diff options
Diffstat (limited to 'innobase')
100 files changed, 8282 insertions, 2995 deletions
diff --git a/innobase/btr/btr0btr.c b/innobase/btr/btr0btr.c index ae967e0525e..4fb930da50f 100644 --- a/innobase/btr/btr0btr.c +++ b/innobase/btr/btr0btr.c @@ -86,15 +86,6 @@ btr_page_create( page_t* page, /* in: page to be created */ dict_tree_t* tree, /* in: index tree */ mtr_t* mtr); /* in: mtr */ -/****************************************************************** -Sets the child node file address in a node pointer. */ -UNIV_INLINE -void -btr_node_ptr_set_child_page_no( -/*===========================*/ - rec_t* rec, /* in: node pointer record */ - ulint page_no, /* in: child node address */ - mtr_t* mtr); /* in: mtr */ /**************************************************************** Returns the upper level node pointer to a page. It is assumed that mtr holds an x-latch on the tree. */ @@ -128,7 +119,10 @@ btr_page_insert_fits( rec_t* split_rec, /* in: suggestion for first record on upper half-page, or NULL if tuple should be first */ - dtuple_t* tuple); /* in: tuple to insert */ + const ulint* offsets, /* in: rec_get_offsets( + split_rec, cursor->index) */ + dtuple_t* tuple, /* in: tuple to insert */ + mem_heap_t* heap); /* in: temporary memory heap */ /****************************************************************** Gets the root node of a tree and x-latches it. */ @@ -143,11 +137,13 @@ btr_root_get( ulint space; ulint root_page_no; page_t* root; + ibool comp = UT_LIST_GET_FIRST(tree->tree_indexes)->table->comp; space = dict_tree_get_space(tree); root_page_no = dict_tree_get_page(tree); root = btr_page_get(space, root_page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(root) == comp); return(root); } @@ -194,6 +190,7 @@ btr_get_prev_user_rec( MTR_MEMO_PAGE_S_FIX)) || (mtr_memo_contains(mtr, buf_block_align(prev_page), MTR_MEMO_PAGE_X_FIX))); + ut_a(page_is_comp(prev_page) == page_is_comp(page)); prev_rec = page_rec_get_prev(page_get_supremum_rec(prev_page)); @@ -246,6 +243,7 @@ btr_get_next_user_rec( || (mtr_memo_contains(mtr, buf_block_align(next_page), MTR_MEMO_PAGE_X_FIX))); + ut_a(page_is_comp(next_page) == page_is_comp(page)); next_rec = page_rec_get_next(page_get_infimum_rec(next_page)); return(next_rec); @@ -267,7 +265,8 @@ btr_page_create( { ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); - page_create(page, mtr); + page_create(page, mtr, + UT_LIST_GET_FIRST(tree->tree_indexes)->table->comp); buf_block_align(page)->check_index_page_at_flush = TRUE; btr_page_set_index_id(page, tree->id, mtr); @@ -503,20 +502,21 @@ UNIV_INLINE void btr_node_ptr_set_child_page_no( /*===========================*/ - rec_t* rec, /* in: node pointer record */ - ulint page_no, /* in: child node address */ - mtr_t* mtr) /* in: mtr */ + rec_t* rec, /* in: node pointer record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint page_no,/* in: child node address */ + mtr_t* mtr) /* in: mtr */ { - ulint n_fields; byte* field; ulint len; + ut_ad(rec_offs_validate(rec, NULL, offsets)); ut_ad(0 < btr_page_get_level(buf_frame_align(rec), mtr)); - - n_fields = rec_get_n_fields(rec); + ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec)); /* The child address is in the last field */ - field = rec_get_nth_field(rec, n_fields - 1, &len); + field = rec_get_nth_field(rec, offsets, + rec_offs_n_fields(offsets) - 1, &len); ut_ad(len == 4); @@ -529,16 +529,18 @@ static page_t* btr_node_ptr_get_child( /*===================*/ - /* out: child page, x-latched */ - rec_t* node_ptr, /* in: node pointer */ - mtr_t* mtr) /* in: mtr */ + /* out: child page, x-latched */ + rec_t* node_ptr,/* in: node pointer */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + mtr_t* mtr) /* in: mtr */ { ulint page_no; ulint space; page_t* page; - + + ut_ad(rec_offs_validate(node_ptr, NULL, offsets)); space = buf_frame_get_space_id(node_ptr); - page_no = btr_node_ptr_get_child_page_no(node_ptr); + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); page = btr_page_get(space, page_no, RW_X_LATCH, mtr); @@ -564,6 +566,9 @@ btr_page_get_father_for_rec( dtuple_t* tuple; btr_cur_t cursor; rec_t* node_ptr; + dict_index_t* index; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; ut_ad(mtr_memo_contains(mtr, dict_tree_get_lock(tree), MTR_MEMO_X_LOCK)); @@ -576,18 +581,21 @@ btr_page_get_father_for_rec( tuple = dict_tree_build_node_ptr(tree, user_rec, 0, heap, btr_page_get_level(page, mtr)); + index = UT_LIST_GET_FIRST(tree->tree_indexes); /* In the following, we choose just any index from the tree as the first parameter for btr_cur_search_to_nth_level. */ - - btr_cur_search_to_nth_level(UT_LIST_GET_FIRST(tree->tree_indexes), + + btr_cur_search_to_nth_level(index, btr_page_get_level(page, mtr) + 1, tuple, PAGE_CUR_LE, BTR_CONT_MODIFY_TREE, &cursor, 0, mtr); node_ptr = btr_cur_get_rec(&cursor); + offsets = rec_get_offsets(node_ptr, index, offsets, + ULINT_UNDEFINED, &heap); - if (btr_node_ptr_get_child_page_no(node_ptr) != + if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != buf_frame_get_page_no(page)) { fputs("InnoDB: Dump of the child page:\n", stderr); buf_page_print(buf_frame_align(page)); @@ -595,17 +603,22 @@ btr_page_get_father_for_rec( buf_page_print(buf_frame_align(node_ptr)); fputs("InnoDB: Corruption of an index tree: table ", stderr); - ut_print_name(stderr, NULL, - UT_LIST_GET_FIRST(tree->tree_indexes)->table_name); + ut_print_name(stderr, NULL, index->table_name); fputs(", index ", stderr); - ut_print_name(stderr, NULL, - UT_LIST_GET_FIRST(tree->tree_indexes)->name); + ut_print_name(stderr, NULL, index->name); fprintf(stderr, ",\n" "InnoDB: father ptr page no %lu, child page no %lu\n", - (ulong) btr_node_ptr_get_child_page_no(node_ptr), + (ulong) + btr_node_ptr_get_child_page_no(node_ptr, offsets), (ulong) buf_frame_get_page_no(page)); - page_rec_print(page_rec_get_next(page_get_infimum_rec(page))); - page_rec_print(node_ptr); + offsets = rec_get_offsets(page_rec_get_next( + page_get_infimum_rec(page)), index, + offsets, ULINT_UNDEFINED, &heap); + page_rec_print(page_rec_get_next(page_get_infimum_rec(page)), + offsets); + offsets = rec_get_offsets(node_ptr, index, offsets, + ULINT_UNDEFINED, &heap); + page_rec_print(node_ptr, offsets); fputs( "InnoDB: You should dump + drop + reimport the table to fix the\n" @@ -614,7 +627,7 @@ btr_page_get_father_for_rec( "InnoDB: forcing recovery. Then dump + drop + reimport.\n", stderr); } - ut_a(btr_node_ptr_get_child_page_no(node_ptr) == + ut_a(btr_node_ptr_get_child_page_no(node_ptr, offsets) == buf_frame_get_page_no(page)); mem_heap_free(heap); @@ -649,6 +662,7 @@ btr_create( ulint type, /* in: type of the index */ ulint space, /* in: space where created */ dulint index_id,/* in: index id */ + ibool comp, /* in: TRUE=compact page format */ mtr_t* mtr) /* in: mini-transaction handle */ { ulint page_no; @@ -716,7 +730,7 @@ btr_create( } /* Create a new index page on the the allocated segment page */ - page = page_create(frame, mtr); + page = page_create(frame, mtr, comp); buf_block_align(page)->check_index_page_at_flush = TRUE; /* Set the index id of the page */ @@ -821,12 +835,14 @@ static void btr_page_reorganize_low( /*====================*/ - ibool recovery,/* in: TRUE if called in recovery: locks should not - be updated, i.e., there cannot exist locks on the - page, and a hash index should not be dropped: it - cannot exist */ - page_t* page, /* in: page to be reorganized */ - mtr_t* mtr) /* in: mtr */ + ibool recovery,/* in: TRUE if called in recovery: + locks should not be updated, i.e., + there cannot exist locks on the + page, and a hash index should not be + dropped: it cannot exist */ + page_t* page, /* in: page to be reorganized */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { page_t* new_page; ulint log_mode; @@ -841,7 +857,9 @@ btr_page_reorganize_low( max_ins_size1 = page_get_max_insert_size_after_reorganize(page, 1); /* Write the log record */ - mlog_write_initial_log_record(page, MLOG_PAGE_REORGANIZE, mtr); + mlog_open_and_write_index(mtr, page, index, index->table->comp + ? MLOG_COMP_PAGE_REORGANIZE + : MLOG_PAGE_REORGANIZE, 0); /* Turn logging off */ log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); @@ -858,14 +876,14 @@ btr_page_reorganize_low( /* Recreate the page: note that global data on page (possible segment headers, next page-field, etc.) is preserved intact */ - page_create(page, mtr); + page_create(page, mtr, index->table->comp); buf_block_align(page)->check_index_page_at_flush = TRUE; /* Copy the records from the temporary space to the recreated page; do not copy the lock bits yet */ page_copy_rec_list_end_no_locks(page, new_page, - page_get_infimum_rec(new_page), mtr); + page_get_infimum_rec(new_page), index, mtr); /* Copy max trx id to recreated page */ page_set_max_trx_id(page, page_get_max_trx_id(new_page)); @@ -901,10 +919,11 @@ Reorganizes an index page. */ void btr_page_reorganize( /*================*/ - page_t* page, /* in: page to be reorganized */ - mtr_t* mtr) /* in: mtr */ + page_t* page, /* in: page to be reorganized */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { - btr_page_reorganize_low(FALSE, page, mtr); + btr_page_reorganize_low(FALSE, page, index, mtr); } /*************************************************************** @@ -913,18 +932,20 @@ Parses a redo log record of reorganizing a page. */ byte* btr_parse_page_reorganize( /*======================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr __attribute__((unused)), /* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr) /* in: mtr or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr __attribute__((unused)), + /* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ { ut_ad(ptr && end_ptr); /* The record is empty, except for the record initial part */ if (page) { - btr_page_reorganize_low(TRUE, page, mtr); + btr_page_reorganize_low(TRUE, page, index, mtr); } return(ptr); @@ -946,7 +967,7 @@ btr_page_empty( /* Recreate the page: note that global data on page (possible segment headers, next page-field, etc.) is preserved intact */ - page_create(page, mtr); + page_create(page, mtr, page_is_comp(page)); buf_block_align(page)->check_index_page_at_flush = TRUE; } @@ -1011,7 +1032,7 @@ btr_root_raise_and_insert( /* Move the records from root to the new page */ page_move_rec_list_end(new_page, root, page_get_infimum_rec(root), - mtr); + cursor->index, mtr); /* If this is a pessimistic insert which is actually done to perform a pessimistic update then we have stored the lock information of the record to be inserted on the infimum of the @@ -1031,7 +1052,7 @@ btr_root_raise_and_insert( node_ptr = dict_tree_build_node_ptr(tree, rec, new_page_no, heap, level); /* Reorganize the root to get free space */ - btr_page_reorganize(root, mtr); + btr_page_reorganize(root, cursor->index, mtr); page_cursor = btr_cur_get_page_cur(cursor); @@ -1039,7 +1060,8 @@ btr_root_raise_and_insert( page_cur_set_before_first(root, page_cursor); - node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr, mtr); + node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr, + cursor->index, mtr); ut_ad(node_ptr_rec); @@ -1047,7 +1069,7 @@ btr_root_raise_and_insert( as there is no lower alphabetical limit to records in the leftmost node of a level: */ - btr_set_min_rec_mark(node_ptr_rec, mtr); + btr_set_min_rec_mark(node_ptr_rec, cursor->index->table->comp, mtr); /* Free the memory heap */ mem_heap_free(heap); @@ -1060,7 +1082,8 @@ btr_root_raise_and_insert( ibuf_reset_free_bits(UT_LIST_GET_FIRST(tree->tree_indexes), new_page); /* Reposition the cursor to the child node */ - page_cur_search(new_page, tuple, PAGE_CUR_LE, page_cursor); + page_cur_search(new_page, cursor->index, tuple, + PAGE_CUR_LE, page_cursor); /* Split the child and insert tuple */ return(btr_page_split_and_insert(cursor, tuple, mtr)); @@ -1190,11 +1213,13 @@ btr_page_get_sure_split_rec( rec_t* rec; rec_t* next_rec; ulint n; - + mem_heap_t* heap; + ulint* offsets; + page = btr_cur_get_page(cursor); - insert_size = rec_get_converted_size(tuple); - free_space = page_get_free_space_of_empty(); + insert_size = rec_get_converted_size(cursor->index, tuple); + free_space = page_get_free_space_of_empty(cursor->index->table->comp); /* free_space is now the free space of a created new page */ @@ -1208,6 +1233,9 @@ btr_page_get_sure_split_rec( ins_rec = btr_cur_get_rec(cursor); rec = page_get_infimum_rec(page); + heap = NULL; + offsets = NULL; + /* We start to include records to the left half, and when the space reserved by them exceeds half of total_space, then if the included records fit on the left page, they will be put there @@ -1230,7 +1258,9 @@ btr_page_get_sure_split_rec( /* Include tuple */ incl_data += insert_size; } else { - incl_data += rec_get_size(rec); + offsets = rec_get_offsets(rec, cursor->index, + offsets, ULINT_UNDEFINED, &heap); + incl_data += rec_offs_size(offsets); } n++; @@ -1252,11 +1282,16 @@ btr_page_get_sure_split_rec( next_rec = page_rec_get_next(rec); } if (next_rec != page_get_supremum_rec(page)) { - + if (heap) { + mem_heap_free(heap); + } return(next_rec); } } + if (heap) { + mem_heap_free(heap); + } return(rec); } } @@ -1275,7 +1310,10 @@ btr_page_insert_fits( rec_t* split_rec, /* in: suggestion for first record on upper half-page, or NULL if tuple to be inserted should be first */ - dtuple_t* tuple) /* in: tuple to insert */ + const ulint* offsets, /* in: rec_get_offsets( + split_rec, cursor->index) */ + dtuple_t* tuple, /* in: tuple to insert */ + mem_heap_t* heap) /* in: temporary memory heap */ { page_t* page; ulint insert_size; @@ -1284,11 +1322,19 @@ btr_page_insert_fits( ulint total_n_recs; rec_t* rec; rec_t* end_rec; + ulint* offs; page = btr_cur_get_page(cursor); - - insert_size = rec_get_converted_size(tuple); - free_space = page_get_free_space_of_empty(); + + ut_ad(!split_rec == !offsets); + ut_ad(!offsets + || cursor->index->table->comp == rec_offs_comp(offsets)); + ut_ad(!offsets + || rec_offs_validate(split_rec, cursor->index, offsets)); + ut_ad(page_is_comp(page) == cursor->index->table->comp); + + insert_size = rec_get_converted_size(cursor->index, tuple); + free_space = page_get_free_space_of_empty(cursor->index->table->comp); /* free_space is now the free space of a created new page */ @@ -1303,7 +1349,7 @@ btr_page_insert_fits( rec = page_rec_get_next(page_get_infimum_rec(page)); end_rec = page_rec_get_next(btr_cur_get_rec(cursor)); - } else if (cmp_dtuple_rec(tuple, split_rec) >= 0) { + } else if (cmp_dtuple_rec(tuple, split_rec, offsets) >= 0) { rec = page_rec_get_next(page_get_infimum_rec(page)); end_rec = split_rec; @@ -1321,11 +1367,16 @@ btr_page_insert_fits( return(TRUE); } + offs = NULL; + while (rec != end_rec) { /* In this loop we calculate the amount of reserved space after rec is removed from page. */ - total_data -= rec_get_size(rec); + offs = rec_get_offsets(rec, cursor->index, offs, + ULINT_UNDEFINED, &heap); + + total_data -= rec_offs_size(offs); total_n_recs--; if (total_data + page_dir_calc_reserved_space(total_n_recs) @@ -1411,6 +1462,10 @@ btr_attach_half_pages( MTR_MEMO_PAGE_X_FIX)); ut_ad(mtr_memo_contains(mtr, buf_block_align(new_page), MTR_MEMO_PAGE_X_FIX)); + ut_a(page_is_comp(page) == page_is_comp(new_page)); + + /* Create a memory heap where the data tuple is stored */ + heap = mem_heap_create(1024); /* Based on split direction, decide upper and lower pages */ if (direction == FSP_DOWN) { @@ -1426,7 +1481,12 @@ btr_attach_half_pages( /* Replace the address of the old child node (= page) with the address of the new lower half */ - btr_node_ptr_set_child_page_no(node_ptr, lower_page_no, mtr); + btr_node_ptr_set_child_page_no(node_ptr, + rec_get_offsets(node_ptr, + UT_LIST_GET_FIRST(tree->tree_indexes), + NULL, ULINT_UNDEFINED, &heap), + lower_page_no, mtr); + mem_heap_empty(heap); } else { lower_page_no = buf_frame_get_page_no(page); upper_page_no = buf_frame_get_page_no(new_page); @@ -1434,9 +1494,6 @@ btr_attach_half_pages( upper_page = new_page; } - /* Create a memory heap where the data tuple is stored */ - heap = mem_heap_create(100); - /* Get the level of the split pages */ level = btr_page_get_level(page, mtr); @@ -1465,6 +1522,7 @@ btr_attach_half_pages( if (prev_page_no != FIL_NULL) { prev_page = btr_page_get(space, prev_page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(prev_page) == page_is_comp(page)); btr_page_set_next(prev_page, lower_page_no, mtr); } @@ -1472,6 +1530,7 @@ btr_attach_half_pages( if (next_page_no != FIL_NULL) { next_page = btr_page_get(space, next_page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(next_page) == page_is_comp(page)); btr_page_set_prev(next_page, upper_page_no, mtr); } @@ -1522,7 +1581,15 @@ btr_page_split_and_insert( ibool insert_will_fit; ulint n_iterations = 0; rec_t* rec; + mem_heap_t* heap; + ulint n_uniq; + ulint* offsets; + + heap = mem_heap_create(1024); + n_uniq = dict_index_get_n_unique_in_tree(cursor->index); func_start: + mem_heap_empty(heap); + offsets = NULL; tree = btr_cur_get_tree(cursor); ut_ad(mtr_memo_contains(mtr, dict_tree_get_lock(tree), @@ -1574,9 +1641,10 @@ func_start: first_rec = split_rec; move_limit = split_rec; } else { - buf = mem_alloc(rec_get_converted_size(tuple)); + buf = mem_alloc(rec_get_converted_size(cursor->index, tuple)); - first_rec = rec_convert_dtuple_to_rec(buf, tuple); + first_rec = rec_convert_dtuple_to_rec(buf, + cursor->index, tuple); move_limit = page_rec_get_next(btr_cur_get_rec(cursor)); } @@ -1593,7 +1661,16 @@ func_start: We can then move the records after releasing the tree latch, thus reducing the tree latch contention. */ - insert_will_fit = btr_page_insert_fits(cursor, split_rec, tuple); + if (split_rec) { + offsets = rec_get_offsets(split_rec, cursor->index, offsets, + n_uniq, &heap); + + insert_will_fit = btr_page_insert_fits(cursor, + split_rec, offsets, tuple, heap); + } else { + insert_will_fit = btr_page_insert_fits(cursor, + NULL, NULL, tuple, heap); + } if (insert_will_fit && (btr_page_get_level(page, mtr) == 0)) { @@ -1605,7 +1682,8 @@ func_start: if (direction == FSP_DOWN) { /* fputs("Split left\n", stderr); */ - page_move_rec_list_start(new_page, page, move_limit, mtr); + page_move_rec_list_start(new_page, page, move_limit, + cursor->index, mtr); left_page = new_page; right_page = page; @@ -1613,7 +1691,8 @@ func_start: } else { /* fputs("Split right\n", stderr); */ - page_move_rec_list_end(new_page, page, move_limit, mtr); + page_move_rec_list_end(new_page, page, move_limit, + cursor->index, mtr); left_page = page; right_page = new_page; @@ -1626,19 +1705,25 @@ func_start: if (split_rec == NULL) { insert_page = right_page; - } else if (cmp_dtuple_rec(tuple, first_rec) >= 0) { - - insert_page = right_page; } else { - insert_page = left_page; + offsets = rec_get_offsets(first_rec, cursor->index, + offsets, n_uniq, &heap); + + if (cmp_dtuple_rec(tuple, first_rec, offsets) >= 0) { + + insert_page = right_page; + } else { + insert_page = left_page; + } } /* 7. Reposition the cursor for insert and try insertion */ page_cursor = btr_cur_get_page_cur(cursor); - page_cur_search(insert_page, tuple, PAGE_CUR_LE, page_cursor); + page_cur_search(insert_page, cursor->index, tuple, + PAGE_CUR_LE, page_cursor); - rec = page_cur_tuple_insert(page_cursor, tuple, mtr); + rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, mtr); if (rec != NULL) { /* Insert fit on the page: update the free bits for the @@ -1650,15 +1735,17 @@ func_start: /* fprintf(stderr, "Split and insert done %lu %lu\n", buf_frame_get_page_no(left_page), buf_frame_get_page_no(right_page)); */ + mem_heap_free(heap); return(rec); } /* 8. If insert did not fit, try page reorganization */ - btr_page_reorganize(insert_page, mtr); + btr_page_reorganize(insert_page, cursor->index, mtr); - page_cur_search(insert_page, tuple, PAGE_CUR_LE, page_cursor); - rec = page_cur_tuple_insert(page_cursor, tuple, mtr); + page_cur_search(insert_page, cursor->index, tuple, + PAGE_CUR_LE, page_cursor); + rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, mtr); if (rec == NULL) { /* The insert did not fit on the page: loop back to the @@ -1688,6 +1775,7 @@ func_start: ut_ad(page_validate(left_page, UT_LIST_GET_FIRST(tree->tree_indexes))); ut_ad(page_validate(right_page, UT_LIST_GET_FIRST(tree->tree_indexes))); + mem_heap_free(heap); return(rec); } @@ -1721,6 +1809,7 @@ btr_level_list_remove( if (prev_page_no != FIL_NULL) { prev_page = btr_page_get(space, prev_page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(prev_page) == page_is_comp(page)); btr_page_set_next(prev_page, next_page_no, mtr); } @@ -1728,6 +1817,7 @@ btr_level_list_remove( if (next_page_no != FIL_NULL) { next_page = btr_page_get(space, next_page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(next_page) == page_is_comp(page)); btr_page_set_prev(next_page, prev_page_no, mtr); } @@ -1741,9 +1831,11 @@ void btr_set_min_rec_mark_log( /*=====================*/ rec_t* rec, /* in: record */ + ibool comp, /* TRUE=compact record format */ mtr_t* mtr) /* in: mtr */ { - mlog_write_initial_log_record(rec, MLOG_REC_MIN_MARK, mtr); + mlog_write_initial_log_record(rec, + comp ? MLOG_COMP_REC_MIN_MARK : MLOG_REC_MIN_MARK, mtr); /* Write rec offset as a 2-byte ulint */ mlog_catenate_ulint(mtr, rec - buf_frame_align(rec), MLOG_2BYTES); @@ -1759,6 +1851,7 @@ btr_parse_set_min_rec_mark( /* out: end of log record or NULL */ byte* ptr, /* in: buffer */ byte* end_ptr,/* in: buffer end */ + ibool comp, /* in: TRUE=compact page format */ page_t* page, /* in: page or NULL */ mtr_t* mtr) /* in: mtr or NULL */ { @@ -1772,7 +1865,7 @@ btr_parse_set_min_rec_mark( if (page) { rec = page + mach_read_from_2(ptr); - btr_set_min_rec_mark(rec, mtr); + btr_set_min_rec_mark(rec, comp, mtr); } return(ptr + 2); @@ -1785,15 +1878,16 @@ void btr_set_min_rec_mark( /*=================*/ rec_t* rec, /* in: record */ + ibool comp, /* in: TRUE=compact page format */ mtr_t* mtr) /* in: mtr */ { ulint info_bits; - info_bits = rec_get_info_bits(rec); + info_bits = rec_get_info_bits(rec, comp); - rec_set_info_bits(rec, info_bits | REC_INFO_MIN_REC_FLAG); + rec_set_info_bits(rec, comp, info_bits | REC_INFO_MIN_REC_FLAG); - btr_set_min_rec_mark_log(rec, mtr); + btr_set_min_rec_mark_log(rec, comp, mtr); } /***************************************************************** @@ -1842,18 +1936,19 @@ btr_lift_page_up( record from the page should be removed */ mtr_t* mtr) /* in: mtr */ { - rec_t* node_ptr; - page_t* father_page; - ulint page_level; - + page_t* father_page; + ulint page_level; + dict_index_t* index; + ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL); ut_ad(btr_page_get_next(page, mtr) == FIL_NULL); ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); - node_ptr = btr_page_get_father_node_ptr(tree, page, mtr); - father_page = buf_frame_align(node_ptr); + father_page = buf_frame_align( + btr_page_get_father_node_ptr(tree, page, mtr)); page_level = btr_page_get_level(page, mtr); + index = UT_LIST_GET_FIRST(tree->tree_indexes); btr_search_drop_page_hash_index(page); @@ -1862,7 +1957,7 @@ btr_lift_page_up( /* Move records to the father */ page_copy_rec_list_end(father_page, page, page_get_infimum_rec(page), - mtr); + index, mtr); lock_update_copy_and_discard(father_page, page); btr_page_set_level(father_page, page_level, mtr); @@ -1871,10 +1966,8 @@ btr_lift_page_up( btr_page_free(tree, page, mtr); /* We play safe and reset the free bits for the father */ - ibuf_reset_free_bits(UT_LIST_GET_FIRST(tree->tree_indexes), - father_page); - ut_ad(page_validate(father_page, - UT_LIST_GET_FIRST(tree->tree_indexes))); + ibuf_reset_free_bits(index, father_page); + ut_ad(page_validate(father_page, index)); ut_ad(btr_check_node_ptr(tree, father_page, mtr)); } @@ -1914,9 +2007,11 @@ btr_compress( ulint max_ins_size; ulint max_ins_size_reorg; ulint level; - + ibool comp = cursor->index->table->comp; + page = btr_cur_get_page(cursor); tree = btr_cur_get_tree(cursor); + ut_a(comp == page_is_comp(page)); ut_ad(mtr_memo_contains(mtr, dict_tree_get_lock(tree), MTR_MEMO_X_LOCK)); @@ -1932,7 +2027,9 @@ btr_compress( right_page_no); */ node_ptr = btr_page_get_father_node_ptr(tree, page, mtr); + ut_ad(!comp || rec_get_status(node_ptr) == REC_STATUS_NODE_PTR); father_page = buf_frame_align(node_ptr); + ut_a(comp == page_is_comp(father_page)); /* Decide the page to which we try to merge and which will inherit the locks */ @@ -1957,6 +2054,7 @@ btr_compress( n_recs = page_get_n_recs(page); data_size = page_get_data_size(page); + ut_a(page_is_comp(merge_page) == page_is_comp(page)); max_ins_size_reorg = page_get_max_insert_size_after_reorganize( merge_page, n_recs); @@ -1975,7 +2073,7 @@ btr_compress( /* We have to reorganize merge_page */ - btr_page_reorganize(merge_page, mtr); + btr_page_reorganize(merge_page, cursor->index, mtr); max_ins_size = page_get_max_insert_size(merge_page, n_recs); @@ -1999,11 +2097,18 @@ btr_compress( if (is_left) { btr_node_ptr_delete(tree, page, mtr); } else { + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; /* Replace the address of the old child node (= page) with the address of the merge page to the right */ - btr_node_ptr_set_child_page_no(node_ptr, right_page_no, mtr); - + btr_node_ptr_set_child_page_no(node_ptr, + rec_get_offsets(node_ptr, cursor->index, + offsets_, ULINT_UNDEFINED, &heap), + right_page_no, mtr); + if (heap) { + mem_heap_free(heap); + } btr_node_ptr_delete(tree, merge_page, mtr); } @@ -2012,14 +2117,14 @@ btr_compress( orig_pred = page_rec_get_prev( page_get_supremum_rec(merge_page)); page_copy_rec_list_start(merge_page, page, - page_get_supremum_rec(page), mtr); + page_get_supremum_rec(page), cursor->index, mtr); lock_update_merge_left(merge_page, orig_pred, page); } else { orig_succ = page_rec_get_next( page_get_infimum_rec(merge_page)); page_copy_rec_list_end(merge_page, page, - page_get_infimum_rec(page), mtr); + page_get_infimum_rec(page), cursor->index, mtr); lock_update_merge_right(orig_succ, page); } @@ -2133,6 +2238,7 @@ btr_discard_page( return; } + ut_a(page_is_comp(merge_page) == page_is_comp(page)); btr_search_drop_page_hash_index(page); if (left_page_no == FIL_NULL && btr_page_get_level(page, mtr) > 0) { @@ -2144,7 +2250,8 @@ btr_discard_page( ut_ad(node_ptr != page_get_supremum_rec(merge_page)); - btr_set_min_rec_mark(node_ptr, mtr); + btr_set_min_rec_mark(node_ptr, + cursor->index->table->comp, mtr); } btr_node_ptr_delete(tree, page, mtr); @@ -2215,6 +2322,8 @@ btr_print_recursive( page_t* page, /* in: index page */ ulint width, /* in: print this many entries from start and end */ + mem_heap_t** heap, /* in/out: heap for rec_get_offsets() */ + ulint** offsets,/* in/out: buffer for rec_get_offsets() */ mtr_t* mtr) /* in: mtr */ { page_cur_t cursor; @@ -2223,14 +2332,16 @@ btr_print_recursive( mtr_t mtr2; rec_t* node_ptr; page_t* child; - + dict_index_t* index; + ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); fprintf(stderr, "NODE ON LEVEL %lu page number %lu\n", (ulong) btr_page_get_level(page, mtr), (ulong) buf_frame_get_page_no(page)); - page_print(page, width, width); + index = UT_LIST_GET_FIRST(tree->tree_indexes); + page_print(page, index, width, width); n_recs = page_get_n_recs(page); @@ -2249,9 +2360,12 @@ btr_print_recursive( node_ptr = page_cur_get_rec(&cursor); - child = btr_node_ptr_get_child(node_ptr, &mtr2); - - btr_print_recursive(tree, child, width, &mtr2); + *offsets = rec_get_offsets(node_ptr, index, *offsets, + ULINT_UNDEFINED, heap); + child = btr_node_ptr_get_child(node_ptr, + *offsets, &mtr2); + btr_print_recursive(tree, child, width, + heap, offsets, &mtr2); mtr_commit(&mtr2); } @@ -2270,8 +2384,12 @@ btr_print_tree( ulint width) /* in: print this many entries from start and end */ { - mtr_t mtr; - page_t* root; + mtr_t mtr; + page_t* root; + mem_heap_t* heap = NULL; + ulint offsets_[100] + = { 100, }; + ulint* offsets = offsets_; fputs("--------------------------\n" "INDEX TREE PRINT\n", stderr); @@ -2280,7 +2398,10 @@ btr_print_tree( root = btr_root_get(tree, &mtr); - btr_print_recursive(tree, root, width, &mtr); + btr_print_recursive(tree, root, width, &heap, &offsets, &mtr); + if (heap) { + mem_heap_free(heap); + } mtr_commit(&mtr); @@ -2323,7 +2444,10 @@ btr_check_node_ptr( page_rec_get_next(page_get_infimum_rec(page)), 0, heap, btr_page_get_level(page, mtr)); - ut_a(cmp_dtuple_rec(node_ptr_tuple, node_ptr) == 0); + ut_a(cmp_dtuple_rec(node_ptr_tuple, node_ptr, + rec_get_offsets(node_ptr, + dict_tree_find_index(tree, node_ptr), + NULL, ULINT_UNDEFINED, &heap)) == 0); mem_heap_free(heap); @@ -2360,10 +2484,14 @@ btr_index_rec_validate( should print hex dump of record and page on error */ { - ulint len; - ulint n; - ulint i; - page_t* page; + ulint len; + ulint n; + ulint i; + page_t* page; + mem_heap_t* heap = NULL; + ulint offsets_[100] + = { 100, }; + ulint* offsets = offsets_; page = buf_frame_align(rec); @@ -2377,36 +2505,35 @@ btr_index_rec_validate( n = dict_index_get_n_fields(index); - if (rec_get_n_fields(rec) != n) { + if (!index->table->comp && rec_get_n_fields_old(rec) != n) { btr_index_rec_validate_report(page, rec, index); fprintf(stderr, "InnoDB: has %lu fields, should have %lu\n", - (ulong) rec_get_n_fields(rec), (ulong) n); + (ulong) rec_get_n_fields_old(rec), (ulong) n); - if (!dump_on_error) { + if (dump_on_error) { + buf_page_print(page); - return(FALSE); + fputs("InnoDB: corrupt record ", stderr); + rec_print_old(stderr, rec); + putc('\n', stderr); } - - buf_page_print(page); - - fputs("InnoDB: corrupt record ", stderr); - rec_print(stderr, rec); - putc('\n', stderr); - return(FALSE); } + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + for (i = 0; i < n; i++) { dtype_t* type = dict_index_get_nth_type(index, i); + ulint fixed_size = dtype_get_fixed_size(type); - rec_get_nth_field(rec, i, &len); + rec_get_nth_field(rec, offsets, i, &len); /* Note that prefix indexes are not fixed size even when their type is CHAR. */ if ((dict_index_get_nth_field(index, i)->prefix_len == 0 - && len != UNIV_SQL_NULL && dtype_is_fixed_size(type) - && len != dtype_get_fixed_size(type)) + && len != UNIV_SQL_NULL && fixed_size + && len != fixed_size) || (dict_index_get_nth_field(index, i)->prefix_len > 0 && len != UNIV_SQL_NULL @@ -2418,21 +2545,23 @@ btr_index_rec_validate( "InnoDB: field %lu len is %lu, should be %lu\n", (ulong) i, (ulong) len, (ulong) dtype_get_fixed_size(type)); - if (!dump_on_error) { - - return(FALSE); - } - - buf_page_print(page); - - fputs("InnoDB: corrupt record ", stderr); - rec_print(stderr, rec); - putc('\n', stderr); + if (dump_on_error) { + buf_page_print(page); + fputs("InnoDB: corrupt record ", stderr); + rec_print_new(stderr, rec, offsets); + putc('\n', stderr); + } + if (heap) { + mem_heap_free(heap); + } return(FALSE); } } + if (heap) { + mem_heap_free(heap); + } return(TRUE); } @@ -2527,15 +2656,18 @@ btr_validate_level( page_t* right_father_page; rec_t* node_ptr; rec_t* right_node_ptr; + rec_t* rec; ulint right_page_no; ulint left_page_no; page_cur_t cursor; - mem_heap_t* heap; dtuple_t* node_ptr_tuple; ibool ret = TRUE; dict_index_t* index; mtr_t mtr; - + mem_heap_t* heap = mem_heap_create(256); + ulint* offsets = NULL; + ulint* offsets2= NULL; + mtr_start(&mtr); mtr_x_lock(dict_tree_get_lock(tree), &mtr); @@ -2544,6 +2676,8 @@ btr_validate_level( space = buf_frame_get_space_id(page); + index = UT_LIST_GET_FIRST(tree->tree_indexes); + while (level != btr_page_get_level(page, &mtr)) { ut_a(btr_page_get_level(page, &mtr) > 0); @@ -2552,14 +2686,16 @@ btr_validate_level( page_cur_move_to_next(&cursor); node_ptr = page_cur_get_rec(&cursor); - page = btr_node_ptr_get_child(node_ptr, &mtr); + offsets = rec_get_offsets(node_ptr, index, offsets, + ULINT_UNDEFINED, &heap); + page = btr_node_ptr_get_child(node_ptr, offsets, &mtr); } - index = UT_LIST_GET_FIRST(tree->tree_indexes); - /* Now we are on the desired level. Loop through the pages on that level. */ loop: + mem_heap_empty(heap); + offsets = offsets2 = NULL; mtr_x_lock(dict_tree_get_lock(tree), &mtr); /* Check ordering etc. of records */ @@ -2588,12 +2724,19 @@ loop: (buf_frame_get_page_no(page) == dict_tree_get_page(tree)))); if (right_page_no != FIL_NULL) { - + rec_t* right_rec; right_page = btr_page_get(space, right_page_no, RW_X_LATCH, &mtr); - if (cmp_rec_rec(page_rec_get_prev(page_get_supremum_rec(page)), - page_rec_get_next(page_get_infimum_rec(right_page)), - UT_LIST_GET_FIRST(tree->tree_indexes)) >= 0) { + ut_a(page_is_comp(right_page) == page_is_comp(page)); + rec = page_rec_get_prev(page_get_supremum_rec(page)); + right_rec = page_rec_get_next( + page_get_infimum_rec(right_page)); + offsets = rec_get_offsets(rec, index, + offsets, ULINT_UNDEFINED, &heap); + offsets2 = rec_get_offsets(right_rec, index, + offsets2, ULINT_UNDEFINED, &heap); + if (cmp_rec_rec(rec, right_rec, offsets, offsets2, index) + >= 0) { btr_validate_report2(index, level, page, right_page); @@ -2604,12 +2747,13 @@ loop: buf_page_print(right_page); fputs("InnoDB: record ", stderr); - rec_print(stderr, page_rec_get_prev( - page_get_supremum_rec(page))); + rec = page_rec_get_prev(page_get_supremum_rec(page)); + rec_print(stderr, rec, index); putc('\n', stderr); fputs("InnoDB: record ", stderr); - rec_print(stderr, page_rec_get_next( - page_get_infimum_rec(right_page))); + rec = page_rec_get_next(page_get_infimum_rec( + right_page)); + rec_print(stderr, rec, index); putc('\n', stderr); ret = FALSE; @@ -2618,7 +2762,8 @@ loop: if (level > 0 && left_page_no == FIL_NULL) { ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits( - page_rec_get_next(page_get_infimum_rec(page)))); + page_rec_get_next(page_get_infimum_rec(page)), + index->table->comp)); } if (buf_frame_get_page_no(page) != dict_tree_get_page(tree)) { @@ -2627,12 +2772,14 @@ loop: node_ptr = btr_page_get_father_node_ptr(tree, page, &mtr); father_page = buf_frame_align(node_ptr); + offsets = rec_get_offsets(node_ptr, index, + offsets, ULINT_UNDEFINED, &heap); - if (btr_node_ptr_get_child_page_no(node_ptr) != + if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != buf_frame_get_page_no(page) || node_ptr != btr_page_get_father_for_rec(tree, page, - page_rec_get_prev(page_get_supremum_rec(page)), - &mtr)) { + page_rec_get_prev(page_get_supremum_rec(page)), + &mtr)) { btr_validate_report1(index, level, page); fputs("InnoDB: node pointer to the page is wrong\n", @@ -2642,17 +2789,18 @@ loop: buf_page_print(page); fputs("InnoDB: node ptr ", stderr); - rec_print(stderr, node_ptr); + rec_print_new(stderr, node_ptr, offsets); fprintf(stderr, "\n" "InnoDB: node ptr child page n:o %lu\n", - (unsigned long) btr_node_ptr_get_child_page_no(node_ptr)); + (unsigned long) btr_node_ptr_get_child_page_no( + node_ptr, offsets)); fputs("InnoDB: record on page ", stderr); - rec_print(stderr, - btr_page_get_father_for_rec(tree, page, - page_rec_get_prev(page_get_supremum_rec(page)), - &mtr)); + rec = btr_page_get_father_for_rec(tree, page, + page_rec_get_prev(page_get_supremum_rec(page)), + &mtr); + rec_print(stderr, rec, index); putc('\n', stderr); ret = FALSE; @@ -2660,7 +2808,8 @@ loop: } if (btr_page_get_level(page, &mtr) > 0) { - heap = mem_heap_create(256); + offsets = rec_get_offsets(node_ptr, index, + offsets, ULINT_UNDEFINED, &heap); node_ptr_tuple = dict_tree_build_node_ptr( tree, @@ -2669,7 +2818,10 @@ loop: 0, heap, btr_page_get_level(page, &mtr)); - if (cmp_dtuple_rec(node_ptr_tuple, node_ptr) != 0) { + if (cmp_dtuple_rec(node_ptr_tuple, node_ptr, + offsets)) { + rec_t* first_rec = page_rec_get_next( + page_get_infimum_rec(page)); btr_validate_report1(index, level, page); @@ -2679,18 +2831,14 @@ loop: fputs("InnoDB: Error: node ptrs differ" " on levels > 0\n" "InnoDB: node ptr ", stderr); - rec_print(stderr, node_ptr); + rec_print_new(stderr, node_ptr, offsets); fputs("InnoDB: first rec ", stderr); - rec_print(stderr, page_rec_get_next( - page_get_infimum_rec(page))); + rec_print(stderr, first_rec, index); putc('\n', stderr); ret = FALSE; - mem_heap_free(heap); goto node_ptr_fails; } - - mem_heap_free(heap); } if (left_page_no == FIL_NULL) { @@ -2701,7 +2849,7 @@ loop: if (right_page_no == FIL_NULL) { ut_a(node_ptr == page_rec_get_prev( - page_get_supremum_rec(father_page))); + page_get_supremum_rec(father_page))); ut_a(btr_page_get_next(father_page, &mtr) == FIL_NULL); } @@ -2771,13 +2919,16 @@ node_ptr_fails: mtr_commit(&mtr); if (right_page_no != FIL_NULL) { + ibool comp = page_is_comp(page); mtr_start(&mtr); page = btr_page_get(space, right_page_no, RW_X_LATCH, &mtr); + ut_a(page_is_comp(page) == comp); goto loop; } + mem_heap_free(heap); return(ret); } diff --git a/innobase/btr/btr0cur.c b/innobase/btr/btr0cur.c index 48de5644908..4c2a501a08a 100644 --- a/innobase/btr/btr0cur.c +++ b/innobase/btr/btr0cur.c @@ -73,8 +73,9 @@ static void btr_cur_unmark_extern_fields( /*=========================*/ - rec_t* rec, /* in: record in a clustered index */ - mtr_t* mtr); /* in: mtr */ + rec_t* rec, /* in: record in a clustered index */ + mtr_t* mtr, /* in: mtr */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /*********************************************************************** Adds path information to the cursor for the current page, for which the binary search has been performed. */ @@ -96,6 +97,7 @@ btr_rec_free_updated_extern_fields( dict_index_t* index, /* in: index of rec; the index tree MUST be X-latched */ rec_t* rec, /* in: record */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ upd_t* update, /* in: update vector */ ibool do_not_free_inherited,/* in: TRUE if called in a rollback and we do not want to free @@ -108,9 +110,10 @@ static ulint btr_rec_get_externally_stored_len( /*==============================*/ - /* out: externally stored part, in units of a - database page */ - rec_t* rec); /* in: record */ + /* out: externally stored part, + in units of a database page */ + rec_t* rec, /* in: record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /*==================== B-TREE SEARCH =========================*/ @@ -137,11 +140,13 @@ btr_cur_latch_leaves( if (latch_mode == BTR_SEARCH_LEAF) { get_page = btr_page_get(space, page_no, RW_S_LATCH, mtr); + ut_a(page_is_comp(get_page) == page_is_comp(page)); buf_block_align(get_page)->check_index_page_at_flush = TRUE; } else if (latch_mode == BTR_MODIFY_LEAF) { get_page = btr_page_get(space, page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(get_page) == page_is_comp(page)); buf_block_align(get_page)->check_index_page_at_flush = TRUE; } else if (latch_mode == BTR_MODIFY_TREE) { @@ -152,11 +157,13 @@ btr_cur_latch_leaves( if (left_page_no != FIL_NULL) { get_page = btr_page_get(space, left_page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(get_page) == page_is_comp(page)); buf_block_align(get_page)->check_index_page_at_flush = TRUE; } get_page = btr_page_get(space, page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(get_page) == page_is_comp(page)); buf_block_align(get_page)->check_index_page_at_flush = TRUE; right_page_no = btr_page_get_next(page, mtr); @@ -176,11 +183,14 @@ btr_cur_latch_leaves( if (left_page_no != FIL_NULL) { cursor->left_page = btr_page_get(space, left_page_no, RW_S_LATCH, mtr); + ut_a(page_is_comp(cursor->left_page) == + page_is_comp(page)); buf_block_align( cursor->left_page)->check_index_page_at_flush = TRUE; } get_page = btr_page_get(space, page_no, RW_S_LATCH, mtr); + ut_a(page_is_comp(get_page) == page_is_comp(page)); buf_block_align(get_page)->check_index_page_at_flush = TRUE; } else if (latch_mode == BTR_MODIFY_PREV) { @@ -191,11 +201,14 @@ btr_cur_latch_leaves( if (left_page_no != FIL_NULL) { cursor->left_page = btr_page_get(space, left_page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(cursor->left_page) == + page_is_comp(page)); buf_block_align( cursor->left_page)->check_index_page_at_flush = TRUE; } get_page = btr_page_get(space, page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(get_page) == page_is_comp(page)); buf_block_align(get_page)->check_index_page_at_flush = TRUE; } else { ut_error; @@ -261,6 +274,9 @@ btr_cur_search_to_nth_level( #ifdef BTR_CUR_ADAPT btr_search_t* info; #endif + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; /* Currently, PAGE_CUR_LE is the only search mode used for searches ending to upper levels */ @@ -379,7 +395,7 @@ btr_cur_search_to_nth_level( page_mode = mode; break; } - + /* Loop and search until we arrive at the desired level */ for (;;) { @@ -414,7 +430,9 @@ retry_page_get: cursor->thr)) { /* Insertion to the insert buffer succeeded */ cursor->flag = BTR_CUR_INSERT_TO_IBUF; - + if (heap) { + mem_heap_free(heap); + } return; } @@ -470,9 +488,9 @@ retry_page_get: page_mode = mode; } - page_cur_search_with_match(page, tuple, page_mode, &up_match, - &up_bytes, &low_match, &low_bytes, - page_cursor); + page_cur_search_with_match(page, index, tuple, page_mode, + &up_match, &up_bytes, + &low_match, &low_bytes, page_cursor); if (estimate) { btr_cur_add_path_info(cursor, height, root_height); } @@ -486,7 +504,9 @@ retry_page_get: if (level > 0) { /* x-latch the page */ - btr_page_get(space, page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(btr_page_get(space, + page_no, RW_X_LATCH, mtr)) + == index->table->comp); } break; @@ -498,9 +518,14 @@ retry_page_get: guess = NULL; node_ptr = page_cur_get_rec(page_cursor); - + offsets = rec_get_offsets(node_ptr, cursor->index, offsets, + ULINT_UNDEFINED, &heap); /* Go to the child node */ - page_no = btr_node_ptr_get_child_page_no(node_ptr); + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); + } + + if (heap) { + mem_heap_free(heap); } if (level == 0) { @@ -552,6 +577,9 @@ btr_cur_open_at_index_side( rec_t* node_ptr; ulint estimate; ulint savepoint; + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; estimate = latch_mode & BTR_ESTIMATE; latch_mode = latch_mode & ~BTR_ESTIMATE; @@ -576,7 +604,7 @@ btr_cur_open_at_index_side( page_no = dict_tree_get_page(tree); height = ULINT_UNDEFINED; - + for (;;) { page = buf_page_get_gen(space, page_no, RW_NO_LATCH, NULL, BUF_GET, @@ -645,9 +673,14 @@ btr_cur_open_at_index_side( height--; node_ptr = page_cur_get_rec(page_cursor); - + offsets = rec_get_offsets(node_ptr, cursor->index, offsets, + ULINT_UNDEFINED, &heap); /* Go to the child node */ - page_no = btr_node_ptr_get_child_page_no(node_ptr); + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); + } + + if (heap) { + mem_heap_free(heap); } } @@ -669,6 +702,9 @@ btr_cur_open_at_rnd_pos( ulint space; ulint height; rec_t* node_ptr; + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; tree = index->tree; @@ -717,9 +753,14 @@ btr_cur_open_at_rnd_pos( height--; node_ptr = page_cur_get_rec(page_cursor); - + offsets = rec_get_offsets(node_ptr, cursor->index, offsets, + ULINT_UNDEFINED, &heap); /* Go to the child node */ - page_no = btr_node_ptr_get_child_page_no(node_ptr); + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); + } + + if (heap) { + mem_heap_free(heap); } } @@ -758,18 +799,20 @@ btr_cur_insert_if_possible( page_cursor = btr_cur_get_page_cur(cursor); /* Now, try the insert */ - rec = page_cur_tuple_insert(page_cursor, tuple, mtr); + rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, mtr); if (!rec) { /* If record did not fit, reorganize */ - btr_page_reorganize(page, mtr); + btr_page_reorganize(page, cursor->index, mtr); *reorg = TRUE; - page_cur_search(page, tuple, PAGE_CUR_LE, page_cursor); + page_cur_search(page, cursor->index, tuple, + PAGE_CUR_LE, page_cursor); - rec = page_cur_tuple_insert(page_cursor, tuple, mtr); + rec = page_cur_tuple_insert(page_cursor, tuple, + cursor->index, mtr); } return(rec); @@ -887,8 +930,6 @@ btr_cur_optimistic_insert( ibool reorg; ibool inherit; ulint rec_size; - ulint data_size; - ulint extra_size; ulint type; ulint err; @@ -914,13 +955,11 @@ btr_cur_optimistic_insert( calculate_sizes_again: /* Calculate the record size when entry is converted to a record */ - data_size = dtuple_get_data_size(entry); - extra_size = rec_get_converted_extra_size(data_size, - dtuple_get_n_fields(entry)); - rec_size = data_size + extra_size; + rec_size = rec_get_converted_size(index, entry); - if ((rec_size >= page_get_free_space_of_empty() / 2) - || (rec_size >= REC_MAX_DATA_SIZE)) { + if (rec_size >= + ut_min(page_get_free_space_of_empty(index->table->comp) / 2, + REC_MAX_DATA_SIZE)) { /* The record is so big that we have to store some fields externally on separate database pages */ @@ -983,19 +1022,18 @@ calculate_sizes_again: /* Now, try the insert */ - *rec = page_cur_insert_rec_low(page_cursor, entry, data_size, - NULL, mtr); + *rec = page_cur_insert_rec_low(page_cursor, entry, index, NULL, mtr); if (!(*rec)) { /* If the record did not fit, reorganize */ - btr_page_reorganize(page, mtr); + btr_page_reorganize(page, index, mtr); ut_ad(page_get_max_insert_size(page, 1) == max_size); reorg = TRUE; - page_cur_search(page, entry, PAGE_CUR_LE, page_cursor); + page_cur_search(page, index, entry, PAGE_CUR_LE, page_cursor); - *rec = page_cur_tuple_insert(page_cursor, entry, mtr); + *rec = page_cur_tuple_insert(page_cursor, entry, index, mtr); if (!*rec) { fputs("InnoDB: Error: cannot insert tuple ", stderr); @@ -1123,9 +1161,9 @@ btr_cur_pessimistic_insert( } } - if ((rec_get_converted_size(entry) - >= page_get_free_space_of_empty() / 2) - || (rec_get_converted_size(entry) >= REC_MAX_DATA_SIZE)) { + if (rec_get_converted_size(index, entry) >= + ut_min(page_get_free_space_of_empty(index->table->comp) / 2, + REC_MAX_DATA_SIZE)) { /* The record is so big that we have to store some fields externally on separate database pages */ @@ -1212,8 +1250,14 @@ btr_cur_upd_lock_and_undo( err = DB_SUCCESS; if (!(flags & BTR_NO_LOCKING_FLAG)) { + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; err = lock_clust_rec_modify_check_and_lock(flags, rec, index, - thr); + rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap), thr); + if (heap) { + mem_heap_free(heap); + } if (err != DB_SUCCESS) { return(err); @@ -1243,14 +1287,17 @@ btr_cur_update_in_place_log( mtr_t* mtr) /* in: mtr */ { byte* log_ptr; + ut_ad(flags < 256); - log_ptr = mlog_open(mtr, 30 + MLOG_BUF_MARGIN); - - log_ptr = mlog_write_initial_log_record_fast(rec, - MLOG_REC_UPDATE_IN_PLACE, log_ptr, mtr); + log_ptr = mlog_open_and_write_index(mtr, rec, index, index->table->comp + ? MLOG_COMP_REC_UPDATE_IN_PLACE + : MLOG_REC_UPDATE_IN_PLACE, + 1 + DATA_ROLL_PTR_LEN + 14 + 2 + MLOG_BUF_MARGIN); - mach_write_to_1(log_ptr, flags); - log_ptr++; + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery */ + return; + } /* The code below assumes index is a clustered index: change index to the clustered index if we are updating a secondary index record (or we @@ -1259,6 +1306,9 @@ btr_cur_update_in_place_log( index = dict_table_get_first_index(index->table); + mach_write_to_1(log_ptr, flags); + log_ptr++; + log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr, mtr); mach_write_to_2(log_ptr, rec - buf_frame_align(rec)); @@ -1273,10 +1323,11 @@ Parses a redo log record of updating a record in-place. */ byte* btr_cur_parse_update_in_place( /*==========================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page) /* in: page or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page, /* in: page or NULL */ + dict_index_t* index) /* in: index corresponding to page */ { ulint flags; rec_t* rec; @@ -1286,6 +1337,7 @@ btr_cur_parse_update_in_place( dulint roll_ptr; ulint rec_offset; mem_heap_t* heap; + ulint* offsets; if (end_ptr < ptr + 1) { @@ -1333,11 +1385,14 @@ btr_cur_parse_update_in_place( /* We do not need to reserve btr_search_latch, as the page is only being recovered, and there cannot be a hash index to it. */ + offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); + if (!(flags & BTR_KEEP_SYS_FLAG)) { - row_upd_rec_sys_fields_in_recovery(rec, pos, trx_id, roll_ptr); + row_upd_rec_sys_fields_in_recovery(rec, offsets, + pos, trx_id, roll_ptr); } - row_upd_rec_in_place(rec, update); + row_upd_rec_in_place(rec, offsets, update); mem_heap_free(heap); @@ -1369,14 +1424,19 @@ btr_cur_update_in_place( dulint roll_ptr = ut_dulint_zero; trx_t* trx; ibool was_delete_marked; + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; rec = btr_cur_get_rec(cursor); index = cursor->index; trx = thr_get_trx(thr); - + heap = mem_heap_create(100); + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + if (btr_cur_print_record_ops && thr) { btr_cur_trx_report(trx, index, "update "); - rec_print(stderr, rec); + rec_print_new(stderr, rec, offsets); } /* Do lock checking and undo logging */ @@ -1384,6 +1444,9 @@ btr_cur_update_in_place( thr, &roll_ptr); if (err != DB_SUCCESS) { + if (heap) { + mem_heap_free(heap); + } return(err); } @@ -1405,15 +1468,15 @@ btr_cur_update_in_place( } if (!(flags & BTR_KEEP_SYS_FLAG)) { - row_upd_rec_sys_fields(rec, index, trx, roll_ptr); + row_upd_rec_sys_fields(rec, index, offsets, trx, roll_ptr); } /* FIXME: in a mixed tree, all records may not have enough ordering fields for btr search: */ - was_delete_marked = rec_get_deleted_flag(rec); - - row_upd_rec_in_place(rec, update); + was_delete_marked = rec_get_deleted_flag(rec, index->table->comp); + + row_upd_rec_in_place(rec, offsets, update); if (block->is_hashed) { rw_lock_x_unlock(&btr_search_latch); @@ -1421,13 +1484,16 @@ btr_cur_update_in_place( btr_cur_update_in_place_log(flags, rec, index, update, trx, roll_ptr, mtr); - if (was_delete_marked && !rec_get_deleted_flag(rec)) { + if (was_delete_marked && !rec_get_deleted_flag(rec, index->table->comp)) { /* The new updated record owns its possible externally stored fields */ - btr_cur_unmark_extern_fields(rec, mtr); + btr_cur_unmark_extern_fields(rec, mtr, offsets); } + if (heap) { + mem_heap_free(heap); + } return(DB_SUCCESS); } @@ -1469,24 +1535,28 @@ btr_cur_optimistic_update( mem_heap_t* heap; ibool reorganized = FALSE; ulint i; - + ulint* offsets; + page = btr_cur_get_page(cursor); rec = btr_cur_get_rec(cursor); index = cursor->index; + heap = mem_heap_create(1024); + offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); + if (btr_cur_print_record_ops && thr) { btr_cur_trx_report(thr_get_trx(thr), index, "update "); - rec_print(stderr, rec); + rec_print_new(stderr, rec, offsets); } ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); - if (!row_upd_changes_field_size_or_external(rec, index, update)) { + if (!row_upd_changes_field_size_or_external(index, offsets, update)) { /* The simplest and the most common case: the update does not change the size of any field and none of the updated fields is externally stored in rec or update */ - + mem_heap_free(heap); return(btr_cur_update_in_place(flags, cursor, update, cmpl_info, thr, mtr)); } @@ -1497,29 +1567,30 @@ btr_cur_optimistic_update( /* Externally stored fields are treated in pessimistic update */ + mem_heap_free(heap); return(DB_OVERFLOW); } } - if (rec_contains_externally_stored_field(btr_cur_get_rec(cursor))) { + if (rec_offs_any_extern(offsets)) { /* Externally stored fields are treated in pessimistic update */ + mem_heap_free(heap); return(DB_OVERFLOW); } page_cursor = btr_cur_get_page_cur(cursor); - heap = mem_heap_create(1024); - new_entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, heap); row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update, NULL); - old_rec_size = rec_get_size(rec); - new_rec_size = rec_get_converted_size(new_entry); + old_rec_size = rec_offs_size(offsets); + new_rec_size = rec_get_converted_size(index, new_entry); - if (new_rec_size >= page_get_free_space_of_empty() / 2) { + if (new_rec_size >= + page_get_free_space_of_empty(index->table->comp) / 2) { mem_heap_free(heap); @@ -1570,7 +1641,7 @@ btr_cur_optimistic_update( btr_search_update_hash_on_delete(cursor); - page_cur_delete_rec(page_cursor, mtr); + page_cur_delete_rec(page_cursor, index, mtr); page_cur_move_to_prev(page_cursor); @@ -1587,11 +1658,13 @@ btr_cur_optimistic_update( ut_a(rec); /* <- We calculated above the insert would fit */ - if (!rec_get_deleted_flag(rec)) { + if (!rec_get_deleted_flag(rec, index->table->comp)) { /* The new inserted record owns its possible externally stored fields */ - btr_cur_unmark_extern_fields(rec, mtr); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + btr_cur_unmark_extern_fields(rec, mtr, offsets); } /* Restore the old explicit lock state on the record */ @@ -1690,6 +1763,7 @@ btr_cur_pessimistic_update( ulint* ext_vect; ulint n_ext_vect; ulint reserve_flag; + ulint* offsets = NULL; *big_rec = NULL; @@ -1743,6 +1817,7 @@ btr_cur_pessimistic_update( } heap = mem_heap_create(1024); + offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); trx = thr_get_trx(thr); @@ -1767,28 +1842,29 @@ btr_cur_pessimistic_update( ut_a(big_rec_vec == NULL); - btr_rec_free_updated_extern_fields(index, rec, update, - TRUE, mtr); + btr_rec_free_updated_extern_fields(index, rec, offsets, + update, TRUE, mtr); } /* We have to set appropriate extern storage bits in the new record to be inserted: we have to remember which fields were such */ - ext_vect = mem_heap_alloc(heap, sizeof(ulint) * rec_get_n_fields(rec)); - n_ext_vect = btr_push_update_extern_fields(ext_vect, rec, update); - - if ((rec_get_converted_size(new_entry) >= - page_get_free_space_of_empty() / 2) - || (rec_get_converted_size(new_entry) >= REC_MAX_DATA_SIZE)) { + ext_vect = mem_heap_alloc(heap, sizeof(ulint) + * dict_index_get_n_fields(index)); + ut_ad(!cursor->index->table->comp || !rec_get_node_ptr_flag(rec)); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + n_ext_vect = btr_push_update_extern_fields(ext_vect, offsets, update); + + if (rec_get_converted_size(index, new_entry) >= + ut_min(page_get_free_space_of_empty(index->table->comp) / 2, + REC_MAX_DATA_SIZE)) { big_rec_vec = dtuple_convert_big_rec(index, new_entry, ext_vect, n_ext_vect); if (big_rec_vec == NULL) { - mem_heap_free(heap); - err = DB_TOO_BIG_RECORD; - goto return_after_reservations; } } @@ -1808,7 +1884,7 @@ btr_cur_pessimistic_update( btr_search_update_hash_on_delete(cursor); - page_cur_delete_rec(page_cursor, mtr); + page_cur_delete_rec(page_cursor, index, mtr); page_cur_move_to_prev(page_cursor); @@ -1817,21 +1893,22 @@ btr_cur_pessimistic_update( ut_a(rec || optim_err != DB_UNDERFLOW); if (rec) { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + lock_rec_restore_from_page_infimum(rec, page); - rec_set_field_extern_bits(rec, ext_vect, n_ext_vect, mtr); + rec_set_field_extern_bits(rec, index, + ext_vect, n_ext_vect, mtr); - if (!rec_get_deleted_flag(rec)) { + if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) { /* The new inserted record owns its possible externally stored fields */ - - btr_cur_unmark_extern_fields(rec, mtr); + btr_cur_unmark_extern_fields(rec, mtr, offsets); } btr_cur_compress_if_useful(cursor, mtr); err = DB_SUCCESS; - mem_heap_free(heap); - goto return_after_reservations; } @@ -1856,13 +1933,14 @@ btr_cur_pessimistic_update( ut_a(err == DB_SUCCESS); ut_a(dummy_big_rec == NULL); - rec_set_field_extern_bits(rec, ext_vect, n_ext_vect, mtr); + rec_set_field_extern_bits(rec, index, ext_vect, n_ext_vect, mtr); + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); - if (!rec_get_deleted_flag(rec)) { + if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) { /* The new inserted record owns its possible externally stored fields */ - btr_cur_unmark_extern_fields(rec, mtr); + btr_cur_unmark_extern_fields(rec, mtr, offsets); } lock_rec_restore_from_page_infimum(rec, page); @@ -1876,9 +1954,8 @@ btr_cur_pessimistic_update( btr_cur_pess_upd_restore_supremum(rec, mtr); } - mem_heap_free(heap); - return_after_reservations: + mem_heap_free(heap); if (n_extents > 0) { fil_space_release_free_extents(cursor->index->space, @@ -1908,11 +1985,18 @@ btr_cur_del_mark_set_clust_rec_log( mtr_t* mtr) /* in: mtr */ { byte* log_ptr; + ut_ad(flags < 256); + ut_ad(val <= 1); - log_ptr = mlog_open(mtr, 30); + log_ptr = mlog_open_and_write_index(mtr, rec, index, index->table->comp + ? MLOG_COMP_REC_CLUST_DELETE_MARK + : MLOG_REC_CLUST_DELETE_MARK, + 1 + 1 + DATA_ROLL_PTR_LEN + 14 + 2); - log_ptr = mlog_write_initial_log_record_fast(rec, - MLOG_REC_CLUST_DELETE_MARK, log_ptr, mtr); + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery */ + return; + } mach_write_to_1(log_ptr, flags); log_ptr++; @@ -1934,10 +2018,11 @@ index record. */ byte* btr_cur_parse_del_mark_set_clust_rec( /*=================================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page) /* in: page or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: index corresponding to page */ + page_t* page) /* in: page or NULL */ { ulint flags; ibool val; @@ -1978,15 +2063,22 @@ btr_cur_parse_del_mark_set_clust_rec( rec = page + offset; if (!(flags & BTR_KEEP_SYS_FLAG)) { - row_upd_rec_sys_fields_in_recovery(rec, pos, trx_id, - roll_ptr); + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + row_upd_rec_sys_fields_in_recovery(rec, + rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap), + pos, trx_id, roll_ptr); + if (heap) { + mem_heap_free(heap); + } } /* We do not need to reserve btr_search_latch, as the page is only being recovered, and there cannot be a hash index to it. */ - rec_set_deleted_flag(rec, val); + rec_set_deleted_flag(rec, index->table->comp, val); } return(ptr); @@ -2015,22 +2107,30 @@ btr_cur_del_mark_set_clust_rec( ulint err; rec_t* rec; trx_t* trx; - + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; + rec = btr_cur_get_rec(cursor); index = cursor->index; - + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + if (btr_cur_print_record_ops && thr) { btr_cur_trx_report(thr_get_trx(thr), index, "del mark "); - rec_print(stderr, rec); + rec_print_new(stderr, rec, offsets); } ut_ad(index->type & DICT_CLUSTERED); - ut_ad(rec_get_deleted_flag(rec) == FALSE); + ut_ad(rec_get_deleted_flag(rec, index->table->comp) == FALSE); - err = lock_clust_rec_modify_check_and_lock(flags, rec, index, thr); + err = lock_clust_rec_modify_check_and_lock(flags, + rec, index, offsets, thr); if (err != DB_SUCCESS) { + if (heap) { + mem_heap_free(heap); + } return(err); } @@ -2039,6 +2139,9 @@ btr_cur_del_mark_set_clust_rec( &roll_ptr); if (err != DB_SUCCESS) { + if (heap) { + mem_heap_free(heap); + } return(err); } @@ -2048,13 +2151,12 @@ btr_cur_del_mark_set_clust_rec( rw_lock_x_lock(&btr_search_latch); } - rec_set_deleted_flag(rec, val); + rec_set_deleted_flag(rec, index->table->comp, val); trx = thr_get_trx(thr); if (!(flags & BTR_KEEP_SYS_FLAG)) { - - row_upd_rec_sys_fields(rec, index, trx, roll_ptr); + row_upd_rec_sys_fields(rec, index, offsets, trx, roll_ptr); } if (block->is_hashed) { @@ -2063,6 +2165,9 @@ btr_cur_del_mark_set_clust_rec( btr_cur_del_mark_set_clust_rec_log(flags, rec, index, val, trx, roll_ptr, mtr); + if (heap) { + mem_heap_free(heap); + } return(DB_SUCCESS); } @@ -2073,16 +2178,24 @@ UNIV_INLINE void btr_cur_del_mark_set_sec_rec_log( /*=============================*/ - rec_t* rec, /* in: record */ - ibool val, /* in: value to set */ - mtr_t* mtr) /* in: mtr */ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: record descriptor */ + ibool val, /* in: value to set */ + mtr_t* mtr) /* in: mtr */ { byte* log_ptr; + ut_ad(val <= 1); - log_ptr = mlog_open(mtr, 30); + log_ptr = mlog_open_and_write_index(mtr, rec, index, index->table->comp + ? MLOG_COMP_REC_SEC_DELETE_MARK + : MLOG_REC_SEC_DELETE_MARK, + 1 + 2); - log_ptr = mlog_write_initial_log_record_fast(rec, - MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr); + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery: + in that case mlog_open returns NULL */ + return; + } mach_write_to_1(log_ptr, val); log_ptr++; @@ -2100,10 +2213,11 @@ index record. */ byte* btr_cur_parse_del_mark_set_sec_rec( /*===============================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page) /* in: page or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page) /* in: page or NULL */ { ibool val; ulint offset; @@ -2129,7 +2243,7 @@ btr_cur_parse_del_mark_set_sec_rec( is only being recovered, and there cannot be a hash index to it. */ - rec_set_deleted_flag(rec, val); + rec_set_deleted_flag(rec, index->table->comp, val); } return(ptr); @@ -2158,7 +2272,7 @@ btr_cur_del_mark_set_sec_rec( if (btr_cur_print_record_ops && thr) { btr_cur_trx_report(thr_get_trx(thr), cursor->index, "del mark "); - rec_print(stderr, rec); + rec_print(stderr, rec, cursor->index); } err = lock_sec_rec_modify_check_and_lock(flags, rec, cursor->index, @@ -2174,13 +2288,13 @@ btr_cur_del_mark_set_sec_rec( rw_lock_x_lock(&btr_search_latch); } - rec_set_deleted_flag(rec, val); + rec_set_deleted_flag(rec, cursor->index->table->comp, val); if (block->is_hashed) { rw_lock_x_unlock(&btr_search_latch); } - btr_cur_del_mark_set_sec_rec_log(rec, val, mtr); + btr_cur_del_mark_set_sec_rec_log(rec, cursor->index, val, mtr); return(DB_SUCCESS); } @@ -2192,15 +2306,16 @@ used by the insert buffer insert merge mechanism. */ void btr_cur_del_unmark_for_ibuf( /*========================*/ - rec_t* rec, /* in: record to delete unmark */ - mtr_t* mtr) /* in: mtr */ + rec_t* rec, /* in: record to delete unmark */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { /* We do not need to reserve btr_search_latch, as the page has just been read to the buffer pool and there cannot be a hash index to it. */ - rec_set_deleted_flag(rec, FALSE); + rec_set_deleted_flag(rec, index->table->comp, FALSE); - btr_cur_del_mark_set_sec_rec_log(rec, FALSE, mtr); + btr_cur_del_mark_set_sec_rec_log(rec, index, FALSE, mtr); } /*==================== B-TREE RECORD REMOVE =========================*/ @@ -2279,8 +2394,12 @@ btr_cur_optimistic_delete( successor of the deleted record */ mtr_t* mtr) /* in: mtr */ { - page_t* page; - ulint max_ins_size; + page_t* page; + ulint max_ins_size; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; ut_ad(mtr_memo_contains(mtr, buf_block_align(btr_cur_get_page(cursor)), MTR_MEMO_PAGE_X_FIX)); @@ -2290,26 +2409,34 @@ btr_cur_optimistic_delete( ut_ad(btr_page_get_level(page, mtr) == 0); - if (rec_contains_externally_stored_field(btr_cur_get_rec(cursor))) { - - return(FALSE); - } + rec = btr_cur_get_rec(cursor); + offsets = rec_get_offsets(rec, cursor->index, offsets, + ULINT_UNDEFINED, &heap); - if (btr_cur_can_delete_without_compress(cursor, mtr)) { + if (!rec_offs_any_extern(offsets) + && btr_cur_can_delete_without_compress( + cursor, rec_offs_size(offsets), mtr)) { - lock_update_delete(btr_cur_get_rec(cursor)); + lock_update_delete(rec); btr_search_update_hash_on_delete(cursor); max_ins_size = page_get_max_insert_size_after_reorganize(page, 1); - page_cur_delete_rec(btr_cur_get_page_cur(cursor), mtr); + page_cur_delete_rec(btr_cur_get_page_cur(cursor), + cursor->index, mtr); ibuf_update_free_bits_low(cursor->index, page, max_ins_size, mtr); + if (heap) { + mem_heap_free(heap); + } return(TRUE); } + if (heap) { + mem_heap_free(heap); + } return(FALSE); } @@ -2375,8 +2502,21 @@ btr_cur_pessimistic_delete( } } - btr_rec_free_externally_stored_fields(cursor->index, - btr_cur_get_rec(cursor), in_rollback, mtr); + heap = mem_heap_create(256); + rec = btr_cur_get_rec(cursor); + + /* Free externally stored fields if the record is neither + a node pointer nor in two-byte format. + This avoids unnecessary calls to rec_get_offsets(). */ + if (cursor->index->table->comp + ? !rec_get_node_ptr_flag(rec) + : !rec_get_1byte_offs_flag(rec)) { + btr_rec_free_externally_stored_fields(cursor->index, + rec, rec_get_offsets(rec, cursor->index, + NULL, ULINT_UNDEFINED, &heap), + in_rollback, mtr); + mem_heap_empty(heap); + } if ((page_get_n_recs(page) < 2) && (dict_tree_get_page(btr_cur_get_tree(cursor)) @@ -2393,8 +2533,6 @@ btr_cur_pessimistic_delete( goto return_after_reservations; } - rec = btr_cur_get_rec(cursor); - lock_update_delete(rec); if ((btr_page_get_level(page, mtr) > 0) @@ -2406,7 +2544,8 @@ btr_cur_pessimistic_delete( non-leaf level, we must mark the new leftmost node pointer as the predefined minimum record */ - btr_set_min_rec_mark(page_rec_get_next(rec), mtr); + btr_set_min_rec_mark(page_rec_get_next(rec), + cursor->index->table->comp, mtr); } else { /* Otherwise, if we delete the leftmost node pointer on a page, we have to change the father node pointer @@ -2415,8 +2554,6 @@ btr_cur_pessimistic_delete( btr_node_ptr_delete(tree, page, mtr); - heap = mem_heap_create(256); - node_ptr = dict_tree_build_node_ptr( tree, page_rec_get_next(rec), buf_frame_get_page_no(page), @@ -2425,20 +2562,19 @@ btr_cur_pessimistic_delete( btr_insert_on_non_leaf_level(tree, btr_page_get_level(page, mtr) + 1, node_ptr, mtr); - - mem_heap_free(heap); } } btr_search_update_hash_on_delete(cursor); - page_cur_delete_rec(btr_cur_get_page_cur(cursor), mtr); + page_cur_delete_rec(btr_cur_get_page_cur(cursor), cursor->index, mtr); ut_ad(btr_check_node_ptr(tree, page, mtr)); *err = DB_SUCCESS; return_after_reservations: + mem_heap_free(heap); if (ret == FALSE) { ret = btr_cur_compress_if_useful(cursor, mtr); @@ -2663,6 +2799,11 @@ btr_estimate_number_of_different_key_vals( ulint j; ulint add_on; mtr_t mtr; + mem_heap_t* heap = NULL; + ulint offsets1_[100] = { 100, }; + ulint offsets2_[100] = { 100, }; + ulint* offsets1 = offsets1_; + ulint* offsets2 = offsets2_; n_cols = dict_index_get_n_unique(index); @@ -2697,10 +2838,16 @@ btr_estimate_number_of_different_key_vals( while (rec != page_get_supremum_rec(page) && page_rec_get_next(rec) != page_get_supremum_rec(page)) { + rec_t* next_rec = page_rec_get_next(rec); matched_fields = 0; matched_bytes = 0; + offsets1 = rec_get_offsets(rec, index, offsets1, + ULINT_UNDEFINED, &heap); + offsets2 = rec_get_offsets(next_rec, index, offsets2, + n_cols, &heap); - cmp_rec_rec_with_match(rec, page_rec_get_next(rec), + cmp_rec_rec_with_match(rec, next_rec, + offsets1, offsets2, index, &matched_fields, &matched_bytes); @@ -2712,7 +2859,8 @@ btr_estimate_number_of_different_key_vals( } total_external_size += - btr_rec_get_externally_stored_len(rec); + btr_rec_get_externally_stored_len( + rec, offsets1); rec = page_rec_get_next(rec); } @@ -2736,8 +2884,11 @@ btr_estimate_number_of_different_key_vals( } } + offsets1 = rec_get_offsets(rec, index, offsets1, + ULINT_UNDEFINED, &heap); total_external_size += - btr_rec_get_externally_stored_len(rec); + btr_rec_get_externally_stored_len(rec, + offsets1); mtr_commit(&mtr); } @@ -2778,6 +2929,9 @@ btr_estimate_number_of_different_key_vals( } mem_free(n_diff); + if (heap) { + mem_heap_free(heap); + } } /*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/ @@ -2788,9 +2942,10 @@ static ulint btr_rec_get_externally_stored_len( /*==============================*/ - /* out: externally stored part, in units of a - database page */ - rec_t* rec) /* in: record */ + /* out: externally stored part, + in units of a database page */ + rec_t* rec, /* in: record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { ulint n_fields; byte* data; @@ -2799,17 +2954,13 @@ btr_rec_get_externally_stored_len( ulint total_extern_len = 0; ulint i; - if (rec_get_data_size(rec) <= REC_1BYTE_OFFS_LIMIT) { - - return(0); - } - - n_fields = rec_get_n_fields(rec); + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + n_fields = rec_offs_n_fields(offsets); for (i = 0; i < n_fields; i++) { - if (rec_get_nth_field_extern_bit(rec, i)) { + if (rec_offs_nth_extern(offsets, i)) { - data = rec_get_nth_field(rec, i, &local_len); + data = rec_get_nth_field(rec, offsets, i, &local_len); local_len -= BTR_EXTERN_FIELD_REF_SIZE; @@ -2830,16 +2981,17 @@ static void btr_cur_set_ownership_of_extern_field( /*==================================*/ - rec_t* rec, /* in: clustered index record */ - ulint i, /* in: field number */ - ibool val, /* in: value to set */ - mtr_t* mtr) /* in: mtr */ + rec_t* rec, /* in: clustered index record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint i, /* in: field number */ + ibool val, /* in: value to set */ + mtr_t* mtr) /* in: mtr */ { byte* data; ulint local_len; ulint byte_val; - data = rec_get_nth_field(rec, i, &local_len); + data = rec_get_nth_field(rec, offsets, i, &local_len); ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); @@ -2866,19 +3018,22 @@ to free the field. */ void btr_cur_mark_extern_inherited_fields( /*=================================*/ - rec_t* rec, /* in: record in a clustered index */ - upd_t* update, /* in: update vector */ - mtr_t* mtr) /* in: mtr */ + rec_t* rec, /* in: record in a clustered index */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + upd_t* update, /* in: update vector */ + mtr_t* mtr) /* in: mtr */ { ibool is_updated; ulint n; ulint j; ulint i; - - n = rec_get_n_fields(rec); + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + n = rec_offs_n_fields(offsets); for (i = 0; i < n; i++) { - if (rec_get_nth_field_extern_bit(rec, i)) { + if (rec_offs_nth_extern(offsets, i)) { /* Check it is not in updated fields */ is_updated = FALSE; @@ -2894,8 +3049,8 @@ btr_cur_mark_extern_inherited_fields( } if (!is_updated) { - btr_cur_set_ownership_of_extern_field(rec, i, - FALSE, mtr); + btr_cur_set_ownership_of_extern_field(rec, + offsets, i, FALSE, mtr); } } } @@ -2967,18 +3122,20 @@ static void btr_cur_unmark_extern_fields( /*=========================*/ - rec_t* rec, /* in: record in a clustered index */ - mtr_t* mtr) /* in: mtr */ + rec_t* rec, /* in: record in a clustered index */ + mtr_t* mtr, /* in: mtr */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { ulint n; ulint i; - n = rec_get_n_fields(rec); + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + n = rec_offs_n_fields(offsets); for (i = 0; i < n; i++) { - if (rec_get_nth_field_extern_bit(rec, i)) { - - btr_cur_set_ownership_of_extern_field(rec, i, + if (rec_offs_nth_extern(offsets, i)) { + + btr_cur_set_ownership_of_extern_field(rec, offsets, i, TRUE, mtr); } } @@ -3028,10 +3185,10 @@ ulint btr_push_update_extern_fields( /*==========================*/ /* out: number of values stored in ext_vect */ - ulint* ext_vect, /* in: array of ulints, must be preallocated + ulint* ext_vect,/* in: array of ulints, must be preallocated to have space for all fields in rec */ - rec_t* rec, /* in: record */ - upd_t* update) /* in: update vector or NULL */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + upd_t* update) /* in: update vector or NULL */ { ulint n_pushed = 0; ibool is_updated; @@ -3054,10 +3211,10 @@ btr_push_update_extern_fields( } } - n = rec_get_n_fields(rec); + n = rec_offs_n_fields(offsets); for (i = 0; i < n; i++) { - if (rec_get_nth_field_extern_bit(rec, i)) { + if (rec_offs_nth_extern(offsets, i)) { /* Check it is not in updated fields */ is_updated = FALSE; @@ -3119,6 +3276,7 @@ btr_store_big_rec_extern_fields( dict_index_t* index, /* in: index of rec; the index tree MUST be X-latched */ rec_t* rec, /* in: record */ + const ulint* offsets, /* in: rec_get_offsets(rec, index) */ big_rec_t* big_rec_vec, /* in: vector containing fields to be stored externally */ mtr_t* local_mtr __attribute__((unused))) /* in: mtr @@ -3139,6 +3297,7 @@ btr_store_big_rec_extern_fields( ulint i; mtr_t mtr; + ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(mtr_memo_contains(local_mtr, dict_tree_get_lock(index->tree), MTR_MEMO_X_LOCK)); ut_ad(mtr_memo_contains(local_mtr, buf_block_align(rec), @@ -3152,8 +3311,8 @@ btr_store_big_rec_extern_fields( for (i = 0; i < big_rec_vec->n_fields; i++) { - data = rec_get_nth_field(rec, big_rec_vec->fields[i].field_no, - &local_len); + data = rec_get_nth_field(rec, offsets, + big_rec_vec->fields[i].field_no, &local_len); ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); local_len -= BTR_EXTERN_FIELD_REF_SIZE; extern_len = big_rec_vec->fields[i].len; @@ -3254,7 +3413,7 @@ btr_store_big_rec_extern_fields( /* Set the bit denoting that this field in rec is stored externally */ - rec_set_nth_field_extern_bit(rec, + rec_set_nth_field_extern_bit(rec, index, big_rec_vec->fields[i].field_no, TRUE, &mtr); } @@ -3407,6 +3566,7 @@ btr_rec_free_externally_stored_fields( dict_index_t* index, /* in: index of the data, the index tree MUST be X-latched */ rec_t* rec, /* in: record */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ ibool do_not_free_inherited,/* in: TRUE if called in a rollback and we do not want to free inherited fields */ @@ -3419,21 +3579,18 @@ btr_rec_free_externally_stored_fields( ulint len; ulint i; + ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_X_FIX)); - if (rec_get_data_size(rec) <= REC_1BYTE_OFFS_LIMIT) { - - return; - } - /* Free possible externally stored fields in the record */ - n_fields = rec_get_n_fields(rec); + ut_ad(index->table->comp == rec_offs_comp(offsets)); + n_fields = rec_offs_n_fields(offsets); for (i = 0; i < n_fields; i++) { - if (rec_get_nth_field_extern_bit(rec, i)) { + if (rec_offs_nth_extern(offsets, i)) { - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field(rec, offsets, i, &len); btr_free_externally_stored_field(index, data, len, do_not_free_inherited, mtr); } @@ -3450,6 +3607,7 @@ btr_rec_free_updated_extern_fields( dict_index_t* index, /* in: index of rec; the index tree MUST be X-latched */ rec_t* rec, /* in: record */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ upd_t* update, /* in: update vector */ ibool do_not_free_inherited,/* in: TRUE if called in a rollback and we do not want to free @@ -3463,13 +3621,10 @@ btr_rec_free_updated_extern_fields( ulint len; ulint i; + ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_X_FIX)); - if (rec_get_data_size(rec) <= REC_1BYTE_OFFS_LIMIT) { - return; - } - /* Free possible externally stored fields in the record */ n_fields = upd_get_n_fields(update); @@ -3477,9 +3632,10 @@ btr_rec_free_updated_extern_fields( for (i = 0; i < n_fields; i++) { ufield = upd_get_nth_field(update, i); - if (rec_get_nth_field_extern_bit(rec, ufield->field_no)) { + if (rec_offs_nth_extern(offsets, ufield->field_no)) { - data = rec_get_nth_field(rec, ufield->field_no, &len); + data = rec_get_nth_field(rec, offsets, + ufield->field_no, &len); btr_free_externally_stored_field(index, data, len, do_not_free_inherited, mtr); } @@ -3583,7 +3739,8 @@ byte* btr_rec_copy_externally_stored_field( /*=================================*/ /* out: the field copied to heap */ - rec_t* rec, /* in: record */ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint no, /* in: field number */ ulint* len, /* out: length of the field */ mem_heap_t* heap) /* in: mem heap */ @@ -3591,7 +3748,8 @@ btr_rec_copy_externally_stored_field( ulint local_len; byte* data; - ut_a(rec_get_nth_field_extern_bit(rec, no)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_a(rec_offs_nth_extern(offsets, no)); /* An externally stored field can contain some initial data from the field, and in the last 20 bytes it has the @@ -3602,7 +3760,7 @@ btr_rec_copy_externally_stored_field( limit so that field offsets are stored in two bytes, and the extern bit is available in those two bytes. */ - data = rec_get_nth_field(rec, no, &local_len); + data = rec_get_nth_field(rec, offsets, no, &local_len); return(btr_copy_externally_stored_field(len, data, local_len, heap)); } diff --git a/innobase/btr/btr0pcur.c b/innobase/btr/btr0pcur.c index cf8a612ef28..ceaa4f41a18 100644 --- a/innobase/btr/btr0pcur.c +++ b/innobase/btr/btr0pcur.c @@ -45,12 +45,12 @@ btr_pcur_free_for_mysql( mem_free(cursor->old_rec_buf); - cursor->old_rec = NULL; cursor->old_rec_buf = NULL; } cursor->btr_cur.page_cur.rec = NULL; cursor->old_rec = NULL; + cursor->old_n_fields = 0; cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; cursor->latch_mode = BTR_NO_LATCHES; @@ -133,9 +133,10 @@ btr_pcur_store_position( cursor->old_stored = BTR_PCUR_OLD_STORED; cursor->old_rec = dict_tree_copy_rec_order_prefix(tree, rec, - &(cursor->old_rec_buf), - &(cursor->buf_size)); - + &cursor->old_n_fields, + &cursor->old_rec_buf, + &cursor->buf_size); + cursor->block_when_stored = buf_block_align(page); cursor->modify_clock = buf_frame_get_modify_clock(page); } @@ -166,6 +167,8 @@ btr_pcur_copy_stored_position( pcur_receive->old_rec = pcur_receive->old_rec_buf + (pcur_donate->old_rec - pcur_donate->old_rec_buf); } + + pcur_receive->old_n_fields = pcur_donate->old_n_fields; } /****************************************************************** @@ -228,6 +231,7 @@ btr_pcur_restore_position( } ut_a(cursor->old_rec); + ut_a(cursor->old_n_fields); page = btr_cur_get_page(btr_pcur_get_btr_cur(cursor)); @@ -242,17 +246,31 @@ btr_pcur_restore_position( buf_page_dbg_add_level(page, SYNC_TREE_NODE); #endif /* UNIV_SYNC_DEBUG */ if (cursor->rel_pos == BTR_PCUR_ON) { - +#ifdef UNIV_DEBUG + rec_t* rec; + ulint* offsets1; + ulint* offsets2; + dict_index_t* index; +#endif /* UNIV_DEBUG */ cursor->latch_mode = latch_mode; - - ut_ad(cmp_rec_rec(cursor->old_rec, - btr_pcur_get_rec(cursor), - dict_tree_find_index( - btr_cur_get_tree( +#ifdef UNIV_DEBUG + rec = btr_pcur_get_rec(cursor); + index = dict_tree_find_index( + btr_cur_get_tree( btr_pcur_get_btr_cur(cursor)), - btr_pcur_get_rec(cursor))) - == 0); + rec); + + heap = mem_heap_create(256); + offsets1 = rec_get_offsets(cursor->old_rec, + index, NULL, + cursor->old_n_fields, &heap); + offsets2 = rec_get_offsets(rec, index, NULL, + cursor->old_n_fields, &heap); + ut_ad(cmp_rec_rec(cursor->old_rec, + rec, offsets1, offsets2, index) == 0); + mem_heap_free(heap); +#endif /* UNIV_DEBUG */ return(TRUE); } @@ -265,7 +283,8 @@ btr_pcur_restore_position( heap = mem_heap_create(256); tree = btr_cur_get_tree(btr_pcur_get_btr_cur(cursor)); - tuple = dict_tree_build_data_tuple(tree, cursor->old_rec, heap); + tuple = dict_tree_build_data_tuple(tree, cursor->old_rec, + cursor->old_n_fields, heap); /* Save the old search mode of the cursor */ old_mode = cursor->search_mode; @@ -287,7 +306,10 @@ btr_pcur_restore_position( if (cursor->rel_pos == BTR_PCUR_ON && btr_pcur_is_on_user_rec(cursor, mtr) - && 0 == cmp_dtuple_rec(tuple, btr_pcur_get_rec(cursor))) { + && 0 == cmp_dtuple_rec(tuple, btr_pcur_get_rec(cursor), + rec_get_offsets(btr_pcur_get_rec(cursor), + btr_pcur_get_btr_cur(cursor)->index, + NULL, ULINT_UNDEFINED, &heap))) { /* We have to store the NEW value for the modify clock, since the cursor can now be on a different page! But we can retain @@ -376,6 +398,7 @@ btr_pcur_move_to_next_page( ut_ad(next_page_no != FIL_NULL); next_page = btr_page_get(space, next_page_no, cursor->latch_mode, mtr); + ut_a(page_is_comp(next_page) == page_is_comp(page)); buf_block_align(next_page)->check_index_page_at_flush = TRUE; btr_leaf_page_release(page, cursor->latch_mode, mtr); diff --git a/innobase/btr/btr0sea.c b/innobase/btr/btr0sea.c index 9384168df88..dc712f650e7 100644 --- a/innobase/btr/btr0sea.c +++ b/innobase/btr/btr0sea.c @@ -411,11 +411,16 @@ btr_search_update_hash_ref( ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED) || rw_lock_own(&(block->lock), RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ + ut_ad(buf_block_align(btr_cur_get_rec(cursor)) == block); + ut_a(!block->is_hashed || block->index == cursor->index); + if (block->is_hashed && (info->n_hash_potential > 0) && (block->curr_n_fields == info->n_fields) && (block->curr_n_bytes == info->n_bytes) && (block->curr_side == info->side)) { + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; rec = btr_cur_get_rec(cursor); @@ -425,10 +430,13 @@ btr_search_update_hash_ref( } tree_id = ((cursor->index)->tree)->id; - - fold = rec_fold(rec, block->curr_n_fields, - block->curr_n_bytes, tree_id); - + fold = rec_fold(rec, rec_get_offsets(rec, cursor->index, + offsets_, ULINT_UNDEFINED, &heap), + block->curr_n_fields, + block->curr_n_bytes, tree_id); + if (heap) { + mem_heap_free(heap); + } #ifdef UNIV_SYNC_DEBUG ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ @@ -535,15 +543,19 @@ btr_search_check_guess( or PAGE_CUR_GE */ mtr_t* mtr) /* in: mtr */ { - page_t* page; - rec_t* rec; - rec_t* prev_rec; - rec_t* next_rec; - ulint n_unique; - ulint match; - ulint bytes; - int cmp; - + page_t* page; + rec_t* rec; + rec_t* prev_rec; + rec_t* next_rec; + ulint n_unique; + ulint match; + ulint bytes; + int cmp; + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; + ibool success = FALSE; + n_unique = dict_index_get_n_unique_in_tree(cursor->index); rec = btr_cur_get_rec(cursor); @@ -554,45 +566,43 @@ btr_search_check_guess( match = 0; bytes = 0; - cmp = page_cmp_dtuple_rec_with_match(tuple, rec, &match, &bytes); + offsets = rec_get_offsets(rec, cursor->index, offsets, + n_unique, &heap); + cmp = page_cmp_dtuple_rec_with_match(tuple, rec, + offsets, &match, &bytes); if (mode == PAGE_CUR_GE) { if (cmp == 1) { - - return(FALSE); + goto exit_func; } cursor->up_match = match; if (match >= n_unique) { - - return(TRUE); + success = TRUE; + goto exit_func; } } else if (mode == PAGE_CUR_LE) { if (cmp == -1) { - - return(FALSE); + goto exit_func; } cursor->low_match = match; } else if (mode == PAGE_CUR_G) { if (cmp != -1) { - - return(FALSE); + goto exit_func; } } else if (mode == PAGE_CUR_L) { if (cmp != 1) { - - return(FALSE); + goto exit_func; } } if (can_only_compare_to_cursor_rec) { /* Since we could not determine if our guess is right just by looking at the record under the cursor, return FALSE */ - - return(FALSE); + goto exit_func; } match = 0; @@ -605,30 +615,21 @@ btr_search_check_guess( prev_rec = page_rec_get_prev(rec); if (prev_rec == page_get_infimum_rec(page)) { - - if (btr_page_get_prev(page, mtr) != FIL_NULL) { - - return(FALSE); - } - - return(TRUE); + success = btr_page_get_prev(page, mtr) == FIL_NULL; + goto exit_func; } + offsets = rec_get_offsets(prev_rec, cursor->index, offsets, + n_unique, &heap); cmp = page_cmp_dtuple_rec_with_match(tuple, prev_rec, - &match, &bytes); + offsets, &match, &bytes); if (mode == PAGE_CUR_GE) { - if (cmp != 1) { - - return(FALSE); - } + success = cmp == 1; } else { - if (cmp == -1) { - - return(FALSE); - } + success = cmp != -1; } - return(TRUE); + goto exit_func; } ut_ad(rec != page_get_supremum_rec(page)); @@ -636,34 +637,30 @@ btr_search_check_guess( next_rec = page_rec_get_next(rec); if (next_rec == page_get_supremum_rec(page)) { - if (btr_page_get_next(page, mtr) == FIL_NULL) { cursor->up_match = 0; - - return(TRUE); + success = TRUE; } - return(FALSE); + goto exit_func; } - cmp = page_cmp_dtuple_rec_with_match(tuple, next_rec, &match, &bytes); - + offsets = rec_get_offsets(next_rec, cursor->index, offsets, + n_unique, &heap); + cmp = page_cmp_dtuple_rec_with_match(tuple, next_rec, + offsets, &match, &bytes); if (mode == PAGE_CUR_LE) { - if (cmp != -1) { - - return(FALSE); - } - + success = cmp == -1; cursor->up_match = match; } else { - if (cmp == 1) { - - return(FALSE); - } + success = cmp != 1; } - - return(TRUE); +exit_func: + if (heap) { + mem_heap_free(heap); + } + return(success); } /********************************************************************** @@ -926,6 +923,8 @@ btr_search_drop_page_hash_index( ulint n_recs; ulint* folds; ulint i; + mem_heap_t* heap; + ulint* offsets; #ifdef UNIV_SYNC_DEBUG ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED)); @@ -973,10 +972,10 @@ btr_search_drop_page_hash_index( rec = page_rec_get_next(rec); if (rec != sup) { - ut_a(n_fields <= rec_get_n_fields(rec)); + ut_a(n_fields <= rec_get_n_fields(rec, block->index)); if (n_bytes > 0) { - ut_a(n_fields < rec_get_n_fields(rec)); + ut_a(n_fields < rec_get_n_fields(rec, block->index)); } } @@ -984,11 +983,15 @@ btr_search_drop_page_hash_index( prev_fold = 0; + heap = NULL; + offsets = NULL; + while (rec != sup) { /* FIXME: in a mixed tree, not all records may have enough ordering fields: */ - - fold = rec_fold(rec, n_fields, n_bytes, tree_id); + offsets = rec_get_offsets(rec, block->index, + offsets, n_fields + (n_bytes > 0), &heap); + fold = rec_fold(rec, offsets, n_fields, n_bytes, tree_id); if (fold == prev_fold && prev_fold != 0) { @@ -1005,6 +1008,10 @@ next_rec: prev_fold = fold; } + if (heap) { + mem_heap_free(heap); + } + rw_lock_x_lock(&btr_search_latch); for (i = 0; i < n_cached; i++) { @@ -1013,6 +1020,7 @@ next_rec: } block->is_hashed = FALSE; + block->index = NULL; rw_lock_x_unlock(&btr_search_latch); @@ -1069,8 +1077,7 @@ static void btr_search_build_page_hash_index( /*=============================*/ - dict_index_t* index, /* in: index for which to build, or NULL if - not known */ + dict_index_t* index, /* in: index for which to build */ page_t* page, /* in: index page, s- or x-latched */ ulint n_fields,/* in: hash this many full fields */ ulint n_bytes,/* in: hash this many bytes from the next @@ -1090,7 +1097,12 @@ btr_search_build_page_hash_index( ulint* folds; rec_t** recs; ulint i; - + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; + + ut_ad(index); + block = buf_block_align(page); table = btr_search_sys->hash_index; @@ -1127,9 +1139,9 @@ btr_search_build_page_hash_index( return; } - if (index && (dict_index_get_n_unique_in_tree(index) < n_fields + if (dict_index_get_n_unique_in_tree(index) < n_fields || (dict_index_get_n_unique_in_tree(index) == n_fields - && n_bytes > 0))) { + && n_bytes > 0)) { return; } @@ -1148,18 +1160,20 @@ btr_search_build_page_hash_index( rec = page_get_infimum_rec(page); rec = page_rec_get_next(rec); + offsets = rec_get_offsets(rec, index, offsets, + n_fields + (n_bytes > 0), &heap); + if (rec != sup) { - ut_a(n_fields <= rec_get_n_fields(rec)); + ut_a(n_fields <= rec_offs_n_fields(offsets)); if (n_bytes > 0) { - ut_a(n_fields < rec_get_n_fields(rec)); + ut_a(n_fields < rec_offs_n_fields(offsets)); } } /* FIXME: in a mixed tree, all records may not have enough ordering fields: */ - - fold = rec_fold(rec, n_fields, n_bytes, tree_id); + fold = rec_fold(rec, offsets, n_fields, n_bytes, tree_id); if (side == BTR_SEARCH_LEFT_SIDE) { @@ -1183,7 +1197,10 @@ btr_search_build_page_hash_index( break; } - next_fold = rec_fold(next_rec, n_fields, n_bytes, tree_id); + offsets = rec_get_offsets(next_rec, index, offsets, + n_fields + (n_bytes > 0), &heap); + next_fold = rec_fold(next_rec, offsets, n_fields, + n_bytes, tree_id); if (fold != next_fold) { /* Insert an entry into the hash index */ @@ -1211,13 +1228,7 @@ btr_search_build_page_hash_index( if (block->is_hashed && ((block->curr_n_fields != n_fields) || (block->curr_n_bytes != n_bytes) || (block->curr_side != side))) { - - rw_lock_x_unlock(&btr_search_latch); - - mem_free(folds); - mem_free(recs); - - return; + goto exit_func; } block->is_hashed = TRUE; @@ -1226,16 +1237,21 @@ btr_search_build_page_hash_index( block->curr_n_fields = n_fields; block->curr_n_bytes = n_bytes; block->curr_side = side; + block->index = index; for (i = 0; i < n_cached; i++) { ha_insert_for_fold(table, folds[i], recs[i]); } +exit_func: rw_lock_x_unlock(&btr_search_latch); mem_free(folds); mem_free(recs); + if (heap) { + mem_heap_free(heap); + } } /************************************************************************ @@ -1247,10 +1263,13 @@ parameters as page (this often happens when a page is split). */ void btr_search_move_or_delete_hash_entries( /*===================================*/ - page_t* new_page, /* in: records are copied to this page */ - page_t* page) /* in: index page from which records were - copied, and the copied records will be deleted - from this page */ + page_t* new_page, /* in: records are copied + to this page */ + page_t* page, /* in: index page from which + records were copied, and the + copied records will be deleted + from this page */ + dict_index_t* index) /* in: record descriptor */ { buf_block_t* block; buf_block_t* new_block; @@ -1260,11 +1279,14 @@ btr_search_move_or_delete_hash_entries( block = buf_block_align(page); new_block = buf_block_align(new_page); + ut_a(page_is_comp(page) == page_is_comp(new_page)); #ifdef UNIV_SYNC_DEBUG ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); ut_ad(rw_lock_own(&(new_block->lock), RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ + ut_a(!new_block->is_hashed || new_block->index == index); + ut_a(!block->is_hashed || block->index == index); rw_lock_s_lock(&btr_search_latch); @@ -1290,8 +1312,8 @@ btr_search_move_or_delete_hash_entries( rw_lock_s_unlock(&btr_search_latch); ut_a(n_fields + n_bytes > 0); - - btr_search_build_page_hash_index(NULL, new_page, n_fields, + + btr_search_build_page_hash_index(index, new_page, n_fields, n_bytes, side); ut_a(n_fields == block->curr_n_fields); ut_a(n_bytes == block->curr_n_bytes); @@ -1319,6 +1341,8 @@ btr_search_update_hash_on_delete( ulint fold; dulint tree_id; ibool found; + ulint offsets_[100] = { 100, }; + mem_heap_t* heap = NULL; rec = btr_cur_get_rec(cursor); @@ -1333,14 +1357,18 @@ btr_search_update_hash_on_delete( return; } + ut_a(block->index == cursor->index); ut_a(block->curr_n_fields + block->curr_n_bytes > 0); table = btr_search_sys->hash_index; tree_id = cursor->index->tree->id; - - fold = rec_fold(rec, block->curr_n_fields, block->curr_n_bytes, - tree_id); + fold = rec_fold(rec, rec_get_offsets(rec, cursor->index, offsets_, + ULINT_UNDEFINED, &heap), block->curr_n_fields, + block->curr_n_bytes, tree_id); + if (heap) { + mem_heap_free(heap); + } rw_lock_x_lock(&btr_search_latch); found = ha_search_and_delete_if_found(table, fold, rec); @@ -1376,6 +1404,8 @@ btr_search_update_hash_node_on_insert( return; } + ut_a(block->index == cursor->index); + rw_lock_x_lock(&btr_search_latch); if ((cursor->flag == BTR_CUR_HASH) @@ -1420,7 +1450,10 @@ btr_search_update_hash_on_insert( ulint n_fields; ulint n_bytes; ulint side; - ibool locked = FALSE; + ibool locked = FALSE; + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; table = btr_search_sys->hash_index; @@ -1439,6 +1472,8 @@ btr_search_update_hash_on_insert( return; } + ut_a(block->index == cursor->index); + tree_id = ((cursor->index)->tree)->id; n_fields = block->curr_n_fields; @@ -1449,15 +1484,21 @@ btr_search_update_hash_on_insert( next_rec = page_rec_get_next(ins_rec); page = buf_frame_align(rec); - - ins_fold = rec_fold(ins_rec, n_fields, n_bytes, tree_id); + offsets = rec_get_offsets(ins_rec, cursor->index, offsets, + ULINT_UNDEFINED, &heap); + ins_fold = rec_fold(ins_rec, offsets, n_fields, n_bytes, tree_id); if (next_rec != page_get_supremum_rec(page)) { - next_fold = rec_fold(next_rec, n_fields, n_bytes, tree_id); + offsets = rec_get_offsets(next_rec, cursor->index, offsets, + n_fields + (n_bytes > 0), &heap); + next_fold = rec_fold(next_rec, offsets, n_fields, + n_bytes, tree_id); } if (rec != page_get_infimum_rec(page)) { - fold = rec_fold(rec, n_fields, n_bytes, tree_id); + offsets = rec_get_offsets(rec, cursor->index, offsets, + n_fields + (n_bytes > 0), &heap); + fold = rec_fold(rec, offsets, n_fields, n_bytes, tree_id); } else { if (side == BTR_SEARCH_LEFT_SIDE) { @@ -1527,6 +1568,9 @@ check_next_rec: } function_exit: + if (heap) { + mem_heap_free(heap); + } if (locked) { rw_lock_x_unlock(&btr_search_latch); } @@ -1546,6 +1590,9 @@ btr_search_validate(void) ulint n_page_dumps = 0; ibool ok = TRUE; ulint i; + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; rw_lock_x_lock(&btr_search_latch); @@ -1555,9 +1602,14 @@ btr_search_validate(void) while (node != NULL) { block = buf_block_align(node->data); page = buf_frame_align(node->data); + offsets = rec_get_offsets((rec_t*) node->data, + block->index, offsets, + block->curr_n_fields + + (block->curr_n_bytes > 0), &heap); if (!block->is_hashed || node->fold != rec_fold((rec_t*)(node->data), + offsets, block->curr_n_fields, block->curr_n_bytes, btr_page_get_index_id(page))) { @@ -1573,12 +1625,14 @@ btr_search_validate(void) (ulong) ut_dulint_get_low(btr_page_get_index_id(page)), (ulong) node->fold, (ulong) rec_fold((rec_t*)(node->data), + offsets, block->curr_n_fields, block->curr_n_bytes, btr_page_get_index_id(page))); fputs("InnoDB: Record ", stderr); - rec_print(stderr, (rec_t*)(node->data)); + rec_print_new(stderr, (rec_t*)node->data, + offsets); fprintf(stderr, "\nInnoDB: on that page." "Page mem address %p, is hashed %lu, n fields %lu, n bytes %lu\n" "side %lu\n", @@ -1602,6 +1656,9 @@ btr_search_validate(void) } rw_lock_x_unlock(&btr_search_latch); + if (heap) { + mem_heap_free(heap); + } return(ok); } diff --git a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c index 699ad5fb42e..89f851709db 100644 --- a/innobase/buf/buf0buf.c +++ b/innobase/buf/buf0buf.c @@ -331,33 +331,43 @@ buf_page_is_corrupted( } } #endif - old_checksum = buf_calc_page_old_checksum(read_buf); - - old_checksum_field = mach_read_from_4(read_buf + UNIV_PAGE_SIZE + + /* If we use checksums validation, make additional check before returning + TRUE to ensure that the checksum is not equal to BUF_NO_CHECKSUM_MAGIC which + might be stored by InnoDB with checksums disabled. + Otherwise, skip checksum calculation and return FALSE */ + + if (srv_use_checksums) { + old_checksum = buf_calc_page_old_checksum(read_buf); + + old_checksum_field = mach_read_from_4(read_buf + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM); - /* There are 2 valid formulas for old_checksum_field: - 1. Very old versions of InnoDB only stored 8 byte lsn to the start - and the end of the page. - 2. Newer InnoDB versions store the old formula checksum there. */ + /* There are 2 valid formulas for old_checksum_field: + 1. Very old versions of InnoDB only stored 8 byte lsn to the start + and the end of the page. + 2. Newer InnoDB versions store the old formula checksum there. */ - if (old_checksum_field != mach_read_from_4(read_buf + FIL_PAGE_LSN) - && old_checksum_field != old_checksum) { + if (old_checksum_field != mach_read_from_4(read_buf + FIL_PAGE_LSN) + && old_checksum_field != old_checksum + && old_checksum_field != BUF_NO_CHECKSUM_MAGIC) { - return(TRUE); - } + return(TRUE); + } - checksum = buf_calc_page_new_checksum(read_buf); - checksum_field = mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM); + checksum = buf_calc_page_new_checksum(read_buf); + checksum_field = mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM); - /* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id - (always equal to 0), to FIL_PAGE_SPACE_SPACE_OR_CHKSUM */ + /* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id + (always equal to 0), to FIL_PAGE_SPACE_SPACE_OR_CHKSUM */ - if (checksum_field != 0 && checksum_field != checksum) { - - return(TRUE); - } + if (checksum_field != 0 && checksum_field != checksum + && checksum_field != BUF_NO_CHECKSUM_MAGIC) { + return(TRUE); + } + } + return(FALSE); } @@ -379,8 +389,10 @@ buf_page_print( ut_print_buf(stderr, read_buf, UNIV_PAGE_SIZE); fputs("InnoDB: End of page dump\n", stderr); - checksum = buf_calc_page_new_checksum(read_buf); - old_checksum = buf_calc_page_old_checksum(read_buf); + checksum = srv_use_checksums ? + buf_calc_page_new_checksum(read_buf) : BUF_NO_CHECKSUM_MAGIC; + old_checksum = srv_use_checksums ? + buf_calc_page_old_checksum(read_buf) : BUF_NO_CHECKSUM_MAGIC; ut_print_timestamp(stderr); fprintf(stderr, @@ -460,6 +472,7 @@ buf_block_init( block->file_page_was_freed = FALSE; block->check_index_page_at_flush = FALSE; + block->index = NULL; block->in_free_list = FALSE; block->in_LRU_list = FALSE; @@ -547,7 +560,7 @@ buf_pool_init( } /*----------------------------------------*/ } else { - buf_pool->frame_mem = ut_malloc_low( + buf_pool->frame_mem = os_mem_alloc_large( UNIV_PAGE_SIZE * (n_frames + 1), TRUE, FALSE); } @@ -1535,6 +1548,7 @@ buf_page_init( block->offset = offset; block->check_index_page_at_flush = FALSE; + block->index = NULL; block->lock_hash_val = lock_rec_hash(space, offset); block->lock_mutex = NULL; @@ -2137,6 +2151,31 @@ buf_print(void) } /************************************************************************* +Returns the number of latched pages in the buffer pool. */ + +ulint +buf_get_latched_pages_number(void) +{ + buf_block_t* block; + ulint i; + ulint fixed_pages_number = 0; + + mutex_enter(&(buf_pool->mutex)); + + for (i = 0; i < buf_pool->curr_size; i++) { + + block = buf_pool_get_nth_block(buf_pool, i); + + if (((block->buf_fix_count != 0) || (block->io_fix != 0)) && + block->magic_n == BUF_BLOCK_MAGIC_N ) + fixed_pages_number++; + } + + mutex_exit(&(buf_pool->mutex)); + return fixed_pages_number; +} + +/************************************************************************* Returns the number of pending buf pool ios. */ ulint diff --git a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c index 964c396dd08..a0ca614d9b3 100644 --- a/innobase/buf/buf0flu.c +++ b/innobase/buf/buf0flu.c @@ -273,6 +273,10 @@ buf_flush_buffered_writes(void) } } + /* increment the doublewrite flushed pages counter */ + srv_dblwr_pages_written+= trx_doublewrite->first_free; + srv_dblwr_writes++; + if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { len = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE; } else { @@ -444,7 +448,8 @@ buf_flush_init_for_writing( /* Store the new formula checksum */ mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, - buf_calc_page_new_checksum(page)); + srv_use_checksums ? + buf_calc_page_new_checksum(page) : BUF_NO_CHECKSUM_MAGIC); /* We overwrite the first 4 bytes of the end lsn field to store the old formula checksum. Since it depends also on the field @@ -452,7 +457,8 @@ buf_flush_init_for_writing( new formula checksum. */ mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, - buf_calc_page_old_checksum(page)); + srv_use_checksums ? + buf_calc_page_old_checksum(page) : BUF_NO_CHECKSUM_MAGIC); } /************************************************************************ @@ -901,6 +907,9 @@ buf_flush_batch( (ulong) page_count); } + if (page_count != ULINT_UNDEFINED) + srv_buf_pool_flushed+= page_count; + return(page_count); } diff --git a/innobase/buf/buf0lru.c b/innobase/buf/buf0lru.c index f3fb19ae183..8460a049d3e 100644 --- a/innobase/buf/buf0lru.c +++ b/innobase/buf/buf0lru.c @@ -465,6 +465,7 @@ loop: /* No free block was found: try to flush the LRU list */ buf_flush_free_margin(); + ++srv_buf_pool_wait_free; os_aio_simulated_wake_handler_threads(); diff --git a/innobase/buf/buf0rea.c b/innobase/buf/buf0rea.c index 58287d37387..1ce52c8ee31 100644 --- a/innobase/buf/buf0rea.c +++ b/innobase/buf/buf0rea.c @@ -20,6 +20,10 @@ Created 11/5/1995 Heikki Tuuri #include "os0file.h" #include "srv0start.h" +extern ulint srv_read_ahead_rnd; +extern ulint srv_read_ahead_seq; +extern ulint srv_buf_pool_reads; + /* The size in blocks of the area where the random read-ahead algorithm counts the accessed pages when deciding whether to read-ahead */ #define BUF_READ_AHEAD_RANDOM_AREA BUF_READ_AHEAD_AREA @@ -291,6 +295,7 @@ buf_read_ahead_random( (ulong) count); } + ++srv_read_ahead_rnd; return(count); } @@ -323,6 +328,7 @@ buf_read_page( count2 = buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space, tablespace_version, offset); + srv_buf_pool_reads+= count2; if (err == DB_TABLESPACE_DELETED) { ut_print_timestamp(stderr); fprintf(stderr, @@ -575,6 +581,7 @@ buf_read_ahead_linear( (ulong) space, (ulong) offset, (ulong) count); } + ++srv_read_ahead_seq; return(count); } diff --git a/innobase/data/data0data.c b/innobase/data/data0data.c index 97ec1a1acd9..25ba19d0296 100644 --- a/innobase/data/data0data.c +++ b/innobase/data/data0data.c @@ -500,7 +500,7 @@ dtuple_convert_big_rec( ut_a(dtuple_check_typed_no_assert(entry)); - size = rec_get_converted_size(entry); + size = rec_get_converted_size(index, entry); if (size > 1000000000) { fprintf(stderr, @@ -524,9 +524,10 @@ dtuple_convert_big_rec( n_fields = 0; - while ((rec_get_converted_size(entry) - >= page_get_free_space_of_empty() / 2) - || rec_get_converted_size(entry) >= REC_MAX_DATA_SIZE) { + while (rec_get_converted_size(index, entry) + >= ut_min(page_get_free_space_of_empty( + index->table->comp) / 2, + REC_MAX_DATA_SIZE)) { longest = 0; for (i = dict_index_get_n_unique_in_tree(index); diff --git a/innobase/data/data0type.c b/innobase/data/data0type.c index dab14df4240..00048bf6fbb 100644 --- a/innobase/data/data0type.c +++ b/innobase/data/data0type.c @@ -206,7 +206,7 @@ dtype_validate( ut_a((type->mtype >= DATA_VARCHAR) && (type->mtype <= DATA_MYSQL)); if (type->mtype == DATA_SYS) { - ut_a(type->prtype <= DATA_MIX_ID); + ut_a((type->prtype & DATA_MYSQL_TYPE_MASK) < DATA_N_SYS_COLS); } return(TRUE); diff --git a/innobase/dict/dict0boot.c b/innobase/dict/dict0boot.c index f156cf67a18..0f6d55c9341 100644 --- a/innobase/dict/dict0boot.c +++ b/innobase/dict/dict0boot.c @@ -158,7 +158,7 @@ dict_hdr_create( /*--------------------------*/ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, - DICT_HDR_SPACE, DICT_TABLES_ID, mtr); + DICT_HDR_SPACE, DICT_TABLES_ID, FALSE, mtr); if (root_page_no == FIL_NULL) { return(FALSE); @@ -168,7 +168,7 @@ dict_hdr_create( MLOG_4BYTES, mtr); /*--------------------------*/ root_page_no = btr_create(DICT_UNIQUE, DICT_HDR_SPACE, - DICT_TABLE_IDS_ID, mtr); + DICT_TABLE_IDS_ID, FALSE, mtr); if (root_page_no == FIL_NULL) { return(FALSE); @@ -178,7 +178,7 @@ dict_hdr_create( MLOG_4BYTES, mtr); /*--------------------------*/ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, - DICT_HDR_SPACE, DICT_COLUMNS_ID, mtr); + DICT_HDR_SPACE, DICT_COLUMNS_ID, FALSE, mtr); if (root_page_no == FIL_NULL) { return(FALSE); @@ -188,7 +188,7 @@ dict_hdr_create( MLOG_4BYTES, mtr); /*--------------------------*/ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, - DICT_HDR_SPACE, DICT_INDEXES_ID, mtr); + DICT_HDR_SPACE, DICT_INDEXES_ID, FALSE, mtr); if (root_page_no == FIL_NULL) { return(FALSE); @@ -198,7 +198,7 @@ dict_hdr_create( MLOG_4BYTES, mtr); /*--------------------------*/ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, - DICT_HDR_SPACE, DICT_FIELDS_ID, mtr); + DICT_HDR_SPACE, DICT_FIELDS_ID, FALSE, mtr); if (root_page_no == FIL_NULL) { return(FALSE); @@ -223,6 +223,7 @@ dict_boot(void) dict_index_t* index; dict_hdr_t* dict_hdr; mtr_t mtr; + ibool success; mtr_start(&mtr); @@ -254,7 +255,7 @@ dict_boot(void) /* Insert into the dictionary cache the descriptions of the basic system tables */ /*-------------------------*/ - table = dict_mem_table_create("SYS_TABLES", DICT_HDR_SPACE,8); + table = dict_mem_table_create("SYS_TABLES", DICT_HDR_SPACE, 8, FALSE); dict_mem_table_add_col(table, "NAME", DATA_BINARY, 0, 0, 0); dict_mem_table_add_col(table, "ID", DATA_BINARY, 0, 0, 0); @@ -275,22 +276,22 @@ dict_boot(void) dict_mem_index_add_field(index, "NAME", 0, 0); - index->page_no = mtr_read_ulint(dict_hdr + DICT_HDR_TABLES, - MLOG_4BYTES, &mtr); index->id = DICT_TABLES_ID; - ut_a(dict_index_add_to_cache(table, index)); + success = dict_index_add_to_cache(table, index, mtr_read_ulint( + dict_hdr + DICT_HDR_TABLES, MLOG_4BYTES, &mtr)); + ut_a(success); /*-------------------------*/ index = dict_mem_index_create("SYS_TABLES", "ID_IND", DICT_HDR_SPACE, DICT_UNIQUE, 1); dict_mem_index_add_field(index, "ID", 0, 0); - index->page_no = mtr_read_ulint(dict_hdr + DICT_HDR_TABLE_IDS, - MLOG_4BYTES, &mtr); index->id = DICT_TABLE_IDS_ID; - ut_a(dict_index_add_to_cache(table, index)); + success = dict_index_add_to_cache(table, index, mtr_read_ulint( + dict_hdr + DICT_HDR_TABLE_IDS, MLOG_4BYTES, &mtr)); + ut_a(success); /*-------------------------*/ - table = dict_mem_table_create("SYS_COLUMNS",DICT_HDR_SPACE,7); + table = dict_mem_table_create("SYS_COLUMNS", DICT_HDR_SPACE, 7, FALSE); dict_mem_table_add_col(table, "TABLE_ID", DATA_BINARY,0,0,0); dict_mem_table_add_col(table, "POS", DATA_INT, 0, 4, 0); @@ -311,12 +312,12 @@ dict_boot(void) dict_mem_index_add_field(index, "TABLE_ID", 0, 0); dict_mem_index_add_field(index, "POS", 0, 0); - index->page_no = mtr_read_ulint(dict_hdr + DICT_HDR_COLUMNS, - MLOG_4BYTES, &mtr); index->id = DICT_COLUMNS_ID; - ut_a(dict_index_add_to_cache(table, index)); + success = dict_index_add_to_cache(table, index, mtr_read_ulint( + dict_hdr + DICT_HDR_COLUMNS, MLOG_4BYTES, &mtr)); + ut_a(success); /*-------------------------*/ - table = dict_mem_table_create("SYS_INDEXES",DICT_HDR_SPACE,7); + table = dict_mem_table_create("SYS_INDEXES", DICT_HDR_SPACE, 7, FALSE); dict_mem_table_add_col(table, "TABLE_ID", DATA_BINARY, 0,0,0); dict_mem_table_add_col(table, "ID", DATA_BINARY, 0, 0, 0); @@ -333,6 +334,9 @@ dict_boot(void) #if DICT_SYS_INDEXES_SPACE_NO_FIELD != 5 + 2 #error "DICT_SYS_INDEXES_SPACE_NO_FIELD != 5 + 2" #endif +#if DICT_SYS_INDEXES_TYPE_FIELD != 4 + 2 +#error "DICT_SYS_INDEXES_TYPE_FIELD != 4 + 2" +#endif table->id = DICT_INDEXES_ID; dict_table_add_to_cache(table); @@ -344,12 +348,12 @@ dict_boot(void) dict_mem_index_add_field(index, "TABLE_ID", 0, 0); dict_mem_index_add_field(index, "ID", 0, 0); - index->page_no = mtr_read_ulint(dict_hdr + DICT_HDR_INDEXES, - MLOG_4BYTES, &mtr); index->id = DICT_INDEXES_ID; - ut_a(dict_index_add_to_cache(table, index)); + success = dict_index_add_to_cache(table, index, mtr_read_ulint( + dict_hdr + DICT_HDR_INDEXES, MLOG_4BYTES, &mtr)); + ut_a(success); /*-------------------------*/ - table = dict_mem_table_create("SYS_FIELDS", DICT_HDR_SPACE,3); + table = dict_mem_table_create("SYS_FIELDS", DICT_HDR_SPACE, 3, FALSE); dict_mem_table_add_col(table, "INDEX_ID", DATA_BINARY, 0,0,0); dict_mem_table_add_col(table, "POS", DATA_INT, 0, 4, 0); @@ -365,10 +369,10 @@ dict_boot(void) dict_mem_index_add_field(index, "INDEX_ID", 0, 0); dict_mem_index_add_field(index, "POS", 0, 0); - index->page_no = mtr_read_ulint(dict_hdr + DICT_HDR_FIELDS, - MLOG_4BYTES, &mtr); index->id = DICT_FIELDS_ID; - ut_a(dict_index_add_to_cache(table, index)); + success = dict_index_add_to_cache(table, index, mtr_read_ulint( + dict_hdr + DICT_HDR_FIELDS, MLOG_4BYTES, &mtr)); + ut_a(success); mtr_commit(&mtr); /*-------------------------*/ diff --git a/innobase/dict/dict0crea.c b/innobase/dict/dict0crea.c index cbdc0aab53c..3c496bae5b4 100644 --- a/innobase/dict/dict0crea.c +++ b/innobase/dict/dict0crea.c @@ -84,7 +84,8 @@ dict_create_sys_tables_tuple( dfield = dtuple_get_nth_field(entry, 5); ptr = mem_heap_alloc(heap, 4); - mach_write_to_4(ptr, table->mix_len); + mach_write_to_4(ptr, (table->mix_len & 0x7fffffff) | + ((ulint) table->comp << 31)); dfield_set_data(dfield, ptr, 4); /* 8: CLUSTER_NAME ---------------------*/ @@ -543,9 +544,7 @@ dict_build_index_def_step( table in the same tablespace */ index->space = table->space; - - index->page_no = FIL_NULL; - + node->page_no = FIL_NULL; row = dict_create_sys_indexes_tuple(index, node->heap); node->ind_row = row; @@ -623,18 +622,18 @@ dict_create_index_tree_step( btr_pcur_move_to_next_user_rec(&pcur, &mtr); - index->page_no = btr_create(index->type, index->space, index->id, - &mtr); + node->page_no = btr_create(index->type, index->space, index->id, + table->comp, &mtr); /* printf("Created a new index tree in space %lu root page %lu\n", index->space, index->page_no); */ page_rec_write_index_page_no(btr_pcur_get_rec(&pcur), DICT_SYS_INDEXES_PAGE_NO_FIELD, - index->page_no, &mtr); + node->page_no, &mtr); btr_pcur_close(&pcur); mtr_commit(&mtr); - if (index->page_no == FIL_NULL) { + if (node->page_no == FIL_NULL) { return(DB_OUT_OF_FILE_SPACE); } @@ -660,8 +659,9 @@ dict_drop_index_tree( #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(dict_sys->mutex))); #endif /* UNIV_SYNC_DEBUG */ - - ptr = rec_get_nth_field(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len); + + ut_a(!dict_sys->sys_indexes->comp); + ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len); ut_ad(len == 4); @@ -673,8 +673,9 @@ dict_drop_index_tree( return; } - ptr = rec_get_nth_field(rec, DICT_SYS_INDEXES_SPACE_NO_FIELD, &len); - + ptr = rec_get_nth_field_old(rec, + DICT_SYS_INDEXES_SPACE_NO_FIELD, &len); + ut_ad(len == 4); space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); @@ -699,8 +700,103 @@ dict_drop_index_tree( root_page_no); */ btr_free_root(space, root_page_no, mtr); - page_rec_write_index_page_no(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, - FIL_NULL, mtr); + page_rec_write_index_page_no(rec, + DICT_SYS_INDEXES_PAGE_NO_FIELD, FIL_NULL, mtr); +} + +/*********************************************************************** +Truncates the index tree associated with a row in SYS_INDEXES table. */ + +void +dict_truncate_index_tree( +/*=====================*/ + dict_table_t* table, /* in: the table the index belongs to */ + rec_t* rec, /* in: record in the clustered index of + SYS_INDEXES table */ + mtr_t* mtr) /* in: mtr having the latch + on the record page */ +{ + ulint root_page_no; + ulint space; + ulint type; + dulint index_id; + byte* ptr; + ulint len; + ibool comp; + dict_index_t* index; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(dict_sys->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + + ut_a(!dict_sys->sys_indexes->comp); + ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len); + + ut_ad(len == 4); + + root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); + + if (root_page_no == FIL_NULL) { + /* The tree has been freed. */ + + return; + } + + ptr = rec_get_nth_field_old(rec, + DICT_SYS_INDEXES_SPACE_NO_FIELD, &len); + + ut_ad(len == 4); + + space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); + + if (!fil_tablespace_exists_in_mem(space)) { + /* It is a single table tablespace and the .ibd file is + missing: do nothing */ + + return; + } + + ptr = rec_get_nth_field_old(rec, + DICT_SYS_INDEXES_TYPE_FIELD, &len); + ut_ad(len == 4); + type = mach_read_from_4(ptr); + + ptr = rec_get_nth_field_old(rec, 1, &len); + ut_ad(len == 8); + index_id = mach_read_from_8(ptr); + + /* We free all the pages but the root page first; this operation + may span several mini-transactions */ + + btr_free_but_not_root(space, root_page_no); + + /* Then we free the root page in the same mini-transaction where + we create the b-tree and write its new root page number to the + appropriate field in the SYS_INDEXES record: this mini-transaction + marks the B-tree totally truncated */ + + comp = page_is_comp(btr_page_get( + space, root_page_no, RW_X_LATCH, mtr)); + + btr_free_root(space, root_page_no, mtr); + + /* Find the index corresponding to this SYS_INDEXES record. */ + for (index = UT_LIST_GET_FIRST(table->indexes); + index; + index = UT_LIST_GET_NEXT(indexes, index)) { + if (!ut_dulint_cmp(index->id, index_id)) { + break; + } + } + + root_page_no = btr_create(type, space, index_id, comp, mtr); + if (index) { + index->tree->page = root_page_no; + } + + page_rec_write_index_page_no(rec, + DICT_SYS_INDEXES_PAGE_NO_FIELD, + root_page_no, mtr); } /************************************************************************* @@ -759,6 +855,7 @@ ind_create_graph_create( node->index = index; node->state = INDEX_BUILD_INDEX_DEF; + node->page_no = FIL_NULL; node->heap = mem_heap_create(256); node->ind_def = ins_node_create(INS_DIRECT, @@ -978,7 +1075,8 @@ dict_create_index_step( if (node->state == INDEX_ADD_TO_CACHE) { - success = dict_index_add_to_cache(node->table, node->index); + success = dict_index_add_to_cache(node->table, node->index, + node->page_no); ut_a(success); diff --git a/innobase/dict/dict0dict.c b/innobase/dict/dict0dict.c index 2e6504cac11..12749f7704f 100644 --- a/innobase/dict/dict0dict.c +++ b/innobase/dict/dict0dict.c @@ -814,23 +814,22 @@ dict_table_add_to_cache( system columns. */ dict_mem_table_add_col(table, "DB_ROW_ID", DATA_SYS, - DATA_ROW_ID, 0, 0); + DATA_ROW_ID | DATA_NOT_NULL, DATA_ROW_ID_LEN, 0); #if DATA_ROW_ID != 0 #error "DATA_ROW_ID != 0" #endif dict_mem_table_add_col(table, "DB_TRX_ID", DATA_SYS, - DATA_TRX_ID, 0, 0); + DATA_TRX_ID | DATA_NOT_NULL, DATA_TRX_ID_LEN, 0); #if DATA_TRX_ID != 1 #error "DATA_TRX_ID != 1" #endif dict_mem_table_add_col(table, "DB_ROLL_PTR", DATA_SYS, - DATA_ROLL_PTR, 0, 0); + DATA_ROLL_PTR | DATA_NOT_NULL, DATA_ROLL_PTR_LEN, 0); #if DATA_ROLL_PTR != 2 #error "DATA_ROLL_PTR != 2" #endif - dict_mem_table_add_col(table, "DB_MIX_ID", DATA_SYS, - DATA_MIX_ID, 0, 0); + DATA_MIX_ID | DATA_NOT_NULL, DATA_MIX_ID_LEN, 0); #if DATA_MIX_ID != 3 #error "DATA_MIX_ID != 3" #endif @@ -1375,8 +1374,9 @@ dict_index_add_to_cache( /*====================*/ /* out: TRUE if success */ dict_table_t* table, /* in: table on which the index is */ - dict_index_t* index) /* in, own: index; NOTE! The index memory + dict_index_t* index, /* in, own: index; NOTE! The index memory object is freed in this function! */ + ulint page_no)/* in: root page number of the index */ { dict_index_t* new_index; dict_tree_t* tree; @@ -1462,10 +1462,9 @@ dict_index_add_to_cache( tree = dict_index_get_tree( UT_LIST_GET_FIRST(cluster->indexes)); new_index->tree = tree; - new_index->page_no = tree->page; } else { /* Create an index tree memory object for the index */ - tree = dict_tree_create(new_index); + tree = dict_tree_create(new_index, page_no); ut_ad(tree); new_index->tree = tree; @@ -1588,7 +1587,7 @@ dict_index_find_cols( /*********************************************************************** Adds a column to index. */ -UNIV_INLINE + void dict_index_add_col( /*===============*/ @@ -1604,6 +1603,34 @@ dict_index_add_col( field = dict_index_get_nth_field(index, index->n_def - 1); field->col = col; + field->fixed_len = dtype_get_fixed_size(&col->type); + + if (prefix_len && field->fixed_len > prefix_len) { + field->fixed_len = prefix_len; + } + + /* Long fixed-length fields that need external storage are treated as + variable-length fields, so that the extern flag can be embedded in + the length word. */ + + if (field->fixed_len > DICT_MAX_COL_PREFIX_LEN) { + field->fixed_len = 0; + } + + if (!(dtype_get_prtype(&col->type) & DATA_NOT_NULL)) { + index->n_nullable++; + } + + if (index->n_def > 1) { + const dict_field_t* field2 = + dict_index_get_nth_field(index, index->n_def - 2); + field->fixed_offs = (!field2->fixed_len || + field2->fixed_offs == ULINT_UNDEFINED) + ? ULINT_UNDEFINED + : field2->fixed_len + field2->fixed_offs; + } else { + field->fixed_offs = 0; + } } /*********************************************************************** @@ -1722,7 +1749,6 @@ dict_index_build_internal_clust( new_index->n_user_defined_cols = index->n_fields; new_index->id = index->id; - new_index->page_no = index->page_no; if (table->type != DICT_TABLE_ORDINARY) { /* The index is mixed: copy common key prefix fields */ @@ -1901,7 +1927,6 @@ dict_index_build_internal_non_clust( new_index->n_user_defined_cols = index->n_fields; new_index->id = index->id; - new_index->page_no = index->page_no; /* Copy fields from index to new_index */ dict_index_copy(new_index, index, 0, index->n_fields); @@ -3538,9 +3563,10 @@ dict_tree_t* dict_tree_create( /*=============*/ /* out, own: created tree */ - dict_index_t* index) /* in: the index for which to create: in the + dict_index_t* index, /* in: the index for which to create: in the case of a mixed tree, this should be the index of the cluster object */ + ulint page_no)/* in: root page number of the index */ { dict_tree_t* tree; @@ -3550,7 +3576,7 @@ dict_tree_create( tree->type = index->type; tree->space = index->space; - tree->page = index->page_no; + tree->page = page_no; tree->id = index->id; @@ -3604,9 +3630,10 @@ dict_tree_find_index_low( && (table->type != DICT_TABLE_ORDINARY)) { /* Get the mix id of the record */ + ut_a(!table->comp); mix_id = mach_dulint_read_compressed( - rec_get_nth_field(rec, table->mix_len, &len)); + rec_get_nth_field_old(rec, table->mix_len, &len)); while (ut_dulint_cmp(table->mix_id, mix_id) != 0) { @@ -3685,6 +3712,29 @@ dict_tree_find_index_for_tuple( return(index); } +/*********************************************************************** +Checks if a table which is a mixed cluster member owns a record. */ + +ibool +dict_is_mixed_table_rec( +/*====================*/ + /* out: TRUE if the record belongs to this + table */ + dict_table_t* table, /* in: table in a mixed cluster */ + rec_t* rec) /* in: user record in the clustered index */ +{ + byte* mix_id_field; + ulint len; + + ut_ad(!table->comp); + + mix_id_field = rec_get_nth_field_old(rec, + table->mix_len, &len); + + return(len == table->mix_id_len + && !ut_memcmp(table->mix_id_buf, mix_id_field, len)); +} + /************************************************************************** Checks that a tuple has n_fields_cmp value in a sensible range, so that no comparison can occur with the page number field in a node pointer. */ @@ -3739,7 +3789,8 @@ dict_tree_build_node_ptr( on non-leaf levels we remove the last field, which contains the page number of the child page */ - n_unique = rec_get_n_fields(rec); + ut_a(!ind->table->comp); + n_unique = rec_get_n_fields_old(rec); if (level > 0) { ut_a(n_unique > 1); @@ -3768,9 +3819,11 @@ dict_tree_build_node_ptr( field = dtuple_get_nth_field(tuple, n_unique); dfield_set_data(field, buf, 4); - dtype_set(dfield_get_type(field), DATA_SYS_CHILD, 0, 0, 0); + dtype_set(dfield_get_type(field), DATA_SYS_CHILD, DATA_NOT_NULL, 4, 0); - rec_copy_prefix_to_dtuple(tuple, rec, n_unique, heap); + rec_copy_prefix_to_dtuple(tuple, rec, ind, n_unique, heap); + dtuple_set_info_bits(tuple, dtuple_get_info_bits(tuple) | + REC_STATUS_NODE_PTR); ut_ad(dtuple_check_typed(tuple)); @@ -3787,27 +3840,26 @@ dict_tree_copy_rec_order_prefix( /* out: pointer to the prefix record */ dict_tree_t* tree, /* in: index tree */ rec_t* rec, /* in: record for which to copy prefix */ + ulint* n_fields,/* out: number of fields copied */ byte** buf, /* in/out: memory buffer for the copied prefix, or NULL */ ulint* buf_size)/* in/out: buffer size */ { - dict_index_t* ind; - rec_t* order_rec; - ulint n_fields; - - ind = dict_tree_find_index_low(tree, rec); + dict_index_t* index; + ulint n; - n_fields = dict_index_get_n_unique_in_tree(ind); - - if (tree->type & DICT_UNIVERSAL) { + index = dict_tree_find_index_low(tree, rec); - n_fields = rec_get_n_fields(rec); + if (tree->type & DICT_UNIVERSAL) { + ut_a(!index->table->comp); + n = rec_get_n_fields_old(rec); + } else { + n = dict_index_get_n_unique_in_tree(index); } - order_rec = rec_copy_prefix_to_buf(rec, n_fields, buf, buf_size); - - return(order_rec); -} + *n_fields = n; + return(rec_copy_prefix_to_buf(rec, index, n, buf, buf_size)); +} /************************************************************************** Builds a typed data tuple out of a physical record. */ @@ -3818,21 +3870,21 @@ dict_tree_build_data_tuple( /* out, own: data tuple */ dict_tree_t* tree, /* in: index tree */ rec_t* rec, /* in: record for which to build data tuple */ + ulint n_fields,/* in: number of data fields */ mem_heap_t* heap) /* in: memory heap where tuple created */ { dtuple_t* tuple; dict_index_t* ind; - ulint n_fields; ind = dict_tree_find_index_low(tree, rec); - n_fields = rec_get_n_fields(rec); + ut_ad(ind->table->comp || n_fields <= rec_get_n_fields_old(rec)); tuple = dtuple_create(heap, n_fields); dict_index_copy_types(tuple, ind, n_fields); - rec_copy_prefix_to_dtuple(tuple, rec, n_fields, heap); + rec_copy_prefix_to_dtuple(tuple, rec, ind, n_fields, heap); ut_ad(dtuple_check_typed(tuple)); @@ -3850,6 +3902,27 @@ dict_index_calc_min_rec_len( ulint sum = 0; ulint i; + if (index->table->comp) { + ulint nullable = 0; + sum = REC_N_NEW_EXTRA_BYTES; + for (i = 0; i < dict_index_get_n_fields(index); i++) { + dtype_t*t = dict_index_get_nth_type(index, i); + ulint size = dtype_get_fixed_size(t); + sum += size; + if (!size) { + size = dtype_get_len(t); + sum += size < 128 ? 1 : 2; + } + if (!(dtype_get_prtype(t) & DATA_NOT_NULL)) + nullable++; + } + + /* round the NULL flags up to full bytes */ + sum += (nullable + 7) / 8; + + return(sum); + } + for (i = 0; i < dict_index_get_n_fields(index); i++) { sum += dtype_get_fixed_size(dict_index_get_nth_type(index, i)); } @@ -3860,7 +3933,7 @@ dict_index_calc_min_rec_len( sum += dict_index_get_n_fields(index); } - sum += REC_N_EXTRA_BYTES; + sum += REC_N_OLD_EXTRA_BYTES; return(sum); } diff --git a/innobase/dict/dict0load.c b/innobase/dict/dict0load.c index 61facc8818d..18910acb01d 100644 --- a/innobase/dict/dict0load.c +++ b/innobase/dict/dict0load.c @@ -8,6 +8,7 @@ Created 4/24/1996 Heikki Tuuri *******************************************************/ #include "dict0load.h" +#include "mysql_version.h" #ifdef UNIV_NONINL #include "dict0load.ic" @@ -55,6 +56,7 @@ dict_get_first_table_name_in_db( sys_tables = dict_table_get_low("SYS_TABLES"); sys_index = UT_LIST_GET_FIRST(sys_tables->indexes); + ut_a(!sys_tables->comp); tuple = dtuple_create(heap, 1); dfield = dtuple_get_nth_field(tuple, 0); @@ -77,7 +79,7 @@ loop: return(NULL); } - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); if (len < strlen(name) || ut_memcmp(name, field, strlen(name)) != 0) { @@ -90,7 +92,7 @@ loop: return(NULL); } - if (!rec_get_deleted_flag(rec)) { + if (!rec_get_deleted_flag(rec, sys_tables->comp)) { /* We found one */ @@ -163,9 +165,9 @@ loop: return; } - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); - if (!rec_get_deleted_flag(rec)) { + if (!rec_get_deleted_flag(rec, sys_tables->comp)) { /* We found one */ @@ -231,6 +233,7 @@ dict_check_tablespaces_and_store_max_id( sys_tables = dict_table_get_low("SYS_TABLES"); sys_index = UT_LIST_GET_FIRST(sys_tables->indexes); + ut_a(!sys_tables->comp); btr_pcur_open_at_index_side(TRUE, sys_index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr); @@ -257,15 +260,15 @@ loop: return; } - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); - if (!rec_get_deleted_flag(rec)) { + if (!rec_get_deleted_flag(rec, sys_tables->comp)) { /* We found one */ char* name = mem_strdupl((char*) field, len); - field = rec_get_nth_field(rec, 9, &len); + field = rec_get_nth_field_old(rec, 9, &len); ut_a(len == 4); space_id = mach_read_from_4(field); @@ -338,6 +341,7 @@ dict_load_columns( sys_columns = dict_table_get_low("SYS_COLUMNS"); sys_index = UT_LIST_GET_FIRST(sys_columns->indexes); + ut_a(!sys_columns->comp); tuple = dtuple_create(heap, 1); dfield = dtuple_get_nth_field(tuple, 0); @@ -356,28 +360,27 @@ dict_load_columns( ut_a(btr_pcur_is_on_user_rec(&pcur, &mtr)); - ut_a(!rec_get_deleted_flag(rec)); - - field = rec_get_nth_field(rec, 0, &len); + ut_a(!rec_get_deleted_flag(rec, sys_columns->comp)); + + field = rec_get_nth_field_old(rec, 0, &len); ut_ad(len == 8); ut_a(ut_dulint_cmp(table->id, mach_read_from_8(field)) == 0); - field = rec_get_nth_field(rec, 1, &len); + field = rec_get_nth_field_old(rec, 1, &len); ut_ad(len == 4); ut_a(i == mach_read_from_4(field)); ut_a(0 == ut_strcmp("NAME", dict_field_get_col( - dict_index_get_nth_field( - dict_table_get_first_index(sys_columns), 4))->name)); + dict_index_get_nth_field(sys_index, 4))->name)); - field = rec_get_nth_field(rec, 4, &len); + field = rec_get_nth_field_old(rec, 4, &len); name = mem_heap_strdupl(heap, (char*) field, len); - field = rec_get_nth_field(rec, 5, &len); + field = rec_get_nth_field_old(rec, 5, &len); mtype = mach_read_from_4(field); - field = rec_get_nth_field(rec, 6, &len); + field = rec_get_nth_field_old(rec, 6, &len); prtype = mach_read_from_4(field); if (dtype_is_non_binary_string_type(mtype, prtype) @@ -389,15 +392,14 @@ dict_load_columns( data_mysql_default_charset_coll); } - field = rec_get_nth_field(rec, 7, &len); + field = rec_get_nth_field_old(rec, 7, &len); col_len = mach_read_from_4(field); ut_a(0 == ut_strcmp("PREC", dict_field_get_col( - dict_index_get_nth_field( - dict_table_get_first_index(sys_columns), 8))->name)); + dict_index_get_nth_field(sys_index, 8))->name)); - field = rec_get_nth_field(rec, 8, &len); + field = rec_get_nth_field_old(rec, 8, &len); prec = mach_read_from_4(field); dict_mem_table_add_col(table, name, mtype, prtype, col_len, @@ -462,6 +464,7 @@ dict_load_fields( sys_fields = dict_table_get_low("SYS_FIELDS"); sys_index = UT_LIST_GET_FIRST(sys_fields->indexes); + ut_a(!sys_fields->comp); tuple = dtuple_create(heap, 1); dfield = dtuple_get_nth_field(tuple, 0); @@ -479,15 +482,15 @@ dict_load_fields( rec = btr_pcur_get_rec(&pcur); ut_a(btr_pcur_is_on_user_rec(&pcur, &mtr)); - if (rec_get_deleted_flag(rec)) { + if (rec_get_deleted_flag(rec, sys_fields->comp)) { dict_load_report_deleted_index(table->name, i); } - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); ut_ad(len == 8); ut_a(ut_memcmp(buf, field, len) == 0); - field = rec_get_nth_field(rec, 1, &len); + field = rec_get_nth_field_old(rec, 1, &len); ut_a(len == 4); /* The next field stores the field position in the index @@ -513,10 +516,9 @@ dict_load_fields( ut_a(0 == ut_strcmp("COL_NAME", dict_field_get_col( - dict_index_get_nth_field( - dict_table_get_first_index(sys_fields), 4))->name)); + dict_index_get_nth_field(sys_index, 4))->name)); - field = rec_get_nth_field(rec, 4, &len); + field = rec_get_nth_field_old(rec, 4, &len); dict_mem_index_add_field(index, mem_heap_strdupl(heap, (char*) field, len), 0, prefix_len); @@ -575,6 +577,7 @@ dict_load_indexes( sys_indexes = dict_table_get_low("SYS_INDEXES"); sys_index = UT_LIST_GET_FIRST(sys_indexes->indexes); + ut_a(!sys_indexes->comp); tuple = dtuple_create(heap, 1); dfield = dtuple_get_nth_field(tuple, 0); @@ -595,14 +598,14 @@ dict_load_indexes( rec = btr_pcur_get_rec(&pcur); - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); ut_ad(len == 8); if (ut_memcmp(buf, field, len) != 0) { break; } - if (rec_get_deleted_flag(rec)) { + if (rec_get_deleted_flag(rec, table->comp)) { dict_load_report_deleted_index(table->name, ULINT_UNDEFINED); @@ -612,33 +615,31 @@ dict_load_indexes( return(FALSE); } - field = rec_get_nth_field(rec, 1, &len); + field = rec_get_nth_field_old(rec, 1, &len); ut_ad(len == 8); id = mach_read_from_8(field); ut_a(0 == ut_strcmp("NAME", dict_field_get_col( - dict_index_get_nth_field( - dict_table_get_first_index(sys_indexes), 4))->name)); - - field = rec_get_nth_field(rec, 4, &name_len); + dict_index_get_nth_field(sys_index, 4))->name)); + + field = rec_get_nth_field_old(rec, 4, &name_len); name_buf = mem_heap_strdupl(heap, (char*) field, name_len); - field = rec_get_nth_field(rec, 5, &len); + field = rec_get_nth_field_old(rec, 5, &len); n_fields = mach_read_from_4(field); - field = rec_get_nth_field(rec, 6, &len); + field = rec_get_nth_field_old(rec, 6, &len); type = mach_read_from_4(field); - field = rec_get_nth_field(rec, 7, &len); + field = rec_get_nth_field_old(rec, 7, &len); space = mach_read_from_4(field); ut_a(0 == ut_strcmp("PAGE_NO", dict_field_get_col( - dict_index_get_nth_field( - dict_table_get_first_index(sys_indexes), 8))->name)); + dict_index_get_nth_field(sys_index, 8))->name)); - field = rec_get_nth_field(rec, 8, &len); + field = rec_get_nth_field_old(rec, 8, &len); page_no = mach_read_from_4(field); if (page_no == FIL_NULL) { @@ -680,12 +681,10 @@ dict_load_indexes( } else { index = dict_mem_index_create(table->name, name_buf, space, type, n_fields); - index->page_no = page_no; index->id = id; dict_load_fields(table, index, heap); - - dict_index_add_to_cache(table, index); + dict_index_add_to_cache(table, index, page_no); } btr_pcur_move_to_next_user_rec(&pcur, &mtr); @@ -741,6 +740,7 @@ dict_load_table( sys_tables = dict_table_get_low("SYS_TABLES"); sys_index = UT_LIST_GET_FIRST(sys_tables->indexes); + ut_a(!sys_tables->comp); tuple = dtuple_create(heap, 1); dfield = dtuple_get_nth_field(tuple, 0); @@ -753,7 +753,7 @@ dict_load_table( rec = btr_pcur_get_rec(&pcur); if (!btr_pcur_is_on_user_rec(&pcur, &mtr) - || rec_get_deleted_flag(rec)) { + || rec_get_deleted_flag(rec, sys_tables->comp)) { /* Not found */ btr_pcur_close(&pcur); @@ -763,7 +763,7 @@ dict_load_table( return(NULL); } - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); /* Check if the table name in record is the searched one */ if (len != ut_strlen(name) || ut_memcmp(name, field, len) != 0) { @@ -775,7 +775,7 @@ dict_load_table( return(NULL); } -#if MYSQL_VERSION_ID < 50300 +#if MYSQL_VERSION_ID < 50003 /* Starting from MySQL 5.0.3, the high-order bit of MIX_LEN is the "compact format" flag. */ field = rec_get_nth_field(rec, 7, &len); @@ -793,10 +793,9 @@ dict_load_table( ut_a(0 == ut_strcmp("SPACE", dict_field_get_col( - dict_index_get_nth_field( - dict_table_get_first_index(sys_tables), 9))->name)); + dict_index_get_nth_field(sys_index, 9))->name)); - field = rec_get_nth_field(rec, 9, &len); + field = rec_get_nth_field_old(rec, 9, &len); space = mach_read_from_4(field); /* Check if the tablespace exists and has the right name */ @@ -828,43 +827,45 @@ dict_load_table( ut_a(0 == ut_strcmp("N_COLS", dict_field_get_col( - dict_index_get_nth_field( - dict_table_get_first_index(sys_tables), 4))->name)); + dict_index_get_nth_field(sys_index, 4))->name)); - field = rec_get_nth_field(rec, 4, &len); + field = rec_get_nth_field_old(rec, 4, &len); n_cols = mach_read_from_4(field); - table = dict_mem_table_create(name, space, n_cols); + /* table->comp will be initialized later, in this function */ + table = dict_mem_table_create(name, space, n_cols, FALSE); table->ibd_file_missing = ibd_file_missing; ut_a(0 == ut_strcmp("ID", dict_field_get_col( - dict_index_get_nth_field( - dict_table_get_first_index(sys_tables), 3))->name)); + dict_index_get_nth_field(sys_index, 3))->name)); - field = rec_get_nth_field(rec, 3, &len); + field = rec_get_nth_field_old(rec, 3, &len); table->id = mach_read_from_8(field); - field = rec_get_nth_field(rec, 5, &len); + field = rec_get_nth_field_old(rec, 5, &len); table->type = mach_read_from_4(field); if (table->type == DICT_TABLE_CLUSTER_MEMBER) { ut_error; #if 0 /* clustered tables have not been implemented yet */ - field = rec_get_nth_field(rec, 6, &len); + field = rec_get_nth_field_old(rec, 6, &len); table->mix_id = mach_read_from_8(field); - field = rec_get_nth_field(rec, 8, &len); + field = rec_get_nth_field_old(rec, 8, &len); table->cluster_name = mem_heap_strdupl(heap, (char*) field, len); #endif } + /* The high-order bit of MIX_LEN is the "compact format" flag */ + field = rec_get_nth_field_old(rec, 7, &len); + table->comp = !!(mach_read_from_1(field) & 0x80); + if ((table->type == DICT_TABLE_CLUSTER) || (table->type == DICT_TABLE_CLUSTER_MEMBER)) { - - field = rec_get_nth_field(rec, 7, &len); - table->mix_len = mach_read_from_4(field); + + table->mix_len = mach_read_from_4(field) & 0x7fffffff; } btr_pcur_close(&pcur); @@ -942,6 +943,7 @@ dict_load_table_on_id( sys_tables = dict_sys->sys_tables; sys_table_ids = dict_table_get_next_index( dict_table_get_first_index(sys_tables)); + ut_a(!sys_tables->comp); heap = mem_heap_create(256); tuple = dtuple_create(heap, 1); @@ -958,7 +960,7 @@ dict_load_table_on_id( rec = btr_pcur_get_rec(&pcur); if (!btr_pcur_is_on_user_rec(&pcur, &mtr) - || rec_get_deleted_flag(rec)) { + || rec_get_deleted_flag(rec, sys_tables->comp)) { /* Not found */ btr_pcur_close(&pcur); @@ -973,7 +975,7 @@ dict_load_table_on_id( table ID and NAME */ rec = btr_pcur_get_rec(&pcur); - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); ut_ad(len == 8); /* Check if the table id in record is the one searched for */ @@ -987,7 +989,7 @@ dict_load_table_on_id( } /* Now we get the table name from the record */ - field = rec_get_nth_field(rec, 1, &len); + field = rec_get_nth_field_old(rec, 1, &len); /* Load the table definition to memory */ table = dict_load_table(mem_heap_strdupl(heap, (char*) field, len)); @@ -1055,6 +1057,7 @@ dict_load_foreign_cols( sys_foreign_cols = dict_table_get_low("SYS_FOREIGN_COLS"); sys_index = UT_LIST_GET_FIRST(sys_foreign_cols->indexes); + ut_a(!sys_foreign_cols->comp); tuple = dtuple_create(foreign->heap, 1); dfield = dtuple_get_nth_field(tuple, 0); @@ -1069,21 +1072,21 @@ dict_load_foreign_cols( rec = btr_pcur_get_rec(&pcur); ut_a(btr_pcur_is_on_user_rec(&pcur, &mtr)); - ut_a(!rec_get_deleted_flag(rec)); - - field = rec_get_nth_field(rec, 0, &len); + ut_a(!rec_get_deleted_flag(rec, sys_foreign_cols->comp)); + + field = rec_get_nth_field_old(rec, 0, &len); ut_a(len == ut_strlen(id)); ut_a(ut_memcmp(id, field, len) == 0); - field = rec_get_nth_field(rec, 1, &len); + field = rec_get_nth_field_old(rec, 1, &len); ut_a(len == 4); ut_a(i == mach_read_from_4(field)); - field = rec_get_nth_field(rec, 4, &len); + field = rec_get_nth_field_old(rec, 4, &len); foreign->foreign_col_names[i] = mem_heap_strdupl(foreign->heap, (char*) field, len); - field = rec_get_nth_field(rec, 5, &len); + field = rec_get_nth_field_old(rec, 5, &len); foreign->referenced_col_names[i] = mem_heap_strdupl(foreign->heap, (char*) field, len); @@ -1127,6 +1130,7 @@ dict_load_foreign( sys_foreign = dict_table_get_low("SYS_FOREIGN"); sys_index = UT_LIST_GET_FIRST(sys_foreign->indexes); + ut_a(!sys_foreign->comp); tuple = dtuple_create(heap2, 1); dfield = dtuple_get_nth_field(tuple, 0); @@ -1139,7 +1143,7 @@ dict_load_foreign( rec = btr_pcur_get_rec(&pcur); if (!btr_pcur_is_on_user_rec(&pcur, &mtr) - || rec_get_deleted_flag(rec)) { + || rec_get_deleted_flag(rec, sys_foreign->comp)) { /* Not found */ fprintf(stderr, @@ -1153,7 +1157,7 @@ dict_load_foreign( return(DB_ERROR); } - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); /* Check if the id in record is the searched one */ if (len != ut_strlen(id) || ut_memcmp(id, field, len) != 0) { @@ -1176,7 +1180,8 @@ dict_load_foreign( foreign = dict_mem_foreign_create(); - foreign->n_fields = mach_read_from_4(rec_get_nth_field(rec, 5, &len)); + foreign->n_fields = + mach_read_from_4(rec_get_nth_field_old(rec, 5, &len)); ut_a(len == 4); @@ -1187,11 +1192,11 @@ dict_load_foreign( foreign->id = mem_heap_strdup(foreign->heap, id); - field = rec_get_nth_field(rec, 3, &len); + field = rec_get_nth_field_old(rec, 3, &len); foreign->foreign_table_name = mem_heap_strdupl(foreign->heap, (char*) field, len); - - field = rec_get_nth_field(rec, 4, &len); + + field = rec_get_nth_field_old(rec, 4, &len); foreign->referenced_table_name = mem_heap_strdupl(foreign->heap, (char*) field, len); @@ -1260,6 +1265,7 @@ dict_load_foreigns( return(DB_ERROR); } + ut_a(!sys_foreign->comp); mtr_start(&mtr); /* Get the secondary index based on FOR_NAME from table @@ -1291,7 +1297,7 @@ loop: name and a foreign constraint ID */ rec = btr_pcur_get_rec(&pcur); - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); /* Check if the table name in the record is the one searched for; the following call does the comparison in the latin1_swedish_ci @@ -1314,13 +1320,13 @@ loop: goto next_rec; } - if (rec_get_deleted_flag(rec)) { + if (rec_get_deleted_flag(rec, sys_foreign->comp)) { goto next_rec; } /* Now we get a foreign key constraint id */ - field = rec_get_nth_field(rec, 1, &len); + field = rec_get_nth_field_old(rec, 1, &len); id = mem_heap_strdupl(heap, (char*) field, len); btr_pcur_store_position(&pcur, &mtr); diff --git a/innobase/dict/dict0mem.c b/innobase/dict/dict0mem.c index 1d45585aac1..48b9f28d292 100644 --- a/innobase/dict/dict0mem.c +++ b/innobase/dict/dict0mem.c @@ -35,7 +35,8 @@ dict_mem_table_create( the table is placed; this parameter is ignored if the table is made a member of a cluster */ - ulint n_cols) /* in: number of columns */ + ulint n_cols, /* in: number of columns */ + ibool comp) /* in: TRUE=compact page format */ { dict_table_t* table; mem_heap_t* heap; @@ -54,6 +55,7 @@ dict_mem_table_create( table->space = space; table->ibd_file_missing = FALSE; table->tablespace_discarded = FALSE; + table->comp = comp; table->n_def = 0; table->n_cols = n_cols + DATA_N_SYS_COLS; table->mem_fix = 0; @@ -110,7 +112,8 @@ dict_mem_cluster_create( { dict_table_t* cluster; - cluster = dict_mem_table_create(name, space, n_cols); + /* Clustered tables cannot work with the compact record format. */ + cluster = dict_mem_table_create(name, space, n_cols, FALSE); cluster->type = DICT_TABLE_CLUSTER; cluster->mix_len = mix_len; @@ -197,7 +200,7 @@ dict_mem_index_create( index->name = mem_heap_strdup(heap, index_name); index->table_name = table_name; index->table = NULL; - index->n_def = 0; + index->n_def = index->n_nullable = 0; index->n_fields = n_fields; index->fields = mem_heap_alloc(heap, 1 + n_fields * sizeof(dict_field_t)); diff --git a/innobase/fil/fil0fil.c b/innobase/fil/fil0fil.c index d1a083fcd66..f2d0790892e 100644 --- a/innobase/fil/fil0fil.c +++ b/innobase/fil/fil0fil.c @@ -88,6 +88,9 @@ but in the MySQL Embedded Server Library and ibbackup it is not the default directory, and we must set the base file path explicitly */ const char* fil_path_to_mysql_datadir = "."; +/* The number of fsyncs done to the log */ +ulint fil_n_log_flushes = 0; + ulint fil_n_pending_log_flushes = 0; ulint fil_n_pending_tablespace_flushes = 0; @@ -657,9 +660,9 @@ fil_try_to_close_file_in_LRU( fputs("InnoDB: cannot close file ", stderr); ut_print_filename(stderr, node->name); fprintf(stderr, - ", because mod_count %lld != fl_count %lld\n", - node->modification_counter, - node->flush_counter); + ", because mod_count %ld != fl_count %ld\n", + (long) node->modification_counter, + (long) node->flush_counter); } node = UT_LIST_GET_PREV(LRU, node); @@ -1622,30 +1625,38 @@ fil_op_write_log( mtr_t* mtr) /* in: mini-transaction handle */ { byte* log_ptr; + ulint len; + + log_ptr = mlog_open(mtr, 11 + 2); + + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery: + in that case mlog_open returns NULL */ + return; + } - log_ptr = mlog_open(mtr, 30); - log_ptr = mlog_write_initial_log_record_for_file_op(type, space_id, 0, log_ptr, mtr); /* Let us store the strings as null-terminated for easier readability and handling */ - mach_write_to_2(log_ptr, ut_strlen(name) + 1); + len = strlen(name) + 1; + + mach_write_to_2(log_ptr, len); log_ptr += 2; - mlog_close(mtr, log_ptr); - mlog_catenate_string(mtr, (byte*) name, ut_strlen(name) + 1); + mlog_catenate_string(mtr, (byte*) name, len); if (type == MLOG_FILE_RENAME) { - log_ptr = mlog_open(mtr, 30); - mach_write_to_2(log_ptr, ut_strlen(new_name) + 1); + ulint len = strlen(new_name) + 1; + log_ptr = mlog_open(mtr, 2 + len); + ut_a(log_ptr); + mach_write_to_2(log_ptr, len); log_ptr += 2; - mlog_close(mtr, log_ptr); - mlog_catenate_string(mtr, (byte*) new_name, - ut_strlen(new_name) + 1); + mlog_catenate_string(mtr, (byte*) new_name, len); } } #endif @@ -3747,6 +3758,12 @@ fil_io( mode = OS_AIO_NORMAL; } + if (type == OS_FILE_READ) { + srv_data_read+= len; + } else if (type == OS_FILE_WRITE) { + srv_data_written+= len; + } + /* Reserve the fil_system mutex and make sure that we can open at least one file while holding it, if the file is not already open */ @@ -4032,6 +4049,7 @@ fil_flush( fil_n_pending_tablespace_flushes++; } else { fil_n_pending_log_flushes++; + fil_n_log_flushes++; } #ifdef __WIN__ if (node->is_raw_disk) { diff --git a/innobase/fsp/fsp0fsp.c b/innobase/fsp/fsp0fsp.c index e1621cc2765..ef8e70646c6 100644 --- a/innobase/fsp/fsp0fsp.c +++ b/innobase/fsp/fsp0fsp.c @@ -910,7 +910,7 @@ fsp_header_init( if (space == 0) { fsp_fill_free_list(FALSE, space, header, mtr); btr_create(DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF, space, - ut_dulint_add(DICT_IBUF_ID_MIN, space), mtr); + ut_dulint_add(DICT_IBUF_ID_MIN, space), FALSE, mtr); } else { fsp_fill_free_list(TRUE, space, header, mtr); } diff --git a/innobase/ibuf/ibuf0ibuf.c b/innobase/ibuf/ibuf0ibuf.c index 2191cdc0ee6..5ad61e2590f 100644 --- a/innobase/ibuf/ibuf0ibuf.c +++ b/innobase/ibuf/ibuf0ibuf.c @@ -46,7 +46,7 @@ Note that contary to what we planned in the 1990's, there will only be one insert buffer tree, and that is in the system tablespace of InnoDB. 1. The first field is the space id. -2. The second field is a one-byte marker which differentiates records from +2. The second field is a one-byte marker (0) which differentiates records from the < 4.1.x storage format. 3. The third field is the page number. 4. The fourth field contains the type info, where we have also added 2 bytes to @@ -55,7 +55,14 @@ insert buffer tree, and that is in the system tablespace of InnoDB. can use in the binary search on the index page in the ibuf merge phase. 5. The rest of the fields contain the fields of the actual index record. -*/ +In versions >= 5.0.3: + +The first byte of the fourth field is an additional marker (0) if the record +is in the compact format. The presence of this marker can be detected by +looking at the length of the field modulo DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE. + +The high-order bit of the character set field in the type info is the +"nullable" flag for the field. */ /* PREVENTING DEADLOCKS IN THE INSERT BUFFER SYSTEM @@ -525,8 +532,8 @@ ibuf_data_init_for_space( ibuf_exit(); sprintf(buf, "SYS_IBUF_TABLE_%lu", (ulong) space); - - table = dict_mem_table_create(buf, space, 2); + /* use old-style record format for the insert buffer */ + table = dict_mem_table_create(buf, space, 2, FALSE); dict_mem_table_add_col(table, "PAGE_NO", DATA_BINARY, 0, 0, 0); dict_mem_table_add_col(table, "TYPES", DATA_BINARY, 0, 0, 0); @@ -541,11 +548,9 @@ ibuf_data_init_for_space( dict_mem_index_add_field(index, "PAGE_NO", 0, 0); dict_mem_index_add_field(index, "TYPES", 0, 0); - index->page_no = FSP_IBUF_TREE_ROOT_PAGE_NO; - index->id = ut_dulint_add(DICT_IBUF_ID_MIN, space); - dict_index_add_to_cache(table, index); + dict_index_add_to_cache(table, index, FSP_IBUF_TREE_ROOT_PAGE_NO); data->index = dict_table_get_first_index(table); @@ -1049,20 +1054,20 @@ ibuf_rec_get_page_no( ulint len; ut_ad(ibuf_inside()); - ut_ad(rec_get_n_fields(rec) > 2); + ut_ad(rec_get_n_fields_old(rec) > 2); - field = rec_get_nth_field(rec, 1, &len); + field = rec_get_nth_field_old(rec, 1, &len); if (len == 1) { /* This is of the >= 4.1.x record format */ ut_a(trx_sys_multiple_tablespace_format); - field = rec_get_nth_field(rec, 2, &len); + field = rec_get_nth_field_old(rec, 2, &len); } else { ut_a(trx_doublewrite_must_reset_space_ids); ut_a(!trx_sys_multiple_tablespace_format); - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); } ut_a(len == 4); @@ -1084,15 +1089,15 @@ ibuf_rec_get_space( ulint len; ut_ad(ibuf_inside()); - ut_ad(rec_get_n_fields(rec) > 2); + ut_ad(rec_get_n_fields_old(rec) > 2); - field = rec_get_nth_field(rec, 1, &len); + field = rec_get_nth_field_old(rec, 1, &len); if (len == 1) { /* This is of the >= 4.1.x record format */ ut_a(trx_sys_multiple_tablespace_format); - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); ut_a(len == 4); return(mach_read_from_4(field)); @@ -1105,6 +1110,162 @@ ibuf_rec_get_space( } /************************************************************************ +Creates a dummy index for inserting a record to a non-clustered index. +*/ +static +dict_index_t* +ibuf_dummy_index_create( +/*====================*/ + /* out: dummy index */ + ulint n, /* in: number of fields */ + ibool comp) /* in: TRUE=use compact record format */ +{ + dict_table_t* table; + dict_index_t* index; + table = dict_mem_table_create("IBUF_DUMMY", + DICT_HDR_SPACE, n, comp); + index = dict_mem_index_create("IBUF_DUMMY", "IBUF_DUMMY", + DICT_HDR_SPACE, 0, n); + index->table = table; + /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ + index->cached = TRUE; + return(index); +} +/************************************************************************ +Add a column to the dummy index */ +static +void +ibuf_dummy_index_add_col( +/*====================*/ + dict_index_t* index, /* in: dummy index */ + dtype_t* type, /* in: the data type of the column */ + ulint len) /* in: length of the column */ +{ + ulint i = index->table->n_def; + dict_mem_table_add_col(index->table, "DUMMY", + dtype_get_mtype(type), + dtype_get_prtype(type), + dtype_get_len(type), + dtype_get_prec(type)); + dict_index_add_col(index, + dict_table_get_nth_col(index->table, i), 0, len); +} +/************************************************************************ +Deallocates a dummy index for inserting a record to a non-clustered index. +*/ +static +void +ibuf_dummy_index_free( +/*====================*/ + dict_index_t* index) /* in: dummy index */ +{ + dict_table_t* table = index->table; + mem_heap_free(index->heap); + mutex_free(&(table->autoinc_mutex)); + mem_heap_free(table->heap); +} + +/************************************************************************* +Builds the entry to insert into a non-clustered index when we have the +corresponding record in an ibuf index. */ +static +dtuple_t* +ibuf_build_entry_from_ibuf_rec( +/*===========================*/ + /* out, own: entry to insert to + a non-clustered index; NOTE that + as we copy pointers to fields in + ibuf_rec, the caller must hold a + latch to the ibuf_rec page as long + as the entry is used! */ + rec_t* ibuf_rec, /* in: record in an insert buffer */ + mem_heap_t* heap, /* in: heap where built */ + dict_index_t** pindex) /* out, own: dummy index that + describes the entry */ +{ + dtuple_t* tuple; + dfield_t* field; + ulint n_fields; + byte* types; + const byte* data; + ulint len; + ulint i; + dict_index_t* index; + + data = rec_get_nth_field_old(ibuf_rec, 1, &len); + + if (len > 1) { + /* This a < 4.1.x format record */ + + ut_a(trx_doublewrite_must_reset_space_ids); + ut_a(!trx_sys_multiple_tablespace_format); + + n_fields = rec_get_n_fields_old(ibuf_rec) - 2; + tuple = dtuple_create(heap, n_fields); + types = rec_get_nth_field_old(ibuf_rec, 1, &len); + + ut_a(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE); + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(tuple, i); + + data = rec_get_nth_field_old(ibuf_rec, i + 2, &len); + + dfield_set_data(field, data, len); + + dtype_read_for_order_and_null_size( + dfield_get_type(field), + types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE); + } + + *pindex = ibuf_dummy_index_create(n_fields, FALSE); + return(tuple); + } + + /* This a >= 4.1.x format record */ + + ut_a(trx_sys_multiple_tablespace_format); + ut_a(*data == 0); + ut_a(rec_get_n_fields_old(ibuf_rec) > 4); + + n_fields = rec_get_n_fields_old(ibuf_rec) - 4; + + tuple = dtuple_create(heap, n_fields); + + types = rec_get_nth_field_old(ibuf_rec, 3, &len); + + ut_a(len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE <= 1); + index = ibuf_dummy_index_create(n_fields, + len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + if (len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE) { + /* compact record format */ + len--; + ut_a(*types == 0); + types++; + } + + ut_a(len == n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(tuple, i); + + data = rec_get_nth_field_old(ibuf_rec, i + 4, &len); + + dfield_set_data(field, data, len); + + dtype_new_read_for_order_and_null_size( + dfield_get_type(field), + types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + ibuf_dummy_index_add_col(index, dfield_get_type(field), len); + } + + *pindex = index; + return(tuple); +} + +/************************************************************************ Returns the space taken by a stored non-clustered index entry if converted to an index record. */ static @@ -1125,43 +1286,60 @@ ibuf_rec_get_volume( ulint i; ut_ad(ibuf_inside()); - ut_ad(rec_get_n_fields(ibuf_rec) > 2); - - data = rec_get_nth_field(ibuf_rec, 1, &len); + ut_ad(rec_get_n_fields_old(ibuf_rec) > 2); + + data = rec_get_nth_field_old(ibuf_rec, 1, &len); if (len > 1) { - /* < 4.1.x format record */ + /* < 4.1.x format record */ ut_a(trx_doublewrite_must_reset_space_ids); ut_a(!trx_sys_multiple_tablespace_format); - n_fields = rec_get_n_fields(ibuf_rec) - 2; + n_fields = rec_get_n_fields_old(ibuf_rec) - 2; - types = rec_get_nth_field(ibuf_rec, 1, &len); + types = rec_get_nth_field_old(ibuf_rec, 1, &len); ut_ad(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE); } else { - /* >= 4.1.x format record */ + /* >= 4.1.x format record */ ut_a(trx_sys_multiple_tablespace_format); - new_format = TRUE; + ut_a(*data == 0); + + types = rec_get_nth_field_old(ibuf_rec, 3, &len); + + ut_a(len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE <= 1); + if (len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE) { + /* compact record format */ + ulint volume; + dict_index_t* dummy_index; + mem_heap_t* heap = mem_heap_create(500); + dtuple_t* entry = + ibuf_build_entry_from_ibuf_rec( + ibuf_rec, heap, &dummy_index); + volume = rec_get_converted_size(dummy_index, entry); + ibuf_dummy_index_free(dummy_index); + mem_heap_free(heap); + return(volume + page_dir_calc_reserved_space(1)); + } - n_fields = rec_get_n_fields(ibuf_rec) - 4; + n_fields = rec_get_n_fields_old(ibuf_rec) - 4; - types = rec_get_nth_field(ibuf_rec, 3, &len); + new_format = TRUE; } for (i = 0; i < n_fields; i++) { if (new_format) { - data = rec_get_nth_field(ibuf_rec, i + 4, &len); + data = rec_get_nth_field_old(ibuf_rec, i + 4, &len); dtype_new_read_for_order_and_null_size(&dtype, - types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); } else { - data = rec_get_nth_field(ibuf_rec, i + 2, &len); + data = rec_get_nth_field_old(ibuf_rec, i + 2, &len); dtype_read_for_order_and_null_size(&dtype, - types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE); + types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE); } if (len == UNIV_SQL_NULL) { @@ -1187,6 +1365,7 @@ ibuf_entry_build( must be kept because we copy pointers to its fields */ dtuple_t* entry, /* in: entry for a non-clustered index */ + ibool comp, /* in: flag: TRUE=compact record format */ ulint space, /* in: space id */ ulint page_no,/* in: index page number where entry should be inserted */ @@ -1202,11 +1381,14 @@ ibuf_entry_build( /* Starting from 4.1.x, we have to build a tuple whose (1) first field is the space id, - (2) the second field a single marker byte to tell that this + (2) the second field a single marker byte (0) to tell that this is a new format record, (3) the third contains the page number, and (4) the fourth contains the relevent type information of each data - field, + field; the length of this field % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE is + (a) 0 for b-trees in the old format, and + (b) 1 for b-trees in the compact format, the first byte of the field + being the marker (0); (5) and the rest of the fields are copied from entry. All fields in the tuple are ordered like the type binary in our insert buffer tree. */ @@ -1247,10 +1429,15 @@ ibuf_entry_build( dfield_set_data(field, buf, 4); + ut_ad(comp == 0 || comp == 1); /* Store the type info in buf2, and add the fields from entry to tuple */ buf2 = mem_heap_alloc(heap, n_fields - * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE + + comp); + if (comp) { + *buf2++ = 0; /* write the compact format indicator */ + } for (i = 0; i < n_fields; i++) { /* We add 4 below because we have the 4 extra fields at the start of an ibuf record */ @@ -1268,8 +1455,13 @@ ibuf_entry_build( field = dtuple_get_nth_field(tuple, 3); + if (comp) { + buf2--; + } + dfield_set_data(field, buf2, n_fields - * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE + + comp); /* Set all the types in the new tuple binary */ dtuple_set_types_binary(tuple, n_fields + 4); @@ -1278,88 +1470,6 @@ ibuf_entry_build( } /************************************************************************* -Builds the entry to insert into a non-clustered index when we have the -corresponding record in an ibuf index. */ -static -dtuple_t* -ibuf_build_entry_from_ibuf_rec( -/*===========================*/ - /* out, own: entry to insert to - a non-clustered index; NOTE that - as we copy pointers to fields in - ibuf_rec, the caller must hold a - latch to the ibuf_rec page as long - as the entry is used! */ - rec_t* ibuf_rec, /* in: record in an insert buffer */ - mem_heap_t* heap) /* in: heap where built */ -{ - dtuple_t* tuple; - dfield_t* field; - ulint n_fields; - byte* types; - byte* data; - ulint len; - ulint i; - - data = rec_get_nth_field(ibuf_rec, 1, &len); - - if (len > 1) { - /* This a < 4.1.x format record */ - - ut_a(trx_doublewrite_must_reset_space_ids); - ut_a(!trx_sys_multiple_tablespace_format); - - n_fields = rec_get_n_fields(ibuf_rec) - 2; - tuple = dtuple_create(heap, n_fields); - types = rec_get_nth_field(ibuf_rec, 1, &len); - - ut_a(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE); - - for (i = 0; i < n_fields; i++) { - field = dtuple_get_nth_field(tuple, i); - - data = rec_get_nth_field(ibuf_rec, i + 2, &len); - - dfield_set_data(field, data, len); - - dtype_read_for_order_and_null_size( - dfield_get_type(field), - types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE); - } - - return(tuple); - } - - /* This a >= 4.1.x format record */ - - ut_a(trx_sys_multiple_tablespace_format); - - ut_a(rec_get_n_fields(ibuf_rec) > 4); - - n_fields = rec_get_n_fields(ibuf_rec) - 4; - - tuple = dtuple_create(heap, n_fields); - - types = rec_get_nth_field(ibuf_rec, 3, &len); - - ut_a(len == n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); - - for (i = 0; i < n_fields; i++) { - field = dtuple_get_nth_field(tuple, i); - - data = rec_get_nth_field(ibuf_rec, i + 4, &len); - - dfield_set_data(field, data, len); - - dtype_new_read_for_order_and_null_size( - dfield_get_type(field), - types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); - } - - return(tuple); -} - -/************************************************************************* Builds a search tuple used to search buffered inserts for an index page. This is for < 4.1.x format records */ static @@ -2047,8 +2157,7 @@ loop: mutex_exit(&ibuf_mutex); sum_sizes = ibuf_get_merge_page_nos(TRUE, btr_pcur_get_rec(&pcur), - space_ids, space_versions, page_nos, - &n_stored); + space_ids, space_versions, page_nos, &n_stored); #ifdef UNIV_IBUF_DEBUG /* fprintf(stderr, "Ibuf contract sync %lu pages %lu volume %lu\n", sync, n_stored, sum_sizes); */ @@ -2344,6 +2453,7 @@ ibuf_update_max_tablespace_id(void) ibuf_data = fil_space_get_ibuf_data(0); ibuf_index = ibuf_data->index; + ut_a(!ibuf_index->table->comp); ibuf_enter(); @@ -2360,7 +2470,7 @@ ibuf_update_max_tablespace_id(void) } else { rec = btr_pcur_get_rec(&pcur); - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); ut_a(len == 4); @@ -2479,7 +2589,7 @@ ibuf_insert_low( ibuf_enter(); } - entry_size = rec_get_converted_size(entry); + entry_size = rec_get_converted_size(index, entry); heap = mem_heap_create(512); @@ -2487,7 +2597,8 @@ ibuf_insert_low( the first fields and the type information for other fields, and which will be inserted to the insert buffer. */ - ibuf_entry = ibuf_entry_build(entry, space, page_no, heap); + ibuf_entry = ibuf_entry_build(entry, index->table->comp, + space, page_no, heap); /* Open a cursor to the insert buffer tree to calculate if we can add the new entry to it without exceeding the free space limit for the @@ -2532,8 +2643,8 @@ ibuf_insert_low( do_merge = TRUE; ibuf_get_merge_page_nos(FALSE, btr_pcur_get_rec(&pcur), - space_ids, space_versions, page_nos, - &n_stored); + space_ids, space_versions, + page_nos, &n_stored); goto function_exit; } @@ -2656,8 +2767,8 @@ ibuf_insert( ut_a(!(index->type & DICT_CLUSTERED)); - if (rec_get_converted_size(entry) - >= page_get_free_space_of_empty() / 2) { + if (rec_get_converted_size(index, entry) + >= page_get_free_space_of_empty(index->table->comp) / 2) { return(FALSE); } @@ -2692,6 +2803,7 @@ ibuf_insert_to_index_page( dtuple_t* entry, /* in: buffered entry to insert */ page_t* page, /* in: index page where the buffered entry should be placed */ + dict_index_t* index, /* in: record descriptor */ mtr_t* mtr) /* in: mtr */ { page_cur_t page_cur; @@ -2703,13 +2815,20 @@ ibuf_insert_to_index_page( ut_ad(ibuf_inside()); ut_ad(dtuple_check_typed(entry)); - if (rec_get_n_fields(page_rec_get_next(page_get_infimum_rec(page))) - != dtuple_get_n_fields(entry)) { - - fprintf(stderr, + if (index->table->comp != page_is_comp(page)) { + fputs( "InnoDB: Trying to insert a record from the insert buffer to an index page\n" -"InnoDB: but the number of fields does not match!\n"); +"InnoDB: but the 'compact' flag does not match!\n", stderr); + goto dump; + } + + rec = page_rec_get_next(page_get_infimum_rec(page)); + if (rec_get_n_fields(rec, index) != dtuple_get_n_fields(entry)) { + fputs( +"InnoDB: Trying to insert a record from the insert buffer to an index page\n" +"InnoDB: but the number of fields does not match!\n", stderr); + dump: buf_page_print(page); dtuple_print(stderr, entry); @@ -2723,31 +2842,34 @@ ibuf_insert_to_index_page( return; } - low_match = page_cur_search(page, entry, PAGE_CUR_LE, &page_cur); + low_match = page_cur_search(page, index, entry, + PAGE_CUR_LE, &page_cur); if (low_match == dtuple_get_n_fields(entry)) { rec = page_cur_get_rec(&page_cur); - btr_cur_del_unmark_for_ibuf(rec, mtr); + btr_cur_del_unmark_for_ibuf(rec, index, mtr); } else { - rec = page_cur_tuple_insert(&page_cur, entry, mtr); + rec = page_cur_tuple_insert(&page_cur, entry, index, mtr); if (rec == NULL) { /* If the record did not fit, reorganize */ - btr_page_reorganize(page, mtr); + btr_page_reorganize(page, index, mtr); - page_cur_search(page, entry, PAGE_CUR_LE, &page_cur); + page_cur_search(page, index, entry, + PAGE_CUR_LE, &page_cur); /* This time the record must fit */ - if (!page_cur_tuple_insert(&page_cur, entry, mtr)) { + if (!page_cur_tuple_insert(&page_cur, entry, + index, mtr)) { ut_print_timestamp(stderr); fprintf(stderr, "InnoDB: Error: Insert buffer insert fails; page free %lu, dtuple size %lu\n", (ulong) page_get_max_insert_size(page, 1), - (ulong) rec_get_converted_size(entry)); + (ulong) rec_get_converted_size(index, entry)); fputs("InnoDB: Cannot insert index record ", stderr); dtuple_print(stderr, entry); @@ -2836,11 +2958,12 @@ ibuf_delete_rec( "InnoDB: ibuf record inserted to page %lu\n", (ulong) page_no); fflush(stderr); - rec_print(stderr, btr_pcur_get_rec(pcur)); - rec_print(stderr, pcur->old_rec); + rec_print_old(stderr, btr_pcur_get_rec(pcur)); + rec_print_old(stderr, pcur->old_rec); dtuple_print(stderr, search_tuple); - rec_print(stderr, page_rec_get_next(btr_pcur_get_rec(pcur))); + rec_print_old(stderr, + page_rec_get_next(btr_pcur_get_rec(pcur))); fflush(stderr); btr_pcur_commit_specify_mtr(pcur, mtr); @@ -3075,7 +3198,7 @@ loop: if (corruption_noticed) { fputs("InnoDB: Discarding record\n ", stderr); - rec_print(stderr, ibuf_rec); + rec_print_old(stderr, ibuf_rec); fputs("\n from the insert buffer!\n\n", stderr); } else if (page) { /* Now we have at pcur a record which should be @@ -3083,19 +3206,22 @@ loop: copies pointers to fields in ibuf_rec, and we must keep the latch to the ibuf_rec page until the insertion is finished! */ - - dulint max_trx_id = page_get_max_trx_id( + dict_index_t* dummy_index; + dulint max_trx_id = page_get_max_trx_id( buf_frame_align(ibuf_rec)); page_update_max_trx_id(page, max_trx_id); - entry = ibuf_build_entry_from_ibuf_rec(ibuf_rec, heap); + entry = ibuf_build_entry_from_ibuf_rec(ibuf_rec, + heap, &dummy_index); #ifdef UNIV_IBUF_DEBUG - volume += rec_get_converted_size(entry) + volume += rec_get_converted_size(dummy_index, entry) + page_dir_calc_reserved_space(1); ut_a(volume <= 4 * UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE); #endif - ibuf_insert_to_index_page(entry, page, &mtr); + ibuf_insert_to_index_page(entry, page, + dummy_index, &mtr); + ibuf_dummy_index_free(dummy_index); } n_inserts++; diff --git a/innobase/include/Makefile.am b/innobase/include/Makefile.am index 102d25566da..eb1e3b72877 100644 --- a/innobase/include/Makefile.am +++ b/innobase/include/Makefile.am @@ -49,7 +49,7 @@ noinst_HEADERS = btr0btr.h btr0btr.ic btr0cur.h btr0cur.ic \ thr0loc.h thr0loc.ic trx0purge.h trx0purge.ic trx0rec.h \ trx0rec.ic trx0roll.h trx0roll.ic trx0rseg.h trx0rseg.ic \ trx0sys.h trx0sys.ic trx0trx.h trx0trx.ic trx0types.h \ - trx0undo.h trx0undo.ic univ.i \ + trx0undo.h trx0undo.ic trx0xa.h univ.i \ usr0sess.h usr0sess.ic usr0types.h ut0byte.h ut0byte.ic \ ut0dbg.h ut0lst.h ut0mem.h ut0mem.ic ut0rnd.h ut0rnd.ic \ ut0sort.h ut0ut.h ut0ut.ic diff --git a/innobase/include/btr0btr.h b/innobase/include/btr0btr.h index 8606fcd2a5c..0b19e64d4e0 100644 --- a/innobase/include/btr0btr.h +++ b/innobase/include/btr0btr.h @@ -155,7 +155,8 @@ ulint btr_node_ptr_get_child_page_no( /*===========================*/ /* out: child node address */ - rec_t* rec); /* in: node pointer record */ + rec_t* rec, /* in: node pointer record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /**************************************************************** Creates the root node for a new index tree. */ @@ -167,6 +168,7 @@ btr_create( ulint type, /* in: type of the index */ ulint space, /* in: space where created */ dulint index_id,/* in: index id */ + ibool comp, /* in: TRUE=compact page format */ mtr_t* mtr); /* in: mini-transaction handle */ /**************************************************************** Frees a B-tree except the root page, which MUST be freed after this @@ -210,8 +212,9 @@ Reorganizes an index page. */ void btr_page_reorganize( /*================*/ - page_t* page, /* in: page to be reorganized */ - mtr_t* mtr); /* in: mtr */ + page_t* page, /* in: page to be reorganized */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Decides if the page should be split at the convergence point of inserts converging to left. */ @@ -273,6 +276,7 @@ void btr_set_min_rec_mark( /*=================*/ rec_t* rec, /* in: record */ + ibool comp, /* in: TRUE=compact page format */ mtr_t* mtr); /* in: mtr */ /***************************************************************** Deletes on the upper level the node pointer to a page. */ @@ -332,6 +336,7 @@ btr_parse_set_min_rec_mark( /* out: end of log record or NULL */ byte* ptr, /* in: buffer */ byte* end_ptr,/* in: buffer end */ + ibool comp, /* in: TRUE=compact page format */ page_t* page, /* in: page or NULL */ mtr_t* mtr); /* in: mtr or NULL */ /*************************************************************** @@ -340,11 +345,12 @@ Parses a redo log record of reorganizing a page. */ byte* btr_parse_page_reorganize( /*======================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr); /* in: mtr or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr); /* in: mtr or NULL */ /****************************************************************** Gets the number of pages in a B-tree. */ diff --git a/innobase/include/btr0btr.ic b/innobase/include/btr0btr.ic index b0aa0756307..1d1f97d3668 100644 --- a/innobase/include/btr0btr.ic +++ b/innobase/include/btr0btr.ic @@ -183,17 +183,18 @@ ulint btr_node_ptr_get_child_page_no( /*===========================*/ /* out: child node address */ - rec_t* rec) /* in: node pointer record */ + rec_t* rec, /* in: node pointer record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { - ulint n_fields; byte* field; ulint len; ulint page_no; - n_fields = rec_get_n_fields(rec); + ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec)); /* The child address is in the last field */ - field = rec_get_nth_field(rec, n_fields - 1, &len); + field = rec_get_nth_field(rec, offsets, + rec_offs_n_fields(offsets) - 1, &len); ut_ad(len == 4); diff --git a/innobase/include/btr0cur.h b/innobase/include/btr0cur.h index f1334656d53..0a8d8ceaeb7 100644 --- a/innobase/include/btr0cur.h +++ b/innobase/include/btr0cur.h @@ -34,7 +34,7 @@ page_cur_t* btr_cur_get_page_cur( /*=================*/ /* out: pointer to page cursor component */ - btr_cur_t* cursor); /* in: tree cursor */ + btr_cur_t* cursor);/* in: tree cursor */ /************************************************************* Returns the record pointer of a tree cursor. */ UNIV_INLINE @@ -42,14 +42,14 @@ rec_t* btr_cur_get_rec( /*============*/ /* out: pointer to record */ - btr_cur_t* cursor); /* in: tree cursor */ + btr_cur_t* cursor);/* in: tree cursor */ /************************************************************* Invalidates a tree cursor by setting record pointer to NULL. */ UNIV_INLINE void btr_cur_invalidate( /*===============*/ - btr_cur_t* cursor); /* in: tree cursor */ + btr_cur_t* cursor);/* in: tree cursor */ /************************************************************* Returns the page of a tree cursor. */ UNIV_INLINE @@ -57,7 +57,7 @@ page_t* btr_cur_get_page( /*=============*/ /* out: pointer to page */ - btr_cur_t* cursor); /* in: tree cursor */ + btr_cur_t* cursor);/* in: tree cursor */ /************************************************************* Returns the tree of a cursor. */ UNIV_INLINE @@ -65,7 +65,7 @@ dict_tree_t* btr_cur_get_tree( /*=============*/ /* out: tree */ - btr_cur_t* cursor); /* in: tree cursor */ + btr_cur_t* cursor);/* in: tree cursor */ /************************************************************* Positions a tree cursor at a given record. */ UNIV_INLINE @@ -283,8 +283,9 @@ only used by the insert buffer insert merge mechanism. */ void btr_cur_del_unmark_for_ibuf( /*========================*/ - rec_t* rec, /* in: record to delete unmark */ - mtr_t* mtr); /* in: mtr */ + rec_t* rec, /* in: record to delete unmark */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Tries to compress a page of the tree on the leaf level. It is assumed that mtr holds an x-latch on the tree and on the cursor page. To avoid @@ -361,10 +362,11 @@ Parses a redo log record of updating a record in-place. */ byte* btr_cur_parse_update_in_place( /*==========================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page); /* in: page or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page, /* in: page or NULL */ + dict_index_t* index); /* in: index corresponding to page */ /******************************************************************** Parses the redo log record for delete marking or unmarking of a clustered index record. */ @@ -372,10 +374,11 @@ index record. */ byte* btr_cur_parse_del_mark_set_clust_rec( /*=================================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page); /* in: page or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: index corresponding to page */ + page_t* page); /* in: page or NULL */ /******************************************************************** Parses the redo log record for delete marking or unmarking of a secondary index record. */ @@ -383,10 +386,11 @@ index record. */ byte* btr_cur_parse_del_mark_set_sec_rec( /*===============================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page); /* in: page or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: index corresponding to page */ + page_t* page); /* in: page or NULL */ /*********************************************************************** Estimates the number of rows in a given index range. */ @@ -417,9 +421,10 @@ to free the field. */ void btr_cur_mark_extern_inherited_fields( /*=================================*/ - rec_t* rec, /* in: record in a clustered index */ - upd_t* update, /* in: update vector */ - mtr_t* mtr); /* in: mtr */ + rec_t* rec, /* in: record in a clustered index */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + upd_t* update, /* in: update vector */ + mtr_t* mtr); /* in: mtr */ /*********************************************************************** The complement of the previous function: in an update entry may inherit some externally stored fields from a record. We must mark them as inherited @@ -456,6 +461,7 @@ btr_store_big_rec_extern_fields( dict_index_t* index, /* in: index of rec; the index tree MUST be X-latched */ rec_t* rec, /* in: record */ + const ulint* offsets, /* in: rec_get_offsets(rec, index) */ big_rec_t* big_rec_vec, /* in: vector containing fields to be stored externally */ mtr_t* local_mtr); /* in: mtr containing the latch to @@ -496,6 +502,7 @@ btr_rec_free_externally_stored_fields( dict_index_t* index, /* in: index of the data, the index tree MUST be X-latched */ rec_t* rec, /* in: record */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ ibool do_not_free_inherited,/* in: TRUE if called in a rollback and we do not want to free inherited fields */ @@ -510,6 +517,7 @@ btr_rec_copy_externally_stored_field( /*=================================*/ /* out: the field copied to heap */ rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint no, /* in: field number */ ulint* len, /* out: length of the field */ mem_heap_t* heap); /* in: mem heap */ @@ -540,10 +548,10 @@ ulint btr_push_update_extern_fields( /*==========================*/ /* out: number of values stored in ext_vect */ - ulint* ext_vect, /* in: array of ulints, must be preallocated - to have place for all fields in rec */ - rec_t* rec, /* in: record */ - upd_t* update); /* in: update vector */ + ulint* ext_vect,/* in: array of ulints, must be preallocated + to have space for all fields in rec */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + upd_t* update);/* in: update vector or NULL */ /*######################################################################*/ diff --git a/innobase/include/btr0cur.ic b/innobase/include/btr0cur.ic index a3a04b60c45..dcad3e9e14d 100644 --- a/innobase/include/btr0cur.ic +++ b/innobase/include/btr0cur.ic @@ -134,17 +134,15 @@ btr_cur_can_delete_without_compress( /* out: TRUE if can be deleted without recommended compression */ btr_cur_t* cursor, /* in: btr cursor */ + ulint rec_size,/* in: rec_get_size(btr_cur_get_rec(cursor))*/ mtr_t* mtr) /* in: mtr */ { - ulint rec_size; page_t* page; ut_ad(mtr_memo_contains(mtr, buf_block_align( btr_cur_get_page(cursor)), MTR_MEMO_PAGE_X_FIX)); - rec_size = rec_get_size(btr_cur_get_rec(cursor)); - page = btr_cur_get_page(cursor); if ((page_get_data_size(page) - rec_size < BTR_CUR_PAGE_COMPRESS_LIMIT) diff --git a/innobase/include/btr0pcur.h b/innobase/include/btr0pcur.h index 81f19af4d40..6384222be51 100644 --- a/innobase/include/btr0pcur.h +++ b/innobase/include/btr0pcur.h @@ -462,6 +462,7 @@ struct btr_pcur_struct{ contains an initial segment of the latest record cursor was positioned either on, before, or after */ + ulint old_n_fields; /* number of fields in old_rec */ ulint rel_pos; /* BTR_PCUR_ON, BTR_PCUR_BEFORE, or BTR_PCUR_AFTER, depending on whether cursor was on, before, or after the diff --git a/innobase/include/btr0sea.h b/innobase/include/btr0sea.h index ce4140ecf92..78e88a24083 100644 --- a/innobase/include/btr0sea.h +++ b/innobase/include/btr0sea.h @@ -77,8 +77,10 @@ parameters as page (this often happens when a page is split). */ void btr_search_move_or_delete_hash_entries( /*===================================*/ - page_t* new_page, /* in: records are copied to this page */ - page_t* page); /* in: index page */ + page_t* new_page, /* in: records are copied + to this page */ + page_t* page, /* in: index page */ + dict_index_t* index); /* in: record descriptor */ /************************************************************************ Drops a page hash index. */ @@ -129,8 +131,8 @@ Validates the search system. */ ibool btr_search_validate(void); -/*=====================*/ - +/*======================*/ + /* out: TRUE if ok */ /* Search info directions */ #define BTR_SEA_NO_DIRECTION 1 diff --git a/innobase/include/buf0buf.h b/innobase/include/buf0buf.h index 53599d03c73..5ee323f1b1e 100644 --- a/innobase/include/buf0buf.h +++ b/innobase/include/buf0buf.h @@ -52,11 +52,15 @@ Created 11/5/1995 Heikki Tuuri /* Modes for buf_page_get_known_nowait */ #define BUF_MAKE_YOUNG 51 #define BUF_KEEP_OLD 52 +/* Magic value to use instead of checksums when they are disabled */ +#define BUF_NO_CHECKSUM_MAGIC 0xDEADBEEFUL extern buf_pool_t* buf_pool; /* The buffer pool of the database */ extern ibool buf_debug_prints;/* If this is set TRUE, the program prints info whenever read or flush occurs */ +extern ulint srv_buf_pool_write_requests; /* variable to count write request + issued */ /************************************************************************ Creates the buffer pool. */ @@ -496,6 +500,12 @@ void buf_print(void); /*============*/ /************************************************************************* +Returns the number of latched pages in the buffer pool. */ + +ulint +buf_get_latched_pages_number(void); +/*==============================*/ +/************************************************************************* Returns the number of pending buf pool ios. */ ulint @@ -731,6 +741,8 @@ struct buf_block_struct{ buffer pool which are index pages, but this flag is not set because we do not keep track of all pages */ + dict_index_t* index; /* index for which the adaptive + hash index has been created */ /* 2. Page flushing fields */ UT_LIST_NODE_T(buf_block_t) flush_list; diff --git a/innobase/include/buf0flu.ic b/innobase/include/buf0flu.ic index d6dbdcc0865..9a8a021e029 100644 --- a/innobase/include/buf0flu.ic +++ b/innobase/include/buf0flu.ic @@ -61,6 +61,8 @@ buf_flush_note_modification( ut_ad(ut_dulint_cmp(block->oldest_modification, mtr->start_lsn) <= 0); } + + ++srv_buf_pool_write_requests; } /************************************************************************ diff --git a/innobase/include/data0type.ic b/innobase/include/data0type.ic index 946b646ffbf..0b92ffbe7f1 100644 --- a/innobase/include/data0type.ic +++ b/innobase/include/data0type.ic @@ -8,6 +8,17 @@ Created 1/16/1996 Heikki Tuuri #include "mach0data.h" +/********************************************************************** +Determines whether the given character set is of variable length. + +NOTE: the prototype of this function is copied from ha_innodb.cc! If you change +this function, you MUST change also the prototype here! */ +extern +ibool +innobase_is_mb_cset( +/*================*/ + ulint cset); /* in: MySQL charset-collation code */ + /************************************************************************* Sets a data type structure. */ UNIV_INLINE @@ -149,8 +160,10 @@ dtype_new_store_for_order_and_null_size( bytes where we store the info */ dtype_t* type) /* in: type struct */ { - ut_ad(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); - +#if 6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE +#error "6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE" +#endif + buf[0] = (byte)(type->mtype & 0xFFUL); if (type->prtype & DATA_BINARY_TYPE) { @@ -166,10 +179,12 @@ dtype_new_store_for_order_and_null_size( mach_write_to_2(buf + 2, type->len & 0xFFFFUL); + ut_ad(dtype_get_charset_coll(type->prtype) < 256); mach_write_to_2(buf + 4, dtype_get_charset_coll(type->prtype)); - /* Note that the second last byte is left unused, because the - charset-collation code is always < 256 */ + if (type->prtype & DATA_NOT_NULL) { + buf[4] |= 128; + } } /************************************************************************** @@ -211,20 +226,26 @@ dtype_new_read_for_order_and_null_size( { ulint charset_coll; - ut_ad(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); +#if 6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE +#error "6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE" +#endif type->mtype = buf[0] & 63; type->prtype = buf[1]; if (buf[0] & 128) { - type->prtype = type->prtype | DATA_BINARY_TYPE; + type->prtype |= DATA_BINARY_TYPE; + } + + if (buf[4] & 128) { + type->prtype |= DATA_NOT_NULL; } type->len = mach_read_from_2(buf + 2); mach_read_from_2(buf + 4); - charset_coll = mach_read_from_2(buf + 4); + charset_coll = mach_read_from_2(buf + 4) & 0x7fff; if (dtype_is_string_type(type->mtype)) { ut_a(charset_coll < 256); @@ -257,23 +278,39 @@ dtype_get_fixed_size( mtype = dtype_get_mtype(type); switch (mtype) { + case DATA_SYS: +#ifdef UNIV_DEBUG + switch (type->prtype & DATA_MYSQL_TYPE_MASK) { + default: + ut_ad(0); + return(0); + case DATA_ROW_ID: + ut_ad(type->len == DATA_ROW_ID_LEN); + break; + case DATA_TRX_ID: + ut_ad(type->len == DATA_TRX_ID_LEN); + break; + case DATA_ROLL_PTR: + ut_ad(type->len == DATA_ROLL_PTR_LEN); + break; + case DATA_MIX_ID: + ut_ad(type->len == DATA_MIX_ID_LEN); + break; + } +#endif /* UNIV_DEBUG */ case DATA_CHAR: case DATA_FIXBINARY: case DATA_INT: case DATA_FLOAT: case DATA_DOUBLE: case DATA_MYSQL: - return(dtype_get_len(type)); - - case DATA_SYS: if (type->prtype == DATA_ROW_ID) { - return(DATA_ROW_ID_LEN); - } else if (type->prtype == DATA_TRX_ID) { - return(DATA_TRX_ID_LEN); - } else if (type->prtype == DATA_ROLL_PTR) { - return(DATA_ROLL_PTR_LEN); - } else { - return(0); + if ((type->prtype & DATA_BINARY_TYPE) + || !innobase_is_mb_cset( + dtype_get_charset_coll( + type->prtype))) { + return(dtype_get_len(type)); } + /* fall through for variable-length charsets */ case DATA_VARCHAR: case DATA_BINARY: case DATA_DECIMAL: diff --git a/innobase/include/dict0boot.h b/innobase/include/dict0boot.h index 35eff5af29a..86702cbca05 100644 --- a/innobase/include/dict0boot.h +++ b/innobase/include/dict0boot.h @@ -119,6 +119,7 @@ dict_create(void); clustered index */ #define DICT_SYS_INDEXES_PAGE_NO_FIELD 8 #define DICT_SYS_INDEXES_SPACE_NO_FIELD 7 +#define DICT_SYS_INDEXES_TYPE_FIELD 6 /* When a row id which is zero modulo this number (which must be a power of two) is assigned, the field DICT_HDR_ROW_ID on the dictionary header page is diff --git a/innobase/include/dict0crea.h b/innobase/include/dict0crea.h index 8b6944fc605..7164e53bceb 100644 --- a/innobase/include/dict0crea.h +++ b/innobase/include/dict0crea.h @@ -54,6 +54,17 @@ dict_create_index_step( /* out: query thread to run next or NULL */ que_thr_t* thr); /* in: query thread */ /*********************************************************************** +Truncates the index tree associated with a row in SYS_INDEXES table. */ + +void +dict_truncate_index_tree( +/*=====================*/ + dict_table_t* table, /* in: the table the index belongs to */ + rec_t* rec, /* in: record in the clustered index of + SYS_INDEXES table */ + mtr_t* mtr); /* in: mtr having the latch + on the record page */ +/*********************************************************************** Drops the index tree associated with a row in SYS_INDEXES table. */ void @@ -142,6 +153,7 @@ struct ind_node_struct{ /*----------------------*/ /* Local storage for this graph node */ ulint state; /* node execution state */ + ulint page_no;/* root page number of the index */ dict_table_t* table; /* table which owns the index */ dtuple_t* ind_row;/* index definition row built */ ulint field_no;/* next field definition to insert */ diff --git a/innobase/include/dict0dict.h b/innobase/include/dict0dict.h index ca632691450..fdcb6c1c4e1 100644 --- a/innobase/include/dict0dict.h +++ b/innobase/include/dict0dict.h @@ -508,8 +508,9 @@ dict_index_add_to_cache( /*====================*/ /* out: TRUE if success */ dict_table_t* table, /* in: table on which the index is */ - dict_index_t* index); /* in, own: index; NOTE! The index memory + dict_index_t* index, /* in, own: index; NOTE! The index memory object is freed in this function! */ + ulint page_no);/* in: root page number of the index */ /************************************************************************ Gets the number of fields in the internal representation of an index, including fields added by the dictionary system. */ @@ -639,6 +640,16 @@ dict_index_get_sys_col_pos( dict_index_t* index, /* in: index */ ulint type); /* in: DATA_ROW_ID, ... */ /*********************************************************************** +Adds a column to index. */ + +void +dict_index_add_col( +/*===============*/ + dict_index_t* index, /* in: index */ + dict_col_t* col, /* in: column */ + ulint order, /* in: order criterion */ + ulint prefix_len); /* in: column prefix length */ +/*********************************************************************** Copies types of fields contained in index to tuple. */ void @@ -647,18 +658,6 @@ dict_index_copy_types( dtuple_t* tuple, /* in: data tuple */ dict_index_t* index, /* in: index */ ulint n_fields); /* in: number of field types to copy */ -/************************************************************************ -Gets the value of a system column in a clustered index record. The clustered -index must contain the system column: if the index is unique, row id is -not contained there! */ -UNIV_INLINE -dulint -dict_index_rec_get_sys_col( -/*=======================*/ - /* out: system column value */ - dict_index_t* index, /* in: clustered index describing the record */ - ulint type, /* in: column type: DATA_ROLL_PTR, ... */ - rec_t* rec); /* in: record */ /************************************************************************* Gets the index tree where the index is stored. */ UNIV_INLINE @@ -688,9 +687,10 @@ dict_tree_t* dict_tree_create( /*=============*/ /* out, own: created tree */ - dict_index_t* index); /* in: the index for which to create: in the + dict_index_t* index, /* in: the index for which to create: in the case of a mixed tree, this should be the index of the cluster object */ + ulint page_no);/* in: root page number of the index */ /************************************************************************** Frees an index tree struct. */ @@ -720,7 +720,7 @@ dict_tree_find_index_for_tuple( dtuple_t* tuple); /* in: tuple for which to find index */ /*********************************************************************** Checks if a table which is a mixed cluster member owns a record. */ -UNIV_INLINE + ibool dict_is_mixed_table_rec( /*====================*/ @@ -770,6 +770,7 @@ dict_tree_copy_rec_order_prefix( /* out: pointer to the prefix record */ dict_tree_t* tree, /* in: index tree */ rec_t* rec, /* in: record for which to copy prefix */ + ulint* n_fields,/* out: number of fields copied */ byte** buf, /* in/out: memory buffer for the copied prefix, or NULL */ ulint* buf_size);/* in/out: buffer size */ @@ -782,6 +783,7 @@ dict_tree_build_data_tuple( /* out, own: data tuple */ dict_tree_t* tree, /* in: index tree */ rec_t* rec, /* in: record for which to build data tuple */ + ulint n_fields,/* in: number of data fields */ mem_heap_t* heap); /* in: memory heap where tuple created */ /************************************************************************* Gets the space id of the root of the index tree. */ diff --git a/innobase/include/dict0dict.ic b/innobase/include/dict0dict.ic index 85e4aaf1a05..928a693f860 100644 --- a/innobase/include/dict0dict.ic +++ b/innobase/include/dict0dict.ic @@ -9,7 +9,6 @@ Created 1/8/1996 Heikki Tuuri #include "dict0load.h" #include "trx0undo.h" #include "trx0sys.h" -#include "rem0rec.h" /************************************************************************* Gets the column data type. */ @@ -168,7 +167,7 @@ dict_table_get_sys_col( col = dict_table_get_nth_col(table, table->n_cols - DATA_N_SYS_COLS + sys); ut_ad(col->type.mtype == DATA_SYS); - ut_ad(col->type.prtype == sys); + ut_ad(col->type.prtype == (sys | DATA_NOT_NULL)); return(col); } @@ -312,49 +311,6 @@ dict_index_get_sys_col_pos( dict_table_get_sys_col_no(index->table, type))); } -/************************************************************************ -Gets the value of a system column in a clustered index record. The clustered -index must contain the system column: if the index is unique, row id is -not contained there! */ -UNIV_INLINE -dulint -dict_index_rec_get_sys_col( -/*=======================*/ - /* out: system column value */ - dict_index_t* index, /* in: clustered index describing the record */ - ulint type, /* in: column type: DATA_ROLL_PTR, ... */ - rec_t* rec) /* in: record */ -{ - ulint pos; - byte* field; - ulint len; - - ut_ad(index); - ut_ad(index->type & DICT_CLUSTERED); - - pos = dict_index_get_sys_col_pos(index, type); - - ut_ad(pos != ULINT_UNDEFINED); - - field = rec_get_nth_field(rec, pos, &len); - - if (type == DATA_ROLL_PTR) { - ut_ad(len == 7); - - return(trx_read_roll_ptr(field)); - } else if (type == DATA_TRX_ID) { - - return(trx_read_trx_id(field)); - } else if (type == DATA_MIX_ID) { - - return(mach_dulint_read_compressed(field)); - } else { - ut_a(type == DATA_ROW_ID); - - return(mach_read_from_6(field)); - } -} - /************************************************************************* Gets the index tree where the index is stored. */ UNIV_INLINE @@ -662,28 +618,3 @@ dict_table_get_index( return(index); } - -/*********************************************************************** -Checks if a table which is a mixed cluster member owns a record. */ -UNIV_INLINE -ibool -dict_is_mixed_table_rec( -/*====================*/ - /* out: TRUE if the record belongs to this - table */ - dict_table_t* table, /* in: table in a mixed cluster */ - rec_t* rec) /* in: user record in the clustered index */ -{ - byte* mix_id_field; - ulint len; - - mix_id_field = rec_get_nth_field(rec, table->mix_len, &len); - - if ((len != table->mix_id_len) - || (0 != ut_memcmp(table->mix_id_buf, mix_id_field, len))) { - - return(FALSE); - } - - return(TRUE); -} diff --git a/innobase/include/dict0mem.h b/innobase/include/dict0mem.h index 1e496a25477..ff6c4ec9b28 100644 --- a/innobase/include/dict0mem.h +++ b/innobase/include/dict0mem.h @@ -54,7 +54,8 @@ dict_mem_table_create( of the table is placed; this parameter is ignored if the table is made a member of a cluster */ - ulint n_cols); /* in: number of columns */ + ulint n_cols, /* in: number of columns */ + ibool comp); /* in: TRUE=compact page format */ /************************************************************************** Creates a cluster memory object. */ @@ -171,6 +172,13 @@ struct dict_field_struct{ DICT_MAX_COL_PREFIX_LEN; NOTE that in the UTF-8 charset, MySQL sets this to 3 * the prefix len in UTF-8 chars */ + ulint fixed_len; /* 0 or the fixed length of the + column if smaller than + DICT_MAX_COL_PREFIX_LEN */ + ulint fixed_offs; /* offset to the field, or + ULINT_UNDEFINED if it is not fixed + within the record (due to preceding + variable-length fields) */ }; /* Data structure for an index tree */ @@ -210,7 +218,6 @@ struct dict_index_struct{ const char* table_name; /* table name */ dict_table_t* table; /* back pointer to table */ ulint space; /* space where the index tree is placed */ - ulint page_no;/* page number of the index tree root */ ulint trx_id_offset;/* position of the the trx id column in a clustered index record, if the fields before it are known to be of a fixed size, @@ -225,6 +232,7 @@ struct dict_index_struct{ ulint n_def; /* number of fields defined so far */ ulint n_fields;/* number of fields in the index */ dict_field_t* fields; /* array of field descriptions */ + ulint n_nullable;/* number of nullable fields */ UT_LIST_NODE_T(dict_index_t) indexes;/* list of indexes of the table */ dict_tree_t* tree; /* index tree struct */ @@ -320,6 +328,7 @@ struct dict_table_struct{ ibool tablespace_discarded;/* this flag is set TRUE when the user calls DISCARD TABLESPACE on this table, and reset to FALSE in IMPORT TABLESPACE */ + ibool comp; /* flag: TRUE=compact page format */ hash_node_t name_hash; /* hash chain node */ hash_node_t id_hash; /* hash chain node */ ulint n_def; /* number of columns defined so far */ diff --git a/innobase/include/fil0fil.h b/innobase/include/fil0fil.h index c1a127aadca..aa1ec5c25a5 100644 --- a/innobase/include/fil0fil.h +++ b/innobase/include/fil0fil.h @@ -89,6 +89,8 @@ extern fil_addr_t fil_addr_null; #define FIL_TABLESPACE 501 #define FIL_LOG 502 +extern ulint fil_n_log_flushes; + extern ulint fil_n_pending_log_flushes; extern ulint fil_n_pending_tablespace_flushes; diff --git a/innobase/include/lock0lock.h b/innobase/include/lock0lock.h index 1fd7492d517..710c945375c 100644 --- a/innobase/include/lock0lock.h +++ b/innobase/include/lock0lock.h @@ -47,7 +47,8 @@ lock_sec_rec_some_has_impl_off_kernel( /* out: transaction which has the x-lock, or NULL */ rec_t* rec, /* in: user record */ - dict_index_t* index); /* in: secondary index */ + dict_index_t* index, /* in: secondary index */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ /************************************************************************* Checks if some transaction has an implicit x-lock on a record in a clustered index. */ @@ -58,7 +59,8 @@ lock_clust_rec_some_has_impl( /* out: transaction which has the x-lock, or NULL */ rec_t* rec, /* in: user record */ - dict_index_t* index); /* in: clustered index */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ /***************************************************************** Resets the lock bits for a single record. Releases transactions waiting for lock requests here. */ @@ -275,6 +277,7 @@ lock_clust_rec_modify_check_and_lock( does nothing */ rec_t* rec, /* in: record which should be modified */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ que_thr_t* thr); /* in: query thread */ /************************************************************************* Checks if locks of other transactions prevent an immediate modify @@ -308,6 +311,7 @@ lock_sec_rec_read_check_and_lock( which should be read or passed over by a read cursor */ dict_index_t* index, /* in: secondary index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ ulint mode, /* in: mode of the lock which the read cursor should set on records: LOCK_S or LOCK_X; the latter is possible in SELECT FOR UPDATE */ @@ -333,6 +337,34 @@ lock_clust_rec_read_check_and_lock( which should be read or passed over by a read cursor */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + ulint mode, /* in: mode of the lock which the read cursor + should set on records: LOCK_S or LOCK_X; the + latter is possible in SELECT FOR UPDATE */ + ulint gap_mode,/* in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr); /* in: query thread */ +/************************************************************************* +Checks if locks of other transactions prevent an immediate read, or passing +over by a read cursor, of a clustered index record. If they do, first tests +if the query thread should anyway be suspended for some reason; if not, then +puts the transaction and the query thread to the lock wait state and inserts a +waiting request for a record lock to the lock queue. Sets the requested mode +lock on the record. This is an alternative version of +lock_clust_rec_read_check_and_lock() that does not require the parameter +"offsets". */ + +ulint +lock_clust_rec_read_check_and_lock_alt( +/*===================================*/ + /* out: DB_SUCCESS, DB_LOCK_WAIT, + DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ + ulint flags, /* in: if BTR_NO_LOCKING_FLAG bit is set, + does nothing */ + rec_t* rec, /* in: user record or page supremum record + which should be read or passed over by a read + cursor */ + dict_index_t* index, /* in: clustered index */ ulint mode, /* in: mode of the lock which the read cursor should set on records: LOCK_S or LOCK_X; the latter is possible in SELECT FOR UPDATE */ @@ -350,6 +382,7 @@ lock_clust_rec_cons_read_sees( rec_t* rec, /* in: user record which should be read or passed over by a read cursor */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ read_view_t* view); /* in: consistent read view */ /************************************************************************* Checks that a non-clustered index record is seen in a consistent read. */ @@ -499,6 +532,7 @@ lock_check_trx_id_sanity( dulint trx_id, /* in: trx id */ rec_t* rec, /* in: user record */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets, /* in: rec_get_offsets(rec, index) */ ibool has_kernel_mutex);/* in: TRUE if the caller owns the kernel mutex */ /************************************************************************* @@ -509,7 +543,8 @@ lock_rec_queue_validate( /*====================*/ /* out: TRUE if ok */ rec_t* rec, /* in: record to look at */ - dict_index_t* index); /* in: index, or NULL if not known */ + dict_index_t* index, /* in: index, or NULL if not known */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ /************************************************************************* Prints info of a table lock. */ @@ -577,6 +612,8 @@ extern lock_sys_t* lock_sys; #define LOCK_TABLE 16 /* these type values should be so high that */ #define LOCK_REC 32 /* they can be ORed to the lock mode */ #define LOCK_TABLE_EXP 80 /* explicit table lock (80 = 16 + 64) */ +#define LOCK_TABLE_TRANSACTIONAL 144 + /* transactional table lock (144 = 16 + 128)*/ #define LOCK_TYPE_MASK 0xF0UL /* mask used to extract lock type from the type_mode field in a lock */ /* Waiting lock flag */ diff --git a/innobase/include/lock0lock.ic b/innobase/include/lock0lock.ic index fabc9256401..c7a71bb45d8 100644 --- a/innobase/include/lock0lock.ic +++ b/innobase/include/lock0lock.ic @@ -60,7 +60,8 @@ lock_clust_rec_some_has_impl( /* out: transaction which has the x-lock, or NULL */ rec_t* rec, /* in: user record */ - dict_index_t* index) /* in: clustered index */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { dulint trx_id; @@ -70,7 +71,7 @@ lock_clust_rec_some_has_impl( ut_ad(index->type & DICT_CLUSTERED); ut_ad(page_rec_is_user_rec(rec)); - trx_id = row_get_rec_trx_id(rec, index); + trx_id = row_get_rec_trx_id(rec, index, offsets); if (trx_is_active(trx_id)) { /* The modifying or inserting transaction is active */ diff --git a/innobase/include/mtr0log.h b/innobase/include/mtr0log.h index 9c9c6f696e8..c0636ea1e1e 100644 --- a/innobase/include/mtr0log.h +++ b/innobase/include/mtr0log.h @@ -11,6 +11,7 @@ Created 12/7/1995 Heikki Tuuri #include "univ.i" #include "mtr0mtr.h" +#include "dict0types.h" /************************************************************ Writes 1 - 4 bytes to a file page buffered in the buffer pool. @@ -173,6 +174,38 @@ mlog_parse_string( byte* page); /* in: page where to apply the log record, or NULL */ +/************************************************************ +Opens a buffer for mlog, writes the initial log record and, +if needed, the field lengths of an index. Reserves space +for further log entries. The log entry must be closed with +mtr_close(). */ + +byte* +mlog_open_and_write_index( +/*======================*/ + /* out: buffer, NULL if log mode + MTR_LOG_NONE */ + mtr_t* mtr, /* in: mtr */ + byte* rec, /* in: index record or page */ + dict_index_t* index, /* in: record descriptor */ + byte type, /* in: log item type */ + ulint size); /* in: requested buffer size in bytes + (if 0, calls mlog_close() and returns NULL) */ + +/************************************************************ +Parses a log record written by mlog_open_and_write_index. */ + +byte* +mlog_parse_index( +/*=============*/ + /* out: parsed record end, + NULL if not a complete record */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + /* out: new value of log_ptr */ + ibool comp, /* in: TRUE=compact record format */ + dict_index_t** index); /* out, own: dummy index */ + /* Insert, update, and maybe other functions may use this value to define an extra mlog buffer size for variable size data */ #define MLOG_BUF_MARGIN 256 diff --git a/innobase/include/mtr0mtr.h b/innobase/include/mtr0mtr.h index e8c68a91dad..071279d5259 100644 --- a/innobase/include/mtr0mtr.h +++ b/innobase/include/mtr0mtr.h @@ -102,7 +102,31 @@ flag value must give the length also! */ file rename */ #define MLOG_FILE_DELETE ((byte)35) /* log record about an .ibd file deletion */ -#define MLOG_BIGGEST_TYPE ((byte)35) /* biggest value (used in +#define MLOG_COMP_REC_MIN_MARK ((byte)36) /* mark a compact index record + as the predefined minimum + record */ +#define MLOG_COMP_PAGE_CREATE ((byte)37) /* create a compact + index page */ +#define MLOG_COMP_REC_INSERT ((byte)38) /* compact record insert */ +#define MLOG_COMP_REC_CLUST_DELETE_MARK ((byte)39) + /* mark compact clustered index + record deleted */ +#define MLOG_COMP_REC_SEC_DELETE_MARK ((byte)40)/* mark compact secondary index + record deleted */ +#define MLOG_COMP_REC_UPDATE_IN_PLACE ((byte)41)/* update of a compact record, + preserves record field sizes */ +#define MLOG_COMP_REC_DELETE ((byte)42) /* delete a compact record + from a page */ +#define MLOG_COMP_LIST_END_DELETE ((byte)43) /* delete compact record list + end on index page */ +#define MLOG_COMP_LIST_START_DELETE ((byte)44) /* delete compact record list + start on index page */ +#define MLOG_COMP_LIST_END_COPY_CREATED ((byte)45) + /* copy compact record list end + to a new created index page */ +#define MLOG_COMP_PAGE_REORGANIZE ((byte)46) /* reorganize an index page */ + +#define MLOG_BIGGEST_TYPE ((byte)46) /* biggest value (used in asserts) */ /******************************************************************* diff --git a/innobase/include/os0file.h b/innobase/include/os0file.h index d1439faf29f..599e78bab48 100644 --- a/innobase/include/os0file.h +++ b/innobase/include/os0file.h @@ -24,6 +24,9 @@ extern ibool os_aio_print_debug; extern ulint os_file_n_pending_preads; extern ulint os_file_n_pending_pwrites; +extern ulint os_n_pending_reads; +extern ulint os_n_pending_writes; + #ifdef __WIN__ /* We define always WIN_ASYNC_IO, and check at run-time whether diff --git a/innobase/include/os0proc.h b/innobase/include/os0proc.h index d0d3cf82e38..b0b72e18675 100644 --- a/innobase/include/os0proc.h +++ b/innobase/include/os0proc.h @@ -12,6 +12,11 @@ Created 9/30/1995 Heikki Tuuri #include "univ.i" +#ifdef UNIV_LINUX +#include <sys/ipc.h> +#include <sys/shm.h> +#endif + typedef void* os_process_t; typedef unsigned long int os_process_id_t; @@ -27,6 +32,10 @@ page size of an Intel x86 processor. We cannot use AWE with 2 MB or 4 MB pages. */ #define OS_AWE_X86_PAGE_SIZE 4096 +extern ibool os_use_large_pages; +/* Large page size. This may be a boot-time option on some platforms */ +extern ulint os_large_page_size; + /******************************************************************** Windows AWE support. Tries to enable the "lock pages in memory" privilege for the current process so that the current process can allocate memory-locked @@ -103,6 +112,25 @@ os_mem_alloc_nocache( /* out: allocated memory */ ulint n); /* in: number of bytes */ /******************************************************************** +Allocates large pages memory. */ + +void* +os_mem_alloc_large( +/*=================*/ + /* out: allocated memory */ + ulint n, /* in: number of bytes */ + ibool set_to_zero, /* in: TRUE if allocated memory should be set + to zero if UNIV_SET_MEM_TO_ZERO is defined */ + ibool assert_on_error); /* in: if TRUE, we crash mysqld if the memory + cannot be allocated */ +/******************************************************************** +Frees large pages memory. */ + +void +os_mem_free_large( +/*=================*/ +void *ptr); /* in: number of bytes */ +/******************************************************************** Sets the priority boost for threads released from waiting within the current process. */ diff --git a/innobase/include/page0cur.h b/innobase/include/page0cur.h index c85669ed4df..a693931968e 100644 --- a/innobase/include/page0cur.h +++ b/innobase/include/page0cur.h @@ -128,7 +128,8 @@ page_cur_tuple_insert( /* out: pointer to record if succeed, NULL otherwise */ page_cur_t* cursor, /* in: a page cursor */ - dtuple_t* tuple, /* in: pointer to a data tuple */ + dtuple_t* tuple, /* in: pointer to a data tuple */ + dict_index_t* index, /* in: record descriptor */ mtr_t* mtr); /* in: mini-transaction handle */ /*************************************************************** Inserts a record next to page cursor. Returns pointer to inserted record if @@ -142,6 +143,7 @@ page_cur_rec_insert( otherwise */ page_cur_t* cursor, /* in: a page cursor */ rec_t* rec, /* in: record to insert */ + dict_index_t* index, /* in: record descriptor */ mtr_t* mtr); /* in: mini-transaction handle */ /*************************************************************** Inserts a record next to page cursor. Returns pointer to inserted record if @@ -155,9 +157,9 @@ page_cur_insert_rec_low( /* out: pointer to record if succeed, NULL otherwise */ page_cur_t* cursor, /* in: a page cursor */ - dtuple_t* tuple, /* in: pointer to a data tuple or NULL */ - ulint data_size,/* in: data size of tuple */ - rec_t* rec, /* in: pointer to a physical record or NULL */ + dtuple_t* tuple, /* in: pointer to a data tuple or NULL */ + dict_index_t* index, /* in: record descriptor */ + rec_t* rec, /* in: pointer to a physical record or NULL */ mtr_t* mtr); /* in: mini-transaction handle */ /***************************************************************** Copies records from page to a newly created page, from a given record onward, @@ -166,10 +168,11 @@ including that record. Infimum and supremum records are not copied. */ void page_copy_rec_list_end_to_created_page( /*===================================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ - rec_t* rec, /* in: first record to copy */ - mtr_t* mtr); /* in: mtr */ + page_t* new_page, /* in: index page to copy to */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: first record to copy */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ /*************************************************************** Deletes a record at the page cursor. The cursor is moved to the next record after the deleted one. */ @@ -178,6 +181,7 @@ void page_cur_delete_rec( /*================*/ page_cur_t* cursor, /* in: a page cursor */ + dict_index_t* index, /* in: record descriptor */ mtr_t* mtr); /* in: mini-transaction handle */ /******************************************************************** Searches the right position for a page cursor. */ @@ -187,6 +191,7 @@ page_cur_search( /*============*/ /* out: number of matched fields on the left */ page_t* page, /* in: index page */ + dict_index_t* index, /* in: record descriptor */ dtuple_t* tuple, /* in: data tuple */ ulint mode, /* in: PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, or PAGE_CUR_GE */ @@ -198,6 +203,7 @@ void page_cur_search_with_match( /*=======================*/ page_t* page, /* in: index page */ + dict_index_t* index, /* in: record descriptor */ dtuple_t* tuple, /* in: data tuple */ ulint mode, /* in: PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, or PAGE_CUR_GE */ @@ -229,34 +235,37 @@ Parses a log record of a record insert on a page. */ byte* page_cur_parse_insert_rec( /*======================*/ - /* out: end of log record or NULL */ - ibool is_short,/* in: TRUE if short inserts */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr); /* in: mtr or NULL */ + /* out: end of log record or NULL */ + ibool is_short,/* in: TRUE if short inserts */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr); /* in: mtr or NULL */ /************************************************************** Parses a log record of copying a record list end to a new created page. */ byte* page_parse_copy_rec_list_to_created_page( /*=====================================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr); /* in: mtr or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr); /* in: mtr or NULL */ /*************************************************************** Parses log record of a record delete on a page. */ byte* page_cur_parse_delete_rec( /*======================*/ - /* out: pointer to record end or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr); /* in: mtr or NULL */ + /* out: pointer to record end or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr); /* in: mtr or NULL */ /* Index page cursor */ diff --git a/innobase/include/page0cur.ic b/innobase/include/page0cur.ic index 39f8ab11513..03010fbd766 100644 --- a/innobase/include/page0cur.ic +++ b/innobase/include/page0cur.ic @@ -143,7 +143,7 @@ UNIV_INLINE void page_cur_move_to_prev( /*==================*/ - page_cur_t* cur) /* in: cursor; must not before first */ + page_cur_t* cur) /* in: page cursor, not before first */ { ut_ad(!page_cur_is_before_first(cur)); @@ -158,6 +158,7 @@ page_cur_search( /*============*/ /* out: number of matched fields on the left */ page_t* page, /* in: index page */ + dict_index_t* index, /* in: record descriptor */ dtuple_t* tuple, /* in: data tuple */ ulint mode, /* in: PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, or PAGE_CUR_GE */ @@ -170,7 +171,7 @@ page_cur_search( ut_ad(dtuple_check_typed(tuple)); - page_cur_search_with_match(page, tuple, mode, + page_cur_search_with_match(page, index, tuple, mode, &up_matched_fields, &up_matched_bytes, &low_matched_fields, @@ -190,16 +191,11 @@ page_cur_tuple_insert( /* out: pointer to record if succeed, NULL otherwise */ page_cur_t* cursor, /* in: a page cursor */ - dtuple_t* tuple, /* in: pointer to a data tuple */ + dtuple_t* tuple, /* in: pointer to a data tuple */ + dict_index_t* index, /* in: record descriptor */ mtr_t* mtr) /* in: mini-transaction handle */ { - ulint data_size; - - ut_ad(dtuple_check_typed(tuple)); - - data_size = dtuple_get_data_size(tuple); - - return(page_cur_insert_rec_low(cursor, tuple, data_size, NULL, mtr)); + return(page_cur_insert_rec_low(cursor, tuple, index, NULL, mtr)); } /*************************************************************** @@ -214,8 +210,9 @@ page_cur_rec_insert( otherwise */ page_cur_t* cursor, /* in: a page cursor */ rec_t* rec, /* in: record to insert */ + dict_index_t* index, /* in: record descriptor */ mtr_t* mtr) /* in: mini-transaction handle */ { - return(page_cur_insert_rec_low(cursor, NULL, 0, rec, mtr)); + return(page_cur_insert_rec_low(cursor, NULL, index, rec, mtr)); } diff --git a/innobase/include/page0page.h b/innobase/include/page0page.h index 969313614e3..d3ef8214eb6 100644 --- a/innobase/include/page0page.h +++ b/innobase/include/page0page.h @@ -37,7 +37,8 @@ typedef byte page_header_t; /*-----------------------------*/ #define PAGE_N_DIR_SLOTS 0 /* number of slots in page directory */ #define PAGE_HEAP_TOP 2 /* pointer to record heap top */ -#define PAGE_N_HEAP 4 /* number of records in the heap */ +#define PAGE_N_HEAP 4 /* number of records in the heap, + bit 15=flag: new-style compact page format */ #define PAGE_FREE 6 /* pointer to start of page free record list */ #define PAGE_GARBAGE 8 /* number of bytes in deleted records */ #define PAGE_LAST_INSERT 10 /* pointer to the last inserted record, or @@ -79,15 +80,24 @@ typedef byte page_header_t; #define PAGE_DATA (PAGE_HEADER + 36 + 2 * FSEG_HEADER_SIZE) /* start of data on the page */ -#define PAGE_INFIMUM (PAGE_DATA + 1 + REC_N_EXTRA_BYTES) - /* offset of the page infimum record on the - page */ -#define PAGE_SUPREMUM (PAGE_DATA + 2 + 2 * REC_N_EXTRA_BYTES + 8) - /* offset of the page supremum record on the - page */ -#define PAGE_SUPREMUM_END (PAGE_SUPREMUM + 9) +#define PAGE_OLD_INFIMUM (PAGE_DATA + 1 + REC_N_OLD_EXTRA_BYTES) + /* offset of the page infimum record on an + old-style page */ +#define PAGE_OLD_SUPREMUM (PAGE_DATA + 2 + 2 * REC_N_OLD_EXTRA_BYTES + 8) + /* offset of the page supremum record on an + old-style page */ +#define PAGE_OLD_SUPREMUM_END (PAGE_OLD_SUPREMUM + 9) /* offset of the page supremum record end on - the page */ + an old-style page */ +#define PAGE_NEW_INFIMUM (PAGE_DATA + REC_N_NEW_EXTRA_BYTES) + /* offset of the page infimum record on a + new-style compact page */ +#define PAGE_NEW_SUPREMUM (PAGE_DATA + 2 * REC_N_NEW_EXTRA_BYTES + 8) + /* offset of the page supremum record on a + new-style compact page */ +#define PAGE_NEW_SUPREMUM_END (PAGE_NEW_SUPREMUM + 8) + /* offset of the page supremum record end on + a new-style compact page */ /*-----------------------------*/ /* Directions of cursor movement */ @@ -233,6 +243,7 @@ page_cmp_dtuple_rec_with_match( be page infimum or supremum, in which case matched-parameter values below are not affected */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint* matched_fields, /* in/out: number of already completely matched fields; when function returns contains the value for current comparison */ @@ -259,6 +270,22 @@ page_rec_get_n_recs_before( /* out: number of records */ rec_t* rec); /* in: the physical record */ /***************************************************************** +Gets the number of records in the heap. */ +UNIV_INLINE +ulint +page_dir_get_n_heap( +/*================*/ + /* out: number of user records */ + page_t* page); /* in: index page */ +/***************************************************************** +Sets the number of records in the heap. */ +UNIV_INLINE +void +page_dir_set_n_heap( +/*================*/ + page_t* page, /* in: index page */ + ulint n_heap);/* in: number of records */ +/***************************************************************** Gets the number of dir slots in directory. */ UNIV_INLINE ulint @@ -267,6 +294,15 @@ page_dir_get_n_slots( /* out: number of slots */ page_t* page); /* in: index page */ /***************************************************************** +Sets the number of dir slots in directory. */ +UNIV_INLINE +void +page_dir_set_n_slots( +/*=================*/ + /* out: number of slots */ + page_t* page, /* in: index page */ + ulint n_slots);/* in: number of slots */ +/***************************************************************** Gets pointer to nth directory slot. */ UNIV_INLINE page_dir_slot_t* @@ -333,7 +369,16 @@ ulint page_dir_find_owner_slot( /*=====================*/ /* out: the directory slot number */ - rec_t* rec); /* in: the physical record */ + rec_t* rec); /* in: the physical record */ +/**************************************************************** +Determine whether the page is in new-style compact format. */ +UNIV_INLINE +ibool +page_is_comp( +/*=========*/ + /* out: TRUE if the page is in compact format + FALSE if it is in old-style format */ + page_t* page); /* in: index page */ /**************************************************************** Gets the pointer to the next record on the page. */ UNIV_INLINE @@ -359,9 +404,10 @@ UNIV_INLINE rec_t* page_rec_get_prev( /*==============*/ - /* out: pointer to previous record */ - rec_t* rec); /* in: pointer to record, must not be page - infimum */ + /* out: pointer to previous record */ + rec_t* rec); /* in: pointer to record, + must not be page infimum */ + /**************************************************************** TRUE if the record is a user record on the page. */ UNIV_INLINE @@ -446,9 +492,11 @@ page_get_max_insert_size_after_reorganize( Calculates free space if a page is emptied. */ UNIV_INLINE ulint -page_get_free_space_of_empty(void); -/*==============================*/ - /* out: free space */ +page_get_free_space_of_empty( +/*=========================*/ + /* out: free space */ + ibool comp) /* in: TRUE=compact page format */ + __attribute__((const)); /**************************************************************** Returns the sum of the sizes of the records in the record list excluding the infimum and supremum records. */ @@ -464,20 +512,23 @@ Allocates a block of memory from an index page. */ byte* page_mem_alloc( /*===========*/ - /* out: pointer to start of allocated - buffer, or NULL if allocation fails */ - page_t* page, /* in: index page */ - ulint need, /* in: number of bytes needed */ - ulint* heap_no);/* out: this contains the heap number - of the allocated record if allocation succeeds */ + /* out: pointer to start of allocated + buffer, or NULL if allocation fails */ + page_t* page, /* in: index page */ + ulint need, /* in: number of bytes needed */ + dict_index_t* index, /* in: record descriptor */ + ulint* heap_no);/* out: this contains the heap number + of the allocated record + if allocation succeeds */ /**************************************************************** Puts a record to free list. */ UNIV_INLINE void page_mem_free( /*==========*/ - page_t* page, /* in: index page */ - rec_t* rec); /* in: pointer to the (origin of) record */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: pointer to the (origin of) record */ + dict_index_t* index); /* in: record descriptor */ /************************************************************** The index page creation function. */ @@ -487,7 +538,8 @@ page_create( /* out: pointer to the page */ buf_frame_t* frame, /* in: a buffer frame where the page is created */ - mtr_t* mtr); /* in: mini-transaction handle */ + mtr_t* mtr, /* in: mini-transaction handle */ + ibool comp); /* in: TRUE=compact page format */ /***************************************************************** Differs from page_copy_rec_list_end, because this function does not touch the lock table and max trx id on page. */ @@ -495,10 +547,11 @@ touch the lock table and max trx id on page. */ void page_copy_rec_list_end_no_locks( /*============================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - mtr_t* mtr); /* in: mtr */ + page_t* new_page, /* in: index page to copy to */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Copies records from page to new_page, from the given record onward, including that record. Infimum and supremum records are not copied. @@ -507,10 +560,11 @@ The records are copied to the start of the record list on new_page. */ void page_copy_rec_list_end( /*===================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - mtr_t* mtr); /* in: mtr */ + page_t* new_page, /* in: index page to copy to */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Copies records from page to new_page, up to the given record, NOT including that record. Infimum and supremum records are not copied. @@ -519,10 +573,11 @@ The records are copied to the end of the record list on new_page. */ void page_copy_rec_list_start( /*=====================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - mtr_t* mtr); /* in: mtr */ + page_t* new_page, /* in: index page to copy to */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Deletes records from a page from a given record onward, including that record. The infimum and supremum records are not deleted. */ @@ -530,14 +585,15 @@ The infimum and supremum records are not deleted. */ void page_delete_rec_list_end( /*=====================*/ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - ulint n_recs, /* in: number of records to delete, or ULINT_UNDEFINED - if not known */ - ulint size, /* in: the sum of the sizes of the records in the end - of the chain to delete, or ULINT_UNDEFINED if not - known */ - mtr_t* mtr); /* in: mtr */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + ulint n_recs, /* in: number of records to delete, + or ULINT_UNDEFINED if not known */ + ulint size, /* in: the sum of the sizes of the + records in the end of the chain to + delete, or ULINT_UNDEFINED if not known */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Deletes records from page, up to the given record, NOT including that record. Infimum and supremum records are not deleted. */ @@ -545,9 +601,10 @@ that record. Infimum and supremum records are not deleted. */ void page_delete_rec_list_start( /*=======================*/ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - mtr_t* mtr); /* in: mtr */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Moves record list end to another page. Moved records include split_rec. */ @@ -555,10 +612,11 @@ split_rec. */ void page_move_rec_list_end( /*===================*/ - page_t* new_page, /* in: index page where to move */ - page_t* page, /* in: index page */ - rec_t* split_rec, /* in: first record to move */ - mtr_t* mtr); /* in: mtr */ + page_t* new_page, /* in: index page where to move */ + page_t* page, /* in: index page */ + rec_t* split_rec, /* in: first record to move */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Moves record list start to another page. Moved records do not include split_rec. */ @@ -566,10 +624,11 @@ split_rec. */ void page_move_rec_list_start( /*=====================*/ - page_t* new_page, /* in: index page where to move */ - page_t* page, /* in: index page */ - rec_t* split_rec, /* in: first record not to move */ - mtr_t* mtr); /* in: mtr */ + page_t* new_page, /* in: index page where to move */ + page_t* page, /* in: index page */ + rec_t* split_rec, /* in: first record not to move */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ /******************************************************************** Splits a directory slot which owns too many records. */ @@ -595,13 +654,16 @@ Parses a log record of a record list end or start deletion. */ byte* page_parse_delete_rec_list( /*=======================*/ - /* out: end of log record or NULL */ - byte type, /* in: MLOG_LIST_END_DELETE or - MLOG_LIST_START_DELETE */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr); /* in: mtr or NULL */ + /* out: end of log record or NULL */ + byte type, /* in: MLOG_LIST_END_DELETE, + MLOG_LIST_START_DELETE, + MLOG_COMP_LIST_END_DELETE or + MLOG_COMP_LIST_START_DELETE */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr); /* in: mtr or NULL */ /*************************************************************** Parses a redo log record of creating a page. */ @@ -611,6 +673,7 @@ page_parse_create( /* out: end of log record or NULL */ byte* ptr, /* in: buffer */ byte* end_ptr,/* in: buffer end */ + ibool comp, /* in: TRUE=compact page format */ page_t* page, /* in: page or NULL */ mtr_t* mtr); /* in: mtr or NULL */ /**************************************************************** @@ -620,7 +683,8 @@ the index page context. */ void page_rec_print( /*===========*/ - rec_t* rec); + rec_t* rec, /* in: physical record */ + const ulint* offsets);/* in: record descriptor */ /******************************************************************* This is used to print the contents of the directory for debugging purposes. */ @@ -637,8 +701,9 @@ debugging purposes. */ void page_print_list( /*============*/ - page_t* page, /* in: index page */ - ulint pr_n); /* in: print n first and n last entries */ + page_t* page, /* in: index page */ + dict_index_t* index, /* in: dictionary index of the page */ + ulint pr_n); /* in: print n first and n last entries */ /******************************************************************* Prints the info in a page header. */ @@ -653,9 +718,12 @@ debugging purposes. */ void page_print( /*======*/ - page_t* page, /* in: index page */ - ulint dn, /* in: print dn first and last entries in directory */ - ulint rn); /* in: print rn first and last records on page */ + page_t* page, /* in: index page */ + dict_index_t* index, /* in: dictionary index of the page */ + ulint dn, /* in: print dn first and last entries + in directory */ + ulint rn); /* in: print rn first and last records + in directory */ /******************************************************************* The following is used to validate a record on a page. This function differs from rec_validate as it can also check the n_owned field and @@ -664,8 +732,9 @@ the heap_no field. */ ibool page_rec_validate( /*==============*/ - /* out: TRUE if ok */ - rec_t* rec); /* in: record on the page */ + /* out: TRUE if ok */ + rec_t* rec, /* in: physical record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /******************************************************************* Checks that the first directory slot points to the infimum record and the last to the supremum. This function is intended to track if the diff --git a/innobase/include/page0page.ic b/innobase/include/page0page.ic index 3d2bf3b090e..a63b5ca4238 100644 --- a/innobase/include/page0page.ic +++ b/innobase/include/page0page.ic @@ -73,7 +73,8 @@ page_header_set_field( { ut_ad(page); ut_ad(field <= PAGE_N_RECS); - ut_ad(val < UNIV_PAGE_SIZE); + ut_ad(field == PAGE_N_HEAP || val < UNIV_PAGE_SIZE); + ut_ad(field != PAGE_N_HEAP || (val & 0x7fff) < UNIV_PAGE_SIZE); mach_write_to_2(page + PAGE_HEADER + field, val); } @@ -152,6 +153,19 @@ page_header_reset_last_insert( } /**************************************************************** +Determine whether the page is in new-style compact format. */ +UNIV_INLINE +ibool +page_is_comp( +/*=========*/ + /* out: TRUE if the page is in compact format + FALSE if it is in old-style format */ + page_t* page) /* in: index page */ +{ + return(!!(page_header_get_field(page, PAGE_N_HEAP) & 0x8000)); +} + +/**************************************************************** Gets the first record on the page. */ UNIV_INLINE rec_t* @@ -162,7 +176,11 @@ page_get_infimum_rec( { ut_ad(page); - return(page + PAGE_INFIMUM); + if (page_is_comp(page)) { + return(page + PAGE_NEW_INFIMUM); + } else { + return(page + PAGE_OLD_INFIMUM); + } } /**************************************************************** @@ -176,7 +194,11 @@ page_get_supremum_rec( { ut_ad(page); - return(page + PAGE_SUPREMUM); + if (page_is_comp(page)) { + return(page + PAGE_NEW_SUPREMUM); + } else { + return(page + PAGE_OLD_SUPREMUM); + } } /**************************************************************** @@ -309,6 +331,7 @@ page_cmp_dtuple_rec_with_match( be page infimum or supremum, in which case matched-parameter values below are not affected */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint* matched_fields, /* in/out: number of already completely matched fields; when function returns contains the value for current comparison */ @@ -320,6 +343,7 @@ page_cmp_dtuple_rec_with_match( page_t* page; ut_ad(dtuple_check_typed(dtuple)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); page = buf_frame_align(rec); @@ -328,7 +352,7 @@ page_cmp_dtuple_rec_with_match( } else if (rec == page_get_supremum_rec(page)) { return(-1); } else { - return(cmp_dtuple_rec_with_match(dtuple, rec, + return(cmp_dtuple_rec_with_match(dtuple, rec, offsets, matched_fields, matched_bytes)); } @@ -358,6 +382,45 @@ page_dir_get_n_slots( { return(page_header_get_field(page, PAGE_N_DIR_SLOTS)); } +/***************************************************************** +Sets the number of dir slots in directory. */ +UNIV_INLINE +void +page_dir_set_n_slots( +/*=================*/ + /* out: number of slots */ + page_t* page, /* in: index page */ + ulint n_slots)/* in: number of slots */ +{ + page_header_set_field(page, PAGE_N_DIR_SLOTS, n_slots); +} + +/***************************************************************** +Gets the number of records in the heap. */ +UNIV_INLINE +ulint +page_dir_get_n_heap( +/*================*/ + /* out: number of user records */ + page_t* page) /* in: index page */ +{ + return(page_header_get_field(page, PAGE_N_HEAP) & 0x7fff); +} + +/***************************************************************** +Sets the number of records in the heap. */ +UNIV_INLINE +void +page_dir_set_n_heap( +/*================*/ + page_t* page, /* in: index page */ + ulint n_heap) /* in: number of records */ +{ + ut_ad(n_heap < 0x8000); + + page_header_set_field(page, PAGE_N_HEAP, n_heap | (0x8000 & + page_header_get_field(page, PAGE_N_HEAP))); +} /***************************************************************** Gets pointer to nth directory slot. */ @@ -369,7 +432,7 @@ page_dir_get_nth_slot( page_t* page, /* in: index page */ ulint n) /* in: position */ { - ut_ad(page_header_get_field(page, PAGE_N_DIR_SLOTS) > n); + ut_ad(page_dir_get_n_slots(page) > n); return(page + UNIV_PAGE_SIZE - PAGE_DIR - (n + 1) * PAGE_DIR_SLOT_SIZE); @@ -431,7 +494,8 @@ page_dir_slot_get_n_owned( /* out: number of records */ page_dir_slot_t* slot) /* in: page directory slot */ { - return(rec_get_n_owned(page_dir_slot_get_rec(slot))); + return(rec_get_n_owned(page_dir_slot_get_rec(slot), + page_is_comp(buf_frame_align(slot)))); } /******************************************************************* @@ -444,7 +508,8 @@ page_dir_slot_set_n_owned( ulint n) /* in: number of records owned by the slot */ { - rec_set_n_owned(page_dir_slot_get_rec(slot), n); + rec_set_n_owned(page_dir_slot_get_rec(slot), + page_is_comp(buf_frame_align(slot)), n); } /**************************************************************** @@ -477,7 +542,7 @@ page_rec_get_next( page = buf_frame_align(rec); - offs = rec_get_next_offs(rec); + offs = rec_get_next_offs(rec, page_is_comp(page)); if (offs >= UNIV_PAGE_SIZE) { fprintf(stderr, @@ -513,6 +578,7 @@ page_rec_set_next( infimum */ { page_t* page; + ulint offs; ut_ad(page_rec_check(rec)); ut_a((next == NULL) @@ -523,11 +589,13 @@ page_rec_set_next( ut_ad(rec != page_get_supremum_rec(page)); ut_ad(next != page_get_infimum_rec(page)); - if (next == NULL) { - rec_set_next_offs(rec, 0); + if (next) { + offs = (ulint) (next - page); } else { - rec_set_next_offs(rec, (ulint)(next - page)); + offs = 0; } + + rec_set_next_offs(rec, page_is_comp(page), offs); } /**************************************************************** @@ -545,6 +613,7 @@ page_rec_get_prev( rec_t* rec2; rec_t* prev_rec = NULL; page_t* page; + ibool comp; ut_ad(page_rec_check(rec)); @@ -559,6 +628,7 @@ page_rec_get_prev( slot = page_dir_get_nth_slot(page, slot_no - 1); rec2 = page_dir_slot_get_rec(slot); + comp = page_is_comp(page); while (rec != rec2) { prev_rec = rec2; @@ -579,9 +649,12 @@ page_rec_find_owner_rec( /* out: the owner record */ rec_t* rec) /* in: the physical record */ { + ibool comp; + ut_ad(page_rec_check(rec)); + comp = page_is_comp(buf_frame_align(rec)); - while (rec_get_n_owned(rec) == 0) { + while (rec_get_n_owned(rec, comp) == 0) { rec = page_rec_get_next(rec); } @@ -601,7 +674,9 @@ page_get_data_size( ulint ret; ret = (ulint)(page_header_get_field(page, PAGE_HEAP_TOP) - - PAGE_SUPREMUM_END + - (page_is_comp(page) + ? PAGE_NEW_SUPREMUM_END + : PAGE_OLD_SUPREMUM_END) - page_header_get_field(page, PAGE_GARBAGE)); ut_ad(ret < UNIV_PAGE_SIZE); @@ -613,12 +688,13 @@ page_get_data_size( Calculates free space if a page is emptied. */ UNIV_INLINE ulint -page_get_free_space_of_empty(void) -/*==============================*/ +page_get_free_space_of_empty( +/*=========================*/ /* out: free space */ + ibool comp) /* in: TRUE=compact page layout */ { return((ulint)(UNIV_PAGE_SIZE - - PAGE_SUPREMUM_END + - (comp ? PAGE_NEW_SUPREMUM_END : PAGE_OLD_SUPREMUM_END) - PAGE_DIR - 2 * PAGE_DIR_SLOT_SIZE)); } @@ -640,13 +716,16 @@ page_get_max_insert_size( { ulint occupied; ulint free_space; + ibool comp; + + comp = page_is_comp(page); occupied = page_header_get_field(page, PAGE_HEAP_TOP) - - PAGE_SUPREMUM_END + - (comp ? PAGE_NEW_SUPREMUM_END : PAGE_OLD_SUPREMUM_END) + page_dir_calc_reserved_space( - n_recs + (page_header_get_field(page, PAGE_N_HEAP) - 2)); + n_recs + page_dir_get_n_heap(page) - 2); - free_space = page_get_free_space_of_empty(); + free_space = page_get_free_space_of_empty(comp); /* Above the 'n_recs +' part reserves directory space for the new inserted records; the '- 2' excludes page infimum and supremum @@ -673,11 +752,14 @@ page_get_max_insert_size_after_reorganize( { ulint occupied; ulint free_space; + ibool comp; + + comp = page_is_comp(page); occupied = page_get_data_size(page) + page_dir_calc_reserved_space(n_recs + page_get_n_recs(page)); - free_space = page_get_free_space_of_empty(); + free_space = page_get_free_space_of_empty(comp); if (occupied > free_space) { @@ -693,11 +775,12 @@ UNIV_INLINE void page_mem_free( /*==========*/ - page_t* page, /* in: index page */ - rec_t* rec) /* in: pointer to the (origin of) record */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: pointer to the (origin of) record */ + dict_index_t* index) /* in: record descriptor */ { - rec_t* free; - ulint garbage; + rec_t* free; + ulint garbage; free = page_header_get_ptr(page, PAGE_FREE); @@ -707,7 +790,7 @@ page_mem_free( garbage = page_header_get_field(page, PAGE_GARBAGE); page_header_set_field(page, PAGE_GARBAGE, - garbage + rec_get_size(rec)); + garbage + rec_get_size(rec, index)); } #ifdef UNIV_MATERIALIZE diff --git a/innobase/include/que0que.h b/innobase/include/que0que.h index e1874edcaf2..298ec494750 100644 --- a/innobase/include/que0que.h +++ b/innobase/include/que0que.h @@ -359,6 +359,7 @@ struct que_thr_struct{ the control came */ ulint resource; /* resource usage of the query thread thus far */ + ulint lock_state; /* lock state of thread (table or row) */ }; #define QUE_THR_MAGIC_N 8476583 @@ -482,6 +483,11 @@ struct que_fork_struct{ #define QUE_THR_SUSPENDED 7 #define QUE_THR_ERROR 8 +/* Query thread lock states */ +#define QUE_THR_LOCK_NOLOCK 0 +#define QUE_THR_LOCK_ROW 1 +#define QUE_THR_LOCK_TABLE 2 + /* From where the cursor position is counted */ #define QUE_CUR_NOT_DEFINED 1 #define QUE_CUR_START 2 diff --git a/innobase/include/rem0cmp.h b/innobase/include/rem0cmp.h index 712e263350e..1b1ee26b809 100644 --- a/innobase/include/rem0cmp.h +++ b/innobase/include/rem0cmp.h @@ -90,6 +90,7 @@ cmp_dtuple_rec_with_match( dtuple in some of the common fields, or which has an equal number or more fields than dtuple */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint* matched_fields, /* in/out: number of already completely matched fields; when function returns, contains the value for current comparison */ @@ -107,7 +108,8 @@ cmp_dtuple_rec( less than rec, respectively; see the comments for cmp_dtuple_rec_with_match */ dtuple_t* dtuple, /* in: data tuple */ - rec_t* rec); /* in: physical record */ + rec_t* rec, /* in: physical record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /****************************************************************** Checks if a dtuple is a prefix of a record. The last field in dtuple is allowed to be a prefix of the corresponding field in the record. */ @@ -116,23 +118,9 @@ ibool cmp_dtuple_is_prefix_of_rec( /*========================*/ /* out: TRUE if prefix */ - dtuple_t* dtuple, /* in: data tuple */ - rec_t* rec); /* in: physical record */ -/****************************************************************** -Compares a prefix of a data tuple to a prefix of a physical record for -equality. If there are less fields in rec than parameter n_fields, FALSE -is returned. NOTE that n_fields_cmp of dtuple does not affect this -comparison. */ - -ibool -cmp_dtuple_rec_prefix_equal( -/*========================*/ - /* out: TRUE if equal */ dtuple_t* dtuple, /* in: data tuple */ rec_t* rec, /* in: physical record */ - ulint n_fields); /* in: number of fields which should be - compared; must not exceed the number of - fields in dtuple */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /***************************************************************** This function is used to compare two physical records. Only the common first fields are compared, and if an externally stored field is @@ -146,6 +134,8 @@ cmp_rec_rec_with_match( first fields are compared */ rec_t* rec1, /* in: physical record */ rec_t* rec2, /* in: physical record */ + const ulint* offsets1,/* in: rec_get_offsets(rec1, index) */ + const ulint* offsets2,/* in: rec_get_offsets(rec2, index) */ dict_index_t* index, /* in: data dictionary index */ ulint* matched_fields, /* in/out: number of already completely matched fields; when the function returns, @@ -167,6 +157,8 @@ cmp_rec_rec( first fields are compared */ rec_t* rec1, /* in: physical record */ rec_t* rec2, /* in: physical record */ + const ulint* offsets1,/* in: rec_get_offsets(rec1, index) */ + const ulint* offsets2,/* in: rec_get_offsets(rec2, index) */ dict_index_t* index); /* in: data dictionary index */ diff --git a/innobase/include/rem0cmp.ic b/innobase/include/rem0cmp.ic index 75cb3ef04e8..b86534e0a6a 100644 --- a/innobase/include/rem0cmp.ic +++ b/innobase/include/rem0cmp.ic @@ -57,10 +57,13 @@ cmp_rec_rec( first fields are compared */ rec_t* rec1, /* in: physical record */ rec_t* rec2, /* in: physical record */ + const ulint* offsets1,/* in: rec_get_offsets(rec1, index) */ + const ulint* offsets2,/* in: rec_get_offsets(rec2, index) */ dict_index_t* index) /* in: data dictionary index */ { ulint match_f = 0; ulint match_b = 0; - return(cmp_rec_rec_with_match(rec1, rec2, index, &match_f, &match_b)); + return(cmp_rec_rec_with_match(rec1, rec2, offsets1, offsets2, index, + &match_f, &match_b)); } diff --git a/innobase/include/rem0rec.h b/innobase/include/rem0rec.h index 86bf263170f..ab89b912523 100644 --- a/innobase/include/rem0rec.h +++ b/innobase/include/rem0rec.h @@ -23,9 +23,18 @@ Created 5/30/1994 Heikki Tuuri info bits of a record */ #define REC_INFO_MIN_REC_FLAG 0x10UL -/* Number of extra bytes in a record, in addition to the data and the -offsets */ -#define REC_N_EXTRA_BYTES 6 +/* Number of extra bytes in an old-style record, +in addition to the data and the offsets */ +#define REC_N_OLD_EXTRA_BYTES 6 +/* Number of extra bytes in a new-style record, +in addition to the data and the offsets */ +#define REC_N_NEW_EXTRA_BYTES 5 + +/* Record status values */ +#define REC_STATUS_ORDINARY 0 +#define REC_STATUS_NODE_PTR 1 +#define REC_STATUS_INFIMUM 2 +#define REC_STATUS_SUPREMUM 3 /********************************************************** The following function is used to get the offset of the @@ -36,7 +45,8 @@ rec_get_next_offs( /*==============*/ /* out: the page offset of the next chained record */ - rec_t* rec); /* in: physical record */ + rec_t* rec, /* in: physical record */ + ibool comp); /* in: TRUE=compact page format */ /********************************************************** The following function is used to set the next record offset field of the record. */ @@ -45,17 +55,28 @@ void rec_set_next_offs( /*==============*/ rec_t* rec, /* in: physical record */ + ibool comp, /* in: TRUE=compact page format */ ulint next); /* in: offset of the next record */ /********************************************************** The following function is used to get the number of fields -in the record. */ +in an old-style record. */ UNIV_INLINE ulint -rec_get_n_fields( -/*=============*/ +rec_get_n_fields_old( +/*=================*/ /* out: number of data fields */ rec_t* rec); /* in: physical record */ /********************************************************** +The following function is used to get the number of fields +in a record. */ +UNIV_INLINE +ulint +rec_get_n_fields( +/*=============*/ + /* out: number of data fields */ + rec_t* rec, /* in: physical record */ + dict_index_t* index); /* in: record descriptor */ +/********************************************************** The following function is used to get the number of records owned by the previous directory record. */ UNIV_INLINE @@ -63,7 +84,8 @@ ulint rec_get_n_owned( /*============*/ /* out: number of owned records */ - rec_t* rec); /* in: physical record */ + rec_t* rec, /* in: physical record */ + ibool comp); /* in: TRUE=compact page format */ /********************************************************** The following function is used to set the number of owned records. */ @@ -72,6 +94,7 @@ void rec_set_n_owned( /*============*/ rec_t* rec, /* in: physical record */ + ibool comp, /* in: TRUE=compact page format */ ulint n_owned); /* in: the number of owned */ /********************************************************** The following function is used to retrieve the info bits of @@ -81,7 +104,8 @@ ulint rec_get_info_bits( /*==============*/ /* out: info bits */ - rec_t* rec); /* in: physical record */ + rec_t* rec, /* in: physical record */ + ibool comp); /* in: TRUE=compact page format */ /********************************************************** The following function is used to set the info bits of a record. */ UNIV_INLINE @@ -89,15 +113,26 @@ void rec_set_info_bits( /*==============*/ rec_t* rec, /* in: physical record */ + ibool comp, /* in: TRUE=compact page format */ ulint bits); /* in: info bits */ /********************************************************** -Gets the value of the deleted falg in info bits. */ +The following function retrieves the status bits of a new-style record. */ UNIV_INLINE -ibool -rec_info_bits_get_deleted_flag( -/*===========================*/ - /* out: TRUE if deleted flag set */ - ulint info_bits); /* in: info bits from a record */ +ulint +rec_get_status( +/*===========*/ + /* out: status bits */ + rec_t* rec); /* in: physical record */ + +/********************************************************** +The following function is used to set the status bits of a new-style record. */ +UNIV_INLINE +void +rec_set_status( +/*===========*/ + rec_t* rec, /* in: physical record */ + ulint bits); /* in: info bits */ + /********************************************************** The following function tells if record is delete marked. */ UNIV_INLINE @@ -105,7 +140,8 @@ ibool rec_get_deleted_flag( /*=================*/ /* out: TRUE if delete marked */ - rec_t* rec); /* in: physical record */ + rec_t* rec, /* in: physical record */ + ibool comp); /* in: TRUE=compact page format */ /********************************************************** The following function is used to set the deleted bit. */ UNIV_INLINE @@ -113,8 +149,25 @@ void rec_set_deleted_flag( /*=================*/ rec_t* rec, /* in: physical record */ + ibool comp, /* in: TRUE=compact page format */ ibool flag); /* in: TRUE if delete marked */ /********************************************************** +The following function tells if a new-style record is a node pointer. */ +UNIV_INLINE +ibool +rec_get_node_ptr_flag( +/*=================*/ + /* out: TRUE if node pointer */ + rec_t* rec); /* in: physical record */ +/********************************************************** +The following function is used to flag a record as a node pointer. */ +UNIV_INLINE +void +rec_set_node_ptr_flag( +/*=================*/ + rec_t* rec, /* in: physical record */ + ibool flag); /* in: TRUE if the record is a node pointer */ +/********************************************************** The following function is used to get the order number of the record in the heap of the index page. */ UNIV_INLINE @@ -122,7 +175,8 @@ ulint rec_get_heap_no( /*=============*/ /* out: heap order number */ - rec_t* rec); /* in: physical record */ + rec_t* rec, /* in: physical record */ + ibool comp); /* in: TRUE=compact page format */ /********************************************************** The following function is used to set the heap number field in the record. */ @@ -131,6 +185,7 @@ void rec_set_heap_no( /*=============*/ rec_t* rec, /* in: physical record */ + ibool comp, /* in: TRUE=compact page format */ ulint heap_no);/* in: the heap number */ /********************************************************** The following function is used to test whether the data offsets @@ -141,31 +196,65 @@ rec_get_1byte_offs_flag( /*====================*/ /* out: TRUE if 1-byte form */ rec_t* rec); /* in: physical record */ +/********************************************************** +The following function determines the offsets to each field +in the record. It can reuse a previously allocated array. */ + +ulint* +rec_get_offsets_func( +/*=================*/ + /* out: the new offsets */ + rec_t* rec, /* in: physical record */ + dict_index_t* index, /* in: record descriptor */ + ulint* offsets,/* in: array consisting of offsets[0] + allocated elements, or an array from + rec_get_offsets(), or NULL */ + ulint n_fields,/* in: maximum number of initialized fields + (ULINT_UNDEFINED if all fields) */ + mem_heap_t** heap, /* in/out: memory heap */ + const char* file, /* in: file name where called */ + ulint line); /* in: line number where called */ + +#define rec_get_offsets(rec,index,offsets,n,heap) \ + rec_get_offsets_func(rec,index,offsets,n,heap,__FILE__,__LINE__) + +/**************************************************************** +Validates offsets returned by rec_get_offsets(). */ +UNIV_INLINE +ibool +rec_offs_validate( +/*==============*/ + /* out: TRUE if valid */ + rec_t* rec, /* in: record or NULL */ + dict_index_t* index, /* in: record descriptor or NULL */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/**************************************************************** +Updates debug data in offsets, in order to avoid bogus +rec_offs_validate() failures. */ +UNIV_INLINE +void +rec_offs_make_valid( +/*================*/ + rec_t* rec, /* in: record */ + dict_index_t* index,/* in: record descriptor */ + ulint* offsets);/* in: array returned by rec_get_offsets() */ + /**************************************************************** The following function is used to get a pointer to the nth -data field in the record. */ +data field in an old-style record. */ byte* -rec_get_nth_field( -/*==============*/ +rec_get_nth_field_old( +/*==================*/ /* out: pointer to the field */ rec_t* rec, /* in: record */ ulint n, /* in: index of the field */ ulint* len); /* out: length of the field; UNIV_SQL_NULL if SQL null */ /**************************************************************** -Return field length or UNIV_SQL_NULL. */ -UNIV_INLINE -ulint -rec_get_nth_field_len( -/*==================*/ - /* out: length of the field; UNIV_SQL_NULL if SQL - null */ - rec_t* rec, /* in: record */ - ulint n); /* in: index of the field */ -/**************************************************************** -Gets the physical size of a field. Also an SQL null may have a field of -size > 0, if the data type is of a fixed size. */ +Gets the physical size of an old-style field. +Also an SQL null may have a field of size > 0, +if the data type is of a fixed size. */ UNIV_INLINE ulint rec_get_nth_field_size( @@ -173,131 +262,194 @@ rec_get_nth_field_size( /* out: field size in bytes */ rec_t* rec, /* in: record */ ulint n); /* in: index of the field */ -/*************************************************************** -Gets the value of the ith field extern storage bit. If it is TRUE -it means that the field is stored on another page. */ +/**************************************************************** +The following function is used to get a pointer to the nth +data field in an old-style record. */ +UNIV_INLINE +byte* +rec_get_nth_field( +/*==============*/ + /* out: pointer to the field */ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n, /* in: index of the field */ + ulint* len); /* out: length of the field; UNIV_SQL_NULL + if SQL null */ +/********************************************************** +Determine if the offsets are for a record in the new +compact format. */ UNIV_INLINE ibool -rec_get_nth_field_extern_bit( -/*=========================*/ - /* in: TRUE or FALSE */ - rec_t* rec, /* in: record */ - ulint i); /* in: ith field */ +rec_offs_comp( +/*==========*/ + /* out: TRUE if compact format */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/********************************************************** +Returns TRUE if the nth field of rec is SQL NULL. */ +UNIV_INLINE +ibool +rec_offs_nth_null( +/*==============*/ + /* out: TRUE if SQL NULL */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n); /* in: nth field */ +/********************************************************** +Returns TRUE if the extern bit is set in nth field of rec. */ +UNIV_INLINE +ibool +rec_offs_nth_extern( +/*================*/ + /* out: TRUE if externally stored */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n); /* in: nth field */ +/********************************************************** +Gets the physical size of a field. */ +UNIV_INLINE +ulint +rec_offs_nth_size( +/*==============*/ + /* out: length of field */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n); /* in: nth field */ + /********************************************************** Returns TRUE if the extern bit is set in any of the fields of rec. */ UNIV_INLINE ibool -rec_contains_externally_stored_field( -/*=================================*/ - /* out: TRUE if a field is stored externally */ - rec_t* rec); /* in: record */ +rec_offs_any_extern( +/*================*/ + /* out: TRUE if a field is stored externally */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /*************************************************************** Sets the value of the ith field extern storage bit. */ - +UNIV_INLINE void rec_set_nth_field_extern_bit( /*=========================*/ - rec_t* rec, /* in: record */ - ulint i, /* in: ith field */ - ibool val, /* in: value to set */ - mtr_t* mtr); /* in: mtr holding an X-latch to the page where - rec is, or NULL; in the NULL case we do not - write to log about the change */ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: record descriptor */ + ulint i, /* in: ith field */ + ibool val, /* in: value to set */ + mtr_t* mtr); /* in: mtr holding an X-latch to the page + where rec is, or NULL; in the NULL case + we do not write to log about the change */ /*************************************************************** Sets TRUE the extern storage bits of fields mentioned in an array. */ void rec_set_field_extern_bits( /*======================*/ - rec_t* rec, /* in: record */ - ulint* vec, /* in: array of field numbers */ - ulint n_fields, /* in: number of fields numbers */ - mtr_t* mtr); /* in: mtr holding an X-latch to the page - where rec is, or NULL; in the NULL case we - do not write to log about the change */ -/**************************************************************** -The following function is used to get a copy of the nth -data field in the record to a buffer. */ -UNIV_INLINE -void -rec_copy_nth_field( -/*===============*/ - void* buf, /* in: pointer to the buffer */ - rec_t* rec, /* in: record */ - ulint n, /* in: index of the field */ - ulint* len); /* out: length of the field; UNIV_SQL_NULL if SQL - null */ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: record descriptor */ + const ulint* vec, /* in: array of field numbers */ + ulint n_fields,/* in: number of fields numbers */ + mtr_t* mtr); /* in: mtr holding an X-latch to the page + where rec is, or NULL; in the NULL case + we do not write to log about the change */ /*************************************************************** -This is used to modify the value of an already existing field in -a physical record. The previous value must have exactly the same -size as the new value. If len is UNIV_SQL_NULL then the field is -treated as SQL null. */ +This is used to modify the value of an already existing field in a record. +The previous value must have exactly the same size as the new value. If len +is UNIV_SQL_NULL then the field is treated as an SQL null for old-style +records. For new-style records, len must not be UNIV_SQL_NULL. */ UNIV_INLINE void rec_set_nth_field( /*==============*/ - rec_t* rec, /* in: record */ - ulint n, /* in: index of the field */ - void* data, /* in: pointer to the data if not SQL null */ - ulint len); /* in: length of the data or UNIV_SQL_NULL. - If not SQL null, must have the same length as the - previous value. If SQL null, previous value must be - SQL null. */ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n, /* in: index number of the field */ + const void* data, /* in: pointer to the data if not SQL null */ + ulint len); /* in: length of the data or UNIV_SQL_NULL. + If not SQL null, must have the same + length as the previous value. + If SQL null, previous value must be + SQL null. */ /************************************************************** -The following function returns the data size of a physical +The following function returns the data size of an old-style physical record, that is the sum of field lengths. SQL null fields are counted as length 0 fields. The value returned by the function is the distance from record origin to record end in bytes. */ UNIV_INLINE ulint -rec_get_data_size( -/*==============*/ - /* out: size */ +rec_get_data_size_old( +/*==================*/ + /* out: size */ rec_t* rec); /* in: physical record */ /************************************************************** +The following function returns the number of fields in a record. */ +UNIV_INLINE +ulint +rec_offs_n_fields( +/*===============*/ + /* out: number of fields */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/************************************************************** +The following function returns the data size of a physical +record, that is the sum of field lengths. SQL null fields +are counted as length 0 fields. The value returned by the function +is the distance from record origin to record end in bytes. */ +UNIV_INLINE +ulint +rec_offs_data_size( +/*===============*/ + /* out: size */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/************************************************************** Returns the total size of record minus data size of record. The value returned by the function is the distance from record start to record origin in bytes. */ UNIV_INLINE ulint -rec_get_extra_size( -/*===============*/ - /* out: size */ - rec_t* rec); /* in: physical record */ -/************************************************************** +rec_offs_extra_size( +/*================*/ + /* out: size */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/************************************************************** Returns the total size of a physical record. */ UNIV_INLINE ulint +rec_offs_size( +/*==========*/ + /* out: size */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/************************************************************** +Returns the total size of a physical record. */ + +ulint rec_get_size( /*=========*/ - /* out: size */ - rec_t* rec); /* in: physical record */ + /* out: size */ + rec_t* rec, /* in: physical record */ + dict_index_t* index); /* in: record descriptor */ /************************************************************** Returns a pointer to the start of the record. */ UNIV_INLINE byte* rec_get_start( /*==========*/ - /* out: pointer to start */ - rec_t* rec); /* in: pointer to record */ + /* out: pointer to start */ + rec_t* rec, /* in: pointer to record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /************************************************************** Returns a pointer to the end of the record. */ UNIV_INLINE byte* rec_get_end( /*========*/ - /* out: pointer to end */ - rec_t* rec); /* in: pointer to record */ + /* out: pointer to end */ + rec_t* rec, /* in: pointer to record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /******************************************************************* Copies a physical record to a buffer. */ UNIV_INLINE rec_t* rec_copy( /*=====*/ - /* out: pointer to the origin of the copied record */ - void* buf, /* in: buffer */ - rec_t* rec); /* in: physical record */ + /* out: pointer to the origin of the copy */ + void* buf, /* in: buffer */ + const rec_t* rec, /* in: physical record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /****************************************************************** Copies the first n fields of a physical record to a new physical record in a buffer. */ @@ -305,49 +457,43 @@ a buffer. */ rec_t* rec_copy_prefix_to_buf( /*===================*/ - /* out, own: copied record */ - rec_t* rec, /* in: physical record */ - ulint n_fields, /* in: number of fields to copy */ - byte** buf, /* in/out: memory buffer for the copied prefix, - or NULL */ - ulint* buf_size); /* in/out: buffer size */ + /* out, own: copied record */ + rec_t* rec, /* in: physical record */ + dict_index_t* index, /* in: record descriptor */ + ulint n_fields, /* in: number of fields to copy */ + byte** buf, /* in/out: memory buffer + for the copied prefix, or NULL */ + ulint* buf_size); /* in/out: buffer size */ /**************************************************************** Folds a prefix of a physical record to a ulint. */ UNIV_INLINE ulint rec_fold( /*=====*/ - /* out: the folded value */ - rec_t* rec, /* in: the physical record */ - ulint n_fields, /* in: number of complete fields to fold */ - ulint n_bytes, /* in: number of bytes to fold in an - incomplete last field */ - dulint tree_id); /* in: index tree id */ + /* out: the folded value */ + rec_t* rec, /* in: the physical record */ + const ulint* offsets, /* in: array returned by + rec_get_offsets() */ + ulint n_fields, /* in: number of complete + fields to fold */ + ulint n_bytes, /* in: number of bytes to fold + in an incomplete last field */ + dulint tree_id); /* in: index tree id */ /************************************************************* Builds a physical record out of a data tuple and stores it beginning from address destination. */ -UNIV_INLINE + rec_t* rec_convert_dtuple_to_rec( /*======================*/ - /* out: pointer to the origin of physical - record */ - byte* destination, /* in: start address of the physical record */ - dtuple_t* dtuple); /* in: data tuple */ -/************************************************************* -Builds a physical record out of a data tuple and stores it beginning from -address destination. */ - -rec_t* -rec_convert_dtuple_to_rec_low( -/*==========================*/ - /* out: pointer to the origin of physical - record */ - byte* destination, /* in: start address of the physical record */ - dtuple_t* dtuple, /* in: data tuple */ - ulint data_size); /* in: data size of dtuple */ + /* out: pointer to the origin + of physical record */ + byte* buf, /* in: start address of the + physical record */ + dict_index_t* index, /* in: record descriptor */ + dtuple_t* dtuple);/* in: data tuple */ /************************************************************** -Returns the extra size of a physical record if we know its +Returns the extra size of an old-style physical record if we know its data size and number of fields. */ UNIV_INLINE ulint @@ -355,7 +501,8 @@ rec_get_converted_extra_size( /*=========================*/ /* out: extra size */ ulint data_size, /* in: data size */ - ulint n_fields); /* in: number of fields */ + ulint n_fields) /* in: number of fields */ + __attribute__((const)); /************************************************************** The following function returns the size of a data tuple when converted to a physical record. */ @@ -364,6 +511,7 @@ ulint rec_get_converted_size( /*===================*/ /* out: size */ + dict_index_t* index, /* in: record descriptor */ dtuple_t* dtuple);/* in: data tuple */ /****************************************************************** Copies the first n fields of a physical record to a data tuple. @@ -374,6 +522,7 @@ rec_copy_prefix_to_dtuple( /*======================*/ dtuple_t* tuple, /* in: data tuple */ rec_t* rec, /* in: physical record */ + dict_index_t* index, /* in: record descriptor */ ulint n_fields, /* in: number of fields to copy */ mem_heap_t* heap); /* in: memory heap */ /******************************************************************* @@ -382,16 +531,35 @@ Validates the consistency of a physical record. */ ibool rec_validate( /*=========*/ - /* out: TRUE if ok */ - rec_t* rec); /* in: physical record */ + /* out: TRUE if ok */ + rec_t* rec, /* in: physical record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/******************************************************************* +Prints an old-style physical record. */ + +void +rec_print_old( +/*==========*/ + FILE* file, /* in: file where to print */ + rec_t* rec); /* in: physical record */ +/******************************************************************* +Prints a physical record. */ + +void +rec_print_new( +/*==========*/ + FILE* file, /* in: file where to print */ + rec_t* rec, /* in: physical record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /******************************************************************* Prints a physical record. */ void rec_print( /*======*/ - FILE* file, /* in: file where to print */ - rec_t* rec); /* in: physical record */ + FILE* file, /* in: file where to print */ + rec_t* rec, /* in: physical record */ + dict_index_t* index); /* in: record descriptor */ #define REC_INFO_BITS 6 /* This is single byte bit-field */ diff --git a/innobase/include/rem0rec.ic b/innobase/include/rem0rec.ic index c36bf8f6d6e..7d35e8e4110 100644 --- a/innobase/include/rem0rec.ic +++ b/innobase/include/rem0rec.ic @@ -8,9 +8,19 @@ Created 5/30/1994 Heikki Tuuri #include "mach0data.h" #include "ut0byte.h" +#include "dict0dict.h" -/* Offsets of the bit-fields in the record. NOTE! In the table the most -significant bytes and bits are written below less significant. +/* Compact flag ORed to the extra size returned by rec_get_offsets() */ +#define REC_OFFS_COMPACT ((ulint) 1 << 31) +/* SQL NULL flag in offsets returned by rec_get_offsets() */ +#define REC_OFFS_SQL_NULL ((ulint) 1 << 31) +/* External flag in offsets returned by rec_get_offsets() */ +#define REC_OFFS_EXTERNAL ((ulint) 1 << 30) +/* Mask for offsets returned by rec_get_offsets() */ +#define REC_OFFS_MASK (REC_OFFS_EXTERNAL - 1) + +/* Offsets of the bit-fields in an old-style record. NOTE! In the table the +most significant bytes and bits are written below less significant. (1) byte offset (2) bit usage within byte downward from @@ -25,6 +35,35 @@ significant bytes and bits are written below less significant. 4 bits info bits */ +/* Offsets of the bit-fields in a new-style record. NOTE! In the table the +most significant bytes and bits are written below less significant. + + (1) byte offset (2) bit usage within byte + downward from + origin -> 1 8 bits relative offset of next record + 2 8 bits relative offset of next record + the relative offset is an unsigned 16-bit + integer: + (offset_of_next_record + - offset_of_this_record) mod 64Ki, + where mod is the modulo as a non-negative + number; + we can calculate the the offset of the next + record with the formula: + relative_offset + offset_of_this_record + mod UNIV_PAGE_SIZE + 3 3 bits status: + 000=conventional record + 001=node pointer record (inside B-tree) + 010=infimum record + 011=supremum record + 1xx=reserved + 5 bits heap number + 4 8 bits heap number + 5 4 bits n_owned + 4 bits info bits +*/ + /* We list the byte offsets from the origin of the record, the mask, and the shift needed to obtain each bit-field of the record. */ @@ -32,22 +71,30 @@ and the shift needed to obtain each bit-field of the record. */ #define REC_NEXT_MASK 0xFFFFUL #define REC_NEXT_SHIFT 0 -#define REC_SHORT 3 /* This is single byte bit-field */ -#define REC_SHORT_MASK 0x1UL -#define REC_SHORT_SHIFT 0 +#define REC_OLD_SHORT 3 /* This is single byte bit-field */ +#define REC_OLD_SHORT_MASK 0x1UL +#define REC_OLD_SHORT_SHIFT 0 -#define REC_N_FIELDS 4 -#define REC_N_FIELDS_MASK 0x7FEUL -#define REC_N_FIELDS_SHIFT 1 +#define REC_OLD_N_FIELDS 4 +#define REC_OLD_N_FIELDS_MASK 0x7FEUL +#define REC_OLD_N_FIELDS_SHIFT 1 -#define REC_HEAP_NO 5 +#define REC_NEW_STATUS 3 /* This is single byte bit-field */ +#define REC_NEW_STATUS_MASK 0x7UL +#define REC_NEW_STATUS_SHIFT 0 + +#define REC_OLD_HEAP_NO 5 +#define REC_NEW_HEAP_NO 4 #define REC_HEAP_NO_MASK 0xFFF8UL #define REC_HEAP_NO_SHIFT 3 -#define REC_N_OWNED 6 /* This is single byte bit-field */ +#define REC_OLD_N_OWNED 6 /* This is single byte bit-field */ +#define REC_NEW_N_OWNED 5 /* This is single byte bit-field */ #define REC_N_OWNED_MASK 0xFUL #define REC_N_OWNED_SHIFT 0 +#define REC_OLD_INFO_BITS 6 /* This is single byte bit-field */ +#define REC_NEW_INFO_BITS 5 /* This is single byte bit-field */ #define REC_INFO_BITS_MASK 0xF0UL #define REC_INFO_BITS_SHIFT 0 @@ -65,26 +112,24 @@ a field stored to another page: */ #define REC_2BYTE_EXTERN_MASK 0x4000UL -/**************************************************************** -Return field length or UNIV_SQL_NULL. */ -UNIV_INLINE -ulint -rec_get_nth_field_len( -/*==================*/ - /* out: length of the field; UNIV_SQL_NULL if SQL - null */ - rec_t* rec, /* in: record */ - ulint n) /* in: index of the field */ -{ - ulint len; - - rec_get_nth_field(rec, n, &len); - - return(len); -} +#if REC_OLD_SHORT_MASK << (8 * (REC_OLD_SHORT - 3)) \ + ^ REC_OLD_N_FIELDS_MASK << (8 * (REC_OLD_N_FIELDS - 4)) \ + ^ REC_HEAP_NO_MASK << (8 * (REC_OLD_HEAP_NO - 4)) \ + ^ REC_N_OWNED_MASK << (8 * (REC_OLD_N_OWNED - 3)) \ + ^ REC_INFO_BITS_MASK << (8 * (REC_OLD_INFO_BITS - 3)) \ + ^ 0xFFFFFFFFUL +# error "sum of old-style masks != 0xFFFFFFFFUL" +#endif +#if REC_NEW_STATUS_MASK << (8 * (REC_NEW_STATUS - 3)) \ + ^ REC_HEAP_NO_MASK << (8 * (REC_NEW_HEAP_NO - 4)) \ + ^ REC_N_OWNED_MASK << (8 * (REC_NEW_N_OWNED - 3)) \ + ^ REC_INFO_BITS_MASK << (8 * (REC_NEW_INFO_BITS - 3)) \ + ^ 0xFFFFFFUL +# error "sum of new-style masks != 0xFFFFFFUL" +#endif /*************************************************************** -Sets the value of the ith field SQL null bit. */ +Sets the value of the ith field SQL null bit of an old-style record. */ void rec_set_nth_field_null_bit( @@ -93,8 +138,8 @@ rec_set_nth_field_null_bit( ulint i, /* in: ith field */ ibool val); /* in: value to set */ /*************************************************************** -Sets a record field to SQL null. The physical size of the field is not -changed. */ +Sets an old-style record field to SQL null. +The physical size of the field is not changed. */ void rec_set_nth_field_sql_null( @@ -102,6 +147,32 @@ rec_set_nth_field_sql_null( rec_t* rec, /* in: record */ ulint n); /* in: index of the field */ +/*************************************************************** +Sets the value of the ith field extern storage bit of an old-style record. */ + +void +rec_set_nth_field_extern_bit_old( +/*=============================*/ + rec_t* rec, /* in: old-style record */ + ulint i, /* in: ith field */ + ibool val, /* in: value to set */ + mtr_t* mtr); /* in: mtr holding an X-latch to the page where + rec is, or NULL; in the NULL case we do not + write to log about the change */ +/*************************************************************** +Sets the value of the ith field extern storage bit of a new-style record. */ + +void +rec_set_nth_field_extern_bit_new( +/*=============================*/ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: record descriptor */ + ulint ith, /* in: ith field */ + ibool val, /* in: value to set */ + mtr_t* mtr); /* in: mtr holding an X-latch to the page + where rec is, or NULL; in the NULL case + we do not write to log about the change */ + /********************************************************** Gets a bit field from within 1 byte. */ UNIV_INLINE @@ -131,7 +202,7 @@ rec_set_bit_field_1( ulint shift) /* in: shift right applied after masking */ { ut_ad(rec); - ut_ad(offs <= REC_N_EXTRA_BYTES); + ut_ad(offs <= REC_N_OLD_EXTRA_BYTES); ut_ad(mask); ut_ad(mask <= 0xFFUL); ut_ad(((mask >> shift) << shift) == mask); @@ -171,30 +242,14 @@ rec_set_bit_field_2( ulint shift) /* in: shift right applied after masking */ { ut_ad(rec); - ut_ad(offs <= REC_N_EXTRA_BYTES); + ut_ad(offs <= REC_N_OLD_EXTRA_BYTES); ut_ad(mask > 0xFFUL); ut_ad(mask <= 0xFFFFUL); ut_ad((mask >> shift) & 1); ut_ad(0 == ((mask >> shift) & ((mask >> shift) + 1))); ut_ad(((mask >> shift) << shift) == mask); ut_ad(((val << shift) & mask) == (val << shift)); -#ifdef UNIV_DEBUG - { - ulint m; - - /* The following assertion checks that the masks of currently - defined bit-fields in bytes 3-6 do not overlap. */ - m = (ulint)((REC_SHORT_MASK << (8 * (REC_SHORT - 3))) - + (REC_N_FIELDS_MASK << (8 * (REC_N_FIELDS - 4))) - + (REC_HEAP_NO_MASK << (8 * (REC_HEAP_NO - 4))) - + (REC_N_OWNED_MASK << (8 * (REC_N_OWNED - 3))) - + (REC_INFO_BITS_MASK << (8 * (REC_INFO_BITS - 3)))); - if (m != ut_dbg_zero + 0xFFFFFFFFUL) { - fprintf(stderr, "Sum of masks %lx\n", m); - ut_error; - } - } -#endif + mach_write_to_2(rec - offs, (mach_read_from_2(rec - offs) & ~mask) | (val << shift)); @@ -207,18 +262,38 @@ UNIV_INLINE ulint rec_get_next_offs( /*==============*/ - /* out: the page offset of the next chained record */ - rec_t* rec) /* in: physical record */ + /* out: the page offset of the next chained record, or + 0 if none */ + rec_t* rec, /* in: physical record */ + ibool comp) /* in: TRUE=compact page format */ { - ulint ret; + ulint field_value; + + ut_ad(REC_NEXT_MASK == 0xFFFFUL); + ut_ad(REC_NEXT_SHIFT == 0); - ut_ad(rec); + field_value = mach_read_from_2(rec - REC_NEXT); - ret = rec_get_bit_field_2(rec, REC_NEXT, REC_NEXT_MASK, - REC_NEXT_SHIFT); - ut_ad(ret < UNIV_PAGE_SIZE); + if (comp) { +#if UNIV_PAGE_SIZE <= 32768 + /* Note that for 64 KiB pages, field_value can 'wrap around' + and the debug assertion is not valid */ - return(ret); + ut_ad((int16_t)field_value + + ut_align_offset(rec, UNIV_PAGE_SIZE) + < UNIV_PAGE_SIZE); +#endif + if (field_value == 0) { + + return(0); + } + + return(ut_align_offset(rec + field_value, UNIV_PAGE_SIZE)); + } else { + ut_ad(field_value < UNIV_PAGE_SIZE); + + return(field_value); + } } /********************************************************** @@ -229,21 +304,42 @@ void rec_set_next_offs( /*==============*/ rec_t* rec, /* in: physical record */ - ulint next) /* in: offset of the next record */ + ibool comp, /* in: TRUE=compact page format */ + ulint next) /* in: offset of the next record, or 0 if none */ { ut_ad(rec); ut_ad(UNIV_PAGE_SIZE > next); + ut_ad(REC_NEXT_MASK == 0xFFFFUL); + ut_ad(REC_NEXT_SHIFT == 0); + + if (comp) { + ulint field_value; - rec_set_bit_field_2(rec, next, REC_NEXT, REC_NEXT_MASK, - REC_NEXT_SHIFT); + if (next) { + /* The following two statements calculate + next - offset_of_rec mod 64Ki, where mod is the modulo + as a non-negative number */ + + field_value = (ulint)((lint)next + - (lint)ut_align_offset(rec, UNIV_PAGE_SIZE)); + field_value &= REC_NEXT_MASK; + } else { + field_value = 0; + } + + mach_write_to_2(rec - REC_NEXT, field_value); + } else { + mach_write_to_2(rec - REC_NEXT, next); + } } /********************************************************** -The following function is used to get the number of fields in the record. */ +The following function is used to get the number of fields +in an old-style record. */ UNIV_INLINE ulint -rec_get_n_fields( -/*=============*/ +rec_get_n_fields_old( +/*=================*/ /* out: number of data fields */ rec_t* rec) /* in: physical record */ { @@ -251,8 +347,8 @@ rec_get_n_fields( ut_ad(rec); - ret = rec_get_bit_field_2(rec, REC_N_FIELDS, REC_N_FIELDS_MASK, - REC_N_FIELDS_SHIFT); + ret = rec_get_bit_field_2(rec, REC_OLD_N_FIELDS, + REC_OLD_N_FIELDS_MASK, REC_OLD_N_FIELDS_SHIFT); ut_ad(ret <= REC_MAX_N_FIELDS); ut_ad(ret > 0); @@ -260,12 +356,12 @@ rec_get_n_fields( } /********************************************************** -The following function is used to set the number of fields field in the -record. */ +The following function is used to set the number of fields +in an old-style record. */ UNIV_INLINE void -rec_set_n_fields( -/*=============*/ +rec_set_n_fields_old( +/*=================*/ rec_t* rec, /* in: physical record */ ulint n_fields) /* in: the number of fields */ { @@ -273,8 +369,58 @@ rec_set_n_fields( ut_ad(n_fields <= REC_MAX_N_FIELDS); ut_ad(n_fields > 0); - rec_set_bit_field_2(rec, n_fields, REC_N_FIELDS, REC_N_FIELDS_MASK, - REC_N_FIELDS_SHIFT); + rec_set_bit_field_2(rec, n_fields, REC_OLD_N_FIELDS, + REC_OLD_N_FIELDS_MASK, REC_OLD_N_FIELDS_SHIFT); +} + +/********************************************************** +The following function retrieves the status bits of a new-style record. */ +UNIV_INLINE +ulint +rec_get_status( +/*===========*/ + /* out: status bits */ + rec_t* rec) /* in: physical record */ +{ + ulint ret; + + ut_ad(rec); + + ret = rec_get_bit_field_1(rec, REC_NEW_STATUS, + REC_NEW_STATUS_MASK, REC_NEW_STATUS_SHIFT); + ut_ad((ret & ~REC_NEW_STATUS_MASK) == 0); + + return(ret); +} + +/********************************************************** +The following function is used to get the number of fields +in a record. */ +UNIV_INLINE +ulint +rec_get_n_fields( +/*=============*/ + /* out: number of data fields */ + rec_t* rec, /* in: physical record */ + dict_index_t* index) /* in: record descriptor */ +{ + ut_ad(rec); + ut_ad(index); + if (!index->table->comp) { + return(rec_get_n_fields_old(rec)); + } + switch (rec_get_status(rec)) { + case REC_STATUS_ORDINARY: + return(dict_index_get_n_fields(index)); + case REC_STATUS_NODE_PTR: + return(dict_index_get_n_unique_in_tree(index) + 1); + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + return(1); + default: + ut_error; + return(ULINT_UNDEFINED); + } } /********************************************************** @@ -285,14 +431,16 @@ ulint rec_get_n_owned( /*============*/ /* out: number of owned records */ - rec_t* rec) /* in: physical record */ + rec_t* rec, /* in: physical record */ + ibool comp) /* in: TRUE=compact page format */ { ulint ret; ut_ad(rec); - ret = rec_get_bit_field_1(rec, REC_N_OWNED, REC_N_OWNED_MASK, - REC_N_OWNED_SHIFT); + ret = rec_get_bit_field_1(rec, + comp ? REC_NEW_N_OWNED : REC_OLD_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); ut_ad(ret <= REC_MAX_N_OWNED); return(ret); @@ -305,13 +453,15 @@ void rec_set_n_owned( /*============*/ rec_t* rec, /* in: physical record */ + ibool comp, /* in: TRUE=compact page format */ ulint n_owned) /* in: the number of owned */ { ut_ad(rec); ut_ad(n_owned <= REC_MAX_N_OWNED); - rec_set_bit_field_1(rec, n_owned, REC_N_OWNED, REC_N_OWNED_MASK, - REC_N_OWNED_SHIFT); + rec_set_bit_field_1(rec, n_owned, + comp ? REC_NEW_N_OWNED : REC_OLD_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); } /********************************************************** @@ -321,14 +471,16 @@ ulint rec_get_info_bits( /*==============*/ /* out: info bits */ - rec_t* rec) /* in: physical record */ + rec_t* rec, /* in: physical record */ + ibool comp) /* in: TRUE=compact page format */ { ulint ret; ut_ad(rec); - ret = rec_get_bit_field_1(rec, REC_INFO_BITS, REC_INFO_BITS_MASK, - REC_INFO_BITS_SHIFT); + ret = rec_get_bit_field_1(rec, + comp ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS, + REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); ut_ad((ret & ~REC_INFO_BITS_MASK) == 0); return(ret); @@ -341,30 +493,31 @@ void rec_set_info_bits( /*==============*/ rec_t* rec, /* in: physical record */ + ibool comp, /* in: TRUE=compact page format */ ulint bits) /* in: info bits */ { ut_ad(rec); ut_ad((bits & ~REC_INFO_BITS_MASK) == 0); - rec_set_bit_field_1(rec, bits, REC_INFO_BITS, REC_INFO_BITS_MASK, - REC_INFO_BITS_SHIFT); + rec_set_bit_field_1(rec, bits, + comp ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS, + REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); } /********************************************************** -Gets the value of the deleted flag in info bits. */ +The following function is used to set the status bits of a new-style record. */ UNIV_INLINE -ibool -rec_info_bits_get_deleted_flag( -/*===========================*/ - /* out: TRUE if deleted flag set */ - ulint info_bits) /* in: info bits from a record */ +void +rec_set_status( +/*===========*/ + rec_t* rec, /* in: physical record */ + ulint bits) /* in: info bits */ { - if (info_bits & REC_INFO_DELETED_FLAG) { - - return(TRUE); - } + ut_ad(rec); + ut_ad((bits & ~REC_NEW_STATUS_MASK) == 0); - return(FALSE); + rec_set_bit_field_1(rec, bits, REC_NEW_STATUS, + REC_NEW_STATUS_MASK, REC_NEW_STATUS_SHIFT); } /********************************************************** @@ -374,9 +527,10 @@ ibool rec_get_deleted_flag( /*=================*/ /* out: TRUE if delete marked */ - rec_t* rec) /* in: physical record */ + rec_t* rec, /* in: physical record */ + ibool comp) /* in: TRUE=compact page format */ { - if (REC_INFO_DELETED_FLAG & rec_get_info_bits(rec)) { + if (REC_INFO_DELETED_FLAG & rec_get_info_bits(rec, comp)) { return(TRUE); } @@ -391,6 +545,7 @@ void rec_set_deleted_flag( /*=================*/ rec_t* rec, /* in: physical record */ + ibool comp, /* in: TRUE=compact page format */ ibool flag) /* in: TRUE if delete marked */ { ulint old_val; @@ -399,7 +554,7 @@ rec_set_deleted_flag( ut_ad(TRUE == 1); ut_ad(flag <= TRUE); - old_val = rec_get_info_bits(rec); + old_val = rec_get_info_bits(rec, comp); if (flag) { new_val = REC_INFO_DELETED_FLAG | old_val; @@ -407,7 +562,39 @@ rec_set_deleted_flag( new_val = ~REC_INFO_DELETED_FLAG & old_val; } - rec_set_info_bits(rec, new_val); + rec_set_info_bits(rec, comp, new_val); +} + +/********************************************************** +The following function tells if a new-style record is a node pointer. */ +UNIV_INLINE +ibool +rec_get_node_ptr_flag( +/*=================*/ + /* out: TRUE if node pointer */ + rec_t* rec) /* in: physical record */ +{ + return(REC_STATUS_NODE_PTR == rec_get_status(rec)); +} + +/********************************************************** +The following function is used to flag a record as a node pointer. */ +UNIV_INLINE +void +rec_set_node_ptr_flag( +/*=================*/ + rec_t* rec, /* in: physical record */ + ibool flag) /* in: TRUE if the record is a node pointer */ +{ + ulint status; + ut_ad(flag <= TRUE); + ut_ad(REC_STATUS_NODE_PTR >= rec_get_status(rec)); + if (flag) { + status = REC_STATUS_NODE_PTR; + } else { + status = REC_STATUS_ORDINARY; + } + rec_set_status(rec, status); } /********************************************************** @@ -418,14 +605,16 @@ ulint rec_get_heap_no( /*=============*/ /* out: heap order number */ - rec_t* rec) /* in: physical record */ + rec_t* rec, /* in: physical record */ + ibool comp) /* in: TRUE=compact page format */ { ulint ret; ut_ad(rec); - ret = rec_get_bit_field_2(rec, REC_HEAP_NO, REC_HEAP_NO_MASK, - REC_HEAP_NO_SHIFT); + ret = rec_get_bit_field_2(rec, + comp ? REC_NEW_HEAP_NO : REC_OLD_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); ut_ad(ret <= REC_MAX_HEAP_NO); return(ret); @@ -438,12 +627,14 @@ void rec_set_heap_no( /*=============*/ rec_t* rec, /* in: physical record */ + ibool comp, /* in: TRUE=compact page format */ ulint heap_no)/* in: the heap number */ { ut_ad(heap_no <= REC_MAX_HEAP_NO); - rec_set_bit_field_2(rec, heap_no, REC_HEAP_NO, REC_HEAP_NO_MASK, - REC_HEAP_NO_SHIFT); + rec_set_bit_field_2(rec, heap_no, + comp ? REC_NEW_HEAP_NO : REC_OLD_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); } /********************************************************** @@ -456,10 +647,12 @@ rec_get_1byte_offs_flag( /* out: TRUE if 1-byte form */ rec_t* rec) /* in: physical record */ { - ut_ad(TRUE == 1); +#if TRUE != 1 +#error "TRUE != 1" +#endif - return(rec_get_bit_field_1(rec, REC_SHORT, REC_SHORT_MASK, - REC_SHORT_SHIFT)); + return(rec_get_bit_field_1(rec, REC_OLD_SHORT, REC_OLD_SHORT_MASK, + REC_OLD_SHORT_SHIFT)); } /********************************************************** @@ -471,11 +664,13 @@ rec_set_1byte_offs_flag( rec_t* rec, /* in: physical record */ ibool flag) /* in: TRUE if 1byte form */ { - ut_ad(TRUE == 1); +#if TRUE != 1 +#error "TRUE != 1" +#endif ut_ad(flag <= TRUE); - rec_set_bit_field_1(rec, flag, REC_SHORT, REC_SHORT_MASK, - REC_SHORT_SHIFT); + rec_set_bit_field_1(rec, flag, REC_OLD_SHORT, REC_OLD_SHORT_MASK, + REC_OLD_SHORT_SHIFT); } /********************************************************** @@ -492,9 +687,9 @@ rec_1_get_field_end_info( ulint n) /* in: field index */ { ut_ad(rec_get_1byte_offs_flag(rec)); - ut_ad(n < rec_get_n_fields(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); - return(mach_read_from_1(rec - (REC_N_EXTRA_BYTES + n + 1))); + return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1))); } /********************************************************** @@ -511,68 +706,289 @@ rec_2_get_field_end_info( ulint n) /* in: field index */ { ut_ad(!rec_get_1byte_offs_flag(rec)); - ut_ad(n < rec_get_n_fields(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); - return(mach_read_from_2(rec - (REC_N_EXTRA_BYTES + 2 * n + 2))); + return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2))); } -/*************************************************************** -Gets the value of the ith field extern storage bit. If it is TRUE -it means that the field is stored on another page. */ +#ifdef UNIV_DEBUG +/* Length of the rec_get_offsets() header */ +# define REC_OFFS_HEADER_SIZE 4 +#else /* UNIV_DEBUG */ +/* Length of the rec_get_offsets() header */ +# define REC_OFFS_HEADER_SIZE 2 +#endif /* UNIV_DEBUG */ + +/* Get the base address of offsets. The extra_size is stored at +this position, and following positions hold the end offsets of +the fields. */ +#define rec_offs_base(offsets) (offsets + REC_OFFS_HEADER_SIZE) + +/************************************************************** +The following function returns the number of allocated elements +for an array of offsets. */ +UNIV_INLINE +ulint +rec_offs_get_n_alloc( +/*=================*/ + /* out: number of elements */ + const ulint* offsets)/* in: array for rec_get_offsets() */ +{ + ulint n_alloc; + ut_ad(offsets); + n_alloc = offsets[0]; + ut_ad(n_alloc > 0); + return(n_alloc); +} + +/************************************************************** +The following function sets the number of allocated elements +for an array of offsets. */ +UNIV_INLINE +void +rec_offs_set_n_alloc( +/*=================*/ + ulint* offsets, /* in: array for rec_get_offsets() */ + ulint n_alloc) /* in: number of elements */ +{ + ut_ad(offsets); + ut_ad(n_alloc > 0); + offsets[0] = n_alloc; +} + +/************************************************************** +The following function returns the number of fields in a record. */ +UNIV_INLINE +ulint +rec_offs_n_fields( +/*===============*/ + /* out: number of fields */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ulint n_fields; + ut_ad(offsets); + n_fields = offsets[1]; + ut_ad(n_fields > 0); + ut_ad(n_fields <= REC_MAX_N_FIELDS); + ut_ad(n_fields + REC_OFFS_HEADER_SIZE + <= rec_offs_get_n_alloc(offsets)); + return(n_fields); +} + +/**************************************************************** +Validates offsets returned by rec_get_offsets(). */ UNIV_INLINE ibool -rec_get_nth_field_extern_bit( -/*=========================*/ - /* in: TRUE or FALSE */ - rec_t* rec, /* in: record */ - ulint i) /* in: ith field */ +rec_offs_validate( +/*==============*/ + /* out: TRUE if valid */ + rec_t* rec, /* in: record or NULL */ + dict_index_t* index, /* in: record descriptor or NULL */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ulint i = rec_offs_n_fields(offsets); + ulint last = ULINT_MAX; + ibool comp = (*rec_offs_base(offsets) & REC_OFFS_COMPACT) != 0; + + if (rec) { + ut_ad((ulint) rec == offsets[2]); + if (!comp) { + ut_a(rec_get_n_fields_old(rec) >= i); + } + } + if (index) { + ulint max_n_fields; + ut_ad((ulint) index == offsets[3]); + max_n_fields = ut_max( + dict_index_get_n_fields(index), + dict_index_get_n_unique_in_tree(index) + 1); + if (comp && rec) { + switch (rec_get_status(rec)) { + case REC_STATUS_ORDINARY: + break; + case REC_STATUS_NODE_PTR: + max_n_fields = + dict_index_get_n_unique_in_tree(index) + 1; + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + max_n_fields = 1; + break; + default: + ut_error; + } + } + /* index->n_def == 0 for dummy indexes if !comp */ + ut_a(!comp || index->n_def); + ut_a(!index->n_def || i <= max_n_fields); + } + while (i--) { + ulint curr = rec_offs_base(offsets)[1 + i] & REC_OFFS_MASK; + ut_a(curr <= last); + last = curr; + } + return(TRUE); +} +/**************************************************************** +Updates debug data in offsets, in order to avoid bogus +rec_offs_validate() failures. */ +UNIV_INLINE +void +rec_offs_make_valid( +/*================*/ + rec_t* rec __attribute__((unused)), + /* in: record */ + dict_index_t* index __attribute__((unused)), + /* in: record descriptor */ + ulint* offsets __attribute__((unused))) + /* in: array returned by rec_get_offsets() */ { - ulint info; +#ifdef UNIV_DEBUG + ut_ad(rec_get_n_fields(rec, index) >= rec_offs_n_fields(offsets)); + offsets[2] = (ulint) rec; + offsets[3] = (ulint) index; +#endif /* UNIV_DEBUG */ +} - if (rec_get_1byte_offs_flag(rec)) { +/**************************************************************** +The following function is used to get a pointer to the nth +data field in an old-style record. */ +UNIV_INLINE +byte* +rec_get_nth_field( +/*==============*/ + /* out: pointer to the field */ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n, /* in: index of the field */ + ulint* len) /* out: length of the field; UNIV_SQL_NULL + if SQL null */ +{ + byte* field; + ulint length; + ut_ad(rec); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(n < rec_offs_n_fields(offsets)); + ut_ad(len); - return(FALSE); + if (n == 0) { + field = rec; + } else { + field = rec + (rec_offs_base(offsets)[n] & REC_OFFS_MASK); } - info = rec_2_get_field_end_info(rec, i); + length = rec_offs_base(offsets)[1 + n]; - if (info & REC_2BYTE_EXTERN_MASK) { - return(TRUE); + if (length & REC_OFFS_SQL_NULL) { + length = UNIV_SQL_NULL; + } else { + length &= REC_OFFS_MASK; + length -= field - rec; } - return(FALSE); + *len = length; + return(field); } /********************************************************** -Returns TRUE if the extern bit is set in any of the fields -of rec. */ +Determine if the offsets are for a record in the new +compact format. */ UNIV_INLINE ibool -rec_contains_externally_stored_field( -/*=================================*/ - /* out: TRUE if a field is stored externally */ - rec_t* rec) /* in: record */ +rec_offs_comp( +/*==========*/ + /* out: TRUE if compact format */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { - ulint n; - ulint i; - - if (rec_get_1byte_offs_flag(rec)) { - - return(FALSE); - } + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + return((*rec_offs_base(offsets) & REC_OFFS_COMPACT) != 0); +} - n = rec_get_n_fields(rec); +/********************************************************** +Returns TRUE if the nth field of rec is SQL NULL. */ +UNIV_INLINE +ibool +rec_offs_nth_null( +/*==============*/ + /* out: TRUE if SQL NULL */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n) /* in: nth field */ +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + ut_ad(n < rec_offs_n_fields(offsets)); + return((rec_offs_base(offsets)[1 + n] & REC_OFFS_SQL_NULL) != 0); +} +/********************************************************** +Returns TRUE if the extern bit is set in nth field of rec. */ +UNIV_INLINE +ibool +rec_offs_nth_extern( +/*================*/ + /* out: TRUE if externally stored */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n) /* in: nth field */ +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + ut_ad(n < rec_offs_n_fields(offsets)); + return((rec_offs_base(offsets)[1 + n] & REC_OFFS_EXTERNAL) != 0); +} - for (i = 0; i < n; i++) { - if (rec_get_nth_field_extern_bit(rec, i)) { +/********************************************************** +Gets the physical size of a field. */ +UNIV_INLINE +ulint +rec_offs_nth_size( +/*==============*/ + /* out: length of field */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n) /* in: nth field */ +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + ut_ad(n < rec_offs_n_fields(offsets)); + return((rec_offs_base(offsets)[1 + n] - rec_offs_base(offsets)[n]) + & REC_OFFS_MASK); +} +/********************************************************** +Returns TRUE if the extern bit is set in any of the fields +of an old-style record. */ +UNIV_INLINE +ibool +rec_offs_any_extern( +/*================*/ + /* out: TRUE if a field is stored externally */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ulint i; + for (i = rec_offs_n_fields(offsets); i--; ) { + if (rec_offs_nth_extern(offsets, i)) { return(TRUE); } } - return(FALSE); } +/*************************************************************** +Sets the value of the ith field extern storage bit. */ +UNIV_INLINE +void +rec_set_nth_field_extern_bit( +/*=========================*/ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: record descriptor */ + ulint i, /* in: ith field */ + ibool val, /* in: value to set */ + mtr_t* mtr) /* in: mtr holding an X-latch to the page + where rec is, or NULL; in the NULL case + we do not write to log about the change */ +{ + if (index->table->comp) { + rec_set_nth_field_extern_bit_new(rec, index, i, val, mtr); + } else { + rec_set_nth_field_extern_bit_old(rec, i, val, mtr); + } +} + /********************************************************** Returns the offset of n - 1th field end if the record is stored in the 1-byte offsets form. If the field is SQL null, the flag is ORed in the returned @@ -589,9 +1005,9 @@ rec_1_get_prev_field_end_info( ulint n) /* in: field index */ { ut_ad(rec_get_1byte_offs_flag(rec)); - ut_ad(n <= rec_get_n_fields(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); - return(mach_read_from_1(rec - (REC_N_EXTRA_BYTES + n))); + return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n))); } /********************************************************** @@ -608,9 +1024,9 @@ rec_2_get_prev_field_end_info( ulint n) /* in: field index */ { ut_ad(!rec_get_1byte_offs_flag(rec)); - ut_ad(n <= rec_get_n_fields(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); - return(mach_read_from_2(rec - (REC_N_EXTRA_BYTES + 2 * n))); + return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n))); } /********************************************************** @@ -625,9 +1041,9 @@ rec_1_set_field_end_info( ulint info) /* in: value to set */ { ut_ad(rec_get_1byte_offs_flag(rec)); - ut_ad(n < rec_get_n_fields(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); - mach_write_to_1(rec - (REC_N_EXTRA_BYTES + n + 1), info); + mach_write_to_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1), info); } /********************************************************** @@ -642,9 +1058,9 @@ rec_2_set_field_end_info( ulint info) /* in: value to set */ { ut_ad(!rec_get_1byte_offs_flag(rec)); - ut_ad(n < rec_get_n_fields(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); - mach_write_to_2(rec - (REC_N_EXTRA_BYTES + 2 * n + 2), info); + mach_write_to_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2), info); } /********************************************************** @@ -659,7 +1075,7 @@ rec_1_get_field_start_offs( ulint n) /* in: field index */ { ut_ad(rec_get_1byte_offs_flag(rec)); - ut_ad(n <= rec_get_n_fields(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); if (n == 0) { @@ -682,7 +1098,7 @@ rec_2_get_field_start_offs( ulint n) /* in: field index */ { ut_ad(!rec_get_1byte_offs_flag(rec)); - ut_ad(n <= rec_get_n_fields(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); if (n == 0) { @@ -707,7 +1123,7 @@ rec_get_field_start_offs( ulint n) /* in: field index */ { ut_ad(rec); - ut_ad(n <= rec_get_n_fields(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); if (n == 0) { @@ -723,8 +1139,9 @@ rec_get_field_start_offs( } /**************************************************************** -Gets the physical size of a field. Also an SQL null may have a field of -size > 0, if the data type is of a fixed size. */ +Gets the physical size of an old-style field. +Also an SQL null may have a field of size > 0, +if the data type is of a fixed size. */ UNIV_INLINE ulint rec_get_nth_field_size( @@ -744,133 +1161,134 @@ rec_get_nth_field_size( return(next_os - os); } -/**************************************************************** -The following function is used to get a copy of the nth data field in a -record to a buffer. */ -UNIV_INLINE -void -rec_copy_nth_field( -/*===============*/ - void* buf, /* in: pointer to the buffer */ - rec_t* rec, /* in: record */ - ulint n, /* in: index of the field */ - ulint* len) /* out: length of the field; UNIV_SQL_NULL if SQL - null */ -{ - byte* ptr; - - ut_ad(buf && rec && len); - - ptr = rec_get_nth_field(rec, n, len); - - if (*len == UNIV_SQL_NULL) { - - return; - } - - ut_memcpy(buf, ptr, *len); -} - /*************************************************************** This is used to modify the value of an already existing field in a record. The previous value must have exactly the same size as the new value. If len -is UNIV_SQL_NULL then the field is treated as an SQL null. */ +is UNIV_SQL_NULL then the field is treated as an SQL null for old-style +records. For new-style records, len must not be UNIV_SQL_NULL. */ UNIV_INLINE void rec_set_nth_field( /*==============*/ - rec_t* rec, /* in: record */ - ulint n, /* in: index of the field */ - void* data, /* in: pointer to the data if not SQL null */ - ulint len) /* in: length of the data or UNIV_SQL_NULL */ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n, /* in: index number of the field */ + const void* data, /* in: pointer to the data + if not SQL null */ + ulint len) /* in: length of the data or UNIV_SQL_NULL. + If not SQL null, must have the same + length as the previous value. + If SQL null, previous value must be + SQL null. */ { byte* data2; ulint len2; - ut_ad((len == UNIV_SQL_NULL) - || (rec_get_nth_field_size(rec, n) == len)); - + ut_ad(rec); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + if (len == UNIV_SQL_NULL) { + ut_ad(!rec_offs_comp(offsets)); rec_set_nth_field_sql_null(rec, n); return; } - data2 = rec_get_nth_field(rec, n, &len2); - - ut_memcpy(data2, data, len); - + data2 = rec_get_nth_field(rec, offsets, n, &len2); if (len2 == UNIV_SQL_NULL) { - + ut_ad(!rec_offs_comp(offsets)); rec_set_nth_field_null_bit(rec, n, FALSE); + ut_ad(len == rec_get_nth_field_size(rec, n)); + } else { + ut_ad(len2 == len); } + + ut_memcpy(data2, data, len); } /************************************************************** -The following function returns the data size of a physical +The following function returns the data size of an old-style physical record, that is the sum of field lengths. SQL null fields are counted as length 0 fields. The value returned by the function is the distance from record origin to record end in bytes. */ UNIV_INLINE ulint -rec_get_data_size( -/*==============*/ - /* out: size */ +rec_get_data_size_old( +/*==================*/ + /* out: size */ rec_t* rec) /* in: physical record */ { ut_ad(rec); - return(rec_get_field_start_offs(rec, rec_get_n_fields(rec))); + return(rec_get_field_start_offs(rec, rec_get_n_fields_old(rec))); } /************************************************************** -Returns the total size of record minus data size of record. The value -returned by the function is the distance from record start to record origin -in bytes. */ +The following function sets the number of fields in offsets. */ +UNIV_INLINE +void +rec_offs_set_n_fields( +/*==================*/ + ulint* offsets, /* in: array returned by rec_get_offsets() */ + ulint n_fields) /* in: number of fields */ +{ + ut_ad(offsets); + ut_ad(n_fields > 0); + ut_ad(n_fields <= REC_MAX_N_FIELDS); + ut_ad(n_fields + REC_OFFS_HEADER_SIZE + <= rec_offs_get_n_alloc(offsets)); + offsets[1] = n_fields; +} + +/************************************************************** +The following function returns the data size of a physical +record, that is the sum of field lengths. SQL null fields +are counted as length 0 fields. The value returned by the function +is the distance from record origin to record end in bytes. */ UNIV_INLINE ulint -rec_get_extra_size( +rec_offs_data_size( /*===============*/ - /* out: size */ - rec_t* rec) /* in: physical record */ + /* out: size */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { - ulint n_fields; - - ut_ad(rec); - - n_fields = rec_get_n_fields(rec); - - if (rec_get_1byte_offs_flag(rec)) { + ulint size; - return(REC_N_EXTRA_BYTES + n_fields); - } + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + size = rec_offs_base(offsets)[rec_offs_n_fields(offsets)] + & REC_OFFS_MASK; + ut_ad(size < UNIV_PAGE_SIZE); + return(size); +} - return(REC_N_EXTRA_BYTES + 2 * n_fields); +/************************************************************** +Returns the total size of record minus data size of record. The value +returned by the function is the distance from record start to record origin +in bytes. */ +UNIV_INLINE +ulint +rec_offs_extra_size( +/*================*/ + /* out: size */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ulint size; + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + size = *rec_offs_base(offsets) & ~REC_OFFS_COMPACT; + ut_ad(size < UNIV_PAGE_SIZE); + return(size); } -/************************************************************** +/************************************************************** Returns the total size of a physical record. */ UNIV_INLINE ulint -rec_get_size( -/*=========*/ - /* out: size */ - rec_t* rec) /* in: physical record */ +rec_offs_size( +/*==========*/ + /* out: size */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { - ulint n_fields; - - ut_ad(rec); - - n_fields = rec_get_n_fields(rec); - - if (rec_get_1byte_offs_flag(rec)) { - - return(REC_N_EXTRA_BYTES + n_fields - + rec_1_get_field_start_offs(rec, n_fields)); - } - - return(REC_N_EXTRA_BYTES + 2 * n_fields - + rec_2_get_field_start_offs(rec, n_fields)); + return(rec_offs_data_size(offsets) + rec_offs_extra_size(offsets)); } /************************************************************** @@ -879,10 +1297,11 @@ UNIV_INLINE byte* rec_get_end( /*========*/ - /* out: pointer to end */ - rec_t* rec) /* in: pointer to record */ + /* out: pointer to end */ + rec_t* rec, /* in: pointer to record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { - return(rec + rec_get_data_size(rec)); + return(rec + rec_offs_data_size(offsets)); } /************************************************************** @@ -891,10 +1310,11 @@ UNIV_INLINE byte* rec_get_start( /*==========*/ - /* out: pointer to start */ - rec_t* rec) /* in: pointer to record */ + /* out: pointer to start */ + rec_t* rec, /* in: pointer to record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { - return(rec - rec_get_extra_size(rec)); + return(rec - rec_offs_extra_size(offsets)); } /******************************************************************* @@ -903,18 +1323,20 @@ UNIV_INLINE rec_t* rec_copy( /*=====*/ - /* out: pointer to the origin of the copied record */ - void* buf, /* in: buffer */ - rec_t* rec) /* in: physical record */ + /* out: pointer to the origin of the copy */ + void* buf, /* in: buffer */ + const rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { ulint extra_len; ulint data_len; ut_ad(rec && buf); - ut_ad(rec_validate(rec)); + ut_ad(rec_offs_validate((rec_t*) rec, NULL, offsets)); + ut_ad(rec_validate((rec_t*) rec, offsets)); - extra_len = rec_get_extra_size(rec); - data_len = rec_get_data_size(rec); + extra_len = rec_offs_extra_size(offsets); + data_len = rec_offs_data_size(offsets); ut_memcpy(buf, rec - extra_len, extra_len + data_len); @@ -922,8 +1344,8 @@ rec_copy( } /************************************************************** -Returns the extra size of a physical record if we know its data size and -the number of fields. */ +Returns the extra size of an old-style physical record if we know its +data size and number of fields. */ UNIV_INLINE ulint rec_get_converted_extra_size( @@ -934,28 +1356,51 @@ rec_get_converted_extra_size( { if (data_size <= REC_1BYTE_OFFS_LIMIT) { - return(REC_N_EXTRA_BYTES + n_fields); + return(REC_N_OLD_EXTRA_BYTES + n_fields); } - return(REC_N_EXTRA_BYTES + 2 * n_fields); + return(REC_N_OLD_EXTRA_BYTES + 2 * n_fields); } /************************************************************** The following function returns the size of a data tuple when converted to +a new-style physical record. */ + +ulint +rec_get_converted_size_new( +/*=======================*/ + /* out: size */ + dict_index_t* index, /* in: record descriptor */ + dtuple_t* dtuple);/* in: data tuple */ +/************************************************************** +The following function returns the size of a data tuple when converted to a physical record. */ UNIV_INLINE ulint rec_get_converted_size( /*===================*/ /* out: size */ + dict_index_t* index, /* in: record descriptor */ dtuple_t* dtuple) /* in: data tuple */ { ulint data_size; ulint extra_size; - + + ut_ad(index); ut_ad(dtuple); ut_ad(dtuple_check_typed(dtuple)); + ut_ad(index->type & DICT_UNIVERSAL + || dtuple_get_n_fields(dtuple) == + (((dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK) + == REC_STATUS_NODE_PTR) + ? dict_index_get_n_unique_in_tree(index) + 1 + : dict_index_get_n_fields(index))); + + if (index->table->comp) { + return(rec_get_converted_size_new(index, dtuple)); + } + data_size = dtuple_get_data_size(dtuple); extra_size = rec_get_converted_extra_size( @@ -971,12 +1416,15 @@ UNIV_INLINE ulint rec_fold( /*=====*/ - /* out: the folded value */ - rec_t* rec, /* in: the physical record */ - ulint n_fields, /* in: number of complete fields to fold */ - ulint n_bytes, /* in: number of bytes to fold in an - incomplete last field */ - dulint tree_id) /* in: index tree id */ + /* out: the folded value */ + rec_t* rec, /* in: the physical record */ + const ulint* offsets, /* in: array returned by + rec_get_offsets() */ + ulint n_fields, /* in: number of complete + fields to fold */ + ulint n_bytes, /* in: number of bytes to fold + in an incomplete last field */ + dulint tree_id) /* in: index tree id */ { ulint i; byte* data; @@ -984,12 +1432,13 @@ rec_fold( ulint fold; ulint n_fields_rec; - ut_ad(rec_validate(rec)); - ut_ad(n_fields <= rec_get_n_fields(rec)); - ut_ad((n_fields < rec_get_n_fields(rec)) || (n_bytes == 0)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(rec_validate((rec_t*) rec, offsets)); ut_ad(n_fields + n_bytes > 0); - - n_fields_rec = rec_get_n_fields(rec); + + n_fields_rec = rec_offs_n_fields(offsets); + ut_ad(n_fields <= n_fields_rec); + ut_ad(n_fields < n_fields_rec || n_bytes == 0); if (n_fields > n_fields_rec) { n_fields = n_fields_rec; @@ -1002,7 +1451,7 @@ rec_fold( fold = ut_fold_dulint(tree_id); for (i = 0; i < n_fields; i++) { - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field(rec, offsets, i, &len); if (len != UNIV_SQL_NULL) { fold = ut_fold_ulint_pair(fold, @@ -1011,7 +1460,7 @@ rec_fold( } if (n_bytes > 0) { - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field(rec, offsets, i, &len); if (len != UNIV_SQL_NULL) { if (len > n_bytes) { @@ -1025,19 +1474,3 @@ rec_fold( return(fold); } - -/************************************************************* -Builds a physical record out of a data tuple and stores it beginning from -the address destination. */ -UNIV_INLINE -rec_t* -rec_convert_dtuple_to_rec( -/*======================*/ - /* out: pointer to the origin of physical - record */ - byte* destination, /* in: start address of the physical record */ - dtuple_t* dtuple) /* in: data tuple */ -{ - return(rec_convert_dtuple_to_rec_low(destination, dtuple, - dtuple_get_data_size(dtuple))); -} diff --git a/innobase/include/row0mysql.h b/innobase/include/row0mysql.h index 13773ed380d..2ef260829fc 100644 --- a/innobase/include/row0mysql.h +++ b/innobase/include/row0mysql.h @@ -239,6 +239,17 @@ row_update_for_mysql( the MySQL format */ row_prebuilt_t* prebuilt); /* in: prebuilt struct in MySQL handle */ + +/************************************************************************* +Does an unlock of a row for MySQL. */ + +int +row_unlock_for_mysql( +/*=================*/ + /* out: error code or DB_SUCCESS */ + row_prebuilt_t* prebuilt); /* in: prebuilt struct in MySQL + handle */ + /************************************************************************* Creates an query graph node of 'update' type to be used in the MySQL interface. */ @@ -352,6 +363,15 @@ row_get_background_drop_list_len_low(void); /*======================================*/ /* out: how many tables in list */ /************************************************************************* +Truncates a table for MySQL. */ + +int +row_truncate_table_for_mysql( +/*=========================*/ + /* out: error code or DB_SUCCESS */ + dict_table_t* table, /* in: table handle */ + trx_t* trx); /* in: transaction handle */ +/************************************************************************* Drops a table for MySQL. If the name of the dropped table ends to characters INNODB_MONITOR, then this also stops printing of monitor output by the master thread. */ @@ -569,6 +589,10 @@ struct row_prebuilt_struct { allocated mem buf start, because there is a 4 byte magic number at the start and at the end */ + ibool keep_other_fields_on_keyread; /* when using fetch + cache with HA_EXTRA_KEYREAD, don't + overwrite other fields in mysql row + row buffer.*/ ulint fetch_cache_first;/* position of the first not yet fetched row in fetch_cache */ ulint n_fetch_cached; /* number of not yet fetched rows diff --git a/innobase/include/row0row.h b/innobase/include/row0row.h index 951e211fb37..782973d8f5d 100644 --- a/innobase/include/row0row.h +++ b/innobase/include/row0row.h @@ -27,7 +27,8 @@ row_get_rec_trx_id( /*===============*/ /* out: value of the field */ rec_t* rec, /* in: record */ - dict_index_t* index); /* in: clustered index */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ /************************************************************************* Reads the roll pointer field from a clustered index record. */ UNIV_INLINE @@ -36,7 +37,8 @@ row_get_rec_roll_ptr( /*=================*/ /* out: value of the field */ rec_t* rec, /* in: record */ - dict_index_t* index); /* in: clustered index */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ /************************************************************************* Writes the trx id field to a clustered index record. */ UNIV_INLINE @@ -45,7 +47,8 @@ row_set_rec_trx_id( /*===============*/ rec_t* rec, /* in: record */ dict_index_t* index, /* in: clustered index */ - dulint trx_id); /* in: value of the field */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + dulint trx_id);/* in: value of the field */ /************************************************************************* Sets the roll pointer field in a clustered index record. */ UNIV_INLINE @@ -54,6 +57,7 @@ row_set_rec_roll_ptr( /*=================*/ rec_t* rec, /* in: record */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ dulint roll_ptr);/* in: value of the field */ /********************************************************************* When an insert to a table is performed, this function builds the entry which @@ -90,6 +94,9 @@ row_build( the buffer page of this record must be at least s-latched and the latch held as long as the row dtuple is used! */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) + or NULL, in which case this function + will invoke rec_get_offsets() */ mem_heap_t* heap); /* in: memory heap from which the memory needed is allocated */ /*********************************************************************** @@ -175,14 +182,15 @@ UNIV_INLINE void row_build_row_ref_fast( /*===================*/ - dtuple_t* ref, /* in: typed data tuple where the reference - is built */ - ulint* map, /* in: array of field numbers in rec telling - how ref should be built from the fields of - rec */ - rec_t* rec); /* in: record in the index; must be preserved - while ref is used, as we do not copy field - values to heap */ + dtuple_t* ref, /* in: typed data tuple where the + reference is built */ + const ulint* map, /* in: array of field numbers in rec + telling how ref should be built from + the fields of rec */ + rec_t* rec, /* in: record in the index; must be + preserved while ref is used, as we do + not copy field values to heap */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /******************************************************************* Searches the clustered index record for a row, if we have the row reference. */ diff --git a/innobase/include/row0row.ic b/innobase/include/row0row.ic index 8e5121f5a96..85410beacf0 100644 --- a/innobase/include/row0row.ic +++ b/innobase/include/row0row.ic @@ -20,7 +20,8 @@ row_get_rec_sys_field( /* out: value of the field */ ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */ rec_t* rec, /* in: record */ - dict_index_t* index); /* in: clustered index */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ /************************************************************************* Sets the trx id or roll ptr field in a clustered index record: this function is slower than the specialized inline functions. */ @@ -32,6 +33,7 @@ row_set_rec_sys_field( ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */ rec_t* rec, /* in: record */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ dulint val); /* in: value to set */ /************************************************************************* @@ -42,18 +44,21 @@ row_get_rec_trx_id( /*===============*/ /* out: value of the field */ rec_t* rec, /* in: record */ - dict_index_t* index) /* in: clustered index */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { ulint offset; ut_ad(index->type & DICT_CLUSTERED); + ut_ad(rec_offs_validate(rec, index, offsets)); offset = index->trx_id_offset; if (offset) { return(trx_read_trx_id(rec + offset)); } else { - return(row_get_rec_sys_field(DATA_TRX_ID, rec, index)); + return(row_get_rec_sys_field(DATA_TRX_ID, + rec, index, offsets)); } } @@ -65,18 +70,21 @@ row_get_rec_roll_ptr( /*=================*/ /* out: value of the field */ rec_t* rec, /* in: record */ - dict_index_t* index) /* in: clustered index */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { ulint offset; ut_ad(index->type & DICT_CLUSTERED); + ut_ad(rec_offs_validate(rec, index, offsets)); offset = index->trx_id_offset; if (offset) { return(trx_read_roll_ptr(rec + offset + DATA_TRX_ID_LEN)); } else { - return(row_get_rec_sys_field(DATA_ROLL_PTR, rec, index)); + return(row_get_rec_sys_field(DATA_ROLL_PTR, + rec, index, offsets)); } } @@ -88,18 +96,21 @@ row_set_rec_trx_id( /*===============*/ rec_t* rec, /* in: record */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ dulint trx_id) /* in: value of the field */ { ulint offset; ut_ad(index->type & DICT_CLUSTERED); + ut_ad(rec_offs_validate(rec, index, offsets)); offset = index->trx_id_offset; if (offset) { trx_write_trx_id(rec + offset, trx_id); } else { - row_set_rec_sys_field(DATA_TRX_ID, rec, index, trx_id); + row_set_rec_sys_field(DATA_TRX_ID, + rec, index, offsets, trx_id); } } @@ -111,18 +122,21 @@ row_set_rec_roll_ptr( /*=================*/ rec_t* rec, /* in: record */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ dulint roll_ptr)/* in: value of the field */ { ulint offset; ut_ad(index->type & DICT_CLUSTERED); + ut_ad(rec_offs_validate(rec, index, offsets)); offset = index->trx_id_offset; if (offset) { trx_write_roll_ptr(rec + offset + DATA_TRX_ID_LEN, roll_ptr); } else { - row_set_rec_sys_field(DATA_ROLL_PTR, rec, index, roll_ptr); + row_set_rec_sys_field(DATA_ROLL_PTR, + rec, index, offsets, roll_ptr); } } @@ -133,14 +147,15 @@ UNIV_INLINE void row_build_row_ref_fast( /*===================*/ - dtuple_t* ref, /* in: typed data tuple where the reference - is built */ - ulint* map, /* in: array of field numbers in rec telling - how ref should be built from the fields of - rec */ - rec_t* rec) /* in: record in the index; must be preserved - while ref is used, as we do not copy field - values to heap */ + dtuple_t* ref, /* in: typed data tuple where the + reference is built */ + const ulint* map, /* in: array of field numbers in rec + telling how ref should be built from + the fields of rec */ + rec_t* rec, /* in: record in the index; must be + preserved while ref is used, as we do + not copy field values to heap */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { dfield_t* dfield; byte* field; @@ -149,6 +164,7 @@ row_build_row_ref_fast( ulint field_no; ulint i; + ut_ad(rec_offs_validate(rec, NULL, offsets)); ref_len = dtuple_get_n_fields(ref); for (i = 0; i < ref_len; i++) { @@ -158,7 +174,8 @@ row_build_row_ref_fast( if (field_no != ULINT_UNDEFINED) { - field = rec_get_nth_field(rec, field_no, &len); + field = rec_get_nth_field(rec, offsets, + field_no, &len); dfield_set_data(dfield, field, len); } } diff --git a/innobase/include/row0upd.h b/innobase/include/row0upd.h index 28210364833..673e0511153 100644 --- a/innobase/include/row0upd.h +++ b/innobase/include/row0upd.h @@ -80,6 +80,7 @@ row_upd_rec_sys_fields( /*===================*/ rec_t* rec, /* in: record */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ trx_t* trx, /* in: transaction */ dulint roll_ptr);/* in: roll ptr of the undo log record */ /************************************************************************* @@ -124,8 +125,8 @@ row_upd_changes_field_size_or_external( /* out: TRUE if the update changes the size of some field in index or the field is external in rec or update */ - rec_t* rec, /* in: record in index */ dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ upd_t* update);/* in: update vector */ /*************************************************************** Replaces the new column values stored in the update vector to the record @@ -135,8 +136,9 @@ a clustered index */ void row_upd_rec_in_place( /*=================*/ - rec_t* rec, /* in/out: record where replaced */ - upd_t* update);/* in: update vector */ + rec_t* rec, /* in/out: record where replaced */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + upd_t* update);/* in: update vector */ /******************************************************************* Builds an update vector from those fields which in a secondary index entry differ from a record that has the equal ordering fields. NOTE: we compare @@ -274,10 +276,11 @@ recovery. */ void row_upd_rec_sys_fields_in_recovery( /*===============================*/ - rec_t* rec, /* in: record */ - ulint pos, /* in: TRX_ID position in rec */ - dulint trx_id, /* in: transaction id */ - dulint roll_ptr);/* in: roll ptr of the undo log record */ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint pos, /* in: TRX_ID position in rec */ + dulint trx_id, /* in: transaction id */ + dulint roll_ptr);/* in: roll ptr of the undo log record */ /************************************************************************* Parses the log data written by row_upd_index_write_log. */ diff --git a/innobase/include/row0upd.ic b/innobase/include/row0upd.ic index a124228a0de..e2d81a39cfa 100644 --- a/innobase/include/row0upd.ic +++ b/innobase/include/row0upd.ic @@ -106,15 +106,17 @@ row_upd_rec_sys_fields( /*===================*/ rec_t* rec, /* in: record */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ trx_t* trx, /* in: transaction */ dulint roll_ptr)/* in: roll ptr of the undo log record */ { ut_ad(index->type & DICT_CLUSTERED); + ut_ad(rec_offs_validate(rec, index, offsets)); #ifdef UNIV_SYNC_DEBUG ut_ad(!buf_block_align(rec)->is_hashed || rw_lock_own(&btr_search_latch, RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ - row_set_rec_trx_id(rec, index, trx->id); - row_set_rec_roll_ptr(rec, index, roll_ptr); + row_set_rec_trx_id(rec, index, offsets, trx->id); + row_set_rec_roll_ptr(rec, index, offsets, roll_ptr); } diff --git a/innobase/include/row0vers.h b/innobase/include/row0vers.h index 30cf82144e9..0dd40fda65f 100644 --- a/innobase/include/row0vers.h +++ b/innobase/include/row0vers.h @@ -30,7 +30,8 @@ row_vers_impl_x_locked_off_kernel( transaction; NOTE that the kernel mutex is temporarily released! */ rec_t* rec, /* in: record in a secondary index */ - dict_index_t* index); /* in: the secondary index */ + dict_index_t* index, /* in: the secondary index */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ /********************************************************************* Finds out if we must preserve a delete marked earlier version of a clustered index record, because it is >= the purge view. */ diff --git a/innobase/include/row0vers.ic b/innobase/include/row0vers.ic index 5ece47c35d1..ab1e264635b 100644 --- a/innobase/include/row0vers.ic +++ b/innobase/include/row0vers.ic @@ -11,73 +11,3 @@ Created 2/6/1997 Heikki Tuuri #include "read0read.h" #include "page0page.h" #include "log0recv.h" - -/************************************************************************* -Fetches the trx id of a clustered index record or version. */ -UNIV_INLINE -dulint -row_vers_get_trx_id( -/*================*/ - /* out: trx id or ut_dulint_zero if the - clustered index record not found */ - rec_t* rec, /* in: clustered index record, or an old - version of it */ - dict_table_t* table) /* in: table */ -{ - return(row_get_rec_trx_id(rec, dict_table_get_first_index(table))); -} - -/************************************************************************* -Checks if a consistent read can be performed immediately on the index -record, or if an older version is needed. */ -UNIV_INLINE -ibool -row_vers_clust_rec_sees_older( -/*==========================*/ - /* out: FALSE if can read immediately */ - rec_t* rec, /* in: record which should be read or passed - over by a read cursor */ - dict_index_t* index, /* in: clustered index */ - read_view_t* view) /* in: read view */ -{ - ut_ad(index->type & DICT_CLUSTERED); - - if (read_view_sees_trx_id(view, row_get_rec_trx_id(rec, index))) { - - return(FALSE); - } - - return(TRUE); -} - -/************************************************************************* -Checks if a secondary index record can be read immediately by a consistent -read, or if an older version may be needed. To be sure, we will have to -look in the clustered index. */ -UNIV_INLINE -ibool -row_vers_sec_rec_may_see_older( -/*===========================*/ - /* out: FALSE if can be read immediately */ - rec_t* rec, /* in: record which should be read or passed */ - dict_index_t* index __attribute__((unused)),/* in: secondary index */ - read_view_t* view) /* in: read view */ -{ - page_t* page; - - ut_ad(!(index->type & DICT_CLUSTERED)); - - page = buf_frame_align(rec); - - if ((ut_dulint_cmp(page_get_max_trx_id(page), view->up_limit_id) >= 0) - || recv_recovery_is_on()) { - - /* It may be that the record was inserted or modified by a - transaction the view should not see: we have to look in the - clustered index */ - - return(TRUE); - } - - return(FALSE); -} diff --git a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h index 6cfe9cef927..c5374fd00fa 100644 --- a/innobase/include/srv0srv.h +++ b/innobase/include/srv0srv.h @@ -107,6 +107,7 @@ extern ibool srv_very_fast_shutdown; /* if this TRUE, do not flush the extern ibool srv_innodb_status; extern ibool srv_use_doublewrite_buf; +extern ibool srv_use_checksums; extern ibool srv_set_thread_priorities; extern int srv_query_thread_priority; @@ -133,6 +134,8 @@ extern ibool srv_lock_timeout_and_monitor_active; extern ibool srv_error_monitor_active; extern ulint srv_n_spin_wait_rounds; +extern ulint srv_n_free_tickets_to_enter; +extern ulint srv_thread_sleep_delay; extern ulint srv_spin_wait_delay; extern ibool srv_priority_boost; @@ -184,6 +187,63 @@ i/o handler thread */ extern const char* srv_io_thread_op_info[]; extern const char* srv_io_thread_function[]; +/* the number of the log write requests done */ +extern ulint srv_log_write_requests; + +/* the number of physical writes to the log performed */ +extern ulint srv_log_writes; + +/* amount of data written to the log files in bytes */ +extern ulint srv_os_log_written; + +/* amount of writes being done to the log files */ +extern ulint srv_os_log_pending_writes; + +/* we increase this counter, when there we don't have enough space in the +log buffer and have to flush it */ +extern ulint srv_log_waits; + +/* variable that counts amount of data read in total (in bytes) */ +extern ulint srv_data_read; + +/* here we count the amount of data written in total (in bytes) */ +extern ulint srv_data_written; + +/* this variable counts the amount of times, when the doublewrite buffer +was flushed */ +extern ulint srv_dblwr_writes; + +/* here we store the number of pages that have been flushed to the +doublewrite buffer */ +extern ulint srv_dblwr_pages_written; + +/* in this variable we store the number of write requests issued */ +extern ulint srv_buf_pool_write_requests; + +/* here we store the number of times when we had to wait for a free page +in the buffer pool. It happens when the buffer pool is full and we need +to make a flush, in order to be able to read or create a page. */ +extern ulint srv_buf_pool_wait_free; + +/* variable to count the number of pages that were written from the +buffer pool to disk */ +extern ulint srv_buf_pool_flushed; + +/* variable to count the number of buffer pool reads that led to the +reading of a disk page */ +extern ulint srv_buf_pool_reads; + +/* variable to count the number of sequential read-aheads were done */ +extern ulint srv_read_ahead_seq; + +/* variable to count the number of random read-aheads were done */ +extern ulint srv_read_ahead_rnd; + +/* In this structure we store status variables to be passed to MySQL */ +typedef struct export_var_struct export_struc; + +extern export_struc export_vars; + typedef struct srv_sys_struct srv_sys_t; /* The server system */ @@ -400,7 +460,12 @@ void srv_printf_innodb_monitor( /*======================*/ FILE* file); /* in: output stream */ +/************************************************************************ +Function to pass InnoDB status variables to MySQL */ +void +srv_export_innodb_status(void); +/*=====================*/ /* Types for the threads existing in the system. Threads of types 4 - 9 are called utility threads. Note that utility threads are mainly disk @@ -426,6 +491,53 @@ typedef struct srv_slot_struct srv_slot_t; /* Thread table is an array of slots */ typedef srv_slot_t srv_table_t; +/* In this structure we store status variables to be passed to MySQL */ +struct export_var_struct{ + ulint innodb_data_pending_reads; + ulint innodb_data_pending_writes; + ulint innodb_data_pending_fsyncs; + ulint innodb_data_fsyncs; + ulint innodb_data_read; + ulint innodb_data_writes; + ulint innodb_data_written; + ulint innodb_data_reads; + ulint innodb_buffer_pool_pages_total; + ulint innodb_buffer_pool_pages_data; + ulint innodb_buffer_pool_pages_dirty; + ulint innodb_buffer_pool_pages_misc; + ulint innodb_buffer_pool_pages_free; + ulint innodb_buffer_pool_pages_latched; + ulint innodb_buffer_pool_read_requests; + ulint innodb_buffer_pool_reads; + ulint innodb_buffer_pool_wait_free; + ulint innodb_buffer_pool_pages_flushed; + ulint innodb_buffer_pool_write_requests; + ulint innodb_buffer_pool_read_ahead_seq; + ulint innodb_buffer_pool_read_ahead_rnd; + ulint innodb_dblwr_pages_written; + ulint innodb_dblwr_writes; + ulint innodb_log_waits; + ulint innodb_log_write_requests; + ulint innodb_log_writes; + ulint innodb_os_log_written; + ulint innodb_os_log_fsyncs; + ulint innodb_os_log_pending_writes; + ulint innodb_os_log_pending_fsyncs; + ulint innodb_page_size; + ulint innodb_pages_created; + ulint innodb_pages_read; + ulint innodb_pages_written; + ulint innodb_row_lock_waits; + ulint innodb_row_lock_current_waits; + ib_longlong innodb_row_lock_time; + ulint innodb_row_lock_time_avg; + ulint innodb_row_lock_time_max; + ulint innodb_rows_read; + ulint innodb_rows_inserted; + ulint innodb_rows_updated; + ulint innodb_rows_deleted; +}; + /* The server system struct */ struct srv_sys_struct{ os_event_t operational; /* created threads must wait for the @@ -434,6 +546,10 @@ struct srv_sys_struct{ srv_table_t* threads; /* server thread table */ UT_LIST_BASE_NODE_T(que_thr_t) tasks; /* task queue */ + dict_index_t* dummy_ind1; /* dummy index for old-style + supremum and infimum records */ + dict_index_t* dummy_ind2; /* dummy index for new-style + supremum and infimum records */ }; extern ulint srv_n_threads_active[]; diff --git a/innobase/include/sync0rw.h b/innobase/include/sync0rw.h index 9a988a03e92..911c8ac3f4a 100644 --- a/innobase/include/sync0rw.h +++ b/innobase/include/sync0rw.h @@ -61,8 +61,8 @@ Creates, or rather, initializes an rw-lock object in a specified memory location (which must be appropriately aligned). The rw-lock is initialized to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free is necessary only if the memory block containing it is freed. */ - -#define rw_lock_create(L) rw_lock_create_func((L), __FILE__, __LINE__) +#define rw_lock_create(L) rw_lock_create_func((L), __FILE__, __LINE__, #L) + /*=====================*/ /********************************************************************** Creates, or rather, initializes an rw-lock object in a specified memory @@ -75,7 +75,8 @@ rw_lock_create_func( /*================*/ rw_lock_t* lock, /* in: pointer to memory */ const char* cfile_name, /* in: file name where created */ - ulint cline); /* in: file line where created */ + ulint cline, /* in: file line where created */ + const char* cmutex_name); /* in: mutex name */ /********************************************************************** Calling this function is obligatory only if the memory buffer containing the rw-lock is freed. Removes an rw-lock object from the global list. The diff --git a/innobase/include/sync0sync.h b/innobase/include/sync0sync.h index 8e0ec715b12..5046a960bcf 100644 --- a/innobase/include/sync0sync.h +++ b/innobase/include/sync0sync.h @@ -17,6 +17,8 @@ Created 9/5/1995 Heikki Tuuri #include "os0sync.h" #include "sync0arr.h" +extern my_bool timed_mutexes; + /********************************************************************** Initializes the synchronization data structures. */ @@ -35,8 +37,7 @@ location (which must be appropriately aligned). The mutex is initialized in the reset state. Explicit freeing of the mutex with mutex_free is necessary only if the memory block containing it is freed. */ - -#define mutex_create(M) mutex_create_func((M), __FILE__, __LINE__) +#define mutex_create(M) mutex_create_func((M), __FILE__, __LINE__, #M) /*===================*/ /********************************************************************** Creates, or rather, initializes a mutex object in a specified memory @@ -49,7 +50,8 @@ mutex_create_func( /*==============*/ mutex_t* mutex, /* in: pointer to memory */ const char* cfile_name, /* in: file name where created */ - ulint cline); /* in: file line where created */ + ulint cline, /* in: file line where created */ + const char* cmutex_name); /* in: mutex name */ /********************************************************************** Calling this function is obligatory only if the memory buffer containing the mutex is freed. Removes a mutex object from the mutex list. The mutex @@ -413,6 +415,8 @@ or row lock! */ /*------------------------------------- Insert buffer tree */ #define SYNC_IBUF_BITMAP_MUTEX 351 #define SYNC_IBUF_BITMAP 350 +/*------------------------------------- MySQL query cache mutex */ +/*------------------------------------- MySQL binlog mutex */ /*-------------------------------*/ #define SYNC_KERNEL 300 #define SYNC_REC_LOCK 299 @@ -471,6 +475,15 @@ struct mutex_struct { const char* cfile_name;/* File name where mutex created */ ulint cline; /* Line where created */ ulint magic_n; + ulong count_using; /* count of times mutex used */ + ulong count_spin_loop; /* count of spin loops */ + ulong count_spin_rounds; /* count of spin rounds */ + ulong count_os_wait; /* count of os_wait */ + ulong count_os_yield; /* count of os_wait */ + ulonglong lspent_time; /* mutex os_wait timer msec */ + ulonglong lmax_spent_time; /* mutex os_wait timer msec */ + const char* cmutex_name;/* mutex name */ + ulint mutex_type;/* 0 - usual mutex 1 - rw_lock mutex */ }; #define MUTEX_MAGIC_N (ulint)979585 @@ -504,6 +517,13 @@ extern ibool sync_order_checks_on; /* This variable is set to TRUE when sync_init is called */ extern ibool sync_initialized; +/* Global list of database mutexes (not OS mutexes) created. */ +UT_LIST_BASE_NODE_T(mutex_t) mutex_list; + +/* Mutex protecting the mutex_list variable */ +mutex_t mutex_list_mutex; + + #ifndef UNIV_NONINL #include "sync0sync.ic" #endif diff --git a/innobase/include/sync0sync.ic b/innobase/include/sync0sync.ic index aaf5e1fd9e9..f26f3788dc3 100644 --- a/innobase/include/sync0sync.ic +++ b/innobase/include/sync0sync.ic @@ -249,8 +249,11 @@ mutex_enter_func( /* Note that we do not peek at the value of lock_word before trying the atomic test_and_set; we could peek, and possibly save time. */ + + mutex->count_using++; - if (!mutex_test_and_set(mutex)) { + if (!mutex_test_and_set(mutex)) + { #ifdef UNIV_SYNC_DEBUG mutex_set_debug_info(mutex, file_name, line); #endif @@ -258,4 +261,5 @@ mutex_enter_func( } mutex_spin_wait(mutex, file_name, line); + } diff --git a/innobase/include/trx0rec.h b/innobase/include/trx0rec.h index 9d7f41cd94e..4387ce1a61e 100644 --- a/innobase/include/trx0rec.h +++ b/innobase/include/trx0rec.h @@ -246,6 +246,7 @@ trx_undo_prev_version_build( index_rec page and purge_view */ rec_t* rec, /* in: version of a clustered index record */ dict_index_t* index, /* in: clustered index */ + ulint* offsets,/* in: rec_get_offsets(rec, index) */ mem_heap_t* heap, /* in: memory heap from which the memory needed is allocated */ rec_t** old_vers);/* out, own: previous version, or NULL if diff --git a/innobase/include/trx0roll.h b/innobase/include/trx0roll.h index 6004551f456..9d025da4a5f 100644 --- a/innobase/include/trx0roll.h +++ b/innobase/include/trx0roll.h @@ -104,11 +104,20 @@ trx_rollback( /*********************************************************************** Rollback or clean up transactions which have no user session. If the transaction already was committed, then we clean up a possible insert -undo log. If the transaction was not yet committed, then we roll it back. */ +undo log. If the transaction was not yet committed, then we roll it back. +Note: this is done in a background thread. */ -void -trx_rollback_or_clean_all_without_sess(void); -/*========================================*/ +#ifndef __WIN__ +void* +#else +ulint +#endif +trx_rollback_or_clean_all_without_sess( +/*===================================*/ + /* out: a dummy parameter */ + void* arg __attribute__((unused))); + /* in: a dummy parameter required by + os_thread_create */ /******************************************************************** Finishes a transaction rollback. */ diff --git a/innobase/include/trx0trx.h b/innobase/include/trx0trx.h index 8336e05bdb0..76b051105de 100644 --- a/innobase/include/trx0trx.h +++ b/innobase/include/trx0trx.h @@ -16,6 +16,7 @@ Created 3/26/1996 Heikki Tuuri #include "que0types.h" #include "mem0mem.h" #include "read0types.h" +#include "trx0xa.h" extern ulint trx_n_mysql_transactions; @@ -156,6 +157,36 @@ trx_commit_for_mysql( /*=================*/ /* out: 0 or error number */ trx_t* trx); /* in: trx handle */ + +/************************************************************************** +Does the transaction prepare for MySQL. */ + +ulint +trx_prepare_for_mysql( +/*=================*/ + /* out: 0 or error number */ + trx_t* trx); /* in: trx handle */ + +/************************************************************************** +This function is used to find number of prepared transactions and +their transaction objects for a recovery. */ + +int +trx_recover_for_mysql( +/*=================*/ + /* out: number of prepared transactions */ + XID* xid_list, /* in/out: prepared transactions */ + uint len); /* in: number of slots in xid_list */ + +/*********************************************************************** +This function is used to commit one X/Open XA distributed transaction +which is in the prepared state */ +trx_t * +trx_get_trx_by_xid( +/*===============*/ + /* out: trx or NULL */ + XID* xid); /* in: X/Open XA Transaction Idenfication */ + /************************************************************************** If required, flushes the log to disk if we called trx_commit_for_mysql() with trx->flush_log_later == TRUE. */ @@ -339,6 +370,9 @@ struct trx_struct{ if we can use the insert buffer for them, we set this FALSE */ dulint id; /* transaction id */ + XID xid; /* X/Open XA transaction + identification to identify a + transaction branch */ dulint no; /* transaction serialization number == max trx id when the transaction is moved to COMMITTED_IN_MEMORY state */ @@ -353,8 +387,10 @@ struct trx_struct{ dulint table_id; /* table id if the preceding field is TRUE */ /*------------------------------*/ - void* mysql_thd; /* MySQL thread handle corresponding - to this trx, or NULL */ + int active_trans; /* whether a transaction in MySQL + is active */ + void* mysql_thd; /* MySQL thread handle corresponding + to this trx, or NULL */ char** mysql_query_str;/* pointer to the field in mysqld_thd which contains the pointer to the current SQL query string */ @@ -436,9 +472,15 @@ struct trx_struct{ lock_t* auto_inc_lock; /* possible auto-inc lock reserved by the transaction; note that it is also in the lock list trx_locks */ + ibool trx_create_lock;/* this is TRUE if we have created a + new lock for a record accessed */ ulint n_lock_table_exp;/* number of explicit table locks (LOCK TABLES) reserved by the transaction, stored in trx_locks */ + ulint n_lock_table_transactional; + /* number of transactional table locks + (LOCK TABLES..WHERE ENGINE) reserved by + the transaction, stored in trx_locks */ UT_LIST_NODE_T(trx_t) trx_list; /* list of transactions */ UT_LIST_NODE_T(trx_t) @@ -554,6 +596,7 @@ struct trx_struct{ #define TRX_NOT_STARTED 1 #define TRX_ACTIVE 2 #define TRX_COMMITTED_IN_MEMORY 3 +#define TRX_PREPARED 4 /* Support for 2PC/XA */ /* Transaction execution states when trx state is TRX_ACTIVE */ #define TRX_QUE_RUNNING 1 /* transaction is running */ diff --git a/innobase/include/trx0undo.h b/innobase/include/trx0undo.h index 20002076cc3..fce62e46046 100644 --- a/innobase/include/trx0undo.h +++ b/innobase/include/trx0undo.h @@ -14,6 +14,7 @@ Created 3/26/1996 Heikki Tuuri #include "mtr0mtr.h" #include "trx0sys.h" #include "page0types.h" +#include "trx0xa.h" /*************************************************************************** Builds a roll pointer dulint. */ @@ -36,7 +37,7 @@ trx_undo_decode_roll_ptr( ibool* is_insert, /* out: TRUE if insert undo log */ ulint* rseg_id, /* out: rollback segment id */ ulint* page_no, /* out: page number */ - ulint* offset); /* out: offset of the undo entry within page */ + ulint* offset); /* out: offset of the undo entry within page */ /*************************************************************************** Returns TRUE if the roll pointer is of the insert type. */ UNIV_INLINE @@ -239,6 +240,18 @@ trx_undo_set_state_at_finish( trx_t* trx, /* in: transaction */ trx_undo_t* undo, /* in: undo log memory copy */ mtr_t* mtr); /* in: mtr */ +/********************************************************************** +Sets the state of the undo log segment at a transaction prepare. */ + +page_t* +trx_undo_set_state_at_prepare( +/*==========================*/ + /* out: undo log segment header page, + x-latched */ + trx_t* trx, /* in: transaction */ + trx_undo_t* undo, /* in: undo log memory copy */ + mtr_t* mtr); /* in: mtr */ + /************************************************************************** Adds the update undo log header as the first in the history list, and frees the memory object, or puts it to the list of cached update undo log @@ -294,7 +307,23 @@ trx_undo_parse_discard_latest( byte* end_ptr,/* in: buffer end */ page_t* page, /* in: page or NULL */ mtr_t* mtr); /* in: mtr or NULL */ +/************************************************************************ +Write X/Open XA Transaction Identification (XID) to undo log header */ +void +trx_undo_write_xid( +/*===============*/ + trx_ulogf_t* log_hdr,/* in: undo log header */ + XID* xid); /* in: X/Open XA Transaction Identification */ + +/************************************************************************ +Read X/Open XA Transaction Identification (XID) from undo log header */ + +void +trx_undo_read_xid( +/*==============*/ + trx_ulogf_t* log_hdr,/* in: undo log header */ + XID* xid); /* out: X/Open XA Transaction Identification */ /* Types of an undo log segment */ #define TRX_UNDO_INSERT 1 /* contains undo entries for inserts */ @@ -310,6 +339,8 @@ trx_undo_parse_discard_latest( #define TRX_UNDO_TO_PURGE 4 /* update undo segment will not be reused: it can be freed in purge when all undo data in it is removed */ +#define TRX_UNDO_PREPARED 5 /* contains an undo log of an + prepared transaction */ /* Transaction undo log memory object; this is protected by the undo_mutex in the corresponding transaction object */ @@ -332,6 +363,8 @@ struct trx_undo_struct{ field */ dulint trx_id; /* id of the trx assigned to the undo log */ + XID xid; /* X/Open XA transaction + identification */ ibool dict_operation; /* TRUE if a dict operation trx */ dulint table_id; /* if a dict operation, then the table id */ @@ -436,7 +469,10 @@ page of an update undo log segment. */ log start, and therefore this is not necessarily the same as this log header end offset */ -#define TRX_UNDO_DICT_OPERATION 20 /* TRUE if the transaction is a table +#define TRX_UNDO_XID_EXISTS 20 /* TRUE if undo log header includes + X/Open XA transaction identification + XID */ +#define TRX_UNDO_DICT_TRANS 21 /* TRUE if the transaction is a table create, index create, or drop transaction: in recovery the transaction cannot be rolled back @@ -452,7 +488,17 @@ page of an update undo log segment. */ #define TRX_UNDO_HISTORY_NODE 34 /* If the log is put to the history list, the file list node is here */ /*-------------------------------------------------------------*/ -#define TRX_UNDO_LOG_HDR_SIZE (34 + FLST_NODE_SIZE) +/* X/Open XA Transaction Identification (XID) */ + +#define TRX_UNDO_XA_FORMAT (34 + FLST_NODE_SIZE) +#define TRX_UNDO_XA_TRID_LEN (TRX_UNDO_XA_FORMAT + 4) +#define TRX_UNDO_XA_BQUAL_LEN (TRX_UNDO_XA_TRID_LEN + 4) +#define TRX_UNDO_XA_XID (TRX_UNDO_XA_BQUAL_LEN + 4) +#define TRX_UNDO_XA_LEN (TRX_UNDO_XA_XID + XIDDATASIZE) + +/*-------------------------------------------------------------*/ +#define TRX_UNDO_LOG_HDR_SIZE (TRX_UNDO_XA_LEN) +/*-------------------------------------------------------------*/ #ifndef UNIV_NONINL #include "trx0undo.ic" diff --git a/innobase/include/trx0xa.h b/innobase/include/trx0xa.h new file mode 100644 index 00000000000..34b7a2f95a8 --- /dev/null +++ b/innobase/include/trx0xa.h @@ -0,0 +1,182 @@ +/* + * Start of xa.h header + * + * Define a symbol to prevent multiple inclusions of this header file + */ +#ifndef XA_H +#define XA_H + +/* + * Transaction branch identification: XID and NULLXID: + */ +#ifndef XIDDATASIZE + +#define XIDDATASIZE 128 /* size in bytes */ +#define MAXGTRIDSIZE 64 /* maximum size in bytes of gtrid */ +#define MAXBQUALSIZE 64 /* maximum size in bytes of bqual */ + +struct xid_t { + long formatID; /* format identifier */ + long gtrid_length; /* value from 1 through 64 */ + long bqual_length; /* value from 1 through 64 */ + char data[XIDDATASIZE]; +}; +typedef struct xid_t XID; +#endif +/* + * A value of -1 in formatID means that the XID is null. + */ + + +#ifdef NOTDEFINED +/* Let us comment this out to remove compiler errors!!!!!!!!!!!! */ + +/* + * Declarations of routines by which RMs call TMs: + */ +extern int ax_reg __P((int, XID *, long)); +extern int ax_unreg __P((int, long)); + +/* + * XA Switch Data Structure + */ +#define RMNAMESZ 32 /* length of resource manager name, */ + /* including the null terminator */ +#define MAXINFOSIZE 256 /* maximum size in bytes of xa_info */ + /* strings, including the null + terminator */ + + +struct xa_switch_t { + char name[RMNAMESZ]; /* name of resource manager */ + long flags; /* resource manager specific options */ + long version; /* must be 0 */ + int (*xa_open_entry) /* xa_open function pointer */ + __P((char *, int, long)); + int (*xa_close_entry) /* xa_close function pointer */ + __P((char *, int, long)); + int (*xa_start_entry) /* xa_start function pointer */ + __P((XID *, int, long)); + int (*xa_end_entry) /* xa_end function pointer */ + __P((XID *, int, long)); + int (*xa_rollback_entry) /* xa_rollback function pointer */ + __P((XID *, int, long)); + int (*xa_prepare_entry) /* xa_prepare function pointer */ + __P((XID *, int, long)); + int (*xa_commit_entry) /* xa_commit function pointer */ + __P((XID *, int, long)); + int (*xa_recover_entry) /* xa_recover function pointer */ + __P((XID *, long, int, long)); + int (*xa_forget_entry) /* xa_forget function pointer */ + __P((XID *, int, long)); + int (*xa_complete_entry) /* xa_complete function pointer */ + __P((int *, int *, int, long)); +}; +#endif /* NOTDEFINED */ + + +/* + * Flag definitions for the RM switch + */ +#define TMNOFLAGS 0x00000000L /* no resource manager features + selected */ +#define TMREGISTER 0x00000001L /* resource manager dynamically + registers */ +#define TMNOMIGRATE 0x00000002L /* resource manager does not support + association migration */ +#define TMUSEASYNC 0x00000004L /* resource manager supports + asynchronous operations */ +/* + * Flag definitions for xa_ and ax_ routines + */ +/* use TMNOFLAGGS, defined above, when not specifying other flags */ +#define TMASYNC 0x80000000L /* perform routine asynchronously */ +#define TMONEPHASE 0x40000000L /* caller is using one-phase commit + optimisation */ +#define TMFAIL 0x20000000L /* dissociates caller and marks + transaction branch rollback-only */ +#define TMNOWAIT 0x10000000L /* return if blocking condition + exists */ +#define TMRESUME 0x08000000L /* caller is resuming association with + suspended transaction branch */ +#define TMSUCCESS 0x04000000L /* dissociate caller from transaction + branch */ +#define TMSUSPEND 0x02000000L /* caller is suspending, not ending, + association */ +#define TMSTARTRSCAN 0x01000000L /* start a recovery scan */ +#define TMENDRSCAN 0x00800000L /* end a recovery scan */ +#define TMMULTIPLE 0x00400000L /* wait for any asynchronous + operation */ +#define TMJOIN 0x00200000L /* caller is joining existing + transaction branch */ +#define TMMIGRATE 0x00100000L /* caller intends to perform + migration */ + +/* + * ax_() return codes (transaction manager reports to resource manager) + */ +#define TM_JOIN 2 /* caller is joining existing + transaction branch */ +#define TM_RESUME 1 /* caller is resuming association with + suspended transaction branch */ +#define TM_OK 0 /* normal execution */ +#define TMER_TMERR -1 /* an error occurred in the transaction + manager */ +#define TMER_INVAL -2 /* invalid arguments were given */ +#define TMER_PROTO -3 /* routine invoked in an improper + context */ + +/* + * xa_() return codes (resource manager reports to transaction manager) + */ +#define XA_RBBASE 100 /* The inclusive lower bound of the + rollback codes */ +#define XA_RBROLLBACK XA_RBBASE /* The rollback was caused by an + unspecified reason */ +#define XA_RBCOMMFAIL XA_RBBASE+1 /* The rollback was caused by a + communication failure */ +#define XA_RBDEADLOCK XA_RBBASE+2 /* A deadlock was detected */ +#define XA_RBINTEGRITY XA_RBBASE+3 /* A condition that violates the + integrity of the resources was + detected */ +#define XA_RBOTHER XA_RBBASE+4 /* The resource manager rolled back the + transaction branch for a reason not + on this list */ +#define XA_RBPROTO XA_RBBASE+5 /* A protocol error occurred in the + resource manager */ +#define XA_RBTIMEOUT XA_RBBASE+6 /* A transaction branch took too long */ +#define XA_RBTRANSIENT XA_RBBASE+7 /* May retry the transaction branch */ +#define XA_RBEND XA_RBTRANSIENT /* The inclusive upper bound of the + rollback codes */ +#define XA_NOMIGRATE 9 /* resumption must occur where + suspension occurred */ +#define XA_HEURHAZ 8 /* the transaction branch may have + been heuristically completed */ +#define XA_HEURCOM 7 /* the transaction branch has been + heuristically committed */ +#define XA_HEURRB 6 /* the transaction branch has been + heuristically rolled back */ +#define XA_HEURMIX 5 /* the transaction branch has been + heuristically committed and rolled + back */ +#define XA_RETRY 4 /* routine returned with no effect and + may be re-issued */ +#define XA_RDONLY 3 /* the transaction branch was read-only + and has been committed */ +#define XA_OK 0 /* normal execution */ +#define XAER_ASYNC -2 /* asynchronous operation already + outstanding */ +#define XAER_RMERR -3 /* a resource manager error occurred in + the transaction branch */ +#define XAER_NOTA -4 /* the XID is not valid */ +#define XAER_INVAL -5 /* invalid arguments were given */ +#define XAER_PROTO -6 /* routine invoked in an improper + context */ +#define XAER_RMFAIL -7 /* resource manager unavailable */ +#define XAER_DUPID -8 /* the XID already exists */ +#define XAER_OUTSIDE -9 /* resource manager doing work outside + transaction */ +#endif /* ifndef XA_H */ +/* + * End of xa.h header + */ diff --git a/innobase/include/univ.i b/innobase/include/univ.i index be71d4211b3..80024f71992 100644 --- a/innobase/include/univ.i +++ b/innobase/include/univ.i @@ -88,6 +88,7 @@ memory is read outside the allocated blocks. */ #define UNIV_SEARCH_DEBUG #define UNIV_SYNC_PERF_STAT #define UNIV_SEARCH_PERF_STAT +#define UNIV_SRV_PRINT_LATCH_WAITS; */ #define UNIV_LIGHT_MEM_DEBUG diff --git a/innobase/include/ut0byte.h b/innobase/include/ut0byte.h index a62c2e2e318..22d488abeaf 100644 --- a/innobase/include/ut0byte.h +++ b/innobase/include/ut0byte.h @@ -208,7 +208,20 @@ ut_align_down( /*==========*/ /* out: aligned pointer */ void* ptr, /* in: pointer */ - ulint align_no); /* in: align by this number */ + ulint align_no) /* in: align by this number */ + __attribute__((const)); +/************************************************************* +The following function computes the offset of a pointer from the nearest +aligned address. */ +UNIV_INLINE +ulint +ut_align_offset( +/*==========*/ + /* out: distance from aligned + pointer */ + const void* ptr, /* in: pointer */ + ulint align_no) /* in: align by this number */ + __attribute__((const)); /********************************************************************* Gets the nth bit of a ulint. */ UNIV_INLINE diff --git a/innobase/include/ut0byte.ic b/innobase/include/ut0byte.ic index 5a70dcf12a8..e141de3aa3f 100644 --- a/innobase/include/ut0byte.ic +++ b/innobase/include/ut0byte.ic @@ -335,6 +335,27 @@ ut_align_down( return((void*)((((ulint)ptr)) & ~(align_no - 1))); } +/************************************************************* +The following function computes the offset of a pointer from the nearest +aligned address. */ +UNIV_INLINE +ulint +ut_align_offset( +/*============*/ + /* out: distance from + aligned pointer */ + const void* ptr, /* in: pointer */ + ulint align_no) /* in: align by this number */ +{ + ut_ad(align_no > 0); + ut_ad(((align_no - 1) & align_no) == 0); + ut_ad(ptr); + + ut_ad(sizeof(void*) == sizeof(ulint)); + + return(((ulint)ptr) & (align_no - 1)); +} + /********************************************************************* Gets the nth bit of a ulint. */ UNIV_INLINE diff --git a/innobase/include/ut0ut.h b/innobase/include/ut0ut.h index dee8785c9e7..8938957cd12 100644 --- a/innobase/include/ut0ut.h +++ b/innobase/include/ut0ut.h @@ -139,6 +139,14 @@ ib_time_t ut_time(void); /*=========*/ /************************************************************** +Returns system time. */ + +void +ut_usectime( +/*========*/ + ulint* sec, /* out: seconds since the Epoch */ + ulint* ms); /* out: microseconds since the Epoch+*sec */ +/************************************************************** Returns the difference of two times in seconds. */ double diff --git a/innobase/lock/lock0lock.c b/innobase/lock/lock0lock.c index 29a274261f8..05466764063 100644 --- a/innobase/lock/lock0lock.c +++ b/innobase/lock/lock0lock.c @@ -424,12 +424,15 @@ lock_check_trx_id_sanity( /* out: TRUE if ok */ dulint trx_id, /* in: trx id */ rec_t* rec, /* in: user record */ - dict_index_t* index, /* in: clustered index */ + dict_index_t* index, /* in: index */ + const ulint* offsets, /* in: rec_get_offsets(rec, index) */ ibool has_kernel_mutex)/* in: TRUE if the caller owns the kernel mutex */ { ibool is_ok = TRUE; + ut_ad(rec_offs_validate(rec, index, offsets)); + if (!has_kernel_mutex) { mutex_enter(&kernel_mutex); } @@ -442,7 +445,7 @@ lock_check_trx_id_sanity( fputs(" InnoDB: Error: transaction id associated" " with record\n", stderr); - rec_print(stderr, rec); + rec_print_new(stderr, rec, offsets); fputs("InnoDB: in ", stderr); dict_index_name_print(stderr, NULL, index); fprintf(stderr, "\n" @@ -474,18 +477,20 @@ lock_clust_rec_cons_read_sees( rec_t* rec, /* in: user record which should be read or passed over by a read cursor */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ read_view_t* view) /* in: consistent read view */ { dulint trx_id; ut_ad(index->type & DICT_CLUSTERED); ut_ad(page_rec_is_user_rec(rec)); + ut_ad(rec_offs_validate(rec, index, offsets)); /* NOTE that we call this function while holding the search system latch. To obey the latching order we must NOT reserve the kernel mutex here! */ - trx_id = row_get_rec_trx_id(rec, index); + trx_id = row_get_rec_trx_id(rec, index, offsets); if (read_view_sees_trx_id(view, trx_id)) { @@ -1256,6 +1261,7 @@ lock_rec_get_next( /*==============*/ /* out: next lock, NULL if none exists */ rec_t* rec, /* in: record on a page */ + ibool comp, /* in: TRUE=compact page format */ lock_t* lock) /* in: lock */ { #ifdef UNIV_SYNC_DEBUG @@ -1271,7 +1277,7 @@ lock_rec_get_next( return(NULL); } - if (lock_rec_get_nth_bit(lock, rec_get_heap_no(rec))) { + if (lock_rec_get_nth_bit(lock, rec_get_heap_no(rec, comp))) { return(lock); } @@ -1288,15 +1294,17 @@ lock_rec_get_first( rec_t* rec) /* in: record on a page */ { lock_t* lock; + ibool comp; #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ lock = lock_rec_get_first_on_page(rec); + comp = page_is_comp(buf_frame_align(rec)); while (lock) { - if (lock_rec_get_nth_bit(lock, rec_get_heap_no(rec))) { + if (lock_rec_get_nth_bit(lock, rec_get_heap_no(rec, comp))) { break; } @@ -1463,6 +1471,7 @@ lock_rec_has_expl( for a supremum record we regard this always a gap type request */ rec_t* rec, /* in: record */ + ibool comp, /* in: TRUE=compact page format */ trx_t* trx) /* in: transaction */ { lock_t* lock; @@ -1492,7 +1501,7 @@ lock_rec_has_expl( return(lock); } - lock = lock_rec_get_next(rec, lock); + lock = lock_rec_get_next(rec, comp, lock); } return(NULL); @@ -1511,6 +1520,7 @@ lock_rec_other_has_expl_req( ulint wait, /* in: LOCK_WAIT if also waiting locks are taken into account, or 0 if not */ rec_t* rec, /* in: record to look at */ + ibool comp, /* in: TRUE=compact record format */ trx_t* trx) /* in: transaction, or NULL if requests by all transactions are taken into account */ { @@ -1535,7 +1545,7 @@ lock_rec_other_has_expl_req( return(lock); } - lock = lock_rec_get_next(rec, lock); + lock = lock_rec_get_next(rec, comp, lock); } return(NULL); @@ -1556,12 +1566,13 @@ lock_rec_other_has_conflicting( trx_t* trx) /* in: our transaction */ { lock_t* lock; - + ibool comp; #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ lock = lock_rec_get_first(rec); + comp = page_is_comp(buf_frame_align(rec)); while (lock) { if (lock_rec_has_to_wait(trx, mode, lock, @@ -1570,7 +1581,7 @@ lock_rec_other_has_conflicting( return(lock); } - lock = lock_rec_get_next(rec, lock); + lock = lock_rec_get_next(rec, comp, lock); } return(NULL); @@ -1596,8 +1607,7 @@ lock_rec_find_similar_on_page( ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ - heap_no = rec_get_heap_no(rec); - + heap_no = rec_get_heap_no(rec, page_is_comp(buf_frame_align(rec))); lock = lock_rec_get_first_on_page(rec); while (lock != NULL) { @@ -1624,7 +1634,8 @@ lock_sec_rec_some_has_impl_off_kernel( /* out: transaction which has the x-lock, or NULL */ rec_t* rec, /* in: user record */ - dict_index_t* index) /* in: secondary index */ + dict_index_t* index, /* in: secondary index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { page_t* page; @@ -1633,6 +1644,7 @@ lock_sec_rec_some_has_impl_off_kernel( #endif /* UNIV_SYNC_DEBUG */ ut_ad(!(index->type & DICT_CLUSTERED)); ut_ad(page_rec_is_user_rec(rec)); + ut_ad(rec_offs_validate(rec, index, offsets)); page = buf_frame_align(rec); @@ -1652,8 +1664,8 @@ lock_sec_rec_some_has_impl_off_kernel( /* Ok, in this case it is possible that some transaction has an implicit x-lock. We have to look in the clustered index. */ - if (!lock_check_trx_id_sanity(page_get_max_trx_id(page), rec, index, - TRUE)) { + if (!lock_check_trx_id_sanity(page_get_max_trx_id(page), + rec, index, offsets, TRUE)) { buf_page_print(page); /* The page is corrupt: try to avoid a crash by returning @@ -1661,7 +1673,7 @@ lock_sec_rec_some_has_impl_off_kernel( return(NULL); } - return(row_vers_impl_x_locked_off_kernel(rec, index)); + return(row_vers_impl_x_locked_off_kernel(rec, index, offsets)); } /*============== RECORD LOCK CREATION AND QUEUE MANAGEMENT =============*/ @@ -1695,7 +1707,7 @@ lock_rec_create( page = buf_frame_align(rec); space = buf_frame_get_space_id(page); page_no = buf_frame_get_page_no(page); - heap_no = rec_get_heap_no(rec); + heap_no = rec_get_heap_no(rec, page_is_comp(page)); /* If rec is the supremum record, then we reset the gap and LOCK_REC_NOT_GAP bits, as all locks on the supremum are @@ -1708,8 +1720,7 @@ lock_rec_create( } /* Make lock bitmap bigger by a safety margin */ - n_bits = page_header_get_field(page, PAGE_N_HEAP) - + LOCK_PAGE_BITMAP_MARGIN; + n_bits = page_dir_get_n_heap(page) + LOCK_PAGE_BITMAP_MARGIN; n_bytes = 1 + n_bits / 8; lock = mem_heap_alloc(trx->lock_heap, sizeof(lock_t) + n_bytes); @@ -1740,6 +1751,9 @@ lock_rec_create( HASH_INSERT(lock_t, hash, lock_sys->rec_hash, lock_rec_fold(space, page_no), lock); + /* Note that we have create a new lock */ + trx->trx_create_lock = TRUE; + if (type_mode & LOCK_WAIT) { lock_set_lock_and_trx_wait(lock, trx); @@ -1811,7 +1825,8 @@ lock_rec_enqueue_waiting( if (lock_deadlock_occurs(lock, trx)) { lock_reset_lock_and_trx_wait(lock); - lock_rec_reset_nth_bit(lock, rec_get_heap_no(rec)); + lock_rec_reset_nth_bit(lock, rec_get_heap_no(rec, + page_is_comp(buf_frame_align(rec)))); return(DB_DEADLOCK); } @@ -1861,7 +1876,7 @@ lock_rec_add_to_queue( lock_t* lock; lock_t* similar_lock = NULL; ulint heap_no; - page_t* page; + page_t* page = buf_frame_align(rec); ibool somebody_waits = FALSE; #ifdef UNIV_SYNC_DEBUG @@ -1869,15 +1884,15 @@ lock_rec_add_to_queue( #endif /* UNIV_SYNC_DEBUG */ ut_ad((type_mode & (LOCK_WAIT | LOCK_GAP)) || ((type_mode & LOCK_MODE_MASK) != LOCK_S) - || !lock_rec_other_has_expl_req(LOCK_X, 0, LOCK_WAIT, rec, trx)); + || !lock_rec_other_has_expl_req(LOCK_X, 0, LOCK_WAIT, + rec, page_is_comp(page), trx)); ut_ad((type_mode & (LOCK_WAIT | LOCK_GAP)) || ((type_mode & LOCK_MODE_MASK) != LOCK_X) - || !lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT, rec, trx)); + || !lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT, + rec, page_is_comp(page), trx)); type_mode = type_mode | LOCK_REC; - page = buf_frame_align(rec); - /* If rec is the supremum record, then we can reset the gap bit, as all locks on the supremum are automatically of the gap type, and we try to avoid unnecessary memory consumption of a new record lock @@ -1894,7 +1909,7 @@ lock_rec_add_to_queue( /* Look for a waiting lock request on the same record or on a gap */ - heap_no = rec_get_heap_no(rec); + heap_no = rec_get_heap_no(rec, page_is_comp(page)); lock = lock_rec_get_first_on_page(rec); while (lock != NULL) { @@ -1914,6 +1929,15 @@ lock_rec_add_to_queue( if (similar_lock && !somebody_waits && !(type_mode & LOCK_WAIT)) { + /* If the nth bit of a record lock is already set then we + do not set a new lock bit, otherwice we set */ + + if (lock_rec_get_nth_bit(similar_lock, heap_no)) { + trx->trx_create_lock = FALSE; + } else { + trx->trx_create_lock = TRUE; + } + lock_rec_set_nth_bit(similar_lock, heap_no); return(similar_lock); @@ -1945,6 +1969,7 @@ lock_rec_lock_fast( { lock_t* lock; ulint heap_no; + trx_t* trx; #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); @@ -1959,13 +1984,16 @@ lock_rec_lock_fast( || mode - (LOCK_MODE_MASK & mode) == 0 || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP); - heap_no = rec_get_heap_no(rec); + heap_no = rec_get_heap_no(rec, page_is_comp(buf_frame_align(rec))); lock = lock_rec_get_first_on_page(rec); + trx = thr_get_trx(thr); + trx->trx_create_lock = FALSE; + if (lock == NULL) { if (!impl) { - lock_rec_create(mode, rec, index, thr_get_trx(thr)); + lock_rec_create(mode, rec, index, trx); } return(TRUE); @@ -1976,13 +2004,23 @@ lock_rec_lock_fast( return(FALSE); } - if (lock->trx != thr_get_trx(thr) + if (lock->trx != trx || lock->type_mode != (mode | LOCK_REC) || lock_rec_get_n_bits(lock) <= heap_no) { return(FALSE); } if (!impl) { + + /* If the nth bit of a record lock is already set then we + do not set a new lock bit, otherwice we set */ + + if (lock_rec_get_nth_bit(lock, heap_no)) { + trx->trx_create_lock = FALSE; + } else { + trx->trx_create_lock = TRUE; + } + lock_rec_set_nth_bit(lock, heap_no); } @@ -2027,7 +2065,8 @@ lock_rec_lock_slow( trx = thr_get_trx(thr); - if (lock_rec_has_expl(mode, rec, trx)) { + if (lock_rec_has_expl(mode, rec, + page_is_comp(buf_frame_align(rec)), trx)) { /* The trx already has a strong enough lock on rec: do nothing */ @@ -2168,7 +2207,8 @@ lock_grant( release it at the end of the SQL statement */ lock->trx->auto_inc_lock = lock; - } else if (lock_get_type(lock) == LOCK_TABLE_EXP) { + } else if (lock_get_type(lock) == LOCK_TABLE_EXP || + lock_get_type(lock) == LOCK_TABLE_TRANSACTIONAL) { ut_a(lock_get_mode(lock) == LOCK_S || lock_get_mode(lock) == LOCK_X); } @@ -2343,12 +2383,14 @@ lock_rec_reset_and_release_wait( { lock_t* lock; ulint heap_no; - + ibool comp; + #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ - heap_no = rec_get_heap_no(rec); + comp = page_is_comp(buf_frame_align(rec)); + heap_no = rec_get_heap_no(rec, comp); lock = lock_rec_get_first(rec); @@ -2359,7 +2401,7 @@ lock_rec_reset_and_release_wait( lock_rec_reset_nth_bit(lock, heap_no); } - lock = lock_rec_get_next(rec, lock); + lock = lock_rec_get_next(rec, comp, lock); } } @@ -2377,12 +2419,13 @@ lock_rec_inherit_to_gap( the locks on this record */ { lock_t* lock; - + ibool comp; #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ lock = lock_rec_get_first(rec); + comp = page_is_comp(buf_frame_align(rec)); while (lock != NULL) { if (!lock_rec_get_insert_intention(lock)) { @@ -2392,7 +2435,7 @@ lock_rec_inherit_to_gap( heir, lock->index, lock->trx); } - lock = lock_rec_get_next(rec, lock); + lock = lock_rec_get_next(rec, comp, lock); } } @@ -2409,12 +2452,13 @@ lock_rec_inherit_to_gap_if_gap_lock( the locks on this record */ { lock_t* lock; - + ibool comp; #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ lock = lock_rec_get_first(rec); + comp = page_is_comp(buf_frame_align(rec)); while (lock != NULL) { if (!lock_rec_get_insert_intention(lock) @@ -2426,7 +2470,7 @@ lock_rec_inherit_to_gap_if_gap_lock( heir, lock->index, lock->trx); } - lock = lock_rec_get_next(rec, lock); + lock = lock_rec_get_next(rec, comp, lock); } } @@ -2439,7 +2483,8 @@ lock_rec_move( /*==========*/ rec_t* receiver, /* in: record which gets locks; this record must have no lock requests on it! */ - rec_t* donator) /* in: record which gives locks */ + rec_t* donator, /* in: record which gives locks */ + ibool comp) /* in: TRUE=compact page format */ { lock_t* lock; ulint heap_no; @@ -2449,7 +2494,7 @@ lock_rec_move( ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ - heap_no = rec_get_heap_no(donator); + heap_no = rec_get_heap_no(donator, comp); lock = lock_rec_get_first(donator); @@ -2469,7 +2514,7 @@ lock_rec_move( lock_rec_add_to_queue(type_mode, receiver, lock->index, lock->trx); - lock = lock_rec_get_next(donator, lock); + lock = lock_rec_get_next(donator, comp, lock); } ut_ad(lock_rec_get_first(donator) == NULL); @@ -2495,6 +2540,7 @@ lock_move_reorganize_page( UT_LIST_BASE_NODE_T(lock_t) old_locks; mem_heap_t* heap = NULL; rec_t* sup; + ibool comp; lock_mutex_enter_kernel(); @@ -2535,6 +2581,9 @@ lock_move_reorganize_page( lock = UT_LIST_GET_FIRST(old_locks); + comp = page_is_comp(page); + ut_ad(comp == page_is_comp(old_page)); + while (lock) { /* NOTE: we copy also the locks set on the infimum and supremum of the page; the infimum may carry locks if an @@ -2546,12 +2595,12 @@ lock_move_reorganize_page( /* Set locks according to old locks */ for (;;) { - ut_ad(0 == ut_memcmp(page_cur_get_rec(&cur1), + ut_ad(comp || 0 == ut_memcmp(page_cur_get_rec(&cur1), page_cur_get_rec(&cur2), - rec_get_data_size( + rec_get_data_size_old( page_cur_get_rec(&cur2)))); - - old_heap_no = rec_get_heap_no(page_cur_get_rec(&cur2)); + old_heap_no = rec_get_heap_no(page_cur_get_rec(&cur2), + comp); if (lock_rec_get_nth_bit(lock, old_heap_no)) { @@ -2610,6 +2659,7 @@ lock_move_rec_list_end( ulint heap_no; rec_t* sup; ulint type_mode; + ibool comp; lock_mutex_enter_kernel(); @@ -2623,6 +2673,8 @@ lock_move_rec_list_end( lock = lock_rec_get_first_on_page(page); + comp = page_is_comp(page); + while (lock != NULL) { page_cur_position(rec, &cur1); @@ -2638,13 +2690,12 @@ lock_move_rec_list_end( reset the lock bits on the old */ while (page_cur_get_rec(&cur1) != sup) { - - ut_ad(0 == ut_memcmp(page_cur_get_rec(&cur1), + ut_ad(comp || 0 == ut_memcmp(page_cur_get_rec(&cur1), page_cur_get_rec(&cur2), - rec_get_data_size( + rec_get_data_size_old( page_cur_get_rec(&cur2)))); - - heap_no = rec_get_heap_no(page_cur_get_rec(&cur1)); + heap_no = rec_get_heap_no(page_cur_get_rec(&cur1), + comp); if (lock_rec_get_nth_bit(lock, heap_no)) { type_mode = lock->type_mode; @@ -2694,12 +2745,15 @@ lock_move_rec_list_start( page_cur_t cur2; ulint heap_no; ulint type_mode; + ibool comp; ut_a(new_page); lock_mutex_enter_kernel(); lock = lock_rec_get_first_on_page(page); + comp = page_is_comp(page); + ut_ad(comp == page_is_comp(new_page)); while (lock != NULL) { @@ -2713,13 +2767,12 @@ lock_move_rec_list_start( reset the lock bits on the old */ while (page_cur_get_rec(&cur1) != rec) { - - ut_ad(0 == ut_memcmp(page_cur_get_rec(&cur1), + ut_ad(comp || 0 == ut_memcmp(page_cur_get_rec(&cur1), page_cur_get_rec(&cur2), - rec_get_data_size( + rec_get_data_size_old( page_cur_get_rec(&cur2)))); - - heap_no = rec_get_heap_no(page_cur_get_rec(&cur1)); + heap_no = rec_get_heap_no(page_cur_get_rec(&cur1), + comp); if (lock_rec_get_nth_bit(lock, heap_no)) { type_mode = lock->type_mode; @@ -2759,13 +2812,16 @@ lock_update_split_right( page_t* right_page, /* in: right page */ page_t* left_page) /* in: left page */ { + ibool comp; lock_mutex_enter_kernel(); - + comp = page_is_comp(left_page); + ut_ad(comp == page_is_comp(right_page)); + /* Move the locks on the supremum of the left page to the supremum of the right page */ lock_rec_move(page_get_supremum_rec(right_page), - page_get_supremum_rec(left_page)); + page_get_supremum_rec(left_page), comp); /* Inherit the locks to the supremum of left page from the successor of the infimum on right page */ @@ -2819,13 +2875,16 @@ lock_update_root_raise( page_t* new_page, /* in: index page to which copied */ page_t* root) /* in: root page */ { + ibool comp; lock_mutex_enter_kernel(); - + comp = page_is_comp(root); + ut_ad(comp == page_is_comp(new_page)); + /* Move the locks on the supremum of the root to the supremum of new_page */ lock_rec_move(page_get_supremum_rec(new_page), - page_get_supremum_rec(root)); + page_get_supremum_rec(root), comp); lock_mutex_exit_kernel(); } @@ -2839,13 +2898,16 @@ lock_update_copy_and_discard( page_t* new_page, /* in: index page to which copied */ page_t* page) /* in: index page; NOT the root! */ { + ibool comp; lock_mutex_enter_kernel(); - + comp = page_is_comp(page); + ut_ad(comp == page_is_comp(new_page)); + /* Move the locks on the supremum of the old page to the supremum of new_page */ lock_rec_move(page_get_supremum_rec(new_page), - page_get_supremum_rec(page)); + page_get_supremum_rec(page), comp); lock_rec_free_all_from_discard_page(page); lock_mutex_exit_kernel(); @@ -2883,8 +2945,11 @@ lock_update_merge_left( page_t* right_page) /* in: merged index page which will be discarded */ { + ibool comp; lock_mutex_enter_kernel(); - + comp = page_is_comp(left_page); + ut_ad(comp == page_is_comp(right_page)); + if (page_rec_get_next(orig_pred) != page_get_supremum_rec(left_page)) { /* Inherit the locks on the supremum of the left page to the @@ -2904,7 +2969,7 @@ lock_update_merge_left( of the left page */ lock_rec_move(page_get_supremum_rec(left_page), - page_get_supremum_rec(right_page)); + page_get_supremum_rec(right_page), comp); lock_rec_free_all_from_discard_page(right_page); @@ -3031,12 +3096,14 @@ lock_rec_store_on_page_infimum( bits are reset on the record */ { page_t* page; + ibool comp; page = buf_frame_align(rec); + comp = page_is_comp(page); lock_mutex_enter_kernel(); - lock_rec_move(page_get_infimum_rec(page), rec); + lock_rec_move(page_get_infimum_rec(page), rec, comp); lock_mutex_exit_kernel(); } @@ -3053,9 +3120,12 @@ lock_rec_restore_from_page_infimum( whose infimum stored the lock state; lock bits are reset on the infimum */ { + ibool comp; lock_mutex_enter_kernel(); - - lock_rec_move(rec, page_get_infimum_rec(page)); + comp = page_is_comp(page); + ut_ad(comp == page_is_comp(buf_frame_align(rec))); + + lock_rec_move(rec, page_get_infimum_rec(page), comp); lock_mutex_exit_kernel(); } @@ -3352,6 +3422,10 @@ lock_table_create( lock->trx->n_lock_table_exp++; } + if (lock_get_type(lock) == LOCK_TABLE_TRANSACTIONAL) { + lock->trx->n_lock_table_transactional++; + } + lock->un_member.tab_lock.table = table; UT_LIST_ADD_LAST(un_member.tab_lock.locks, table->locks, lock); @@ -3389,7 +3463,11 @@ lock_table_remove_low( } if (lock_get_type(lock) == LOCK_TABLE_EXP) { - lock->trx->n_lock_table_exp--; + trx->n_lock_table_exp--; + } + + if (lock_get_type(lock) == LOCK_TABLE_TRANSACTIONAL) { + trx->n_lock_table_transactional--; } UT_LIST_REMOVE(trx_locks, trx->trx_locks, lock); @@ -3523,7 +3601,8 @@ lock_table( DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ ulint flags, /* in: if BTR_NO_LOCKING_FLAG bit is set, does nothing; - if LOCK_TABLE_EXP bits are set, + if LOCK_TABLE_EXP|LOCK_TABLE_TRANSACTIONAL + bits are set, creates an explicit table lock */ dict_table_t* table, /* in: database table in dictionary cache */ ulint mode, /* in: lock mode */ @@ -3539,7 +3618,8 @@ lock_table( return(DB_SUCCESS); } - ut_a(flags == 0 || flags == LOCK_TABLE_EXP); + ut_a(flags == 0 || flags == LOCK_TABLE_EXP || + flags == LOCK_TABLE_TRANSACTIONAL); trx = thr_get_trx(thr); @@ -3562,7 +3642,7 @@ lock_table( /* Another trx has a request on the table in an incompatible mode: this trx may have to wait */ - err = lock_table_enqueue_waiting(mode, table, thr); + err = lock_table_enqueue_waiting(mode | flags, table, thr); lock_mutex_exit_kernel(); @@ -3653,7 +3733,8 @@ lock_table_dequeue( ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ ut_a(lock_get_type(in_lock) == LOCK_TABLE || - lock_get_type(in_lock) == LOCK_TABLE_EXP); + lock_get_type(in_lock) == LOCK_TABLE_EXP || + lock_get_type(in_lock) == LOCK_TABLE_TRANSACTIONAL); lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, in_lock); @@ -3757,7 +3838,9 @@ lock_release_off_kernel( } lock_table_dequeue(lock); - if (lock_get_type(lock) == LOCK_TABLE_EXP) { + + if (lock_get_type(lock) == LOCK_TABLE_EXP || + lock_get_type(lock) == LOCK_TABLE_TRANSACTIONAL) { ut_a(lock_get_mode(lock) == LOCK_S || lock_get_mode(lock) == LOCK_X); } @@ -3781,6 +3864,7 @@ lock_release_off_kernel( ut_a(trx->auto_inc_lock == NULL); ut_a(trx->n_lock_table_exp == 0); + ut_a(trx->n_lock_table_transactional == 0); } /************************************************************************* @@ -3846,6 +3930,7 @@ lock_release_tables_off_kernel( } ut_a(trx->n_lock_table_exp == 0); + ut_a(trx->n_lock_table_transactional == 0); } /************************************************************************* @@ -3959,11 +4044,15 @@ lock_table_print( ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ ut_a(lock_get_type(lock) == LOCK_TABLE || - lock_get_type(lock) == LOCK_TABLE_EXP); + lock_get_type(lock) == LOCK_TABLE_EXP || + lock_get_type(lock) == LOCK_TABLE_TRANSACTIONAL); if (lock_get_type(lock) == LOCK_TABLE_EXP) { fputs("EXPLICIT ", file); + } else if (lock_get_type(lock) == LOCK_TABLE_TRANSACTIONAL) { + fputs("TRANSACTIONAL ", file); } + fputs("TABLE LOCK table ", file); ut_print_name(file, lock->trx, lock->un_member.tab_lock.table->name); fprintf(file, " trx id %lu %lu", @@ -3999,11 +4088,14 @@ lock_rec_print( FILE* file, /* in: file where to print */ lock_t* lock) /* in: record type lock */ { - page_t* page; - ulint space; - ulint page_no; - ulint i; - mtr_t mtr; + page_t* page; + ulint space; + ulint page_no; + ulint i; + mtr_t mtr; + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); @@ -4082,8 +4174,11 @@ lock_rec_print( fprintf(file, "Record lock, heap no %lu ", (ulong) i); if (page) { - rec_print(file, - page_find_rec_with_heap_no(page, i)); + rec_t* rec + = page_find_rec_with_heap_no(page, i); + offsets = rec_get_offsets(rec, lock->index, + offsets, ULINT_UNDEFINED, &heap); + rec_print_new(file, rec, offsets); } putc('\n', file); @@ -4091,8 +4186,11 @@ lock_rec_print( } mtr_commit(&mtr); -} - + if (heap) { + mem_heap_free(heap); + } +} + /************************************************************************* Calculates the number of record lock structs in the record lock hash table. */ static @@ -4340,6 +4438,7 @@ lock_table_queue_validate( while (lock) { ut_a(((lock->trx)->conc_state == TRX_ACTIVE) + || ((lock->trx)->conc_state == TRX_PREPARED) || ((lock->trx)->conc_state == TRX_COMMITTED_IN_MEMORY)); if (!lock_get_wait(lock)) { @@ -4368,12 +4467,16 @@ lock_rec_queue_validate( /*====================*/ /* out: TRUE if ok */ rec_t* rec, /* in: record to look at */ - dict_index_t* index) /* in: index, or NULL if not known */ + dict_index_t* index, /* in: index, or NULL if not known */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { trx_t* impl_trx; lock_t* lock; - + ibool comp; + ut_a(rec); + ut_ad(rec_offs_validate(rec, index, offsets)); + comp = page_is_comp(buf_frame_align(rec)); lock_mutex_enter_kernel(); @@ -4383,6 +4486,7 @@ lock_rec_queue_validate( while (lock) { ut_a(lock->trx->conc_state == TRX_ACTIVE + || lock->trx->conc_state == TRX_PREPARED || lock->trx->conc_state == TRX_COMMITTED_IN_MEMORY); @@ -4396,7 +4500,7 @@ lock_rec_queue_validate( ut_a(lock->index == index); } - lock = lock_rec_get_next(rec, lock); + lock = lock_rec_get_next(rec, comp, lock); } lock_mutex_exit_kernel(); @@ -4406,13 +4510,13 @@ lock_rec_queue_validate( if (index && (index->type & DICT_CLUSTERED)) { - impl_trx = lock_clust_rec_some_has_impl(rec, index); + impl_trx = lock_clust_rec_some_has_impl(rec, index, offsets); if (impl_trx && lock_rec_other_has_expl_req(LOCK_S, 0, - LOCK_WAIT, rec, impl_trx)) { + LOCK_WAIT, rec, comp, impl_trx)) { ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, rec, - impl_trx)); + comp, impl_trx)); } } @@ -4422,13 +4526,14 @@ lock_rec_queue_validate( next function call: we have to release lock table mutex to obey the latching order */ - impl_trx = lock_sec_rec_some_has_impl_off_kernel(rec, index); + impl_trx = lock_sec_rec_some_has_impl_off_kernel( + rec, index, offsets); if (impl_trx && lock_rec_other_has_expl_req(LOCK_S, 0, - LOCK_WAIT, rec, impl_trx)) { + LOCK_WAIT, rec, comp, impl_trx)) { - ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, rec, - impl_trx)); + ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, + rec, comp, impl_trx)); } } @@ -4436,6 +4541,7 @@ lock_rec_queue_validate( while (lock) { ut_a(lock->trx->conc_state == TRX_ACTIVE + || lock->trx->conc_state == TRX_PREPARED || lock->trx->conc_state == TRX_COMMITTED_IN_MEMORY); ut_a(trx_in_trx_list(lock->trx)); @@ -4447,10 +4553,10 @@ lock_rec_queue_validate( if (lock_get_mode(lock) == LOCK_S) { ut_a(!lock_rec_other_has_expl_req(LOCK_X, - 0, 0, rec, lock->trx)); + 0, 0, rec, comp, lock->trx)); } else { ut_a(!lock_rec_other_has_expl_req(LOCK_S, - 0, 0, rec, lock->trx)); + 0, 0, rec, comp, lock->trx)); } } else if (lock_get_wait(lock) && !lock_rec_get_gap(lock)) { @@ -4458,7 +4564,7 @@ lock_rec_queue_validate( ut_a(lock_rec_has_to_wait_in_queue(lock)); } - lock = lock_rec_get_next(rec, lock); + lock = lock_rec_get_next(rec, comp, lock); } lock_mutex_exit_kernel(); @@ -4480,10 +4586,13 @@ lock_rec_validate_page( page_t* page; lock_t* lock; rec_t* rec; - ulint nth_lock = 0; - ulint nth_bit = 0; + ulint nth_lock = 0; + ulint nth_bit = 0; ulint i; mtr_t mtr; + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; #ifdef UNIV_SYNC_DEBUG ut_ad(!mutex_own(&kernel_mutex)); @@ -4515,6 +4624,7 @@ loop: ut_a(trx_in_trx_list(lock->trx)); ut_a(lock->trx->conc_state == TRX_ACTIVE + || lock->trx->conc_state == TRX_PREPARED || lock->trx->conc_state == TRX_COMMITTED_IN_MEMORY); for (i = nth_bit; i < lock_rec_get_n_bits(lock); i++) { @@ -4523,13 +4633,15 @@ loop: index = lock->index; rec = page_find_rec_with_heap_no(page, i); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); fprintf(stderr, "Validating %lu %lu\n", (ulong) space, (ulong) page_no); lock_mutex_exit_kernel(); - lock_rec_queue_validate(rec, index); + lock_rec_queue_validate(rec, index, offsets); lock_mutex_enter_kernel(); @@ -4549,6 +4661,9 @@ function_exit: mtr_commit(&mtr); + if (heap) { + mem_heap_free(heap); + } return(TRUE); } @@ -4721,8 +4836,20 @@ lock_rec_insert_check_and_lock( page_update_max_trx_id(buf_frame_align(rec), thr_get_trx(thr)->id); } - - ut_ad(lock_rec_queue_validate(next_rec, index)); + +#ifdef UNIV_DEBUG + { + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + const ulint* offsets = rec_get_offsets( + next_rec, index, offsets_, + ULINT_UNDEFINED, &heap); + ut_ad(lock_rec_queue_validate(next_rec, index, offsets)); + if (heap) { + mem_heap_free(heap); + } + } +#endif /* UNIV_DEBUG */ return(err); } @@ -4736,7 +4863,8 @@ void lock_rec_convert_impl_to_expl( /*==========================*/ rec_t* rec, /* in: user record on page */ - dict_index_t* index) /* in: index of record */ + dict_index_t* index, /* in: index of record */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { trx_t* impl_trx; @@ -4744,11 +4872,14 @@ lock_rec_convert_impl_to_expl( ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ ut_ad(page_rec_is_user_rec(rec)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(page_is_comp(buf_frame_align(rec)) == index->table->comp); if (index->type & DICT_CLUSTERED) { - impl_trx = lock_clust_rec_some_has_impl(rec, index); + impl_trx = lock_clust_rec_some_has_impl(rec, index, offsets); } else { - impl_trx = lock_sec_rec_some_has_impl_off_kernel(rec, index); + impl_trx = lock_sec_rec_some_has_impl_off_kernel( + rec, index, offsets); } if (impl_trx) { @@ -4756,7 +4887,7 @@ lock_rec_convert_impl_to_expl( record, set one for it */ if (!lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, rec, - impl_trx)) { + index->table->comp, impl_trx)) { lock_rec_add_to_queue(LOCK_REC | LOCK_X | LOCK_REC_NOT_GAP, rec, index, @@ -4782,17 +4913,19 @@ lock_clust_rec_modify_check_and_lock( does nothing */ rec_t* rec, /* in: record which should be modified */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ que_thr_t* thr) /* in: query thread */ { ulint err; - + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(index->type & DICT_CLUSTERED); + if (flags & BTR_NO_LOCKING_FLAG) { return(DB_SUCCESS); } - ut_ad(index->type & DICT_CLUSTERED); - lock_mutex_enter_kernel(); ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); @@ -4800,13 +4933,13 @@ lock_clust_rec_modify_check_and_lock( /* If a transaction has no explicit x-lock set on the record, set one for it */ - lock_rec_convert_impl_to_expl(rec, index); + lock_rec_convert_impl_to_expl(rec, index, offsets); err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP, rec, index, thr); lock_mutex_exit_kernel(); - ut_ad(lock_rec_queue_validate(rec, index)); + ut_ad(lock_rec_queue_validate(rec, index, offsets)); return(err); } @@ -4850,8 +4983,19 @@ lock_sec_rec_modify_check_and_lock( err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP, rec, index, thr); lock_mutex_exit_kernel(); - - ut_ad(lock_rec_queue_validate(rec, index)); + +#ifdef UNIV_DEBUG + { + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + const ulint* offsets = rec_get_offsets( + rec, index, offsets_, ULINT_UNDEFINED, &heap); + ut_ad(lock_rec_queue_validate(rec, index, offsets)); + if (heap) { + mem_heap_free(heap); + } + } +#endif /* UNIV_DEBUG */ if (err == DB_SUCCESS) { /* Update the page max trx id field */ @@ -4878,6 +5022,7 @@ lock_sec_rec_read_check_and_lock( which should be read or passed over by a read cursor */ dict_index_t* index, /* in: secondary index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ ulint mode, /* in: mode of the lock which the read cursor should set on records: LOCK_S or LOCK_X; the latter is possible in SELECT FOR UPDATE */ @@ -4889,6 +5034,7 @@ lock_sec_rec_read_check_and_lock( ut_ad(!(index->type & DICT_CLUSTERED)); ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec)); + ut_ad(rec_offs_validate(rec, index, offsets)); if (flags & BTR_NO_LOCKING_FLAG) { @@ -4911,14 +5057,14 @@ lock_sec_rec_read_check_and_lock( || recv_recovery_is_on()) && !page_rec_is_supremum(rec)) { - lock_rec_convert_impl_to_expl(rec, index); + lock_rec_convert_impl_to_expl(rec, index, offsets); } err = lock_rec_lock(FALSE, mode | gap_mode, rec, index, thr); lock_mutex_exit_kernel(); - ut_ad(lock_rec_queue_validate(rec, index)); + ut_ad(lock_rec_queue_validate(rec, index, offsets)); return(err); } @@ -4942,6 +5088,7 @@ lock_clust_rec_read_check_and_lock( which should be read or passed over by a read cursor */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ ulint mode, /* in: mode of the lock which the read cursor should set on records: LOCK_S or LOCK_X; the latter is possible in SELECT FOR UPDATE */ @@ -4955,6 +5102,8 @@ lock_clust_rec_read_check_and_lock( ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec)); ut_ad(gap_mode == LOCK_ORDINARY || gap_mode == LOCK_GAP || gap_mode == LOCK_REC_NOT_GAP); + ut_ad(rec_offs_validate(rec, index, offsets)); + if (flags & BTR_NO_LOCKING_FLAG) { return(DB_SUCCESS); @@ -4969,14 +5118,56 @@ lock_clust_rec_read_check_and_lock( if (!page_rec_is_supremum(rec)) { - lock_rec_convert_impl_to_expl(rec, index); + lock_rec_convert_impl_to_expl(rec, index, offsets); } err = lock_rec_lock(FALSE, mode | gap_mode, rec, index, thr); lock_mutex_exit_kernel(); - ut_ad(lock_rec_queue_validate(rec, index)); - + ut_ad(lock_rec_queue_validate(rec, index, offsets)); + return(err); } +/************************************************************************* +Checks if locks of other transactions prevent an immediate read, or passing +over by a read cursor, of a clustered index record. If they do, first tests +if the query thread should anyway be suspended for some reason; if not, then +puts the transaction and the query thread to the lock wait state and inserts a +waiting request for a record lock to the lock queue. Sets the requested mode +lock on the record. This is an alternative version of +lock_clust_rec_read_check_and_lock() that does not require the parameter +"offsets". */ + +ulint +lock_clust_rec_read_check_and_lock_alt( +/*===================================*/ + /* out: DB_SUCCESS, DB_LOCK_WAIT, + DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ + ulint flags, /* in: if BTR_NO_LOCKING_FLAG bit is set, + does nothing */ + rec_t* rec, /* in: user record or page supremum record + which should be read or passed over by a read + cursor */ + dict_index_t* index, /* in: clustered index */ + ulint mode, /* in: mode of the lock which the read cursor + should set on records: LOCK_S or LOCK_X; the + latter is possible in SELECT FOR UPDATE */ + ulint gap_mode,/* in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr) /* in: query thread */ +{ + mem_heap_t* tmp_heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; + ulint ret; + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &tmp_heap); + ret = lock_clust_rec_read_check_and_lock(flags, rec, index, + offsets, mode, gap_mode, thr); + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + return(ret); +} diff --git a/innobase/log/log0log.c b/innobase/log/log0log.c index e08adb013b5..1ab91b71e8f 100644 --- a/innobase/log/log0log.c +++ b/innobase/log/log0log.c @@ -190,6 +190,8 @@ loop: log_buffer_flush_to_disk(); + srv_log_waits++; + ut_ad(++count < 50); goto loop; @@ -292,6 +294,8 @@ part_loop: if (str_len > 0) { goto part_loop; } + + srv_log_write_requests++; } /**************************************************************** @@ -1112,11 +1116,15 @@ log_group_file_header_flush( if (log_do_write) { log_sys->n_log_ios++; + srv_os_log_pending_writes++; + fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->space_id, dest_offset / UNIV_PAGE_SIZE, dest_offset % UNIV_PAGE_SIZE, OS_FILE_LOG_BLOCK_SIZE, buf, group); + + srv_os_log_pending_writes--; } } @@ -1181,6 +1189,8 @@ loop: log_group_file_header_flush(group, next_offset / group->file_size, start_lsn); + srv_os_log_written+= OS_FILE_LOG_BLOCK_SIZE; + srv_log_writes++; } if ((next_offset % group->file_size) + len > group->file_size) { @@ -1225,9 +1235,16 @@ loop: if (log_do_write) { log_sys->n_log_ios++; + srv_os_log_pending_writes++; + fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->space_id, next_offset / UNIV_PAGE_SIZE, next_offset % UNIV_PAGE_SIZE, write_len, buf, group); + + srv_os_log_pending_writes--; + + srv_os_log_written+= write_len; + srv_log_writes++; } if (write_len < len) { diff --git a/innobase/log/log0recv.c b/innobase/log/log0recv.c index ae84f085523..35dc9a06020 100644 --- a/innobase/log/log0recv.c +++ b/innobase/log/log0recv.c @@ -756,81 +756,124 @@ recv_parse_or_apply_log_rec_body( mtr_t* mtr) /* in: mtr or NULL; should be non-NULL if and only if page is non-NULL */ { - byte* new_ptr; - - if (type <= MLOG_8BYTES) { - new_ptr = mlog_parse_nbytes(type, ptr, end_ptr, page); - - } else if (type == MLOG_REC_INSERT) { - new_ptr = page_cur_parse_insert_rec(FALSE, ptr, end_ptr, page, - mtr); - } else if (type == MLOG_REC_CLUST_DELETE_MARK) { - new_ptr = btr_cur_parse_del_mark_set_clust_rec(ptr, end_ptr, - page); - } else if (type == MLOG_REC_SEC_DELETE_MARK) { - new_ptr = btr_cur_parse_del_mark_set_sec_rec(ptr, end_ptr, - page); - } else if (type == MLOG_REC_UPDATE_IN_PLACE) { - new_ptr = btr_cur_parse_update_in_place(ptr, end_ptr, page); - - } else if ((type == MLOG_LIST_END_DELETE) - || (type == MLOG_LIST_START_DELETE)) { - new_ptr = page_parse_delete_rec_list(type, ptr, end_ptr, page, - mtr); - } else if (type == MLOG_LIST_END_COPY_CREATED) { - new_ptr = page_parse_copy_rec_list_to_created_page(ptr, - end_ptr, page, mtr); - } else if (type == MLOG_PAGE_REORGANIZE) { - new_ptr = btr_parse_page_reorganize(ptr, end_ptr, page, mtr); - - } else if (type == MLOG_PAGE_CREATE) { - new_ptr = page_parse_create(ptr, end_ptr, page, mtr); - - } else if (type == MLOG_UNDO_INSERT) { - new_ptr = trx_undo_parse_add_undo_rec(ptr, end_ptr, page); - - } else if (type == MLOG_UNDO_ERASE_END) { - new_ptr = trx_undo_parse_erase_page_end(ptr, end_ptr, page, - mtr); - } else if (type == MLOG_UNDO_INIT) { - new_ptr = trx_undo_parse_page_init(ptr, end_ptr, page, mtr); - - } else if (type == MLOG_UNDO_HDR_DISCARD) { - new_ptr = trx_undo_parse_discard_latest(ptr, end_ptr, page, - mtr); - } else if ((type == MLOG_UNDO_HDR_CREATE) - || (type == MLOG_UNDO_HDR_REUSE)) { - new_ptr = trx_undo_parse_page_header(type, ptr, end_ptr, page, - mtr); - } else if (type == MLOG_REC_MIN_MARK) { - new_ptr = btr_parse_set_min_rec_mark(ptr, end_ptr, page, mtr); - - } else if (type == MLOG_REC_DELETE) { - new_ptr = page_cur_parse_delete_rec(ptr, end_ptr, page, mtr); - - } else if (type == MLOG_IBUF_BITMAP_INIT) { - new_ptr = ibuf_parse_bitmap_init(ptr, end_ptr, page, mtr); - - } else if (type == MLOG_INIT_FILE_PAGE) { - new_ptr = fsp_parse_init_file_page(ptr, end_ptr, page); - - } else if (type == MLOG_WRITE_STRING) { - new_ptr = mlog_parse_string(ptr, end_ptr, page); - - } else if (type == MLOG_FILE_CREATE - || type == MLOG_FILE_RENAME - || type == MLOG_FILE_DELETE) { - new_ptr = fil_op_log_parse_or_replay(ptr, end_ptr, type, FALSE, + dict_index_t* index = NULL; + + switch (type) { + case MLOG_1BYTE: case MLOG_2BYTES: case MLOG_4BYTES: case MLOG_8BYTES: + ptr = mlog_parse_nbytes(type, ptr, end_ptr, page); + break; + case MLOG_REC_INSERT: case MLOG_COMP_REC_INSERT: + if (NULL != (ptr = mlog_parse_index(ptr, end_ptr, + type == MLOG_COMP_REC_INSERT, &index))) { + ptr = page_cur_parse_insert_rec(FALSE, ptr, end_ptr, + index, page, mtr); + } + break; + case MLOG_REC_CLUST_DELETE_MARK: case MLOG_COMP_REC_CLUST_DELETE_MARK: + if (NULL != (ptr = mlog_parse_index(ptr, end_ptr, + type == MLOG_COMP_REC_CLUST_DELETE_MARK, &index))) { + ptr = btr_cur_parse_del_mark_set_clust_rec(ptr, + end_ptr, index, page); + } + break; + case MLOG_REC_SEC_DELETE_MARK: case MLOG_COMP_REC_SEC_DELETE_MARK: + if (NULL != (ptr = mlog_parse_index(ptr, end_ptr, + type == MLOG_COMP_REC_SEC_DELETE_MARK, &index))) { + ptr = btr_cur_parse_del_mark_set_sec_rec(ptr, end_ptr, + index, page); + } + break; + case MLOG_REC_UPDATE_IN_PLACE: case MLOG_COMP_REC_UPDATE_IN_PLACE: + if (NULL != (ptr = mlog_parse_index(ptr, end_ptr, + type == MLOG_COMP_REC_UPDATE_IN_PLACE, &index))) { + ptr = btr_cur_parse_update_in_place(ptr, end_ptr, + page, index); + } + break; + case MLOG_LIST_END_DELETE: case MLOG_COMP_LIST_END_DELETE: + case MLOG_LIST_START_DELETE: case MLOG_COMP_LIST_START_DELETE: + if (NULL != (ptr = mlog_parse_index(ptr, end_ptr, + type == MLOG_COMP_LIST_END_DELETE + || type == MLOG_COMP_LIST_START_DELETE, &index))) { + ptr = page_parse_delete_rec_list(type, ptr, end_ptr, + index, page, mtr); + } + break; + case MLOG_LIST_END_COPY_CREATED: case MLOG_COMP_LIST_END_COPY_CREATED: + if (NULL != (ptr = mlog_parse_index(ptr, end_ptr, + type == MLOG_COMP_LIST_END_COPY_CREATED, &index))) { + ptr = page_parse_copy_rec_list_to_created_page(ptr, + end_ptr, index, page, mtr); + } + break; + case MLOG_PAGE_REORGANIZE: case MLOG_COMP_PAGE_REORGANIZE: + if (NULL != (ptr = mlog_parse_index(ptr, end_ptr, + type == MLOG_COMP_PAGE_REORGANIZE, &index))) { + ptr = btr_parse_page_reorganize(ptr, end_ptr, index, + page, mtr); + } + break; + case MLOG_PAGE_CREATE: case MLOG_COMP_PAGE_CREATE: + ptr = page_parse_create(ptr, end_ptr, + type == MLOG_COMP_PAGE_CREATE, page, mtr); + break; + case MLOG_UNDO_INSERT: + ptr = trx_undo_parse_add_undo_rec(ptr, end_ptr, page); + break; + case MLOG_UNDO_ERASE_END: + ptr = trx_undo_parse_erase_page_end(ptr, end_ptr, page, mtr); + break; + case MLOG_UNDO_INIT: + ptr = trx_undo_parse_page_init(ptr, end_ptr, page, mtr); + break; + case MLOG_UNDO_HDR_DISCARD: + ptr = trx_undo_parse_discard_latest(ptr, end_ptr, page, mtr); + break; + case MLOG_UNDO_HDR_CREATE: + case MLOG_UNDO_HDR_REUSE: + ptr = trx_undo_parse_page_header(type, ptr, end_ptr, + page, mtr); + break; + case MLOG_REC_MIN_MARK: case MLOG_COMP_REC_MIN_MARK: + ptr = btr_parse_set_min_rec_mark(ptr, end_ptr, + type == MLOG_COMP_REC_MIN_MARK, page, mtr); + break; + case MLOG_REC_DELETE: case MLOG_COMP_REC_DELETE: + if (NULL != (ptr = mlog_parse_index(ptr, end_ptr, + type == MLOG_COMP_REC_DELETE, &index))) { + ptr = page_cur_parse_delete_rec(ptr, end_ptr, + index, page, mtr); + } + break; + case MLOG_IBUF_BITMAP_INIT: + ptr = ibuf_parse_bitmap_init(ptr, end_ptr, page, mtr); + break; + case MLOG_INIT_FILE_PAGE: + ptr = fsp_parse_init_file_page(ptr, end_ptr, page); + break; + case MLOG_WRITE_STRING: + ptr = mlog_parse_string(ptr, end_ptr, page); + break; + case MLOG_FILE_CREATE: + case MLOG_FILE_RENAME: + case MLOG_FILE_DELETE: + ptr = fil_op_log_parse_or_replay(ptr, end_ptr, type, FALSE, ULINT_UNDEFINED); - } else { - new_ptr = NULL; - + break; + default: + ptr = NULL; recv_sys->found_corrupt_log = TRUE; } - ut_ad(!page || new_ptr); + ut_ad(!page || ptr); + if (index) { + dict_table_t* table = index->table; + mem_heap_free(index->heap); + mutex_free(&(table->autoinc_mutex)); + mem_heap_free(table->heap); + } - return(new_ptr); + return(ptr); } /************************************************************************* @@ -2851,11 +2894,13 @@ void recv_recovery_from_checkpoint_finish(void) /*======================================*/ { + int i; + os_thread_id_t recovery_thread_id; + /* Rollback the uncommitted transactions which have no user session */ - if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) { - trx_rollback_or_clean_all_without_sess(); - } + fprintf(stderr, + "InnoDB: Starting to apply log records to the database...\n"); /* Apply the hashed log records to the respective file pages */ @@ -2888,9 +2933,14 @@ recv_recovery_from_checkpoint_finish(void) /* Free the resources of the recovery system */ recv_recovery_on = FALSE; + #ifndef UNIV_LOG_DEBUG recv_sys_free(); #endif + if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) { + os_thread_create(trx_rollback_or_clean_all_without_sess, + (void *)&i, &recovery_thread_id); + } } /********************************************************** diff --git a/innobase/mtr/mtr0log.c b/innobase/mtr/mtr0log.c index 82baa8905ba..4f826f242e8 100644 --- a/innobase/mtr/mtr0log.c +++ b/innobase/mtr/mtr0log.c @@ -384,3 +384,161 @@ mlog_parse_string( return(ptr + len); } + +/************************************************************ +Opens a buffer for mlog, writes the initial log record and, +if needed, the field lengths of an index. */ + +byte* +mlog_open_and_write_index( +/*======================*/ + /* out: buffer, NULL if log mode + MTR_LOG_NONE */ + mtr_t* mtr, /* in: mtr */ + byte* rec, /* in: index record or page */ + dict_index_t* index, /* in: record descriptor */ + byte type, /* in: log item type */ + ulint size) /* in: requested buffer size in bytes + (if 0, calls mlog_close() and returns NULL) */ +{ + byte* log_ptr; + const byte* log_start; + const byte* log_end; + + if (!index->table->comp) { + log_start = log_ptr = mlog_open(mtr, 11 + size); + if (!log_ptr) { + return(NULL); /* logging is disabled */ + } + log_ptr = mlog_write_initial_log_record_fast(rec, type, + log_ptr, mtr); + log_end = log_ptr + 11 + size; + } else { + ulint i; + ulint n = dict_index_get_n_fields(index); + /* total size needed */ + ulint total = 11 + size + (n + 2) * 2; + ulint alloc = total; + /* allocate at most DYN_ARRAY_DATA_SIZE at a time */ + if (alloc > DYN_ARRAY_DATA_SIZE) { + alloc = DYN_ARRAY_DATA_SIZE; + } + log_start = log_ptr = mlog_open(mtr, alloc); + if (!log_ptr) { + return(NULL); /* logging is disabled */ + } + log_end = log_ptr + alloc; + log_ptr = mlog_write_initial_log_record_fast(rec, type, + log_ptr, mtr); + mach_write_to_2(log_ptr, n); + log_ptr += 2; + mach_write_to_2(log_ptr, + dict_index_get_n_unique_in_tree(index)); + log_ptr += 2; + for (i = 0; i < n; i++) { + dict_field_t* field; + dtype_t* type; + ulint len; + field = dict_index_get_nth_field(index, i); + type = dict_col_get_type(dict_field_get_col(field)); + len = field->fixed_len; + ut_ad(len < 0x7fff); + if (len == 0 && (dtype_get_len(type) > 255 + || dtype_get_mtype(type) == DATA_BLOB)) { + /* variable-length field + with maximum length > 255 */ + len = 0x7fff; + } + if (dtype_get_prtype(type) & DATA_NOT_NULL) { + len |= 0x8000; + } + if (log_ptr + 2 > log_end) { + mlog_close(mtr, log_ptr); + ut_a(total > (ulint) (log_ptr - log_start)); + total -= log_ptr - log_start; + alloc = total; + if (alloc > DYN_ARRAY_DATA_SIZE) { + alloc = DYN_ARRAY_DATA_SIZE; + } + log_start = log_ptr = mlog_open(mtr, alloc); + if (!log_ptr) { + return(NULL); /* logging is disabled */ + } + log_end = log_ptr + alloc; + } + mach_write_to_2(log_ptr, len); + log_ptr += 2; + } + } + if (size == 0) { + mlog_close(mtr, log_ptr); + log_ptr = NULL; + } else if (log_ptr + size > log_end) { + mlog_close(mtr, log_ptr); + log_ptr = mlog_open(mtr, size); + } + return(log_ptr); +} + +/************************************************************ +Parses a log record written by mlog_open_and_write_index. */ + +byte* +mlog_parse_index( +/*=============*/ + /* out: parsed record end, + NULL if not a complete record */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + /* out: new value of log_ptr */ + ibool comp, /* in: TRUE=compact record format */ + dict_index_t** index) /* out, own: dummy index */ +{ + ulint i, n, n_uniq; + dict_table_t* table; + dict_index_t* ind; + + if (comp) { + if (end_ptr < ptr + 4) { + return(NULL); + } + n = mach_read_from_2(ptr); + ptr += 2; + n_uniq = mach_read_from_2(ptr); + ut_ad(n_uniq <= n); + if (end_ptr < ptr + (n + 1) * 2) { + return(NULL); + } + } else { + n = n_uniq = 1; + } + table = dict_mem_table_create("LOG_DUMMY", DICT_HDR_SPACE, n, comp); + ind = dict_mem_index_create("LOG_DUMMY", "LOG_DUMMY", + DICT_HDR_SPACE, 0, n); + ind->table = table; + ind->n_uniq = n_uniq; + if (n_uniq != n) { + ind->type = DICT_CLUSTERED; + } + /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ + ind->cached = TRUE; + if (comp) { + for (i = 0; i < n; i++) { + ulint len = mach_read_from_2(ptr += 2); + /* The high-order bit of len is the NOT NULL flag; + the rest is 0 or 0x7fff for variable-length fields, + and 1..0x7ffe for fixed-length fields. */ + dict_mem_table_add_col(table, "DUMMY", + ((len + 1) & 0x7fff) <= 1 + ? DATA_BINARY + : DATA_FIXBINARY, + len & 0x8000 ? DATA_NOT_NULL : 0, + len & 0x7fff, 0); + dict_index_add_col(ind, + dict_table_get_nth_col(table, i), 0, 0); + } + ptr += 2; + } + *index = ind; + return(ptr); +} diff --git a/innobase/os/os0file.c b/innobase/os/os0file.c index 7090e8662f3..64d80350275 100644 --- a/innobase/os/os0file.c +++ b/innobase/os/os0file.c @@ -155,6 +155,10 @@ os_mutex_t os_file_count_mutex; ulint os_file_n_pending_preads = 0; ulint os_file_n_pending_pwrites = 0; +/* These are not protected by any mutex */ +ulint os_n_pending_writes = 0; +ulint os_n_pending_reads = 0; + /*************************************************************************** Gets the operating system version. Currently works only on Windows. */ @@ -1683,7 +1687,7 @@ os_file_set_size( } /* Print about progress for each 100 MB written */ - if ((offset + n_bytes) / (ib_longlong)(100 * 1024 * 1024) + if ((ib_longlong) (offset + n_bytes) / (ib_longlong)(100 * 1024 * 1024) != offset / (ib_longlong)(100 * 1024 * 1024)) { fprintf(stderr, " %lu00", @@ -2058,8 +2062,12 @@ try_again: goto error_handling; } + os_n_pending_reads++; + ret = ReadFile(file, buf, n, &len, NULL); + os_n_pending_reads--; + os_mutex_exit(os_file_seek_mutexes[i]); if (ret && len == n) { @@ -2072,8 +2080,12 @@ try_again: os_bytes_read_since_printout += n; try_again: + os_n_pending_reads++; + ret = os_file_pread(file, buf, n, offset, offset_high); + os_n_pending_reads--; + if ((ulint)ret == n) { return(TRUE); @@ -2161,8 +2173,12 @@ try_again: goto error_handling; } + os_n_pending_reads++; + ret = ReadFile(file, buf, n, &len, NULL); + os_n_pending_reads--; + os_mutex_exit(os_file_seek_mutexes[i]); if (ret && len == n) { @@ -2175,8 +2191,12 @@ try_again: os_bytes_read_since_printout += n; try_again: + os_n_pending_reads++; + ret = os_file_pread(file, buf, n, offset, offset_high); + os_n_pending_reads--; + if ((ulint)ret == n) { return(TRUE); @@ -2258,7 +2278,11 @@ retry: return(FALSE); } + os_n_pending_writes++; + ret = WriteFile(file, buf, n, &len, NULL); + + os_n_pending_writes--; /* Always do fsync to reduce the probability that when the OS crashes, a database page is only partially physically written to disk. */ @@ -2319,8 +2343,12 @@ retry: #else ssize_t ret; + os_n_pending_writes++; + ret = os_file_pwrite(file, buf, n, offset, offset_high); + os_n_pending_writes--; + if ((ulint)ret == n) { return(TRUE); diff --git a/innobase/os/os0proc.c b/innobase/os/os0proc.c index 2f155788420..167aed93de7 100644 --- a/innobase/os/os0proc.c +++ b/innobase/os/os0proc.c @@ -69,6 +69,10 @@ byte* os_awe_window; ulint os_awe_window_size; #endif +ibool os_use_large_pages; +/* Large page size. This may be a boot-time option on some platforms */ +ulint os_large_page_size; + /******************************************************************** Windows AWE support. Tries to enable the "lock pages in memory" privilege for the current process so that the current process can allocate memory-locked @@ -516,6 +520,89 @@ os_mem_alloc_nocache( } /******************************************************************** +Allocates large pages memory. */ + +void* +os_mem_alloc_large( +/*=================*/ + /* out: allocated memory */ + ulint n, /* in: number of bytes */ + ibool set_to_zero, /* in: TRUE if allocated memory should be set + to zero if UNIV_SET_MEM_TO_ZERO is defined */ + ibool assert_on_error) /* in: if TRUE, we crash mysqld if the memory + cannot be allocated */ +{ +#ifdef HAVE_LARGE_PAGES + ulint size; + int shmid; + void *ptr = NULL; + struct shmid_ds buf; + + if (!os_use_large_pages || !os_large_page_size) { + goto skip; + } + +#ifdef UNIV_LINUX + /* Align block size to os_large_page_size */ + size = ((n - 1) & ~(os_large_page_size - 1)) + os_large_page_size; + + shmid = shmget(IPC_PRIVATE, (size_t)size, SHM_HUGETLB | SHM_R | SHM_W); + if (shmid < 0) { + fprintf(stderr, "InnoDB: HugeTLB: Warning: Failed to allocate %lu bytes. " + "errno %d\n", n, errno); + } else { + ptr = shmat(shmid, NULL, 0); + if (ptr == (void *)-1) { + fprintf(stderr, "InnoDB: HugeTLB: Warning: Failed to attach shared memory " + "segment, errno %d\n", errno); + } + /* + Remove the shared memory segment so that it will be automatically freed + after memory is detached or process exits + */ + shmctl(shmid, IPC_RMID, &buf); + } +#endif + + if (ptr) { + if (set_to_zero) { +#ifdef UNIV_SET_MEM_TO_ZERO + memset(ptr, '\0', size); +#endif + } + + return(ptr); + } + + fprintf(stderr, "InnoDB HugeTLB: Warning: Using conventional memory pool\n"); +skip: +#endif /* HAVE_LARGE_PAGES */ + + return(ut_malloc_low(n, set_to_zero, assert_on_error)); +} + +/******************************************************************** +Frees large pages memory. */ + +void +os_mem_free_large( +/*=================*/ + void *ptr) /* in: number of bytes */ +{ +#ifdef HAVE_LARGE_PAGES + if (os_use_large_pages && os_large_page_size +#ifdef UNIV_LINUX + && !shmdt(ptr) +#endif + ) { + return; + } +#endif + + ut_free(ptr); +} + +/******************************************************************** Sets the priority boost for threads released from waiting within the current process. */ diff --git a/innobase/page/page0cur.c b/innobase/page/page0cur.c index 459ab986610..53c3c573b8e 100644 --- a/innobase/page/page0cur.c +++ b/innobase/page/page0cur.c @@ -29,7 +29,9 @@ UNIV_INLINE ibool page_cur_try_search_shortcut( /*=========================*/ + /* out: TRUE on success */ page_t* page, /* in: index page */ + dict_index_t* index, /* in: record descriptor */ dtuple_t* tuple, /* in: data tuple */ ulint* iup_matched_fields, /* in/out: already matched fields in upper @@ -55,9 +57,15 @@ page_cur_try_search_shortcut( #ifdef UNIV_SEARCH_DEBUG page_cur_t cursor2; #endif + ibool success = FALSE; + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; ut_ad(dtuple_check_typed(tuple)); rec = page_header_get_ptr(page, PAGE_LAST_INSERT); + offsets = rec_get_offsets(rec, index, offsets, + dtuple_get_n_fields(tuple), &heap); ut_ad(rec); ut_ad(page_rec_is_user_rec(rec)); @@ -69,26 +77,26 @@ page_cur_try_search_shortcut( up_match = low_match; up_bytes = low_bytes; - cmp = page_cmp_dtuple_rec_with_match(tuple, rec, &low_match, + cmp = page_cmp_dtuple_rec_with_match(tuple, rec, offsets, &low_match, &low_bytes); if (cmp == -1) { - - return(FALSE); + goto exit_func; } next_rec = page_rec_get_next(rec); + offsets = rec_get_offsets(next_rec, index, offsets, + dtuple_get_n_fields(tuple), &heap); - cmp = page_cmp_dtuple_rec_with_match(tuple, next_rec, &up_match, - &up_bytes); + cmp = page_cmp_dtuple_rec_with_match(tuple, next_rec, offsets, + &up_match, &up_bytes); if (cmp != -1) { - - return(FALSE); + goto exit_func; } cursor->rec = rec; #ifdef UNIV_SEARCH_DEBUG - page_cur_search_with_match(page, tuple, PAGE_CUR_DBG, + page_cur_search_with_match(page, index, tuple, PAGE_CUR_DBG, iup_matched_fields, iup_matched_bytes, ilow_matched_fields, @@ -117,7 +125,12 @@ page_cur_try_search_shortcut( #ifdef UNIV_SEARCH_PERF_STAT page_cur_short_succ++; #endif - return(TRUE); + success = TRUE; +exit_func: + if (heap) { + mem_heap_free(heap); + } + return(success); } #endif @@ -130,22 +143,24 @@ static ibool page_cur_rec_field_extends( /*=======================*/ - /* out: TRUE if rec field extends tuple - field */ - dtuple_t* tuple, /* in: data tuple */ - rec_t* rec, /* in: record */ - ulint n) /* in: compare nth field */ + /* out: TRUE if rec field + extends tuple field */ + dtuple_t* tuple, /* in: data tuple */ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n) /* in: compare nth field */ { dtype_t* type; dfield_t* dfield; byte* rec_f; ulint rec_f_len; + ut_ad(rec_offs_validate(rec, NULL, offsets)); dfield = dtuple_get_nth_field(tuple, n); type = dfield_get_type(dfield); - rec_f = rec_get_nth_field(rec, n, &rec_f_len); + rec_f = rec_get_nth_field(rec, offsets, n, &rec_f_len); if (type->mtype == DATA_VARCHAR || type->mtype == DATA_CHAR @@ -176,6 +191,7 @@ void page_cur_search_with_match( /*=======================*/ page_t* page, /* in: index page */ + dict_index_t* index, /* in: record descriptor */ dtuple_t* tuple, /* in: data tuple */ ulint mode, /* in: PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, or PAGE_CUR_GE */ @@ -212,6 +228,10 @@ page_cur_search_with_match( ulint dbg_matched_fields; ulint dbg_matched_bytes; #endif + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; + ut_ad(page && tuple && iup_matched_fields && iup_matched_bytes && ilow_matched_fields && ilow_matched_bytes && cursor); ut_ad(dtuple_validate(tuple)); @@ -229,7 +249,7 @@ page_cur_search_with_match( && (page_header_get_ptr(page, PAGE_LAST_INSERT)) && (page_header_get_field(page, PAGE_DIRECTION) == PAGE_RIGHT)) { - if (page_cur_try_search_shortcut(page, tuple, + if (page_cur_try_search_shortcut(page, index, tuple, iup_matched_fields, iup_matched_bytes, ilow_matched_fields, @@ -279,7 +299,10 @@ page_cur_search_with_match( low_matched_fields, low_matched_bytes, up_matched_fields, up_matched_bytes); - cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, + offsets = rec_get_offsets(mid_rec, index, offsets, + dtuple_get_n_fields_cmp(tuple), &heap); + + cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, offsets, &cur_matched_fields, &cur_matched_bytes); if (cmp == 1) { @@ -288,10 +311,12 @@ page_cur_search_with_match( low_matched_bytes = cur_matched_bytes; } else if (cmp == -1) { + offsets = rec_get_offsets(mid_rec, index, offsets, + dtuple_get_n_fields_cmp(tuple), &heap); if (mode == PAGE_CUR_LE_OR_EXTENDS && page_cur_rec_field_extends(tuple, mid_rec, - cur_matched_fields)) { + offsets, cur_matched_fields)) { low = mid; low_matched_fields = cur_matched_fields; low_matched_bytes = cur_matched_bytes; @@ -329,7 +354,10 @@ page_cur_search_with_match( low_matched_fields, low_matched_bytes, up_matched_fields, up_matched_bytes); - cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, + offsets = rec_get_offsets(mid_rec, index, offsets, + dtuple_get_n_fields_cmp(tuple), &heap); + + cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, offsets, &cur_matched_fields, &cur_matched_bytes); if (cmp == 1) { @@ -338,9 +366,12 @@ page_cur_search_with_match( low_matched_bytes = cur_matched_bytes; } else if (cmp == -1) { + offsets = rec_get_offsets(mid_rec, index, offsets, + dtuple_get_n_fields_cmp(tuple), &heap); + if (mode == PAGE_CUR_LE_OR_EXTENDS && page_cur_rec_field_extends(tuple, mid_rec, - cur_matched_fields)) { + offsets, cur_matched_fields)) { low_rec = mid_rec; low_matched_fields = cur_matched_fields; low_matched_bytes = cur_matched_bytes; @@ -368,7 +399,9 @@ page_cur_search_with_match( dbg_matched_fields = 0; dbg_matched_bytes = 0; - dbg_cmp = page_cmp_dtuple_rec_with_match(tuple, low_rec, + offsets = rec_get_offsets(low_rec, index, offsets, + ULINT_UNDEFINED, &heap); + dbg_cmp = page_cmp_dtuple_rec_with_match(tuple, low_rec, offsets, &dbg_matched_fields, &dbg_matched_bytes); if (mode == PAGE_CUR_G) { @@ -390,7 +423,9 @@ page_cur_search_with_match( dbg_matched_fields = 0; dbg_matched_bytes = 0; - dbg_cmp = page_cmp_dtuple_rec_with_match(tuple, up_rec, + offsets = rec_get_offsets(up_rec, index, offsets, + ULINT_UNDEFINED, &heap); + dbg_cmp = page_cmp_dtuple_rec_with_match(tuple, up_rec, offsets, &dbg_matched_fields, &dbg_matched_bytes); if (mode == PAGE_CUR_G) { @@ -419,6 +454,9 @@ page_cur_search_with_match( *iup_matched_bytes = up_matched_bytes; *ilow_matched_fields = low_matched_fields; *ilow_matched_bytes = low_matched_bytes; + if (heap) { + mem_heap_free(heap); + } } /*************************************************************** @@ -463,10 +501,12 @@ static void page_cur_insert_rec_write_log( /*==========================*/ - rec_t* insert_rec, /* in: inserted physical record */ - ulint rec_size, /* in: insert_rec size */ - rec_t* cursor_rec, /* in: record the cursor is pointing to */ - mtr_t* mtr) /* in: mini-transaction handle */ + rec_t* insert_rec, /* in: inserted physical record */ + ulint rec_size, /* in: insert_rec size */ + rec_t* cursor_rec, /* in: record the + cursor is pointing to */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mini-transaction handle */ { ulint cur_rec_size; ulint extra_size; @@ -476,22 +516,33 @@ page_cur_insert_rec_write_log( byte* cur_ptr; ulint extra_info_yes; byte* log_ptr; + byte* log_end; ulint i; ut_a(rec_size < UNIV_PAGE_SIZE); - ut_ad(rec_size == rec_get_size(insert_rec)); - log_ptr = mlog_open(mtr, 30 + MLOG_BUF_MARGIN); + { + mem_heap_t* heap = NULL; + ulint cur_offs_[100] = { 100, }; + ulint ins_offs_[100] = { 100, }; - if (log_ptr == NULL) { + ulint* cur_offs; + ulint* ins_offs; - return; - } + cur_offs = rec_get_offsets(cursor_rec, index, cur_offs_, + ULINT_UNDEFINED, &heap); + ins_offs = rec_get_offsets(insert_rec, index, ins_offs_, + ULINT_UNDEFINED, &heap); - extra_size = rec_get_extra_size(insert_rec); + extra_size = rec_offs_extra_size(ins_offs); + cur_extra_size = rec_offs_extra_size(cur_offs); + ut_ad(rec_size == rec_offs_size(ins_offs)); + cur_rec_size = rec_offs_size(cur_offs); - cur_extra_size = rec_get_extra_size(cursor_rec); - cur_rec_size = rec_get_size(cursor_rec); + if (heap) { + mem_heap_free(heap); + } + } ins_ptr = insert_rec - extra_size; @@ -514,7 +565,9 @@ page_cur_insert_rec_write_log( ins_ptr++; cur_ptr++; } else if ((i < extra_size) - && (i >= extra_size - REC_N_EXTRA_BYTES)) { + && (i >= extra_size - (index->table->comp + ? REC_N_NEW_EXTRA_BYTES + : REC_N_OLD_EXTRA_BYTES))) { i = extra_size; ins_ptr = insert_rec; cur_ptr = cursor_rec; @@ -525,16 +578,35 @@ page_cur_insert_rec_write_log( } if (mtr_get_log_mode(mtr) != MTR_LOG_SHORT_INSERTS) { - - log_ptr = mlog_write_initial_log_record_fast(insert_rec, - MLOG_REC_INSERT, log_ptr, mtr); + + log_ptr = mlog_open_and_write_index(mtr, insert_rec, index, + index->table->comp + ? MLOG_COMP_REC_INSERT : MLOG_REC_INSERT, + 2 + 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN); + + if (!log_ptr) { + /* Logging in mtr is switched off during crash + recovery: in that case mlog_open returns NULL */ + return; + } + + log_end = &log_ptr[2 + 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN]; /* Write the cursor rec offset as a 2-byte ulint */ mach_write_to_2(log_ptr, cursor_rec - buf_frame_align(cursor_rec)); log_ptr += 2; + } else { + log_ptr = mlog_open(mtr, 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN); + if (!log_ptr) { + /* Logging in mtr is switched off during crash + recovery: in that case mlog_open returns NULL */ + return; + } + log_end = &log_ptr[5 + 1 + 5 + 5 + MLOG_BUF_MARGIN]; } - if ((rec_get_info_bits(insert_rec) != rec_get_info_bits(cursor_rec)) + if ((rec_get_info_bits(insert_rec, index->table->comp) != + rec_get_info_bits(cursor_rec, index->table->comp)) || (extra_size != cur_extra_size) || (rec_size != cur_rec_size)) { @@ -549,7 +621,8 @@ page_cur_insert_rec_write_log( + extra_info_yes); if (extra_info_yes) { /* Write the info bits */ - mach_write_to_1(log_ptr, rec_get_info_bits(insert_rec)); + mach_write_to_1(log_ptr, + rec_get_info_bits(insert_rec, index->table->comp)); log_ptr++; /* Write the record origin offset */ @@ -565,17 +638,15 @@ page_cur_insert_rec_write_log( /* Write to the log the inserted index record end segment which differs from the cursor record */ - if (rec_size - i < MLOG_BUF_MARGIN) { - ut_memcpy(log_ptr, ins_ptr, rec_size - i); - log_ptr += rec_size - i; - } - - mlog_close(mtr, log_ptr); - - ut_a(rec_size - i < UNIV_PAGE_SIZE); + rec_size -= i; - if (rec_size - i >= MLOG_BUF_MARGIN) { - mlog_catenate_string(mtr, ins_ptr, rec_size - i); + if (log_ptr + rec_size <= log_end) { + memcpy(log_ptr, ins_ptr, rec_size); + mlog_close(mtr, log_ptr + rec_size); + } else { + mlog_close(mtr, log_ptr); + ut_a(rec_size < UNIV_PAGE_SIZE); + mlog_catenate_string(mtr, ins_ptr, rec_size); } } @@ -585,12 +656,13 @@ Parses a log record of a record insert on a page. */ byte* page_cur_parse_insert_rec( /*======================*/ - /* out: end of log record or NULL */ - ibool is_short,/* in: TRUE if short inserts */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr) /* in: mtr or NULL */ + /* out: end of log record or NULL */ + ibool is_short,/* in: TRUE if short inserts */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ { ulint extra_info_yes; ulint offset = 0; /* remove warning */ @@ -603,6 +675,9 @@ page_cur_parse_insert_rec( byte* ptr2 = ptr; ulint info_bits = 0; /* remove warning */ page_cur_t cursor; + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; if (!is_short) { /* Read the cursor rec offset as a 2-byte ulint */ @@ -689,11 +764,14 @@ page_cur_parse_insert_rec( cursor_rec = page + offset; } + offsets = rec_get_offsets(cursor_rec, index, offsets, + ULINT_UNDEFINED, &heap); + if (extra_info_yes == 0) { - info_bits = rec_get_info_bits(cursor_rec); - origin_offset = rec_get_extra_size(cursor_rec); - mismatch_index = rec_get_size(cursor_rec) - end_seg_len; - } + info_bits = rec_get_info_bits(cursor_rec, index->table->comp); + origin_offset = rec_offs_extra_size(offsets); + mismatch_index = rec_offs_size(offsets) - end_seg_len; + } if (mismatch_index + end_seg_len < sizeof buf1) { buf = buf1; @@ -722,20 +800,34 @@ page_cur_parse_insert_rec( ut_error; } - ut_memcpy(buf, rec_get_start(cursor_rec), mismatch_index); + ut_memcpy(buf, rec_get_start(cursor_rec, offsets), mismatch_index); ut_memcpy(buf + mismatch_index, ptr, end_seg_len); - rec_set_info_bits(buf + origin_offset, info_bits); + rec_set_info_bits(buf + origin_offset, index->table->comp, info_bits); + + /* Set the status bits for new-style records. */ + if (index->table->comp) { + /* Leaf pages (level 0) contain ordinary records; + non-leaf pages contain node pointer records. */ + ulint level = page_header_get_field( + buf_frame_align(cursor_rec), PAGE_LEVEL); + rec_set_status(buf + origin_offset, + level ? REC_STATUS_NODE_PTR : REC_STATUS_ORDINARY); + } page_cur_position(cursor_rec, &cursor); - page_cur_rec_insert(&cursor, buf + origin_offset, mtr); + page_cur_rec_insert(&cursor, buf + origin_offset, index, mtr); if (buf != buf1) { mem_free(buf); } + if (heap) { + mem_heap_free(heap); + } + return(ptr + end_seg_len); } @@ -751,68 +843,83 @@ page_cur_insert_rec_low( /* out: pointer to record if succeed, NULL otherwise */ page_cur_t* cursor, /* in: a page cursor */ - dtuple_t* tuple, /* in: pointer to a data tuple or NULL */ - ulint data_size,/* in: data size of tuple */ - rec_t* rec, /* in: pointer to a physical record or NULL */ + dtuple_t* tuple, /* in: pointer to a data tuple or NULL */ + dict_index_t* index, /* in: record descriptor */ + rec_t* rec, /* in: pointer to a physical record or NULL */ mtr_t* mtr) /* in: mini-transaction handle */ { - byte* insert_buf = NULL; - ulint rec_size; - byte* page; /* the relevant page */ - rec_t* last_insert; /* cursor position at previous insert */ - rec_t* insert_rec; /* inserted record */ - ulint heap_no; /* heap number of the inserted record */ - rec_t* current_rec; /* current record after which the - new record is inserted */ - rec_t* next_rec; /* next record after current before - the insertion */ - ulint owner_slot; /* the slot which owns the inserted record */ - rec_t* owner_rec; - ulint n_owned; - + byte* insert_buf = NULL; + ulint rec_size; + byte* page; /* the relevant page */ + rec_t* last_insert; /* cursor position at previous insert */ + rec_t* insert_rec; /* inserted record */ + ulint heap_no; /* heap number of the inserted record */ + rec_t* current_rec; /* current record after which the + new record is inserted */ + rec_t* next_rec; /* next record after current before + the insertion */ + ulint owner_slot; /* the slot which owns the + inserted record */ + rec_t* owner_rec; + ulint n_owned; + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; + ibool comp = index->table->comp; + ut_ad(cursor && mtr); ut_ad(tuple || rec); ut_ad(!(tuple && rec)); ut_ad(rec || dtuple_check_typed(tuple)); - ut_ad(rec || (dtuple_get_data_size(tuple) == data_size)); page = page_cur_get_page(cursor); + ut_ad(page_is_comp(page) == comp); + ut_ad(cursor->rec != page_get_supremum_rec(page)); /* 1. Get the size of the physical record in the page */ if (tuple != NULL) { - rec_size = data_size + rec_get_converted_extra_size( - data_size, - dtuple_get_n_fields(tuple)); + rec_size = rec_get_converted_size(index, tuple); } else { - rec_size = rec_get_size(rec); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + rec_size = rec_offs_size(offsets); } /* 2. Try to find suitable space from page memory management */ - insert_buf = page_mem_alloc(page, rec_size, &heap_no); + insert_buf = page_mem_alloc(page, rec_size, index, &heap_no); if (insert_buf == NULL) { - + if (heap) { + mem_heap_free(heap); + } return(NULL); } /* 3. Create the record */ if (tuple != NULL) { - insert_rec = rec_convert_dtuple_to_rec_low(insert_buf, tuple, - data_size); + insert_rec = rec_convert_dtuple_to_rec(insert_buf, + index, tuple); + offsets = rec_get_offsets(insert_rec, index, offsets, + ULINT_UNDEFINED, &heap); } else { - insert_rec = rec_copy(insert_buf, rec); + insert_rec = rec_copy(insert_buf, rec, offsets); + ut_ad(rec_offs_validate(rec, index, offsets)); + rec_offs_make_valid(insert_rec, index, offsets); } ut_ad(insert_rec); - ut_ad(rec_size == rec_get_size(insert_rec)); + ut_ad(rec_size == rec_offs_size(offsets)); /* 4. Insert the record in the linked list of records */ - current_rec = cursor->rec; + ut_ad(!comp || rec_get_status(current_rec) <= REC_STATUS_INFIMUM); + ut_ad(!comp || rec_get_status(insert_rec) < REC_STATUS_INFIMUM); + next_rec = page_rec_get_next(current_rec); + ut_ad(!comp || rec_get_status(next_rec) != REC_STATUS_INFIMUM); page_rec_set_next(insert_rec, next_rec); page_rec_set_next(current_rec, insert_rec); @@ -821,12 +928,15 @@ page_cur_insert_rec_low( /* 5. Set the n_owned field in the inserted record to zero, and set the heap_no field */ - rec_set_n_owned(insert_rec, 0); - rec_set_heap_no(insert_rec, heap_no); + rec_set_n_owned(insert_rec, comp, 0); + rec_set_heap_no(insert_rec, comp, heap_no); /* 6. Update the last insertion info in page header */ last_insert = page_header_get_ptr(page, PAGE_LAST_INSERT); + ut_ad(!last_insert || !comp + || rec_get_node_ptr_flag(last_insert) + == rec_get_node_ptr_flag(insert_rec)); if (last_insert == NULL) { page_header_set_field(page, PAGE_DIRECTION, PAGE_NO_DIRECTION); @@ -855,8 +965,8 @@ page_cur_insert_rec_low( /* 7. It remains to update the owner record. */ owner_rec = page_rec_find_owner_rec(insert_rec); - n_owned = rec_get_n_owned(owner_rec); - rec_set_n_owned(owner_rec, n_owned + 1); + n_owned = rec_get_n_owned(owner_rec, comp); + rec_set_n_owned(owner_rec, comp, n_owned + 1); /* 8. Now we have incremented the n_owned field of the owner record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED, @@ -868,8 +978,12 @@ page_cur_insert_rec_low( } /* 9. Write log record of the insert */ - page_cur_insert_rec_write_log(insert_rec, rec_size, current_rec, mtr); + page_cur_insert_rec_write_log(insert_rec, rec_size, current_rec, + index, mtr); + if (heap) { + mem_heap_free(heap); + } return(insert_rec); } @@ -879,17 +993,19 @@ UNIV_INLINE byte* page_copy_rec_list_to_created_page_write_log( /*=========================================*/ - /* out: 4-byte field where to write the log data - length */ - page_t* page, /* in: index page */ - mtr_t* mtr) /* in: mtr */ + /* out: 4-byte field where to + write the log data length */ + page_t* page, /* in: index page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { byte* log_ptr; - - mlog_write_initial_log_record(page, MLOG_LIST_END_COPY_CREATED, mtr); - - log_ptr = mlog_open(mtr, 4); + log_ptr = mlog_open_and_write_index(mtr, page, index, + index->table->comp + ? MLOG_COMP_LIST_END_COPY_CREATED + : MLOG_LIST_END_COPY_CREATED, 4); + ut_a(log_ptr); mlog_close(mtr, log_ptr + 4); return(log_ptr); @@ -901,11 +1017,12 @@ Parses a log record of copying a record list end to a new created page. */ byte* page_parse_copy_rec_list_to_created_page( /*=====================================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr) /* in: mtr or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ { byte* rec_end; ulint log_data_len; @@ -931,7 +1048,8 @@ page_parse_copy_rec_list_to_created_page( } while (ptr < rec_end) { - ptr = page_cur_parse_insert_rec(TRUE, ptr, end_ptr, page, mtr); + ptr = page_cur_parse_insert_rec(TRUE, ptr, end_ptr, + index, page, mtr); } ut_a(ptr == rec_end); @@ -950,10 +1068,11 @@ including that record. Infimum and supremum records are not copied. */ void page_copy_rec_list_end_to_created_page( /*===================================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ - rec_t* rec, /* in: first record to copy */ - mtr_t* mtr) /* in: mtr */ + page_t* new_page, /* in: index page to copy to */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: first record to copy */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { page_dir_slot_t* slot = 0; /* remove warning */ byte* heap_top; @@ -966,9 +1085,14 @@ page_copy_rec_list_end_to_created_page( ulint log_mode; byte* log_ptr; ulint log_data_len; + ibool comp = page_is_comp(page); + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; - ut_ad(page_header_get_field(new_page, PAGE_N_HEAP) == 2); + ut_ad(page_dir_get_n_heap(new_page) == 2); ut_ad(page != new_page); + ut_ad(comp == page_is_comp(new_page)); if (rec == page_get_infimum_rec(page)) { @@ -983,12 +1107,13 @@ page_copy_rec_list_end_to_created_page( #ifdef UNIV_DEBUG /* To pass the debug tests we have to set these dummy values in the debug version */ - page_header_set_field(new_page, PAGE_N_DIR_SLOTS, UNIV_PAGE_SIZE / 2); + page_dir_set_n_slots(new_page, UNIV_PAGE_SIZE / 2); page_header_set_ptr(new_page, PAGE_HEAP_TOP, new_page + UNIV_PAGE_SIZE - 1); #endif - log_ptr = page_copy_rec_list_to_created_page_write_log(new_page, mtr); + log_ptr = page_copy_rec_list_to_created_page_write_log(new_page, + index, mtr); log_data_len = dyn_array_get_data_size(&(mtr->log)); @@ -997,22 +1122,29 @@ page_copy_rec_list_end_to_created_page( log_mode = mtr_set_log_mode(mtr, MTR_LOG_SHORT_INSERTS); prev_rec = page_get_infimum_rec(new_page); - heap_top = new_page + PAGE_SUPREMUM_END; + if (comp) { + heap_top = new_page + PAGE_NEW_SUPREMUM_END; + } else { + heap_top = new_page + PAGE_OLD_SUPREMUM_END; + } count = 0; slot_index = 0; n_recs = 0; + heap = mem_heap_create(100); + /* should be do ... until, comment by Jani */ while (rec != page_get_supremum_rec(page)) { - - insert_rec = rec_copy(heap_top, rec); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + insert_rec = rec_copy(heap_top, rec, offsets); - rec_set_next_offs(prev_rec, insert_rec - new_page); + rec_set_next_offs(prev_rec, comp, insert_rec - new_page); - rec_set_n_owned(insert_rec, 0); - rec_set_heap_no(insert_rec, 2 + n_recs); + rec_set_n_owned(insert_rec, comp, 0); + rec_set_heap_no(insert_rec, comp, 2 + n_recs); - rec_size = rec_get_size(insert_rec); + rec_size = rec_offs_size(offsets); heap_top = heap_top + rec_size; @@ -1034,7 +1166,7 @@ page_copy_rec_list_end_to_created_page( } page_cur_insert_rec_write_log(insert_rec, rec_size, prev_rec, - mtr); + index, mtr); prev_rec = insert_rec; rec = page_rec_get_next(rec); } @@ -1056,22 +1188,27 @@ page_copy_rec_list_end_to_created_page( slot_index--; } + if (heap) { + mem_heap_free(heap); + } + log_data_len = dyn_array_get_data_size(&(mtr->log)) - log_data_len; ut_a(log_data_len < 100 * UNIV_PAGE_SIZE); mach_write_to_4(log_ptr, log_data_len); - rec_set_next_offs(insert_rec, PAGE_SUPREMUM); + rec_set_next_offs(insert_rec, comp, + comp ? PAGE_NEW_SUPREMUM : PAGE_OLD_SUPREMUM); slot = page_dir_get_nth_slot(new_page, 1 + slot_index); page_dir_slot_set_rec(slot, page_get_supremum_rec(new_page)); page_dir_slot_set_n_owned(slot, count + 1); - page_header_set_field(new_page, PAGE_N_DIR_SLOTS, 2 + slot_index); + page_dir_set_n_slots(new_page, 2 + slot_index); page_header_set_ptr(new_page, PAGE_HEAP_TOP, heap_top); - page_header_set_field(new_page, PAGE_N_HEAP, 2 + n_recs); + page_dir_set_n_heap(new_page, 2 + n_recs); page_header_set_field(new_page, PAGE_N_RECS, n_recs); page_header_set_ptr(new_page, PAGE_LAST_INSERT, NULL); @@ -1089,14 +1226,27 @@ UNIV_INLINE void page_cur_delete_rec_write_log( /*==========================*/ - rec_t* cursor_rec, /* in: record to be deleted */ - mtr_t* mtr) /* in: mini-transaction handle */ + rec_t* rec, /* in: record to be deleted */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mini-transaction handle */ { - mlog_write_initial_log_record(cursor_rec, MLOG_REC_DELETE, mtr); + byte* log_ptr; + + log_ptr = mlog_open_and_write_index(mtr, rec, index, + index->table->comp + ? MLOG_COMP_REC_DELETE + : MLOG_REC_DELETE, 2); + + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery: + in that case mlog_open returns NULL */ + return; + } /* Write the cursor rec offset as a 2-byte ulint */ - mlog_catenate_ulint(mtr, cursor_rec - buf_frame_align(cursor_rec), - MLOG_2BYTES); + mach_write_to_2(log_ptr, rec - buf_frame_align(rec)); + + mlog_close(mtr, log_ptr + 2); } /*************************************************************** @@ -1105,11 +1255,12 @@ Parses log record of a record delete on a page. */ byte* page_cur_parse_delete_rec( /*======================*/ - /* out: pointer to record end or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr) /* in: mtr or NULL */ + /* out: pointer to record end or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ { ulint offset; page_cur_t cursor; @@ -1128,7 +1279,7 @@ page_cur_parse_delete_rec( if (page) { page_cur_position(page + offset, &cursor); - page_cur_delete_rec(&cursor, mtr); + page_cur_delete_rec(&cursor, index, mtr); } return(ptr); @@ -1142,6 +1293,7 @@ void page_cur_delete_rec( /*================*/ page_cur_t* cursor, /* in: a page cursor */ + dict_index_t* index, /* in: record descriptor */ mtr_t* mtr) /* in: mini-transaction handle */ { page_dir_slot_t* cur_dir_slot; @@ -1169,7 +1321,7 @@ page_cur_delete_rec( cur_n_owned = page_dir_slot_get_n_owned(cur_dir_slot); /* 0. Write the log record */ - page_cur_delete_rec_write_log(current_rec, mtr); + page_cur_delete_rec_write_log(current_rec, index, mtr); /* 1. Reset the last insert info in the page header and increment the modify clock for the frame */ @@ -1223,7 +1375,7 @@ page_cur_delete_rec( page_dir_slot_set_n_owned(cur_dir_slot, cur_n_owned - 1); /* 6. Free the memory occupied by the record */ - page_mem_free(page, current_rec); + page_mem_free(page, current_rec, index); /* 7. Now we have decremented the number of owned records of the slot. If the number drops below PAGE_DIR_SLOT_MIN_N_OWNED, we balance the diff --git a/innobase/page/page0page.c b/innobase/page/page0page.c index 343f300fc77..d71c243e7c5 100644 --- a/innobase/page/page0page.c +++ b/innobase/page/page0page.c @@ -18,6 +18,8 @@ Created 2/2/1994 Heikki Tuuri #include "fut0lst.h" #include "btr0sea.h" #include "buf0buf.h" +#include "srv0srv.h" +#include "btr0btr.h" /* THE INDEX PAGE ============== @@ -75,10 +77,14 @@ page_dir_find_owner_slot( page_t* page; page_dir_slot_t* slot; rec_t* original_rec = rec; + ibool comp; ut_ad(page_rec_check(rec)); - while (rec_get_n_owned(rec) == 0) { + page = buf_frame_align(rec); + comp = page_is_comp(page); + + while (rec_get_n_owned(rec, comp) == 0) { steps++; rec = page_rec_get_next(rec); } @@ -96,14 +102,22 @@ page_dir_find_owner_slot( "InnoDB: Original record ", (ulong) buf_frame_get_page_no(page)); - rec_print(stderr, original_rec); + if (comp) { + fputs("(compact record)", stderr); + } else { + rec_print_old(stderr, original_rec); + } fprintf(stderr, "\n" "InnoDB: on that page. Steps %lu.\n", (ulong) steps); fputs( "InnoDB: Cannot find the dir slot for record ", stderr); - rec_print(stderr, rec); + if (comp) { + fputs("(compact record)", stderr); + } else { + rec_print_old(stderr, rec); + } fputs("\n" "InnoDB: on that page!\n", stderr); @@ -136,14 +150,15 @@ page_dir_slot_check( page = buf_frame_align(slot); - n_slots = page_header_get_field(page, PAGE_N_DIR_SLOTS); + n_slots = page_dir_get_n_slots(page); ut_a(slot <= page_dir_get_nth_slot(page, 0)); ut_a(slot >= page_dir_get_nth_slot(page, n_slots - 1)); - ut_a(page_rec_check(page + mach_read_from_2(slot))); + ut_a(page_rec_check(page_dir_slot_get_rec(slot))); - n_owned = rec_get_n_owned(page + mach_read_from_2(slot)); + n_owned = rec_get_n_owned(page_dir_slot_get_rec(slot), + page_is_comp(page)); if (slot == page_dir_get_nth_slot(page, 0)) { ut_a(n_owned == 1); @@ -194,12 +209,14 @@ Allocates a block of memory from an index page. */ byte* page_mem_alloc( /*===========*/ - /* out: pointer to start of allocated - buffer, or NULL if allocation fails */ - page_t* page, /* in: index page */ - ulint need, /* in: number of bytes needed */ - ulint* heap_no)/* out: this contains the heap number - of the allocated record if allocation succeeds */ + /* out: pointer to start of allocated + buffer, or NULL if allocation fails */ + page_t* page, /* in: index page */ + ulint need, /* in: number of bytes needed */ + dict_index_t* index, /* in: record descriptor */ + ulint* heap_no)/* out: this contains the heap number + of the allocated record + if allocation succeeds */ { rec_t* rec; byte* block; @@ -213,18 +230,36 @@ page_mem_alloc( rec = page_header_get_ptr(page, PAGE_FREE); - if (rec && (rec_get_size(rec) >= need)) { + if (rec) { + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + if (rec_offs_size(offsets) >= need) { + page_header_set_ptr(page, PAGE_FREE, + page_rec_get_next(rec)); - page_header_set_ptr(page, PAGE_FREE, page_rec_get_next(rec)); + garbage = page_header_get_field(page, PAGE_GARBAGE); + ut_ad(garbage >= need); - garbage = page_header_get_field(page, PAGE_GARBAGE); - ut_ad(garbage >= need); + page_header_set_field(page, PAGE_GARBAGE, + garbage - need); - page_header_set_field(page, PAGE_GARBAGE, garbage - need); + *heap_no = rec_get_heap_no(rec, page_is_comp(page)); - *heap_no = rec_get_heap_no(rec); + block = rec_get_start(rec, offsets); + if (heap) { + mem_heap_free(heap); + } + return(block); + } - return(rec_get_start(rec)); + if (heap) { + mem_heap_free(heap); + } } /* Could not find space from the free list, try top of heap */ @@ -235,9 +270,9 @@ page_mem_alloc( block = page_header_get_ptr(page, PAGE_HEAP_TOP); page_header_set_ptr(page, PAGE_HEAP_TOP, block + need); - *heap_no = page_header_get_field(page, PAGE_N_HEAP); + *heap_no = page_dir_get_n_heap(page); - page_header_set_field(page, PAGE_N_HEAP, 1 + *heap_no); + page_dir_set_n_heap(page, 1 + *heap_no); return(block); } @@ -253,9 +288,11 @@ page_create_write_log( /*==================*/ buf_frame_t* frame, /* in: a buffer frame where the page is created */ - mtr_t* mtr) /* in: mini-transaction handle */ + mtr_t* mtr, /* in: mini-transaction handle */ + ibool comp) /* in: TRUE=compact page format */ { - mlog_write_initial_log_record(frame, MLOG_PAGE_CREATE, mtr); + mlog_write_initial_log_record(frame, + comp ? MLOG_COMP_PAGE_CREATE : MLOG_PAGE_CREATE, mtr); } /*************************************************************** @@ -267,6 +304,7 @@ page_parse_create( /* out: end of log record or NULL */ byte* ptr, /* in: buffer */ byte* end_ptr __attribute__((unused)), /* in: buffer end */ + ibool comp, /* in: TRUE=compact page format */ page_t* page, /* in: page or NULL */ mtr_t* mtr) /* in: mtr or NULL */ { @@ -275,7 +313,7 @@ page_parse_create( /* The record is empty, except for the record initial part */ if (page) { - page_create(page, mtr); + page_create(page, mtr, comp); } return(ptr); @@ -290,7 +328,8 @@ page_create( /* out: pointer to the page */ buf_frame_t* frame, /* in: a buffer frame where the page is created */ - mtr_t* mtr) /* in: mini-transaction handle */ + mtr_t* mtr, /* in: mini-transaction handle */ + ibool comp) /* in: TRUE=compact page format */ { page_dir_slot_t* slot; mem_heap_t* heap; @@ -300,6 +339,10 @@ page_create( rec_t* infimum_rec; rec_t* supremum_rec; page_t* page; + dict_index_t* index; + ulint* offsets; + + index = comp ? srv_sys->dummy_ind2 : srv_sys->dummy_ind1; ut_ad(frame && mtr); ut_ad(PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE @@ -311,7 +354,7 @@ page_create( buf_frame_modify_clock_inc(frame); /* 2. WRITE LOG INFORMATION */ - page_create_write_log(frame, mtr); + page_create_write_log(frame, mtr, comp); page = frame; @@ -323,43 +366,53 @@ page_create( /* Create first a data tuple for infimum record */ tuple = dtuple_create(heap, 1); + dtuple_set_info_bits(tuple, REC_STATUS_INFIMUM); field = dtuple_get_nth_field(tuple, 0); - dfield_set_data(field, "infimum", sizeof "infimum"); - dtype_set(dfield_get_type(field), DATA_VARCHAR, DATA_ENGLISH, 20, 0); - + dfield_set_data(field, "infimum", 8); + dtype_set(dfield_get_type(field), + DATA_VARCHAR, DATA_ENGLISH | DATA_NOT_NULL, 8, 0); /* Set the corresponding physical record to its place in the page record heap */ heap_top = page + PAGE_DATA; - infimum_rec = rec_convert_dtuple_to_rec(heap_top, tuple); + infimum_rec = rec_convert_dtuple_to_rec(heap_top, index, tuple); + + ut_a(infimum_rec == + page + (comp ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM)); + + rec_set_n_owned(infimum_rec, comp, 1); + rec_set_heap_no(infimum_rec, comp, 0); + offsets = rec_get_offsets(infimum_rec, index, NULL, + ULINT_UNDEFINED, &heap); + + heap_top = rec_get_end(infimum_rec, offsets); - ut_a(infimum_rec == page + PAGE_INFIMUM); - - rec_set_n_owned(infimum_rec, 1); - rec_set_heap_no(infimum_rec, 0); - - heap_top = rec_get_end(infimum_rec); - /* Create then a tuple for supremum */ tuple = dtuple_create(heap, 1); + dtuple_set_info_bits(tuple, REC_STATUS_SUPREMUM); field = dtuple_get_nth_field(tuple, 0); - dfield_set_data(field, "supremum", sizeof "supremum"); - dtype_set(dfield_get_type(field), DATA_VARCHAR, DATA_ENGLISH, 20, 0); + dfield_set_data(field, "supremum", 9 - comp); + dtype_set(dfield_get_type(field), + DATA_VARCHAR, DATA_ENGLISH | DATA_NOT_NULL, 9 - comp, 0); - supremum_rec = rec_convert_dtuple_to_rec(heap_top, tuple); + supremum_rec = rec_convert_dtuple_to_rec(heap_top, index, tuple); - ut_a(supremum_rec == page + PAGE_SUPREMUM); + ut_a(supremum_rec == + page + (comp ? PAGE_NEW_SUPREMUM : PAGE_OLD_SUPREMUM)); - rec_set_n_owned(supremum_rec, 1); - rec_set_heap_no(supremum_rec, 1); - - heap_top = rec_get_end(supremum_rec); + rec_set_n_owned(supremum_rec, comp, 1); + rec_set_heap_no(supremum_rec, comp, 1); - ut_ad(heap_top == page + PAGE_SUPREMUM_END); + offsets = rec_get_offsets(supremum_rec, index, offsets, + ULINT_UNDEFINED, &heap); + heap_top = rec_get_end(supremum_rec, offsets); + + ut_ad(heap_top == + page + (comp ? PAGE_NEW_SUPREMUM_END : PAGE_OLD_SUPREMUM_END)); mem_heap_free(heap); @@ -367,7 +420,7 @@ page_create( page_header_set_field(page, PAGE_N_DIR_SLOTS, 2); page_header_set_ptr(page, PAGE_HEAP_TOP, heap_top); - page_header_set_field(page, PAGE_N_HEAP, 2); + page_header_set_field(page, PAGE_N_HEAP, comp ? 0x8002 : 2); page_header_set_ptr(page, PAGE_FREE, NULL); page_header_set_field(page, PAGE_GARBAGE, 0); page_header_set_ptr(page, PAGE_LAST_INSERT, NULL); @@ -388,8 +441,8 @@ page_create( /* Set the next pointers in infimum and supremum */ - rec_set_next_offs(infimum_rec, (ulint)(supremum_rec - page)); - rec_set_next_offs(supremum_rec, 0); + rec_set_next_offs(infimum_rec, comp, (ulint)(supremum_rec - page)); + rec_set_next_offs(supremum_rec, comp, 0); return(page); } @@ -401,10 +454,11 @@ touch the lock table and max trx id on page. */ void page_copy_rec_list_end_no_locks( /*============================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - mtr_t* mtr) /* in: mtr */ + page_t* new_page, /* in: index page to copy to */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { page_cur_t cur1; page_cur_t cur2; @@ -416,8 +470,11 @@ page_copy_rec_list_end_no_locks( page_cur_move_to_next(&cur1); } - - ut_a(mach_read_from_2(new_page + UNIV_PAGE_SIZE - 10) == PAGE_INFIMUM); + + ut_a(index->table->comp == page_is_comp(page)); + ut_a(index->table->comp == page_is_comp(new_page)); + ut_a(mach_read_from_2(new_page + UNIV_PAGE_SIZE - 10) == (ulint) + (index->table->comp ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM)); page_cur_set_before_first(new_page, &cur2); @@ -427,7 +484,7 @@ page_copy_rec_list_end_no_locks( while (sup != page_cur_get_rec(&cur1)) { if (!page_cur_rec_insert(&cur2, - page_cur_get_rec(&cur1), mtr)) { + page_cur_get_rec(&cur1), index, mtr)) { /* Track an assertion failure reported on the mailing list on June 18th, 2003 */ @@ -456,16 +513,18 @@ The records are copied to the start of the record list on new_page. */ void page_copy_rec_list_end( /*===================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - mtr_t* mtr) /* in: mtr */ + page_t* new_page, /* in: index page to copy to */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { - if (page_header_get_field(new_page, PAGE_N_HEAP) == 2) { + if (page_dir_get_n_heap(new_page) == 2) { page_copy_rec_list_end_to_created_page(new_page, page, rec, - mtr); + index, mtr); } else { - page_copy_rec_list_end_no_locks(new_page, page, rec, mtr); + page_copy_rec_list_end_no_locks(new_page, page, rec, + index, mtr); } /* Update the lock table, MAX_TRX_ID, and possible hash index */ @@ -474,7 +533,7 @@ page_copy_rec_list_end( page_update_max_trx_id(new_page, page_get_max_trx_id(page)); - btr_search_move_or_delete_hash_entries(new_page, page); + btr_search_move_or_delete_hash_entries(new_page, page, index); } /***************************************************************** @@ -485,10 +544,11 @@ The records are copied to the end of the record list on new_page. */ void page_copy_rec_list_start( /*=====================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - mtr_t* mtr) /* in: mtr */ + page_t* new_page, /* in: index page to copy to */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { page_cur_t cur1; page_cur_t cur2; @@ -510,8 +570,8 @@ page_copy_rec_list_start( /* Copy records from the original page to the new page */ while (page_cur_get_rec(&cur1) != rec) { - ut_a( - page_cur_rec_insert(&cur2, page_cur_get_rec(&cur1), mtr)); + ut_a(page_cur_rec_insert(&cur2, + page_cur_get_rec(&cur1), index, mtr)); page_cur_move_to_next(&cur1); page_cur_move_to_next(&cur2); @@ -523,7 +583,7 @@ page_copy_rec_list_start( page_update_max_trx_id(new_page, page_get_max_trx_id(page)); - btr_search_move_or_delete_hash_entries(new_page, page); + btr_search_move_or_delete_hash_entries(new_page, page, index); } /************************************************************** @@ -532,18 +592,25 @@ UNIV_INLINE void page_delete_rec_list_write_log( /*===========================*/ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - byte type, /* in: operation type: MLOG_LIST_END_DELETE, ... */ - mtr_t* mtr) /* in: mtr */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + byte type, /* in: operation type: + MLOG_LIST_END_DELETE, ... */ + mtr_t* mtr) /* in: mtr */ { - ut_ad((type == MLOG_LIST_END_DELETE) - || (type == MLOG_LIST_START_DELETE)); - - mlog_write_initial_log_record(page, type, mtr); - - /* Write the parameter as a 2-byte ulint */ - mlog_catenate_ulint(mtr, rec - page, MLOG_2BYTES); + byte* log_ptr; + ut_ad(type == MLOG_LIST_END_DELETE + || type == MLOG_LIST_START_DELETE + || type == MLOG_COMP_LIST_END_DELETE + || type == MLOG_COMP_LIST_START_DELETE); + + log_ptr = mlog_open_and_write_index(mtr, page, index, type, 2); + if (log_ptr) { + /* Write the parameter as a 2-byte ulint */ + mach_write_to_2(log_ptr, rec - page); + mlog_close(mtr, log_ptr + 2); + } } /************************************************************** @@ -552,18 +619,23 @@ Parses a log record of a record list end or start deletion. */ byte* page_parse_delete_rec_list( /*=======================*/ - /* out: end of log record or NULL */ - byte type, /* in: MLOG_LIST_END_DELETE or - MLOG_LIST_START_DELETE */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr) /* in: mtr or NULL */ + /* out: end of log record or NULL */ + byte type, /* in: MLOG_LIST_END_DELETE, + MLOG_LIST_START_DELETE, + MLOG_COMP_LIST_END_DELETE or + MLOG_COMP_LIST_START_DELETE */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ { ulint offset; - ut_ad((type == MLOG_LIST_END_DELETE) - || (type == MLOG_LIST_START_DELETE)); + ut_ad(type == MLOG_LIST_END_DELETE + || type == MLOG_LIST_START_DELETE + || type == MLOG_COMP_LIST_END_DELETE + || type == MLOG_COMP_LIST_START_DELETE); /* Read the record offset as a 2-byte ulint */ @@ -580,11 +652,12 @@ page_parse_delete_rec_list( return(ptr); } - if (type == MLOG_LIST_END_DELETE) { - page_delete_rec_list_end(page, page + offset, ULINT_UNDEFINED, - ULINT_UNDEFINED, mtr); + if (type == MLOG_LIST_END_DELETE + || type == MLOG_COMP_LIST_END_DELETE) { + page_delete_rec_list_end(page, page + offset, index, + ULINT_UNDEFINED, ULINT_UNDEFINED, mtr); } else { - page_delete_rec_list_start(page, page + offset, mtr); + page_delete_rec_list_start(page, page + offset, index, mtr); } return(ptr); @@ -597,14 +670,15 @@ The infimum and supremum records are not deleted. */ void page_delete_rec_list_end( /*=====================*/ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - ulint n_recs, /* in: number of records to delete, or ULINT_UNDEFINED - if not known */ - ulint size, /* in: the sum of the sizes of the records in the end - of the chain to delete, or ULINT_UNDEFINED if not - known */ - mtr_t* mtr) /* in: mtr */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + ulint n_recs, /* in: number of records to delete, + or ULINT_UNDEFINED if not known */ + ulint size, /* in: the sum of the sizes of the + records in the end of the chain to + delete, or ULINT_UNDEFINED if not known */ + mtr_t* mtr) /* in: mtr */ { page_dir_slot_t* slot; ulint slot_index; @@ -615,10 +689,12 @@ page_delete_rec_list_end( ulint count; ulint n_owned; rec_t* sup; + ibool comp; /* Reset the last insert info in the page header and increment the modify clock for the frame */ + ut_ad(size == ULINT_UNDEFINED || size < UNIV_PAGE_SIZE); page_header_set_ptr(page, PAGE_LAST_INSERT, NULL); /* The page gets invalid for optimistic searches: increment the @@ -632,7 +708,9 @@ page_delete_rec_list_end( rec = page_rec_get_next(rec); } - page_delete_rec_list_write_log(page, rec, MLOG_LIST_END_DELETE, mtr); + comp = page_is_comp(page); + page_delete_rec_list_write_log(page, rec, index, + comp ? MLOG_COMP_LIST_END_DELETE : MLOG_LIST_END_DELETE, mtr); if (rec == sup) { @@ -644,19 +722,35 @@ page_delete_rec_list_end( last_rec = page_rec_get_prev(sup); if ((size == ULINT_UNDEFINED) || (n_recs == ULINT_UNDEFINED)) { + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; /* Calculate the sum of sizes and the number of records */ size = 0; n_recs = 0; rec2 = rec; while (rec2 != sup) { - size += rec_get_size(rec2); + ulint s; + offsets = rec_get_offsets(rec2, index, offsets, + ULINT_UNDEFINED, &heap); + s = rec_offs_size(offsets); + ut_ad(rec2 - page + s - rec_offs_extra_size(offsets) + < UNIV_PAGE_SIZE); + ut_ad(size + s < UNIV_PAGE_SIZE); + size += s; n_recs++; rec2 = page_rec_get_next(rec2); } + + if (heap) { + mem_heap_free(heap); + } } + ut_ad(size < UNIV_PAGE_SIZE); + /* Update the page directory; there is no need to balance the number of the records owned by the supremum record, as it is allowed to be less than PAGE_DIR_SLOT_MIN_N_OWNED */ @@ -664,15 +758,15 @@ page_delete_rec_list_end( rec2 = rec; count = 0; - while (rec_get_n_owned(rec2) == 0) { + while (rec_get_n_owned(rec2, comp) == 0) { count++; rec2 = page_rec_get_next(rec2); } - ut_ad(rec_get_n_owned(rec2) - count > 0); + ut_ad(rec_get_n_owned(rec2, comp) - count > 0); - n_owned = rec_get_n_owned(rec2) - count; + n_owned = rec_get_n_owned(rec2, comp) - count; slot_index = page_dir_find_owner_slot(rec2); slot = page_dir_get_nth_slot(page, slot_index); @@ -680,7 +774,7 @@ page_delete_rec_list_end( page_dir_slot_set_rec(slot, sup); page_dir_slot_set_n_owned(slot, n_owned); - page_header_set_field(page, PAGE_N_DIR_SLOTS, slot_index + 1); + page_dir_set_n_slots(page, slot_index + 1); /* Remove the record chain segment from the record chain */ page_rec_set_next(prev_rec, page_get_supremum_rec(page)); @@ -706,14 +800,19 @@ that record. Infimum and supremum records are not deleted. */ void page_delete_rec_list_start( /*=======================*/ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - mtr_t* mtr) /* in: mtr */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { page_cur_t cur1; ulint log_mode; - page_delete_rec_list_write_log(page, rec, MLOG_LIST_START_DELETE, mtr); + page_delete_rec_list_write_log(page, rec, index, + index->table->comp + ? MLOG_COMP_LIST_START_DELETE + : MLOG_LIST_START_DELETE, + mtr); page_cur_set_before_first(page, &cur1); @@ -730,7 +829,7 @@ page_delete_rec_list_start( while (page_cur_get_rec(&cur1) != rec) { - page_cur_delete_rec(&cur1, mtr); + page_cur_delete_rec(&cur1, index, mtr); } /* Restore log mode */ @@ -745,10 +844,11 @@ split_rec. */ void page_move_rec_list_end( /*===================*/ - page_t* new_page, /* in: index page where to move */ - page_t* page, /* in: index page */ - rec_t* split_rec, /* in: first record to move */ - mtr_t* mtr) /* in: mtr */ + page_t* new_page, /* in: index page where to move */ + page_t* page, /* in: index page */ + rec_t* split_rec, /* in: first record to move */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { ulint old_data_size; ulint new_data_size; @@ -758,15 +858,15 @@ page_move_rec_list_end( old_data_size = page_get_data_size(new_page); old_n_recs = page_get_n_recs(new_page); - page_copy_rec_list_end(new_page, page, split_rec, mtr); + page_copy_rec_list_end(new_page, page, split_rec, index, mtr); new_data_size = page_get_data_size(new_page); new_n_recs = page_get_n_recs(new_page); ut_ad(new_data_size >= old_data_size); - page_delete_rec_list_end(page, split_rec, new_n_recs - old_n_recs, - new_data_size - old_data_size, mtr); + page_delete_rec_list_end(page, split_rec, index, + new_n_recs - old_n_recs, new_data_size - old_data_size, mtr); } /***************************************************************** @@ -776,14 +876,15 @@ split_rec. */ void page_move_rec_list_start( /*=====================*/ - page_t* new_page, /* in: index page where to move */ - page_t* page, /* in: index page */ - rec_t* split_rec, /* in: first record not to move */ - mtr_t* mtr) /* in: mtr */ + page_t* new_page, /* in: index page where to move */ + page_t* page, /* in: index page */ + rec_t* split_rec, /* in: first record not to move */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { - page_copy_rec_list_start(new_page, page, split_rec, mtr); + page_copy_rec_list_start(new_page, page, split_rec, index, mtr); - page_delete_rec_list_start(page, split_rec, mtr); + page_delete_rec_list_start(page, split_rec, index, mtr); } /*************************************************************************** @@ -801,7 +902,7 @@ page_rec_write_index_page_no( byte* data; ulint len; - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field_old(rec, i, &len); ut_ad(len == 4); @@ -885,7 +986,7 @@ page_dir_add_slots( ut_ad(start < n_slots - 1); /* Update the page header */ - page_header_set_field(page, PAGE_N_DIR_SLOTS, n_slots + n); + page_dir_set_n_slots(page, n_slots + n); /* Move slots up */ @@ -1006,8 +1107,8 @@ page_dir_balance_slot( old_rec = page_dir_slot_get_rec(slot); new_rec = page_rec_get_next(old_rec); - rec_set_n_owned(old_rec, 0); - rec_set_n_owned(new_rec, n_owned + 1); + rec_set_n_owned(old_rec, page_is_comp(page), 0); + rec_set_n_owned(new_rec, page_is_comp(page), n_owned + 1); page_dir_slot_set_rec(slot, new_rec); @@ -1080,13 +1181,15 @@ page_rec_get_n_recs_before( rec_t* slot_rec; page_t* page; ulint i; + ibool comp; lint n = 0; ut_ad(page_rec_check(rec)); page = buf_frame_align(rec); - - while (rec_get_n_owned(rec) == 0) { + comp = page_is_comp(page); + + while (rec_get_n_owned(rec, comp) == 0) { rec = page_rec_get_next(rec); n--; @@ -1096,7 +1199,7 @@ page_rec_get_n_recs_before( slot = page_dir_get_nth_slot(page, i); slot_rec = page_dir_slot_get_rec(slot); - n += rec_get_n_owned(slot_rec); + n += rec_get_n_owned(slot_rec, comp); if (rec == slot_rec) { @@ -1118,17 +1221,21 @@ the index page context. */ void page_rec_print( /*===========*/ - rec_t* rec) + rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: record descriptor */ { - rec_print(stderr, rec); + ibool comp = page_is_comp(buf_frame_align(rec)); + + ut_a(comp == rec_offs_comp(offsets)); + rec_print_new(stderr, rec, offsets); fprintf(stderr, " n_owned: %lu; heap_no: %lu; next rec: %lu\n", - (ulong) rec_get_n_owned(rec), - (ulong) rec_get_heap_no(rec), - (ulong) rec_get_next_offs(rec)); + (ulong) rec_get_n_owned(rec, comp), + (ulong) rec_get_heap_no(rec, comp), + (ulong) rec_get_next_offs(rec, comp)); page_rec_check(rec); - rec_validate(rec); + rec_validate(rec, offsets); } /******************************************************************* @@ -1176,12 +1283,18 @@ debugging purposes. */ void page_print_list( /*============*/ - page_t* page, /* in: index page */ - ulint pr_n) /* in: print n first and n last entries */ + page_t* page, /* in: index page */ + dict_index_t* index, /* in: dictionary index of the page */ + ulint pr_n) /* in: print n first and n last entries */ { page_cur_t cur; ulint count; ulint n_recs; + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; + + ut_a(page_is_comp(page) == index->table->comp); fprintf(stderr, "--------------------------------\n" @@ -1193,7 +1306,9 @@ page_print_list( page_cur_set_before_first(page, &cur); count = 0; for (;;) { - page_rec_print(cur.rec); + offsets = rec_get_offsets(cur.rec, index, offsets, + ULINT_UNDEFINED, &heap); + page_rec_print(cur.rec, offsets); if (count == pr_n) { break; @@ -1213,7 +1328,9 @@ page_print_list( page_cur_move_to_next(&cur); if (count + pr_n >= n_recs) { - page_rec_print(cur.rec); + offsets = rec_get_offsets(cur.rec, index, offsets, + ULINT_UNDEFINED, &heap); + page_rec_print(cur.rec, offsets); } count++; } @@ -1222,6 +1339,10 @@ page_print_list( "Total of %lu records \n" "--------------------------------\n", (ulong) (count + 1)); + + if (heap) { + mem_heap_free(heap); + } } /******************************************************************* @@ -1235,14 +1356,15 @@ page_header_print( fprintf(stderr, "--------------------------------\n" "PAGE HEADER INFO\n" - "Page address %p, n records %lu\n" + "Page address %p, n records %lu (%s)\n" "n dir slots %lu, heap top %lu\n" "Page n heap %lu, free %lu, garbage %lu\n" "Page last insert %lu, direction %lu, n direction %lu\n", page, (ulong) page_header_get_field(page, PAGE_N_RECS), + page_is_comp(page) ? "compact format" : "original format", (ulong) page_header_get_field(page, PAGE_N_DIR_SLOTS), (ulong) page_header_get_field(page, PAGE_HEAP_TOP), - (ulong) page_header_get_field(page, PAGE_N_HEAP), + (ulong) page_dir_get_n_heap(page), (ulong) page_header_get_field(page, PAGE_FREE), (ulong) page_header_get_field(page, PAGE_GARBAGE), (ulong) page_header_get_field(page, PAGE_LAST_INSERT), @@ -1257,13 +1379,16 @@ debugging purposes. */ void page_print( /*======*/ - page_t* page, /* in: index page */ - ulint dn, /* in: print dn first and last entries in directory */ - ulint rn) /* in: print rn first and last records on page */ + page_t* page, /* in: index page */ + dict_index_t* index, /* in: dictionary index of the page */ + ulint dn, /* in: print dn first and last entries + in directory */ + ulint rn) /* in: print rn first and last records + in directory */ { page_header_print(page); page_dir_print(page, dn); - page_print_list(page, rn); + page_print_list(page, index, rn); } /******************************************************************* @@ -1274,20 +1399,24 @@ the heap_no field. */ ibool page_rec_validate( /*==============*/ - /* out: TRUE if ok */ - rec_t* rec) /* in: record on the page */ + /* out: TRUE if ok */ + rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { ulint n_owned; ulint heap_no; - page_t* page; + page_t* page; + ibool comp; page = buf_frame_align(rec); + comp = page_is_comp(page); + ut_a(comp == rec_offs_comp(offsets)); page_rec_check(rec); - rec_validate(rec); + rec_validate(rec, offsets); - n_owned = rec_get_n_owned(rec); - heap_no = rec_get_heap_no(rec); + n_owned = rec_get_n_owned(rec, comp); + heap_no = rec_get_heap_no(rec, comp); if (!(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED)) { fprintf(stderr, @@ -1296,11 +1425,11 @@ page_rec_validate( return(FALSE); } - if (!(heap_no < page_header_get_field(page, PAGE_N_HEAP))) { + if (!(heap_no < page_dir_get_n_heap(page))) { fprintf(stderr, "InnoDB: Heap no of rec %lu too big %lu %lu\n", (ulong)(rec - page), (ulong) heap_no, - (ulong) page_header_get_field(page, PAGE_N_HEAP)); + (ulong) page_dir_get_n_heap(page)); return(FALSE); } @@ -1358,6 +1487,7 @@ page_simple_validate( ulint count; ulint own_count; ibool ret = FALSE; + ibool comp = page_is_comp(page); /* Check first that the record heap and the directory do not overlap. */ @@ -1404,13 +1534,13 @@ page_simple_validate( goto func_exit; } - if (rec_get_n_owned(rec) != 0) { + if (rec_get_n_owned(rec, comp) != 0) { /* This is a record pointed to by a dir slot */ - if (rec_get_n_owned(rec) != own_count) { + if (rec_get_n_owned(rec, comp) != own_count) { fprintf(stderr, "InnoDB: Wrong owned count %lu, %lu, rec %lu\n", - (ulong) rec_get_n_owned(rec), + (ulong) rec_get_n_owned(rec, comp), (ulong) own_count, (ulong)(rec - page)); @@ -1438,11 +1568,11 @@ page_simple_validate( break; } - if (rec_get_next_offs(rec) < FIL_PAGE_DATA - || rec_get_next_offs(rec) >= UNIV_PAGE_SIZE) { + if (rec_get_next_offs(rec, comp) < FIL_PAGE_DATA + || rec_get_next_offs(rec, comp) >= UNIV_PAGE_SIZE) { fprintf(stderr, "InnoDB: Next record offset nonsensical %lu for rec %lu\n", - (ulong) rec_get_next_offs(rec), + (ulong) rec_get_next_offs(rec, comp), (ulong)(rec - page)); goto func_exit; @@ -1461,7 +1591,7 @@ page_simple_validate( own_count++; } - if (rec_get_n_owned(rec) == 0) { + if (rec_get_n_owned(rec, comp) == 0) { fprintf(stderr, "InnoDB: n owned is zero in a supremum rec\n"); goto func_exit; @@ -1514,10 +1644,10 @@ page_simple_validate( rec = page_rec_get_next(rec); } - if (page_header_get_field(page, PAGE_N_HEAP) != count + 1) { + if (page_dir_get_n_heap(page) != count + 1) { fprintf(stderr, "InnoDB: N heap is wrong %lu, %lu\n", - (ulong) page_header_get_field(page, PAGE_N_HEAP), + (ulong) page_dir_get_n_heap(page), (ulong) (count + 1)); goto func_exit; @@ -1549,17 +1679,24 @@ page_validate( ulint slot_no; ulint data_size; rec_t* rec; - rec_t* old_rec = NULL; + rec_t* old_rec = NULL; ulint offs; ulint n_slots; - ibool ret = FALSE; + ibool ret = FALSE; ulint i; - + ibool comp = page_is_comp(page); + ulint* offsets = NULL; + ulint* old_offsets = NULL; + + if (comp != index->table->comp) { + fputs("InnoDB: 'compact format' flag mismatch\n", stderr); + goto func_exit2; + } if (!page_simple_validate(page)) { goto func_exit2; } - heap = mem_heap_create(UNIV_PAGE_SIZE); + heap = mem_heap_create(UNIV_PAGE_SIZE + 200); /* The following buffer is used to check that the records in the page record heap do not overlap */ @@ -1599,22 +1736,32 @@ page_validate( for (;;) { rec = cur.rec; + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); - if (!page_rec_validate(rec)) { + if (comp && page_rec_is_user_rec(rec) + && rec_get_node_ptr_flag(rec) + == !btr_page_get_level_low(page)) { + fputs("InnoDB: node_ptr flag mismatch\n", stderr); + goto func_exit; + } + + if (!page_rec_validate(rec, offsets)) { goto func_exit; } /* Check that the records are in the ascending order */ if ((count >= 2) && (!page_cur_is_after_last(&cur))) { - if (!(1 == cmp_rec_rec(rec, old_rec, index))) { + if (!(1 == cmp_rec_rec(rec, old_rec, + offsets, old_offsets, index))) { fprintf(stderr, "InnoDB: Records in wrong order on page %lu", (ulong) buf_frame_get_page_no(page)); dict_index_name_print(stderr, NULL, index); fputs("\nInnoDB: previous record ", stderr); - rec_print(stderr, old_rec); + rec_print_new(stderr, old_rec, old_offsets); fputs("\nInnoDB: record ", stderr); - rec_print(stderr, rec); + rec_print_new(stderr, rec, offsets); putc('\n', stderr); goto func_exit; @@ -1624,12 +1771,12 @@ page_validate( if ((rec != page_get_supremum_rec(page)) && (rec != page_get_infimum_rec(page))) { - data_size += rec_get_size(rec); + data_size += rec_offs_size(offsets); } - offs = rec_get_start(rec) - page; + offs = rec_get_start(rec, offsets) - page; - for (i = 0; i < rec_get_size(rec); i++) { + for (i = 0; i < rec_offs_size(offsets); i++) { if (!buf[offs + i] == 0) { /* No other record may overlap this */ @@ -1641,12 +1788,12 @@ page_validate( buf[offs + i] = 1; } - if (rec_get_n_owned(rec) != 0) { + if (rec_get_n_owned(rec, comp) != 0) { /* This is a record pointed to by a dir slot */ - if (rec_get_n_owned(rec) != own_count) { + if (rec_get_n_owned(rec, comp) != own_count) { fprintf(stderr, "InnoDB: Wrong owned count %lu, %lu\n", - (ulong) rec_get_n_owned(rec), + (ulong) rec_get_n_owned(rec, comp), (ulong) own_count); goto func_exit; } @@ -1671,11 +1818,11 @@ page_validate( break; } - if (rec_get_next_offs(rec) < FIL_PAGE_DATA - || rec_get_next_offs(rec) >= UNIV_PAGE_SIZE) { + if (rec_get_next_offs(rec, comp) < FIL_PAGE_DATA + || rec_get_next_offs(rec, comp) >= UNIV_PAGE_SIZE) { fprintf(stderr, "InnoDB: Next record offset wrong %lu\n", - (ulong) rec_get_next_offs(rec)); + (ulong) rec_get_next_offs(rec, comp)); goto func_exit; } @@ -1683,9 +1830,15 @@ page_validate( page_cur_move_to_next(&cur); own_count++; old_rec = rec; + /* set old_offsets to offsets; recycle offsets */ + { + ulint* offs = old_offsets; + old_offsets = offsets; + offsets = offs; + } } - if (rec_get_n_owned(rec) == 0) { + if (rec_get_n_owned(rec, comp) == 0) { fputs("InnoDB: n owned is zero\n", stderr); goto func_exit; } @@ -1714,15 +1867,17 @@ page_validate( rec = page_header_get_ptr(page, PAGE_FREE); while (rec != NULL) { - if (!page_rec_validate(rec)) { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + if (!page_rec_validate(rec, offsets)) { goto func_exit; } count++; - offs = rec_get_start(rec) - page; + offs = rec_get_start(rec, offsets) - page; - for (i = 0; i < rec_get_size(rec); i++) { + for (i = 0; i < rec_offs_size(offsets); i++) { if (buf[offs + i] != 0) { fputs( @@ -1736,9 +1891,9 @@ page_validate( rec = page_rec_get_next(rec); } - if (page_header_get_field(page, PAGE_N_HEAP) != count + 1) { + if (page_dir_get_n_heap(page) != count + 1) { fprintf(stderr, "InnoDB: N heap is wrong %lu %lu\n", - (ulong) page_header_get_field(page, PAGE_N_HEAP), + (ulong) page_dir_get_n_heap(page), (ulong) count + 1); goto func_exit; } @@ -1775,7 +1930,7 @@ page_find_rec_with_heap_no( page_cur_set_before_first(page, &cur); for (;;) { - if (rec_get_heap_no(cur.rec) == heap_no) { + if (rec_get_heap_no(cur.rec, page_is_comp(page)) == heap_no) { return(cur.rec); } diff --git a/innobase/pars/pars0pars.c b/innobase/pars/pars0pars.c index 16d630dd318..c62184abd85 100644 --- a/innobase/pars/pars0pars.c +++ b/innobase/pars/pars0pars.c @@ -1514,8 +1514,11 @@ pars_create_table( n_cols = que_node_list_get_len(column_defs); - table = dict_mem_table_create(table_sym->name, 0, n_cols); - + /* As the InnoDB SQL parser is for internal use only, + for creating some system tables, this function will only + create tables in the old (not compact) record format. */ + table = dict_mem_table_create(table_sym->name, 0, n_cols, FALSE); + if (not_fit_in_memory != NULL) { table->does_not_fit_in_memory = TRUE; } diff --git a/innobase/que/que0que.c b/innobase/que/que0que.c index 22878dec27f..a0a6adf9b83 100644 --- a/innobase/que/que0que.c +++ b/innobase/que/que0que.c @@ -163,6 +163,7 @@ que_thr_create( thr->run_node = NULL; thr->resource = 0; + thr->lock_state = QUE_THR_LOCK_NOLOCK; UT_LIST_ADD_LAST(thrs, parent->thrs, thr); diff --git a/innobase/rem/rem0cmp.c b/innobase/rem/rem0cmp.c index cf549284acc..193bda75f24 100644 --- a/innobase/rem/rem0cmp.c +++ b/innobase/rem/rem0cmp.c @@ -51,6 +51,7 @@ cmp_debug_dtuple_rec_with_match( dtuple in some of the common fields, or which has an equal number or more fields than dtuple */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint* matched_fields);/* in/out: number of already completely matched fields; when function returns, contains the value for current @@ -410,6 +411,7 @@ cmp_dtuple_rec_with_match( dtuple in some of the common fields, or which has an equal number or more fields than dtuple */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint* matched_fields, /* in/out: number of already completely matched fields; when function returns, contains the value for current comparison */ @@ -439,12 +441,13 @@ cmp_dtuple_rec_with_match( ut_ad(dtuple && rec && matched_fields && matched_bytes); ut_ad(dtuple_check_typed(dtuple)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); cur_field = *matched_fields; cur_bytes = *matched_bytes; ut_ad(cur_field <= dtuple_get_n_fields_cmp(dtuple)); - ut_ad(cur_field <= rec_get_n_fields(rec)); + ut_ad(cur_field <= rec_offs_n_fields(offsets)); /* Match fields in a loop; stop if we run out of fields in dtuple or find an externally stored field */ @@ -456,7 +459,8 @@ cmp_dtuple_rec_with_match( dtuple_f_len = dfield_get_len(dtuple_field); - rec_b_ptr = rec_get_nth_field(rec, cur_field, &rec_f_len); + rec_b_ptr = rec_get_nth_field(rec, offsets, + cur_field, &rec_f_len); /* If we have matched yet 0 bytes, it may be that one or both the fields are SQL null, or the record or dtuple may be @@ -466,7 +470,8 @@ cmp_dtuple_rec_with_match( if (cur_bytes == 0) { if (cur_field == 0) { - if (rec_get_info_bits(rec) + if (rec_get_info_bits(rec, + rec_offs_comp(offsets)) & REC_INFO_MIN_REC_FLAG) { if (dtuple_get_info_bits(dtuple) @@ -488,7 +493,7 @@ cmp_dtuple_rec_with_match( } } - if (rec_get_nth_field_extern_bit(rec, cur_field)) { + if (rec_offs_nth_extern(offsets, cur_field)) { /* We do not compare to an externally stored field */ @@ -619,7 +624,7 @@ cmp_dtuple_rec_with_match( up to the common fields */ order_resolved: ut_ad((ret >= - 1) && (ret <= 1)); - ut_ad(ret == cmp_debug_dtuple_rec_with_match(dtuple, rec, + ut_ad(ret == cmp_debug_dtuple_rec_with_match(dtuple, rec, offsets, matched_fields)); ut_ad(*matched_fields == cur_field); /* In the debug version, the above cmp_debug_... sets @@ -640,13 +645,15 @@ cmp_dtuple_rec( less than rec, respectively; see the comments for cmp_dtuple_rec_with_match */ dtuple_t* dtuple, /* in: data tuple */ - rec_t* rec) /* in: physical record */ + rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { ulint matched_fields = 0; ulint matched_bytes = 0; - return(cmp_dtuple_rec_with_match(dtuple, rec, &matched_fields, - &matched_bytes)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + return(cmp_dtuple_rec_with_match(dtuple, rec, offsets, + &matched_fields, &matched_bytes)); } /****************************************************************** @@ -657,22 +664,24 @@ ibool cmp_dtuple_is_prefix_of_rec( /*========================*/ /* out: TRUE if prefix */ - dtuple_t* dtuple, /* in: data tuple */ - rec_t* rec) /* in: physical record */ + dtuple_t* dtuple, /* in: data tuple */ + rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { ulint n_fields; ulint matched_fields = 0; ulint matched_bytes = 0; + ut_ad(rec_offs_validate(rec, NULL, offsets)); n_fields = dtuple_get_n_fields(dtuple); - if (n_fields > rec_get_n_fields(rec)) { + if (n_fields > rec_offs_n_fields(offsets)) { return(FALSE); } - cmp_dtuple_rec_with_match(dtuple, rec, &matched_fields, - &matched_bytes); + cmp_dtuple_rec_with_match(dtuple, rec, offsets, + &matched_fields, &matched_bytes); if (matched_fields == n_fields) { return(TRUE); @@ -687,42 +696,6 @@ cmp_dtuple_is_prefix_of_rec( return(FALSE); } -/****************************************************************** -Compares a prefix of a data tuple to a prefix of a physical record for -equality. If there are less fields in rec than parameter n_fields, FALSE -is returned. NOTE that n_fields_cmp of dtuple does not affect this -comparison. */ - -ibool -cmp_dtuple_rec_prefix_equal( -/*========================*/ - /* out: TRUE if equal */ - dtuple_t* dtuple, /* in: data tuple */ - rec_t* rec, /* in: physical record */ - ulint n_fields) /* in: number of fields which should be - compared; must not exceed the number of - fields in dtuple */ -{ - ulint matched_fields = 0; - ulint matched_bytes = 0; - - ut_ad(n_fields <= dtuple_get_n_fields(dtuple)); - - if (rec_get_n_fields(rec) < n_fields) { - - return(FALSE); - } - - cmp_dtuple_rec_with_match(dtuple, rec, &matched_fields, - &matched_bytes); - if (matched_fields >= n_fields) { - - return(TRUE); - } - - return(FALSE); -} - /***************************************************************** This function is used to compare two physical records. Only the common first fields are compared, and if an externally stored field is @@ -736,6 +709,8 @@ cmp_rec_rec_with_match( first fields are compared */ rec_t* rec1, /* in: physical record */ rec_t* rec2, /* in: physical record */ + const ulint* offsets1,/* in: rec_get_offsets(rec1, index) */ + const ulint* offsets2,/* in: rec_get_offsets(rec2, index) */ dict_index_t* index, /* in: data dictionary index */ ulint* matched_fields, /* in/out: number of already completely matched fields; when the function returns, @@ -762,17 +737,21 @@ cmp_rec_rec_with_match( ulint cur_bytes; /* number of already matched bytes in current field */ int ret = 3333; /* return value */ + ibool comp; ut_ad(rec1 && rec2 && index); + ut_ad(rec_offs_validate(rec1, index, offsets1)); + ut_ad(rec_offs_validate(rec2, index, offsets2)); + ut_ad(rec_offs_comp(offsets1) == rec_offs_comp(offsets2)); - rec1_n_fields = rec_get_n_fields(rec1); - rec2_n_fields = rec_get_n_fields(rec2); + comp = rec_offs_comp(offsets1); + rec1_n_fields = rec_offs_n_fields(offsets1); + rec2_n_fields = rec_offs_n_fields(offsets2); cur_field = *matched_fields; cur_bytes = *matched_bytes; - /* Match fields in a loop; stop if we run out of fields in either - record */ + /* Match fields in a loop */ while ((cur_field < rec1_n_fields) && (cur_field < rec2_n_fields)) { @@ -784,17 +763,19 @@ cmp_rec_rec_with_match( dict_index_get_nth_field(index, cur_field))); } - rec1_b_ptr = rec_get_nth_field(rec1, cur_field, &rec1_f_len); - rec2_b_ptr = rec_get_nth_field(rec2, cur_field, &rec2_f_len); - + rec1_b_ptr = rec_get_nth_field(rec1, offsets1, + cur_field, &rec1_f_len); + rec2_b_ptr = rec_get_nth_field(rec2, offsets2, + cur_field, &rec2_f_len); + if (cur_bytes == 0) { if (cur_field == 0) { /* Test if rec is the predefined minimum record */ - if (rec_get_info_bits(rec1) + if (rec_get_info_bits(rec1, comp) & REC_INFO_MIN_REC_FLAG) { - if (rec_get_info_bits(rec2) + if (rec_get_info_bits(rec2, comp) & REC_INFO_MIN_REC_FLAG) { ret = 0; } else { @@ -803,7 +784,7 @@ cmp_rec_rec_with_match( goto order_resolved; - } else if (rec_get_info_bits(rec2) + } else if (rec_get_info_bits(rec2, comp) & REC_INFO_MIN_REC_FLAG) { ret = 1; @@ -812,8 +793,8 @@ cmp_rec_rec_with_match( } } - if (rec_get_nth_field_extern_bit(rec1, cur_field) - || rec_get_nth_field_extern_bit(rec2, cur_field)) { + if (rec_offs_nth_extern(offsets1, cur_field) + || rec_offs_nth_extern(offsets2, cur_field)) { /* We do not compare to an externally stored field */ @@ -968,6 +949,7 @@ cmp_debug_dtuple_rec_with_match( dtuple in some of the common fields, or which has an equal number or more fields than dtuple */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint* matched_fields) /* in/out: number of already completely matched fields; when function returns, contains the value for current @@ -987,14 +969,16 @@ cmp_debug_dtuple_rec_with_match( ut_ad(dtuple && rec && matched_fields); ut_ad(dtuple_check_typed(dtuple)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); ut_ad(*matched_fields <= dtuple_get_n_fields_cmp(dtuple)); - ut_ad(*matched_fields <= rec_get_n_fields(rec)); + ut_ad(*matched_fields <= rec_offs_n_fields(offsets)); cur_field = *matched_fields; if (cur_field == 0) { - if (rec_get_info_bits(rec) & REC_INFO_MIN_REC_FLAG) { + if (rec_get_info_bits(rec, rec_offs_comp(offsets)) + & REC_INFO_MIN_REC_FLAG) { if (dtuple_get_info_bits(dtuple) & REC_INFO_MIN_REC_FLAG) { @@ -1024,9 +1008,10 @@ cmp_debug_dtuple_rec_with_match( dtuple_f_data = dfield_get_data(dtuple_field); dtuple_f_len = dfield_get_len(dtuple_field); - rec_f_data = rec_get_nth_field(rec, cur_field, &rec_f_len); + rec_f_data = rec_get_nth_field(rec, offsets, + cur_field, &rec_f_len); - if (rec_get_nth_field_extern_bit(rec, cur_field)) { + if (rec_offs_nth_extern(offsets, cur_field)) { /* We do not compare to an externally stored field */ ret = 0; diff --git a/innobase/rem/rem0rec.c b/innobase/rem/rem0rec.c index 1db89241dff..30f98f457ea 100644 --- a/innobase/rem/rem0rec.c +++ b/innobase/rem/rem0rec.c @@ -15,8 +15,8 @@ Created 5/30/1994 Heikki Tuuri #include "mtr0mtr.h" #include "mtr0log.h" -/* PHYSICAL RECORD - =============== +/* PHYSICAL RECORD (OLD STYLE) + =========================== The physical record, which is the data type of all the records found in index pages of the database, has the following format @@ -39,7 +39,7 @@ represented on a higher text line): | 10 bits giving the number of fields in this record | | 1 bit which is set to 1 if the offsets above are given in one byte format, 0 if in two byte format | -| two bytes giving the pointer to the next record in the page | +| two bytes giving an absolute pointer to the next record in the page | ORIGIN of the record | first field of data | ... @@ -55,9 +55,50 @@ The offsets of the data fields are given as one-byte (if there are less than 127 bytes of data in the record) or two-byte unsigned integers. The most significant bit is not part of the offset, instead it indicates the SQL-null -if the bit is set to 1. +if the bit is set to 1. */ -CANONICAL COORDINATES. A record can be seen as a single +/* PHYSICAL RECORD (NEW STYLE) + =========================== + +The physical record, which is the data type of all the records +found in index pages of the database, has the following format +(lower addresses and more significant bits inside a byte are below +represented on a higher text line): + +| length of the last non-null variable-length field of data: + if the maximum length is 255, one byte; otherwise, + 0xxxxxxx (one byte, length=0..127), or 1exxxxxxxxxxxxxx (two bytes, + length=128..16383, extern storage flag) | +... +| length of first variable-length field of data | +| SQL-null flags (1 bit per nullable field), padded to full bytes | +| 4 bits used to delete mark a record, and mark a predefined + minimum record in alphabetical order | +| 4 bits giving the number of records owned by this record + (this term is explained in page0page.h) | +| 13 bits giving the order number of this record in the + heap of the index page | +| 3 bits record type: 000=conventional, 001=node pointer (inside B-tree), + 010=infimum, 011=supremum, 1xx=reserved | +| two bytes giving a relative pointer to the next record in the page | +ORIGIN of the record +| first field of data | +... +| last field of data | + +The origin of the record is the start address of the first field +of data. The offsets are given relative to the origin. +The offsets of the data fields are stored in an inverted +order because then the offset of the first fields are near the +origin, giving maybe a better processor cache hit rate in searches. + +The offsets of the data fields are given as one-byte +(if there are less than 127 bytes of data in the record) +or two-byte unsigned integers. The most significant bit +is not part of the offset, instead it indicates the SQL-null +if the bit is set to 1. */ + +/* CANONICAL COORDINATES. A record can be seen as a single string of 'characters' in the following way: catenate the bytes in each field, in the order of fields. An SQL-null field is taken to be an empty sequence of bytes. Then after @@ -86,13 +127,237 @@ the corresponding canonical strings have the same property. */ ulint rec_dummy; /* this is used to fool compiler in rec_validate */ +/******************************************************************* +Validates the consistency of an old-style physical record. */ +static +ibool +rec_validate_old( +/*=============*/ + /* out: TRUE if ok */ + rec_t* rec); /* in: physical record */ + +/********************************************************** +The following function determines the offsets to each field in the +record. The offsets are written to a previously allocated array of +ulint, where rec_offs_n_fields(offsets) has been initialized to the +number of fields in the record. The rest of the array will be +initialized by this function. rec_offs_base(offsets)[0] will be set +to the extra size (if REC_OFFS_COMPACT is set, the record is in the +new format), and rec_offs_base(offsets)[1..n_fields] will be set to +offsets past the end of fields 0..n_fields, or to the beginning of +fields 1..n_fields+1. When the high-order bit of the offset at [i+1] +is set (REC_OFFS_SQL_NULL), the field i is NULL. When the second +high-order bit of the offset at [i+1] is set (REC_OFFS_EXTERNAL), the +field i is being stored externally. */ +static +void +rec_init_offsets( +/*=============*/ + /* out: the offsets */ + rec_t* rec, /* in: physical record */ + dict_index_t* index, /* in: record descriptor */ + ulint* offsets)/* in/out: array of offsets; + in: n=rec_offs_n_fields(offsets) */ +{ + ulint n_fields = rec_offs_n_fields(offsets); + ulint i = 0; + ulint offs; + + rec_offs_make_valid(rec, index, offsets); + + if (index->table->comp) { + const byte* nulls; + const byte* lens; + dict_field_t* field; + dtype_t* type; + ulint null_mask; + ulint status = rec_get_status(rec); + ulint n_node_ptr_field = ULINT_UNDEFINED; + + switch (status) { + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + /* the field is 8 bytes long */ + rec_offs_base(offsets)[0] = + REC_N_NEW_EXTRA_BYTES | REC_OFFS_COMPACT; + rec_offs_base(offsets)[1] = 8; + return; + case REC_STATUS_NODE_PTR: + n_node_ptr_field = + dict_index_get_n_unique_in_tree(index); + break; + case REC_STATUS_ORDINARY: + break; + } + + nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1); + lens = nulls - (index->n_nullable + 7) / 8; + offs = 0; + null_mask = 1; + + /* read the lengths of fields 0..n */ + for (; i < n_fields; i++) { + ibool is_null = FALSE, is_external = FALSE; + ulint len; + if (i == n_node_ptr_field) { + len = 4; + goto resolved; + } + + field = dict_index_get_nth_field(index, i); + type = dict_col_get_type(dict_field_get_col(field)); + if (!(dtype_get_prtype(type) & DATA_NOT_NULL)) { + /* nullable field => read the null flag */ + is_null = (*nulls & null_mask) != 0; + null_mask <<= 1; + if (null_mask == 0x100) { + nulls--; + null_mask = 1; + } + } + + if (is_null) { + /* No length is stored for NULL fields. */ + len = 0; + } else if (!field->fixed_len) { + /* Variable-length field: read the length */ + len = *lens--; + if (dtype_get_len(type) > 255 + || dtype_get_mtype(type) == DATA_BLOB) { + if (len & 0x80) { + /* 1exxxxxxx xxxxxxxx */ + is_external = !!(len & 0x40); + len &= 0x3f; + len <<= 8; + len |= *lens--; + } + } + } else { + len = field->fixed_len; + } + resolved: + offs += len; + len = offs; + if (is_external) { + len |= REC_OFFS_EXTERNAL; + } + if (is_null) { + len |= REC_OFFS_SQL_NULL; + } + rec_offs_base(offsets)[i + 1] = len; + } + + *rec_offs_base(offsets) = + (rec - (lens + 1)) | REC_OFFS_COMPACT; + } else { + /* Old-style record: determine extra size and end offsets */ + offs = REC_N_OLD_EXTRA_BYTES; + if (rec_get_1byte_offs_flag(rec)) { + offs += n_fields; + *rec_offs_base(offsets) = offs; + /* Determine offsets to fields */ + for (; i < n_fields; i++) { + offs = rec_1_get_field_end_info(rec, i); + if (offs & REC_1BYTE_SQL_NULL_MASK) { + offs &= ~REC_1BYTE_SQL_NULL_MASK; + offs |= REC_OFFS_SQL_NULL; + } + rec_offs_base(offsets)[1 + i] = offs; + } + } else { + offs += 2 * n_fields; + *rec_offs_base(offsets) = offs; + /* Determine offsets to fields */ + for (; i < n_fields; i++) { + offs = rec_2_get_field_end_info(rec, i); + if (offs & REC_2BYTE_SQL_NULL_MASK) { + offs &= ~REC_2BYTE_SQL_NULL_MASK; + offs |= REC_OFFS_SQL_NULL; + } + if (offs & REC_2BYTE_EXTERN_MASK) { + offs &= ~REC_2BYTE_EXTERN_MASK; + offs |= REC_OFFS_EXTERNAL; + } + rec_offs_base(offsets)[1 + i] = offs; + } + } + } +} + +/********************************************************** +The following function determines the offsets to each field +in the record. It can reuse a previously returned array. */ + +ulint* +rec_get_offsets_func( +/*=================*/ + /* out: the new offsets */ + rec_t* rec, /* in: physical record */ + dict_index_t* index, /* in: record descriptor */ + ulint* offsets,/* in: array consisting of offsets[0] + allocated elements, or an array from + rec_get_offsets(), or NULL */ + ulint n_fields,/* in: maximum number of initialized fields + (ULINT_UNDEFINED if all fields) */ + mem_heap_t** heap, /* in/out: memory heap */ + const char* file, /* in: file name where called */ + ulint line) /* in: line number where called */ +{ + ulint n; + ulint size; + + ut_ad(rec); + ut_ad(index); + ut_ad(heap); + + if (index->table->comp) { + switch (rec_get_status(rec)) { + case REC_STATUS_ORDINARY: + n = dict_index_get_n_fields(index); + break; + case REC_STATUS_NODE_PTR: + n = dict_index_get_n_unique_in_tree(index) + 1; + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + /* infimum or supremum record */ + n = 1; + break; + default: + ut_error; + return(NULL); + } + } else { + n = rec_get_n_fields_old(rec); + } + + if (n_fields < n) { + n = n_fields; + } + + size = n + (1 + REC_OFFS_HEADER_SIZE); + + if (!offsets || rec_offs_get_n_alloc(offsets) < size) { + if (!*heap) { + *heap = mem_heap_create_func(size * sizeof(ulint), + NULL, MEM_HEAP_DYNAMIC, file, line); + } + offsets = mem_heap_alloc(*heap, size * sizeof(ulint)); + rec_offs_set_n_alloc(offsets, size); + } + + rec_offs_set_n_fields(offsets, n); + rec_init_offsets(rec, index, offsets); + return(offsets); +} + /**************************************************************** -The following function is used to get a pointer to the nth data field in a -record. */ +The following function is used to get a pointer to the nth +data field in an old-style record. */ byte* -rec_get_nth_field( -/*==============*/ +rec_get_nth_field_old( +/*==================*/ /* out: pointer to the field */ rec_t* rec, /* in: record */ ulint n, /* in: index of the field */ @@ -103,9 +368,9 @@ rec_get_nth_field( ulint next_os; ut_ad(rec && len); - ut_ad(n < rec_get_n_fields(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); - if (n > 1024) { + if (n > REC_MAX_N_FIELDS) { fprintf(stderr, "Error: trying to access field %lu in rec\n", (ulong) n); ut_error; @@ -150,8 +415,78 @@ rec_get_nth_field( return(rec + os); } +/************************************************************** +The following function returns the size of a data tuple when converted to +a new-style physical record. */ + +ulint +rec_get_converted_size_new( +/*=======================*/ + /* out: size */ + dict_index_t* index, /* in: record descriptor */ + dtuple_t* dtuple) /* in: data tuple */ +{ + ulint size = REC_N_NEW_EXTRA_BYTES + + (index->n_nullable + 7) / 8; + dict_field_t* field; + dtype_t* type; + ulint i; + ulint n_fields; + ut_ad(index && dtuple); + ut_ad(index->table->comp); + + switch (dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK) { + case REC_STATUS_ORDINARY: + n_fields = dict_index_get_n_fields(index); + ut_ad(n_fields == dtuple_get_n_fields(dtuple)); + break; + case REC_STATUS_NODE_PTR: + n_fields = dict_index_get_n_unique_in_tree(index); + ut_ad(n_fields + 1 == dtuple_get_n_fields(dtuple)); + ut_ad(dtuple_get_nth_field(dtuple, n_fields)->len == 4); + size += 4; /* child page number */ + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + /* infimum or supremum record, 8 bytes */ + return(size + 8); /* no extra data needed */ + default: + ut_a(0); + return(ULINT_UNDEFINED); + } + + /* read the lengths of fields 0..n */ + for (i = 0; i < n_fields; i++) { + ulint len = dtuple_get_nth_field(dtuple, i)->len; + field = dict_index_get_nth_field(index, i); + type = dict_col_get_type(dict_field_get_col(field)); + ut_ad(len != UNIV_SQL_NULL || + !(dtype_get_prtype(type) & DATA_NOT_NULL)); + + if (len == UNIV_SQL_NULL) { + /* No length is stored for NULL fields. */ + continue; + } + + ut_ad(len <= dtype_get_len(type) + || dtype_get_mtype(type) == DATA_BLOB); + ut_ad(!field->fixed_len || len == field->fixed_len); + + if (field->fixed_len) { + } else if (len < 128 || (dtype_get_len(type) < 256 + && dtype_get_mtype(type) != DATA_BLOB)) { + size++; + } else { + size += 2; + } + size += len; + } + + return(size); +} + /*************************************************************** -Sets the value of the ith field SQL null bit. */ +Sets the value of the ith field SQL null bit of an old-style record. */ void rec_set_nth_field_null_bit( @@ -189,12 +524,12 @@ rec_set_nth_field_null_bit( } /*************************************************************** -Sets the value of the ith field extern storage bit. */ +Sets the value of the ith field extern storage bit of an old-style record. */ void -rec_set_nth_field_extern_bit( -/*=========================*/ - rec_t* rec, /* in: record */ +rec_set_nth_field_extern_bit_old( +/*=============================*/ + rec_t* rec, /* in: old-style record */ ulint i, /* in: ith field */ ibool val, /* in: value to set */ mtr_t* mtr) /* in: mtr holding an X-latch to the page where @@ -204,7 +539,7 @@ rec_set_nth_field_extern_bit( ulint info; ut_a(!rec_get_1byte_offs_flag(rec)); - ut_a(i < rec_get_n_fields(rec)); + ut_a(i < rec_get_n_fields_old(rec)); info = rec_2_get_field_end_info(rec, i); @@ -215,36 +550,140 @@ rec_set_nth_field_extern_bit( } if (mtr) { - mlog_write_ulint(rec - REC_N_EXTRA_BYTES - 2 * (i + 1), info, - MLOG_2BYTES, mtr); + mlog_write_ulint(rec - REC_N_OLD_EXTRA_BYTES - 2 * (i + 1), + info, MLOG_2BYTES, mtr); } else { rec_2_set_field_end_info(rec, i, info); } } /*************************************************************** +Sets the value of the ith field extern storage bit of a new-style record. */ + +void +rec_set_nth_field_extern_bit_new( +/*=============================*/ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: record descriptor */ + ulint ith, /* in: ith field */ + ibool val, /* in: value to set */ + mtr_t* mtr) /* in: mtr holding an X-latch to the page + where rec is, or NULL; in the NULL case + we do not write to log about the change */ +{ + byte* nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1); + byte* lens = nulls - (index->n_nullable + 7) / 8; + dict_field_t* field; + dtype_t* type; + ulint i; + ulint n_fields; + ulint null_mask = 1; + ut_ad(rec && index); + ut_ad(index->table->comp); + ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY); + + n_fields = dict_index_get_n_fields(index); + + ut_ad(ith < n_fields); + + /* read the lengths of fields 0..n */ + for (i = 0; i < n_fields; i++) { + ibool is_null; + ulint len; + field = dict_index_get_nth_field(index, i); + type = dict_col_get_type(dict_field_get_col(field)); + is_null = !(dtype_get_prtype(type) & DATA_NOT_NULL); + if (is_null) { + /* nullable field => read the null flag */ + is_null = !!(*nulls & null_mask); + null_mask <<= 1; + if (null_mask == 0x100) + nulls--, null_mask = 1; + } + if (is_null || field->fixed_len) { + /* No length (or extern bit) is stored for + fields that are NULL or fixed-length. */ + ut_ad(i != ith); + continue; + } + len = *lens--; + if (dtype_get_len(type) > 255 + || dtype_get_mtype(type) == DATA_BLOB) { + if (len & 0x80) { /* 1exxxxxx: 2-byte length */ + if (i == ith) { + if (!val == !(len & 0x20)) { + return; /* no change */ + } + /* toggle the extern bit */ + len ^= 0x40; + if (mtr) { + mlog_write_ulint(lens + 1, len, + MLOG_1BYTE, mtr); + } else { + lens[1] = len; + } + return; + } + lens--; + } else { + /* short fields cannot be external */ + ut_ad(i != ith); + } + } else { + /* short fields cannot be external */ + ut_ad(i != ith); + } + } +} + +/*************************************************************** Sets TRUE the extern storage bits of fields mentioned in an array. */ void rec_set_field_extern_bits( /*======================*/ - rec_t* rec, /* in: record */ - ulint* vec, /* in: array of field numbers */ - ulint n_fields, /* in: number of fields numbers */ - mtr_t* mtr) /* in: mtr holding an X-latch to the page - where rec is, or NULL; in the NULL case we - do not write to log about the change */ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: record descriptor */ + const ulint* vec, /* in: array of field numbers */ + ulint n_fields,/* in: number of fields numbers */ + mtr_t* mtr) /* in: mtr holding an X-latch to the + page where rec is, or NULL; + in the NULL case we do not write + to log about the change */ { ulint i; for (i = 0; i < n_fields; i++) { - rec_set_nth_field_extern_bit(rec, vec[i], TRUE, mtr); + rec_set_nth_field_extern_bit(rec, index, vec[i], TRUE, mtr); + } +} + +/************************************************************** +Returns the total size of a physical record. */ + +ulint +rec_get_size( +/*=========*/ + /* out: size */ + rec_t* rec, /* in: physical record */ + dict_index_t* index) /* in: record descriptor */ +{ + mem_heap_t* heap = NULL; + ulint offsets_[100 + REC_OFFS_HEADER_SIZE] + = { 100, }; + ulint* offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + ulint size = rec_offs_size(offsets); + + if (heap) { + mem_heap_free(heap); } + return(size); } /*************************************************************** -Sets a record field to SQL null. The physical size of the field is not -changed. */ +Sets an old-style record field to SQL null. +The physical size of the field is not changed. */ void rec_set_nth_field_sql_null( @@ -262,20 +701,20 @@ rec_set_nth_field_sql_null( } /************************************************************* -Builds a physical record out of a data tuple and stores it beginning from -address destination. */ - -rec_t* -rec_convert_dtuple_to_rec_low( +Builds an old-style physical record out of a data tuple and +stores it beginning from the start of the given buffer. */ +static +rec_t* +rec_convert_dtuple_to_rec_old( /*==========================*/ - /* out: pointer to the origin of physical - record */ - byte* destination, /* in: start address of the physical record */ - dtuple_t* dtuple, /* in: data tuple */ - ulint data_size) /* in: data size of dtuple */ + /* out: pointer to the origin of + physical record */ + byte* buf, /* in: start address of the physical record */ + dtuple_t* dtuple)/* in: data tuple */ { dfield_t* field; ulint n_fields; + ulint data_size; rec_t* rec; ulint end_offset; ulint ored_offset; @@ -283,24 +722,25 @@ rec_convert_dtuple_to_rec_low( ulint len; ulint i; - ut_ad(destination && dtuple); + ut_ad(buf && dtuple); ut_ad(dtuple_validate(dtuple)); ut_ad(dtuple_check_typed(dtuple)); - ut_ad(dtuple_get_data_size(dtuple) == data_size); n_fields = dtuple_get_n_fields(dtuple); + data_size = dtuple_get_data_size(dtuple); ut_ad(n_fields > 0); /* Calculate the offset of the origin in the physical record */ - rec = destination + rec_get_converted_extra_size(data_size, n_fields); + rec = buf + rec_get_converted_extra_size(data_size, n_fields); /* Store the number of fields */ - rec_set_n_fields(rec, n_fields); + rec_set_n_fields_old(rec, n_fields); /* Set the info bits of the record */ - rec_set_info_bits(rec, dtuple_get_info_bits(dtuple)); + rec_set_info_bits(rec, FALSE, + dtuple_get_info_bits(dtuple) & REC_INFO_BITS_MASK); /* Store the data and the offsets */ @@ -361,8 +801,196 @@ rec_convert_dtuple_to_rec_low( } } - ut_ad(rec_validate(rec)); + return(rec); +} + +/************************************************************* +Builds a new-style physical record out of a data tuple and +stores it beginning from the start of the given buffer. */ +static +rec_t* +rec_convert_dtuple_to_rec_new( +/*==========================*/ + /* out: pointer to the origin + of physical record */ + byte* buf, /* in: start address of the physical record */ + dict_index_t* index, /* in: record descriptor */ + dtuple_t* dtuple) /* in: data tuple */ +{ + dfield_t* field; + dtype_t* type; + rec_t* rec = buf + REC_N_NEW_EXTRA_BYTES; + byte* end; + byte* nulls; + byte* lens; + ulint len; + ulint i; + ulint fixed_len; + ulint null_mask = 1; + const ulint n_fields = dtuple_get_n_fields(dtuple); + const ulint status = dtuple_get_info_bits(dtuple) + & REC_NEW_STATUS_MASK; + ut_ad(index->table->comp); + + ut_ad(n_fields > 0); + switch (status) { + case REC_STATUS_ORDINARY: + ut_ad(n_fields <= dict_index_get_n_fields(index)); + break; + case REC_STATUS_NODE_PTR: + ut_ad(n_fields == dict_index_get_n_unique_in_tree(index) + 1); + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + ut_ad(n_fields == 1); + goto init; + default: + ut_a(0); + return(0); + } + + /* Calculate the offset of the origin in the physical record. + We must loop over all fields to do this. */ + rec += (index->n_nullable + 7) / 8; + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(dtuple, i); + type = dfield_get_type(field); + len = dfield_get_len(field); + if (status == REC_STATUS_NODE_PTR && i == n_fields - 1) { + fixed_len = 4; + ut_ad(dtype_get_prtype(type) & DATA_NOT_NULL); + ut_ad(len == 4); + continue; + } + fixed_len = dict_index_get_nth_field(index, i)->fixed_len; + + if (!(dtype_get_prtype(type) & DATA_NOT_NULL)) { + if (len == UNIV_SQL_NULL) + continue; + } + /* only nullable fields can be null */ + ut_ad(len != UNIV_SQL_NULL); + if (fixed_len) { + ut_ad(len == fixed_len); + } else { + ut_ad(len <= dtype_get_len(type) + || dtype_get_mtype(type) == DATA_BLOB); + rec++; + if (len >= 128 && (dtype_get_len(type) >= 256 + || dtype_get_mtype(type) == DATA_BLOB)) { + rec++; + } + } + } + +init: + end = rec; + nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1); + lens = nulls - (index->n_nullable + 7) / 8; + /* clear the SQL-null flags */ + memset (lens + 1, 0, nulls - lens); + + /* Set the info bits of the record */ + rec_set_status(rec, status); + + rec_set_info_bits(rec, TRUE, + dtuple_get_info_bits(dtuple) & REC_INFO_BITS_MASK); + + /* Store the data and the offsets */ + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(dtuple, i); + type = dfield_get_type(field); + len = dfield_get_len(field); + + if (status == REC_STATUS_NODE_PTR && i == n_fields - 1) { + fixed_len = 4; + ut_ad(dtype_get_prtype(type) & DATA_NOT_NULL); + ut_ad(len == 4); + goto copy; + } + fixed_len = dict_index_get_nth_field(index, i)->fixed_len; + + if (!(dtype_get_prtype(type) & DATA_NOT_NULL)) { + /* nullable field */ + ut_ad(index->n_nullable > 0); + ut_ad(*nulls < null_mask); + /* set the null flag if necessary */ + if (len == UNIV_SQL_NULL) { + *nulls |= null_mask; + } + null_mask <<= 1; + if (null_mask == 0x100) + nulls--, null_mask = 1; + if (len == UNIV_SQL_NULL) + continue; + } + /* only nullable fields can be null */ + ut_ad(len != UNIV_SQL_NULL); + if (fixed_len) { + ut_ad(len == fixed_len); + } else { + ut_ad(len <= dtype_get_len(type) + || dtype_get_mtype(type) == DATA_BLOB); + if (len < 128 || (dtype_get_len(type) < 256 + && dtype_get_mtype(type) != DATA_BLOB)) { + *lens-- = len; + } + else { + /* the extern bits will be set later */ + ut_ad(len < 16384); + *lens-- = len >> 8 | 0x80; + *lens-- = len; + } + } + copy: + memcpy(end, dfield_get_data(field), len); + end += len; + } + + return(rec); +} + +/************************************************************* +Builds a physical record out of a data tuple and +stores it beginning from the start of the given buffer. */ +rec_t* +rec_convert_dtuple_to_rec( +/*======================*/ + /* out: pointer to the origin + of physical record */ + byte* buf, /* in: start address of the + physical record */ + dict_index_t* index, /* in: record descriptor */ + dtuple_t* dtuple) /* in: data tuple */ +{ + rec_t* rec; + + ut_ad(buf && index && dtuple); + ut_ad(dtuple_validate(dtuple)); + ut_ad(dtuple_check_typed(dtuple)); + + if (index->table->comp) { + rec = rec_convert_dtuple_to_rec_new(buf, index, dtuple); + } else { + rec = rec_convert_dtuple_to_rec_old(buf, dtuple); + } + +#ifdef UNIV_DEBUG + { + mem_heap_t* heap = NULL; + ulint offsets_[100 + REC_OFFS_HEADER_SIZE] + = { 100, }; + const ulint* offsets = rec_get_offsets(rec, index, + offsets_, ULINT_UNDEFINED, &heap); + ut_ad(rec_validate(rec, offsets)); + if (heap) { + mem_heap_free(heap); + } + } +#endif /* UNIV_DEBUG */ return(rec); } @@ -375,6 +1003,7 @@ rec_copy_prefix_to_dtuple( /*======================*/ dtuple_t* tuple, /* in: data tuple */ rec_t* rec, /* in: physical record */ + dict_index_t* index, /* in: record descriptor */ ulint n_fields, /* in: number of fields to copy */ mem_heap_t* heap) /* in: memory heap */ { @@ -383,16 +1012,22 @@ rec_copy_prefix_to_dtuple( ulint len; byte* buf = NULL; ulint i; - - ut_ad(rec_validate(rec)); + ulint offsets_[100 + REC_OFFS_HEADER_SIZE] + = { 100, }; + ulint* offsets = offsets_; + + offsets = rec_get_offsets(rec, index, offsets, n_fields, &heap); + + ut_ad(rec_validate(rec, offsets)); ut_ad(dtuple_check_typed(tuple)); - dtuple_set_info_bits(tuple, rec_get_info_bits(rec)); + dtuple_set_info_bits(tuple, + rec_get_info_bits(rec, index->table->comp)); for (i = 0; i < n_fields; i++) { field = dtuple_get_nth_field(tuple, i); - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field(rec, offsets, i, &len); if (len != UNIV_SQL_NULL) { buf = mem_heap_alloc(heap, len); @@ -405,32 +1040,28 @@ rec_copy_prefix_to_dtuple( } /****************************************************************** -Copies the first n fields of a physical record to a new physical record in -a buffer. */ - +Copies the first n fields of an old-style physical record +to a new physical record in a buffer. */ +static rec_t* -rec_copy_prefix_to_buf( -/*===================*/ +rec_copy_prefix_to_buf_old( +/*=======================*/ /* out, own: copied record */ rec_t* rec, /* in: physical record */ ulint n_fields, /* in: number of fields to copy */ + ulint area_end, /* in: end of the prefix data */ byte** buf, /* in/out: memory buffer for the copied prefix, or NULL */ ulint* buf_size) /* in/out: buffer size */ { rec_t* copy_rec; ulint area_start; - ulint area_end; ulint prefix_len; - ut_ad(rec_validate(rec)); - - area_end = rec_get_field_start_offs(rec, n_fields); - if (rec_get_1byte_offs_flag(rec)) { - area_start = REC_N_EXTRA_BYTES + n_fields; + area_start = REC_N_OLD_EXTRA_BYTES + n_fields; } else { - area_start = REC_N_EXTRA_BYTES + 2 * n_fields; + area_start = REC_N_OLD_EXTRA_BYTES + 2 * n_fields; } prefix_len = area_start + area_end; @@ -448,17 +1079,114 @@ rec_copy_prefix_to_buf( copy_rec = *buf + area_start; - rec_set_n_fields(copy_rec, n_fields); + rec_set_n_fields_old(copy_rec, n_fields); return(copy_rec); } -/******************************************************************* -Validates the consistency of a physical record. */ +/****************************************************************** +Copies the first n fields of a physical record to a new physical record in +a buffer. */ +rec_t* +rec_copy_prefix_to_buf( +/*===================*/ + /* out, own: copied record */ + rec_t* rec, /* in: physical record */ + dict_index_t* index, /* in: record descriptor */ + ulint n_fields, /* in: number of fields to copy */ + byte** buf, /* in/out: memory buffer + for the copied prefix, or NULL */ + ulint* buf_size) /* in/out: buffer size */ +{ + byte* nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1); + byte* lens = nulls - (index->n_nullable + 7) / 8; + dict_field_t* field; + dtype_t* type; + ulint i; + ulint prefix_len = 0; + ibool is_null; + ulint null_mask = 1; + ulint status; + + if (!index->table->comp) { + ut_ad(rec_validate_old(rec)); + return(rec_copy_prefix_to_buf_old(rec, n_fields, + rec_get_field_start_offs(rec, n_fields), + buf, buf_size)); + } + + status = rec_get_status(rec); + + switch (status) { + case REC_STATUS_ORDINARY: + ut_ad(n_fields <= dict_index_get_n_fields(index)); + break; + case REC_STATUS_NODE_PTR: + /* it doesn't make sense to copy the child page number field */ + ut_ad(n_fields <= dict_index_get_n_unique_in_tree(index)); + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + /* infimum or supremum record: no sense to copy anything */ + default: + ut_a(0); + return(NULL); + } + + /* read the lengths of fields 0..n */ + for (i = 0; i < n_fields; i++) { + field = dict_index_get_nth_field(index, i); + type = dict_col_get_type(dict_field_get_col(field)); + is_null = !(dtype_get_prtype(type) & DATA_NOT_NULL); + if (is_null) { + /* nullable field => read the null flag */ + is_null = !!(*nulls & null_mask); + null_mask <<= 1; + if (null_mask == 0x100) + nulls--, null_mask = 1; + } + + if (is_null) { + } else if (field->fixed_len) { + prefix_len += field->fixed_len; + } else { + ulint len = *lens--; + if (dtype_get_len(type) > 255 + || dtype_get_mtype(type) == DATA_BLOB) { + if (len & 0x80) { + /* 1exxxxxx */ + len &= 0x3f; + len <<= 8; + len |= *lens--; + } + } + prefix_len += len; + } + } + + prefix_len += rec - (lens + 1); + + if ((*buf == NULL) || (*buf_size < prefix_len)) { + if (*buf != NULL) { + mem_free(*buf); + } + + *buf = mem_alloc(prefix_len); + *buf_size = prefix_len; + } + + memcpy(*buf, lens + 1, prefix_len); + + return(*buf + (rec - (lens + 1))); +} + +/******************************************************************* +Validates the consistency of an old-style physical record. */ +static ibool -rec_validate( -/*=========*/ +rec_validate_old( +/*=============*/ /* out: TRUE if ok */ rec_t* rec) /* in: physical record */ { @@ -470,7 +1198,7 @@ rec_validate( ulint i; ut_a(rec); - n_fields = rec_get_n_fields(rec); + n_fields = rec_get_n_fields_old(rec); if ((n_fields == 0) || (n_fields > REC_MAX_N_FIELDS)) { fprintf(stderr, "InnoDB: Error: record has %lu fields\n", @@ -479,7 +1207,7 @@ rec_validate( } for (i = 0; i < n_fields; i++) { - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field_old(rec, i, &len); if (!((len < UNIV_PAGE_SIZE) || (len == UNIV_SQL_NULL))) { fprintf(stderr, @@ -499,45 +1227,167 @@ rec_validate( } } - if (len_sum != (ulint)(rec_get_end(rec) - rec)) { + if (len_sum != rec_get_data_size_old(rec)) { fprintf(stderr, "InnoDB: Error: record len should be %lu, len %lu\n", (ulong) len_sum, - (ulong) (rec_get_end(rec) - rec)); + rec_get_data_size_old(rec)); + return(FALSE); + } + + rec_dummy = sum; /* This is here only to fool the compiler */ + + return(TRUE); +} + +/******************************************************************* +Validates the consistency of a physical record. */ + +ibool +rec_validate( +/*=========*/ + /* out: TRUE if ok */ + rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + const byte* data; + ulint len; + ulint n_fields; + ulint len_sum = 0; + ulint sum = 0; + ulint i; + + ut_a(rec); + n_fields = rec_offs_n_fields(offsets); + + if ((n_fields == 0) || (n_fields > REC_MAX_N_FIELDS)) { + fprintf(stderr, "InnoDB: Error: record has %lu fields\n", + (ulong) n_fields); + return(FALSE); + } + + ut_a(rec_offs_comp(offsets) || n_fields <= rec_get_n_fields_old(rec)); + + for (i = 0; i < n_fields; i++) { + data = rec_get_nth_field(rec, offsets, i, &len); + + if (!((len < UNIV_PAGE_SIZE) || (len == UNIV_SQL_NULL))) { + fprintf(stderr, + "InnoDB: Error: record field %lu len %lu\n", (ulong) i, + (ulong) len); + return(FALSE); + } + + if (len != UNIV_SQL_NULL) { + len_sum += len; + sum += *(data + len -1); /* dereference the + end of the field to + cause a memory trap + if possible */ + } else if (!rec_offs_comp(offsets)) { + len_sum += rec_get_nth_field_size(rec, i); + } + } + + if (len_sum != (ulint)(rec_get_end(rec, offsets) - rec)) { + fprintf(stderr, + "InnoDB: Error: record len should be %lu, len %lu\n", + (ulong) len_sum, + (ulong) (rec_get_end(rec, offsets) - rec)); return(FALSE); } rec_dummy = sum; /* This is here only to fool the compiler */ + if (!rec_offs_comp(offsets)) { + ut_a(rec_validate_old(rec)); + } + return(TRUE); } /******************************************************************* +Prints an old-style physical record. */ + +void +rec_print_old( +/*==========*/ + FILE* file, /* in: file where to print */ + rec_t* rec) /* in: physical record */ +{ + const byte* data; + ulint len; + ulint n; + ulint i; + + ut_ad(rec); + + n = rec_get_n_fields_old(rec); + + fprintf(file, "PHYSICAL RECORD: n_fields %lu;" + " %u-byte offsets; info bits %lu\n", + (ulong) n, + rec_get_1byte_offs_flag(rec) ? 1 : 2, + (ulong) rec_get_info_bits(rec, FALSE)); + + for (i = 0; i < n; i++) { + + data = rec_get_nth_field_old(rec, i, &len); + + fprintf(file, " %lu:", (ulong) i); + + if (len != UNIV_SQL_NULL) { + if (len <= 30) { + + ut_print_buf(file, data, len); + } else { + ut_print_buf(file, data, 30); + + fputs("...(truncated)", file); + } + } else { + fprintf(file, " SQL NULL, size %lu ", + rec_get_nth_field_size(rec, i)); + } + putc(';', file); + } + + putc('\n', file); + + rec_validate_old(rec); +} + +/******************************************************************* Prints a physical record. */ void -rec_print( -/*======*/ - FILE* file, /* in: file where to print */ - rec_t* rec) /* in: physical record */ +rec_print_new( +/*==========*/ + FILE* file, /* in: file where to print */ + rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { - byte* data; - ulint len; - ulint n; - ulint i; + const byte* data; + ulint len; + ulint i; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + if (!rec_offs_comp(offsets)) { + rec_print_old(file, rec); + return; + } ut_ad(rec); - - n = rec_get_n_fields(rec); fprintf(file, "PHYSICAL RECORD: n_fields %lu;" - " 1-byte offs %s; info bits %lu\n", - (ulong) n, rec_get_1byte_offs_flag(rec) ? "TRUE" : "FALSE", - (ulong) rec_get_info_bits(rec)); + " compact format; info bits %lu\n", + (ulong) rec_offs_n_fields(offsets), + (ulong) rec_get_info_bits(rec, TRUE)); - for (i = 0; i < n; i++) { + for (i = 0; i < rec_offs_n_fields(offsets); i++) { - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field(rec, offsets, i, &len); fprintf(file, " %lu:", (ulong) i); @@ -551,14 +1401,39 @@ rec_print( fputs("...(truncated)", file); } } else { - fprintf(file, " SQL NULL, size %lu ", - (ulong) rec_get_nth_field_size(rec, i)); - + fputs(" SQL NULL", file); } putc(';', file); } putc('\n', file); - rec_validate(rec); + rec_validate(rec, offsets); +} + +/******************************************************************* +Prints a physical record. */ + +void +rec_print( +/*======*/ + FILE* file, /* in: file where to print */ + rec_t* rec, /* in: physical record */ + dict_index_t* index) /* in: record descriptor */ +{ + ut_ad(index); + + if (!index->table->comp) { + rec_print_old(file, rec); + return; + } else { + mem_heap_t* heap = NULL; + ulint offsets_[100 + REC_OFFS_HEADER_SIZE] + = { 100, }; + rec_print_new(file, rec, rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap)); + if (heap) { + mem_heap_free(heap); + } + } } diff --git a/innobase/row/row0ins.c b/innobase/row/row0ins.c index f8a98f74c09..4650db7abad 100644 --- a/innobase/row/row0ins.c +++ b/innobase/row/row0ins.c @@ -251,7 +251,7 @@ row_ins_sec_index_entry_by_modify( rec = btr_cur_get_rec(cursor); ut_ad((cursor->index->type & DICT_CLUSTERED) == 0); - ut_ad(rec_get_deleted_flag(rec)); + ut_ad(rec_get_deleted_flag(rec, cursor->index->table->comp)); /* We know that in the alphabetical ordering, entry and rec are identified. But in their binary form there may be differences if @@ -316,7 +316,7 @@ row_ins_clust_index_entry_by_modify( rec = btr_cur_get_rec(cursor); - ut_ad(rec_get_deleted_flag(rec)); + ut_ad(rec_get_deleted_flag(rec, cursor->index->table->comp)); heap = mem_heap_create(1024); @@ -473,6 +473,8 @@ row_ins_cascade_calc_update_vec( if (parent_ufield->field_no == parent_field_no) { + ulint fixed_size; + /* A field in the parent index record is updated. Let us make the update vector field for the child table. */ @@ -512,22 +514,22 @@ row_ins_cascade_calc_update_vec( need to pad with spaces the new value of the child column */ - if (dtype_is_fixed_size(type) + fixed_size = dtype_get_fixed_size(type); + + if (fixed_size && ufield->new_val.len != UNIV_SQL_NULL - && ufield->new_val.len - < dtype_get_fixed_size(type)) { + && ufield->new_val.len < fixed_size) { ufield->new_val.data = mem_heap_alloc(heap, - dtype_get_fixed_size(type)); - ufield->new_val.len = - dtype_get_fixed_size(type); + fixed_size); + ufield->new_val.len = fixed_size; ut_a(dtype_get_pad_char(type) != ULINT_UNDEFINED); memset(ufield->new_val.data, (byte)dtype_get_pad_char(type), - dtype_get_fixed_size(type)); + fixed_size); ut_memcpy(ufield->new_val.data, parent_ufield->new_val.data, parent_ufield->new_val.len); @@ -589,7 +591,7 @@ row_ins_foreign_report_err( ut_print_name(ef, trx, foreign->foreign_index->name); if (rec) { fputs(", there is a record:\n", ef); - rec_print(ef, rec); + rec_print(ef, rec, foreign->foreign_index); } else { fputs(", the record is not available\n", ef); } @@ -644,7 +646,7 @@ row_ins_foreign_report_add_err( } if (rec) { - rec_print(ef, rec); + rec_print(ef, rec, foreign->foreign_index); } putc('\n', ef); @@ -706,7 +708,6 @@ row_ins_foreign_check_on_constraint( dict_index_t* index; dict_index_t* clust_index; dtuple_t* ref; - mem_heap_t* tmp_heap; mem_heap_t* upd_vec_heap = NULL; rec_t* rec; rec_t* clust_rec; @@ -715,14 +716,17 @@ row_ins_foreign_check_on_constraint( ulint err; ulint i; trx_t* trx; + mem_heap_t* tmp_heap = NULL; - ut_a(thr && foreign && pcur && mtr); trx = thr_get_trx(thr); /* Since we are going to delete or update a row, we have to invalidate - the MySQL query cache for table */ + the MySQL query cache for table. A deadlock of threads is not possible + here because the caller of this function does not hold any latches with + the sync0sync.h rank above the kernel mutex. The query cache mutex has + a rank just above the kernel mutex. */ row_ins_invalidate_query_cache(thr, table->name); @@ -816,7 +820,7 @@ row_ins_foreign_check_on_constraint( err = DB_ROW_IS_REFERENCED; row_ins_foreign_report_err( -(char*)"Trying a too deep cascaded delete or update\n", +"Trying a too deep cascaded delete or update\n", thr, foreign, btr_pcur_get_rec(pcur), entry); goto nonstandard_exit_func; @@ -848,8 +852,6 @@ row_ins_foreign_check_on_constraint( PAGE_CUR_LE, BTR_SEARCH_LEAF, cascade->pcur, 0, mtr); - mem_heap_free(tmp_heap); - clust_rec = btr_pcur_get_rec(cascade->pcur); if (!page_rec_is_user_rec(clust_rec) @@ -863,10 +865,10 @@ row_ins_foreign_check_on_constraint( fputs("\n" "InnoDB: record ", stderr); - rec_print(stderr, rec); + rec_print(stderr, rec, index); fputs("\n" "InnoDB: clustered record ", stderr); - rec_print(stderr, clust_rec); + rec_print(stderr, clust_rec, clust_index); fputs("\n" "InnoDB: Submit a detailed bug report to http://bugs.mysql.com\n", stderr); @@ -884,9 +886,9 @@ row_ins_foreign_check_on_constraint( /* Here it suffices to use a LOCK_REC_NOT_GAP type lock; we already have a normal shared lock on the appropriate gap if the search criterion was not unique */ - - err = lock_clust_rec_read_check_and_lock(0, clust_rec, - clust_index, LOCK_X, LOCK_REC_NOT_GAP, thr); + + err = lock_clust_rec_read_check_and_lock_alt(0, clust_rec, + clust_index, LOCK_X, LOCK_REC_NOT_GAP, thr); } if (err != DB_SUCCESS) { @@ -894,7 +896,7 @@ row_ins_foreign_check_on_constraint( goto nonstandard_exit_func; } - if (rec_get_deleted_flag(clust_rec)) { + if (rec_get_deleted_flag(clust_rec, table->comp)) { /* This can happen if there is a circular reference of rows such that cascading delete comes to delete a row already in the process of being delete marked */ @@ -1003,6 +1005,10 @@ row_ins_foreign_check_on_constraint( btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr); + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + if (upd_vec_heap) { mem_heap_free(upd_vec_heap); } @@ -1010,6 +1016,9 @@ row_ins_foreign_check_on_constraint( return(err); nonstandard_exit_func: + if (tmp_heap) { + mem_heap_free(tmp_heap); + } if (upd_vec_heap) { mem_heap_free(upd_vec_heap); @@ -1037,16 +1046,19 @@ row_ins_set_shared_rec_lock( LOCK_REC_NOT_GAP type lock */ rec_t* rec, /* in: record */ dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ que_thr_t* thr) /* in: query thread */ { ulint err; + ut_ad(rec_offs_validate(rec, index, offsets)); + if (index->type & DICT_CLUSTERED) { - err = lock_clust_rec_read_check_and_lock(0, rec, index, LOCK_S, - type, thr); + err = lock_clust_rec_read_check_and_lock(0, + rec, index, offsets, LOCK_S, type, thr); } else { - err = lock_sec_rec_read_check_and_lock(0, rec, index, LOCK_S, - type, thr); + err = lock_sec_rec_read_check_and_lock(0, + rec, index, offsets, LOCK_S, type, thr); } return(err); @@ -1064,16 +1076,19 @@ row_ins_set_exclusive_rec_lock( LOCK_REC_NOT_GAP type lock */ rec_t* rec, /* in: record */ dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ que_thr_t* thr) /* in: query thread */ { ulint err; + ut_ad(rec_offs_validate(rec, index, offsets)); + if (index->type & DICT_CLUSTERED) { - err = lock_clust_rec_read_check_and_lock(0, rec, index, LOCK_X, - type, thr); + err = lock_clust_rec_read_check_and_lock(0, + rec, index, offsets, LOCK_X, type, thr); } else { - err = lock_sec_rec_read_check_and_lock(0, rec, index, LOCK_X, - type, thr); + err = lock_sec_rec_read_check_and_lock(0, + rec, index, offsets, LOCK_X, type, thr); } return(err); @@ -1113,7 +1128,10 @@ row_ins_check_foreign_constraint( ulint err; ulint i; mtr_t mtr; - trx_t* trx = thr_get_trx(thr); + trx_t* trx = thr_get_trx(thr); + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; run_again: #ifdef UNIV_SYNC_DEBUG @@ -1125,8 +1143,7 @@ run_again: if (trx->check_foreigns == FALSE) { /* The user has suppressed foreign key checks currently for this session */ - - return(DB_SUCCESS); + goto exit_func; } /* If any of the foreign key fields in entry is SQL NULL, we @@ -1137,7 +1154,7 @@ run_again: if (UNIV_SQL_NULL == dfield_get_len( dtuple_get_nth_field(entry, i))) { - return(DB_SUCCESS); + goto exit_func; } } @@ -1160,8 +1177,8 @@ run_again: with each foreign key constraint, one after another, and the user has problems predicting in which order they are performed. */ - - return(DB_SUCCESS); + + goto exit_func; } } @@ -1195,10 +1212,10 @@ run_again: fputs("\nor its .ibd file does not currently exist!\n", ef); mutex_exit(&dict_foreign_err_mutex); - return(DB_NO_REFERENCED_ROW); + err = DB_NO_REFERENCED_ROW; } - return(DB_SUCCESS); + goto exit_func; } ut_a(check_table && check_index); @@ -1244,10 +1261,13 @@ run_again: goto next_rec; } + offsets = rec_get_offsets(rec, check_index, + offsets, ULINT_UNDEFINED, &heap); + if (rec == page_get_supremum_rec(buf_frame_align(rec))) { - + err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, rec, - check_index, thr); + check_index, offsets, thr); if (err != DB_SUCCESS) { break; @@ -1256,29 +1276,30 @@ run_again: goto next_rec; } - cmp = cmp_dtuple_rec(entry, rec); + cmp = cmp_dtuple_rec(entry, rec, offsets); if (cmp == 0) { - if (rec_get_deleted_flag(rec)) { + if (rec_get_deleted_flag(rec, + rec_offs_comp(offsets))) { err = row_ins_set_shared_rec_lock( - LOCK_ORDINARY, - rec, check_index, thr); + LOCK_ORDINARY, rec, + check_index, offsets, thr); if (err != DB_SUCCESS) { break; } } else { /* Found a matching record */ + ulint lock_type; if (unique_search) { - err = row_ins_set_shared_rec_lock( - LOCK_REC_NOT_GAP, - rec, check_index, thr); + lock_type = LOCK_REC_NOT_GAP; } else { - err = row_ins_set_shared_rec_lock( - LOCK_ORDINARY, - rec, check_index, thr); + lock_type = LOCK_ORDINARY; } + + err = row_ins_set_shared_rec_lock(lock_type, + rec, check_index, offsets, thr); if (err != DB_SUCCESS) { @@ -1315,7 +1336,7 @@ run_again: if (cmp < 0) { err = row_ins_set_shared_rec_lock(LOCK_GAP, - rec, check_index, thr); + rec, check_index, offsets, thr); if (err != DB_SUCCESS) { break; @@ -1373,6 +1394,10 @@ do_possible_lock_wait: err = trx->error_state; } +exit_func: + if (heap) { + mem_heap_free(heap); + } return(err); } @@ -1470,19 +1495,23 @@ row_ins_dupl_error_with_rec( that the caller already has a record lock on the record! */ dtuple_t* entry, /* in: entry to insert */ - dict_index_t* index) /* in: index */ + dict_index_t* index, /* in: index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { ulint matched_fields; ulint matched_bytes; ulint n_unique; ulint i; - + + ut_ad(rec_offs_validate(rec, index, offsets)); + n_unique = dict_index_get_n_unique(index); matched_fields = 0; matched_bytes = 0; - cmp_dtuple_rec_with_match(entry, rec, &matched_fields, &matched_bytes); + cmp_dtuple_rec_with_match(entry, rec, offsets, + &matched_fields, &matched_bytes); if (matched_fields < n_unique) { @@ -1503,7 +1532,7 @@ row_ins_dupl_error_with_rec( } } - if (!rec_get_deleted_flag(rec)) { + if (!rec_get_deleted_flag(rec, index->table->comp)) { return(TRUE); } @@ -1535,7 +1564,10 @@ row_ins_scan_sec_index_for_duplicate( ibool moved; mtr_t mtr; trx_t* trx; - + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; + n_unique = dict_index_get_n_unique(index); /* If the secondary index is unique, but one of the fields in the @@ -1575,6 +1607,9 @@ row_ins_scan_sec_index_for_duplicate( trx = thr_get_trx(thr); ut_ad(trx); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + if (innobase_query_is_replace()) { /* The manual defines the REPLACE semantics that it @@ -1582,12 +1617,12 @@ row_ins_scan_sec_index_for_duplicate( + INSERT. Therefore, we should take X-lock for duplicates */ - err = row_ins_set_exclusive_rec_lock( - LOCK_ORDINARY,rec,index,thr); + err = row_ins_set_exclusive_rec_lock(LOCK_ORDINARY, + rec, index, offsets, thr); } else { - err = row_ins_set_shared_rec_lock( - LOCK_ORDINARY, rec, index,thr); + err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, + rec, index, offsets, thr); } if (err != DB_SUCCESS) { @@ -1600,10 +1635,11 @@ row_ins_scan_sec_index_for_duplicate( goto next_rec; } - cmp = cmp_dtuple_rec(entry, rec); + cmp = cmp_dtuple_rec(entry, rec, offsets); if (cmp == 0) { - if (row_ins_dupl_error_with_rec(rec, entry, index)) { + if (row_ins_dupl_error_with_rec(rec, entry, + index, offsets)) { err = DB_DUPLICATE_KEY; thr_get_trx(thr)->error_info = index; @@ -1625,6 +1661,9 @@ next_rec: } } + if (heap) { + mem_heap_free(heap); + } mtr_commit(&mtr); /* Restore old value */ @@ -1654,7 +1693,11 @@ row_ins_duplicate_error_in_clust( rec_t* rec; page_t* page; ulint n_unique; - trx_t* trx = thr_get_trx(thr); + trx_t* trx = thr_get_trx(thr); + mem_heap_t*heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; + UT_NOT_USED(mtr); @@ -1682,6 +1725,8 @@ row_ins_duplicate_error_in_clust( page = buf_frame_align(rec); if (rec != page_get_infimum_rec(page)) { + offsets = rec_get_offsets(rec, cursor->index, offsets, + ULINT_UNDEFINED, &heap); /* We set a lock on the possible duplicate: this is needed in logical logging of MySQL to make @@ -1697,23 +1742,23 @@ row_ins_duplicate_error_in_clust( err = row_ins_set_exclusive_rec_lock( LOCK_REC_NOT_GAP,rec,cursor->index, - thr); + offsets, thr); } else { err = row_ins_set_shared_rec_lock( LOCK_REC_NOT_GAP,rec, cursor->index, - thr); + offsets, thr); } if (err != DB_SUCCESS) { - - return(err); + goto func_exit; } if (row_ins_dupl_error_with_rec(rec, entry, - cursor->index)) { + cursor->index, offsets)) { trx->error_info = cursor->index; - return(DB_DUPLICATE_KEY); + err = DB_DUPLICATE_KEY; + goto func_exit; } } } @@ -1724,7 +1769,8 @@ row_ins_duplicate_error_in_clust( page = buf_frame_align(rec); if (rec != page_get_supremum_rec(page)) { - + offsets = rec_get_offsets(rec, cursor->index, offsets, + ULINT_UNDEFINED, &heap); /* The manual defines the REPLACE semantics that it is either an INSERT or DELETE(s) for duplicate key @@ -1734,32 +1780,35 @@ row_ins_duplicate_error_in_clust( if (innobase_query_is_replace()) { err = row_ins_set_exclusive_rec_lock( - LOCK_REC_NOT_GAP, - rec,cursor->index,thr); + LOCK_REC_NOT_GAP, rec, + cursor->index, offsets, thr); } else { err = row_ins_set_shared_rec_lock( - LOCK_REC_NOT_GAP,rec, - cursor->index, thr); + LOCK_REC_NOT_GAP, rec, + cursor->index, offsets, thr); } if (err != DB_SUCCESS) { - - return(err); + goto func_exit; } if (row_ins_dupl_error_with_rec(rec, entry, - cursor->index)) { + cursor->index, offsets)) { trx->error_info = cursor->index; - return(DB_DUPLICATE_KEY); + err = DB_DUPLICATE_KEY; + goto func_exit; } + mem_heap_free(heap); } ut_a(!(cursor->index->type & DICT_CLUSTERED)); /* This should never happen */ } - return(DB_SUCCESS); + err = DB_SUCCESS; +func_exit: + return(err); } /******************************************************************* @@ -1841,6 +1890,9 @@ row_ins_index_entry_low( ulint n_unique; big_rec_t* big_rec = NULL; mtr_t mtr; + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; log_free_check(); @@ -1873,8 +1925,8 @@ row_ins_index_entry_low( buf_frame_align(btr_cur_get_rec(&cursor)))); if (!page_rec_is_supremum(first_rec)) { - ut_a((rec_get_n_fields(first_rec)) - == dtuple_get_n_fields(entry)); + ut_a(rec_get_n_fields(first_rec, index) + == dtuple_get_n_fields(entry)); } n_unique = dict_index_get_n_unique(index); @@ -1952,7 +2004,7 @@ row_ins_index_entry_low( if (err == DB_SUCCESS) { if (ext_vec) { - rec_set_field_extern_bits(insert_rec, + rec_set_field_extern_bits(insert_rec, index, ext_vec, n_ext_vec, &mtr); } } @@ -1962,14 +2014,18 @@ function_exit: mtr_commit(&mtr); if (big_rec) { + rec_t* rec; mtr_start(&mtr); btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, BTR_MODIFY_TREE, &cursor, 0, &mtr); + rec = btr_cur_get_rec(&cursor); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + err = btr_store_big_rec_extern_fields(index, rec, + offsets, big_rec, &mtr); - err = btr_store_big_rec_extern_fields(index, - btr_cur_get_rec(&cursor), - big_rec, &mtr); if (modify) { dtuple_big_rec_free(big_rec); } else { @@ -1979,6 +2035,9 @@ function_exit: mtr_commit(&mtr); } + if (heap) { + mem_heap_free(heap); + } return(err); } diff --git a/innobase/row/row0mysql.c b/innobase/row/row0mysql.c index b7cd730828a..6aaa0cbcf1b 100644 --- a/innobase/row/row0mysql.c +++ b/innobase/row/row0mysql.c @@ -784,7 +784,7 @@ row_lock_table_for_mysql( table handle */ dict_table_t* table, /* in: table to lock, or NULL if prebuilt->table should be - locked as LOCK_TABLE_EXP | + locked or a prebuilt->select_lock_type */ ulint mode) /* in: lock mode of table */ { @@ -822,8 +822,14 @@ run_again: if (table) { err = lock_table(0, table, mode, thr); } else { - err = lock_table(LOCK_TABLE_EXP, prebuilt->table, - prebuilt->select_lock_type, thr); + if (mode == LOCK_TABLE_TRANSACTIONAL) { + err = lock_table(LOCK_TABLE_TRANSACTIONAL, + prebuilt->table, + prebuilt->select_lock_type, thr); + } else { + err = lock_table(LOCK_TABLE_EXP, prebuilt->table, + prebuilt->select_lock_type, thr); + } } trx->error_state = err; @@ -945,9 +951,10 @@ run_again: if (err != DB_SUCCESS) { que_thr_stop_for_mysql(thr); - + thr->lock_state= QUE_THR_LOCK_ROW; was_lock_wait = row_mysql_handle_errors(&err, trx, thr, &savept); + thr->lock_state= QUE_THR_LOCK_NOLOCK; if (was_lock_wait) { goto run_again; } @@ -1193,9 +1200,11 @@ run_again: return((int) err); } - + + thr->lock_state= QUE_THR_LOCK_ROW; was_lock_wait = row_mysql_handle_errors(&err, trx, thr, &savept); + thr->lock_state= QUE_THR_LOCK_NOLOCK;; if (was_lock_wait) { goto run_again; } @@ -1224,6 +1233,57 @@ run_again: return((int) err); } +/************************************************************************* +Does an unlock of a row for MySQL. */ + +int +row_unlock_for_mysql( +/*=================*/ + /* out: error code or DB_SUCCESS */ + row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL + handle */ +{ + rec_t* rec; + btr_pcur_t* cur = prebuilt->pcur; + trx_t* trx = prebuilt->trx; + mtr_t mtr; + + ut_ad(prebuilt && trx); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + trx->op_info = "unlock_row"; + + if (srv_locks_unsafe_for_binlog) { + if (trx->trx_create_lock == TRUE) { + + mtr_start(&mtr); + + /* Restore a cursor position and find a record */ + btr_pcur_restore_position(BTR_SEARCH_LEAF, cur, &mtr); + rec = btr_pcur_get_rec(cur); + + if (rec) { + + lock_rec_reset_and_release_wait(rec); + } else { + fputs("InnoDB: Error: " + "Record for the lock not found\n", + stderr); + mem_analyze_corruption((byte*) trx); + ut_error; + } + + trx->trx_create_lock = FALSE; + mtr_commit(&mtr); + } + + } + + trx->op_info = ""; + + return(DB_SUCCESS); +} + /************************************************************************** Does a cascaded delete or set null in a foreign key operation. */ @@ -2362,6 +2422,294 @@ funct_exit: } /************************************************************************* +Truncates a table for MySQL. */ + +int +row_truncate_table_for_mysql( +/*=========================*/ + /* out: error code or DB_SUCCESS */ + dict_table_t* table, /* in: table handle */ + trx_t* trx) /* in: transaction handle */ +{ + dict_foreign_t* foreign; + ulint err; + ibool locked_dictionary = FALSE; + mem_heap_t* heap; + byte* buf; + dtuple_t* tuple; + dfield_t* dfield; + dict_index_t* sys_index; + btr_pcur_t pcur; + mtr_t mtr; + dulint new_id; + char* sql; + que_thr_t* thr; + que_t* graph = NULL; + +/* How do we prevent crashes caused by ongoing operations on the table? Old +operations could try to access non-existent pages. + +1) SQL queries, INSERT, SELECT, ...: we must get an exclusive MySQL table lock +on the table before we can do TRUNCATE TABLE. Then there are no running +queries on the table. +2) Purge and rollback: we assign a new table id for the table. Since purge and +rollback look for the table based on the table id, they see the table as +'dropped' and discard their operations. +3) Insert buffer: TRUNCATE TABLE is analogous to DROP TABLE, so we do not +have to remove insert buffer records, as the insert buffer works at a low +level. If a freed page is later reallocated, the allocator will remove +the ibuf entries for it. + +TODO: when we truncate *.ibd files (analogous to DISCARD TABLESPACE), we +will have to remove we remove all entries for the table in the insert +buffer tree! + +4) Linear readahead and random readahead: we use the same method as in 3) to +discard ongoing operations. (This will only be relevant for TRUNCATE TABLE +by DISCARD TABLESPACE.) +5) FOREIGN KEY operations: if table->n_foreign_key_checks_running > 0, we +do not allow the TRUNCATE. We also reserve the data dictionary latch. */ + + static const char renumber_tablespace_proc[] = + "PROCEDURE RENUMBER_TABLESPACE_PROC () IS\n" + "old_id CHAR;\n" + "new_id CHAR;\n" + "old_id_low INT;\n" + "old_id_high INT;\n" + "new_id_low INT;\n" + "new_id_high INT;\n" + "BEGIN\n" + "old_id_high := %lu;\n" + "old_id_low := %lu;\n" + "new_id_high := %lu;\n" + "new_id_low := %lu;\n" + "old_id := CONCAT(TO_BINARY(old_id_high, 4), TO_BINARY(old_id_low, 4));\n" + "new_id := CONCAT(TO_BINARY(new_id_high, 4), TO_BINARY(new_id_low, 4));\n" + "UPDATE SYS_TABLES SET ID = new_id\n" + "WHERE ID = old_id;\n" + "UPDATE SYS_COLUMNS SET TABLE_ID = new_id\n" + "WHERE TABLE_ID = old_id;\n" + "UPDATE SYS_INDEXES SET TABLE_ID = new_id\n" + "WHERE TABLE_ID = old_id;\n" + "COMMIT WORK;\n" + "END;\n"; + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + ut_ad(table); + + if (srv_created_new_raw) { + fputs( + "InnoDB: A new raw disk partition was initialized or\n" + "InnoDB: innodb_force_recovery is on: we do not allow\n" + "InnoDB: database modifications by the user. Shut down\n" + "InnoDB: mysqld and edit my.cnf so that newraw is replaced\n" + "InnoDB: with raw, and innodb_force_... is removed.\n", + stderr); + + return(DB_ERROR); + } + + trx->op_info = "truncating table"; + + trx_start_if_not_started(trx); + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + if (trx->dict_operation_lock_mode != RW_X_LATCH) { + /* Prevent foreign key checks etc. while we are truncating the + table */ + + row_mysql_lock_data_dictionary(trx); + + locked_dictionary = TRUE; + } + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + /* Check if the table is referenced by foreign key constraints from + some other table (not the table itself) */ + + foreign = UT_LIST_GET_FIRST(table->referenced_list); + + while (foreign && foreign->foreign_table == table) { + foreign = UT_LIST_GET_NEXT(referenced_list, foreign); + } + + if (foreign && trx->check_foreigns) { + FILE* ef = dict_foreign_err_file; + + /* We only allow truncating a referenced table if + FOREIGN_KEY_CHECKS is set to 0 */ + + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + + fputs(" Cannot truncate table ", ef); + ut_print_name(ef, trx, table->name); + fputs("\n" + "because it is referenced by ", ef); + ut_print_name(ef, trx, foreign->foreign_table_name); + putc('\n', ef); + mutex_exit(&dict_foreign_err_mutex); + + err = DB_ERROR; + goto funct_exit; + } + + if (table->n_mysql_handles_opened > 1) { + ut_print_timestamp(stderr); +fputs(" InnoDB: Warning: MySQL is trying to truncate table ", stderr); + ut_print_name(stderr, trx, table->name); + fputs("\n" +"InnoDB: though there are still open handles to it.\n", stderr); + err = DB_ERROR; + + goto funct_exit; + } + + /* TODO: could we replace the counter n_foreign_key_checks_running + with lock checks on the table? Acquire here an exclusive lock on the + table, and rewrite lock0lock.c and the lock wait in srv0srv.c so that + they can cope with the table having been truncated here? Foreign key + checks take an IS or IX lock on the table. */ + + if (table->n_foreign_key_checks_running > 0) { + ut_print_timestamp(stderr); + fputs(" InnoDB: You are trying to truncate table ", stderr); + ut_print_name(stderr, trx, table->name); + fputs("\n" +"InnoDB: though there is a foreign key check running on it.\n", + stderr); + err = DB_ERROR; + + goto funct_exit; + } + + /* Remove any locks there are on the table or its records */ + + lock_reset_all_on_table(table); + + trx->dict_operation = TRUE; + trx->table_id = table->id; + + /* scan SYS_INDEXES for all indexes of the table */ + heap = mem_heap_create(800); + + tuple = dtuple_create(heap, 1); + dfield = dtuple_get_nth_field(tuple, 0); + + buf = mem_heap_alloc(heap, 8); + mach_write_to_8(buf, table->id); + + dfield_set_data(dfield, buf, 8); + sys_index = dict_table_get_first_index(dict_sys->sys_indexes); + dict_index_copy_types(tuple, sys_index, 1); + + mtr_start(&mtr); + btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE, + BTR_MODIFY_LEAF, &pcur, &mtr); + for (;;) { + rec_t* rec; + const byte* field; + ulint len; + + if (!btr_pcur_is_on_user_rec(&pcur, &mtr)) { + /* The end of SYS_INDEXES has been reached. */ + break; + } + + rec = btr_pcur_get_rec(&pcur); + + field = rec_get_nth_field_old(rec, 0, &len); + ut_ad(len == 8); + + if (memcmp(buf, field, len) != 0) { + /* End of indexes for the table (TABLE_ID mismatch). */ + break; + } + + if (rec_get_deleted_flag(rec, FALSE)) { + /* The index has been dropped. */ + continue; + } + + dict_truncate_index_tree(table, rec, &mtr); + + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + new_id = dict_hdr_get_new_id(DICT_HDR_TABLE_ID); + + mem_heap_empty(heap); + sql = mem_heap_alloc(heap, (sizeof renumber_tablespace_proc) + 40); + sprintf(sql, renumber_tablespace_proc, + (ulong) ut_dulint_get_high(table->id), + (ulong) ut_dulint_get_low(table->id), + (ulong) ut_dulint_get_high(new_id), + (ulong) ut_dulint_get_low(new_id)); + + graph = pars_sql(sql); + + ut_a(graph); + + mem_heap_free(heap); + + graph->trx = trx; + trx->graph = NULL; + + graph->fork_type = QUE_FORK_MYSQL_INTERFACE; + + thr = que_fork_start_command(graph); + ut_a(thr); + + que_run_threads(thr); + + que_graph_free(graph); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + ut_print_timestamp(stderr); +fputs(" InnoDB: Unable to assign a new identifier to table ", stderr); + ut_print_name(stderr, trx, table->name); + fputs("\n" +"InnoDB: after truncating it. Background processes may corrupt the table!\n", + stderr); + err = DB_ERROR; + } else { + dict_table_change_id_in_cache(table, new_id); + } + + dict_table_autoinc_initialize(table, 0); + dict_update_statistics(table); + + trx_commit_for_mysql(trx); + +funct_exit: + + if (locked_dictionary) { + row_mysql_unlock_data_dictionary(trx); + } + + trx->op_info = ""; + + srv_wake_master_thread(); + + return((int) err); +} + +/************************************************************************* Drops a table for MySQL. If the name of the table to be dropped is equal with one of the predefined magic table names, then this also stops printing the corresponding monitor output by the master thread. */ @@ -3292,18 +3640,20 @@ row_scan_and_check_index( ulint* n_rows) /* out: number of entries seen in the current consistent read */ { - mem_heap_t* heap; - dtuple_t* prev_entry = NULL; + dtuple_t* prev_entry = NULL; ulint matched_fields; ulint matched_bytes; byte* buf; ulint ret; rec_t* rec; - ibool is_ok = TRUE; + ibool is_ok = TRUE; int cmp; ibool contains_null; ulint i; - + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; + *n_rows = 0; buf = mem_alloc(UNIV_PAGE_SIZE); @@ -3343,8 +3693,10 @@ loop: if (prev_entry != NULL) { matched_fields = 0; matched_bytes = 0; - - cmp = cmp_dtuple_rec_with_match(prev_entry, rec, + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + cmp = cmp_dtuple_rec_with_match(prev_entry, rec, offsets, &matched_fields, &matched_bytes); contains_null = FALSE; @@ -3373,7 +3725,7 @@ loop: dtuple_print(stderr, prev_entry); fputs("\n" "InnoDB: record ", stderr); - rec_print(stderr, rec); + rec_print_new(stderr, rec, offsets); putc('\n', stderr); is_ok = FALSE; } else if ((index->type & DICT_UNIQUE) @@ -3387,6 +3739,7 @@ loop: } mem_heap_empty(heap); + offsets = offsets_; prev_entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, heap); diff --git a/innobase/row/row0purge.c b/innobase/row/row0purge.c index f7e01169b9d..8897a1a872f 100644 --- a/innobase/row/row0purge.c +++ b/innobase/row/row0purge.c @@ -99,6 +99,9 @@ row_purge_remove_clust_if_poss_low( ibool success; ulint err; mtr_t mtr; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; index = dict_table_get_first_index(node->table); @@ -117,15 +120,24 @@ row_purge_remove_clust_if_poss_low( return(TRUE); } + rec = btr_pcur_get_rec(pcur); + if (0 != ut_dulint_cmp(node->roll_ptr, - row_get_rec_roll_ptr(btr_pcur_get_rec(pcur), index))) { - + row_get_rec_roll_ptr(rec, index, rec_get_offsets( + rec, index, offsets_, ULINT_UNDEFINED, &heap)))) { + if (heap) { + mem_heap_free(heap); + } /* Someone else has modified the record later: do not remove */ btr_pcur_commit_specify_mtr(pcur, &mtr); return(TRUE); } + if (heap) { + mem_heap_free(heap); + } + if (mode == BTR_MODIFY_LEAF) { success = btr_cur_optimistic_delete(btr_cur, &mtr); } else { diff --git a/innobase/row/row0row.c b/innobase/row/row0row.c index 38714b0c49b..43d0cd41b0a 100644 --- a/innobase/row/row0row.c +++ b/innobase/row/row0row.c @@ -37,17 +37,18 @@ row_get_rec_sys_field( /* out: value of the field */ ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */ rec_t* rec, /* in: record */ - dict_index_t* index) /* in: clustered index */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { - ulint pos; - byte* field; - ulint len; + ulint pos; + byte* field; + ulint len; ut_ad(index->type & DICT_CLUSTERED); pos = dict_index_get_sys_col_pos(index, type); - field = rec_get_nth_field(rec, pos, &len); + field = rec_get_nth_field(rec, offsets, pos, &len); if (type == DATA_TRX_ID) { @@ -70,6 +71,7 @@ row_set_rec_sys_field( ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */ rec_t* rec, /* in: record */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ dulint val) /* in: value to set */ { ulint pos; @@ -77,10 +79,11 @@ row_set_rec_sys_field( ulint len; ut_ad(index->type & DICT_CLUSTERED); + ut_ad(rec_offs_validate(rec, index, offsets)); pos = dict_index_get_sys_col_pos(index, type); - field = rec_get_nth_field(rec, pos, &len); + field = rec_get_nth_field(rec, offsets, pos, &len); if (type == DATA_TRX_ID) { @@ -182,6 +185,9 @@ row_build( the buffer page of this record must be at least s-latched and the latch held as long as the row dtuple is used! */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) + or NULL, in which case this function + will invoke rec_get_offsets() */ mem_heap_t* heap) /* in: memory heap from which the memory needed is allocated */ { @@ -196,14 +202,25 @@ row_build( ulint row_len; byte* buf; ulint i; - + mem_heap_t* tmp_heap = NULL; + ulint offsets_[100] = { 100, }; + ut_ad(index && rec && heap); ut_ad(index->type & DICT_CLUSTERED); + if (!offsets) { + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &tmp_heap); + } else { + ut_ad(rec_offs_validate(rec, index, offsets)); + } + if (type != ROW_COPY_POINTERS) { /* Take a copy of rec to heap */ - buf = mem_heap_alloc(heap, rec_get_size(rec)); - rec = rec_copy(buf, rec); + buf = mem_heap_alloc(heap, rec_offs_size(offsets)); + rec = rec_copy(buf, rec, offsets); + /* Avoid a debug assertion in rec_offs_validate(). */ + rec_offs_make_valid(rec, index, (ulint*) offsets); } table = index->table; @@ -211,11 +228,9 @@ row_build( row = dtuple_create(heap, row_len); - dtuple_set_info_bits(row, rec_get_info_bits(rec)); - - n_fields = dict_index_get_n_fields(index); + dtuple_set_info_bits(row, rec_get_info_bits(rec, table->comp)); - ut_ad(n_fields == rec_get_n_fields(rec)); + n_fields = rec_offs_n_fields(offsets); dict_table_copy_types(row, table); @@ -227,13 +242,13 @@ row_build( col = dict_field_get_col(ind_field); dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); - field = rec_get_nth_field(rec, i, &len); + field = rec_get_nth_field(rec, offsets, i, &len); if (type == ROW_COPY_ALSO_EXTERNALS - && rec_get_nth_field_extern_bit(rec, i)) { + && rec_offs_nth_extern(offsets, i)) { field = btr_rec_copy_externally_stored_field( - rec, i, &len, heap); + rec, offsets, i, &len, heap); } dfield_set_data(dfield, field, len); @@ -242,6 +257,10 @@ row_build( ut_ad(dtuple_check_typed(row)); + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + return(row); } @@ -276,16 +295,24 @@ row_rec_to_index_entry( ulint len; ulint rec_len; byte* buf; - + mem_heap_t* tmp_heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; + ut_ad(rec && heap && index); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &tmp_heap); + if (type == ROW_COPY_DATA) { /* Take a copy of rec to heap */ - buf = mem_heap_alloc(heap, rec_get_size(rec)); - rec = rec_copy(buf, rec); + buf = mem_heap_alloc(heap, rec_offs_size(offsets)); + rec = rec_copy(buf, rec, offsets); + /* Avoid a debug assertion in rec_offs_validate(). */ + rec_offs_make_valid(rec, index, offsets); } - rec_len = rec_get_n_fields(rec); + rec_len = rec_offs_n_fields(offsets); entry = dtuple_create(heap, rec_len); @@ -295,17 +322,21 @@ row_rec_to_index_entry( dict_index_copy_types(entry, index, rec_len); - dtuple_set_info_bits(entry, rec_get_info_bits(rec)); + dtuple_set_info_bits(entry, + rec_get_info_bits(rec, rec_offs_comp(offsets))); for (i = 0; i < rec_len; i++) { dfield = dtuple_get_nth_field(entry, i); - field = rec_get_nth_field(rec, i, &len); + field = rec_get_nth_field(rec, offsets, i, &len); dfield_set_data(dfield, field, len); } ut_ad(dtuple_check_typed(entry)); + if (tmp_heap) { + mem_heap_free(tmp_heap); + } return(entry); } @@ -345,15 +376,23 @@ row_build_row_ref( byte* buf; ulint clust_col_prefix_len; ulint i; - + mem_heap_t* tmp_heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; + ut_ad(index && rec && heap); - + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &tmp_heap); + if (type == ROW_COPY_DATA) { /* Take a copy of rec to heap */ - buf = mem_heap_alloc(heap, rec_get_size(rec)); + buf = mem_heap_alloc(heap, rec_offs_size(offsets)); - rec = rec_copy(buf, rec); + rec = rec_copy(buf, rec, offsets); + /* Avoid a debug assertion in rec_offs_validate(). */ + rec_offs_make_valid(rec, index, offsets); } table = index->table; @@ -373,7 +412,7 @@ row_build_row_ref( ut_a(pos != ULINT_UNDEFINED); - field = rec_get_nth_field(rec, pos, &len); + field = rec_get_nth_field(rec, offsets, pos, &len); dfield_set_data(dfield, field, len); @@ -397,6 +436,9 @@ row_build_row_ref( } ut_ad(dtuple_check_typed(ref)); + if (tmp_heap) { + mem_heap_free(tmp_heap); + } return(ref); } @@ -427,7 +469,10 @@ row_build_row_ref_in_tuple( ulint pos; ulint clust_col_prefix_len; ulint i; - + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; + ut_a(ref && index && rec); if (!index->table) { @@ -446,7 +491,9 @@ row_build_row_ref_in_tuple( fputs("InnoDB: clust index for table ", stderr); goto notfound; } - + + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + ref_len = dict_index_get_n_unique(clust_index); ut_ad(ref_len == dtuple_get_n_fields(ref)); @@ -459,8 +506,8 @@ row_build_row_ref_in_tuple( pos = dict_index_get_nth_field_pos(index, clust_index, i); ut_a(pos != ULINT_UNDEFINED); - - field = rec_get_nth_field(rec, pos, &len); + + field = rec_get_nth_field(rec, offsets, pos, &len); dfield_set_data(dfield, field, len); @@ -484,6 +531,9 @@ row_build_row_ref_in_tuple( } ut_ad(dtuple_check_typed(ref)); + if (heap) { + mem_heap_free(heap); + } } /*********************************************************************** diff --git a/innobase/row/row0sel.c b/innobase/row/row0sel.c index 52228caccb0..8512e796a72 100644 --- a/innobase/row/row0sel.c +++ b/innobase/row/row0sel.c @@ -78,8 +78,19 @@ row_sel_sec_rec_is_for_clust_rec( ulint n; ulint i; dtype_t* cur_type; - - UT_NOT_USED(clust_index); + mem_heap_t* heap = NULL; + ulint clust_offsets_[100] + = { 100, }; + ulint sec_offsets_[10] + = { 10, }; + ulint* clust_offs = clust_offsets_; + ulint* sec_offs = sec_offsets_; + ibool is_equal = TRUE; + + clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs, + ULINT_UNDEFINED, &heap); + sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs, + ULINT_UNDEFINED, &heap); n = dict_index_get_n_ordering_defined_by_user(sec_index); @@ -87,10 +98,10 @@ row_sel_sec_rec_is_for_clust_rec( ifield = dict_index_get_nth_field(sec_index, i); col = dict_field_get_col(ifield); - clust_field = rec_get_nth_field(clust_rec, + clust_field = rec_get_nth_field(clust_rec, clust_offs, dict_col_get_clust_pos(col), &clust_len); - sec_field = rec_get_nth_field(sec_rec, i, &sec_len); + sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len); if (ifield->prefix_len > 0 && clust_len != UNIV_SQL_NULL) { @@ -107,11 +118,16 @@ row_sel_sec_rec_is_for_clust_rec( if (0 != cmp_data_data(dict_col_get_type(col), clust_field, clust_len, sec_field, sec_len)) { - return(FALSE); + is_equal = FALSE; + goto func_exit; } } - return(TRUE); +func_exit: + if (heap) { + mem_heap_free(heap); + } + return(is_equal); } /************************************************************************* @@ -266,6 +282,7 @@ row_sel_fetch_columns( dict_index_t* index, /* in: record index */ rec_t* rec, /* in: record in a clustered or non-clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ sym_node_t* column) /* in: first column in a column list, or NULL */ { @@ -275,6 +292,8 @@ row_sel_fetch_columns( byte* data; ulint len; + ut_ad(rec_offs_validate(rec, index, offsets)); + if (index->type & DICT_CLUSTERED) { index_type = SYM_CLUST_FIELD_NO; } else { @@ -286,7 +305,7 @@ row_sel_fetch_columns( if (field_no != ULINT_UNDEFINED) { - data = rec_get_nth_field(rec, field_no, &len); + data = rec_get_nth_field(rec, offsets, field_no, &len); if (column->copy_val) { eval_node_copy_and_alloc_val(column, data, @@ -601,8 +620,15 @@ row_sel_get_clust_rec( rec_t* clust_rec; rec_t* old_vers; ulint err; + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; + + offsets = rec_get_offsets(rec, + btr_pcur_get_btr_cur(&plan->pcur)->index, + offsets, ULINT_UNDEFINED, &heap); - row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec); + row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets); index = dict_table_get_first_index(plan->table); @@ -619,7 +645,7 @@ row_sel_get_clust_rec( || btr_pcur_get_low_match(&(plan->clust_pcur)) < dict_index_get_n_unique(index)) { - ut_a(rec_get_deleted_flag(rec)); + ut_a(rec_get_deleted_flag(rec, plan->table->comp)); ut_a(node->read_view); /* In a rare case it is possible that no clust rec is found @@ -636,29 +662,30 @@ row_sel_get_clust_rec( goto func_exit; } + offsets = rec_get_offsets(clust_rec, index, offsets, + ULINT_UNDEFINED, &heap); + if (!node->read_view) { /* Try to place a lock on the index record */ /* If innodb_locks_unsafe_for_binlog option is used, - we lock only the record, i.e. next-key locking is - not used. - */ + we lock only the record, i.e., next-key locking is + not used. */ + ulint lock_type; if (srv_locks_unsafe_for_binlog) { - err = lock_clust_rec_read_check_and_lock(0, - clust_rec, - index, node->row_lock_mode, - LOCK_REC_NOT_GAP, thr); + lock_type = LOCK_REC_NOT_GAP; } else { - err = lock_clust_rec_read_check_and_lock(0, - clust_rec, - index, node->row_lock_mode, - LOCK_ORDINARY, thr); + lock_type = LOCK_ORDINARY; } + err = lock_clust_rec_read_check_and_lock(0, + clust_rec, index, offsets, + node->row_lock_mode, lock_type, thr); + if (err != DB_SUCCESS) { - return(err); + goto err_exit; } } else { /* This is a non-locking consistent read: if necessary, fetch @@ -666,22 +693,20 @@ row_sel_get_clust_rec( old_vers = NULL; - if (!lock_clust_rec_cons_read_sees(clust_rec, index, + if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets, node->read_view)) { err = row_sel_build_prev_vers(node->read_view, plan, clust_rec, &old_vers, mtr); if (err != DB_SUCCESS) { - return(err); + goto err_exit; } clust_rec = old_vers; if (clust_rec == NULL) { - *out_rec = clust_rec; - - return(DB_SUCCESS); + goto func_exit; } } @@ -698,24 +723,26 @@ row_sel_get_clust_rec( visit through secondary index records that would not really exist in our snapshot. */ - if ((old_vers || rec_get_deleted_flag(rec)) + if ((old_vers || rec_get_deleted_flag(rec, plan->table->comp)) && !row_sel_sec_rec_is_for_clust_rec(rec, plan->index, clust_rec, index)) { clust_rec = NULL; - *out_rec = clust_rec; - - return(DB_SUCCESS); + goto func_exit; } } /* Fetch the columns needed in test conditions */ - - row_sel_fetch_columns(index, clust_rec, + + row_sel_fetch_columns(index, clust_rec, offsets, UT_LIST_GET_FIRST(plan->columns)); func_exit: *out_rec = clust_rec; - - return(DB_SUCCESS); + err = DB_SUCCESS; +err_exit: + if (heap) { + mem_heap_free(heap); + } + return(err); } /************************************************************************* @@ -727,6 +754,7 @@ sel_set_rec_lock( /* out: DB_SUCCESS or error code */ rec_t* rec, /* in: record */ dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ ulint mode, /* in: lock mode */ ulint type, /* in: LOCK_ORDINARY, LOCK_GAP, or LOC_REC_NOT_GAP */ que_thr_t* thr) /* in: query thread */ @@ -744,11 +772,11 @@ sel_set_rec_lock( } if (index->type & DICT_CLUSTERED) { - err = lock_clust_rec_read_check_and_lock(0, rec, index, mode, - type, thr); + err = lock_clust_rec_read_check_and_lock(0, + rec, index, offsets, mode, type, thr); } else { - err = lock_sec_rec_read_check_and_lock(0, rec, index, mode, - type, thr); + err = lock_sec_rec_read_check_and_lock(0, + rec, index, offsets, mode, type, thr); } return(err); @@ -956,6 +984,10 @@ row_sel_try_search_shortcut( { dict_index_t* index; rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; + ulint ret; index = plan->index; @@ -989,36 +1021,46 @@ row_sel_try_search_shortcut( /* This is a non-locking consistent read: if necessary, fetch a previous version of the record */ + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + if (index->type & DICT_CLUSTERED) { - if (!lock_clust_rec_cons_read_sees(rec, index, + if (!lock_clust_rec_cons_read_sees(rec, index, offsets, node->read_view)) { - return(SEL_RETRY); + ret = SEL_RETRY; + goto func_exit; } } else if (!lock_sec_rec_cons_read_sees(rec, index, node->read_view)) { - return(SEL_RETRY); + ret = SEL_RETRY; + goto func_exit; } /* Test deleted flag. Fetch the columns needed in test conditions. */ - - row_sel_fetch_columns(index, rec, UT_LIST_GET_FIRST(plan->columns)); - if (rec_get_deleted_flag(rec)) { + row_sel_fetch_columns(index, rec, offsets, + UT_LIST_GET_FIRST(plan->columns)); - return(SEL_EXHAUSTED); + if (rec_get_deleted_flag(rec, plan->table->comp)) { + + ret = SEL_EXHAUSTED; + goto func_exit; } /* Test the rest of search conditions */ if (!row_sel_test_other_conds(plan)) { - return(SEL_EXHAUSTED); + ret = SEL_EXHAUSTED; + goto func_exit; } ut_ad(plan->pcur.latch_mode == node->latch_mode); plan->n_rows_fetched++; - +func_exit: + if (heap) { + mem_heap_free(heap); + } return(SEL_FOUND); } @@ -1067,7 +1109,10 @@ row_sel( to the next non-clustered record */ ulint found_flag; ulint err; - + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; + ut_ad(thr->run_node == node); search_latch_locked = FALSE; @@ -1218,22 +1263,23 @@ rec_loop: if (!consistent_read) { /* If innodb_locks_unsafe_for_binlog option is used, - we lock only the record, i.e. next-key locking is - not used. - */ + we lock only the record, i.e., next-key locking is + not used. */ + + rec_t* next_rec = page_rec_get_next(rec); + ulint lock_type; + offsets = rec_get_offsets(next_rec, index, offsets, + ULINT_UNDEFINED, &heap); if (srv_locks_unsafe_for_binlog) { - err = sel_set_rec_lock(page_rec_get_next(rec), - index, - node->row_lock_mode, - LOCK_REC_NOT_GAP, thr); + lock_type = LOCK_REC_NOT_GAP; } else { - err = sel_set_rec_lock(page_rec_get_next(rec), - index, - node->row_lock_mode, - LOCK_ORDINARY, thr); + lock_type = LOCK_ORDINARY; } + err = sel_set_rec_lock(next_rec, index, offsets, + node->row_lock_mode, lock_type, thr); + if (err != DB_SUCCESS) { /* Note that in this case we will store in pcur the PREDECESSOR of the record we are waiting @@ -1260,18 +1306,22 @@ rec_loop: /* Try to place a lock on the index record */ /* If innodb_locks_unsafe_for_binlog option is used, - we lock only the record, i.e. next-key locking is - not used. - */ + we lock only the record, i.e., next-key locking is + not used. */ + + ulint lock_type; + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); if (srv_locks_unsafe_for_binlog) { - err = sel_set_rec_lock(rec, index, node->row_lock_mode, - LOCK_REC_NOT_GAP, thr); + lock_type = LOCK_REC_NOT_GAP; } else { - err = sel_set_rec_lock(rec, index, node->row_lock_mode, - LOCK_ORDINARY, thr); + lock_type = LOCK_ORDINARY; } + err = sel_set_rec_lock(rec, index, offsets, + node->row_lock_mode, lock_type, thr); + if (err != DB_SUCCESS) { goto lock_wait_or_error; @@ -1334,6 +1384,7 @@ rec_loop: /* PHASE 3: Get previous version in a consistent read */ cons_read_requires_clust_rec = FALSE; + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); if (consistent_read) { /* This is a non-locking consistent read: if necessary, fetch @@ -1341,7 +1392,7 @@ rec_loop: if (index->type & DICT_CLUSTERED) { - if (!lock_clust_rec_cons_read_sees(rec, index, + if (!lock_clust_rec_cons_read_sees(rec, index, offsets, node->read_view)) { err = row_sel_build_prev_vers(node->read_view, @@ -1354,6 +1405,7 @@ rec_loop: if (old_vers == NULL) { row_sel_fetch_columns(index, rec, + offsets, UT_LIST_GET_FIRST(plan->columns)); if (!row_sel_test_end_conds(plan)) { @@ -1365,6 +1417,8 @@ rec_loop: } rec = old_vers; + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); } } else if (!lock_sec_rec_cons_read_sees(rec, index, node->read_view)) { @@ -1376,7 +1430,8 @@ rec_loop: /* Fetch the columns needed in test conditions */ - row_sel_fetch_columns(index, rec, UT_LIST_GET_FIRST(plan->columns)); + row_sel_fetch_columns(index, rec, offsets, + UT_LIST_GET_FIRST(plan->columns)); /* Test the selection end conditions: these can only contain columns which already are found in the index, even though the index might be @@ -1391,7 +1446,8 @@ rec_loop: goto table_exhausted; } - if (rec_get_deleted_flag(rec) && !cons_read_requires_clust_rec) { + if (rec_get_deleted_flag(rec, plan->table->comp) + && !cons_read_requires_clust_rec) { /* The record is delete marked: we can skip it if this is not a consistent read which might see an earlier version @@ -1434,7 +1490,7 @@ rec_loop: goto next_rec; } - if (rec_get_deleted_flag(clust_rec)) { + if (rec_get_deleted_flag(clust_rec, plan->table->comp)) { /* The record is delete marked: we can skip it */ @@ -1592,8 +1648,9 @@ next_table_no_mtr: if (search_latch_locked) { rw_lock_s_unlock(&btr_search_latch); } - - return(DB_SUCCESS); + + err = DB_SUCCESS; + goto func_exit; } node->fetch_table++; @@ -1626,6 +1683,7 @@ table_exhausted: table_exhausted_no_mtr: if (node->fetch_table == 0) { + err = DB_SUCCESS; if (node->is_aggregate && !node->aggregate_already_fetched) { @@ -1639,7 +1697,7 @@ table_exhausted_no_mtr: rw_lock_s_unlock(&btr_search_latch); } - return(DB_SUCCESS); + goto func_exit; } node->state = SEL_NODE_NO_MORE_ROWS; @@ -1650,7 +1708,7 @@ table_exhausted_no_mtr: rw_lock_s_unlock(&btr_search_latch); } - return(DB_SUCCESS); + goto func_exit; } node->fetch_table--; @@ -1674,8 +1732,8 @@ stop_for_a_while: mtr_commit(&mtr); ut_ad(sync_thread_levels_empty_gen(TRUE)); - - return(DB_SUCCESS); + err = DB_SUCCESS; + goto func_exit; commit_mtr_for_a_while: /* Stores the cursor position and commits &mtr; this is used if @@ -1710,6 +1768,10 @@ lock_wait_or_error: ut_ad(sync_thread_levels_empty_gen(TRUE)); +func_exit: + if (heap) { + mem_heap_free(heap); + } return(err); } @@ -2133,11 +2195,16 @@ row_sel_store_row_id_to_prebuilt( /*=============================*/ row_prebuilt_t* prebuilt, /* in: prebuilt */ rec_t* index_rec, /* in: record */ - dict_index_t* index) /* in: index of the record */ + dict_index_t* index, /* in: index of the record */ + const ulint* offsets) /* in: rec_get_offsets + (index_rec, index) */ { byte* data; ulint len; - data = rec_get_nth_field(index_rec, + + ut_ad(rec_offs_validate(index_rec, index, offsets)); + + data = rec_get_nth_field(index_rec, offsets, dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len); if (len != DATA_ROW_ID_LEN) { @@ -2147,7 +2214,7 @@ row_sel_store_row_id_to_prebuilt( fprintf(stderr, "\n" "InnoDB: Field number %lu, record:\n", (ulong) dict_index_get_sys_col_pos(index, DATA_ROW_ID)); - rec_print(stderr, index_rec); + rec_print_new(stderr, index_rec, offsets); putc('\n', stderr); ut_error; } @@ -2233,9 +2300,11 @@ row_sel_store_mysql_rec( case) */ byte* mysql_rec, /* out: row in the MySQL format */ row_prebuilt_t* prebuilt, /* in: prebuilt struct */ - rec_t* rec) /* in: Innobase record in the index + rec_t* rec, /* in: Innobase record in the index which was described in prebuilt's template */ + const ulint* offsets) /* in: array returned by + rec_get_offsets() */ { mysql_row_templ_t* templ; mem_heap_t* extern_field_heap = NULL; @@ -2244,26 +2313,29 @@ row_sel_store_mysql_rec( byte* blob_buf; int pad_char; ulint i; + dict_index_t* index; ut_ad(prebuilt->mysql_template); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + index = prebuilt->index; + if (prebuilt->need_to_access_clustered) { + index = dict_table_get_first_index(index->table); + } if (prebuilt->blob_heap != NULL) { mem_heap_free(prebuilt->blob_heap); prebuilt->blob_heap = NULL; } - /* MySQL assumes that all columns have the SQL NULL bit set unless it - is a nullable column with a non-NULL value */ - - memset(mysql_rec, 0xFF, prebuilt->null_bitmap_len); - for (i = 0; i < prebuilt->n_template; i++) { templ = prebuilt->mysql_template + i; - data = rec_get_nth_field(rec, templ->rec_field_no, &len); + data = rec_get_nth_field(rec, offsets, + templ->rec_field_no, &len); - if (rec_get_nth_field_extern_bit(rec, templ->rec_field_no)) { + if (rec_offs_nth_extern(offsets, templ->rec_field_no)) { /* Copy an externally stored field to the temporary heap */ @@ -2277,7 +2349,7 @@ row_sel_store_mysql_rec( causes an assert */ data = btr_rec_copy_externally_stored_field(rec, - templ->rec_field_no, &len, + offsets, templ->rec_field_no, &len, extern_field_heap); ut_a(len != UNIV_SQL_NULL); @@ -2389,6 +2461,8 @@ row_sel_store_mysql_rec( bug number 154 in the MySQL bug database: GROUP BY and DISTINCT could treat NULL values inequal. */ + mysql_rec[templ->mysql_null_byte_offset] |= + (byte) (templ->mysql_null_bit_mask); if (templ->type == DATA_VARCHAR || templ->type == DATA_CHAR || templ->type == DATA_BINARY @@ -2492,6 +2566,9 @@ row_sel_get_clust_rec_for_mysql( rec_t* old_vers; ulint err; trx_t* trx; + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; *out_rec = NULL; trx = thr_get_trx(thr); @@ -2522,9 +2599,8 @@ row_sel_get_clust_rec_for_mysql( clustered index record did not exist in the read view of trx. */ - if (!rec_get_deleted_flag(rec) + if (!rec_get_deleted_flag(rec, sec_index->table->comp) || prebuilt->select_lock_type != LOCK_NONE) { - ut_print_timestamp(stderr); fputs(" InnoDB: error clustered record" " for sec rec not found\n" @@ -2532,10 +2608,10 @@ row_sel_get_clust_rec_for_mysql( dict_index_name_print(stderr, trx, sec_index); fputs("\n" "InnoDB: sec index record ", stderr); - rec_print(stderr, rec); + rec_print(stderr, rec, sec_index); fputs("\n" "InnoDB: clust index record ", stderr); - rec_print(stderr, clust_rec); + rec_print(stderr, clust_rec, clust_index); putc('\n', stderr); trx_print(stderr, trx); @@ -2548,18 +2624,21 @@ row_sel_get_clust_rec_for_mysql( goto func_exit; } + offsets = rec_get_offsets(clust_rec, clust_index, offsets, + ULINT_UNDEFINED, &heap); + if (prebuilt->select_lock_type != LOCK_NONE) { /* Try to place a lock on the index record; we are searching the clust rec with a unique condition, hence we set a LOCK_REC_NOT_GAP type lock */ err = lock_clust_rec_read_check_and_lock(0, clust_rec, - clust_index, + clust_index, offsets, prebuilt->select_lock_type, LOCK_REC_NOT_GAP, thr); if (err != DB_SUCCESS) { - return(err); + goto err_exit; } } else { /* This is a non-locking consistent read: if necessary, fetch @@ -2572,7 +2651,7 @@ row_sel_get_clust_rec_for_mysql( if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED && !lock_clust_rec_cons_read_sees(clust_rec, clust_index, - trx->read_view)) { + offsets, trx->read_view)) { err = row_sel_build_prev_vers_for_mysql( trx->read_view, clust_index, @@ -2581,7 +2660,7 @@ row_sel_get_clust_rec_for_mysql( if (err != DB_SUCCESS) { - return(err); + goto err_exit; } clust_rec = old_vers; @@ -2600,7 +2679,8 @@ row_sel_get_clust_rec_for_mysql( visit through secondary index records that would not really exist in our snapshot. */ - if (clust_rec && (old_vers || rec_get_deleted_flag(rec)) + if (clust_rec && (old_vers + || rec_get_deleted_flag(rec, sec_index->table->comp)) && !row_sel_sec_rec_is_for_clust_rec(rec, sec_index, clust_rec, clust_index)) { clust_rec = NULL; @@ -2622,7 +2702,12 @@ func_exit: btr_pcur_store_position(prebuilt->clust_pcur, mtr); } - return(DB_SUCCESS); + err = DB_SUCCESS; +err_exit: + if (heap) { + mem_heap_free(heap); + } + return(err); } /************************************************************************ @@ -2699,10 +2784,40 @@ row_sel_pop_cached_row_for_mysql( row */ row_prebuilt_t* prebuilt) /* in: prebuilt struct */ { - ut_ad(prebuilt->n_fetch_cached > 0); - - ut_memcpy(buf, prebuilt->fetch_cache[prebuilt->fetch_cache_first], - prebuilt->mysql_row_len); + ulint i; + mysql_row_templ_t* templ; + byte* cached_rec; + ut_ad(prebuilt->n_fetch_cached > 0); + + if (prebuilt->keep_other_fields_on_keyread) + { + /* Copy cache record field by field, don't touch fields that + are not covered by current key */ + cached_rec = + prebuilt->fetch_cache[prebuilt->fetch_cache_first]; + + for (i = 0; i < prebuilt->n_template; i++) { + templ = prebuilt->mysql_template + i; + ut_memcpy( + buf + templ->mysql_col_offset, + cached_rec + templ->mysql_col_offset, + templ->mysql_col_len); + /* Copy NULL bit of the current field from cached_rec + to buf */ + if (templ->mysql_null_bit_mask) + { + buf[templ->mysql_null_byte_offset] ^= + (buf[templ->mysql_null_byte_offset] ^ + cached_rec[templ->mysql_null_byte_offset]) & + (byte)templ->mysql_null_bit_mask; + } + } + } + else + { + ut_memcpy(buf, prebuilt->fetch_cache[prebuilt->fetch_cache_first], + prebuilt->mysql_row_len); + } prebuilt->n_fetch_cached--; prebuilt->fetch_cache_first++; @@ -2718,12 +2833,14 @@ void row_sel_push_cache_row_for_mysql( /*=============================*/ row_prebuilt_t* prebuilt, /* in: prebuilt struct */ - rec_t* rec) /* in: record to push */ + rec_t* rec, /* in: record to push */ + const ulint* offsets) /* in: rec_get_offsets() */ { byte* buf; ulint i; ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE); + ut_ad(rec_offs_validate(rec, NULL, offsets)); ut_a(!prebuilt->templ_contains_blob); if (prebuilt->fetch_cache[0] == NULL) { @@ -2749,7 +2866,7 @@ row_sel_push_cache_row_for_mysql( ut_a(row_sel_store_mysql_rec( prebuilt->fetch_cache[prebuilt->n_fetch_cached], - prebuilt, rec)); + prebuilt, rec, offsets)); prebuilt->n_fetch_cached++; } @@ -2766,6 +2883,8 @@ row_sel_try_search_shortcut_for_mysql( /* out: SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */ rec_t** out_rec,/* out: record if found */ row_prebuilt_t* prebuilt,/* in: prebuilt struct */ + ulint** offsets,/* in/out: for rec_get_offsets(*out_rec) */ + mem_heap_t** heap, /* in/out: heap for rec_get_offsets() */ mtr_t* mtr) /* in: started mtr */ { dict_index_t* index = prebuilt->index; @@ -2803,13 +2922,17 @@ row_sel_try_search_shortcut_for_mysql( /* This is a non-locking consistent read: if necessary, fetch a previous version of the record */ - - if (!lock_clust_rec_cons_read_sees(rec, index, trx->read_view)) { + + *offsets = rec_get_offsets(rec, index, *offsets, + ULINT_UNDEFINED, heap); + + if (!lock_clust_rec_cons_read_sees(rec, index, + *offsets, trx->read_view)) { return(SEL_RETRY); } - if (rec_get_deleted_flag(rec)) { + if (rec_get_deleted_flag(rec, index->table->comp)) { return(SEL_EXHAUSTED); } @@ -2866,7 +2989,6 @@ row_search_for_mysql( ibool moved; ibool cons_read_requires_clust_rec; ibool was_lock_wait; - ulint ret; ulint shortcut; ibool unique_search = FALSE; ibool unique_search_from_clust_index = FALSE; @@ -2878,9 +3000,13 @@ row_search_for_mysql( level is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */ ibool success; + ibool comp; ulint cnt = 0; ulint next_offs; mtr_t mtr; + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; ut_ad(index && pcur && search_tuple); ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); @@ -2985,9 +3111,8 @@ row_search_for_mysql( prebuilt->n_rows_fetched++; srv_n_rows_read++; - trx->op_info = ""; - - return(DB_SUCCESS); + err = DB_SUCCESS; + goto func_exit; } if (prebuilt->fetch_cache_first > 0 @@ -2996,9 +3121,9 @@ row_search_for_mysql( /* The previous returned row was popped from the fetch cache, but the cache was not full at the time of the popping: no more rows can exist in the result set */ - - trx->op_info = ""; - return(DB_RECORD_NOT_FOUND); + + err = DB_RECORD_NOT_FOUND; + goto func_exit; } prebuilt->n_rows_fetched++; @@ -3042,8 +3167,8 @@ row_search_for_mysql( if (direction != 0 && !prebuilt->used_in_HANDLER) { - trx->op_info = ""; - return(DB_RECORD_NOT_FOUND); + err = DB_RECORD_NOT_FOUND; + goto func_exit; } } @@ -3093,13 +3218,14 @@ row_search_for_mysql( } #endif shortcut = row_sel_try_search_shortcut_for_mysql(&rec, - prebuilt, &mtr); + prebuilt, &offsets, &heap, &mtr); if (shortcut == SEL_FOUND) { #ifdef UNIV_SEARCH_DEBUG - ut_a(0 == cmp_dtuple_rec(search_tuple, rec)); + ut_a(0 == cmp_dtuple_rec(search_tuple, + rec, offsets)); #endif if (!row_sel_store_mysql_rec(buf, prebuilt, - rec)) { + rec, offsets)) { err = DB_TOO_BIG_RECORD; /* We let the main loop to do the @@ -3123,12 +3249,10 @@ row_search_for_mysql( trx->has_search_latch = FALSE; } - trx->op_info = ""; - /* NOTE that we do NOT store the cursor position */ - - return(DB_SUCCESS); + err = DB_SUCCESS; + goto func_exit; } else if (shortcut == SEL_EXHAUSTED) { @@ -3146,12 +3270,11 @@ row_search_for_mysql( trx->has_search_latch = FALSE; } - trx->op_info = ""; - /* NOTE that we do NOT store the cursor position */ - return(DB_RECORD_NOT_FOUND); + err = DB_RECORD_NOT_FOUND; + goto func_exit; } shortcut_fails_too_big_rec: mtr_commit(&mtr); @@ -3264,6 +3387,8 @@ rec_loop: /* PHASE 4: Look for matching records in a loop */ rec = btr_pcur_get_rec(pcur); + comp = index->table->comp; + ut_ad(comp == page_is_comp(buf_frame_align(rec))); /* fputs("Using ", stderr); dict_index_name_print(stderr, index); @@ -3291,8 +3416,10 @@ rec_loop: we do not lock gaps. Supremum record is really a gap and therefore we do not set locks there. */ - if (srv_locks_unsafe_for_binlog == FALSE) { - err = sel_set_rec_lock(rec, index, + if (!srv_locks_unsafe_for_binlog) { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + err = sel_set_rec_lock(rec, index, offsets, prebuilt->select_lock_type, LOCK_ORDINARY, thr); } @@ -3312,9 +3439,11 @@ rec_loop: /* Do sanity checks in case our cursor has bumped into page corruption */ - next_offs = rec_get_next_offs(rec); + next_offs = rec_get_next_offs(rec, comp); - if (next_offs >= UNIV_PAGE_SIZE || next_offs < PAGE_SUPREMUM) { + if (next_offs >= UNIV_PAGE_SIZE + || next_offs < + (ulint) (comp ? PAGE_NEW_SUPREMUM : PAGE_OLD_SUPREMUM)) { if (srv_force_recovery == 0 || moves_up == FALSE) { ut_print_timestamp(stderr); @@ -3359,9 +3488,11 @@ rec_loop: } } + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + if (srv_force_recovery > 0) { - if (!rec_validate(rec) || !btr_index_rec_validate(rec, index, - FALSE)) { + if (!rec_validate(rec, offsets) + || !btr_index_rec_validate(rec, index, FALSE)) { fprintf(stderr, "InnoDB: Index corruption: rec offs %lu next offs %lu, page no %lu,\n" "InnoDB: ", @@ -3389,7 +3520,7 @@ rec_loop: /* fputs("Comparing rec and search tuple\n", stderr); */ - if (0 != cmp_dtuple_rec(search_tuple, rec)) { + if (0 != cmp_dtuple_rec(search_tuple, rec, offsets)) { if (prebuilt->select_lock_type != LOCK_NONE && set_also_gap_locks) { @@ -3401,6 +3532,7 @@ rec_loop: if (srv_locks_unsafe_for_binlog == FALSE) { err = sel_set_rec_lock(rec, index, + offsets, prebuilt->select_lock_type, LOCK_GAP, thr); } @@ -3413,7 +3545,7 @@ rec_loop: btr_pcur_store_position(pcur, &mtr); - ret = DB_RECORD_NOT_FOUND; + err = DB_RECORD_NOT_FOUND; /* ut_print_name(stderr, index->name); fputs(" record not found 3\n", stderr); */ @@ -3422,7 +3554,7 @@ rec_loop: } else if (match_mode == ROW_SEL_EXACT_PREFIX) { - if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec)) { + if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, offsets)) { if (prebuilt->select_lock_type != LOCK_NONE && set_also_gap_locks) { @@ -3434,6 +3566,7 @@ rec_loop: if (srv_locks_unsafe_for_binlog == FALSE) { err = sel_set_rec_lock(rec, index, + offsets, prebuilt->select_lock_type, LOCK_GAP, thr); } @@ -3446,7 +3579,7 @@ rec_loop: btr_pcur_store_position(pcur, &mtr); - ret = DB_RECORD_NOT_FOUND; + err = DB_RECORD_NOT_FOUND; /* ut_print_name(stderr, index->name); fputs(" record not found 4\n", stderr); */ @@ -3465,27 +3598,27 @@ rec_loop: is a non-delete marked record, then it is enough to lock its existence with LOCK_REC_NOT_GAP. */ + ulint lock_type; + if (!set_also_gap_locks - || (unique_search && !rec_get_deleted_flag(rec))) { - err = sel_set_rec_lock(rec, index, - prebuilt->select_lock_type, - LOCK_REC_NOT_GAP, thr); + || (unique_search && !rec_get_deleted_flag(rec, comp))) { + lock_type = LOCK_REC_NOT_GAP; } else { /* If innodb_locks_unsafe_for_binlog option is used, - we lock only the record, i.e. next-key locking is + we lock only the record, i.e., next-key locking is not used. */ - if (srv_locks_unsafe_for_binlog) { - err = sel_set_rec_lock(rec, index, - prebuilt->select_lock_type, - LOCK_REC_NOT_GAP, thr); + if (srv_locks_unsafe_for_binlog) { + lock_type = LOCK_REC_NOT_GAP; } else { - err = sel_set_rec_lock(rec, index, - prebuilt->select_lock_type, - LOCK_ORDINARY, thr); - } + lock_type = LOCK_ORDINARY; + } } - + + err = sel_set_rec_lock(rec, index, offsets, + prebuilt->select_lock_type, + lock_type, thr); + if (err != DB_SUCCESS) { goto lock_wait_or_error; @@ -3508,7 +3641,7 @@ rec_loop: if (srv_force_recovery < 5 && !lock_clust_rec_cons_read_sees(rec, index, - trx->read_view)) { + offsets, trx->read_view)) { err = row_sel_build_prev_vers_for_mysql( trx->read_view, clust_index, @@ -3541,7 +3674,8 @@ rec_loop: } } - if (rec_get_deleted_flag(rec) && !cons_read_requires_clust_rec) { + if (rec_get_deleted_flag(rec, comp) + && !cons_read_requires_clust_rec) { /* The record is delete-marked: we can skip it if this is not a consistent read which might see an earlier version @@ -3577,7 +3711,7 @@ rec_loop: goto next_rec; } - if (rec_get_deleted_flag(clust_rec)) { + if (rec_get_deleted_flag(clust_rec, comp)) { /* The record is delete marked: we can skip it */ @@ -3589,6 +3723,15 @@ rec_loop: } } + if (prebuilt->need_to_access_clustered) { + ut_ad(rec == clust_rec || index == clust_index); + offsets = rec_get_offsets(rec, clust_index, offsets, + ULINT_UNDEFINED, &heap); + } else { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + } + /* We found a qualifying row */ if (prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD @@ -3608,7 +3751,7 @@ rec_loop: not cache rows because there the cursor is a scrollable cursor. */ - row_sel_push_cache_row_for_mysql(prebuilt, rec); + row_sel_push_cache_row_for_mysql(prebuilt, rec, offsets); if (prebuilt->n_fetch_cached == MYSQL_FETCH_CACHE_SIZE) { @@ -3618,11 +3761,13 @@ rec_loop: goto next_rec; } else { if (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE) { - ut_memcpy(buf + 4, rec - rec_get_extra_size(rec), - rec_get_size(rec)); - mach_write_to_4(buf, rec_get_extra_size(rec) + 4); + memcpy(buf + 4, rec - rec_offs_extra_size(offsets), + rec_offs_size(offsets)); + mach_write_to_4(buf, + rec_offs_extra_size(offsets) + 4); } else { - if (!row_sel_store_mysql_rec(buf, prebuilt, rec)) { + if (!row_sel_store_mysql_rec(buf, prebuilt, + rec, offsets)) { err = DB_TOO_BIG_RECORD; goto lock_wait_or_error; @@ -3630,8 +3775,10 @@ rec_loop: } if (prebuilt->clust_index_was_generated) { + offsets = rec_get_offsets(index_rec, index, offsets, + ULINT_UNDEFINED, &heap); row_sel_store_row_id_to_prebuilt(prebuilt, index_rec, - index); + index, offsets); } } got_row: @@ -3651,7 +3798,7 @@ got_row: btr_pcur_store_position(pcur, &mtr); } - ret = DB_SUCCESS; + err = DB_SUCCESS; goto normal_return; @@ -3690,9 +3837,9 @@ next_rec: btr_pcur_store_position(pcur, &mtr); if (match_mode != 0) { - ret = DB_RECORD_NOT_FOUND; + err = DB_RECORD_NOT_FOUND; } else { - ret = DB_END_OF_INDEX; + err = DB_END_OF_INDEX; } goto normal_return; @@ -3716,8 +3863,10 @@ lock_wait_or_error: que_thr_stop_for_mysql(thr); + thr->lock_state= QUE_THR_LOCK_ROW; was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL); - + thr->lock_state= QUE_THR_LOCK_NOLOCK; + if (was_lock_wait) { mtr_start(&mtr); @@ -3731,9 +3880,7 @@ lock_wait_or_error: /* fputs("Using ", stderr); dict_index_name_print(stderr, index); fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */ - trx->op_info = ""; - - return(err); + goto func_exit; normal_return: /*-------------------------------------------------------------*/ @@ -3744,19 +3891,22 @@ normal_return: if (prebuilt->n_fetch_cached > 0) { row_sel_pop_cached_row_for_mysql(buf, prebuilt); - ret = DB_SUCCESS; + err = DB_SUCCESS; } /* fputs("Using ", stderr); dict_index_name_print(stderr, index); fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */ - if (ret == DB_SUCCESS) { + if (err == DB_SUCCESS) { srv_n_rows_read++; } +func_exit: trx->op_info = ""; - - return(ret); + if (heap) { + mem_heap_free(heap); + } + return(err); } /*********************************************************************** diff --git a/innobase/row/row0umod.c b/innobase/row/row0umod.c index e16d696314b..1cade0f304f 100644 --- a/innobase/row/row0umod.c +++ b/innobase/row/row0umod.c @@ -438,7 +438,7 @@ row_undo_mod_del_unmark_sec_and_undo_update( dtuple_print(stderr, entry); fputs("\n" "InnoDB: record ", stderr); - rec_print(stderr, btr_pcur_get_rec(&pcur)); + rec_print(stderr, btr_pcur_get_rec(&pcur), index); putc('\n', stderr); trx_print(stderr, trx); fputs("\n" diff --git a/innobase/row/row0undo.c b/innobase/row/row0undo.c index bc3cc8ea9f3..d994eab9873 100644 --- a/innobase/row/row0undo.c +++ b/innobase/row/row0undo.c @@ -151,6 +151,9 @@ row_undo_search_clust_to_pcur( mtr_t mtr; ibool ret; rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; mtr_start(&mtr); @@ -161,8 +164,11 @@ row_undo_search_clust_to_pcur( rec = btr_pcur_get_rec(&(node->pcur)); + offsets = rec_get_offsets(rec, clust_index, offsets, + ULINT_UNDEFINED, &heap); + if (!found || 0 != ut_dulint_cmp(node->roll_ptr, - row_get_rec_roll_ptr(rec, clust_index))) { + row_get_rec_roll_ptr(rec, clust_index, offsets))) { /* We must remove the reservation on the undo log record BEFORE releasing the latch on the clustered index page: this @@ -175,7 +181,7 @@ row_undo_search_clust_to_pcur( ret = FALSE; } else { node->row = row_build(ROW_COPY_DATA, clust_index, rec, - node->heap); + offsets, node->heap); btr_pcur_store_position(&(node->pcur), &mtr); ret = TRUE; @@ -183,6 +189,9 @@ row_undo_search_clust_to_pcur( btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); + if (heap) { + mem_heap_free(heap); + } return(ret); } diff --git a/innobase/row/row0upd.c b/innobase/row/row0upd.c index 9192f6dc692..173912d6956 100644 --- a/innobase/row/row0upd.c +++ b/innobase/row/row0upd.c @@ -301,19 +301,20 @@ recovery. */ void row_upd_rec_sys_fields_in_recovery( /*===============================*/ - rec_t* rec, /* in: record */ - ulint pos, /* in: TRX_ID position in rec */ - dulint trx_id, /* in: transaction id */ - dulint roll_ptr)/* in: roll ptr of the undo log record */ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint pos, /* in: TRX_ID position in rec */ + dulint trx_id, /* in: transaction id */ + dulint roll_ptr)/* in: roll ptr of the undo log record */ { byte* field; ulint len; - field = rec_get_nth_field(rec, pos, &len); + field = rec_get_nth_field(rec, offsets, pos, &len); ut_ad(len == DATA_TRX_ID_LEN); trx_write_trx_id(field, trx_id); - field = rec_get_nth_field(rec, pos + 1, &len); + field = rec_get_nth_field(rec, offsets, pos + 1, &len); ut_ad(len == DATA_ROLL_PTR_LEN); trx_write_roll_ptr(field, roll_ptr); } @@ -361,8 +362,8 @@ row_upd_changes_field_size_or_external( /* out: TRUE if the update changes the size of some field in index or the field is external in rec or update */ - rec_t* rec, /* in: record in index */ dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ upd_t* update) /* in: update vector */ { upd_field_t* upd_field; @@ -372,6 +373,7 @@ row_upd_changes_field_size_or_external( ulint n_fields; ulint i; + ut_ad(rec_offs_validate(NULL, index, offsets)); n_fields = upd_get_n_fields(update); for (i = 0; i < n_fields; i++) { @@ -380,7 +382,7 @@ row_upd_changes_field_size_or_external( new_val = &(upd_field->new_val); new_len = new_val->len; - if (new_len == UNIV_SQL_NULL) { + if (new_len == UNIV_SQL_NULL && !rec_offs_comp(offsets)) { /* A bug fixed on Dec 31st, 2004: we looked at the SQL NULL size from the wrong field! We may backport this fix also to 4.0. The merge to 5.0 will be made @@ -391,14 +393,14 @@ row_upd_changes_field_size_or_external( upd_field->field_no)); } - old_len = rec_get_nth_field_size(rec, upd_field->field_no); - + old_len = rec_offs_nth_size(offsets, upd_field->field_no); + if (old_len != new_len) { return(TRUE); } - if (rec_get_nth_field_extern_bit(rec, upd_field->field_no)) { + if (rec_offs_nth_extern(offsets, upd_field->field_no)) { return(TRUE); } @@ -420,15 +422,18 @@ a clustered index */ void row_upd_rec_in_place( /*=================*/ - rec_t* rec, /* in/out: record where replaced */ - upd_t* update) /* in: update vector */ + rec_t* rec, /* in/out: record where replaced */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + upd_t* update) /* in: update vector */ { upd_field_t* upd_field; dfield_t* new_val; ulint n_fields; ulint i; - rec_set_info_bits(rec, update->info_bits); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + rec_set_info_bits(rec, rec_offs_comp(offsets), update->info_bits); n_fields = upd_get_n_fields(update); @@ -436,7 +441,7 @@ row_upd_rec_in_place( upd_field = upd_get_nth_field(update, i); new_val = &(upd_field->new_val); - rec_set_nth_field(rec, upd_field->field_no, + rec_set_nth_field(rec, offsets, upd_field->field_no, dfield_get_data(new_val), dfield_get_len(new_val)); } @@ -701,6 +706,8 @@ row_upd_build_sec_rec_difference_binary( upd_t* update; ulint n_diff; ulint i; + ulint offsets_[10] = { 10, }; + const ulint* offsets; /* This function is used only for a secondary index */ ut_a(0 == (index->type & DICT_CLUSTERED)); @@ -708,10 +715,12 @@ row_upd_build_sec_rec_difference_binary( update = upd_create(dtuple_get_n_fields(entry), heap); n_diff = 0; + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); for (i = 0; i < dtuple_get_n_fields(entry); i++) { - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field(rec, offsets, i, &len); dfield = dtuple_get_nth_field(entry, i); @@ -774,6 +783,8 @@ row_upd_build_difference_binary( ulint trx_id_pos; ibool extern_bit; ulint i; + ulint offsets_[100] = { 100, }; + const ulint* offsets; /* This function is used only for a clustered index */ ut_a(index->type & DICT_CLUSTERED); @@ -785,9 +796,12 @@ row_upd_build_difference_binary( roll_ptr_pos = dict_index_get_sys_col_pos(index, DATA_ROLL_PTR); trx_id_pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + for (i = 0; i < dtuple_get_n_fields(entry); i++) { - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field(rec, offsets, i, &len); dfield = dtuple_get_nth_field(entry, i); @@ -799,7 +813,7 @@ row_upd_build_difference_binary( goto skip_compare; } - extern_bit = rec_get_nth_field_extern_bit(rec, i); + extern_bit = rec_offs_nth_extern(offsets, i); if (extern_bit != upd_ext_vec_contains(ext_vec, n_ext_vec, i) || !dfield_data_is_binary_equal(dfield, len, data)) { @@ -1123,6 +1137,7 @@ void row_upd_copy_columns( /*=================*/ rec_t* rec, /* in: record in a clustered index */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ sym_node_t* column) /* in: first column in a column list, or NULL */ { @@ -1130,7 +1145,7 @@ row_upd_copy_columns( ulint len; while (column) { - data = rec_get_nth_field(rec, + data = rec_get_nth_field(rec, offsets, column->field_nos[SYM_CLUST_FIELD_NO], &len); eval_node_copy_and_alloc_val(column, data, len); @@ -1177,7 +1192,10 @@ row_upd_store_row( dict_index_t* clust_index; upd_t* update; rec_t* rec; - + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + const ulint* offsets; + ut_ad(node->pcur->latch_mode != BTR_NO_LATCHES); if (node->row != NULL) { @@ -1189,10 +1207,12 @@ row_upd_store_row( rec = btr_pcur_get_rec(node->pcur); - node->row = row_build(ROW_COPY_DATA, clust_index, rec, node->heap); - + offsets = rec_get_offsets(rec, clust_index, offsets_, + ULINT_UNDEFINED, &heap); + node->row = row_build(ROW_COPY_DATA, clust_index, rec, offsets, + node->heap); node->ext_vec = mem_heap_alloc(node->heap, sizeof(ulint) - * rec_get_n_fields(rec)); + * rec_offs_n_fields(offsets)); if (node->is_delete) { update = NULL; } else { @@ -1200,7 +1220,10 @@ row_upd_store_row( } node->n_ext_vec = btr_push_update_extern_fields(node->ext_vec, - rec, update); + offsets, update); + if (heap) { + mem_heap_free(heap); + } } /*************************************************************** @@ -1253,7 +1276,7 @@ row_upd_sec_index_entry( dtuple_print(stderr, entry); fputs("\n" "InnoDB: record ", stderr); - rec_print(stderr, rec); + rec_print(stderr, rec, index); putc('\n', stderr); trx_print(stderr, trx); @@ -1265,7 +1288,7 @@ row_upd_sec_index_entry( delete marked if we return after a lock wait in row_ins_index_entry below */ - if (!rec_get_deleted_flag(rec)) { + if (!rec_get_deleted_flag(rec, index->table->comp)) { err = btr_cur_del_mark_set_sec_rec(0, btr_cur, TRUE, thr, &mtr); if (err == DB_SUCCESS && check_ref) { @@ -1353,7 +1376,7 @@ row_upd_clust_rec_by_insert( a foreign key constraint */ mtr_t* mtr) /* in: mtr; gets committed here */ { - mem_heap_t* heap; + mem_heap_t* heap = NULL; btr_pcur_t* pcur; btr_cur_t* btr_cur; trx_t* trx; @@ -1370,12 +1393,12 @@ row_upd_clust_rec_by_insert( btr_cur = btr_pcur_get_btr_cur(pcur); if (node->state != UPD_NODE_INSERT_CLUSTERED) { + ulint offsets_[100] = { 100, }; err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG, btr_cur, TRUE, thr, mtr); if (err != DB_SUCCESS) { mtr_commit(mtr); - return(err); } @@ -1385,7 +1408,9 @@ row_upd_clust_rec_by_insert( record is removed from the index tree, or updated. */ btr_cur_mark_extern_inherited_fields(btr_cur_get_rec(btr_cur), - node->update, mtr); + rec_get_offsets(btr_cur_get_rec(btr_cur), + dict_table_get_first_index(table), offsets_, + ULINT_UNDEFINED, &heap), node->update, mtr); if (check_ref) { /* NOTE that the following call loses the position of pcur ! */ @@ -1394,7 +1419,9 @@ row_upd_clust_rec_by_insert( index, thr, mtr); if (err != DB_SUCCESS) { mtr_commit(mtr); - + if (heap) { + mem_heap_free(heap); + } return(err); } } @@ -1403,10 +1430,11 @@ row_upd_clust_rec_by_insert( mtr_commit(mtr); + if (!heap) { + heap = mem_heap_create(500); + } node->state = UPD_NODE_INSERT_CLUSTERED; - heap = mem_heap_create(500); - entry = row_build_index_entry(node->row, index, heap); row_upd_index_replace_new_col_vals(entry, index, node->update, NULL); @@ -1458,7 +1486,8 @@ row_upd_clust_rec( pcur = node->pcur; btr_cur = btr_pcur_get_btr_cur(pcur); - ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur))); + ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur), + index->table->comp)); /* Try optimistic updating of the record, keeping changes within the page; we do not check locks because we assume the x-lock on the @@ -1494,7 +1523,8 @@ row_upd_clust_rec( ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr)); - ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur))); + ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur), + index->table->comp)); err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG, btr_cur, &big_rec, node->update, @@ -1502,12 +1532,20 @@ row_upd_clust_rec( mtr_commit(mtr); if (err == DB_SUCCESS && big_rec) { + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + rec_t* rec; mtr_start(mtr); + ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr)); - - err = btr_store_big_rec_extern_fields(index, - btr_cur_get_rec(btr_cur), - big_rec, mtr); + rec = btr_cur_get_rec(btr_cur); + err = btr_store_big_rec_extern_fields(index, rec, + rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap), + big_rec, mtr); + if (heap) { + mem_heap_free(heap); + } mtr_commit(mtr); } @@ -1591,7 +1629,11 @@ row_upd_clust_step( ulint err; mtr_t* mtr; mtr_t mtr_buf; - + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + const ulint* offsets; + index = dict_table_get_first_index(node->table); check_ref = row_upd_index_is_referenced(index, thr_get_trx(thr)); @@ -1647,14 +1689,16 @@ row_upd_clust_step( } } + rec = btr_pcur_get_rec(pcur); + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + if (!node->has_clust_rec_x_lock) { err = lock_clust_rec_modify_check_and_lock(0, - btr_pcur_get_rec(pcur), - index, thr); + rec, index, offsets, thr); if (err != DB_SUCCESS) { mtr_commit(mtr); - - return(err); + goto exit_func; } } @@ -1663,14 +1707,14 @@ row_upd_clust_step( if (node->is_delete) { err = row_upd_del_mark_clust_rec(node, index, thr, check_ref, mtr); - if (err != DB_SUCCESS) { - - return(err); + if (err == DB_SUCCESS) { + node->state = UPD_NODE_UPDATE_ALL_SEC; + node->index = dict_table_get_next_index(index); + } + exit_func: + if (heap) { + mem_heap_free(heap); } - - node->state = UPD_NODE_UPDATE_ALL_SEC; - node->index = dict_table_get_next_index(index); - return(err); } @@ -1680,16 +1724,18 @@ row_upd_clust_step( if (!node->in_mysql_interface) { /* Copy the necessary columns from clust_rec and calculate the new values to set */ - - row_upd_copy_columns(btr_pcur_get_rec(pcur), + row_upd_copy_columns(rec, offsets, UT_LIST_GET_FIRST(node->columns)); row_upd_eval_new_vals(node->update); } + + if (heap) { + mem_heap_free(heap); + } if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) { err = row_upd_clust_rec(node, index, thr, mtr); - return(err); } @@ -1941,6 +1987,8 @@ row_upd_in_place_in_select( btr_pcur_t* pcur; btr_cur_t* btr_cur; ulint err; + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; ut_ad(sel_node->select_will_do_update); ut_ad(sel_node->latch_mode == BTR_MODIFY_LEAF); @@ -1956,11 +2004,17 @@ row_upd_in_place_in_select( /* Copy the necessary columns from clust_rec and calculate the new values to set */ - row_upd_copy_columns(btr_pcur_get_rec(pcur), - UT_LIST_GET_FIRST(node->columns)); + row_upd_copy_columns(btr_pcur_get_rec(pcur), rec_get_offsets( + btr_pcur_get_rec(pcur), btr_cur->index, offsets_, + ULINT_UNDEFINED, &heap), + UT_LIST_GET_FIRST(node->columns)); + if (heap) { + mem_heap_free(heap); + } row_upd_eval_new_vals(node->update); - ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur))); + ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur), + btr_cur->index->table->comp)); ut_ad(node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE); ut_ad(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE); diff --git a/innobase/row/row0vers.c b/innobase/row/row0vers.c index bc17ede89e3..9ccaf32f2c2 100644 --- a/innobase/row/row0vers.c +++ b/innobase/row/row0vers.c @@ -41,10 +41,12 @@ row_vers_impl_x_locked_off_kernel( transaction; NOTE that the kernel mutex is temporarily released! */ rec_t* rec, /* in: record in a secondary index */ - dict_index_t* index) /* in: the secondary index */ + dict_index_t* index, /* in: the secondary index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { dict_index_t* clust_index; rec_t* clust_rec; + ulint* clust_offsets; rec_t* version; rec_t* prev_version; dulint trx_id; @@ -59,6 +61,7 @@ row_vers_impl_x_locked_off_kernel( ibool rec_del; ulint err; mtr_t mtr; + ibool comp; #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); @@ -96,29 +99,33 @@ row_vers_impl_x_locked_off_kernel( return(NULL); } - trx_id = row_get_rec_trx_id(clust_rec, clust_index); + heap = mem_heap_create(1024); + clust_offsets = rec_get_offsets(clust_rec, clust_index, NULL, + ULINT_UNDEFINED, &heap); + trx_id = row_get_rec_trx_id(clust_rec, clust_index, clust_offsets); mtr_s_lock(&(purge_sys->latch), &mtr); mutex_enter(&kernel_mutex); + trx = NULL; if (!trx_is_active(trx_id)) { /* The transaction that modified or inserted clust_rec is no longer active: no implicit lock on rec */ - - mtr_commit(&mtr); - - return(NULL); + goto exit_func; } - if (!lock_check_trx_id_sanity(trx_id, clust_rec, clust_index, TRUE)) { + if (!lock_check_trx_id_sanity(trx_id, clust_rec, clust_index, + clust_offsets, TRUE)) { /* Corruption noticed: try to avoid a crash by returning */ - - mtr_commit(&mtr); - - return(NULL); + goto exit_func; } + comp = index->table->comp; + ut_ad(index->table == clust_index->table); + ut_ad(comp == page_is_comp(buf_frame_align(rec))); + ut_ad(comp == page_is_comp(buf_frame_align(clust_rec))); + /* We look up if some earlier version, which was modified by the trx_id transaction, of the clustered index record would require rec to be in a different state (delete marked or unmarked, or have different field @@ -128,11 +135,10 @@ row_vers_impl_x_locked_off_kernel( different state, then the trx_id transaction has not yet had time to modify rec, and does not necessarily have an implicit x-lock on rec. */ - rec_del = rec_get_deleted_flag(rec); + rec_del = rec_get_deleted_flag(rec, comp); trx = NULL; version = clust_rec; - heap = NULL; for (;;) { mutex_exit(&kernel_mutex); @@ -146,18 +152,17 @@ row_vers_impl_x_locked_off_kernel( heap2 = heap; heap = mem_heap_create(1024); - err = trx_undo_prev_version_build(clust_rec, &mtr, version, - clust_index, heap, - &prev_version); - if (heap2) { - mem_heap_free(heap2); /* version was stored in heap2, - if heap2 != NULL */ - } + clust_index, clust_offsets, heap, + &prev_version); + mem_heap_free(heap2); /* free version and clust_offsets */ if (prev_version) { + clust_offsets = rec_get_offsets(prev_version, + clust_index, NULL, + ULINT_UNDEFINED, &heap); row = row_build(ROW_COPY_POINTERS, clust_index, - prev_version, heap); + prev_version, clust_offsets, heap); entry = row_build_index_entry(row, index, heap); } @@ -189,11 +194,11 @@ row_vers_impl_x_locked_off_kernel( if prev_version would require rec to be in a different state. */ - vers_del = rec_get_deleted_flag(prev_version); + vers_del = rec_get_deleted_flag(prev_version, comp); /* We check if entry and rec are identified in the alphabetical ordering */ - if (0 == cmp_dtuple_rec(entry, rec)) { + if (0 == cmp_dtuple_rec(entry, rec, offsets)) { /* The delete marks of rec and prev_version should be equal for rec to be in the state required by prev_version */ @@ -211,7 +216,7 @@ row_vers_impl_x_locked_off_kernel( dtuple_set_types_binary(entry, dtuple_get_n_fields(entry)); - if (0 != cmp_dtuple_rec(entry, rec)) { + if (0 != cmp_dtuple_rec(entry, rec, offsets)) { trx = trx_get_on_id(trx_id); @@ -226,7 +231,8 @@ row_vers_impl_x_locked_off_kernel( break; } - prev_trx_id = row_get_rec_trx_id(prev_version, clust_index); + prev_trx_id = row_get_rec_trx_id(prev_version, clust_index, + clust_offsets); if (0 != ut_dulint_cmp(trx_id, prev_trx_id)) { /* The versions modified by the trx_id transaction end @@ -238,6 +244,7 @@ row_vers_impl_x_locked_off_kernel( version = prev_version; }/* for (;;) */ +exit_func: mtr_commit(&mtr); mem_heap_free(heap); @@ -297,12 +304,14 @@ row_vers_old_has_index_entry( rec_t* version; rec_t* prev_version; dict_index_t* clust_index; + ulint* clust_offsets; mem_heap_t* heap; mem_heap_t* heap2; dtuple_t* row; dtuple_t* entry; ulint err; - + ibool comp; + ut_ad(mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_X_FIX) || mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_S_FIX)); @@ -313,10 +322,15 @@ row_vers_old_has_index_entry( clust_index = dict_table_get_first_index(index->table); - if (also_curr && !rec_get_deleted_flag(rec)) { + comp = index->table->comp; + ut_ad(comp == page_is_comp(buf_frame_align(rec))); + heap = mem_heap_create(1024); + clust_offsets = rec_get_offsets(rec, clust_index, NULL, + ULINT_UNDEFINED, &heap); - heap = mem_heap_create(1024); - row = row_build(ROW_COPY_POINTERS, clust_index, rec, heap); + if (also_curr && !rec_get_deleted_flag(rec, comp)) { + row = row_build(ROW_COPY_POINTERS, clust_index, + rec, clust_offsets, heap); entry = row_build_index_entry(row, index, heap); /* NOTE that we cannot do the comparison as binary @@ -331,24 +345,17 @@ row_vers_old_has_index_entry( return(TRUE); } - - mem_heap_free(heap); } version = rec; - heap = NULL; for (;;) { heap2 = heap; heap = mem_heap_create(1024); - err = trx_undo_prev_version_build(rec, mtr, version, - clust_index, heap, - &prev_version); - if (heap2) { - mem_heap_free(heap2); /* version was stored in heap2, - if heap2 != NULL */ - } + clust_index, clust_offsets, heap, + &prev_version); + mem_heap_free(heap2); /* free version and clust_offsets */ if (err != DB_SUCCESS || !prev_version) { /* Versions end here */ @@ -358,9 +365,12 @@ row_vers_old_has_index_entry( return(FALSE); } - if (!rec_get_deleted_flag(prev_version)) { + clust_offsets = rec_get_offsets(prev_version, clust_index, + NULL, ULINT_UNDEFINED, &heap); + + if (!rec_get_deleted_flag(prev_version, comp)) { row = row_build(ROW_COPY_POINTERS, clust_index, - prev_version, heap); + prev_version, clust_offsets, heap); entry = row_build_index_entry(row, index, heap); /* NOTE that we cannot do the comparison as binary @@ -412,6 +422,7 @@ row_vers_build_for_consistent_read( mem_heap_t* heap2; byte* buf; ulint err; + ulint* offsets; ut_ad(index->type & DICT_CLUSTERED); ut_ad(mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_X_FIX) @@ -420,22 +431,23 @@ row_vers_build_for_consistent_read( #ifdef UNIV_SYNC_DEBUG ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); #endif /* UNIV_SYNC_DEBUG */ - ut_ad(!read_view_sees_trx_id(view, row_get_rec_trx_id(rec, index))); + + heap = mem_heap_create(1024); + offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); + + ut_ad(!read_view_sees_trx_id(view, + row_get_rec_trx_id(rec, index, offsets))); rw_lock_s_lock(&(purge_sys->latch)); version = rec; - heap = NULL; for (;;) { heap2 = heap; heap = mem_heap_create(1024); err = trx_undo_prev_version_build(rec, mtr, version, index, - heap, &prev_version); - if (heap2) { - mem_heap_free(heap2); /* version was stored in heap2, - if heap2 != NULL */ - } + offsets, heap, &prev_version); + mem_heap_free(heap2); /* free version and offsets */ if (err != DB_SUCCESS) { break; @@ -449,16 +461,17 @@ row_vers_build_for_consistent_read( break; } - prev_trx_id = row_get_rec_trx_id(prev_version, index); + offsets = rec_get_offsets(prev_version, index, NULL, + ULINT_UNDEFINED, &heap); + prev_trx_id = row_get_rec_trx_id(prev_version, index, offsets); if (read_view_sees_trx_id(view, prev_trx_id)) { /* The view already sees this version: we can copy it to in_heap and return */ - buf = mem_heap_alloc(in_heap, rec_get_size( - prev_version)); - *old_vers = rec_copy(buf, prev_version); + buf = mem_heap_alloc(in_heap, rec_offs_size(offsets)); + *old_vers = rec_copy(buf, prev_version, offsets); err = DB_SUCCESS; break; diff --git a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c index f61cc569f6b..7da2ee10d27 100644 --- a/innobase/srv/srv0srv.c +++ b/innobase/srv/srv0srv.c @@ -44,6 +44,7 @@ Created 10/8/1995 Heikki Tuuri #include "buf0flu.h" #include "btr0sea.h" #include "dict0load.h" +#include "dict0boot.h" #include "srv0start.h" #include "row0mysql.h" @@ -186,6 +187,61 @@ that during a time of heavy update/insert activity. */ ulint srv_max_buf_pool_modified_pct = 90; +/* variable counts amount of data read in total (in bytes) */ +ulint srv_data_read = 0; + +/* here we count the amount of data written in total (in bytes) */ +ulint srv_data_written = 0; + +/* the number of the log write requests done */ +ulint srv_log_write_requests = 0; + +/* the number of physical writes to the log performed */ +ulint srv_log_writes = 0; + +/* amount of data written to the log files in bytes */ +ulint srv_os_log_written = 0; + +/* amount of writes being done to the log files */ +ulint srv_os_log_pending_writes = 0; + +/* we increase this counter, when there we don't have enough space in the +log buffer and have to flush it */ +ulint srv_log_waits = 0; + +/* this variable counts the amount of times, when the doublewrite buffer +was flushed */ +ulint srv_dblwr_writes = 0; + +/* here we store the number of pages that have been flushed to the +doublewrite buffer */ +ulint srv_dblwr_pages_written = 0; + +/* in this variable we store the number of write requests issued */ +ulint srv_buf_pool_write_requests = 0; + +/* here we store the number of times when we had to wait for a free page +in the buffer pool. It happens when the buffer pool is full and we need +to make a flush, in order to be able to read or create a page. */ +ulint srv_buf_pool_wait_free = 0; + +/* variable to count the number of pages that were written from buffer +pool to the disk */ +ulint srv_buf_pool_flushed = 0; + +/* variable to count the number of buffer pool reads that led to the +reading of a disk page */ +ulint srv_buf_pool_reads = 0; + +/* variable to count the number of sequential read-aheads */ +ulint srv_read_ahead_seq = 0; + +/* variable to count the number of random read-aheads */ +ulint srv_read_ahead_rnd = 0; + +/* structure to pass status variables to MySQL */ +export_struc export_vars; + /* If the following is != 0 we do not allow inserts etc. This protects the user from forgetting the innodb_force_recovery keyword to my.cnf */ @@ -241,8 +297,8 @@ srv_conc_slot_t* srv_conc_slots; /* array of wait /* Number of times a thread is allowed to enter InnoDB within the same SQL query after it has once got the ticket at srv_conc_enter_innodb */ -#define SRV_FREE_TICKETS_TO_ENTER 500 - +#define SRV_FREE_TICKETS_TO_ENTER srv_n_free_tickets_to_enter +#define SRV_THREAD_SLEEP_DELAY srv_thread_sleep_delay /*-----------------------*/ /* If the following is set TRUE then we do not run purge and insert buffer merge to completion before shutdown */ @@ -257,6 +313,7 @@ ibool srv_very_fast_shutdown = FALSE; /* if this TRUE, do not flush the ibool srv_innodb_status = FALSE; ibool srv_use_doublewrite_buf = TRUE; +ibool srv_use_checksums = TRUE; ibool srv_set_thread_priorities = TRUE; int srv_query_thread_priority = 0; @@ -271,6 +328,8 @@ ulint srv_max_purge_lag = 0; /*-------------------------------------------*/ ulint srv_n_spin_wait_rounds = 20; +ulint srv_n_free_tickets_to_enter = 500; +ulint srv_thread_sleep_delay = 10000; ulint srv_spin_wait_delay = 5; ibool srv_priority_boost = TRUE; @@ -289,6 +348,12 @@ static ulint srv_n_rows_updated_old = 0; static ulint srv_n_rows_deleted_old = 0; static ulint srv_n_rows_read_old = 0; +ulint srv_n_lock_wait_count = 0; +ulint srv_n_lock_wait_current_count = 0; +ib_longlong srv_n_lock_wait_time = 0; +ulint srv_n_lock_max_wait_time = 0; + + /* Set the following to 0 if you want InnoDB to write messages on stderr on startup/shutdown @@ -790,6 +855,7 @@ srv_init(void) { srv_conc_slot_t* conc_slot; srv_slot_t* slot; + dict_table_t* table; ulint i; srv_sys = mem_alloc(sizeof(srv_sys_t)); @@ -839,6 +905,31 @@ srv_init(void) UT_LIST_INIT(srv_sys->tasks); + /* create dummy table and index for old-style infimum and supremum */ + table = dict_mem_table_create("SYS_DUMMY1", + DICT_HDR_SPACE, 1, FALSE); + dict_mem_table_add_col(table, "DUMMY", DATA_CHAR, + DATA_ENGLISH | DATA_NOT_NULL, 8, 0); + + srv_sys->dummy_ind1 = dict_mem_index_create("SYS_DUMMY1", + "SYS_DUMMY1", DICT_HDR_SPACE, 0, 1); + dict_index_add_col(srv_sys->dummy_ind1, + dict_table_get_nth_col(table, 0), 0, 0); + srv_sys->dummy_ind1->table = table; + /* create dummy table and index for new-style infimum and supremum */ + table = dict_mem_table_create("SYS_DUMMY2", + DICT_HDR_SPACE, 1, TRUE); + dict_mem_table_add_col(table, "DUMMY", DATA_CHAR, + DATA_ENGLISH | DATA_NOT_NULL, 8, 0); + srv_sys->dummy_ind2 = dict_mem_index_create("SYS_DUMMY2", + "SYS_DUMMY2", DICT_HDR_SPACE, 0, 1); + dict_index_add_col(srv_sys->dummy_ind2, + dict_table_get_nth_col(table, 0), 0, 0); + srv_sys->dummy_ind2->table = table; + + /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ + srv_sys->dummy_ind1->cached = srv_sys->dummy_ind2->cached = TRUE; + /* Init the server concurrency restriction data structures */ os_fast_mutex_init(&srv_conc_mutex); @@ -936,8 +1027,8 @@ retry: return; } - /* If the transaction is not holding resources, let it sleep for 50 - milliseconds, and try again then */ + /* If the transaction is not holding resources, + let it sleep for SRV_THREAD_SLEEP_DELAY microseconds, and try again then */ if (!has_slept && !trx->has_search_latch && NULL == UT_LIST_GET_FIRST(trx->trx_locks)) { @@ -956,8 +1047,10 @@ retry: situations of lots of thread switches. Simply put some threads aside for a while to reduce the number of thread switches. */ - - os_thread_sleep(10000); + if (SRV_THREAD_SLEEP_DELAY > 0) + { + os_thread_sleep(SRV_THREAD_SLEEP_DELAY); + } trx->op_info = ""; @@ -1295,7 +1388,12 @@ srv_suspend_mysql_thread( trx_t* trx; ibool had_dict_lock = FALSE; ibool was_declared_inside_innodb = FALSE; - + ib_longlong start_time = 0; + ib_longlong finish_time; + ulint diff_time; + ulint sec; + ulint ms; + #ifdef UNIV_SYNC_DEBUG ut_ad(!mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ @@ -1338,6 +1436,13 @@ srv_suspend_mysql_thread( slot->suspend_time = ut_time(); + if (thr->lock_state == QUE_THR_LOCK_ROW) { + srv_n_lock_wait_count++; + srv_n_lock_wait_current_count++; + + ut_usectime(&sec, &ms); + start_time = (ib_longlong)sec * 1000000 + ms; + } /* Wake the lock timeout monitor thread, if it is suspended */ os_event_set(srv_lock_timeout_thread_event); @@ -1388,7 +1493,20 @@ srv_suspend_mysql_thread( slot->in_use = FALSE; wait_time = ut_difftime(ut_time(), slot->suspend_time); - + + if (thr->lock_state == QUE_THR_LOCK_ROW) { + ut_usectime(&sec, &ms); + finish_time = (ib_longlong)sec * 1000000 + ms; + + diff_time = finish_time - start_time; + + srv_n_lock_wait_current_count--; + srv_n_lock_wait_time = srv_n_lock_wait_time + diff_time; + if (diff_time > srv_n_lock_max_wait_time) { + srv_n_lock_max_wait_time = diff_time; + } + } + if (trx->was_chosen_as_deadlock_victim) { trx->error_state = DB_DEADLOCK; @@ -1605,19 +1723,77 @@ srv_printf_innodb_monitor( (srv_n_rows_read - srv_n_rows_read_old) / time_elapsed); - srv_n_rows_inserted_old = srv_n_rows_inserted; + srv_n_rows_inserted_old = srv_n_rows_inserted; srv_n_rows_updated_old = srv_n_rows_updated; srv_n_rows_deleted_old = srv_n_rows_deleted; srv_n_rows_read_old = srv_n_rows_read; - fputs("----------------------------\n" + fputs("----------------------------\n" "END OF INNODB MONITOR OUTPUT\n" "============================\n", file); - mutex_exit(&srv_innodb_monitor_mutex); fflush(file); } +/********************************************************************** +Function to pass InnoDB status variables to MySQL */ + +void +srv_export_innodb_status(void) +{ + + mutex_enter(&srv_innodb_monitor_mutex); + export_vars.innodb_data_pending_reads= os_n_pending_reads; + export_vars.innodb_data_pending_writes= os_n_pending_writes; + export_vars.innodb_data_pending_fsyncs= + fil_n_pending_log_flushes + fil_n_pending_tablespace_flushes; + export_vars.innodb_data_fsyncs= os_n_fsyncs; + export_vars.innodb_data_read= srv_data_read; + export_vars.innodb_data_reads= os_n_file_reads; + export_vars.innodb_data_writes= os_n_file_writes; + export_vars.innodb_data_written= srv_data_written; + export_vars.innodb_buffer_pool_read_requests= buf_pool->n_page_gets; + export_vars.innodb_buffer_pool_write_requests= srv_buf_pool_write_requests; + export_vars.innodb_buffer_pool_wait_free= srv_buf_pool_wait_free; + export_vars.innodb_buffer_pool_pages_flushed= srv_buf_pool_flushed; + export_vars.innodb_buffer_pool_reads= srv_buf_pool_reads; + export_vars.innodb_buffer_pool_read_ahead_rnd= srv_read_ahead_rnd; + export_vars.innodb_buffer_pool_read_ahead_seq= srv_read_ahead_seq; + export_vars.innodb_buffer_pool_pages_data= UT_LIST_GET_LEN(buf_pool->LRU); + export_vars.innodb_buffer_pool_pages_dirty= UT_LIST_GET_LEN(buf_pool->flush_list); + export_vars.innodb_buffer_pool_pages_free= UT_LIST_GET_LEN(buf_pool->free); + export_vars.innodb_buffer_pool_pages_latched= buf_get_latched_pages_number(); + export_vars.innodb_buffer_pool_pages_total= buf_pool->curr_size; + export_vars.innodb_buffer_pool_pages_misc= buf_pool->max_size - + UT_LIST_GET_LEN(buf_pool->LRU) - UT_LIST_GET_LEN(buf_pool->free); + export_vars.innodb_page_size= UNIV_PAGE_SIZE; + export_vars.innodb_log_waits= srv_log_waits; + export_vars.innodb_os_log_written= srv_os_log_written; + export_vars.innodb_os_log_fsyncs= fil_n_log_flushes; + export_vars.innodb_os_log_pending_fsyncs= fil_n_pending_log_flushes; + export_vars.innodb_os_log_pending_writes= srv_os_log_pending_writes; + export_vars.innodb_log_write_requests= srv_log_write_requests; + export_vars.innodb_log_writes= srv_log_writes; + export_vars.innodb_dblwr_pages_written= srv_dblwr_pages_written; + export_vars.innodb_dblwr_writes= srv_dblwr_writes; + export_vars.innodb_pages_created= buf_pool->n_pages_created; + export_vars.innodb_pages_read= buf_pool->n_pages_read; + export_vars.innodb_pages_written= buf_pool->n_pages_written; + export_vars.innodb_row_lock_waits= srv_n_lock_wait_count; + export_vars.innodb_row_lock_current_waits= srv_n_lock_wait_current_count; + export_vars.innodb_row_lock_time= srv_n_lock_wait_time / 10000; + export_vars.innodb_row_lock_time_avg= + (srv_n_lock_wait_count > 0) ? + (srv_n_lock_wait_time / 10000 / srv_n_lock_wait_count) : 0; + export_vars.innodb_row_lock_time_max= srv_n_lock_max_wait_time / 10000; + export_vars.innodb_rows_read= srv_n_rows_read; + export_vars.innodb_rows_inserted= srv_n_rows_inserted; + export_vars.innodb_rows_updated= srv_n_rows_updated; + export_vars.innodb_rows_deleted= srv_n_rows_deleted; + mutex_exit(&srv_innodb_monitor_mutex); + +} + /************************************************************************* A thread which wakes up threads whose lock wait may have lasted too long. This also prints the info output by various InnoDB monitors. */ diff --git a/innobase/srv/srv0start.c b/innobase/srv/srv0start.c index fe05f07df21..983a8306773 100644 --- a/innobase/srv/srv0start.c +++ b/innobase/srv/srv0start.c @@ -1458,15 +1458,13 @@ NetWare. */ fsp_header_inc_size(0, sum_of_new_sizes, &mtr); mtr_commit(&mtr); - } - if (recv_needed_recovery) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Flushing modified pages from the buffer pool...\n"); - } + /* Immediately write the log record about increased tablespace + size to disk, so that it is durable even if mysqld would crash + quickly */ - log_make_checkpoint_at(ut_dulint_max, TRUE); + log_buffer_flush_to_disk(); + } #ifdef UNIV_LOG_ARCHIVE /* Archiving is always off under MySQL */ diff --git a/innobase/sync/sync0rw.c b/innobase/sync/sync0rw.c index 77757685208..359945594be 100644 --- a/innobase/sync/sync0rw.c +++ b/innobase/sync/sync0rw.c @@ -90,7 +90,8 @@ rw_lock_create_func( /*================*/ rw_lock_t* lock, /* in: pointer to memory */ const char* cfile_name, /* in: file name where created */ - ulint cline) /* in: file line where created */ + ulint cline, /* in: file line where created */ + const char* cmutex_name) /* in: mutex name */ { /* If this is the very first time a synchronization object is created, then the following call initializes @@ -101,7 +102,9 @@ rw_lock_create_func( lock->mutex.cfile_name = cfile_name; lock->mutex.cline = cline; - + lock->mutex.cmutex_name = cmutex_name; + lock->mutex.mutex_type = 1; + rw_lock_set_waiters(lock, 0); rw_lock_set_writer(lock, RW_LOCK_NOT_LOCKED); lock->writer_count = 0; diff --git a/innobase/sync/sync0sync.c b/innobase/sync/sync0sync.c index 86306e49cac..788965f82ef 100644 --- a/innobase/sync/sync0sync.c +++ b/innobase/sync/sync0sync.c @@ -129,11 +129,6 @@ sync_array_t* sync_primary_wait_array; /* This variable is set to TRUE when sync_init is called */ ibool sync_initialized = FALSE; -/* Global list of database mutexes (not OS mutexes) created. */ -UT_LIST_BASE_NODE_T(mutex_t) mutex_list; - -/* Mutex protecting the mutex_list variable */ -mutex_t mutex_list_mutex; typedef struct sync_level_struct sync_level_t; typedef struct sync_thread_struct sync_thread_t; @@ -202,7 +197,8 @@ mutex_create_func( /*==============*/ mutex_t* mutex, /* in: pointer to memory */ const char* cfile_name, /* in: file name where created */ - ulint cline) /* in: file line where created */ + ulint cline, /* in: file line where created */ + const char* cmutex_name) /* in: mutex name */ { #if defined(_WIN32) && defined(UNIV_CAN_USE_X86_ASSEMBLER) mutex_reset_lock_word(mutex); @@ -219,6 +215,16 @@ mutex_create_func( mutex->level = SYNC_LEVEL_NONE; mutex->cfile_name = cfile_name; mutex->cline = cline; + mutex->cmutex_name= cmutex_name; + mutex->count_using= 0; + mutex->mutex_type= 0; + mutex->lspent_time= 0; + mutex->lmax_spent_time= 0; + mutex->count_spin_loop= 0; + mutex->count_spin_rounds= 0; + mutex->count_os_wait= 0; + mutex->count_os_yield= 0; + /* Check that lock_word is aligned; this is important on Intel */ ut_ad(((ulint)(&(mutex->lock_word))) % 4 == 0); @@ -355,135 +361,180 @@ for the mutex before suspending the thread. */ void mutex_spin_wait( /*============*/ - mutex_t* mutex, /* in: pointer to mutex */ - const char* file_name, /* in: file name where - mutex requested */ - ulint line) /* in: line where requested */ + mutex_t* mutex, /* in: pointer to mutex */ + const char* file_name, /* in: file name where + mutex requested */ + ulint line) /* in: line where requested */ { - ulint index; /* index of the reserved wait cell */ - ulint i; /* spin round count */ - - ut_ad(mutex); + ulint index; /* index of the reserved wait cell */ + ulint i; /* spin round count */ + ib_longlong lstart_time = 0, lfinish_time; /* for timing os_wait */ + ulint ltime_diff; + ulint sec; + ulint ms; -mutex_loop: + uint timer_started = 0; - i = 0; + ut_ad(mutex); - /* Spin waiting for the lock word to become zero. Note that we do not - have to assume that the read access to the lock word is atomic, as the - actual locking is always committed with atomic test-and-set. In - reality, however, all processors probably have an atomic read of a - memory word. */ - -spin_loop: - mutex_spin_wait_count++; +mutex_loop: - while (mutex_get_lock_word(mutex) != 0 && i < SYNC_SPIN_ROUNDS) { + i = 0; - if (srv_spin_wait_delay) { - ut_delay(ut_rnd_interval(0, srv_spin_wait_delay)); - } - - i++; - } +/* Spin waiting for the lock word to become zero. Note that we do not + have to assume that the read access to the lock word is atomic, as the + actual locking is always committed with atomic test-and-set. In + reality, however, all processors probably have an atomic read of a + memory word. */ - if (i == SYNC_SPIN_ROUNDS) { - os_thread_yield(); - } +spin_loop: + mutex_spin_wait_count++; + mutex->count_spin_loop++; + + while (mutex_get_lock_word(mutex) != 0 && i < SYNC_SPIN_ROUNDS) + { + if (srv_spin_wait_delay) + { + ut_delay(ut_rnd_interval(0, srv_spin_wait_delay)); + } + + i++; + } + + if (i == SYNC_SPIN_ROUNDS) + { + mutex->count_os_yield++; + if (timed_mutexes == 1 && timer_started==0) + { + ut_usectime(&sec, &ms); + lstart_time= (ib_longlong)sec * 1000000 + ms; + timer_started = 1; + } + os_thread_yield(); + } + +#ifdef UNIV_SRV_PRINT_LATCH_WAITS + fprintf(stderr, + "Thread %lu spin wait mutex at %p cfile %s cline %lu rnds %lu\n", + (ulong) os_thread_pf(os_thread_get_curr_id()), mutex, + mutex->cfile_name, (ulong) mutex->cline, (ulong) i); +#endif - if (srv_print_latch_waits) { - fprintf(stderr, - "Thread %lu spin wait mutex at %p cfile %s cline %lu rnds %lu\n", - (ulong) os_thread_pf(os_thread_get_curr_id()), mutex, - mutex->cfile_name, (ulong) mutex->cline, (ulong) i); - } + mutex_spin_round_count += i; - mutex_spin_round_count += i; + mutex->count_spin_rounds += i; - if (mutex_test_and_set(mutex) == 0) { - /* Succeeded! */ + if (mutex_test_and_set(mutex) == 0) + { + /* Succeeded! */ #ifdef UNIV_SYNC_DEBUG - mutex_set_debug_info(mutex, file_name, line); + mutex_set_debug_info(mutex, file_name, line); #endif - return; - } + goto finish_timing; + } - /* We may end up with a situation where lock_word is - 0 but the OS fast mutex is still reserved. On FreeBSD - the OS does not seem to schedule a thread which is constantly - calling pthread_mutex_trylock (in mutex_test_and_set - implementation). Then we could end up spinning here indefinitely. - The following 'i++' stops this infinite spin. */ + /* We may end up with a situation where lock_word is + 0 but the OS fast mutex is still reserved. On FreeBSD + the OS does not seem to schedule a thread which is constantly + calling pthread_mutex_trylock (in mutex_test_and_set + implementation). Then we could end up spinning here indefinitely. + The following 'i++' stops this infinite spin. */ - i++; - - if (i < SYNC_SPIN_ROUNDS) { + i++; - goto spin_loop; - } + if (i < SYNC_SPIN_ROUNDS) + { + goto spin_loop; + } - sync_array_reserve_cell(sync_primary_wait_array, mutex, - SYNC_MUTEX, - file_name, line, - &index); + sync_array_reserve_cell(sync_primary_wait_array, mutex, + SYNC_MUTEX, file_name, line, &index); - mutex_system_call_count++; + mutex_system_call_count++; - /* The memory order of the array reservation and the change in the - waiters field is important: when we suspend a thread, we first - reserve the cell and then set waiters field to 1. When threads are - released in mutex_exit, the waiters field is first set to zero and - then the event is set to the signaled state. */ - - mutex_set_waiters(mutex, 1); + /* The memory order of the array reservation and the change in the + waiters field is important: when we suspend a thread, we first + reserve the cell and then set waiters field to 1. When threads are + released in mutex_exit, the waiters field is first set to zero and + then the event is set to the signaled state. */ + + mutex_set_waiters(mutex, 1); - /* Try to reserve still a few times */ - for (i = 0; i < 4; i++) { - if (mutex_test_and_set(mutex) == 0) { + /* Try to reserve still a few times */ + for (i = 0; i < 4; i++) + { + if (mutex_test_and_set(mutex) == 0) + { + /* Succeeded! Free the reserved wait cell */ - /* Succeeded! Free the reserved wait cell */ + sync_array_free_cell(sync_primary_wait_array, index); - sync_array_free_cell(sync_primary_wait_array, index); - #ifdef UNIV_SYNC_DEBUG - mutex_set_debug_info(mutex, file_name, line); + mutex_set_debug_info(mutex, file_name, line); #endif - if (srv_print_latch_waits) { - fprintf(stderr, - "Thread %lu spin wait succeeds at 2:" - " mutex at %p\n", - (ulong) os_thread_pf(os_thread_get_curr_id()), - mutex); - } - - return; +#ifdef UNIV_SRV_PRINT_LATCH_WAITS + fprintf(stderr, "Thread %lu spin wait succeeds at 2:" + " mutex at %p\n", + (ulong) os_thread_pf(os_thread_get_curr_id()), + mutex); +#endif - /* Note that in this case we leave the waiters field - set to 1. We cannot reset it to zero, as we do not know - if there are other waiters. */ - } - } + goto finish_timing; - /* Now we know that there has been some thread holding the mutex - after the change in the wait array and the waiters field was made. - Now there is no risk of infinite wait on the event. */ + /* Note that in this case we leave the waiters field + set to 1. We cannot reset it to zero, as we do not know + if there are other waiters. */ + } + } - if (srv_print_latch_waits) { - fprintf(stderr, - "Thread %lu OS wait mutex at %p cfile %s cline %lu rnds %lu\n", - (ulong) os_thread_pf(os_thread_get_curr_id()), mutex, - mutex->cfile_name, (ulong) mutex->cline, (ulong) i); - } - - mutex_system_call_count++; - mutex_os_wait_count++; + /* Now we know that there has been some thread holding the mutex + after the change in the wait array and the waiters field was made. +Now there is no risk of infinite wait on the event. */ - sync_array_wait_event(sync_primary_wait_array, index); +#ifdef UNIV_SRV_PRINT_LATCH_WAITS + fprintf(stderr, + "Thread %lu OS wait mutex at %p cfile %s cline %lu rnds %lu\n", + (ulong) os_thread_pf(os_thread_get_curr_id()), mutex, + mutex->cfile_name, (ulong) mutex->cline, (ulong) i); +#endif - goto mutex_loop; + mutex_system_call_count++; + mutex_os_wait_count++; + + mutex->count_os_wait++; + + /* + !!!!! Sometimes os_wait can be called without os_thread_yield + */ + + if (timed_mutexes == 1 && timer_started==0) + { + ut_usectime(&sec, &ms); + lstart_time= (ib_longlong)sec * 1000000 + ms; + timer_started = 1; + } + + + sync_array_wait_event(sync_primary_wait_array, index); + goto mutex_loop; + +finish_timing: + if (timed_mutexes == 1 && timer_started==1) + { + ut_usectime(&sec, &ms); + lfinish_time= (ib_longlong)sec * 1000000 + ms; + + ltime_diff= lfinish_time - lstart_time; + mutex->lspent_time += ltime_diff; + if (mutex->lmax_spent_time < ltime_diff) + { + mutex->lmax_spent_time= ltime_diff; + } + } + return; } /********************************************************************** @@ -555,6 +606,7 @@ mutex_set_level( mutex->level = level; } + #ifdef UNIV_SYNC_DEBUG /********************************************************************** Checks that the current thread owns the mutex. Works only in the debug diff --git a/innobase/trx/trx0rec.c b/innobase/trx/trx0rec.c index fe429d1cc62..90ecb217c1d 100644 --- a/innobase/trx/trx0rec.c +++ b/innobase/trx/trx0rec.c @@ -38,16 +38,18 @@ trx_undof_page_add_undo_rec_log( ulint new_free, /* in: end offset of the entry */ mtr_t* mtr) /* in: mtr */ { - byte* log_ptr; - ulint len; + byte* log_ptr; + const byte* log_end; + ulint len; - log_ptr = mlog_open(mtr, 30 + MLOG_BUF_MARGIN); + log_ptr = mlog_open(mtr, 11 + 13 + MLOG_BUF_MARGIN); if (log_ptr == NULL) { return; } + log_end = &log_ptr[11 + 13 + MLOG_BUF_MARGIN]; log_ptr = mlog_write_initial_log_record_fast(undo_page, MLOG_UNDO_INSERT, log_ptr, mtr); len = new_free - old_free - 4; @@ -55,14 +57,11 @@ trx_undof_page_add_undo_rec_log( mach_write_to_2(log_ptr, len); log_ptr += 2; - if (len < 256) { - ut_memcpy(log_ptr, undo_page + old_free + 2, len); - log_ptr += len; - } - - mlog_close(mtr, log_ptr); - - if (len >= MLOG_BUF_MARGIN) { + if (log_ptr + len <= log_end) { + memcpy(log_ptr, undo_page + old_free + 2, len); + mlog_close(mtr, log_ptr + len); + } else { + mlog_close(mtr, log_ptr); mlog_catenate_string(mtr, undo_page + old_free + 2, len); } } @@ -404,6 +403,7 @@ trx_undo_page_report_modify( delete marking is done */ rec_t* rec, /* in: clustered index record which has NOT yet been modified */ + const ulint* offsets, /* in: rec_get_offsets(rec, index) */ upd_t* update, /* in: update vector which tells the columns to be updated; in the case of a delete, this should be set to NULL */ @@ -430,6 +430,7 @@ trx_undo_page_report_modify( ulint i; ut_a(index->type & DICT_CLUSTERED); + ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_UPDATE); table = index->table; @@ -454,7 +455,7 @@ trx_undo_page_report_modify( /* Store first some general parameters to the undo log */ if (update) { - if (rec_get_deleted_flag(rec)) { + if (rec_get_deleted_flag(rec, table->comp)) { type_cmpl = TRX_UNDO_UPD_DEL_REC; } else { type_cmpl = TRX_UNDO_UPD_EXIST_REC; @@ -479,14 +480,20 @@ trx_undo_page_report_modify( /*----------------------------------------*/ /* Store the state of the info bits */ - bits = rec_get_info_bits(rec); + bits = rec_get_info_bits(rec, table->comp); mach_write_to_1(ptr, bits); ptr += 1; /* Store the values of the system columns */ - trx_id = dict_index_rec_get_sys_col(index, DATA_TRX_ID, rec); + field = rec_get_nth_field(rec, offsets, + dict_index_get_sys_col_pos(index, DATA_TRX_ID), &len); + ut_ad(len == DATA_TRX_ID_LEN); + trx_id = trx_read_trx_id(field); + field = rec_get_nth_field(rec, offsets, + dict_index_get_sys_col_pos(index, DATA_ROLL_PTR), &len); + ut_ad(len == DATA_ROLL_PTR_LEN); + roll_ptr = trx_read_roll_ptr(field); - roll_ptr = dict_index_rec_get_sys_col(index, DATA_ROLL_PTR, rec); len = mach_dulint_write_compressed(ptr, trx_id); ptr += len; @@ -499,7 +506,7 @@ trx_undo_page_report_modify( for (i = 0; i < dict_index_get_n_unique(index); i++) { - field = rec_get_nth_field(rec, i, &flen); + field = rec_get_nth_field(rec, offsets, i, &flen); if (trx_undo_left(undo_page, ptr) < 4) { @@ -547,14 +554,14 @@ trx_undo_page_report_modify( ptr += len; /* Save the old value of field */ - field = rec_get_nth_field(rec, pos, &flen); + field = rec_get_nth_field(rec, offsets, pos, &flen); if (trx_undo_left(undo_page, ptr) < 5) { return(0); } - if (rec_get_nth_field_extern_bit(rec, pos)) { + if (rec_offs_nth_extern(offsets, pos)) { /* If a field has external storage, we add to flen the flag */ @@ -631,7 +638,7 @@ trx_undo_page_report_modify( ptr += len; /* Save the old value of field */ - field = rec_get_nth_field(rec, pos, &flen); + field = rec_get_nth_field(rec, offsets, pos, &flen); if (trx_undo_left(undo_page, ptr) < 5) { @@ -1008,7 +1015,10 @@ trx_undo_report_row_operation( ibool is_insert; trx_rseg_t* rseg; mtr_t mtr; - + mem_heap_t* heap = NULL; + ulint offsets_[100] = { 100, }; + ulint* offsets = offsets_; + ut_a(index->type & DICT_CLUSTERED); if (flags & BTR_NO_UNDO_LOG_FLAG) { @@ -1019,7 +1029,6 @@ trx_undo_report_row_operation( } ut_ad(thr); - ut_a(index->type & DICT_CLUSTERED); ut_ad((op_type != TRX_UNDO_INSERT_OP) || (clust_entry && !update && !rec)); @@ -1079,9 +1088,10 @@ trx_undo_report_row_operation( index, clust_entry, &mtr); } else { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); offset = trx_undo_page_report_modify(undo_page, trx, - index, rec, update, - cmpl_info, &mtr); + index, rec, offsets, update, cmpl_info, &mtr); } if (offset == 0) { @@ -1123,7 +1133,9 @@ trx_undo_report_row_operation( mutex_exit(&(trx->undo_mutex)); mtr_commit(&mtr); - + if (heap) { + mem_heap_free(heap); + } return(DB_OUT_OF_FILE_SPACE); } } @@ -1140,6 +1152,9 @@ trx_undo_report_row_operation( *roll_ptr = trx_undo_build_roll_ptr(is_insert, rseg->id, page_no, offset); + if (heap) { + mem_heap_free(heap); + } return(DB_SUCCESS); } @@ -1236,6 +1251,7 @@ trx_undo_prev_version_build( index_rec page and purge_view */ rec_t* rec, /* in: version of a clustered index record */ dict_index_t* index, /* in: clustered index */ + ulint* offsets,/* in: rec_get_offsets(rec, index) */ mem_heap_t* heap, /* in: memory heap from which the memory needed is allocated */ rec_t** old_vers)/* out, own: previous version, or NULL if @@ -1258,7 +1274,6 @@ trx_undo_prev_version_build( ibool dummy_extern; byte* buf; ulint err; - #ifdef UNIV_SYNC_DEBUG ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); #endif /* UNIV_SYNC_DEBUG */ @@ -1266,21 +1281,23 @@ trx_undo_prev_version_build( MTR_MEMO_PAGE_S_FIX) || mtr_memo_contains(index_mtr, buf_block_align(index_rec), MTR_MEMO_PAGE_X_FIX)); + ut_ad(rec_offs_validate(rec, index, offsets)); + if (!(index->type & DICT_CLUSTERED)) { fprintf(stderr, "InnoDB: Error: trying to access" " update undo rec for non-clustered index %s\n" "InnoDB: Submit a detailed bug report to" " http://bugs.mysql.com\n" "InnoDB: index record ", index->name); - rec_print(stderr, index_rec); + rec_print(stderr, index_rec, index); fputs("\n" "InnoDB: record version ", stderr); - rec_print(stderr, rec); + rec_print_new(stderr, rec, offsets); putc('\n', stderr); return(DB_ERROR); } - roll_ptr = row_get_rec_roll_ptr(rec, index); + roll_ptr = row_get_rec_roll_ptr(rec, index, offsets); old_roll_ptr = roll_ptr; *old_vers = NULL; @@ -1292,7 +1309,7 @@ trx_undo_prev_version_build( return(DB_SUCCESS); } - rec_trx_id = row_get_rec_trx_id(rec, index); + rec_trx_id = row_get_rec_trx_id(rec, index, offsets); err = trx_undo_get_undo_rec(roll_ptr, rec_trx_id, &undo_rec, heap); @@ -1341,10 +1358,10 @@ trx_undo_prev_version_build( ut_print_buf(stderr, undo_rec, 150); fputs("\n" "InnoDB: index record ", stderr); - rec_print(stderr, index_rec); + rec_print(stderr, index_rec, index); fputs("\n" "InnoDB: record version ", stderr); - rec_print(stderr, rec); + rec_print_new(stderr, rec, offsets); fprintf(stderr, "\n" "InnoDB: Record trx id %lu %lu, update rec trx id %lu %lu\n" "InnoDB: Roll ptr in rec %lu %lu, in update rec %lu %lu\n", @@ -1358,11 +1375,10 @@ trx_undo_prev_version_build( (ulong) ut_dulint_get_low(roll_ptr)); trx_purge_sys_print(); - return(DB_ERROR); } - if (row_upd_changes_field_size_or_external(rec, index, update)) { + if (row_upd_changes_field_size_or_external(index, offsets, update)) { ulint* ext_vect; ulint n_ext_vect; @@ -1372,27 +1388,28 @@ trx_undo_prev_version_build( those fields that update updates to become externally stored fields. Store the info to ext_vect: */ - ext_vect = mem_alloc(sizeof(ulint) * rec_get_n_fields(rec)); - n_ext_vect = btr_push_update_extern_fields(ext_vect, rec, + ext_vect = mem_alloc(sizeof(ulint) + * rec_offs_n_fields(offsets)); + n_ext_vect = btr_push_update_extern_fields(ext_vect, offsets, update); entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, heap); row_upd_index_replace_new_col_vals(entry, index, update, heap); - buf = mem_heap_alloc(heap, rec_get_converted_size(entry)); + buf = mem_heap_alloc(heap, + rec_get_converted_size(index, entry)); - *old_vers = rec_convert_dtuple_to_rec(buf, entry); + *old_vers = rec_convert_dtuple_to_rec(buf, index, entry); /* Now set the extern bits in the old version of the record */ - rec_set_field_extern_bits(*old_vers, ext_vect, n_ext_vect, - NULL); + rec_set_field_extern_bits(*old_vers, index, + ext_vect, n_ext_vect, NULL); mem_free(ext_vect); } else { - buf = mem_heap_alloc(heap, rec_get_size(rec)); - - *old_vers = rec_copy(buf, rec); - - row_upd_rec_in_place(*old_vers, update); + buf = mem_heap_alloc(heap, rec_offs_size(offsets)); + *old_vers = rec_copy(buf, rec, offsets); + rec_offs_make_valid(*old_vers, index, offsets); + row_upd_rec_in_place(*old_vers, offsets, update); } return(DB_SUCCESS); diff --git a/innobase/trx/trx0roll.c b/innobase/trx/trx0roll.c index eb7c7f43f03..e5cffd2a4f3 100644 --- a/innobase/trx/trx0roll.c +++ b/innobase/trx/trx0roll.c @@ -30,9 +30,13 @@ Created 3/26/1996 Heikki Tuuri /* This many pages must be undone before a truncate is tried within rollback */ #define TRX_ROLL_TRUNC_THRESHOLD 1 +/* In crash recovery, the current trx to be rolled back */ +trx_t* trx_roll_crash_recv_trx = NULL; + /* In crash recovery we set this to the undo n:o of the current trx to be rolled back. Then we can print how many % the rollback has progressed. */ ib_longlong trx_roll_max_undo_no; + /* Auxiliary variable which tells the previous progress % we printed */ ulint trx_roll_progress_printed_pct; @@ -331,11 +335,20 @@ trx_savept_take( /*********************************************************************** Rollback or clean up transactions which have no user session. If the transaction already was committed, then we clean up a possible insert -undo log. If the transaction was not yet committed, then we roll it back. */ +undo log. If the transaction was not yet committed, then we roll it back. +Note: this is done in a background thread. */ -void -trx_rollback_or_clean_all_without_sess(void) -/*========================================*/ +#ifndef __WIN__ +void* +#else +ulint +#endif +trx_rollback_or_clean_all_without_sess( +/*===================================*/ + /* out: a dummy parameter */ + void* arg __attribute__((unused))) + /* in: a dummy parameter required by + os_thread_create */ { mem_heap_t* heap; que_fork_t* fork; @@ -360,9 +373,9 @@ trx_rollback_or_clean_all_without_sess(void) if (UT_LIST_GET_FIRST(trx_sys->trx_list)) { fprintf(stderr, - "InnoDB: Starting rollback of uncommitted transactions\n"); +"InnoDB: Starting in background the rollback of uncommitted transactions\n"); } else { - return; + goto leave_function; } loop: heap = mem_heap_create(512); @@ -371,24 +384,30 @@ loop: trx = UT_LIST_GET_FIRST(trx_sys->trx_list); - while (trx && (trx->sess || (trx->conc_state == TRX_NOT_STARTED))) { - - trx = UT_LIST_GET_NEXT(trx_list, trx); + while (trx) { + if ((trx->sess || (trx->conc_state == TRX_NOT_STARTED))) { + trx = UT_LIST_GET_NEXT(trx_list, trx); + } else if (trx->conc_state == TRX_PREPARED) { + trx->sess = trx_dummy_sess; + } else { + break; + } } mutex_exit(&kernel_mutex); if (trx == NULL) { + ut_print_timestamp(stderr); fprintf(stderr, - "InnoDB: Rollback of uncommitted transactions completed\n"); + " InnoDB: Rollback of uncommitted transactions completed\n"); mem_heap_free(heap); - - return; + + goto leave_function; } trx->sess = trx_dummy_sess; - + if (trx->conc_state == TRX_COMMITTED_IN_MEMORY) { fprintf(stderr, "InnoDB: Cleaning up trx with id %lu %lu\n", (ulong) ut_dulint_get_high(trx->id), @@ -417,21 +436,28 @@ loop: ut_a(thr == que_fork_start_command(fork)); + trx_roll_crash_recv_trx = trx; trx_roll_max_undo_no = ut_conv_dulint_to_longlong(trx->undo_no); trx_roll_progress_printed_pct = 0; rows_to_undo = trx_roll_max_undo_no; + if (rows_to_undo > 1000000000) { rows_to_undo = rows_to_undo / 1000000; unit = "M"; } + ut_print_timestamp(stderr); fprintf(stderr, -"InnoDB: Rolling back trx with id %lu %lu, %lu%s rows to undo", +" InnoDB: Rolling back trx with id %lu %lu, %lu%s rows to undo\n", (ulong) ut_dulint_get_high(trx->id), (ulong) ut_dulint_get_low(trx->id), (ulong) rows_to_undo, unit); mutex_exit(&kernel_mutex); + trx->mysql_thread_id = os_thread_get_curr_id(); + + trx->mysql_process_no = os_proc_get_number(); + if (trx->dict_operation) { row_mysql_lock_data_dictionary(trx); } @@ -446,7 +472,7 @@ loop: fprintf(stderr, "InnoDB: Waiting for rollback of trx id %lu to end\n", - (ulong) ut_dulint_get_low(trx->id)); + (ulong) ut_dulint_get_low(trx->id)); os_thread_sleep(100000); mutex_enter(&kernel_mutex); @@ -485,7 +511,23 @@ loop: (ulong) ut_dulint_get_low(trx->id)); mem_heap_free(heap); + trx_roll_crash_recv_trx = NULL; + goto loop; + +leave_function: + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit and not use return() to exit. */ + + os_thread_exit(NULL); + + /* The following is dummy code to keep the compiler happy: */ + +#ifndef __WIN__ + return(NULL); +#else + return(0); +#endif } /*********************************************************************** @@ -846,16 +888,17 @@ try_again: ut_ad(ut_dulint_cmp(ut_dulint_add(undo_no, 1), trx->undo_no) == 0); /* We print rollback progress info if we are in a crash recovery - and the transaction has at least 1000 row operations to undo */ + and the transaction has at least 1000 row operations to undo. */ + + if (trx == trx_roll_crash_recv_trx && trx_roll_max_undo_no > 1000) { - if (srv_is_being_started && trx_roll_max_undo_no > 1000) { - progress_pct = 100 - (ulint) + progress_pct = 100 - (ulint) ((ut_conv_dulint_to_longlong(undo_no) * 100) / trx_roll_max_undo_no); if (progress_pct != trx_roll_progress_printed_pct) { if (trx_roll_progress_printed_pct == 0) { fprintf(stderr, - "\nInnoDB: Progress in percents: %lu", (ulong) progress_pct); +"\nInnoDB: Progress in percents: %lu\n", (ulong) progress_pct); } else { fprintf(stderr, " %lu", (ulong) progress_pct); diff --git a/innobase/trx/trx0sys.c b/innobase/trx/trx0sys.c index 54bd5be01a1..57166e98f45 100644 --- a/innobase/trx/trx0sys.c +++ b/innobase/trx/trx0sys.c @@ -125,6 +125,22 @@ trx_doublewrite_init( } /******************************************************************** +Frees the doublewrite buffer. */ +static +void +trx_doublewrite_free(void) +/*======================*/ +{ + mutex_free(&(trx_doublewrite->mutex)); + + mem_free(trx_doublewrite->buf_block_arr); + ut_free(trx_doublewrite->write_buf_unaligned); + + mem_free(trx_doublewrite); + trx_doublewrite = NULL; +} + +/******************************************************************** Marks the trx sys header when we have successfully upgraded to the >= 4.1.x multiple tablespace format. */ @@ -512,6 +528,9 @@ trx_sys_doublewrite_init_or_restore_pages( fil_flush_file_spaces(FIL_TABLESPACE); + if (!srv_use_doublewrite_buf) + trx_doublewrite_free(); + leave_func: ut_free(unaligned_read_buf); } @@ -887,8 +906,12 @@ trx_sys_init_at_db_start(void) trx = UT_LIST_GET_FIRST(trx_sys->trx_list); for (;;) { - rows_to_undo += + + if ( trx->conc_state != TRX_PREPARED) { + rows_to_undo += ut_conv_dulint_to_longlong(trx->undo_no); + } + trx = UT_LIST_GET_NEXT(trx_list, trx); if (!trx) { diff --git a/innobase/trx/trx0trx.c b/innobase/trx/trx0trx.c index af4f1979858..6619286ee71 100644 --- a/innobase/trx/trx0trx.c +++ b/innobase/trx/trx0trx.c @@ -24,6 +24,7 @@ Created 3/26/1996 Heikki Tuuri #include "thr0loc.h" #include "btr0sea.h" #include "os0proc.h" +#include "trx0xa.h" /* Copy of the prototype for innobase_mysql_print_thd: this copy MUST be equal to the one in mysql/sql/ha_innodb.cc ! */ @@ -155,10 +156,15 @@ trx_create( trx->auto_inc_lock = NULL; trx->n_lock_table_exp = 0; + trx->n_lock_table_transactional = 0; trx->read_view_heap = mem_heap_create(256); trx->read_view = NULL; + /* Set X/Open XA transaction identification to NULL */ + memset(&trx->xid,0,sizeof(trx->xid)); + trx->xid.formatID = -1; + return(trx); } @@ -288,6 +294,7 @@ trx_free( ut_a(!trx->has_search_latch); ut_a(!trx->auto_inc_lock); ut_a(!trx->n_lock_table_exp); + ut_a(!trx->n_lock_table_transactional); ut_a(trx->dict_operation_lock_mode == 0); @@ -416,13 +423,22 @@ trx_lists_init_at_db_start(void) trx = trx_create(NULL); trx->id = undo->trx_id; - + trx->xid = undo->xid; trx->insert_undo = undo; trx->rseg = rseg; if (undo->state != TRX_UNDO_ACTIVE) { - trx->conc_state = TRX_COMMITTED_IN_MEMORY; + /* Prepared transactions are left in + the prepared state waiting for a + commit or abort decision from MySQL */ + + if (undo->state == TRX_UNDO_PREPARED) { + trx->conc_state = TRX_PREPARED; + } else { + trx->conc_state = + TRX_COMMITTED_IN_MEMORY; + } /* We give a dummy value for the trx no; this should have no relevance since purge @@ -465,10 +481,22 @@ trx_lists_init_at_db_start(void) trx = trx_create(NULL); trx->id = undo->trx_id; + trx->xid = undo->xid; if (undo->state != TRX_UNDO_ACTIVE) { - trx->conc_state = - TRX_COMMITTED_IN_MEMORY; + + /* Prepared transactions are left in + the prepared state waiting for a + commit or abort decision from MySQL */ + + if (undo->state == TRX_UNDO_PREPARED) { + trx->conc_state = + TRX_PREPARED; + } else { + trx->conc_state = + TRX_COMMITTED_IN_MEMORY; + } + /* We give a dummy value for the trx number */ @@ -734,7 +762,8 @@ trx_commit_off_kernel( mutex_enter(&kernel_mutex); } - ut_ad(trx->conc_state == TRX_ACTIVE); + ut_ad(trx->conc_state == TRX_ACTIVE || trx->conc_state == TRX_PREPARED); + #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ @@ -1626,10 +1655,15 @@ trx_print( putc('\n', f); if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) { + fprintf(f, "mysql tables in use %lu, locked %lu\n", + (ulong) trx->n_mysql_tables_in_use, + (ulong) trx->mysql_n_tables_locked); + } - fprintf(f, "mysql tables in use %lu, locked %lu\n", - (ulong) trx->n_mysql_tables_in_use, - (ulong) trx->mysql_n_tables_locked); + if (trx->n_lock_table_transactional > 0 || trx->n_lock_table_exp > 0) { +fprintf(f, "mysql explicit table locks %lu, transactional table locks %lu\n", + (ulong) trx->n_lock_table_exp, + (ulong) trx->n_lock_table_transactional); } newline = TRUE; @@ -1675,3 +1709,239 @@ trx_print( innobase_mysql_print_thd(f, trx->mysql_thd); } } + +/******************************************************************** +Prepares a transaction. */ + +void +trx_prepare_off_kernel( +/*==================*/ + trx_t* trx) /* in: transaction */ +{ + page_t* update_hdr_page; + dulint lsn; + trx_rseg_t* rseg; + trx_undo_t* undo; + ibool must_flush_log = FALSE; + mtr_t mtr; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + rseg = trx->rseg; + + if (trx->insert_undo != NULL || trx->update_undo != NULL) { + + mutex_exit(&kernel_mutex); + + mtr_start(&mtr); + + must_flush_log = TRUE; + + /* Change the undo log segment states from TRX_UNDO_ACTIVE + to some other state: these modifications to the file data + structure define the transaction as prepared in the file + based world, at the serialization point of the log sequence + number lsn obtained below. */ + + mutex_enter(&(rseg->mutex)); + + if (trx->insert_undo != NULL) { + trx_undo_set_state_at_prepare(trx, trx->insert_undo, + &mtr); + } + + undo = trx->update_undo; + + if (undo) { + + /* It is not necessary to obtain trx->undo_mutex here + because only a single OS thread is allowed to do the + transaction prepare for this transaction. */ + + update_hdr_page = trx_undo_set_state_at_prepare(trx, undo, &mtr); + } + + mutex_exit(&(rseg->mutex)); + + /*--------------*/ + mtr_commit(&mtr); + /*--------------*/ + lsn = mtr.end_lsn; + + mutex_enter(&kernel_mutex); + } + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + /*--------------------------------------*/ + trx->conc_state = TRX_PREPARED; + /*--------------------------------------*/ + + if (trx->read_view) { + read_view_close(trx->read_view); + + mem_heap_empty(trx->read_view_heap); + trx->read_view = NULL; + } + + if (must_flush_log) { + + mutex_exit(&kernel_mutex); + + /* Write the log to the log files AND flush them to disk */ + + /*-------------------------------------*/ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); + + /*-------------------------------------*/ + + mutex_enter(&kernel_mutex); + } +} + +/************************************************************************** +Does the transaction prepare for MySQL. */ + +ulint +trx_prepare_for_mysql( +/*=================*/ + /* out: 0 or error number */ + trx_t* trx) /* in: trx handle */ +{ + /* Because we do not do the prepare by sending an Innobase + sig to the transaction, we must here make sure that trx has been + started. */ + + ut_a(trx); + + trx->op_info = "preparing"; + + trx_start_if_not_started(trx); + + mutex_enter(&kernel_mutex); + + trx_prepare_off_kernel(trx); + + mutex_exit(&kernel_mutex); + + trx->op_info = ""; + + return(0); +} + +/************************************************************************** +This function is used to find number of prepared transactions and +their transaction objects for a recovery. */ + +int +trx_recover_for_mysql( +/*==================*/ + /* out: number of prepared transactions + stored in xid_list */ + XID* xid_list, /* in/out: prepared transactions */ + uint len) /* in: number of slots in xid_list */ +{ + trx_t* trx; + int num_of_transactions = 0; + + ut_ad(xid_list); + ut_ad(len); + + fprintf(stderr, + "InnoDB: Starting recovery for XA transactions...\n"); + + + /* We should set those transactions which are in + the prepared state to the xid_list */ + + mutex_enter(&kernel_mutex); + + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + + while (trx) { + if (trx->conc_state == TRX_PREPARED) { + xid_list[num_of_transactions] = trx->xid; + + fprintf(stderr, +"InnoDB: Transaction %lu %lu in prepared state after recovery\n", + (ulong) ut_dulint_get_high(trx->id), + (ulong) ut_dulint_get_low(trx->id)); + + fprintf(stderr, +"InnoDB: Transaction contains changes to %lu rows\n", + (ulong)ut_conv_dulint_to_longlong(trx->undo_no)); + + num_of_transactions++; + + if ((uint)num_of_transactions == len ) { + break; + } + } + + trx = UT_LIST_GET_NEXT(trx_list, trx); + } + + mutex_exit(&kernel_mutex); + + fprintf(stderr, + "InnoDB: %d transactions in prepare state after recovery\n", + num_of_transactions); + + return (num_of_transactions); +} + +/*********************************************************************** +This function is used to find one X/Open XA distributed transaction +which is in the prepared state */ + +trx_t * +trx_get_trx_by_xid( +/*===============*/ + /* out: trx or NULL */ + XID* xid) /* in: X/Open XA Transaction Idenfication */ +{ + trx_t* trx; + + if (xid == NULL) { + return (NULL); + } + + mutex_enter(&kernel_mutex); + + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + + while (trx) { + /* Compare two X/Open XA transaction id's: their + length should be the same and binary comparison + of gtrid_lenght+bqual_length bytes should be + the same */ + + if (xid->gtrid_length == trx->xid.gtrid_length && + xid->bqual_length == trx->xid.bqual_length && + memcmp(xid, &trx->xid, + xid->gtrid_length + + xid->bqual_length) == 0) { + break; + } + + trx = UT_LIST_GET_NEXT(trx_list, trx); + } + + mutex_exit(&kernel_mutex); + + if (trx) { + if (trx->conc_state != TRX_PREPARED) { + return(NULL); + } + + return(trx); + } else { + return(NULL); + } +} + diff --git a/innobase/trx/trx0undo.c b/innobase/trx/trx0undo.c index 8d1518753dd..4bfa9c20a54 100644 --- a/innobase/trx/trx0undo.c +++ b/innobase/trx/trx0undo.c @@ -19,6 +19,7 @@ Created 3/26/1996 Heikki Tuuri #include "srv0srv.h" #include "trx0rec.h" #include "trx0purge.h" +#include "trx0xa.h" /* How should the old versions in the history list be managed? ---------------------------------------------------------- @@ -97,6 +98,7 @@ trx_undo_mem_create( TRX_UNDO_UPDATE */ dulint trx_id, /* in: id of the trx for which the undo log is created */ + XID* xid, /* in: X/Open XA transaction identification*/ ulint page_no,/* in: undo log header page number */ ulint offset); /* in: undo log header byte offset on page */ /******************************************************************* @@ -109,6 +111,7 @@ trx_undo_insert_header_reuse( page_t* undo_page, /* in: insert undo log segment header page, x-latched */ dulint trx_id, /* in: transaction id */ + XID* xid, /* in: X/Open XA transaction identification*/ mtr_t* mtr); /* in: mtr */ /************************************************************************** If an update undo log can be discarded immediately, this function frees the @@ -484,6 +487,7 @@ trx_undo_header_create( TRX_UNDO_LOG_HDR_SIZE bytes free space on it */ dulint trx_id, /* in: transaction id */ + XID* xid, /* in: X/Open XA XID */ mtr_t* mtr) /* in: mtr */ { trx_upagef_t* page_hdr; @@ -530,11 +534,25 @@ trx_undo_header_create( mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id); mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free); - mach_write_to_2(log_hdr + TRX_UNDO_DICT_OPERATION, FALSE); - + /* If X/Open XID exits in the log header we store a + flag of it in upper byte of dict operation flag. */ + + if (xid != NULL || xid->formatID != -1) { + mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, TRUE); + } else { + mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE); + } + + mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE); mach_write_to_2(log_hdr + TRX_UNDO_NEXT_LOG, 0); mach_write_to_2(log_hdr + TRX_UNDO_PREV_LOG, prev_log); - + + /* Write X/Open XA transaction identification if exists */ + + if (xid && xid->formatID != -1) { + trx_undo_write_xid(log_hdr, xid); + } + trx_undo_header_create_log(undo_page, trx_id, mtr); return(free); @@ -569,6 +587,11 @@ trx_undo_parse_page_header( mtr_t* mtr) /* in: mtr or NULL */ { dulint trx_id; + XID xid; + + /* Set X/Open XA transaction identification to NULL */ + memset(&xid, 0, sizeof(xid)); + xid.formatID = -1; ptr = mach_dulint_parse_compressed(ptr, end_ptr, &trx_id); @@ -579,10 +602,10 @@ trx_undo_parse_page_header( if (page) { if (type == MLOG_UNDO_HDR_CREATE) { - trx_undo_header_create(page, trx_id, mtr); + trx_undo_header_create(page, trx_id, &xid, mtr); } else { ut_ad(type == MLOG_UNDO_HDR_REUSE); - trx_undo_insert_header_reuse(page, trx_id, mtr); + trx_undo_insert_header_reuse(page, trx_id, &xid, mtr); } } @@ -599,6 +622,7 @@ trx_undo_insert_header_reuse( page_t* undo_page, /* in: insert undo log segment header page, x-latched */ dulint trx_id, /* in: transaction id */ + XID* xid, /* in: X/Open XA transaction identification */ mtr_t* mtr) /* in: mtr */ { trx_upagef_t* page_hdr; @@ -636,8 +660,18 @@ trx_undo_insert_header_reuse( mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id); mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free); - mach_write_to_2(log_hdr + TRX_UNDO_DICT_OPERATION, FALSE); + /* If X/Open XID exits in the log header we store it + to log header. */ + + if (xid && xid->formatID != -1) { + mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, TRUE); + trx_undo_write_xid(log_hdr, xid); + } else { + mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE); + } + + mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE); trx_undo_insert_header_reuse_log(undo_page, trx_id, mtr); return(free); @@ -718,6 +752,52 @@ trx_undo_discard_latest_update_undo( } /************************************************************************ +Write X/Open XA Transaction Identification (XID) to undo log header */ + +void +trx_undo_write_xid( +/*===============*/ + trx_ulogf_t* log_hdr,/* in: undo log header */ + XID* xid) /* in: X/Open XA Transaction Identification */ +{ + ulint i; + + mach_write_to_4(log_hdr + TRX_UNDO_XA_FORMAT, xid->formatID); + + mach_write_to_4(log_hdr + TRX_UNDO_XA_TRID_LEN, xid->gtrid_length); + + mach_write_to_4(log_hdr + TRX_UNDO_XA_BQUAL_LEN, xid->bqual_length); + + for(i=0; i < XIDDATASIZE; i++) { + mach_write_to_1(log_hdr + TRX_UNDO_XA_XID + i, + (ulint)(xid->data[i])); + } +} + +/************************************************************************ +Read X/Open XA Transaction Identification (XID) from undo log header */ + +void +trx_undo_read_xid( +/*==============*/ + trx_ulogf_t* log_hdr,/* in: undo log header */ + XID* xid) /* out: X/Open XA Transaction Identification */ +{ + ulint i; + + xid->formatID = mach_read_from_4(log_hdr + TRX_UNDO_XA_FORMAT); + + xid->gtrid_length = mach_read_from_4(log_hdr + TRX_UNDO_XA_TRID_LEN); + + xid->bqual_length = mach_read_from_4(log_hdr + TRX_UNDO_XA_BQUAL_LEN); + + for(i=0; i < XIDDATASIZE; i++) { + xid->data[i] = (char)mach_read_from_1(log_hdr + + TRX_UNDO_XA_XID +i); + } +} + +/************************************************************************ Tries to add a page to the undo log segment where the undo log is placed. */ ulint @@ -800,7 +880,6 @@ trx_undo_free_page( list */ ulint space, /* in: space */ ulint hdr_page_no, /* in: header page number */ - ulint hdr_offset, /* in: header offset */ ulint page_no, /* in: page number to free: must not be the header page */ mtr_t* mtr) /* in: mtr which does not have a latch to any @@ -813,7 +892,6 @@ trx_undo_free_page( trx_rsegf_t* rseg_header; ulint hist_size; - UT_NOT_USED(hdr_offset); ut_a(hdr_page_no != page_no); #ifdef UNIV_SYNC_DEBUG ut_ad(!mutex_own(&kernel_mutex)); @@ -870,8 +948,7 @@ trx_undo_free_page_in_rollback( #endif /* UNIV_SYNC_DEBUG */ last_page_no = trx_undo_free_page(undo->rseg, FALSE, undo->space, - undo->hdr_page_no, undo->hdr_offset, - page_no, mtr); + undo->hdr_page_no, page_no, mtr); undo->last_page_no = last_page_no; undo->size--; @@ -1039,7 +1116,7 @@ loop: trx_undo_empty_header_page(space, hdr_page_no, hdr_offset, &mtr); } else { - trx_undo_free_page(rseg, TRUE, space, hdr_page_no, hdr_offset, + trx_undo_free_page(rseg, TRUE, space, hdr_page_no, page_no, &mtr); } @@ -1123,7 +1200,9 @@ trx_undo_mem_create_at_db_start( fil_addr_t last_addr; page_t* last_page; trx_undo_rec_t* rec; - + XID xid; + ibool xid_exists = FALSE; + if (id >= TRX_RSEG_N_SLOTS) { fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", (ulong) id); @@ -1145,15 +1224,31 @@ trx_undo_mem_create_at_db_start( undo_header = undo_page + offset; trx_id = mtr_read_dulint(undo_header + TRX_UNDO_TRX_ID, mtr); + + xid_exists = mtr_read_ulint(undo_header + TRX_UNDO_XID_EXISTS, + MLOG_1BYTE, mtr); + + /* Read X/Open XA transaction identification if exists or + set it to NULL. */ + + memset(&xid, 0, sizeof(xid)); + xid.formatID = -1; + + if (xid_exists == TRUE) { + trx_undo_read_xid(undo_header, &xid); + } + mutex_enter(&(rseg->mutex)); - undo = trx_undo_mem_create(rseg, id, type, trx_id, page_no, offset); + undo = trx_undo_mem_create(rseg, id, type, trx_id, &xid, + page_no, offset); mutex_exit(&(rseg->mutex)); - undo->dict_operation = mtr_read_ulint( - undo_header + TRX_UNDO_DICT_OPERATION, - MLOG_2BYTES, mtr); + undo->dict_operation = mtr_read_ulint( + undo_header + TRX_UNDO_DICT_TRANS, + MLOG_1BYTE, mtr); + undo->table_id = mtr_read_dulint(undo_header + TRX_UNDO_TABLE_ID, mtr); undo->state = state; undo->size = flst_get_len(seg_header + TRX_UNDO_PAGE_LIST, mtr); @@ -1272,7 +1367,8 @@ trx_undo_mem_create( ulint type, /* in: type of the log: TRX_UNDO_INSERT or TRX_UNDO_UPDATE */ dulint trx_id, /* in: id of the trx for which the undo log - is created */ + is created */ + XID* xid, /* in: X/Open transaction identification */ ulint page_no,/* in: undo log header page number */ ulint offset) /* in: undo log header byte offset on page */ { @@ -1295,6 +1391,7 @@ trx_undo_mem_create( undo->state = TRX_UNDO_ACTIVE; undo->del_marks = FALSE; undo->trx_id = trx_id; + undo->xid = *xid; undo->dict_operation = FALSE; @@ -1322,6 +1419,7 @@ trx_undo_mem_init_for_reuse( trx_undo_t* undo, /* in: undo log to init */ dulint trx_id, /* in: id of the trx for which the undo log is created */ + XID* xid, /* in: X/Open XA transaction identification*/ ulint offset) /* in: undo log header byte offset on page */ { #ifdef UNIV_SYNC_DEBUG @@ -1339,6 +1437,7 @@ trx_undo_mem_init_for_reuse( undo->state = TRX_UNDO_ACTIVE; undo->del_marks = FALSE; undo->trx_id = trx_id; + undo->xid = *xid; undo->dict_operation = FALSE; @@ -1376,6 +1475,7 @@ trx_undo_create( TRX_UNDO_UPDATE */ dulint trx_id, /* in: id of the trx for which the undo log is created */ + XID* xid, /* in: X/Open transaction identification*/ mtr_t* mtr) /* in: mtr */ { trx_rsegf_t* rseg_header; @@ -1410,9 +1510,10 @@ trx_undo_create( page_no = buf_frame_get_page_no(undo_page); - offset = trx_undo_header_create(undo_page, trx_id, mtr); + offset = trx_undo_header_create(undo_page, trx_id, xid, mtr); - undo = trx_undo_mem_create(rseg, id, type, trx_id, page_no, offset); + undo = trx_undo_mem_create(rseg, id, type, trx_id, xid , + page_no, offset); return(undo); } @@ -1432,6 +1533,7 @@ trx_undo_reuse_cached( TRX_UNDO_UPDATE */ dulint trx_id, /* in: id of the trx for which the undo log is used */ + XID* xid, /* in: X/Open XA transaction identification*/ mtr_t* mtr) /* in: mtr */ { trx_undo_t* undo; @@ -1475,16 +1577,17 @@ trx_undo_reuse_cached( undo_page = trx_undo_page_get(undo->space, undo->hdr_page_no, mtr); if (type == TRX_UNDO_INSERT) { - offset = trx_undo_insert_header_reuse(undo_page, trx_id, mtr); + offset = trx_undo_insert_header_reuse(undo_page, trx_id, + xid, mtr); } else { ut_a(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_UPDATE); - offset = trx_undo_header_create(undo_page, trx_id, mtr); + offset = trx_undo_header_create(undo_page, trx_id, xid, mtr); } - trx_undo_mem_init_for_reuse(undo, trx_id, offset); + trx_undo_mem_init_for_reuse(undo, trx_id, xid, offset); return(undo); } @@ -1506,9 +1609,10 @@ trx_undo_mark_as_dict_operation( hdr_page = trx_undo_page_get(undo->space, undo->hdr_page_no, mtr); - mlog_write_ulint(hdr_page + undo->hdr_offset + TRX_UNDO_DICT_OPERATION, - trx->dict_operation, MLOG_2BYTES, mtr); - + mlog_write_ulint(hdr_page + undo->hdr_offset + + TRX_UNDO_DICT_TRANS, + trx->dict_operation, MLOG_1BYTE, mtr); + mlog_write_dulint(hdr_page + undo->hdr_offset + TRX_UNDO_TABLE_ID, trx->table_id, mtr); @@ -1548,10 +1652,10 @@ trx_undo_assign_undo( #endif /* UNIV_SYNC_DEBUG */ mutex_enter(&(rseg->mutex)); - undo = trx_undo_reuse_cached(rseg, type, trx->id, &mtr); + undo = trx_undo_reuse_cached(rseg, type, trx->id, &trx->xid, &mtr); if (undo == NULL) { - undo = trx_undo_create(rseg, type, trx->id, &mtr); + undo = trx_undo_create(rseg, type, trx->id, &trx->xid, &mtr); if (undo == NULL) { /* Did not succeed */ @@ -1632,6 +1736,56 @@ trx_undo_set_state_at_finish( return(undo_page); } +/********************************************************************** +Sets the state of the undo log segment at a transaction prepare. */ + +page_t* +trx_undo_set_state_at_prepare( +/*==========================*/ + /* out: undo log segment header page, + x-latched */ + trx_t* trx, /* in: transaction */ + trx_undo_t* undo, /* in: undo log memory copy */ + mtr_t* mtr) /* in: mtr */ +{ + trx_usegf_t* seg_hdr; + trx_upagef_t* page_hdr; + trx_ulogf_t* undo_header; + page_t* undo_page; + ulint offset; + + ut_ad(trx && undo && mtr); + + if (undo->id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", + (ulong) undo->id); + mem_analyze_corruption((byte*)undo); + ut_error; + } + + undo_page = trx_undo_page_get(undo->space, undo->hdr_page_no, mtr); + + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + + /*------------------------------*/ + undo->state = TRX_UNDO_PREPARED; + undo->xid = trx->xid; + /*------------------------------*/ + + mlog_write_ulint(seg_hdr + TRX_UNDO_STATE, undo->state, + MLOG_2BYTES, mtr); + + offset = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG); + undo_header = undo_page + offset; + + mlog_write_ulint(undo_header + TRX_UNDO_XID_EXISTS, + TRUE, MLOG_1BYTE, mtr); + + trx_undo_write_xid(undo_header, &undo->xid); + return(undo_page); +} + /************************************************************************** Adds the update undo log header as the first in the history list, and frees the memory object, or puts it to the list of cached update undo log diff --git a/innobase/ut/ut0ut.c b/innobase/ut/ut0ut.c index 732380bcb1f..f35b4dea5e0 100644 --- a/innobase/ut/ut0ut.c +++ b/innobase/ut/ut0ut.c @@ -74,6 +74,21 @@ ut_time(void) } /************************************************************** +Returns system time. */ + +void +ut_usectime( +/*========*/ + ulint* sec, /* out: seconds since the Epoch */ + ulint* ms) /* out: microseconds since the Epoch+*sec */ +{ + struct timeval tv; + gettimeofday(&tv,NULL); + *sec = (ulint) tv.tv_sec; + *ms = (ulint) tv.tv_usec; +} + +/************************************************************** Returns the difference of two times in seconds. */ double |