diff options
Diffstat (limited to 'innobase')
95 files changed, 7365 insertions, 2684 deletions
diff --git a/innobase/btr/btr0btr.c b/innobase/btr/btr0btr.c index ae967e0525e..c911124e705 100644 --- a/innobase/btr/btr0btr.c +++ b/innobase/btr/btr0btr.c @@ -86,15 +86,6 @@ btr_page_create( page_t* page, /* in: page to be created */ dict_tree_t* tree, /* in: index tree */ mtr_t* mtr); /* in: mtr */ -/****************************************************************** -Sets the child node file address in a node pointer. */ -UNIV_INLINE -void -btr_node_ptr_set_child_page_no( -/*===========================*/ - rec_t* rec, /* in: node pointer record */ - ulint page_no, /* in: child node address */ - mtr_t* mtr); /* in: mtr */ /**************************************************************** Returns the upper level node pointer to a page. It is assumed that mtr holds an x-latch on the tree. */ @@ -128,7 +119,10 @@ btr_page_insert_fits( rec_t* split_rec, /* in: suggestion for first record on upper half-page, or NULL if tuple should be first */ - dtuple_t* tuple); /* in: tuple to insert */ + const ulint* offsets, /* in: rec_get_offsets( + split_rec, cursor->index) */ + dtuple_t* tuple, /* in: tuple to insert */ + mem_heap_t* heap); /* in: temporary memory heap */ /****************************************************************** Gets the root node of a tree and x-latches it. */ @@ -143,11 +137,13 @@ btr_root_get( ulint space; ulint root_page_no; page_t* root; + ibool comp = UT_LIST_GET_FIRST(tree->tree_indexes)->table->comp; space = dict_tree_get_space(tree); root_page_no = dict_tree_get_page(tree); root = btr_page_get(space, root_page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(root) == comp); return(root); } @@ -194,6 +190,7 @@ btr_get_prev_user_rec( MTR_MEMO_PAGE_S_FIX)) || (mtr_memo_contains(mtr, buf_block_align(prev_page), MTR_MEMO_PAGE_X_FIX))); + ut_a(page_is_comp(prev_page) == page_is_comp(page)); prev_rec = page_rec_get_prev(page_get_supremum_rec(prev_page)); @@ -246,6 +243,7 @@ btr_get_next_user_rec( || (mtr_memo_contains(mtr, buf_block_align(next_page), MTR_MEMO_PAGE_X_FIX))); + ut_a(page_is_comp(next_page) == page_is_comp(page)); next_rec = page_rec_get_next(page_get_infimum_rec(next_page)); return(next_rec); @@ -267,7 +265,8 @@ btr_page_create( { ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); - page_create(page, mtr); + page_create(page, mtr, + UT_LIST_GET_FIRST(tree->tree_indexes)->table->comp); buf_block_align(page)->check_index_page_at_flush = TRUE; btr_page_set_index_id(page, tree->id, mtr); @@ -503,20 +502,21 @@ UNIV_INLINE void btr_node_ptr_set_child_page_no( /*===========================*/ - rec_t* rec, /* in: node pointer record */ - ulint page_no, /* in: child node address */ - mtr_t* mtr) /* in: mtr */ + rec_t* rec, /* in: node pointer record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint page_no,/* in: child node address */ + mtr_t* mtr) /* in: mtr */ { - ulint n_fields; byte* field; ulint len; + ut_ad(rec_offs_validate(rec, NULL, offsets)); ut_ad(0 < btr_page_get_level(buf_frame_align(rec), mtr)); - - n_fields = rec_get_n_fields(rec); + ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec)); /* The child address is in the last field */ - field = rec_get_nth_field(rec, n_fields - 1, &len); + field = rec_get_nth_field(rec, offsets, + rec_offs_n_fields(offsets) - 1, &len); ut_ad(len == 4); @@ -529,16 +529,18 @@ static page_t* btr_node_ptr_get_child( /*===================*/ - /* out: child page, x-latched */ - rec_t* node_ptr, /* in: node pointer */ - mtr_t* mtr) /* in: mtr */ + /* out: child page, x-latched */ + rec_t* node_ptr,/* in: node pointer */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + mtr_t* mtr) /* in: mtr */ { ulint page_no; ulint space; page_t* page; - + + ut_ad(rec_offs_validate(node_ptr, NULL, offsets)); space = buf_frame_get_space_id(node_ptr); - page_no = btr_node_ptr_get_child_page_no(node_ptr); + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); page = btr_page_get(space, page_no, RW_X_LATCH, mtr); @@ -564,6 +566,8 @@ btr_page_get_father_for_rec( dtuple_t* tuple; btr_cur_t cursor; rec_t* node_ptr; + dict_index_t* index; + ulint* offsets; ut_ad(mtr_memo_contains(mtr, dict_tree_get_lock(tree), MTR_MEMO_X_LOCK)); @@ -576,18 +580,20 @@ btr_page_get_father_for_rec( tuple = dict_tree_build_node_ptr(tree, user_rec, 0, heap, btr_page_get_level(page, mtr)); + index = UT_LIST_GET_FIRST(tree->tree_indexes); /* In the following, we choose just any index from the tree as the first parameter for btr_cur_search_to_nth_level. */ - - btr_cur_search_to_nth_level(UT_LIST_GET_FIRST(tree->tree_indexes), + + btr_cur_search_to_nth_level(index, btr_page_get_level(page, mtr) + 1, tuple, PAGE_CUR_LE, BTR_CONT_MODIFY_TREE, &cursor, 0, mtr); node_ptr = btr_cur_get_rec(&cursor); + offsets = rec_get_offsets(node_ptr, index, ULINT_UNDEFINED, heap); - if (btr_node_ptr_get_child_page_no(node_ptr) != + if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != buf_frame_get_page_no(page)) { fputs("InnoDB: Dump of the child page:\n", stderr); buf_page_print(buf_frame_align(page)); @@ -595,17 +601,22 @@ btr_page_get_father_for_rec( buf_page_print(buf_frame_align(node_ptr)); fputs("InnoDB: Corruption of an index tree: table ", stderr); - ut_print_name(stderr, NULL, - UT_LIST_GET_FIRST(tree->tree_indexes)->table_name); + ut_print_name(stderr, NULL, index->table_name); fputs(", index ", stderr); - ut_print_name(stderr, NULL, - UT_LIST_GET_FIRST(tree->tree_indexes)->name); + ut_print_name(stderr, NULL, index->name); fprintf(stderr, ",\n" "InnoDB: father ptr page no %lu, child page no %lu\n", - (ulong) btr_node_ptr_get_child_page_no(node_ptr), + (ulong) + btr_node_ptr_get_child_page_no(node_ptr, offsets), (ulong) buf_frame_get_page_no(page)); - page_rec_print(page_rec_get_next(page_get_infimum_rec(page))); - page_rec_print(node_ptr); + offsets = rec_reget_offsets(page_rec_get_next( + page_get_infimum_rec(page)), index, + offsets, ULINT_UNDEFINED, heap); + page_rec_print(page_rec_get_next(page_get_infimum_rec(page)), + offsets); + offsets = rec_reget_offsets(node_ptr, index, offsets, + ULINT_UNDEFINED, heap); + page_rec_print(node_ptr, offsets); fputs( "InnoDB: You should dump + drop + reimport the table to fix the\n" @@ -614,7 +625,7 @@ btr_page_get_father_for_rec( "InnoDB: forcing recovery. Then dump + drop + reimport.\n", stderr); } - ut_a(btr_node_ptr_get_child_page_no(node_ptr) == + ut_a(btr_node_ptr_get_child_page_no(node_ptr, offsets) == buf_frame_get_page_no(page)); mem_heap_free(heap); @@ -649,6 +660,7 @@ btr_create( ulint type, /* in: type of the index */ ulint space, /* in: space where created */ dulint index_id,/* in: index id */ + ibool comp, /* in: TRUE=compact page format */ mtr_t* mtr) /* in: mini-transaction handle */ { ulint page_no; @@ -716,7 +728,7 @@ btr_create( } /* Create a new index page on the the allocated segment page */ - page = page_create(frame, mtr); + page = page_create(frame, mtr, comp); buf_block_align(page)->check_index_page_at_flush = TRUE; /* Set the index id of the page */ @@ -821,12 +833,14 @@ static void btr_page_reorganize_low( /*====================*/ - ibool recovery,/* in: TRUE if called in recovery: locks should not - be updated, i.e., there cannot exist locks on the - page, and a hash index should not be dropped: it - cannot exist */ - page_t* page, /* in: page to be reorganized */ - mtr_t* mtr) /* in: mtr */ + ibool recovery,/* in: TRUE if called in recovery: + locks should not be updated, i.e., + there cannot exist locks on the + page, and a hash index should not be + dropped: it cannot exist */ + page_t* page, /* in: page to be reorganized */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { page_t* new_page; ulint log_mode; @@ -841,7 +855,9 @@ btr_page_reorganize_low( max_ins_size1 = page_get_max_insert_size_after_reorganize(page, 1); /* Write the log record */ - mlog_write_initial_log_record(page, MLOG_PAGE_REORGANIZE, mtr); + mlog_open_and_write_index(mtr, page, index, index->table->comp + ? MLOG_COMP_PAGE_REORGANIZE + : MLOG_PAGE_REORGANIZE, 0); /* Turn logging off */ log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); @@ -858,14 +874,14 @@ btr_page_reorganize_low( /* Recreate the page: note that global data on page (possible segment headers, next page-field, etc.) is preserved intact */ - page_create(page, mtr); + page_create(page, mtr, index->table->comp); buf_block_align(page)->check_index_page_at_flush = TRUE; /* Copy the records from the temporary space to the recreated page; do not copy the lock bits yet */ page_copy_rec_list_end_no_locks(page, new_page, - page_get_infimum_rec(new_page), mtr); + page_get_infimum_rec(new_page), index, mtr); /* Copy max trx id to recreated page */ page_set_max_trx_id(page, page_get_max_trx_id(new_page)); @@ -901,10 +917,11 @@ Reorganizes an index page. */ void btr_page_reorganize( /*================*/ - page_t* page, /* in: page to be reorganized */ - mtr_t* mtr) /* in: mtr */ + page_t* page, /* in: page to be reorganized */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { - btr_page_reorganize_low(FALSE, page, mtr); + btr_page_reorganize_low(FALSE, page, index, mtr); } /*************************************************************** @@ -913,18 +930,20 @@ Parses a redo log record of reorganizing a page. */ byte* btr_parse_page_reorganize( /*======================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr __attribute__((unused)), /* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr) /* in: mtr or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr __attribute__((unused)), + /* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ { ut_ad(ptr && end_ptr); /* The record is empty, except for the record initial part */ if (page) { - btr_page_reorganize_low(TRUE, page, mtr); + btr_page_reorganize_low(TRUE, page, index, mtr); } return(ptr); @@ -946,7 +965,7 @@ btr_page_empty( /* Recreate the page: note that global data on page (possible segment headers, next page-field, etc.) is preserved intact */ - page_create(page, mtr); + page_create(page, mtr, page_is_comp(page)); buf_block_align(page)->check_index_page_at_flush = TRUE; } @@ -1011,7 +1030,7 @@ btr_root_raise_and_insert( /* Move the records from root to the new page */ page_move_rec_list_end(new_page, root, page_get_infimum_rec(root), - mtr); + cursor->index, mtr); /* If this is a pessimistic insert which is actually done to perform a pessimistic update then we have stored the lock information of the record to be inserted on the infimum of the @@ -1031,7 +1050,7 @@ btr_root_raise_and_insert( node_ptr = dict_tree_build_node_ptr(tree, rec, new_page_no, heap, level); /* Reorganize the root to get free space */ - btr_page_reorganize(root, mtr); + btr_page_reorganize(root, cursor->index, mtr); page_cursor = btr_cur_get_page_cur(cursor); @@ -1039,7 +1058,8 @@ btr_root_raise_and_insert( page_cur_set_before_first(root, page_cursor); - node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr, mtr); + node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr, + cursor->index, mtr); ut_ad(node_ptr_rec); @@ -1047,7 +1067,7 @@ btr_root_raise_and_insert( as there is no lower alphabetical limit to records in the leftmost node of a level: */ - btr_set_min_rec_mark(node_ptr_rec, mtr); + btr_set_min_rec_mark(node_ptr_rec, cursor->index->table->comp, mtr); /* Free the memory heap */ mem_heap_free(heap); @@ -1060,7 +1080,8 @@ btr_root_raise_and_insert( ibuf_reset_free_bits(UT_LIST_GET_FIRST(tree->tree_indexes), new_page); /* Reposition the cursor to the child node */ - page_cur_search(new_page, tuple, PAGE_CUR_LE, page_cursor); + page_cur_search(new_page, cursor->index, tuple, + PAGE_CUR_LE, page_cursor); /* Split the child and insert tuple */ return(btr_page_split_and_insert(cursor, tuple, mtr)); @@ -1190,11 +1211,13 @@ btr_page_get_sure_split_rec( rec_t* rec; rec_t* next_rec; ulint n; - + mem_heap_t* heap; + ulint* offsets; + page = btr_cur_get_page(cursor); - insert_size = rec_get_converted_size(tuple); - free_space = page_get_free_space_of_empty(); + insert_size = rec_get_converted_size(cursor->index, tuple); + free_space = page_get_free_space_of_empty(cursor->index->table->comp); /* free_space is now the free space of a created new page */ @@ -1208,6 +1231,9 @@ btr_page_get_sure_split_rec( ins_rec = btr_cur_get_rec(cursor); rec = page_get_infimum_rec(page); + heap = mem_heap_create(100); + offsets = NULL; + /* We start to include records to the left half, and when the space reserved by them exceeds half of total_space, then if the included records fit on the left page, they will be put there @@ -1230,7 +1256,9 @@ btr_page_get_sure_split_rec( /* Include tuple */ incl_data += insert_size; } else { - incl_data += rec_get_size(rec); + offsets = rec_reget_offsets(rec, cursor->index, + offsets, ULINT_UNDEFINED, heap); + incl_data += rec_offs_size(offsets); } n++; @@ -1252,11 +1280,12 @@ btr_page_get_sure_split_rec( next_rec = page_rec_get_next(rec); } if (next_rec != page_get_supremum_rec(page)) { - + mem_heap_free(heap); return(next_rec); } } + mem_heap_free(heap); return(rec); } } @@ -1275,7 +1304,10 @@ btr_page_insert_fits( rec_t* split_rec, /* in: suggestion for first record on upper half-page, or NULL if tuple to be inserted should be first */ - dtuple_t* tuple) /* in: tuple to insert */ + const ulint* offsets, /* in: rec_get_offsets( + split_rec, cursor->index) */ + dtuple_t* tuple, /* in: tuple to insert */ + mem_heap_t* heap) /* in: temporary memory heap */ { page_t* page; ulint insert_size; @@ -1284,11 +1316,19 @@ btr_page_insert_fits( ulint total_n_recs; rec_t* rec; rec_t* end_rec; + ulint* offs; page = btr_cur_get_page(cursor); - - insert_size = rec_get_converted_size(tuple); - free_space = page_get_free_space_of_empty(); + + ut_ad(!split_rec == !offsets); + ut_ad(!offsets + || cursor->index->table->comp == rec_offs_comp(offsets)); + ut_ad(!offsets + || rec_offs_validate(split_rec, cursor->index, offsets)); + ut_ad(page_is_comp(page) == cursor->index->table->comp); + + insert_size = rec_get_converted_size(cursor->index, tuple); + free_space = page_get_free_space_of_empty(cursor->index->table->comp); /* free_space is now the free space of a created new page */ @@ -1303,7 +1343,7 @@ btr_page_insert_fits( rec = page_rec_get_next(page_get_infimum_rec(page)); end_rec = page_rec_get_next(btr_cur_get_rec(cursor)); - } else if (cmp_dtuple_rec(tuple, split_rec) >= 0) { + } else if (cmp_dtuple_rec(tuple, split_rec, offsets) >= 0) { rec = page_rec_get_next(page_get_infimum_rec(page)); end_rec = split_rec; @@ -1321,11 +1361,16 @@ btr_page_insert_fits( return(TRUE); } + offs = NULL; + while (rec != end_rec) { /* In this loop we calculate the amount of reserved space after rec is removed from page. */ - total_data -= rec_get_size(rec); + offs = rec_reget_offsets(rec, cursor->index, offs, + ULINT_UNDEFINED, heap); + + total_data -= rec_offs_size(offs); total_n_recs--; if (total_data + page_dir_calc_reserved_space(total_n_recs) @@ -1411,6 +1456,10 @@ btr_attach_half_pages( MTR_MEMO_PAGE_X_FIX)); ut_ad(mtr_memo_contains(mtr, buf_block_align(new_page), MTR_MEMO_PAGE_X_FIX)); + ut_a(page_is_comp(page) == page_is_comp(new_page)); + + /* Create a memory heap where the data tuple is stored */ + heap = mem_heap_create(100); /* Based on split direction, decide upper and lower pages */ if (direction == FSP_DOWN) { @@ -1426,7 +1475,12 @@ btr_attach_half_pages( /* Replace the address of the old child node (= page) with the address of the new lower half */ - btr_node_ptr_set_child_page_no(node_ptr, lower_page_no, mtr); + btr_node_ptr_set_child_page_no(node_ptr, + rec_get_offsets(node_ptr, + UT_LIST_GET_FIRST(tree->tree_indexes), + ULINT_UNDEFINED, heap), + lower_page_no, mtr); + mem_heap_empty(heap); } else { lower_page_no = buf_frame_get_page_no(page); upper_page_no = buf_frame_get_page_no(new_page); @@ -1434,9 +1488,6 @@ btr_attach_half_pages( upper_page = new_page; } - /* Create a memory heap where the data tuple is stored */ - heap = mem_heap_create(100); - /* Get the level of the split pages */ level = btr_page_get_level(page, mtr); @@ -1465,6 +1516,7 @@ btr_attach_half_pages( if (prev_page_no != FIL_NULL) { prev_page = btr_page_get(space, prev_page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(prev_page) == page_is_comp(page)); btr_page_set_next(prev_page, lower_page_no, mtr); } @@ -1472,6 +1524,7 @@ btr_attach_half_pages( if (next_page_no != FIL_NULL) { next_page = btr_page_get(space, next_page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(next_page) == page_is_comp(page)); btr_page_set_prev(next_page, upper_page_no, mtr); } @@ -1522,7 +1575,15 @@ btr_page_split_and_insert( ibool insert_will_fit; ulint n_iterations = 0; rec_t* rec; + mem_heap_t* heap; + ulint n_uniq; + ulint* offsets; + + heap = mem_heap_create(1024); + n_uniq = dict_index_get_n_unique_in_tree(cursor->index); func_start: + mem_heap_empty(heap); + offsets = NULL; tree = btr_cur_get_tree(cursor); ut_ad(mtr_memo_contains(mtr, dict_tree_get_lock(tree), @@ -1574,9 +1635,10 @@ func_start: first_rec = split_rec; move_limit = split_rec; } else { - buf = mem_alloc(rec_get_converted_size(tuple)); + buf = mem_alloc(rec_get_converted_size(cursor->index, tuple)); - first_rec = rec_convert_dtuple_to_rec(buf, tuple); + first_rec = rec_convert_dtuple_to_rec(buf, + cursor->index, tuple); move_limit = page_rec_get_next(btr_cur_get_rec(cursor)); } @@ -1593,7 +1655,16 @@ func_start: We can then move the records after releasing the tree latch, thus reducing the tree latch contention. */ - insert_will_fit = btr_page_insert_fits(cursor, split_rec, tuple); + if (split_rec) { + offsets = rec_reget_offsets(split_rec, cursor->index, + offsets, n_uniq, heap); + + insert_will_fit = btr_page_insert_fits(cursor, + split_rec, offsets, tuple, heap); + } else { + insert_will_fit = btr_page_insert_fits(cursor, + NULL, NULL, tuple, heap); + } if (insert_will_fit && (btr_page_get_level(page, mtr) == 0)) { @@ -1605,7 +1676,8 @@ func_start: if (direction == FSP_DOWN) { /* fputs("Split left\n", stderr); */ - page_move_rec_list_start(new_page, page, move_limit, mtr); + page_move_rec_list_start(new_page, page, move_limit, + cursor->index, mtr); left_page = new_page; right_page = page; @@ -1613,7 +1685,8 @@ func_start: } else { /* fputs("Split right\n", stderr); */ - page_move_rec_list_end(new_page, page, move_limit, mtr); + page_move_rec_list_end(new_page, page, move_limit, + cursor->index, mtr); left_page = page; right_page = new_page; @@ -1626,19 +1699,25 @@ func_start: if (split_rec == NULL) { insert_page = right_page; - } else if (cmp_dtuple_rec(tuple, first_rec) >= 0) { - - insert_page = right_page; } else { - insert_page = left_page; + offsets = rec_reget_offsets(first_rec, cursor->index, + offsets, n_uniq, heap); + + if (cmp_dtuple_rec(tuple, first_rec, offsets) >= 0) { + + insert_page = right_page; + } else { + insert_page = left_page; + } } /* 7. Reposition the cursor for insert and try insertion */ page_cursor = btr_cur_get_page_cur(cursor); - page_cur_search(insert_page, tuple, PAGE_CUR_LE, page_cursor); + page_cur_search(insert_page, cursor->index, tuple, + PAGE_CUR_LE, page_cursor); - rec = page_cur_tuple_insert(page_cursor, tuple, mtr); + rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, mtr); if (rec != NULL) { /* Insert fit on the page: update the free bits for the @@ -1650,15 +1729,17 @@ func_start: /* fprintf(stderr, "Split and insert done %lu %lu\n", buf_frame_get_page_no(left_page), buf_frame_get_page_no(right_page)); */ + mem_heap_free(heap); return(rec); } /* 8. If insert did not fit, try page reorganization */ - btr_page_reorganize(insert_page, mtr); + btr_page_reorganize(insert_page, cursor->index, mtr); - page_cur_search(insert_page, tuple, PAGE_CUR_LE, page_cursor); - rec = page_cur_tuple_insert(page_cursor, tuple, mtr); + page_cur_search(insert_page, cursor->index, tuple, + PAGE_CUR_LE, page_cursor); + rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, mtr); if (rec == NULL) { /* The insert did not fit on the page: loop back to the @@ -1688,6 +1769,7 @@ func_start: ut_ad(page_validate(left_page, UT_LIST_GET_FIRST(tree->tree_indexes))); ut_ad(page_validate(right_page, UT_LIST_GET_FIRST(tree->tree_indexes))); + mem_heap_free(heap); return(rec); } @@ -1721,6 +1803,7 @@ btr_level_list_remove( if (prev_page_no != FIL_NULL) { prev_page = btr_page_get(space, prev_page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(prev_page) == page_is_comp(page)); btr_page_set_next(prev_page, next_page_no, mtr); } @@ -1728,6 +1811,7 @@ btr_level_list_remove( if (next_page_no != FIL_NULL) { next_page = btr_page_get(space, next_page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(next_page) == page_is_comp(page)); btr_page_set_prev(next_page, prev_page_no, mtr); } @@ -1741,9 +1825,11 @@ void btr_set_min_rec_mark_log( /*=====================*/ rec_t* rec, /* in: record */ + ibool comp, /* TRUE=compact record format */ mtr_t* mtr) /* in: mtr */ { - mlog_write_initial_log_record(rec, MLOG_REC_MIN_MARK, mtr); + mlog_write_initial_log_record(rec, + comp ? MLOG_COMP_REC_MIN_MARK : MLOG_REC_MIN_MARK, mtr); /* Write rec offset as a 2-byte ulint */ mlog_catenate_ulint(mtr, rec - buf_frame_align(rec), MLOG_2BYTES); @@ -1759,6 +1845,7 @@ btr_parse_set_min_rec_mark( /* out: end of log record or NULL */ byte* ptr, /* in: buffer */ byte* end_ptr,/* in: buffer end */ + ibool comp, /* in: TRUE=compact page format */ page_t* page, /* in: page or NULL */ mtr_t* mtr) /* in: mtr or NULL */ { @@ -1772,7 +1859,7 @@ btr_parse_set_min_rec_mark( if (page) { rec = page + mach_read_from_2(ptr); - btr_set_min_rec_mark(rec, mtr); + btr_set_min_rec_mark(rec, comp, mtr); } return(ptr + 2); @@ -1785,15 +1872,16 @@ void btr_set_min_rec_mark( /*=================*/ rec_t* rec, /* in: record */ + ibool comp, /* in: TRUE=compact page format */ mtr_t* mtr) /* in: mtr */ { ulint info_bits; - info_bits = rec_get_info_bits(rec); + info_bits = rec_get_info_bits(rec, comp); - rec_set_info_bits(rec, info_bits | REC_INFO_MIN_REC_FLAG); + rec_set_info_bits(rec, comp, info_bits | REC_INFO_MIN_REC_FLAG); - btr_set_min_rec_mark_log(rec, mtr); + btr_set_min_rec_mark_log(rec, comp, mtr); } /***************************************************************** @@ -1842,18 +1930,19 @@ btr_lift_page_up( record from the page should be removed */ mtr_t* mtr) /* in: mtr */ { - rec_t* node_ptr; - page_t* father_page; - ulint page_level; - + page_t* father_page; + ulint page_level; + dict_index_t* index; + ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL); ut_ad(btr_page_get_next(page, mtr) == FIL_NULL); ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); - node_ptr = btr_page_get_father_node_ptr(tree, page, mtr); - father_page = buf_frame_align(node_ptr); + father_page = buf_frame_align( + btr_page_get_father_node_ptr(tree, page, mtr)); page_level = btr_page_get_level(page, mtr); + index = UT_LIST_GET_FIRST(tree->tree_indexes); btr_search_drop_page_hash_index(page); @@ -1862,7 +1951,7 @@ btr_lift_page_up( /* Move records to the father */ page_copy_rec_list_end(father_page, page, page_get_infimum_rec(page), - mtr); + index, mtr); lock_update_copy_and_discard(father_page, page); btr_page_set_level(father_page, page_level, mtr); @@ -1871,10 +1960,8 @@ btr_lift_page_up( btr_page_free(tree, page, mtr); /* We play safe and reset the free bits for the father */ - ibuf_reset_free_bits(UT_LIST_GET_FIRST(tree->tree_indexes), - father_page); - ut_ad(page_validate(father_page, - UT_LIST_GET_FIRST(tree->tree_indexes))); + ibuf_reset_free_bits(index, father_page); + ut_ad(page_validate(father_page, index)); ut_ad(btr_check_node_ptr(tree, father_page, mtr)); } @@ -1914,9 +2001,11 @@ btr_compress( ulint max_ins_size; ulint max_ins_size_reorg; ulint level; - + ibool comp = cursor->index->table->comp; + page = btr_cur_get_page(cursor); tree = btr_cur_get_tree(cursor); + ut_a(comp == page_is_comp(page)); ut_ad(mtr_memo_contains(mtr, dict_tree_get_lock(tree), MTR_MEMO_X_LOCK)); @@ -1932,7 +2021,9 @@ btr_compress( right_page_no); */ node_ptr = btr_page_get_father_node_ptr(tree, page, mtr); + ut_ad(!comp || rec_get_status(node_ptr) == REC_STATUS_NODE_PTR); father_page = buf_frame_align(node_ptr); + ut_a(comp == page_is_comp(father_page)); /* Decide the page to which we try to merge and which will inherit the locks */ @@ -1957,6 +2048,7 @@ btr_compress( n_recs = page_get_n_recs(page); data_size = page_get_data_size(page); + ut_a(page_is_comp(merge_page) == page_is_comp(page)); max_ins_size_reorg = page_get_max_insert_size_after_reorganize( merge_page, n_recs); @@ -1975,7 +2067,7 @@ btr_compress( /* We have to reorganize merge_page */ - btr_page_reorganize(merge_page, mtr); + btr_page_reorganize(merge_page, cursor->index, mtr); max_ins_size = page_get_max_insert_size(merge_page, n_recs); @@ -1999,11 +2091,14 @@ btr_compress( if (is_left) { btr_node_ptr_delete(tree, page, mtr); } else { + mem_heap_t* heap = mem_heap_create(100); /* Replace the address of the old child node (= page) with the address of the merge page to the right */ - btr_node_ptr_set_child_page_no(node_ptr, right_page_no, mtr); - + btr_node_ptr_set_child_page_no(node_ptr, + rec_get_offsets(node_ptr, cursor->index, + ULINT_UNDEFINED, heap), right_page_no, mtr); + mem_heap_free(heap); btr_node_ptr_delete(tree, merge_page, mtr); } @@ -2012,14 +2107,14 @@ btr_compress( orig_pred = page_rec_get_prev( page_get_supremum_rec(merge_page)); page_copy_rec_list_start(merge_page, page, - page_get_supremum_rec(page), mtr); + page_get_supremum_rec(page), cursor->index, mtr); lock_update_merge_left(merge_page, orig_pred, page); } else { orig_succ = page_rec_get_next( page_get_infimum_rec(merge_page)); page_copy_rec_list_end(merge_page, page, - page_get_infimum_rec(page), mtr); + page_get_infimum_rec(page), cursor->index, mtr); lock_update_merge_right(orig_succ, page); } @@ -2133,6 +2228,7 @@ btr_discard_page( return; } + ut_a(page_is_comp(merge_page) == page_is_comp(page)); btr_search_drop_page_hash_index(page); if (left_page_no == FIL_NULL && btr_page_get_level(page, mtr) > 0) { @@ -2144,7 +2240,8 @@ btr_discard_page( ut_ad(node_ptr != page_get_supremum_rec(merge_page)); - btr_set_min_rec_mark(node_ptr, mtr); + btr_set_min_rec_mark(node_ptr, + cursor->index->table->comp, mtr); } btr_node_ptr_delete(tree, page, mtr); @@ -2215,6 +2312,8 @@ btr_print_recursive( page_t* page, /* in: index page */ ulint width, /* in: print this many entries from start and end */ + mem_heap_t* heap, /* in: heap for rec_reget_offsets() */ + ulint** offsets,/* in/out: buffer for rec_reget_offsets() */ mtr_t* mtr) /* in: mtr */ { page_cur_t cursor; @@ -2223,14 +2322,16 @@ btr_print_recursive( mtr_t mtr2; rec_t* node_ptr; page_t* child; - + dict_index_t* index; + ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); fprintf(stderr, "NODE ON LEVEL %lu page number %lu\n", (ulong) btr_page_get_level(page, mtr), (ulong) buf_frame_get_page_no(page)); - page_print(page, width, width); + index = UT_LIST_GET_FIRST(tree->tree_indexes); + page_print(page, index, width, width); n_recs = page_get_n_recs(page); @@ -2249,15 +2350,20 @@ btr_print_recursive( node_ptr = page_cur_get_rec(&cursor); - child = btr_node_ptr_get_child(node_ptr, &mtr2); - - btr_print_recursive(tree, child, width, &mtr2); + *offsets = rec_reget_offsets(node_ptr, index, + *offsets, ULINT_UNDEFINED, heap); + child = btr_node_ptr_get_child(node_ptr, + *offsets, &mtr2); + btr_print_recursive(tree, child, width, + heap, offsets, &mtr2); mtr_commit(&mtr2); } page_cur_move_to_next(&cursor); i++; } + + mem_heap_free(heap); } /****************************************************************** @@ -2270,8 +2376,10 @@ btr_print_tree( ulint width) /* in: print this many entries from start and end */ { - mtr_t mtr; - page_t* root; + mtr_t mtr; + page_t* root; + mem_heap_t* heap = mem_heap_create(100); + ulint* offsets = NULL; fputs("--------------------------\n" "INDEX TREE PRINT\n", stderr); @@ -2280,7 +2388,8 @@ btr_print_tree( root = btr_root_get(tree, &mtr); - btr_print_recursive(tree, root, width, &mtr); + btr_print_recursive(tree, root, width, heap, &offsets, &mtr); + mem_heap_free(heap); mtr_commit(&mtr); @@ -2323,7 +2432,10 @@ btr_check_node_ptr( page_rec_get_next(page_get_infimum_rec(page)), 0, heap, btr_page_get_level(page, mtr)); - ut_a(cmp_dtuple_rec(node_ptr_tuple, node_ptr) == 0); + ut_a(cmp_dtuple_rec(node_ptr_tuple, node_ptr, + rec_get_offsets(node_ptr, + dict_tree_find_index(tree, node_ptr), + ULINT_UNDEFINED, heap)) == 0); mem_heap_free(heap); @@ -2360,10 +2472,12 @@ btr_index_rec_validate( should print hex dump of record and page on error */ { - ulint len; - ulint n; - ulint i; - page_t* page; + ulint len; + ulint n; + ulint i; + page_t* page; + mem_heap_t* heap; + ulint* offsets; page = buf_frame_align(rec); @@ -2377,10 +2491,10 @@ btr_index_rec_validate( n = dict_index_get_n_fields(index); - if (rec_get_n_fields(rec) != n) { + if (!index->table->comp && rec_get_n_fields_old(rec) != n) { btr_index_rec_validate_report(page, rec, index); fprintf(stderr, "InnoDB: has %lu fields, should have %lu\n", - (ulong) rec_get_n_fields(rec), (ulong) n); + (ulong) rec_get_n_fields_old(rec), (ulong) n); if (!dump_on_error) { @@ -2390,23 +2504,27 @@ btr_index_rec_validate( buf_page_print(page); fputs("InnoDB: corrupt record ", stderr); - rec_print(stderr, rec); + rec_print_old(stderr, rec); putc('\n', stderr); return(FALSE); } + heap = mem_heap_create(100); + offsets = rec_get_offsets(rec, index, ULINT_UNDEFINED, heap); + for (i = 0; i < n; i++) { dtype_t* type = dict_index_get_nth_type(index, i); + ulint fixed_size = dtype_get_fixed_size(type); - rec_get_nth_field(rec, i, &len); + rec_get_nth_field(rec, offsets, i, &len); /* Note that prefix indexes are not fixed size even when their type is CHAR. */ if ((dict_index_get_nth_field(index, i)->prefix_len == 0 - && len != UNIV_SQL_NULL && dtype_is_fixed_size(type) - && len != dtype_get_fixed_size(type)) + && len != UNIV_SQL_NULL && fixed_size + && len != fixed_size) || (dict_index_get_nth_field(index, i)->prefix_len > 0 && len != UNIV_SQL_NULL @@ -2419,20 +2537,22 @@ btr_index_rec_validate( (ulong) i, (ulong) len, (ulong) dtype_get_fixed_size(type)); if (!dump_on_error) { - + mem_heap_free(heap); return(FALSE); } buf_page_print(page); fputs("InnoDB: corrupt record ", stderr); - rec_print(stderr, rec); + rec_print(stderr, rec, offsets); putc('\n', stderr); + mem_heap_free(heap); return(FALSE); } } + mem_heap_free(heap); return(TRUE); } @@ -2527,15 +2647,18 @@ btr_validate_level( page_t* right_father_page; rec_t* node_ptr; rec_t* right_node_ptr; + rec_t* rec; ulint right_page_no; ulint left_page_no; page_cur_t cursor; - mem_heap_t* heap; dtuple_t* node_ptr_tuple; ibool ret = TRUE; dict_index_t* index; mtr_t mtr; - + mem_heap_t* heap = mem_heap_create(256); + ulint* offsets = NULL; + ulint* offsets2= NULL; + mtr_start(&mtr); mtr_x_lock(dict_tree_get_lock(tree), &mtr); @@ -2544,6 +2667,8 @@ btr_validate_level( space = buf_frame_get_space_id(page); + index = UT_LIST_GET_FIRST(tree->tree_indexes); + while (level != btr_page_get_level(page, &mtr)) { ut_a(btr_page_get_level(page, &mtr) > 0); @@ -2552,14 +2677,16 @@ btr_validate_level( page_cur_move_to_next(&cursor); node_ptr = page_cur_get_rec(&cursor); - page = btr_node_ptr_get_child(node_ptr, &mtr); + offsets = rec_reget_offsets(node_ptr, index, + offsets, ULINT_UNDEFINED, heap); + page = btr_node_ptr_get_child(node_ptr, offsets, &mtr); } - index = UT_LIST_GET_FIRST(tree->tree_indexes); - /* Now we are on the desired level. Loop through the pages on that level. */ loop: + mem_heap_empty(heap); + offsets = offsets2 = NULL; mtr_x_lock(dict_tree_get_lock(tree), &mtr); /* Check ordering etc. of records */ @@ -2588,12 +2715,20 @@ loop: (buf_frame_get_page_no(page) == dict_tree_get_page(tree)))); if (right_page_no != FIL_NULL) { - + rec_t* right_rec; right_page = btr_page_get(space, right_page_no, RW_X_LATCH, &mtr); - if (cmp_rec_rec(page_rec_get_prev(page_get_supremum_rec(page)), - page_rec_get_next(page_get_infimum_rec(right_page)), - UT_LIST_GET_FIRST(tree->tree_indexes)) >= 0) { + ut_a(page_is_comp(right_page) == page_is_comp(page)); + rec = page_rec_get_prev(page_get_supremum_rec(page)); + right_rec = page_rec_get_next( + page_get_infimum_rec(right_page)); + offsets = rec_reget_offsets(rec, index, + offsets, ULINT_UNDEFINED, heap); + offsets2 = rec_reget_offsets(right_rec, index, + offsets2, ULINT_UNDEFINED, heap); + if (cmp_rec_rec(rec, right_rec, offsets, offsets2, + dict_index_get_n_fields(index), + index) >= 0) { btr_validate_report2(index, level, page, right_page); @@ -2604,12 +2739,17 @@ loop: buf_page_print(right_page); fputs("InnoDB: record ", stderr); - rec_print(stderr, page_rec_get_prev( - page_get_supremum_rec(page))); + rec = page_rec_get_prev(page_get_supremum_rec(page)); + offsets = rec_reget_offsets(rec, index, + offsets, ULINT_UNDEFINED, heap); + rec_print(stderr, rec, offsets); putc('\n', stderr); fputs("InnoDB: record ", stderr); - rec_print(stderr, page_rec_get_next( - page_get_infimum_rec(right_page))); + rec = page_rec_get_next(page_get_infimum_rec( + right_page)); + offsets = rec_reget_offsets(rec, index, + offsets, ULINT_UNDEFINED, heap); + rec_print(stderr, rec, offsets); putc('\n', stderr); ret = FALSE; @@ -2618,7 +2758,8 @@ loop: if (level > 0 && left_page_no == FIL_NULL) { ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits( - page_rec_get_next(page_get_infimum_rec(page)))); + page_rec_get_next(page_get_infimum_rec(page)), + index->table->comp)); } if (buf_frame_get_page_no(page) != dict_tree_get_page(tree)) { @@ -2627,12 +2768,14 @@ loop: node_ptr = btr_page_get_father_node_ptr(tree, page, &mtr); father_page = buf_frame_align(node_ptr); + offsets = rec_reget_offsets(node_ptr, index, + offsets, ULINT_UNDEFINED, heap); - if (btr_node_ptr_get_child_page_no(node_ptr) != + if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != buf_frame_get_page_no(page) || node_ptr != btr_page_get_father_for_rec(tree, page, - page_rec_get_prev(page_get_supremum_rec(page)), - &mtr)) { + page_rec_get_prev(page_get_supremum_rec(page)), + &mtr)) { btr_validate_report1(index, level, page); fputs("InnoDB: node pointer to the page is wrong\n", @@ -2642,17 +2785,20 @@ loop: buf_page_print(page); fputs("InnoDB: node ptr ", stderr); - rec_print(stderr, node_ptr); + rec_print(stderr, node_ptr, offsets); fprintf(stderr, "\n" "InnoDB: node ptr child page n:o %lu\n", - (unsigned long) btr_node_ptr_get_child_page_no(node_ptr)); + (unsigned long) btr_node_ptr_get_child_page_no( + node_ptr, offsets)); fputs("InnoDB: record on page ", stderr); - rec_print(stderr, - btr_page_get_father_for_rec(tree, page, - page_rec_get_prev(page_get_supremum_rec(page)), - &mtr)); + rec = btr_page_get_father_for_rec(tree, page, + page_rec_get_prev(page_get_supremum_rec(page)), + &mtr); + offsets = rec_reget_offsets(rec, index, + offsets, ULINT_UNDEFINED, heap); + rec_print(stderr, rec, offsets); putc('\n', stderr); ret = FALSE; @@ -2660,7 +2806,8 @@ loop: } if (btr_page_get_level(page, &mtr) > 0) { - heap = mem_heap_create(256); + offsets = rec_reget_offsets(node_ptr, index, + offsets, ULINT_UNDEFINED, heap); node_ptr_tuple = dict_tree_build_node_ptr( tree, @@ -2669,7 +2816,10 @@ loop: 0, heap, btr_page_get_level(page, &mtr)); - if (cmp_dtuple_rec(node_ptr_tuple, node_ptr) != 0) { + if (cmp_dtuple_rec(node_ptr_tuple, node_ptr, + offsets)) { + rec_t* first_rec = page_rec_get_next( + page_get_infimum_rec(page)); btr_validate_report1(index, level, page); @@ -2679,18 +2829,16 @@ loop: fputs("InnoDB: Error: node ptrs differ" " on levels > 0\n" "InnoDB: node ptr ", stderr); - rec_print(stderr, node_ptr); + rec_print(stderr, node_ptr, offsets); fputs("InnoDB: first rec ", stderr); - rec_print(stderr, page_rec_get_next( - page_get_infimum_rec(page))); + offsets = rec_reget_offsets(first_rec, index, + offsets, ULINT_UNDEFINED, heap); + rec_print(stderr, first_rec, offsets); putc('\n', stderr); ret = FALSE; - mem_heap_free(heap); goto node_ptr_fails; } - - mem_heap_free(heap); } if (left_page_no == FIL_NULL) { @@ -2701,7 +2849,7 @@ loop: if (right_page_no == FIL_NULL) { ut_a(node_ptr == page_rec_get_prev( - page_get_supremum_rec(father_page))); + page_get_supremum_rec(father_page))); ut_a(btr_page_get_next(father_page, &mtr) == FIL_NULL); } @@ -2771,13 +2919,16 @@ node_ptr_fails: mtr_commit(&mtr); if (right_page_no != FIL_NULL) { + ibool comp = page_is_comp(page); mtr_start(&mtr); page = btr_page_get(space, right_page_no, RW_X_LATCH, &mtr); + ut_a(page_is_comp(page) == comp); goto loop; } + mem_heap_free(heap); return(ret); } diff --git a/innobase/btr/btr0cur.c b/innobase/btr/btr0cur.c index 48de5644908..f5e146172ed 100644 --- a/innobase/btr/btr0cur.c +++ b/innobase/btr/btr0cur.c @@ -73,8 +73,9 @@ static void btr_cur_unmark_extern_fields( /*=========================*/ - rec_t* rec, /* in: record in a clustered index */ - mtr_t* mtr); /* in: mtr */ + rec_t* rec, /* in: record in a clustered index */ + mtr_t* mtr, /* in: mtr */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /*********************************************************************** Adds path information to the cursor for the current page, for which the binary search has been performed. */ @@ -96,6 +97,7 @@ btr_rec_free_updated_extern_fields( dict_index_t* index, /* in: index of rec; the index tree MUST be X-latched */ rec_t* rec, /* in: record */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ upd_t* update, /* in: update vector */ ibool do_not_free_inherited,/* in: TRUE if called in a rollback and we do not want to free @@ -108,9 +110,10 @@ static ulint btr_rec_get_externally_stored_len( /*==============================*/ - /* out: externally stored part, in units of a - database page */ - rec_t* rec); /* in: record */ + /* out: externally stored part, + in units of a database page */ + rec_t* rec, /* in: record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /*==================== B-TREE SEARCH =========================*/ @@ -137,11 +140,13 @@ btr_cur_latch_leaves( if (latch_mode == BTR_SEARCH_LEAF) { get_page = btr_page_get(space, page_no, RW_S_LATCH, mtr); + ut_a(page_is_comp(get_page) == page_is_comp(page)); buf_block_align(get_page)->check_index_page_at_flush = TRUE; } else if (latch_mode == BTR_MODIFY_LEAF) { get_page = btr_page_get(space, page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(get_page) == page_is_comp(page)); buf_block_align(get_page)->check_index_page_at_flush = TRUE; } else if (latch_mode == BTR_MODIFY_TREE) { @@ -152,11 +157,13 @@ btr_cur_latch_leaves( if (left_page_no != FIL_NULL) { get_page = btr_page_get(space, left_page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(get_page) == page_is_comp(page)); buf_block_align(get_page)->check_index_page_at_flush = TRUE; } get_page = btr_page_get(space, page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(get_page) == page_is_comp(page)); buf_block_align(get_page)->check_index_page_at_flush = TRUE; right_page_no = btr_page_get_next(page, mtr); @@ -176,11 +183,14 @@ btr_cur_latch_leaves( if (left_page_no != FIL_NULL) { cursor->left_page = btr_page_get(space, left_page_no, RW_S_LATCH, mtr); + ut_a(page_is_comp(cursor->left_page) == + page_is_comp(page)); buf_block_align( cursor->left_page)->check_index_page_at_flush = TRUE; } get_page = btr_page_get(space, page_no, RW_S_LATCH, mtr); + ut_a(page_is_comp(get_page) == page_is_comp(page)); buf_block_align(get_page)->check_index_page_at_flush = TRUE; } else if (latch_mode == BTR_MODIFY_PREV) { @@ -191,11 +201,14 @@ btr_cur_latch_leaves( if (left_page_no != FIL_NULL) { cursor->left_page = btr_page_get(space, left_page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(cursor->left_page) == + page_is_comp(page)); buf_block_align( cursor->left_page)->check_index_page_at_flush = TRUE; } get_page = btr_page_get(space, page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(get_page) == page_is_comp(page)); buf_block_align(get_page)->check_index_page_at_flush = TRUE; } else { ut_error; @@ -261,6 +274,8 @@ btr_cur_search_to_nth_level( #ifdef BTR_CUR_ADAPT btr_search_t* info; #endif + mem_heap_t* heap; + ulint* offsets; /* Currently, PAGE_CUR_LE is the only search mode used for searches ending to upper levels */ @@ -379,7 +394,9 @@ btr_cur_search_to_nth_level( page_mode = mode; break; } - + + heap = mem_heap_create(100); + offsets = NULL; /* Loop and search until we arrive at the desired level */ for (;;) { @@ -414,7 +431,7 @@ retry_page_get: cursor->thr)) { /* Insertion to the insert buffer succeeded */ cursor->flag = BTR_CUR_INSERT_TO_IBUF; - + mem_heap_free(heap); return; } @@ -470,9 +487,9 @@ retry_page_get: page_mode = mode; } - page_cur_search_with_match(page, tuple, page_mode, &up_match, - &up_bytes, &low_match, &low_bytes, - page_cursor); + page_cur_search_with_match(page, index, tuple, page_mode, + &up_match, &up_bytes, + &low_match, &low_bytes, page_cursor); if (estimate) { btr_cur_add_path_info(cursor, height, root_height); } @@ -486,7 +503,9 @@ retry_page_get: if (level > 0) { /* x-latch the page */ - btr_page_get(space, page_no, RW_X_LATCH, mtr); + ut_a(page_is_comp(btr_page_get(space, + page_no, RW_X_LATCH, mtr)) + == index->table->comp); } break; @@ -498,11 +517,14 @@ retry_page_get: guess = NULL; node_ptr = page_cur_get_rec(page_cursor); - + offsets = rec_reget_offsets(node_ptr, cursor->index, + offsets, ULINT_UNDEFINED, heap); /* Go to the child node */ - page_no = btr_node_ptr_get_child_page_no(node_ptr); + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); } + mem_heap_free(heap); + if (level == 0) { cursor->low_match = low_match; cursor->low_bytes = low_bytes; @@ -552,6 +574,8 @@ btr_cur_open_at_index_side( rec_t* node_ptr; ulint estimate; ulint savepoint; + mem_heap_t* heap; + ulint* offsets = NULL; estimate = latch_mode & BTR_ESTIMATE; latch_mode = latch_mode & ~BTR_ESTIMATE; @@ -576,7 +600,8 @@ btr_cur_open_at_index_side( page_no = dict_tree_get_page(tree); height = ULINT_UNDEFINED; - + heap = mem_heap_create(100); + for (;;) { page = buf_page_get_gen(space, page_no, RW_NO_LATCH, NULL, BUF_GET, @@ -645,10 +670,13 @@ btr_cur_open_at_index_side( height--; node_ptr = page_cur_get_rec(page_cursor); - + offsets = rec_reget_offsets(node_ptr, cursor->index, + offsets, ULINT_UNDEFINED, heap); /* Go to the child node */ - page_no = btr_node_ptr_get_child_page_no(node_ptr); + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); } + + mem_heap_free(heap); } /************************************************************************** @@ -669,6 +697,8 @@ btr_cur_open_at_rnd_pos( ulint space; ulint height; rec_t* node_ptr; + mem_heap_t* heap = mem_heap_create(100); + ulint* offsets = NULL; tree = index->tree; @@ -717,10 +747,13 @@ btr_cur_open_at_rnd_pos( height--; node_ptr = page_cur_get_rec(page_cursor); - + offsets = rec_reget_offsets(node_ptr, cursor->index, + offsets, ULINT_UNDEFINED, heap); /* Go to the child node */ - page_no = btr_node_ptr_get_child_page_no(node_ptr); + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); } + + mem_heap_free(heap); } /*==================== B-TREE INSERT =========================*/ @@ -758,18 +791,20 @@ btr_cur_insert_if_possible( page_cursor = btr_cur_get_page_cur(cursor); /* Now, try the insert */ - rec = page_cur_tuple_insert(page_cursor, tuple, mtr); + rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, mtr); if (!rec) { /* If record did not fit, reorganize */ - btr_page_reorganize(page, mtr); + btr_page_reorganize(page, cursor->index, mtr); *reorg = TRUE; - page_cur_search(page, tuple, PAGE_CUR_LE, page_cursor); + page_cur_search(page, cursor->index, tuple, + PAGE_CUR_LE, page_cursor); - rec = page_cur_tuple_insert(page_cursor, tuple, mtr); + rec = page_cur_tuple_insert(page_cursor, tuple, + cursor->index, mtr); } return(rec); @@ -887,8 +922,6 @@ btr_cur_optimistic_insert( ibool reorg; ibool inherit; ulint rec_size; - ulint data_size; - ulint extra_size; ulint type; ulint err; @@ -914,13 +947,11 @@ btr_cur_optimistic_insert( calculate_sizes_again: /* Calculate the record size when entry is converted to a record */ - data_size = dtuple_get_data_size(entry); - extra_size = rec_get_converted_extra_size(data_size, - dtuple_get_n_fields(entry)); - rec_size = data_size + extra_size; + rec_size = rec_get_converted_size(index, entry); - if ((rec_size >= page_get_free_space_of_empty() / 2) - || (rec_size >= REC_MAX_DATA_SIZE)) { + if (rec_size >= + ut_min(page_get_free_space_of_empty(index->table->comp) / 2, + REC_MAX_DATA_SIZE)) { /* The record is so big that we have to store some fields externally on separate database pages */ @@ -983,19 +1014,18 @@ calculate_sizes_again: /* Now, try the insert */ - *rec = page_cur_insert_rec_low(page_cursor, entry, data_size, - NULL, mtr); + *rec = page_cur_insert_rec_low(page_cursor, entry, index, NULL, mtr); if (!(*rec)) { /* If the record did not fit, reorganize */ - btr_page_reorganize(page, mtr); + btr_page_reorganize(page, index, mtr); ut_ad(page_get_max_insert_size(page, 1) == max_size); reorg = TRUE; - page_cur_search(page, entry, PAGE_CUR_LE, page_cursor); + page_cur_search(page, index, entry, PAGE_CUR_LE, page_cursor); - *rec = page_cur_tuple_insert(page_cursor, entry, mtr); + *rec = page_cur_tuple_insert(page_cursor, entry, index, mtr); if (!*rec) { fputs("InnoDB: Error: cannot insert tuple ", stderr); @@ -1123,9 +1153,9 @@ btr_cur_pessimistic_insert( } } - if ((rec_get_converted_size(entry) - >= page_get_free_space_of_empty() / 2) - || (rec_get_converted_size(entry) >= REC_MAX_DATA_SIZE)) { + if (rec_get_converted_size(index, entry) >= + ut_min(page_get_free_space_of_empty(index->table->comp) / 2, + REC_MAX_DATA_SIZE)) { /* The record is so big that we have to store some fields externally on separate database pages */ @@ -1212,8 +1242,11 @@ btr_cur_upd_lock_and_undo( err = DB_SUCCESS; if (!(flags & BTR_NO_LOCKING_FLAG)) { + mem_heap_t* heap = mem_heap_create(100); err = lock_clust_rec_modify_check_and_lock(flags, rec, index, - thr); + rec_get_offsets(rec, index, ULINT_UNDEFINED, heap), + thr); + mem_heap_free(heap); if (err != DB_SUCCESS) { return(err); @@ -1243,14 +1276,17 @@ btr_cur_update_in_place_log( mtr_t* mtr) /* in: mtr */ { byte* log_ptr; + ut_ad(flags < 256); - log_ptr = mlog_open(mtr, 30 + MLOG_BUF_MARGIN); - - log_ptr = mlog_write_initial_log_record_fast(rec, - MLOG_REC_UPDATE_IN_PLACE, log_ptr, mtr); + log_ptr = mlog_open_and_write_index(mtr, rec, index, index->table->comp + ? MLOG_COMP_REC_UPDATE_IN_PLACE + : MLOG_REC_UPDATE_IN_PLACE, + 1 + DATA_ROLL_PTR_LEN + 14 + 2 + MLOG_BUF_MARGIN); - mach_write_to_1(log_ptr, flags); - log_ptr++; + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery */ + return; + } /* The code below assumes index is a clustered index: change index to the clustered index if we are updating a secondary index record (or we @@ -1259,6 +1295,9 @@ btr_cur_update_in_place_log( index = dict_table_get_first_index(index->table); + mach_write_to_1(log_ptr, flags); + log_ptr++; + log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr, mtr); mach_write_to_2(log_ptr, rec - buf_frame_align(rec)); @@ -1273,10 +1312,11 @@ Parses a redo log record of updating a record in-place. */ byte* btr_cur_parse_update_in_place( /*==========================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page) /* in: page or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page, /* in: page or NULL */ + dict_index_t* index) /* in: index corresponding to page */ { ulint flags; rec_t* rec; @@ -1286,6 +1326,7 @@ btr_cur_parse_update_in_place( dulint roll_ptr; ulint rec_offset; mem_heap_t* heap; + ulint* offsets; if (end_ptr < ptr + 1) { @@ -1333,11 +1374,14 @@ btr_cur_parse_update_in_place( /* We do not need to reserve btr_search_latch, as the page is only being recovered, and there cannot be a hash index to it. */ + offsets = rec_get_offsets(rec, index, ULINT_UNDEFINED, heap); + if (!(flags & BTR_KEEP_SYS_FLAG)) { - row_upd_rec_sys_fields_in_recovery(rec, pos, trx_id, roll_ptr); + row_upd_rec_sys_fields_in_recovery(rec, offsets, + pos, trx_id, roll_ptr); } - row_upd_rec_in_place(rec, update); + row_upd_rec_in_place(rec, offsets, update); mem_heap_free(heap); @@ -1369,14 +1413,18 @@ btr_cur_update_in_place( dulint roll_ptr = ut_dulint_zero; trx_t* trx; ibool was_delete_marked; + mem_heap_t* heap; + const ulint* offsets; rec = btr_cur_get_rec(cursor); index = cursor->index; trx = thr_get_trx(thr); - + heap = mem_heap_create(100); + offsets = rec_get_offsets(rec, index, ULINT_UNDEFINED, heap); + if (btr_cur_print_record_ops && thr) { btr_cur_trx_report(trx, index, "update "); - rec_print(stderr, rec); + rec_print(stderr, rec, offsets); } /* Do lock checking and undo logging */ @@ -1384,6 +1432,7 @@ btr_cur_update_in_place( thr, &roll_ptr); if (err != DB_SUCCESS) { + mem_heap_free(heap); return(err); } @@ -1405,15 +1454,15 @@ btr_cur_update_in_place( } if (!(flags & BTR_KEEP_SYS_FLAG)) { - row_upd_rec_sys_fields(rec, index, trx, roll_ptr); + row_upd_rec_sys_fields(rec, index, offsets, trx, roll_ptr); } /* FIXME: in a mixed tree, all records may not have enough ordering fields for btr search: */ - was_delete_marked = rec_get_deleted_flag(rec); - - row_upd_rec_in_place(rec, update); + was_delete_marked = rec_get_deleted_flag(rec, index->table->comp); + + row_upd_rec_in_place(rec, offsets, update); if (block->is_hashed) { rw_lock_x_unlock(&btr_search_latch); @@ -1421,13 +1470,14 @@ btr_cur_update_in_place( btr_cur_update_in_place_log(flags, rec, index, update, trx, roll_ptr, mtr); - if (was_delete_marked && !rec_get_deleted_flag(rec)) { + if (was_delete_marked && !rec_get_deleted_flag(rec, index->table->comp)) { /* The new updated record owns its possible externally stored fields */ - btr_cur_unmark_extern_fields(rec, mtr); + btr_cur_unmark_extern_fields(rec, mtr, offsets); } + mem_heap_free(heap); return(DB_SUCCESS); } @@ -1469,24 +1519,28 @@ btr_cur_optimistic_update( mem_heap_t* heap; ibool reorganized = FALSE; ulint i; - + ulint* offsets; + page = btr_cur_get_page(cursor); rec = btr_cur_get_rec(cursor); index = cursor->index; + heap = mem_heap_create(1024); + offsets = rec_get_offsets(rec, index, ULINT_UNDEFINED, heap); + if (btr_cur_print_record_ops && thr) { btr_cur_trx_report(thr_get_trx(thr), index, "update "); - rec_print(stderr, rec); + rec_print(stderr, rec, offsets); } ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); - if (!row_upd_changes_field_size_or_external(rec, index, update)) { + if (!row_upd_changes_field_size_or_external(index, offsets, update)) { /* The simplest and the most common case: the update does not change the size of any field and none of the updated fields is externally stored in rec or update */ - + mem_heap_free(heap); return(btr_cur_update_in_place(flags, cursor, update, cmpl_info, thr, mtr)); } @@ -1497,29 +1551,30 @@ btr_cur_optimistic_update( /* Externally stored fields are treated in pessimistic update */ + mem_heap_free(heap); return(DB_OVERFLOW); } } - if (rec_contains_externally_stored_field(btr_cur_get_rec(cursor))) { + if (rec_offs_any_extern(offsets)) { /* Externally stored fields are treated in pessimistic update */ + mem_heap_free(heap); return(DB_OVERFLOW); } page_cursor = btr_cur_get_page_cur(cursor); - heap = mem_heap_create(1024); - new_entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, heap); row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update, NULL); - old_rec_size = rec_get_size(rec); - new_rec_size = rec_get_converted_size(new_entry); + old_rec_size = rec_offs_size(offsets); + new_rec_size = rec_get_converted_size(index, new_entry); - if (new_rec_size >= page_get_free_space_of_empty() / 2) { + if (new_rec_size >= + page_get_free_space_of_empty(index->table->comp) / 2) { mem_heap_free(heap); @@ -1570,7 +1625,7 @@ btr_cur_optimistic_update( btr_search_update_hash_on_delete(cursor); - page_cur_delete_rec(page_cursor, mtr); + page_cur_delete_rec(page_cursor, index, mtr); page_cur_move_to_prev(page_cursor); @@ -1587,11 +1642,13 @@ btr_cur_optimistic_update( ut_a(rec); /* <- We calculated above the insert would fit */ - if (!rec_get_deleted_flag(rec)) { + if (!rec_get_deleted_flag(rec, index->table->comp)) { /* The new inserted record owns its possible externally stored fields */ - btr_cur_unmark_extern_fields(rec, mtr); + offsets = rec_reget_offsets(rec, index, + offsets, ULINT_UNDEFINED, heap); + btr_cur_unmark_extern_fields(rec, mtr, offsets); } /* Restore the old explicit lock state on the record */ @@ -1690,6 +1747,7 @@ btr_cur_pessimistic_update( ulint* ext_vect; ulint n_ext_vect; ulint reserve_flag; + ulint* offsets = NULL; *big_rec = NULL; @@ -1743,6 +1801,7 @@ btr_cur_pessimistic_update( } heap = mem_heap_create(1024); + offsets = rec_get_offsets(rec, index, ULINT_UNDEFINED, heap); trx = thr_get_trx(thr); @@ -1767,28 +1826,29 @@ btr_cur_pessimistic_update( ut_a(big_rec_vec == NULL); - btr_rec_free_updated_extern_fields(index, rec, update, - TRUE, mtr); + btr_rec_free_updated_extern_fields(index, rec, offsets, + update, TRUE, mtr); } /* We have to set appropriate extern storage bits in the new record to be inserted: we have to remember which fields were such */ - ext_vect = mem_heap_alloc(heap, sizeof(ulint) * rec_get_n_fields(rec)); - n_ext_vect = btr_push_update_extern_fields(ext_vect, rec, update); - - if ((rec_get_converted_size(new_entry) >= - page_get_free_space_of_empty() / 2) - || (rec_get_converted_size(new_entry) >= REC_MAX_DATA_SIZE)) { + ext_vect = mem_heap_alloc(heap, sizeof(ulint) + * dict_index_get_n_fields(index)); + ut_ad(!cursor->index->table->comp || !rec_get_node_ptr_flag(rec)); + offsets = rec_reget_offsets(rec, index, + offsets, ULINT_UNDEFINED, heap); + n_ext_vect = btr_push_update_extern_fields(ext_vect, offsets, update); + + if (rec_get_converted_size(index, new_entry) >= + ut_min(page_get_free_space_of_empty(index->table->comp) / 2, + REC_MAX_DATA_SIZE)) { big_rec_vec = dtuple_convert_big_rec(index, new_entry, ext_vect, n_ext_vect); if (big_rec_vec == NULL) { - mem_heap_free(heap); - err = DB_TOO_BIG_RECORD; - goto return_after_reservations; } } @@ -1808,7 +1868,7 @@ btr_cur_pessimistic_update( btr_search_update_hash_on_delete(cursor); - page_cur_delete_rec(page_cursor, mtr); + page_cur_delete_rec(page_cursor, index, mtr); page_cur_move_to_prev(page_cursor); @@ -1817,21 +1877,22 @@ btr_cur_pessimistic_update( ut_a(rec || optim_err != DB_UNDERFLOW); if (rec) { + offsets = rec_reget_offsets(rec, index, + offsets, ULINT_UNDEFINED, heap); + lock_rec_restore_from_page_infimum(rec, page); - rec_set_field_extern_bits(rec, ext_vect, n_ext_vect, mtr); + rec_set_field_extern_bits(rec, index, + ext_vect, n_ext_vect, mtr); - if (!rec_get_deleted_flag(rec)) { + if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) { /* The new inserted record owns its possible externally stored fields */ - - btr_cur_unmark_extern_fields(rec, mtr); + btr_cur_unmark_extern_fields(rec, mtr, offsets); } btr_cur_compress_if_useful(cursor, mtr); err = DB_SUCCESS; - mem_heap_free(heap); - goto return_after_reservations; } @@ -1856,13 +1917,15 @@ btr_cur_pessimistic_update( ut_a(err == DB_SUCCESS); ut_a(dummy_big_rec == NULL); - rec_set_field_extern_bits(rec, ext_vect, n_ext_vect, mtr); + rec_set_field_extern_bits(rec, index, ext_vect, n_ext_vect, mtr); + offsets = rec_reget_offsets(rec, index, + offsets, ULINT_UNDEFINED, heap); - if (!rec_get_deleted_flag(rec)) { + if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) { /* The new inserted record owns its possible externally stored fields */ - btr_cur_unmark_extern_fields(rec, mtr); + btr_cur_unmark_extern_fields(rec, mtr, offsets); } lock_rec_restore_from_page_infimum(rec, page); @@ -1876,9 +1939,8 @@ btr_cur_pessimistic_update( btr_cur_pess_upd_restore_supremum(rec, mtr); } - mem_heap_free(heap); - return_after_reservations: + mem_heap_free(heap); if (n_extents > 0) { fil_space_release_free_extents(cursor->index->space, @@ -1908,11 +1970,18 @@ btr_cur_del_mark_set_clust_rec_log( mtr_t* mtr) /* in: mtr */ { byte* log_ptr; + ut_ad(flags < 256); + ut_ad(val <= 1); - log_ptr = mlog_open(mtr, 30); + log_ptr = mlog_open_and_write_index(mtr, rec, index, index->table->comp + ? MLOG_COMP_REC_CLUST_DELETE_MARK + : MLOG_REC_CLUST_DELETE_MARK, + 1 + 1 + DATA_ROLL_PTR_LEN + 14 + 2); - log_ptr = mlog_write_initial_log_record_fast(rec, - MLOG_REC_CLUST_DELETE_MARK, log_ptr, mtr); + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery */ + return; + } mach_write_to_1(log_ptr, flags); log_ptr++; @@ -1934,10 +2003,11 @@ index record. */ byte* btr_cur_parse_del_mark_set_clust_rec( /*=================================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page) /* in: page or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: index corresponding to page */ + page_t* page) /* in: page or NULL */ { ulint flags; ibool val; @@ -1978,15 +2048,19 @@ btr_cur_parse_del_mark_set_clust_rec( rec = page + offset; if (!(flags & BTR_KEEP_SYS_FLAG)) { - row_upd_rec_sys_fields_in_recovery(rec, pos, trx_id, - roll_ptr); + mem_heap_t* heap = mem_heap_create(100); + row_upd_rec_sys_fields_in_recovery(rec, + rec_get_offsets(rec, index, + ULINT_UNDEFINED, heap), + pos, trx_id, roll_ptr); + mem_heap_free(heap); } /* We do not need to reserve btr_search_latch, as the page is only being recovered, and there cannot be a hash index to it. */ - rec_set_deleted_flag(rec, val); + rec_set_deleted_flag(rec, index->table->comp, val); } return(ptr); @@ -2015,22 +2089,28 @@ btr_cur_del_mark_set_clust_rec( ulint err; rec_t* rec; trx_t* trx; + mem_heap_t* heap; + const ulint* offsets; rec = btr_cur_get_rec(cursor); index = cursor->index; - + heap = mem_heap_create(100); + offsets = rec_get_offsets(rec, index, ULINT_UNDEFINED, heap); + if (btr_cur_print_record_ops && thr) { btr_cur_trx_report(thr_get_trx(thr), index, "del mark "); - rec_print(stderr, rec); + rec_print(stderr, rec, offsets); } ut_ad(index->type & DICT_CLUSTERED); - ut_ad(rec_get_deleted_flag(rec) == FALSE); + ut_ad(rec_get_deleted_flag(rec, index->table->comp) == FALSE); - err = lock_clust_rec_modify_check_and_lock(flags, rec, index, thr); + err = lock_clust_rec_modify_check_and_lock(flags, + rec, index, offsets, thr); if (err != DB_SUCCESS) { + mem_heap_free(heap); return(err); } @@ -2039,6 +2119,7 @@ btr_cur_del_mark_set_clust_rec( &roll_ptr); if (err != DB_SUCCESS) { + mem_heap_free(heap); return(err); } @@ -2048,13 +2129,12 @@ btr_cur_del_mark_set_clust_rec( rw_lock_x_lock(&btr_search_latch); } - rec_set_deleted_flag(rec, val); + rec_set_deleted_flag(rec, index->table->comp, val); trx = thr_get_trx(thr); if (!(flags & BTR_KEEP_SYS_FLAG)) { - - row_upd_rec_sys_fields(rec, index, trx, roll_ptr); + row_upd_rec_sys_fields(rec, index, offsets, trx, roll_ptr); } if (block->is_hashed) { @@ -2063,6 +2143,7 @@ btr_cur_del_mark_set_clust_rec( btr_cur_del_mark_set_clust_rec_log(flags, rec, index, val, trx, roll_ptr, mtr); + mem_heap_free(heap); return(DB_SUCCESS); } @@ -2073,16 +2154,24 @@ UNIV_INLINE void btr_cur_del_mark_set_sec_rec_log( /*=============================*/ - rec_t* rec, /* in: record */ - ibool val, /* in: value to set */ - mtr_t* mtr) /* in: mtr */ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: record descriptor */ + ibool val, /* in: value to set */ + mtr_t* mtr) /* in: mtr */ { byte* log_ptr; + ut_ad(val <= 1); - log_ptr = mlog_open(mtr, 30); + log_ptr = mlog_open_and_write_index(mtr, rec, index, index->table->comp + ? MLOG_COMP_REC_SEC_DELETE_MARK + : MLOG_REC_SEC_DELETE_MARK, + 1 + 2); - log_ptr = mlog_write_initial_log_record_fast(rec, - MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr); + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery: + in that case mlog_open returns NULL */ + return; + } mach_write_to_1(log_ptr, val); log_ptr++; @@ -2100,10 +2189,11 @@ index record. */ byte* btr_cur_parse_del_mark_set_sec_rec( /*===============================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page) /* in: page or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page) /* in: page or NULL */ { ibool val; ulint offset; @@ -2129,7 +2219,7 @@ btr_cur_parse_del_mark_set_sec_rec( is only being recovered, and there cannot be a hash index to it. */ - rec_set_deleted_flag(rec, val); + rec_set_deleted_flag(rec, index->table->comp, val); } return(ptr); @@ -2156,9 +2246,12 @@ btr_cur_del_mark_set_sec_rec( rec = btr_cur_get_rec(cursor); if (btr_cur_print_record_ops && thr) { + mem_heap_t* heap = mem_heap_create(100); btr_cur_trx_report(thr_get_trx(thr), cursor->index, "del mark "); - rec_print(stderr, rec); + rec_print(stderr, rec, rec_get_offsets(rec, cursor->index, + ULINT_UNDEFINED, heap)); + mem_heap_free(heap); } err = lock_sec_rec_modify_check_and_lock(flags, rec, cursor->index, @@ -2174,13 +2267,13 @@ btr_cur_del_mark_set_sec_rec( rw_lock_x_lock(&btr_search_latch); } - rec_set_deleted_flag(rec, val); + rec_set_deleted_flag(rec, cursor->index->table->comp, val); if (block->is_hashed) { rw_lock_x_unlock(&btr_search_latch); } - btr_cur_del_mark_set_sec_rec_log(rec, val, mtr); + btr_cur_del_mark_set_sec_rec_log(rec, cursor->index, val, mtr); return(DB_SUCCESS); } @@ -2192,15 +2285,16 @@ used by the insert buffer insert merge mechanism. */ void btr_cur_del_unmark_for_ibuf( /*========================*/ - rec_t* rec, /* in: record to delete unmark */ - mtr_t* mtr) /* in: mtr */ + rec_t* rec, /* in: record to delete unmark */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { /* We do not need to reserve btr_search_latch, as the page has just been read to the buffer pool and there cannot be a hash index to it. */ - rec_set_deleted_flag(rec, FALSE); + rec_set_deleted_flag(rec, index->table->comp, FALSE); - btr_cur_del_mark_set_sec_rec_log(rec, FALSE, mtr); + btr_cur_del_mark_set_sec_rec_log(rec, index, FALSE, mtr); } /*==================== B-TREE RECORD REMOVE =========================*/ @@ -2279,8 +2373,11 @@ btr_cur_optimistic_delete( successor of the deleted record */ mtr_t* mtr) /* in: mtr */ { - page_t* page; - ulint max_ins_size; + page_t* page; + ulint max_ins_size; + mem_heap_t* heap; + rec_t* rec; + const ulint* offsets; ut_ad(mtr_memo_contains(mtr, buf_block_align(btr_cur_get_page(cursor)), MTR_MEMO_PAGE_X_FIX)); @@ -2290,26 +2387,30 @@ btr_cur_optimistic_delete( ut_ad(btr_page_get_level(page, mtr) == 0); - if (rec_contains_externally_stored_field(btr_cur_get_rec(cursor))) { - - return(FALSE); - } + heap = mem_heap_create(100); + rec = btr_cur_get_rec(cursor); + offsets = rec_get_offsets(rec, cursor->index, ULINT_UNDEFINED, heap); - if (btr_cur_can_delete_without_compress(cursor, mtr)) { + if (!rec_offs_any_extern(offsets) + && btr_cur_can_delete_without_compress( + cursor, rec_offs_size(offsets), mtr)) { - lock_update_delete(btr_cur_get_rec(cursor)); + lock_update_delete(rec); btr_search_update_hash_on_delete(cursor); max_ins_size = page_get_max_insert_size_after_reorganize(page, 1); - page_cur_delete_rec(btr_cur_get_page_cur(cursor), mtr); + page_cur_delete_rec(btr_cur_get_page_cur(cursor), + cursor->index, mtr); ibuf_update_free_bits_low(cursor->index, page, max_ins_size, mtr); + mem_heap_free(heap); return(TRUE); } + mem_heap_free(heap); return(FALSE); } @@ -2375,8 +2476,20 @@ btr_cur_pessimistic_delete( } } - btr_rec_free_externally_stored_fields(cursor->index, - btr_cur_get_rec(cursor), in_rollback, mtr); + heap = mem_heap_create(256); + rec = btr_cur_get_rec(cursor); + + /* Free externally stored fields if the record is neither + a node pointer nor in two-byte format. + This avoids unnecessary calls to rec_get_offsets(). */ + if (cursor->index->table->comp + ? !rec_get_node_ptr_flag(rec) + : !rec_get_1byte_offs_flag(rec)) { + btr_rec_free_externally_stored_fields(cursor->index, + rec, rec_get_offsets(rec, cursor->index, + ULINT_UNDEFINED, heap), + in_rollback, mtr); + } if ((page_get_n_recs(page) < 2) && (dict_tree_get_page(btr_cur_get_tree(cursor)) @@ -2393,8 +2506,6 @@ btr_cur_pessimistic_delete( goto return_after_reservations; } - rec = btr_cur_get_rec(cursor); - lock_update_delete(rec); if ((btr_page_get_level(page, mtr) > 0) @@ -2406,7 +2517,8 @@ btr_cur_pessimistic_delete( non-leaf level, we must mark the new leftmost node pointer as the predefined minimum record */ - btr_set_min_rec_mark(page_rec_get_next(rec), mtr); + btr_set_min_rec_mark(page_rec_get_next(rec), + cursor->index->table->comp, mtr); } else { /* Otherwise, if we delete the leftmost node pointer on a page, we have to change the father node pointer @@ -2415,8 +2527,6 @@ btr_cur_pessimistic_delete( btr_node_ptr_delete(tree, page, mtr); - heap = mem_heap_create(256); - node_ptr = dict_tree_build_node_ptr( tree, page_rec_get_next(rec), buf_frame_get_page_no(page), @@ -2425,20 +2535,19 @@ btr_cur_pessimistic_delete( btr_insert_on_non_leaf_level(tree, btr_page_get_level(page, mtr) + 1, node_ptr, mtr); - - mem_heap_free(heap); } } btr_search_update_hash_on_delete(cursor); - page_cur_delete_rec(btr_cur_get_page_cur(cursor), mtr); + page_cur_delete_rec(btr_cur_get_page_cur(cursor), cursor->index, mtr); ut_ad(btr_check_node_ptr(tree, page, mtr)); *err = DB_SUCCESS; return_after_reservations: + mem_heap_free(heap); if (ret == FALSE) { ret = btr_cur_compress_if_useful(cursor, mtr); @@ -2663,9 +2772,13 @@ btr_estimate_number_of_different_key_vals( ulint j; ulint add_on; mtr_t mtr; + mem_heap_t* heap; + ulint* offsets1 = 0; + ulint* offsets2 = 0; n_cols = dict_index_get_n_unique(index); + heap = mem_heap_create(100); n_diff = mem_alloc((n_cols + 1) * sizeof(ib_longlong)); for (j = 0; j <= n_cols; j++) { @@ -2697,11 +2810,17 @@ btr_estimate_number_of_different_key_vals( while (rec != page_get_supremum_rec(page) && page_rec_get_next(rec) != page_get_supremum_rec(page)) { + rec_t* next_rec = page_rec_get_next(rec); matched_fields = 0; matched_bytes = 0; - - cmp_rec_rec_with_match(rec, page_rec_get_next(rec), - index, &matched_fields, + offsets1 = rec_reget_offsets(rec, index, + offsets1, ULINT_UNDEFINED, heap); + offsets2 = rec_reget_offsets(next_rec, index, + offsets2, n_cols, heap); + + cmp_rec_rec_with_match(rec, next_rec, + offsets1, offsets2, + index, n_cols, &matched_fields, &matched_bytes); for (j = matched_fields + 1; j <= n_cols; j++) { @@ -2712,7 +2831,8 @@ btr_estimate_number_of_different_key_vals( } total_external_size += - btr_rec_get_externally_stored_len(rec); + btr_rec_get_externally_stored_len( + rec, offsets1); rec = page_rec_get_next(rec); } @@ -2736,8 +2856,11 @@ btr_estimate_number_of_different_key_vals( } } + offsets1 = rec_reget_offsets(rec, index, + offsets1, ULINT_UNDEFINED, heap); total_external_size += - btr_rec_get_externally_stored_len(rec); + btr_rec_get_externally_stored_len(rec, + offsets1); mtr_commit(&mtr); } @@ -2778,6 +2901,7 @@ btr_estimate_number_of_different_key_vals( } mem_free(n_diff); + mem_heap_free(heap); } /*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/ @@ -2788,9 +2912,10 @@ static ulint btr_rec_get_externally_stored_len( /*==============================*/ - /* out: externally stored part, in units of a - database page */ - rec_t* rec) /* in: record */ + /* out: externally stored part, + in units of a database page */ + rec_t* rec, /* in: record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { ulint n_fields; byte* data; @@ -2799,17 +2924,13 @@ btr_rec_get_externally_stored_len( ulint total_extern_len = 0; ulint i; - if (rec_get_data_size(rec) <= REC_1BYTE_OFFS_LIMIT) { - - return(0); - } - - n_fields = rec_get_n_fields(rec); + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + n_fields = rec_offs_n_fields(offsets); for (i = 0; i < n_fields; i++) { - if (rec_get_nth_field_extern_bit(rec, i)) { + if (rec_offs_nth_extern(offsets, i)) { - data = rec_get_nth_field(rec, i, &local_len); + data = rec_get_nth_field(rec, offsets, i, &local_len); local_len -= BTR_EXTERN_FIELD_REF_SIZE; @@ -2830,16 +2951,17 @@ static void btr_cur_set_ownership_of_extern_field( /*==================================*/ - rec_t* rec, /* in: clustered index record */ - ulint i, /* in: field number */ - ibool val, /* in: value to set */ - mtr_t* mtr) /* in: mtr */ + rec_t* rec, /* in: clustered index record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint i, /* in: field number */ + ibool val, /* in: value to set */ + mtr_t* mtr) /* in: mtr */ { byte* data; ulint local_len; ulint byte_val; - data = rec_get_nth_field(rec, i, &local_len); + data = rec_get_nth_field(rec, offsets, i, &local_len); ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); @@ -2866,19 +2988,22 @@ to free the field. */ void btr_cur_mark_extern_inherited_fields( /*=================================*/ - rec_t* rec, /* in: record in a clustered index */ - upd_t* update, /* in: update vector */ - mtr_t* mtr) /* in: mtr */ + rec_t* rec, /* in: record in a clustered index */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + upd_t* update, /* in: update vector */ + mtr_t* mtr) /* in: mtr */ { ibool is_updated; ulint n; ulint j; ulint i; - - n = rec_get_n_fields(rec); + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + n = rec_offs_n_fields(offsets); for (i = 0; i < n; i++) { - if (rec_get_nth_field_extern_bit(rec, i)) { + if (rec_offs_nth_extern(offsets, i)) { /* Check it is not in updated fields */ is_updated = FALSE; @@ -2894,8 +3019,8 @@ btr_cur_mark_extern_inherited_fields( } if (!is_updated) { - btr_cur_set_ownership_of_extern_field(rec, i, - FALSE, mtr); + btr_cur_set_ownership_of_extern_field(rec, + offsets, i, FALSE, mtr); } } } @@ -2967,18 +3092,20 @@ static void btr_cur_unmark_extern_fields( /*=========================*/ - rec_t* rec, /* in: record in a clustered index */ - mtr_t* mtr) /* in: mtr */ + rec_t* rec, /* in: record in a clustered index */ + mtr_t* mtr, /* in: mtr */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { ulint n; ulint i; - n = rec_get_n_fields(rec); + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + n = rec_offs_n_fields(offsets); for (i = 0; i < n; i++) { - if (rec_get_nth_field_extern_bit(rec, i)) { - - btr_cur_set_ownership_of_extern_field(rec, i, + if (rec_offs_nth_extern(offsets, i)) { + + btr_cur_set_ownership_of_extern_field(rec, offsets, i, TRUE, mtr); } } @@ -3028,10 +3155,10 @@ ulint btr_push_update_extern_fields( /*==========================*/ /* out: number of values stored in ext_vect */ - ulint* ext_vect, /* in: array of ulints, must be preallocated + ulint* ext_vect,/* in: array of ulints, must be preallocated to have space for all fields in rec */ - rec_t* rec, /* in: record */ - upd_t* update) /* in: update vector or NULL */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + upd_t* update) /* in: update vector or NULL */ { ulint n_pushed = 0; ibool is_updated; @@ -3054,10 +3181,10 @@ btr_push_update_extern_fields( } } - n = rec_get_n_fields(rec); + n = rec_offs_n_fields(offsets); for (i = 0; i < n; i++) { - if (rec_get_nth_field_extern_bit(rec, i)) { + if (rec_offs_nth_extern(offsets, i)) { /* Check it is not in updated fields */ is_updated = FALSE; @@ -3119,6 +3246,7 @@ btr_store_big_rec_extern_fields( dict_index_t* index, /* in: index of rec; the index tree MUST be X-latched */ rec_t* rec, /* in: record */ + const ulint* offsets, /* in: rec_get_offsets(rec, index) */ big_rec_t* big_rec_vec, /* in: vector containing fields to be stored externally */ mtr_t* local_mtr __attribute__((unused))) /* in: mtr @@ -3139,6 +3267,7 @@ btr_store_big_rec_extern_fields( ulint i; mtr_t mtr; + ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(mtr_memo_contains(local_mtr, dict_tree_get_lock(index->tree), MTR_MEMO_X_LOCK)); ut_ad(mtr_memo_contains(local_mtr, buf_block_align(rec), @@ -3152,8 +3281,8 @@ btr_store_big_rec_extern_fields( for (i = 0; i < big_rec_vec->n_fields; i++) { - data = rec_get_nth_field(rec, big_rec_vec->fields[i].field_no, - &local_len); + data = rec_get_nth_field(rec, offsets, + big_rec_vec->fields[i].field_no, &local_len); ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); local_len -= BTR_EXTERN_FIELD_REF_SIZE; extern_len = big_rec_vec->fields[i].len; @@ -3254,7 +3383,7 @@ btr_store_big_rec_extern_fields( /* Set the bit denoting that this field in rec is stored externally */ - rec_set_nth_field_extern_bit(rec, + rec_set_nth_field_extern_bit(rec, index, big_rec_vec->fields[i].field_no, TRUE, &mtr); } @@ -3407,6 +3536,7 @@ btr_rec_free_externally_stored_fields( dict_index_t* index, /* in: index of the data, the index tree MUST be X-latched */ rec_t* rec, /* in: record */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ ibool do_not_free_inherited,/* in: TRUE if called in a rollback and we do not want to free inherited fields */ @@ -3419,21 +3549,18 @@ btr_rec_free_externally_stored_fields( ulint len; ulint i; + ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_X_FIX)); - if (rec_get_data_size(rec) <= REC_1BYTE_OFFS_LIMIT) { - - return; - } - /* Free possible externally stored fields in the record */ - n_fields = rec_get_n_fields(rec); + ut_ad(index->table->comp == rec_offs_comp(offsets)); + n_fields = rec_offs_n_fields(offsets); for (i = 0; i < n_fields; i++) { - if (rec_get_nth_field_extern_bit(rec, i)) { + if (rec_offs_nth_extern(offsets, i)) { - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field(rec, offsets, i, &len); btr_free_externally_stored_field(index, data, len, do_not_free_inherited, mtr); } @@ -3450,6 +3577,7 @@ btr_rec_free_updated_extern_fields( dict_index_t* index, /* in: index of rec; the index tree MUST be X-latched */ rec_t* rec, /* in: record */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ upd_t* update, /* in: update vector */ ibool do_not_free_inherited,/* in: TRUE if called in a rollback and we do not want to free @@ -3463,13 +3591,10 @@ btr_rec_free_updated_extern_fields( ulint len; ulint i; + ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_X_FIX)); - if (rec_get_data_size(rec) <= REC_1BYTE_OFFS_LIMIT) { - return; - } - /* Free possible externally stored fields in the record */ n_fields = upd_get_n_fields(update); @@ -3477,9 +3602,10 @@ btr_rec_free_updated_extern_fields( for (i = 0; i < n_fields; i++) { ufield = upd_get_nth_field(update, i); - if (rec_get_nth_field_extern_bit(rec, ufield->field_no)) { + if (rec_offs_nth_extern(offsets, ufield->field_no)) { - data = rec_get_nth_field(rec, ufield->field_no, &len); + data = rec_get_nth_field(rec, offsets, + ufield->field_no, &len); btr_free_externally_stored_field(index, data, len, do_not_free_inherited, mtr); } @@ -3583,7 +3709,8 @@ byte* btr_rec_copy_externally_stored_field( /*=================================*/ /* out: the field copied to heap */ - rec_t* rec, /* in: record */ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint no, /* in: field number */ ulint* len, /* out: length of the field */ mem_heap_t* heap) /* in: mem heap */ @@ -3591,7 +3718,8 @@ btr_rec_copy_externally_stored_field( ulint local_len; byte* data; - ut_a(rec_get_nth_field_extern_bit(rec, no)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_a(rec_offs_nth_extern(offsets, no)); /* An externally stored field can contain some initial data from the field, and in the last 20 bytes it has the @@ -3602,7 +3730,7 @@ btr_rec_copy_externally_stored_field( limit so that field offsets are stored in two bytes, and the extern bit is available in those two bytes. */ - data = rec_get_nth_field(rec, no, &local_len); + data = rec_get_nth_field(rec, offsets, no, &local_len); return(btr_copy_externally_stored_field(len, data, local_len, heap)); } diff --git a/innobase/btr/btr0pcur.c b/innobase/btr/btr0pcur.c index cf8a612ef28..7df8e53cd07 100644 --- a/innobase/btr/btr0pcur.c +++ b/innobase/btr/btr0pcur.c @@ -45,12 +45,12 @@ btr_pcur_free_for_mysql( mem_free(cursor->old_rec_buf); - cursor->old_rec = NULL; cursor->old_rec_buf = NULL; } cursor->btr_cur.page_cur.rec = NULL; cursor->old_rec = NULL; + cursor->old_n_fields = 0; cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; cursor->latch_mode = BTR_NO_LATCHES; @@ -133,9 +133,10 @@ btr_pcur_store_position( cursor->old_stored = BTR_PCUR_OLD_STORED; cursor->old_rec = dict_tree_copy_rec_order_prefix(tree, rec, - &(cursor->old_rec_buf), - &(cursor->buf_size)); - + &cursor->old_n_fields, + &cursor->old_rec_buf, + &cursor->buf_size); + cursor->block_when_stored = buf_block_align(page); cursor->modify_clock = buf_frame_get_modify_clock(page); } @@ -166,6 +167,8 @@ btr_pcur_copy_stored_position( pcur_receive->old_rec = pcur_receive->old_rec_buf + (pcur_donate->old_rec - pcur_donate->old_rec_buf); } + + pcur_receive->old_n_fields = pcur_donate->old_n_fields; } /****************************************************************** @@ -228,6 +231,7 @@ btr_pcur_restore_position( } ut_a(cursor->old_rec); + ut_a(cursor->old_n_fields); page = btr_cur_get_page(btr_pcur_get_btr_cur(cursor)); @@ -242,17 +246,32 @@ btr_pcur_restore_position( buf_page_dbg_add_level(page, SYNC_TREE_NODE); #endif /* UNIV_SYNC_DEBUG */ if (cursor->rel_pos == BTR_PCUR_ON) { - +#ifdef UNIV_DEBUG + rec_t* rec; + ulint* offsets1; + ulint* offsets2; + dict_index_t* index; +#endif /* UNIV_DEBUG */ cursor->latch_mode = latch_mode; - - ut_ad(cmp_rec_rec(cursor->old_rec, - btr_pcur_get_rec(cursor), - dict_tree_find_index( - btr_cur_get_tree( +#ifdef UNIV_DEBUG + rec = btr_pcur_get_rec(cursor); + index = dict_tree_find_index( + btr_cur_get_tree( btr_pcur_get_btr_cur(cursor)), - btr_pcur_get_rec(cursor))) - == 0); + rec); + + heap = mem_heap_create(256); + offsets1 = rec_get_offsets(cursor->old_rec, + index, ULINT_UNDEFINED, heap); + offsets2 = rec_get_offsets(rec, + index, ULINT_UNDEFINED, heap); + ut_ad(cmp_rec_rec(cursor->old_rec, + rec, offsets1, offsets2, + cursor->old_n_fields, + index) == 0); + mem_heap_free(heap); +#endif /* UNIV_DEBUG */ return(TRUE); } @@ -265,7 +284,8 @@ btr_pcur_restore_position( heap = mem_heap_create(256); tree = btr_cur_get_tree(btr_pcur_get_btr_cur(cursor)); - tuple = dict_tree_build_data_tuple(tree, cursor->old_rec, heap); + tuple = dict_tree_build_data_tuple(tree, cursor->old_rec, + cursor->old_n_fields, heap); /* Save the old search mode of the cursor */ old_mode = cursor->search_mode; @@ -287,7 +307,10 @@ btr_pcur_restore_position( if (cursor->rel_pos == BTR_PCUR_ON && btr_pcur_is_on_user_rec(cursor, mtr) - && 0 == cmp_dtuple_rec(tuple, btr_pcur_get_rec(cursor))) { + && 0 == cmp_dtuple_rec(tuple, btr_pcur_get_rec(cursor), + rec_get_offsets(btr_pcur_get_rec(cursor), + btr_pcur_get_btr_cur(cursor)->index, + ULINT_UNDEFINED, heap))) { /* We have to store the NEW value for the modify clock, since the cursor can now be on a different page! But we can retain @@ -376,6 +399,7 @@ btr_pcur_move_to_next_page( ut_ad(next_page_no != FIL_NULL); next_page = btr_page_get(space, next_page_no, cursor->latch_mode, mtr); + ut_a(page_is_comp(next_page) == page_is_comp(page)); buf_block_align(next_page)->check_index_page_at_flush = TRUE; btr_leaf_page_release(page, cursor->latch_mode, mtr); diff --git a/innobase/btr/btr0sea.c b/innobase/btr/btr0sea.c index ad74f9704da..40ccf56492f 100644 --- a/innobase/btr/btr0sea.c +++ b/innobase/btr/btr0sea.c @@ -416,7 +416,7 @@ btr_search_update_hash_ref( && (block->curr_n_fields == info->n_fields) && (block->curr_n_bytes == info->n_bytes) && (block->curr_side == info->side)) { - + mem_heap_t* heap; rec = btr_cur_get_rec(cursor); if (!page_rec_is_user_rec(rec)) { @@ -425,10 +425,11 @@ btr_search_update_hash_ref( } tree_id = ((cursor->index)->tree)->id; - - fold = rec_fold(rec, block->curr_n_fields, - block->curr_n_bytes, tree_id); - + heap = mem_heap_create(100); + fold = rec_fold(rec, rec_get_offsets(rec, cursor->index, + ULINT_UNDEFINED, heap), block->curr_n_fields, + block->curr_n_bytes, tree_id); + mem_heap_free(heap); #ifdef UNIV_SYNC_DEBUG ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ @@ -535,15 +536,17 @@ btr_search_check_guess( or PAGE_CUR_GE */ mtr_t* mtr) /* in: mtr */ { - page_t* page; - rec_t* rec; - rec_t* prev_rec; - rec_t* next_rec; - ulint n_unique; - ulint match; - ulint bytes; - int cmp; - + page_t* page; + rec_t* rec; + rec_t* prev_rec; + rec_t* next_rec; + ulint n_unique; + ulint match; + ulint bytes; + int cmp; + mem_heap_t* heap = mem_heap_create(100); + ulint* offsets = NULL; + n_unique = dict_index_get_n_unique_in_tree(cursor->index); rec = btr_cur_get_rec(cursor); @@ -554,23 +557,25 @@ btr_search_check_guess( match = 0; bytes = 0; - cmp = page_cmp_dtuple_rec_with_match(tuple, rec, &match, &bytes); + offsets = rec_get_offsets(rec, cursor->index, n_unique, heap); + cmp = page_cmp_dtuple_rec_with_match(tuple, rec, + offsets, &match, &bytes); if (mode == PAGE_CUR_GE) { if (cmp == 1) { - + mem_heap_free(heap); return(FALSE); } cursor->up_match = match; if (match >= n_unique) { - + mem_heap_free(heap); return(TRUE); } } else if (mode == PAGE_CUR_LE) { if (cmp == -1) { - + mem_heap_free(heap); return(FALSE); } @@ -578,12 +583,12 @@ btr_search_check_guess( } else if (mode == PAGE_CUR_G) { if (cmp != -1) { - + mem_heap_free(heap); return(FALSE); } } else if (mode == PAGE_CUR_L) { if (cmp != 1) { - + mem_heap_free(heap); return(FALSE); } } @@ -591,7 +596,7 @@ btr_search_check_guess( if (can_only_compare_to_cursor_rec) { /* Since we could not determine if our guess is right just by looking at the record under the cursor, return FALSE */ - + mem_heap_free(heap); return(FALSE); } @@ -605,17 +610,15 @@ btr_search_check_guess( prev_rec = page_rec_get_prev(rec); if (prev_rec == page_get_infimum_rec(page)) { - - if (btr_page_get_prev(page, mtr) != FIL_NULL) { - - return(FALSE); - } - - return(TRUE); + mem_heap_free(heap); + return(btr_page_get_prev(page, mtr) == FIL_NULL); } + offsets = rec_reget_offsets(prev_rec, cursor->index, + offsets, n_unique, heap); cmp = page_cmp_dtuple_rec_with_match(tuple, prev_rec, - &match, &bytes); + offsets, &match, &bytes); + mem_heap_free(heap); if (mode == PAGE_CUR_GE) { if (cmp != 1) { @@ -636,6 +639,7 @@ btr_search_check_guess( next_rec = page_rec_get_next(rec); if (next_rec == page_get_supremum_rec(page)) { + mem_heap_free(heap); if (btr_page_get_next(page, mtr) == FIL_NULL) { @@ -647,8 +651,12 @@ btr_search_check_guess( return(FALSE); } - cmp = page_cmp_dtuple_rec_with_match(tuple, next_rec, &match, &bytes); - + offsets = rec_reget_offsets(next_rec, cursor->index, + offsets, n_unique, heap); + cmp = page_cmp_dtuple_rec_with_match(tuple, next_rec, + offsets, &match, &bytes); + mem_heap_free(heap); + if (mode == PAGE_CUR_LE) { if (cmp != -1) { @@ -1003,8 +1011,7 @@ static void btr_search_build_page_hash_index( /*=============================*/ - dict_index_t* index, /* in: index for which to build, or NULL if - not known */ + dict_index_t* index, /* in: index for which to build */ page_t* page, /* in: index page, s- or x-latched */ ulint n_fields,/* in: hash this many full fields */ ulint n_bytes,/* in: hash this many bytes from the next @@ -1024,7 +1031,11 @@ btr_search_build_page_hash_index( ulint* folds; rec_t** recs; ulint i; - + mem_heap_t* heap; + ulint* offsets; + + ut_ad(index); + block = buf_block_align(page); table = btr_search_sys->hash_index; @@ -1061,9 +1072,9 @@ btr_search_build_page_hash_index( return; } - if (index && (dict_index_get_n_unique_in_tree(index) < n_fields + if (dict_index_get_n_unique_in_tree(index) < n_fields || (dict_index_get_n_unique_in_tree(index) == n_fields - && n_bytes > 0))) { + && n_bytes > 0)) { return; } @@ -1072,6 +1083,7 @@ btr_search_build_page_hash_index( folds = mem_alloc(n_recs * sizeof(ulint)); recs = mem_alloc(n_recs * sizeof(rec_t*)); + heap = mem_heap_create(100); n_cached = 0; @@ -1082,18 +1094,19 @@ btr_search_build_page_hash_index( rec = page_get_infimum_rec(page); rec = page_rec_get_next(rec); + offsets = rec_get_offsets(rec, index, n_fields + (n_bytes > 0), heap); + if (rec != sup) { - ut_a(n_fields <= rec_get_n_fields(rec)); + ut_a(n_fields <= rec_offs_n_fields(offsets)); if (n_bytes > 0) { - ut_a(n_fields < rec_get_n_fields(rec)); + ut_a(n_fields < rec_offs_n_fields(offsets)); } } /* FIXME: in a mixed tree, all records may not have enough ordering fields: */ - - fold = rec_fold(rec, n_fields, n_bytes, tree_id); + fold = rec_fold(rec, offsets, n_fields, n_bytes, tree_id); if (side == BTR_SEARCH_LEFT_SIDE) { @@ -1117,7 +1130,10 @@ btr_search_build_page_hash_index( break; } - next_fold = rec_fold(next_rec, n_fields, n_bytes, tree_id); + offsets = rec_reget_offsets(next_rec, index, + offsets, n_fields + (n_bytes > 0), heap); + next_fold = rec_fold(next_rec, offsets, n_fields, + n_bytes, tree_id); if (fold != next_fold) { /* Insert an entry into the hash index */ @@ -1145,13 +1161,7 @@ btr_search_build_page_hash_index( if (block->is_hashed && ((block->curr_n_fields != n_fields) || (block->curr_n_bytes != n_bytes) || (block->curr_side != side))) { - - rw_lock_x_unlock(&btr_search_latch); - - mem_free(folds); - mem_free(recs); - - return; + goto exit_func; } block->is_hashed = TRUE; @@ -1166,10 +1176,12 @@ btr_search_build_page_hash_index( ha_insert_for_fold(table, folds[i], recs[i]); } +exit_func: rw_lock_x_unlock(&btr_search_latch); mem_free(folds); mem_free(recs); + mem_heap_free(heap); } /************************************************************************ @@ -1181,10 +1193,13 @@ parameters as page (this often happens when a page is split). */ void btr_search_move_or_delete_hash_entries( /*===================================*/ - page_t* new_page, /* in: records are copied to this page */ - page_t* page) /* in: index page from which records were - copied, and the copied records will be deleted - from this page */ + page_t* new_page, /* in: records are copied + to this page */ + page_t* page, /* in: index page from which + records were copied, and the + copied records will be deleted + from this page */ + dict_index_t* index) /* in: record descriptor */ { buf_block_t* block; buf_block_t* new_block; @@ -1194,6 +1209,7 @@ btr_search_move_or_delete_hash_entries( block = buf_block_align(page); new_block = buf_block_align(new_page); + ut_a(page_is_comp(page) == page_is_comp(new_page)); #ifdef UNIV_SYNC_DEBUG ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); @@ -1224,8 +1240,8 @@ btr_search_move_or_delete_hash_entries( rw_lock_s_unlock(&btr_search_latch); ut_a(n_fields + n_bytes > 0); - - btr_search_build_page_hash_index(NULL, new_page, n_fields, + + btr_search_build_page_hash_index(index, new_page, n_fields, n_bytes, side); ut_a(n_fields == block->curr_n_fields); ut_a(n_bytes == block->curr_n_bytes); @@ -1253,6 +1269,7 @@ btr_search_update_hash_on_delete( ulint fold; dulint tree_id; ibool found; + mem_heap_t* heap; rec = btr_cur_get_rec(cursor); @@ -1272,9 +1289,11 @@ btr_search_update_hash_on_delete( table = btr_search_sys->hash_index; tree_id = cursor->index->tree->id; - - fold = rec_fold(rec, block->curr_n_fields, block->curr_n_bytes, - tree_id); + heap = mem_heap_create(100); + fold = rec_fold(rec, rec_get_offsets(rec, cursor->index, + ULINT_UNDEFINED, heap), block->curr_n_fields, + block->curr_n_bytes, tree_id); + mem_heap_free(heap); rw_lock_x_lock(&btr_search_latch); found = ha_search_and_delete_if_found(table, fold, rec); @@ -1355,6 +1374,8 @@ btr_search_update_hash_on_insert( ulint n_bytes; ulint side; ibool locked = FALSE; + mem_heap_t* heap; + ulint* offsets; table = btr_search_sys->hash_index; @@ -1383,15 +1404,22 @@ btr_search_update_hash_on_insert( next_rec = page_rec_get_next(ins_rec); page = buf_frame_align(rec); - - ins_fold = rec_fold(ins_rec, n_fields, n_bytes, tree_id); + heap = mem_heap_create(100); + offsets = rec_get_offsets(ins_rec, cursor->index, + ULINT_UNDEFINED, heap); + ins_fold = rec_fold(ins_rec, offsets, n_fields, n_bytes, tree_id); if (next_rec != page_get_supremum_rec(page)) { - next_fold = rec_fold(next_rec, n_fields, n_bytes, tree_id); + offsets = rec_reget_offsets(next_rec, cursor->index, + offsets, n_fields + (n_bytes > 0), heap); + next_fold = rec_fold(next_rec, offsets, n_fields, + n_bytes, tree_id); } if (rec != page_get_infimum_rec(page)) { - fold = rec_fold(rec, n_fields, n_bytes, tree_id); + offsets = rec_reget_offsets(rec, cursor->index, + offsets, n_fields + (n_bytes > 0), heap); + fold = rec_fold(rec, offsets, n_fields, n_bytes, tree_id); } else { if (side == BTR_SEARCH_LEFT_SIDE) { @@ -1461,6 +1489,7 @@ check_next_rec: } function_exit: + mem_heap_free(heap); if (locked) { rw_lock_x_unlock(&btr_search_latch); } @@ -1470,9 +1499,10 @@ function_exit: Validates the search system. */ ibool -btr_search_validate(void) -/*=====================*/ +btr_search_validate( +/*================*/ /* out: TRUE if ok */ + dict_index_t* index) /* in: record descriptor */ { buf_block_t* block; page_t* page; @@ -1480,6 +1510,8 @@ btr_search_validate(void) ulint n_page_dumps = 0; ibool ok = TRUE; ulint i; + mem_heap_t* heap = mem_heap_create(100); + ulint* offsets = NULL; rw_lock_x_lock(&btr_search_latch); @@ -1489,9 +1521,13 @@ btr_search_validate(void) while (node != NULL) { block = buf_block_align(node->data); page = buf_frame_align(node->data); + offsets = rec_reget_offsets((rec_t*) node->data, index, + offsets, block->curr_n_fields + + (block->curr_n_bytes > 0), heap); if (!block->is_hashed || node->fold != rec_fold((rec_t*)(node->data), + offsets, block->curr_n_fields, block->curr_n_bytes, btr_page_get_index_id(page))) { @@ -1507,12 +1543,13 @@ btr_search_validate(void) (ulong) ut_dulint_get_low(btr_page_get_index_id(page)), (ulong) node->fold, (ulong) rec_fold((rec_t*)(node->data), + offsets, block->curr_n_fields, block->curr_n_bytes, btr_page_get_index_id(page))); fputs("InnoDB: Record ", stderr); - rec_print(stderr, (rec_t*)(node->data)); + rec_print(stderr, (rec_t*)node->data, offsets); fprintf(stderr, "\nInnoDB: on that page." "Page mem address %p, is hashed %lu, n fields %lu, n bytes %lu\n" "side %lu\n", @@ -1536,6 +1573,7 @@ btr_search_validate(void) } rw_lock_x_unlock(&btr_search_latch); + mem_heap_free(heap); return(ok); } diff --git a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c index 4b1f2d0ab99..2f8ce7507ba 100644 --- a/innobase/buf/buf0buf.c +++ b/innobase/buf/buf0buf.c @@ -548,8 +548,9 @@ buf_pool_init( } /*----------------------------------------*/ } else { - buf_pool->frame_mem = ut_malloc( - UNIV_PAGE_SIZE * (n_frames + 1)); + buf_pool->frame_mem = ut_malloc_low( + UNIV_PAGE_SIZE * (n_frames + 1), + TRUE, FALSE); } if (buf_pool->frame_mem == NULL) { @@ -2137,6 +2138,31 @@ buf_print(void) } /************************************************************************* +Returns the number of latched pages in the buffer pool. */ + +ulint +buf_get_latched_pages_number(void) +{ + buf_block_t* block; + ulint i; + ulint fixed_pages_number = 0; + + mutex_enter(&(buf_pool->mutex)); + + for (i = 0; i < buf_pool->curr_size; i++) { + + block = buf_pool_get_nth_block(buf_pool, i); + + if (((block->buf_fix_count != 0) || (block->io_fix != 0)) && + block->magic_n == BUF_BLOCK_MAGIC_N ) + fixed_pages_number++; + } + + mutex_exit(&(buf_pool->mutex)); + return fixed_pages_number; +} + +/************************************************************************* Returns the number of pending buf pool ios. */ ulint diff --git a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c index 964c396dd08..aff4fe92a71 100644 --- a/innobase/buf/buf0flu.c +++ b/innobase/buf/buf0flu.c @@ -273,6 +273,10 @@ buf_flush_buffered_writes(void) } } + /* increment the doublewrite flushed pages counter */ + srv_dblwr_pages_written+= trx_doublewrite->first_free; + srv_dblwr_writes++; + if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { len = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE; } else { @@ -901,6 +905,9 @@ buf_flush_batch( (ulong) page_count); } + if (page_count != ULINT_UNDEFINED) + srv_buf_pool_flushed+= page_count; + return(page_count); } diff --git a/innobase/buf/buf0lru.c b/innobase/buf/buf0lru.c index 21dd0e304eb..985426a9e2b 100644 --- a/innobase/buf/buf0lru.c +++ b/innobase/buf/buf0lru.c @@ -42,6 +42,10 @@ initial segment in buf_LRU_get_recent_limit */ #define BUF_LRU_INITIAL_RATIO 8 +/* If we switch on the InnoDB monitor because there are too few available +frames in the buffer pool, we set this to TRUE */ +ibool buf_lru_switched_on_innodb_mon = FALSE; + /********************************************************************** Takes a block out of the LRU list and page hash table and sets the block state to BUF_BLOCK_REMOVE_HASH. */ @@ -288,6 +292,32 @@ buf_LRU_try_free_flushed_blocks(void) } /********************************************************************** +Returns TRUE if less than 15 % of the buffer pool is available. This can be +used in heuristics to prevent huge transactions eating up the whole buffer +pool for their locks. */ + +ibool +buf_LRU_buf_pool_running_out(void) +/*==============================*/ + /* out: TRUE if less than 15 % of buffer pool + left */ +{ + ibool ret = FALSE; + + mutex_enter(&(buf_pool->mutex)); + + if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) + + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->max_size / 7) { + + ret = TRUE; + } + + mutex_exit(&(buf_pool->mutex)); + + return(ret); +} + +/********************************************************************** Returns a free block from buf_pool. The block is taken off the free list. If it is empty, blocks are moved from the end of the LRU list to the free list. */ @@ -325,7 +355,8 @@ loop: } else if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->max_size / 5) { - if (!srv_print_innodb_monitor) { + + if (!buf_lru_switched_on_innodb_mon) { /* Over 80 % of the buffer pool is occupied by lock heaps or the adaptive hash index. This may be a memory @@ -342,16 +373,18 @@ loop: "InnoDB: lock heap and hash index sizes.\n", (ulong) (buf_pool->curr_size / (1024 * 1024 / UNIV_PAGE_SIZE))); + buf_lru_switched_on_innodb_mon = TRUE; srv_print_innodb_monitor = TRUE; os_event_set(srv_lock_timeout_thread_event); } - } else if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) - + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->max_size / 4) { + } else if (buf_lru_switched_on_innodb_mon) { /* Switch off the InnoDB Monitor; this is a simple way to stop the monitor if the situation becomes less urgent, - but may also surprise users! */ + but may also surprise users if the user also switched on the + monitor! */ + buf_lru_switched_on_innodb_mon = FALSE; srv_print_innodb_monitor = FALSE; } @@ -432,6 +465,7 @@ loop: /* No free block was found: try to flush the LRU list */ buf_flush_free_margin(); + ++srv_buf_pool_wait_free; os_aio_simulated_wake_handler_threads(); diff --git a/innobase/buf/buf0rea.c b/innobase/buf/buf0rea.c index 11107d777c8..f34920549fe 100644 --- a/innobase/buf/buf0rea.c +++ b/innobase/buf/buf0rea.c @@ -20,6 +20,10 @@ Created 11/5/1995 Heikki Tuuri #include "os0file.h" #include "srv0start.h" +extern ulint srv_read_ahead_rnd; +extern ulint srv_read_ahead_seq; +extern ulint srv_buf_pool_reads; + /* The size in blocks of the area where the random read-ahead algorithm counts the accessed pages when deciding whether to read-ahead */ #define BUF_READ_AHEAD_RANDOM_AREA BUF_READ_AHEAD_AREA @@ -291,6 +295,7 @@ buf_read_ahead_random( (ulong) count); } + ++srv_read_ahead_rnd; return(count); } @@ -323,6 +328,7 @@ buf_read_page( count2 = buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space, tablespace_version, offset); + srv_buf_pool_reads+= count2; if (err == DB_TABLESPACE_DELETED) { ut_print_timestamp(stderr); fprintf(stderr, @@ -575,6 +581,7 @@ buf_read_ahead_linear( (ulong) space, (ulong) offset, (ulong) count); } + ++srv_read_ahead_seq; return(count); } diff --git a/innobase/configure.in b/innobase/configure.in index 652291f1f38..baf11272ab9 100644 --- a/innobase/configure.in +++ b/innobase/configure.in @@ -41,7 +41,9 @@ AC_CHECK_SIZEOF(long, 4) AC_CHECK_SIZEOF(void*, 4) AC_CHECK_FUNCS(sched_yield) AC_CHECK_FUNCS(fdatasync) -#AC_CHECK_FUNCS(localtime_r) # Already checked by MySQL +AC_CHECK_FUNCS(localtime_r) +#AC_CHECK_FUNCS(readdir_r) MySQL checks that it has also the right args. +# Some versions of Unix only take 2 arguments. #AC_C_INLINE Already checked in MySQL AC_C_BIGENDIAN @@ -110,6 +112,9 @@ esac case "$target" in i[[4567]]86-*-*) CFLAGS="$CFLAGS -DUNIV_INTEL_X86";; + # The compiler on Linux/S390 does not seem to have inlining + s390-*-*) + CFLAGS="$CFLAGS -DUNIV_MUST_NOT_INLINE";; esac AC_OUTPUT(Makefile os/Makefile ut/Makefile btr/Makefile dnl diff --git a/innobase/data/data0data.c b/innobase/data/data0data.c index 97ec1a1acd9..25ba19d0296 100644 --- a/innobase/data/data0data.c +++ b/innobase/data/data0data.c @@ -500,7 +500,7 @@ dtuple_convert_big_rec( ut_a(dtuple_check_typed_no_assert(entry)); - size = rec_get_converted_size(entry); + size = rec_get_converted_size(index, entry); if (size > 1000000000) { fprintf(stderr, @@ -524,9 +524,10 @@ dtuple_convert_big_rec( n_fields = 0; - while ((rec_get_converted_size(entry) - >= page_get_free_space_of_empty() / 2) - || rec_get_converted_size(entry) >= REC_MAX_DATA_SIZE) { + while (rec_get_converted_size(index, entry) + >= ut_min(page_get_free_space_of_empty( + index->table->comp) / 2, + REC_MAX_DATA_SIZE)) { longest = 0; for (i = dict_index_get_n_unique_in_tree(index); diff --git a/innobase/data/data0type.c b/innobase/data/data0type.c index 714cf92bc65..9b8fb084e33 100644 --- a/innobase/data/data0type.c +++ b/innobase/data/data0type.c @@ -195,7 +195,7 @@ dtype_validate( ut_a((type->mtype >= DATA_VARCHAR) && (type->mtype <= DATA_MYSQL)); if (type->mtype == DATA_SYS) { - ut_a(type->prtype <= DATA_MIX_ID); + ut_a((type->prtype & DATA_MYSQL_TYPE_MASK) < DATA_N_SYS_COLS); } return(TRUE); diff --git a/innobase/dict/dict0boot.c b/innobase/dict/dict0boot.c index f156cf67a18..e500b92252f 100644 --- a/innobase/dict/dict0boot.c +++ b/innobase/dict/dict0boot.c @@ -158,7 +158,7 @@ dict_hdr_create( /*--------------------------*/ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, - DICT_HDR_SPACE, DICT_TABLES_ID, mtr); + DICT_HDR_SPACE, DICT_TABLES_ID, FALSE, mtr); if (root_page_no == FIL_NULL) { return(FALSE); @@ -168,7 +168,7 @@ dict_hdr_create( MLOG_4BYTES, mtr); /*--------------------------*/ root_page_no = btr_create(DICT_UNIQUE, DICT_HDR_SPACE, - DICT_TABLE_IDS_ID, mtr); + DICT_TABLE_IDS_ID, FALSE, mtr); if (root_page_no == FIL_NULL) { return(FALSE); @@ -178,7 +178,7 @@ dict_hdr_create( MLOG_4BYTES, mtr); /*--------------------------*/ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, - DICT_HDR_SPACE, DICT_COLUMNS_ID, mtr); + DICT_HDR_SPACE, DICT_COLUMNS_ID, FALSE, mtr); if (root_page_no == FIL_NULL) { return(FALSE); @@ -188,7 +188,7 @@ dict_hdr_create( MLOG_4BYTES, mtr); /*--------------------------*/ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, - DICT_HDR_SPACE, DICT_INDEXES_ID, mtr); + DICT_HDR_SPACE, DICT_INDEXES_ID, FALSE, mtr); if (root_page_no == FIL_NULL) { return(FALSE); @@ -198,7 +198,7 @@ dict_hdr_create( MLOG_4BYTES, mtr); /*--------------------------*/ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, - DICT_HDR_SPACE, DICT_FIELDS_ID, mtr); + DICT_HDR_SPACE, DICT_FIELDS_ID, FALSE, mtr); if (root_page_no == FIL_NULL) { return(FALSE); @@ -254,7 +254,7 @@ dict_boot(void) /* Insert into the dictionary cache the descriptions of the basic system tables */ /*-------------------------*/ - table = dict_mem_table_create("SYS_TABLES", DICT_HDR_SPACE,8); + table = dict_mem_table_create("SYS_TABLES", DICT_HDR_SPACE, 8, FALSE); dict_mem_table_add_col(table, "NAME", DATA_BINARY, 0, 0, 0); dict_mem_table_add_col(table, "ID", DATA_BINARY, 0, 0, 0); @@ -290,7 +290,7 @@ dict_boot(void) index->id = DICT_TABLE_IDS_ID; ut_a(dict_index_add_to_cache(table, index)); /*-------------------------*/ - table = dict_mem_table_create("SYS_COLUMNS",DICT_HDR_SPACE,7); + table = dict_mem_table_create("SYS_COLUMNS", DICT_HDR_SPACE, 7, FALSE); dict_mem_table_add_col(table, "TABLE_ID", DATA_BINARY,0,0,0); dict_mem_table_add_col(table, "POS", DATA_INT, 0, 4, 0); @@ -316,7 +316,7 @@ dict_boot(void) index->id = DICT_COLUMNS_ID; ut_a(dict_index_add_to_cache(table, index)); /*-------------------------*/ - table = dict_mem_table_create("SYS_INDEXES",DICT_HDR_SPACE,7); + table = dict_mem_table_create("SYS_INDEXES", DICT_HDR_SPACE, 7, FALSE); dict_mem_table_add_col(table, "TABLE_ID", DATA_BINARY, 0,0,0); dict_mem_table_add_col(table, "ID", DATA_BINARY, 0, 0, 0); @@ -349,7 +349,7 @@ dict_boot(void) index->id = DICT_INDEXES_ID; ut_a(dict_index_add_to_cache(table, index)); /*-------------------------*/ - table = dict_mem_table_create("SYS_FIELDS", DICT_HDR_SPACE,3); + table = dict_mem_table_create("SYS_FIELDS", DICT_HDR_SPACE, 3, FALSE); dict_mem_table_add_col(table, "INDEX_ID", DATA_BINARY, 0,0,0); dict_mem_table_add_col(table, "POS", DATA_INT, 0, 4, 0); diff --git a/innobase/dict/dict0crea.c b/innobase/dict/dict0crea.c index 137964b26c1..747a99ebdc9 100644 --- a/innobase/dict/dict0crea.c +++ b/innobase/dict/dict0crea.c @@ -84,7 +84,8 @@ dict_create_sys_tables_tuple( dfield = dtuple_get_nth_field(entry, 5); ptr = mem_heap_alloc(heap, 4); - mach_write_to_4(ptr, table->mix_len); + mach_write_to_4(ptr, (table->mix_len & 0x7fffffff) | + ((ulint) table->comp << 31)); dfield_set_data(dfield, ptr, 4); /* 8: CLUSTER_NAME ---------------------*/ @@ -624,7 +625,7 @@ dict_create_index_tree_step( btr_pcur_move_to_next_user_rec(&pcur, &mtr); index->page_no = btr_create(index->type, index->space, index->id, - &mtr); + table->comp, &mtr); /* printf("Created a new index tree in space %lu root page %lu\n", index->space, index->page_no); */ @@ -660,8 +661,9 @@ dict_drop_index_tree( #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(dict_sys->mutex))); #endif /* UNIV_SYNC_DEBUG */ - - ptr = rec_get_nth_field(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len); + + ut_a(!dict_sys->sys_indexes->comp); + ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len); ut_ad(len == 4); @@ -673,8 +675,9 @@ dict_drop_index_tree( return; } - ptr = rec_get_nth_field(rec, DICT_SYS_INDEXES_SPACE_NO_FIELD, &len); - + ptr = rec_get_nth_field_old(rec, + DICT_SYS_INDEXES_SPACE_NO_FIELD, &len); + ut_ad(len == 4); space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); @@ -699,8 +702,8 @@ dict_drop_index_tree( root_page_no); */ btr_free_root(space, root_page_no, mtr); - page_rec_write_index_page_no(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, - FIL_NULL, mtr); + page_rec_write_index_page_no(rec, + DICT_SYS_INDEXES_PAGE_NO_FIELD, FIL_NULL, mtr); } /************************************************************************* @@ -1067,6 +1070,12 @@ dict_create_or_check_foreign_constraint_tables(void) there are 2 secondary indexes on SYS_FOREIGN, and they are defined just like below */ + /* NOTE: when designing InnoDB's foreign key support in 2001, we made + an error and made the table names and the foreign key id of type + 'CHAR' (internally, really a VARCHAR). We should have made the type + VARBINARY, like in other InnoDB system tables, to get a clean + design. */ + str = "PROCEDURE CREATE_FOREIGN_SYS_TABLES_PROC () IS\n" "BEGIN\n" @@ -1284,9 +1293,17 @@ loop: fputs(".\nA foreign key constraint of name ", ef); ut_print_name(ef, trx, foreign->id); fputs("\nalready exists." - " (Note that internally InnoDB adds 'databasename/'\n" + " (Note that internally InnoDB adds 'databasename/'\n" "in front of the user-defined constraint name).\n", ef); + fputs("Note that InnoDB's FOREIGN KEY system tables store\n" + "constraint names as case-insensitive, with the\n" + "MySQL standard latin1_swedish_ci collation. If you\n" + "create tables or databases whose names differ only in\n" + "the character case, then collisions in constraint\n" + "names can occur. Workaround: name your constraints\n" + "explicitly with unique names.\n", + ef); mutex_exit(&dict_foreign_err_mutex); diff --git a/innobase/dict/dict0dict.c b/innobase/dict/dict0dict.c index c3d0d8d9ac1..0aaa3a9a721 100644 --- a/innobase/dict/dict0dict.c +++ b/innobase/dict/dict0dict.c @@ -156,7 +156,7 @@ dict_index_build_internal_non_clust( dict_index_t* index); /* in: user representation of a non-clustered index */ /************************************************************************** -Removes a foreign constraint struct from the dictionet cache. */ +Removes a foreign constraint struct from the dictionary cache. */ static void dict_foreign_remove_from_cache( @@ -606,7 +606,7 @@ dict_table_get_on_id( dict_table_t* table; if (ut_dulint_cmp(table_id, DICT_FIELDS_ID) <= 0 - || trx->dict_operation) { + || trx->dict_operation_lock_mode == RW_X_LATCH) { /* It is a system table which will always exist in the table cache: we avoid acquiring the dictionary mutex, because if we are doing a rollback to handle an error in TABLE @@ -814,23 +814,22 @@ dict_table_add_to_cache( system columns. */ dict_mem_table_add_col(table, "DB_ROW_ID", DATA_SYS, - DATA_ROW_ID, 0, 0); + DATA_ROW_ID | DATA_NOT_NULL, DATA_ROW_ID_LEN, 0); #if DATA_ROW_ID != 0 #error "DATA_ROW_ID != 0" #endif dict_mem_table_add_col(table, "DB_TRX_ID", DATA_SYS, - DATA_TRX_ID, 0, 0); + DATA_TRX_ID | DATA_NOT_NULL, DATA_TRX_ID_LEN, 0); #if DATA_TRX_ID != 1 #error "DATA_TRX_ID != 1" #endif dict_mem_table_add_col(table, "DB_ROLL_PTR", DATA_SYS, - DATA_ROLL_PTR, 0, 0); + DATA_ROLL_PTR | DATA_NOT_NULL, DATA_ROLL_PTR_LEN, 0); #if DATA_ROLL_PTR != 2 #error "DATA_ROLL_PTR != 2" #endif - dict_mem_table_add_col(table, "DB_MIX_ID", DATA_SYS, - DATA_MIX_ID, 0, 0); + DATA_MIX_ID | DATA_NOT_NULL, DATA_MIX_ID_LEN, 0); #if DATA_MIX_ID != 3 #error "DATA_MIX_ID != 3" #endif @@ -1588,7 +1587,7 @@ dict_index_find_cols( /*********************************************************************** Adds a column to index. */ -UNIV_INLINE + void dict_index_add_col( /*===============*/ @@ -1604,6 +1603,34 @@ dict_index_add_col( field = dict_index_get_nth_field(index, index->n_def - 1); field->col = col; + field->fixed_len = dtype_get_fixed_size(&col->type); + + if (prefix_len && field->fixed_len > prefix_len) { + field->fixed_len = prefix_len; + } + + /* Long fixed-length fields that need external storage are treated as + variable-length fields, so that the extern flag can be embedded in + the length word. */ + + if (field->fixed_len > DICT_MAX_COL_PREFIX_LEN) { + field->fixed_len = 0; + } + + if (!(dtype_get_prtype(&col->type) & DATA_NOT_NULL)) { + index->n_nullable++; + } + + if (index->n_def > 1) { + const dict_field_t* field2 = + dict_index_get_nth_field(index, index->n_def - 2); + field->fixed_offs = (!field2->fixed_len || + field2->fixed_offs == ULINT_UNDEFINED) + ? ULINT_UNDEFINED + : field2->fixed_len + field2->fixed_offs; + } else { + field->fixed_offs = 0; + } } /*********************************************************************** @@ -2266,8 +2293,8 @@ dict_foreign_add_to_cache( /************************************************************************* Scans from pointer onwards. Stops if is at the start of a copy of -'string' where characters are compared without case sensitivity. Stops -also at '\0'. */ +'string' where characters are compared without case sensitivity, and +only outside `` or "" quotes. Stops also at '\0'. */ const char* dict_scan_to( @@ -2276,31 +2303,34 @@ dict_scan_to( const char* ptr, /* in: scan from */ const char* string) /* in: look for this */ { - ibool success; - ulint i; -loop: - if (*ptr == '\0') { - return(ptr); - } - - success = TRUE; - - for (i = 0; i < ut_strlen(string); i++) { - if (toupper((ulint)(ptr[i])) != toupper((ulint)(string[i]))) { - success = FALSE; + char quote = '\0'; + for (; *ptr; ptr++) { + if (*ptr == quote) { + /* Closing quote character: do not look for + starting quote or the keyword. */ + quote = '\0'; + } else if (quote) { + /* Within quotes: do nothing. */ + } else if (*ptr == '`' || *ptr == '"') { + /* Starting quote: remember the quote character. */ + quote = *ptr; + } else { + /* Outside quotes: look for the keyword. */ + ulint i; + for (i = 0; string[i]; i++) { + if (toupper((ulint)(ptr[i])) + != toupper((ulint)(string[i]))) { + goto nomatch; + } + } break; + nomatch: + ; } } - if (success) { - - return(ptr); - } - - ptr++; - - goto loop; + return(ptr); } /************************************************************************* @@ -2877,13 +2907,13 @@ loop: ut_a(success); - if (!isspace(*ptr)) { + if (!isspace(*ptr) && *ptr != '"' && *ptr != '`') { goto loop; } - do { + while (isspace(*ptr)) { ptr++; - } while (isspace(*ptr)); + } /* read constraint name unless got "CONSTRAINT FOREIGN" */ if (ptr != ptr2) { @@ -3577,9 +3607,10 @@ dict_tree_find_index_low( && (table->type != DICT_TABLE_ORDINARY)) { /* Get the mix id of the record */ + ut_a(!table->comp); mix_id = mach_dulint_read_compressed( - rec_get_nth_field(rec, table->mix_len, &len)); + rec_get_nth_field_old(rec, table->mix_len, &len)); while (ut_dulint_cmp(table->mix_id, mix_id) != 0) { @@ -3712,7 +3743,8 @@ dict_tree_build_node_ptr( on non-leaf levels we remove the last field, which contains the page number of the child page */ - n_unique = rec_get_n_fields(rec); + ut_a(!ind->table->comp); + n_unique = rec_get_n_fields_old(rec); if (level > 0) { ut_a(n_unique > 1); @@ -3741,9 +3773,11 @@ dict_tree_build_node_ptr( field = dtuple_get_nth_field(tuple, n_unique); dfield_set_data(field, buf, 4); - dtype_set(dfield_get_type(field), DATA_SYS_CHILD, 0, 0, 0); + dtype_set(dfield_get_type(field), DATA_SYS_CHILD, DATA_NOT_NULL, 4, 0); - rec_copy_prefix_to_dtuple(tuple, rec, n_unique, heap); + rec_copy_prefix_to_dtuple(tuple, rec, ind, n_unique, heap); + dtuple_set_info_bits(tuple, dtuple_get_info_bits(tuple) | + REC_STATUS_NODE_PTR); ut_ad(dtuple_check_typed(tuple)); @@ -3760,27 +3794,26 @@ dict_tree_copy_rec_order_prefix( /* out: pointer to the prefix record */ dict_tree_t* tree, /* in: index tree */ rec_t* rec, /* in: record for which to copy prefix */ + ulint* n_fields,/* out: number of fields copied */ byte** buf, /* in/out: memory buffer for the copied prefix, or NULL */ ulint* buf_size)/* in/out: buffer size */ { - dict_index_t* ind; - rec_t* order_rec; - ulint n_fields; - - ind = dict_tree_find_index_low(tree, rec); + dict_index_t* index; + ulint n; - n_fields = dict_index_get_n_unique_in_tree(ind); - - if (tree->type & DICT_UNIVERSAL) { + index = dict_tree_find_index_low(tree, rec); - n_fields = rec_get_n_fields(rec); + if (tree->type & DICT_UNIVERSAL) { + ut_a(!index->table->comp); + n = rec_get_n_fields_old(rec); + } else { + n = dict_index_get_n_unique_in_tree(index); } - order_rec = rec_copy_prefix_to_buf(rec, n_fields, buf, buf_size); - - return(order_rec); -} + *n_fields = n; + return(rec_copy_prefix_to_buf(rec, index, n, buf, buf_size)); +} /************************************************************************** Builds a typed data tuple out of a physical record. */ @@ -3791,21 +3824,21 @@ dict_tree_build_data_tuple( /* out, own: data tuple */ dict_tree_t* tree, /* in: index tree */ rec_t* rec, /* in: record for which to build data tuple */ + ulint n_fields,/* in: number of data fields */ mem_heap_t* heap) /* in: memory heap where tuple created */ { dtuple_t* tuple; dict_index_t* ind; - ulint n_fields; ind = dict_tree_find_index_low(tree, rec); - n_fields = rec_get_n_fields(rec); + ut_ad(ind->table->comp || n_fields <= rec_get_n_fields_old(rec)); tuple = dtuple_create(heap, n_fields); dict_index_copy_types(tuple, ind, n_fields); - rec_copy_prefix_to_dtuple(tuple, rec, n_fields, heap); + rec_copy_prefix_to_dtuple(tuple, rec, ind, n_fields, heap); ut_ad(dtuple_check_typed(tuple)); @@ -3823,6 +3856,27 @@ dict_index_calc_min_rec_len( ulint sum = 0; ulint i; + if (index->table->comp) { + ulint nullable = 0; + sum = REC_N_NEW_EXTRA_BYTES; + for (i = 0; i < dict_index_get_n_fields(index); i++) { + dtype_t*t = dict_index_get_nth_type(index, i); + ulint size = dtype_get_fixed_size(t); + sum += size; + if (!size) { + size = dtype_get_len(t); + sum += size < 128 ? 1 : 2; + } + if (!(dtype_get_prtype(t) & DATA_NOT_NULL)) + nullable++; + } + + /* round the NULL flags up to full bytes */ + sum += (nullable + 7) / 8; + + return(sum); + } + for (i = 0; i < dict_index_get_n_fields(index); i++) { sum += dtype_get_fixed_size(dict_index_get_nth_type(index, i)); } @@ -3833,7 +3887,7 @@ dict_index_calc_min_rec_len( sum += dict_index_get_n_fields(index); } - sum += REC_N_EXTRA_BYTES; + sum += REC_N_OLD_EXTRA_BYTES; return(sum); } diff --git a/innobase/dict/dict0load.c b/innobase/dict/dict0load.c index d430eadc97b..c80f8346abf 100644 --- a/innobase/dict/dict0load.c +++ b/innobase/dict/dict0load.c @@ -19,7 +19,9 @@ Created 4/24/1996 Heikki Tuuri #include "mach0data.h" #include "dict0dict.h" #include "dict0boot.h" +#include "rem0cmp.h" #include "srv0start.h" +#include "srv0srv.h" /************************************************************************ Finds the first table name in the given database. */ @@ -53,6 +55,7 @@ dict_get_first_table_name_in_db( sys_tables = dict_table_get_low("SYS_TABLES"); sys_index = UT_LIST_GET_FIRST(sys_tables->indexes); + ut_a(!sys_tables->comp); tuple = dtuple_create(heap, 1); dfield = dtuple_get_nth_field(tuple, 0); @@ -75,7 +78,7 @@ loop: return(NULL); } - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); if (len < strlen(name) || ut_memcmp(name, field, strlen(name)) != 0) { @@ -88,7 +91,7 @@ loop: return(NULL); } - if (!rec_get_deleted_flag(rec)) { + if (!rec_get_deleted_flag(rec, sys_tables->comp)) { /* We found one */ @@ -123,6 +126,13 @@ dict_print(void) ulint len; mtr_t mtr; + /* Enlarge the fatal semaphore wait timeout during the InnoDB table + monitor printout */ + + mutex_enter(&kernel_mutex); + srv_fatal_semaphore_wait_threshold += 7200; /* 2 hours */ + mutex_exit(&kernel_mutex); + mutex_enter(&(dict_sys->mutex)); mtr_start(&mtr); @@ -145,12 +155,18 @@ loop: mutex_exit(&(dict_sys->mutex)); + /* Restore the fatal semaphore wait timeout */ + + mutex_enter(&kernel_mutex); + srv_fatal_semaphore_wait_threshold -= 7200; /* 2 hours */ + mutex_exit(&kernel_mutex); + return; } - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); - if (!rec_get_deleted_flag(rec)) { + if (!rec_get_deleted_flag(rec, sys_tables->comp)) { /* We found one */ @@ -214,6 +230,7 @@ dict_check_tablespaces_or_store_max_id( sys_tables = dict_table_get_low("SYS_TABLES"); sys_index = UT_LIST_GET_FIRST(sys_tables->indexes); + ut_a(!sys_tables->comp); btr_pcur_open_at_index_side(TRUE, sys_index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr); @@ -240,15 +257,15 @@ loop: return; } - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); - if (!rec_get_deleted_flag(rec)) { + if (!rec_get_deleted_flag(rec, sys_tables->comp)) { /* We found one */ char* name = mem_strdupl((char*) field, len); - field = rec_get_nth_field(rec, 9, &len); + field = rec_get_nth_field_old(rec, 9, &len); ut_a(len == 4); space_id = mach_read_from_4(field); @@ -313,6 +330,7 @@ dict_load_columns( sys_columns = dict_table_get_low("SYS_COLUMNS"); sys_index = UT_LIST_GET_FIRST(sys_columns->indexes); + ut_a(!sys_columns->comp); tuple = dtuple_create(heap, 1); dfield = dtuple_get_nth_field(tuple, 0); @@ -331,28 +349,27 @@ dict_load_columns( ut_a(btr_pcur_is_on_user_rec(&pcur, &mtr)); - ut_a(!rec_get_deleted_flag(rec)); - - field = rec_get_nth_field(rec, 0, &len); + ut_a(!rec_get_deleted_flag(rec, sys_columns->comp)); + + field = rec_get_nth_field_old(rec, 0, &len); ut_ad(len == 8); ut_a(ut_dulint_cmp(table->id, mach_read_from_8(field)) == 0); - field = rec_get_nth_field(rec, 1, &len); + field = rec_get_nth_field_old(rec, 1, &len); ut_ad(len == 4); ut_a(i == mach_read_from_4(field)); ut_a(0 == ut_strcmp("NAME", dict_field_get_col( - dict_index_get_nth_field( - dict_table_get_first_index(sys_columns), 4))->name)); + dict_index_get_nth_field(sys_index, 4))->name)); - field = rec_get_nth_field(rec, 4, &len); + field = rec_get_nth_field_old(rec, 4, &len); name = mem_heap_strdupl(heap, (char*) field, len); - field = rec_get_nth_field(rec, 5, &len); + field = rec_get_nth_field_old(rec, 5, &len); mtype = mach_read_from_4(field); - field = rec_get_nth_field(rec, 6, &len); + field = rec_get_nth_field_old(rec, 6, &len); prtype = mach_read_from_4(field); if (dtype_is_non_binary_string_type(mtype, prtype) @@ -364,15 +381,14 @@ dict_load_columns( data_mysql_default_charset_coll); } - field = rec_get_nth_field(rec, 7, &len); + field = rec_get_nth_field_old(rec, 7, &len); col_len = mach_read_from_4(field); ut_a(0 == ut_strcmp("PREC", dict_field_get_col( - dict_index_get_nth_field( - dict_table_get_first_index(sys_columns), 8))->name)); + dict_index_get_nth_field(sys_index, 8))->name)); - field = rec_get_nth_field(rec, 8, &len); + field = rec_get_nth_field_old(rec, 8, &len); prec = mach_read_from_4(field); dict_mem_table_add_col(table, name, mtype, prtype, col_len, @@ -437,6 +453,7 @@ dict_load_fields( sys_fields = dict_table_get_low("SYS_FIELDS"); sys_index = UT_LIST_GET_FIRST(sys_fields->indexes); + ut_a(!sys_fields->comp); tuple = dtuple_create(heap, 1); dfield = dtuple_get_nth_field(tuple, 0); @@ -454,15 +471,15 @@ dict_load_fields( rec = btr_pcur_get_rec(&pcur); ut_a(btr_pcur_is_on_user_rec(&pcur, &mtr)); - if (rec_get_deleted_flag(rec)) { + if (rec_get_deleted_flag(rec, sys_fields->comp)) { dict_load_report_deleted_index(table->name, i); } - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); ut_ad(len == 8); ut_a(ut_memcmp(buf, field, len) == 0); - field = rec_get_nth_field(rec, 1, &len); + field = rec_get_nth_field_old(rec, 1, &len); ut_a(len == 4); /* The next field stores the field position in the index @@ -488,10 +505,9 @@ dict_load_fields( ut_a(0 == ut_strcmp("COL_NAME", dict_field_get_col( - dict_index_get_nth_field( - dict_table_get_first_index(sys_fields), 4))->name)); + dict_index_get_nth_field(sys_index, 4))->name)); - field = rec_get_nth_field(rec, 4, &len); + field = rec_get_nth_field_old(rec, 4, &len); dict_mem_index_add_field(index, mem_heap_strdupl(heap, (char*) field, len), 0, prefix_len); @@ -550,6 +566,7 @@ dict_load_indexes( sys_indexes = dict_table_get_low("SYS_INDEXES"); sys_index = UT_LIST_GET_FIRST(sys_indexes->indexes); + ut_a(!sys_indexes->comp); tuple = dtuple_create(heap, 1); dfield = dtuple_get_nth_field(tuple, 0); @@ -570,14 +587,14 @@ dict_load_indexes( rec = btr_pcur_get_rec(&pcur); - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); ut_ad(len == 8); if (ut_memcmp(buf, field, len) != 0) { break; } - if (rec_get_deleted_flag(rec)) { + if (rec_get_deleted_flag(rec, table->comp)) { dict_load_report_deleted_index(table->name, ULINT_UNDEFINED); @@ -587,33 +604,31 @@ dict_load_indexes( return(FALSE); } - field = rec_get_nth_field(rec, 1, &len); + field = rec_get_nth_field_old(rec, 1, &len); ut_ad(len == 8); id = mach_read_from_8(field); ut_a(0 == ut_strcmp("NAME", dict_field_get_col( - dict_index_get_nth_field( - dict_table_get_first_index(sys_indexes), 4))->name)); - - field = rec_get_nth_field(rec, 4, &name_len); + dict_index_get_nth_field(sys_index, 4))->name)); + + field = rec_get_nth_field_old(rec, 4, &name_len); name_buf = mem_heap_strdupl(heap, (char*) field, name_len); - field = rec_get_nth_field(rec, 5, &len); + field = rec_get_nth_field_old(rec, 5, &len); n_fields = mach_read_from_4(field); - field = rec_get_nth_field(rec, 6, &len); + field = rec_get_nth_field_old(rec, 6, &len); type = mach_read_from_4(field); - field = rec_get_nth_field(rec, 7, &len); + field = rec_get_nth_field_old(rec, 7, &len); space = mach_read_from_4(field); ut_a(0 == ut_strcmp("PAGE_NO", dict_field_get_col( - dict_index_get_nth_field( - dict_table_get_first_index(sys_indexes), 8))->name)); + dict_index_get_nth_field(sys_index, 8))->name)); - field = rec_get_nth_field(rec, 8, &len); + field = rec_get_nth_field_old(rec, 8, &len); page_no = mach_read_from_4(field); if (page_no == FIL_NULL) { @@ -716,6 +731,7 @@ dict_load_table( sys_tables = dict_table_get_low("SYS_TABLES"); sys_index = UT_LIST_GET_FIRST(sys_tables->indexes); + ut_a(!sys_tables->comp); tuple = dtuple_create(heap, 1); dfield = dtuple_get_nth_field(tuple, 0); @@ -728,7 +744,7 @@ dict_load_table( rec = btr_pcur_get_rec(&pcur); if (!btr_pcur_is_on_user_rec(&pcur, &mtr) - || rec_get_deleted_flag(rec)) { + || rec_get_deleted_flag(rec, sys_tables->comp)) { /* Not found */ btr_pcur_close(&pcur); @@ -738,7 +754,7 @@ dict_load_table( return(NULL); } - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); /* Check if the table name in record is the searched one */ if (len != ut_strlen(name) || ut_memcmp(name, field, len) != 0) { @@ -752,10 +768,9 @@ dict_load_table( ut_a(0 == ut_strcmp("SPACE", dict_field_get_col( - dict_index_get_nth_field( - dict_table_get_first_index(sys_tables), 9))->name)); + dict_index_get_nth_field(sys_index, 9))->name)); - field = rec_get_nth_field(rec, 9, &len); + field = rec_get_nth_field_old(rec, 9, &len); space = mach_read_from_4(field); /* Check if the tablespace exists and has the right name */ @@ -777,43 +792,45 @@ dict_load_table( ut_a(0 == ut_strcmp("N_COLS", dict_field_get_col( - dict_index_get_nth_field( - dict_table_get_first_index(sys_tables), 4))->name)); + dict_index_get_nth_field(sys_index, 4))->name)); - field = rec_get_nth_field(rec, 4, &len); + field = rec_get_nth_field_old(rec, 4, &len); n_cols = mach_read_from_4(field); - table = dict_mem_table_create(name, space, n_cols); + /* table->comp will be initialized later, in this function */ + table = dict_mem_table_create(name, space, n_cols, FALSE); table->ibd_file_missing = ibd_file_missing; ut_a(0 == ut_strcmp("ID", dict_field_get_col( - dict_index_get_nth_field( - dict_table_get_first_index(sys_tables), 3))->name)); + dict_index_get_nth_field(sys_index, 3))->name)); - field = rec_get_nth_field(rec, 3, &len); + field = rec_get_nth_field_old(rec, 3, &len); table->id = mach_read_from_8(field); - field = rec_get_nth_field(rec, 5, &len); + field = rec_get_nth_field_old(rec, 5, &len); table->type = mach_read_from_4(field); if (table->type == DICT_TABLE_CLUSTER_MEMBER) { ut_error; #if 0 /* clustered tables have not been implemented yet */ - field = rec_get_nth_field(rec, 6, &len); + field = rec_get_nth_field_old(rec, 6, &len); table->mix_id = mach_read_from_8(field); - field = rec_get_nth_field(rec, 8, &len); + field = rec_get_nth_field_old(rec, 8, &len); table->cluster_name = mem_heap_strdupl(heap, (char*) field, len); #endif } + /* The high-order bit of MIX_LEN is the "compact format" flag */ + field = rec_get_nth_field_old(rec, 7, &len); + table->comp = !!(mach_read_from_1(field) & 0x80); + if ((table->type == DICT_TABLE_CLUSTER) || (table->type == DICT_TABLE_CLUSTER_MEMBER)) { - - field = rec_get_nth_field(rec, 7, &len); - table->mix_len = mach_read_from_4(field); + + table->mix_len = mach_read_from_4(field) & 0x7fffffff; } btr_pcur_close(&pcur); @@ -891,6 +908,7 @@ dict_load_table_on_id( sys_tables = dict_sys->sys_tables; sys_table_ids = dict_table_get_next_index( dict_table_get_first_index(sys_tables)); + ut_a(!sys_tables->comp); heap = mem_heap_create(256); tuple = dtuple_create(heap, 1); @@ -907,7 +925,7 @@ dict_load_table_on_id( rec = btr_pcur_get_rec(&pcur); if (!btr_pcur_is_on_user_rec(&pcur, &mtr) - || rec_get_deleted_flag(rec)) { + || rec_get_deleted_flag(rec, sys_tables->comp)) { /* Not found */ btr_pcur_close(&pcur); @@ -922,7 +940,7 @@ dict_load_table_on_id( table ID and NAME */ rec = btr_pcur_get_rec(&pcur); - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); ut_ad(len == 8); /* Check if the table id in record is the one searched for */ @@ -936,7 +954,7 @@ dict_load_table_on_id( } /* Now we get the table name from the record */ - field = rec_get_nth_field(rec, 1, &len); + field = rec_get_nth_field_old(rec, 1, &len); /* Load the table definition to memory */ table = dict_load_table(mem_heap_strdupl(heap, (char*) field, len)); @@ -1004,6 +1022,7 @@ dict_load_foreign_cols( sys_foreign_cols = dict_table_get_low("SYS_FOREIGN_COLS"); sys_index = UT_LIST_GET_FIRST(sys_foreign_cols->indexes); + ut_a(!sys_foreign_cols->comp); tuple = dtuple_create(foreign->heap, 1); dfield = dtuple_get_nth_field(tuple, 0); @@ -1018,21 +1037,21 @@ dict_load_foreign_cols( rec = btr_pcur_get_rec(&pcur); ut_a(btr_pcur_is_on_user_rec(&pcur, &mtr)); - ut_a(!rec_get_deleted_flag(rec)); - - field = rec_get_nth_field(rec, 0, &len); + ut_a(!rec_get_deleted_flag(rec, sys_foreign_cols->comp)); + + field = rec_get_nth_field_old(rec, 0, &len); ut_a(len == ut_strlen(id)); ut_a(ut_memcmp(id, field, len) == 0); - field = rec_get_nth_field(rec, 1, &len); + field = rec_get_nth_field_old(rec, 1, &len); ut_a(len == 4); ut_a(i == mach_read_from_4(field)); - field = rec_get_nth_field(rec, 4, &len); + field = rec_get_nth_field_old(rec, 4, &len); foreign->foreign_col_names[i] = mem_heap_strdupl(foreign->heap, (char*) field, len); - field = rec_get_nth_field(rec, 5, &len); + field = rec_get_nth_field_old(rec, 5, &len); foreign->referenced_col_names[i] = mem_heap_strdupl(foreign->heap, (char*) field, len); @@ -1076,6 +1095,7 @@ dict_load_foreign( sys_foreign = dict_table_get_low("SYS_FOREIGN"); sys_index = UT_LIST_GET_FIRST(sys_foreign->indexes); + ut_a(!sys_foreign->comp); tuple = dtuple_create(heap2, 1); dfield = dtuple_get_nth_field(tuple, 0); @@ -1088,7 +1108,7 @@ dict_load_foreign( rec = btr_pcur_get_rec(&pcur); if (!btr_pcur_is_on_user_rec(&pcur, &mtr) - || rec_get_deleted_flag(rec)) { + || rec_get_deleted_flag(rec, sys_foreign->comp)) { /* Not found */ fprintf(stderr, @@ -1102,7 +1122,7 @@ dict_load_foreign( return(DB_ERROR); } - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); /* Check if the id in record is the searched one */ if (len != ut_strlen(id) || ut_memcmp(id, field, len) != 0) { @@ -1125,7 +1145,8 @@ dict_load_foreign( foreign = dict_mem_foreign_create(); - foreign->n_fields = mach_read_from_4(rec_get_nth_field(rec, 5, &len)); + foreign->n_fields = + mach_read_from_4(rec_get_nth_field_old(rec, 5, &len)); ut_a(len == 4); @@ -1136,11 +1157,11 @@ dict_load_foreign( foreign->id = mem_heap_strdup(foreign->heap, id); - field = rec_get_nth_field(rec, 3, &len); + field = rec_get_nth_field_old(rec, 3, &len); foreign->foreign_table_name = mem_heap_strdupl(foreign->heap, (char*) field, len); - - field = rec_get_nth_field(rec, 4, &len); + + field = rec_get_nth_field_old(rec, 4, &len); foreign->referenced_table_name = mem_heap_strdupl(foreign->heap, (char*) field, len); @@ -1209,6 +1230,7 @@ dict_load_foreigns( return(DB_ERROR); } + ut_a(!sys_foreign->comp); mtr_start(&mtr); /* Get the secondary index based on FOR_NAME from table @@ -1240,22 +1262,36 @@ loop: name and a foreign constraint ID */ rec = btr_pcur_get_rec(&pcur); - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); - /* Check if the table name in record is the one searched for */ - if (len != ut_strlen(table_name) - || 0 != ut_memcmp(field, table_name, len)) { + /* Check if the table name in the record is the one searched for; the + following call does the comparison in the latin1_swedish_ci + charset-collation, in a case-insensitive way. */ + if (0 != cmp_data_data(dfield_get_type(dfield), + dfield_get_data(dfield), dfield_get_len(dfield), + field, len)) { + goto load_next_index; } + + /* Since table names in SYS_FOREIGN are stored in a case-insensitive + order, we have to check that the table name matches also in a binary + string comparison. On Unix, MySQL allows table names that only differ + in character case. */ + + if (0 != ut_memcmp(field, table_name, len)) { + + goto next_rec; + } - if (rec_get_deleted_flag(rec)) { + if (rec_get_deleted_flag(rec, sys_foreign->comp)) { goto next_rec; } /* Now we get a foreign key constraint id */ - field = rec_get_nth_field(rec, 1, &len); + field = rec_get_nth_field_old(rec, 1, &len); id = mem_heap_strdupl(heap, (char*) field, len); btr_pcur_store_position(&pcur, &mtr); diff --git a/innobase/dict/dict0mem.c b/innobase/dict/dict0mem.c index 1d45585aac1..48b9f28d292 100644 --- a/innobase/dict/dict0mem.c +++ b/innobase/dict/dict0mem.c @@ -35,7 +35,8 @@ dict_mem_table_create( the table is placed; this parameter is ignored if the table is made a member of a cluster */ - ulint n_cols) /* in: number of columns */ + ulint n_cols, /* in: number of columns */ + ibool comp) /* in: TRUE=compact page format */ { dict_table_t* table; mem_heap_t* heap; @@ -54,6 +55,7 @@ dict_mem_table_create( table->space = space; table->ibd_file_missing = FALSE; table->tablespace_discarded = FALSE; + table->comp = comp; table->n_def = 0; table->n_cols = n_cols + DATA_N_SYS_COLS; table->mem_fix = 0; @@ -110,7 +112,8 @@ dict_mem_cluster_create( { dict_table_t* cluster; - cluster = dict_mem_table_create(name, space, n_cols); + /* Clustered tables cannot work with the compact record format. */ + cluster = dict_mem_table_create(name, space, n_cols, FALSE); cluster->type = DICT_TABLE_CLUSTER; cluster->mix_len = mix_len; @@ -197,7 +200,7 @@ dict_mem_index_create( index->name = mem_heap_strdup(heap, index_name); index->table_name = table_name; index->table = NULL; - index->n_def = 0; + index->n_def = index->n_nullable = 0; index->n_fields = n_fields; index->fields = mem_heap_alloc(heap, 1 + n_fields * sizeof(dict_field_t)); diff --git a/innobase/eval/eval0eval.c b/innobase/eval/eval0eval.c index ebb6cb1b7d9..5b2d1f857b1 100644 --- a/innobase/eval/eval0eval.c +++ b/innobase/eval/eval0eval.c @@ -627,7 +627,11 @@ eval_concat( } /********************************************************************* -Evaluates a predefined function node. */ +Evaluates a predefined function node. If the first argument is an integer, +this function looks at the second argument which is the integer length in +bytes, and converts the integer to a VARCHAR. +If the first argument is of some other type, this function converts it to +BINARY. */ UNIV_INLINE void eval_to_binary( @@ -638,12 +642,24 @@ eval_to_binary( que_node_t* arg2; dfield_t* dfield; byte* str1; + ulint len; ulint len1; arg1 = func_node->args; str1 = dfield_get_data(que_node_get_val(arg1)); + if (dtype_get_mtype(que_node_get_data_type(arg1)) != DATA_INT) { + + len = dfield_get_len(que_node_get_val(arg1)); + + dfield = que_node_get_val(func_node); + + dfield_set_data(dfield, str1, len); + + return; + } + arg2 = que_node_get_next(arg1); len1 = (ulint)eval_node_get_int_val(arg2); diff --git a/innobase/fil/fil0fil.c b/innobase/fil/fil0fil.c index 7d57468f632..dea48117e00 100644 --- a/innobase/fil/fil0fil.c +++ b/innobase/fil/fil0fil.c @@ -88,6 +88,9 @@ but in the MySQL Embedded Server Library and ibbackup it is not the default directory, and we must set the base file path explicitly */ const char* fil_path_to_mysql_datadir = "."; +/* The number of fsyncs done to the log */ +ulint fil_n_log_flushes = 0; + ulint fil_n_pending_log_flushes = 0; ulint fil_n_pending_tablespace_flushes = 0; @@ -106,7 +109,7 @@ struct fil_node_struct { device or a raw disk partition */ ulint size; /* size of the file in database pages, 0 if not known yet; the possible last incomplete - megabyte is ignored if space == 0 */ + megabyte may be ignored if space == 0 */ ulint n_pending; /* count of pending i/o's on this file; closing of the file is not allowed if @@ -160,7 +163,9 @@ struct fil_space_struct { UT_LIST_BASE_NODE_T(fil_node_t) chain; /* base node for the file chain */ ulint size; /* space size in pages; 0 if a single-table - tablespace whose size we do not know yet */ + tablespace whose size we do not know yet; + last incomplete megabytes in data files may be + ignored if space == 0 */ ulint n_reserved_extents; /* number of reserved free extents for ongoing operations like B-tree page split */ @@ -1574,30 +1579,38 @@ fil_op_write_log( mtr_t* mtr) /* in: mini-transaction handle */ { byte* log_ptr; + ulint len; + + log_ptr = mlog_open(mtr, 11 + 2); + + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery: + in that case mlog_open returns NULL */ + return; + } - log_ptr = mlog_open(mtr, 30); - log_ptr = mlog_write_initial_log_record_for_file_op(type, space_id, 0, log_ptr, mtr); /* Let us store the strings as null-terminated for easier readability and handling */ - mach_write_to_2(log_ptr, ut_strlen(name) + 1); + len = strlen(name) + 1; + + mach_write_to_2(log_ptr, len); log_ptr += 2; - mlog_close(mtr, log_ptr); - mlog_catenate_string(mtr, (byte*) name, ut_strlen(name) + 1); + mlog_catenate_string(mtr, (byte*) name, len); if (type == MLOG_FILE_RENAME) { - log_ptr = mlog_open(mtr, 30); - mach_write_to_2(log_ptr, ut_strlen(new_name) + 1); + ulint len = strlen(new_name) + 1; + log_ptr = mlog_open(mtr, 2 + len); + ut_a(log_ptr); + mach_write_to_2(log_ptr, len); log_ptr += 2; - mlog_close(mtr, log_ptr); - mlog_catenate_string(mtr, (byte*) new_name, - ut_strlen(new_name) + 1); + mlog_catenate_string(mtr, (byte*) new_name, len); } } #endif @@ -3255,7 +3268,7 @@ fil_extend_space_to_desired_size( ulint* actual_size, /* out: size of the space after extension; if we ran out of disk space this may be lower than the desired size */ - ulint space_id, /* in: space id, must be != 0 */ + ulint space_id, /* in: space id */ ulint size_after_extend)/* in: desired size in pages after the extension; if the current space size is bigger than this already, the function does nothing */ @@ -3352,6 +3365,17 @@ fil_extend_space_to_desired_size( fil_node_complete_io(node, system, OS_FILE_WRITE); *actual_size = space->size; + + if (space_id == 0) { + ulint pages_per_mb = (1024 * 1024) / UNIV_PAGE_SIZE; + + /* Keep the last data file size info up to date, rounded to + full megabytes */ + + srv_data_file_sizes[srv_n_data_files - 1] = + (node->size / pages_per_mb) * pages_per_mb; + } + /* printf("Extended %s to %lu, actual size %lu pages\n", space->name, size_after_extend, *actual_size); */ @@ -3671,6 +3695,12 @@ fil_io( mode = OS_AIO_NORMAL; } + if (type == OS_FILE_READ) { + srv_data_read+= len; + } else if (type == OS_FILE_WRITE) { + srv_data_written+= len; + } + /* Reserve the fil_system mutex and make sure that we can open at least one file while holding it, if the file is not already open */ @@ -3956,6 +3986,7 @@ fil_flush( fil_n_pending_tablespace_flushes++; } else { fil_n_pending_log_flushes++; + fil_n_log_flushes++; } #ifdef __WIN__ if (node->is_raw_disk) { diff --git a/innobase/fsp/fsp0fsp.c b/innobase/fsp/fsp0fsp.c index e1621cc2765..ef8e70646c6 100644 --- a/innobase/fsp/fsp0fsp.c +++ b/innobase/fsp/fsp0fsp.c @@ -910,7 +910,7 @@ fsp_header_init( if (space == 0) { fsp_fill_free_list(FALSE, space, header, mtr); btr_create(DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF, space, - ut_dulint_add(DICT_IBUF_ID_MIN, space), mtr); + ut_dulint_add(DICT_IBUF_ID_MIN, space), FALSE, mtr); } else { fsp_fill_free_list(TRUE, space, header, mtr); } diff --git a/innobase/ibuf/ibuf0ibuf.c b/innobase/ibuf/ibuf0ibuf.c index b3c8ade2414..c7ca03f9901 100644 --- a/innobase/ibuf/ibuf0ibuf.c +++ b/innobase/ibuf/ibuf0ibuf.c @@ -46,7 +46,7 @@ Note that contary to what we planned in the 1990's, there will only be one insert buffer tree, and that is in the system tablespace of InnoDB. 1. The first field is the space id. -2. The second field is a one-byte marker which differentiates records from +2. The second field is a one-byte marker (0) which differentiates records from the < 4.1.x storage format. 3. The third field is the page number. 4. The fourth field contains the type info, where we have also added 2 bytes to @@ -55,7 +55,14 @@ insert buffer tree, and that is in the system tablespace of InnoDB. can use in the binary search on the index page in the ibuf merge phase. 5. The rest of the fields contain the fields of the actual index record. -*/ +In versions >= 5.0.3: + +The first byte of the fourth field is an additional marker (0) if the record +is in the compact format. The presence of this marker can be detected by +looking at the length of the field modulo DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE. + +The high-order bit of the character set field in the type info is the +"nullable" flag for the field. */ /* PREVENTING DEADLOCKS IN THE INSERT BUFFER SYSTEM @@ -525,8 +532,8 @@ ibuf_data_init_for_space( ibuf_exit(); sprintf(buf, "SYS_IBUF_TABLE_%lu", (ulong) space); - - table = dict_mem_table_create(buf, space, 2); + /* use old-style record format for the insert buffer */ + table = dict_mem_table_create(buf, space, 2, FALSE); dict_mem_table_add_col(table, "PAGE_NO", DATA_BINARY, 0, 0, 0); dict_mem_table_add_col(table, "TYPES", DATA_BINARY, 0, 0, 0); @@ -1049,20 +1056,20 @@ ibuf_rec_get_page_no( ulint len; ut_ad(ibuf_inside()); - ut_ad(rec_get_n_fields(rec) > 2); + ut_ad(rec_get_n_fields_old(rec) > 2); - field = rec_get_nth_field(rec, 1, &len); + field = rec_get_nth_field_old(rec, 1, &len); if (len == 1) { /* This is of the >= 4.1.x record format */ ut_a(trx_sys_multiple_tablespace_format); - field = rec_get_nth_field(rec, 2, &len); + field = rec_get_nth_field_old(rec, 2, &len); } else { ut_a(trx_doublewrite_must_reset_space_ids); ut_a(!trx_sys_multiple_tablespace_format); - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); } ut_a(len == 4); @@ -1084,15 +1091,15 @@ ibuf_rec_get_space( ulint len; ut_ad(ibuf_inside()); - ut_ad(rec_get_n_fields(rec) > 2); + ut_ad(rec_get_n_fields_old(rec) > 2); - field = rec_get_nth_field(rec, 1, &len); + field = rec_get_nth_field_old(rec, 1, &len); if (len == 1) { /* This is of the >= 4.1.x record format */ ut_a(trx_sys_multiple_tablespace_format); - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); ut_a(len == 4); return(mach_read_from_4(field)); @@ -1105,6 +1112,161 @@ ibuf_rec_get_space( } /************************************************************************ +Creates a dummy index for inserting a record to a non-clustered index. +*/ +static +dict_index_t* +ibuf_dummy_index_create( +/*====================*/ + /* out: dummy index */ + ulint n, /* in: number of fields */ + ibool comp) /* in: TRUE=use compact record format */ +{ + dict_table_t* table; + dict_index_t* index; + table = dict_mem_table_create("IBUF_DUMMY", + DICT_HDR_SPACE, n, comp); + index = dict_mem_index_create("IBUF_DUMMY", "IBUF_DUMMY", + DICT_HDR_SPACE, 0, n); + index->table = table; + /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ + index->cached = TRUE; + return(index); +} +/************************************************************************ +Add a column to the dummy index */ +static +void +ibuf_dummy_index_add_col( +/*====================*/ + dict_index_t* index, /* in: dummy index */ + dtype_t* type) /* in: the data type of the column */ +{ + ulint i = index->table->n_def; + dict_mem_table_add_col(index->table, "DUMMY", + dtype_get_mtype(type), + dtype_get_prtype(type), + dtype_get_len(type), + dtype_get_prec(type)); + dict_index_add_col(index, + dict_table_get_nth_col(index->table, i), 0, 0); +} +/************************************************************************ +Deallocates a dummy index for inserting a record to a non-clustered index. +*/ +static +void +ibuf_dummy_index_free( +/*====================*/ + dict_index_t* index) /* in: dummy index */ +{ + dict_table_t* table = index->table; + mem_heap_free(index->heap); + mutex_free(&(table->autoinc_mutex)); + mem_heap_free(table->heap); +} + +/************************************************************************* +Builds the entry to insert into a non-clustered index when we have the +corresponding record in an ibuf index. */ +static +dtuple_t* +ibuf_build_entry_from_ibuf_rec( +/*===========================*/ + /* out, own: entry to insert to + a non-clustered index; NOTE that + as we copy pointers to fields in + ibuf_rec, the caller must hold a + latch to the ibuf_rec page as long + as the entry is used! */ + rec_t* ibuf_rec, /* in: record in an insert buffer */ + mem_heap_t* heap, /* in: heap where built */ + dict_index_t** pindex) /* out, own: dummy index that + describes the entry */ +{ + dtuple_t* tuple; + dfield_t* field; + ulint n_fields; + byte* types; + const byte* data; + ulint len; + ulint i; + dict_index_t* index; + + data = rec_get_nth_field_old(ibuf_rec, 1, &len); + + if (len > 1) { + /* This a < 4.1.x format record */ + + ut_a(trx_doublewrite_must_reset_space_ids); + ut_a(!trx_sys_multiple_tablespace_format); + + n_fields = rec_get_n_fields_old(ibuf_rec) - 2; + tuple = dtuple_create(heap, n_fields); + types = rec_get_nth_field_old(ibuf_rec, 1, &len); + + ut_a(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE); + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(tuple, i); + + data = rec_get_nth_field_old(ibuf_rec, i + 2, &len); + + dfield_set_data(field, data, len); + + dtype_read_for_order_and_null_size( + dfield_get_type(field), + types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE); + } + + *pindex = ibuf_dummy_index_create(n_fields, FALSE); + return(tuple); + } + + /* This a >= 4.1.x format record */ + + ut_a(trx_sys_multiple_tablespace_format); + ut_a(*data == 0); + ut_a(rec_get_n_fields_old(ibuf_rec) > 4); + + n_fields = rec_get_n_fields_old(ibuf_rec) - 4; + + tuple = dtuple_create(heap, n_fields); + + types = rec_get_nth_field_old(ibuf_rec, 3, &len); + + ut_a(len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE <= 1); + index = ibuf_dummy_index_create(n_fields, + len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + if (len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE) { + /* compact record format */ + len--; + ut_a(*types == 0); + types++; + } + + ut_a(len == n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(tuple, i); + + data = rec_get_nth_field_old(ibuf_rec, i + 4, &len); + + dfield_set_data(field, data, len); + + dtype_new_read_for_order_and_null_size( + dfield_get_type(field), + types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + ibuf_dummy_index_add_col(index, dfield_get_type(field)); + } + + *pindex = index; + return(tuple); +} + +/************************************************************************ Returns the space taken by a stored non-clustered index entry if converted to an index record. */ static @@ -1125,43 +1287,60 @@ ibuf_rec_get_volume( ulint i; ut_ad(ibuf_inside()); - ut_ad(rec_get_n_fields(ibuf_rec) > 2); - - data = rec_get_nth_field(ibuf_rec, 1, &len); + ut_ad(rec_get_n_fields_old(ibuf_rec) > 2); + + data = rec_get_nth_field_old(ibuf_rec, 1, &len); if (len > 1) { - /* < 4.1.x format record */ + /* < 4.1.x format record */ ut_a(trx_doublewrite_must_reset_space_ids); ut_a(!trx_sys_multiple_tablespace_format); - n_fields = rec_get_n_fields(ibuf_rec) - 2; + n_fields = rec_get_n_fields_old(ibuf_rec) - 2; - types = rec_get_nth_field(ibuf_rec, 1, &len); + types = rec_get_nth_field_old(ibuf_rec, 1, &len); ut_ad(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE); } else { - /* >= 4.1.x format record */ + /* >= 4.1.x format record */ ut_a(trx_sys_multiple_tablespace_format); - new_format = TRUE; + ut_a(*data == 0); + + types = rec_get_nth_field_old(ibuf_rec, 3, &len); + + ut_a(len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE <= 1); + if (len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE) { + /* compact record format */ + ulint volume; + dict_index_t* dummy_index; + mem_heap_t* heap = mem_heap_create(500); + dtuple_t* entry = + ibuf_build_entry_from_ibuf_rec( + ibuf_rec, heap, &dummy_index); + volume = rec_get_converted_size(dummy_index, entry); + ibuf_dummy_index_free(dummy_index); + mem_heap_free(heap); + return(volume + page_dir_calc_reserved_space(1)); + } - n_fields = rec_get_n_fields(ibuf_rec) - 4; + n_fields = rec_get_n_fields_old(ibuf_rec) - 4; - types = rec_get_nth_field(ibuf_rec, 3, &len); + new_format = TRUE; } for (i = 0; i < n_fields; i++) { if (new_format) { - data = rec_get_nth_field(ibuf_rec, i + 4, &len); + data = rec_get_nth_field_old(ibuf_rec, i + 4, &len); dtype_new_read_for_order_and_null_size(&dtype, - types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); } else { - data = rec_get_nth_field(ibuf_rec, i + 2, &len); + data = rec_get_nth_field_old(ibuf_rec, i + 2, &len); dtype_read_for_order_and_null_size(&dtype, - types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE); + types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE); } if (len == UNIV_SQL_NULL) { @@ -1187,6 +1366,7 @@ ibuf_entry_build( must be kept because we copy pointers to its fields */ dtuple_t* entry, /* in: entry for a non-clustered index */ + ibool comp, /* in: flag: TRUE=compact record format */ ulint space, /* in: space id */ ulint page_no,/* in: index page number where entry should be inserted */ @@ -1202,11 +1382,14 @@ ibuf_entry_build( /* Starting from 4.1.x, we have to build a tuple whose (1) first field is the space id, - (2) the second field a single marker byte to tell that this + (2) the second field a single marker byte (0) to tell that this is a new format record, (3) the third contains the page number, and (4) the fourth contains the relevent type information of each data - field, + field; the length of this field % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE is + (a) 0 for b-trees in the old format, and + (b) 1 for b-trees in the compact format, the first byte of the field + being the marker (0); (5) and the rest of the fields are copied from entry. All fields in the tuple are ordered like the type binary in our insert buffer tree. */ @@ -1247,10 +1430,15 @@ ibuf_entry_build( dfield_set_data(field, buf, 4); + ut_ad(comp == 0 || comp == 1); /* Store the type info in buf2, and add the fields from entry to tuple */ buf2 = mem_heap_alloc(heap, n_fields - * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE + + comp); + if (comp) { + *buf2++ = 0; /* write the compact format indicator */ + } for (i = 0; i < n_fields; i++) { /* We add 4 below because we have the 4 extra fields at the start of an ibuf record */ @@ -1268,8 +1456,13 @@ ibuf_entry_build( field = dtuple_get_nth_field(tuple, 3); + if (comp) { + buf2--; + } + dfield_set_data(field, buf2, n_fields - * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE + + comp); /* Set all the types in the new tuple binary */ dtuple_set_types_binary(tuple, n_fields + 4); @@ -1278,88 +1471,6 @@ ibuf_entry_build( } /************************************************************************* -Builds the entry to insert into a non-clustered index when we have the -corresponding record in an ibuf index. */ -static -dtuple_t* -ibuf_build_entry_from_ibuf_rec( -/*===========================*/ - /* out, own: entry to insert to - a non-clustered index; NOTE that - as we copy pointers to fields in - ibuf_rec, the caller must hold a - latch to the ibuf_rec page as long - as the entry is used! */ - rec_t* ibuf_rec, /* in: record in an insert buffer */ - mem_heap_t* heap) /* in: heap where built */ -{ - dtuple_t* tuple; - dfield_t* field; - ulint n_fields; - byte* types; - byte* data; - ulint len; - ulint i; - - data = rec_get_nth_field(ibuf_rec, 1, &len); - - if (len > 1) { - /* This a < 4.1.x format record */ - - ut_a(trx_doublewrite_must_reset_space_ids); - ut_a(!trx_sys_multiple_tablespace_format); - - n_fields = rec_get_n_fields(ibuf_rec) - 2; - tuple = dtuple_create(heap, n_fields); - types = rec_get_nth_field(ibuf_rec, 1, &len); - - ut_a(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE); - - for (i = 0; i < n_fields; i++) { - field = dtuple_get_nth_field(tuple, i); - - data = rec_get_nth_field(ibuf_rec, i + 2, &len); - - dfield_set_data(field, data, len); - - dtype_read_for_order_and_null_size( - dfield_get_type(field), - types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE); - } - - return(tuple); - } - - /* This a >= 4.1.x format record */ - - ut_a(trx_sys_multiple_tablespace_format); - - ut_a(rec_get_n_fields(ibuf_rec) > 4); - - n_fields = rec_get_n_fields(ibuf_rec) - 4; - - tuple = dtuple_create(heap, n_fields); - - types = rec_get_nth_field(ibuf_rec, 3, &len); - - ut_a(len == n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); - - for (i = 0; i < n_fields; i++) { - field = dtuple_get_nth_field(tuple, i); - - data = rec_get_nth_field(ibuf_rec, i + 4, &len); - - dfield_set_data(field, data, len); - - dtype_new_read_for_order_and_null_size( - dfield_get_type(field), - types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); - } - - return(tuple); -} - -/************************************************************************* Builds a search tuple used to search buffered inserts for an index page. This is for < 4.1.x format records */ static @@ -2047,8 +2158,7 @@ loop: mutex_exit(&ibuf_mutex); sum_sizes = ibuf_get_merge_page_nos(TRUE, btr_pcur_get_rec(&pcur), - space_ids, space_versions, page_nos, - &n_stored); + space_ids, space_versions, page_nos, &n_stored); #ifdef UNIV_IBUF_DEBUG /* fprintf(stderr, "Ibuf contract sync %lu pages %lu volume %lu\n", sync, n_stored, sum_sizes); */ @@ -2344,6 +2454,7 @@ ibuf_update_max_tablespace_id(void) ibuf_data = fil_space_get_ibuf_data(0); ibuf_index = ibuf_data->index; + ut_a(!ibuf_index->table->comp); ibuf_enter(); @@ -2360,7 +2471,7 @@ ibuf_update_max_tablespace_id(void) } else { rec = btr_pcur_get_rec(&pcur); - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field_old(rec, 0, &len); ut_a(len == 4); @@ -2479,7 +2590,7 @@ ibuf_insert_low( ibuf_enter(); } - entry_size = rec_get_converted_size(entry); + entry_size = rec_get_converted_size(index, entry); heap = mem_heap_create(512); @@ -2487,7 +2598,8 @@ ibuf_insert_low( the first fields and the type information for other fields, and which will be inserted to the insert buffer. */ - ibuf_entry = ibuf_entry_build(entry, space, page_no, heap); + ibuf_entry = ibuf_entry_build(entry, index->table->comp, + space, page_no, heap); /* Open a cursor to the insert buffer tree to calculate if we can add the new entry to it without exceeding the free space limit for the @@ -2532,8 +2644,8 @@ ibuf_insert_low( do_merge = TRUE; ibuf_get_merge_page_nos(FALSE, btr_pcur_get_rec(&pcur), - space_ids, space_versions, page_nos, - &n_stored); + space_ids, space_versions, + page_nos, &n_stored); goto function_exit; } @@ -2656,8 +2768,8 @@ ibuf_insert( ut_a(!(index->type & DICT_CLUSTERED)); - if (rec_get_converted_size(entry) - >= page_get_free_space_of_empty() / 2) { + if (rec_get_converted_size(index, entry) + >= page_get_free_space_of_empty(index->table->comp) / 2) { return(FALSE); } @@ -2692,6 +2804,7 @@ ibuf_insert_to_index_page( dtuple_t* entry, /* in: buffered entry to insert */ page_t* page, /* in: index page where the buffered entry should be placed */ + dict_index_t* index, /* in: record descriptor */ mtr_t* mtr) /* in: mtr */ { page_cur_t page_cur; @@ -2699,17 +2812,28 @@ ibuf_insert_to_index_page( rec_t* rec; page_t* bitmap_page; ulint old_bits; + mem_heap_t* heap; ut_ad(ibuf_inside()); ut_ad(dtuple_check_typed(entry)); - if (rec_get_n_fields(page_rec_get_next(page_get_infimum_rec(page))) - != dtuple_get_n_fields(entry)) { - - fprintf(stderr, + if (index->table->comp != page_is_comp(page)) { + fputs( "InnoDB: Trying to insert a record from the insert buffer to an index page\n" -"InnoDB: but the number of fields does not match!\n"); +"InnoDB: but the 'compact' flag does not match!\n", stderr); + goto dump; + } + + heap = mem_heap_create(100); + rec = page_rec_get_next(page_get_infimum_rec(page)); + if (rec_offs_n_fields(rec_get_offsets(rec, index, ULINT_UNDEFINED, + heap)) != dtuple_get_n_fields(entry)) { + mem_heap_free(heap); + fputs( +"InnoDB: Trying to insert a record from the insert buffer to an index page\n" +"InnoDB: but the number of fields does not match!\n", stderr); + dump: buf_page_print(page); dtuple_print(stderr, entry); @@ -2723,31 +2847,35 @@ ibuf_insert_to_index_page( return; } - low_match = page_cur_search(page, entry, PAGE_CUR_LE, &page_cur); + mem_heap_free(heap); + low_match = page_cur_search(page, index, entry, + PAGE_CUR_LE, &page_cur); if (low_match == dtuple_get_n_fields(entry)) { rec = page_cur_get_rec(&page_cur); - btr_cur_del_unmark_for_ibuf(rec, mtr); + btr_cur_del_unmark_for_ibuf(rec, index, mtr); } else { - rec = page_cur_tuple_insert(&page_cur, entry, mtr); + rec = page_cur_tuple_insert(&page_cur, entry, index, mtr); if (rec == NULL) { /* If the record did not fit, reorganize */ - btr_page_reorganize(page, mtr); + btr_page_reorganize(page, index, mtr); - page_cur_search(page, entry, PAGE_CUR_LE, &page_cur); + page_cur_search(page, index, entry, + PAGE_CUR_LE, &page_cur); /* This time the record must fit */ - if (!page_cur_tuple_insert(&page_cur, entry, mtr)) { + if (!page_cur_tuple_insert(&page_cur, entry, + index, mtr)) { ut_print_timestamp(stderr); fprintf(stderr, "InnoDB: Error: Insert buffer insert fails; page free %lu, dtuple size %lu\n", (ulong) page_get_max_insert_size(page, 1), - (ulong) rec_get_converted_size(entry)); + (ulong) rec_get_converted_size(index, entry)); fputs("InnoDB: Cannot insert index record ", stderr); dtuple_print(stderr, entry); @@ -2836,11 +2964,12 @@ ibuf_delete_rec( "InnoDB: ibuf record inserted to page %lu\n", (ulong) page_no); fflush(stderr); - rec_print(stderr, btr_pcur_get_rec(pcur)); - rec_print(stderr, pcur->old_rec); + rec_print_old(stderr, btr_pcur_get_rec(pcur)); + rec_print_old(stderr, pcur->old_rec); dtuple_print(stderr, search_tuple); - rec_print(stderr, page_rec_get_next(btr_pcur_get_rec(pcur))); + rec_print_old(stderr, + page_rec_get_next(btr_pcur_get_rec(pcur))); fflush(stderr); btr_pcur_commit_specify_mtr(pcur, mtr); @@ -2866,6 +2995,8 @@ ibuf_delete_rec( #ifdef UNIV_IBUF_DEBUG ibuf_count_set(space, page_no, ibuf_count_get(space, page_no) - 1); +#else + UT_NOT_USED(space); #endif ibuf_data_sizes_update(ibuf_data, root, mtr); @@ -3073,7 +3204,7 @@ loop: if (corruption_noticed) { fputs("InnoDB: Discarding record\n ", stderr); - rec_print(stderr, ibuf_rec); + rec_print_old(stderr, ibuf_rec); fputs("\n from the insert buffer!\n\n", stderr); } else if (page) { /* Now we have at pcur a record which should be @@ -3081,19 +3212,22 @@ loop: copies pointers to fields in ibuf_rec, and we must keep the latch to the ibuf_rec page until the insertion is finished! */ - - dulint max_trx_id = page_get_max_trx_id( + dict_index_t* dummy_index; + dulint max_trx_id = page_get_max_trx_id( buf_frame_align(ibuf_rec)); page_update_max_trx_id(page, max_trx_id); - entry = ibuf_build_entry_from_ibuf_rec(ibuf_rec, heap); + entry = ibuf_build_entry_from_ibuf_rec(ibuf_rec, + heap, &dummy_index); #ifdef UNIV_IBUF_DEBUG - volume += rec_get_converted_size(entry) + volume += rec_get_converted_size(dummy_index, entry) + page_dir_calc_reserved_space(1); ut_a(volume <= 4 * UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE); #endif - ibuf_insert_to_index_page(entry, page, &mtr); + ibuf_insert_to_index_page(entry, page, + dummy_index, &mtr); + ibuf_dummy_index_free(dummy_index); } n_inserts++; @@ -3267,11 +3401,11 @@ leave_loop: ibuf_data->n_merged_recs += n_inserts; mutex_exit(&ibuf_mutex); - + /* fprintf(stderr, "InnoDB: Discarded %lu ibuf entries for space %lu\n", (ulong) n_inserts, (ulong) space); - + */ ibuf_exit(); mem_heap_free(heap); diff --git a/innobase/include/btr0btr.h b/innobase/include/btr0btr.h index 8606fcd2a5c..0b19e64d4e0 100644 --- a/innobase/include/btr0btr.h +++ b/innobase/include/btr0btr.h @@ -155,7 +155,8 @@ ulint btr_node_ptr_get_child_page_no( /*===========================*/ /* out: child node address */ - rec_t* rec); /* in: node pointer record */ + rec_t* rec, /* in: node pointer record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /**************************************************************** Creates the root node for a new index tree. */ @@ -167,6 +168,7 @@ btr_create( ulint type, /* in: type of the index */ ulint space, /* in: space where created */ dulint index_id,/* in: index id */ + ibool comp, /* in: TRUE=compact page format */ mtr_t* mtr); /* in: mini-transaction handle */ /**************************************************************** Frees a B-tree except the root page, which MUST be freed after this @@ -210,8 +212,9 @@ Reorganizes an index page. */ void btr_page_reorganize( /*================*/ - page_t* page, /* in: page to be reorganized */ - mtr_t* mtr); /* in: mtr */ + page_t* page, /* in: page to be reorganized */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Decides if the page should be split at the convergence point of inserts converging to left. */ @@ -273,6 +276,7 @@ void btr_set_min_rec_mark( /*=================*/ rec_t* rec, /* in: record */ + ibool comp, /* in: TRUE=compact page format */ mtr_t* mtr); /* in: mtr */ /***************************************************************** Deletes on the upper level the node pointer to a page. */ @@ -332,6 +336,7 @@ btr_parse_set_min_rec_mark( /* out: end of log record or NULL */ byte* ptr, /* in: buffer */ byte* end_ptr,/* in: buffer end */ + ibool comp, /* in: TRUE=compact page format */ page_t* page, /* in: page or NULL */ mtr_t* mtr); /* in: mtr or NULL */ /*************************************************************** @@ -340,11 +345,12 @@ Parses a redo log record of reorganizing a page. */ byte* btr_parse_page_reorganize( /*======================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr); /* in: mtr or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr); /* in: mtr or NULL */ /****************************************************************** Gets the number of pages in a B-tree. */ diff --git a/innobase/include/btr0btr.ic b/innobase/include/btr0btr.ic index b0aa0756307..1d1f97d3668 100644 --- a/innobase/include/btr0btr.ic +++ b/innobase/include/btr0btr.ic @@ -183,17 +183,18 @@ ulint btr_node_ptr_get_child_page_no( /*===========================*/ /* out: child node address */ - rec_t* rec) /* in: node pointer record */ + rec_t* rec, /* in: node pointer record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { - ulint n_fields; byte* field; ulint len; ulint page_no; - n_fields = rec_get_n_fields(rec); + ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec)); /* The child address is in the last field */ - field = rec_get_nth_field(rec, n_fields - 1, &len); + field = rec_get_nth_field(rec, offsets, + rec_offs_n_fields(offsets) - 1, &len); ut_ad(len == 4); diff --git a/innobase/include/btr0cur.h b/innobase/include/btr0cur.h index f1334656d53..0a8d8ceaeb7 100644 --- a/innobase/include/btr0cur.h +++ b/innobase/include/btr0cur.h @@ -34,7 +34,7 @@ page_cur_t* btr_cur_get_page_cur( /*=================*/ /* out: pointer to page cursor component */ - btr_cur_t* cursor); /* in: tree cursor */ + btr_cur_t* cursor);/* in: tree cursor */ /************************************************************* Returns the record pointer of a tree cursor. */ UNIV_INLINE @@ -42,14 +42,14 @@ rec_t* btr_cur_get_rec( /*============*/ /* out: pointer to record */ - btr_cur_t* cursor); /* in: tree cursor */ + btr_cur_t* cursor);/* in: tree cursor */ /************************************************************* Invalidates a tree cursor by setting record pointer to NULL. */ UNIV_INLINE void btr_cur_invalidate( /*===============*/ - btr_cur_t* cursor); /* in: tree cursor */ + btr_cur_t* cursor);/* in: tree cursor */ /************************************************************* Returns the page of a tree cursor. */ UNIV_INLINE @@ -57,7 +57,7 @@ page_t* btr_cur_get_page( /*=============*/ /* out: pointer to page */ - btr_cur_t* cursor); /* in: tree cursor */ + btr_cur_t* cursor);/* in: tree cursor */ /************************************************************* Returns the tree of a cursor. */ UNIV_INLINE @@ -65,7 +65,7 @@ dict_tree_t* btr_cur_get_tree( /*=============*/ /* out: tree */ - btr_cur_t* cursor); /* in: tree cursor */ + btr_cur_t* cursor);/* in: tree cursor */ /************************************************************* Positions a tree cursor at a given record. */ UNIV_INLINE @@ -283,8 +283,9 @@ only used by the insert buffer insert merge mechanism. */ void btr_cur_del_unmark_for_ibuf( /*========================*/ - rec_t* rec, /* in: record to delete unmark */ - mtr_t* mtr); /* in: mtr */ + rec_t* rec, /* in: record to delete unmark */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Tries to compress a page of the tree on the leaf level. It is assumed that mtr holds an x-latch on the tree and on the cursor page. To avoid @@ -361,10 +362,11 @@ Parses a redo log record of updating a record in-place. */ byte* btr_cur_parse_update_in_place( /*==========================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page); /* in: page or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page, /* in: page or NULL */ + dict_index_t* index); /* in: index corresponding to page */ /******************************************************************** Parses the redo log record for delete marking or unmarking of a clustered index record. */ @@ -372,10 +374,11 @@ index record. */ byte* btr_cur_parse_del_mark_set_clust_rec( /*=================================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page); /* in: page or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: index corresponding to page */ + page_t* page); /* in: page or NULL */ /******************************************************************** Parses the redo log record for delete marking or unmarking of a secondary index record. */ @@ -383,10 +386,11 @@ index record. */ byte* btr_cur_parse_del_mark_set_sec_rec( /*===============================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page); /* in: page or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: index corresponding to page */ + page_t* page); /* in: page or NULL */ /*********************************************************************** Estimates the number of rows in a given index range. */ @@ -417,9 +421,10 @@ to free the field. */ void btr_cur_mark_extern_inherited_fields( /*=================================*/ - rec_t* rec, /* in: record in a clustered index */ - upd_t* update, /* in: update vector */ - mtr_t* mtr); /* in: mtr */ + rec_t* rec, /* in: record in a clustered index */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + upd_t* update, /* in: update vector */ + mtr_t* mtr); /* in: mtr */ /*********************************************************************** The complement of the previous function: in an update entry may inherit some externally stored fields from a record. We must mark them as inherited @@ -456,6 +461,7 @@ btr_store_big_rec_extern_fields( dict_index_t* index, /* in: index of rec; the index tree MUST be X-latched */ rec_t* rec, /* in: record */ + const ulint* offsets, /* in: rec_get_offsets(rec, index) */ big_rec_t* big_rec_vec, /* in: vector containing fields to be stored externally */ mtr_t* local_mtr); /* in: mtr containing the latch to @@ -496,6 +502,7 @@ btr_rec_free_externally_stored_fields( dict_index_t* index, /* in: index of the data, the index tree MUST be X-latched */ rec_t* rec, /* in: record */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ ibool do_not_free_inherited,/* in: TRUE if called in a rollback and we do not want to free inherited fields */ @@ -510,6 +517,7 @@ btr_rec_copy_externally_stored_field( /*=================================*/ /* out: the field copied to heap */ rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint no, /* in: field number */ ulint* len, /* out: length of the field */ mem_heap_t* heap); /* in: mem heap */ @@ -540,10 +548,10 @@ ulint btr_push_update_extern_fields( /*==========================*/ /* out: number of values stored in ext_vect */ - ulint* ext_vect, /* in: array of ulints, must be preallocated - to have place for all fields in rec */ - rec_t* rec, /* in: record */ - upd_t* update); /* in: update vector */ + ulint* ext_vect,/* in: array of ulints, must be preallocated + to have space for all fields in rec */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + upd_t* update);/* in: update vector or NULL */ /*######################################################################*/ diff --git a/innobase/include/btr0cur.ic b/innobase/include/btr0cur.ic index a3a04b60c45..dcad3e9e14d 100644 --- a/innobase/include/btr0cur.ic +++ b/innobase/include/btr0cur.ic @@ -134,17 +134,15 @@ btr_cur_can_delete_without_compress( /* out: TRUE if can be deleted without recommended compression */ btr_cur_t* cursor, /* in: btr cursor */ + ulint rec_size,/* in: rec_get_size(btr_cur_get_rec(cursor))*/ mtr_t* mtr) /* in: mtr */ { - ulint rec_size; page_t* page; ut_ad(mtr_memo_contains(mtr, buf_block_align( btr_cur_get_page(cursor)), MTR_MEMO_PAGE_X_FIX)); - rec_size = rec_get_size(btr_cur_get_rec(cursor)); - page = btr_cur_get_page(cursor); if ((page_get_data_size(page) - rec_size < BTR_CUR_PAGE_COMPRESS_LIMIT) diff --git a/innobase/include/btr0pcur.h b/innobase/include/btr0pcur.h index 81f19af4d40..6384222be51 100644 --- a/innobase/include/btr0pcur.h +++ b/innobase/include/btr0pcur.h @@ -462,6 +462,7 @@ struct btr_pcur_struct{ contains an initial segment of the latest record cursor was positioned either on, before, or after */ + ulint old_n_fields; /* number of fields in old_rec */ ulint rel_pos; /* BTR_PCUR_ON, BTR_PCUR_BEFORE, or BTR_PCUR_AFTER, depending on whether cursor was on, before, or after the diff --git a/innobase/include/btr0sea.h b/innobase/include/btr0sea.h index ce4140ecf92..73cd95d1464 100644 --- a/innobase/include/btr0sea.h +++ b/innobase/include/btr0sea.h @@ -77,8 +77,10 @@ parameters as page (this often happens when a page is split). */ void btr_search_move_or_delete_hash_entries( /*===================================*/ - page_t* new_page, /* in: records are copied to this page */ - page_t* page); /* in: index page */ + page_t* new_page, /* in: records are copied + to this page */ + page_t* page, /* in: index page */ + dict_index_t* index); /* in: record descriptor */ /************************************************************************ Drops a page hash index. */ @@ -128,9 +130,10 @@ btr_search_update_hash_on_delete( Validates the search system. */ ibool -btr_search_validate(void); -/*=====================*/ - +btr_search_validate( +/*================*/ + /* out: TRUE if ok */ + dict_index_t* index); /* in: record descriptor */ /* Search info directions */ #define BTR_SEA_NO_DIRECTION 1 diff --git a/innobase/include/buf0buf.h b/innobase/include/buf0buf.h index f72207be29c..b46b8ce40be 100644 --- a/innobase/include/buf0buf.h +++ b/innobase/include/buf0buf.h @@ -58,6 +58,8 @@ extern buf_pool_t* buf_pool; /* The buffer pool of the database */ extern ibool buf_debug_prints;/* If this is set TRUE, the program prints info whenever read or flush occurs */ +extern ulint srv_buf_pool_write_requests; /* variable to count write request + issued */ /************************************************************************ Creates the buffer pool. */ @@ -497,6 +499,12 @@ void buf_print(void); /*============*/ /************************************************************************* +Returns the number of latched pages in the buffer pool. */ + +ulint +buf_get_latched_pages_number(void); +/*==============================*/ +/************************************************************************* Returns the number of pending buf pool ios. */ ulint diff --git a/innobase/include/buf0flu.ic b/innobase/include/buf0flu.ic index d6dbdcc0865..9a8a021e029 100644 --- a/innobase/include/buf0flu.ic +++ b/innobase/include/buf0flu.ic @@ -61,6 +61,8 @@ buf_flush_note_modification( ut_ad(ut_dulint_cmp(block->oldest_modification, mtr->start_lsn) <= 0); } + + ++srv_buf_pool_write_requests; } /************************************************************************ diff --git a/innobase/include/buf0lru.h b/innobase/include/buf0lru.h index 69a376f8cab..45164dd561e 100644 --- a/innobase/include/buf0lru.h +++ b/innobase/include/buf0lru.h @@ -25,6 +25,16 @@ wasted. */ void buf_LRU_try_free_flushed_blocks(void); /*==================================*/ +/********************************************************************** +Returns TRUE if less than 15 % of the buffer pool is available. This can be +used in heuristics to prevent huge transactions eating up the whole buffer +pool for their locks. */ + +ibool +buf_LRU_buf_pool_running_out(void); +/*==============================*/ + /* out: TRUE if less than 15 % of buffer pool + left */ /*####################################################################### These are low-level functions diff --git a/innobase/include/data0type.ic b/innobase/include/data0type.ic index 946b646ffbf..0b92ffbe7f1 100644 --- a/innobase/include/data0type.ic +++ b/innobase/include/data0type.ic @@ -8,6 +8,17 @@ Created 1/16/1996 Heikki Tuuri #include "mach0data.h" +/********************************************************************** +Determines whether the given character set is of variable length. + +NOTE: the prototype of this function is copied from ha_innodb.cc! If you change +this function, you MUST change also the prototype here! */ +extern +ibool +innobase_is_mb_cset( +/*================*/ + ulint cset); /* in: MySQL charset-collation code */ + /************************************************************************* Sets a data type structure. */ UNIV_INLINE @@ -149,8 +160,10 @@ dtype_new_store_for_order_and_null_size( bytes where we store the info */ dtype_t* type) /* in: type struct */ { - ut_ad(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); - +#if 6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE +#error "6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE" +#endif + buf[0] = (byte)(type->mtype & 0xFFUL); if (type->prtype & DATA_BINARY_TYPE) { @@ -166,10 +179,12 @@ dtype_new_store_for_order_and_null_size( mach_write_to_2(buf + 2, type->len & 0xFFFFUL); + ut_ad(dtype_get_charset_coll(type->prtype) < 256); mach_write_to_2(buf + 4, dtype_get_charset_coll(type->prtype)); - /* Note that the second last byte is left unused, because the - charset-collation code is always < 256 */ + if (type->prtype & DATA_NOT_NULL) { + buf[4] |= 128; + } } /************************************************************************** @@ -211,20 +226,26 @@ dtype_new_read_for_order_and_null_size( { ulint charset_coll; - ut_ad(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); +#if 6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE +#error "6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE" +#endif type->mtype = buf[0] & 63; type->prtype = buf[1]; if (buf[0] & 128) { - type->prtype = type->prtype | DATA_BINARY_TYPE; + type->prtype |= DATA_BINARY_TYPE; + } + + if (buf[4] & 128) { + type->prtype |= DATA_NOT_NULL; } type->len = mach_read_from_2(buf + 2); mach_read_from_2(buf + 4); - charset_coll = mach_read_from_2(buf + 4); + charset_coll = mach_read_from_2(buf + 4) & 0x7fff; if (dtype_is_string_type(type->mtype)) { ut_a(charset_coll < 256); @@ -257,23 +278,39 @@ dtype_get_fixed_size( mtype = dtype_get_mtype(type); switch (mtype) { + case DATA_SYS: +#ifdef UNIV_DEBUG + switch (type->prtype & DATA_MYSQL_TYPE_MASK) { + default: + ut_ad(0); + return(0); + case DATA_ROW_ID: + ut_ad(type->len == DATA_ROW_ID_LEN); + break; + case DATA_TRX_ID: + ut_ad(type->len == DATA_TRX_ID_LEN); + break; + case DATA_ROLL_PTR: + ut_ad(type->len == DATA_ROLL_PTR_LEN); + break; + case DATA_MIX_ID: + ut_ad(type->len == DATA_MIX_ID_LEN); + break; + } +#endif /* UNIV_DEBUG */ case DATA_CHAR: case DATA_FIXBINARY: case DATA_INT: case DATA_FLOAT: case DATA_DOUBLE: case DATA_MYSQL: - return(dtype_get_len(type)); - - case DATA_SYS: if (type->prtype == DATA_ROW_ID) { - return(DATA_ROW_ID_LEN); - } else if (type->prtype == DATA_TRX_ID) { - return(DATA_TRX_ID_LEN); - } else if (type->prtype == DATA_ROLL_PTR) { - return(DATA_ROLL_PTR_LEN); - } else { - return(0); + if ((type->prtype & DATA_BINARY_TYPE) + || !innobase_is_mb_cset( + dtype_get_charset_coll( + type->prtype))) { + return(dtype_get_len(type)); } + /* fall through for variable-length charsets */ case DATA_VARCHAR: case DATA_BINARY: case DATA_DECIMAL: diff --git a/innobase/include/db0err.h b/innobase/include/db0err.h index be7667bfd0c..de5ac44e73f 100644 --- a/innobase/include/db0err.h +++ b/innobase/include/db0err.h @@ -53,7 +53,11 @@ Created 5/24/1996 Heikki Tuuri name already exists */ #define DB_TABLESPACE_DELETED 44 /* tablespace does not exist or is being dropped right now */ - +#define DB_LOCK_TABLE_FULL 45 /* lock structs have exhausted the + buffer pool (for big transactions, + InnoDB stores the lock structs in the + buffer pool) */ + /* The following are partial failure codes */ #define DB_FAIL 1000 #define DB_OVERFLOW 1001 diff --git a/innobase/include/dict0dict.h b/innobase/include/dict0dict.h index ca632691450..a2399a81ca9 100644 --- a/innobase/include/dict0dict.h +++ b/innobase/include/dict0dict.h @@ -639,6 +639,16 @@ dict_index_get_sys_col_pos( dict_index_t* index, /* in: index */ ulint type); /* in: DATA_ROW_ID, ... */ /*********************************************************************** +Adds a column to index. */ + +void +dict_index_add_col( +/*===============*/ + dict_index_t* index, /* in: index */ + dict_col_t* col, /* in: column */ + ulint order, /* in: order criterion */ + ulint prefix_len); /* in: column prefix length */ +/*********************************************************************** Copies types of fields contained in index to tuple. */ void @@ -657,6 +667,7 @@ dict_index_rec_get_sys_col( /*=======================*/ /* out: system column value */ dict_index_t* index, /* in: clustered index describing the record */ + const ulint* offsets,/* in: offsets returned by rec_get_offsets() */ ulint type, /* in: column type: DATA_ROLL_PTR, ... */ rec_t* rec); /* in: record */ /************************************************************************* @@ -770,6 +781,7 @@ dict_tree_copy_rec_order_prefix( /* out: pointer to the prefix record */ dict_tree_t* tree, /* in: index tree */ rec_t* rec, /* in: record for which to copy prefix */ + ulint* n_fields,/* out: number of fields copied */ byte** buf, /* in/out: memory buffer for the copied prefix, or NULL */ ulint* buf_size);/* in/out: buffer size */ @@ -782,6 +794,7 @@ dict_tree_build_data_tuple( /* out, own: data tuple */ dict_tree_t* tree, /* in: index tree */ rec_t* rec, /* in: record for which to build data tuple */ + ulint n_fields,/* in: number of data fields */ mem_heap_t* heap); /* in: memory heap where tuple created */ /************************************************************************* Gets the space id of the root of the index tree. */ diff --git a/innobase/include/dict0dict.ic b/innobase/include/dict0dict.ic index 0f7cc8973db..7f754e316b3 100644 --- a/innobase/include/dict0dict.ic +++ b/innobase/include/dict0dict.ic @@ -168,7 +168,7 @@ dict_table_get_sys_col( col = dict_table_get_nth_col(table, table->n_cols - DATA_N_SYS_COLS + sys); ut_ad(col->type.mtype == DATA_SYS); - ut_ad(col->type.prtype == sys); + ut_ad(col->type.prtype == (sys | DATA_NOT_NULL)); return(col); } @@ -322,6 +322,7 @@ dict_index_rec_get_sys_col( /*=======================*/ /* out: system column value */ dict_index_t* index, /* in: clustered index describing the record */ + const ulint* offsets,/* in: offsets returned by rec_get_offsets() */ ulint type, /* in: column type: DATA_ROLL_PTR, ... */ rec_t* rec) /* in: record */ { @@ -331,24 +332,28 @@ dict_index_rec_get_sys_col( ut_ad(index); ut_ad(index->type & DICT_CLUSTERED); - + ut_ad(rec_offs_validate(rec, index, offsets)); + pos = dict_index_get_sys_col_pos(index, type); ut_ad(pos != ULINT_UNDEFINED); - field = rec_get_nth_field(rec, pos, &len); + field = rec_get_nth_field(rec, offsets, pos, &len); if (type == DATA_ROLL_PTR) { ut_ad(len == 7); return(trx_read_roll_ptr(field)); - } else if ((type == DATA_ROW_ID) || (type == DATA_MIX_ID)) { + } else if (type == DATA_TRX_ID) { + + return(trx_read_trx_id(field)); + } else if (type == DATA_MIX_ID) { return(mach_dulint_read_compressed(field)); } else { - ut_ad(type == DATA_TRX_ID); + ut_a(type == DATA_ROW_ID); - return(trx_read_trx_id(field)); + return(mach_read_from_6(field)); } } @@ -674,7 +679,10 @@ dict_is_mixed_table_rec( byte* mix_id_field; ulint len; - mix_id_field = rec_get_nth_field(rec, table->mix_len, &len); + ut_ad(!table->comp); + + mix_id_field = rec_get_nth_field_old(rec, + table->mix_len, &len); if ((len != table->mix_id_len) || (0 != ut_memcmp(table->mix_id_buf, mix_id_field, len))) { diff --git a/innobase/include/dict0mem.h b/innobase/include/dict0mem.h index 1e496a25477..670b3445a55 100644 --- a/innobase/include/dict0mem.h +++ b/innobase/include/dict0mem.h @@ -54,7 +54,8 @@ dict_mem_table_create( of the table is placed; this parameter is ignored if the table is made a member of a cluster */ - ulint n_cols); /* in: number of columns */ + ulint n_cols, /* in: number of columns */ + ibool comp); /* in: TRUE=compact page format */ /************************************************************************** Creates a cluster memory object. */ @@ -171,6 +172,13 @@ struct dict_field_struct{ DICT_MAX_COL_PREFIX_LEN; NOTE that in the UTF-8 charset, MySQL sets this to 3 * the prefix len in UTF-8 chars */ + ulint fixed_len; /* 0 or the fixed length of the + column if smaller than + DICT_MAX_COL_PREFIX_LEN */ + ulint fixed_offs; /* offset to the field, or + ULINT_UNDEFINED if it is not fixed + within the record (due to preceding + variable-length fields) */ }; /* Data structure for an index tree */ @@ -225,6 +233,7 @@ struct dict_index_struct{ ulint n_def; /* number of fields defined so far */ ulint n_fields;/* number of fields in the index */ dict_field_t* fields; /* array of field descriptions */ + ulint n_nullable;/* number of nullable fields */ UT_LIST_NODE_T(dict_index_t) indexes;/* list of indexes of the table */ dict_tree_t* tree; /* index tree struct */ @@ -320,6 +329,7 @@ struct dict_table_struct{ ibool tablespace_discarded;/* this flag is set TRUE when the user calls DISCARD TABLESPACE on this table, and reset to FALSE in IMPORT TABLESPACE */ + ibool comp; /* flag: TRUE=compact page format */ hash_node_t name_hash; /* hash chain node */ hash_node_t id_hash; /* hash chain node */ ulint n_def; /* number of columns defined so far */ diff --git a/innobase/include/fil0fil.h b/innobase/include/fil0fil.h index 5a5db77073a..75b32937e0b 100644 --- a/innobase/include/fil0fil.h +++ b/innobase/include/fil0fil.h @@ -89,6 +89,8 @@ extern fil_addr_t fil_addr_null; #define FIL_TABLESPACE 501 #define FIL_LOG 502 +extern ulint fil_n_log_flushes; + extern ulint fil_n_pending_log_flushes; extern ulint fil_n_pending_tablespace_flushes; @@ -478,7 +480,7 @@ fil_extend_space_to_desired_size( ulint* actual_size, /* out: size of the space after extension; if we ran out of disk space this may be lower than the desired size */ - ulint space_id, /* in: space id, must be != 0 */ + ulint space_id, /* in: space id */ ulint size_after_extend);/* in: desired size in pages after the extension; if the current space size is bigger than this already, the function does nothing */ diff --git a/innobase/include/lock0lock.h b/innobase/include/lock0lock.h index 9f525042dcc..b99359fe998 100644 --- a/innobase/include/lock0lock.h +++ b/innobase/include/lock0lock.h @@ -47,7 +47,8 @@ lock_sec_rec_some_has_impl_off_kernel( /* out: transaction which has the x-lock, or NULL */ rec_t* rec, /* in: user record */ - dict_index_t* index); /* in: secondary index */ + dict_index_t* index, /* in: secondary index */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ /************************************************************************* Checks if some transaction has an implicit x-lock on a record in a clustered index. */ @@ -58,7 +59,8 @@ lock_clust_rec_some_has_impl( /* out: transaction which has the x-lock, or NULL */ rec_t* rec, /* in: user record */ - dict_index_t* index); /* in: clustered index */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ /***************************************************************** Resets the lock bits for a single record. Releases transactions waiting for lock requests here. */ @@ -275,6 +277,7 @@ lock_clust_rec_modify_check_and_lock( does nothing */ rec_t* rec, /* in: record which should be modified */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ que_thr_t* thr); /* in: query thread */ /************************************************************************* Checks if locks of other transactions prevent an immediate modify @@ -308,6 +311,7 @@ lock_sec_rec_read_check_and_lock( which should be read or passed over by a read cursor */ dict_index_t* index, /* in: secondary index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ ulint mode, /* in: mode of the lock which the read cursor should set on records: LOCK_S or LOCK_X; the latter is possible in SELECT FOR UPDATE */ @@ -333,6 +337,7 @@ lock_clust_rec_read_check_and_lock( which should be read or passed over by a read cursor */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ ulint mode, /* in: mode of the lock which the read cursor should set on records: LOCK_S or LOCK_X; the latter is possible in SELECT FOR UPDATE */ @@ -350,6 +355,7 @@ lock_clust_rec_cons_read_sees( rec_t* rec, /* in: user record which should be read or passed over by a read cursor */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ read_view_t* view); /* in: consistent read view */ /************************************************************************* Checks that a non-clustered index record is seen in a consistent read. */ @@ -463,6 +469,33 @@ lock_rec_hash( ulint space, /* in: space */ ulint page_no);/* in: page number */ /************************************************************************* +Gets the source table of an ALTER TABLE transaction. The table must be +covered by an IX or IS table lock. */ + +dict_table_t* +lock_get_src_table( +/*===============*/ + /* out: the source table of transaction, + if it is covered by an IX or IS table lock; + dest if there is no source table, and + NULL if the transaction is locking more than + two tables or an inconsistency is found */ + trx_t* trx, /* in: transaction */ + dict_table_t* dest, /* in: destination of ALTER TABLE */ + ulint* mode); /* out: lock mode of the source table */ +/************************************************************************* +Determine if the given table is exclusively "owned" by the given +transaction, i.e., transaction holds LOCK_IX and possibly LOCK_AUTO_INC +on the table. */ + +ibool +lock_is_table_exclusive( +/*====================*/ + /* out: TRUE if table is only locked by trx, + with LOCK_IX, and possibly LOCK_AUTO_INC */ + dict_table_t* table, /* in: table */ + trx_t* trx); /* in: transaction */ +/************************************************************************* Checks that a transaction id is sensible, i.e., not in the future. */ ibool @@ -472,6 +505,7 @@ lock_check_trx_id_sanity( dulint trx_id, /* in: trx id */ rec_t* rec, /* in: user record */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets, /* in: rec_get_offsets(rec, index) */ ibool has_kernel_mutex);/* in: TRUE if the caller owns the kernel mutex */ /************************************************************************* @@ -482,7 +516,8 @@ lock_rec_queue_validate( /*====================*/ /* out: TRUE if ok */ rec_t* rec, /* in: record to look at */ - dict_index_t* index); /* in: index, or NULL if not known */ + dict_index_t* index, /* in: index, or NULL if not known */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ /************************************************************************* Prints info of a table lock. */ diff --git a/innobase/include/lock0lock.ic b/innobase/include/lock0lock.ic index fabc9256401..c7a71bb45d8 100644 --- a/innobase/include/lock0lock.ic +++ b/innobase/include/lock0lock.ic @@ -60,7 +60,8 @@ lock_clust_rec_some_has_impl( /* out: transaction which has the x-lock, or NULL */ rec_t* rec, /* in: user record */ - dict_index_t* index) /* in: clustered index */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { dulint trx_id; @@ -70,7 +71,7 @@ lock_clust_rec_some_has_impl( ut_ad(index->type & DICT_CLUSTERED); ut_ad(page_rec_is_user_rec(rec)); - trx_id = row_get_rec_trx_id(rec, index); + trx_id = row_get_rec_trx_id(rec, index, offsets); if (trx_is_active(trx_id)) { /* The modifying or inserting transaction is active */ diff --git a/innobase/include/mtr0log.h b/innobase/include/mtr0log.h index 9c9c6f696e8..c0636ea1e1e 100644 --- a/innobase/include/mtr0log.h +++ b/innobase/include/mtr0log.h @@ -11,6 +11,7 @@ Created 12/7/1995 Heikki Tuuri #include "univ.i" #include "mtr0mtr.h" +#include "dict0types.h" /************************************************************ Writes 1 - 4 bytes to a file page buffered in the buffer pool. @@ -173,6 +174,38 @@ mlog_parse_string( byte* page); /* in: page where to apply the log record, or NULL */ +/************************************************************ +Opens a buffer for mlog, writes the initial log record and, +if needed, the field lengths of an index. Reserves space +for further log entries. The log entry must be closed with +mtr_close(). */ + +byte* +mlog_open_and_write_index( +/*======================*/ + /* out: buffer, NULL if log mode + MTR_LOG_NONE */ + mtr_t* mtr, /* in: mtr */ + byte* rec, /* in: index record or page */ + dict_index_t* index, /* in: record descriptor */ + byte type, /* in: log item type */ + ulint size); /* in: requested buffer size in bytes + (if 0, calls mlog_close() and returns NULL) */ + +/************************************************************ +Parses a log record written by mlog_open_and_write_index. */ + +byte* +mlog_parse_index( +/*=============*/ + /* out: parsed record end, + NULL if not a complete record */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + /* out: new value of log_ptr */ + ibool comp, /* in: TRUE=compact record format */ + dict_index_t** index); /* out, own: dummy index */ + /* Insert, update, and maybe other functions may use this value to define an extra mlog buffer size for variable size data */ #define MLOG_BUF_MARGIN 256 diff --git a/innobase/include/mtr0mtr.h b/innobase/include/mtr0mtr.h index e8c68a91dad..071279d5259 100644 --- a/innobase/include/mtr0mtr.h +++ b/innobase/include/mtr0mtr.h @@ -102,7 +102,31 @@ flag value must give the length also! */ file rename */ #define MLOG_FILE_DELETE ((byte)35) /* log record about an .ibd file deletion */ -#define MLOG_BIGGEST_TYPE ((byte)35) /* biggest value (used in +#define MLOG_COMP_REC_MIN_MARK ((byte)36) /* mark a compact index record + as the predefined minimum + record */ +#define MLOG_COMP_PAGE_CREATE ((byte)37) /* create a compact + index page */ +#define MLOG_COMP_REC_INSERT ((byte)38) /* compact record insert */ +#define MLOG_COMP_REC_CLUST_DELETE_MARK ((byte)39) + /* mark compact clustered index + record deleted */ +#define MLOG_COMP_REC_SEC_DELETE_MARK ((byte)40)/* mark compact secondary index + record deleted */ +#define MLOG_COMP_REC_UPDATE_IN_PLACE ((byte)41)/* update of a compact record, + preserves record field sizes */ +#define MLOG_COMP_REC_DELETE ((byte)42) /* delete a compact record + from a page */ +#define MLOG_COMP_LIST_END_DELETE ((byte)43) /* delete compact record list + end on index page */ +#define MLOG_COMP_LIST_START_DELETE ((byte)44) /* delete compact record list + start on index page */ +#define MLOG_COMP_LIST_END_COPY_CREATED ((byte)45) + /* copy compact record list end + to a new created index page */ +#define MLOG_COMP_PAGE_REORGANIZE ((byte)46) /* reorganize an index page */ + +#define MLOG_BIGGEST_TYPE ((byte)46) /* biggest value (used in asserts) */ /******************************************************************* diff --git a/innobase/include/os0file.h b/innobase/include/os0file.h index d1439faf29f..599e78bab48 100644 --- a/innobase/include/os0file.h +++ b/innobase/include/os0file.h @@ -24,6 +24,9 @@ extern ibool os_aio_print_debug; extern ulint os_file_n_pending_preads; extern ulint os_file_n_pending_pwrites; +extern ulint os_n_pending_reads; +extern ulint os_n_pending_writes; + #ifdef __WIN__ /* We define always WIN_ASYNC_IO, and check at run-time whether diff --git a/innobase/include/page0cur.h b/innobase/include/page0cur.h index c85669ed4df..a693931968e 100644 --- a/innobase/include/page0cur.h +++ b/innobase/include/page0cur.h @@ -128,7 +128,8 @@ page_cur_tuple_insert( /* out: pointer to record if succeed, NULL otherwise */ page_cur_t* cursor, /* in: a page cursor */ - dtuple_t* tuple, /* in: pointer to a data tuple */ + dtuple_t* tuple, /* in: pointer to a data tuple */ + dict_index_t* index, /* in: record descriptor */ mtr_t* mtr); /* in: mini-transaction handle */ /*************************************************************** Inserts a record next to page cursor. Returns pointer to inserted record if @@ -142,6 +143,7 @@ page_cur_rec_insert( otherwise */ page_cur_t* cursor, /* in: a page cursor */ rec_t* rec, /* in: record to insert */ + dict_index_t* index, /* in: record descriptor */ mtr_t* mtr); /* in: mini-transaction handle */ /*************************************************************** Inserts a record next to page cursor. Returns pointer to inserted record if @@ -155,9 +157,9 @@ page_cur_insert_rec_low( /* out: pointer to record if succeed, NULL otherwise */ page_cur_t* cursor, /* in: a page cursor */ - dtuple_t* tuple, /* in: pointer to a data tuple or NULL */ - ulint data_size,/* in: data size of tuple */ - rec_t* rec, /* in: pointer to a physical record or NULL */ + dtuple_t* tuple, /* in: pointer to a data tuple or NULL */ + dict_index_t* index, /* in: record descriptor */ + rec_t* rec, /* in: pointer to a physical record or NULL */ mtr_t* mtr); /* in: mini-transaction handle */ /***************************************************************** Copies records from page to a newly created page, from a given record onward, @@ -166,10 +168,11 @@ including that record. Infimum and supremum records are not copied. */ void page_copy_rec_list_end_to_created_page( /*===================================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ - rec_t* rec, /* in: first record to copy */ - mtr_t* mtr); /* in: mtr */ + page_t* new_page, /* in: index page to copy to */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: first record to copy */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ /*************************************************************** Deletes a record at the page cursor. The cursor is moved to the next record after the deleted one. */ @@ -178,6 +181,7 @@ void page_cur_delete_rec( /*================*/ page_cur_t* cursor, /* in: a page cursor */ + dict_index_t* index, /* in: record descriptor */ mtr_t* mtr); /* in: mini-transaction handle */ /******************************************************************** Searches the right position for a page cursor. */ @@ -187,6 +191,7 @@ page_cur_search( /*============*/ /* out: number of matched fields on the left */ page_t* page, /* in: index page */ + dict_index_t* index, /* in: record descriptor */ dtuple_t* tuple, /* in: data tuple */ ulint mode, /* in: PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, or PAGE_CUR_GE */ @@ -198,6 +203,7 @@ void page_cur_search_with_match( /*=======================*/ page_t* page, /* in: index page */ + dict_index_t* index, /* in: record descriptor */ dtuple_t* tuple, /* in: data tuple */ ulint mode, /* in: PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, or PAGE_CUR_GE */ @@ -229,34 +235,37 @@ Parses a log record of a record insert on a page. */ byte* page_cur_parse_insert_rec( /*======================*/ - /* out: end of log record or NULL */ - ibool is_short,/* in: TRUE if short inserts */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr); /* in: mtr or NULL */ + /* out: end of log record or NULL */ + ibool is_short,/* in: TRUE if short inserts */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr); /* in: mtr or NULL */ /************************************************************** Parses a log record of copying a record list end to a new created page. */ byte* page_parse_copy_rec_list_to_created_page( /*=====================================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr); /* in: mtr or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr); /* in: mtr or NULL */ /*************************************************************** Parses log record of a record delete on a page. */ byte* page_cur_parse_delete_rec( /*======================*/ - /* out: pointer to record end or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr); /* in: mtr or NULL */ + /* out: pointer to record end or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr); /* in: mtr or NULL */ /* Index page cursor */ diff --git a/innobase/include/page0cur.ic b/innobase/include/page0cur.ic index 39f8ab11513..03010fbd766 100644 --- a/innobase/include/page0cur.ic +++ b/innobase/include/page0cur.ic @@ -143,7 +143,7 @@ UNIV_INLINE void page_cur_move_to_prev( /*==================*/ - page_cur_t* cur) /* in: cursor; must not before first */ + page_cur_t* cur) /* in: page cursor, not before first */ { ut_ad(!page_cur_is_before_first(cur)); @@ -158,6 +158,7 @@ page_cur_search( /*============*/ /* out: number of matched fields on the left */ page_t* page, /* in: index page */ + dict_index_t* index, /* in: record descriptor */ dtuple_t* tuple, /* in: data tuple */ ulint mode, /* in: PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, or PAGE_CUR_GE */ @@ -170,7 +171,7 @@ page_cur_search( ut_ad(dtuple_check_typed(tuple)); - page_cur_search_with_match(page, tuple, mode, + page_cur_search_with_match(page, index, tuple, mode, &up_matched_fields, &up_matched_bytes, &low_matched_fields, @@ -190,16 +191,11 @@ page_cur_tuple_insert( /* out: pointer to record if succeed, NULL otherwise */ page_cur_t* cursor, /* in: a page cursor */ - dtuple_t* tuple, /* in: pointer to a data tuple */ + dtuple_t* tuple, /* in: pointer to a data tuple */ + dict_index_t* index, /* in: record descriptor */ mtr_t* mtr) /* in: mini-transaction handle */ { - ulint data_size; - - ut_ad(dtuple_check_typed(tuple)); - - data_size = dtuple_get_data_size(tuple); - - return(page_cur_insert_rec_low(cursor, tuple, data_size, NULL, mtr)); + return(page_cur_insert_rec_low(cursor, tuple, index, NULL, mtr)); } /*************************************************************** @@ -214,8 +210,9 @@ page_cur_rec_insert( otherwise */ page_cur_t* cursor, /* in: a page cursor */ rec_t* rec, /* in: record to insert */ + dict_index_t* index, /* in: record descriptor */ mtr_t* mtr) /* in: mini-transaction handle */ { - return(page_cur_insert_rec_low(cursor, NULL, 0, rec, mtr)); + return(page_cur_insert_rec_low(cursor, NULL, index, rec, mtr)); } diff --git a/innobase/include/page0page.h b/innobase/include/page0page.h index 969313614e3..d3ef8214eb6 100644 --- a/innobase/include/page0page.h +++ b/innobase/include/page0page.h @@ -37,7 +37,8 @@ typedef byte page_header_t; /*-----------------------------*/ #define PAGE_N_DIR_SLOTS 0 /* number of slots in page directory */ #define PAGE_HEAP_TOP 2 /* pointer to record heap top */ -#define PAGE_N_HEAP 4 /* number of records in the heap */ +#define PAGE_N_HEAP 4 /* number of records in the heap, + bit 15=flag: new-style compact page format */ #define PAGE_FREE 6 /* pointer to start of page free record list */ #define PAGE_GARBAGE 8 /* number of bytes in deleted records */ #define PAGE_LAST_INSERT 10 /* pointer to the last inserted record, or @@ -79,15 +80,24 @@ typedef byte page_header_t; #define PAGE_DATA (PAGE_HEADER + 36 + 2 * FSEG_HEADER_SIZE) /* start of data on the page */ -#define PAGE_INFIMUM (PAGE_DATA + 1 + REC_N_EXTRA_BYTES) - /* offset of the page infimum record on the - page */ -#define PAGE_SUPREMUM (PAGE_DATA + 2 + 2 * REC_N_EXTRA_BYTES + 8) - /* offset of the page supremum record on the - page */ -#define PAGE_SUPREMUM_END (PAGE_SUPREMUM + 9) +#define PAGE_OLD_INFIMUM (PAGE_DATA + 1 + REC_N_OLD_EXTRA_BYTES) + /* offset of the page infimum record on an + old-style page */ +#define PAGE_OLD_SUPREMUM (PAGE_DATA + 2 + 2 * REC_N_OLD_EXTRA_BYTES + 8) + /* offset of the page supremum record on an + old-style page */ +#define PAGE_OLD_SUPREMUM_END (PAGE_OLD_SUPREMUM + 9) /* offset of the page supremum record end on - the page */ + an old-style page */ +#define PAGE_NEW_INFIMUM (PAGE_DATA + REC_N_NEW_EXTRA_BYTES) + /* offset of the page infimum record on a + new-style compact page */ +#define PAGE_NEW_SUPREMUM (PAGE_DATA + 2 * REC_N_NEW_EXTRA_BYTES + 8) + /* offset of the page supremum record on a + new-style compact page */ +#define PAGE_NEW_SUPREMUM_END (PAGE_NEW_SUPREMUM + 8) + /* offset of the page supremum record end on + a new-style compact page */ /*-----------------------------*/ /* Directions of cursor movement */ @@ -233,6 +243,7 @@ page_cmp_dtuple_rec_with_match( be page infimum or supremum, in which case matched-parameter values below are not affected */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint* matched_fields, /* in/out: number of already completely matched fields; when function returns contains the value for current comparison */ @@ -259,6 +270,22 @@ page_rec_get_n_recs_before( /* out: number of records */ rec_t* rec); /* in: the physical record */ /***************************************************************** +Gets the number of records in the heap. */ +UNIV_INLINE +ulint +page_dir_get_n_heap( +/*================*/ + /* out: number of user records */ + page_t* page); /* in: index page */ +/***************************************************************** +Sets the number of records in the heap. */ +UNIV_INLINE +void +page_dir_set_n_heap( +/*================*/ + page_t* page, /* in: index page */ + ulint n_heap);/* in: number of records */ +/***************************************************************** Gets the number of dir slots in directory. */ UNIV_INLINE ulint @@ -267,6 +294,15 @@ page_dir_get_n_slots( /* out: number of slots */ page_t* page); /* in: index page */ /***************************************************************** +Sets the number of dir slots in directory. */ +UNIV_INLINE +void +page_dir_set_n_slots( +/*=================*/ + /* out: number of slots */ + page_t* page, /* in: index page */ + ulint n_slots);/* in: number of slots */ +/***************************************************************** Gets pointer to nth directory slot. */ UNIV_INLINE page_dir_slot_t* @@ -333,7 +369,16 @@ ulint page_dir_find_owner_slot( /*=====================*/ /* out: the directory slot number */ - rec_t* rec); /* in: the physical record */ + rec_t* rec); /* in: the physical record */ +/**************************************************************** +Determine whether the page is in new-style compact format. */ +UNIV_INLINE +ibool +page_is_comp( +/*=========*/ + /* out: TRUE if the page is in compact format + FALSE if it is in old-style format */ + page_t* page); /* in: index page */ /**************************************************************** Gets the pointer to the next record on the page. */ UNIV_INLINE @@ -359,9 +404,10 @@ UNIV_INLINE rec_t* page_rec_get_prev( /*==============*/ - /* out: pointer to previous record */ - rec_t* rec); /* in: pointer to record, must not be page - infimum */ + /* out: pointer to previous record */ + rec_t* rec); /* in: pointer to record, + must not be page infimum */ + /**************************************************************** TRUE if the record is a user record on the page. */ UNIV_INLINE @@ -446,9 +492,11 @@ page_get_max_insert_size_after_reorganize( Calculates free space if a page is emptied. */ UNIV_INLINE ulint -page_get_free_space_of_empty(void); -/*==============================*/ - /* out: free space */ +page_get_free_space_of_empty( +/*=========================*/ + /* out: free space */ + ibool comp) /* in: TRUE=compact page format */ + __attribute__((const)); /**************************************************************** Returns the sum of the sizes of the records in the record list excluding the infimum and supremum records. */ @@ -464,20 +512,23 @@ Allocates a block of memory from an index page. */ byte* page_mem_alloc( /*===========*/ - /* out: pointer to start of allocated - buffer, or NULL if allocation fails */ - page_t* page, /* in: index page */ - ulint need, /* in: number of bytes needed */ - ulint* heap_no);/* out: this contains the heap number - of the allocated record if allocation succeeds */ + /* out: pointer to start of allocated + buffer, or NULL if allocation fails */ + page_t* page, /* in: index page */ + ulint need, /* in: number of bytes needed */ + dict_index_t* index, /* in: record descriptor */ + ulint* heap_no);/* out: this contains the heap number + of the allocated record + if allocation succeeds */ /**************************************************************** Puts a record to free list. */ UNIV_INLINE void page_mem_free( /*==========*/ - page_t* page, /* in: index page */ - rec_t* rec); /* in: pointer to the (origin of) record */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: pointer to the (origin of) record */ + dict_index_t* index); /* in: record descriptor */ /************************************************************** The index page creation function. */ @@ -487,7 +538,8 @@ page_create( /* out: pointer to the page */ buf_frame_t* frame, /* in: a buffer frame where the page is created */ - mtr_t* mtr); /* in: mini-transaction handle */ + mtr_t* mtr, /* in: mini-transaction handle */ + ibool comp); /* in: TRUE=compact page format */ /***************************************************************** Differs from page_copy_rec_list_end, because this function does not touch the lock table and max trx id on page. */ @@ -495,10 +547,11 @@ touch the lock table and max trx id on page. */ void page_copy_rec_list_end_no_locks( /*============================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - mtr_t* mtr); /* in: mtr */ + page_t* new_page, /* in: index page to copy to */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Copies records from page to new_page, from the given record onward, including that record. Infimum and supremum records are not copied. @@ -507,10 +560,11 @@ The records are copied to the start of the record list on new_page. */ void page_copy_rec_list_end( /*===================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - mtr_t* mtr); /* in: mtr */ + page_t* new_page, /* in: index page to copy to */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Copies records from page to new_page, up to the given record, NOT including that record. Infimum and supremum records are not copied. @@ -519,10 +573,11 @@ The records are copied to the end of the record list on new_page. */ void page_copy_rec_list_start( /*=====================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - mtr_t* mtr); /* in: mtr */ + page_t* new_page, /* in: index page to copy to */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Deletes records from a page from a given record onward, including that record. The infimum and supremum records are not deleted. */ @@ -530,14 +585,15 @@ The infimum and supremum records are not deleted. */ void page_delete_rec_list_end( /*=====================*/ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - ulint n_recs, /* in: number of records to delete, or ULINT_UNDEFINED - if not known */ - ulint size, /* in: the sum of the sizes of the records in the end - of the chain to delete, or ULINT_UNDEFINED if not - known */ - mtr_t* mtr); /* in: mtr */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + ulint n_recs, /* in: number of records to delete, + or ULINT_UNDEFINED if not known */ + ulint size, /* in: the sum of the sizes of the + records in the end of the chain to + delete, or ULINT_UNDEFINED if not known */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Deletes records from page, up to the given record, NOT including that record. Infimum and supremum records are not deleted. */ @@ -545,9 +601,10 @@ that record. Infimum and supremum records are not deleted. */ void page_delete_rec_list_start( /*=======================*/ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - mtr_t* mtr); /* in: mtr */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Moves record list end to another page. Moved records include split_rec. */ @@ -555,10 +612,11 @@ split_rec. */ void page_move_rec_list_end( /*===================*/ - page_t* new_page, /* in: index page where to move */ - page_t* page, /* in: index page */ - rec_t* split_rec, /* in: first record to move */ - mtr_t* mtr); /* in: mtr */ + page_t* new_page, /* in: index page where to move */ + page_t* page, /* in: index page */ + rec_t* split_rec, /* in: first record to move */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Moves record list start to another page. Moved records do not include split_rec. */ @@ -566,10 +624,11 @@ split_rec. */ void page_move_rec_list_start( /*=====================*/ - page_t* new_page, /* in: index page where to move */ - page_t* page, /* in: index page */ - rec_t* split_rec, /* in: first record not to move */ - mtr_t* mtr); /* in: mtr */ + page_t* new_page, /* in: index page where to move */ + page_t* page, /* in: index page */ + rec_t* split_rec, /* in: first record not to move */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ /******************************************************************** Splits a directory slot which owns too many records. */ @@ -595,13 +654,16 @@ Parses a log record of a record list end or start deletion. */ byte* page_parse_delete_rec_list( /*=======================*/ - /* out: end of log record or NULL */ - byte type, /* in: MLOG_LIST_END_DELETE or - MLOG_LIST_START_DELETE */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr); /* in: mtr or NULL */ + /* out: end of log record or NULL */ + byte type, /* in: MLOG_LIST_END_DELETE, + MLOG_LIST_START_DELETE, + MLOG_COMP_LIST_END_DELETE or + MLOG_COMP_LIST_START_DELETE */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr); /* in: mtr or NULL */ /*************************************************************** Parses a redo log record of creating a page. */ @@ -611,6 +673,7 @@ page_parse_create( /* out: end of log record or NULL */ byte* ptr, /* in: buffer */ byte* end_ptr,/* in: buffer end */ + ibool comp, /* in: TRUE=compact page format */ page_t* page, /* in: page or NULL */ mtr_t* mtr); /* in: mtr or NULL */ /**************************************************************** @@ -620,7 +683,8 @@ the index page context. */ void page_rec_print( /*===========*/ - rec_t* rec); + rec_t* rec, /* in: physical record */ + const ulint* offsets);/* in: record descriptor */ /******************************************************************* This is used to print the contents of the directory for debugging purposes. */ @@ -637,8 +701,9 @@ debugging purposes. */ void page_print_list( /*============*/ - page_t* page, /* in: index page */ - ulint pr_n); /* in: print n first and n last entries */ + page_t* page, /* in: index page */ + dict_index_t* index, /* in: dictionary index of the page */ + ulint pr_n); /* in: print n first and n last entries */ /******************************************************************* Prints the info in a page header. */ @@ -653,9 +718,12 @@ debugging purposes. */ void page_print( /*======*/ - page_t* page, /* in: index page */ - ulint dn, /* in: print dn first and last entries in directory */ - ulint rn); /* in: print rn first and last records on page */ + page_t* page, /* in: index page */ + dict_index_t* index, /* in: dictionary index of the page */ + ulint dn, /* in: print dn first and last entries + in directory */ + ulint rn); /* in: print rn first and last records + in directory */ /******************************************************************* The following is used to validate a record on a page. This function differs from rec_validate as it can also check the n_owned field and @@ -664,8 +732,9 @@ the heap_no field. */ ibool page_rec_validate( /*==============*/ - /* out: TRUE if ok */ - rec_t* rec); /* in: record on the page */ + /* out: TRUE if ok */ + rec_t* rec, /* in: physical record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /******************************************************************* Checks that the first directory slot points to the infimum record and the last to the supremum. This function is intended to track if the diff --git a/innobase/include/page0page.ic b/innobase/include/page0page.ic index 3d2bf3b090e..1d5ea337031 100644 --- a/innobase/include/page0page.ic +++ b/innobase/include/page0page.ic @@ -73,7 +73,8 @@ page_header_set_field( { ut_ad(page); ut_ad(field <= PAGE_N_RECS); - ut_ad(val < UNIV_PAGE_SIZE); + ut_ad(field == PAGE_N_HEAP || val < UNIV_PAGE_SIZE); + ut_ad(field != PAGE_N_HEAP || (val & 0x7fff) < UNIV_PAGE_SIZE); mach_write_to_2(page + PAGE_HEADER + field, val); } @@ -162,7 +163,11 @@ page_get_infimum_rec( { ut_ad(page); - return(page + PAGE_INFIMUM); + if (page_is_comp(page)) { + return(page + PAGE_NEW_INFIMUM); + } else { + return(page + PAGE_OLD_INFIMUM); + } } /**************************************************************** @@ -176,7 +181,11 @@ page_get_supremum_rec( { ut_ad(page); - return(page + PAGE_SUPREMUM); + if (page_is_comp(page)) { + return(page + PAGE_NEW_SUPREMUM); + } else { + return(page + PAGE_OLD_SUPREMUM); + } } /**************************************************************** @@ -309,6 +318,7 @@ page_cmp_dtuple_rec_with_match( be page infimum or supremum, in which case matched-parameter values below are not affected */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint* matched_fields, /* in/out: number of already completely matched fields; when function returns contains the value for current comparison */ @@ -320,6 +330,7 @@ page_cmp_dtuple_rec_with_match( page_t* page; ut_ad(dtuple_check_typed(dtuple)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); page = buf_frame_align(rec); @@ -328,7 +339,7 @@ page_cmp_dtuple_rec_with_match( } else if (rec == page_get_supremum_rec(page)) { return(-1); } else { - return(cmp_dtuple_rec_with_match(dtuple, rec, + return(cmp_dtuple_rec_with_match(dtuple, rec, offsets, matched_fields, matched_bytes)); } @@ -358,6 +369,45 @@ page_dir_get_n_slots( { return(page_header_get_field(page, PAGE_N_DIR_SLOTS)); } +/***************************************************************** +Sets the number of dir slots in directory. */ +UNIV_INLINE +void +page_dir_set_n_slots( +/*=================*/ + /* out: number of slots */ + page_t* page, /* in: index page */ + ulint n_slots)/* in: number of slots */ +{ + page_header_set_field(page, PAGE_N_DIR_SLOTS, n_slots); +} + +/***************************************************************** +Gets the number of records in the heap. */ +UNIV_INLINE +ulint +page_dir_get_n_heap( +/*================*/ + /* out: number of user records */ + page_t* page) /* in: index page */ +{ + return(page_header_get_field(page, PAGE_N_HEAP) & 0x7fff); +} + +/***************************************************************** +Sets the number of records in the heap. */ +UNIV_INLINE +void +page_dir_set_n_heap( +/*================*/ + page_t* page, /* in: index page */ + ulint n_heap) /* in: number of records */ +{ + ut_ad(n_heap < 0x8000); + + page_header_set_field(page, PAGE_N_HEAP, n_heap | (0x8000 & + page_header_get_field(page, PAGE_N_HEAP))); +} /***************************************************************** Gets pointer to nth directory slot. */ @@ -369,7 +419,7 @@ page_dir_get_nth_slot( page_t* page, /* in: index page */ ulint n) /* in: position */ { - ut_ad(page_header_get_field(page, PAGE_N_DIR_SLOTS) > n); + ut_ad(page_dir_get_n_slots(page) > n); return(page + UNIV_PAGE_SIZE - PAGE_DIR - (n + 1) * PAGE_DIR_SLOT_SIZE); @@ -431,7 +481,8 @@ page_dir_slot_get_n_owned( /* out: number of records */ page_dir_slot_t* slot) /* in: page directory slot */ { - return(rec_get_n_owned(page_dir_slot_get_rec(slot))); + return(rec_get_n_owned(page_dir_slot_get_rec(slot), + page_is_comp(buf_frame_align(slot)))); } /******************************************************************* @@ -444,7 +495,8 @@ page_dir_slot_set_n_owned( ulint n) /* in: number of records owned by the slot */ { - rec_set_n_owned(page_dir_slot_get_rec(slot), n); + rec_set_n_owned(page_dir_slot_get_rec(slot), + page_is_comp(buf_frame_align(slot)), n); } /**************************************************************** @@ -462,6 +514,19 @@ page_dir_calc_reserved_space( } /**************************************************************** +Determine whether the page is in new-style compact format. */ +UNIV_INLINE +ibool +page_is_comp( +/*=========*/ + /* out: TRUE if the page is in compact format + FALSE if it is in old-style format */ + page_t* page) /* in: index page */ +{ + return(!!(page_header_get_field(page, PAGE_N_HEAP) & 0x8000)); +} + +/**************************************************************** Gets the pointer to the next record on the page. */ UNIV_INLINE rec_t* @@ -477,7 +542,7 @@ page_rec_get_next( page = buf_frame_align(rec); - offs = rec_get_next_offs(rec); + offs = rec_get_next_offs(rec, page_is_comp(page)); if (offs >= UNIV_PAGE_SIZE) { fprintf(stderr, @@ -513,6 +578,7 @@ page_rec_set_next( infimum */ { page_t* page; + ulint offs; ut_ad(page_rec_check(rec)); ut_a((next == NULL) @@ -523,11 +589,13 @@ page_rec_set_next( ut_ad(rec != page_get_supremum_rec(page)); ut_ad(next != page_get_infimum_rec(page)); - if (next == NULL) { - rec_set_next_offs(rec, 0); + if (next) { + offs = (ulint) (next - page); } else { - rec_set_next_offs(rec, (ulint)(next - page)); + offs = 0; } + + rec_set_next_offs(rec, page_is_comp(page), offs); } /**************************************************************** @@ -545,6 +613,7 @@ page_rec_get_prev( rec_t* rec2; rec_t* prev_rec = NULL; page_t* page; + ibool comp; ut_ad(page_rec_check(rec)); @@ -559,6 +628,7 @@ page_rec_get_prev( slot = page_dir_get_nth_slot(page, slot_no - 1); rec2 = page_dir_slot_get_rec(slot); + comp = page_is_comp(page); while (rec != rec2) { prev_rec = rec2; @@ -579,9 +649,12 @@ page_rec_find_owner_rec( /* out: the owner record */ rec_t* rec) /* in: the physical record */ { + ibool comp; + ut_ad(page_rec_check(rec)); + comp = page_is_comp(buf_frame_align(rec)); - while (rec_get_n_owned(rec) == 0) { + while (rec_get_n_owned(rec, comp) == 0) { rec = page_rec_get_next(rec); } @@ -601,7 +674,9 @@ page_get_data_size( ulint ret; ret = (ulint)(page_header_get_field(page, PAGE_HEAP_TOP) - - PAGE_SUPREMUM_END + - (page_is_comp(page) + ? PAGE_NEW_SUPREMUM_END + : PAGE_OLD_SUPREMUM_END) - page_header_get_field(page, PAGE_GARBAGE)); ut_ad(ret < UNIV_PAGE_SIZE); @@ -613,12 +688,13 @@ page_get_data_size( Calculates free space if a page is emptied. */ UNIV_INLINE ulint -page_get_free_space_of_empty(void) -/*==============================*/ +page_get_free_space_of_empty( +/*=========================*/ /* out: free space */ + ibool comp) /* in: TRUE=compact page layout */ { return((ulint)(UNIV_PAGE_SIZE - - PAGE_SUPREMUM_END + - (comp ? PAGE_NEW_SUPREMUM_END : PAGE_OLD_SUPREMUM_END) - PAGE_DIR - 2 * PAGE_DIR_SLOT_SIZE)); } @@ -640,13 +716,16 @@ page_get_max_insert_size( { ulint occupied; ulint free_space; + ibool comp; + + comp = page_is_comp(page); occupied = page_header_get_field(page, PAGE_HEAP_TOP) - - PAGE_SUPREMUM_END + - (comp ? PAGE_NEW_SUPREMUM_END : PAGE_OLD_SUPREMUM_END) + page_dir_calc_reserved_space( - n_recs + (page_header_get_field(page, PAGE_N_HEAP) - 2)); + n_recs + page_dir_get_n_heap(page) - 2); - free_space = page_get_free_space_of_empty(); + free_space = page_get_free_space_of_empty(comp); /* Above the 'n_recs +' part reserves directory space for the new inserted records; the '- 2' excludes page infimum and supremum @@ -673,11 +752,14 @@ page_get_max_insert_size_after_reorganize( { ulint occupied; ulint free_space; + ibool comp; + + comp = page_is_comp(page); occupied = page_get_data_size(page) + page_dir_calc_reserved_space(n_recs + page_get_n_recs(page)); - free_space = page_get_free_space_of_empty(); + free_space = page_get_free_space_of_empty(comp); if (occupied > free_space) { @@ -693,11 +775,12 @@ UNIV_INLINE void page_mem_free( /*==========*/ - page_t* page, /* in: index page */ - rec_t* rec) /* in: pointer to the (origin of) record */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: pointer to the (origin of) record */ + dict_index_t* index) /* in: record descriptor */ { - rec_t* free; - ulint garbage; + rec_t* free; + ulint garbage; free = page_header_get_ptr(page, PAGE_FREE); @@ -707,7 +790,7 @@ page_mem_free( garbage = page_header_get_field(page, PAGE_GARBAGE); page_header_set_field(page, PAGE_GARBAGE, - garbage + rec_get_size(rec)); + garbage + rec_get_size(rec, index)); } #ifdef UNIV_MATERIALIZE diff --git a/innobase/include/rem0cmp.h b/innobase/include/rem0cmp.h index 712e263350e..77a5a42c2d5 100644 --- a/innobase/include/rem0cmp.h +++ b/innobase/include/rem0cmp.h @@ -90,6 +90,7 @@ cmp_dtuple_rec_with_match( dtuple in some of the common fields, or which has an equal number or more fields than dtuple */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint* matched_fields, /* in/out: number of already completely matched fields; when function returns, contains the value for current comparison */ @@ -107,7 +108,8 @@ cmp_dtuple_rec( less than rec, respectively; see the comments for cmp_dtuple_rec_with_match */ dtuple_t* dtuple, /* in: data tuple */ - rec_t* rec); /* in: physical record */ + rec_t* rec, /* in: physical record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /****************************************************************** Checks if a dtuple is a prefix of a record. The last field in dtuple is allowed to be a prefix of the corresponding field in the record. */ @@ -116,23 +118,9 @@ ibool cmp_dtuple_is_prefix_of_rec( /*========================*/ /* out: TRUE if prefix */ - dtuple_t* dtuple, /* in: data tuple */ - rec_t* rec); /* in: physical record */ -/****************************************************************** -Compares a prefix of a data tuple to a prefix of a physical record for -equality. If there are less fields in rec than parameter n_fields, FALSE -is returned. NOTE that n_fields_cmp of dtuple does not affect this -comparison. */ - -ibool -cmp_dtuple_rec_prefix_equal( -/*========================*/ - /* out: TRUE if equal */ dtuple_t* dtuple, /* in: data tuple */ rec_t* rec, /* in: physical record */ - ulint n_fields); /* in: number of fields which should be - compared; must not exceed the number of - fields in dtuple */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /***************************************************************** This function is used to compare two physical records. Only the common first fields are compared, and if an externally stored field is @@ -146,7 +134,13 @@ cmp_rec_rec_with_match( first fields are compared */ rec_t* rec1, /* in: physical record */ rec_t* rec2, /* in: physical record */ + const ulint* offsets1,/* in: rec_get_offsets(rec1, index) */ + const ulint* offsets2,/* in: rec_get_offsets(rec2, index) */ dict_index_t* index, /* in: data dictionary index */ + ulint n, /* in: number of fields to compare, + or ULINT_UNDEFINED if both records + contain all fields, and all fields + should be compared */ ulint* matched_fields, /* in/out: number of already completely matched fields; when the function returns, contains the value the for current @@ -167,6 +161,12 @@ cmp_rec_rec( first fields are compared */ rec_t* rec1, /* in: physical record */ rec_t* rec2, /* in: physical record */ + const ulint* offsets1,/* in: rec_get_offsets(rec1, index) */ + const ulint* offsets2,/* in: rec_get_offsets(rec2, index) */ + ulint n, /* in: number of fields to compare, + or ULINT_UNDEFINED if both records + contain all fields, and all fields + should be compared */ dict_index_t* index); /* in: data dictionary index */ diff --git a/innobase/include/rem0cmp.ic b/innobase/include/rem0cmp.ic index 75cb3ef04e8..d4c30f25f03 100644 --- a/innobase/include/rem0cmp.ic +++ b/innobase/include/rem0cmp.ic @@ -57,10 +57,14 @@ cmp_rec_rec( first fields are compared */ rec_t* rec1, /* in: physical record */ rec_t* rec2, /* in: physical record */ + const ulint* offsets1,/* in: rec_get_offsets(rec1, index) */ + const ulint* offsets2,/* in: rec_get_offsets(rec2, index) */ + ulint n, /* in: number of fields to compare */ dict_index_t* index) /* in: data dictionary index */ { ulint match_f = 0; ulint match_b = 0; - return(cmp_rec_rec_with_match(rec1, rec2, index, &match_f, &match_b)); + return(cmp_rec_rec_with_match(rec1, rec2, offsets1, offsets2, index, n, + &match_f, &match_b)); } diff --git a/innobase/include/rem0rec.h b/innobase/include/rem0rec.h index 86bf263170f..d450df82311 100644 --- a/innobase/include/rem0rec.h +++ b/innobase/include/rem0rec.h @@ -23,9 +23,18 @@ Created 5/30/1994 Heikki Tuuri info bits of a record */ #define REC_INFO_MIN_REC_FLAG 0x10UL -/* Number of extra bytes in a record, in addition to the data and the -offsets */ -#define REC_N_EXTRA_BYTES 6 +/* Number of extra bytes in an old-style record, +in addition to the data and the offsets */ +#define REC_N_OLD_EXTRA_BYTES 6 +/* Number of extra bytes in a new-style record, +in addition to the data and the offsets */ +#define REC_N_NEW_EXTRA_BYTES 5 + +/* Record status values */ +#define REC_STATUS_ORDINARY 0 +#define REC_STATUS_NODE_PTR 1 +#define REC_STATUS_INFIMUM 2 +#define REC_STATUS_SUPREMUM 3 /********************************************************** The following function is used to get the offset of the @@ -36,7 +45,8 @@ rec_get_next_offs( /*==============*/ /* out: the page offset of the next chained record */ - rec_t* rec); /* in: physical record */ + rec_t* rec, /* in: physical record */ + ibool comp); /* in: TRUE=compact page format */ /********************************************************** The following function is used to set the next record offset field of the record. */ @@ -45,14 +55,15 @@ void rec_set_next_offs( /*==============*/ rec_t* rec, /* in: physical record */ + ibool comp, /* in: TRUE=compact page format */ ulint next); /* in: offset of the next record */ /********************************************************** The following function is used to get the number of fields -in the record. */ +in an old-style record. */ UNIV_INLINE ulint -rec_get_n_fields( -/*=============*/ +rec_get_n_fields_old( +/*=================*/ /* out: number of data fields */ rec_t* rec); /* in: physical record */ /********************************************************** @@ -63,7 +74,8 @@ ulint rec_get_n_owned( /*============*/ /* out: number of owned records */ - rec_t* rec); /* in: physical record */ + rec_t* rec, /* in: physical record */ + ibool comp); /* in: TRUE=compact page format */ /********************************************************** The following function is used to set the number of owned records. */ @@ -72,6 +84,7 @@ void rec_set_n_owned( /*============*/ rec_t* rec, /* in: physical record */ + ibool comp, /* in: TRUE=compact page format */ ulint n_owned); /* in: the number of owned */ /********************************************************** The following function is used to retrieve the info bits of @@ -81,7 +94,8 @@ ulint rec_get_info_bits( /*==============*/ /* out: info bits */ - rec_t* rec); /* in: physical record */ + rec_t* rec, /* in: physical record */ + ibool comp); /* in: TRUE=compact page format */ /********************************************************** The following function is used to set the info bits of a record. */ UNIV_INLINE @@ -89,15 +103,26 @@ void rec_set_info_bits( /*==============*/ rec_t* rec, /* in: physical record */ + ibool comp, /* in: TRUE=compact page format */ ulint bits); /* in: info bits */ /********************************************************** -Gets the value of the deleted falg in info bits. */ +The following function retrieves the status bits of a new-style record. */ UNIV_INLINE -ibool -rec_info_bits_get_deleted_flag( -/*===========================*/ - /* out: TRUE if deleted flag set */ - ulint info_bits); /* in: info bits from a record */ +ulint +rec_get_status( +/*===========*/ + /* out: status bits */ + rec_t* rec); /* in: physical record */ + +/********************************************************** +The following function is used to set the status bits of a new-style record. */ +UNIV_INLINE +void +rec_set_status( +/*===========*/ + rec_t* rec, /* in: physical record */ + ulint bits); /* in: info bits */ + /********************************************************** The following function tells if record is delete marked. */ UNIV_INLINE @@ -105,7 +130,8 @@ ibool rec_get_deleted_flag( /*=================*/ /* out: TRUE if delete marked */ - rec_t* rec); /* in: physical record */ + rec_t* rec, /* in: physical record */ + ibool comp); /* in: TRUE=compact page format */ /********************************************************** The following function is used to set the deleted bit. */ UNIV_INLINE @@ -113,8 +139,25 @@ void rec_set_deleted_flag( /*=================*/ rec_t* rec, /* in: physical record */ + ibool comp, /* in: TRUE=compact page format */ ibool flag); /* in: TRUE if delete marked */ /********************************************************** +The following function tells if a new-style record is a node pointer. */ +UNIV_INLINE +ibool +rec_get_node_ptr_flag( +/*=================*/ + /* out: TRUE if node pointer */ + rec_t* rec); /* in: physical record */ +/********************************************************** +The following function is used to flag a record as a node pointer. */ +UNIV_INLINE +void +rec_set_node_ptr_flag( +/*=================*/ + rec_t* rec, /* in: physical record */ + ibool flag); /* in: TRUE if the record is a node pointer */ +/********************************************************** The following function is used to get the order number of the record in the heap of the index page. */ UNIV_INLINE @@ -122,7 +165,8 @@ ulint rec_get_heap_no( /*=============*/ /* out: heap order number */ - rec_t* rec); /* in: physical record */ + rec_t* rec, /* in: physical record */ + ibool comp); /* in: TRUE=compact page format */ /********************************************************** The following function is used to set the heap number field in the record. */ @@ -131,6 +175,7 @@ void rec_set_heap_no( /*=============*/ rec_t* rec, /* in: physical record */ + ibool comp, /* in: TRUE=compact page format */ ulint heap_no);/* in: the heap number */ /********************************************************** The following function is used to test whether the data offsets @@ -141,31 +186,84 @@ rec_get_1byte_offs_flag( /*====================*/ /* out: TRUE if 1-byte form */ rec_t* rec); /* in: physical record */ +/********************************************************** +The following function determines the offsets to each field +in the record. The offsets are returned in an array of +ulint, with [0] being the number of fields (n), [1] being the +extra size (if REC_OFFS_COMPACT is set, the record is in the new +format), and [2]..[n+1] being the offsets past the end of +fields 0..n, or to the beginning of fields 1..n+1. When the +high-order bit of the offset at [n+1] is set (REC_OFFS_SQL_NULL), +the field n is NULL. When the second high-order bit of the offset +at [n+1] is set (REC_OFFS_EXTERNAL), the field n is being stored +externally. */ + +ulint* +rec_get_offsets( +/*============*/ + /* out: the offsets */ + rec_t* rec, /* in: physical record */ + dict_index_t* index, /* in: record descriptor */ + ulint n_fields,/* in: maximum number of initialized fields + (ULINT_UNDEFINED if all fields) */ + mem_heap_t* heap); /* in: memory heap */ +/********************************************************** +The following function determines the offsets to each field +in the record. It differs from rec_get_offsets() by trying to +reuse a previously returned array. */ + +ulint* +rec_reget_offsets( +/*==============*/ + /* out: the new offsets */ + rec_t* rec, /* in: physical record */ + dict_index_t* index, /* in: record descriptor */ + ulint* offsets,/* in: array of offsets + from rec_get_offsets() + or rec_reget_offsets(), or NULL */ + ulint n_fields,/* in: maximum number of initialized fields + (ULINT_UNDEFINED if all fields) */ + mem_heap_t* heap); /* in: memory heap */ + +/**************************************************************** +Validates offsets returned by rec_get_offsets() or rec_reget_offsets(). */ +UNIV_INLINE +ibool +rec_offs_validate( +/*==============*/ + /* out: TRUE if valid */ + rec_t* rec, /* in: record or NULL */ + dict_index_t* index, /* in: record descriptor or NULL */ + const ulint* offsets);/* in: array returned by rec_get_offsets() + or rec_reget_offsets() */ +/**************************************************************** +Updates debug data in offsets, in order to avoid bogus +rec_offs_validate() failures. */ +UNIV_INLINE +void +rec_offs_make_valid( +/*================*/ + const rec_t* rec, /* in: record */ + const dict_index_t* index,/* in: record descriptor */ + ulint* offsets);/* in: array returned by rec_get_offsets() + or rec_reget_offsets() */ + /**************************************************************** The following function is used to get a pointer to the nth -data field in the record. */ +data field in an old-style record. */ byte* -rec_get_nth_field( -/*==============*/ +rec_get_nth_field_old( +/*==================*/ /* out: pointer to the field */ rec_t* rec, /* in: record */ ulint n, /* in: index of the field */ ulint* len); /* out: length of the field; UNIV_SQL_NULL if SQL null */ /**************************************************************** -Return field length or UNIV_SQL_NULL. */ -UNIV_INLINE -ulint -rec_get_nth_field_len( -/*==================*/ - /* out: length of the field; UNIV_SQL_NULL if SQL - null */ - rec_t* rec, /* in: record */ - ulint n); /* in: index of the field */ -/**************************************************************** -Gets the physical size of a field. Also an SQL null may have a field of -size > 0, if the data type is of a fixed size. */ +Gets the physical size of an old-style field. +Also an SQL null may have a field of size > 0, +if the data type is of a fixed size. */ UNIV_INLINE ulint rec_get_nth_field_size( @@ -173,131 +271,194 @@ rec_get_nth_field_size( /* out: field size in bytes */ rec_t* rec, /* in: record */ ulint n); /* in: index of the field */ -/*************************************************************** -Gets the value of the ith field extern storage bit. If it is TRUE -it means that the field is stored on another page. */ +/**************************************************************** +The following function is used to get a pointer to the nth +data field in an old-style record. */ +UNIV_INLINE +byte* +rec_get_nth_field( +/*==============*/ + /* out: pointer to the field */ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n, /* in: index of the field */ + ulint* len); /* out: length of the field; UNIV_SQL_NULL + if SQL null */ +/********************************************************** +Determine if the offsets are for a record in the new +compact format. */ UNIV_INLINE ibool -rec_get_nth_field_extern_bit( -/*=========================*/ - /* in: TRUE or FALSE */ - rec_t* rec, /* in: record */ - ulint i); /* in: ith field */ +rec_offs_comp( +/*==========*/ + /* out: TRUE if compact format */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/********************************************************** +Returns TRUE if the nth field of rec is SQL NULL. */ +UNIV_INLINE +ibool +rec_offs_nth_null( +/*==============*/ + /* out: TRUE if SQL NULL */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n); /* in: nth field */ +/********************************************************** +Returns TRUE if the extern bit is set in nth field of rec. */ +UNIV_INLINE +ibool +rec_offs_nth_extern( +/*================*/ + /* out: TRUE if externally stored */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n); /* in: nth field */ +/********************************************************** +Gets the physical size of a field. */ +UNIV_INLINE +ulint +rec_offs_nth_size( +/*==============*/ + /* out: length of field */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n); /* in: nth field */ + /********************************************************** Returns TRUE if the extern bit is set in any of the fields of rec. */ UNIV_INLINE ibool -rec_contains_externally_stored_field( -/*=================================*/ - /* out: TRUE if a field is stored externally */ - rec_t* rec); /* in: record */ +rec_offs_any_extern( +/*================*/ + /* out: TRUE if a field is stored externally */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /*************************************************************** Sets the value of the ith field extern storage bit. */ - +UNIV_INLINE void rec_set_nth_field_extern_bit( /*=========================*/ - rec_t* rec, /* in: record */ - ulint i, /* in: ith field */ - ibool val, /* in: value to set */ - mtr_t* mtr); /* in: mtr holding an X-latch to the page where - rec is, or NULL; in the NULL case we do not - write to log about the change */ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: record descriptor */ + ulint i, /* in: ith field */ + ibool val, /* in: value to set */ + mtr_t* mtr); /* in: mtr holding an X-latch to the page + where rec is, or NULL; in the NULL case + we do not write to log about the change */ /*************************************************************** Sets TRUE the extern storage bits of fields mentioned in an array. */ void rec_set_field_extern_bits( /*======================*/ - rec_t* rec, /* in: record */ - ulint* vec, /* in: array of field numbers */ - ulint n_fields, /* in: number of fields numbers */ - mtr_t* mtr); /* in: mtr holding an X-latch to the page - where rec is, or NULL; in the NULL case we - do not write to log about the change */ -/**************************************************************** -The following function is used to get a copy of the nth -data field in the record to a buffer. */ -UNIV_INLINE -void -rec_copy_nth_field( -/*===============*/ - void* buf, /* in: pointer to the buffer */ - rec_t* rec, /* in: record */ - ulint n, /* in: index of the field */ - ulint* len); /* out: length of the field; UNIV_SQL_NULL if SQL - null */ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: record descriptor */ + const ulint* vec, /* in: array of field numbers */ + ulint n_fields,/* in: number of fields numbers */ + mtr_t* mtr); /* in: mtr holding an X-latch to the page + where rec is, or NULL; in the NULL case + we do not write to log about the change */ /*************************************************************** -This is used to modify the value of an already existing field in -a physical record. The previous value must have exactly the same -size as the new value. If len is UNIV_SQL_NULL then the field is -treated as SQL null. */ +This is used to modify the value of an already existing field in a record. +The previous value must have exactly the same size as the new value. If len +is UNIV_SQL_NULL then the field is treated as an SQL null for old-style +records. For new-style records, len must not be UNIV_SQL_NULL. */ UNIV_INLINE void rec_set_nth_field( /*==============*/ - rec_t* rec, /* in: record */ - ulint n, /* in: index of the field */ - void* data, /* in: pointer to the data if not SQL null */ - ulint len); /* in: length of the data or UNIV_SQL_NULL. - If not SQL null, must have the same length as the - previous value. If SQL null, previous value must be - SQL null. */ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n, /* in: index number of the field */ + const void* data, /* in: pointer to the data if not SQL null */ + ulint len); /* in: length of the data or UNIV_SQL_NULL. + If not SQL null, must have the same + length as the previous value. + If SQL null, previous value must be + SQL null. */ /************************************************************** -The following function returns the data size of a physical +The following function returns the data size of an old-style physical record, that is the sum of field lengths. SQL null fields are counted as length 0 fields. The value returned by the function is the distance from record origin to record end in bytes. */ UNIV_INLINE ulint -rec_get_data_size( -/*==============*/ - /* out: size */ +rec_get_data_size_old( +/*==================*/ + /* out: size */ rec_t* rec); /* in: physical record */ /************************************************************** +The following function returns the number of fields in a record. */ +UNIV_INLINE +ulint +rec_offs_n_fields( +/*===============*/ + /* out: number of fields */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/************************************************************** +The following function returns the data size of a physical +record, that is the sum of field lengths. SQL null fields +are counted as length 0 fields. The value returned by the function +is the distance from record origin to record end in bytes. */ +UNIV_INLINE +ulint +rec_offs_data_size( +/*===============*/ + /* out: size */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/************************************************************** Returns the total size of record minus data size of record. The value returned by the function is the distance from record start to record origin in bytes. */ UNIV_INLINE ulint -rec_get_extra_size( -/*===============*/ - /* out: size */ - rec_t* rec); /* in: physical record */ -/************************************************************** +rec_offs_extra_size( +/*================*/ + /* out: size */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/************************************************************** Returns the total size of a physical record. */ UNIV_INLINE ulint +rec_offs_size( +/*==========*/ + /* out: size */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/************************************************************** +Returns the total size of a physical record. */ + +ulint rec_get_size( /*=========*/ - /* out: size */ - rec_t* rec); /* in: physical record */ + /* out: size */ + rec_t* rec, /* in: physical record */ + dict_index_t* index); /* in: record descriptor */ /************************************************************** Returns a pointer to the start of the record. */ UNIV_INLINE byte* rec_get_start( /*==========*/ - /* out: pointer to start */ - rec_t* rec); /* in: pointer to record */ + /* out: pointer to start */ + rec_t* rec, /* in: pointer to record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /************************************************************** Returns a pointer to the end of the record. */ UNIV_INLINE byte* rec_get_end( /*========*/ - /* out: pointer to end */ - rec_t* rec); /* in: pointer to record */ + /* out: pointer to end */ + rec_t* rec, /* in: pointer to record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /******************************************************************* Copies a physical record to a buffer. */ UNIV_INLINE rec_t* rec_copy( /*=====*/ - /* out: pointer to the origin of the copied record */ - void* buf, /* in: buffer */ - rec_t* rec); /* in: physical record */ + /* out: pointer to the origin of the copy */ + void* buf, /* in: buffer */ + const rec_t* rec, /* in: physical record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /****************************************************************** Copies the first n fields of a physical record to a new physical record in a buffer. */ @@ -305,49 +466,43 @@ a buffer. */ rec_t* rec_copy_prefix_to_buf( /*===================*/ - /* out, own: copied record */ - rec_t* rec, /* in: physical record */ - ulint n_fields, /* in: number of fields to copy */ - byte** buf, /* in/out: memory buffer for the copied prefix, - or NULL */ - ulint* buf_size); /* in/out: buffer size */ + /* out, own: copied record */ + rec_t* rec, /* in: physical record */ + dict_index_t* index, /* in: record descriptor */ + ulint n_fields, /* in: number of fields to copy */ + byte** buf, /* in/out: memory buffer + for the copied prefix, or NULL */ + ulint* buf_size); /* in/out: buffer size */ /**************************************************************** Folds a prefix of a physical record to a ulint. */ UNIV_INLINE ulint rec_fold( /*=====*/ - /* out: the folded value */ - rec_t* rec, /* in: the physical record */ - ulint n_fields, /* in: number of complete fields to fold */ - ulint n_bytes, /* in: number of bytes to fold in an - incomplete last field */ - dulint tree_id); /* in: index tree id */ + /* out: the folded value */ + rec_t* rec, /* in: the physical record */ + const ulint* offsets, /* in: array returned by + rec_get_offsets() */ + ulint n_fields, /* in: number of complete + fields to fold */ + ulint n_bytes, /* in: number of bytes to fold + in an incomplete last field */ + dulint tree_id); /* in: index tree id */ /************************************************************* Builds a physical record out of a data tuple and stores it beginning from address destination. */ -UNIV_INLINE + rec_t* rec_convert_dtuple_to_rec( /*======================*/ - /* out: pointer to the origin of physical - record */ - byte* destination, /* in: start address of the physical record */ - dtuple_t* dtuple); /* in: data tuple */ -/************************************************************* -Builds a physical record out of a data tuple and stores it beginning from -address destination. */ - -rec_t* -rec_convert_dtuple_to_rec_low( -/*==========================*/ - /* out: pointer to the origin of physical - record */ - byte* destination, /* in: start address of the physical record */ - dtuple_t* dtuple, /* in: data tuple */ - ulint data_size); /* in: data size of dtuple */ + /* out: pointer to the origin + of physical record */ + byte* buf, /* in: start address of the + physical record */ + dict_index_t* index, /* in: record descriptor */ + dtuple_t* dtuple);/* in: data tuple */ /************************************************************** -Returns the extra size of a physical record if we know its +Returns the extra size of an old-style physical record if we know its data size and number of fields. */ UNIV_INLINE ulint @@ -355,7 +510,8 @@ rec_get_converted_extra_size( /*=========================*/ /* out: extra size */ ulint data_size, /* in: data size */ - ulint n_fields); /* in: number of fields */ + ulint n_fields) /* in: number of fields */ + __attribute__((const)); /************************************************************** The following function returns the size of a data tuple when converted to a physical record. */ @@ -364,6 +520,7 @@ ulint rec_get_converted_size( /*===================*/ /* out: size */ + dict_index_t* index, /* in: record descriptor */ dtuple_t* dtuple);/* in: data tuple */ /****************************************************************** Copies the first n fields of a physical record to a data tuple. @@ -374,6 +531,7 @@ rec_copy_prefix_to_dtuple( /*======================*/ dtuple_t* tuple, /* in: data tuple */ rec_t* rec, /* in: physical record */ + dict_index_t* index, /* in: record descriptor */ ulint n_fields, /* in: number of fields to copy */ mem_heap_t* heap); /* in: memory heap */ /******************************************************************* @@ -382,16 +540,27 @@ Validates the consistency of a physical record. */ ibool rec_validate( /*=========*/ - /* out: TRUE if ok */ - rec_t* rec); /* in: physical record */ + /* out: TRUE if ok */ + rec_t* rec, /* in: physical record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/******************************************************************* +Prints an old-style physical record. */ + +void +rec_print_old( +/*==========*/ + FILE* file, /* in: file where to print */ + rec_t* rec); /* in: physical record */ + /******************************************************************* Prints a physical record. */ void rec_print( /*======*/ - FILE* file, /* in: file where to print */ - rec_t* rec); /* in: physical record */ + FILE* file, /* in: file where to print */ + rec_t* rec, /* in: physical record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ #define REC_INFO_BITS 6 /* This is single byte bit-field */ diff --git a/innobase/include/rem0rec.ic b/innobase/include/rem0rec.ic index c36bf8f6d6e..8443b5fa07d 100644 --- a/innobase/include/rem0rec.ic +++ b/innobase/include/rem0rec.ic @@ -8,9 +8,19 @@ Created 5/30/1994 Heikki Tuuri #include "mach0data.h" #include "ut0byte.h" +#include "dict0dict.h" -/* Offsets of the bit-fields in the record. NOTE! In the table the most -significant bytes and bits are written below less significant. +/* Compact flag ORed to the extra size returned by rec_get_offsets() */ +#define REC_OFFS_COMPACT ((ulint) 1 << 31) +/* SQL NULL flag in offsets returned by rec_get_offsets() */ +#define REC_OFFS_SQL_NULL ((ulint) 1 << 31) +/* External flag in offsets returned by rec_get_offsets() */ +#define REC_OFFS_EXTERNAL ((ulint) 1 << 30) +/* Mask for offsets returned by rec_get_offsets() */ +#define REC_OFFS_MASK (REC_OFFS_EXTERNAL - 1) + +/* Offsets of the bit-fields in an old-style record. NOTE! In the table the +most significant bytes and bits are written below less significant. (1) byte offset (2) bit usage within byte downward from @@ -25,6 +35,25 @@ significant bytes and bits are written below less significant. 4 bits info bits */ +/* Offsets of the bit-fields in a new-style record. NOTE! In the table the +most significant bytes and bits are written below less significant. + + (1) byte offset (2) bit usage within byte + downward from + origin -> 1 8 bits pointer to next record (relative) + 2 8 bits pointer to next record (relative) + 3 3 bits status: + 000=conventional record + 001=node pointer record (inside B-tree) + 010=infimum record + 011=supremum record + 1xx=reserved + 5 bits heap number + 4 8 bits heap number + 5 4 bits n_owned + 4 bits info bits +*/ + /* We list the byte offsets from the origin of the record, the mask, and the shift needed to obtain each bit-field of the record. */ @@ -32,22 +61,30 @@ and the shift needed to obtain each bit-field of the record. */ #define REC_NEXT_MASK 0xFFFFUL #define REC_NEXT_SHIFT 0 -#define REC_SHORT 3 /* This is single byte bit-field */ -#define REC_SHORT_MASK 0x1UL -#define REC_SHORT_SHIFT 0 +#define REC_OLD_SHORT 3 /* This is single byte bit-field */ +#define REC_OLD_SHORT_MASK 0x1UL +#define REC_OLD_SHORT_SHIFT 0 + +#define REC_OLD_N_FIELDS 4 +#define REC_OLD_N_FIELDS_MASK 0x7FEUL +#define REC_OLD_N_FIELDS_SHIFT 1 -#define REC_N_FIELDS 4 -#define REC_N_FIELDS_MASK 0x7FEUL -#define REC_N_FIELDS_SHIFT 1 +#define REC_NEW_STATUS 3 /* This is single byte bit-field */ +#define REC_NEW_STATUS_MASK 0x7UL +#define REC_NEW_STATUS_SHIFT 0 -#define REC_HEAP_NO 5 +#define REC_OLD_HEAP_NO 5 +#define REC_NEW_HEAP_NO 4 #define REC_HEAP_NO_MASK 0xFFF8UL #define REC_HEAP_NO_SHIFT 3 -#define REC_N_OWNED 6 /* This is single byte bit-field */ +#define REC_OLD_N_OWNED 6 /* This is single byte bit-field */ +#define REC_NEW_N_OWNED 5 /* This is single byte bit-field */ #define REC_N_OWNED_MASK 0xFUL #define REC_N_OWNED_SHIFT 0 +#define REC_OLD_INFO_BITS 6 /* This is single byte bit-field */ +#define REC_NEW_INFO_BITS 5 /* This is single byte bit-field */ #define REC_INFO_BITS_MASK 0xF0UL #define REC_INFO_BITS_SHIFT 0 @@ -65,26 +102,24 @@ a field stored to another page: */ #define REC_2BYTE_EXTERN_MASK 0x4000UL -/**************************************************************** -Return field length or UNIV_SQL_NULL. */ -UNIV_INLINE -ulint -rec_get_nth_field_len( -/*==================*/ - /* out: length of the field; UNIV_SQL_NULL if SQL - null */ - rec_t* rec, /* in: record */ - ulint n) /* in: index of the field */ -{ - ulint len; - - rec_get_nth_field(rec, n, &len); - - return(len); -} +#if REC_OLD_SHORT_MASK << (8 * (REC_OLD_SHORT - 3)) \ + ^ REC_OLD_N_FIELDS_MASK << (8 * (REC_OLD_N_FIELDS - 4)) \ + ^ REC_HEAP_NO_MASK << (8 * (REC_OLD_HEAP_NO - 4)) \ + ^ REC_N_OWNED_MASK << (8 * (REC_OLD_N_OWNED - 3)) \ + ^ REC_INFO_BITS_MASK << (8 * (REC_OLD_INFO_BITS - 3)) \ + ^ 0xFFFFFFFFUL +# error "sum of old-style masks != 0xFFFFFFFFUL" +#endif +#if REC_NEW_STATUS_MASK << (8 * (REC_NEW_STATUS - 3)) \ + ^ REC_HEAP_NO_MASK << (8 * (REC_NEW_HEAP_NO - 4)) \ + ^ REC_N_OWNED_MASK << (8 * (REC_NEW_N_OWNED - 3)) \ + ^ REC_INFO_BITS_MASK << (8 * (REC_NEW_INFO_BITS - 3)) \ + ^ 0xFFFFFFUL +# error "sum of new-style masks != 0xFFFFFFUL" +#endif /*************************************************************** -Sets the value of the ith field SQL null bit. */ +Sets the value of the ith field SQL null bit of an old-style record. */ void rec_set_nth_field_null_bit( @@ -93,8 +128,8 @@ rec_set_nth_field_null_bit( ulint i, /* in: ith field */ ibool val); /* in: value to set */ /*************************************************************** -Sets a record field to SQL null. The physical size of the field is not -changed. */ +Sets an old-style record field to SQL null. +The physical size of the field is not changed. */ void rec_set_nth_field_sql_null( @@ -102,6 +137,32 @@ rec_set_nth_field_sql_null( rec_t* rec, /* in: record */ ulint n); /* in: index of the field */ +/*************************************************************** +Sets the value of the ith field extern storage bit of an old-style record. */ + +void +rec_set_nth_field_extern_bit_old( +/*=============================*/ + rec_t* rec, /* in: old-style record */ + ulint i, /* in: ith field */ + ibool val, /* in: value to set */ + mtr_t* mtr); /* in: mtr holding an X-latch to the page where + rec is, or NULL; in the NULL case we do not + write to log about the change */ +/*************************************************************** +Sets the value of the ith field extern storage bit of a new-style record. */ + +void +rec_set_nth_field_extern_bit_new( +/*=============================*/ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: record descriptor */ + ulint ith, /* in: ith field */ + ibool val, /* in: value to set */ + mtr_t* mtr); /* in: mtr holding an X-latch to the page + where rec is, or NULL; in the NULL case + we do not write to log about the change */ + /********************************************************** Gets a bit field from within 1 byte. */ UNIV_INLINE @@ -131,7 +192,7 @@ rec_set_bit_field_1( ulint shift) /* in: shift right applied after masking */ { ut_ad(rec); - ut_ad(offs <= REC_N_EXTRA_BYTES); + ut_ad(offs <= REC_N_OLD_EXTRA_BYTES); ut_ad(mask); ut_ad(mask <= 0xFFUL); ut_ad(((mask >> shift) << shift) == mask); @@ -171,30 +232,14 @@ rec_set_bit_field_2( ulint shift) /* in: shift right applied after masking */ { ut_ad(rec); - ut_ad(offs <= REC_N_EXTRA_BYTES); + ut_ad(offs <= REC_N_OLD_EXTRA_BYTES); ut_ad(mask > 0xFFUL); ut_ad(mask <= 0xFFFFUL); ut_ad((mask >> shift) & 1); ut_ad(0 == ((mask >> shift) & ((mask >> shift) + 1))); ut_ad(((mask >> shift) << shift) == mask); ut_ad(((val << shift) & mask) == (val << shift)); -#ifdef UNIV_DEBUG - { - ulint m; - - /* The following assertion checks that the masks of currently - defined bit-fields in bytes 3-6 do not overlap. */ - m = (ulint)((REC_SHORT_MASK << (8 * (REC_SHORT - 3))) - + (REC_N_FIELDS_MASK << (8 * (REC_N_FIELDS - 4))) - + (REC_HEAP_NO_MASK << (8 * (REC_HEAP_NO - 4))) - + (REC_N_OWNED_MASK << (8 * (REC_N_OWNED - 3))) - + (REC_INFO_BITS_MASK << (8 * (REC_INFO_BITS - 3)))); - if (m != ut_dbg_zero + 0xFFFFFFFFUL) { - fprintf(stderr, "Sum of masks %lx\n", m); - ut_error; - } - } -#endif + mach_write_to_2(rec - offs, (mach_read_from_2(rec - offs) & ~mask) | (val << shift)); @@ -208,17 +253,26 @@ ulint rec_get_next_offs( /*==============*/ /* out: the page offset of the next chained record */ - rec_t* rec) /* in: physical record */ + rec_t* rec, /* in: physical record */ + ibool comp) /* in: TRUE=compact page format */ { - ulint ret; - - ut_ad(rec); - - ret = rec_get_bit_field_2(rec, REC_NEXT, REC_NEXT_MASK, - REC_NEXT_SHIFT); - ut_ad(ret < UNIV_PAGE_SIZE); - - return(ret); + if (comp) { + lint ret = (int16_t) rec_get_bit_field_2(rec, REC_NEXT, + REC_NEXT_MASK, REC_NEXT_SHIFT); +#if UNIV_PAGE_SIZE <= 32768 + /* with 64 KiB page size, the pointer will "wrap around", + and the following assertions are invalid */ + ut_ad(ret + ut_align_offset(rec, UNIV_PAGE_SIZE) < + UNIV_PAGE_SIZE); +#endif + return(ret ? ut_align_offset(rec + ret, UNIV_PAGE_SIZE) : 0); + } + else { + ulint ret = rec_get_bit_field_2(rec, REC_NEXT, + REC_NEXT_MASK, REC_NEXT_SHIFT); + ut_ad(ret < UNIV_PAGE_SIZE); + return(ret); + } } /********************************************************** @@ -229,21 +283,32 @@ void rec_set_next_offs( /*==============*/ rec_t* rec, /* in: physical record */ + ibool comp, /* in: TRUE=compact page format */ ulint next) /* in: offset of the next record */ { ut_ad(rec); ut_ad(UNIV_PAGE_SIZE > next); - rec_set_bit_field_2(rec, next, REC_NEXT, REC_NEXT_MASK, - REC_NEXT_SHIFT); + if (comp) { + rec_set_bit_field_2(rec, next + ? (next - ut_align_offset(rec, UNIV_PAGE_SIZE)) +#ifdef UNIV_DEBUG /* avoid an assertion failure */ + & (REC_NEXT_MASK >> REC_NEXT_SHIFT) +#endif + : 0, REC_NEXT, REC_NEXT_MASK, REC_NEXT_SHIFT); + } else { + rec_set_bit_field_2(rec, next, + REC_NEXT, REC_NEXT_MASK, REC_NEXT_SHIFT); + } } /********************************************************** -The following function is used to get the number of fields in the record. */ +The following function is used to get the number of fields +in an old-style record. */ UNIV_INLINE ulint -rec_get_n_fields( -/*=============*/ +rec_get_n_fields_old( +/*=================*/ /* out: number of data fields */ rec_t* rec) /* in: physical record */ { @@ -251,8 +316,8 @@ rec_get_n_fields( ut_ad(rec); - ret = rec_get_bit_field_2(rec, REC_N_FIELDS, REC_N_FIELDS_MASK, - REC_N_FIELDS_SHIFT); + ret = rec_get_bit_field_2(rec, REC_OLD_N_FIELDS, + REC_OLD_N_FIELDS_MASK, REC_OLD_N_FIELDS_SHIFT); ut_ad(ret <= REC_MAX_N_FIELDS); ut_ad(ret > 0); @@ -260,12 +325,12 @@ rec_get_n_fields( } /********************************************************** -The following function is used to set the number of fields field in the -record. */ +The following function is used to set the number of fields +in an old-style record. */ UNIV_INLINE void -rec_set_n_fields( -/*=============*/ +rec_set_n_fields_old( +/*=================*/ rec_t* rec, /* in: physical record */ ulint n_fields) /* in: the number of fields */ { @@ -273,8 +338,38 @@ rec_set_n_fields( ut_ad(n_fields <= REC_MAX_N_FIELDS); ut_ad(n_fields > 0); - rec_set_bit_field_2(rec, n_fields, REC_N_FIELDS, REC_N_FIELDS_MASK, - REC_N_FIELDS_SHIFT); + rec_set_bit_field_2(rec, n_fields, REC_OLD_N_FIELDS, + REC_OLD_N_FIELDS_MASK, REC_OLD_N_FIELDS_SHIFT); +} + +/********************************************************** +The following function is used to get the number of fields +in a record. */ +UNIV_INLINE +ulint +rec_get_n_fields( +/*=============*/ + /* out: number of data fields */ + rec_t* rec, /* in: physical record */ + dict_index_t* index) /* in: record descriptor */ +{ + ut_ad(rec); + ut_ad(index); + if (!index->table->comp) { + return(rec_get_n_fields_old(rec)); + } + switch (rec_get_status(rec)) { + case REC_STATUS_ORDINARY: + return(dict_index_get_n_fields(index)); + case REC_STATUS_NODE_PTR: + return(dict_index_get_n_unique_in_tree(index) + 1); + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + return(1); + default: + ut_error; + return(ULINT_UNDEFINED); + } } /********************************************************** @@ -285,14 +380,16 @@ ulint rec_get_n_owned( /*============*/ /* out: number of owned records */ - rec_t* rec) /* in: physical record */ + rec_t* rec, /* in: physical record */ + ibool comp) /* in: TRUE=compact page format */ { ulint ret; ut_ad(rec); - ret = rec_get_bit_field_1(rec, REC_N_OWNED, REC_N_OWNED_MASK, - REC_N_OWNED_SHIFT); + ret = rec_get_bit_field_1(rec, + comp ? REC_NEW_N_OWNED : REC_OLD_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); ut_ad(ret <= REC_MAX_N_OWNED); return(ret); @@ -305,13 +402,15 @@ void rec_set_n_owned( /*============*/ rec_t* rec, /* in: physical record */ + ibool comp, /* in: TRUE=compact page format */ ulint n_owned) /* in: the number of owned */ { ut_ad(rec); ut_ad(n_owned <= REC_MAX_N_OWNED); - rec_set_bit_field_1(rec, n_owned, REC_N_OWNED, REC_N_OWNED_MASK, - REC_N_OWNED_SHIFT); + rec_set_bit_field_1(rec, n_owned, + comp ? REC_NEW_N_OWNED : REC_OLD_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); } /********************************************************** @@ -321,14 +420,16 @@ ulint rec_get_info_bits( /*==============*/ /* out: info bits */ - rec_t* rec) /* in: physical record */ + rec_t* rec, /* in: physical record */ + ibool comp) /* in: TRUE=compact page format */ { ulint ret; ut_ad(rec); - ret = rec_get_bit_field_1(rec, REC_INFO_BITS, REC_INFO_BITS_MASK, - REC_INFO_BITS_SHIFT); + ret = rec_get_bit_field_1(rec, + comp ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS, + REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); ut_ad((ret & ~REC_INFO_BITS_MASK) == 0); return(ret); @@ -341,30 +442,51 @@ void rec_set_info_bits( /*==============*/ rec_t* rec, /* in: physical record */ + ibool comp, /* in: TRUE=compact page format */ ulint bits) /* in: info bits */ { ut_ad(rec); ut_ad((bits & ~REC_INFO_BITS_MASK) == 0); - rec_set_bit_field_1(rec, bits, REC_INFO_BITS, REC_INFO_BITS_MASK, - REC_INFO_BITS_SHIFT); + rec_set_bit_field_1(rec, bits, + comp ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS, + REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); } /********************************************************** -Gets the value of the deleted flag in info bits. */ +The following function retrieves the status bits of a new-style record. */ UNIV_INLINE -ibool -rec_info_bits_get_deleted_flag( -/*===========================*/ - /* out: TRUE if deleted flag set */ - ulint info_bits) /* in: info bits from a record */ +ulint +rec_get_status( +/*===========*/ + /* out: status bits */ + rec_t* rec) /* in: physical record */ { - if (info_bits & REC_INFO_DELETED_FLAG) { + ulint ret; - return(TRUE); - } + ut_ad(rec); - return(FALSE); + ret = rec_get_bit_field_1(rec, REC_NEW_STATUS, + REC_NEW_STATUS_MASK, REC_NEW_STATUS_SHIFT); + ut_ad((ret & ~REC_NEW_STATUS_MASK) == 0); + + return(ret); +} + +/********************************************************** +The following function is used to set the status bits of a new-style record. */ +UNIV_INLINE +void +rec_set_status( +/*===========*/ + rec_t* rec, /* in: physical record */ + ulint bits) /* in: info bits */ +{ + ut_ad(rec); + ut_ad((bits & ~REC_NEW_STATUS_MASK) == 0); + + rec_set_bit_field_1(rec, bits, REC_NEW_STATUS, + REC_NEW_STATUS_MASK, REC_NEW_STATUS_SHIFT); } /********************************************************** @@ -374,9 +496,10 @@ ibool rec_get_deleted_flag( /*=================*/ /* out: TRUE if delete marked */ - rec_t* rec) /* in: physical record */ + rec_t* rec, /* in: physical record */ + ibool comp) /* in: TRUE=compact page format */ { - if (REC_INFO_DELETED_FLAG & rec_get_info_bits(rec)) { + if (REC_INFO_DELETED_FLAG & rec_get_info_bits(rec, comp)) { return(TRUE); } @@ -391,6 +514,7 @@ void rec_set_deleted_flag( /*=================*/ rec_t* rec, /* in: physical record */ + ibool comp, /* in: TRUE=compact page format */ ibool flag) /* in: TRUE if delete marked */ { ulint old_val; @@ -399,7 +523,7 @@ rec_set_deleted_flag( ut_ad(TRUE == 1); ut_ad(flag <= TRUE); - old_val = rec_get_info_bits(rec); + old_val = rec_get_info_bits(rec, comp); if (flag) { new_val = REC_INFO_DELETED_FLAG | old_val; @@ -407,7 +531,39 @@ rec_set_deleted_flag( new_val = ~REC_INFO_DELETED_FLAG & old_val; } - rec_set_info_bits(rec, new_val); + rec_set_info_bits(rec, comp, new_val); +} + +/********************************************************** +The following function tells if a new-style record is a node pointer. */ +UNIV_INLINE +ibool +rec_get_node_ptr_flag( +/*=================*/ + /* out: TRUE if node pointer */ + rec_t* rec) /* in: physical record */ +{ + return(REC_STATUS_NODE_PTR == rec_get_status(rec)); +} + +/********************************************************** +The following function is used to flag a record as a node pointer. */ +UNIV_INLINE +void +rec_set_node_ptr_flag( +/*=================*/ + rec_t* rec, /* in: physical record */ + ibool flag) /* in: TRUE if the record is a node pointer */ +{ + ulint status; + ut_ad(flag <= TRUE); + ut_ad(REC_STATUS_NODE_PTR >= rec_get_status(rec)); + if (flag) { + status = REC_STATUS_NODE_PTR; + } else { + status = REC_STATUS_ORDINARY; + } + rec_set_status(rec, status); } /********************************************************** @@ -418,14 +574,16 @@ ulint rec_get_heap_no( /*=============*/ /* out: heap order number */ - rec_t* rec) /* in: physical record */ + rec_t* rec, /* in: physical record */ + ibool comp) /* in: TRUE=compact page format */ { ulint ret; ut_ad(rec); - ret = rec_get_bit_field_2(rec, REC_HEAP_NO, REC_HEAP_NO_MASK, - REC_HEAP_NO_SHIFT); + ret = rec_get_bit_field_2(rec, + comp ? REC_NEW_HEAP_NO : REC_OLD_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); ut_ad(ret <= REC_MAX_HEAP_NO); return(ret); @@ -438,12 +596,14 @@ void rec_set_heap_no( /*=============*/ rec_t* rec, /* in: physical record */ + ibool comp, /* in: TRUE=compact page format */ ulint heap_no)/* in: the heap number */ { ut_ad(heap_no <= REC_MAX_HEAP_NO); - rec_set_bit_field_2(rec, heap_no, REC_HEAP_NO, REC_HEAP_NO_MASK, - REC_HEAP_NO_SHIFT); + rec_set_bit_field_2(rec, heap_no, + comp ? REC_NEW_HEAP_NO : REC_OLD_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); } /********************************************************** @@ -456,10 +616,12 @@ rec_get_1byte_offs_flag( /* out: TRUE if 1-byte form */ rec_t* rec) /* in: physical record */ { - ut_ad(TRUE == 1); +#if TRUE != 1 +#error "TRUE != 1" +#endif - return(rec_get_bit_field_1(rec, REC_SHORT, REC_SHORT_MASK, - REC_SHORT_SHIFT)); + return(rec_get_bit_field_1(rec, REC_OLD_SHORT, REC_OLD_SHORT_MASK, + REC_OLD_SHORT_SHIFT)); } /********************************************************** @@ -471,11 +633,13 @@ rec_set_1byte_offs_flag( rec_t* rec, /* in: physical record */ ibool flag) /* in: TRUE if 1byte form */ { - ut_ad(TRUE == 1); +#if TRUE != 1 +#error "TRUE != 1" +#endif ut_ad(flag <= TRUE); - rec_set_bit_field_1(rec, flag, REC_SHORT, REC_SHORT_MASK, - REC_SHORT_SHIFT); + rec_set_bit_field_1(rec, flag, REC_OLD_SHORT, REC_OLD_SHORT_MASK, + REC_OLD_SHORT_SHIFT); } /********************************************************** @@ -492,9 +656,9 @@ rec_1_get_field_end_info( ulint n) /* in: field index */ { ut_ad(rec_get_1byte_offs_flag(rec)); - ut_ad(n < rec_get_n_fields(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); - return(mach_read_from_1(rec - (REC_N_EXTRA_BYTES + n + 1))); + return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1))); } /********************************************************** @@ -511,68 +675,234 @@ rec_2_get_field_end_info( ulint n) /* in: field index */ { ut_ad(!rec_get_1byte_offs_flag(rec)); - ut_ad(n < rec_get_n_fields(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); - return(mach_read_from_2(rec - (REC_N_EXTRA_BYTES + 2 * n + 2))); + return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2))); } -/*************************************************************** -Gets the value of the ith field extern storage bit. If it is TRUE -it means that the field is stored on another page. */ +#ifdef UNIV_DEBUG +# define REC_OFFS_HEADER_SIZE 3 +#else /* UNIV_DEBUG */ +# define REC_OFFS_HEADER_SIZE 1 +#endif /* UNIV_DEBUG */ + +/* Get the base address of offsets. The extra_size is stored at +this position, and following positions hold the end offsets of +the fields. */ +#define rec_offs_base(offsets) (offsets + REC_OFFS_HEADER_SIZE) + +/**************************************************************** +Validates offsets returned by rec_get_offsets() or rec_reget_offsets(). */ UNIV_INLINE ibool -rec_get_nth_field_extern_bit( -/*=========================*/ - /* in: TRUE or FALSE */ - rec_t* rec, /* in: record */ - ulint i) /* in: ith field */ +rec_offs_validate( +/*==============*/ + /* out: TRUE if valid */ + rec_t* rec, /* in: record or NULL */ + dict_index_t* index, /* in: record descriptor or NULL */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ulint i = rec_offs_n_fields(offsets); + ulint last = ULINT_MAX; + ibool comp = (*rec_offs_base(offsets) & REC_OFFS_COMPACT) != 0; + ut_a(offsets); + if (rec) { + ut_ad((ulint) rec == offsets[1]); + if (!comp) { + ut_a(rec_get_n_fields_old(rec) >= i); + } + } + if (index) { + ulint max_n_fields; + ut_ad((ulint) index == offsets[2]); + max_n_fields = ut_max( + dict_index_get_n_fields(index), + dict_index_get_n_unique_in_tree(index) + 1); + if (comp && rec) { + switch (rec_get_status(rec)) { + case REC_STATUS_ORDINARY: + break; + case REC_STATUS_NODE_PTR: + max_n_fields = + dict_index_get_n_unique_in_tree(index) + 1; + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + max_n_fields = 1; + break; + default: + ut_error; + } + } + ut_a(i <= max_n_fields); + } + while (i--) { + ulint curr = rec_offs_base(offsets)[1 + i] & REC_OFFS_MASK; + ut_a(curr <= last); + last = curr; + } + return(TRUE); +} +/**************************************************************** +Updates debug data in offsets, in order to avoid bogus +rec_offs_validate() failures. */ +UNIV_INLINE +void +rec_offs_make_valid( +/*================*/ + const rec_t* rec __attribute__((unused)), + /* in: record */ + const dict_index_t* index __attribute__((unused)), + /* in: record descriptor */ + ulint* offsets __attribute__((unused))) + /* in: array returned by rec_get_offsets() + or rec_reget_offsets() */ { - ulint info; +#ifdef UNIV_DEBUG + offsets[1] = (ulint) rec; + offsets[2] = (ulint) index; +#endif /* UNIV_DEBUG */ +} - if (rec_get_1byte_offs_flag(rec)) { +/**************************************************************** +The following function is used to get a pointer to the nth +data field in an old-style record. */ +UNIV_INLINE +byte* +rec_get_nth_field( +/*==============*/ + /* out: pointer to the field */ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n, /* in: index of the field */ + ulint* len) /* out: length of the field; UNIV_SQL_NULL + if SQL null */ +{ + byte* field; + ulint length; + ut_ad(rec); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(n < rec_offs_n_fields(offsets)); + ut_ad(len); - return(FALSE); + if (n == 0) { + field = rec; + } else { + field = rec + (rec_offs_base(offsets)[n] & REC_OFFS_MASK); } - info = rec_2_get_field_end_info(rec, i); + length = rec_offs_base(offsets)[1 + n]; - if (info & REC_2BYTE_EXTERN_MASK) { - return(TRUE); + if (length & REC_OFFS_SQL_NULL) { + field = NULL; + length = UNIV_SQL_NULL; + } else { + length &= REC_OFFS_MASK; + length -= field - rec; } - return(FALSE); + *len = length; + return(field); } /********************************************************** -Returns TRUE if the extern bit is set in any of the fields -of rec. */ +Determine if the offsets are for a record in the new +compact format. */ UNIV_INLINE ibool -rec_contains_externally_stored_field( -/*=================================*/ - /* out: TRUE if a field is stored externally */ - rec_t* rec) /* in: record */ +rec_offs_comp( +/*==========*/ + /* out: TRUE if compact format */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { - ulint n; - ulint i; - - if (rec_get_1byte_offs_flag(rec)) { - - return(FALSE); - } + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + return((*rec_offs_base(offsets) & REC_OFFS_COMPACT) != 0); +} - n = rec_get_n_fields(rec); +/********************************************************** +Returns TRUE if the nth field of rec is SQL NULL. */ +UNIV_INLINE +ibool +rec_offs_nth_null( +/*==============*/ + /* out: TRUE if SQL NULL */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n) /* in: nth field */ +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + ut_ad(n < rec_offs_n_fields(offsets)); + return((rec_offs_base(offsets)[1 + n] & REC_OFFS_SQL_NULL) != 0); +} +/********************************************************** +Returns TRUE if the extern bit is set in nth field of rec. */ +UNIV_INLINE +ibool +rec_offs_nth_extern( +/*================*/ + /* out: TRUE if externally stored */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n) /* in: nth field */ +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + ut_ad(n < rec_offs_n_fields(offsets)); + return((rec_offs_base(offsets)[1 + n] & REC_OFFS_EXTERNAL) != 0); +} - for (i = 0; i < n; i++) { - if (rec_get_nth_field_extern_bit(rec, i)) { +/********************************************************** +Gets the physical size of a field. */ +UNIV_INLINE +ulint +rec_offs_nth_size( +/*==============*/ + /* out: length of field */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n) /* in: nth field */ +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + ut_ad(n < rec_offs_n_fields(offsets)); + return(rec_offs_base(offsets)[1 + n] & REC_OFFS_MASK); +} +/********************************************************** +Returns TRUE if the extern bit is set in any of the fields +of an old-style record. */ +UNIV_INLINE +ibool +rec_offs_any_extern( +/*================*/ + /* out: TRUE if a field is stored externally */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ulint i; + for (i = rec_offs_n_fields(offsets); i--; ) { + if (rec_offs_nth_extern(offsets, i)) { return(TRUE); } } - return(FALSE); } +/*************************************************************** +Sets the value of the ith field extern storage bit. */ +UNIV_INLINE +void +rec_set_nth_field_extern_bit( +/*=========================*/ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: record descriptor */ + ulint i, /* in: ith field */ + ibool val, /* in: value to set */ + mtr_t* mtr) /* in: mtr holding an X-latch to the page + where rec is, or NULL; in the NULL case + we do not write to log about the change */ +{ + if (index->table->comp) { + rec_set_nth_field_extern_bit_new(rec, index, i, val, mtr); + } else { + rec_set_nth_field_extern_bit_old(rec, i, val, mtr); + } +} + /********************************************************** Returns the offset of n - 1th field end if the record is stored in the 1-byte offsets form. If the field is SQL null, the flag is ORed in the returned @@ -589,9 +919,9 @@ rec_1_get_prev_field_end_info( ulint n) /* in: field index */ { ut_ad(rec_get_1byte_offs_flag(rec)); - ut_ad(n <= rec_get_n_fields(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); - return(mach_read_from_1(rec - (REC_N_EXTRA_BYTES + n))); + return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n))); } /********************************************************** @@ -608,9 +938,9 @@ rec_2_get_prev_field_end_info( ulint n) /* in: field index */ { ut_ad(!rec_get_1byte_offs_flag(rec)); - ut_ad(n <= rec_get_n_fields(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); - return(mach_read_from_2(rec - (REC_N_EXTRA_BYTES + 2 * n))); + return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n))); } /********************************************************** @@ -625,9 +955,9 @@ rec_1_set_field_end_info( ulint info) /* in: value to set */ { ut_ad(rec_get_1byte_offs_flag(rec)); - ut_ad(n < rec_get_n_fields(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); - mach_write_to_1(rec - (REC_N_EXTRA_BYTES + n + 1), info); + mach_write_to_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1), info); } /********************************************************** @@ -642,9 +972,9 @@ rec_2_set_field_end_info( ulint info) /* in: value to set */ { ut_ad(!rec_get_1byte_offs_flag(rec)); - ut_ad(n < rec_get_n_fields(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); - mach_write_to_2(rec - (REC_N_EXTRA_BYTES + 2 * n + 2), info); + mach_write_to_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2), info); } /********************************************************** @@ -659,7 +989,7 @@ rec_1_get_field_start_offs( ulint n) /* in: field index */ { ut_ad(rec_get_1byte_offs_flag(rec)); - ut_ad(n <= rec_get_n_fields(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); if (n == 0) { @@ -682,7 +1012,7 @@ rec_2_get_field_start_offs( ulint n) /* in: field index */ { ut_ad(!rec_get_1byte_offs_flag(rec)); - ut_ad(n <= rec_get_n_fields(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); if (n == 0) { @@ -707,7 +1037,7 @@ rec_get_field_start_offs( ulint n) /* in: field index */ { ut_ad(rec); - ut_ad(n <= rec_get_n_fields(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); if (n == 0) { @@ -723,8 +1053,9 @@ rec_get_field_start_offs( } /**************************************************************** -Gets the physical size of a field. Also an SQL null may have a field of -size > 0, if the data type is of a fixed size. */ +Gets the physical size of an old-style field. +Also an SQL null may have a field of size > 0, +if the data type is of a fixed size. */ UNIV_INLINE ulint rec_get_nth_field_size( @@ -744,133 +1075,132 @@ rec_get_nth_field_size( return(next_os - os); } -/**************************************************************** -The following function is used to get a copy of the nth data field in a -record to a buffer. */ -UNIV_INLINE -void -rec_copy_nth_field( -/*===============*/ - void* buf, /* in: pointer to the buffer */ - rec_t* rec, /* in: record */ - ulint n, /* in: index of the field */ - ulint* len) /* out: length of the field; UNIV_SQL_NULL if SQL - null */ -{ - byte* ptr; - - ut_ad(buf && rec && len); - - ptr = rec_get_nth_field(rec, n, len); - - if (*len == UNIV_SQL_NULL) { - - return; - } - - ut_memcpy(buf, ptr, *len); -} - /*************************************************************** This is used to modify the value of an already existing field in a record. The previous value must have exactly the same size as the new value. If len -is UNIV_SQL_NULL then the field is treated as an SQL null. */ +is UNIV_SQL_NULL then the field is treated as an SQL null for old-style +records. For new-style records, len must not be UNIV_SQL_NULL. */ UNIV_INLINE void rec_set_nth_field( /*==============*/ - rec_t* rec, /* in: record */ - ulint n, /* in: index of the field */ - void* data, /* in: pointer to the data if not SQL null */ - ulint len) /* in: length of the data or UNIV_SQL_NULL */ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n, /* in: index number of the field */ + const void* data, /* in: pointer to the data + if not SQL null */ + ulint len) /* in: length of the data or UNIV_SQL_NULL. + If not SQL null, must have the same + length as the previous value. + If SQL null, previous value must be + SQL null. */ { byte* data2; ulint len2; - ut_ad((len == UNIV_SQL_NULL) - || (rec_get_nth_field_size(rec, n) == len)); - + ut_ad(rec_offs_validate(rec, NULL, offsets)); + if (len == UNIV_SQL_NULL) { + ut_ad(!rec_offs_comp(offsets)); rec_set_nth_field_sql_null(rec, n); return; } - data2 = rec_get_nth_field(rec, n, &len2); + data2 = rec_get_nth_field(rec, offsets, n, &len2); + ut_ad(len2 == len); ut_memcpy(data2, data, len); if (len2 == UNIV_SQL_NULL) { - + ut_ad(!rec_offs_comp(offsets)); rec_set_nth_field_null_bit(rec, n, FALSE); } } /************************************************************** -The following function returns the data size of a physical +The following function returns the data size of an old-style physical record, that is the sum of field lengths. SQL null fields are counted as length 0 fields. The value returned by the function is the distance from record origin to record end in bytes. */ UNIV_INLINE ulint -rec_get_data_size( -/*==============*/ - /* out: size */ +rec_get_data_size_old( +/*==================*/ + /* out: size */ rec_t* rec) /* in: physical record */ { ut_ad(rec); - return(rec_get_field_start_offs(rec, rec_get_n_fields(rec))); + return(rec_get_field_start_offs(rec, rec_get_n_fields_old(rec))); } /************************************************************** -Returns the total size of record minus data size of record. The value -returned by the function is the distance from record start to record origin -in bytes. */ +The following function returns the number of fields in a record. */ UNIV_INLINE ulint -rec_get_extra_size( +rec_offs_n_fields( /*===============*/ - /* out: size */ - rec_t* rec) /* in: physical record */ + /* out: number of fields */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { ulint n_fields; - - ut_ad(rec); - - n_fields = rec_get_n_fields(rec); + ut_ad(offsets); + n_fields = offsets[0]; + ut_ad(n_fields > 0); + ut_ad(n_fields <= REC_MAX_N_FIELDS); + return(n_fields); +} - if (rec_get_1byte_offs_flag(rec)) { +/************************************************************** +The following function returns the data size of a physical +record, that is the sum of field lengths. SQL null fields +are counted as length 0 fields. The value returned by the function +is the distance from record origin to record end in bytes. */ +UNIV_INLINE +ulint +rec_offs_data_size( +/*===============*/ + /* out: size */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ulint size; - return(REC_N_EXTRA_BYTES + n_fields); - } + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + size = rec_offs_base(offsets)[rec_offs_n_fields(offsets)] + & REC_OFFS_MASK; + ut_ad(size < UNIV_PAGE_SIZE); + return(size); +} - return(REC_N_EXTRA_BYTES + 2 * n_fields); +/************************************************************** +Returns the total size of record minus data size of record. The value +returned by the function is the distance from record start to record origin +in bytes. */ +UNIV_INLINE +ulint +rec_offs_extra_size( +/*================*/ + /* out: size */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ulint size; + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + size = *rec_offs_base(offsets) & ~REC_OFFS_COMPACT; + ut_ad(size < UNIV_PAGE_SIZE); + return(size); } -/************************************************************** +/************************************************************** Returns the total size of a physical record. */ UNIV_INLINE ulint -rec_get_size( -/*=========*/ - /* out: size */ - rec_t* rec) /* in: physical record */ +rec_offs_size( +/*==========*/ + /* out: size */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { - ulint n_fields; - - ut_ad(rec); - - n_fields = rec_get_n_fields(rec); - - if (rec_get_1byte_offs_flag(rec)) { - - return(REC_N_EXTRA_BYTES + n_fields - + rec_1_get_field_start_offs(rec, n_fields)); - } - - return(REC_N_EXTRA_BYTES + 2 * n_fields - + rec_2_get_field_start_offs(rec, n_fields)); + return(rec_offs_data_size(offsets) + rec_offs_extra_size(offsets)); } /************************************************************** @@ -879,10 +1209,11 @@ UNIV_INLINE byte* rec_get_end( /*========*/ - /* out: pointer to end */ - rec_t* rec) /* in: pointer to record */ + /* out: pointer to end */ + rec_t* rec, /* in: pointer to record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { - return(rec + rec_get_data_size(rec)); + return(rec + rec_offs_data_size(offsets)); } /************************************************************** @@ -891,10 +1222,11 @@ UNIV_INLINE byte* rec_get_start( /*==========*/ - /* out: pointer to start */ - rec_t* rec) /* in: pointer to record */ + /* out: pointer to start */ + rec_t* rec, /* in: pointer to record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { - return(rec - rec_get_extra_size(rec)); + return(rec - rec_offs_extra_size(offsets)); } /******************************************************************* @@ -903,18 +1235,20 @@ UNIV_INLINE rec_t* rec_copy( /*=====*/ - /* out: pointer to the origin of the copied record */ - void* buf, /* in: buffer */ - rec_t* rec) /* in: physical record */ + /* out: pointer to the origin of the copy */ + void* buf, /* in: buffer */ + const rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { ulint extra_len; ulint data_len; ut_ad(rec && buf); - ut_ad(rec_validate(rec)); + ut_ad(rec_offs_validate((rec_t*) rec, NULL, offsets)); + ut_ad(rec_validate((rec_t*) rec, offsets)); - extra_len = rec_get_extra_size(rec); - data_len = rec_get_data_size(rec); + extra_len = rec_offs_extra_size(offsets); + data_len = rec_offs_data_size(offsets); ut_memcpy(buf, rec - extra_len, extra_len + data_len); @@ -922,8 +1256,8 @@ rec_copy( } /************************************************************** -Returns the extra size of a physical record if we know its data size and -the number of fields. */ +Returns the extra size of an old-style physical record if we know its +data size and number of fields. */ UNIV_INLINE ulint rec_get_converted_extra_size( @@ -934,28 +1268,51 @@ rec_get_converted_extra_size( { if (data_size <= REC_1BYTE_OFFS_LIMIT) { - return(REC_N_EXTRA_BYTES + n_fields); + return(REC_N_OLD_EXTRA_BYTES + n_fields); } - return(REC_N_EXTRA_BYTES + 2 * n_fields); + return(REC_N_OLD_EXTRA_BYTES + 2 * n_fields); } /************************************************************** The following function returns the size of a data tuple when converted to +a new-style physical record. */ + +ulint +rec_get_converted_size_new( +/*=======================*/ + /* out: size */ + dict_index_t* index, /* in: record descriptor */ + dtuple_t* dtuple);/* in: data tuple */ +/************************************************************** +The following function returns the size of a data tuple when converted to a physical record. */ UNIV_INLINE ulint rec_get_converted_size( /*===================*/ /* out: size */ + dict_index_t* index, /* in: record descriptor */ dtuple_t* dtuple) /* in: data tuple */ { ulint data_size; ulint extra_size; - + + ut_ad(index); ut_ad(dtuple); ut_ad(dtuple_check_typed(dtuple)); + ut_ad(index->type & DICT_UNIVERSAL + || dtuple_get_n_fields(dtuple) == + (((dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK) + == REC_STATUS_NODE_PTR) + ? dict_index_get_n_unique_in_tree(index) + 1 + : dict_index_get_n_fields(index))); + + if (index->table->comp) { + return(rec_get_converted_size_new(index, dtuple)); + } + data_size = dtuple_get_data_size(dtuple); extra_size = rec_get_converted_extra_size( @@ -971,12 +1328,15 @@ UNIV_INLINE ulint rec_fold( /*=====*/ - /* out: the folded value */ - rec_t* rec, /* in: the physical record */ - ulint n_fields, /* in: number of complete fields to fold */ - ulint n_bytes, /* in: number of bytes to fold in an - incomplete last field */ - dulint tree_id) /* in: index tree id */ + /* out: the folded value */ + rec_t* rec, /* in: the physical record */ + const ulint* offsets, /* in: array returned by + rec_get_offsets() */ + ulint n_fields, /* in: number of complete + fields to fold */ + ulint n_bytes, /* in: number of bytes to fold + in an incomplete last field */ + dulint tree_id) /* in: index tree id */ { ulint i; byte* data; @@ -984,12 +1344,13 @@ rec_fold( ulint fold; ulint n_fields_rec; - ut_ad(rec_validate(rec)); - ut_ad(n_fields <= rec_get_n_fields(rec)); - ut_ad((n_fields < rec_get_n_fields(rec)) || (n_bytes == 0)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(rec_validate((rec_t*) rec, offsets)); ut_ad(n_fields + n_bytes > 0); - - n_fields_rec = rec_get_n_fields(rec); + + n_fields_rec = rec_offs_n_fields(offsets); + ut_ad(n_fields <= n_fields_rec); + ut_ad(n_fields < n_fields_rec || n_bytes == 0); if (n_fields > n_fields_rec) { n_fields = n_fields_rec; @@ -1002,7 +1363,7 @@ rec_fold( fold = ut_fold_dulint(tree_id); for (i = 0; i < n_fields; i++) { - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field(rec, offsets, i, &len); if (len != UNIV_SQL_NULL) { fold = ut_fold_ulint_pair(fold, @@ -1011,7 +1372,7 @@ rec_fold( } if (n_bytes > 0) { - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field(rec, offsets, i, &len); if (len != UNIV_SQL_NULL) { if (len > n_bytes) { @@ -1025,19 +1386,3 @@ rec_fold( return(fold); } - -/************************************************************* -Builds a physical record out of a data tuple and stores it beginning from -the address destination. */ -UNIV_INLINE -rec_t* -rec_convert_dtuple_to_rec( -/*======================*/ - /* out: pointer to the origin of physical - record */ - byte* destination, /* in: start address of the physical record */ - dtuple_t* dtuple) /* in: data tuple */ -{ - return(rec_convert_dtuple_to_rec_low(destination, dtuple, - dtuple_get_data_size(dtuple))); -} diff --git a/innobase/include/row0mysql.h b/innobase/include/row0mysql.h index 6e1865dae1d..c23a0e025ad 100644 --- a/innobase/include/row0mysql.h +++ b/innobase/include/row0mysql.h @@ -175,8 +175,14 @@ int row_lock_table_for_mysql( /*=====================*/ /* out: error code or DB_SUCCESS */ - row_prebuilt_t* prebuilt); /* in: prebuilt struct in the MySQL + row_prebuilt_t* prebuilt, /* in: prebuilt struct in the MySQL table handle */ + dict_table_t* table, /* in: table to lock, or NULL + if prebuilt->table should be + locked as LOCK_TABLE_EXP | + prebuilt->select_lock_type */ + ulint mode); /* in: lock mode of table */ + /************************************************************************* Does an insert for MySQL. */ @@ -233,6 +239,17 @@ row_update_for_mysql( the MySQL format */ row_prebuilt_t* prebuilt); /* in: prebuilt struct in MySQL handle */ + +/************************************************************************* +Does an unlock of a row for MySQL. */ + +int +row_unlock_for_mysql( +/*=================*/ + /* out: error code or DB_SUCCESS */ + row_prebuilt_t* prebuilt); /* in: prebuilt struct in MySQL + handle */ + /************************************************************************* Creates an query graph node of 'update' type to be used in the MySQL interface. */ @@ -532,7 +549,10 @@ struct row_prebuilt_struct { format */ ulint hint_need_to_fetch_extra_cols; /* normally this is set to 0; if this - is set to ROW_RETRIEVE_PRIMARY_KEY, + is set to ROW_RETRIEVE_PRIMARY_KEY + (that value is obsolete starting from + 5.0.2, because we always fetch the + primary key cols), then we should at least retrieve all columns in the primary key; if this is set to ROW_RETRIEVE_ALL_COLS, then @@ -605,6 +625,9 @@ struct row_prebuilt_struct { /* Values for hint_need_to_fetch_extra_cols */ #define ROW_RETRIEVE_PRIMARY_KEY 1 + /* value 1 is obsolete starting from + 5.0.2, because we always fetch the + primary key cols */ #define ROW_RETRIEVE_ALL_COLS 2 diff --git a/innobase/include/row0row.h b/innobase/include/row0row.h index 951e211fb37..782973d8f5d 100644 --- a/innobase/include/row0row.h +++ b/innobase/include/row0row.h @@ -27,7 +27,8 @@ row_get_rec_trx_id( /*===============*/ /* out: value of the field */ rec_t* rec, /* in: record */ - dict_index_t* index); /* in: clustered index */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ /************************************************************************* Reads the roll pointer field from a clustered index record. */ UNIV_INLINE @@ -36,7 +37,8 @@ row_get_rec_roll_ptr( /*=================*/ /* out: value of the field */ rec_t* rec, /* in: record */ - dict_index_t* index); /* in: clustered index */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ /************************************************************************* Writes the trx id field to a clustered index record. */ UNIV_INLINE @@ -45,7 +47,8 @@ row_set_rec_trx_id( /*===============*/ rec_t* rec, /* in: record */ dict_index_t* index, /* in: clustered index */ - dulint trx_id); /* in: value of the field */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + dulint trx_id);/* in: value of the field */ /************************************************************************* Sets the roll pointer field in a clustered index record. */ UNIV_INLINE @@ -54,6 +57,7 @@ row_set_rec_roll_ptr( /*=================*/ rec_t* rec, /* in: record */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ dulint roll_ptr);/* in: value of the field */ /********************************************************************* When an insert to a table is performed, this function builds the entry which @@ -90,6 +94,9 @@ row_build( the buffer page of this record must be at least s-latched and the latch held as long as the row dtuple is used! */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) + or NULL, in which case this function + will invoke rec_get_offsets() */ mem_heap_t* heap); /* in: memory heap from which the memory needed is allocated */ /*********************************************************************** @@ -175,14 +182,15 @@ UNIV_INLINE void row_build_row_ref_fast( /*===================*/ - dtuple_t* ref, /* in: typed data tuple where the reference - is built */ - ulint* map, /* in: array of field numbers in rec telling - how ref should be built from the fields of - rec */ - rec_t* rec); /* in: record in the index; must be preserved - while ref is used, as we do not copy field - values to heap */ + dtuple_t* ref, /* in: typed data tuple where the + reference is built */ + const ulint* map, /* in: array of field numbers in rec + telling how ref should be built from + the fields of rec */ + rec_t* rec, /* in: record in the index; must be + preserved while ref is used, as we do + not copy field values to heap */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ /******************************************************************* Searches the clustered index record for a row, if we have the row reference. */ diff --git a/innobase/include/row0row.ic b/innobase/include/row0row.ic index 8e5121f5a96..85410beacf0 100644 --- a/innobase/include/row0row.ic +++ b/innobase/include/row0row.ic @@ -20,7 +20,8 @@ row_get_rec_sys_field( /* out: value of the field */ ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */ rec_t* rec, /* in: record */ - dict_index_t* index); /* in: clustered index */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ /************************************************************************* Sets the trx id or roll ptr field in a clustered index record: this function is slower than the specialized inline functions. */ @@ -32,6 +33,7 @@ row_set_rec_sys_field( ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */ rec_t* rec, /* in: record */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ dulint val); /* in: value to set */ /************************************************************************* @@ -42,18 +44,21 @@ row_get_rec_trx_id( /*===============*/ /* out: value of the field */ rec_t* rec, /* in: record */ - dict_index_t* index) /* in: clustered index */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { ulint offset; ut_ad(index->type & DICT_CLUSTERED); + ut_ad(rec_offs_validate(rec, index, offsets)); offset = index->trx_id_offset; if (offset) { return(trx_read_trx_id(rec + offset)); } else { - return(row_get_rec_sys_field(DATA_TRX_ID, rec, index)); + return(row_get_rec_sys_field(DATA_TRX_ID, + rec, index, offsets)); } } @@ -65,18 +70,21 @@ row_get_rec_roll_ptr( /*=================*/ /* out: value of the field */ rec_t* rec, /* in: record */ - dict_index_t* index) /* in: clustered index */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { ulint offset; ut_ad(index->type & DICT_CLUSTERED); + ut_ad(rec_offs_validate(rec, index, offsets)); offset = index->trx_id_offset; if (offset) { return(trx_read_roll_ptr(rec + offset + DATA_TRX_ID_LEN)); } else { - return(row_get_rec_sys_field(DATA_ROLL_PTR, rec, index)); + return(row_get_rec_sys_field(DATA_ROLL_PTR, + rec, index, offsets)); } } @@ -88,18 +96,21 @@ row_set_rec_trx_id( /*===============*/ rec_t* rec, /* in: record */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ dulint trx_id) /* in: value of the field */ { ulint offset; ut_ad(index->type & DICT_CLUSTERED); + ut_ad(rec_offs_validate(rec, index, offsets)); offset = index->trx_id_offset; if (offset) { trx_write_trx_id(rec + offset, trx_id); } else { - row_set_rec_sys_field(DATA_TRX_ID, rec, index, trx_id); + row_set_rec_sys_field(DATA_TRX_ID, + rec, index, offsets, trx_id); } } @@ -111,18 +122,21 @@ row_set_rec_roll_ptr( /*=================*/ rec_t* rec, /* in: record */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ dulint roll_ptr)/* in: value of the field */ { ulint offset; ut_ad(index->type & DICT_CLUSTERED); + ut_ad(rec_offs_validate(rec, index, offsets)); offset = index->trx_id_offset; if (offset) { trx_write_roll_ptr(rec + offset + DATA_TRX_ID_LEN, roll_ptr); } else { - row_set_rec_sys_field(DATA_ROLL_PTR, rec, index, roll_ptr); + row_set_rec_sys_field(DATA_ROLL_PTR, + rec, index, offsets, roll_ptr); } } @@ -133,14 +147,15 @@ UNIV_INLINE void row_build_row_ref_fast( /*===================*/ - dtuple_t* ref, /* in: typed data tuple where the reference - is built */ - ulint* map, /* in: array of field numbers in rec telling - how ref should be built from the fields of - rec */ - rec_t* rec) /* in: record in the index; must be preserved - while ref is used, as we do not copy field - values to heap */ + dtuple_t* ref, /* in: typed data tuple where the + reference is built */ + const ulint* map, /* in: array of field numbers in rec + telling how ref should be built from + the fields of rec */ + rec_t* rec, /* in: record in the index; must be + preserved while ref is used, as we do + not copy field values to heap */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { dfield_t* dfield; byte* field; @@ -149,6 +164,7 @@ row_build_row_ref_fast( ulint field_no; ulint i; + ut_ad(rec_offs_validate(rec, NULL, offsets)); ref_len = dtuple_get_n_fields(ref); for (i = 0; i < ref_len; i++) { @@ -158,7 +174,8 @@ row_build_row_ref_fast( if (field_no != ULINT_UNDEFINED) { - field = rec_get_nth_field(rec, field_no, &len); + field = rec_get_nth_field(rec, offsets, + field_no, &len); dfield_set_data(dfield, field, len); } } diff --git a/innobase/include/row0sel.h b/innobase/include/row0sel.h index bb6fb70ca86..8d5187bfc1c 100644 --- a/innobase/include/row0sel.h +++ b/innobase/include/row0sel.h @@ -120,6 +120,7 @@ row_search_for_mysql( /* out: DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK, + DB_LOCK_TABLE_FULL, or DB_TOO_BIG_RECORD */ byte* buf, /* in/out: buffer for the fetched row in the MySQL format */ diff --git a/innobase/include/row0upd.h b/innobase/include/row0upd.h index 28210364833..673e0511153 100644 --- a/innobase/include/row0upd.h +++ b/innobase/include/row0upd.h @@ -80,6 +80,7 @@ row_upd_rec_sys_fields( /*===================*/ rec_t* rec, /* in: record */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ trx_t* trx, /* in: transaction */ dulint roll_ptr);/* in: roll ptr of the undo log record */ /************************************************************************* @@ -124,8 +125,8 @@ row_upd_changes_field_size_or_external( /* out: TRUE if the update changes the size of some field in index or the field is external in rec or update */ - rec_t* rec, /* in: record in index */ dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ upd_t* update);/* in: update vector */ /*************************************************************** Replaces the new column values stored in the update vector to the record @@ -135,8 +136,9 @@ a clustered index */ void row_upd_rec_in_place( /*=================*/ - rec_t* rec, /* in/out: record where replaced */ - upd_t* update);/* in: update vector */ + rec_t* rec, /* in/out: record where replaced */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + upd_t* update);/* in: update vector */ /******************************************************************* Builds an update vector from those fields which in a secondary index entry differ from a record that has the equal ordering fields. NOTE: we compare @@ -274,10 +276,11 @@ recovery. */ void row_upd_rec_sys_fields_in_recovery( /*===============================*/ - rec_t* rec, /* in: record */ - ulint pos, /* in: TRX_ID position in rec */ - dulint trx_id, /* in: transaction id */ - dulint roll_ptr);/* in: roll ptr of the undo log record */ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint pos, /* in: TRX_ID position in rec */ + dulint trx_id, /* in: transaction id */ + dulint roll_ptr);/* in: roll ptr of the undo log record */ /************************************************************************* Parses the log data written by row_upd_index_write_log. */ diff --git a/innobase/include/row0upd.ic b/innobase/include/row0upd.ic index a124228a0de..e2d81a39cfa 100644 --- a/innobase/include/row0upd.ic +++ b/innobase/include/row0upd.ic @@ -106,15 +106,17 @@ row_upd_rec_sys_fields( /*===================*/ rec_t* rec, /* in: record */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ trx_t* trx, /* in: transaction */ dulint roll_ptr)/* in: roll ptr of the undo log record */ { ut_ad(index->type & DICT_CLUSTERED); + ut_ad(rec_offs_validate(rec, index, offsets)); #ifdef UNIV_SYNC_DEBUG ut_ad(!buf_block_align(rec)->is_hashed || rw_lock_own(&btr_search_latch, RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ - row_set_rec_trx_id(rec, index, trx->id); - row_set_rec_roll_ptr(rec, index, roll_ptr); + row_set_rec_trx_id(rec, index, offsets, trx->id); + row_set_rec_roll_ptr(rec, index, offsets, roll_ptr); } diff --git a/innobase/include/row0vers.h b/innobase/include/row0vers.h index 30cf82144e9..0dd40fda65f 100644 --- a/innobase/include/row0vers.h +++ b/innobase/include/row0vers.h @@ -30,7 +30,8 @@ row_vers_impl_x_locked_off_kernel( transaction; NOTE that the kernel mutex is temporarily released! */ rec_t* rec, /* in: record in a secondary index */ - dict_index_t* index); /* in: the secondary index */ + dict_index_t* index, /* in: the secondary index */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ /********************************************************************* Finds out if we must preserve a delete marked earlier version of a clustered index record, because it is >= the purge view. */ diff --git a/innobase/include/row0vers.ic b/innobase/include/row0vers.ic index 5ece47c35d1..ab1e264635b 100644 --- a/innobase/include/row0vers.ic +++ b/innobase/include/row0vers.ic @@ -11,73 +11,3 @@ Created 2/6/1997 Heikki Tuuri #include "read0read.h" #include "page0page.h" #include "log0recv.h" - -/************************************************************************* -Fetches the trx id of a clustered index record or version. */ -UNIV_INLINE -dulint -row_vers_get_trx_id( -/*================*/ - /* out: trx id or ut_dulint_zero if the - clustered index record not found */ - rec_t* rec, /* in: clustered index record, or an old - version of it */ - dict_table_t* table) /* in: table */ -{ - return(row_get_rec_trx_id(rec, dict_table_get_first_index(table))); -} - -/************************************************************************* -Checks if a consistent read can be performed immediately on the index -record, or if an older version is needed. */ -UNIV_INLINE -ibool -row_vers_clust_rec_sees_older( -/*==========================*/ - /* out: FALSE if can read immediately */ - rec_t* rec, /* in: record which should be read or passed - over by a read cursor */ - dict_index_t* index, /* in: clustered index */ - read_view_t* view) /* in: read view */ -{ - ut_ad(index->type & DICT_CLUSTERED); - - if (read_view_sees_trx_id(view, row_get_rec_trx_id(rec, index))) { - - return(FALSE); - } - - return(TRUE); -} - -/************************************************************************* -Checks if a secondary index record can be read immediately by a consistent -read, or if an older version may be needed. To be sure, we will have to -look in the clustered index. */ -UNIV_INLINE -ibool -row_vers_sec_rec_may_see_older( -/*===========================*/ - /* out: FALSE if can be read immediately */ - rec_t* rec, /* in: record which should be read or passed */ - dict_index_t* index __attribute__((unused)),/* in: secondary index */ - read_view_t* view) /* in: read view */ -{ - page_t* page; - - ut_ad(!(index->type & DICT_CLUSTERED)); - - page = buf_frame_align(rec); - - if ((ut_dulint_cmp(page_get_max_trx_id(page), view->up_limit_id) >= 0) - || recv_recovery_is_on()) { - - /* It may be that the record was inserted or modified by a - transaction the view should not see: we have to look in the - clustered index */ - - return(TRUE); - } - - return(FALSE); -} diff --git a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h index 6cfe9cef927..d4cc7d8222f 100644 --- a/innobase/include/srv0srv.h +++ b/innobase/include/srv0srv.h @@ -184,6 +184,63 @@ i/o handler thread */ extern const char* srv_io_thread_op_info[]; extern const char* srv_io_thread_function[]; +/* the number of the log write requests done */ +extern ulint srv_log_write_requests; + +/* the number of physical writes to the log performed */ +extern ulint srv_log_writes; + +/* amount of data written to the log files in bytes */ +extern ulint srv_os_log_written; + +/* amount of writes being done to the log files */ +extern ulint srv_os_log_pending_writes; + +/* we increase this counter, when there we don't have enough space in the +log buffer and have to flush it */ +extern ulint srv_log_waits; + +/* variable that counts amount of data read in total (in bytes) */ +extern ulint srv_data_read; + +/* here we count the amount of data written in total (in bytes) */ +extern ulint srv_data_written; + +/* this variable counts the amount of times, when the doublewrite buffer +was flushed */ +extern ulint srv_dblwr_writes; + +/* here we store the number of pages that have been flushed to the +doublewrite buffer */ +extern ulint srv_dblwr_pages_written; + +/* in this variable we store the number of write requests issued */ +extern ulint srv_buf_pool_write_requests; + +/* here we store the number of times when we had to wait for a free page +in the buffer pool. It happens when the buffer pool is full and we need +to make a flush, in order to be able to read or create a page. */ +extern ulint srv_buf_pool_wait_free; + +/* variable to count the number of pages that were written from the +buffer pool to disk */ +extern ulint srv_buf_pool_flushed; + +/* variable to count the number of buffer pool reads that led to the +reading of a disk page */ +extern ulint srv_buf_pool_reads; + +/* variable to count the number of sequential read-aheads were done */ +extern ulint srv_read_ahead_seq; + +/* variable to count the number of random read-aheads were done */ +extern ulint srv_read_ahead_rnd; + +/* In this structure we store status variables to be passed to MySQL */ +typedef struct export_var_struct export_struc; + +extern export_struc export_vars; + typedef struct srv_sys_struct srv_sys_t; /* The server system */ @@ -400,7 +457,12 @@ void srv_printf_innodb_monitor( /*======================*/ FILE* file); /* in: output stream */ +/************************************************************************ +Function to pass InnoDB status variables to MySQL */ +void +srv_export_innodb_status(void); +/*=====================*/ /* Types for the threads existing in the system. Threads of types 4 - 9 are called utility threads. Note that utility threads are mainly disk @@ -426,6 +488,48 @@ typedef struct srv_slot_struct srv_slot_t; /* Thread table is an array of slots */ typedef srv_slot_t srv_table_t; +/* In this structure we store status variables to be passed to MySQL */ +struct export_var_struct{ + ulint innodb_data_pending_reads; + ulint innodb_data_pending_writes; + ulint innodb_data_pending_fsyncs; + ulint innodb_data_fsyncs; + ulint innodb_data_read; + ulint innodb_data_writes; + ulint innodb_data_written; + ulint innodb_data_reads; + ulint innodb_buffer_pool_pages_total; + ulint innodb_buffer_pool_pages_data; + ulint innodb_buffer_pool_pages_dirty; + ulint innodb_buffer_pool_pages_misc; + ulint innodb_buffer_pool_pages_free; + ulint innodb_buffer_pool_pages_latched; + ulint innodb_buffer_pool_read_requests; + ulint innodb_buffer_pool_reads; + ulint innodb_buffer_pool_wait_free; + ulint innodb_buffer_pool_pages_flushed; + ulint innodb_buffer_pool_write_requests; + ulint innodb_buffer_pool_read_ahead_seq; + ulint innodb_buffer_pool_read_ahead_rnd; + ulint innodb_dblwr_pages_written; + ulint innodb_dblwr_writes; + ulint innodb_log_waits; + ulint innodb_log_write_requests; + ulint innodb_log_writes; + ulint innodb_os_log_written; + ulint innodb_os_log_fsyncs; + ulint innodb_os_log_pending_writes; + ulint innodb_os_log_pending_fsyncs; + ulint innodb_page_size; + ulint innodb_pages_created; + ulint innodb_pages_read; + ulint innodb_pages_written; + ulint innodb_rows_read; + ulint innodb_rows_inserted; + ulint innodb_rows_updated; + ulint innodb_rows_deleted; +}; + /* The server system struct */ struct srv_sys_struct{ os_event_t operational; /* created threads must wait for the @@ -434,6 +538,10 @@ struct srv_sys_struct{ srv_table_t* threads; /* server thread table */ UT_LIST_BASE_NODE_T(que_thr_t) tasks; /* task queue */ + dict_index_t* dummy_ind1; /* dummy index for old-style + supremum and infimum records */ + dict_index_t* dummy_ind2; /* dummy index for new-style + supremum and infimum records */ }; extern ulint srv_n_threads_active[]; diff --git a/innobase/include/trx0rec.h b/innobase/include/trx0rec.h index 9d7f41cd94e..4387ce1a61e 100644 --- a/innobase/include/trx0rec.h +++ b/innobase/include/trx0rec.h @@ -246,6 +246,7 @@ trx_undo_prev_version_build( index_rec page and purge_view */ rec_t* rec, /* in: version of a clustered index record */ dict_index_t* index, /* in: clustered index */ + ulint* offsets,/* in: rec_get_offsets(rec, index) */ mem_heap_t* heap, /* in: memory heap from which the memory needed is allocated */ rec_t** old_vers);/* out, own: previous version, or NULL if diff --git a/innobase/include/trx0roll.h b/innobase/include/trx0roll.h index 6004551f456..893e5af6c01 100644 --- a/innobase/include/trx0roll.h +++ b/innobase/include/trx0roll.h @@ -104,11 +104,12 @@ trx_rollback( /*********************************************************************** Rollback or clean up transactions which have no user session. If the transaction already was committed, then we clean up a possible insert -undo log. If the transaction was not yet committed, then we roll it back. */ +undo log. If the transaction was not yet committed, then we roll it back. +Note: this is done in a background thread */ -void -trx_rollback_or_clean_all_without_sess(void); -/*========================================*/ +void * +trx_rollback_or_clean_all_without_sess(void *); +/*============================================*/ /******************************************************************** Finishes a transaction rollback. */ diff --git a/innobase/include/trx0trx.h b/innobase/include/trx0trx.h index 7eb91048684..8eb71dac763 100644 --- a/innobase/include/trx0trx.h +++ b/innobase/include/trx0trx.h @@ -16,6 +16,7 @@ Created 3/26/1996 Heikki Tuuri #include "que0types.h" #include "mem0mem.h" #include "read0types.h" +#include "trx0xa.h" extern ulint trx_n_mysql_transactions; @@ -156,6 +157,36 @@ trx_commit_for_mysql( /*=================*/ /* out: 0 or error number */ trx_t* trx); /* in: trx handle */ + +/************************************************************************** +Does the transaction prepare for MySQL. */ + +ulint +trx_prepare_for_mysql( +/*=================*/ + /* out: 0 or error number */ + trx_t* trx); /* in: trx handle */ + +/************************************************************************** +This function is used to find number of prepared transactions and +their transaction objects for a recovery. */ + +int +trx_recover_for_mysql( +/*=================*/ + /* out: number of prepared transactions */ + XID* xid_list, /* in/out: prepared transactions */ + uint len); /* in: number of slots in xid_list */ + +/*********************************************************************** +This function is used to commit one X/Open XA distributed transaction +which is in the prepared state */ +trx_t * +trx_get_trx_by_xid( +/*===============*/ + /* out: trx or NULL */ + XID* xid); /* in: X/Open XA Transaction Idenfication */ + /************************************************************************** If required, flushes the log to disk if we called trx_commit_for_mysql() with trx->flush_log_later == TRUE. */ @@ -339,6 +370,9 @@ struct trx_struct{ if we can use the insert buffer for them, we set this FALSE */ dulint id; /* transaction id */ + XID xid; /* X/Open XA transaction + identification to identify a + transaction branch */ dulint no; /* transaction serialization number == max trx id when the transaction is moved to COMMITTED_IN_MEMORY state */ @@ -353,8 +387,10 @@ struct trx_struct{ dulint table_id; /* table id if the preceding field is TRUE */ /*------------------------------*/ - void* mysql_thd; /* MySQL thread handle corresponding - to this trx, or NULL */ + int active_trans; /* whether a transaction in MySQL + is active */ + void* mysql_thd; /* MySQL thread handle corresponding + to this trx, or NULL */ char** mysql_query_str;/* pointer to the field in mysqld_thd which contains the pointer to the current SQL query string */ @@ -423,6 +459,8 @@ struct trx_struct{ lock_t* auto_inc_lock; /* possible auto-inc lock reserved by the transaction; note that it is also in the lock list trx_locks */ + ibool trx_create_lock;/* this is TRUE if we have created a + new lock for a record accessed */ ulint n_lock_table_exp;/* number of explicit table locks (LOCK TABLES) reserved by the transaction, stored in trx_locks */ @@ -541,6 +579,7 @@ struct trx_struct{ #define TRX_NOT_STARTED 1 #define TRX_ACTIVE 2 #define TRX_COMMITTED_IN_MEMORY 3 +#define TRX_PREPARED 4 /* Support for 2PC/XA */ /* Transaction execution states when trx state is TRX_ACTIVE */ #define TRX_QUE_RUNNING 1 /* transaction is running */ diff --git a/innobase/include/trx0undo.h b/innobase/include/trx0undo.h index 20002076cc3..fce62e46046 100644 --- a/innobase/include/trx0undo.h +++ b/innobase/include/trx0undo.h @@ -14,6 +14,7 @@ Created 3/26/1996 Heikki Tuuri #include "mtr0mtr.h" #include "trx0sys.h" #include "page0types.h" +#include "trx0xa.h" /*************************************************************************** Builds a roll pointer dulint. */ @@ -36,7 +37,7 @@ trx_undo_decode_roll_ptr( ibool* is_insert, /* out: TRUE if insert undo log */ ulint* rseg_id, /* out: rollback segment id */ ulint* page_no, /* out: page number */ - ulint* offset); /* out: offset of the undo entry within page */ + ulint* offset); /* out: offset of the undo entry within page */ /*************************************************************************** Returns TRUE if the roll pointer is of the insert type. */ UNIV_INLINE @@ -239,6 +240,18 @@ trx_undo_set_state_at_finish( trx_t* trx, /* in: transaction */ trx_undo_t* undo, /* in: undo log memory copy */ mtr_t* mtr); /* in: mtr */ +/********************************************************************** +Sets the state of the undo log segment at a transaction prepare. */ + +page_t* +trx_undo_set_state_at_prepare( +/*==========================*/ + /* out: undo log segment header page, + x-latched */ + trx_t* trx, /* in: transaction */ + trx_undo_t* undo, /* in: undo log memory copy */ + mtr_t* mtr); /* in: mtr */ + /************************************************************************** Adds the update undo log header as the first in the history list, and frees the memory object, or puts it to the list of cached update undo log @@ -294,7 +307,23 @@ trx_undo_parse_discard_latest( byte* end_ptr,/* in: buffer end */ page_t* page, /* in: page or NULL */ mtr_t* mtr); /* in: mtr or NULL */ +/************************************************************************ +Write X/Open XA Transaction Identification (XID) to undo log header */ +void +trx_undo_write_xid( +/*===============*/ + trx_ulogf_t* log_hdr,/* in: undo log header */ + XID* xid); /* in: X/Open XA Transaction Identification */ + +/************************************************************************ +Read X/Open XA Transaction Identification (XID) from undo log header */ + +void +trx_undo_read_xid( +/*==============*/ + trx_ulogf_t* log_hdr,/* in: undo log header */ + XID* xid); /* out: X/Open XA Transaction Identification */ /* Types of an undo log segment */ #define TRX_UNDO_INSERT 1 /* contains undo entries for inserts */ @@ -310,6 +339,8 @@ trx_undo_parse_discard_latest( #define TRX_UNDO_TO_PURGE 4 /* update undo segment will not be reused: it can be freed in purge when all undo data in it is removed */ +#define TRX_UNDO_PREPARED 5 /* contains an undo log of an + prepared transaction */ /* Transaction undo log memory object; this is protected by the undo_mutex in the corresponding transaction object */ @@ -332,6 +363,8 @@ struct trx_undo_struct{ field */ dulint trx_id; /* id of the trx assigned to the undo log */ + XID xid; /* X/Open XA transaction + identification */ ibool dict_operation; /* TRUE if a dict operation trx */ dulint table_id; /* if a dict operation, then the table id */ @@ -436,7 +469,10 @@ page of an update undo log segment. */ log start, and therefore this is not necessarily the same as this log header end offset */ -#define TRX_UNDO_DICT_OPERATION 20 /* TRUE if the transaction is a table +#define TRX_UNDO_XID_EXISTS 20 /* TRUE if undo log header includes + X/Open XA transaction identification + XID */ +#define TRX_UNDO_DICT_TRANS 21 /* TRUE if the transaction is a table create, index create, or drop transaction: in recovery the transaction cannot be rolled back @@ -452,7 +488,17 @@ page of an update undo log segment. */ #define TRX_UNDO_HISTORY_NODE 34 /* If the log is put to the history list, the file list node is here */ /*-------------------------------------------------------------*/ -#define TRX_UNDO_LOG_HDR_SIZE (34 + FLST_NODE_SIZE) +/* X/Open XA Transaction Identification (XID) */ + +#define TRX_UNDO_XA_FORMAT (34 + FLST_NODE_SIZE) +#define TRX_UNDO_XA_TRID_LEN (TRX_UNDO_XA_FORMAT + 4) +#define TRX_UNDO_XA_BQUAL_LEN (TRX_UNDO_XA_TRID_LEN + 4) +#define TRX_UNDO_XA_XID (TRX_UNDO_XA_BQUAL_LEN + 4) +#define TRX_UNDO_XA_LEN (TRX_UNDO_XA_XID + XIDDATASIZE) + +/*-------------------------------------------------------------*/ +#define TRX_UNDO_LOG_HDR_SIZE (TRX_UNDO_XA_LEN) +/*-------------------------------------------------------------*/ #ifndef UNIV_NONINL #include "trx0undo.ic" diff --git a/innobase/include/trx0xa.h b/innobase/include/trx0xa.h new file mode 100644 index 00000000000..34b7a2f95a8 --- /dev/null +++ b/innobase/include/trx0xa.h @@ -0,0 +1,182 @@ +/* + * Start of xa.h header + * + * Define a symbol to prevent multiple inclusions of this header file + */ +#ifndef XA_H +#define XA_H + +/* + * Transaction branch identification: XID and NULLXID: + */ +#ifndef XIDDATASIZE + +#define XIDDATASIZE 128 /* size in bytes */ +#define MAXGTRIDSIZE 64 /* maximum size in bytes of gtrid */ +#define MAXBQUALSIZE 64 /* maximum size in bytes of bqual */ + +struct xid_t { + long formatID; /* format identifier */ + long gtrid_length; /* value from 1 through 64 */ + long bqual_length; /* value from 1 through 64 */ + char data[XIDDATASIZE]; +}; +typedef struct xid_t XID; +#endif +/* + * A value of -1 in formatID means that the XID is null. + */ + + +#ifdef NOTDEFINED +/* Let us comment this out to remove compiler errors!!!!!!!!!!!! */ + +/* + * Declarations of routines by which RMs call TMs: + */ +extern int ax_reg __P((int, XID *, long)); +extern int ax_unreg __P((int, long)); + +/* + * XA Switch Data Structure + */ +#define RMNAMESZ 32 /* length of resource manager name, */ + /* including the null terminator */ +#define MAXINFOSIZE 256 /* maximum size in bytes of xa_info */ + /* strings, including the null + terminator */ + + +struct xa_switch_t { + char name[RMNAMESZ]; /* name of resource manager */ + long flags; /* resource manager specific options */ + long version; /* must be 0 */ + int (*xa_open_entry) /* xa_open function pointer */ + __P((char *, int, long)); + int (*xa_close_entry) /* xa_close function pointer */ + __P((char *, int, long)); + int (*xa_start_entry) /* xa_start function pointer */ + __P((XID *, int, long)); + int (*xa_end_entry) /* xa_end function pointer */ + __P((XID *, int, long)); + int (*xa_rollback_entry) /* xa_rollback function pointer */ + __P((XID *, int, long)); + int (*xa_prepare_entry) /* xa_prepare function pointer */ + __P((XID *, int, long)); + int (*xa_commit_entry) /* xa_commit function pointer */ + __P((XID *, int, long)); + int (*xa_recover_entry) /* xa_recover function pointer */ + __P((XID *, long, int, long)); + int (*xa_forget_entry) /* xa_forget function pointer */ + __P((XID *, int, long)); + int (*xa_complete_entry) /* xa_complete function pointer */ + __P((int *, int *, int, long)); +}; +#endif /* NOTDEFINED */ + + +/* + * Flag definitions for the RM switch + */ +#define TMNOFLAGS 0x00000000L /* no resource manager features + selected */ +#define TMREGISTER 0x00000001L /* resource manager dynamically + registers */ +#define TMNOMIGRATE 0x00000002L /* resource manager does not support + association migration */ +#define TMUSEASYNC 0x00000004L /* resource manager supports + asynchronous operations */ +/* + * Flag definitions for xa_ and ax_ routines + */ +/* use TMNOFLAGGS, defined above, when not specifying other flags */ +#define TMASYNC 0x80000000L /* perform routine asynchronously */ +#define TMONEPHASE 0x40000000L /* caller is using one-phase commit + optimisation */ +#define TMFAIL 0x20000000L /* dissociates caller and marks + transaction branch rollback-only */ +#define TMNOWAIT 0x10000000L /* return if blocking condition + exists */ +#define TMRESUME 0x08000000L /* caller is resuming association with + suspended transaction branch */ +#define TMSUCCESS 0x04000000L /* dissociate caller from transaction + branch */ +#define TMSUSPEND 0x02000000L /* caller is suspending, not ending, + association */ +#define TMSTARTRSCAN 0x01000000L /* start a recovery scan */ +#define TMENDRSCAN 0x00800000L /* end a recovery scan */ +#define TMMULTIPLE 0x00400000L /* wait for any asynchronous + operation */ +#define TMJOIN 0x00200000L /* caller is joining existing + transaction branch */ +#define TMMIGRATE 0x00100000L /* caller intends to perform + migration */ + +/* + * ax_() return codes (transaction manager reports to resource manager) + */ +#define TM_JOIN 2 /* caller is joining existing + transaction branch */ +#define TM_RESUME 1 /* caller is resuming association with + suspended transaction branch */ +#define TM_OK 0 /* normal execution */ +#define TMER_TMERR -1 /* an error occurred in the transaction + manager */ +#define TMER_INVAL -2 /* invalid arguments were given */ +#define TMER_PROTO -3 /* routine invoked in an improper + context */ + +/* + * xa_() return codes (resource manager reports to transaction manager) + */ +#define XA_RBBASE 100 /* The inclusive lower bound of the + rollback codes */ +#define XA_RBROLLBACK XA_RBBASE /* The rollback was caused by an + unspecified reason */ +#define XA_RBCOMMFAIL XA_RBBASE+1 /* The rollback was caused by a + communication failure */ +#define XA_RBDEADLOCK XA_RBBASE+2 /* A deadlock was detected */ +#define XA_RBINTEGRITY XA_RBBASE+3 /* A condition that violates the + integrity of the resources was + detected */ +#define XA_RBOTHER XA_RBBASE+4 /* The resource manager rolled back the + transaction branch for a reason not + on this list */ +#define XA_RBPROTO XA_RBBASE+5 /* A protocol error occurred in the + resource manager */ +#define XA_RBTIMEOUT XA_RBBASE+6 /* A transaction branch took too long */ +#define XA_RBTRANSIENT XA_RBBASE+7 /* May retry the transaction branch */ +#define XA_RBEND XA_RBTRANSIENT /* The inclusive upper bound of the + rollback codes */ +#define XA_NOMIGRATE 9 /* resumption must occur where + suspension occurred */ +#define XA_HEURHAZ 8 /* the transaction branch may have + been heuristically completed */ +#define XA_HEURCOM 7 /* the transaction branch has been + heuristically committed */ +#define XA_HEURRB 6 /* the transaction branch has been + heuristically rolled back */ +#define XA_HEURMIX 5 /* the transaction branch has been + heuristically committed and rolled + back */ +#define XA_RETRY 4 /* routine returned with no effect and + may be re-issued */ +#define XA_RDONLY 3 /* the transaction branch was read-only + and has been committed */ +#define XA_OK 0 /* normal execution */ +#define XAER_ASYNC -2 /* asynchronous operation already + outstanding */ +#define XAER_RMERR -3 /* a resource manager error occurred in + the transaction branch */ +#define XAER_NOTA -4 /* the XID is not valid */ +#define XAER_INVAL -5 /* invalid arguments were given */ +#define XAER_PROTO -6 /* routine invoked in an improper + context */ +#define XAER_RMFAIL -7 /* resource manager unavailable */ +#define XAER_DUPID -8 /* the XID already exists */ +#define XAER_OUTSIDE -9 /* resource manager doing work outside + transaction */ +#endif /* ifndef XA_H */ +/* + * End of xa.h header + */ diff --git a/innobase/include/ut0byte.h b/innobase/include/ut0byte.h index a62c2e2e318..22d488abeaf 100644 --- a/innobase/include/ut0byte.h +++ b/innobase/include/ut0byte.h @@ -208,7 +208,20 @@ ut_align_down( /*==========*/ /* out: aligned pointer */ void* ptr, /* in: pointer */ - ulint align_no); /* in: align by this number */ + ulint align_no) /* in: align by this number */ + __attribute__((const)); +/************************************************************* +The following function computes the offset of a pointer from the nearest +aligned address. */ +UNIV_INLINE +ulint +ut_align_offset( +/*==========*/ + /* out: distance from aligned + pointer */ + const void* ptr, /* in: pointer */ + ulint align_no) /* in: align by this number */ + __attribute__((const)); /********************************************************************* Gets the nth bit of a ulint. */ UNIV_INLINE diff --git a/innobase/include/ut0byte.ic b/innobase/include/ut0byte.ic index 5a70dcf12a8..e141de3aa3f 100644 --- a/innobase/include/ut0byte.ic +++ b/innobase/include/ut0byte.ic @@ -335,6 +335,27 @@ ut_align_down( return((void*)((((ulint)ptr)) & ~(align_no - 1))); } +/************************************************************* +The following function computes the offset of a pointer from the nearest +aligned address. */ +UNIV_INLINE +ulint +ut_align_offset( +/*============*/ + /* out: distance from + aligned pointer */ + const void* ptr, /* in: pointer */ + ulint align_no) /* in: align by this number */ +{ + ut_ad(align_no > 0); + ut_ad(((align_no - 1) & align_no) == 0); + ut_ad(ptr); + + ut_ad(sizeof(void*) == sizeof(ulint)); + + return(((ulint)ptr) & (align_no - 1)); +} + /********************************************************************* Gets the nth bit of a ulint. */ UNIV_INLINE diff --git a/innobase/include/ut0mem.h b/innobase/include/ut0mem.h index 73ecb25101a..74357f6bf13 100644 --- a/innobase/include/ut0mem.h +++ b/innobase/include/ut0mem.h @@ -38,8 +38,10 @@ ut_malloc_low( /*==========*/ /* out, own: allocated memory */ ulint n, /* in: number of bytes to allocate */ - ibool set_to_zero); /* in: TRUE if allocated memory should be set + ibool set_to_zero, /* in: TRUE if allocated memory should be set to zero if UNIV_SET_MEM_TO_ZERO is defined */ + ibool assert_on_error); /* in: if TRUE, we crash mysqld if the memory + cannot be allocated */ /************************************************************************** Allocates memory. Sets it also to zero if UNIV_SET_MEM_TO_ZERO is defined. */ diff --git a/innobase/lock/lock0lock.c b/innobase/lock/lock0lock.c index 68073647248..d2d16a1ae4e 100644 --- a/innobase/lock/lock0lock.c +++ b/innobase/lock/lock0lock.c @@ -365,6 +365,21 @@ lock_deadlock_recursive( ulint* cost); /* in/out: number of calculation steps thus far: if this exceeds LOCK_MAX_N_STEPS_... we return TRUE */ + +/************************************************************************* +Gets the type of a lock. */ +UNIV_INLINE +ulint +lock_get_type( +/*==========*/ + /* out: LOCK_TABLE or LOCK_REC */ + lock_t* lock) /* in: lock */ +{ + ut_ad(lock); + + return(lock->type_mode & LOCK_TYPE_MASK); +} + /************************************************************************* Gets the nth bit of a record lock. */ UNIV_INLINE @@ -410,11 +425,14 @@ lock_check_trx_id_sanity( dulint trx_id, /* in: trx id */ rec_t* rec, /* in: user record */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets, /* in: rec_get_offsets(rec, index) */ ibool has_kernel_mutex)/* in: TRUE if the caller owns the kernel mutex */ { ibool is_ok = TRUE; + ut_ad(rec_offs_validate(rec, index, offsets)); + if (!has_kernel_mutex) { mutex_enter(&kernel_mutex); } @@ -427,7 +445,7 @@ lock_check_trx_id_sanity( fputs(" InnoDB: Error: transaction id associated" " with record\n", stderr); - rec_print(stderr, rec); + rec_print(stderr, rec, offsets); fputs("InnoDB: in ", stderr); dict_index_name_print(stderr, NULL, index); fprintf(stderr, "\n" @@ -459,18 +477,20 @@ lock_clust_rec_cons_read_sees( rec_t* rec, /* in: user record which should be read or passed over by a read cursor */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ read_view_t* view) /* in: consistent read view */ { dulint trx_id; ut_ad(index->type & DICT_CLUSTERED); ut_ad(page_rec_is_user_rec(rec)); + ut_ad(rec_offs_validate(rec, index, offsets)); /* NOTE that we call this function while holding the search system latch. To obey the latching order we must NOT reserve the kernel mutex here! */ - trx_id = row_get_rec_trx_id(rec, index); + trx_id = row_get_rec_trx_id(rec, index, offsets); if (read_view_sees_trx_id(view, trx_id)) { @@ -569,20 +589,6 @@ lock_get_mode( } /************************************************************************* -Gets the type of a lock. */ -UNIV_INLINE -ulint -lock_get_type( -/*==========*/ - /* out: LOCK_TABLE or LOCK_REC */ - lock_t* lock) /* in: lock */ -{ - ut_ad(lock); - - return(lock->type_mode & LOCK_TYPE_MASK); -} - -/************************************************************************* Gets the wait flag of a lock. */ UNIV_INLINE ibool @@ -602,6 +608,128 @@ lock_get_wait( } /************************************************************************* +Gets the source table of an ALTER TABLE transaction. The table must be +covered by an IX or IS table lock. */ + +dict_table_t* +lock_get_src_table( +/*===============*/ + /* out: the source table of transaction, + if it is covered by an IX or IS table lock; + dest if there is no source table, and + NULL if the transaction is locking more than + two tables or an inconsistency is found */ + trx_t* trx, /* in: transaction */ + dict_table_t* dest, /* in: destination of ALTER TABLE */ + ulint* mode) /* out: lock mode of the source table */ +{ + dict_table_t* src; + lock_t* lock; + + src = NULL; + *mode = LOCK_NONE; + + for (lock = UT_LIST_GET_FIRST(trx->trx_locks); + lock; + lock = UT_LIST_GET_NEXT(trx_locks, lock)) { + lock_table_t* tab_lock; + ulint lock_mode; + if (!(lock_get_type(lock) & LOCK_TABLE)) { + /* We are only interested in table locks. */ + continue; + } + tab_lock = &lock->un_member.tab_lock; + if (dest == tab_lock->table) { + /* We are not interested in the destination table. */ + continue; + } else if (!src) { + /* This presumably is the source table. */ + src = tab_lock->table; + if (UT_LIST_GET_LEN(src->locks) != 1 || + UT_LIST_GET_FIRST(src->locks) != lock) { + /* We only support the case when + there is only one lock on this table. */ + return(NULL); + } + } else if (src != tab_lock->table) { + /* The transaction is locking more than + two tables (src and dest): abort */ + return(NULL); + } + + /* Check that the source table is locked by + LOCK_IX or LOCK_IS. */ + lock_mode = lock_get_mode(lock); + switch (lock_mode) { + case LOCK_IX: + case LOCK_IS: + if (*mode != LOCK_NONE && *mode != lock_mode) { + /* There are multiple locks on src. */ + return(NULL); + } + *mode = lock_mode; + break; + } + } + + if (!src) { + /* No source table lock found: flag the situation to caller */ + src = dest; + } + + return(src); +} + +/************************************************************************* +Determine if the given table is exclusively "owned" by the given +transaction, i.e., transaction holds LOCK_IX and possibly LOCK_AUTO_INC +on the table. */ + +ibool +lock_is_table_exclusive( +/*====================*/ + /* out: TRUE if table is only locked by trx, + with LOCK_IX, and possibly LOCK_AUTO_INC */ + dict_table_t* table, /* in: table */ + trx_t* trx) /* in: transaction */ +{ + lock_t* lock; + bool ok = FALSE; + + ut_ad(table && trx); + + for (lock = UT_LIST_GET_FIRST(table->locks); + lock; + lock = UT_LIST_GET_NEXT(locks, &lock->un_member.tab_lock)) { + if (lock->trx != trx) { + /* A lock on the table is held + by some other transaction. */ + return(FALSE); + } + + if (!(lock_get_type(lock) & LOCK_TABLE)) { + /* We are interested in table locks only. */ + continue; + } + + switch (lock_get_mode(lock)) { + case LOCK_IX: + ok = TRUE; + break; + case LOCK_AUTO_INC: + /* It is allowed for trx to hold an + auto_increment lock. */ + break; + default: + /* Other table locks than LOCK_IX are not allowed. */ + return(FALSE); + } + } + + return(ok); +} + +/************************************************************************* Sets the wait flag of a lock and the back pointer in trx to lock. */ UNIV_INLINE void @@ -1133,6 +1261,7 @@ lock_rec_get_next( /*==============*/ /* out: next lock, NULL if none exists */ rec_t* rec, /* in: record on a page */ + ibool comp, /* in: TRUE=compact page format */ lock_t* lock) /* in: lock */ { #ifdef UNIV_SYNC_DEBUG @@ -1148,7 +1277,7 @@ lock_rec_get_next( return(NULL); } - if (lock_rec_get_nth_bit(lock, rec_get_heap_no(rec))) { + if (lock_rec_get_nth_bit(lock, rec_get_heap_no(rec, comp))) { return(lock); } @@ -1165,15 +1294,17 @@ lock_rec_get_first( rec_t* rec) /* in: record on a page */ { lock_t* lock; + ibool comp; #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ lock = lock_rec_get_first_on_page(rec); + comp = page_is_comp(buf_frame_align(rec)); while (lock) { - if (lock_rec_get_nth_bit(lock, rec_get_heap_no(rec))) { + if (lock_rec_get_nth_bit(lock, rec_get_heap_no(rec, comp))) { break; } @@ -1340,6 +1471,7 @@ lock_rec_has_expl( for a supremum record we regard this always a gap type request */ rec_t* rec, /* in: record */ + ibool comp, /* in: TRUE=compact page format */ trx_t* trx) /* in: transaction */ { lock_t* lock; @@ -1369,7 +1501,7 @@ lock_rec_has_expl( return(lock); } - lock = lock_rec_get_next(rec, lock); + lock = lock_rec_get_next(rec, comp, lock); } return(NULL); @@ -1388,6 +1520,7 @@ lock_rec_other_has_expl_req( ulint wait, /* in: LOCK_WAIT if also waiting locks are taken into account, or 0 if not */ rec_t* rec, /* in: record to look at */ + ibool comp, /* in: TRUE=compact record format */ trx_t* trx) /* in: transaction, or NULL if requests by all transactions are taken into account */ { @@ -1412,7 +1545,7 @@ lock_rec_other_has_expl_req( return(lock); } - lock = lock_rec_get_next(rec, lock); + lock = lock_rec_get_next(rec, comp, lock); } return(NULL); @@ -1433,12 +1566,13 @@ lock_rec_other_has_conflicting( trx_t* trx) /* in: our transaction */ { lock_t* lock; - + ibool comp; #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ lock = lock_rec_get_first(rec); + comp = page_is_comp(buf_frame_align(rec)); while (lock) { if (lock_rec_has_to_wait(trx, mode, lock, @@ -1447,7 +1581,7 @@ lock_rec_other_has_conflicting( return(lock); } - lock = lock_rec_get_next(rec, lock); + lock = lock_rec_get_next(rec, comp, lock); } return(NULL); @@ -1473,8 +1607,7 @@ lock_rec_find_similar_on_page( ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ - heap_no = rec_get_heap_no(rec); - + heap_no = rec_get_heap_no(rec, page_is_comp(buf_frame_align(rec))); lock = lock_rec_get_first_on_page(rec); while (lock != NULL) { @@ -1501,7 +1634,8 @@ lock_sec_rec_some_has_impl_off_kernel( /* out: transaction which has the x-lock, or NULL */ rec_t* rec, /* in: user record */ - dict_index_t* index) /* in: secondary index */ + dict_index_t* index, /* in: secondary index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { page_t* page; @@ -1510,6 +1644,7 @@ lock_sec_rec_some_has_impl_off_kernel( #endif /* UNIV_SYNC_DEBUG */ ut_ad(!(index->type & DICT_CLUSTERED)); ut_ad(page_rec_is_user_rec(rec)); + ut_ad(rec_offs_validate(rec, index, offsets)); page = buf_frame_align(rec); @@ -1529,8 +1664,8 @@ lock_sec_rec_some_has_impl_off_kernel( /* Ok, in this case it is possible that some transaction has an implicit x-lock. We have to look in the clustered index. */ - if (!lock_check_trx_id_sanity(page_get_max_trx_id(page), rec, index, - TRUE)) { + if (!lock_check_trx_id_sanity(page_get_max_trx_id(page), + rec, index, offsets, TRUE)) { buf_page_print(page); /* The page is corrupt: try to avoid a crash by returning @@ -1538,7 +1673,7 @@ lock_sec_rec_some_has_impl_off_kernel( return(NULL); } - return(row_vers_impl_x_locked_off_kernel(rec, index)); + return(row_vers_impl_x_locked_off_kernel(rec, index, offsets)); } /*============== RECORD LOCK CREATION AND QUEUE MANAGEMENT =============*/ @@ -1572,7 +1707,7 @@ lock_rec_create( page = buf_frame_align(rec); space = buf_frame_get_space_id(page); page_no = buf_frame_get_page_no(page); - heap_no = rec_get_heap_no(rec); + heap_no = rec_get_heap_no(rec, page_is_comp(page)); /* If rec is the supremum record, then we reset the gap and LOCK_REC_NOT_GAP bits, as all locks on the supremum are @@ -1585,8 +1720,7 @@ lock_rec_create( } /* Make lock bitmap bigger by a safety margin */ - n_bits = page_header_get_field(page, PAGE_N_HEAP) - + LOCK_PAGE_BITMAP_MARGIN; + n_bits = page_dir_get_n_heap(page) + LOCK_PAGE_BITMAP_MARGIN; n_bytes = 1 + n_bits / 8; lock = mem_heap_alloc(trx->lock_heap, sizeof(lock_t) + n_bytes); @@ -1617,6 +1751,9 @@ lock_rec_create( HASH_INSERT(lock_t, hash, lock_sys->rec_hash, lock_rec_fold(space, page_no), lock); + /* Note that we have create a new lock */ + trx->trx_create_lock = TRUE; + if (type_mode & LOCK_WAIT) { lock_set_lock_and_trx_wait(lock, trx); @@ -1688,7 +1825,8 @@ lock_rec_enqueue_waiting( if (lock_deadlock_occurs(lock, trx)) { lock_reset_lock_and_trx_wait(lock); - lock_rec_reset_nth_bit(lock, rec_get_heap_no(rec)); + lock_rec_reset_nth_bit(lock, rec_get_heap_no(rec, + page_is_comp(buf_frame_align(rec)))); return(DB_DEADLOCK); } @@ -1738,7 +1876,7 @@ lock_rec_add_to_queue( lock_t* lock; lock_t* similar_lock = NULL; ulint heap_no; - page_t* page; + page_t* page = buf_frame_align(rec); ibool somebody_waits = FALSE; #ifdef UNIV_SYNC_DEBUG @@ -1746,15 +1884,15 @@ lock_rec_add_to_queue( #endif /* UNIV_SYNC_DEBUG */ ut_ad((type_mode & (LOCK_WAIT | LOCK_GAP)) || ((type_mode & LOCK_MODE_MASK) != LOCK_S) - || !lock_rec_other_has_expl_req(LOCK_X, 0, LOCK_WAIT, rec, trx)); + || !lock_rec_other_has_expl_req(LOCK_X, 0, LOCK_WAIT, + rec, page_is_comp(page), trx)); ut_ad((type_mode & (LOCK_WAIT | LOCK_GAP)) || ((type_mode & LOCK_MODE_MASK) != LOCK_X) - || !lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT, rec, trx)); + || !lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT, + rec, page_is_comp(page), trx)); type_mode = type_mode | LOCK_REC; - page = buf_frame_align(rec); - /* If rec is the supremum record, then we can reset the gap bit, as all locks on the supremum are automatically of the gap type, and we try to avoid unnecessary memory consumption of a new record lock @@ -1771,7 +1909,7 @@ lock_rec_add_to_queue( /* Look for a waiting lock request on the same record or on a gap */ - heap_no = rec_get_heap_no(rec); + heap_no = rec_get_heap_no(rec, page_is_comp(page)); lock = lock_rec_get_first_on_page(rec); while (lock != NULL) { @@ -1791,6 +1929,15 @@ lock_rec_add_to_queue( if (similar_lock && !somebody_waits && !(type_mode & LOCK_WAIT)) { + /* If the nth bit of a record lock is already set then we + do not set a new lock bit, otherwice we set */ + + if (lock_rec_get_nth_bit(similar_lock, heap_no)) { + trx->trx_create_lock = FALSE; + } else { + trx->trx_create_lock = TRUE; + } + lock_rec_set_nth_bit(similar_lock, heap_no); return(similar_lock); @@ -1822,6 +1969,7 @@ lock_rec_lock_fast( { lock_t* lock; ulint heap_no; + trx_t* trx; #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); @@ -1836,13 +1984,16 @@ lock_rec_lock_fast( || mode - (LOCK_MODE_MASK & mode) == 0 || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP); - heap_no = rec_get_heap_no(rec); + heap_no = rec_get_heap_no(rec, page_is_comp(buf_frame_align(rec))); lock = lock_rec_get_first_on_page(rec); + trx = thr_get_trx(thr); + trx->trx_create_lock = FALSE; + if (lock == NULL) { if (!impl) { - lock_rec_create(mode, rec, index, thr_get_trx(thr)); + lock_rec_create(mode, rec, index, trx); } return(TRUE); @@ -1853,13 +2004,23 @@ lock_rec_lock_fast( return(FALSE); } - if (lock->trx != thr_get_trx(thr) + if (lock->trx != trx || lock->type_mode != (mode | LOCK_REC) || lock_rec_get_n_bits(lock) <= heap_no) { return(FALSE); } if (!impl) { + + /* If the nth bit of a record lock is already set then we + do not set a new lock bit, otherwice we set */ + + if (lock_rec_get_nth_bit(lock, heap_no)) { + trx->trx_create_lock = FALSE; + } else { + trx->trx_create_lock = TRUE; + } + lock_rec_set_nth_bit(lock, heap_no); } @@ -1904,7 +2065,8 @@ lock_rec_lock_slow( trx = thr_get_trx(thr); - if (lock_rec_has_expl(mode, rec, trx)) { + if (lock_rec_has_expl(mode, rec, + page_is_comp(buf_frame_align(rec)), trx)) { /* The trx already has a strong enough lock on rec: do nothing */ @@ -2220,12 +2382,14 @@ lock_rec_reset_and_release_wait( { lock_t* lock; ulint heap_no; - + ibool comp; + #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ - heap_no = rec_get_heap_no(rec); + comp = page_is_comp(buf_frame_align(rec)); + heap_no = rec_get_heap_no(rec, comp); lock = lock_rec_get_first(rec); @@ -2236,7 +2400,7 @@ lock_rec_reset_and_release_wait( lock_rec_reset_nth_bit(lock, heap_no); } - lock = lock_rec_get_next(rec, lock); + lock = lock_rec_get_next(rec, comp, lock); } } @@ -2254,12 +2418,13 @@ lock_rec_inherit_to_gap( the locks on this record */ { lock_t* lock; - + ibool comp; #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ lock = lock_rec_get_first(rec); + comp = page_is_comp(buf_frame_align(rec)); while (lock != NULL) { if (!lock_rec_get_insert_intention(lock)) { @@ -2269,7 +2434,7 @@ lock_rec_inherit_to_gap( heir, lock->index, lock->trx); } - lock = lock_rec_get_next(rec, lock); + lock = lock_rec_get_next(rec, comp, lock); } } @@ -2286,12 +2451,13 @@ lock_rec_inherit_to_gap_if_gap_lock( the locks on this record */ { lock_t* lock; - + ibool comp; #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ lock = lock_rec_get_first(rec); + comp = page_is_comp(buf_frame_align(rec)); while (lock != NULL) { if (!lock_rec_get_insert_intention(lock) @@ -2303,7 +2469,7 @@ lock_rec_inherit_to_gap_if_gap_lock( heir, lock->index, lock->trx); } - lock = lock_rec_get_next(rec, lock); + lock = lock_rec_get_next(rec, comp, lock); } } @@ -2316,7 +2482,8 @@ lock_rec_move( /*==========*/ rec_t* receiver, /* in: record which gets locks; this record must have no lock requests on it! */ - rec_t* donator) /* in: record which gives locks */ + rec_t* donator, /* in: record which gives locks */ + ibool comp) /* in: TRUE=compact page format */ { lock_t* lock; ulint heap_no; @@ -2326,7 +2493,7 @@ lock_rec_move( ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ - heap_no = rec_get_heap_no(donator); + heap_no = rec_get_heap_no(donator, comp); lock = lock_rec_get_first(donator); @@ -2346,7 +2513,7 @@ lock_rec_move( lock_rec_add_to_queue(type_mode, receiver, lock->index, lock->trx); - lock = lock_rec_get_next(donator, lock); + lock = lock_rec_get_next(donator, comp, lock); } ut_ad(lock_rec_get_first(donator) == NULL); @@ -2372,6 +2539,7 @@ lock_move_reorganize_page( UT_LIST_BASE_NODE_T(lock_t) old_locks; mem_heap_t* heap = NULL; rec_t* sup; + ibool comp; lock_mutex_enter_kernel(); @@ -2412,6 +2580,9 @@ lock_move_reorganize_page( lock = UT_LIST_GET_FIRST(old_locks); + comp = page_is_comp(page); + ut_ad(comp == page_is_comp(old_page)); + while (lock) { /* NOTE: we copy also the locks set on the infimum and supremum of the page; the infimum may carry locks if an @@ -2423,12 +2594,12 @@ lock_move_reorganize_page( /* Set locks according to old locks */ for (;;) { - ut_ad(0 == ut_memcmp(page_cur_get_rec(&cur1), + ut_ad(comp || 0 == ut_memcmp(page_cur_get_rec(&cur1), page_cur_get_rec(&cur2), - rec_get_data_size( + rec_get_data_size_old( page_cur_get_rec(&cur2)))); - - old_heap_no = rec_get_heap_no(page_cur_get_rec(&cur2)); + old_heap_no = rec_get_heap_no(page_cur_get_rec(&cur2), + comp); if (lock_rec_get_nth_bit(lock, old_heap_no)) { @@ -2487,6 +2658,7 @@ lock_move_rec_list_end( ulint heap_no; rec_t* sup; ulint type_mode; + ibool comp; lock_mutex_enter_kernel(); @@ -2500,6 +2672,8 @@ lock_move_rec_list_end( lock = lock_rec_get_first_on_page(page); + comp = page_is_comp(page); + while (lock != NULL) { page_cur_position(rec, &cur1); @@ -2515,13 +2689,12 @@ lock_move_rec_list_end( reset the lock bits on the old */ while (page_cur_get_rec(&cur1) != sup) { - - ut_ad(0 == ut_memcmp(page_cur_get_rec(&cur1), + ut_ad(comp || 0 == ut_memcmp(page_cur_get_rec(&cur1), page_cur_get_rec(&cur2), - rec_get_data_size( + rec_get_data_size_old( page_cur_get_rec(&cur2)))); - - heap_no = rec_get_heap_no(page_cur_get_rec(&cur1)); + heap_no = rec_get_heap_no(page_cur_get_rec(&cur1), + comp); if (lock_rec_get_nth_bit(lock, heap_no)) { type_mode = lock->type_mode; @@ -2571,12 +2744,15 @@ lock_move_rec_list_start( page_cur_t cur2; ulint heap_no; ulint type_mode; + ibool comp; ut_a(new_page); lock_mutex_enter_kernel(); lock = lock_rec_get_first_on_page(page); + comp = page_is_comp(page); + ut_ad(comp == page_is_comp(new_page)); while (lock != NULL) { @@ -2590,13 +2766,12 @@ lock_move_rec_list_start( reset the lock bits on the old */ while (page_cur_get_rec(&cur1) != rec) { - - ut_ad(0 == ut_memcmp(page_cur_get_rec(&cur1), + ut_ad(comp || 0 == ut_memcmp(page_cur_get_rec(&cur1), page_cur_get_rec(&cur2), - rec_get_data_size( + rec_get_data_size_old( page_cur_get_rec(&cur2)))); - - heap_no = rec_get_heap_no(page_cur_get_rec(&cur1)); + heap_no = rec_get_heap_no(page_cur_get_rec(&cur1), + comp); if (lock_rec_get_nth_bit(lock, heap_no)) { type_mode = lock->type_mode; @@ -2636,13 +2811,16 @@ lock_update_split_right( page_t* right_page, /* in: right page */ page_t* left_page) /* in: left page */ { + ibool comp; lock_mutex_enter_kernel(); - + comp = page_is_comp(left_page); + ut_ad(comp == page_is_comp(right_page)); + /* Move the locks on the supremum of the left page to the supremum of the right page */ lock_rec_move(page_get_supremum_rec(right_page), - page_get_supremum_rec(left_page)); + page_get_supremum_rec(left_page), comp); /* Inherit the locks to the supremum of left page from the successor of the infimum on right page */ @@ -2696,13 +2874,16 @@ lock_update_root_raise( page_t* new_page, /* in: index page to which copied */ page_t* root) /* in: root page */ { + ibool comp; lock_mutex_enter_kernel(); - + comp = page_is_comp(root); + ut_ad(comp == page_is_comp(new_page)); + /* Move the locks on the supremum of the root to the supremum of new_page */ lock_rec_move(page_get_supremum_rec(new_page), - page_get_supremum_rec(root)); + page_get_supremum_rec(root), comp); lock_mutex_exit_kernel(); } @@ -2716,13 +2897,16 @@ lock_update_copy_and_discard( page_t* new_page, /* in: index page to which copied */ page_t* page) /* in: index page; NOT the root! */ { + ibool comp; lock_mutex_enter_kernel(); - + comp = page_is_comp(page); + ut_ad(comp == page_is_comp(new_page)); + /* Move the locks on the supremum of the old page to the supremum of new_page */ lock_rec_move(page_get_supremum_rec(new_page), - page_get_supremum_rec(page)); + page_get_supremum_rec(page), comp); lock_rec_free_all_from_discard_page(page); lock_mutex_exit_kernel(); @@ -2760,8 +2944,11 @@ lock_update_merge_left( page_t* right_page) /* in: merged index page which will be discarded */ { + ibool comp; lock_mutex_enter_kernel(); - + comp = page_is_comp(left_page); + ut_ad(comp == page_is_comp(right_page)); + if (page_rec_get_next(orig_pred) != page_get_supremum_rec(left_page)) { /* Inherit the locks on the supremum of the left page to the @@ -2781,7 +2968,7 @@ lock_update_merge_left( of the left page */ lock_rec_move(page_get_supremum_rec(left_page), - page_get_supremum_rec(right_page)); + page_get_supremum_rec(right_page), comp); lock_rec_free_all_from_discard_page(right_page); @@ -2908,12 +3095,14 @@ lock_rec_store_on_page_infimum( bits are reset on the record */ { page_t* page; + ibool comp; page = buf_frame_align(rec); + comp = page_is_comp(page); lock_mutex_enter_kernel(); - lock_rec_move(page_get_infimum_rec(page), rec); + lock_rec_move(page_get_infimum_rec(page), rec, comp); lock_mutex_exit_kernel(); } @@ -2930,9 +3119,12 @@ lock_rec_restore_from_page_infimum( whose infimum stored the lock state; lock bits are reset on the infimum */ { + ibool comp; lock_mutex_enter_kernel(); - - lock_rec_move(rec, page_get_infimum_rec(page)); + comp = page_is_comp(page); + ut_ad(comp == page_is_comp(buf_frame_align(rec))); + + lock_rec_move(rec, page_get_infimum_rec(page), comp); lock_mutex_exit_kernel(); } @@ -3876,11 +4068,15 @@ lock_rec_print( FILE* file, /* in: file where to print */ lock_t* lock) /* in: record type lock */ { - page_t* page; - ulint space; - ulint page_no; - ulint i; - mtr_t mtr; + page_t* page; + ulint space; + ulint page_no; + ulint i; + mtr_t mtr; + mem_heap_t* heap; + ulint* offsets = NULL; + + heap = mem_heap_create(100); #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); @@ -3959,8 +4155,11 @@ lock_rec_print( fprintf(file, "Record lock, heap no %lu ", (ulong) i); if (page) { - rec_print(file, - page_find_rec_with_heap_no(page, i)); + rec_t* rec + = page_find_rec_with_heap_no(page, i); + offsets = rec_reget_offsets(rec, lock->index, + offsets, ULINT_UNDEFINED, heap); + rec_print(file, rec, offsets); } putc('\n', file); @@ -3968,6 +4167,7 @@ lock_rec_print( } mtr_commit(&mtr); + mem_heap_free(heap); } /************************************************************************* @@ -4051,6 +4251,9 @@ lock_print_info( (ulong) ut_dulint_get_low(purge_sys->purge_undo_no)); fprintf(file, + "History list length %lu\n", (ulong) trx_sys->rseg_history_len); + + fprintf(file, "Total number of lock structs in row lock hash table %lu\n", (ulong) lock_get_n_rec_locks()); @@ -4242,12 +4445,16 @@ lock_rec_queue_validate( /*====================*/ /* out: TRUE if ok */ rec_t* rec, /* in: record to look at */ - dict_index_t* index) /* in: index, or NULL if not known */ + dict_index_t* index, /* in: index, or NULL if not known */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { trx_t* impl_trx; lock_t* lock; - + ibool comp; + ut_a(rec); + ut_ad(rec_offs_validate(rec, index, offsets)); + comp = page_is_comp(buf_frame_align(rec)); lock_mutex_enter_kernel(); @@ -4270,7 +4477,7 @@ lock_rec_queue_validate( ut_a(lock->index == index); } - lock = lock_rec_get_next(rec, lock); + lock = lock_rec_get_next(rec, comp, lock); } lock_mutex_exit_kernel(); @@ -4280,13 +4487,13 @@ lock_rec_queue_validate( if (index && (index->type & DICT_CLUSTERED)) { - impl_trx = lock_clust_rec_some_has_impl(rec, index); + impl_trx = lock_clust_rec_some_has_impl(rec, index, offsets); if (impl_trx && lock_rec_other_has_expl_req(LOCK_S, 0, - LOCK_WAIT, rec, impl_trx)) { + LOCK_WAIT, rec, comp, impl_trx)) { ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, rec, - impl_trx)); + comp, impl_trx)); } } @@ -4296,13 +4503,14 @@ lock_rec_queue_validate( next function call: we have to release lock table mutex to obey the latching order */ - impl_trx = lock_sec_rec_some_has_impl_off_kernel(rec, index); + impl_trx = lock_sec_rec_some_has_impl_off_kernel( + rec, index, offsets); if (impl_trx && lock_rec_other_has_expl_req(LOCK_S, 0, - LOCK_WAIT, rec, impl_trx)) { + LOCK_WAIT, rec, comp, impl_trx)) { - ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, rec, - impl_trx)); + ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, + rec, comp, impl_trx)); } } @@ -4321,10 +4529,10 @@ lock_rec_queue_validate( if (lock_get_mode(lock) == LOCK_S) { ut_a(!lock_rec_other_has_expl_req(LOCK_X, - 0, 0, rec, lock->trx)); + 0, 0, rec, comp, lock->trx)); } else { ut_a(!lock_rec_other_has_expl_req(LOCK_S, - 0, 0, rec, lock->trx)); + 0, 0, rec, comp, lock->trx)); } } else if (lock_get_wait(lock) && !lock_rec_get_gap(lock)) { @@ -4332,7 +4540,7 @@ lock_rec_queue_validate( ut_a(lock_rec_has_to_wait_in_queue(lock)); } - lock = lock_rec_get_next(rec, lock); + lock = lock_rec_get_next(rec, comp, lock); } lock_mutex_exit_kernel(); @@ -4358,6 +4566,8 @@ lock_rec_validate_page( ulint nth_bit = 0; ulint i; mtr_t mtr; + mem_heap_t* heap = mem_heap_create(100); + ulint* offsets = NULL; #ifdef UNIV_SYNC_DEBUG ut_ad(!mutex_own(&kernel_mutex)); @@ -4397,13 +4607,15 @@ loop: index = lock->index; rec = page_find_rec_with_heap_no(page, i); + offsets = rec_reget_offsets(rec, index, + offsets, ULINT_UNDEFINED, heap); fprintf(stderr, "Validating %lu %lu\n", (ulong) space, (ulong) page_no); lock_mutex_exit_kernel(); - lock_rec_queue_validate(rec, index); + lock_rec_queue_validate(rec, index, offsets); lock_mutex_enter_kernel(); @@ -4423,6 +4635,7 @@ function_exit: mtr_commit(&mtr); + mem_heap_free(heap); return(TRUE); } @@ -4595,8 +4808,16 @@ lock_rec_insert_check_and_lock( page_update_max_trx_id(buf_frame_align(rec), thr_get_trx(thr)->id); } - - ut_ad(lock_rec_queue_validate(next_rec, index)); + +#ifdef UNIV_DEBUG + { + mem_heap_t* heap = mem_heap_create(100); + const ulint* offsets = rec_get_offsets(next_rec, index, + ULINT_UNDEFINED, heap); + ut_ad(lock_rec_queue_validate(next_rec, index, offsets)); + mem_heap_free(heap); + } +#endif /* UNIV_DEBUG */ return(err); } @@ -4610,7 +4831,8 @@ void lock_rec_convert_impl_to_expl( /*==========================*/ rec_t* rec, /* in: user record on page */ - dict_index_t* index) /* in: index of record */ + dict_index_t* index, /* in: index of record */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { trx_t* impl_trx; @@ -4618,11 +4840,14 @@ lock_rec_convert_impl_to_expl( ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ ut_ad(page_rec_is_user_rec(rec)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(page_is_comp(buf_frame_align(rec)) == index->table->comp); if (index->type & DICT_CLUSTERED) { - impl_trx = lock_clust_rec_some_has_impl(rec, index); + impl_trx = lock_clust_rec_some_has_impl(rec, index, offsets); } else { - impl_trx = lock_sec_rec_some_has_impl_off_kernel(rec, index); + impl_trx = lock_sec_rec_some_has_impl_off_kernel( + rec, index, offsets); } if (impl_trx) { @@ -4630,7 +4855,7 @@ lock_rec_convert_impl_to_expl( record, set one for it */ if (!lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, rec, - impl_trx)) { + index->table->comp, impl_trx)) { lock_rec_add_to_queue(LOCK_REC | LOCK_X | LOCK_REC_NOT_GAP, rec, index, @@ -4656,17 +4881,19 @@ lock_clust_rec_modify_check_and_lock( does nothing */ rec_t* rec, /* in: record which should be modified */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ que_thr_t* thr) /* in: query thread */ { ulint err; - + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(index->type & DICT_CLUSTERED); + if (flags & BTR_NO_LOCKING_FLAG) { return(DB_SUCCESS); } - ut_ad(index->type & DICT_CLUSTERED); - lock_mutex_enter_kernel(); ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); @@ -4674,13 +4901,13 @@ lock_clust_rec_modify_check_and_lock( /* If a transaction has no explicit x-lock set on the record, set one for it */ - lock_rec_convert_impl_to_expl(rec, index); + lock_rec_convert_impl_to_expl(rec, index, offsets); err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP, rec, index, thr); lock_mutex_exit_kernel(); - ut_ad(lock_rec_queue_validate(rec, index)); + ut_ad(lock_rec_queue_validate(rec, index, offsets)); return(err); } @@ -4724,8 +4951,16 @@ lock_sec_rec_modify_check_and_lock( err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP, rec, index, thr); lock_mutex_exit_kernel(); - - ut_ad(lock_rec_queue_validate(rec, index)); + +#ifdef UNIV_DEBUG + { + mem_heap_t* heap = mem_heap_create(100); + const ulint* offsets = rec_get_offsets(rec, index, + ULINT_UNDEFINED, heap); + ut_ad(lock_rec_queue_validate(rec, index, offsets)); + mem_heap_free(heap); + } +#endif /* UNIV_DEBUG */ if (err == DB_SUCCESS) { /* Update the page max trx id field */ @@ -4752,6 +4987,7 @@ lock_sec_rec_read_check_and_lock( which should be read or passed over by a read cursor */ dict_index_t* index, /* in: secondary index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ ulint mode, /* in: mode of the lock which the read cursor should set on records: LOCK_S or LOCK_X; the latter is possible in SELECT FOR UPDATE */ @@ -4763,6 +4999,7 @@ lock_sec_rec_read_check_and_lock( ut_ad(!(index->type & DICT_CLUSTERED)); ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec)); + ut_ad(rec_offs_validate(rec, index, offsets)); if (flags & BTR_NO_LOCKING_FLAG) { @@ -4785,14 +5022,14 @@ lock_sec_rec_read_check_and_lock( || recv_recovery_is_on()) && !page_rec_is_supremum(rec)) { - lock_rec_convert_impl_to_expl(rec, index); + lock_rec_convert_impl_to_expl(rec, index, offsets); } err = lock_rec_lock(FALSE, mode | gap_mode, rec, index, thr); lock_mutex_exit_kernel(); - ut_ad(lock_rec_queue_validate(rec, index)); + ut_ad(lock_rec_queue_validate(rec, index, offsets)); return(err); } @@ -4816,6 +5053,7 @@ lock_clust_rec_read_check_and_lock( which should be read or passed over by a read cursor */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ ulint mode, /* in: mode of the lock which the read cursor should set on records: LOCK_S or LOCK_X; the latter is possible in SELECT FOR UPDATE */ @@ -4829,6 +5067,9 @@ lock_clust_rec_read_check_and_lock( ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec)); ut_ad(gap_mode == LOCK_ORDINARY || gap_mode == LOCK_GAP || gap_mode == LOCK_REC_NOT_GAP); + ut_ad(index->type & DICT_CLUSTERED); + ut_ad(rec_offs_validate(rec, index, offsets)); + if (flags & BTR_NO_LOCKING_FLAG) { return(DB_SUCCESS); @@ -4843,14 +5084,14 @@ lock_clust_rec_read_check_and_lock( if (!page_rec_is_supremum(rec)) { - lock_rec_convert_impl_to_expl(rec, index); + lock_rec_convert_impl_to_expl(rec, index, offsets); } err = lock_rec_lock(FALSE, mode | gap_mode, rec, index, thr); lock_mutex_exit_kernel(); - ut_ad(lock_rec_queue_validate(rec, index)); - + ut_ad(lock_rec_queue_validate(rec, index, offsets)); + return(err); } diff --git a/innobase/log/log0log.c b/innobase/log/log0log.c index e08adb013b5..1ab91b71e8f 100644 --- a/innobase/log/log0log.c +++ b/innobase/log/log0log.c @@ -190,6 +190,8 @@ loop: log_buffer_flush_to_disk(); + srv_log_waits++; + ut_ad(++count < 50); goto loop; @@ -292,6 +294,8 @@ part_loop: if (str_len > 0) { goto part_loop; } + + srv_log_write_requests++; } /**************************************************************** @@ -1112,11 +1116,15 @@ log_group_file_header_flush( if (log_do_write) { log_sys->n_log_ios++; + srv_os_log_pending_writes++; + fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->space_id, dest_offset / UNIV_PAGE_SIZE, dest_offset % UNIV_PAGE_SIZE, OS_FILE_LOG_BLOCK_SIZE, buf, group); + + srv_os_log_pending_writes--; } } @@ -1181,6 +1189,8 @@ loop: log_group_file_header_flush(group, next_offset / group->file_size, start_lsn); + srv_os_log_written+= OS_FILE_LOG_BLOCK_SIZE; + srv_log_writes++; } if ((next_offset % group->file_size) + len > group->file_size) { @@ -1225,9 +1235,16 @@ loop: if (log_do_write) { log_sys->n_log_ios++; + srv_os_log_pending_writes++; + fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->space_id, next_offset / UNIV_PAGE_SIZE, next_offset % UNIV_PAGE_SIZE, write_len, buf, group); + + srv_os_log_pending_writes--; + + srv_os_log_written+= write_len; + srv_log_writes++; } if (write_len < len) { diff --git a/innobase/log/log0recv.c b/innobase/log/log0recv.c index 10f921bb1f0..f42f0eb8c72 100644 --- a/innobase/log/log0recv.c +++ b/innobase/log/log0recv.c @@ -756,81 +756,124 @@ recv_parse_or_apply_log_rec_body( mtr_t* mtr) /* in: mtr or NULL; should be non-NULL if and only if page is non-NULL */ { - byte* new_ptr; - - if (type <= MLOG_8BYTES) { - new_ptr = mlog_parse_nbytes(type, ptr, end_ptr, page); - - } else if (type == MLOG_REC_INSERT) { - new_ptr = page_cur_parse_insert_rec(FALSE, ptr, end_ptr, page, - mtr); - } else if (type == MLOG_REC_CLUST_DELETE_MARK) { - new_ptr = btr_cur_parse_del_mark_set_clust_rec(ptr, end_ptr, - page); - } else if (type == MLOG_REC_SEC_DELETE_MARK) { - new_ptr = btr_cur_parse_del_mark_set_sec_rec(ptr, end_ptr, - page); - } else if (type == MLOG_REC_UPDATE_IN_PLACE) { - new_ptr = btr_cur_parse_update_in_place(ptr, end_ptr, page); - - } else if ((type == MLOG_LIST_END_DELETE) - || (type == MLOG_LIST_START_DELETE)) { - new_ptr = page_parse_delete_rec_list(type, ptr, end_ptr, page, - mtr); - } else if (type == MLOG_LIST_END_COPY_CREATED) { - new_ptr = page_parse_copy_rec_list_to_created_page(ptr, - end_ptr, page, mtr); - } else if (type == MLOG_PAGE_REORGANIZE) { - new_ptr = btr_parse_page_reorganize(ptr, end_ptr, page, mtr); - - } else if (type == MLOG_PAGE_CREATE) { - new_ptr = page_parse_create(ptr, end_ptr, page, mtr); - - } else if (type == MLOG_UNDO_INSERT) { - new_ptr = trx_undo_parse_add_undo_rec(ptr, end_ptr, page); - - } else if (type == MLOG_UNDO_ERASE_END) { - new_ptr = trx_undo_parse_erase_page_end(ptr, end_ptr, page, - mtr); - } else if (type == MLOG_UNDO_INIT) { - new_ptr = trx_undo_parse_page_init(ptr, end_ptr, page, mtr); - - } else if (type == MLOG_UNDO_HDR_DISCARD) { - new_ptr = trx_undo_parse_discard_latest(ptr, end_ptr, page, - mtr); - } else if ((type == MLOG_UNDO_HDR_CREATE) - || (type == MLOG_UNDO_HDR_REUSE)) { - new_ptr = trx_undo_parse_page_header(type, ptr, end_ptr, page, - mtr); - } else if (type == MLOG_REC_MIN_MARK) { - new_ptr = btr_parse_set_min_rec_mark(ptr, end_ptr, page, mtr); - - } else if (type == MLOG_REC_DELETE) { - new_ptr = page_cur_parse_delete_rec(ptr, end_ptr, page, mtr); - - } else if (type == MLOG_IBUF_BITMAP_INIT) { - new_ptr = ibuf_parse_bitmap_init(ptr, end_ptr, page, mtr); - - } else if (type == MLOG_INIT_FILE_PAGE) { - new_ptr = fsp_parse_init_file_page(ptr, end_ptr, page); - - } else if (type == MLOG_WRITE_STRING) { - new_ptr = mlog_parse_string(ptr, end_ptr, page); - - } else if (type == MLOG_FILE_CREATE - || type == MLOG_FILE_RENAME - || type == MLOG_FILE_DELETE) { - new_ptr = fil_op_log_parse_or_replay(ptr, end_ptr, type, FALSE, + dict_index_t* index = NULL; + + switch (type) { + case MLOG_1BYTE: case MLOG_2BYTES: case MLOG_4BYTES: case MLOG_8BYTES: + ptr = mlog_parse_nbytes(type, ptr, end_ptr, page); + break; + case MLOG_REC_INSERT: case MLOG_COMP_REC_INSERT: + if (NULL != (ptr = mlog_parse_index(ptr, end_ptr, + type == MLOG_COMP_REC_INSERT, &index))) { + ptr = page_cur_parse_insert_rec(FALSE, ptr, end_ptr, + index, page, mtr); + } + break; + case MLOG_REC_CLUST_DELETE_MARK: case MLOG_COMP_REC_CLUST_DELETE_MARK: + if (NULL != (ptr = mlog_parse_index(ptr, end_ptr, + type == MLOG_COMP_REC_CLUST_DELETE_MARK, &index))) { + ptr = btr_cur_parse_del_mark_set_clust_rec(ptr, + end_ptr, index, page); + } + break; + case MLOG_REC_SEC_DELETE_MARK: case MLOG_COMP_REC_SEC_DELETE_MARK: + if (NULL != (ptr = mlog_parse_index(ptr, end_ptr, + type == MLOG_COMP_REC_SEC_DELETE_MARK, &index))) { + ptr = btr_cur_parse_del_mark_set_sec_rec(ptr, end_ptr, + index, page); + } + break; + case MLOG_REC_UPDATE_IN_PLACE: case MLOG_COMP_REC_UPDATE_IN_PLACE: + if (NULL != (ptr = mlog_parse_index(ptr, end_ptr, + type == MLOG_COMP_REC_UPDATE_IN_PLACE, &index))) { + ptr = btr_cur_parse_update_in_place(ptr, end_ptr, + page, index); + } + break; + case MLOG_LIST_END_DELETE: case MLOG_COMP_LIST_END_DELETE: + case MLOG_LIST_START_DELETE: case MLOG_COMP_LIST_START_DELETE: + if (NULL != (ptr = mlog_parse_index(ptr, end_ptr, + type == MLOG_COMP_LIST_END_DELETE + || type == MLOG_COMP_LIST_START_DELETE, &index))) { + ptr = page_parse_delete_rec_list(type, ptr, end_ptr, + index, page, mtr); + } + break; + case MLOG_LIST_END_COPY_CREATED: case MLOG_COMP_LIST_END_COPY_CREATED: + if (NULL != (ptr = mlog_parse_index(ptr, end_ptr, + type == MLOG_COMP_LIST_END_COPY_CREATED, &index))) { + ptr = page_parse_copy_rec_list_to_created_page(ptr, + end_ptr, index, page, mtr); + } + break; + case MLOG_PAGE_REORGANIZE: case MLOG_COMP_PAGE_REORGANIZE: + if (NULL != (ptr = mlog_parse_index(ptr, end_ptr, + type == MLOG_COMP_PAGE_REORGANIZE, &index))) { + ptr = btr_parse_page_reorganize(ptr, end_ptr, index, + page, mtr); + } + break; + case MLOG_PAGE_CREATE: case MLOG_COMP_PAGE_CREATE: + ptr = page_parse_create(ptr, end_ptr, + type == MLOG_COMP_PAGE_CREATE, page, mtr); + break; + case MLOG_UNDO_INSERT: + ptr = trx_undo_parse_add_undo_rec(ptr, end_ptr, page); + break; + case MLOG_UNDO_ERASE_END: + ptr = trx_undo_parse_erase_page_end(ptr, end_ptr, page, mtr); + break; + case MLOG_UNDO_INIT: + ptr = trx_undo_parse_page_init(ptr, end_ptr, page, mtr); + break; + case MLOG_UNDO_HDR_DISCARD: + ptr = trx_undo_parse_discard_latest(ptr, end_ptr, page, mtr); + break; + case MLOG_UNDO_HDR_CREATE: + case MLOG_UNDO_HDR_REUSE: + ptr = trx_undo_parse_page_header(type, ptr, end_ptr, + page, mtr); + break; + case MLOG_REC_MIN_MARK: case MLOG_COMP_REC_MIN_MARK: + ptr = btr_parse_set_min_rec_mark(ptr, end_ptr, + type == MLOG_COMP_REC_MIN_MARK, page, mtr); + break; + case MLOG_REC_DELETE: case MLOG_COMP_REC_DELETE: + if (NULL != (ptr = mlog_parse_index(ptr, end_ptr, + type == MLOG_COMP_REC_DELETE, &index))) { + ptr = page_cur_parse_delete_rec(ptr, end_ptr, + index, page, mtr); + } + break; + case MLOG_IBUF_BITMAP_INIT: + ptr = ibuf_parse_bitmap_init(ptr, end_ptr, page, mtr); + break; + case MLOG_INIT_FILE_PAGE: + ptr = fsp_parse_init_file_page(ptr, end_ptr, page); + break; + case MLOG_WRITE_STRING: + ptr = mlog_parse_string(ptr, end_ptr, page); + break; + case MLOG_FILE_CREATE: + case MLOG_FILE_RENAME: + case MLOG_FILE_DELETE: + ptr = fil_op_log_parse_or_replay(ptr, end_ptr, type, FALSE, ULINT_UNDEFINED); - } else { - new_ptr = NULL; - + break; + default: + ptr = NULL; recv_sys->found_corrupt_log = TRUE; } - ut_ad(!page || new_ptr); + ut_ad(!page || ptr); + if (index) { + dict_table_t* table = index->table; + mem_heap_free(index->heap); + mutex_free(&(table->autoinc_mutex)); + mem_heap_free(table->heap); + } - return(new_ptr); + return(ptr); } /************************************************************************* @@ -2851,11 +2894,13 @@ void recv_recovery_from_checkpoint_finish(void) /*======================================*/ { + int i; + os_thread_id_t recovery_thread_id; + /* Rollback the uncommitted transactions which have no user session */ - if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) { - trx_rollback_or_clean_all_without_sess(); - } + fprintf(stderr, + "InnoDB: Starting to apply log records to the database...\n"); /* Apply the hashed log records to the respective file pages */ @@ -2888,9 +2933,15 @@ recv_recovery_from_checkpoint_finish(void) /* Free the resources of the recovery system */ recv_recovery_on = FALSE; + #ifndef UNIV_LOG_DEBUG recv_sys_free(); #endif + + if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) { + os_thread_create(trx_rollback_or_clean_all_without_sess, + (void *)&i, &recovery_thread_id); + } } /********************************************************** diff --git a/innobase/mem/mem0pool.c b/innobase/mem/mem0pool.c index 023369e8ec5..cb891a03092 100644 --- a/innobase/mem/mem0pool.c +++ b/innobase/mem/mem0pool.c @@ -199,7 +199,7 @@ mem_pool_create( but only when allocated at a higher level in mem0mem.c. This is to avoid masking useful Purify warnings. */ - pool->buf = ut_malloc_low(size, FALSE); + pool->buf = ut_malloc_low(size, FALSE, TRUE); pool->size = size; mutex_create(&(pool->mutex)); diff --git a/innobase/mtr/mtr0log.c b/innobase/mtr/mtr0log.c index 82baa8905ba..417093134c3 100644 --- a/innobase/mtr/mtr0log.c +++ b/innobase/mtr/mtr0log.c @@ -384,3 +384,160 @@ mlog_parse_string( return(ptr + len); } + +/************************************************************ +Opens a buffer for mlog, writes the initial log record and, +if needed, the field lengths of an index. */ + +byte* +mlog_open_and_write_index( +/*======================*/ + /* out: buffer, NULL if log mode + MTR_LOG_NONE */ + mtr_t* mtr, /* in: mtr */ + byte* rec, /* in: index record or page */ + dict_index_t* index, /* in: record descriptor */ + byte type, /* in: log item type */ + ulint size) /* in: requested buffer size in bytes + (if 0, calls mlog_close() and returns NULL) */ +{ + byte* log_ptr; + const byte* log_start; + const byte* log_end; + + if (!index->table->comp) { + log_start = log_ptr = mlog_open(mtr, 11 + size); + if (!log_ptr) { + return(NULL); /* logging is disabled */ + } + log_ptr = mlog_write_initial_log_record_fast(rec, type, + log_ptr, mtr); + log_end = log_ptr + 11 + size; + } else { + ulint i; + ulint n = dict_index_get_n_fields(index); + /* total size needed */ + ulint total = 11 + size + (n + 2) * 2; + ulint alloc = total; + /* allocate at most DYN_ARRAY_DATA_SIZE at a time */ + if (alloc > DYN_ARRAY_DATA_SIZE) { + alloc = DYN_ARRAY_DATA_SIZE; + } + log_start = log_ptr = mlog_open(mtr, alloc); + if (!log_ptr) { + return(NULL); /* logging is disabled */ + } + log_end = log_ptr + alloc; + log_ptr = mlog_write_initial_log_record_fast(rec, type, + log_ptr, mtr); + mach_write_to_2(log_ptr, n); + log_ptr += 2; + mach_write_to_2(log_ptr, + dict_index_get_n_unique_in_tree(index)); + log_ptr += 2; + for (i = 0; i < n; i++) { + dict_field_t* field; + dtype_t* type; + ulint len; + field = dict_index_get_nth_field(index, i); + type = dict_col_get_type(dict_field_get_col(field)); + len = field->fixed_len; + ut_ad(len < 0x7fff); + if (len == 0 && dtype_get_len(type) > 255) { + /* variable-length field + with maximum length > 255 */ + len = 0x7fff; + } + if (dtype_get_prtype(type) & DATA_NOT_NULL) { + len |= 0x8000; + } + if (log_ptr + 2 > log_end) { + mlog_close(mtr, log_ptr); + ut_a(total > (ulint) (log_ptr - log_start)); + total -= log_ptr - log_start; + alloc = total; + if (alloc > DYN_ARRAY_DATA_SIZE) { + alloc = DYN_ARRAY_DATA_SIZE; + } + log_start = log_ptr = mlog_open(mtr, alloc); + if (!log_ptr) { + return(NULL); /* logging is disabled */ + } + log_end = log_ptr + alloc; + } + mach_write_to_2(log_ptr, len); + log_ptr += 2; + } + } + if (size == 0) { + mlog_close(mtr, log_ptr); + log_ptr = NULL; + } else if (log_ptr + size > log_end) { + mlog_close(mtr, log_ptr); + log_ptr = mlog_open(mtr, size); + } + return(log_ptr); +} + +/************************************************************ +Parses a log record written by mlog_open_and_write_index. */ + +byte* +mlog_parse_index( +/*=============*/ + /* out: parsed record end, + NULL if not a complete record */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + /* out: new value of log_ptr */ + ibool comp, /* in: TRUE=compact record format */ + dict_index_t** index) /* out, own: dummy index */ +{ + ulint i, n, n_uniq; + dict_table_t* table; + dict_index_t* ind; + + if (comp) { + if (end_ptr < ptr + 4) { + return(NULL); + } + n = mach_read_from_2(ptr); + ptr += 2; + n_uniq = mach_read_from_2(ptr); + ut_ad(n_uniq <= n); + if (end_ptr < ptr + (n + 1) * 2) { + return(NULL); + } + } else { + n = n_uniq = 1; + } + table = dict_mem_table_create("LOG_DUMMY", DICT_HDR_SPACE, n, comp); + ind = dict_mem_index_create("LOG_DUMMY", "LOG_DUMMY", + DICT_HDR_SPACE, 0, n); + ind->table = table; + ind->n_uniq = n_uniq; + if (n_uniq != n) { + ind->type = DICT_CLUSTERED; + } + /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ + ind->cached = TRUE; + if (comp) { + for (i = 0; i < n; i++) { + ulint len = mach_read_from_2(ptr += 2); + /* The high-order bit of len is the NOT NULL flag; + the rest is 0 or 0x7fff for variable-length fields, + and 1..0x7ffe for fixed-length fields. */ + dict_mem_table_add_col(table, "DUMMY", + ((len + 1) & 0x7fff) <= 1 + ? DATA_BINARY + : DATA_FIXBINARY, + len & 0x8000 ? DATA_NOT_NULL : 0, + len & 0x7fff, 0); + dict_index_add_col(ind, + dict_table_get_nth_col(table, i), 0, 0); + } + ptr += 2; + } + *index = ind; + return(ptr); +} diff --git a/innobase/os/os0file.c b/innobase/os/os0file.c index 5c140e4b798..1e3eeb0de02 100644 --- a/innobase/os/os0file.c +++ b/innobase/os/os0file.c @@ -155,6 +155,10 @@ os_mutex_t os_file_count_mutex; ulint os_file_n_pending_preads = 0; ulint os_file_n_pending_pwrites = 0; +/* These are not protected by any mutex */ +ulint os_n_pending_writes = 0; +ulint os_n_pending_reads = 0; + /*************************************************************************** Gets the operating system version. Currently works only on Windows. */ @@ -711,13 +715,41 @@ http://www.mysql.com/doc/en/Windows_symbolic_links.html */ char* full_path; int ret; struct stat statinfo; +#ifdef HAVE_READDIR_R + char dirent_buf[sizeof(struct dirent) + _POSIX_PATH_MAX + + 100]; + /* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as + the max file name len; but in most standards, the + length is NAME_MAX; we add 100 to be even safer */ +#endif + next_file: - ent = readdir(dir); + +#ifdef HAVE_READDIR_R + ret = readdir_r(dir, (struct dirent*)dirent_buf, &ent); + + if (ret != 0) { + fprintf(stderr, +"InnoDB: cannot read directory %s, error %lu\n", dirname, (ulong)ret); + + return(-1); + } if (ent == NULL) { + /* End of directory */ + return(1); } + ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1); +#else + ent = readdir(dir); + + if (ent == NULL) { + + return(1); + } +#endif ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH); if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) { @@ -1987,8 +2019,12 @@ try_again: goto error_handling; } + os_n_pending_reads++; + ret = ReadFile(file, buf, n, &len, NULL); + os_n_pending_reads--; + os_mutex_exit(os_file_seek_mutexes[i]); if (ret && len == n) { @@ -2001,8 +2037,12 @@ try_again: os_bytes_read_since_printout += n; try_again: + os_n_pending_reads++; + ret = os_file_pread(file, buf, n, offset, offset_high); + os_n_pending_reads--; + if ((ulint)ret == n) { return(TRUE); @@ -2090,8 +2130,12 @@ try_again: goto error_handling; } + os_n_pending_reads++; + ret = ReadFile(file, buf, n, &len, NULL); + os_n_pending_reads--; + os_mutex_exit(os_file_seek_mutexes[i]); if (ret && len == n) { @@ -2104,8 +2148,12 @@ try_again: os_bytes_read_since_printout += n; try_again: + os_n_pending_reads++; + ret = os_file_pread(file, buf, n, offset, offset_high); + os_n_pending_reads--; + if ((ulint)ret == n) { return(TRUE); @@ -2187,7 +2235,11 @@ retry: return(FALSE); } + os_n_pending_writes++; + ret = WriteFile(file, buf, n, &len, NULL); + + os_n_pending_writes--; /* Always do fsync to reduce the probability that when the OS crashes, a database page is only partially physically written to disk. */ @@ -2248,8 +2300,12 @@ retry: #else ssize_t ret; + os_n_pending_writes++; + ret = os_file_pwrite(file, buf, n, offset, offset_high); + os_n_pending_writes--; + if ((ulint)ret == n) { return(TRUE); diff --git a/innobase/page/page0cur.c b/innobase/page/page0cur.c index 459ab986610..8def8474d9a 100644 --- a/innobase/page/page0cur.c +++ b/innobase/page/page0cur.c @@ -30,6 +30,7 @@ ibool page_cur_try_search_shortcut( /*=========================*/ page_t* page, /* in: index page */ + dict_index_t* index, /* in: record descriptor */ dtuple_t* tuple, /* in: data tuple */ ulint* iup_matched_fields, /* in/out: already matched fields in upper @@ -55,9 +56,14 @@ page_cur_try_search_shortcut( #ifdef UNIV_SEARCH_DEBUG page_cur_t cursor2; #endif + mem_heap_t* heap; + ulint* offsets; ut_ad(dtuple_check_typed(tuple)); rec = page_header_get_ptr(page, PAGE_LAST_INSERT); + heap = mem_heap_create(100); + offsets = rec_get_offsets(rec, index, + dtuple_get_n_fields(tuple), heap); ut_ad(rec); ut_ad(page_rec_is_user_rec(rec)); @@ -69,26 +75,30 @@ page_cur_try_search_shortcut( up_match = low_match; up_bytes = low_bytes; - cmp = page_cmp_dtuple_rec_with_match(tuple, rec, &low_match, + cmp = page_cmp_dtuple_rec_with_match(tuple, rec, offsets, &low_match, &low_bytes); if (cmp == -1) { + mem_heap_free(heap); return(FALSE); } next_rec = page_rec_get_next(rec); + offsets = rec_reget_offsets(next_rec, index, offsets, + dtuple_get_n_fields(tuple), heap); - cmp = page_cmp_dtuple_rec_with_match(tuple, next_rec, &up_match, - &up_bytes); + cmp = page_cmp_dtuple_rec_with_match(tuple, next_rec, offsets, + &up_match, &up_bytes); if (cmp != -1) { + mem_heap_free(heap); return(FALSE); } cursor->rec = rec; #ifdef UNIV_SEARCH_DEBUG - page_cur_search_with_match(page, tuple, PAGE_CUR_DBG, + page_cur_search_with_match(page, index, tuple, PAGE_CUR_DBG, iup_matched_fields, iup_matched_bytes, ilow_matched_fields, @@ -117,6 +127,7 @@ page_cur_try_search_shortcut( #ifdef UNIV_SEARCH_PERF_STAT page_cur_short_succ++; #endif + mem_heap_free(heap); return(TRUE); } @@ -130,22 +141,24 @@ static ibool page_cur_rec_field_extends( /*=======================*/ - /* out: TRUE if rec field extends tuple - field */ - dtuple_t* tuple, /* in: data tuple */ - rec_t* rec, /* in: record */ - ulint n) /* in: compare nth field */ + /* out: TRUE if rec field + extends tuple field */ + dtuple_t* tuple, /* in: data tuple */ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n) /* in: compare nth field */ { dtype_t* type; dfield_t* dfield; byte* rec_f; ulint rec_f_len; + ut_ad(rec_offs_validate(rec, NULL, offsets)); dfield = dtuple_get_nth_field(tuple, n); type = dfield_get_type(dfield); - rec_f = rec_get_nth_field(rec, n, &rec_f_len); + rec_f = rec_get_nth_field(rec, offsets, n, &rec_f_len); if (type->mtype == DATA_VARCHAR || type->mtype == DATA_CHAR @@ -176,6 +189,7 @@ void page_cur_search_with_match( /*=======================*/ page_t* page, /* in: index page */ + dict_index_t* index, /* in: record descriptor */ dtuple_t* tuple, /* in: data tuple */ ulint mode, /* in: PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, or PAGE_CUR_GE */ @@ -212,6 +226,9 @@ page_cur_search_with_match( ulint dbg_matched_fields; ulint dbg_matched_bytes; #endif + mem_heap_t* heap; + ulint* offsets = NULL; + ut_ad(page && tuple && iup_matched_fields && iup_matched_bytes && ilow_matched_fields && ilow_matched_bytes && cursor); ut_ad(dtuple_validate(tuple)); @@ -229,7 +246,7 @@ page_cur_search_with_match( && (page_header_get_ptr(page, PAGE_LAST_INSERT)) && (page_header_get_field(page, PAGE_DIRECTION) == PAGE_RIGHT)) { - if (page_cur_try_search_shortcut(page, tuple, + if (page_cur_try_search_shortcut(page, index, tuple, iup_matched_fields, iup_matched_bytes, ilow_matched_fields, @@ -245,6 +262,8 @@ page_cur_search_with_match( /*#endif */ #endif + heap = mem_heap_create(100); + /* The following flag does not work for non-latin1 char sets because cmp_full_field does not tell how many bytes matched */ ut_a(mode != PAGE_CUR_LE_OR_EXTENDS); @@ -279,7 +298,10 @@ page_cur_search_with_match( low_matched_fields, low_matched_bytes, up_matched_fields, up_matched_bytes); - cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, + offsets = rec_reget_offsets(mid_rec, index, offsets, + dtuple_get_n_fields_cmp(tuple), heap); + + cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, offsets, &cur_matched_fields, &cur_matched_bytes); if (cmp == 1) { @@ -288,10 +310,12 @@ page_cur_search_with_match( low_matched_bytes = cur_matched_bytes; } else if (cmp == -1) { + offsets = rec_reget_offsets(mid_rec, index, + offsets, dtuple_get_n_fields_cmp(tuple), heap); if (mode == PAGE_CUR_LE_OR_EXTENDS && page_cur_rec_field_extends(tuple, mid_rec, - cur_matched_fields)) { + offsets, cur_matched_fields)) { low = mid; low_matched_fields = cur_matched_fields; low_matched_bytes = cur_matched_bytes; @@ -329,7 +353,10 @@ page_cur_search_with_match( low_matched_fields, low_matched_bytes, up_matched_fields, up_matched_bytes); - cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, + offsets = rec_reget_offsets(mid_rec, index, + offsets, dtuple_get_n_fields_cmp(tuple), heap); + + cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, offsets, &cur_matched_fields, &cur_matched_bytes); if (cmp == 1) { @@ -338,9 +365,12 @@ page_cur_search_with_match( low_matched_bytes = cur_matched_bytes; } else if (cmp == -1) { + offsets = rec_reget_offsets(mid_rec, index, + offsets, dtuple_get_n_fields_cmp(tuple), heap); + if (mode == PAGE_CUR_LE_OR_EXTENDS && page_cur_rec_field_extends(tuple, mid_rec, - cur_matched_fields)) { + offsets, cur_matched_fields)) { low_rec = mid_rec; low_matched_fields = cur_matched_fields; low_matched_bytes = cur_matched_bytes; @@ -368,7 +398,9 @@ page_cur_search_with_match( dbg_matched_fields = 0; dbg_matched_bytes = 0; - dbg_cmp = page_cmp_dtuple_rec_with_match(tuple, low_rec, + offsets = rec_reget_offsets(low_rec, index, + offsets, ULINT_UNDEFINED, heap); + dbg_cmp = page_cmp_dtuple_rec_with_match(tuple, low_rec, offsets, &dbg_matched_fields, &dbg_matched_bytes); if (mode == PAGE_CUR_G) { @@ -390,7 +422,9 @@ page_cur_search_with_match( dbg_matched_fields = 0; dbg_matched_bytes = 0; - dbg_cmp = page_cmp_dtuple_rec_with_match(tuple, up_rec, + offsets = rec_reget_offsets(up_rec, index, + offsets, ULINT_UNDEFINED, heap); + dbg_cmp = page_cmp_dtuple_rec_with_match(tuple, up_rec, offsets, &dbg_matched_fields, &dbg_matched_bytes); if (mode == PAGE_CUR_G) { @@ -419,6 +453,7 @@ page_cur_search_with_match( *iup_matched_bytes = up_matched_bytes; *ilow_matched_fields = low_matched_fields; *ilow_matched_bytes = low_matched_bytes; + mem_heap_free(heap); } /*************************************************************** @@ -463,10 +498,12 @@ static void page_cur_insert_rec_write_log( /*==========================*/ - rec_t* insert_rec, /* in: inserted physical record */ - ulint rec_size, /* in: insert_rec size */ - rec_t* cursor_rec, /* in: record the cursor is pointing to */ - mtr_t* mtr) /* in: mini-transaction handle */ + rec_t* insert_rec, /* in: inserted physical record */ + ulint rec_size, /* in: insert_rec size */ + rec_t* cursor_rec, /* in: record the + cursor is pointing to */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mini-transaction handle */ { ulint cur_rec_size; ulint extra_size; @@ -476,22 +513,29 @@ page_cur_insert_rec_write_log( byte* cur_ptr; ulint extra_info_yes; byte* log_ptr; + byte* log_end; ulint i; ut_a(rec_size < UNIV_PAGE_SIZE); - ut_ad(rec_size == rec_get_size(insert_rec)); - log_ptr = mlog_open(mtr, 30 + MLOG_BUF_MARGIN); + { + mem_heap_t* heap; + ulint* cur_offs; + ulint* ins_offs; - if (log_ptr == NULL) { + heap = mem_heap_create(100); + cur_offs = rec_get_offsets(cursor_rec, index, + ULINT_UNDEFINED, heap); + ins_offs = rec_get_offsets(insert_rec, index, + ULINT_UNDEFINED, heap); - return; - } - - extra_size = rec_get_extra_size(insert_rec); + extra_size = rec_offs_extra_size(ins_offs); + cur_extra_size = rec_offs_extra_size(cur_offs); + ut_ad(rec_size == rec_offs_size(ins_offs)); + cur_rec_size = rec_offs_size(cur_offs); - cur_extra_size = rec_get_extra_size(cursor_rec); - cur_rec_size = rec_get_size(cursor_rec); + mem_heap_free(heap); + } ins_ptr = insert_rec - extra_size; @@ -514,7 +558,9 @@ page_cur_insert_rec_write_log( ins_ptr++; cur_ptr++; } else if ((i < extra_size) - && (i >= extra_size - REC_N_EXTRA_BYTES)) { + && (i >= extra_size - (index->table->comp + ? REC_N_NEW_EXTRA_BYTES + : REC_N_OLD_EXTRA_BYTES))) { i = extra_size; ins_ptr = insert_rec; cur_ptr = cursor_rec; @@ -525,16 +571,35 @@ page_cur_insert_rec_write_log( } if (mtr_get_log_mode(mtr) != MTR_LOG_SHORT_INSERTS) { - - log_ptr = mlog_write_initial_log_record_fast(insert_rec, - MLOG_REC_INSERT, log_ptr, mtr); + + log_ptr = mlog_open_and_write_index(mtr, insert_rec, index, + index->table->comp + ? MLOG_COMP_REC_INSERT : MLOG_REC_INSERT, + 2 + 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN); + + if (!log_ptr) { + /* Logging in mtr is switched off during crash + recovery: in that case mlog_open returns NULL */ + return; + } + + log_end = &log_ptr[2 + 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN]; /* Write the cursor rec offset as a 2-byte ulint */ mach_write_to_2(log_ptr, cursor_rec - buf_frame_align(cursor_rec)); log_ptr += 2; + } else { + log_ptr = mlog_open(mtr, 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN); + if (!log_ptr) { + /* Logging in mtr is switched off during crash + recovery: in that case mlog_open returns NULL */ + return; + } + log_end = &log_ptr[5 + 1 + 5 + 5 + MLOG_BUF_MARGIN]; } - if ((rec_get_info_bits(insert_rec) != rec_get_info_bits(cursor_rec)) + if ((rec_get_info_bits(insert_rec, index->table->comp) != + rec_get_info_bits(cursor_rec, index->table->comp)) || (extra_size != cur_extra_size) || (rec_size != cur_rec_size)) { @@ -549,7 +614,8 @@ page_cur_insert_rec_write_log( + extra_info_yes); if (extra_info_yes) { /* Write the info bits */ - mach_write_to_1(log_ptr, rec_get_info_bits(insert_rec)); + mach_write_to_1(log_ptr, + rec_get_info_bits(insert_rec, index->table->comp)); log_ptr++; /* Write the record origin offset */ @@ -565,17 +631,15 @@ page_cur_insert_rec_write_log( /* Write to the log the inserted index record end segment which differs from the cursor record */ - if (rec_size - i < MLOG_BUF_MARGIN) { - ut_memcpy(log_ptr, ins_ptr, rec_size - i); - log_ptr += rec_size - i; - } - - mlog_close(mtr, log_ptr); - - ut_a(rec_size - i < UNIV_PAGE_SIZE); + rec_size -= i; - if (rec_size - i >= MLOG_BUF_MARGIN) { - mlog_catenate_string(mtr, ins_ptr, rec_size - i); + if (log_ptr + rec_size <= log_end) { + memcpy(log_ptr, ins_ptr, rec_size); + mlog_close(mtr, log_ptr + rec_size); + } else { + mlog_close(mtr, log_ptr); + ut_a(rec_size < UNIV_PAGE_SIZE); + mlog_catenate_string(mtr, ins_ptr, rec_size); } } @@ -585,12 +649,13 @@ Parses a log record of a record insert on a page. */ byte* page_cur_parse_insert_rec( /*======================*/ - /* out: end of log record or NULL */ - ibool is_short,/* in: TRUE if short inserts */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr) /* in: mtr or NULL */ + /* out: end of log record or NULL */ + ibool is_short,/* in: TRUE if short inserts */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ { ulint extra_info_yes; ulint offset = 0; /* remove warning */ @@ -603,6 +668,8 @@ page_cur_parse_insert_rec( byte* ptr2 = ptr; ulint info_bits = 0; /* remove warning */ page_cur_t cursor; + mem_heap_t* heap; + ulint* offsets; if (!is_short) { /* Read the cursor rec offset as a 2-byte ulint */ @@ -689,11 +756,14 @@ page_cur_parse_insert_rec( cursor_rec = page + offset; } + heap = mem_heap_create(100); + offsets = rec_get_offsets(cursor_rec, index, ULINT_UNDEFINED, heap); + if (extra_info_yes == 0) { - info_bits = rec_get_info_bits(cursor_rec); - origin_offset = rec_get_extra_size(cursor_rec); - mismatch_index = rec_get_size(cursor_rec) - end_seg_len; - } + info_bits = rec_get_info_bits(cursor_rec, index->table->comp); + origin_offset = rec_offs_extra_size(offsets); + mismatch_index = rec_offs_size(offsets) - end_seg_len; + } if (mismatch_index + end_seg_len < sizeof buf1) { buf = buf1; @@ -722,14 +792,24 @@ page_cur_parse_insert_rec( ut_error; } - ut_memcpy(buf, rec_get_start(cursor_rec), mismatch_index); + ut_memcpy(buf, rec_get_start(cursor_rec, offsets), mismatch_index); ut_memcpy(buf + mismatch_index, ptr, end_seg_len); - rec_set_info_bits(buf + origin_offset, info_bits); + rec_set_info_bits(buf + origin_offset, index->table->comp, info_bits); + + /* Set the status bits for new-style records. */ + if (index->table->comp) { + /* Leaf pages (level 0) contain ordinary records; + non-leaf pages contain node pointer records. */ + ulint level = page_header_get_field( + buf_frame_align(cursor_rec), PAGE_LEVEL); + rec_set_status(buf + origin_offset, + level ? REC_STATUS_NODE_PTR : REC_STATUS_ORDINARY); + } page_cur_position(cursor_rec, &cursor); - page_cur_rec_insert(&cursor, buf + origin_offset, mtr); + page_cur_rec_insert(&cursor, buf + origin_offset, index, mtr); if (buf != buf1) { @@ -751,68 +831,80 @@ page_cur_insert_rec_low( /* out: pointer to record if succeed, NULL otherwise */ page_cur_t* cursor, /* in: a page cursor */ - dtuple_t* tuple, /* in: pointer to a data tuple or NULL */ - ulint data_size,/* in: data size of tuple */ - rec_t* rec, /* in: pointer to a physical record or NULL */ + dtuple_t* tuple, /* in: pointer to a data tuple or NULL */ + dict_index_t* index, /* in: record descriptor */ + rec_t* rec, /* in: pointer to a physical record or NULL */ mtr_t* mtr) /* in: mini-transaction handle */ { - byte* insert_buf = NULL; - ulint rec_size; - byte* page; /* the relevant page */ - rec_t* last_insert; /* cursor position at previous insert */ - rec_t* insert_rec; /* inserted record */ - ulint heap_no; /* heap number of the inserted record */ - rec_t* current_rec; /* current record after which the - new record is inserted */ - rec_t* next_rec; /* next record after current before - the insertion */ - ulint owner_slot; /* the slot which owns the inserted record */ - rec_t* owner_rec; - ulint n_owned; - + byte* insert_buf = NULL; + ulint rec_size; + byte* page; /* the relevant page */ + rec_t* last_insert; /* cursor position at previous insert */ + rec_t* insert_rec; /* inserted record */ + ulint heap_no; /* heap number of the inserted record */ + rec_t* current_rec; /* current record after which the + new record is inserted */ + rec_t* next_rec; /* next record after current before + the insertion */ + ulint owner_slot; /* the slot which owns the + inserted record */ + rec_t* owner_rec; + ulint n_owned; + mem_heap_t* heap; + ulint* offsets; + ibool comp = index->table->comp; + ut_ad(cursor && mtr); ut_ad(tuple || rec); ut_ad(!(tuple && rec)); ut_ad(rec || dtuple_check_typed(tuple)); - ut_ad(rec || (dtuple_get_data_size(tuple) == data_size)); page = page_cur_get_page(cursor); + ut_ad(page_is_comp(page) == comp); + ut_ad(cursor->rec != page_get_supremum_rec(page)); + heap = mem_heap_create(100); + /* 1. Get the size of the physical record in the page */ if (tuple != NULL) { - rec_size = data_size + rec_get_converted_extra_size( - data_size, - dtuple_get_n_fields(tuple)); + offsets = NULL; + rec_size = rec_get_converted_size(index, tuple); } else { - rec_size = rec_get_size(rec); + offsets = rec_get_offsets(rec, index, ULINT_UNDEFINED, heap); + rec_size = rec_offs_size(offsets); } /* 2. Try to find suitable space from page memory management */ - insert_buf = page_mem_alloc(page, rec_size, &heap_no); + insert_buf = page_mem_alloc(page, rec_size, index, &heap_no); if (insert_buf == NULL) { - + mem_heap_free(heap); return(NULL); } /* 3. Create the record */ if (tuple != NULL) { - insert_rec = rec_convert_dtuple_to_rec_low(insert_buf, tuple, - data_size); + insert_rec = rec_convert_dtuple_to_rec(insert_buf, + index, tuple); } else { - insert_rec = rec_copy(insert_buf, rec); + insert_rec = rec_copy(insert_buf, rec, offsets); } ut_ad(insert_rec); - ut_ad(rec_size == rec_get_size(insert_rec)); + offsets = rec_reget_offsets(insert_rec, index, + offsets, ULINT_UNDEFINED, heap); + ut_ad(rec_size == rec_offs_size(offsets)); /* 4. Insert the record in the linked list of records */ - current_rec = cursor->rec; + ut_ad(!comp || rec_get_status(current_rec) <= REC_STATUS_INFIMUM); + ut_ad(!comp || rec_get_status(insert_rec) < REC_STATUS_INFIMUM); + next_rec = page_rec_get_next(current_rec); + ut_ad(!comp || rec_get_status(next_rec) != REC_STATUS_INFIMUM); page_rec_set_next(insert_rec, next_rec); page_rec_set_next(current_rec, insert_rec); @@ -821,12 +913,15 @@ page_cur_insert_rec_low( /* 5. Set the n_owned field in the inserted record to zero, and set the heap_no field */ - rec_set_n_owned(insert_rec, 0); - rec_set_heap_no(insert_rec, heap_no); + rec_set_n_owned(insert_rec, comp, 0); + rec_set_heap_no(insert_rec, comp, heap_no); /* 6. Update the last insertion info in page header */ last_insert = page_header_get_ptr(page, PAGE_LAST_INSERT); + ut_ad(!last_insert || !comp + || rec_get_node_ptr_flag(last_insert) + == rec_get_node_ptr_flag(insert_rec)); if (last_insert == NULL) { page_header_set_field(page, PAGE_DIRECTION, PAGE_NO_DIRECTION); @@ -855,8 +950,8 @@ page_cur_insert_rec_low( /* 7. It remains to update the owner record. */ owner_rec = page_rec_find_owner_rec(insert_rec); - n_owned = rec_get_n_owned(owner_rec); - rec_set_n_owned(owner_rec, n_owned + 1); + n_owned = rec_get_n_owned(owner_rec, comp); + rec_set_n_owned(owner_rec, comp, n_owned + 1); /* 8. Now we have incremented the n_owned field of the owner record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED, @@ -868,8 +963,10 @@ page_cur_insert_rec_low( } /* 9. Write log record of the insert */ - page_cur_insert_rec_write_log(insert_rec, rec_size, current_rec, mtr); + page_cur_insert_rec_write_log(insert_rec, rec_size, current_rec, + index, mtr); + mem_heap_free(heap); return(insert_rec); } @@ -879,17 +976,19 @@ UNIV_INLINE byte* page_copy_rec_list_to_created_page_write_log( /*=========================================*/ - /* out: 4-byte field where to write the log data - length */ - page_t* page, /* in: index page */ - mtr_t* mtr) /* in: mtr */ + /* out: 4-byte field where to + write the log data length */ + page_t* page, /* in: index page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { byte* log_ptr; - - mlog_write_initial_log_record(page, MLOG_LIST_END_COPY_CREATED, mtr); - - log_ptr = mlog_open(mtr, 4); + log_ptr = mlog_open_and_write_index(mtr, page, index, + index->table->comp + ? MLOG_COMP_LIST_END_COPY_CREATED + : MLOG_LIST_END_COPY_CREATED, 4); + ut_a(log_ptr); mlog_close(mtr, log_ptr + 4); return(log_ptr); @@ -901,11 +1000,12 @@ Parses a log record of copying a record list end to a new created page. */ byte* page_parse_copy_rec_list_to_created_page( /*=====================================*/ - /* out: end of log record or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr) /* in: mtr or NULL */ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ { byte* rec_end; ulint log_data_len; @@ -931,7 +1031,8 @@ page_parse_copy_rec_list_to_created_page( } while (ptr < rec_end) { - ptr = page_cur_parse_insert_rec(TRUE, ptr, end_ptr, page, mtr); + ptr = page_cur_parse_insert_rec(TRUE, ptr, end_ptr, + index, page, mtr); } ut_a(ptr == rec_end); @@ -950,10 +1051,11 @@ including that record. Infimum and supremum records are not copied. */ void page_copy_rec_list_end_to_created_page( /*===================================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ - rec_t* rec, /* in: first record to copy */ - mtr_t* mtr) /* in: mtr */ + page_t* new_page, /* in: index page to copy to */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: first record to copy */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { page_dir_slot_t* slot = 0; /* remove warning */ byte* heap_top; @@ -966,9 +1068,13 @@ page_copy_rec_list_end_to_created_page( ulint log_mode; byte* log_ptr; ulint log_data_len; + ibool comp = page_is_comp(page); + mem_heap_t* heap; + ulint* offsets = NULL; - ut_ad(page_header_get_field(new_page, PAGE_N_HEAP) == 2); + ut_ad(page_dir_get_n_heap(new_page) == 2); ut_ad(page != new_page); + ut_ad(comp == page_is_comp(new_page)); if (rec == page_get_infimum_rec(page)) { @@ -983,12 +1089,13 @@ page_copy_rec_list_end_to_created_page( #ifdef UNIV_DEBUG /* To pass the debug tests we have to set these dummy values in the debug version */ - page_header_set_field(new_page, PAGE_N_DIR_SLOTS, UNIV_PAGE_SIZE / 2); + page_dir_set_n_slots(new_page, UNIV_PAGE_SIZE / 2); page_header_set_ptr(new_page, PAGE_HEAP_TOP, new_page + UNIV_PAGE_SIZE - 1); #endif - log_ptr = page_copy_rec_list_to_created_page_write_log(new_page, mtr); + log_ptr = page_copy_rec_list_to_created_page_write_log(new_page, + index, mtr); log_data_len = dyn_array_get_data_size(&(mtr->log)); @@ -997,22 +1104,29 @@ page_copy_rec_list_end_to_created_page( log_mode = mtr_set_log_mode(mtr, MTR_LOG_SHORT_INSERTS); prev_rec = page_get_infimum_rec(new_page); - heap_top = new_page + PAGE_SUPREMUM_END; + if (comp) { + heap_top = new_page + PAGE_NEW_SUPREMUM_END; + } else { + heap_top = new_page + PAGE_OLD_SUPREMUM_END; + } count = 0; slot_index = 0; n_recs = 0; + heap = mem_heap_create(100); + /* should be do ... until, comment by Jani */ while (rec != page_get_supremum_rec(page)) { - - insert_rec = rec_copy(heap_top, rec); + offsets = rec_reget_offsets(rec, index, + offsets, ULINT_UNDEFINED, heap); + insert_rec = rec_copy(heap_top, rec, offsets); - rec_set_next_offs(prev_rec, insert_rec - new_page); + rec_set_next_offs(prev_rec, comp, insert_rec - new_page); - rec_set_n_owned(insert_rec, 0); - rec_set_heap_no(insert_rec, 2 + n_recs); + rec_set_n_owned(insert_rec, comp, 0); + rec_set_heap_no(insert_rec, comp, 2 + n_recs); - rec_size = rec_get_size(insert_rec); + rec_size = rec_offs_size(offsets); heap_top = heap_top + rec_size; @@ -1034,7 +1148,7 @@ page_copy_rec_list_end_to_created_page( } page_cur_insert_rec_write_log(insert_rec, rec_size, prev_rec, - mtr); + index, mtr); prev_rec = insert_rec; rec = page_rec_get_next(rec); } @@ -1056,22 +1170,25 @@ page_copy_rec_list_end_to_created_page( slot_index--; } + mem_heap_free(heap); + log_data_len = dyn_array_get_data_size(&(mtr->log)) - log_data_len; ut_a(log_data_len < 100 * UNIV_PAGE_SIZE); mach_write_to_4(log_ptr, log_data_len); - rec_set_next_offs(insert_rec, PAGE_SUPREMUM); + rec_set_next_offs(insert_rec, comp, + comp ? PAGE_NEW_SUPREMUM : PAGE_OLD_SUPREMUM); slot = page_dir_get_nth_slot(new_page, 1 + slot_index); page_dir_slot_set_rec(slot, page_get_supremum_rec(new_page)); page_dir_slot_set_n_owned(slot, count + 1); - page_header_set_field(new_page, PAGE_N_DIR_SLOTS, 2 + slot_index); + page_dir_set_n_slots(new_page, 2 + slot_index); page_header_set_ptr(new_page, PAGE_HEAP_TOP, heap_top); - page_header_set_field(new_page, PAGE_N_HEAP, 2 + n_recs); + page_dir_set_n_heap(new_page, 2 + n_recs); page_header_set_field(new_page, PAGE_N_RECS, n_recs); page_header_set_ptr(new_page, PAGE_LAST_INSERT, NULL); @@ -1089,14 +1206,27 @@ UNIV_INLINE void page_cur_delete_rec_write_log( /*==========================*/ - rec_t* cursor_rec, /* in: record to be deleted */ - mtr_t* mtr) /* in: mini-transaction handle */ + rec_t* rec, /* in: record to be deleted */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mini-transaction handle */ { - mlog_write_initial_log_record(cursor_rec, MLOG_REC_DELETE, mtr); + byte* log_ptr; + + log_ptr = mlog_open_and_write_index(mtr, rec, index, + index->table->comp + ? MLOG_COMP_REC_DELETE + : MLOG_REC_DELETE, 2); + + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery: + in that case mlog_open returns NULL */ + return; + } /* Write the cursor rec offset as a 2-byte ulint */ - mlog_catenate_ulint(mtr, cursor_rec - buf_frame_align(cursor_rec), - MLOG_2BYTES); + mach_write_to_2(log_ptr, rec - buf_frame_align(rec)); + + mlog_close(mtr, log_ptr + 2); } /*************************************************************** @@ -1105,11 +1235,12 @@ Parses log record of a record delete on a page. */ byte* page_cur_parse_delete_rec( /*======================*/ - /* out: pointer to record end or NULL */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr) /* in: mtr or NULL */ + /* out: pointer to record end or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ { ulint offset; page_cur_t cursor; @@ -1128,7 +1259,7 @@ page_cur_parse_delete_rec( if (page) { page_cur_position(page + offset, &cursor); - page_cur_delete_rec(&cursor, mtr); + page_cur_delete_rec(&cursor, index, mtr); } return(ptr); @@ -1142,6 +1273,7 @@ void page_cur_delete_rec( /*================*/ page_cur_t* cursor, /* in: a page cursor */ + dict_index_t* index, /* in: record descriptor */ mtr_t* mtr) /* in: mini-transaction handle */ { page_dir_slot_t* cur_dir_slot; @@ -1169,7 +1301,7 @@ page_cur_delete_rec( cur_n_owned = page_dir_slot_get_n_owned(cur_dir_slot); /* 0. Write the log record */ - page_cur_delete_rec_write_log(current_rec, mtr); + page_cur_delete_rec_write_log(current_rec, index, mtr); /* 1. Reset the last insert info in the page header and increment the modify clock for the frame */ @@ -1223,7 +1355,7 @@ page_cur_delete_rec( page_dir_slot_set_n_owned(cur_dir_slot, cur_n_owned - 1); /* 6. Free the memory occupied by the record */ - page_mem_free(page, current_rec); + page_mem_free(page, current_rec, index); /* 7. Now we have decremented the number of owned records of the slot. If the number drops below PAGE_DIR_SLOT_MIN_N_OWNED, we balance the diff --git a/innobase/page/page0page.c b/innobase/page/page0page.c index 343f300fc77..38b1e503c8f 100644 --- a/innobase/page/page0page.c +++ b/innobase/page/page0page.c @@ -18,6 +18,8 @@ Created 2/2/1994 Heikki Tuuri #include "fut0lst.h" #include "btr0sea.h" #include "buf0buf.h" +#include "srv0srv.h" +#include "btr0btr.h" /* THE INDEX PAGE ============== @@ -75,10 +77,14 @@ page_dir_find_owner_slot( page_t* page; page_dir_slot_t* slot; rec_t* original_rec = rec; + ibool comp; ut_ad(page_rec_check(rec)); - while (rec_get_n_owned(rec) == 0) { + page = buf_frame_align(rec); + comp = page_is_comp(page); + + while (rec_get_n_owned(rec, comp) == 0) { steps++; rec = page_rec_get_next(rec); } @@ -96,14 +102,18 @@ page_dir_find_owner_slot( "InnoDB: Original record ", (ulong) buf_frame_get_page_no(page)); - rec_print(stderr, original_rec); + if (comp) { + fputs("(compact record)\n", stderr); + } else { + rec_print_old(stderr, original_rec); + } fprintf(stderr, "\n" "InnoDB: on that page. Steps %lu.\n", (ulong) steps); fputs( "InnoDB: Cannot find the dir slot for record ", stderr); - rec_print(stderr, rec); + rec_print(stderr, rec, NULL); fputs("\n" "InnoDB: on that page!\n", stderr); @@ -136,14 +146,15 @@ page_dir_slot_check( page = buf_frame_align(slot); - n_slots = page_header_get_field(page, PAGE_N_DIR_SLOTS); + n_slots = page_dir_get_n_slots(page); ut_a(slot <= page_dir_get_nth_slot(page, 0)); ut_a(slot >= page_dir_get_nth_slot(page, n_slots - 1)); - ut_a(page_rec_check(page + mach_read_from_2(slot))); + ut_a(page_rec_check(page_dir_slot_get_rec(slot))); - n_owned = rec_get_n_owned(page + mach_read_from_2(slot)); + n_owned = rec_get_n_owned(page_dir_slot_get_rec(slot), + page_is_comp(page)); if (slot == page_dir_get_nth_slot(page, 0)) { ut_a(n_owned == 1); @@ -194,12 +205,14 @@ Allocates a block of memory from an index page. */ byte* page_mem_alloc( /*===========*/ - /* out: pointer to start of allocated - buffer, or NULL if allocation fails */ - page_t* page, /* in: index page */ - ulint need, /* in: number of bytes needed */ - ulint* heap_no)/* out: this contains the heap number - of the allocated record if allocation succeeds */ + /* out: pointer to start of allocated + buffer, or NULL if allocation fails */ + page_t* page, /* in: index page */ + ulint need, /* in: number of bytes needed */ + dict_index_t* index, /* in: record descriptor */ + ulint* heap_no)/* out: this contains the heap number + of the allocated record + if allocation succeeds */ { rec_t* rec; byte* block; @@ -213,18 +226,30 @@ page_mem_alloc( rec = page_header_get_ptr(page, PAGE_FREE); - if (rec && (rec_get_size(rec) >= need)) { + if (rec) { + mem_heap_t* heap + = mem_heap_create(100); + const ulint* offsets + = rec_get_offsets(rec, index, ULINT_UNDEFINED, heap); + + if (rec_offs_size(offsets) >= need) { + page_header_set_ptr(page, PAGE_FREE, + page_rec_get_next(rec)); - page_header_set_ptr(page, PAGE_FREE, page_rec_get_next(rec)); + garbage = page_header_get_field(page, PAGE_GARBAGE); + ut_ad(garbage >= need); - garbage = page_header_get_field(page, PAGE_GARBAGE); - ut_ad(garbage >= need); + page_header_set_field(page, PAGE_GARBAGE, + garbage - need); - page_header_set_field(page, PAGE_GARBAGE, garbage - need); + *heap_no = rec_get_heap_no(rec, page_is_comp(page)); - *heap_no = rec_get_heap_no(rec); + block = rec_get_start(rec, offsets); + mem_heap_free(heap); + return(block); + } - return(rec_get_start(rec)); + mem_heap_free(heap); } /* Could not find space from the free list, try top of heap */ @@ -235,9 +260,9 @@ page_mem_alloc( block = page_header_get_ptr(page, PAGE_HEAP_TOP); page_header_set_ptr(page, PAGE_HEAP_TOP, block + need); - *heap_no = page_header_get_field(page, PAGE_N_HEAP); + *heap_no = page_dir_get_n_heap(page); - page_header_set_field(page, PAGE_N_HEAP, 1 + *heap_no); + page_dir_set_n_heap(page, 1 + *heap_no); return(block); } @@ -253,9 +278,11 @@ page_create_write_log( /*==================*/ buf_frame_t* frame, /* in: a buffer frame where the page is created */ - mtr_t* mtr) /* in: mini-transaction handle */ + mtr_t* mtr, /* in: mini-transaction handle */ + ibool comp) /* in: TRUE=compact page format */ { - mlog_write_initial_log_record(frame, MLOG_PAGE_CREATE, mtr); + mlog_write_initial_log_record(frame, + comp ? MLOG_COMP_PAGE_CREATE : MLOG_PAGE_CREATE, mtr); } /*************************************************************** @@ -267,6 +294,7 @@ page_parse_create( /* out: end of log record or NULL */ byte* ptr, /* in: buffer */ byte* end_ptr __attribute__((unused)), /* in: buffer end */ + ibool comp, /* in: TRUE=compact page format */ page_t* page, /* in: page or NULL */ mtr_t* mtr) /* in: mtr or NULL */ { @@ -275,7 +303,7 @@ page_parse_create( /* The record is empty, except for the record initial part */ if (page) { - page_create(page, mtr); + page_create(page, mtr, comp); } return(ptr); @@ -290,7 +318,8 @@ page_create( /* out: pointer to the page */ buf_frame_t* frame, /* in: a buffer frame where the page is created */ - mtr_t* mtr) /* in: mini-transaction handle */ + mtr_t* mtr, /* in: mini-transaction handle */ + ibool comp) /* in: TRUE=compact page format */ { page_dir_slot_t* slot; mem_heap_t* heap; @@ -300,6 +329,10 @@ page_create( rec_t* infimum_rec; rec_t* supremum_rec; page_t* page; + dict_index_t* index; + ulint* offsets; + + index = comp ? srv_sys->dummy_ind2 : srv_sys->dummy_ind1; ut_ad(frame && mtr); ut_ad(PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE @@ -311,7 +344,7 @@ page_create( buf_frame_modify_clock_inc(frame); /* 2. WRITE LOG INFORMATION */ - page_create_write_log(frame, mtr); + page_create_write_log(frame, mtr, comp); page = frame; @@ -323,43 +356,52 @@ page_create( /* Create first a data tuple for infimum record */ tuple = dtuple_create(heap, 1); + dtuple_set_info_bits(tuple, REC_STATUS_INFIMUM); field = dtuple_get_nth_field(tuple, 0); - dfield_set_data(field, "infimum", sizeof "infimum"); - dtype_set(dfield_get_type(field), DATA_VARCHAR, DATA_ENGLISH, 20, 0); - + dfield_set_data(field, "infimum", 8); + dtype_set(dfield_get_type(field), + DATA_VARCHAR, DATA_ENGLISH | DATA_NOT_NULL, 8, 0); /* Set the corresponding physical record to its place in the page record heap */ heap_top = page + PAGE_DATA; - infimum_rec = rec_convert_dtuple_to_rec(heap_top, tuple); + infimum_rec = rec_convert_dtuple_to_rec(heap_top, index, tuple); + + ut_a(infimum_rec == + page + (comp ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM)); + + rec_set_n_owned(infimum_rec, comp, 1); + rec_set_heap_no(infimum_rec, comp, 0); + offsets = rec_get_offsets(infimum_rec, index, ULINT_UNDEFINED, heap); + + heap_top = rec_get_end(infimum_rec, offsets); - ut_a(infimum_rec == page + PAGE_INFIMUM); - - rec_set_n_owned(infimum_rec, 1); - rec_set_heap_no(infimum_rec, 0); - - heap_top = rec_get_end(infimum_rec); - /* Create then a tuple for supremum */ tuple = dtuple_create(heap, 1); + dtuple_set_info_bits(tuple, REC_STATUS_SUPREMUM); field = dtuple_get_nth_field(tuple, 0); - dfield_set_data(field, "supremum", sizeof "supremum"); - dtype_set(dfield_get_type(field), DATA_VARCHAR, DATA_ENGLISH, 20, 0); + dfield_set_data(field, "supremum", 9 - comp); + dtype_set(dfield_get_type(field), + DATA_VARCHAR, DATA_ENGLISH | DATA_NOT_NULL, 9 - comp, 0); - supremum_rec = rec_convert_dtuple_to_rec(heap_top, tuple); + supremum_rec = rec_convert_dtuple_to_rec(heap_top, index, tuple); - ut_a(supremum_rec == page + PAGE_SUPREMUM); + ut_a(supremum_rec == + page + (comp ? PAGE_NEW_SUPREMUM : PAGE_OLD_SUPREMUM)); - rec_set_n_owned(supremum_rec, 1); - rec_set_heap_no(supremum_rec, 1); - - heap_top = rec_get_end(supremum_rec); + rec_set_n_owned(supremum_rec, comp, 1); + rec_set_heap_no(supremum_rec, comp, 1); - ut_ad(heap_top == page + PAGE_SUPREMUM_END); + offsets = rec_reget_offsets(supremum_rec, index, + offsets, ULINT_UNDEFINED, heap); + heap_top = rec_get_end(supremum_rec, offsets); + + ut_ad(heap_top == + page + (comp ? PAGE_NEW_SUPREMUM_END : PAGE_OLD_SUPREMUM_END)); mem_heap_free(heap); @@ -367,7 +409,7 @@ page_create( page_header_set_field(page, PAGE_N_DIR_SLOTS, 2); page_header_set_ptr(page, PAGE_HEAP_TOP, heap_top); - page_header_set_field(page, PAGE_N_HEAP, 2); + page_header_set_field(page, PAGE_N_HEAP, comp ? 0x8002 : 2); page_header_set_ptr(page, PAGE_FREE, NULL); page_header_set_field(page, PAGE_GARBAGE, 0); page_header_set_ptr(page, PAGE_LAST_INSERT, NULL); @@ -388,8 +430,8 @@ page_create( /* Set the next pointers in infimum and supremum */ - rec_set_next_offs(infimum_rec, (ulint)(supremum_rec - page)); - rec_set_next_offs(supremum_rec, 0); + rec_set_next_offs(infimum_rec, comp, (ulint)(supremum_rec - page)); + rec_set_next_offs(supremum_rec, comp, 0); return(page); } @@ -401,10 +443,11 @@ touch the lock table and max trx id on page. */ void page_copy_rec_list_end_no_locks( /*============================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - mtr_t* mtr) /* in: mtr */ + page_t* new_page, /* in: index page to copy to */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { page_cur_t cur1; page_cur_t cur2; @@ -416,8 +459,11 @@ page_copy_rec_list_end_no_locks( page_cur_move_to_next(&cur1); } - - ut_a(mach_read_from_2(new_page + UNIV_PAGE_SIZE - 10) == PAGE_INFIMUM); + + ut_a(index->table->comp == page_is_comp(page)); + ut_a(index->table->comp == page_is_comp(new_page)); + ut_a(mach_read_from_2(new_page + UNIV_PAGE_SIZE - 10) == (ulint) + (index->table->comp ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM)); page_cur_set_before_first(new_page, &cur2); @@ -427,7 +473,7 @@ page_copy_rec_list_end_no_locks( while (sup != page_cur_get_rec(&cur1)) { if (!page_cur_rec_insert(&cur2, - page_cur_get_rec(&cur1), mtr)) { + page_cur_get_rec(&cur1), index, mtr)) { /* Track an assertion failure reported on the mailing list on June 18th, 2003 */ @@ -456,16 +502,18 @@ The records are copied to the start of the record list on new_page. */ void page_copy_rec_list_end( /*===================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - mtr_t* mtr) /* in: mtr */ + page_t* new_page, /* in: index page to copy to */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { - if (page_header_get_field(new_page, PAGE_N_HEAP) == 2) { + if (page_dir_get_n_heap(new_page) == 2) { page_copy_rec_list_end_to_created_page(new_page, page, rec, - mtr); + index, mtr); } else { - page_copy_rec_list_end_no_locks(new_page, page, rec, mtr); + page_copy_rec_list_end_no_locks(new_page, page, rec, + index, mtr); } /* Update the lock table, MAX_TRX_ID, and possible hash index */ @@ -474,7 +522,7 @@ page_copy_rec_list_end( page_update_max_trx_id(new_page, page_get_max_trx_id(page)); - btr_search_move_or_delete_hash_entries(new_page, page); + btr_search_move_or_delete_hash_entries(new_page, page, index); } /***************************************************************** @@ -485,10 +533,11 @@ The records are copied to the end of the record list on new_page. */ void page_copy_rec_list_start( /*=====================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - mtr_t* mtr) /* in: mtr */ + page_t* new_page, /* in: index page to copy to */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { page_cur_t cur1; page_cur_t cur2; @@ -510,8 +559,8 @@ page_copy_rec_list_start( /* Copy records from the original page to the new page */ while (page_cur_get_rec(&cur1) != rec) { - ut_a( - page_cur_rec_insert(&cur2, page_cur_get_rec(&cur1), mtr)); + ut_a(page_cur_rec_insert(&cur2, + page_cur_get_rec(&cur1), index, mtr)); page_cur_move_to_next(&cur1); page_cur_move_to_next(&cur2); @@ -523,7 +572,7 @@ page_copy_rec_list_start( page_update_max_trx_id(new_page, page_get_max_trx_id(page)); - btr_search_move_or_delete_hash_entries(new_page, page); + btr_search_move_or_delete_hash_entries(new_page, page, index); } /************************************************************** @@ -532,18 +581,25 @@ UNIV_INLINE void page_delete_rec_list_write_log( /*===========================*/ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - byte type, /* in: operation type: MLOG_LIST_END_DELETE, ... */ - mtr_t* mtr) /* in: mtr */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + byte type, /* in: operation type: + MLOG_LIST_END_DELETE, ... */ + mtr_t* mtr) /* in: mtr */ { - ut_ad((type == MLOG_LIST_END_DELETE) - || (type == MLOG_LIST_START_DELETE)); - - mlog_write_initial_log_record(page, type, mtr); - - /* Write the parameter as a 2-byte ulint */ - mlog_catenate_ulint(mtr, rec - page, MLOG_2BYTES); + byte* log_ptr; + ut_ad(type == MLOG_LIST_END_DELETE + || type == MLOG_LIST_START_DELETE + || type == MLOG_COMP_LIST_END_DELETE + || type == MLOG_COMP_LIST_START_DELETE); + + log_ptr = mlog_open_and_write_index(mtr, page, index, type, 2); + if (log_ptr) { + /* Write the parameter as a 2-byte ulint */ + mach_write_to_2(log_ptr, rec - page); + mlog_close(mtr, log_ptr + 2); + } } /************************************************************** @@ -552,18 +608,23 @@ Parses a log record of a record list end or start deletion. */ byte* page_parse_delete_rec_list( /*=======================*/ - /* out: end of log record or NULL */ - byte type, /* in: MLOG_LIST_END_DELETE or - MLOG_LIST_START_DELETE */ - byte* ptr, /* in: buffer */ - byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ - mtr_t* mtr) /* in: mtr or NULL */ + /* out: end of log record or NULL */ + byte type, /* in: MLOG_LIST_END_DELETE, + MLOG_LIST_START_DELETE, + MLOG_COMP_LIST_END_DELETE or + MLOG_COMP_LIST_START_DELETE */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ { ulint offset; - ut_ad((type == MLOG_LIST_END_DELETE) - || (type == MLOG_LIST_START_DELETE)); + ut_ad(type == MLOG_LIST_END_DELETE + || type == MLOG_LIST_START_DELETE + || type == MLOG_COMP_LIST_END_DELETE + || type == MLOG_COMP_LIST_START_DELETE); /* Read the record offset as a 2-byte ulint */ @@ -580,11 +641,12 @@ page_parse_delete_rec_list( return(ptr); } - if (type == MLOG_LIST_END_DELETE) { - page_delete_rec_list_end(page, page + offset, ULINT_UNDEFINED, - ULINT_UNDEFINED, mtr); + if (type == MLOG_LIST_END_DELETE + || type == MLOG_COMP_LIST_END_DELETE) { + page_delete_rec_list_end(page, page + offset, index, + ULINT_UNDEFINED, ULINT_UNDEFINED, mtr); } else { - page_delete_rec_list_start(page, page + offset, mtr); + page_delete_rec_list_start(page, page + offset, index, mtr); } return(ptr); @@ -597,14 +659,15 @@ The infimum and supremum records are not deleted. */ void page_delete_rec_list_end( /*=====================*/ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - ulint n_recs, /* in: number of records to delete, or ULINT_UNDEFINED - if not known */ - ulint size, /* in: the sum of the sizes of the records in the end - of the chain to delete, or ULINT_UNDEFINED if not - known */ - mtr_t* mtr) /* in: mtr */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + ulint n_recs, /* in: number of records to delete, + or ULINT_UNDEFINED if not known */ + ulint size, /* in: the sum of the sizes of the + records in the end of the chain to + delete, or ULINT_UNDEFINED if not known */ + mtr_t* mtr) /* in: mtr */ { page_dir_slot_t* slot; ulint slot_index; @@ -615,10 +678,12 @@ page_delete_rec_list_end( ulint count; ulint n_owned; rec_t* sup; + ibool comp; /* Reset the last insert info in the page header and increment the modify clock for the frame */ + ut_ad(size == ULINT_UNDEFINED || size < UNIV_PAGE_SIZE); page_header_set_ptr(page, PAGE_LAST_INSERT, NULL); /* The page gets invalid for optimistic searches: increment the @@ -632,7 +697,9 @@ page_delete_rec_list_end( rec = page_rec_get_next(rec); } - page_delete_rec_list_write_log(page, rec, MLOG_LIST_END_DELETE, mtr); + comp = page_is_comp(page); + page_delete_rec_list_write_log(page, rec, index, + comp ? MLOG_COMP_LIST_END_DELETE : MLOG_LIST_END_DELETE, mtr); if (rec == sup) { @@ -644,19 +711,32 @@ page_delete_rec_list_end( last_rec = page_rec_get_prev(sup); if ((size == ULINT_UNDEFINED) || (n_recs == ULINT_UNDEFINED)) { + mem_heap_t* heap = mem_heap_create(100); + ulint* offsets = NULL; /* Calculate the sum of sizes and the number of records */ size = 0; n_recs = 0; rec2 = rec; while (rec2 != sup) { - size += rec_get_size(rec2); + ulint s; + offsets = rec_reget_offsets(rec2, index, + offsets, ULINT_UNDEFINED, heap); + s = rec_offs_size(offsets); + ut_ad(rec2 - page + s - rec_offs_extra_size(offsets) + < UNIV_PAGE_SIZE); + ut_ad(size + s < UNIV_PAGE_SIZE); + size += s; n_recs++; rec2 = page_rec_get_next(rec2); } + + mem_heap_free(heap); } + ut_ad(size < UNIV_PAGE_SIZE); + /* Update the page directory; there is no need to balance the number of the records owned by the supremum record, as it is allowed to be less than PAGE_DIR_SLOT_MIN_N_OWNED */ @@ -664,15 +744,15 @@ page_delete_rec_list_end( rec2 = rec; count = 0; - while (rec_get_n_owned(rec2) == 0) { + while (rec_get_n_owned(rec2, comp) == 0) { count++; rec2 = page_rec_get_next(rec2); } - ut_ad(rec_get_n_owned(rec2) - count > 0); + ut_ad(rec_get_n_owned(rec2, comp) - count > 0); - n_owned = rec_get_n_owned(rec2) - count; + n_owned = rec_get_n_owned(rec2, comp) - count; slot_index = page_dir_find_owner_slot(rec2); slot = page_dir_get_nth_slot(page, slot_index); @@ -680,7 +760,7 @@ page_delete_rec_list_end( page_dir_slot_set_rec(slot, sup); page_dir_slot_set_n_owned(slot, n_owned); - page_header_set_field(page, PAGE_N_DIR_SLOTS, slot_index + 1); + page_dir_set_n_slots(page, slot_index + 1); /* Remove the record chain segment from the record chain */ page_rec_set_next(prev_rec, page_get_supremum_rec(page)); @@ -706,14 +786,19 @@ that record. Infimum and supremum records are not deleted. */ void page_delete_rec_list_start( /*=======================*/ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - mtr_t* mtr) /* in: mtr */ + page_t* page, /* in: index page */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { page_cur_t cur1; ulint log_mode; - page_delete_rec_list_write_log(page, rec, MLOG_LIST_START_DELETE, mtr); + page_delete_rec_list_write_log(page, rec, index, + index->table->comp + ? MLOG_COMP_LIST_START_DELETE + : MLOG_LIST_START_DELETE, + mtr); page_cur_set_before_first(page, &cur1); @@ -730,7 +815,7 @@ page_delete_rec_list_start( while (page_cur_get_rec(&cur1) != rec) { - page_cur_delete_rec(&cur1, mtr); + page_cur_delete_rec(&cur1, index, mtr); } /* Restore log mode */ @@ -745,10 +830,11 @@ split_rec. */ void page_move_rec_list_end( /*===================*/ - page_t* new_page, /* in: index page where to move */ - page_t* page, /* in: index page */ - rec_t* split_rec, /* in: first record to move */ - mtr_t* mtr) /* in: mtr */ + page_t* new_page, /* in: index page where to move */ + page_t* page, /* in: index page */ + rec_t* split_rec, /* in: first record to move */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { ulint old_data_size; ulint new_data_size; @@ -758,15 +844,15 @@ page_move_rec_list_end( old_data_size = page_get_data_size(new_page); old_n_recs = page_get_n_recs(new_page); - page_copy_rec_list_end(new_page, page, split_rec, mtr); + page_copy_rec_list_end(new_page, page, split_rec, index, mtr); new_data_size = page_get_data_size(new_page); new_n_recs = page_get_n_recs(new_page); ut_ad(new_data_size >= old_data_size); - page_delete_rec_list_end(page, split_rec, new_n_recs - old_n_recs, - new_data_size - old_data_size, mtr); + page_delete_rec_list_end(page, split_rec, index, + new_n_recs - old_n_recs, new_data_size - old_data_size, mtr); } /***************************************************************** @@ -776,14 +862,15 @@ split_rec. */ void page_move_rec_list_start( /*=====================*/ - page_t* new_page, /* in: index page where to move */ - page_t* page, /* in: index page */ - rec_t* split_rec, /* in: first record not to move */ - mtr_t* mtr) /* in: mtr */ + page_t* new_page, /* in: index page where to move */ + page_t* page, /* in: index page */ + rec_t* split_rec, /* in: first record not to move */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ { - page_copy_rec_list_start(new_page, page, split_rec, mtr); + page_copy_rec_list_start(new_page, page, split_rec, index, mtr); - page_delete_rec_list_start(page, split_rec, mtr); + page_delete_rec_list_start(page, split_rec, index, mtr); } /*************************************************************************** @@ -801,7 +888,7 @@ page_rec_write_index_page_no( byte* data; ulint len; - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field_old(rec, i, &len); ut_ad(len == 4); @@ -885,7 +972,7 @@ page_dir_add_slots( ut_ad(start < n_slots - 1); /* Update the page header */ - page_header_set_field(page, PAGE_N_DIR_SLOTS, n_slots + n); + page_dir_set_n_slots(page, n_slots + n); /* Move slots up */ @@ -1006,8 +1093,8 @@ page_dir_balance_slot( old_rec = page_dir_slot_get_rec(slot); new_rec = page_rec_get_next(old_rec); - rec_set_n_owned(old_rec, 0); - rec_set_n_owned(new_rec, n_owned + 1); + rec_set_n_owned(old_rec, page_is_comp(page), 0); + rec_set_n_owned(new_rec, page_is_comp(page), n_owned + 1); page_dir_slot_set_rec(slot, new_rec); @@ -1080,13 +1167,15 @@ page_rec_get_n_recs_before( rec_t* slot_rec; page_t* page; ulint i; + ibool comp; lint n = 0; ut_ad(page_rec_check(rec)); page = buf_frame_align(rec); - - while (rec_get_n_owned(rec) == 0) { + comp = page_is_comp(page); + + while (rec_get_n_owned(rec, comp) == 0) { rec = page_rec_get_next(rec); n--; @@ -1096,7 +1185,7 @@ page_rec_get_n_recs_before( slot = page_dir_get_nth_slot(page, i); slot_rec = page_dir_slot_get_rec(slot); - n += rec_get_n_owned(slot_rec); + n += rec_get_n_owned(slot_rec, comp); if (rec == slot_rec) { @@ -1118,17 +1207,21 @@ the index page context. */ void page_rec_print( /*===========*/ - rec_t* rec) + rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: record descriptor */ { - rec_print(stderr, rec); + ibool comp = page_is_comp(buf_frame_align(rec)); + + ut_a(comp == rec_offs_comp(offsets)); + rec_print(stderr, rec, offsets); fprintf(stderr, " n_owned: %lu; heap_no: %lu; next rec: %lu\n", - (ulong) rec_get_n_owned(rec), - (ulong) rec_get_heap_no(rec), - (ulong) rec_get_next_offs(rec)); + (ulong) rec_get_n_owned(rec, comp), + (ulong) rec_get_heap_no(rec, comp), + (ulong) rec_get_next_offs(rec, comp)); page_rec_check(rec); - rec_validate(rec); + rec_validate(rec, offsets); } /******************************************************************* @@ -1176,12 +1269,18 @@ debugging purposes. */ void page_print_list( /*============*/ - page_t* page, /* in: index page */ - ulint pr_n) /* in: print n first and n last entries */ + page_t* page, /* in: index page */ + dict_index_t* index, /* in: dictionary index of the page */ + ulint pr_n) /* in: print n first and n last entries */ { page_cur_t cur; ulint count; ulint n_recs; + mem_heap_t* heap; + ulint* offsets = NULL; + + ut_a(page_is_comp(page) == index->table->comp); + heap = mem_heap_create(100); fprintf(stderr, "--------------------------------\n" @@ -1193,7 +1292,9 @@ page_print_list( page_cur_set_before_first(page, &cur); count = 0; for (;;) { - page_rec_print(cur.rec); + offsets = rec_reget_offsets(cur.rec, index, + offsets, ULINT_UNDEFINED, heap); + page_rec_print(cur.rec, offsets); if (count == pr_n) { break; @@ -1213,7 +1314,9 @@ page_print_list( page_cur_move_to_next(&cur); if (count + pr_n >= n_recs) { - page_rec_print(cur.rec); + offsets = rec_reget_offsets(cur.rec, index, + offsets, ULINT_UNDEFINED, heap); + page_rec_print(cur.rec, offsets); } count++; } @@ -1222,6 +1325,8 @@ page_print_list( "Total of %lu records \n" "--------------------------------\n", (ulong) (count + 1)); + + mem_heap_free(heap); } /******************************************************************* @@ -1235,14 +1340,15 @@ page_header_print( fprintf(stderr, "--------------------------------\n" "PAGE HEADER INFO\n" - "Page address %p, n records %lu\n" + "Page address %p, n records %lu (%s)\n" "n dir slots %lu, heap top %lu\n" "Page n heap %lu, free %lu, garbage %lu\n" "Page last insert %lu, direction %lu, n direction %lu\n", page, (ulong) page_header_get_field(page, PAGE_N_RECS), + page_is_comp(page) ? "compact format" : "original format", (ulong) page_header_get_field(page, PAGE_N_DIR_SLOTS), (ulong) page_header_get_field(page, PAGE_HEAP_TOP), - (ulong) page_header_get_field(page, PAGE_N_HEAP), + (ulong) page_dir_get_n_heap(page), (ulong) page_header_get_field(page, PAGE_FREE), (ulong) page_header_get_field(page, PAGE_GARBAGE), (ulong) page_header_get_field(page, PAGE_LAST_INSERT), @@ -1257,13 +1363,16 @@ debugging purposes. */ void page_print( /*======*/ - page_t* page, /* in: index page */ - ulint dn, /* in: print dn first and last entries in directory */ - ulint rn) /* in: print rn first and last records on page */ + page_t* page, /* in: index page */ + dict_index_t* index, /* in: dictionary index of the page */ + ulint dn, /* in: print dn first and last entries + in directory */ + ulint rn) /* in: print rn first and last records + in directory */ { page_header_print(page); page_dir_print(page, dn); - page_print_list(page, rn); + page_print_list(page, index, rn); } /******************************************************************* @@ -1274,20 +1383,24 @@ the heap_no field. */ ibool page_rec_validate( /*==============*/ - /* out: TRUE if ok */ - rec_t* rec) /* in: record on the page */ + /* out: TRUE if ok */ + rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { ulint n_owned; ulint heap_no; - page_t* page; + page_t* page; + ibool comp; page = buf_frame_align(rec); + comp = page_is_comp(page); + ut_a(comp == rec_offs_comp(offsets)); page_rec_check(rec); - rec_validate(rec); + rec_validate(rec, offsets); - n_owned = rec_get_n_owned(rec); - heap_no = rec_get_heap_no(rec); + n_owned = rec_get_n_owned(rec, comp); + heap_no = rec_get_heap_no(rec, comp); if (!(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED)) { fprintf(stderr, @@ -1296,11 +1409,11 @@ page_rec_validate( return(FALSE); } - if (!(heap_no < page_header_get_field(page, PAGE_N_HEAP))) { + if (!(heap_no < page_dir_get_n_heap(page))) { fprintf(stderr, "InnoDB: Heap no of rec %lu too big %lu %lu\n", (ulong)(rec - page), (ulong) heap_no, - (ulong) page_header_get_field(page, PAGE_N_HEAP)); + (ulong) page_dir_get_n_heap(page)); return(FALSE); } @@ -1358,6 +1471,7 @@ page_simple_validate( ulint count; ulint own_count; ibool ret = FALSE; + ibool comp = page_is_comp(page); /* Check first that the record heap and the directory do not overlap. */ @@ -1404,13 +1518,13 @@ page_simple_validate( goto func_exit; } - if (rec_get_n_owned(rec) != 0) { + if (rec_get_n_owned(rec, comp) != 0) { /* This is a record pointed to by a dir slot */ - if (rec_get_n_owned(rec) != own_count) { + if (rec_get_n_owned(rec, comp) != own_count) { fprintf(stderr, "InnoDB: Wrong owned count %lu, %lu, rec %lu\n", - (ulong) rec_get_n_owned(rec), + (ulong) rec_get_n_owned(rec, comp), (ulong) own_count, (ulong)(rec - page)); @@ -1438,11 +1552,11 @@ page_simple_validate( break; } - if (rec_get_next_offs(rec) < FIL_PAGE_DATA - || rec_get_next_offs(rec) >= UNIV_PAGE_SIZE) { + if (rec_get_next_offs(rec, comp) < FIL_PAGE_DATA + || rec_get_next_offs(rec, comp) >= UNIV_PAGE_SIZE) { fprintf(stderr, "InnoDB: Next record offset nonsensical %lu for rec %lu\n", - (ulong) rec_get_next_offs(rec), + (ulong) rec_get_next_offs(rec, comp), (ulong)(rec - page)); goto func_exit; @@ -1461,7 +1575,7 @@ page_simple_validate( own_count++; } - if (rec_get_n_owned(rec) == 0) { + if (rec_get_n_owned(rec, comp) == 0) { fprintf(stderr, "InnoDB: n owned is zero in a supremum rec\n"); goto func_exit; @@ -1514,10 +1628,10 @@ page_simple_validate( rec = page_rec_get_next(rec); } - if (page_header_get_field(page, PAGE_N_HEAP) != count + 1) { + if (page_dir_get_n_heap(page) != count + 1) { fprintf(stderr, "InnoDB: N heap is wrong %lu, %lu\n", - (ulong) page_header_get_field(page, PAGE_N_HEAP), + (ulong) page_dir_get_n_heap(page), (ulong) (count + 1)); goto func_exit; @@ -1549,12 +1663,19 @@ page_validate( ulint slot_no; ulint data_size; rec_t* rec; - rec_t* old_rec = NULL; + rec_t* old_rec = NULL; ulint offs; ulint n_slots; - ibool ret = FALSE; + ibool ret = FALSE; ulint i; - + ibool comp = page_is_comp(page); + ulint* offsets = NULL; + ulint* old_offsets = NULL; + + if (comp != index->table->comp) { + fputs("InnoDB: 'compact format' flag mismatch\n", stderr); + goto func_exit2; + } if (!page_simple_validate(page)) { goto func_exit2; } @@ -1599,22 +1720,33 @@ page_validate( for (;;) { rec = cur.rec; + offsets = rec_reget_offsets(rec, index, + offsets, ULINT_UNDEFINED, heap); - if (!page_rec_validate(rec)) { + if (comp && page_rec_is_user_rec(rec) + && rec_get_node_ptr_flag(rec) + == !btr_page_get_level_low(page)) { + fputs("InnoDB: node_ptr flag mismatch\n", stderr); + goto func_exit; + } + + if (!page_rec_validate(rec, offsets)) { goto func_exit; } /* Check that the records are in the ascending order */ if ((count >= 2) && (!page_cur_is_after_last(&cur))) { - if (!(1 == cmp_rec_rec(rec, old_rec, index))) { + if (!(1 == cmp_rec_rec(rec, old_rec, + offsets, old_offsets, + ULINT_UNDEFINED, index))) { fprintf(stderr, "InnoDB: Records in wrong order on page %lu", (ulong) buf_frame_get_page_no(page)); dict_index_name_print(stderr, NULL, index); fputs("\nInnoDB: previous record ", stderr); - rec_print(stderr, old_rec); + rec_print(stderr, old_rec, old_offsets); fputs("\nInnoDB: record ", stderr); - rec_print(stderr, rec); + rec_print(stderr, rec, offsets); putc('\n', stderr); goto func_exit; @@ -1624,12 +1756,12 @@ page_validate( if ((rec != page_get_supremum_rec(page)) && (rec != page_get_infimum_rec(page))) { - data_size += rec_get_size(rec); + data_size += rec_offs_size(offsets); } - offs = rec_get_start(rec) - page; + offs = rec_get_start(rec, offsets) - page; - for (i = 0; i < rec_get_size(rec); i++) { + for (i = 0; i < rec_offs_size(offsets); i++) { if (!buf[offs + i] == 0) { /* No other record may overlap this */ @@ -1641,12 +1773,12 @@ page_validate( buf[offs + i] = 1; } - if (rec_get_n_owned(rec) != 0) { + if (rec_get_n_owned(rec, comp) != 0) { /* This is a record pointed to by a dir slot */ - if (rec_get_n_owned(rec) != own_count) { + if (rec_get_n_owned(rec, comp) != own_count) { fprintf(stderr, "InnoDB: Wrong owned count %lu, %lu\n", - (ulong) rec_get_n_owned(rec), + (ulong) rec_get_n_owned(rec, comp), (ulong) own_count); goto func_exit; } @@ -1671,11 +1803,11 @@ page_validate( break; } - if (rec_get_next_offs(rec) < FIL_PAGE_DATA - || rec_get_next_offs(rec) >= UNIV_PAGE_SIZE) { + if (rec_get_next_offs(rec, comp) < FIL_PAGE_DATA + || rec_get_next_offs(rec, comp) >= UNIV_PAGE_SIZE) { fprintf(stderr, "InnoDB: Next record offset wrong %lu\n", - (ulong) rec_get_next_offs(rec)); + (ulong) rec_get_next_offs(rec, comp)); goto func_exit; } @@ -1683,9 +1815,15 @@ page_validate( page_cur_move_to_next(&cur); own_count++; old_rec = rec; + /* set old_offsets to offsets; recycle offsets */ + { + ulint* offs = old_offsets; + old_offsets = offsets; + offsets = offs; + } } - if (rec_get_n_owned(rec) == 0) { + if (rec_get_n_owned(rec, comp) == 0) { fputs("InnoDB: n owned is zero\n", stderr); goto func_exit; } @@ -1714,15 +1852,17 @@ page_validate( rec = page_header_get_ptr(page, PAGE_FREE); while (rec != NULL) { - if (!page_rec_validate(rec)) { + offsets = rec_reget_offsets(rec, index, + offsets, ULINT_UNDEFINED, heap); + if (!page_rec_validate(rec, offsets)) { goto func_exit; } count++; - offs = rec_get_start(rec) - page; + offs = rec_get_start(rec, offsets) - page; - for (i = 0; i < rec_get_size(rec); i++) { + for (i = 0; i < rec_offs_size(offsets); i++) { if (buf[offs + i] != 0) { fputs( @@ -1736,9 +1876,9 @@ page_validate( rec = page_rec_get_next(rec); } - if (page_header_get_field(page, PAGE_N_HEAP) != count + 1) { + if (page_dir_get_n_heap(page) != count + 1) { fprintf(stderr, "InnoDB: N heap is wrong %lu %lu\n", - (ulong) page_header_get_field(page, PAGE_N_HEAP), + (ulong) page_dir_get_n_heap(page), (ulong) count + 1); goto func_exit; } @@ -1775,7 +1915,7 @@ page_find_rec_with_heap_no( page_cur_set_before_first(page, &cur); for (;;) { - if (rec_get_heap_no(cur.rec) == heap_no) { + if (rec_get_heap_no(cur.rec, page_is_comp(page)) == heap_no) { return(cur.rec); } diff --git a/innobase/pars/pars0pars.c b/innobase/pars/pars0pars.c index 846cb060a7e..c62184abd85 100644 --- a/innobase/pars/pars0pars.c +++ b/innobase/pars/pars0pars.c @@ -259,9 +259,13 @@ pars_resolve_func_data_type( dtype_set(que_node_get_data_type(node), DATA_VARCHAR, DATA_ENGLISH, 0, 0); } else if (func == PARS_TO_BINARY_TOKEN) { - ut_a(dtype_get_mtype(que_node_get_data_type(arg)) == DATA_INT); - dtype_set(que_node_get_data_type(node), DATA_VARCHAR, + if (dtype_get_mtype(que_node_get_data_type(arg)) == DATA_INT) { + dtype_set(que_node_get_data_type(node), DATA_VARCHAR, DATA_ENGLISH, 0, 0); + } else { + dtype_set(que_node_get_data_type(node), DATA_BINARY, + 0, 0, 0); + } } else if (func == PARS_TO_NUMBER_TOKEN) { ut_a(dtype_get_mtype(que_node_get_data_type(arg)) == DATA_VARCHAR); @@ -1510,8 +1514,11 @@ pars_create_table( n_cols = que_node_list_get_len(column_defs); - table = dict_mem_table_create(table_sym->name, 0, n_cols); - + /* As the InnoDB SQL parser is for internal use only, + for creating some system tables, this function will only + create tables in the old (not compact) record format. */ + table = dict_mem_table_create(table_sym->name, 0, n_cols, FALSE); + if (not_fit_in_memory != NULL) { table->does_not_fit_in_memory = TRUE; } diff --git a/innobase/rem/rem0cmp.c b/innobase/rem/rem0cmp.c index 041fb7914e2..974fc7a24d0 100644 --- a/innobase/rem/rem0cmp.c +++ b/innobase/rem/rem0cmp.c @@ -51,6 +51,7 @@ cmp_debug_dtuple_rec_with_match( dtuple in some of the common fields, or which has an equal number or more fields than dtuple */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint* matched_fields);/* in/out: number of already completely matched fields; when function returns, contains the value for current @@ -426,6 +427,7 @@ cmp_dtuple_rec_with_match( dtuple in some of the common fields, or which has an equal number or more fields than dtuple */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint* matched_fields, /* in/out: number of already completely matched fields; when function returns, contains the value for current comparison */ @@ -455,12 +457,13 @@ cmp_dtuple_rec_with_match( ut_ad(dtuple && rec && matched_fields && matched_bytes); ut_ad(dtuple_check_typed(dtuple)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); cur_field = *matched_fields; cur_bytes = *matched_bytes; ut_ad(cur_field <= dtuple_get_n_fields_cmp(dtuple)); - ut_ad(cur_field <= rec_get_n_fields(rec)); + ut_ad(cur_field <= rec_offs_n_fields(offsets)); /* Match fields in a loop; stop if we run out of fields in dtuple or find an externally stored field */ @@ -472,7 +475,8 @@ cmp_dtuple_rec_with_match( dtuple_f_len = dfield_get_len(dtuple_field); - rec_b_ptr = rec_get_nth_field(rec, cur_field, &rec_f_len); + rec_b_ptr = rec_get_nth_field(rec, offsets, + cur_field, &rec_f_len); /* If we have matched yet 0 bytes, it may be that one or both the fields are SQL null, or the record or dtuple may be @@ -482,7 +486,8 @@ cmp_dtuple_rec_with_match( if (cur_bytes == 0) { if (cur_field == 0) { - if (rec_get_info_bits(rec) + if (rec_get_info_bits(rec, + rec_offs_comp(offsets)) & REC_INFO_MIN_REC_FLAG) { if (dtuple_get_info_bits(dtuple) @@ -504,7 +509,7 @@ cmp_dtuple_rec_with_match( } } - if (rec_get_nth_field_extern_bit(rec, cur_field)) { + if (rec_offs_nth_extern(offsets, cur_field)) { /* We do not compare to an externally stored field */ @@ -635,7 +640,7 @@ cmp_dtuple_rec_with_match( up to the common fields */ order_resolved: ut_ad((ret >= - 1) && (ret <= 1)); - ut_ad(ret == cmp_debug_dtuple_rec_with_match(dtuple, rec, + ut_ad(ret == cmp_debug_dtuple_rec_with_match(dtuple, rec, offsets, matched_fields)); ut_ad(*matched_fields == cur_field); /* In the debug version, the above cmp_debug_... sets @@ -656,13 +661,15 @@ cmp_dtuple_rec( less than rec, respectively; see the comments for cmp_dtuple_rec_with_match */ dtuple_t* dtuple, /* in: data tuple */ - rec_t* rec) /* in: physical record */ + rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { ulint matched_fields = 0; ulint matched_bytes = 0; - return(cmp_dtuple_rec_with_match(dtuple, rec, &matched_fields, - &matched_bytes)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + return(cmp_dtuple_rec_with_match(dtuple, rec, offsets, + &matched_fields, &matched_bytes)); } /****************************************************************** @@ -673,22 +680,24 @@ ibool cmp_dtuple_is_prefix_of_rec( /*========================*/ /* out: TRUE if prefix */ - dtuple_t* dtuple, /* in: data tuple */ - rec_t* rec) /* in: physical record */ + dtuple_t* dtuple, /* in: data tuple */ + rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { ulint n_fields; ulint matched_fields = 0; ulint matched_bytes = 0; + ut_ad(rec_offs_validate(rec, NULL, offsets)); n_fields = dtuple_get_n_fields(dtuple); - if (n_fields > rec_get_n_fields(rec)) { + if (n_fields > rec_offs_n_fields(offsets)) { return(FALSE); } - cmp_dtuple_rec_with_match(dtuple, rec, &matched_fields, - &matched_bytes); + cmp_dtuple_rec_with_match(dtuple, rec, offsets, + &matched_fields, &matched_bytes); if (matched_fields == n_fields) { return(TRUE); @@ -703,42 +712,6 @@ cmp_dtuple_is_prefix_of_rec( return(FALSE); } -/****************************************************************** -Compares a prefix of a data tuple to a prefix of a physical record for -equality. If there are less fields in rec than parameter n_fields, FALSE -is returned. NOTE that n_fields_cmp of dtuple does not affect this -comparison. */ - -ibool -cmp_dtuple_rec_prefix_equal( -/*========================*/ - /* out: TRUE if equal */ - dtuple_t* dtuple, /* in: data tuple */ - rec_t* rec, /* in: physical record */ - ulint n_fields) /* in: number of fields which should be - compared; must not exceed the number of - fields in dtuple */ -{ - ulint matched_fields = 0; - ulint matched_bytes = 0; - - ut_ad(n_fields <= dtuple_get_n_fields(dtuple)); - - if (rec_get_n_fields(rec) < n_fields) { - - return(FALSE); - } - - cmp_dtuple_rec_with_match(dtuple, rec, &matched_fields, - &matched_bytes); - if (matched_fields >= n_fields) { - - return(TRUE); - } - - return(FALSE); -} - /***************************************************************** This function is used to compare two physical records. Only the common first fields are compared, and if an externally stored field is @@ -752,7 +725,13 @@ cmp_rec_rec_with_match( first fields are compared */ rec_t* rec1, /* in: physical record */ rec_t* rec2, /* in: physical record */ + const ulint* offsets1,/* in: rec_get_offsets(rec1, index) */ + const ulint* offsets2,/* in: rec_get_offsets(rec2, index) */ dict_index_t* index, /* in: data dictionary index */ + ulint n, /* in: number of fields to compare, + or ULINT_UNDEFINED if both records + contain all fields, and all fields + should be compared */ ulint* matched_fields, /* in/out: number of already completely matched fields; when the function returns, contains the value the for current @@ -778,17 +757,27 @@ cmp_rec_rec_with_match( ulint cur_bytes; /* number of already matched bytes in current field */ int ret = 3333; /* return value */ + ibool comp; ut_ad(rec1 && rec2 && index); - - rec1_n_fields = rec_get_n_fields(rec1); - rec2_n_fields = rec_get_n_fields(rec2); + ut_ad(rec_offs_validate(rec1, index, offsets1)); + ut_ad(rec_offs_validate(rec2, index, offsets2)); + ut_ad(rec_offs_comp(offsets1) == rec_offs_comp(offsets2)); + + comp = rec_offs_comp(offsets1); + if (n == ULINT_UNDEFINED) { + rec1_n_fields = rec_offs_n_fields(offsets1); + rec2_n_fields = rec_offs_n_fields(offsets2); + } else { + ut_ad(n <= rec_offs_n_fields(offsets1)); + ut_ad(n <= rec_offs_n_fields(offsets2)); + rec1_n_fields = rec2_n_fields = n; + } cur_field = *matched_fields; cur_bytes = *matched_bytes; - /* Match fields in a loop; stop if we run out of fields in either - record */ + /* Match fields in a loop */ while ((cur_field < rec1_n_fields) && (cur_field < rec2_n_fields)) { @@ -800,17 +789,19 @@ cmp_rec_rec_with_match( dict_index_get_nth_field(index, cur_field))); } - rec1_b_ptr = rec_get_nth_field(rec1, cur_field, &rec1_f_len); - rec2_b_ptr = rec_get_nth_field(rec2, cur_field, &rec2_f_len); - + rec1_b_ptr = rec_get_nth_field(rec1, offsets1, + cur_field, &rec1_f_len); + rec2_b_ptr = rec_get_nth_field(rec2, offsets2, + cur_field, &rec2_f_len); + if (cur_bytes == 0) { if (cur_field == 0) { /* Test if rec is the predefined minimum record */ - if (rec_get_info_bits(rec1) + if (rec_get_info_bits(rec1, comp) & REC_INFO_MIN_REC_FLAG) { - if (rec_get_info_bits(rec2) + if (rec_get_info_bits(rec2, comp) & REC_INFO_MIN_REC_FLAG) { ret = 0; } else { @@ -819,7 +810,7 @@ cmp_rec_rec_with_match( goto order_resolved; - } else if (rec_get_info_bits(rec2) + } else if (rec_get_info_bits(rec2, comp) & REC_INFO_MIN_REC_FLAG) { ret = 1; @@ -828,8 +819,8 @@ cmp_rec_rec_with_match( } } - if (rec_get_nth_field_extern_bit(rec1, cur_field) - || rec_get_nth_field_extern_bit(rec2, cur_field)) { + if (rec_offs_nth_extern(offsets1, cur_field) + || rec_offs_nth_extern(offsets2, cur_field)) { /* We do not compare to an externally stored field */ @@ -984,6 +975,7 @@ cmp_debug_dtuple_rec_with_match( dtuple in some of the common fields, or which has an equal number or more fields than dtuple */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint* matched_fields) /* in/out: number of already completely matched fields; when function returns, contains the value for current @@ -1003,14 +995,16 @@ cmp_debug_dtuple_rec_with_match( ut_ad(dtuple && rec && matched_fields); ut_ad(dtuple_check_typed(dtuple)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); ut_ad(*matched_fields <= dtuple_get_n_fields_cmp(dtuple)); - ut_ad(*matched_fields <= rec_get_n_fields(rec)); + ut_ad(*matched_fields <= rec_offs_n_fields(offsets)); cur_field = *matched_fields; if (cur_field == 0) { - if (rec_get_info_bits(rec) & REC_INFO_MIN_REC_FLAG) { + if (rec_get_info_bits(rec, rec_offs_comp(offsets)) + & REC_INFO_MIN_REC_FLAG) { if (dtuple_get_info_bits(dtuple) & REC_INFO_MIN_REC_FLAG) { @@ -1040,9 +1034,10 @@ cmp_debug_dtuple_rec_with_match( dtuple_f_data = dfield_get_data(dtuple_field); dtuple_f_len = dfield_get_len(dtuple_field); - rec_f_data = rec_get_nth_field(rec, cur_field, &rec_f_len); + rec_f_data = rec_get_nth_field(rec, offsets, + cur_field, &rec_f_len); - if (rec_get_nth_field_extern_bit(rec, cur_field)) { + if (rec_offs_nth_extern(offsets, cur_field)) { /* We do not compare to an externally stored field */ ret = 0; diff --git a/innobase/rem/rem0rec.c b/innobase/rem/rem0rec.c index 1db89241dff..e4fa213480f 100644 --- a/innobase/rem/rem0rec.c +++ b/innobase/rem/rem0rec.c @@ -15,8 +15,8 @@ Created 5/30/1994 Heikki Tuuri #include "mtr0mtr.h" #include "mtr0log.h" -/* PHYSICAL RECORD - =============== +/* PHYSICAL RECORD (OLD STYLE) + =========================== The physical record, which is the data type of all the records found in index pages of the database, has the following format @@ -39,7 +39,7 @@ represented on a higher text line): | 10 bits giving the number of fields in this record | | 1 bit which is set to 1 if the offsets above are given in one byte format, 0 if in two byte format | -| two bytes giving the pointer to the next record in the page | +| two bytes giving an absolute pointer to the next record in the page | ORIGIN of the record | first field of data | ... @@ -55,9 +55,50 @@ The offsets of the data fields are given as one-byte (if there are less than 127 bytes of data in the record) or two-byte unsigned integers. The most significant bit is not part of the offset, instead it indicates the SQL-null -if the bit is set to 1. +if the bit is set to 1. */ -CANONICAL COORDINATES. A record can be seen as a single +/* PHYSICAL RECORD (NEW STYLE) + =========================== + +The physical record, which is the data type of all the records +found in index pages of the database, has the following format +(lower addresses and more significant bits inside a byte are below +represented on a higher text line): + +| length of the last non-null variable-length field of data: + if the maximum length is 255, one byte; otherwise, + 0xxxxxxx (one byte, length=0..127), or 1exxxxxxxxxxxxxx (two bytes, + length=128..16383, extern storage flag) | +... +| length of first variable-length field of data | +| SQL-null flags (1 bit per nullable field), padded to full bytes | +| 4 bits used to delete mark a record, and mark a predefined + minimum record in alphabetical order | +| 4 bits giving the number of records owned by this record + (this term is explained in page0page.h) | +| 13 bits giving the order number of this record in the + heap of the index page | +| 3 bits record type: 000=conventional, 001=node pointer (inside B-tree), + 010=infimum, 011=supremum, 1xx=reserved | +| two bytes giving a relative pointer to the next record in the page | +ORIGIN of the record +| first field of data | +... +| last field of data | + +The origin of the record is the start address of the first field +of data. The offsets are given relative to the origin. +The offsets of the data fields are stored in an inverted +order because then the offset of the first fields are near the +origin, giving maybe a better processor cache hit rate in searches. + +The offsets of the data fields are given as one-byte +(if there are less than 127 bytes of data in the record) +or two-byte unsigned integers. The most significant bit +is not part of the offset, instead it indicates the SQL-null +if the bit is set to 1. */ + +/* CANONICAL COORDINATES. A record can be seen as a single string of 'characters' in the following way: catenate the bytes in each field, in the order of fields. An SQL-null field is taken to be an empty sequence of bytes. Then after @@ -86,13 +127,291 @@ the corresponding canonical strings have the same property. */ ulint rec_dummy; /* this is used to fool compiler in rec_validate */ +/******************************************************************* +Validates the consistency of an old-style physical record. */ +static +ibool +rec_validate_old( +/*=============*/ + /* out: TRUE if ok */ + rec_t* rec); /* in: physical record */ + +/********************************************************** +The following function determines the offsets to each field +in the record. The offsets are written to an array of +ulint[n+2], with [0] being the number of fields (n), [1] being the +extra size (if REC_OFFS_COMPACT is set, the record is in the new +format), and [2]..[n+1] being the offsets past the end of +fields 0..n, or to the beginning of fields 1..n+1. When the +high-order bit of the offset at [n+1] is set (REC_OFFS_SQL_NULL), +the field n is NULL. When the second high-order bit of the offset +at [n+1] is set (REC_OFFS_EXTERNAL), the field n is being stored +externally. */ +static +void +rec_init_offsets( +/*=============*/ + /* out: the offsets */ + rec_t* rec, /* in: physical record */ + dict_index_t* index, /* in: record descriptor */ + ulint* offsets)/* in:/out: ulint[n+2]; + n=rec_offs_n_fields(offsets) */ +{ + ulint n_fields = rec_offs_n_fields(offsets); + ulint i = 0; + ulint offs; + + rec_offs_make_valid(rec, index, offsets); + + if (index->table->comp) { + const byte* nulls; + const byte* lens; + dict_field_t* field; + dtype_t* type; + ulint null_mask; + ulint status = rec_get_status(rec); + ulint n_node_ptr_field = ULINT_UNDEFINED; + + switch (status) { + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + /* the field is 8 bytes long */ + rec_offs_base(offsets)[0] = + REC_N_NEW_EXTRA_BYTES | REC_OFFS_COMPACT; + rec_offs_base(offsets)[1] = 8; + return; + case REC_STATUS_NODE_PTR: + n_node_ptr_field = + dict_index_get_n_unique_in_tree(index); + break; + case REC_STATUS_ORDINARY: + break; + } + + nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1); + lens = nulls - (index->n_nullable + 7) / 8; + offs = 0; + null_mask = 1; + + /* read the lengths of fields 0..n */ + for (; i < n_fields; i++) { + ibool is_null = FALSE, is_external = FALSE; + ulint len; + if (i == n_node_ptr_field) { + len = 4; + goto resolved; + } + + field = dict_index_get_nth_field(index, i); + type = dict_col_get_type(dict_field_get_col(field)); + if (!(dtype_get_prtype(type) & DATA_NOT_NULL)) { + /* nullable field => read the null flag */ + is_null = (*nulls & null_mask) != 0; + null_mask <<= 1; + if (null_mask == 0x100) { + nulls--; + null_mask = 1; + } + } + + if (is_null) { + /* No length is stored for NULL fields. */ + len = 0; + } else if (!field->fixed_len) { + /* Variable-length field: read the length */ + len = *lens--; + if (dtype_get_len(type) > 255 + || dtype_get_mtype(type) == DATA_BLOB) { + if (len & 0x80) { + /* 1exxxxxxx xxxxxxxx */ + is_external = !!(len & 0x40); + len &= 0x3f; + len <<= 8; + len |= *lens--; + } + } + } else { + len = field->fixed_len; + } + resolved: + offs += len; + len = offs; + if (is_external) { + len |= REC_OFFS_EXTERNAL; + } + if (is_null) { + len |= REC_OFFS_SQL_NULL; + } + rec_offs_base(offsets)[i + 1] = len; + } + + *rec_offs_base(offsets) = + (rec - (lens + 1)) | REC_OFFS_COMPACT; + } else { + /* Old-style record: determine extra size and end offsets */ + offs = REC_N_OLD_EXTRA_BYTES; + if (rec_get_1byte_offs_flag(rec)) { + offs += n_fields; + *rec_offs_base(offsets) = offs; + /* Determine offsets to fields */ + for (; i < n_fields; i++) { + offs = rec_1_get_field_end_info(rec, i); + if (offs & REC_1BYTE_SQL_NULL_MASK) { + offs &= ~REC_1BYTE_SQL_NULL_MASK; + offs |= REC_OFFS_SQL_NULL; + } + rec_offs_base(offsets)[1 + i] = offs; + } + } else { + offs += 2 * n_fields; + *rec_offs_base(offsets) = offs; + /* Determine offsets to fields */ + for (; i < n_fields; i++) { + offs = rec_2_get_field_end_info(rec, i); + if (offs & REC_2BYTE_SQL_NULL_MASK) { + offs &= ~REC_2BYTE_SQL_NULL_MASK; + offs |= REC_OFFS_SQL_NULL; + } + if (offs & REC_2BYTE_EXTERN_MASK) { + offs &= ~REC_2BYTE_EXTERN_MASK; + offs |= REC_OFFS_EXTERNAL; + } + rec_offs_base(offsets)[1 + i] = offs; + } + } + } +} + +/********************************************************** +The following function determines the offsets to each field +in the record. The offsets are returned in an array of +ulint, with [0] being the number of fields (n), [1] being the +extra size (if REC_OFFS_COMPACT is set, the record is in the new +format), and [2]..[n+1] being the offsets past the end of +fields 0..n, or to the beginning of fields 1..n+1. When the +high-order bit of the offset at [n+1] is set (REC_OFFS_SQL_NULL), +the field n is NULL. When the second high-order bit of the offset +at [n+1] is set (REC_OFFS_EXTERNAL), the field n is being stored +externally. */ + +ulint* +rec_get_offsets( +/*============*/ + /* out: the offsets */ + rec_t* rec, /* in: physical record */ + dict_index_t* index, /* in: record descriptor */ + ulint n_fields,/* in: maximum number of initialized fields + (ULINT_UNDEFINED if all fields) */ + mem_heap_t* heap) /* in: memory heap */ +{ + ulint* offsets; + ulint n; + + ut_ad(rec); + ut_ad(index); + ut_ad(heap); + + if (index->table->comp) { + switch (rec_get_status(rec)) { + case REC_STATUS_ORDINARY: + n = dict_index_get_n_fields(index); + break; + case REC_STATUS_NODE_PTR: + n = dict_index_get_n_unique_in_tree(index) + 1; + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + /* infimum or supremum record */ + n = 1; + break; + default: + ut_error; + return(NULL); + } + } else { + n = rec_get_n_fields_old(rec); + } + + if (n_fields < n) { + n = n_fields; + } + + offsets = mem_heap_alloc(heap, + (n + (1 + REC_OFFS_HEADER_SIZE)) * sizeof(ulint)); + + offsets[0] = n; + + rec_init_offsets(rec, index, offsets); + return(offsets); +} + +/********************************************************** +The following function determines the offsets to each field +in the record. It differs from rec_get_offsets() by trying to +reuse a previously returned array. */ + +ulint* +rec_reget_offsets( +/*==============*/ + /* out: the new offsets */ + rec_t* rec, /* in: physical record */ + dict_index_t* index, /* in: record descriptor */ + ulint* offsets,/* in: array of offsets + from rec_get_offsets() + or rec_reget_offsets(), or NULL */ + ulint n_fields,/* in: maximum number of initialized fields + (ULINT_UNDEFINED if all fields) */ + mem_heap_t* heap) /* in: memory heap */ +{ + ulint n; + + ut_ad(rec); + ut_ad(index); + ut_ad(heap); + + if (index->table->comp) { + switch (rec_get_status(rec)) { + case REC_STATUS_ORDINARY: + n = dict_index_get_n_fields(index); + break; + case REC_STATUS_NODE_PTR: + n = dict_index_get_n_unique_in_tree(index) + 1; + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + /* infimum or supremum record */ + n = 1; + break; + default: + ut_error; + return(NULL); + } + } else { + n = rec_get_n_fields_old(rec); + } + + if (n_fields < n) { + n = n_fields; + } + + if (!offsets || rec_offs_n_fields(offsets) < n) { + offsets = mem_heap_alloc(heap, + (n + (1 + REC_OFFS_HEADER_SIZE)) * sizeof(ulint)); + } + + offsets[0] = n; + + rec_init_offsets(rec, index, offsets); + return(offsets); +} + /**************************************************************** -The following function is used to get a pointer to the nth data field in a -record. */ +The following function is used to get a pointer to the nth +data field in an old-style record. */ byte* -rec_get_nth_field( -/*==============*/ +rec_get_nth_field_old( +/*==================*/ /* out: pointer to the field */ rec_t* rec, /* in: record */ ulint n, /* in: index of the field */ @@ -103,9 +422,9 @@ rec_get_nth_field( ulint next_os; ut_ad(rec && len); - ut_ad(n < rec_get_n_fields(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); - if (n > 1024) { + if (n > REC_MAX_N_FIELDS) { fprintf(stderr, "Error: trying to access field %lu in rec\n", (ulong) n); ut_error; @@ -150,8 +469,78 @@ rec_get_nth_field( return(rec + os); } +/************************************************************** +The following function returns the size of a data tuple when converted to +a new-style physical record. */ + +ulint +rec_get_converted_size_new( +/*=======================*/ + /* out: size */ + dict_index_t* index, /* in: record descriptor */ + dtuple_t* dtuple) /* in: data tuple */ +{ + ulint size = REC_N_NEW_EXTRA_BYTES + + (index->n_nullable + 7) / 8; + dict_field_t* field; + dtype_t* type; + ulint i; + ulint n_fields; + ut_ad(index && dtuple); + ut_ad(index->table->comp); + + switch (dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK) { + case REC_STATUS_ORDINARY: + n_fields = dict_index_get_n_fields(index); + ut_ad(n_fields == dtuple_get_n_fields(dtuple)); + break; + case REC_STATUS_NODE_PTR: + n_fields = dict_index_get_n_unique_in_tree(index); + ut_ad(n_fields + 1 == dtuple_get_n_fields(dtuple)); + ut_ad(dtuple_get_nth_field(dtuple, n_fields)->len == 4); + size += 4; /* child page number */ + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + /* infimum or supremum record, 8 bytes */ + return(size + 8); /* no extra data needed */ + default: + ut_a(0); + return(ULINT_UNDEFINED); + } + + /* read the lengths of fields 0..n */ + for (i = 0; i < n_fields; i++) { + ulint len = dtuple_get_nth_field(dtuple, i)->len; + field = dict_index_get_nth_field(index, i); + type = dict_col_get_type(dict_field_get_col(field)); + ut_ad(len != UNIV_SQL_NULL || + !(dtype_get_prtype(type) & DATA_NOT_NULL)); + + if (len == UNIV_SQL_NULL) { + /* No length is stored for NULL fields. */ + continue; + } + + ut_ad(len <= dtype_get_len(type) + || dtype_get_mtype(type) == DATA_BLOB); + ut_ad(!field->fixed_len || len == field->fixed_len); + + if (field->fixed_len) { + } else if (len < 128 || (dtype_get_len(type) < 256 + && dtype_get_mtype(type) != DATA_BLOB)) { + size++; + } else { + size += 2; + } + size += len; + } + + return(size); +} + /*************************************************************** -Sets the value of the ith field SQL null bit. */ +Sets the value of the ith field SQL null bit of an old-style record. */ void rec_set_nth_field_null_bit( @@ -189,12 +578,12 @@ rec_set_nth_field_null_bit( } /*************************************************************** -Sets the value of the ith field extern storage bit. */ +Sets the value of the ith field extern storage bit of an old-style record. */ void -rec_set_nth_field_extern_bit( -/*=========================*/ - rec_t* rec, /* in: record */ +rec_set_nth_field_extern_bit_old( +/*=============================*/ + rec_t* rec, /* in: old-style record */ ulint i, /* in: ith field */ ibool val, /* in: value to set */ mtr_t* mtr) /* in: mtr holding an X-latch to the page where @@ -204,7 +593,7 @@ rec_set_nth_field_extern_bit( ulint info; ut_a(!rec_get_1byte_offs_flag(rec)); - ut_a(i < rec_get_n_fields(rec)); + ut_a(i < rec_get_n_fields_old(rec)); info = rec_2_get_field_end_info(rec, i); @@ -215,36 +604,138 @@ rec_set_nth_field_extern_bit( } if (mtr) { - mlog_write_ulint(rec - REC_N_EXTRA_BYTES - 2 * (i + 1), info, - MLOG_2BYTES, mtr); + mlog_write_ulint(rec - REC_N_OLD_EXTRA_BYTES - 2 * (i + 1), + info, MLOG_2BYTES, mtr); } else { rec_2_set_field_end_info(rec, i, info); } } /*************************************************************** +Sets the value of the ith field extern storage bit of a new-style record. */ + +void +rec_set_nth_field_extern_bit_new( +/*=============================*/ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: record descriptor */ + ulint ith, /* in: ith field */ + ibool val, /* in: value to set */ + mtr_t* mtr) /* in: mtr holding an X-latch to the page + where rec is, or NULL; in the NULL case + we do not write to log about the change */ +{ + byte* nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1); + byte* lens = nulls - (index->n_nullable + 7) / 8; + dict_field_t* field; + dtype_t* type; + ulint i; + ulint n_fields; + ulint null_mask = 1; + ut_ad(rec && index); + ut_ad(index->table->comp); + ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY); + + n_fields = dict_index_get_n_fields(index); + + ut_ad(ith < n_fields); + + /* read the lengths of fields 0..n */ + for (i = 0; i < n_fields; i++) { + ibool is_null; + ulint len; + field = dict_index_get_nth_field(index, i); + type = dict_col_get_type(dict_field_get_col(field)); + is_null = !(dtype_get_prtype(type) & DATA_NOT_NULL); + if (is_null) { + /* nullable field => read the null flag */ + is_null = !!(*nulls & null_mask); + null_mask <<= 1; + if (null_mask == 0x100) + nulls--, null_mask = 1; + } + if (is_null || field->fixed_len) { + /* No length (or extern bit) is stored for + fields that are NULL or fixed-length. */ + ut_ad(i != ith); + continue; + } + len = *lens--; + if (dtype_get_len(type) > 255 + || dtype_get_mtype(type) == DATA_BLOB) { + if (len & 0x80) { /* 1exxxxxx: 2-byte length */ + if (i == ith) { + if (!val == !(len & 0x20)) { + return; /* no change */ + } + /* toggle the extern bit */ + len ^= 0x40; + if (mtr) { + mlog_write_ulint(lens + 1, len, + MLOG_1BYTE, mtr); + } else { + lens[1] = len; + } + return; + } + lens--; + } else { + /* short fields cannot be external */ + ut_ad(i != ith); + } + } else { + /* short fields cannot be external */ + ut_ad(i != ith); + } + } +} + +/*************************************************************** Sets TRUE the extern storage bits of fields mentioned in an array. */ void rec_set_field_extern_bits( /*======================*/ - rec_t* rec, /* in: record */ - ulint* vec, /* in: array of field numbers */ - ulint n_fields, /* in: number of fields numbers */ - mtr_t* mtr) /* in: mtr holding an X-latch to the page - where rec is, or NULL; in the NULL case we - do not write to log about the change */ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: record descriptor */ + const ulint* vec, /* in: array of field numbers */ + ulint n_fields,/* in: number of fields numbers */ + mtr_t* mtr) /* in: mtr holding an X-latch to the + page where rec is, or NULL; + in the NULL case we do not write + to log about the change */ { ulint i; for (i = 0; i < n_fields; i++) { - rec_set_nth_field_extern_bit(rec, vec[i], TRUE, mtr); + rec_set_nth_field_extern_bit(rec, index, vec[i], TRUE, mtr); } } +/************************************************************** +Returns the total size of a physical record. */ + +ulint +rec_get_size( +/*=========*/ + /* out: size */ + rec_t* rec, /* in: physical record */ + dict_index_t* index) /* in: record descriptor */ +{ + mem_heap_t* heap + = mem_heap_create(100); + ulint* offsets + = rec_get_offsets(rec, index, ULINT_UNDEFINED, heap); + ulint size + = rec_offs_size(offsets); + + mem_heap_free(heap); + return(size); +} + /*************************************************************** -Sets a record field to SQL null. The physical size of the field is not -changed. */ +Sets an old-style record field to SQL null. +The physical size of the field is not changed. */ void rec_set_nth_field_sql_null( @@ -262,20 +753,20 @@ rec_set_nth_field_sql_null( } /************************************************************* -Builds a physical record out of a data tuple and stores it beginning from -address destination. */ - -rec_t* -rec_convert_dtuple_to_rec_low( +Builds an old-style physical record out of a data tuple and +stores it beginning from the start of the given buffer. */ +static +rec_t* +rec_convert_dtuple_to_rec_old( /*==========================*/ - /* out: pointer to the origin of physical - record */ - byte* destination, /* in: start address of the physical record */ - dtuple_t* dtuple, /* in: data tuple */ - ulint data_size) /* in: data size of dtuple */ + /* out: pointer to the origin of + physical record */ + byte* buf, /* in: start address of the physical record */ + dtuple_t* dtuple)/* in: data tuple */ { dfield_t* field; ulint n_fields; + ulint data_size; rec_t* rec; ulint end_offset; ulint ored_offset; @@ -283,24 +774,25 @@ rec_convert_dtuple_to_rec_low( ulint len; ulint i; - ut_ad(destination && dtuple); + ut_ad(buf && dtuple); ut_ad(dtuple_validate(dtuple)); ut_ad(dtuple_check_typed(dtuple)); - ut_ad(dtuple_get_data_size(dtuple) == data_size); n_fields = dtuple_get_n_fields(dtuple); + data_size = dtuple_get_data_size(dtuple); ut_ad(n_fields > 0); /* Calculate the offset of the origin in the physical record */ - rec = destination + rec_get_converted_extra_size(data_size, n_fields); + rec = buf + rec_get_converted_extra_size(data_size, n_fields); /* Store the number of fields */ - rec_set_n_fields(rec, n_fields); + rec_set_n_fields_old(rec, n_fields); /* Set the info bits of the record */ - rec_set_info_bits(rec, dtuple_get_info_bits(dtuple)); + rec_set_info_bits(rec, FALSE, + dtuple_get_info_bits(dtuple) & REC_INFO_BITS_MASK); /* Store the data and the offsets */ @@ -361,11 +853,194 @@ rec_convert_dtuple_to_rec_low( } } - ut_ad(rec_validate(rec)); + return(rec); +} + +/************************************************************* +Builds a new-style physical record out of a data tuple and +stores it beginning from the start of the given buffer. */ +static +rec_t* +rec_convert_dtuple_to_rec_new( +/*==========================*/ + /* out: pointer to the origin + of physical record */ + byte* buf, /* in: start address of the physical record */ + dict_index_t* index, /* in: record descriptor */ + dtuple_t* dtuple) /* in: data tuple */ +{ + dfield_t* field; + dtype_t* type; + rec_t* rec = buf + REC_N_NEW_EXTRA_BYTES; + byte* end; + byte* nulls; + byte* lens; + ulint len; + ulint i; + ulint fixed_len; + ulint null_mask = 1; + const ulint n_fields = dtuple_get_n_fields(dtuple); + const ulint status = dtuple_get_info_bits(dtuple) + & REC_NEW_STATUS_MASK; + ut_ad(index->table->comp); + + ut_ad(n_fields > 0); + switch (status) { + case REC_STATUS_ORDINARY: + ut_ad(n_fields <= dict_index_get_n_fields(index)); + break; + case REC_STATUS_NODE_PTR: + ut_ad(n_fields == dict_index_get_n_unique_in_tree(index) + 1); + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + ut_ad(n_fields == 1); + goto init; + default: + ut_a(0); + return(0); + } + + /* Calculate the offset of the origin in the physical record. + We must loop over all fields to do this. */ + rec += (index->n_nullable + 7) / 8; + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(dtuple, i); + type = dfield_get_type(field); + len = dfield_get_len(field); + if (status == REC_STATUS_NODE_PTR && i == n_fields - 1) { + fixed_len = 4; + ut_ad(dtype_get_prtype(type) & DATA_NOT_NULL); + ut_ad(len == 4); + continue; + } + fixed_len = dict_index_get_nth_field(index, i)->fixed_len; + + if (!(dtype_get_prtype(type) & DATA_NOT_NULL)) { + if (len == UNIV_SQL_NULL) + continue; + } + /* only nullable fields can be null */ + ut_ad(len != UNIV_SQL_NULL); + if (fixed_len) { + ut_ad(len == fixed_len); + } else { + ut_ad(len <= dtype_get_len(type) + || dtype_get_mtype(type) == DATA_BLOB); + rec++; + if (len >= 128 && (dtype_get_len(type) >= 256 + || dtype_get_mtype(type) == DATA_BLOB)) { + rec++; + } + } + } + +init: + end = rec; + nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1); + lens = nulls - (index->n_nullable + 7) / 8; + /* clear the SQL-null flags */ + memset (lens + 1, 0, nulls - lens); + + /* Set the info bits of the record */ + rec_set_status(rec, status); + + rec_set_info_bits(rec, TRUE, + dtuple_get_info_bits(dtuple) & REC_INFO_BITS_MASK); + + /* Store the data and the offsets */ + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(dtuple, i); + type = dfield_get_type(field); + len = dfield_get_len(field); + + if (status == REC_STATUS_NODE_PTR && i == n_fields - 1) { + fixed_len = 4; + ut_ad(dtype_get_prtype(type) & DATA_NOT_NULL); + ut_ad(len == 4); + goto copy; + } + fixed_len = dict_index_get_nth_field(index, i)->fixed_len; + + if (!(dtype_get_prtype(type) & DATA_NOT_NULL)) { + /* nullable field */ + ut_ad(index->n_nullable > 0); + ut_ad(*nulls < null_mask); + /* set the null flag if necessary */ + if (len == UNIV_SQL_NULL) { + *nulls |= null_mask; + } + null_mask <<= 1; + if (null_mask == 0x100) + nulls--, null_mask = 1; + if (len == UNIV_SQL_NULL) + continue; + } + /* only nullable fields can be null */ + ut_ad(len != UNIV_SQL_NULL); + if (fixed_len) { + ut_ad(len == fixed_len); + } else { + ut_ad(len <= dtype_get_len(type) + || dtype_get_mtype(type) == DATA_BLOB); + if (len < 128 || (dtype_get_len(type) < 256 + && dtype_get_mtype(type) != DATA_BLOB)) { + *lens-- = len; + } + else { + /* the extern bits will be set later */ + ut_ad(len < 16384); + *lens-- = len >> 8 | 0x80; + *lens-- = len; + } + } + copy: + memcpy(end, dfield_get_data(field), len); + end += len; + } return(rec); } +/************************************************************* +Builds a physical record out of a data tuple and +stores it beginning from the start of the given buffer. */ + +rec_t* +rec_convert_dtuple_to_rec( +/*======================*/ + /* out: pointer to the origin + of physical record */ + byte* buf, /* in: start address of the + physical record */ + dict_index_t* index, /* in: record descriptor */ + dtuple_t* dtuple) /* in: data tuple */ +{ + rec_t* rec; + + ut_ad(buf && index && dtuple); + ut_ad(dtuple_validate(dtuple)); + ut_ad(dtuple_check_typed(dtuple)); + + if (index->table->comp) { + rec = rec_convert_dtuple_to_rec_new(buf, index, dtuple); + } else { + rec = rec_convert_dtuple_to_rec_old(buf, dtuple); + } + +#ifdef UNIV_DEBUG + { + mem_heap_t* heap = mem_heap_create(100); + ut_ad(rec_validate(rec, + rec_get_offsets(rec, index, ULINT_UNDEFINED, heap))); + mem_heap_free(heap); + } +#endif /* UNIV_DEBUG */ + return(rec); +} + /****************************************************************** Copies the first n fields of a physical record to a data tuple. The fields are copied to the memory heap. */ @@ -375,6 +1050,7 @@ rec_copy_prefix_to_dtuple( /*======================*/ dtuple_t* tuple, /* in: data tuple */ rec_t* rec, /* in: physical record */ + dict_index_t* index, /* in: record descriptor */ ulint n_fields, /* in: number of fields to copy */ mem_heap_t* heap) /* in: memory heap */ { @@ -383,16 +1059,20 @@ rec_copy_prefix_to_dtuple( ulint len; byte* buf = NULL; ulint i; - - ut_ad(rec_validate(rec)); + ulint* offsets; + + offsets = rec_get_offsets(rec, index, n_fields, heap); + + ut_ad(rec_validate(rec, offsets)); ut_ad(dtuple_check_typed(tuple)); - dtuple_set_info_bits(tuple, rec_get_info_bits(rec)); + dtuple_set_info_bits(tuple, + rec_get_info_bits(rec, index->table->comp)); for (i = 0; i < n_fields; i++) { field = dtuple_get_nth_field(tuple, i); - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field(rec, offsets, i, &len); if (len != UNIV_SQL_NULL) { buf = mem_heap_alloc(heap, len); @@ -405,32 +1085,28 @@ rec_copy_prefix_to_dtuple( } /****************************************************************** -Copies the first n fields of a physical record to a new physical record in -a buffer. */ - +Copies the first n fields of an old-style physical record +to a new physical record in a buffer. */ +static rec_t* -rec_copy_prefix_to_buf( -/*===================*/ +rec_copy_prefix_to_buf_old( +/*=======================*/ /* out, own: copied record */ rec_t* rec, /* in: physical record */ ulint n_fields, /* in: number of fields to copy */ + ulint area_end, /* in: end of the prefix data */ byte** buf, /* in/out: memory buffer for the copied prefix, or NULL */ ulint* buf_size) /* in/out: buffer size */ { rec_t* copy_rec; ulint area_start; - ulint area_end; ulint prefix_len; - ut_ad(rec_validate(rec)); - - area_end = rec_get_field_start_offs(rec, n_fields); - if (rec_get_1byte_offs_flag(rec)) { - area_start = REC_N_EXTRA_BYTES + n_fields; + area_start = REC_N_OLD_EXTRA_BYTES + n_fields; } else { - area_start = REC_N_EXTRA_BYTES + 2 * n_fields; + area_start = REC_N_OLD_EXTRA_BYTES + 2 * n_fields; } prefix_len = area_start + area_end; @@ -448,17 +1124,114 @@ rec_copy_prefix_to_buf( copy_rec = *buf + area_start; - rec_set_n_fields(copy_rec, n_fields); + rec_set_n_fields_old(copy_rec, n_fields); return(copy_rec); } -/******************************************************************* -Validates the consistency of a physical record. */ +/****************************************************************** +Copies the first n fields of a physical record to a new physical record in +a buffer. */ + +rec_t* +rec_copy_prefix_to_buf( +/*===================*/ + /* out, own: copied record */ + rec_t* rec, /* in: physical record */ + dict_index_t* index, /* in: record descriptor */ + ulint n_fields, /* in: number of fields to copy */ + byte** buf, /* in/out: memory buffer + for the copied prefix, or NULL */ + ulint* buf_size) /* in/out: buffer size */ +{ + byte* nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1); + byte* lens = nulls - (index->n_nullable + 7) / 8; + dict_field_t* field; + dtype_t* type; + ulint i; + ulint prefix_len = 0; + ibool is_null; + ulint null_mask = 1; + ulint status; + + if (!index->table->comp) { + ut_ad(rec_validate_old(rec)); + return(rec_copy_prefix_to_buf_old(rec, n_fields, + rec_get_field_start_offs(rec, n_fields), + buf, buf_size)); + } + + status = rec_get_status(rec); + + switch (status) { + case REC_STATUS_ORDINARY: + ut_ad(n_fields <= dict_index_get_n_fields(index)); + break; + case REC_STATUS_NODE_PTR: + /* it doesn't make sense to copy the child page number field */ + ut_ad(n_fields <= dict_index_get_n_unique_in_tree(index)); + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + /* infimum or supremum record: no sense to copy anything */ + default: + ut_a(0); + return(NULL); + } + /* read the lengths of fields 0..n */ + for (i = 0; i < n_fields; i++) { + field = dict_index_get_nth_field(index, i); + type = dict_col_get_type(dict_field_get_col(field)); + is_null = !(dtype_get_prtype(type) & DATA_NOT_NULL); + if (is_null) { + /* nullable field => read the null flag */ + is_null = !!(*nulls & null_mask); + null_mask <<= 1; + if (null_mask == 0x100) + nulls--, null_mask = 1; + } + + if (is_null) { + } else if (field->fixed_len) { + prefix_len += field->fixed_len; + } else { + ulint len = *lens--; + if (dtype_get_len(type) > 255 + || dtype_get_mtype(type) == DATA_BLOB) { + if (len & 0x80) { + /* 1exxxxxx */ + len &= 0x3f; + len <<= 8; + len |= *lens--; + } + } + prefix_len += len; + } + } + + prefix_len += rec - (lens + 1); + + if ((*buf == NULL) || (*buf_size < prefix_len)) { + if (*buf != NULL) { + mem_free(*buf); + } + + *buf = mem_alloc(prefix_len); + *buf_size = prefix_len; + } + + memcpy(*buf, lens + 1, prefix_len); + + return(*buf + (rec - (lens + 1))); +} + +/******************************************************************* +Validates the consistency of an old-style physical record. */ +static ibool -rec_validate( -/*=========*/ +rec_validate_old( +/*=============*/ /* out: TRUE if ok */ rec_t* rec) /* in: physical record */ { @@ -470,7 +1243,7 @@ rec_validate( ulint i; ut_a(rec); - n_fields = rec_get_n_fields(rec); + n_fields = rec_get_n_fields_old(rec); if ((n_fields == 0) || (n_fields > REC_MAX_N_FIELDS)) { fprintf(stderr, "InnoDB: Error: record has %lu fields\n", @@ -479,7 +1252,7 @@ rec_validate( } for (i = 0; i < n_fields; i++) { - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field_old(rec, i, &len); if (!((len < UNIV_PAGE_SIZE) || (len == UNIV_SQL_NULL))) { fprintf(stderr, @@ -499,45 +1272,165 @@ rec_validate( } } - if (len_sum != (ulint)(rec_get_end(rec) - rec)) { + if (len_sum != rec_get_data_size_old(rec)) { fprintf(stderr, "InnoDB: Error: record len should be %lu, len %lu\n", (ulong) len_sum, - (ulong) (rec_get_end(rec) - rec)); + rec_get_data_size_old(rec)); + return(FALSE); + } + + rec_dummy = sum; /* This is here only to fool the compiler */ + + return(TRUE); +} + +/******************************************************************* +Validates the consistency of a physical record. */ + +ibool +rec_validate( +/*=========*/ + /* out: TRUE if ok */ + rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + const byte* data; + ulint len; + ulint n_fields; + ulint len_sum = 0; + ulint sum = 0; + ulint i; + + ut_a(rec); + n_fields = rec_offs_n_fields(offsets); + + if ((n_fields == 0) || (n_fields > REC_MAX_N_FIELDS)) { + fprintf(stderr, "InnoDB: Error: record has %lu fields\n", + (ulong) n_fields); + return(FALSE); + } + + ut_a(rec_offs_comp(offsets) || n_fields <= rec_get_n_fields_old(rec)); + + for (i = 0; i < n_fields; i++) { + data = rec_get_nth_field(rec, offsets, i, &len); + + if (!((len < UNIV_PAGE_SIZE) || (len == UNIV_SQL_NULL))) { + fprintf(stderr, + "InnoDB: Error: record field %lu len %lu\n", (ulong) i, + (ulong) len); + return(FALSE); + } + + if (len != UNIV_SQL_NULL) { + len_sum += len; + sum += *(data + len -1); /* dereference the + end of the field to + cause a memory trap + if possible */ + } else if (!rec_offs_comp(offsets)) { + len_sum += rec_get_nth_field_size(rec, i); + } + } + + if (len_sum != (ulint)(rec_get_end(rec, offsets) - rec)) { + fprintf(stderr, + "InnoDB: Error: record len should be %lu, len %lu\n", + (ulong) len_sum, + (ulong) (rec_get_end(rec, offsets) - rec)); return(FALSE); } rec_dummy = sum; /* This is here only to fool the compiler */ + if (!rec_offs_comp(offsets)) { + ut_a(rec_validate_old(rec)); + } + return(TRUE); } /******************************************************************* +Prints an old-style physical record. */ + +void +rec_print_old( +/*==========*/ + FILE* file, /* in: file where to print */ + rec_t* rec) /* in: physical record */ +{ + const byte* data; + ulint len; + ulint n; + ulint i; + + ut_ad(rec); + + n = rec_get_n_fields_old(rec); + + fprintf(file, "PHYSICAL RECORD: n_fields %lu;" + " %u-byte offsets; info bits %lu\n", + (ulong) n, + rec_get_1byte_offs_flag(rec) ? 1 : 2, + (ulong) rec_get_info_bits(rec, FALSE)); + + for (i = 0; i < n; i++) { + + data = rec_get_nth_field_old(rec, i, &len); + + fprintf(file, " %lu:", (ulong) i); + + if (len != UNIV_SQL_NULL) { + if (len <= 30) { + + ut_print_buf(file, data, len); + } else { + ut_print_buf(file, data, 30); + + fputs("...(truncated)", file); + } + } else { + fprintf(file, " SQL NULL, size %lu ", + rec_get_nth_field_size(rec, i)); + } + putc(';', file); + } + + putc('\n', file); + + rec_validate_old(rec); +} + +/******************************************************************* Prints a physical record. */ void rec_print( /*======*/ - FILE* file, /* in: file where to print */ - rec_t* rec) /* in: physical record */ + FILE* file, /* in: file where to print */ + rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ { - byte* data; - ulint len; - ulint n; - ulint i; + const byte* data; + ulint len; + ulint i; + + if (!rec_offs_comp(offsets)) { + rec_print_old(file, rec); + return; + } ut_ad(rec); - - n = rec_get_n_fields(rec); fprintf(file, "PHYSICAL RECORD: n_fields %lu;" - " 1-byte offs %s; info bits %lu\n", - (ulong) n, rec_get_1byte_offs_flag(rec) ? "TRUE" : "FALSE", - (ulong) rec_get_info_bits(rec)); + " compact format; info bits %lu\n", + (ulong) rec_offs_n_fields(offsets), + (ulong) rec_get_info_bits(rec, TRUE)); - for (i = 0; i < n; i++) { + for (i = 0; i < rec_offs_n_fields(offsets); i++) { - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field(rec, offsets, i, &len); fprintf(file, " %lu:", (ulong) i); @@ -551,14 +1444,12 @@ rec_print( fputs("...(truncated)", file); } } else { - fprintf(file, " SQL NULL, size %lu ", - (ulong) rec_get_nth_field_size(rec, i)); - + fputs(" SQL NULL", file); } putc(';', file); } putc('\n', file); - rec_validate(rec); + rec_validate(rec, offsets); } diff --git a/innobase/row/row0ins.c b/innobase/row/row0ins.c index c45818ddd26..1c55005dcfa 100644 --- a/innobase/row/row0ins.c +++ b/innobase/row/row0ins.c @@ -251,7 +251,7 @@ row_ins_sec_index_entry_by_modify( rec = btr_cur_get_rec(cursor); ut_ad((cursor->index->type & DICT_CLUSTERED) == 0); - ut_ad(rec_get_deleted_flag(rec)); + ut_ad(rec_get_deleted_flag(rec, cursor->index->table->comp)); /* We know that in the alphabetical ordering, entry and rec are identified. But in their binary form there may be differences if @@ -316,7 +316,7 @@ row_ins_clust_index_entry_by_modify( rec = btr_cur_get_rec(cursor); - ut_ad(rec_get_deleted_flag(rec)); + ut_ad(rec_get_deleted_flag(rec, cursor->index->table->comp)); heap = mem_heap_create(1024); @@ -473,6 +473,8 @@ row_ins_cascade_calc_update_vec( if (parent_ufield->field_no == parent_field_no) { + ulint fixed_size; + /* A field in the parent index record is updated. Let us make the update vector field for the child table. */ @@ -512,22 +514,22 @@ row_ins_cascade_calc_update_vec( need to pad with spaces the new value of the child column */ - if (dtype_is_fixed_size(type) + fixed_size = dtype_get_fixed_size(type); + + if (fixed_size && ufield->new_val.len != UNIV_SQL_NULL - && ufield->new_val.len - < dtype_get_fixed_size(type)) { + && ufield->new_val.len < fixed_size) { ufield->new_val.data = mem_heap_alloc(heap, - dtype_get_fixed_size(type)); - ufield->new_val.len = - dtype_get_fixed_size(type); + fixed_size); + ufield->new_val.len = fixed_size; ut_a(dtype_get_pad_char(type) != ULINT_UNDEFINED); memset(ufield->new_val.data, (byte)dtype_get_pad_char(type), - dtype_get_fixed_size(type)); + fixed_size); ut_memcpy(ufield->new_val.data, parent_ufield->new_val.data, parent_ufield->new_val.len); @@ -588,8 +590,16 @@ row_ins_foreign_report_err( fputs(", in index ", ef); ut_print_name(ef, trx, foreign->foreign_index->name); if (rec) { + mem_heap_t* heap; + ulint* offsets; + + heap = mem_heap_create(100); + offsets = rec_get_offsets(rec, foreign->foreign_index, + ULINT_UNDEFINED, heap); + fputs(", there is a record:\n", ef); - rec_print(ef, rec); + rec_print(ef, rec, offsets); + mem_heap_free(heap); } else { fputs(", the record is not available\n", ef); } @@ -644,7 +654,16 @@ row_ins_foreign_report_add_err( } if (rec) { - rec_print(ef, rec); + mem_heap_t* heap; + ulint* offsets; + + heap = mem_heap_create(100); + offsets = rec_get_offsets(rec, foreign->foreign_index, + ULINT_UNDEFINED, heap); + + rec_print(ef, rec, offsets); + + mem_heap_free(heap); } putc('\n', ef); @@ -706,7 +725,6 @@ row_ins_foreign_check_on_constraint( dict_index_t* index; dict_index_t* clust_index; dtuple_t* ref; - mem_heap_t* tmp_heap; mem_heap_t* upd_vec_heap = NULL; rec_t* rec; rec_t* clust_rec; @@ -715,8 +733,9 @@ row_ins_foreign_check_on_constraint( ulint err; ulint i; trx_t* trx; + mem_heap_t* tmp_heap = NULL; + ulint* offsets; - ut_a(thr && foreign && pcur && mtr); trx = thr_get_trx(thr); @@ -816,7 +835,7 @@ row_ins_foreign_check_on_constraint( err = DB_ROW_IS_REFERENCED; row_ins_foreign_report_err( -(char*)"Trying a too deep cascaded delete or update\n", +"Trying a too deep cascaded delete or update\n", thr, foreign, btr_pcur_get_rec(pcur), entry); goto nonstandard_exit_func; @@ -848,8 +867,6 @@ row_ins_foreign_check_on_constraint( PAGE_CUR_LE, BTR_SEARCH_LEAF, cascade->pcur, 0, mtr); - mem_heap_free(tmp_heap); - clust_rec = btr_pcur_get_rec(cascade->pcur); if (!page_rec_is_user_rec(clust_rec) @@ -863,10 +880,14 @@ row_ins_foreign_check_on_constraint( fputs("\n" "InnoDB: record ", stderr); - rec_print(stderr, rec); + offsets = rec_get_offsets(rec, index, + ULINT_UNDEFINED, tmp_heap); + rec_print(stderr, rec, offsets); fputs("\n" "InnoDB: clustered record ", stderr); - rec_print(stderr, clust_rec); + offsets = rec_reget_offsets(clust_rec, clust_index, + offsets, ULINT_UNDEFINED, tmp_heap); + rec_print(stderr, clust_rec, offsets); fputs("\n" "InnoDB: Submit a detailed bug report to http://bugs.mysql.com\n", stderr); @@ -884,9 +905,14 @@ row_ins_foreign_check_on_constraint( /* Here it suffices to use a LOCK_REC_NOT_GAP type lock; we already have a normal shared lock on the appropriate gap if the search criterion was not unique */ - + + if (!tmp_heap) { + tmp_heap = mem_heap_create(256); + } + offsets = rec_get_offsets(clust_rec, clust_index, + ULINT_UNDEFINED, tmp_heap); err = lock_clust_rec_read_check_and_lock(0, clust_rec, - clust_index, LOCK_X, LOCK_REC_NOT_GAP, thr); + clust_index, offsets, LOCK_X, LOCK_REC_NOT_GAP, thr); } if (err != DB_SUCCESS) { @@ -894,7 +920,7 @@ row_ins_foreign_check_on_constraint( goto nonstandard_exit_func; } - if (rec_get_deleted_flag(clust_rec)) { + if (rec_get_deleted_flag(clust_rec, table->comp)) { /* This can happen if there is a circular reference of rows such that cascading delete comes to delete a row already in the process of being delete marked */ @@ -1003,6 +1029,10 @@ row_ins_foreign_check_on_constraint( btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr); + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + if (upd_vec_heap) { mem_heap_free(upd_vec_heap); } @@ -1010,6 +1040,9 @@ row_ins_foreign_check_on_constraint( return(err); nonstandard_exit_func: + if (tmp_heap) { + mem_heap_free(tmp_heap); + } if (upd_vec_heap) { mem_heap_free(upd_vec_heap); @@ -1037,16 +1070,19 @@ row_ins_set_shared_rec_lock( LOCK_REC_NOT_GAP type lock */ rec_t* rec, /* in: record */ dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ que_thr_t* thr) /* in: query thread */ { ulint err; + ut_ad(rec_offs_validate(rec, index, offsets)); + if (index->type & DICT_CLUSTERED) { - err = lock_clust_rec_read_check_and_lock(0, rec, index, LOCK_S, - type, thr); + err = lock_clust_rec_read_check_and_lock(0, + rec, index, offsets, LOCK_S, type, thr); } else { - err = lock_sec_rec_read_check_and_lock(0, rec, index, LOCK_S, - type, thr); + err = lock_sec_rec_read_check_and_lock(0, + rec, index, offsets, LOCK_S, type, thr); } return(err); @@ -1064,16 +1100,19 @@ row_ins_set_exclusive_rec_lock( LOCK_REC_NOT_GAP type lock */ rec_t* rec, /* in: record */ dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ que_thr_t* thr) /* in: query thread */ { ulint err; + ut_ad(rec_offs_validate(rec, index, offsets)); + if (index->type & DICT_CLUSTERED) { - err = lock_clust_rec_read_check_and_lock(0, rec, index, LOCK_X, - type, thr); + err = lock_clust_rec_read_check_and_lock(0, + rec, index, offsets, LOCK_X, type, thr); } else { - err = lock_sec_rec_read_check_and_lock(0, rec, index, LOCK_X, - type, thr); + err = lock_sec_rec_read_check_and_lock(0, + rec, index, offsets, LOCK_X, type, thr); } return(err); @@ -1114,6 +1153,10 @@ row_ins_check_foreign_constraint( ulint i; mtr_t mtr; trx_t* trx = thr_get_trx(thr); + mem_heap_t* heap; + ulint* offsets = NULL; + + heap = mem_heap_create(100); run_again: #ifdef UNIV_SYNC_DEBUG @@ -1125,7 +1168,7 @@ run_again: if (trx->check_foreigns == FALSE) { /* The user has suppressed foreign key checks currently for this session */ - + mem_heap_free(heap); return(DB_SUCCESS); } @@ -1137,6 +1180,7 @@ run_again: if (UNIV_SQL_NULL == dfield_get_len( dtuple_get_nth_field(entry, i))) { + mem_heap_free(heap); return(DB_SUCCESS); } } @@ -1160,7 +1204,8 @@ run_again: with each foreign key constraint, one after another, and the user has problems predicting in which order they are performed. */ - + + mem_heap_free(heap); return(DB_SUCCESS); } } @@ -1174,6 +1219,8 @@ run_again: } if (check_table == NULL) { + mem_heap_free(heap); + if (check_ref) { FILE* ef = dict_foreign_err_file; mutex_enter(&dict_foreign_err_mutex); @@ -1244,10 +1291,13 @@ run_again: goto next_rec; } + offsets = rec_reget_offsets(rec, check_index, + offsets, ULINT_UNDEFINED, heap); + if (rec == page_get_supremum_rec(buf_frame_align(rec))) { - + err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, rec, - check_index, thr); + check_index, offsets, thr); if (err != DB_SUCCESS) { break; @@ -1256,29 +1306,30 @@ run_again: goto next_rec; } - cmp = cmp_dtuple_rec(entry, rec); + cmp = cmp_dtuple_rec(entry, rec, offsets); if (cmp == 0) { - if (rec_get_deleted_flag(rec)) { + if (rec_get_deleted_flag(rec, + rec_offs_comp(offsets))) { err = row_ins_set_shared_rec_lock( - LOCK_ORDINARY, - rec, check_index, thr); + LOCK_ORDINARY, rec, + check_index, offsets, thr); if (err != DB_SUCCESS) { break; } } else { /* Found a matching record */ + ulint lock_type; if (unique_search) { - err = row_ins_set_shared_rec_lock( - LOCK_REC_NOT_GAP, - rec, check_index, thr); + lock_type = LOCK_REC_NOT_GAP; } else { - err = row_ins_set_shared_rec_lock( - LOCK_ORDINARY, - rec, check_index, thr); + lock_type = LOCK_ORDINARY; } + + err = row_ins_set_shared_rec_lock(lock_type, + rec, check_index, offsets, thr); if (err != DB_SUCCESS) { @@ -1315,7 +1366,7 @@ run_again: if (cmp < 0) { err = row_ins_set_shared_rec_lock(LOCK_GAP, - rec, check_index, thr); + rec, check_index, offsets, thr); if (err != DB_SUCCESS) { break; @@ -1373,6 +1424,7 @@ do_possible_lock_wait: err = trx->error_state; } + mem_heap_free(heap); return(err); } @@ -1444,19 +1496,23 @@ row_ins_dupl_error_with_rec( that the caller already has a record lock on the record! */ dtuple_t* entry, /* in: entry to insert */ - dict_index_t* index) /* in: index */ + dict_index_t* index, /* in: index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { ulint matched_fields; ulint matched_bytes; ulint n_unique; ulint i; - + + ut_ad(rec_offs_validate(rec, index, offsets)); + n_unique = dict_index_get_n_unique(index); matched_fields = 0; matched_bytes = 0; - cmp_dtuple_rec_with_match(entry, rec, &matched_fields, &matched_bytes); + cmp_dtuple_rec_with_match(entry, rec, offsets, + &matched_fields, &matched_bytes); if (matched_fields < n_unique) { @@ -1477,7 +1533,7 @@ row_ins_dupl_error_with_rec( } } - if (!rec_get_deleted_flag(rec)) { + if (!rec_get_deleted_flag(rec, index->table->comp)) { return(TRUE); } @@ -1509,8 +1565,9 @@ row_ins_scan_sec_index_for_duplicate( ibool moved; mtr_t mtr; trx_t* trx; - const char* ptr; - + mem_heap_t* heap; + ulint* offsets = NULL; + n_unique = dict_index_get_n_unique(index); /* If the secondary index is unique, but one of the fields in the @@ -1525,6 +1582,7 @@ row_ins_scan_sec_index_for_duplicate( } } + heap = mem_heap_create(100); mtr_start(&mtr); /* Store old value on n_fields_cmp */ @@ -1550,6 +1608,9 @@ row_ins_scan_sec_index_for_duplicate( trx = thr_get_trx(thr); ut_ad(trx); + offsets = rec_reget_offsets(rec, index, + offsets, ULINT_UNDEFINED, heap); + if (innobase_query_is_replace()) { /* The manual defines the REPLACE semantics that it @@ -1557,12 +1618,12 @@ row_ins_scan_sec_index_for_duplicate( + INSERT. Therefore, we should take X-lock for duplicates */ - err = row_ins_set_exclusive_rec_lock( - LOCK_ORDINARY,rec,index,thr); + err = row_ins_set_exclusive_rec_lock(LOCK_ORDINARY, + rec, index, offsets, thr); } else { - err = row_ins_set_shared_rec_lock( - LOCK_ORDINARY, rec, index,thr); + err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, + rec, index, offsets, thr); } if (err != DB_SUCCESS) { @@ -1575,10 +1636,11 @@ row_ins_scan_sec_index_for_duplicate( goto next_rec; } - cmp = cmp_dtuple_rec(entry, rec); + cmp = cmp_dtuple_rec(entry, rec, offsets); if (cmp == 0) { - if (row_ins_dupl_error_with_rec(rec, entry, index)) { + if (row_ins_dupl_error_with_rec(rec, entry, + index, offsets)) { err = DB_DUPLICATE_KEY; thr_get_trx(thr)->error_info = index; @@ -1600,6 +1662,7 @@ next_rec: } } + mem_heap_free(heap); mtr_commit(&mtr); /* Restore old value */ @@ -1630,7 +1693,6 @@ row_ins_duplicate_error_in_clust( page_t* page; ulint n_unique; trx_t* trx = thr_get_trx(thr); - const char* ptr; UT_NOT_USED(mtr); @@ -1658,6 +1720,12 @@ row_ins_duplicate_error_in_clust( page = buf_frame_align(rec); if (rec != page_get_infimum_rec(page)) { + mem_heap_t* heap; + ulint* offsets; + + heap = mem_heap_create(100); + offsets = rec_get_offsets(rec, cursor->index, + ULINT_UNDEFINED, heap); /* We set a lock on the possible duplicate: this is needed in logical logging of MySQL to make @@ -1673,24 +1741,26 @@ row_ins_duplicate_error_in_clust( err = row_ins_set_exclusive_rec_lock( LOCK_REC_NOT_GAP,rec,cursor->index, - thr); + offsets, thr); } else { err = row_ins_set_shared_rec_lock( LOCK_REC_NOT_GAP,rec, cursor->index, - thr); + offsets, thr); } if (err != DB_SUCCESS) { - + mem_heap_free(heap); return(err); } if (row_ins_dupl_error_with_rec(rec, entry, - cursor->index)) { + cursor->index, offsets)) { trx->error_info = cursor->index; + mem_heap_free(heap); return(DB_DUPLICATE_KEY); } + mem_heap_free(heap); } } @@ -1700,7 +1770,12 @@ row_ins_duplicate_error_in_clust( page = buf_frame_align(rec); if (rec != page_get_supremum_rec(page)) { + mem_heap_t* heap; + ulint* offsets; + heap = mem_heap_create(100); + offsets = rec_get_offsets(rec, cursor->index, + ULINT_UNDEFINED, heap); /* The manual defines the REPLACE semantics that it is either an INSERT or DELETE(s) for duplicate key @@ -1710,25 +1785,27 @@ row_ins_duplicate_error_in_clust( if (innobase_query_is_replace()) { err = row_ins_set_exclusive_rec_lock( - LOCK_REC_NOT_GAP, - rec,cursor->index,thr); + LOCK_REC_NOT_GAP, rec, + cursor->index, offsets, thr); } else { err = row_ins_set_shared_rec_lock( - LOCK_REC_NOT_GAP,rec, - cursor->index, thr); + LOCK_REC_NOT_GAP, rec, + cursor->index, offsets, thr); } if (err != DB_SUCCESS) { - + mem_heap_free(heap); return(err); } if (row_ins_dupl_error_with_rec(rec, entry, - cursor->index)) { + cursor->index, offsets)) { trx->error_info = cursor->index; + mem_heap_free(heap); return(DB_DUPLICATE_KEY); } + mem_heap_free(heap); } ut_a(!(cursor->index->type & DICT_CLUSTERED)); @@ -1817,6 +1894,8 @@ row_ins_index_entry_low( ulint n_unique; big_rec_t* big_rec = NULL; mtr_t mtr; + mem_heap_t* heap = mem_heap_create(100); + ulint* offsets = NULL; log_free_check(); @@ -1849,8 +1928,9 @@ row_ins_index_entry_low( buf_frame_align(btr_cur_get_rec(&cursor)))); if (!page_rec_is_supremum(first_rec)) { - ut_a((rec_get_n_fields(first_rec)) - == dtuple_get_n_fields(entry)); + offsets = rec_get_offsets(first_rec, index, + ULINT_UNDEFINED, heap); + ut_a(rec_offs_n_fields(offsets) == dtuple_get_n_fields(entry)); } n_unique = dict_index_get_n_unique(index); @@ -1928,7 +2008,7 @@ row_ins_index_entry_low( if (err == DB_SUCCESS) { if (ext_vec) { - rec_set_field_extern_bits(insert_rec, + rec_set_field_extern_bits(insert_rec, index, ext_vec, n_ext_vec, &mtr); } } @@ -1938,14 +2018,18 @@ function_exit: mtr_commit(&mtr); if (big_rec) { + rec_t* rec; mtr_start(&mtr); btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, BTR_MODIFY_TREE, &cursor, 0, &mtr); + rec = btr_cur_get_rec(&cursor); + offsets = rec_reget_offsets(rec, index, + offsets, ULINT_UNDEFINED, heap); + + err = btr_store_big_rec_extern_fields(index, rec, + offsets, big_rec, &mtr); - err = btr_store_big_rec_extern_fields(index, - btr_cur_get_rec(&cursor), - big_rec, &mtr); if (modify) { dtuple_big_rec_free(big_rec); } else { @@ -1955,6 +2039,7 @@ function_exit: mtr_commit(&mtr); } + mem_heap_free(heap); return(err); } diff --git a/innobase/row/row0mysql.c b/innobase/row/row0mysql.c index 241ddc310e8..be243b44488 100644 --- a/innobase/row/row0mysql.c +++ b/innobase/row/row0mysql.c @@ -308,7 +308,8 @@ handle_new_error: return(TRUE); - } else if (err == DB_DEADLOCK || err == DB_LOCK_WAIT_TIMEOUT) { + } else if (err == DB_DEADLOCK || err == DB_LOCK_WAIT_TIMEOUT + || err == DB_LOCK_TABLE_FULL) { /* Roll back the whole transaction; this resolution was added to version 3.23.43 */ @@ -779,8 +780,13 @@ int row_lock_table_for_mysql( /*=====================*/ /* out: error code or DB_SUCCESS */ - row_prebuilt_t* prebuilt) /* in: prebuilt struct in the MySQL + row_prebuilt_t* prebuilt, /* in: prebuilt struct in the MySQL table handle */ + dict_table_t* table, /* in: table to lock, or NULL + if prebuilt->table should be + locked as LOCK_TABLE_EXP | + prebuilt->select_lock_type */ + ulint mode) /* in: lock mode of table */ { trx_t* trx = prebuilt->trx; que_thr_t* thr; @@ -813,8 +819,12 @@ run_again: trx_start_if_not_started(trx); - err = lock_table(LOCK_TABLE_EXP, prebuilt->table, - prebuilt->select_lock_type, thr); + if (table) { + err = lock_table(0, table, mode, thr); + } else { + err = lock_table(LOCK_TABLE_EXP, prebuilt->table, + prebuilt->select_lock_type, thr); + } trx->error_state = err; @@ -1186,6 +1196,57 @@ run_again: return((int) err); } +/************************************************************************* +Does an unlock of a row for MySQL. */ + +int +row_unlock_for_mysql( +/*=================*/ + /* out: error code or DB_SUCCESS */ + row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL + handle */ +{ + rec_t* rec; + btr_pcur_t* cur = prebuilt->pcur; + trx_t* trx = prebuilt->trx; + mtr_t mtr; + + ut_ad(prebuilt && trx); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + trx->op_info = "unlock_row"; + + if (srv_locks_unsafe_for_binlog) { + if (trx->trx_create_lock == TRUE) { + + mtr_start(&mtr); + + /* Restore a cursor position and find a record */ + btr_pcur_restore_position(BTR_SEARCH_LEAF, cur, &mtr); + rec = btr_pcur_get_rec(cur); + + if (rec) { + + lock_rec_reset_and_release_wait(rec); + } else { + fputs("InnoDB: Error: " + "Record for the lock not found\n", + stderr); + mem_analyze_corruption((byte*) trx); + ut_error; + } + + trx->trx_create_lock = FALSE; + mtr_commit(&mtr); + } + + } + + trx->op_info = ""; + + return(DB_SUCCESS); +} + /************************************************************************** Does a cascaded delete or set null in a foreign key operation. */ @@ -2314,7 +2375,8 @@ row_drop_table_for_mysql( "WHILE found = 1 LOOP\n" " SELECT ID INTO foreign_id\n" " FROM SYS_FOREIGN\n" - " WHERE FOR_NAME = table_name;\n" + " WHERE FOR_NAME = table_name\n" + " AND TO_BINARY(FOR_NAME) = TO_BINARY(table_name);\n" " IF (SQL % NOTFOUND) THEN\n" " found := 0;\n" " ELSE" @@ -2769,7 +2831,8 @@ row_rename_table_for_mysql( "WHILE found = 1 LOOP\n" " SELECT ID INTO foreign_id\n" " FROM SYS_FOREIGN\n" - " WHERE FOR_NAME = old_table_name;\n" + " WHERE FOR_NAME = old_table_name\n" + " AND TO_BINARY(FOR_NAME) = TO_BINARY(old_table_name);\n" " IF (SQL % NOTFOUND) THEN\n" " found := 0;\n" " ELSE\n" @@ -2802,7 +2865,8 @@ row_rename_table_for_mysql( " END IF;\n" "END LOOP;\n" "UPDATE SYS_FOREIGN SET REF_NAME = new_table_name\n" - "WHERE REF_NAME = old_table_name;\n"; + "WHERE REF_NAME = old_table_name\n" + " AND TO_BINARY(REF_NAME) = TO_BINARY(old_table_name);\n"; static const char str5[] = "END;\n"; @@ -3012,7 +3076,11 @@ row_rename_table_for_mysql( if (err != DB_SUCCESS) { if (err == DB_DUPLICATE_KEY) { ut_print_timestamp(stderr); - fputs(" InnoDB: Error: table ", stderr); + fputs( + " InnoDB: Error; possible reasons:\n" + "InnoDB: 1) Table rename would cause two FOREIGN KEY constraints\n" + "InnoDB: to have the same internal name in case-insensitive comparison.\n" + "InnoDB: 2) table ", stderr); ut_print_name(stderr, trx, new_name); fputs(" exists in the InnoDB internal data\n" "InnoDB: dictionary though MySQL is trying rename table ", stderr); @@ -3158,7 +3226,8 @@ row_scan_and_check_index( int cmp; ibool contains_null; ulint i; - + ulint* offsets = NULL; + *n_rows = 0; buf = mem_alloc(UNIV_PAGE_SIZE); @@ -3198,8 +3267,10 @@ loop: if (prev_entry != NULL) { matched_fields = 0; matched_bytes = 0; - - cmp = cmp_dtuple_rec_with_match(prev_entry, rec, + + offsets = rec_reget_offsets(rec, index, + offsets, ULINT_UNDEFINED, heap); + cmp = cmp_dtuple_rec_with_match(prev_entry, rec, offsets, &matched_fields, &matched_bytes); contains_null = FALSE; @@ -3228,7 +3299,7 @@ loop: dtuple_print(stderr, prev_entry); fputs("\n" "InnoDB: record ", stderr); - rec_print(stderr, rec); + rec_print(stderr, rec, offsets); putc('\n', stderr); is_ok = FALSE; } else if ((index->type & DICT_UNIQUE) @@ -3242,6 +3313,7 @@ loop: } mem_heap_empty(heap); + offsets = NULL; prev_entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, heap); @@ -3326,7 +3398,7 @@ row_check_table_for_mysql( /* We validate also the whole adaptive hash index for all tables at every CHECK TABLE */ - if (!btr_search_validate()) { + if (!btr_search_validate(index)) { ret = DB_ERROR; } diff --git a/innobase/row/row0purge.c b/innobase/row/row0purge.c index f7e01169b9d..109d0f3b976 100644 --- a/innobase/row/row0purge.c +++ b/innobase/row/row0purge.c @@ -99,6 +99,8 @@ row_purge_remove_clust_if_poss_low( ibool success; ulint err; mtr_t mtr; + rec_t* rec; + mem_heap_t* heap; index = dict_table_get_first_index(node->table); @@ -117,15 +119,21 @@ row_purge_remove_clust_if_poss_low( return(TRUE); } + rec = btr_pcur_get_rec(pcur); + heap = mem_heap_create(100); + if (0 != ut_dulint_cmp(node->roll_ptr, - row_get_rec_roll_ptr(btr_pcur_get_rec(pcur), index))) { - + row_get_rec_roll_ptr(rec, index, rec_get_offsets( + rec, index, ULINT_UNDEFINED, heap)))) { + mem_heap_free(heap); /* Someone else has modified the record later: do not remove */ btr_pcur_commit_specify_mtr(pcur, &mtr); return(TRUE); } + mem_heap_free(heap); + if (mode == BTR_MODIFY_LEAF) { success = btr_cur_optimistic_delete(btr_cur, &mtr); } else { diff --git a/innobase/row/row0row.c b/innobase/row/row0row.c index 38714b0c49b..9cf285a519d 100644 --- a/innobase/row/row0row.c +++ b/innobase/row/row0row.c @@ -37,17 +37,18 @@ row_get_rec_sys_field( /* out: value of the field */ ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */ rec_t* rec, /* in: record */ - dict_index_t* index) /* in: clustered index */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { - ulint pos; - byte* field; - ulint len; + ulint pos; + byte* field; + ulint len; ut_ad(index->type & DICT_CLUSTERED); pos = dict_index_get_sys_col_pos(index, type); - field = rec_get_nth_field(rec, pos, &len); + field = rec_get_nth_field(rec, offsets, pos, &len); if (type == DATA_TRX_ID) { @@ -70,6 +71,7 @@ row_set_rec_sys_field( ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */ rec_t* rec, /* in: record */ dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ dulint val) /* in: value to set */ { ulint pos; @@ -77,10 +79,11 @@ row_set_rec_sys_field( ulint len; ut_ad(index->type & DICT_CLUSTERED); + ut_ad(rec_offs_validate(rec, index, offsets)); pos = dict_index_get_sys_col_pos(index, type); - field = rec_get_nth_field(rec, pos, &len); + field = rec_get_nth_field(rec, offsets, pos, &len); if (type == DATA_TRX_ID) { @@ -182,6 +185,9 @@ row_build( the buffer page of this record must be at least s-latched and the latch held as long as the row dtuple is used! */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) + or NULL, in which case this function + will invoke rec_get_offsets() */ mem_heap_t* heap) /* in: memory heap from which the memory needed is allocated */ { @@ -196,14 +202,26 @@ row_build( ulint row_len; byte* buf; ulint i; - + mem_heap_t* tmp_heap; + ut_ad(index && rec && heap); ut_ad(index->type & DICT_CLUSTERED); + if (!offsets) { + tmp_heap = mem_heap_create(100); + offsets = rec_get_offsets(rec, index, + ULINT_UNDEFINED, tmp_heap); + } else { + tmp_heap = NULL; + ut_ad(rec_offs_validate(rec, index, offsets)); + } + if (type != ROW_COPY_POINTERS) { /* Take a copy of rec to heap */ - buf = mem_heap_alloc(heap, rec_get_size(rec)); - rec = rec_copy(buf, rec); + buf = mem_heap_alloc(heap, rec_offs_size(offsets)); + rec = rec_copy(buf, rec, offsets); + /* Avoid a debug assertion in rec_offs_validate(). */ + rec_offs_make_valid(rec, index, (ulint*) offsets); } table = index->table; @@ -211,11 +229,9 @@ row_build( row = dtuple_create(heap, row_len); - dtuple_set_info_bits(row, rec_get_info_bits(rec)); - - n_fields = dict_index_get_n_fields(index); + dtuple_set_info_bits(row, rec_get_info_bits(rec, table->comp)); - ut_ad(n_fields == rec_get_n_fields(rec)); + n_fields = rec_offs_n_fields(offsets); dict_table_copy_types(row, table); @@ -227,13 +243,13 @@ row_build( col = dict_field_get_col(ind_field); dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); - field = rec_get_nth_field(rec, i, &len); + field = rec_get_nth_field(rec, offsets, i, &len); if (type == ROW_COPY_ALSO_EXTERNALS - && rec_get_nth_field_extern_bit(rec, i)) { + && rec_offs_nth_extern(offsets, i)) { field = btr_rec_copy_externally_stored_field( - rec, i, &len, heap); + rec, offsets, i, &len, heap); } dfield_set_data(dfield, field, len); @@ -242,6 +258,10 @@ row_build( ut_ad(dtuple_check_typed(row)); + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + return(row); } @@ -276,16 +296,23 @@ row_rec_to_index_entry( ulint len; ulint rec_len; byte* buf; - + mem_heap_t* tmp_heap; + ulint* offsets; + ut_ad(rec && heap && index); + tmp_heap = mem_heap_create(100); + offsets = rec_get_offsets(rec, index, ULINT_UNDEFINED, tmp_heap); + if (type == ROW_COPY_DATA) { /* Take a copy of rec to heap */ - buf = mem_heap_alloc(heap, rec_get_size(rec)); - rec = rec_copy(buf, rec); + buf = mem_heap_alloc(heap, rec_offs_size(offsets)); + rec = rec_copy(buf, rec, offsets); + /* Avoid a debug assertion in rec_offs_validate(). */ + rec_offs_make_valid(rec, index, offsets); } - rec_len = rec_get_n_fields(rec); + rec_len = rec_offs_n_fields(offsets); entry = dtuple_create(heap, rec_len); @@ -295,17 +322,19 @@ row_rec_to_index_entry( dict_index_copy_types(entry, index, rec_len); - dtuple_set_info_bits(entry, rec_get_info_bits(rec)); + dtuple_set_info_bits(entry, + rec_get_info_bits(rec, rec_offs_comp(offsets))); for (i = 0; i < rec_len; i++) { dfield = dtuple_get_nth_field(entry, i); - field = rec_get_nth_field(rec, i, &len); + field = rec_get_nth_field(rec, offsets, i, &len); dfield_set_data(dfield, field, len); } ut_ad(dtuple_check_typed(entry)); + mem_heap_free(tmp_heap); return(entry); } @@ -345,15 +374,22 @@ row_build_row_ref( byte* buf; ulint clust_col_prefix_len; ulint i; + mem_heap_t* tmp_heap; + ulint* offsets; ut_ad(index && rec && heap); - + + tmp_heap = mem_heap_create(100); + offsets = rec_get_offsets(rec, index, ULINT_UNDEFINED, tmp_heap); + if (type == ROW_COPY_DATA) { /* Take a copy of rec to heap */ - buf = mem_heap_alloc(heap, rec_get_size(rec)); + buf = mem_heap_alloc(heap, rec_offs_size(offsets)); - rec = rec_copy(buf, rec); + rec = rec_copy(buf, rec, offsets); + /* Avoid a debug assertion in rec_offs_validate(). */ + rec_offs_make_valid(rec, index, offsets); } table = index->table; @@ -373,7 +409,7 @@ row_build_row_ref( ut_a(pos != ULINT_UNDEFINED); - field = rec_get_nth_field(rec, pos, &len); + field = rec_get_nth_field(rec, offsets, pos, &len); dfield_set_data(dfield, field, len); @@ -397,6 +433,7 @@ row_build_row_ref( } ut_ad(dtuple_check_typed(ref)); + mem_heap_free(tmp_heap); return(ref); } @@ -427,7 +464,9 @@ row_build_row_ref_in_tuple( ulint pos; ulint clust_col_prefix_len; ulint i; - + mem_heap_t* heap; + ulint* offsets; + ut_a(ref && index && rec); if (!index->table) { @@ -446,7 +485,10 @@ row_build_row_ref_in_tuple( fputs("InnoDB: clust index for table ", stderr); goto notfound; } - + + heap = mem_heap_create(100); + offsets = rec_get_offsets(rec, index, ULINT_UNDEFINED, heap); + ref_len = dict_index_get_n_unique(clust_index); ut_ad(ref_len == dtuple_get_n_fields(ref)); @@ -459,8 +501,8 @@ row_build_row_ref_in_tuple( pos = dict_index_get_nth_field_pos(index, clust_index, i); ut_a(pos != ULINT_UNDEFINED); - - field = rec_get_nth_field(rec, pos, &len); + + field = rec_get_nth_field(rec, offsets, pos, &len); dfield_set_data(dfield, field, len); @@ -484,6 +526,7 @@ row_build_row_ref_in_tuple( } ut_ad(dtuple_check_typed(ref)); + mem_heap_free(heap); } /*********************************************************************** diff --git a/innobase/row/row0sel.c b/innobase/row/row0sel.c index 71163bc35b6..2b40b62e5bc 100644 --- a/innobase/row/row0sel.c +++ b/innobase/row/row0sel.c @@ -31,6 +31,7 @@ Created 12/19/1997 Heikki Tuuri #include "pars0pars.h" #include "row0mysql.h" #include "read0read.h" +#include "buf0lru.h" /* Maximum number of rows to prefetch; MySQL interface has another parameter */ #define SEL_MAX_N_PREFETCH 16 @@ -77,8 +78,14 @@ row_sel_sec_rec_is_for_clust_rec( ulint n; ulint i; dtype_t* cur_type; + mem_heap_t* heap; + ulint* clust_offs; + ulint* sec_offs; - UT_NOT_USED(clust_index); + heap = mem_heap_create(100); + clust_offs = rec_get_offsets(clust_rec, clust_index, + ULINT_UNDEFINED, heap); + sec_offs = rec_get_offsets(sec_rec, sec_index, ULINT_UNDEFINED, heap); n = dict_index_get_n_ordering_defined_by_user(sec_index); @@ -86,10 +93,10 @@ row_sel_sec_rec_is_for_clust_rec( ifield = dict_index_get_nth_field(sec_index, i); col = dict_field_get_col(ifield); - clust_field = rec_get_nth_field(clust_rec, + clust_field = rec_get_nth_field(clust_rec, clust_offs, dict_col_get_clust_pos(col), &clust_len); - sec_field = rec_get_nth_field(sec_rec, i, &sec_len); + sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len); if (ifield->prefix_len > 0 && clust_len != UNIV_SQL_NULL) { @@ -106,10 +113,12 @@ row_sel_sec_rec_is_for_clust_rec( if (0 != cmp_data_data(dict_col_get_type(col), clust_field, clust_len, sec_field, sec_len)) { + mem_heap_free(heap); return(FALSE); } } + mem_heap_free(heap); return(TRUE); } @@ -265,6 +274,7 @@ row_sel_fetch_columns( dict_index_t* index, /* in: record index */ rec_t* rec, /* in: record in a clustered or non-clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ sym_node_t* column) /* in: first column in a column list, or NULL */ { @@ -274,6 +284,8 @@ row_sel_fetch_columns( byte* data; ulint len; + ut_ad(rec_offs_validate(rec, index, offsets)); + if (index->type & DICT_CLUSTERED) { index_type = SYM_CLUST_FIELD_NO; } else { @@ -285,7 +297,7 @@ row_sel_fetch_columns( if (field_no != ULINT_UNDEFINED) { - data = rec_get_nth_field(rec, field_no, &len); + data = rec_get_nth_field(rec, offsets, field_no, &len); if (column->copy_val) { eval_node_copy_and_alloc_val(column, data, @@ -600,8 +612,15 @@ row_sel_get_clust_rec( rec_t* clust_rec; rec_t* old_vers; ulint err; + mem_heap_t* heap; + ulint* offsets; + + heap = mem_heap_create(100); + offsets = rec_get_offsets(rec, + btr_pcur_get_btr_cur(&plan->pcur)->index, + ULINT_UNDEFINED, heap); - row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec); + row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets); index = dict_table_get_first_index(plan->table); @@ -618,7 +637,7 @@ row_sel_get_clust_rec( || btr_pcur_get_low_match(&(plan->clust_pcur)) < dict_index_get_n_unique(index)) { - ut_a(rec_get_deleted_flag(rec)); + ut_a(rec_get_deleted_flag(rec, plan->table->comp)); ut_a(node->read_view); /* In a rare case it is possible that no clust rec is found @@ -635,27 +654,30 @@ row_sel_get_clust_rec( goto func_exit; } + offsets = rec_reget_offsets(clust_rec, index, + offsets, ULINT_UNDEFINED, heap); + if (!node->read_view) { /* Try to place a lock on the index record */ - /* If innodb_locks_unsafe_for_binlog option is used, - we lock only the record, i.e. next-key locking is - not used. - */ - if ( srv_locks_unsafe_for_binlog ) - { - err = lock_clust_rec_read_check_and_lock(0, clust_rec, - index,node->row_lock_mode, LOCK_REC_NOT_GAP, thr); - } - else - { - err = lock_clust_rec_read_check_and_lock(0, clust_rec, index, - node->row_lock_mode, LOCK_ORDINARY, thr); - - } - - if (err != DB_SUCCESS) { + /* If innodb_locks_unsafe_for_binlog option is used, + we lock only the record, i.e., next-key locking is + not used. */ + ulint lock_type; + + if (srv_locks_unsafe_for_binlog) { + lock_type = LOCK_REC_NOT_GAP; + } else { + lock_type = LOCK_ORDINARY; + } + err = lock_clust_rec_read_check_and_lock(0, + clust_rec, index, offsets, + node->row_lock_mode, lock_type, thr); + + if (err != DB_SUCCESS) { + + mem_heap_free(heap); return(err); } } else { @@ -664,22 +686,21 @@ row_sel_get_clust_rec( old_vers = NULL; - if (!lock_clust_rec_cons_read_sees(clust_rec, index, + if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets, node->read_view)) { err = row_sel_build_prev_vers(node->read_view, plan, clust_rec, &old_vers, mtr); if (err != DB_SUCCESS) { + mem_heap_free(heap); return(err); } clust_rec = old_vers; if (clust_rec == NULL) { - *out_rec = clust_rec; - - return(DB_SUCCESS); + goto func_exit; } } @@ -696,23 +717,22 @@ row_sel_get_clust_rec( visit through secondary index records that would not really exist in our snapshot. */ - if ((old_vers || rec_get_deleted_flag(rec)) + if ((old_vers || rec_get_deleted_flag(rec, plan->table->comp)) && !row_sel_sec_rec_is_for_clust_rec(rec, plan->index, clust_rec, index)) { clust_rec = NULL; - *out_rec = clust_rec; - - return(DB_SUCCESS); + goto func_exit; } } /* Fetch the columns needed in test conditions */ - - row_sel_fetch_columns(index, clust_rec, + + row_sel_fetch_columns(index, clust_rec, offsets, UT_LIST_GET_FIRST(plan->columns)); func_exit: *out_rec = clust_rec; + mem_heap_free(heap); return(DB_SUCCESS); } @@ -725,18 +745,29 @@ sel_set_rec_lock( /* out: DB_SUCCESS or error code */ rec_t* rec, /* in: record */ dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ ulint mode, /* in: lock mode */ ulint type, /* in: LOCK_ORDINARY, LOCK_GAP, or LOC_REC_NOT_GAP */ que_thr_t* thr) /* in: query thread */ { + trx_t* trx; ulint err; + trx = thr_get_trx(thr); + + if (UT_LIST_GET_LEN(trx->trx_locks) > 10000) { + if (buf_LRU_buf_pool_running_out()) { + + return(DB_LOCK_TABLE_FULL); + } + } + if (index->type & DICT_CLUSTERED) { - err = lock_clust_rec_read_check_and_lock(0, rec, index, mode, - type, thr); + err = lock_clust_rec_read_check_and_lock(0, + rec, index, offsets, mode, type, thr); } else { - err = lock_sec_rec_read_check_and_lock(0, rec, index, mode, - type, thr); + err = lock_sec_rec_read_check_and_lock(0, + rec, index, offsets, mode, type, thr); } return(err); @@ -944,6 +975,8 @@ row_sel_try_search_shortcut( { dict_index_t* index; rec_t* rec; + mem_heap_t* heap; + ulint* offsets; index = plan->index; @@ -977,21 +1010,28 @@ row_sel_try_search_shortcut( /* This is a non-locking consistent read: if necessary, fetch a previous version of the record */ + heap = mem_heap_create(100); + offsets = rec_get_offsets(rec, index, ULINT_UNDEFINED, heap); + if (index->type & DICT_CLUSTERED) { - if (!lock_clust_rec_cons_read_sees(rec, index, + if (!lock_clust_rec_cons_read_sees(rec, index, offsets, node->read_view)) { + mem_heap_free(heap); return(SEL_RETRY); } } else if (!lock_sec_rec_cons_read_sees(rec, index, node->read_view)) { + mem_heap_free(heap); return(SEL_RETRY); } /* Test deleted flag. Fetch the columns needed in test conditions. */ - - row_sel_fetch_columns(index, rec, UT_LIST_GET_FIRST(plan->columns)); - if (rec_get_deleted_flag(rec)) { + row_sel_fetch_columns(index, rec, offsets, + UT_LIST_GET_FIRST(plan->columns)); + mem_heap_free(heap); + + if (rec_get_deleted_flag(rec, plan->table->comp)) { return(SEL_EXHAUSTED); } @@ -1055,7 +1095,9 @@ row_sel( to the next non-clustered record */ ulint found_flag; ulint err; - + mem_heap_t* heap = mem_heap_create(100); + ulint* offsets = NULL; + ut_ad(thr->run_node == node); search_latch_locked = FALSE; @@ -1205,22 +1247,25 @@ rec_loop: if (!consistent_read) { - /* If innodb_locks_unsafe_for_binlog option is used, - we lock only the record, i.e. next-key locking is - not used. - */ - - if ( srv_locks_unsafe_for_binlog ) - { - err = sel_set_rec_lock(page_rec_get_next(rec), index, - node->row_lock_mode, LOCK_REC_NOT_GAP, thr); - } - else - { - err = sel_set_rec_lock(page_rec_get_next(rec), index, - node->row_lock_mode, LOCK_ORDINARY, thr); - } - if (err != DB_SUCCESS) { + /* If innodb_locks_unsafe_for_binlog option is used, + we lock only the record, i.e., next-key locking is + not used. */ + + rec_t* next_rec = page_rec_get_next(rec); + ulint lock_type; + offsets = rec_reget_offsets(next_rec, index, + offsets, ULINT_UNDEFINED, heap); + + if (srv_locks_unsafe_for_binlog) { + lock_type = LOCK_REC_NOT_GAP; + } else { + lock_type = LOCK_ORDINARY; + } + + err = sel_set_rec_lock(next_rec, index, offsets, + node->row_lock_mode, lock_type, thr); + + if (err != DB_SUCCESS) { /* Note that in this case we will store in pcur the PREDECESSOR of the record we are waiting the lock for */ @@ -1245,21 +1290,22 @@ rec_loop: if (!consistent_read) { /* Try to place a lock on the index record */ - /* If innodb_locks_unsafe_for_binlog option is used, - we lock only the record, i.e. next-key locking is - not used. - */ - - if ( srv_locks_unsafe_for_binlog ) - { - err = sel_set_rec_lock(rec, index, node->row_lock_mode, - LOCK_REC_NOT_GAP, thr); - } - else - { - err = sel_set_rec_lock(rec, index, node->row_lock_mode, - LOCK_ORDINARY, thr); - } + /* If innodb_locks_unsafe_for_binlog option is used, + we lock only the record, i.e., next-key locking is + not used. */ + + ulint lock_type; + offsets = rec_reget_offsets(rec, index, + offsets, ULINT_UNDEFINED, heap); + + if (srv_locks_unsafe_for_binlog) { + lock_type = LOCK_REC_NOT_GAP; + } else { + lock_type = LOCK_ORDINARY; + } + + err = sel_set_rec_lock(rec, index, offsets, + node->row_lock_mode, lock_type, thr); if (err != DB_SUCCESS) { @@ -1323,6 +1369,8 @@ rec_loop: /* PHASE 3: Get previous version in a consistent read */ cons_read_requires_clust_rec = FALSE; + offsets = rec_reget_offsets(rec, index, + offsets, ULINT_UNDEFINED, heap); if (consistent_read) { /* This is a non-locking consistent read: if necessary, fetch @@ -1330,7 +1378,7 @@ rec_loop: if (index->type & DICT_CLUSTERED) { - if (!lock_clust_rec_cons_read_sees(rec, index, + if (!lock_clust_rec_cons_read_sees(rec, index, offsets, node->read_view)) { err = row_sel_build_prev_vers(node->read_view, @@ -1343,6 +1391,7 @@ rec_loop: if (old_vers == NULL) { row_sel_fetch_columns(index, rec, + offsets, UT_LIST_GET_FIRST(plan->columns)); if (!row_sel_test_end_conds(plan)) { @@ -1354,6 +1403,8 @@ rec_loop: } rec = old_vers; + offsets = rec_reget_offsets(rec, index, + offsets, ULINT_UNDEFINED, heap); } } else if (!lock_sec_rec_cons_read_sees(rec, index, node->read_view)) { @@ -1365,7 +1416,8 @@ rec_loop: /* Fetch the columns needed in test conditions */ - row_sel_fetch_columns(index, rec, UT_LIST_GET_FIRST(plan->columns)); + row_sel_fetch_columns(index, rec, offsets, + UT_LIST_GET_FIRST(plan->columns)); /* Test the selection end conditions: these can only contain columns which already are found in the index, even though the index might be @@ -1380,7 +1432,8 @@ rec_loop: goto table_exhausted; } - if (rec_get_deleted_flag(rec) && !cons_read_requires_clust_rec) { + if (rec_get_deleted_flag(rec, plan->table->comp) + && !cons_read_requires_clust_rec) { /* The record is delete marked: we can skip it if this is not a consistent read which might see an earlier version @@ -1423,7 +1476,7 @@ rec_loop: goto next_rec; } - if (rec_get_deleted_flag(clust_rec)) { + if (rec_get_deleted_flag(clust_rec, plan->table->comp)) { /* The record is delete marked: we can skip it */ @@ -1581,7 +1634,8 @@ next_table_no_mtr: if (search_latch_locked) { rw_lock_s_unlock(&btr_search_latch); } - + + mem_heap_free(heap); return(DB_SUCCESS); } @@ -1615,6 +1669,7 @@ table_exhausted: table_exhausted_no_mtr: if (node->fetch_table == 0) { + mem_heap_free(heap); if (node->is_aggregate && !node->aggregate_already_fetched) { @@ -1663,7 +1718,7 @@ stop_for_a_while: mtr_commit(&mtr); ut_ad(sync_thread_levels_empty_gen(TRUE)); - + mem_heap_free(heap); return(DB_SUCCESS); commit_mtr_for_a_while: @@ -1699,6 +1754,7 @@ lock_wait_or_error: ut_ad(sync_thread_levels_empty_gen(TRUE)); + mem_heap_free(heap); return(err); } @@ -2122,11 +2178,16 @@ row_sel_store_row_id_to_prebuilt( /*=============================*/ row_prebuilt_t* prebuilt, /* in: prebuilt */ rec_t* index_rec, /* in: record */ - dict_index_t* index) /* in: index of the record */ + dict_index_t* index, /* in: index of the record */ + const ulint* offsets) /* in: rec_get_offsets + (index_rec, index) */ { byte* data; ulint len; - data = rec_get_nth_field(index_rec, + + ut_ad(rec_offs_validate(index_rec, index, offsets)); + + data = rec_get_nth_field(index_rec, offsets, dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len); if (len != DATA_ROW_ID_LEN) { @@ -2136,7 +2197,7 @@ row_sel_store_row_id_to_prebuilt( fprintf(stderr, "\n" "InnoDB: Field number %lu, record:\n", (ulong) dict_index_get_sys_col_pos(index, DATA_ROW_ID)); - rec_print(stderr, index_rec); + rec_print(stderr, index_rec, offsets); putc('\n', stderr); ut_error; } @@ -2225,9 +2286,11 @@ row_sel_store_mysql_rec( case) */ byte* mysql_rec, /* out: row in the MySQL format */ row_prebuilt_t* prebuilt, /* in: prebuilt struct */ - rec_t* rec) /* in: Innobase record in the index + rec_t* rec, /* in: Innobase record in the index which was described in prebuilt's template */ + const ulint* offsets) /* in: array returned by + rec_get_offsets() */ { mysql_row_templ_t* templ; mem_heap_t* extern_field_heap = NULL; @@ -2236,8 +2299,15 @@ row_sel_store_mysql_rec( byte* blob_buf; int pad_char; ulint i; + dict_index_t* index; ut_ad(prebuilt->mysql_template); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + index = prebuilt->index; + if (prebuilt->need_to_access_clustered) { + index = dict_table_get_first_index(index->table); + } if (prebuilt->blob_heap != NULL) { mem_heap_free(prebuilt->blob_heap); @@ -2253,9 +2323,10 @@ row_sel_store_mysql_rec( templ = prebuilt->mysql_template + i; - data = rec_get_nth_field(rec, templ->rec_field_no, &len); + data = rec_get_nth_field(rec, offsets, + templ->rec_field_no, &len); - if (rec_get_nth_field_extern_bit(rec, templ->rec_field_no)) { + if (rec_offs_nth_extern(offsets, templ->rec_field_no)) { /* Copy an externally stored field to the temporary heap */ @@ -2269,7 +2340,7 @@ row_sel_store_mysql_rec( causes an assert */ data = btr_rec_copy_externally_stored_field(rec, - templ->rec_field_no, &len, + offsets, templ->rec_field_no, &len, extern_field_heap); ut_a(len != UNIV_SQL_NULL); @@ -2425,6 +2496,8 @@ row_sel_get_clust_rec_for_mysql( rec_t* old_vers; ulint err; trx_t* trx; + mem_heap_t* heap = mem_heap_create(100); + ulint* offsets = NULL; *out_rec = NULL; trx = thr_get_trx(thr); @@ -2455,9 +2528,8 @@ row_sel_get_clust_rec_for_mysql( clustered index record did not exist in the read view of trx. */ - if (!rec_get_deleted_flag(rec) + if (!rec_get_deleted_flag(rec, sec_index->table->comp) || prebuilt->select_lock_type != LOCK_NONE) { - ut_print_timestamp(stderr); fputs(" InnoDB: error clustered record" " for sec rec not found\n" @@ -2465,10 +2537,14 @@ row_sel_get_clust_rec_for_mysql( dict_index_name_print(stderr, trx, sec_index); fputs("\n" "InnoDB: sec index record ", stderr); - rec_print(stderr, rec); + offsets = rec_get_offsets(rec, sec_index, + ULINT_UNDEFINED, heap); + rec_print(stderr, rec, offsets); fputs("\n" "InnoDB: clust index record ", stderr); - rec_print(stderr, clust_rec); + offsets = rec_reget_offsets(clust_rec, clust_index, + offsets, ULINT_UNDEFINED, heap); + rec_print(stderr, clust_rec, offsets); putc('\n', stderr); trx_print(stderr, trx); @@ -2481,17 +2557,21 @@ row_sel_get_clust_rec_for_mysql( goto func_exit; } + offsets = rec_get_offsets(clust_rec, clust_index, + ULINT_UNDEFINED, heap); + if (prebuilt->select_lock_type != LOCK_NONE) { /* Try to place a lock on the index record; we are searching the clust rec with a unique condition, hence we set a LOCK_REC_NOT_GAP type lock */ err = lock_clust_rec_read_check_and_lock(0, clust_rec, - clust_index, + clust_index, offsets, prebuilt->select_lock_type, LOCK_REC_NOT_GAP, thr); if (err != DB_SUCCESS) { + mem_heap_free(heap); return(err); } } else { @@ -2505,7 +2585,7 @@ row_sel_get_clust_rec_for_mysql( if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED && !lock_clust_rec_cons_read_sees(clust_rec, clust_index, - trx->read_view)) { + offsets, trx->read_view)) { err = row_sel_build_prev_vers_for_mysql( trx->read_view, clust_index, @@ -2514,6 +2594,7 @@ row_sel_get_clust_rec_for_mysql( if (err != DB_SUCCESS) { + mem_heap_free(heap); return(err); } @@ -2533,7 +2614,8 @@ row_sel_get_clust_rec_for_mysql( visit through secondary index records that would not really exist in our snapshot. */ - if (clust_rec && (old_vers || rec_get_deleted_flag(rec)) + if (clust_rec && (old_vers + || rec_get_deleted_flag(rec, sec_index->table->comp)) && !row_sel_sec_rec_is_for_clust_rec(rec, sec_index, clust_rec, clust_index)) { clust_rec = NULL; @@ -2555,6 +2637,7 @@ func_exit: btr_pcur_store_position(prebuilt->clust_pcur, mtr); } + mem_heap_free(heap); return(DB_SUCCESS); } @@ -2676,12 +2759,14 @@ void row_sel_push_cache_row_for_mysql( /*=============================*/ row_prebuilt_t* prebuilt, /* in: prebuilt struct */ - rec_t* rec) /* in: record to push */ + rec_t* rec, /* in: record to push */ + const ulint* offsets) /* in: rec_get_offsets() */ { byte* buf; ulint i; ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE); + ut_ad(rec_offs_validate(rec, NULL, offsets)); ut_a(!prebuilt->templ_contains_blob); if (prebuilt->fetch_cache[0] == NULL) { @@ -2707,7 +2792,7 @@ row_sel_push_cache_row_for_mysql( ut_a(row_sel_store_mysql_rec( prebuilt->fetch_cache[prebuilt->n_fetch_cached], - prebuilt, rec)); + prebuilt, rec, offsets)); prebuilt->n_fetch_cached++; } @@ -2724,6 +2809,8 @@ row_sel_try_search_shortcut_for_mysql( /* out: SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */ rec_t** out_rec,/* out: record if found */ row_prebuilt_t* prebuilt,/* in: prebuilt struct */ + ulint** offsets,/* in/out: for rec_reget_offsets(*out_rec) */ + mem_heap_t* heap, /* in: heap for rec_reget_offsets() */ mtr_t* mtr) /* in: started mtr */ { dict_index_t* index = prebuilt->index; @@ -2761,13 +2848,17 @@ row_sel_try_search_shortcut_for_mysql( /* This is a non-locking consistent read: if necessary, fetch a previous version of the record */ - - if (!lock_clust_rec_cons_read_sees(rec, index, trx->read_view)) { + + *offsets = rec_reget_offsets(rec, index, + *offsets, ULINT_UNDEFINED, heap); + + if (!lock_clust_rec_cons_read_sees(rec, index, + *offsets, trx->read_view)) { return(SEL_RETRY); } - if (rec_get_deleted_flag(rec)) { + if (rec_get_deleted_flag(rec, index->table->comp)) { return(SEL_EXHAUSTED); } @@ -2790,6 +2881,7 @@ row_search_for_mysql( /* out: DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK, + DB_LOCK_TABLE_FULL, or DB_TOO_BIG_RECORD */ byte* buf, /* in/out: buffer for the fetched row in the MySQL format */ @@ -2835,9 +2927,12 @@ row_search_for_mysql( level is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */ ibool success; + ibool comp; ulint cnt = 0; ulint next_offs; mtr_t mtr; + mem_heap_t* heap; + ulint* offsets = NULL; ut_ad(index && pcur && search_tuple); ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); @@ -2991,6 +3086,7 @@ row_search_for_mysql( } mtr_start(&mtr); + heap = mem_heap_create(100); /*-------------------------------------------------------------*/ /* PHASE 2: Try fast adaptive hash index search if possible */ @@ -3036,13 +3132,14 @@ row_search_for_mysql( } #endif shortcut = row_sel_try_search_shortcut_for_mysql(&rec, - prebuilt, &mtr); + prebuilt, &offsets, heap, &mtr); if (shortcut == SEL_FOUND) { #ifdef UNIV_SEARCH_DEBUG - ut_a(0 == cmp_dtuple_rec(search_tuple, rec)); + ut_a(0 == cmp_dtuple_rec(search_tuple, + rec, offsets)); #endif if (!row_sel_store_mysql_rec(buf, prebuilt, - rec)) { + rec, offsets)) { err = DB_TOO_BIG_RECORD; /* We let the main loop to do the @@ -3070,7 +3167,7 @@ row_search_for_mysql( /* NOTE that we do NOT store the cursor position */ - + mem_heap_free(heap); return(DB_SUCCESS); } else if (shortcut == SEL_EXHAUSTED) { @@ -3094,6 +3191,7 @@ row_search_for_mysql( /* NOTE that we do NOT store the cursor position */ + mem_heap_free(heap); return(DB_RECORD_NOT_FOUND); } shortcut_fails_too_big_rec: @@ -3207,6 +3305,8 @@ rec_loop: /* PHASE 4: Look for matching records in a loop */ rec = btr_pcur_get_rec(pcur); + comp = index->table->comp; + ut_ad(comp == page_is_comp(buf_frame_align(rec))); /* fputs("Using ", stderr); dict_index_name_print(stderr, index); @@ -3234,9 +3334,10 @@ rec_loop: we do not lock gaps. Supremum record is really a gap and therefore we do not set locks there. */ - if ( srv_locks_unsafe_for_binlog == FALSE ) - { - err = sel_set_rec_lock(rec, index, + if (srv_locks_unsafe_for_binlog == FALSE) { + offsets = rec_reget_offsets(rec, index, + offsets, ULINT_UNDEFINED, heap); + err = sel_set_rec_lock(rec, index, offsets, prebuilt->select_lock_type, LOCK_ORDINARY, thr); } @@ -3256,9 +3357,11 @@ rec_loop: /* Do sanity checks in case our cursor has bumped into page corruption */ - next_offs = rec_get_next_offs(rec); + next_offs = rec_get_next_offs(rec, comp); - if (next_offs >= UNIV_PAGE_SIZE || next_offs < PAGE_SUPREMUM) { + if (next_offs >= UNIV_PAGE_SIZE + || next_offs < + (ulint) (comp ? PAGE_NEW_SUPREMUM : PAGE_OLD_SUPREMUM)) { if (srv_force_recovery == 0 || moves_up == FALSE) { ut_print_timestamp(stderr); @@ -3303,9 +3406,12 @@ rec_loop: } } + offsets = rec_reget_offsets(rec, index, + offsets, ULINT_UNDEFINED, heap); + if (srv_force_recovery > 0) { - if (!rec_validate(rec) || !btr_index_rec_validate(rec, index, - FALSE)) { + if (!rec_validate(rec, offsets) + || !btr_index_rec_validate(rec, index, FALSE)) { fprintf(stderr, "InnoDB: Index corruption: rec offs %lu next offs %lu, page no %lu,\n" "InnoDB: ", @@ -3333,15 +3439,23 @@ rec_loop: /* fputs("Comparing rec and search tuple\n", stderr); */ - if (0 != cmp_dtuple_rec(search_tuple, rec)) { + if (0 != cmp_dtuple_rec(search_tuple, rec, offsets)) { if (prebuilt->select_lock_type != LOCK_NONE && set_also_gap_locks) { - /* Try to place a lock on the index record */ - err = sel_set_rec_lock(rec, index, + /* Try to place a gap lock on the index + record only if innodb_locks_unsafe_for_binlog + option is not set */ + + if (srv_locks_unsafe_for_binlog == FALSE) { + + err = sel_set_rec_lock(rec, index, + offsets, prebuilt->select_lock_type, LOCK_GAP, thr); + } + if (err != DB_SUCCESS) { goto lock_wait_or_error; @@ -3359,15 +3473,23 @@ rec_loop: } else if (match_mode == ROW_SEL_EXACT_PREFIX) { - if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec)) { + if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, offsets)) { if (prebuilt->select_lock_type != LOCK_NONE && set_also_gap_locks) { - /* Try to place a lock on the index record */ - err = sel_set_rec_lock(rec, index, + /* Try to place a gap lock on the index + record only if innodb_locks_unsafe_for_binlog + option is not set */ + + if (srv_locks_unsafe_for_binlog == FALSE) { + + err = sel_set_rec_lock(rec, index, + offsets, prebuilt->select_lock_type, LOCK_GAP, thr); + } + if (err != DB_SUCCESS) { goto lock_wait_or_error; @@ -3395,30 +3517,27 @@ rec_loop: is a non-delete marked record, then it is enough to lock its existence with LOCK_REC_NOT_GAP. */ + ulint lock_type; + if (!set_also_gap_locks - || (unique_search && !rec_get_deleted_flag(rec))) { - err = sel_set_rec_lock(rec, index, - prebuilt->select_lock_type, - LOCK_REC_NOT_GAP, thr); + || (unique_search && !rec_get_deleted_flag(rec, comp))) { + lock_type = LOCK_REC_NOT_GAP; } else { - /* If innodb_locks_unsafe_for_binlog option is used, - we lock only the record, i.e. next-key locking is - not used. - */ - if ( srv_locks_unsafe_for_binlog ) - { - err = sel_set_rec_lock(rec, index, - prebuilt->select_lock_type, - LOCK_REC_NOT_GAP, thr); - } - else - { - err = sel_set_rec_lock(rec, index, - prebuilt->select_lock_type, - LOCK_ORDINARY, thr); - } + /* If innodb_locks_unsafe_for_binlog option is used, + we lock only the record, i.e., next-key locking is + not used. */ + + if (srv_locks_unsafe_for_binlog) { + lock_type = LOCK_REC_NOT_GAP; + } else { + lock_type = LOCK_ORDINARY; + } } - + + err = sel_set_rec_lock(rec, index, offsets, + prebuilt->select_lock_type, + lock_type, thr); + if (err != DB_SUCCESS) { goto lock_wait_or_error; @@ -3441,7 +3560,7 @@ rec_loop: if (srv_force_recovery < 5 && !lock_clust_rec_cons_read_sees(rec, index, - trx->read_view)) { + offsets, trx->read_view)) { err = row_sel_build_prev_vers_for_mysql( trx->read_view, clust_index, @@ -3474,7 +3593,8 @@ rec_loop: } } - if (rec_get_deleted_flag(rec) && !cons_read_requires_clust_rec) { + if (rec_get_deleted_flag(rec, comp) + && !cons_read_requires_clust_rec) { /* The record is delete-marked: we can skip it if this is not a consistent read which might see an earlier version @@ -3510,7 +3630,7 @@ rec_loop: goto next_rec; } - if (rec_get_deleted_flag(clust_rec)) { + if (rec_get_deleted_flag(clust_rec, comp)) { /* The record is delete marked: we can skip it */ @@ -3522,6 +3642,15 @@ rec_loop: } } + if (prebuilt->need_to_access_clustered) { + ut_ad(rec == clust_rec || index == clust_index); + offsets = rec_reget_offsets(rec, clust_index, + offsets, ULINT_UNDEFINED, heap); + } else { + offsets = rec_reget_offsets(rec, index, + offsets, ULINT_UNDEFINED, heap); + } + /* We found a qualifying row */ if (prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD @@ -3541,7 +3670,7 @@ rec_loop: not cache rows because there the cursor is a scrollable cursor. */ - row_sel_push_cache_row_for_mysql(prebuilt, rec); + row_sel_push_cache_row_for_mysql(prebuilt, rec, offsets); if (prebuilt->n_fetch_cached == MYSQL_FETCH_CACHE_SIZE) { @@ -3551,11 +3680,13 @@ rec_loop: goto next_rec; } else { if (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE) { - ut_memcpy(buf + 4, rec - rec_get_extra_size(rec), - rec_get_size(rec)); - mach_write_to_4(buf, rec_get_extra_size(rec) + 4); + memcpy(buf + 4, rec - rec_offs_extra_size(offsets), + rec_offs_size(offsets)); + mach_write_to_4(buf, + rec_offs_extra_size(offsets) + 4); } else { - if (!row_sel_store_mysql_rec(buf, prebuilt, rec)) { + if (!row_sel_store_mysql_rec(buf, prebuilt, + rec, offsets)) { err = DB_TOO_BIG_RECORD; goto lock_wait_or_error; @@ -3563,8 +3694,10 @@ rec_loop: } if (prebuilt->clust_index_was_generated) { + offsets = rec_reget_offsets(index_rec, index, offsets, + ULINT_UNDEFINED, heap); row_sel_store_row_id_to_prebuilt(prebuilt, index_rec, - index); + index, offsets); } } got_row: @@ -3666,6 +3799,7 @@ lock_wait_or_error: fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */ trx->op_info = ""; + mem_heap_free(heap); return(err); normal_return: @@ -3689,6 +3823,7 @@ normal_return: trx->op_info = ""; + mem_heap_free(heap); return(ret); } diff --git a/innobase/row/row0umod.c b/innobase/row/row0umod.c index e16d696314b..ee9066a0d6f 100644 --- a/innobase/row/row0umod.c +++ b/innobase/row/row0umod.c @@ -430,6 +430,7 @@ row_undo_mod_del_unmark_sec_and_undo_update( found = row_search_index_entry(index, entry, mode, &pcur, &mtr); if (!found) { + heap = mem_heap_create(100); fputs("InnoDB: error in sec index entry del undo in\n" "InnoDB: ", stderr); dict_index_name_print(stderr, trx, index); @@ -438,11 +439,14 @@ row_undo_mod_del_unmark_sec_and_undo_update( dtuple_print(stderr, entry); fputs("\n" "InnoDB: record ", stderr); - rec_print(stderr, btr_pcur_get_rec(&pcur)); + rec_print(stderr, btr_pcur_get_rec(&pcur), + rec_get_offsets(btr_pcur_get_rec(&pcur), + index, ULINT_UNDEFINED, heap)); putc('\n', stderr); trx_print(stderr, trx); fputs("\n" "InnoDB: Submit a detailed bug report to http://bugs.mysql.com\n", stderr); + mem_heap_free(heap); } else { btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur); diff --git a/innobase/row/row0undo.c b/innobase/row/row0undo.c index bc3cc8ea9f3..42f5ef94854 100644 --- a/innobase/row/row0undo.c +++ b/innobase/row/row0undo.c @@ -151,6 +151,8 @@ row_undo_search_clust_to_pcur( mtr_t mtr; ibool ret; rec_t* rec; + mem_heap_t* heap; + const ulint* offsets; mtr_start(&mtr); @@ -161,8 +163,11 @@ row_undo_search_clust_to_pcur( rec = btr_pcur_get_rec(&(node->pcur)); + heap = mem_heap_create(100); + offsets = rec_get_offsets(rec, clust_index, ULINT_UNDEFINED, heap); + if (!found || 0 != ut_dulint_cmp(node->roll_ptr, - row_get_rec_roll_ptr(rec, clust_index))) { + row_get_rec_roll_ptr(rec, clust_index, offsets))) { /* We must remove the reservation on the undo log record BEFORE releasing the latch on the clustered index page: this @@ -175,7 +180,7 @@ row_undo_search_clust_to_pcur( ret = FALSE; } else { node->row = row_build(ROW_COPY_DATA, clust_index, rec, - node->heap); + offsets, node->heap); btr_pcur_store_position(&(node->pcur), &mtr); ret = TRUE; diff --git a/innobase/row/row0upd.c b/innobase/row/row0upd.c index a449b9f1736..e080d0ba577 100644 --- a/innobase/row/row0upd.c +++ b/innobase/row/row0upd.c @@ -301,19 +301,20 @@ recovery. */ void row_upd_rec_sys_fields_in_recovery( /*===============================*/ - rec_t* rec, /* in: record */ - ulint pos, /* in: TRX_ID position in rec */ - dulint trx_id, /* in: transaction id */ - dulint roll_ptr)/* in: roll ptr of the undo log record */ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint pos, /* in: TRX_ID position in rec */ + dulint trx_id, /* in: transaction id */ + dulint roll_ptr)/* in: roll ptr of the undo log record */ { byte* field; ulint len; - field = rec_get_nth_field(rec, pos, &len); + field = rec_get_nth_field(rec, offsets, pos, &len); ut_ad(len == DATA_TRX_ID_LEN); trx_write_trx_id(field, trx_id); - field = rec_get_nth_field(rec, pos + 1, &len); + field = rec_get_nth_field(rec, offsets, pos + 1, &len); ut_ad(len == DATA_ROLL_PTR_LEN); trx_write_roll_ptr(field, roll_ptr); } @@ -361,8 +362,8 @@ row_upd_changes_field_size_or_external( /* out: TRUE if the update changes the size of some field in index or the field is external in rec or update */ - rec_t* rec, /* in: record in index */ dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ upd_t* update) /* in: update vector */ { upd_field_t* upd_field; @@ -372,6 +373,7 @@ row_upd_changes_field_size_or_external( ulint n_fields; ulint i; + ut_ad(rec_offs_validate(NULL, index, offsets)); n_fields = upd_get_n_fields(update); for (i = 0; i < n_fields; i++) { @@ -380,19 +382,19 @@ row_upd_changes_field_size_or_external( new_val = &(upd_field->new_val); new_len = new_val->len; - if (new_len == UNIV_SQL_NULL) { + if (new_len == UNIV_SQL_NULL && !rec_offs_comp(offsets)) { new_len = dtype_get_sql_null_size( dict_index_get_nth_type(index, i)); } - old_len = rec_get_nth_field_size(rec, upd_field->field_no); - + old_len = rec_offs_nth_size(offsets, upd_field->field_no); + if (old_len != new_len) { return(TRUE); } - if (rec_get_nth_field_extern_bit(rec, upd_field->field_no)) { + if (rec_offs_nth_extern(offsets, upd_field->field_no)) { return(TRUE); } @@ -414,15 +416,18 @@ a clustered index */ void row_upd_rec_in_place( /*=================*/ - rec_t* rec, /* in/out: record where replaced */ - upd_t* update) /* in: update vector */ + rec_t* rec, /* in/out: record where replaced */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + upd_t* update) /* in: update vector */ { upd_field_t* upd_field; dfield_t* new_val; ulint n_fields; ulint i; - rec_set_info_bits(rec, update->info_bits); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + rec_set_info_bits(rec, rec_offs_comp(offsets), update->info_bits); n_fields = upd_get_n_fields(update); @@ -430,7 +435,7 @@ row_upd_rec_in_place( upd_field = upd_get_nth_field(update, i); new_val = &(upd_field->new_val); - rec_set_nth_field(rec, upd_field->field_no, + rec_set_nth_field(rec, offsets, upd_field->field_no, dfield_get_data(new_val), dfield_get_len(new_val)); } @@ -695,6 +700,7 @@ row_upd_build_sec_rec_difference_binary( upd_t* update; ulint n_diff; ulint i; + const ulint* offsets; /* This function is used only for a secondary index */ ut_a(0 == (index->type & DICT_CLUSTERED)); @@ -702,10 +708,11 @@ row_upd_build_sec_rec_difference_binary( update = upd_create(dtuple_get_n_fields(entry), heap); n_diff = 0; + offsets = rec_get_offsets(rec, index, ULINT_UNDEFINED, heap); for (i = 0; i < dtuple_get_n_fields(entry); i++) { - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field(rec, offsets, i, &len); dfield = dtuple_get_nth_field(entry, i); @@ -768,6 +775,7 @@ row_upd_build_difference_binary( ulint trx_id_pos; ibool extern_bit; ulint i; + const ulint* offsets; /* This function is used only for a clustered index */ ut_a(index->type & DICT_CLUSTERED); @@ -779,9 +787,11 @@ row_upd_build_difference_binary( roll_ptr_pos = dict_index_get_sys_col_pos(index, DATA_ROLL_PTR); trx_id_pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); + offsets = rec_get_offsets(rec, index, ULINT_UNDEFINED, heap); + for (i = 0; i < dtuple_get_n_fields(entry); i++) { - data = rec_get_nth_field(rec, i, &len); + data = rec_get_nth_field(rec, offsets, i, &len); dfield = dtuple_get_nth_field(entry, i); @@ -793,7 +803,7 @@ row_upd_build_difference_binary( goto skip_compare; } - extern_bit = rec_get_nth_field_extern_bit(rec, i); + extern_bit = rec_offs_nth_extern(offsets, i); if (extern_bit != upd_ext_vec_contains(ext_vec, n_ext_vec, i) || !dfield_data_is_binary_equal(dfield, len, data)) { @@ -1117,6 +1127,7 @@ void row_upd_copy_columns( /*=================*/ rec_t* rec, /* in: record in a clustered index */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ sym_node_t* column) /* in: first column in a column list, or NULL */ { @@ -1124,7 +1135,7 @@ row_upd_copy_columns( ulint len; while (column) { - data = rec_get_nth_field(rec, + data = rec_get_nth_field(rec, offsets, column->field_nos[SYM_CLUST_FIELD_NO], &len); eval_node_copy_and_alloc_val(column, data, len); @@ -1171,7 +1182,9 @@ row_upd_store_row( dict_index_t* clust_index; upd_t* update; rec_t* rec; - + mem_heap_t* heap; + const ulint* offsets; + ut_ad(node->pcur->latch_mode != BTR_NO_LATCHES); if (node->row != NULL) { @@ -1183,10 +1196,12 @@ row_upd_store_row( rec = btr_pcur_get_rec(node->pcur); - node->row = row_build(ROW_COPY_DATA, clust_index, rec, node->heap); - + heap = mem_heap_create(100); + offsets = rec_get_offsets(rec, clust_index, ULINT_UNDEFINED, heap); + node->row = row_build(ROW_COPY_DATA, clust_index, rec, offsets, + node->heap); node->ext_vec = mem_heap_alloc(node->heap, sizeof(ulint) - * rec_get_n_fields(rec)); + * rec_offs_n_fields(offsets)); if (node->is_delete) { update = NULL; } else { @@ -1194,7 +1209,8 @@ row_upd_store_row( } node->n_ext_vec = btr_push_update_extern_fields(node->ext_vec, - rec, update); + offsets, update); + mem_heap_free(heap); } /*************************************************************** @@ -1247,7 +1263,8 @@ row_upd_sec_index_entry( dtuple_print(stderr, entry); fputs("\n" "InnoDB: record ", stderr); - rec_print(stderr, rec); + rec_print(stderr, rec, + rec_get_offsets(rec, index, ULINT_UNDEFINED, heap)); putc('\n', stderr); trx_print(stderr, trx); @@ -1259,7 +1276,7 @@ row_upd_sec_index_entry( delete marked if we return after a lock wait in row_ins_index_entry below */ - if (!rec_get_deleted_flag(rec)) { + if (!rec_get_deleted_flag(rec, index->table->comp)) { err = btr_cur_del_mark_set_sec_rec(0, btr_cur, TRUE, thr, &mtr); if (err == DB_SUCCESS && check_ref) { @@ -1362,6 +1379,7 @@ row_upd_clust_rec_by_insert( table = node->table; pcur = node->pcur; btr_cur = btr_pcur_get_btr_cur(pcur); + heap = mem_heap_create(500); if (node->state != UPD_NODE_INSERT_CLUSTERED) { @@ -1369,7 +1387,7 @@ row_upd_clust_rec_by_insert( btr_cur, TRUE, thr, mtr); if (err != DB_SUCCESS) { mtr_commit(mtr); - + mem_heap_free(heap); return(err); } @@ -1379,7 +1397,9 @@ row_upd_clust_rec_by_insert( record is removed from the index tree, or updated. */ btr_cur_mark_extern_inherited_fields(btr_cur_get_rec(btr_cur), - node->update, mtr); + rec_get_offsets(btr_cur_get_rec(btr_cur), + dict_table_get_first_index(table), + ULINT_UNDEFINED, heap), node->update, mtr); if (check_ref) { /* NOTE that the following call loses the position of pcur ! */ @@ -1399,8 +1419,6 @@ row_upd_clust_rec_by_insert( node->state = UPD_NODE_INSERT_CLUSTERED; - heap = mem_heap_create(500); - entry = row_build_index_entry(node->row, index, heap); row_upd_index_replace_new_col_vals(entry, index, node->update, NULL); @@ -1452,7 +1470,8 @@ row_upd_clust_rec( pcur = node->pcur; btr_cur = btr_pcur_get_btr_cur(pcur); - ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur))); + ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur), + index->table->comp)); /* Try optimistic updating of the record, keeping changes within the page; we do not check locks because we assume the x-lock on the @@ -1488,7 +1507,8 @@ row_upd_clust_rec( ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr)); - ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur))); + ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur), + index->table->comp)); err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG, btr_cur, &big_rec, node->update, @@ -1496,12 +1516,17 @@ row_upd_clust_rec( mtr_commit(mtr); if (err == DB_SUCCESS && big_rec) { + mem_heap_t* heap; + rec_t* rec; mtr_start(mtr); + + heap = mem_heap_create(100); + rec = btr_cur_get_rec(btr_cur); ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr)); - - err = btr_store_big_rec_extern_fields(index, - btr_cur_get_rec(btr_cur), - big_rec, mtr); + err = btr_store_big_rec_extern_fields(index, rec, + rec_get_offsets(rec, index, ULINT_UNDEFINED, heap), + big_rec, mtr); + mem_heap_free(heap); mtr_commit(mtr); } @@ -1585,7 +1610,10 @@ row_upd_clust_step( ulint err; mtr_t* mtr; mtr_t mtr_buf; - + rec_t* rec; + mem_heap_t* heap; + const ulint* offsets; + index = dict_table_get_first_index(node->table); check_ref = row_upd_index_is_referenced(index, thr_get_trx(thr)); @@ -1641,13 +1669,16 @@ row_upd_clust_step( } } + rec = btr_pcur_get_rec(pcur); + heap = mem_heap_create(100); + offsets = rec_get_offsets(rec, index, ULINT_UNDEFINED, heap); + if (!node->has_clust_rec_x_lock) { err = lock_clust_rec_modify_check_and_lock(0, - btr_pcur_get_rec(pcur), - index, thr); + rec, index, offsets, thr); if (err != DB_SUCCESS) { mtr_commit(mtr); - + mem_heap_free(heap); return(err); } } @@ -1655,6 +1686,7 @@ row_upd_clust_step( /* NOTE: the following function calls will also commit mtr */ if (node->is_delete) { + mem_heap_free(heap); err = row_upd_del_mark_clust_rec(node, index, thr, check_ref, mtr); if (err != DB_SUCCESS) { @@ -1674,12 +1706,13 @@ row_upd_clust_step( if (!node->in_mysql_interface) { /* Copy the necessary columns from clust_rec and calculate the new values to set */ - - row_upd_copy_columns(btr_pcur_get_rec(pcur), + row_upd_copy_columns(rec, offsets, UT_LIST_GET_FIRST(node->columns)); row_upd_eval_new_vals(node->update); } + mem_heap_free(heap); + if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) { err = row_upd_clust_rec(node, index, thr, mtr); @@ -1935,6 +1968,7 @@ row_upd_in_place_in_select( btr_pcur_t* pcur; btr_cur_t* btr_cur; ulint err; + mem_heap_t* heap; ut_ad(sel_node->select_will_do_update); ut_ad(sel_node->latch_mode == BTR_MODIFY_LEAF); @@ -1950,11 +1984,15 @@ row_upd_in_place_in_select( /* Copy the necessary columns from clust_rec and calculate the new values to set */ - row_upd_copy_columns(btr_pcur_get_rec(pcur), - UT_LIST_GET_FIRST(node->columns)); + heap = mem_heap_create(100); + row_upd_copy_columns(btr_pcur_get_rec(pcur), rec_get_offsets( + btr_pcur_get_rec(pcur), btr_cur->index, ULINT_UNDEFINED, heap), + UT_LIST_GET_FIRST(node->columns)); + mem_heap_free(heap); row_upd_eval_new_vals(node->update); - ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur))); + ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur), + btr_cur->index->table->comp)); ut_ad(node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE); ut_ad(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE); diff --git a/innobase/row/row0vers.c b/innobase/row/row0vers.c index bc17ede89e3..5281dbd67d7 100644 --- a/innobase/row/row0vers.c +++ b/innobase/row/row0vers.c @@ -41,10 +41,12 @@ row_vers_impl_x_locked_off_kernel( transaction; NOTE that the kernel mutex is temporarily released! */ rec_t* rec, /* in: record in a secondary index */ - dict_index_t* index) /* in: the secondary index */ + dict_index_t* index, /* in: the secondary index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ { dict_index_t* clust_index; rec_t* clust_rec; + ulint* clust_offsets; rec_t* version; rec_t* prev_version; dulint trx_id; @@ -59,6 +61,7 @@ row_vers_impl_x_locked_off_kernel( ibool rec_del; ulint err; mtr_t mtr; + ibool comp; #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); @@ -96,7 +99,10 @@ row_vers_impl_x_locked_off_kernel( return(NULL); } - trx_id = row_get_rec_trx_id(clust_rec, clust_index); + heap = mem_heap_create(1024); + clust_offsets = rec_get_offsets(clust_rec, clust_index, + ULINT_UNDEFINED, heap); + trx_id = row_get_rec_trx_id(clust_rec, clust_index, clust_offsets); mtr_s_lock(&(purge_sys->latch), &mtr); @@ -106,19 +112,27 @@ row_vers_impl_x_locked_off_kernel( /* The transaction that modified or inserted clust_rec is no longer active: no implicit lock on rec */ + mem_heap_free(heap); mtr_commit(&mtr); return(NULL); } - if (!lock_check_trx_id_sanity(trx_id, clust_rec, clust_index, TRUE)) { + if (!lock_check_trx_id_sanity(trx_id, clust_rec, clust_index, + clust_offsets, TRUE)) { /* Corruption noticed: try to avoid a crash by returning */ + mem_heap_free(heap); mtr_commit(&mtr); return(NULL); } + comp = index->table->comp; + ut_ad(index->table == clust_index->table); + ut_ad(comp == page_is_comp(buf_frame_align(rec))); + ut_ad(comp == page_is_comp(buf_frame_align(clust_rec))); + /* We look up if some earlier version, which was modified by the trx_id transaction, of the clustered index record would require rec to be in a different state (delete marked or unmarked, or have different field @@ -128,11 +142,10 @@ row_vers_impl_x_locked_off_kernel( different state, then the trx_id transaction has not yet had time to modify rec, and does not necessarily have an implicit x-lock on rec. */ - rec_del = rec_get_deleted_flag(rec); + rec_del = rec_get_deleted_flag(rec, comp); trx = NULL; version = clust_rec; - heap = NULL; for (;;) { mutex_exit(&kernel_mutex); @@ -146,18 +159,16 @@ row_vers_impl_x_locked_off_kernel( heap2 = heap; heap = mem_heap_create(1024); - err = trx_undo_prev_version_build(clust_rec, &mtr, version, - clust_index, heap, - &prev_version); - if (heap2) { - mem_heap_free(heap2); /* version was stored in heap2, - if heap2 != NULL */ - } + clust_index, clust_offsets, heap, + &prev_version); + mem_heap_free(heap2); /* free version and clust_offsets */ if (prev_version) { + clust_offsets = rec_get_offsets(prev_version, + clust_index, ULINT_UNDEFINED, heap); row = row_build(ROW_COPY_POINTERS, clust_index, - prev_version, heap); + prev_version, clust_offsets, heap); entry = row_build_index_entry(row, index, heap); } @@ -189,11 +200,11 @@ row_vers_impl_x_locked_off_kernel( if prev_version would require rec to be in a different state. */ - vers_del = rec_get_deleted_flag(prev_version); + vers_del = rec_get_deleted_flag(prev_version, comp); /* We check if entry and rec are identified in the alphabetical ordering */ - if (0 == cmp_dtuple_rec(entry, rec)) { + if (0 == cmp_dtuple_rec(entry, rec, offsets)) { /* The delete marks of rec and prev_version should be equal for rec to be in the state required by prev_version */ @@ -211,7 +222,7 @@ row_vers_impl_x_locked_off_kernel( dtuple_set_types_binary(entry, dtuple_get_n_fields(entry)); - if (0 != cmp_dtuple_rec(entry, rec)) { + if (0 != cmp_dtuple_rec(entry, rec, offsets)) { trx = trx_get_on_id(trx_id); @@ -226,7 +237,8 @@ row_vers_impl_x_locked_off_kernel( break; } - prev_trx_id = row_get_rec_trx_id(prev_version, clust_index); + prev_trx_id = row_get_rec_trx_id(prev_version, clust_index, + clust_offsets); if (0 != ut_dulint_cmp(trx_id, prev_trx_id)) { /* The versions modified by the trx_id transaction end @@ -297,12 +309,14 @@ row_vers_old_has_index_entry( rec_t* version; rec_t* prev_version; dict_index_t* clust_index; + ulint* clust_offsets; mem_heap_t* heap; mem_heap_t* heap2; dtuple_t* row; dtuple_t* entry; ulint err; - + ibool comp; + ut_ad(mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_X_FIX) || mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_S_FIX)); @@ -313,10 +327,15 @@ row_vers_old_has_index_entry( clust_index = dict_table_get_first_index(index->table); - if (also_curr && !rec_get_deleted_flag(rec)) { + comp = index->table->comp; + ut_ad(comp == page_is_comp(buf_frame_align(rec))); + heap = mem_heap_create(1024); + clust_offsets = rec_get_offsets(rec, clust_index, + ULINT_UNDEFINED, heap); - heap = mem_heap_create(1024); - row = row_build(ROW_COPY_POINTERS, clust_index, rec, heap); + if (also_curr && !rec_get_deleted_flag(rec, comp)) { + row = row_build(ROW_COPY_POINTERS, clust_index, + rec, clust_offsets, heap); entry = row_build_index_entry(row, index, heap); /* NOTE that we cannot do the comparison as binary @@ -331,24 +350,17 @@ row_vers_old_has_index_entry( return(TRUE); } - - mem_heap_free(heap); } version = rec; - heap = NULL; for (;;) { heap2 = heap; heap = mem_heap_create(1024); - err = trx_undo_prev_version_build(rec, mtr, version, - clust_index, heap, - &prev_version); - if (heap2) { - mem_heap_free(heap2); /* version was stored in heap2, - if heap2 != NULL */ - } + clust_index, clust_offsets, heap, + &prev_version); + mem_heap_free(heap2); /* free version and clust_offsets */ if (err != DB_SUCCESS || !prev_version) { /* Versions end here */ @@ -358,9 +370,12 @@ row_vers_old_has_index_entry( return(FALSE); } - if (!rec_get_deleted_flag(prev_version)) { + clust_offsets = rec_get_offsets(prev_version, clust_index, + ULINT_UNDEFINED, heap); + + if (!rec_get_deleted_flag(prev_version, comp)) { row = row_build(ROW_COPY_POINTERS, clust_index, - prev_version, heap); + prev_version, clust_offsets, heap); entry = row_build_index_entry(row, index, heap); /* NOTE that we cannot do the comparison as binary @@ -412,6 +427,7 @@ row_vers_build_for_consistent_read( mem_heap_t* heap2; byte* buf; ulint err; + ulint* offsets; ut_ad(index->type & DICT_CLUSTERED); ut_ad(mtr_memo_contains(mtr, buf_block_align(rec), MTR_MEMO_PAGE_X_FIX) @@ -420,22 +436,23 @@ row_vers_build_for_consistent_read( #ifdef UNIV_SYNC_DEBUG ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); #endif /* UNIV_SYNC_DEBUG */ - ut_ad(!read_view_sees_trx_id(view, row_get_rec_trx_id(rec, index))); + + heap = mem_heap_create(1024); + offsets = rec_get_offsets(rec, index, ULINT_UNDEFINED, heap); + + ut_ad(!read_view_sees_trx_id(view, + row_get_rec_trx_id(rec, index, offsets))); rw_lock_s_lock(&(purge_sys->latch)); version = rec; - heap = NULL; for (;;) { heap2 = heap; heap = mem_heap_create(1024); err = trx_undo_prev_version_build(rec, mtr, version, index, - heap, &prev_version); - if (heap2) { - mem_heap_free(heap2); /* version was stored in heap2, - if heap2 != NULL */ - } + offsets, heap, &prev_version); + mem_heap_free(heap2); /* free version and offsets */ if (err != DB_SUCCESS) { break; @@ -449,16 +466,17 @@ row_vers_build_for_consistent_read( break; } - prev_trx_id = row_get_rec_trx_id(prev_version, index); + offsets = rec_get_offsets(prev_version, index, + ULINT_UNDEFINED, heap); + prev_trx_id = row_get_rec_trx_id(prev_version, index, offsets); if (read_view_sees_trx_id(view, prev_trx_id)) { /* The view already sees this version: we can copy it to in_heap and return */ - buf = mem_heap_alloc(in_heap, rec_get_size( - prev_version)); - *old_vers = rec_copy(buf, prev_version); + buf = mem_heap_alloc(in_heap, rec_offs_size(offsets)); + *old_vers = rec_copy(buf, prev_version, offsets); err = DB_SUCCESS; break; diff --git a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c index b8d03cfab5f..40befae424e 100644 --- a/innobase/srv/srv0srv.c +++ b/innobase/srv/srv0srv.c @@ -44,6 +44,7 @@ Created 10/8/1995 Heikki Tuuri #include "buf0flu.h" #include "btr0sea.h" #include "dict0load.h" +#include "dict0boot.h" #include "srv0start.h" #include "row0mysql.h" @@ -186,6 +187,61 @@ that during a time of heavy update/insert activity. */ ulint srv_max_buf_pool_modified_pct = 90; +/* variable counts amount of data read in total (in bytes) */ +ulint srv_data_read = 0; + +/* here we count the amount of data written in total (in bytes) */ +ulint srv_data_written = 0; + +/* the number of the log write requests done */ +ulint srv_log_write_requests = 0; + +/* the number of physical writes to the log performed */ +ulint srv_log_writes = 0; + +/* amount of data written to the log files in bytes */ +ulint srv_os_log_written = 0; + +/* amount of writes being done to the log files */ +ulint srv_os_log_pending_writes = 0; + +/* we increase this counter, when there we don't have enough space in the +log buffer and have to flush it */ +ulint srv_log_waits = 0; + +/* this variable counts the amount of times, when the doublewrite buffer +was flushed */ +ulint srv_dblwr_writes = 0; + +/* here we store the number of pages that have been flushed to the +doublewrite buffer */ +ulint srv_dblwr_pages_written = 0; + +/* in this variable we store the number of write requests issued */ +ulint srv_buf_pool_write_requests = 0; + +/* here we store the number of times when we had to wait for a free page +in the buffer pool. It happens when the buffer pool is full and we need +to make a flush, in order to be able to read or create a page. */ +ulint srv_buf_pool_wait_free = 0; + +/* variable to count the number of pages that were written from buffer +pool to the disk */ +ulint srv_buf_pool_flushed = 0; + +/* variable to count the number of buffer pool reads that led to the +reading of a disk page */ +ulint srv_buf_pool_reads = 0; + +/* variable to count the number of sequential read-aheads */ +ulint srv_read_ahead_seq = 0; + +/* variable to count the number of random read-aheads */ +ulint srv_read_ahead_rnd = 0; + +/* structure to pass status variables to MySQL */ +export_struc export_vars; + /* If the following is != 0 we do not allow inserts etc. This protects the user from forgetting the innodb_force_recovery keyword to my.cnf */ @@ -790,6 +846,7 @@ srv_init(void) { srv_conc_slot_t* conc_slot; srv_slot_t* slot; + dict_table_t* table; ulint i; srv_sys = mem_alloc(sizeof(srv_sys_t)); @@ -839,6 +896,31 @@ srv_init(void) UT_LIST_INIT(srv_sys->tasks); + /* create dummy table and index for old-style infimum and supremum */ + table = dict_mem_table_create("SYS_DUMMY1", + DICT_HDR_SPACE, 1, FALSE); + dict_mem_table_add_col(table, "DUMMY", DATA_CHAR, + DATA_ENGLISH | DATA_NOT_NULL, 8, 0); + + srv_sys->dummy_ind1 = dict_mem_index_create("SYS_DUMMY1", + "SYS_DUMMY1", DICT_HDR_SPACE, 0, 1); + dict_index_add_col(srv_sys->dummy_ind1, + dict_table_get_nth_col(table, 0), 0, 0); + srv_sys->dummy_ind1->table = table; + /* create dummy table and index for new-style infimum and supremum */ + table = dict_mem_table_create("SYS_DUMMY2", + DICT_HDR_SPACE, 1, TRUE); + dict_mem_table_add_col(table, "DUMMY", DATA_CHAR, + DATA_ENGLISH | DATA_NOT_NULL, 8, 0); + srv_sys->dummy_ind2 = dict_mem_index_create("SYS_DUMMY2", + "SYS_DUMMY2", DICT_HDR_SPACE, 0, 1); + dict_index_add_col(srv_sys->dummy_ind2, + dict_table_get_nth_col(table, 0), 0, 0); + srv_sys->dummy_ind2->table = table; + + /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ + srv_sys->dummy_ind1->cached = srv_sys->dummy_ind2->cached = TRUE; + /* Init the server concurrency restriction data structures */ os_fast_mutex_init(&srv_conc_mutex); @@ -1619,6 +1701,57 @@ srv_printf_innodb_monitor( fflush(file); } +/********************************************************************** +Function to pass InnoDB status variables to MySQL */ + +void +srv_export_innodb_status(void) +{ + + mutex_enter(&srv_innodb_monitor_mutex); + export_vars.innodb_data_pending_reads= os_n_pending_reads; + export_vars.innodb_data_pending_writes= os_n_pending_writes; + export_vars.innodb_data_pending_fsyncs= + fil_n_pending_log_flushes + fil_n_pending_tablespace_flushes; + export_vars.innodb_data_fsyncs= os_n_fsyncs; + export_vars.innodb_data_read= srv_data_read; + export_vars.innodb_data_reads= os_n_file_reads; + export_vars.innodb_data_writes= os_n_file_writes; + export_vars.innodb_data_written= srv_data_written; + export_vars.innodb_buffer_pool_read_requests= buf_pool->n_page_gets; + export_vars.innodb_buffer_pool_write_requests= srv_buf_pool_write_requests; + export_vars.innodb_buffer_pool_wait_free= srv_buf_pool_wait_free; + export_vars.innodb_buffer_pool_pages_flushed= srv_buf_pool_flushed; + export_vars.innodb_buffer_pool_reads= srv_buf_pool_reads; + export_vars.innodb_buffer_pool_read_ahead_rnd= srv_read_ahead_rnd; + export_vars.innodb_buffer_pool_read_ahead_seq= srv_read_ahead_seq; + export_vars.innodb_buffer_pool_pages_data= UT_LIST_GET_LEN(buf_pool->LRU); + export_vars.innodb_buffer_pool_pages_dirty= UT_LIST_GET_LEN(buf_pool->flush_list); + export_vars.innodb_buffer_pool_pages_free= UT_LIST_GET_LEN(buf_pool->free); + export_vars.innodb_buffer_pool_pages_latched= buf_get_latched_pages_number(); + export_vars.innodb_buffer_pool_pages_total= buf_pool->curr_size; + export_vars.innodb_buffer_pool_pages_misc= buf_pool->max_size - + UT_LIST_GET_LEN(buf_pool->LRU) - UT_LIST_GET_LEN(buf_pool->free); + export_vars.innodb_page_size= UNIV_PAGE_SIZE; + export_vars.innodb_log_waits= srv_log_waits; + export_vars.innodb_os_log_written= srv_os_log_written; + export_vars.innodb_os_log_fsyncs= fil_n_log_flushes; + export_vars.innodb_os_log_pending_fsyncs= fil_n_pending_log_flushes; + export_vars.innodb_os_log_pending_writes= srv_os_log_pending_writes; + export_vars.innodb_log_write_requests= srv_log_write_requests; + export_vars.innodb_log_writes= srv_log_writes; + export_vars.innodb_dblwr_pages_written= srv_dblwr_pages_written; + export_vars.innodb_dblwr_writes= srv_dblwr_writes; + export_vars.innodb_pages_created= buf_pool->n_pages_created; + export_vars.innodb_pages_read= buf_pool->n_pages_read; + export_vars.innodb_pages_written= buf_pool->n_pages_written; + export_vars.innodb_rows_read= srv_n_rows_read; + export_vars.innodb_rows_inserted= srv_n_rows_inserted; + export_vars.innodb_rows_updated= srv_n_rows_updated; + export_vars.innodb_rows_deleted= srv_n_rows_deleted; + mutex_exit(&srv_innodb_monitor_mutex); +} + /************************************************************************* A thread which wakes up threads whose lock wait may have lasted too long. This also prints the info output by various InnoDB monitors. */ @@ -1677,11 +1810,13 @@ loop: srv_printf_innodb_monitor(stderr); } - mutex_enter(&srv_monitor_file_mutex); - rewind(srv_monitor_file); - srv_printf_innodb_monitor(srv_monitor_file); - os_file_set_eof(srv_monitor_file); - mutex_exit(&srv_monitor_file_mutex); + if (srv_innodb_status) { + mutex_enter(&srv_monitor_file_mutex); + rewind(srv_monitor_file); + srv_printf_innodb_monitor(srv_monitor_file); + os_file_set_eof(srv_monitor_file); + mutex_exit(&srv_monitor_file_mutex); + } if (srv_print_innodb_tablespace_monitor && difftime(current_time, last_table_monitor_time) > 60) { diff --git a/innobase/srv/srv0start.c b/innobase/srv/srv0start.c index 9709f5235de..69341a1d7d1 100644 --- a/innobase/srv/srv0start.c +++ b/innobase/srv/srv0start.c @@ -1172,6 +1172,9 @@ NetWare. */ } if (ret == NULL) { + fprintf(stderr, +"InnoDB: Fatal error: cannot allocate the memory for the buffer pool\n"); + return(DB_ERROR); } diff --git a/innobase/trx/trx0purge.c b/innobase/trx/trx0purge.c index 6726d7ca609..3df34111281 100644 --- a/innobase/trx/trx0purge.c +++ b/innobase/trx/trx0purge.c @@ -289,7 +289,7 @@ trx_purge_add_update_undo_to_history( flst_get_len(seg_header + TRX_UNDO_PAGE_LIST, mtr)); mlog_write_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE, - hist_size + undo->size, MLOG_4BYTES, mtr); + hist_size + undo->size, MLOG_4BYTES, mtr); } /* Add the log as the first in the history list */ @@ -646,6 +646,27 @@ trx_purge_rseg_get_next_history_log( mutex_exit(&(rseg->mutex)); mtr_commit(&mtr); + mutex_enter(&kernel_mutex); + + /* Add debug code to track history list corruption reported + on the MySQL mailing list on Nov 9, 2004. The fut0lst.c + file-based list was corrupt. The prev node pointer was + FIL_NULL, even though the list length was over 8 million nodes! + We assume that purge truncates the history list in moderate + size pieces, and if we here reach the head of the list, the + list cannot be longer than 20 000 undo logs now. */ + + if (trx_sys->rseg_history_len > 20000) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: purge reached the head of the history list,\n" +"InnoDB: but its length is still reported as %lu! Make a detailed bug\n" +"InnoDB: report, and post it to bugs.mysql.com\n", + (ulong)trx_sys->rseg_history_len); + } + + mutex_exit(&kernel_mutex); + return; } @@ -1069,30 +1090,6 @@ trx_purge(void) } } - /* Determine how much data manipulation language (DML) statements - need to be delayed in order to reduce the lagging of the purge - thread. */ - srv_dml_needed_delay = 0; /* in microseconds; default: no delay */ - - /* If we cannot advance the 'purge view' because of an old - 'consistent read view', then the DML statements cannot be delayed. - Also, srv_max_purge_lag <= 0 means 'infinity'. */ - if (srv_max_purge_lag > 0 - && !UT_LIST_GET_LAST(trx_sys->view_list)) { - float ratio = (float) trx_sys->rseg_history_len - / srv_max_purge_lag; - if (ratio > ULINT_MAX / 10000) { - /* Avoid overflow: maximum delay is 4295 seconds */ - srv_dml_needed_delay = ULINT_MAX; - } else if (ratio > 1) { - /* If the history list length exceeds the - innodb_max_purge_lag, the - data manipulation statements are delayed - by at least 5000 microseconds. */ - srv_dml_needed_delay = (ulint) ((ratio - .5) * 10000); - } - } - purge_sys->view = read_view_oldest_copy_or_open_new(NULL, purge_sys->heap); mutex_exit(&kernel_mutex); diff --git a/innobase/trx/trx0rec.c b/innobase/trx/trx0rec.c index fe429d1cc62..484d4f62744 100644 --- a/innobase/trx/trx0rec.c +++ b/innobase/trx/trx0rec.c @@ -38,16 +38,18 @@ trx_undof_page_add_undo_rec_log( ulint new_free, /* in: end offset of the entry */ mtr_t* mtr) /* in: mtr */ { - byte* log_ptr; - ulint len; + byte* log_ptr; + const byte* log_end; + ulint len; - log_ptr = mlog_open(mtr, 30 + MLOG_BUF_MARGIN); + log_ptr = mlog_open(mtr, 11 + 13 + MLOG_BUF_MARGIN); if (log_ptr == NULL) { return; } + log_end = &log_ptr[11 + 13 + MLOG_BUF_MARGIN]; log_ptr = mlog_write_initial_log_record_fast(undo_page, MLOG_UNDO_INSERT, log_ptr, mtr); len = new_free - old_free - 4; @@ -55,14 +57,11 @@ trx_undof_page_add_undo_rec_log( mach_write_to_2(log_ptr, len); log_ptr += 2; - if (len < 256) { - ut_memcpy(log_ptr, undo_page + old_free + 2, len); - log_ptr += len; - } - - mlog_close(mtr, log_ptr); - - if (len >= MLOG_BUF_MARGIN) { + if (log_ptr + len <= log_end) { + memcpy(log_ptr, undo_page + old_free + 2, len); + mlog_close(mtr, log_ptr + len); + } else { + mlog_close(mtr, log_ptr); mlog_catenate_string(mtr, undo_page + old_free + 2, len); } } @@ -404,6 +403,7 @@ trx_undo_page_report_modify( delete marking is done */ rec_t* rec, /* in: clustered index record which has NOT yet been modified */ + const ulint* offsets, /* in: rec_get_offsets(rec, index) */ upd_t* update, /* in: update vector which tells the columns to be updated; in the case of a delete, this should be set to NULL */ @@ -430,6 +430,7 @@ trx_undo_page_report_modify( ulint i; ut_a(index->type & DICT_CLUSTERED); + ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_UPDATE); table = index->table; @@ -454,7 +455,7 @@ trx_undo_page_report_modify( /* Store first some general parameters to the undo log */ if (update) { - if (rec_get_deleted_flag(rec)) { + if (rec_get_deleted_flag(rec, table->comp)) { type_cmpl = TRX_UNDO_UPD_DEL_REC; } else { type_cmpl = TRX_UNDO_UPD_EXIST_REC; @@ -479,14 +480,15 @@ trx_undo_page_report_modify( /*----------------------------------------*/ /* Store the state of the info bits */ - bits = rec_get_info_bits(rec); + bits = rec_get_info_bits(rec, table->comp); mach_write_to_1(ptr, bits); ptr += 1; /* Store the values of the system columns */ - trx_id = dict_index_rec_get_sys_col(index, DATA_TRX_ID, rec); - - roll_ptr = dict_index_rec_get_sys_col(index, DATA_ROLL_PTR, rec); + trx_id = dict_index_rec_get_sys_col(index, offsets, + DATA_TRX_ID, rec); + roll_ptr = dict_index_rec_get_sys_col(index, offsets, + DATA_ROLL_PTR, rec); len = mach_dulint_write_compressed(ptr, trx_id); ptr += len; @@ -499,7 +501,7 @@ trx_undo_page_report_modify( for (i = 0; i < dict_index_get_n_unique(index); i++) { - field = rec_get_nth_field(rec, i, &flen); + field = rec_get_nth_field(rec, offsets, i, &flen); if (trx_undo_left(undo_page, ptr) < 4) { @@ -547,14 +549,14 @@ trx_undo_page_report_modify( ptr += len; /* Save the old value of field */ - field = rec_get_nth_field(rec, pos, &flen); + field = rec_get_nth_field(rec, offsets, pos, &flen); if (trx_undo_left(undo_page, ptr) < 5) { return(0); } - if (rec_get_nth_field_extern_bit(rec, pos)) { + if (rec_offs_nth_extern(offsets, pos)) { /* If a field has external storage, we add to flen the flag */ @@ -631,7 +633,7 @@ trx_undo_page_report_modify( ptr += len; /* Save the old value of field */ - field = rec_get_nth_field(rec, pos, &flen); + field = rec_get_nth_field(rec, offsets, pos, &flen); if (trx_undo_left(undo_page, ptr) < 5) { @@ -1008,7 +1010,9 @@ trx_undo_report_row_operation( ibool is_insert; trx_rseg_t* rseg; mtr_t mtr; - + mem_heap_t* heap; + ulint* offsets = NULL; + ut_a(index->type & DICT_CLUSTERED); if (flags & BTR_NO_UNDO_LOG_FLAG) { @@ -1019,7 +1023,6 @@ trx_undo_report_row_operation( } ut_ad(thr); - ut_a(index->type & DICT_CLUSTERED); ut_ad((op_type != TRX_UNDO_INSERT_OP) || (clust_entry && !update && !rec)); @@ -1063,6 +1066,8 @@ trx_undo_report_row_operation( mtr_start(&mtr); + heap = mem_heap_create(100); + for (;;) { undo_page = buf_page_get_gen(undo->space, page_no, RW_X_LATCH, undo->guess_page, @@ -1079,9 +1084,10 @@ trx_undo_report_row_operation( index, clust_entry, &mtr); } else { + offsets = rec_reget_offsets(rec, index, + offsets, ULINT_UNDEFINED, heap); offset = trx_undo_page_report_modify(undo_page, trx, - index, rec, update, - cmpl_info, &mtr); + index, rec, offsets, update, cmpl_info, &mtr); } if (offset == 0) { @@ -1123,7 +1129,7 @@ trx_undo_report_row_operation( mutex_exit(&(trx->undo_mutex)); mtr_commit(&mtr); - + mem_heap_free(heap); return(DB_OUT_OF_FILE_SPACE); } } @@ -1140,6 +1146,7 @@ trx_undo_report_row_operation( *roll_ptr = trx_undo_build_roll_ptr(is_insert, rseg->id, page_no, offset); + mem_heap_free(heap); return(DB_SUCCESS); } @@ -1236,6 +1243,7 @@ trx_undo_prev_version_build( index_rec page and purge_view */ rec_t* rec, /* in: version of a clustered index record */ dict_index_t* index, /* in: clustered index */ + ulint* offsets,/* in: rec_get_offsets(rec, index) */ mem_heap_t* heap, /* in: memory heap from which the memory needed is allocated */ rec_t** old_vers)/* out, own: previous version, or NULL if @@ -1258,7 +1266,7 @@ trx_undo_prev_version_build( ibool dummy_extern; byte* buf; ulint err; - + ulint* index_offsets = NULL; #ifdef UNIV_SYNC_DEBUG ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); #endif /* UNIV_SYNC_DEBUG */ @@ -1266,21 +1274,25 @@ trx_undo_prev_version_build( MTR_MEMO_PAGE_S_FIX) || mtr_memo_contains(index_mtr, buf_block_align(index_rec), MTR_MEMO_PAGE_X_FIX)); + ut_ad(rec_offs_validate(rec, index, offsets)); + if (!(index->type & DICT_CLUSTERED)) { fprintf(stderr, "InnoDB: Error: trying to access" " update undo rec for non-clustered index %s\n" "InnoDB: Submit a detailed bug report to" " http://bugs.mysql.com\n" "InnoDB: index record ", index->name); - rec_print(stderr, index_rec); + index_offsets = rec_get_offsets(index_rec, index, + ULINT_UNDEFINED, heap); + rec_print(stderr, index_rec, index_offsets); fputs("\n" "InnoDB: record version ", stderr); - rec_print(stderr, rec); + rec_print(stderr, rec, offsets); putc('\n', stderr); return(DB_ERROR); } - roll_ptr = row_get_rec_roll_ptr(rec, index); + roll_ptr = row_get_rec_roll_ptr(rec, index, offsets); old_roll_ptr = roll_ptr; *old_vers = NULL; @@ -1292,7 +1304,7 @@ trx_undo_prev_version_build( return(DB_SUCCESS); } - rec_trx_id = row_get_rec_trx_id(rec, index); + rec_trx_id = row_get_rec_trx_id(rec, index, offsets); err = trx_undo_get_undo_rec(roll_ptr, rec_trx_id, &undo_rec, heap); @@ -1341,10 +1353,12 @@ trx_undo_prev_version_build( ut_print_buf(stderr, undo_rec, 150); fputs("\n" "InnoDB: index record ", stderr); - rec_print(stderr, index_rec); + index_offsets = rec_get_offsets(index_rec, index, + ULINT_UNDEFINED, heap); + rec_print(stderr, index_rec, index_offsets); fputs("\n" "InnoDB: record version ", stderr); - rec_print(stderr, rec); + rec_print(stderr, rec, offsets); fprintf(stderr, "\n" "InnoDB: Record trx id %lu %lu, update rec trx id %lu %lu\n" "InnoDB: Roll ptr in rec %lu %lu, in update rec %lu %lu\n", @@ -1358,11 +1372,10 @@ trx_undo_prev_version_build( (ulong) ut_dulint_get_low(roll_ptr)); trx_purge_sys_print(); - return(DB_ERROR); } - if (row_upd_changes_field_size_or_external(rec, index, update)) { + if (row_upd_changes_field_size_or_external(index, offsets, update)) { ulint* ext_vect; ulint n_ext_vect; @@ -1372,27 +1385,28 @@ trx_undo_prev_version_build( those fields that update updates to become externally stored fields. Store the info to ext_vect: */ - ext_vect = mem_alloc(sizeof(ulint) * rec_get_n_fields(rec)); - n_ext_vect = btr_push_update_extern_fields(ext_vect, rec, + ext_vect = mem_alloc(sizeof(ulint) + * rec_offs_n_fields(offsets)); + n_ext_vect = btr_push_update_extern_fields(ext_vect, offsets, update); entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, heap); row_upd_index_replace_new_col_vals(entry, index, update, heap); - buf = mem_heap_alloc(heap, rec_get_converted_size(entry)); + buf = mem_heap_alloc(heap, + rec_get_converted_size(index, entry)); - *old_vers = rec_convert_dtuple_to_rec(buf, entry); + *old_vers = rec_convert_dtuple_to_rec(buf, index, entry); /* Now set the extern bits in the old version of the record */ - rec_set_field_extern_bits(*old_vers, ext_vect, n_ext_vect, - NULL); + rec_set_field_extern_bits(*old_vers, index, + ext_vect, n_ext_vect, NULL); mem_free(ext_vect); } else { - buf = mem_heap_alloc(heap, rec_get_size(rec)); - - *old_vers = rec_copy(buf, rec); - - row_upd_rec_in_place(*old_vers, update); + buf = mem_heap_alloc(heap, rec_offs_size(offsets)); + *old_vers = rec_copy(buf, rec, offsets); + rec_offs_make_valid(*old_vers, index, offsets); + row_upd_rec_in_place(*old_vers, offsets, update); } return(DB_SUCCESS); diff --git a/innobase/trx/trx0roll.c b/innobase/trx/trx0roll.c index eb7c7f43f03..db5e16c7778 100644 --- a/innobase/trx/trx0roll.c +++ b/innobase/trx/trx0roll.c @@ -331,10 +331,11 @@ trx_savept_take( /*********************************************************************** Rollback or clean up transactions which have no user session. If the transaction already was committed, then we clean up a possible insert -undo log. If the transaction was not yet committed, then we roll it back. */ +undo log. If the transaction was not yet committed, then we roll it back. +Note: this is done in a background thread */ -void -trx_rollback_or_clean_all_without_sess(void) +void * +trx_rollback_or_clean_all_without_sess(void *i) /*========================================*/ { mem_heap_t* heap; @@ -362,7 +363,7 @@ trx_rollback_or_clean_all_without_sess(void) fprintf(stderr, "InnoDB: Starting rollback of uncommitted transactions\n"); } else { - return; + os_thread_exit(i); } loop: heap = mem_heap_create(512); @@ -371,9 +372,15 @@ loop: trx = UT_LIST_GET_FIRST(trx_sys->trx_list); - while (trx && (trx->sess || (trx->conc_state == TRX_NOT_STARTED))) { + while (trx) { - trx = UT_LIST_GET_NEXT(trx_list, trx); + if ((trx->sess || (trx->conc_state == TRX_NOT_STARTED))) { + trx = UT_LIST_GET_NEXT(trx_list, trx); + } else if (trx->conc_state == TRX_PREPARED) { + trx->sess = trx_dummy_sess; + } else { + break; + } } mutex_exit(&kernel_mutex); @@ -384,10 +391,11 @@ loop: mem_heap_free(heap); - return; + os_thread_exit(i); } trx->sess = trx_dummy_sess; + if (trx->conc_state == TRX_COMMITTED_IN_MEMORY) { fprintf(stderr, "InnoDB: Cleaning up trx with id %lu %lu\n", @@ -486,6 +494,8 @@ loop: mem_heap_free(heap); goto loop; + + os_thread_exit(i); /* not reached */ } /*********************************************************************** diff --git a/innobase/trx/trx0sys.c b/innobase/trx/trx0sys.c index 54bd5be01a1..35e18064329 100644 --- a/innobase/trx/trx0sys.c +++ b/innobase/trx/trx0sys.c @@ -887,8 +887,12 @@ trx_sys_init_at_db_start(void) trx = UT_LIST_GET_FIRST(trx_sys->trx_list); for (;;) { - rows_to_undo += + + if ( trx->conc_state != TRX_PREPARED) { + rows_to_undo += ut_conv_dulint_to_longlong(trx->undo_no); + } + trx = UT_LIST_GET_NEXT(trx_list, trx); if (!trx) { diff --git a/innobase/trx/trx0trx.c b/innobase/trx/trx0trx.c index f7497ac4090..ab8bd898dd6 100644 --- a/innobase/trx/trx0trx.c +++ b/innobase/trx/trx0trx.c @@ -24,6 +24,7 @@ Created 3/26/1996 Heikki Tuuri #include "thr0loc.h" #include "btr0sea.h" #include "os0proc.h" +#include "trx0xa.h" /* Copy of the prototype for innobase_mysql_print_thd: this copy MUST be equal to the one in mysql/sql/ha_innodb.cc ! */ @@ -156,6 +157,10 @@ trx_create( trx->read_view_heap = mem_heap_create(256); trx->read_view = NULL; + /* Set X/Open XA transaction identification to NULL */ + memset(&trx->xid,0,sizeof(trx->xid)); + trx->xid.formatID = -1; + return(trx); } @@ -408,13 +413,22 @@ trx_lists_init_at_db_start(void) trx = trx_create(NULL); trx->id = undo->trx_id; - + trx->xid = undo->xid; trx->insert_undo = undo; trx->rseg = rseg; if (undo->state != TRX_UNDO_ACTIVE) { - trx->conc_state = TRX_COMMITTED_IN_MEMORY; + /* Prepared transactions are left in + the prepared state waiting for a + commit or abort decision from MySQL */ + + if (undo->state == TRX_UNDO_PREPARED) { + trx->conc_state = TRX_PREPARED; + } else { + trx->conc_state = + TRX_COMMITTED_IN_MEMORY; + } /* We give a dummy value for the trx no; this should have no relevance since purge @@ -457,10 +471,22 @@ trx_lists_init_at_db_start(void) trx = trx_create(NULL); trx->id = undo->trx_id; + trx->xid = undo->xid; if (undo->state != TRX_UNDO_ACTIVE) { - trx->conc_state = - TRX_COMMITTED_IN_MEMORY; + + /* Prepared transactions are left in + the prepared state waiting for a + commit or abort decision from MySQL */ + + if (undo->state == TRX_UNDO_PREPARED) { + trx->conc_state = + TRX_PREPARED; + } else { + trx->conc_state = + TRX_COMMITTED_IN_MEMORY; + } + /* We give a dummy value for the trx number */ @@ -726,7 +752,8 @@ trx_commit_off_kernel( mutex_enter(&kernel_mutex); } - ut_ad(trx->conc_state == TRX_ACTIVE); + ut_ad(trx->conc_state == TRX_ACTIVE || trx->conc_state == TRX_PREPARED); + #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ @@ -1667,3 +1694,239 @@ trx_print( innobase_mysql_print_thd(f, trx->mysql_thd); } } + +/******************************************************************** +Prepares a transaction. */ + +void +trx_prepare_off_kernel( +/*==================*/ + trx_t* trx) /* in: transaction */ +{ + page_t* update_hdr_page; + dulint lsn; + trx_rseg_t* rseg; + trx_undo_t* undo; + ibool must_flush_log = FALSE; + mtr_t mtr; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + rseg = trx->rseg; + + if (trx->insert_undo != NULL || trx->update_undo != NULL) { + + mutex_exit(&kernel_mutex); + + mtr_start(&mtr); + + must_flush_log = TRUE; + + /* Change the undo log segment states from TRX_UNDO_ACTIVE + to some other state: these modifications to the file data + structure define the transaction as prepared in the file + based world, at the serialization point of the log sequence + number lsn obtained below. */ + + mutex_enter(&(rseg->mutex)); + + if (trx->insert_undo != NULL) { + trx_undo_set_state_at_prepare(trx, trx->insert_undo, + &mtr); + } + + undo = trx->update_undo; + + if (undo) { + + /* It is not necessary to obtain trx->undo_mutex here + because only a single OS thread is allowed to do the + transaction prepare for this transaction. */ + + update_hdr_page = trx_undo_set_state_at_prepare(trx, undo, &mtr); + } + + mutex_exit(&(rseg->mutex)); + + /*--------------*/ + mtr_commit(&mtr); + /*--------------*/ + lsn = mtr.end_lsn; + + mutex_enter(&kernel_mutex); + } + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +#endif /* UNIV_SYNC_DEBUG */ + + /*--------------------------------------*/ + trx->conc_state = TRX_PREPARED; + /*--------------------------------------*/ + + if (trx->read_view) { + read_view_close(trx->read_view); + + mem_heap_empty(trx->read_view_heap); + trx->read_view = NULL; + } + + if (must_flush_log) { + + mutex_exit(&kernel_mutex); + + /* Write the log to the log files AND flush them to disk */ + + /*-------------------------------------*/ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); + + /*-------------------------------------*/ + + mutex_enter(&kernel_mutex); + } +} + +/************************************************************************** +Does the transaction prepare for MySQL. */ + +ulint +trx_prepare_for_mysql( +/*=================*/ + /* out: 0 or error number */ + trx_t* trx) /* in: trx handle */ +{ + /* Because we do not do the prepare by sending an Innobase + sig to the transaction, we must here make sure that trx has been + started. */ + + ut_a(trx); + + trx->op_info = "preparing"; + + trx_start_if_not_started(trx); + + mutex_enter(&kernel_mutex); + + trx_prepare_off_kernel(trx); + + mutex_exit(&kernel_mutex); + + trx->op_info = ""; + + return(0); +} + +/************************************************************************** +This function is used to find number of prepared transactions and +their transaction objects for a recovery. */ + +int +trx_recover_for_mysql( +/*==================*/ + /* out: number of prepared transactions + stored in xid_list */ + XID* xid_list, /* in/out: prepared transactions */ + uint len) /* in: number of slots in xid_list */ +{ + trx_t* trx; + int num_of_transactions = 0; + + ut_ad(xid_list); + ut_ad(len); + + fprintf(stderr, + "InnoDB: Starting recovery for XA transactions...\n"); + + + /* We should set those transactions which are in + the prepared state to the xid_list */ + + mutex_enter(&kernel_mutex); + + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + + while (trx) { + if (trx->conc_state == TRX_PREPARED) { + xid_list[num_of_transactions] = trx->xid; + + fprintf(stderr, +"InnoDB: Transaction %lu %lu in prepared state after recovery\n", + (ulong) ut_dulint_get_high(trx->id), + (ulong) ut_dulint_get_low(trx->id)); + + fprintf(stderr, +"InnoDB: Transaction contains changes to %lu rows\n", + (ulong)ut_conv_dulint_to_longlong(trx->undo_no)); + + num_of_transactions++; + + if ((uint)num_of_transactions == len ) { + break; + } + } + + trx = UT_LIST_GET_NEXT(trx_list, trx); + } + + mutex_exit(&kernel_mutex); + + fprintf(stderr, + "InnoDB: %d transactions in prepare state after recovery\n", + num_of_transactions); + + return (num_of_transactions); +} + +/*********************************************************************** +This function is used to find one X/Open XA distributed transaction +which is in the prepared state */ + +trx_t * +trx_get_trx_by_xid( +/*===============*/ + /* out: trx or NULL */ + XID* xid) /* in: X/Open XA Transaction Idenfication */ +{ + trx_t* trx; + + if (xid == NULL) { + return (NULL); + } + + mutex_enter(&kernel_mutex); + + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + + while (trx) { + /* Compare two X/Open XA transaction id's: their + length should be the same and binary comparison + of gtrid_lenght+bqual_length bytes should be + the same */ + + if (xid->gtrid_length == trx->xid.gtrid_length && + xid->bqual_length == trx->xid.bqual_length && + memcmp(xid, &trx->xid, + xid->gtrid_length + + xid->bqual_length) == 0) { + break; + } + + trx = UT_LIST_GET_NEXT(trx_list, trx); + } + + mutex_exit(&kernel_mutex); + + if (trx) { + if (trx->conc_state != TRX_PREPARED) { + return(NULL); + } + + return(trx); + } else { + return(NULL); + } +} + diff --git a/innobase/trx/trx0undo.c b/innobase/trx/trx0undo.c index c1edc223cbc..4bfa9c20a54 100644 --- a/innobase/trx/trx0undo.c +++ b/innobase/trx/trx0undo.c @@ -19,6 +19,7 @@ Created 3/26/1996 Heikki Tuuri #include "srv0srv.h" #include "trx0rec.h" #include "trx0purge.h" +#include "trx0xa.h" /* How should the old versions in the history list be managed? ---------------------------------------------------------- @@ -97,6 +98,7 @@ trx_undo_mem_create( TRX_UNDO_UPDATE */ dulint trx_id, /* in: id of the trx for which the undo log is created */ + XID* xid, /* in: X/Open XA transaction identification*/ ulint page_no,/* in: undo log header page number */ ulint offset); /* in: undo log header byte offset on page */ /******************************************************************* @@ -109,6 +111,7 @@ trx_undo_insert_header_reuse( page_t* undo_page, /* in: insert undo log segment header page, x-latched */ dulint trx_id, /* in: transaction id */ + XID* xid, /* in: X/Open XA transaction identification*/ mtr_t* mtr); /* in: mtr */ /************************************************************************** If an update undo log can be discarded immediately, this function frees the @@ -484,6 +487,7 @@ trx_undo_header_create( TRX_UNDO_LOG_HDR_SIZE bytes free space on it */ dulint trx_id, /* in: transaction id */ + XID* xid, /* in: X/Open XA XID */ mtr_t* mtr) /* in: mtr */ { trx_upagef_t* page_hdr; @@ -530,11 +534,25 @@ trx_undo_header_create( mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id); mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free); - mach_write_to_2(log_hdr + TRX_UNDO_DICT_OPERATION, FALSE); - + /* If X/Open XID exits in the log header we store a + flag of it in upper byte of dict operation flag. */ + + if (xid != NULL || xid->formatID != -1) { + mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, TRUE); + } else { + mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE); + } + + mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE); mach_write_to_2(log_hdr + TRX_UNDO_NEXT_LOG, 0); mach_write_to_2(log_hdr + TRX_UNDO_PREV_LOG, prev_log); - + + /* Write X/Open XA transaction identification if exists */ + + if (xid && xid->formatID != -1) { + trx_undo_write_xid(log_hdr, xid); + } + trx_undo_header_create_log(undo_page, trx_id, mtr); return(free); @@ -569,6 +587,11 @@ trx_undo_parse_page_header( mtr_t* mtr) /* in: mtr or NULL */ { dulint trx_id; + XID xid; + + /* Set X/Open XA transaction identification to NULL */ + memset(&xid, 0, sizeof(xid)); + xid.formatID = -1; ptr = mach_dulint_parse_compressed(ptr, end_ptr, &trx_id); @@ -579,10 +602,10 @@ trx_undo_parse_page_header( if (page) { if (type == MLOG_UNDO_HDR_CREATE) { - trx_undo_header_create(page, trx_id, mtr); + trx_undo_header_create(page, trx_id, &xid, mtr); } else { ut_ad(type == MLOG_UNDO_HDR_REUSE); - trx_undo_insert_header_reuse(page, trx_id, mtr); + trx_undo_insert_header_reuse(page, trx_id, &xid, mtr); } } @@ -599,6 +622,7 @@ trx_undo_insert_header_reuse( page_t* undo_page, /* in: insert undo log segment header page, x-latched */ dulint trx_id, /* in: transaction id */ + XID* xid, /* in: X/Open XA transaction identification */ mtr_t* mtr) /* in: mtr */ { trx_upagef_t* page_hdr; @@ -636,8 +660,18 @@ trx_undo_insert_header_reuse( mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id); mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free); - mach_write_to_2(log_hdr + TRX_UNDO_DICT_OPERATION, FALSE); + /* If X/Open XID exits in the log header we store it + to log header. */ + + if (xid && xid->formatID != -1) { + mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, TRUE); + trx_undo_write_xid(log_hdr, xid); + } else { + mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE); + } + + mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE); trx_undo_insert_header_reuse_log(undo_page, trx_id, mtr); return(free); @@ -718,6 +752,52 @@ trx_undo_discard_latest_update_undo( } /************************************************************************ +Write X/Open XA Transaction Identification (XID) to undo log header */ + +void +trx_undo_write_xid( +/*===============*/ + trx_ulogf_t* log_hdr,/* in: undo log header */ + XID* xid) /* in: X/Open XA Transaction Identification */ +{ + ulint i; + + mach_write_to_4(log_hdr + TRX_UNDO_XA_FORMAT, xid->formatID); + + mach_write_to_4(log_hdr + TRX_UNDO_XA_TRID_LEN, xid->gtrid_length); + + mach_write_to_4(log_hdr + TRX_UNDO_XA_BQUAL_LEN, xid->bqual_length); + + for(i=0; i < XIDDATASIZE; i++) { + mach_write_to_1(log_hdr + TRX_UNDO_XA_XID + i, + (ulint)(xid->data[i])); + } +} + +/************************************************************************ +Read X/Open XA Transaction Identification (XID) from undo log header */ + +void +trx_undo_read_xid( +/*==============*/ + trx_ulogf_t* log_hdr,/* in: undo log header */ + XID* xid) /* out: X/Open XA Transaction Identification */ +{ + ulint i; + + xid->formatID = mach_read_from_4(log_hdr + TRX_UNDO_XA_FORMAT); + + xid->gtrid_length = mach_read_from_4(log_hdr + TRX_UNDO_XA_TRID_LEN); + + xid->bqual_length = mach_read_from_4(log_hdr + TRX_UNDO_XA_BQUAL_LEN); + + for(i=0; i < XIDDATASIZE; i++) { + xid->data[i] = (char)mach_read_from_1(log_hdr + + TRX_UNDO_XA_XID +i); + } +} + +/************************************************************************ Tries to add a page to the undo log segment where the undo log is placed. */ ulint @@ -800,7 +880,6 @@ trx_undo_free_page( list */ ulint space, /* in: space */ ulint hdr_page_no, /* in: header page number */ - ulint hdr_offset, /* in: header offset */ ulint page_no, /* in: page number to free: must not be the header page */ mtr_t* mtr) /* in: mtr which does not have a latch to any @@ -813,7 +892,6 @@ trx_undo_free_page( trx_rsegf_t* rseg_header; ulint hist_size; - UT_NOT_USED(hdr_offset); ut_a(hdr_page_no != page_no); #ifdef UNIV_SYNC_DEBUG ut_ad(!mutex_own(&kernel_mutex)); @@ -870,8 +948,7 @@ trx_undo_free_page_in_rollback( #endif /* UNIV_SYNC_DEBUG */ last_page_no = trx_undo_free_page(undo->rseg, FALSE, undo->space, - undo->hdr_page_no, undo->hdr_offset, - page_no, mtr); + undo->hdr_page_no, page_no, mtr); undo->last_page_no = last_page_no; undo->size--; @@ -1039,7 +1116,7 @@ loop: trx_undo_empty_header_page(space, hdr_page_no, hdr_offset, &mtr); } else { - trx_undo_free_page(rseg, TRUE, space, hdr_page_no, hdr_offset, + trx_undo_free_page(rseg, TRUE, space, hdr_page_no, page_no, &mtr); } @@ -1123,7 +1200,9 @@ trx_undo_mem_create_at_db_start( fil_addr_t last_addr; page_t* last_page; trx_undo_rec_t* rec; - + XID xid; + ibool xid_exists = FALSE; + if (id >= TRX_RSEG_N_SLOTS) { fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", (ulong) id); @@ -1145,15 +1224,31 @@ trx_undo_mem_create_at_db_start( undo_header = undo_page + offset; trx_id = mtr_read_dulint(undo_header + TRX_UNDO_TRX_ID, mtr); + + xid_exists = mtr_read_ulint(undo_header + TRX_UNDO_XID_EXISTS, + MLOG_1BYTE, mtr); + + /* Read X/Open XA transaction identification if exists or + set it to NULL. */ + + memset(&xid, 0, sizeof(xid)); + xid.formatID = -1; + + if (xid_exists == TRUE) { + trx_undo_read_xid(undo_header, &xid); + } + mutex_enter(&(rseg->mutex)); - undo = trx_undo_mem_create(rseg, id, type, trx_id, page_no, offset); + undo = trx_undo_mem_create(rseg, id, type, trx_id, &xid, + page_no, offset); mutex_exit(&(rseg->mutex)); - undo->dict_operation = mtr_read_ulint( - undo_header + TRX_UNDO_DICT_OPERATION, - MLOG_2BYTES, mtr); + undo->dict_operation = mtr_read_ulint( + undo_header + TRX_UNDO_DICT_TRANS, + MLOG_1BYTE, mtr); + undo->table_id = mtr_read_dulint(undo_header + TRX_UNDO_TABLE_ID, mtr); undo->state = state; undo->size = flst_get_len(seg_header + TRX_UNDO_PAGE_LIST, mtr); @@ -1241,7 +1336,7 @@ trx_undo_lists_init( if (page_no != FIL_NULL && srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) { - + undo = trx_undo_mem_create_at_db_start(rseg, i, page_no, &mtr); size += undo->size; @@ -1272,7 +1367,8 @@ trx_undo_mem_create( ulint type, /* in: type of the log: TRX_UNDO_INSERT or TRX_UNDO_UPDATE */ dulint trx_id, /* in: id of the trx for which the undo log - is created */ + is created */ + XID* xid, /* in: X/Open transaction identification */ ulint page_no,/* in: undo log header page number */ ulint offset) /* in: undo log header byte offset on page */ { @@ -1295,6 +1391,7 @@ trx_undo_mem_create( undo->state = TRX_UNDO_ACTIVE; undo->del_marks = FALSE; undo->trx_id = trx_id; + undo->xid = *xid; undo->dict_operation = FALSE; @@ -1322,6 +1419,7 @@ trx_undo_mem_init_for_reuse( trx_undo_t* undo, /* in: undo log to init */ dulint trx_id, /* in: id of the trx for which the undo log is created */ + XID* xid, /* in: X/Open XA transaction identification*/ ulint offset) /* in: undo log header byte offset on page */ { #ifdef UNIV_SYNC_DEBUG @@ -1339,6 +1437,7 @@ trx_undo_mem_init_for_reuse( undo->state = TRX_UNDO_ACTIVE; undo->del_marks = FALSE; undo->trx_id = trx_id; + undo->xid = *xid; undo->dict_operation = FALSE; @@ -1376,6 +1475,7 @@ trx_undo_create( TRX_UNDO_UPDATE */ dulint trx_id, /* in: id of the trx for which the undo log is created */ + XID* xid, /* in: X/Open transaction identification*/ mtr_t* mtr) /* in: mtr */ { trx_rsegf_t* rseg_header; @@ -1410,9 +1510,10 @@ trx_undo_create( page_no = buf_frame_get_page_no(undo_page); - offset = trx_undo_header_create(undo_page, trx_id, mtr); + offset = trx_undo_header_create(undo_page, trx_id, xid, mtr); - undo = trx_undo_mem_create(rseg, id, type, trx_id, page_no, offset); + undo = trx_undo_mem_create(rseg, id, type, trx_id, xid , + page_no, offset); return(undo); } @@ -1432,6 +1533,7 @@ trx_undo_reuse_cached( TRX_UNDO_UPDATE */ dulint trx_id, /* in: id of the trx for which the undo log is used */ + XID* xid, /* in: X/Open XA transaction identification*/ mtr_t* mtr) /* in: mtr */ { trx_undo_t* undo; @@ -1475,16 +1577,17 @@ trx_undo_reuse_cached( undo_page = trx_undo_page_get(undo->space, undo->hdr_page_no, mtr); if (type == TRX_UNDO_INSERT) { - offset = trx_undo_insert_header_reuse(undo_page, trx_id, mtr); + offset = trx_undo_insert_header_reuse(undo_page, trx_id, + xid, mtr); } else { ut_a(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_UPDATE); - offset = trx_undo_header_create(undo_page, trx_id, mtr); + offset = trx_undo_header_create(undo_page, trx_id, xid, mtr); } - trx_undo_mem_init_for_reuse(undo, trx_id, offset); + trx_undo_mem_init_for_reuse(undo, trx_id, xid, offset); return(undo); } @@ -1506,9 +1609,10 @@ trx_undo_mark_as_dict_operation( hdr_page = trx_undo_page_get(undo->space, undo->hdr_page_no, mtr); - mlog_write_ulint(hdr_page + undo->hdr_offset + TRX_UNDO_DICT_OPERATION, - trx->dict_operation, MLOG_2BYTES, mtr); - + mlog_write_ulint(hdr_page + undo->hdr_offset + + TRX_UNDO_DICT_TRANS, + trx->dict_operation, MLOG_1BYTE, mtr); + mlog_write_dulint(hdr_page + undo->hdr_offset + TRX_UNDO_TABLE_ID, trx->table_id, mtr); @@ -1548,10 +1652,10 @@ trx_undo_assign_undo( #endif /* UNIV_SYNC_DEBUG */ mutex_enter(&(rseg->mutex)); - undo = trx_undo_reuse_cached(rseg, type, trx->id, &mtr); + undo = trx_undo_reuse_cached(rseg, type, trx->id, &trx->xid, &mtr); if (undo == NULL) { - undo = trx_undo_create(rseg, type, trx->id, &mtr); + undo = trx_undo_create(rseg, type, trx->id, &trx->xid, &mtr); if (undo == NULL) { /* Did not succeed */ @@ -1632,6 +1736,56 @@ trx_undo_set_state_at_finish( return(undo_page); } +/********************************************************************** +Sets the state of the undo log segment at a transaction prepare. */ + +page_t* +trx_undo_set_state_at_prepare( +/*==========================*/ + /* out: undo log segment header page, + x-latched */ + trx_t* trx, /* in: transaction */ + trx_undo_t* undo, /* in: undo log memory copy */ + mtr_t* mtr) /* in: mtr */ +{ + trx_usegf_t* seg_hdr; + trx_upagef_t* page_hdr; + trx_ulogf_t* undo_header; + page_t* undo_page; + ulint offset; + + ut_ad(trx && undo && mtr); + + if (undo->id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", + (ulong) undo->id); + mem_analyze_corruption((byte*)undo); + ut_error; + } + + undo_page = trx_undo_page_get(undo->space, undo->hdr_page_no, mtr); + + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + + /*------------------------------*/ + undo->state = TRX_UNDO_PREPARED; + undo->xid = trx->xid; + /*------------------------------*/ + + mlog_write_ulint(seg_hdr + TRX_UNDO_STATE, undo->state, + MLOG_2BYTES, mtr); + + offset = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG); + undo_header = undo_page + offset; + + mlog_write_ulint(undo_header + TRX_UNDO_XID_EXISTS, + TRUE, MLOG_1BYTE, mtr); + + trx_undo_write_xid(undo_header, &undo->xid); + return(undo_page); +} + /************************************************************************** Adds the update undo log header as the first in the history list, and frees the memory object, or puts it to the list of cached update undo log diff --git a/innobase/ut/ut0mem.c b/innobase/ut/ut0mem.c index a6002d7fd83..6ed61b0b5de 100644 --- a/innobase/ut/ut0mem.c +++ b/innobase/ut/ut0mem.c @@ -61,8 +61,10 @@ ut_malloc_low( /*==========*/ /* out, own: allocated memory */ ulint n, /* in: number of bytes to allocate */ - ibool set_to_zero) /* in: TRUE if allocated memory should be set + ibool set_to_zero, /* in: TRUE if allocated memory should be set to zero if UNIV_SET_MEM_TO_ZERO is defined */ + ibool assert_on_error) /* in: if TRUE, we crash mysqld if the memory + cannot be allocated */ { void* ret; @@ -86,9 +88,7 @@ ut_malloc_low( "InnoDB: Check if you should increase the swap file or\n" "InnoDB: ulimits of your operating system.\n" "InnoDB: On FreeBSD check you have compiled the OS with\n" - "InnoDB: a big enough maximum process size.\n" - "InnoDB: We now intentionally generate a seg fault so that\n" - "InnoDB: on Linux we get a stack trace.\n", + "InnoDB: a big enough maximum process size.\n", (ulong) n, (ulong) ut_total_allocated_memory, #ifdef __WIN__ (ulong) GetLastError() @@ -110,7 +110,15 @@ ut_malloc_low( /* Intentional segfault on NetWare causes an abend. Avoid this by graceful exit handling in ut_a(). */ #if (!defined __NETWARE__) - if (*ut_mem_null_ptr) ut_mem_null_ptr = 0; + if (assert_on_error) { + fprintf(stderr, + "InnoDB: We now intentionally generate a seg fault so that\n" + "InnoDB: on Linux we get a stack trace.\n"); + + if (*ut_mem_null_ptr) ut_mem_null_ptr = 0; + } else { + return(NULL); + } #else ut_a(0); #endif @@ -144,7 +152,7 @@ ut_malloc( /* out, own: allocated memory */ ulint n) /* in: number of bytes to allocate */ { - return(ut_malloc_low(n, TRUE)); + return(ut_malloc_low(n, TRUE, TRUE)); } /************************************************************************** diff --git a/innobase/ut/ut0ut.c b/innobase/ut/ut0ut.c index b67be77b29e..732380bcb1f 100644 --- a/innobase/ut/ut0ut.c +++ b/innobase/ut/ut0ut.c @@ -235,13 +235,18 @@ ut_get_year_month_day( *month = (ulint)cal_tm.wMonth; *day = (ulint)cal_tm.wDay; #else + struct tm cal_tm; struct tm* cal_tm_ptr; time_t tm; time(&tm); +#ifdef HAVE_LOCALTIME_R + localtime_r(&tm, &cal_tm); + cal_tm_ptr = &cal_tm; +#else cal_tm_ptr = localtime(&tm); - +#endif *year = (ulint)cal_tm_ptr->tm_year + 1900; *month = (ulint)cal_tm_ptr->tm_mon + 1; *day = (ulint)cal_tm_ptr->tm_mday; |