diff options
author | unknown <knielsen@knielsen-hq.org> | 2009-06-09 13:16:11 +0200 |
---|---|---|
committer | unknown <knielsen@knielsen-hq.org> | 2009-06-09 13:16:11 +0200 |
commit | a6b7f71329ceb7d0188572f494b5d1a1f0461fc5 (patch) | |
tree | d7e62c1af5118cd3ec9346de436569e907fcc51d /storage/xtradb/page | |
parent | b125770aaadd09e839ad9211047e88095984308b (diff) | |
parent | 107072563d771422c9bbb9aeeedce8ae19c5b838 (diff) | |
download | mariadb-git-a6b7f71329ceb7d0188572f494b5d1a1f0461fc5.tar.gz |
Import Percona XtraDB into the MariaDB source tree.
Diffstat (limited to 'storage/xtradb/page')
-rw-r--r-- | storage/xtradb/page/page0cur.c | 1922 | ||||
-rw-r--r-- | storage/xtradb/page/page0page.c | 2568 | ||||
-rw-r--r-- | storage/xtradb/page/page0zip.c | 4575 |
3 files changed, 9065 insertions, 0 deletions
diff --git a/storage/xtradb/page/page0cur.c b/storage/xtradb/page/page0cur.c new file mode 100644 index 00000000000..e810756c1e4 --- /dev/null +++ b/storage/xtradb/page/page0cur.c @@ -0,0 +1,1922 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/************************************************************************ +The page cursor + +Created 10/4/1994 Heikki Tuuri +*************************************************************************/ + +#include "page0cur.h" +#ifdef UNIV_NONINL +#include "page0cur.ic" +#endif + +#include "page0zip.h" +#include "mtr0log.h" +#include "log0recv.h" +#include "rem0cmp.h" + +static ulint page_rnd = 976722341; + +#ifdef PAGE_CUR_ADAPT +# ifdef UNIV_SEARCH_PERF_STAT +static ulint page_cur_short_succ = 0; +# endif /* UNIV_SEARCH_PERF_STAT */ + +/******************************************************************** +Tries a search shortcut based on the last insert. */ +UNIV_INLINE +ibool +page_cur_try_search_shortcut( +/*=========================*/ + /* out: TRUE on success */ + const buf_block_t* block, /* in: index page */ + const dict_index_t* index, /* in: record descriptor */ + const dtuple_t* tuple, /* in: data tuple */ + ulint* iup_matched_fields, + /* in/out: already matched + fields in upper limit record */ + ulint* iup_matched_bytes, + /* in/out: already matched + bytes in a field not yet + completely matched */ + ulint* ilow_matched_fields, + /* in/out: already matched + fields in lower limit record */ + ulint* ilow_matched_bytes, + /* in/out: already matched + bytes in a field not yet + completely matched */ + page_cur_t* cursor) /* out: page cursor */ +{ + const rec_t* rec; + const rec_t* next_rec; + ulint low_match; + ulint low_bytes; + ulint up_match; + ulint up_bytes; +#ifdef UNIV_SEARCH_DEBUG + page_cur_t cursor2; +#endif + ibool success = FALSE; + const page_t* page = buf_block_get_frame(block); + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(dtuple_check_typed(tuple)); + + rec = page_header_get_ptr(page, PAGE_LAST_INSERT); + offsets = rec_get_offsets(rec, index, offsets, + dtuple_get_n_fields(tuple), &heap); + + ut_ad(rec); + ut_ad(page_rec_is_user_rec(rec)); + + ut_pair_min(&low_match, &low_bytes, + *ilow_matched_fields, *ilow_matched_bytes, + *iup_matched_fields, *iup_matched_bytes); + + up_match = low_match; + up_bytes = low_bytes; + + if (page_cmp_dtuple_rec_with_match(tuple, rec, offsets, + &low_match, &low_bytes) < 0) { + goto exit_func; + } + + next_rec = page_rec_get_next_const(rec); + offsets = rec_get_offsets(next_rec, index, offsets, + dtuple_get_n_fields(tuple), &heap); + + if (page_cmp_dtuple_rec_with_match(tuple, next_rec, offsets, + &up_match, &up_bytes) >= 0) { + goto exit_func; + } + + page_cur_position(rec, block, cursor); + +#ifdef UNIV_SEARCH_DEBUG + page_cur_search_with_match(block, index, tuple, PAGE_CUR_DBG, + iup_matched_fields, + iup_matched_bytes, + ilow_matched_fields, + ilow_matched_bytes, + &cursor2); + ut_a(cursor2.rec == cursor->rec); + + if (!page_rec_is_supremum(next_rec)) { + + ut_a(*iup_matched_fields == up_match); + ut_a(*iup_matched_bytes == up_bytes); + } + + ut_a(*ilow_matched_fields == low_match); + ut_a(*ilow_matched_bytes == low_bytes); +#endif + if (!page_rec_is_supremum(next_rec)) { + + *iup_matched_fields = up_match; + *iup_matched_bytes = up_bytes; + } + + *ilow_matched_fields = low_match; + *ilow_matched_bytes = low_bytes; + +#ifdef UNIV_SEARCH_PERF_STAT + page_cur_short_succ++; +#endif + success = TRUE; +exit_func: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(success); +} + +#endif + +#ifdef PAGE_CUR_LE_OR_EXTENDS +/******************************************************************** +Checks if the nth field in a record is a character type field which extends +the nth field in tuple, i.e., the field is longer or equal in length and has +common first characters. */ +static +ibool +page_cur_rec_field_extends( +/*=======================*/ + /* out: TRUE if rec field + extends tuple field */ + const dtuple_t* tuple, /* in: data tuple */ + const rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n) /* in: compare nth field */ +{ + const dtype_t* type; + const dfield_t* dfield; + const byte* rec_f; + ulint rec_f_len; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + dfield = dtuple_get_nth_field(tuple, n); + + type = dfield_get_type(dfield); + + rec_f = rec_get_nth_field(rec, offsets, n, &rec_f_len); + + if (type->mtype == DATA_VARCHAR + || type->mtype == DATA_CHAR + || type->mtype == DATA_FIXBINARY + || type->mtype == DATA_BINARY + || type->mtype == DATA_BLOB + || type->mtype == DATA_VARMYSQL + || type->mtype == DATA_MYSQL) { + + if (dfield_get_len(dfield) != UNIV_SQL_NULL + && rec_f_len != UNIV_SQL_NULL + && rec_f_len >= dfield_get_len(dfield) + && !cmp_data_data_slow(type->mtype, type->prtype, + dfield_get_data(dfield), + dfield_get_len(dfield), + rec_f, dfield_get_len(dfield))) { + + return(TRUE); + } + } + + return(FALSE); +} +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + +/******************************************************************** +Searches the right position for a page cursor. */ +UNIV_INTERN +void +page_cur_search_with_match( +/*=======================*/ + const buf_block_t* block, /* in: buffer block */ + const dict_index_t* index, /* in: record descriptor */ + const dtuple_t* tuple, /* in: data tuple */ + ulint mode, /* in: PAGE_CUR_L, + PAGE_CUR_LE, PAGE_CUR_G, or + PAGE_CUR_GE */ + ulint* iup_matched_fields, + /* in/out: already matched + fields in upper limit record */ + ulint* iup_matched_bytes, + /* in/out: already matched + bytes in a field not yet + completely matched */ + ulint* ilow_matched_fields, + /* in/out: already matched + fields in lower limit record */ + ulint* ilow_matched_bytes, + /* in/out: already matched + bytes in a field not yet + completely matched */ + page_cur_t* cursor) /* out: page cursor */ +{ + ulint up; + ulint low; + ulint mid; + const page_t* page; + const page_dir_slot_t* slot; + const rec_t* up_rec; + const rec_t* low_rec; + const rec_t* mid_rec; + ulint up_matched_fields; + ulint up_matched_bytes; + ulint low_matched_fields; + ulint low_matched_bytes; + ulint cur_matched_fields; + ulint cur_matched_bytes; + int cmp; +#ifdef UNIV_SEARCH_DEBUG + int dbg_cmp; + ulint dbg_matched_fields; + ulint dbg_matched_bytes; +#endif +#ifdef UNIV_ZIP_DEBUG + const page_zip_des_t* page_zip = buf_block_get_page_zip(block); +#endif /* UNIV_ZIP_DEBUG */ + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(block && tuple && iup_matched_fields && iup_matched_bytes + && ilow_matched_fields && ilow_matched_bytes && cursor); + ut_ad(dtuple_validate(tuple)); +#ifdef UNIV_DEBUG +# ifdef PAGE_CUR_DBG + if (mode != PAGE_CUR_DBG) +# endif /* PAGE_CUR_DBG */ +# ifdef PAGE_CUR_LE_OR_EXTENDS + if (mode != PAGE_CUR_LE_OR_EXTENDS) +# endif /* PAGE_CUR_LE_OR_EXTENDS */ + ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE + || mode == PAGE_CUR_G || mode == PAGE_CUR_GE); +#endif /* UNIV_DEBUG */ + page = buf_block_get_frame(block); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + page_check_dir(page); + +#ifdef PAGE_CUR_ADAPT + if (page_is_leaf(page) + && (mode == PAGE_CUR_LE) + && (page_header_get_field(page, PAGE_N_DIRECTION) > 3) + && (page_header_get_ptr(page, PAGE_LAST_INSERT)) + && (page_header_get_field(page, PAGE_DIRECTION) == PAGE_RIGHT)) { + + if (page_cur_try_search_shortcut( + block, index, tuple, + iup_matched_fields, iup_matched_bytes, + ilow_matched_fields, ilow_matched_bytes, + cursor)) { + return; + } + } +# ifdef PAGE_CUR_DBG + if (mode == PAGE_CUR_DBG) { + mode = PAGE_CUR_LE; + } +# endif +#endif + + /* The following flag does not work for non-latin1 char sets because + cmp_full_field does not tell how many bytes matched */ +#ifdef PAGE_CUR_LE_OR_EXTENDS + ut_a(mode != PAGE_CUR_LE_OR_EXTENDS); +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + + /* If mode PAGE_CUR_G is specified, we are trying to position the + cursor to answer a query of the form "tuple < X", where tuple is + the input parameter, and X denotes an arbitrary physical record on + the page. We want to position the cursor on the first X which + satisfies the condition. */ + + up_matched_fields = *iup_matched_fields; + up_matched_bytes = *iup_matched_bytes; + low_matched_fields = *ilow_matched_fields; + low_matched_bytes = *ilow_matched_bytes; + + /* Perform binary search. First the search is done through the page + directory, after that as a linear search in the list of records + owned by the upper limit directory slot. */ + + low = 0; + up = page_dir_get_n_slots(page) - 1; + + /* Perform binary search until the lower and upper limit directory + slots come to the distance 1 of each other */ + + while (up - low > 1) { + mid = (low + up) / 2; + slot = page_dir_get_nth_slot(page, mid); + mid_rec = page_dir_slot_get_rec(slot); + + ut_pair_min(&cur_matched_fields, &cur_matched_bytes, + low_matched_fields, low_matched_bytes, + up_matched_fields, up_matched_bytes); + + offsets = rec_get_offsets(mid_rec, index, offsets, + dtuple_get_n_fields_cmp(tuple), + &heap); + + cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, offsets, + &cur_matched_fields, + &cur_matched_bytes); + if (UNIV_LIKELY(cmp > 0)) { +low_slot_match: + low = mid; + low_matched_fields = cur_matched_fields; + low_matched_bytes = cur_matched_bytes; + + } else if (UNIV_EXPECT(cmp, -1)) { +#ifdef PAGE_CUR_LE_OR_EXTENDS + if (mode == PAGE_CUR_LE_OR_EXTENDS + && page_cur_rec_field_extends( + tuple, mid_rec, offsets, + cur_matched_fields)) { + + goto low_slot_match; + } +#endif /* PAGE_CUR_LE_OR_EXTENDS */ +up_slot_match: + up = mid; + up_matched_fields = cur_matched_fields; + up_matched_bytes = cur_matched_bytes; + + } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE +#ifdef PAGE_CUR_LE_OR_EXTENDS + || mode == PAGE_CUR_LE_OR_EXTENDS +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + ) { + + goto low_slot_match; + } else { + + goto up_slot_match; + } + } + + slot = page_dir_get_nth_slot(page, low); + low_rec = page_dir_slot_get_rec(slot); + slot = page_dir_get_nth_slot(page, up); + up_rec = page_dir_slot_get_rec(slot); + + /* Perform linear search until the upper and lower records come to + distance 1 of each other. */ + + while (page_rec_get_next_const(low_rec) != up_rec) { + + mid_rec = page_rec_get_next_const(low_rec); + + ut_pair_min(&cur_matched_fields, &cur_matched_bytes, + low_matched_fields, low_matched_bytes, + up_matched_fields, up_matched_bytes); + + offsets = rec_get_offsets(mid_rec, index, offsets, + dtuple_get_n_fields_cmp(tuple), + &heap); + + cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, offsets, + &cur_matched_fields, + &cur_matched_bytes); + if (UNIV_LIKELY(cmp > 0)) { +low_rec_match: + low_rec = mid_rec; + low_matched_fields = cur_matched_fields; + low_matched_bytes = cur_matched_bytes; + + } else if (UNIV_EXPECT(cmp, -1)) { +#ifdef PAGE_CUR_LE_OR_EXTENDS + if (mode == PAGE_CUR_LE_OR_EXTENDS + && page_cur_rec_field_extends( + tuple, mid_rec, offsets, + cur_matched_fields)) { + + goto low_rec_match; + } +#endif /* PAGE_CUR_LE_OR_EXTENDS */ +up_rec_match: + up_rec = mid_rec; + up_matched_fields = cur_matched_fields; + up_matched_bytes = cur_matched_bytes; + } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE +#ifdef PAGE_CUR_LE_OR_EXTENDS + || mode == PAGE_CUR_LE_OR_EXTENDS +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + ) { + + goto low_rec_match; + } else { + + goto up_rec_match; + } + } + +#ifdef UNIV_SEARCH_DEBUG + + /* Check that the lower and upper limit records have the + right alphabetical order compared to tuple. */ + dbg_matched_fields = 0; + dbg_matched_bytes = 0; + + offsets = rec_get_offsets(low_rec, index, offsets, + ULINT_UNDEFINED, &heap); + dbg_cmp = page_cmp_dtuple_rec_with_match(tuple, low_rec, offsets, + &dbg_matched_fields, + &dbg_matched_bytes); + if (mode == PAGE_CUR_G) { + ut_a(dbg_cmp >= 0); + } else if (mode == PAGE_CUR_GE) { + ut_a(dbg_cmp == 1); + } else if (mode == PAGE_CUR_L) { + ut_a(dbg_cmp == 1); + } else if (mode == PAGE_CUR_LE) { + ut_a(dbg_cmp >= 0); + } + + if (!page_rec_is_infimum(low_rec)) { + + ut_a(low_matched_fields == dbg_matched_fields); + ut_a(low_matched_bytes == dbg_matched_bytes); + } + + dbg_matched_fields = 0; + dbg_matched_bytes = 0; + + offsets = rec_get_offsets(up_rec, index, offsets, + ULINT_UNDEFINED, &heap); + dbg_cmp = page_cmp_dtuple_rec_with_match(tuple, up_rec, offsets, + &dbg_matched_fields, + &dbg_matched_bytes); + if (mode == PAGE_CUR_G) { + ut_a(dbg_cmp == -1); + } else if (mode == PAGE_CUR_GE) { + ut_a(dbg_cmp <= 0); + } else if (mode == PAGE_CUR_L) { + ut_a(dbg_cmp <= 0); + } else if (mode == PAGE_CUR_LE) { + ut_a(dbg_cmp == -1); + } + + if (!page_rec_is_supremum(up_rec)) { + + ut_a(up_matched_fields == dbg_matched_fields); + ut_a(up_matched_bytes == dbg_matched_bytes); + } +#endif + if (mode <= PAGE_CUR_GE) { + page_cur_position(up_rec, block, cursor); + } else { + page_cur_position(low_rec, block, cursor); + } + + *iup_matched_fields = up_matched_fields; + *iup_matched_bytes = up_matched_bytes; + *ilow_matched_fields = low_matched_fields; + *ilow_matched_bytes = low_matched_bytes; + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/*************************************************************** +Positions a page cursor on a randomly chosen user record on a page. If there +are no user records, sets the cursor on the infimum record. */ +UNIV_INTERN +void +page_cur_open_on_rnd_user_rec( +/*==========================*/ + buf_block_t* block, /* in: page */ + page_cur_t* cursor) /* out: page cursor */ +{ + ulint rnd; + ulint n_recs = page_get_n_recs(buf_block_get_frame(block)); + + page_cur_set_before_first(block, cursor); + + if (UNIV_UNLIKELY(n_recs == 0)) { + + return; + } + + page_rnd += 87584577; + + rnd = page_rnd % n_recs; + + do { + page_cur_move_to_next(cursor); + } while (rnd--); +} + +/*************************************************************** +Writes the log record of a record insert on a page. */ +static +void +page_cur_insert_rec_write_log( +/*==========================*/ + rec_t* insert_rec, /* in: inserted physical record */ + ulint rec_size, /* in: insert_rec size */ + rec_t* cursor_rec, /* in: record the + cursor is pointing to */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + ulint cur_rec_size; + ulint extra_size; + ulint cur_extra_size; + const byte* ins_ptr; + byte* log_ptr; + const byte* log_end; + ulint i; + + ut_a(rec_size < UNIV_PAGE_SIZE); + ut_ad(page_align(insert_rec) == page_align(cursor_rec)); + ut_ad(!page_rec_is_comp(insert_rec) + == !dict_table_is_comp(index->table)); + + { + mem_heap_t* heap = NULL; + ulint cur_offs_[REC_OFFS_NORMAL_SIZE]; + ulint ins_offs_[REC_OFFS_NORMAL_SIZE]; + + ulint* cur_offs; + ulint* ins_offs; + + rec_offs_init(cur_offs_); + rec_offs_init(ins_offs_); + + cur_offs = rec_get_offsets(cursor_rec, index, cur_offs_, + ULINT_UNDEFINED, &heap); + ins_offs = rec_get_offsets(insert_rec, index, ins_offs_, + ULINT_UNDEFINED, &heap); + + extra_size = rec_offs_extra_size(ins_offs); + cur_extra_size = rec_offs_extra_size(cur_offs); + ut_ad(rec_size == rec_offs_size(ins_offs)); + cur_rec_size = rec_offs_size(cur_offs); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + + ins_ptr = insert_rec - extra_size; + + i = 0; + + if (cur_extra_size == extra_size) { + ulint min_rec_size = ut_min(cur_rec_size, rec_size); + + const byte* cur_ptr = cursor_rec - cur_extra_size; + + /* Find out the first byte in insert_rec which differs from + cursor_rec; skip the bytes in the record info */ + + do { + if (*ins_ptr == *cur_ptr) { + i++; + ins_ptr++; + cur_ptr++; + } else if ((i < extra_size) + && (i >= extra_size + - page_rec_get_base_extra_size + (insert_rec))) { + i = extra_size; + ins_ptr = insert_rec; + cur_ptr = cursor_rec; + } else { + break; + } + } while (i < min_rec_size); + } + + if (mtr_get_log_mode(mtr) != MTR_LOG_SHORT_INSERTS) { + + if (page_rec_is_comp(insert_rec)) { + log_ptr = mlog_open_and_write_index( + mtr, insert_rec, index, MLOG_COMP_REC_INSERT, + 2 + 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN); + if (UNIV_UNLIKELY(!log_ptr)) { + /* Logging in mtr is switched off + during crash recovery: in that case + mlog_open returns NULL */ + return; + } + } else { + log_ptr = mlog_open(mtr, 11 + + 2 + 5 + 1 + 5 + 5 + + MLOG_BUF_MARGIN); + if (UNIV_UNLIKELY(!log_ptr)) { + /* Logging in mtr is switched off + during crash recovery: in that case + mlog_open returns NULL */ + return; + } + + log_ptr = mlog_write_initial_log_record_fast( + insert_rec, MLOG_REC_INSERT, log_ptr, mtr); + } + + log_end = &log_ptr[2 + 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN]; + /* Write the cursor rec offset as a 2-byte ulint */ + mach_write_to_2(log_ptr, page_offset(cursor_rec)); + log_ptr += 2; + } else { + log_ptr = mlog_open(mtr, 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN); + if (!log_ptr) { + /* Logging in mtr is switched off during crash + recovery: in that case mlog_open returns NULL */ + return; + } + log_end = &log_ptr[5 + 1 + 5 + 5 + MLOG_BUF_MARGIN]; + } + + if (page_rec_is_comp(insert_rec)) { + if (UNIV_UNLIKELY + (rec_get_info_and_status_bits(insert_rec, TRUE) + != rec_get_info_and_status_bits(cursor_rec, TRUE))) { + + goto need_extra_info; + } + } else { + if (UNIV_UNLIKELY + (rec_get_info_and_status_bits(insert_rec, FALSE) + != rec_get_info_and_status_bits(cursor_rec, FALSE))) { + + goto need_extra_info; + } + } + + if (extra_size != cur_extra_size || rec_size != cur_rec_size) { +need_extra_info: + /* Write the record end segment length + and the extra info storage flag */ + log_ptr += mach_write_compressed(log_ptr, + 2 * (rec_size - i) + 1); + + /* Write the info bits */ + mach_write_to_1(log_ptr, + rec_get_info_and_status_bits( + insert_rec, + page_rec_is_comp(insert_rec))); + log_ptr++; + + /* Write the record origin offset */ + log_ptr += mach_write_compressed(log_ptr, extra_size); + + /* Write the mismatch index */ + log_ptr += mach_write_compressed(log_ptr, i); + + ut_a(i < UNIV_PAGE_SIZE); + ut_a(extra_size < UNIV_PAGE_SIZE); + } else { + /* Write the record end segment length + and the extra info storage flag */ + log_ptr += mach_write_compressed(log_ptr, 2 * (rec_size - i)); + } + + /* Write to the log the inserted index record end segment which + differs from the cursor record */ + + rec_size -= i; + + if (log_ptr + rec_size <= log_end) { + memcpy(log_ptr, ins_ptr, rec_size); + mlog_close(mtr, log_ptr + rec_size); + } else { + mlog_close(mtr, log_ptr); + ut_a(rec_size < UNIV_PAGE_SIZE); + mlog_catenate_string(mtr, ins_ptr, rec_size); + } +} + +/*************************************************************** +Parses a log record of a record insert on a page. */ +UNIV_INTERN +byte* +page_cur_parse_insert_rec( +/*======================*/ + /* out: end of log record or NULL */ + ibool is_short,/* in: TRUE if short inserts */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + buf_block_t* block, /* in: page or NULL */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr or NULL */ +{ + ulint origin_offset; + ulint end_seg_len; + ulint mismatch_index; + page_t* page; + rec_t* cursor_rec; + byte buf1[1024]; + byte* buf; + byte* ptr2 = ptr; + ulint info_and_status_bits = 0; /* remove warning */ + page_cur_t cursor; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + page = block ? buf_block_get_frame(block) : NULL; + + if (is_short) { + cursor_rec = page_rec_get_prev(page_get_supremum_rec(page)); + } else { + ulint offset; + + /* Read the cursor rec offset as a 2-byte ulint */ + + if (UNIV_UNLIKELY(end_ptr < ptr + 2)) { + + return(NULL); + } + + offset = mach_read_from_2(ptr); + ptr += 2; + + cursor_rec = page + offset; + + if (UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE)) { + + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + } + + ptr = mach_parse_compressed(ptr, end_ptr, &end_seg_len); + + if (ptr == NULL) { + + return(NULL); + } + + if (UNIV_UNLIKELY(end_seg_len >= UNIV_PAGE_SIZE << 1)) { + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + + if (end_seg_len & 0x1UL) { + /* Read the info bits */ + + if (end_ptr < ptr + 1) { + + return(NULL); + } + + info_and_status_bits = mach_read_from_1(ptr); + ptr++; + + ptr = mach_parse_compressed(ptr, end_ptr, &origin_offset); + + if (ptr == NULL) { + + return(NULL); + } + + ut_a(origin_offset < UNIV_PAGE_SIZE); + + ptr = mach_parse_compressed(ptr, end_ptr, &mismatch_index); + + if (ptr == NULL) { + + return(NULL); + } + + ut_a(mismatch_index < UNIV_PAGE_SIZE); + } + + if (UNIV_UNLIKELY(end_ptr < ptr + (end_seg_len >> 1))) { + + return(NULL); + } + + if (!block) { + + return(ptr + (end_seg_len >> 1)); + } + + ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); + ut_ad(!buf_block_get_page_zip(block) || page_is_comp(page)); + + /* Read from the log the inserted index record end segment which + differs from the cursor record */ + + offsets = rec_get_offsets(cursor_rec, index, offsets, + ULINT_UNDEFINED, &heap); + + if (!(end_seg_len & 0x1UL)) { + info_and_status_bits = rec_get_info_and_status_bits( + cursor_rec, page_is_comp(page)); + origin_offset = rec_offs_extra_size(offsets); + mismatch_index = rec_offs_size(offsets) - (end_seg_len >> 1); + } + + end_seg_len >>= 1; + + if (mismatch_index + end_seg_len < sizeof buf1) { + buf = buf1; + } else { + buf = mem_alloc(mismatch_index + end_seg_len); + } + + /* Build the inserted record to buf */ + + if (UNIV_UNLIKELY(mismatch_index >= UNIV_PAGE_SIZE)) { + fprintf(stderr, + "Is short %lu, info_and_status_bits %lu, offset %lu, " + "o_offset %lu\n" + "mismatch index %lu, end_seg_len %lu\n" + "parsed len %lu\n", + (ulong) is_short, (ulong) info_and_status_bits, + (ulong) page_offset(cursor_rec), + (ulong) origin_offset, + (ulong) mismatch_index, (ulong) end_seg_len, + (ulong) (ptr - ptr2)); + + fputs("Dump of 300 bytes of log:\n", stderr); + ut_print_buf(stderr, ptr2, 300); + putc('\n', stderr); + + buf_page_print(page, 0); + + ut_error; + } + + ut_memcpy(buf, rec_get_start(cursor_rec, offsets), mismatch_index); + ut_memcpy(buf + mismatch_index, ptr, end_seg_len); + + if (page_is_comp(page)) { + rec_set_info_and_status_bits(buf + origin_offset, + info_and_status_bits); + } else { + rec_set_info_bits_old(buf + origin_offset, + info_and_status_bits); + } + + page_cur_position(cursor_rec, block, &cursor); + + offsets = rec_get_offsets(buf + origin_offset, index, offsets, + ULINT_UNDEFINED, &heap); + if (UNIV_UNLIKELY(!page_cur_rec_insert(&cursor, + buf + origin_offset, + index, offsets, mtr))) { + /* The redo log record should only have been written + after the write was successful. */ + ut_error; + } + + if (buf != buf1) { + + mem_free(buf); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return(ptr + end_seg_len); +} + +/*************************************************************** +Inserts a record next to page cursor on an uncompressed page. +Returns pointer to inserted record if succeed, i.e., enough +space available, NULL otherwise. The cursor stays at the same position. */ +UNIV_INTERN +rec_t* +page_cur_insert_rec_low( +/*====================*/ + /* out: pointer to record if succeed, NULL + otherwise */ + rec_t* current_rec,/* in: pointer to current record after + which the new record is inserted */ + dict_index_t* index, /* in: record descriptor */ + const rec_t* rec, /* in: pointer to a physical record */ + ulint* offsets,/* in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr) /* in: mini-transaction handle, or NULL */ +{ + byte* insert_buf; + ulint rec_size; + page_t* page; /* the relevant page */ + rec_t* last_insert; /* cursor position at previous + insert */ + rec_t* free_rec; /* a free record that was reused, + or NULL */ + rec_t* insert_rec; /* inserted record */ + ulint heap_no; /* heap number of the inserted + record */ + + ut_ad(rec_offs_validate(rec, index, offsets)); + + page = page_align(current_rec); + ut_ad(dict_table_is_comp(index->table) + == (ibool) !!page_is_comp(page)); + + ut_ad(!page_rec_is_supremum(current_rec)); + + /* 1. Get the size of the physical record in the page */ + rec_size = rec_offs_size(offsets); + +#ifdef UNIV_DEBUG_VALGRIND + { + const void* rec_start + = rec - rec_offs_extra_size(offsets); + ulint extra_size + = rec_offs_extra_size(offsets) + - (rec_offs_comp(offsets) + ? REC_N_NEW_EXTRA_BYTES + : REC_N_OLD_EXTRA_BYTES); + + /* All data bytes of the record must be valid. */ + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + /* The variable-length header must be valid. */ + UNIV_MEM_ASSERT_RW(rec_start, extra_size); + } +#endif /* UNIV_DEBUG_VALGRIND */ + + /* 2. Try to find suitable space from page memory management */ + + free_rec = page_header_get_ptr(page, PAGE_FREE); + if (UNIV_LIKELY_NULL(free_rec)) { + /* Try to allocate from the head of the free list. */ + ulint foffsets_[REC_OFFS_NORMAL_SIZE]; + ulint* foffsets = foffsets_; + mem_heap_t* heap = NULL; + + rec_offs_init(foffsets_); + + foffsets = rec_get_offsets(free_rec, index, foffsets, + ULINT_UNDEFINED, &heap); + if (rec_offs_size(foffsets) < rec_size) { + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + goto use_heap; + } + + insert_buf = free_rec - rec_offs_extra_size(foffsets); + + if (page_is_comp(page)) { + heap_no = rec_get_heap_no_new(free_rec); + page_mem_alloc_free(page, NULL, + rec_get_next_ptr(free_rec, TRUE), + rec_size); + } else { + heap_no = rec_get_heap_no_old(free_rec); + page_mem_alloc_free(page, NULL, + rec_get_next_ptr(free_rec, FALSE), + rec_size); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } else { +use_heap: + free_rec = NULL; + insert_buf = page_mem_alloc_heap(page, NULL, + rec_size, &heap_no); + + if (UNIV_UNLIKELY(insert_buf == NULL)) { + return(NULL); + } + } + + /* 3. Create the record */ + insert_rec = rec_copy(insert_buf, rec, offsets); + rec_offs_make_valid(insert_rec, index, offsets); + + /* 4. Insert the record in the linked list of records */ + ut_ad(current_rec != insert_rec); + + { + /* next record after current before the insertion */ + rec_t* next_rec = page_rec_get_next(current_rec); +#ifdef UNIV_DEBUG + if (page_is_comp(page)) { + ut_ad(rec_get_status(current_rec) + <= REC_STATUS_INFIMUM); + ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM); + ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM); + } +#endif + page_rec_set_next(insert_rec, next_rec); + page_rec_set_next(current_rec, insert_rec); + } + + page_header_set_field(page, NULL, PAGE_N_RECS, + 1 + page_get_n_recs(page)); + + /* 5. Set the n_owned field in the inserted record to zero, + and set the heap_no field */ + if (page_is_comp(page)) { + rec_set_n_owned_new(insert_rec, NULL, 0); + rec_set_heap_no_new(insert_rec, heap_no); + } else { + rec_set_n_owned_old(insert_rec, 0); + rec_set_heap_no_old(insert_rec, heap_no); + } + + UNIV_MEM_ASSERT_RW(rec_get_start(insert_rec, offsets), + rec_offs_size(offsets)); + /* 6. Update the last insertion info in page header */ + + last_insert = page_header_get_ptr(page, PAGE_LAST_INSERT); + ut_ad(!last_insert || !page_is_comp(page) + || rec_get_node_ptr_flag(last_insert) + == rec_get_node_ptr_flag(insert_rec)); + + if (UNIV_UNLIKELY(last_insert == NULL)) { + page_header_set_field(page, NULL, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(page, NULL, PAGE_N_DIRECTION, 0); + + } else if ((last_insert == current_rec) + && (page_header_get_field(page, PAGE_DIRECTION) + != PAGE_LEFT)) { + + page_header_set_field(page, NULL, PAGE_DIRECTION, + PAGE_RIGHT); + page_header_set_field(page, NULL, PAGE_N_DIRECTION, + page_header_get_field( + page, PAGE_N_DIRECTION) + 1); + + } else if ((page_rec_get_next(insert_rec) == last_insert) + && (page_header_get_field(page, PAGE_DIRECTION) + != PAGE_RIGHT)) { + + page_header_set_field(page, NULL, PAGE_DIRECTION, + PAGE_LEFT); + page_header_set_field(page, NULL, PAGE_N_DIRECTION, + page_header_get_field( + page, PAGE_N_DIRECTION) + 1); + } else { + page_header_set_field(page, NULL, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(page, NULL, PAGE_N_DIRECTION, 0); + } + + page_header_set_ptr(page, NULL, PAGE_LAST_INSERT, insert_rec); + + /* 7. It remains to update the owner record. */ + { + rec_t* owner_rec = page_rec_find_owner_rec(insert_rec); + ulint n_owned; + if (page_is_comp(page)) { + n_owned = rec_get_n_owned_new(owner_rec); + rec_set_n_owned_new(owner_rec, NULL, n_owned + 1); + } else { + n_owned = rec_get_n_owned_old(owner_rec); + rec_set_n_owned_old(owner_rec, n_owned + 1); + } + + /* 8. Now we have incremented the n_owned field of the owner + record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED, + we have to split the corresponding directory slot in two. */ + + if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) { + page_dir_split_slot( + page, NULL, + page_dir_find_owner_slot(owner_rec)); + } + } + + /* 9. Write log record of the insert */ + if (UNIV_LIKELY(mtr != NULL)) { + page_cur_insert_rec_write_log(insert_rec, rec_size, + current_rec, index, mtr); + } + + return(insert_rec); +} + +/*************************************************************** +Compresses or reorganizes a page after an optimistic insert. */ +static +rec_t* +page_cur_insert_rec_zip_reorg( +/*==========================*/ + /* out: rec if succeed, NULL otherwise */ + rec_t** current_rec,/* in/out: pointer to current record after + which the new record is inserted */ + buf_block_t* block, /* in: buffer block */ + dict_index_t* index, /* in: record descriptor */ + rec_t* rec, /* in: inserted record */ + page_t* page, /* in: uncompressed page */ + page_zip_des_t* page_zip,/* in: compressed page */ + mtr_t* mtr) /* in: mini-transaction, or NULL */ +{ + ulint pos; + + /* Recompress or reorganize and recompress the page. */ + if (UNIV_LIKELY(page_zip_compress(page_zip, page, index, mtr))) { + return(rec); + } + + /* Before trying to reorganize the page, + store the number of preceding records on the page. */ + pos = page_rec_get_n_recs_before(rec); + + if (page_zip_reorganize(block, index, mtr)) { + /* The page was reorganized: Find rec by seeking to pos, + and update *current_rec. */ + rec = page + PAGE_NEW_INFIMUM; + + while (--pos) { + rec = page + rec_get_next_offs(rec, TRUE); + } + + *current_rec = rec; + rec = page + rec_get_next_offs(rec, TRUE); + + return(rec); + } + + /* Out of space: restore the page */ + if (!page_zip_decompress(page_zip, page)) { + ut_error; /* Memory corrupted? */ + } + ut_ad(page_validate(page, index)); + return(NULL); +} + +/*************************************************************** +Inserts a record next to page cursor on a compressed and uncompressed +page. Returns pointer to inserted record if succeed, i.e., +enough space available, NULL otherwise. +The cursor stays at the same position. */ +UNIV_INTERN +rec_t* +page_cur_insert_rec_zip( +/*====================*/ + /* out: pointer to record if succeed, NULL + otherwise */ + rec_t** current_rec,/* in/out: pointer to current record after + which the new record is inserted */ + buf_block_t* block, /* in: buffer block of *current_rec */ + dict_index_t* index, /* in: record descriptor */ + const rec_t* rec, /* in: pointer to a physical record */ + ulint* offsets,/* in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr) /* in: mini-transaction handle, or NULL */ +{ + byte* insert_buf; + ulint rec_size; + page_t* page; /* the relevant page */ + rec_t* last_insert; /* cursor position at previous + insert */ + rec_t* free_rec; /* a free record that was reused, + or NULL */ + rec_t* insert_rec; /* inserted record */ + ulint heap_no; /* heap number of the inserted + record */ + page_zip_des_t* page_zip; + + page_zip = buf_block_get_page_zip(block); + ut_ad(page_zip); + + ut_ad(rec_offs_validate(rec, index, offsets)); + + page = page_align(*current_rec); + ut_ad(dict_table_is_comp(index->table)); + ut_ad(page_is_comp(page)); + + ut_ad(!page_rec_is_supremum(*current_rec)); +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + /* 1. Get the size of the physical record in the page */ + rec_size = rec_offs_size(offsets); + +#ifdef UNIV_DEBUG_VALGRIND + { + const void* rec_start + = rec - rec_offs_extra_size(offsets); + ulint extra_size + = rec_offs_extra_size(offsets) + - (rec_offs_comp(offsets) + ? REC_N_NEW_EXTRA_BYTES + : REC_N_OLD_EXTRA_BYTES); + + /* All data bytes of the record must be valid. */ + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + /* The variable-length header must be valid. */ + UNIV_MEM_ASSERT_RW(rec_start, extra_size); + } +#endif /* UNIV_DEBUG_VALGRIND */ + + /* 2. Try to find suitable space from page memory management */ + if (!page_zip_available(page_zip, dict_index_is_clust(index), + rec_size, 1)) { + + /* Try compressing the whole page afterwards. */ + insert_rec = page_cur_insert_rec_low(*current_rec, + index, rec, offsets, + NULL); + + if (UNIV_LIKELY(insert_rec != NULL)) { + insert_rec = page_cur_insert_rec_zip_reorg( + current_rec, block, index, insert_rec, + page, page_zip, mtr); + } + + return(insert_rec); + } + + free_rec = page_header_get_ptr(page, PAGE_FREE); + if (UNIV_LIKELY_NULL(free_rec)) { + /* Try to allocate from the head of the free list. */ + lint extra_size_diff; + ulint foffsets_[REC_OFFS_NORMAL_SIZE]; + ulint* foffsets = foffsets_; + mem_heap_t* heap = NULL; + + rec_offs_init(foffsets_); + + foffsets = rec_get_offsets(free_rec, index, foffsets, + ULINT_UNDEFINED, &heap); + if (rec_offs_size(foffsets) < rec_size) { +too_small: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + goto use_heap; + } + + insert_buf = free_rec - rec_offs_extra_size(foffsets); + + /* On compressed pages, do not relocate records from + the free list. If extra_size would grow, use the heap. */ + extra_size_diff + = rec_offs_extra_size(offsets) + - rec_offs_extra_size(foffsets); + + if (UNIV_UNLIKELY(extra_size_diff < 0)) { + /* Add an offset to the extra_size. */ + if (rec_offs_size(foffsets) + < rec_size - extra_size_diff) { + + goto too_small; + } + + insert_buf -= extra_size_diff; + } else if (UNIV_UNLIKELY(extra_size_diff)) { + /* Do not allow extra_size to grow */ + + goto too_small; + } + + heap_no = rec_get_heap_no_new(free_rec); + page_mem_alloc_free(page, page_zip, + rec_get_next_ptr(free_rec, TRUE), + rec_size); + + if (!page_is_leaf(page)) { + /* Zero out the node pointer of free_rec, + in case it will not be overwritten by + insert_rec. */ + + ut_ad(rec_size > REC_NODE_PTR_SIZE); + + if (rec_offs_extra_size(foffsets) + + rec_offs_data_size(foffsets) > rec_size) { + + memset(rec_get_end(free_rec, foffsets) + - REC_NODE_PTR_SIZE, 0, + REC_NODE_PTR_SIZE); + } + } else if (dict_index_is_clust(index)) { + /* Zero out the DB_TRX_ID and DB_ROLL_PTR + columns of free_rec, in case it will not be + overwritten by insert_rec. */ + + ulint trx_id_col; + ulint trx_id_offs; + ulint len; + + trx_id_col = dict_index_get_sys_col_pos(index, + DATA_TRX_ID); + ut_ad(trx_id_col > 0); + ut_ad(trx_id_col != ULINT_UNDEFINED); + + trx_id_offs = rec_get_nth_field_offs(foffsets, + trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + + if (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + trx_id_offs + + rec_offs_extra_size(foffsets) > rec_size) { + /* We will have to zero out the + DB_TRX_ID and DB_ROLL_PTR, because + they will not be fully overwritten by + insert_rec. */ + + memset(free_rec + trx_id_offs, 0, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + } + + ut_ad(free_rec + trx_id_offs + DATA_TRX_ID_LEN + == rec_get_nth_field(free_rec, foffsets, + trx_id_col + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } else { +use_heap: + free_rec = NULL; + insert_buf = page_mem_alloc_heap(page, page_zip, + rec_size, &heap_no); + + if (UNIV_UNLIKELY(insert_buf == NULL)) { + return(NULL); + } + + page_zip_dir_add_slot(page_zip, dict_index_is_clust(index)); + } + + /* 3. Create the record */ + insert_rec = rec_copy(insert_buf, rec, offsets); + rec_offs_make_valid(insert_rec, index, offsets); + + /* 4. Insert the record in the linked list of records */ + ut_ad(*current_rec != insert_rec); + + { + /* next record after current before the insertion */ + rec_t* next_rec = page_rec_get_next(*current_rec); + ut_ad(rec_get_status(*current_rec) + <= REC_STATUS_INFIMUM); + ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM); + ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM); + + page_rec_set_next(insert_rec, next_rec); + page_rec_set_next(*current_rec, insert_rec); + } + + page_header_set_field(page, page_zip, PAGE_N_RECS, + 1 + page_get_n_recs(page)); + + /* 5. Set the n_owned field in the inserted record to zero, + and set the heap_no field */ + rec_set_n_owned_new(insert_rec, NULL, 0); + rec_set_heap_no_new(insert_rec, heap_no); + + UNIV_MEM_ASSERT_RW(rec_get_start(insert_rec, offsets), + rec_offs_size(offsets)); + + page_zip_dir_insert(page_zip, *current_rec, free_rec, insert_rec); + + /* 6. Update the last insertion info in page header */ + + last_insert = page_header_get_ptr(page, PAGE_LAST_INSERT); + ut_ad(!last_insert + || rec_get_node_ptr_flag(last_insert) + == rec_get_node_ptr_flag(insert_rec)); + + if (UNIV_UNLIKELY(last_insert == NULL)) { + page_header_set_field(page, page_zip, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0); + + } else if ((last_insert == *current_rec) + && (page_header_get_field(page, PAGE_DIRECTION) + != PAGE_LEFT)) { + + page_header_set_field(page, page_zip, PAGE_DIRECTION, + PAGE_RIGHT); + page_header_set_field(page, page_zip, PAGE_N_DIRECTION, + page_header_get_field( + page, PAGE_N_DIRECTION) + 1); + + } else if ((page_rec_get_next(insert_rec) == last_insert) + && (page_header_get_field(page, PAGE_DIRECTION) + != PAGE_RIGHT)) { + + page_header_set_field(page, page_zip, PAGE_DIRECTION, + PAGE_LEFT); + page_header_set_field(page, page_zip, PAGE_N_DIRECTION, + page_header_get_field( + page, PAGE_N_DIRECTION) + 1); + } else { + page_header_set_field(page, page_zip, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0); + } + + page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, insert_rec); + + /* 7. It remains to update the owner record. */ + { + rec_t* owner_rec = page_rec_find_owner_rec(insert_rec); + ulint n_owned; + + n_owned = rec_get_n_owned_new(owner_rec); + rec_set_n_owned_new(owner_rec, page_zip, n_owned + 1); + + /* 8. Now we have incremented the n_owned field of the owner + record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED, + we have to split the corresponding directory slot in two. */ + + if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) { + page_dir_split_slot( + page, page_zip, + page_dir_find_owner_slot(owner_rec)); + } + } + + page_zip_write_rec(page_zip, insert_rec, index, offsets, 1); + + /* 9. Write log record of the insert */ + if (UNIV_LIKELY(mtr != NULL)) { + page_cur_insert_rec_write_log(insert_rec, rec_size, + *current_rec, index, mtr); + } + + return(insert_rec); +} + +/************************************************************** +Writes a log record of copying a record list end to a new created page. */ +UNIV_INLINE +byte* +page_copy_rec_list_to_created_page_write_log( +/*=========================================*/ + /* out: 4-byte field where to + write the log data length, + or NULL if logging is disabled */ + page_t* page, /* in: index page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ +{ + byte* log_ptr; + + ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); + + log_ptr = mlog_open_and_write_index(mtr, page, index, + page_is_comp(page) + ? MLOG_COMP_LIST_END_COPY_CREATED + : MLOG_LIST_END_COPY_CREATED, 4); + if (UNIV_LIKELY(log_ptr != NULL)) { + mlog_close(mtr, log_ptr + 4); + } + + return(log_ptr); +} + +/************************************************************** +Parses a log record of copying a record list end to a new created page. */ +UNIV_INTERN +byte* +page_parse_copy_rec_list_to_created_page( +/*=====================================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + buf_block_t* block, /* in: page or NULL */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr or NULL */ +{ + byte* rec_end; + ulint log_data_len; + page_t* page; + page_zip_des_t* page_zip; + + if (ptr + 4 > end_ptr) { + + return(NULL); + } + + log_data_len = mach_read_from_4(ptr); + ptr += 4; + + rec_end = ptr + log_data_len; + + if (rec_end > end_ptr) { + + return(NULL); + } + + if (!block) { + + return(rec_end); + } + + while (ptr < rec_end) { + ptr = page_cur_parse_insert_rec(TRUE, ptr, end_ptr, + block, index, mtr); + } + + ut_a(ptr == rec_end); + + page = buf_block_get_frame(block); + page_zip = buf_block_get_page_zip(block); + + page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL); + page_header_set_field(page, page_zip, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0); + + return(rec_end); +} + +/***************************************************************** +Copies records from page to a newly created page, from a given record onward, +including that record. Infimum and supremum records are not copied. */ +UNIV_INTERN +void +page_copy_rec_list_end_to_created_page( +/*===================================*/ + page_t* new_page, /* in/out: index page to copy to */ + rec_t* rec, /* in: first record to copy */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ +{ + page_dir_slot_t* slot = 0; /* remove warning */ + byte* heap_top; + rec_t* insert_rec = 0; /* remove warning */ + rec_t* prev_rec; + ulint count; + ulint n_recs; + ulint slot_index; + ulint rec_size; + ulint log_mode; + byte* log_ptr; + ulint log_data_len; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(page_dir_get_n_heap(new_page) == PAGE_HEAP_NO_USER_LOW); + ut_ad(page_align(rec) != new_page); + ut_ad(page_rec_is_comp(rec) == page_is_comp(new_page)); + + if (page_rec_is_infimum(rec)) { + + rec = page_rec_get_next(rec); + } + + if (page_rec_is_supremum(rec)) { + + return; + } + +#ifdef UNIV_DEBUG + /* To pass the debug tests we have to set these dummy values + in the debug version */ + page_dir_set_n_slots(new_page, NULL, UNIV_PAGE_SIZE / 2); + page_header_set_ptr(new_page, NULL, PAGE_HEAP_TOP, + new_page + UNIV_PAGE_SIZE - 1); +#endif + + log_ptr = page_copy_rec_list_to_created_page_write_log(new_page, + index, mtr); + + log_data_len = dyn_array_get_data_size(&(mtr->log)); + + /* Individual inserts are logged in a shorter form */ + + log_mode = mtr_set_log_mode(mtr, MTR_LOG_SHORT_INSERTS); + + prev_rec = page_get_infimum_rec(new_page); + if (page_is_comp(new_page)) { + heap_top = new_page + PAGE_NEW_SUPREMUM_END; + } else { + heap_top = new_page + PAGE_OLD_SUPREMUM_END; + } + count = 0; + slot_index = 0; + n_recs = 0; + + do { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + insert_rec = rec_copy(heap_top, rec, offsets); + + if (page_is_comp(new_page)) { + rec_set_next_offs_new(prev_rec, + page_offset(insert_rec)); + + rec_set_n_owned_new(insert_rec, NULL, 0); + rec_set_heap_no_new(insert_rec, + PAGE_HEAP_NO_USER_LOW + n_recs); + } else { + rec_set_next_offs_old(prev_rec, + page_offset(insert_rec)); + + rec_set_n_owned_old(insert_rec, 0); + rec_set_heap_no_old(insert_rec, + PAGE_HEAP_NO_USER_LOW + n_recs); + } + + count++; + n_recs++; + + if (UNIV_UNLIKELY + (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2)) { + + slot_index++; + + slot = page_dir_get_nth_slot(new_page, slot_index); + + page_dir_slot_set_rec(slot, insert_rec); + page_dir_slot_set_n_owned(slot, NULL, count); + + count = 0; + } + + rec_size = rec_offs_size(offsets); + + ut_ad(heap_top < new_page + UNIV_PAGE_SIZE); + + heap_top += rec_size; + + page_cur_insert_rec_write_log(insert_rec, rec_size, prev_rec, + index, mtr); + prev_rec = insert_rec; + rec = page_rec_get_next(rec); + } while (!page_rec_is_supremum(rec)); + + if ((slot_index > 0) && (count + 1 + + (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 + <= PAGE_DIR_SLOT_MAX_N_OWNED)) { + /* We can merge the two last dir slots. This operation is + here to make this function imitate exactly the equivalent + task made using page_cur_insert_rec, which we use in database + recovery to reproduce the task performed by this function. + To be able to check the correctness of recovery, it is good + that it imitates exactly. */ + + count += (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2; + + page_dir_slot_set_n_owned(slot, NULL, 0); + + slot_index--; + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + log_data_len = dyn_array_get_data_size(&(mtr->log)) - log_data_len; + + ut_a(log_data_len < 100 * UNIV_PAGE_SIZE); + + if (UNIV_LIKELY(log_ptr != NULL)) { + mach_write_to_4(log_ptr, log_data_len); + } + + if (page_is_comp(new_page)) { + rec_set_next_offs_new(insert_rec, PAGE_NEW_SUPREMUM); + } else { + rec_set_next_offs_old(insert_rec, PAGE_OLD_SUPREMUM); + } + + slot = page_dir_get_nth_slot(new_page, 1 + slot_index); + + page_dir_slot_set_rec(slot, page_get_supremum_rec(new_page)); + page_dir_slot_set_n_owned(slot, NULL, count + 1); + + page_dir_set_n_slots(new_page, NULL, 2 + slot_index); + page_header_set_ptr(new_page, NULL, PAGE_HEAP_TOP, heap_top); + page_dir_set_n_heap(new_page, NULL, PAGE_HEAP_NO_USER_LOW + n_recs); + page_header_set_field(new_page, NULL, PAGE_N_RECS, n_recs); + + page_header_set_ptr(new_page, NULL, PAGE_LAST_INSERT, NULL); + page_header_set_field(new_page, NULL, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(new_page, NULL, PAGE_N_DIRECTION, 0); + + /* Restore the log mode */ + + mtr_set_log_mode(mtr, log_mode); +} + +/*************************************************************** +Writes log record of a record delete on a page. */ +UNIV_INLINE +void +page_cur_delete_rec_write_log( +/*==========================*/ + rec_t* rec, /* in: record to be deleted */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + byte* log_ptr; + + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + + log_ptr = mlog_open_and_write_index(mtr, rec, index, + page_rec_is_comp(rec) + ? MLOG_COMP_REC_DELETE + : MLOG_REC_DELETE, 2); + + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery: + in that case mlog_open returns NULL */ + return; + } + + /* Write the cursor rec offset as a 2-byte ulint */ + mach_write_to_2(log_ptr, page_offset(rec)); + + mlog_close(mtr, log_ptr + 2); +} + +/*************************************************************** +Parses log record of a record delete on a page. */ +UNIV_INTERN +byte* +page_cur_parse_delete_rec( +/*======================*/ + /* out: pointer to record end or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + buf_block_t* block, /* in: page or NULL */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr or NULL */ +{ + ulint offset; + page_cur_t cursor; + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + /* Read the cursor rec offset as a 2-byte ulint */ + offset = mach_read_from_2(ptr); + ptr += 2; + + ut_a(offset <= UNIV_PAGE_SIZE); + + if (block) { + page_t* page = buf_block_get_frame(block); + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_t* rec = page + offset; + rec_offs_init(offsets_); + + page_cur_position(rec, block, &cursor); + ut_ad(!buf_block_get_page_zip(block) || page_is_comp(page)); + + page_cur_delete_rec(&cursor, index, + rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap), + mtr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + + return(ptr); +} + +/*************************************************************** +Deletes a record at the page cursor. The cursor is moved to the next +record after the deleted one. */ +UNIV_INTERN +void +page_cur_delete_rec( +/*================*/ + page_cur_t* cursor, /* in/out: a page cursor */ + dict_index_t* index, /* in: record descriptor */ + const ulint* offsets,/* in: rec_get_offsets(cursor->rec, index) */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + page_dir_slot_t* cur_dir_slot; + page_dir_slot_t* prev_slot; + page_t* page; + page_zip_des_t* page_zip; + rec_t* current_rec; + rec_t* prev_rec = NULL; + rec_t* next_rec; + ulint cur_slot_no; + ulint cur_n_owned; + rec_t* rec; + + ut_ad(cursor && mtr); + + page = page_cur_get_page(cursor); + page_zip = page_cur_get_page_zip(cursor); + + /* page_zip_validate() will fail here when + btr_cur_pessimistic_delete() invokes btr_set_min_rec_mark(). + Then, both "page_zip" and "page" would have the min-rec-mark + set on the smallest user record, but "page" would additionally + have it set on the smallest-but-one record. Because sloppy + page_zip_validate_low() only ignores min-rec-flag differences + in the smallest user record, it cannot be used here either. */ + + current_rec = cursor->rec; + ut_ad(rec_offs_validate(current_rec, index, offsets)); + ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); + + /* The record must not be the supremum or infimum record. */ + ut_ad(page_rec_is_user_rec(current_rec)); + + /* Save to local variables some data associated with current_rec */ + cur_slot_no = page_dir_find_owner_slot(current_rec); + cur_dir_slot = page_dir_get_nth_slot(page, cur_slot_no); + cur_n_owned = page_dir_slot_get_n_owned(cur_dir_slot); + + /* 0. Write the log record */ + page_cur_delete_rec_write_log(current_rec, index, mtr); + + /* 1. Reset the last insert info in the page header and increment + the modify clock for the frame */ + + page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL); + + /* The page gets invalid for optimistic searches: increment the + frame modify clock */ + + buf_block_modify_clock_inc(page_cur_get_block(cursor)); + + /* 2. Find the next and the previous record. Note that the cursor is + left at the next record. */ + + ut_ad(cur_slot_no > 0); + prev_slot = page_dir_get_nth_slot(page, cur_slot_no - 1); + + rec = (rec_t*) page_dir_slot_get_rec(prev_slot); + + /* rec now points to the record of the previous directory slot. Look + for the immediate predecessor of current_rec in a loop. */ + + while(current_rec != rec) { + prev_rec = rec; + rec = page_rec_get_next(rec); + } + + page_cur_move_to_next(cursor); + next_rec = cursor->rec; + + /* 3. Remove the record from the linked list of records */ + + page_rec_set_next(prev_rec, next_rec); + + /* 4. If the deleted record is pointed to by a dir slot, update the + record pointer in slot. In the following if-clause we assume that + prev_rec is owned by the same slot, i.e., PAGE_DIR_SLOT_MIN_N_OWNED + >= 2. */ + +#if PAGE_DIR_SLOT_MIN_N_OWNED < 2 +# error "PAGE_DIR_SLOT_MIN_N_OWNED < 2" +#endif + ut_ad(cur_n_owned > 1); + + if (current_rec == page_dir_slot_get_rec(cur_dir_slot)) { + page_dir_slot_set_rec(cur_dir_slot, prev_rec); + } + + /* 5. Update the number of owned records of the slot */ + + page_dir_slot_set_n_owned(cur_dir_slot, page_zip, cur_n_owned - 1); + + /* 6. Free the memory occupied by the record */ + page_mem_free(page, page_zip, current_rec, index, offsets); + + /* 7. Now we have decremented the number of owned records of the slot. + If the number drops below PAGE_DIR_SLOT_MIN_N_OWNED, we balance the + slots. */ + + if (UNIV_UNLIKELY(cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED)) { + page_dir_balance_slot(page, page_zip, cur_slot_no); + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ +} diff --git a/storage/xtradb/page/page0page.c b/storage/xtradb/page/page0page.c new file mode 100644 index 00000000000..0ce532068ce --- /dev/null +++ b/storage/xtradb/page/page0page.c @@ -0,0 +1,2568 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Index page routines + +Created 2/2/1994 Heikki Tuuri +*******************************************************/ + +#define THIS_MODULE +#include "page0page.h" +#ifdef UNIV_NONINL +#include "page0page.ic" +#endif +#undef THIS_MODULE + +#include "page0cur.h" +#include "page0zip.h" +#include "lock0lock.h" +#include "fut0lst.h" +#include "btr0sea.h" +#include "buf0buf.h" +#include "srv0srv.h" +#include "btr0btr.h" + +/* THE INDEX PAGE + ============== + +The index page consists of a page header which contains the page's +id and other information. On top of it are the the index records +in a heap linked into a one way linear list according to alphabetic order. + +Just below page end is an array of pointers which we call page directory, +to about every sixth record in the list. The pointers are placed in +the directory in the alphabetical order of the records pointed to, +enabling us to make binary search using the array. Each slot n:o I +in the directory points to a record, where a 4-bit field contains a count +of those records which are in the linear list between pointer I and +the pointer I - 1 in the directory, including the record +pointed to by pointer I and not including the record pointed to by I - 1. +We say that the record pointed to by slot I, or that slot I, owns +these records. The count is always kept in the range 4 to 8, with +the exception that it is 1 for the first slot, and 1--8 for the second slot. + +An essentially binary search can be performed in the list of index +records, like we could do if we had pointer to every record in the +page directory. The data structure is, however, more efficient when +we are doing inserts, because most inserts are just pushed on a heap. +Only every 8th insert requires block move in the directory pointer +table, which itself is quite small. A record is deleted from the page +by just taking it off the linear list and updating the number of owned +records-field of the record which owns it, and updating the page directory, +if necessary. A special case is the one when the record owns itself. +Because the overhead of inserts is so small, we may also increase the +page size from the projected default of 8 kB to 64 kB without too +much loss of efficiency in inserts. Bigger page becomes actual +when the disk transfer rate compared to seek and latency time rises. +On the present system, the page size is set so that the page transfer +time (3 ms) is 20 % of the disk random access time (15 ms). + +When the page is split, merged, or becomes full but contains deleted +records, we have to reorganize the page. + +Assuming a page size of 8 kB, a typical index page of a secondary +index contains 300 index entries, and the size of the page directory +is 50 x 4 bytes = 200 bytes. */ + +/******************************************************************* +Looks for the directory slot which owns the given record. */ +UNIV_INTERN +ulint +page_dir_find_owner_slot( +/*=====================*/ + /* out: the directory slot number */ + const rec_t* rec) /* in: the physical record */ +{ + const page_t* page; + register uint16 rec_offs_bytes; + register const page_dir_slot_t* slot; + register const page_dir_slot_t* first_slot; + register const rec_t* r = rec; + + ut_ad(page_rec_check(rec)); + + page = page_align(rec); + first_slot = page_dir_get_nth_slot(page, 0); + slot = page_dir_get_nth_slot(page, page_dir_get_n_slots(page) - 1); + + if (page_is_comp(page)) { + while (rec_get_n_owned_new(r) == 0) { + r = rec_get_next_ptr_const(r, TRUE); + ut_ad(r >= page + PAGE_NEW_SUPREMUM); + ut_ad(r < page + (UNIV_PAGE_SIZE - PAGE_DIR)); + } + } else { + while (rec_get_n_owned_old(r) == 0) { + r = rec_get_next_ptr_const(r, FALSE); + ut_ad(r >= page + PAGE_OLD_SUPREMUM); + ut_ad(r < page + (UNIV_PAGE_SIZE - PAGE_DIR)); + } + } + + rec_offs_bytes = mach_encode_2(r - page); + + while (UNIV_LIKELY(*(uint16*) slot != rec_offs_bytes)) { + + if (UNIV_UNLIKELY(slot == first_slot)) { + fprintf(stderr, + "InnoDB: Probable data corruption on" + " page %lu\n" + "InnoDB: Original record ", + (ulong) page_get_page_no(page)); + + if (page_is_comp(page)) { + fputs("(compact record)", stderr); + } else { + rec_print_old(stderr, rec); + } + + fputs("\n" + "InnoDB: on that page.\n" + "InnoDB: Cannot find the dir slot for record ", + stderr); + if (page_is_comp(page)) { + fputs("(compact record)", stderr); + } else { + rec_print_old(stderr, page + + mach_decode_2(rec_offs_bytes)); + } + fputs("\n" + "InnoDB: on that page!\n", stderr); + + buf_page_print(page, 0); + + ut_error; + } + + slot += PAGE_DIR_SLOT_SIZE; + } + + return(((ulint) (first_slot - slot)) / PAGE_DIR_SLOT_SIZE); +} + +/****************************************************************** +Used to check the consistency of a directory slot. */ +static +ibool +page_dir_slot_check( +/*================*/ + /* out: TRUE if succeed */ + page_dir_slot_t* slot) /* in: slot */ +{ + page_t* page; + ulint n_slots; + ulint n_owned; + + ut_a(slot); + + page = page_align(slot); + + n_slots = page_dir_get_n_slots(page); + + ut_a(slot <= page_dir_get_nth_slot(page, 0)); + ut_a(slot >= page_dir_get_nth_slot(page, n_slots - 1)); + + ut_a(page_rec_check(page_dir_slot_get_rec(slot))); + + if (page_is_comp(page)) { + n_owned = rec_get_n_owned_new(page_dir_slot_get_rec(slot)); + } else { + n_owned = rec_get_n_owned_old(page_dir_slot_get_rec(slot)); + } + + if (slot == page_dir_get_nth_slot(page, 0)) { + ut_a(n_owned == 1); + } else if (slot == page_dir_get_nth_slot(page, n_slots - 1)) { + ut_a(n_owned >= 1); + ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED); + } else { + ut_a(n_owned >= PAGE_DIR_SLOT_MIN_N_OWNED); + ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED); + } + + return(TRUE); +} + +/***************************************************************** +Sets the max trx id field value. */ +UNIV_INTERN +void +page_set_max_trx_id( +/*================*/ + buf_block_t* block, /* in/out: page */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + dulint trx_id) /* in: transaction id */ +{ + const ibool is_hashed = block->is_hashed; + page_t* page = buf_block_get_frame(block); + + if (is_hashed) { + rw_lock_x_lock(&btr_search_latch); + } + + /* It is not necessary to write this change to the redo log, as + during a database recovery we assume that the max trx id of every + page is the maximum trx id assigned before the crash. */ + + mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), trx_id); + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_write_header(page_zip, + page + (PAGE_HEADER + PAGE_MAX_TRX_ID), + 8, NULL); + } + + if (is_hashed) { + rw_lock_x_unlock(&btr_search_latch); + } +} + +/**************************************************************** +Allocates a block of memory from the heap of an index page. */ +UNIV_INTERN +byte* +page_mem_alloc_heap( +/*================*/ + /* out: pointer to start of allocated + buffer, or NULL if allocation fails */ + page_t* page, /* in/out: index page */ + page_zip_des_t* page_zip,/* in/out: compressed page with enough + space available for inserting the record, + or NULL */ + ulint need, /* in: total number of bytes needed */ + ulint* heap_no)/* out: this contains the heap number + of the allocated record + if allocation succeeds */ +{ + byte* block; + ulint avl_space; + + ut_ad(page && heap_no); + + avl_space = page_get_max_insert_size(page, 1); + + if (avl_space >= need) { + block = page_header_get_ptr(page, PAGE_HEAP_TOP); + + page_header_set_ptr(page, page_zip, PAGE_HEAP_TOP, + block + need); + *heap_no = page_dir_get_n_heap(page); + + page_dir_set_n_heap(page, page_zip, 1 + *heap_no); + + return(block); + } + + return(NULL); +} + +/************************************************************** +Writes a log record of page creation. */ +UNIV_INLINE +void +page_create_write_log( +/*==================*/ + buf_frame_t* frame, /* in: a buffer frame where the page is + created */ + mtr_t* mtr, /* in: mini-transaction handle */ + ibool comp) /* in: TRUE=compact page format */ +{ + mlog_write_initial_log_record(frame, comp + ? MLOG_COMP_PAGE_CREATE + : MLOG_PAGE_CREATE, mtr); +} + +/*************************************************************** +Parses a redo log record of creating a page. */ +UNIV_INTERN +byte* +page_parse_create( +/*==============*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr __attribute__((unused)), /* in: buffer end */ + ulint comp, /* in: nonzero=compact page format */ + buf_block_t* block, /* in: block or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ +{ + ut_ad(ptr && end_ptr); + + /* The record is empty, except for the record initial part */ + + if (block) { + page_create(block, mtr, comp); + } + + return(ptr); +} + +/************************************************************** +The index page creation function. */ +static +page_t* +page_create_low( +/*============*/ + /* out: pointer to the page */ + buf_block_t* block, /* in: a buffer block where the + page is created */ + ulint comp) /* in: nonzero=compact page format */ +{ + page_dir_slot_t* slot; + mem_heap_t* heap; + dtuple_t* tuple; + dfield_t* field; + byte* heap_top; + rec_t* infimum_rec; + rec_t* supremum_rec; + page_t* page; + dict_index_t* index; + ulint* offsets; + + ut_ad(block); +#if PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE > PAGE_DATA +# error "PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE > PAGE_DATA" +#endif +#if PAGE_BTR_IBUF_FREE_LIST_NODE + FLST_NODE_SIZE > PAGE_DATA +# error "PAGE_BTR_IBUF_FREE_LIST_NODE + FLST_NODE_SIZE > PAGE_DATA" +#endif + + /* The infimum and supremum records use a dummy index. */ + if (UNIV_LIKELY(comp)) { + index = srv_sys->dummy_ind2; + } else { + index = srv_sys->dummy_ind1; + } + + /* 1. INCREMENT MODIFY CLOCK */ + buf_block_modify_clock_inc(block); + + page = buf_block_get_frame(block); + + fil_page_set_type(page, FIL_PAGE_INDEX); + + heap = mem_heap_create(200); + + /* 3. CREATE THE INFIMUM AND SUPREMUM RECORDS */ + + /* Create first a data tuple for infimum record */ + tuple = dtuple_create(heap, 1); + dtuple_set_info_bits(tuple, REC_STATUS_INFIMUM); + field = dtuple_get_nth_field(tuple, 0); + + dfield_set_data(field, "infimum", 8); + dtype_set(dfield_get_type(field), + DATA_VARCHAR, DATA_ENGLISH | DATA_NOT_NULL, 8); + /* Set the corresponding physical record to its place in the page + record heap */ + + heap_top = page + PAGE_DATA; + + infimum_rec = rec_convert_dtuple_to_rec(heap_top, index, tuple, 0); + + if (UNIV_LIKELY(comp)) { + ut_a(infimum_rec == page + PAGE_NEW_INFIMUM); + + rec_set_n_owned_new(infimum_rec, NULL, 1); + rec_set_heap_no_new(infimum_rec, 0); + } else { + ut_a(infimum_rec == page + PAGE_OLD_INFIMUM); + + rec_set_n_owned_old(infimum_rec, 1); + rec_set_heap_no_old(infimum_rec, 0); + } + + offsets = rec_get_offsets(infimum_rec, index, NULL, + ULINT_UNDEFINED, &heap); + + heap_top = rec_get_end(infimum_rec, offsets); + + /* Create then a tuple for supremum */ + + tuple = dtuple_create(heap, 1); + dtuple_set_info_bits(tuple, REC_STATUS_SUPREMUM); + field = dtuple_get_nth_field(tuple, 0); + + dfield_set_data(field, "supremum", comp ? 8 : 9); + dtype_set(dfield_get_type(field), + DATA_VARCHAR, DATA_ENGLISH | DATA_NOT_NULL, comp ? 8 : 9); + + supremum_rec = rec_convert_dtuple_to_rec(heap_top, index, tuple, 0); + + if (UNIV_LIKELY(comp)) { + ut_a(supremum_rec == page + PAGE_NEW_SUPREMUM); + + rec_set_n_owned_new(supremum_rec, NULL, 1); + rec_set_heap_no_new(supremum_rec, 1); + } else { + ut_a(supremum_rec == page + PAGE_OLD_SUPREMUM); + + rec_set_n_owned_old(supremum_rec, 1); + rec_set_heap_no_old(supremum_rec, 1); + } + + offsets = rec_get_offsets(supremum_rec, index, offsets, + ULINT_UNDEFINED, &heap); + heap_top = rec_get_end(supremum_rec, offsets); + + ut_ad(heap_top == page + + (comp ? PAGE_NEW_SUPREMUM_END : PAGE_OLD_SUPREMUM_END)); + + mem_heap_free(heap); + + /* 4. INITIALIZE THE PAGE */ + + page_header_set_field(page, NULL, PAGE_N_DIR_SLOTS, 2); + page_header_set_ptr(page, NULL, PAGE_HEAP_TOP, heap_top); + page_header_set_field(page, NULL, PAGE_N_HEAP, comp + ? 0x8000 | PAGE_HEAP_NO_USER_LOW + : PAGE_HEAP_NO_USER_LOW); + page_header_set_ptr(page, NULL, PAGE_FREE, NULL); + page_header_set_field(page, NULL, PAGE_GARBAGE, 0); + page_header_set_ptr(page, NULL, PAGE_LAST_INSERT, NULL); + page_header_set_field(page, NULL, PAGE_DIRECTION, PAGE_NO_DIRECTION); + page_header_set_field(page, NULL, PAGE_N_DIRECTION, 0); + page_header_set_field(page, NULL, PAGE_N_RECS, 0); + page_set_max_trx_id(block, NULL, ut_dulint_zero); + memset(heap_top, 0, UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START + - page_offset(heap_top)); + + /* 5. SET POINTERS IN RECORDS AND DIR SLOTS */ + + /* Set the slots to point to infimum and supremum. */ + + slot = page_dir_get_nth_slot(page, 0); + page_dir_slot_set_rec(slot, infimum_rec); + + slot = page_dir_get_nth_slot(page, 1); + page_dir_slot_set_rec(slot, supremum_rec); + + /* Set the next pointers in infimum and supremum */ + + if (UNIV_LIKELY(comp)) { + rec_set_next_offs_new(infimum_rec, PAGE_NEW_SUPREMUM); + rec_set_next_offs_new(supremum_rec, 0); + } else { + rec_set_next_offs_old(infimum_rec, PAGE_OLD_SUPREMUM); + rec_set_next_offs_old(supremum_rec, 0); + } + + return(page); +} + +/************************************************************** +Create an uncompressed B-tree index page. */ +UNIV_INTERN +page_t* +page_create( +/*========*/ + /* out: pointer to the page */ + buf_block_t* block, /* in: a buffer block where the + page is created */ + mtr_t* mtr, /* in: mini-transaction handle */ + ulint comp) /* in: nonzero=compact page format */ +{ + page_create_write_log(buf_block_get_frame(block), mtr, comp); + return(page_create_low(block, comp)); +} + +/************************************************************** +Create a compressed B-tree index page. */ +UNIV_INTERN +page_t* +page_create_zip( +/*============*/ + /* out: pointer to the page */ + buf_block_t* block, /* in/out: a buffer frame where the + page is created */ + dict_index_t* index, /* in: the index of the page */ + ulint level, /* in: the B-tree level of the page */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + page_t* page; + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + + ut_ad(block); + ut_ad(page_zip); + ut_ad(index); + ut_ad(dict_table_is_comp(index->table)); + + page = page_create_low(block, TRUE); + mach_write_to_2(page + PAGE_HEADER + PAGE_LEVEL, level); + + if (UNIV_UNLIKELY(!page_zip_compress(page_zip, page, index, mtr))) { + /* The compression of a newly created page + should always succeed. */ + ut_error; + } + + return(page); +} + +/***************************************************************** +Differs from page_copy_rec_list_end, because this function does not +touch the lock table and max trx id on page or compress the page. */ +UNIV_INTERN +void +page_copy_rec_list_end_no_locks( +/*============================*/ + buf_block_t* new_block, /* in: index page to copy to */ + buf_block_t* block, /* in: index page of rec */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ +{ + page_t* new_page = buf_block_get_frame(new_block); + page_cur_t cur1; + rec_t* cur2; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + page_cur_position(rec, block, &cur1); + + if (page_cur_is_before_first(&cur1)) { + + page_cur_move_to_next(&cur1); + } + + ut_a((ibool)!!page_is_comp(new_page) + == dict_table_is_comp(index->table)); + ut_a(page_is_comp(new_page) == page_rec_is_comp(rec)); + ut_a(mach_read_from_2(new_page + UNIV_PAGE_SIZE - 10) == (ulint) + (page_is_comp(new_page) ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM)); + + cur2 = page_get_infimum_rec(buf_block_get_frame(new_block)); + + /* Copy records from the original page to the new page */ + + while (!page_cur_is_after_last(&cur1)) { + rec_t* cur1_rec = page_cur_get_rec(&cur1); + rec_t* ins_rec; + offsets = rec_get_offsets(cur1_rec, index, offsets, + ULINT_UNDEFINED, &heap); + ins_rec = page_cur_insert_rec_low(cur2, index, + cur1_rec, offsets, mtr); + if (UNIV_UNLIKELY(!ins_rec)) { + /* Track an assertion failure reported on the mailing + list on June 18th, 2003 */ + + buf_page_print(new_page, 0); + buf_page_print(page_align(rec), 0); + ut_print_timestamp(stderr); + + fprintf(stderr, + "InnoDB: rec offset %lu, cur1 offset %lu," + " cur2 offset %lu\n", + (ulong) page_offset(rec), + (ulong) page_offset(page_cur_get_rec(&cur1)), + (ulong) page_offset(cur2)); + ut_error; + } + + page_cur_move_to_next(&cur1); + cur2 = ins_rec; + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/***************************************************************** +Copies records from page to new_page, from a given record onward, +including that record. Infimum and supremum records are not copied. +The records are copied to the start of the record list on new_page. */ +UNIV_INTERN +rec_t* +page_copy_rec_list_end( +/*===================*/ + /* out: pointer to the original + successor of the infimum record + on new_page, or NULL on zip overflow + (new_block will be decompressed) */ + buf_block_t* new_block, /* in/out: index page to copy to */ + buf_block_t* block, /* in: index page containing rec */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ +{ + page_t* new_page = buf_block_get_frame(new_block); + page_zip_des_t* new_page_zip = buf_block_get_page_zip(new_block); + page_t* page = page_align(rec); + rec_t* ret = page_rec_get_next( + page_get_infimum_rec(new_page)); + ulint log_mode = 0; /* remove warning */ + +#ifdef UNIV_ZIP_DEBUG + if (new_page_zip) { + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + ut_a(page_zip); + + /* Strict page_zip_validate() may fail here. + Furthermore, btr_compress() may set FIL_PAGE_PREV to + FIL_NULL on new_page while leaving it intact on + new_page_zip. So, we cannot validate new_page_zip. */ + ut_a(page_zip_validate_low(page_zip, page, TRUE)); + } +#endif /* UNIV_ZIP_DEBUG */ + ut_ad(buf_block_get_frame(block) == page); + ut_ad(page_is_leaf(page) == page_is_leaf(new_page)); + ut_ad(page_is_comp(page) == page_is_comp(new_page)); + /* Here, "ret" may be pointing to a user record or the + predefined supremum record. */ + + if (UNIV_LIKELY_NULL(new_page_zip)) { + log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + } + + if (page_dir_get_n_heap(new_page) == PAGE_HEAP_NO_USER_LOW) { + page_copy_rec_list_end_to_created_page(new_page, rec, + index, mtr); + } else { + page_copy_rec_list_end_no_locks(new_block, block, rec, + index, mtr); + } + + if (UNIV_LIKELY_NULL(new_page_zip)) { + mtr_set_log_mode(mtr, log_mode); + + if (UNIV_UNLIKELY + (!page_zip_compress(new_page_zip, new_page, index, mtr))) { + /* Before trying to reorganize the page, + store the number of preceding records on the page. */ + ulint ret_pos + = page_rec_get_n_recs_before(ret); + /* Before copying, "ret" was the successor of + the predefined infimum record. It must still + have at least one predecessor (the predefined + infimum record, or a freshly copied record + that is smaller than "ret"). */ + ut_a(ret_pos > 0); + + if (UNIV_UNLIKELY + (!page_zip_reorganize(new_block, index, mtr))) { + + if (UNIV_UNLIKELY + (!page_zip_decompress(new_page_zip, + new_page))) { + ut_error; + } + ut_ad(page_validate(new_page, index)); + return(NULL); + } else { + /* The page was reorganized: + Seek to ret_pos. */ + ret = new_page + PAGE_NEW_INFIMUM; + + do { + ret = rec_get_next_ptr(ret, TRUE); + } while (--ret_pos); + } + } + } + + /* Update the lock table, MAX_TRX_ID, and possible hash index */ + + lock_move_rec_list_end(new_block, block, rec); + + page_update_max_trx_id(new_block, new_page_zip, + page_get_max_trx_id(page)); + + btr_search_move_or_delete_hash_entries(new_block, block, index); + + return(ret); +} + +/***************************************************************** +Copies records from page to new_page, up to the given record, +NOT including that record. Infimum and supremum records are not copied. +The records are copied to the end of the record list on new_page. */ +UNIV_INTERN +rec_t* +page_copy_rec_list_start( +/*=====================*/ + /* out: pointer to the original + predecessor of the supremum record + on new_page, or NULL on zip overflow + (new_block will be decompressed) */ + buf_block_t* new_block, /* in/out: index page to copy to */ + buf_block_t* block, /* in: index page containing rec */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ +{ + page_t* new_page = buf_block_get_frame(new_block); + page_zip_des_t* new_page_zip = buf_block_get_page_zip(new_block); + page_cur_t cur1; + rec_t* cur2; + ulint log_mode = 0 /* remove warning */; + mem_heap_t* heap = NULL; + rec_t* ret + = page_rec_get_prev(page_get_supremum_rec(new_page)); + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + /* Here, "ret" may be pointing to a user record or the + predefined infimum record. */ + + if (page_rec_is_infimum(rec)) { + + return(ret); + } + + if (UNIV_LIKELY_NULL(new_page_zip)) { + log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + } + + page_cur_set_before_first(block, &cur1); + page_cur_move_to_next(&cur1); + + cur2 = ret; + + /* Copy records from the original page to the new page */ + + while (page_cur_get_rec(&cur1) != rec) { + rec_t* cur1_rec = page_cur_get_rec(&cur1); + offsets = rec_get_offsets(cur1_rec, index, offsets, + ULINT_UNDEFINED, &heap); + cur2 = page_cur_insert_rec_low(cur2, index, + cur1_rec, offsets, mtr); + ut_a(cur2); + + page_cur_move_to_next(&cur1); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + if (UNIV_LIKELY_NULL(new_page_zip)) { + mtr_set_log_mode(mtr, log_mode); + + if (UNIV_UNLIKELY + (!page_zip_compress(new_page_zip, new_page, index, mtr))) { + /* Before trying to reorganize the page, + store the number of preceding records on the page. */ + ulint ret_pos + = page_rec_get_n_recs_before(ret); + /* Before copying, "ret" was the predecessor + of the predefined supremum record. If it was + the predefined infimum record, then it would + still be the infimum. Thus, the assertion + ut_a(ret_pos > 0) would fail here. */ + + if (UNIV_UNLIKELY + (!page_zip_reorganize(new_block, index, mtr))) { + + if (UNIV_UNLIKELY + (!page_zip_decompress(new_page_zip, + new_page))) { + ut_error; + } + ut_ad(page_validate(new_page, index)); + return(NULL); + } else { + /* The page was reorganized: + Seek to ret_pos. */ + ret = new_page + PAGE_NEW_INFIMUM; + + do { + ret = rec_get_next_ptr(ret, TRUE); + } while (--ret_pos); + } + } + } + + /* Update MAX_TRX_ID, the lock table, and possible hash index */ + + page_update_max_trx_id(new_block, new_page_zip, + page_get_max_trx_id(page_align(rec))); + + lock_move_rec_list_start(new_block, block, rec, ret); + + btr_search_move_or_delete_hash_entries(new_block, block, index); + + return(ret); +} + +/************************************************************** +Writes a log record of a record list end or start deletion. */ +UNIV_INLINE +void +page_delete_rec_list_write_log( +/*===========================*/ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + byte type, /* in: operation type: + MLOG_LIST_END_DELETE, ... */ + mtr_t* mtr) /* in: mtr */ +{ + byte* log_ptr; + ut_ad(type == MLOG_LIST_END_DELETE + || type == MLOG_LIST_START_DELETE + || type == MLOG_COMP_LIST_END_DELETE + || type == MLOG_COMP_LIST_START_DELETE); + + log_ptr = mlog_open_and_write_index(mtr, rec, index, type, 2); + if (log_ptr) { + /* Write the parameter as a 2-byte ulint */ + mach_write_to_2(log_ptr, page_offset(rec)); + mlog_close(mtr, log_ptr + 2); + } +} + +/************************************************************** +Parses a log record of a record list end or start deletion. */ +UNIV_INTERN +byte* +page_parse_delete_rec_list( +/*=======================*/ + /* out: end of log record or NULL */ + byte type, /* in: MLOG_LIST_END_DELETE, + MLOG_LIST_START_DELETE, + MLOG_COMP_LIST_END_DELETE or + MLOG_COMP_LIST_START_DELETE */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + buf_block_t* block, /* in/out: buffer block or NULL */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr or NULL */ +{ + page_t* page; + ulint offset; + + ut_ad(type == MLOG_LIST_END_DELETE + || type == MLOG_LIST_START_DELETE + || type == MLOG_COMP_LIST_END_DELETE + || type == MLOG_COMP_LIST_START_DELETE); + + /* Read the record offset as a 2-byte ulint */ + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + offset = mach_read_from_2(ptr); + ptr += 2; + + if (!block) { + + return(ptr); + } + + page = buf_block_get_frame(block); + + ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); + + if (type == MLOG_LIST_END_DELETE + || type == MLOG_COMP_LIST_END_DELETE) { + page_delete_rec_list_end(page + offset, block, index, + ULINT_UNDEFINED, ULINT_UNDEFINED, + mtr); + } else { + page_delete_rec_list_start(page + offset, block, index, mtr); + } + + return(ptr); +} + +/***************************************************************** +Deletes records from a page from a given record onward, including that record. +The infimum and supremum records are not deleted. */ +UNIV_INTERN +void +page_delete_rec_list_end( +/*=====================*/ + rec_t* rec, /* in: pointer to record on page */ + buf_block_t* block, /* in: buffer block of the page */ + dict_index_t* index, /* in: record descriptor */ + ulint n_recs, /* in: number of records to delete, + or ULINT_UNDEFINED if not known */ + ulint size, /* in: the sum of the sizes of the + records in the end of the chain to + delete, or ULINT_UNDEFINED if not known */ + mtr_t* mtr) /* in: mtr */ +{ + page_dir_slot_t*slot; + ulint slot_index; + rec_t* last_rec; + rec_t* prev_rec; + ulint n_owned; + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + page_t* page = page_align(rec); + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(size == ULINT_UNDEFINED || size < UNIV_PAGE_SIZE); + ut_ad(!page_zip || page_rec_is_comp(rec)); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + if (page_rec_is_infimum(rec)) { + rec = page_rec_get_next(rec); + } + + if (page_rec_is_supremum(rec)) { + + return; + } + + /* Reset the last insert info in the page header and increment + the modify clock for the frame */ + + page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL); + + /* The page gets invalid for optimistic searches: increment the + frame modify clock */ + + buf_block_modify_clock_inc(block); + + page_delete_rec_list_write_log(rec, index, page_is_comp(page) + ? MLOG_COMP_LIST_END_DELETE + : MLOG_LIST_END_DELETE, mtr); + + if (UNIV_LIKELY_NULL(page_zip)) { + ulint log_mode; + + ut_a(page_is_comp(page)); + /* Individual deletes are not logged */ + + log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + + do { + page_cur_t cur; + page_cur_position(rec, block, &cur); + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + rec = rec_get_next_ptr(rec, TRUE); +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + page_cur_delete_rec(&cur, index, offsets, mtr); + } while (page_offset(rec) != PAGE_NEW_SUPREMUM); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + /* Restore log mode */ + + mtr_set_log_mode(mtr, log_mode); + return; + } + + prev_rec = page_rec_get_prev(rec); + + last_rec = page_rec_get_prev(page_get_supremum_rec(page)); + + if ((size == ULINT_UNDEFINED) || (n_recs == ULINT_UNDEFINED)) { + rec_t* rec2 = rec; + /* Calculate the sum of sizes and the number of records */ + size = 0; + n_recs = 0; + + do { + ulint s; + offsets = rec_get_offsets(rec2, index, offsets, + ULINT_UNDEFINED, &heap); + s = rec_offs_size(offsets); + ut_ad(rec2 - page + s - rec_offs_extra_size(offsets) + < UNIV_PAGE_SIZE); + ut_ad(size + s < UNIV_PAGE_SIZE); + size += s; + n_recs++; + + rec2 = page_rec_get_next(rec2); + } while (!page_rec_is_supremum(rec2)); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + + ut_ad(size < UNIV_PAGE_SIZE); + + /* Update the page directory; there is no need to balance the number + of the records owned by the supremum record, as it is allowed to be + less than PAGE_DIR_SLOT_MIN_N_OWNED */ + + if (page_is_comp(page)) { + rec_t* rec2 = rec; + ulint count = 0; + + while (rec_get_n_owned_new(rec2) == 0) { + count++; + + rec2 = rec_get_next_ptr(rec2, TRUE); + } + + ut_ad(rec_get_n_owned_new(rec2) > count); + + n_owned = rec_get_n_owned_new(rec2) - count; + slot_index = page_dir_find_owner_slot(rec2); + slot = page_dir_get_nth_slot(page, slot_index); + } else { + rec_t* rec2 = rec; + ulint count = 0; + + while (rec_get_n_owned_old(rec2) == 0) { + count++; + + rec2 = rec_get_next_ptr(rec2, FALSE); + } + + ut_ad(rec_get_n_owned_old(rec2) > count); + + n_owned = rec_get_n_owned_old(rec2) - count; + slot_index = page_dir_find_owner_slot(rec2); + slot = page_dir_get_nth_slot(page, slot_index); + } + + page_dir_slot_set_rec(slot, page_get_supremum_rec(page)); + page_dir_slot_set_n_owned(slot, NULL, n_owned); + + page_dir_set_n_slots(page, NULL, slot_index + 1); + + /* Remove the record chain segment from the record chain */ + page_rec_set_next(prev_rec, page_get_supremum_rec(page)); + + /* Catenate the deleted chain segment to the page free list */ + + page_rec_set_next(last_rec, page_header_get_ptr(page, PAGE_FREE)); + page_header_set_ptr(page, NULL, PAGE_FREE, rec); + + page_header_set_field(page, NULL, PAGE_GARBAGE, size + + page_header_get_field(page, PAGE_GARBAGE)); + + page_header_set_field(page, NULL, PAGE_N_RECS, + (ulint)(page_get_n_recs(page) - n_recs)); +} + +/***************************************************************** +Deletes records from page, up to the given record, NOT including +that record. Infimum and supremum records are not deleted. */ +UNIV_INTERN +void +page_delete_rec_list_start( +/*=======================*/ + rec_t* rec, /* in: record on page */ + buf_block_t* block, /* in: buffer block of the page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ +{ + page_cur_t cur1; + ulint log_mode; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + mem_heap_t* heap = NULL; + byte type; + + rec_offs_init(offsets_); + + ut_ad((ibool) !!page_rec_is_comp(rec) + == dict_table_is_comp(index->table)); +#ifdef UNIV_ZIP_DEBUG + { + page_zip_des_t* page_zip= buf_block_get_page_zip(block); + page_t* page = buf_block_get_frame(block); + + /* page_zip_validate() would detect a min_rec_mark mismatch + in btr_page_split_and_insert() + between btr_attach_half_pages() and insert_page = ... + when btr_page_get_split_rec_to_left() holds + (direction == FSP_DOWN). */ + ut_a(!page_zip || page_zip_validate_low(page_zip, page, TRUE)); + } +#endif /* UNIV_ZIP_DEBUG */ + + if (page_rec_is_infimum(rec)) { + + return; + } + + if (page_rec_is_comp(rec)) { + type = MLOG_COMP_LIST_START_DELETE; + } else { + type = MLOG_LIST_START_DELETE; + } + + page_delete_rec_list_write_log(rec, index, type, mtr); + + page_cur_set_before_first(block, &cur1); + page_cur_move_to_next(&cur1); + + /* Individual deletes are not logged */ + + log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + + while (page_cur_get_rec(&cur1) != rec) { + offsets = rec_get_offsets(page_cur_get_rec(&cur1), index, + offsets, ULINT_UNDEFINED, &heap); + page_cur_delete_rec(&cur1, index, offsets, mtr); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + /* Restore log mode */ + + mtr_set_log_mode(mtr, log_mode); +} + +/***************************************************************** +Moves record list end to another page. Moved records include +split_rec. */ +UNIV_INTERN +ibool +page_move_rec_list_end( +/*===================*/ + /* out: TRUE on success; FALSE on + compression failure + (new_block will be decompressed) */ + buf_block_t* new_block, /* in/out: index page where to move */ + buf_block_t* block, /* in: index page from where to move */ + rec_t* split_rec, /* in: first record to move */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ +{ + page_t* new_page = buf_block_get_frame(new_block); + ulint old_data_size; + ulint new_data_size; + ulint old_n_recs; + ulint new_n_recs; + + old_data_size = page_get_data_size(new_page); + old_n_recs = page_get_n_recs(new_page); +#ifdef UNIV_ZIP_DEBUG + { + page_zip_des_t* new_page_zip + = buf_block_get_page_zip(new_block); + page_zip_des_t* page_zip + = buf_block_get_page_zip(block); + ut_a(!new_page_zip == !page_zip); + ut_a(!new_page_zip + || page_zip_validate(new_page_zip, new_page)); + ut_a(!page_zip + || page_zip_validate(page_zip, page_align(split_rec))); + } +#endif /* UNIV_ZIP_DEBUG */ + + if (UNIV_UNLIKELY(!page_copy_rec_list_end(new_block, block, + split_rec, index, mtr))) { + return(FALSE); + } + + new_data_size = page_get_data_size(new_page); + new_n_recs = page_get_n_recs(new_page); + + ut_ad(new_data_size >= old_data_size); + + page_delete_rec_list_end(split_rec, block, index, + new_n_recs - old_n_recs, + new_data_size - old_data_size, mtr); + + return(TRUE); +} + +/***************************************************************** +Moves record list start to another page. Moved records do not include +split_rec. */ +UNIV_INTERN +ibool +page_move_rec_list_start( +/*=====================*/ + /* out: TRUE on success; FALSE on + compression failure */ + buf_block_t* new_block, /* in/out: index page where to move */ + buf_block_t* block, /* in/out: page containing split_rec */ + rec_t* split_rec, /* in: first record not to move */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ +{ + if (UNIV_UNLIKELY(!page_copy_rec_list_start(new_block, block, + split_rec, index, mtr))) { + return(FALSE); + } + + page_delete_rec_list_start(split_rec, block, index, mtr); + + return(TRUE); +} + +/*************************************************************************** +This is a low-level operation which is used in a database index creation +to update the page number of a created B-tree to a data dictionary record. */ +UNIV_INTERN +void +page_rec_write_index_page_no( +/*=========================*/ + rec_t* rec, /* in: record to update */ + ulint i, /* in: index of the field to update */ + ulint page_no,/* in: value to write */ + mtr_t* mtr) /* in: mtr */ +{ + byte* data; + ulint len; + + data = rec_get_nth_field_old(rec, i, &len); + + ut_ad(len == 4); + + mlog_write_ulint(data, page_no, MLOG_4BYTES, mtr); +} + +/****************************************************************** +Used to delete n slots from the directory. This function updates +also n_owned fields in the records, so that the first slot after +the deleted ones inherits the records of the deleted slots. */ +UNIV_INLINE +void +page_dir_delete_slot( +/*=================*/ + page_t* page, /* in/out: the index page */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + ulint slot_no)/* in: slot to be deleted */ +{ + page_dir_slot_t* slot; + ulint n_owned; + ulint i; + ulint n_slots; + + ut_ad(!page_zip || page_is_comp(page)); + ut_ad(slot_no > 0); + ut_ad(slot_no + 1 < page_dir_get_n_slots(page)); + + n_slots = page_dir_get_n_slots(page); + + /* 1. Reset the n_owned fields of the slots to be + deleted */ + slot = page_dir_get_nth_slot(page, slot_no); + n_owned = page_dir_slot_get_n_owned(slot); + page_dir_slot_set_n_owned(slot, page_zip, 0); + + /* 2. Update the n_owned value of the first non-deleted slot */ + + slot = page_dir_get_nth_slot(page, slot_no + 1); + page_dir_slot_set_n_owned(slot, page_zip, + n_owned + page_dir_slot_get_n_owned(slot)); + + /* 3. Destroy the slot by copying slots */ + for (i = slot_no + 1; i < n_slots; i++) { + rec_t* rec = (rec_t*) + page_dir_slot_get_rec(page_dir_get_nth_slot(page, i)); + page_dir_slot_set_rec(page_dir_get_nth_slot(page, i - 1), rec); + } + + /* 4. Zero out the last slot, which will be removed */ + mach_write_to_2(page_dir_get_nth_slot(page, n_slots - 1), 0); + + /* 5. Update the page header */ + page_header_set_field(page, page_zip, PAGE_N_DIR_SLOTS, n_slots - 1); +} + +/****************************************************************** +Used to add n slots to the directory. Does not set the record pointers +in the added slots or update n_owned values: this is the responsibility +of the caller. */ +UNIV_INLINE +void +page_dir_add_slot( +/*==============*/ + page_t* page, /* in/out: the index page */ + page_zip_des_t* page_zip,/* in/out: comprssed page, or NULL */ + ulint start) /* in: the slot above which the new slots + are added */ +{ + page_dir_slot_t* slot; + ulint n_slots; + + n_slots = page_dir_get_n_slots(page); + + ut_ad(start < n_slots - 1); + + /* Update the page header */ + page_dir_set_n_slots(page, page_zip, n_slots + 1); + + /* Move slots up */ + slot = page_dir_get_nth_slot(page, n_slots); + memmove(slot, slot + PAGE_DIR_SLOT_SIZE, + (n_slots - 1 - start) * PAGE_DIR_SLOT_SIZE); +} + +/******************************************************************** +Splits a directory slot which owns too many records. */ +UNIV_INTERN +void +page_dir_split_slot( +/*================*/ + page_t* page, /* in/out: index page */ + page_zip_des_t* page_zip,/* in/out: compressed page whose + uncompressed part will be written, or NULL */ + ulint slot_no)/* in: the directory slot */ +{ + rec_t* rec; + page_dir_slot_t* new_slot; + page_dir_slot_t* prev_slot; + page_dir_slot_t* slot; + ulint i; + ulint n_owned; + + ut_ad(page); + ut_ad(!page_zip || page_is_comp(page)); + ut_ad(slot_no > 0); + + slot = page_dir_get_nth_slot(page, slot_no); + + n_owned = page_dir_slot_get_n_owned(slot); + ut_ad(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED + 1); + + /* 1. We loop to find a record approximately in the middle of the + records owned by the slot. */ + + prev_slot = page_dir_get_nth_slot(page, slot_no - 1); + rec = (rec_t*) page_dir_slot_get_rec(prev_slot); + + for (i = 0; i < n_owned / 2; i++) { + rec = page_rec_get_next(rec); + } + + ut_ad(n_owned / 2 >= PAGE_DIR_SLOT_MIN_N_OWNED); + + /* 2. We add one directory slot immediately below the slot to be + split. */ + + page_dir_add_slot(page, page_zip, slot_no - 1); + + /* The added slot is now number slot_no, and the old slot is + now number slot_no + 1 */ + + new_slot = page_dir_get_nth_slot(page, slot_no); + slot = page_dir_get_nth_slot(page, slot_no + 1); + + /* 3. We store the appropriate values to the new slot. */ + + page_dir_slot_set_rec(new_slot, rec); + page_dir_slot_set_n_owned(new_slot, page_zip, n_owned / 2); + + /* 4. Finally, we update the number of records field of the + original slot */ + + page_dir_slot_set_n_owned(slot, page_zip, n_owned - (n_owned / 2)); +} + +/***************************************************************** +Tries to balance the given directory slot with too few records with the upper +neighbor, so that there are at least the minimum number of records owned by +the slot; this may result in the merging of two slots. */ +UNIV_INTERN +void +page_dir_balance_slot( +/*==================*/ + page_t* page, /* in/out: index page */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + ulint slot_no)/* in: the directory slot */ +{ + page_dir_slot_t* slot; + page_dir_slot_t* up_slot; + ulint n_owned; + ulint up_n_owned; + rec_t* old_rec; + rec_t* new_rec; + + ut_ad(page); + ut_ad(!page_zip || page_is_comp(page)); + ut_ad(slot_no > 0); + + slot = page_dir_get_nth_slot(page, slot_no); + + /* The last directory slot cannot be balanced with the upper + neighbor, as there is none. */ + + if (UNIV_UNLIKELY(slot_no == page_dir_get_n_slots(page) - 1)) { + + return; + } + + up_slot = page_dir_get_nth_slot(page, slot_no + 1); + + n_owned = page_dir_slot_get_n_owned(slot); + up_n_owned = page_dir_slot_get_n_owned(up_slot); + + ut_ad(n_owned == PAGE_DIR_SLOT_MIN_N_OWNED - 1); + + /* If the upper slot has the minimum value of n_owned, we will merge + the two slots, therefore we assert: */ + ut_ad(2 * PAGE_DIR_SLOT_MIN_N_OWNED - 1 <= PAGE_DIR_SLOT_MAX_N_OWNED); + + if (up_n_owned > PAGE_DIR_SLOT_MIN_N_OWNED) { + + /* In this case we can just transfer one record owned + by the upper slot to the property of the lower slot */ + old_rec = (rec_t*) page_dir_slot_get_rec(slot); + + if (page_is_comp(page)) { + new_rec = rec_get_next_ptr(old_rec, TRUE); + + rec_set_n_owned_new(old_rec, page_zip, 0); + rec_set_n_owned_new(new_rec, page_zip, n_owned + 1); + } else { + new_rec = rec_get_next_ptr(old_rec, FALSE); + + rec_set_n_owned_old(old_rec, 0); + rec_set_n_owned_old(new_rec, n_owned + 1); + } + + page_dir_slot_set_rec(slot, new_rec); + + page_dir_slot_set_n_owned(up_slot, page_zip, up_n_owned -1); + } else { + /* In this case we may merge the two slots */ + page_dir_delete_slot(page, page_zip, slot_no); + } +} + +/**************************************************************** +Returns the middle record of the record list. If there are an even number +of records in the list, returns the first record of the upper half-list. */ +UNIV_INTERN +rec_t* +page_get_middle_rec( +/*================*/ + /* out: middle record */ + page_t* page) /* in: page */ +{ + page_dir_slot_t* slot; + ulint middle; + ulint i; + ulint n_owned; + ulint count; + rec_t* rec; + + /* This many records we must leave behind */ + middle = (page_get_n_recs(page) + PAGE_HEAP_NO_USER_LOW) / 2; + + count = 0; + + for (i = 0;; i++) { + + slot = page_dir_get_nth_slot(page, i); + n_owned = page_dir_slot_get_n_owned(slot); + + if (count + n_owned > middle) { + break; + } else { + count += n_owned; + } + } + + ut_ad(i > 0); + slot = page_dir_get_nth_slot(page, i - 1); + rec = (rec_t*) page_dir_slot_get_rec(slot); + rec = page_rec_get_next(rec); + + /* There are now count records behind rec */ + + for (i = 0; i < middle - count; i++) { + rec = page_rec_get_next(rec); + } + + return(rec); +} + +/******************************************************************* +Returns the number of records before the given record in chain. +The number includes infimum and supremum records. */ +UNIV_INTERN +ulint +page_rec_get_n_recs_before( +/*=======================*/ + /* out: number of records */ + const rec_t* rec) /* in: the physical record */ +{ + const page_dir_slot_t* slot; + const rec_t* slot_rec; + const page_t* page; + ulint i; + lint n = 0; + + ut_ad(page_rec_check(rec)); + + page = page_align(rec); + if (page_is_comp(page)) { + while (rec_get_n_owned_new(rec) == 0) { + + rec = rec_get_next_ptr_const(rec, TRUE); + n--; + } + + for (i = 0; ; i++) { + slot = page_dir_get_nth_slot(page, i); + slot_rec = page_dir_slot_get_rec(slot); + + n += rec_get_n_owned_new(slot_rec); + + if (rec == slot_rec) { + + break; + } + } + } else { + while (rec_get_n_owned_old(rec) == 0) { + + rec = rec_get_next_ptr_const(rec, FALSE); + n--; + } + + for (i = 0; ; i++) { + slot = page_dir_get_nth_slot(page, i); + slot_rec = page_dir_slot_get_rec(slot); + + n += rec_get_n_owned_old(slot_rec); + + if (rec == slot_rec) { + + break; + } + } + } + + n--; + + ut_ad(n >= 0); + + return((ulint) n); +} + +/**************************************************************** +Prints record contents including the data relevant only in +the index page context. */ +UNIV_INTERN +void +page_rec_print( +/*===========*/ + const rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: record descriptor */ +{ + ut_a(!page_rec_is_comp(rec) == !rec_offs_comp(offsets)); + rec_print_new(stderr, rec, offsets); + if (page_rec_is_comp(rec)) { + fprintf(stderr, + " n_owned: %lu; heap_no: %lu; next rec: %lu\n", + (ulong) rec_get_n_owned_new(rec), + (ulong) rec_get_heap_no_new(rec), + (ulong) rec_get_next_offs(rec, TRUE)); + } else { + fprintf(stderr, + " n_owned: %lu; heap_no: %lu; next rec: %lu\n", + (ulong) rec_get_n_owned_old(rec), + (ulong) rec_get_heap_no_old(rec), + (ulong) rec_get_next_offs(rec, TRUE)); + } + + page_rec_check(rec); + rec_validate(rec, offsets); +} + +/******************************************************************* +This is used to print the contents of the directory for +debugging purposes. */ +UNIV_INTERN +void +page_dir_print( +/*===========*/ + page_t* page, /* in: index page */ + ulint pr_n) /* in: print n first and n last entries */ +{ + ulint n; + ulint i; + page_dir_slot_t* slot; + + n = page_dir_get_n_slots(page); + + fprintf(stderr, "--------------------------------\n" + "PAGE DIRECTORY\n" + "Page address %p\n" + "Directory stack top at offs: %lu; number of slots: %lu\n", + page, (ulong) page_offset(page_dir_get_nth_slot(page, n - 1)), + (ulong) n); + for (i = 0; i < n; i++) { + slot = page_dir_get_nth_slot(page, i); + if ((i == pr_n) && (i < n - pr_n)) { + fputs(" ... \n", stderr); + } + if ((i < pr_n) || (i >= n - pr_n)) { + fprintf(stderr, + "Contents of slot: %lu: n_owned: %lu," + " rec offs: %lu\n", + (ulong) i, + (ulong) page_dir_slot_get_n_owned(slot), + (ulong) + page_offset(page_dir_slot_get_rec(slot))); + } + } + fprintf(stderr, "Total of %lu records\n" + "--------------------------------\n", + (ulong) (PAGE_HEAP_NO_USER_LOW + page_get_n_recs(page))); +} + +/******************************************************************* +This is used to print the contents of the page record list for +debugging purposes. */ +UNIV_INTERN +void +page_print_list( +/*============*/ + buf_block_t* block, /* in: index page */ + dict_index_t* index, /* in: dictionary index of the page */ + ulint pr_n) /* in: print n first and n last entries */ +{ + page_t* page = block->frame; + page_cur_t cur; + ulint count; + ulint n_recs; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table)); + + fprintf(stderr, + "--------------------------------\n" + "PAGE RECORD LIST\n" + "Page address %p\n", page); + + n_recs = page_get_n_recs(page); + + page_cur_set_before_first(block, &cur); + count = 0; + for (;;) { + offsets = rec_get_offsets(cur.rec, index, offsets, + ULINT_UNDEFINED, &heap); + page_rec_print(cur.rec, offsets); + + if (count == pr_n) { + break; + } + if (page_cur_is_after_last(&cur)) { + break; + } + page_cur_move_to_next(&cur); + count++; + } + + if (n_recs > 2 * pr_n) { + fputs(" ... \n", stderr); + } + + while (!page_cur_is_after_last(&cur)) { + page_cur_move_to_next(&cur); + + if (count + pr_n >= n_recs) { + offsets = rec_get_offsets(cur.rec, index, offsets, + ULINT_UNDEFINED, &heap); + page_rec_print(cur.rec, offsets); + } + count++; + } + + fprintf(stderr, + "Total of %lu records \n" + "--------------------------------\n", + (ulong) (count + 1)); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/******************************************************************* +Prints the info in a page header. */ +UNIV_INTERN +void +page_header_print( +/*==============*/ + const page_t* page) +{ + fprintf(stderr, + "--------------------------------\n" + "PAGE HEADER INFO\n" + "Page address %p, n records %lu (%s)\n" + "n dir slots %lu, heap top %lu\n" + "Page n heap %lu, free %lu, garbage %lu\n" + "Page last insert %lu, direction %lu, n direction %lu\n", + page, (ulong) page_header_get_field(page, PAGE_N_RECS), + page_is_comp(page) ? "compact format" : "original format", + (ulong) page_header_get_field(page, PAGE_N_DIR_SLOTS), + (ulong) page_header_get_field(page, PAGE_HEAP_TOP), + (ulong) page_dir_get_n_heap(page), + (ulong) page_header_get_field(page, PAGE_FREE), + (ulong) page_header_get_field(page, PAGE_GARBAGE), + (ulong) page_header_get_field(page, PAGE_LAST_INSERT), + (ulong) page_header_get_field(page, PAGE_DIRECTION), + (ulong) page_header_get_field(page, PAGE_N_DIRECTION)); +} + +/******************************************************************* +This is used to print the contents of the page for +debugging purposes. */ +UNIV_INTERN +void +page_print( +/*=======*/ + buf_block_t* block, /* in: index page */ + dict_index_t* index, /* in: dictionary index of the page */ + ulint dn, /* in: print dn first and last entries + in directory */ + ulint rn) /* in: print rn first and last records + in directory */ +{ + page_t* page = block->frame; + + page_header_print(page); + page_dir_print(page, dn); + page_print_list(block, index, rn); +} + +/******************************************************************* +The following is used to validate a record on a page. This function +differs from rec_validate as it can also check the n_owned field and +the heap_no field. */ +UNIV_INTERN +ibool +page_rec_validate( +/*==============*/ + /* out: TRUE if ok */ + rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ulint n_owned; + ulint heap_no; + page_t* page; + + page = page_align(rec); + ut_a(!page_is_comp(page) == !rec_offs_comp(offsets)); + + page_rec_check(rec); + rec_validate(rec, offsets); + + if (page_rec_is_comp(rec)) { + n_owned = rec_get_n_owned_new(rec); + heap_no = rec_get_heap_no_new(rec); + } else { + n_owned = rec_get_n_owned_old(rec); + heap_no = rec_get_heap_no_old(rec); + } + + if (UNIV_UNLIKELY(!(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED))) { + fprintf(stderr, + "InnoDB: Dir slot of rec %lu, n owned too big %lu\n", + (ulong) page_offset(rec), (ulong) n_owned); + return(FALSE); + } + + if (UNIV_UNLIKELY(!(heap_no < page_dir_get_n_heap(page)))) { + fprintf(stderr, + "InnoDB: Heap no of rec %lu too big %lu %lu\n", + (ulong) page_offset(rec), (ulong) heap_no, + (ulong) page_dir_get_n_heap(page)); + return(FALSE); + } + + return(TRUE); +} + +/******************************************************************* +Checks that the first directory slot points to the infimum record and +the last to the supremum. This function is intended to track if the +bug fixed in 4.0.14 has caused corruption to users' databases. */ +UNIV_INTERN +void +page_check_dir( +/*===========*/ + const page_t* page) /* in: index page */ +{ + ulint n_slots; + ulint infimum_offs; + ulint supremum_offs; + + n_slots = page_dir_get_n_slots(page); + infimum_offs = mach_read_from_2(page_dir_get_nth_slot(page, 0)); + supremum_offs = mach_read_from_2(page_dir_get_nth_slot(page, + n_slots - 1)); + + if (UNIV_UNLIKELY(!page_rec_is_infimum_low(infimum_offs))) { + + fprintf(stderr, + "InnoDB: Page directory corruption:" + " infimum not pointed to\n"); + buf_page_print(page, 0); + } + + if (UNIV_UNLIKELY(!page_rec_is_supremum_low(supremum_offs))) { + + fprintf(stderr, + "InnoDB: Page directory corruption:" + " supremum not pointed to\n"); + buf_page_print(page, 0); + } +} + +/******************************************************************* +This function checks the consistency of an index page when we do not +know the index. This is also resilient so that this should never crash +even if the page is total garbage. */ +UNIV_INTERN +ibool +page_simple_validate_old( +/*=====================*/ + /* out: TRUE if ok */ + page_t* page) /* in: old-style index page */ +{ + page_dir_slot_t* slot; + ulint slot_no; + ulint n_slots; + rec_t* rec; + byte* rec_heap_top; + ulint count; + ulint own_count; + ibool ret = FALSE; + + ut_a(!page_is_comp(page)); + + /* Check first that the record heap and the directory do not + overlap. */ + + n_slots = page_dir_get_n_slots(page); + + if (UNIV_UNLIKELY(n_slots > UNIV_PAGE_SIZE / 4)) { + fprintf(stderr, + "InnoDB: Nonsensical number %lu of page dir slots\n", + (ulong) n_slots); + + goto func_exit; + } + + rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP); + + if (UNIV_UNLIKELY(rec_heap_top + > page_dir_get_nth_slot(page, n_slots - 1))) { + + fprintf(stderr, + "InnoDB: Record heap and dir overlap on a page," + " heap top %lu, dir %lu\n", + (ulong) page_header_get_field(page, PAGE_HEAP_TOP), + (ulong) + page_offset(page_dir_get_nth_slot(page, n_slots - 1))); + + goto func_exit; + } + + /* Validate the record list in a loop checking also that it is + consistent with the page record directory. */ + + count = 0; + own_count = 1; + slot_no = 0; + slot = page_dir_get_nth_slot(page, slot_no); + + rec = page_get_infimum_rec(page); + + for (;;) { + if (UNIV_UNLIKELY(rec > rec_heap_top)) { + fprintf(stderr, + "InnoDB: Record %lu is above" + " rec heap top %lu\n", + (ulong)(rec - page), + (ulong)(rec_heap_top - page)); + + goto func_exit; + } + + if (UNIV_UNLIKELY(rec_get_n_owned_old(rec))) { + /* This is a record pointed to by a dir slot */ + if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) + != own_count)) { + + fprintf(stderr, + "InnoDB: Wrong owned count %lu, %lu," + " rec %lu\n", + (ulong) rec_get_n_owned_old(rec), + (ulong) own_count, + (ulong)(rec - page)); + + goto func_exit; + } + + if (UNIV_UNLIKELY + (page_dir_slot_get_rec(slot) != rec)) { + fprintf(stderr, + "InnoDB: Dir slot does not point" + " to right rec %lu\n", + (ulong)(rec - page)); + + goto func_exit; + } + + own_count = 0; + + if (!page_rec_is_supremum(rec)) { + slot_no++; + slot = page_dir_get_nth_slot(page, slot_no); + } + } + + if (page_rec_is_supremum(rec)) { + + break; + } + + if (UNIV_UNLIKELY + (rec_get_next_offs(rec, FALSE) < FIL_PAGE_DATA + || rec_get_next_offs(rec, FALSE) >= UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Next record offset" + " nonsensical %lu for rec %lu\n", + (ulong) rec_get_next_offs(rec, FALSE), + (ulong) (rec - page)); + + goto func_exit; + } + + count++; + + if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Page record list appears" + " to be circular %lu\n", + (ulong) count); + goto func_exit; + } + + rec = page_rec_get_next(rec); + own_count++; + } + + if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) == 0)) { + fprintf(stderr, "InnoDB: n owned is zero in a supremum rec\n"); + + goto func_exit; + } + + if (UNIV_UNLIKELY(slot_no != n_slots - 1)) { + fprintf(stderr, "InnoDB: n slots wrong %lu, %lu\n", + (ulong) slot_no, (ulong) (n_slots - 1)); + goto func_exit; + } + + if (UNIV_UNLIKELY(page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW + != count + 1)) { + fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n", + (ulong) page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW, + (ulong) (count + 1)); + + goto func_exit; + } + + /* Check then the free list */ + rec = page_header_get_ptr(page, PAGE_FREE); + + while (rec != NULL) { + if (UNIV_UNLIKELY(rec < page + FIL_PAGE_DATA + || rec >= page + UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Free list record has" + " a nonsensical offset %lu\n", + (ulong) (rec - page)); + + goto func_exit; + } + + if (UNIV_UNLIKELY(rec > rec_heap_top)) { + fprintf(stderr, + "InnoDB: Free list record %lu" + " is above rec heap top %lu\n", + (ulong) (rec - page), + (ulong) (rec_heap_top - page)); + + goto func_exit; + } + + count++; + + if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Page free list appears" + " to be circular %lu\n", + (ulong) count); + goto func_exit; + } + + rec = page_rec_get_next(rec); + } + + if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) { + + fprintf(stderr, "InnoDB: N heap is wrong %lu, %lu\n", + (ulong) page_dir_get_n_heap(page), + (ulong) (count + 1)); + + goto func_exit; + } + + ret = TRUE; + +func_exit: + return(ret); +} + +/******************************************************************* +This function checks the consistency of an index page when we do not +know the index. This is also resilient so that this should never crash +even if the page is total garbage. */ +UNIV_INTERN +ibool +page_simple_validate_new( +/*=====================*/ + /* out: TRUE if ok */ + page_t* page) /* in: new-style index page */ +{ + page_dir_slot_t* slot; + ulint slot_no; + ulint n_slots; + rec_t* rec; + byte* rec_heap_top; + ulint count; + ulint own_count; + ibool ret = FALSE; + + ut_a(page_is_comp(page)); + + /* Check first that the record heap and the directory do not + overlap. */ + + n_slots = page_dir_get_n_slots(page); + + if (UNIV_UNLIKELY(n_slots > UNIV_PAGE_SIZE / 4)) { + fprintf(stderr, + "InnoDB: Nonsensical number %lu" + " of page dir slots\n", (ulong) n_slots); + + goto func_exit; + } + + rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP); + + if (UNIV_UNLIKELY(rec_heap_top + > page_dir_get_nth_slot(page, n_slots - 1))) { + + fprintf(stderr, + "InnoDB: Record heap and dir overlap on a page," + " heap top %lu, dir %lu\n", + (ulong) page_header_get_field(page, PAGE_HEAP_TOP), + (ulong) + page_offset(page_dir_get_nth_slot(page, n_slots - 1))); + + goto func_exit; + } + + /* Validate the record list in a loop checking also that it is + consistent with the page record directory. */ + + count = 0; + own_count = 1; + slot_no = 0; + slot = page_dir_get_nth_slot(page, slot_no); + + rec = page_get_infimum_rec(page); + + for (;;) { + if (UNIV_UNLIKELY(rec > rec_heap_top)) { + fprintf(stderr, + "InnoDB: Record %lu is above rec" + " heap top %lu\n", + (ulong) page_offset(rec), + (ulong) page_offset(rec_heap_top)); + + goto func_exit; + } + + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec))) { + /* This is a record pointed to by a dir slot */ + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) + != own_count)) { + + fprintf(stderr, + "InnoDB: Wrong owned count %lu, %lu," + " rec %lu\n", + (ulong) rec_get_n_owned_new(rec), + (ulong) own_count, + (ulong) page_offset(rec)); + + goto func_exit; + } + + if (UNIV_UNLIKELY + (page_dir_slot_get_rec(slot) != rec)) { + fprintf(stderr, + "InnoDB: Dir slot does not point" + " to right rec %lu\n", + (ulong) page_offset(rec)); + + goto func_exit; + } + + own_count = 0; + + if (!page_rec_is_supremum(rec)) { + slot_no++; + slot = page_dir_get_nth_slot(page, slot_no); + } + } + + if (page_rec_is_supremum(rec)) { + + break; + } + + if (UNIV_UNLIKELY + (rec_get_next_offs(rec, TRUE) < FIL_PAGE_DATA + || rec_get_next_offs(rec, TRUE) >= UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Next record offset nonsensical %lu" + " for rec %lu\n", + (ulong) rec_get_next_offs(rec, TRUE), + (ulong) page_offset(rec)); + + goto func_exit; + } + + count++; + + if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Page record list appears" + " to be circular %lu\n", + (ulong) count); + goto func_exit; + } + + rec = page_rec_get_next(rec); + own_count++; + } + + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) == 0)) { + fprintf(stderr, "InnoDB: n owned is zero" + " in a supremum rec\n"); + + goto func_exit; + } + + if (UNIV_UNLIKELY(slot_no != n_slots - 1)) { + fprintf(stderr, "InnoDB: n slots wrong %lu, %lu\n", + (ulong) slot_no, (ulong) (n_slots - 1)); + goto func_exit; + } + + if (UNIV_UNLIKELY(page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW + != count + 1)) { + fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n", + (ulong) page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW, + (ulong) (count + 1)); + + goto func_exit; + } + + /* Check then the free list */ + rec = page_header_get_ptr(page, PAGE_FREE); + + while (rec != NULL) { + if (UNIV_UNLIKELY(rec < page + FIL_PAGE_DATA + || rec >= page + UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Free list record has" + " a nonsensical offset %lu\n", + (ulong) page_offset(rec)); + + goto func_exit; + } + + if (UNIV_UNLIKELY(rec > rec_heap_top)) { + fprintf(stderr, + "InnoDB: Free list record %lu" + " is above rec heap top %lu\n", + (ulong) page_offset(rec), + (ulong) page_offset(rec_heap_top)); + + goto func_exit; + } + + count++; + + if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Page free list appears" + " to be circular %lu\n", + (ulong) count); + goto func_exit; + } + + rec = page_rec_get_next(rec); + } + + if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) { + + fprintf(stderr, "InnoDB: N heap is wrong %lu, %lu\n", + (ulong) page_dir_get_n_heap(page), + (ulong) (count + 1)); + + goto func_exit; + } + + ret = TRUE; + +func_exit: + return(ret); +} + +/******************************************************************* +This function checks the consistency of an index page. */ +UNIV_INTERN +ibool +page_validate( +/*==========*/ + /* out: TRUE if ok */ + page_t* page, /* in: index page */ + dict_index_t* index) /* in: data dictionary index containing + the page record type definition */ +{ + page_dir_slot_t*slot; + mem_heap_t* heap; + byte* buf; + ulint count; + ulint own_count; + ulint rec_own_count; + ulint slot_no; + ulint data_size; + rec_t* rec; + rec_t* old_rec = NULL; + ulint offs; + ulint n_slots; + ibool ret = FALSE; + ulint i; + ulint* offsets = NULL; + ulint* old_offsets = NULL; + + if (UNIV_UNLIKELY((ibool) !!page_is_comp(page) + != dict_table_is_comp(index->table))) { + fputs("InnoDB: 'compact format' flag mismatch\n", stderr); + goto func_exit2; + } + if (page_is_comp(page)) { + if (UNIV_UNLIKELY(!page_simple_validate_new(page))) { + goto func_exit2; + } + } else { + if (UNIV_UNLIKELY(!page_simple_validate_old(page))) { + goto func_exit2; + } + } + + heap = mem_heap_create(UNIV_PAGE_SIZE + 200); + + /* The following buffer is used to check that the + records in the page record heap do not overlap */ + + buf = mem_heap_zalloc(heap, UNIV_PAGE_SIZE); + + /* Check first that the record heap and the directory do not + overlap. */ + + n_slots = page_dir_get_n_slots(page); + + if (UNIV_UNLIKELY(!(page_header_get_ptr(page, PAGE_HEAP_TOP) + <= page_dir_get_nth_slot(page, n_slots - 1)))) { + + fputs("InnoDB: Record heap and dir overlap on a page ", + stderr); + dict_index_name_print(stderr, NULL, index); + fprintf(stderr, ", %p, %p\n", + page_header_get_ptr(page, PAGE_HEAP_TOP), + page_dir_get_nth_slot(page, n_slots - 1)); + + goto func_exit; + } + + /* Validate the record list in a loop checking also that + it is consistent with the directory. */ + count = 0; + data_size = 0; + own_count = 1; + slot_no = 0; + slot = page_dir_get_nth_slot(page, slot_no); + + rec = page_get_infimum_rec(page); + + for (;;) { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + if (page_is_comp(page) && page_rec_is_user_rec(rec) + && UNIV_UNLIKELY(rec_get_node_ptr_flag(rec) + == page_is_leaf(page))) { + fputs("InnoDB: node_ptr flag mismatch\n", stderr); + goto func_exit; + } + + if (UNIV_UNLIKELY(!page_rec_validate(rec, offsets))) { + goto func_exit; + } + + /* Check that the records are in the ascending order */ + if (UNIV_LIKELY(count >= PAGE_HEAP_NO_USER_LOW) + && !page_rec_is_supremum(rec)) { + if (UNIV_UNLIKELY + (1 != cmp_rec_rec(rec, old_rec, + offsets, old_offsets, index))) { + fprintf(stderr, + "InnoDB: Records in wrong order" + " on page %lu ", + (ulong) page_get_page_no(page)); + dict_index_name_print(stderr, NULL, index); + fputs("\nInnoDB: previous record ", stderr); + rec_print_new(stderr, old_rec, old_offsets); + fputs("\nInnoDB: record ", stderr); + rec_print_new(stderr, rec, offsets); + putc('\n', stderr); + + goto func_exit; + } + } + + if (page_rec_is_user_rec(rec)) { + + data_size += rec_offs_size(offsets); + } + + offs = page_offset(rec_get_start(rec, offsets)); + + for (i = rec_offs_size(offsets); i--; ) { + if (UNIV_UNLIKELY(buf[offs + i])) { + /* No other record may overlap this */ + + fputs("InnoDB: Record overlaps another\n", + stderr); + goto func_exit; + } + + buf[offs + i] = 1; + } + + if (page_is_comp(page)) { + rec_own_count = rec_get_n_owned_new(rec); + } else { + rec_own_count = rec_get_n_owned_old(rec); + } + + if (UNIV_UNLIKELY(rec_own_count)) { + /* This is a record pointed to by a dir slot */ + if (UNIV_UNLIKELY(rec_own_count != own_count)) { + fprintf(stderr, + "InnoDB: Wrong owned count %lu, %lu\n", + (ulong) rec_own_count, + (ulong) own_count); + goto func_exit; + } + + if (page_dir_slot_get_rec(slot) != rec) { + fputs("InnoDB: Dir slot does not" + " point to right rec\n", + stderr); + goto func_exit; + } + + page_dir_slot_check(slot); + + own_count = 0; + if (!page_rec_is_supremum(rec)) { + slot_no++; + slot = page_dir_get_nth_slot(page, slot_no); + } + } + + if (page_rec_is_supremum(rec)) { + break; + } + + count++; + own_count++; + old_rec = rec; + rec = page_rec_get_next(rec); + + /* set old_offsets to offsets; recycle offsets */ + { + ulint* offs = old_offsets; + old_offsets = offsets; + offsets = offs; + } + } + + if (page_is_comp(page)) { + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) == 0)) { + + goto n_owned_zero; + } + } else if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) == 0)) { +n_owned_zero: + fputs("InnoDB: n owned is zero\n", stderr); + goto func_exit; + } + + if (UNIV_UNLIKELY(slot_no != n_slots - 1)) { + fprintf(stderr, "InnoDB: n slots wrong %lu %lu\n", + (ulong) slot_no, (ulong) (n_slots - 1)); + goto func_exit; + } + + if (UNIV_UNLIKELY(page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW + != count + 1)) { + fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n", + (ulong) page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW, + (ulong) (count + 1)); + goto func_exit; + } + + if (UNIV_UNLIKELY(data_size != page_get_data_size(page))) { + fprintf(stderr, + "InnoDB: Summed data size %lu, returned by func %lu\n", + (ulong) data_size, (ulong) page_get_data_size(page)); + goto func_exit; + } + + /* Check then the free list */ + rec = page_header_get_ptr(page, PAGE_FREE); + + while (rec != NULL) { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + if (UNIV_UNLIKELY(!page_rec_validate(rec, offsets))) { + + goto func_exit; + } + + count++; + offs = page_offset(rec_get_start(rec, offsets)); + + for (i = rec_offs_size(offsets); i--; ) { + + if (UNIV_UNLIKELY(buf[offs + i])) { + fputs("InnoDB: Record overlaps another" + " in free list\n", stderr); + goto func_exit; + } + + buf[offs + i] = 1; + } + + rec = page_rec_get_next(rec); + } + + if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) { + fprintf(stderr, "InnoDB: N heap is wrong %lu %lu\n", + (ulong) page_dir_get_n_heap(page), + (ulong) count + 1); + goto func_exit; + } + + ret = TRUE; + +func_exit: + mem_heap_free(heap); + + if (UNIV_UNLIKELY(ret == FALSE)) { +func_exit2: + fprintf(stderr, "InnoDB: Apparent corruption in page %lu in ", + (ulong) page_get_page_no(page)); + dict_index_name_print(stderr, NULL, index); + putc('\n', stderr); + buf_page_print(page, 0); + } + + return(ret); +} + +/******************************************************************* +Looks in the page record list for a record with the given heap number. */ +UNIV_INTERN +const rec_t* +page_find_rec_with_heap_no( +/*=======================*/ + /* out: record, NULL if not found */ + const page_t* page, /* in: index page */ + ulint heap_no)/* in: heap number */ +{ + const rec_t* rec; + + if (page_is_comp(page)) { + rec = page + PAGE_NEW_INFIMUM; + + for(;;) { + ulint rec_heap_no = rec_get_heap_no_new(rec); + + if (rec_heap_no == heap_no) { + + return(rec); + } else if (rec_heap_no == PAGE_HEAP_NO_SUPREMUM) { + + return(NULL); + } + + rec = page + rec_get_next_offs(rec, TRUE); + } + } else { + rec = page + PAGE_OLD_INFIMUM; + + for (;;) { + ulint rec_heap_no = rec_get_heap_no_old(rec); + + if (rec_heap_no == heap_no) { + + return(rec); + } else if (rec_heap_no == PAGE_HEAP_NO_SUPREMUM) { + + return(NULL); + } + + rec = page + rec_get_next_offs(rec, FALSE); + } + } +} diff --git a/storage/xtradb/page/page0zip.c b/storage/xtradb/page/page0zip.c new file mode 100644 index 00000000000..56189ce3bad --- /dev/null +++ b/storage/xtradb/page/page0zip.c @@ -0,0 +1,4575 @@ +/***************************************************************************** + +Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Compressed page interface + +Created June 2005 by Marko Makela +*******************************************************/ + +#define THIS_MODULE +#include "page0zip.h" +#ifdef UNIV_NONINL +# include "page0zip.ic" +#endif +#undef THIS_MODULE +#include "page0page.h" +#include "mtr0log.h" +#include "ut0sort.h" +#include "dict0boot.h" +#include "dict0dict.h" +#include "btr0sea.h" +#include "btr0cur.h" +#include "page0types.h" +#include "lock0lock.h" +#include "log0recv.h" +#include "zlib.h" +#include "buf0lru.h" + +/** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */ +UNIV_INTERN page_zip_stat_t page_zip_stat[PAGE_ZIP_NUM_SSIZE - 1]; + +/* Please refer to ../include/page0zip.ic for a description of the +compressed page format. */ + +/* The infimum and supremum records are omitted from the compressed page. +On compress, we compare that the records are there, and on uncompress we +restore the records. */ +static const byte infimum_extra[] = { + 0x01, /* info_bits=0, n_owned=1 */ + 0x00, 0x02 /* heap_no=0, status=2 */ + /* ?, ? */ /* next=(first user rec, or supremum) */ +}; +static const byte infimum_data[] = { + 0x69, 0x6e, 0x66, 0x69, + 0x6d, 0x75, 0x6d, 0x00 /* "infimum\0" */ +}; +static const byte supremum_extra_data[] = { + /* 0x0?, */ /* info_bits=0, n_owned=1..8 */ + 0x00, 0x0b, /* heap_no=1, status=3 */ + 0x00, 0x00, /* next=0 */ + 0x73, 0x75, 0x70, 0x72, + 0x65, 0x6d, 0x75, 0x6d /* "supremum" */ +}; + +/** Assert that a block of memory is filled with zero bytes. +Compare at most sizeof(field_ref_zero) bytes. */ +#define ASSERT_ZERO(b, s) \ + ut_ad(!memcmp(b, field_ref_zero, ut_min(s, sizeof field_ref_zero))) +/** Assert that a BLOB pointer is filled with zero bytes. */ +#define ASSERT_ZERO_BLOB(b) \ + ut_ad(!memcmp(b, field_ref_zero, sizeof field_ref_zero)) + +/* Enable some extra debugging output. This code can be enabled +independently of any UNIV_ debugging conditions. */ +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG +# include <stdarg.h> +__attribute__((format (printf, 1, 2))) +/************************************************************************** +Report a failure to decompress or compress. */ +static +int +page_zip_fail_func( +/*===============*/ + /* out: number of characters printed */ + const char* fmt, /* in: printf(3) format string */ + ...) /* in: arguments corresponding to fmt */ +{ + int res; + va_list ap; + + ut_print_timestamp(stderr); + fputs(" InnoDB: ", stderr); + va_start(ap, fmt); + res = vfprintf(stderr, fmt, ap); + va_end(ap); + + return(res); +} +# define page_zip_fail(fmt_args) page_zip_fail_func fmt_args +#else /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ +# define page_zip_fail(fmt_args) /* empty */ +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + +/************************************************************************** +Determine the guaranteed free space on an empty page. */ +UNIV_INTERN +ulint +page_zip_empty_size( +/*================*/ + /* out: minimum payload size on the page */ + ulint n_fields, /* in: number of columns in the index */ + ulint zip_size) /* in: compressed page size in bytes */ +{ + lint size = zip_size + /* subtract the page header and the longest + uncompressed data needed for one record */ + - (PAGE_DATA + + PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + + 1/* encoded heap_no==2 in page_zip_write_rec() */ + + 1/* end of modification log */ + - REC_N_NEW_EXTRA_BYTES/* omitted bytes */) + /* subtract the space for page_zip_fields_encode() */ + - compressBound(2 * (n_fields + 1)); + return(size > 0 ? (ulint) size : 0); +} + +/***************************************************************** +Gets the size of the compressed page trailer (the dense page directory), +including deleted records (the free list). */ +UNIV_INLINE +ulint +page_zip_dir_size( +/*==============*/ + /* out: length of dense page + directory, in bytes */ + const page_zip_des_t* page_zip) /* in: compressed page */ +{ + /* Exclude the page infimum and supremum from the record count. */ + ulint size = PAGE_ZIP_DIR_SLOT_SIZE + * (page_dir_get_n_heap(page_zip->data) + - PAGE_HEAP_NO_USER_LOW); + return(size); +} + +/***************************************************************** +Gets the size of the compressed page trailer (the dense page directory), +only including user records (excluding the free list). */ +UNIV_INLINE +ulint +page_zip_dir_user_size( +/*===================*/ + /* out: length of dense page + directory comprising existing + records, in bytes */ + const page_zip_des_t* page_zip) /* in: compressed page */ +{ + ulint size = PAGE_ZIP_DIR_SLOT_SIZE + * page_get_n_recs(page_zip->data); + ut_ad(size <= page_zip_dir_size(page_zip)); + return(size); +} + +/***************************************************************** +Find the slot of the given record in the dense page directory. */ +UNIV_INLINE +byte* +page_zip_dir_find_low( +/*==================*/ + /* out: dense directory slot, + or NULL if record not found */ + byte* slot, /* in: start of records */ + byte* end, /* in: end of records */ + ulint offset) /* in: offset of user record */ +{ + ut_ad(slot <= end); + + for (; slot < end; slot += PAGE_ZIP_DIR_SLOT_SIZE) { + if ((mach_read_from_2(slot) & PAGE_ZIP_DIR_SLOT_MASK) + == offset) { + return(slot); + } + } + + return(NULL); +} + +/***************************************************************** +Find the slot of the given non-free record in the dense page directory. */ +UNIV_INLINE +byte* +page_zip_dir_find( +/*==============*/ + /* out: dense directory slot, + or NULL if record not found */ + page_zip_des_t* page_zip, /* in: compressed page */ + ulint offset) /* in: offset of user record */ +{ + byte* end = page_zip->data + page_zip_get_size(page_zip); + + ut_ad(page_zip_simple_validate(page_zip)); + + return(page_zip_dir_find_low(end - page_zip_dir_user_size(page_zip), + end, + offset)); +} + +/***************************************************************** +Find the slot of the given free record in the dense page directory. */ +UNIV_INLINE +byte* +page_zip_dir_find_free( +/*===================*/ + /* out: dense directory slot, + or NULL if record not found */ + page_zip_des_t* page_zip, /* in: compressed page */ + ulint offset) /* in: offset of user record */ +{ + byte* end = page_zip->data + page_zip_get_size(page_zip); + + ut_ad(page_zip_simple_validate(page_zip)); + + return(page_zip_dir_find_low(end - page_zip_dir_size(page_zip), + end - page_zip_dir_user_size(page_zip), + offset)); +} + +/***************************************************************** +Read a given slot in the dense page directory. */ +UNIV_INLINE +ulint +page_zip_dir_get( +/*=============*/ + /* out: record offset + on the uncompressed page, + possibly ORed with + PAGE_ZIP_DIR_SLOT_DEL or + PAGE_ZIP_DIR_SLOT_OWNED */ + const page_zip_des_t* page_zip, /* in: compressed page */ + ulint slot) /* in: slot + (0=first user record) */ +{ + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(slot < page_zip_dir_size(page_zip) / PAGE_ZIP_DIR_SLOT_SIZE); + return(mach_read_from_2(page_zip->data + page_zip_get_size(page_zip) + - PAGE_ZIP_DIR_SLOT_SIZE * (slot + 1))); +} + +/************************************************************************** +Write a log record of compressing an index page. */ +static +void +page_zip_compress_write_log( +/*========================*/ + const page_zip_des_t* page_zip,/* in: compressed page */ + const page_t* page, /* in: uncompressed page */ + dict_index_t* index, /* in: index of the B-tree node */ + mtr_t* mtr) /* in: mini-transaction */ +{ + byte* log_ptr; + ulint trailer_size; + + log_ptr = mlog_open(mtr, 11 + 2 + 2); + + if (!log_ptr) { + + return; + } + + /* Read the number of user records. */ + trailer_size = page_dir_get_n_heap(page_zip->data) + - PAGE_HEAP_NO_USER_LOW; + /* Multiply by uncompressed of size stored per record */ + if (!page_is_leaf(page)) { + trailer_size *= PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE; + } else if (dict_index_is_clust(index)) { + trailer_size *= PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + } else { + trailer_size *= PAGE_ZIP_DIR_SLOT_SIZE; + } + /* Add the space occupied by BLOB pointers. */ + trailer_size += page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE; + ut_a(page_zip->m_end > PAGE_DATA); +#if FIL_PAGE_DATA > PAGE_DATA +# error "FIL_PAGE_DATA > PAGE_DATA" +#endif + ut_a(page_zip->m_end + trailer_size <= page_zip_get_size(page_zip)); + + log_ptr = mlog_write_initial_log_record_fast((page_t*) page, + MLOG_ZIP_PAGE_COMPRESS, + log_ptr, mtr); + mach_write_to_2(log_ptr, page_zip->m_end - FIL_PAGE_TYPE); + log_ptr += 2; + mach_write_to_2(log_ptr, trailer_size); + log_ptr += 2; + mlog_close(mtr, log_ptr); + + /* Write FIL_PAGE_PREV and FIL_PAGE_NEXT */ + mlog_catenate_string(mtr, page_zip->data + FIL_PAGE_PREV, 4); + mlog_catenate_string(mtr, page_zip->data + FIL_PAGE_NEXT, 4); + /* Write most of the page header, the compressed stream and + the modification log. */ + mlog_catenate_string(mtr, page_zip->data + FIL_PAGE_TYPE, + page_zip->m_end - FIL_PAGE_TYPE); + /* Write the uncompressed trailer of the compressed page. */ + mlog_catenate_string(mtr, page_zip->data + page_zip_get_size(page_zip) + - trailer_size, trailer_size); +} + +/********************************************************** +Determine how many externally stored columns are contained +in existing records with smaller heap_no than rec. */ +static +ulint +page_zip_get_n_prev_extern( +/*=======================*/ + const page_zip_des_t* page_zip,/* in: dense page directory on + compressed page */ + const rec_t* rec, /* in: compact physical record + on a B-tree leaf page */ + dict_index_t* index) /* in: record descriptor */ +{ + const page_t* page = page_align(rec); + ulint n_ext = 0; + ulint i; + ulint left; + ulint heap_no; + ulint n_recs = page_get_n_recs(page_zip->data); + + ut_ad(page_is_leaf(page)); + ut_ad(page_is_comp(page)); + ut_ad(dict_table_is_comp(index->table)); + ut_ad(dict_index_is_clust(index)); + + heap_no = rec_get_heap_no_new(rec); + ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); + left = heap_no - PAGE_HEAP_NO_USER_LOW; + if (UNIV_UNLIKELY(!left)) { + return(0); + } + + for (i = 0; i < n_recs; i++) { + const rec_t* r = page + (page_zip_dir_get(page_zip, i) + & PAGE_ZIP_DIR_SLOT_MASK); + + if (rec_get_heap_no_new(r) < heap_no) { + n_ext += rec_get_n_extern_new(r, index, + ULINT_UNDEFINED); + if (!--left) { + break; + } + } + } + + return(n_ext); +} + +/************************************************************************** +Encode the length of a fixed-length column. */ +static +byte* +page_zip_fixed_field_encode( +/*========================*/ + /* out: buf + length of encoded val */ + byte* buf, /* in: pointer to buffer where to write */ + ulint val) /* in: value to write */ +{ + ut_ad(val >= 2); + + if (UNIV_LIKELY(val < 126)) { + /* + 0 = nullable variable field of at most 255 bytes length; + 1 = not null variable field of at most 255 bytes length; + 126 = nullable variable field with maximum length >255; + 127 = not null variable field with maximum length >255 + */ + *buf++ = (byte) val; + } else { + *buf++ = (byte) (0x80 | val >> 8); + *buf++ = (byte) val; + } + + return(buf); +} + +/************************************************************************** +Write the index information for the compressed page. */ +static +ulint +page_zip_fields_encode( +/*===================*/ + /* out: used size of buf */ + ulint n, /* in: number of fields to compress */ + dict_index_t* index, /* in: index comprising at least n fields */ + ulint trx_id_pos,/* in: position of the trx_id column + in the index, or ULINT_UNDEFINED if + this is a non-leaf page */ + byte* buf) /* out: buffer of (n + 1) * 2 bytes */ +{ + const byte* buf_start = buf; + ulint i; + ulint col; + ulint trx_id_col = 0; + /* sum of lengths of preceding non-nullable fixed fields, or 0 */ + ulint fixed_sum = 0; + + ut_ad(trx_id_pos == ULINT_UNDEFINED || trx_id_pos < n); + + for (i = col = 0; i < n; i++) { + dict_field_t* field = dict_index_get_nth_field(index, i); + ulint val; + + if (dict_field_get_col(field)->prtype & DATA_NOT_NULL) { + val = 1; /* set the "not nullable" flag */ + } else { + val = 0; /* nullable field */ + } + + if (!field->fixed_len) { + /* variable-length field */ + const dict_col_t* column + = dict_field_get_col(field); + + if (UNIV_UNLIKELY(column->len > 255) + || UNIV_UNLIKELY(column->mtype == DATA_BLOB)) { + val |= 0x7e; /* max > 255 bytes */ + } + + if (fixed_sum) { + /* write out the length of any + preceding non-nullable fields */ + buf = page_zip_fixed_field_encode( + buf, fixed_sum << 1 | 1); + fixed_sum = 0; + col++; + } + + *buf++ = (byte) val; + col++; + } else if (val) { + /* fixed-length non-nullable field */ + + if (fixed_sum && UNIV_UNLIKELY + (fixed_sum + field->fixed_len + > DICT_MAX_INDEX_COL_LEN)) { + /* Write out the length of the + preceding non-nullable fields, + to avoid exceeding the maximum + length of a fixed-length column. */ + buf = page_zip_fixed_field_encode( + buf, fixed_sum << 1 | 1); + fixed_sum = 0; + col++; + } + + if (i && UNIV_UNLIKELY(i == trx_id_pos)) { + if (fixed_sum) { + /* Write out the length of any + preceding non-nullable fields, + and start a new trx_id column. */ + buf = page_zip_fixed_field_encode( + buf, fixed_sum << 1 | 1); + col++; + } + + trx_id_col = col; + fixed_sum = field->fixed_len; + } else { + /* add to the sum */ + fixed_sum += field->fixed_len; + } + } else { + /* fixed-length nullable field */ + + if (fixed_sum) { + /* write out the length of any + preceding non-nullable fields */ + buf = page_zip_fixed_field_encode( + buf, fixed_sum << 1 | 1); + fixed_sum = 0; + col++; + } + + buf = page_zip_fixed_field_encode( + buf, field->fixed_len << 1); + col++; + } + } + + if (fixed_sum) { + /* Write out the lengths of last fixed-length columns. */ + buf = page_zip_fixed_field_encode(buf, fixed_sum << 1 | 1); + } + + if (trx_id_pos != ULINT_UNDEFINED) { + /* Write out the position of the trx_id column */ + i = trx_id_col; + } else { + /* Write out the number of nullable fields */ + i = index->n_nullable; + } + + if (i < 128) { + *buf++ = (byte) i; + } else { + *buf++ = (byte) (0x80 | i >> 8); + *buf++ = (byte) i; + } + + ut_ad((ulint) (buf - buf_start) <= (n + 2) * 2); + return((ulint) (buf - buf_start)); +} + +/************************************************************************** +Populate the dense page directory from the sparse directory. */ +static +void +page_zip_dir_encode( +/*================*/ + const page_t* page, /* in: compact page */ + byte* buf, /* in: pointer to dense page directory[-1]; + out: dense directory on compressed page */ + const rec_t** recs) /* in: pointer to an array of 0, or NULL; + out: dense page directory sorted by ascending + address (and heap_no) */ +{ + const byte* rec; + ulint status; + ulint min_mark; + ulint heap_no; + ulint i; + ulint n_heap; + ulint offs; + + min_mark = 0; + + if (page_is_leaf(page)) { + status = REC_STATUS_ORDINARY; + } else { + status = REC_STATUS_NODE_PTR; + if (UNIV_UNLIKELY + (mach_read_from_4(page + FIL_PAGE_PREV) == FIL_NULL)) { + min_mark = REC_INFO_MIN_REC_FLAG; + } + } + + n_heap = page_dir_get_n_heap(page); + + /* Traverse the list of stored records in the collation order, + starting from the first user record. */ + + rec = page + PAGE_NEW_INFIMUM, TRUE; + + i = 0; + + for (;;) { + ulint info_bits; + offs = rec_get_next_offs(rec, TRUE); + if (UNIV_UNLIKELY(offs == PAGE_NEW_SUPREMUM)) { + break; + } + rec = page + offs; + heap_no = rec_get_heap_no_new(rec); + ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW); + ut_a(heap_no < n_heap); + ut_a(offs < UNIV_PAGE_SIZE - PAGE_DIR); + ut_a(offs >= PAGE_ZIP_START); +#if PAGE_ZIP_DIR_SLOT_MASK & (PAGE_ZIP_DIR_SLOT_MASK + 1) +# error "PAGE_ZIP_DIR_SLOT_MASK is not 1 less than a power of 2" +#endif +#if PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE - 1 +# error "PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE - 1" +#endif + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec))) { + offs |= PAGE_ZIP_DIR_SLOT_OWNED; + } + + info_bits = rec_get_info_bits(rec, TRUE); + if (UNIV_UNLIKELY(info_bits & REC_INFO_DELETED_FLAG)) { + info_bits &= ~REC_INFO_DELETED_FLAG; + offs |= PAGE_ZIP_DIR_SLOT_DEL; + } + ut_a(info_bits == min_mark); + /* Only the smallest user record can have + REC_INFO_MIN_REC_FLAG set. */ + min_mark = 0; + + mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs); + + if (UNIV_LIKELY_NULL(recs)) { + /* Ensure that each heap_no occurs at most once. */ + ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]); + /* exclude infimum and supremum */ + recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec; + } + + ut_a(rec_get_status(rec) == status); + } + + offs = page_header_get_field(page, PAGE_FREE); + + /* Traverse the free list (of deleted records). */ + while (offs) { + ut_ad(!(offs & ~PAGE_ZIP_DIR_SLOT_MASK)); + rec = page + offs; + + heap_no = rec_get_heap_no_new(rec); + ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW); + ut_a(heap_no < n_heap); + + ut_a(!rec[-REC_N_NEW_EXTRA_BYTES]); /* info_bits and n_owned */ + ut_a(rec_get_status(rec) == status); + + mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs); + + if (UNIV_LIKELY_NULL(recs)) { + /* Ensure that each heap_no occurs at most once. */ + ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]); + /* exclude infimum and supremum */ + recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec; + } + + offs = rec_get_next_offs(rec, TRUE); + } + + /* Ensure that each heap no occurs at least once. */ + ut_a(i + PAGE_HEAP_NO_USER_LOW == n_heap); +} + +/************************************************************************** +Allocate memory for zlib. */ +static +void* +page_zip_malloc( +/*============*/ + void* opaque, + uInt items, + uInt size) +{ + return(mem_heap_alloc(opaque, items * size)); +} + +/************************************************************************** +Deallocate memory for zlib. */ +static +void +page_zip_free( +/*==========*/ + void* opaque __attribute__((unused)), + void* address __attribute__((unused))) +{ +} + +/************************************************************************** +Configure the zlib allocator to use the given memory heap. */ +UNIV_INTERN +void +page_zip_set_alloc( +/*===============*/ + void* stream, /* in/out: zlib stream */ + mem_heap_t* heap) /* in: memory heap to use */ +{ + z_stream* strm = stream; + + strm->zalloc = page_zip_malloc; + strm->zfree = page_zip_free; + strm->opaque = heap; +} + +#if 0 || defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG +# define PAGE_ZIP_COMPRESS_DBG +#endif + +#ifdef PAGE_ZIP_COMPRESS_DBG +/* Set this variable in a debugger to enable +excessive logging in page_zip_compress(). */ +UNIV_INTERN ibool page_zip_compress_dbg; +/* Set this variable in a debugger to enable +binary logging of the data passed to deflate(). +When this variable is nonzero, it will act +as a log file name generator. */ +UNIV_INTERN unsigned page_zip_compress_log; + +/************************************************************************** +Wrapper for deflate(). Log the operation if page_zip_compress_dbg is set. */ +static +ibool +page_zip_compress_deflate( +/*======================*/ + FILE* logfile,/* in: log file, or NULL */ + z_streamp strm, /* in/out: compressed stream for deflate() */ + int flush) /* in: deflate() flushing method */ +{ + int status; + if (UNIV_UNLIKELY(page_zip_compress_dbg)) { + ut_print_buf(stderr, strm->next_in, strm->avail_in); + } + if (UNIV_LIKELY_NULL(logfile)) { + fwrite(strm->next_in, 1, strm->avail_in, logfile); + } + status = deflate(strm, flush); + if (UNIV_UNLIKELY(page_zip_compress_dbg)) { + fprintf(stderr, " -> %d\n", status); + } + return(status); +} + +/* Redefine deflate(). */ +# undef deflate +# define deflate(strm, flush) page_zip_compress_deflate(logfile, strm, flush) +# define FILE_LOGFILE FILE* logfile, +# define LOGFILE logfile, +#else /* PAGE_ZIP_COMPRESS_DBG */ +# define FILE_LOGFILE +# define LOGFILE +#endif /* PAGE_ZIP_COMPRESS_DBG */ + +/************************************************************************** +Compress the records of a node pointer page. */ +static +int +page_zip_compress_node_ptrs( +/*========================*/ + /* out: Z_OK, or a zlib error code */ + FILE_LOGFILE + z_stream* c_stream, /* in/out: compressed page stream */ + const rec_t** recs, /* in: dense page directory + sorted by address */ + ulint n_dense, /* in: size of recs[] */ + dict_index_t* index, /* in: the index of the page */ + byte* storage, /* in: end of dense page directory */ + mem_heap_t* heap) /* in: temporary memory heap */ +{ + int err = Z_OK; + ulint* offsets = NULL; + + do { + const rec_t* rec = *recs++; + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + /* Only leaf nodes may contain externally stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + /* Compress the extra bytes. */ + c_stream->avail_in = rec - REC_N_NEW_EXTRA_BYTES + - c_stream->next_in; + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + break; + } + } + ut_ad(!c_stream->avail_in); + + /* Compress the data bytes, except node_ptr. */ + c_stream->next_in = (byte*) rec; + c_stream->avail_in = rec_offs_data_size(offsets) + - REC_NODE_PTR_SIZE; + ut_ad(c_stream->avail_in); + + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + break; + } + + ut_ad(!c_stream->avail_in); + + memcpy(storage - REC_NODE_PTR_SIZE + * (rec_get_heap_no_new(rec) - 1), + c_stream->next_in, REC_NODE_PTR_SIZE); + c_stream->next_in += REC_NODE_PTR_SIZE; + } while (--n_dense); + + return(err); +} + +/************************************************************************** +Compress the records of a leaf node of a secondary index. */ +static +int +page_zip_compress_sec( +/*==================*/ + /* out: Z_OK, or a zlib error code */ + FILE_LOGFILE + z_stream* c_stream, /* in/out: compressed page stream */ + const rec_t** recs, /* in: dense page directory + sorted by address */ + ulint n_dense) /* in: size of recs[] */ +{ + int err = Z_OK; + + ut_ad(n_dense > 0); + + do { + const rec_t* rec = *recs++; + + /* Compress everything up to this record. */ + c_stream->avail_in = rec - REC_N_NEW_EXTRA_BYTES + - c_stream->next_in; + + if (UNIV_LIKELY(c_stream->avail_in)) { + UNIV_MEM_ASSERT_RW(c_stream->next_in, + c_stream->avail_in); + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + break; + } + } + + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES); + + /* Skip the REC_N_NEW_EXTRA_BYTES. */ + + c_stream->next_in = (byte*) rec; + } while (--n_dense); + + return(err); +} + +/************************************************************************** +Compress a record of a leaf node of a clustered index that contains +externally stored columns. */ +static +int +page_zip_compress_clust_ext( +/*========================*/ + /* out: Z_OK, or a zlib error code */ + FILE_LOGFILE + z_stream* c_stream, /* in/out: compressed page stream */ + const rec_t* rec, /* in: record */ + const ulint* offsets, /* in: rec_get_offsets(rec) */ + ulint trx_id_col, /* in: position of of DB_TRX_ID */ + byte* deleted, /* in: dense directory entry pointing + to the head of the free list */ + byte* storage, /* in: end of dense page directory */ + byte** externs, /* in/out: pointer to the next + available BLOB pointer */ + ulint* n_blobs) /* in/out: number of + externally stored columns */ +{ + int err; + ulint i; + + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + ulint len; + const byte* src; + + if (UNIV_UNLIKELY(i == trx_id_col)) { + ut_ad(!rec_offs_nth_extern(offsets, i)); + /* Store trx_id and roll_ptr + in uncompressed form. */ + src = rec_get_nth_field(rec, offsets, i, &len); + ut_ad(src + DATA_TRX_ID_LEN + == rec_get_nth_field(rec, offsets, + i + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + + /* Compress any preceding bytes. */ + c_stream->avail_in + = src - c_stream->next_in; + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + return(err); + } + } + + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == src); + + memcpy(storage + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (rec_get_heap_no_new(rec) - 1), + c_stream->next_in, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + c_stream->next_in + += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + + /* Skip also roll_ptr */ + i++; + } else if (rec_offs_nth_extern(offsets, i)) { + src = rec_get_nth_field(rec, offsets, i, &len); + ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE); + src += len - BTR_EXTERN_FIELD_REF_SIZE; + + c_stream->avail_in = src + - c_stream->next_in; + if (UNIV_LIKELY(c_stream->avail_in)) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + return(err); + } + } + + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == src); + + /* Reserve space for the data at + the end of the space reserved for + the compressed data and the page + modification log. */ + + if (UNIV_UNLIKELY + (c_stream->avail_out + <= BTR_EXTERN_FIELD_REF_SIZE)) { + /* out of space */ + return(Z_BUF_ERROR); + } + + ut_ad(*externs == c_stream->next_out + + c_stream->avail_out + + 1/* end of modif. log */); + + c_stream->next_in + += BTR_EXTERN_FIELD_REF_SIZE; + + /* Skip deleted records. */ + if (UNIV_LIKELY_NULL + (page_zip_dir_find_low( + storage, deleted, + page_offset(rec)))) { + continue; + } + + (*n_blobs)++; + c_stream->avail_out + -= BTR_EXTERN_FIELD_REF_SIZE; + *externs -= BTR_EXTERN_FIELD_REF_SIZE; + + /* Copy the BLOB pointer */ + memcpy(*externs, c_stream->next_in + - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE); + } + } + + return(Z_OK); +} + +/************************************************************************** +Compress the records of a leaf node of a clustered index. */ +static +int +page_zip_compress_clust( +/*====================*/ + /* out: Z_OK, or a zlib error code */ + FILE_LOGFILE + z_stream* c_stream, /* in/out: compressed page stream */ + const rec_t** recs, /* in: dense page directory + sorted by address */ + ulint n_dense, /* in: size of recs[] */ + dict_index_t* index, /* in: the index of the page */ + ulint* n_blobs, /* in: 0; out: number of + externally stored columns */ + ulint trx_id_col, /* index of the trx_id column */ + byte* deleted, /* in: dense directory entry pointing + to the head of the free list */ + byte* storage, /* in: end of dense page directory */ + mem_heap_t* heap) /* in: temporary memory heap */ +{ + int err = Z_OK; + ulint* offsets = NULL; + /* BTR_EXTERN_FIELD_REF storage */ + byte* externs = storage - n_dense + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + ut_ad(*n_blobs == 0); + + do { + const rec_t* rec = *recs++; + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + ut_ad(rec_offs_n_fields(offsets) + == dict_index_get_n_fields(index)); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + /* Compress the extra bytes. */ + c_stream->avail_in = rec - REC_N_NEW_EXTRA_BYTES + - c_stream->next_in; + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + goto func_exit; + } + } + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES); + + /* Compress the data bytes. */ + + c_stream->next_in = (byte*) rec; + + /* Check if there are any externally stored columns. + For each externally stored column, store the + BTR_EXTERN_FIELD_REF separately. */ + if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) { + ut_ad(dict_index_is_clust(index)); + + err = page_zip_compress_clust_ext( + LOGFILE + c_stream, rec, offsets, trx_id_col, + deleted, storage, &externs, n_blobs); + + if (UNIV_UNLIKELY(err != Z_OK)) { + + goto func_exit; + } + } else { + ulint len; + const byte* src; + + /* Store trx_id and roll_ptr in uncompressed form. */ + src = rec_get_nth_field(rec, offsets, + trx_id_col, &len); + ut_ad(src + DATA_TRX_ID_LEN + == rec_get_nth_field(rec, offsets, + trx_id_col + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + /* Compress any preceding bytes. */ + c_stream->avail_in = src - c_stream->next_in; + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + return(err); + } + } + + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == src); + + memcpy(storage + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (rec_get_heap_no_new(rec) - 1), + c_stream->next_in, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + c_stream->next_in + += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + + /* Skip also roll_ptr */ + ut_ad(trx_id_col + 1 < rec_offs_n_fields(offsets)); + } + + /* Compress the last bytes of the record. */ + c_stream->avail_in = rec + rec_offs_data_size(offsets) + - c_stream->next_in; + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + goto func_exit; + } + } + ut_ad(!c_stream->avail_in); + } while (--n_dense); + +func_exit: + return(err); +} + +/************************************************************************** +Compress a page. */ +UNIV_INTERN +ibool +page_zip_compress( +/*==============*/ + /* out: TRUE on success, FALSE on failure; + page_zip will be left intact on failure. */ + page_zip_des_t* page_zip,/* in: size; out: data, n_blobs, + m_start, m_end, m_nonempty */ + const page_t* page, /* in: uncompressed page */ + dict_index_t* index, /* in: index of the B-tree node */ + mtr_t* mtr) /* in: mini-transaction, or NULL */ +{ + z_stream c_stream; + int err; + ulint n_fields;/* number of index fields needed */ + byte* fields; /* index field information */ + byte* buf; /* compressed payload of the page */ + byte* buf_end;/* end of buf */ + ulint n_dense; + ulint slot_size;/* amount of uncompressed bytes per record */ + const rec_t** recs; /* dense page directory, sorted by address */ + mem_heap_t* heap; + ulint trx_id_col; + ulint* offsets = NULL; + ulint n_blobs = 0; + byte* storage;/* storage of uncompressed columns */ + ullint usec = ut_time_us(NULL); +#ifdef PAGE_ZIP_COMPRESS_DBG + FILE* logfile = NULL; +#endif + + ut_a(page_is_comp(page)); + ut_a(fil_page_get_type(page) == FIL_PAGE_INDEX); + ut_ad(page_simple_validate_new((page_t*) page)); + ut_ad(page_zip_simple_validate(page_zip)); + + UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE); + + /* Check the data that will be omitted. */ + ut_a(!memcmp(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES), + infimum_extra, sizeof infimum_extra)); + ut_a(!memcmp(page + PAGE_NEW_INFIMUM, + infimum_data, sizeof infimum_data)); + ut_a(page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES] + /* info_bits == 0, n_owned <= max */ + <= PAGE_DIR_SLOT_MAX_N_OWNED); + ut_a(!memcmp(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1), + supremum_extra_data, sizeof supremum_extra_data)); + + if (UNIV_UNLIKELY(!page_get_n_recs(page))) { + ut_a(rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE) + == PAGE_NEW_SUPREMUM); + } + + if (page_is_leaf(page)) { + n_fields = dict_index_get_n_fields(index); + } else { + n_fields = dict_index_get_n_unique_in_tree(index); + } + + /* The dense directory excludes the infimum and supremum records. */ + n_dense = page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW; +#ifdef PAGE_ZIP_COMPRESS_DBG + if (UNIV_UNLIKELY(page_zip_compress_dbg)) { + fprintf(stderr, "compress %p %p %lu %lu %lu\n", + (void*) page_zip, (void*) page, + page_is_leaf(page), + n_fields, n_dense); + } + if (UNIV_UNLIKELY(page_zip_compress_log)) { + /* Create a log file for every compression attempt. */ + char logfilename[9]; + ut_snprintf(logfilename, sizeof logfilename, + "%08x", page_zip_compress_log++); + logfile = fopen(logfilename, "wb"); + + if (logfile) { + /* Write the uncompressed page to the log. */ + fwrite(page, 1, UNIV_PAGE_SIZE, logfile); + /* Record the compressed size as zero. + This will be overwritten at successful exit. */ + putc(0, logfile); + putc(0, logfile); + putc(0, logfile); + putc(0, logfile); + } + } +#endif /* PAGE_ZIP_COMPRESS_DBG */ + page_zip_stat[page_zip->ssize - 1].compressed++; + + if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE + >= page_zip_get_size(page_zip))) { + + goto err_exit; + } + + heap = mem_heap_create(page_zip_get_size(page_zip) + + n_fields * (2 + sizeof *offsets) + + n_dense * ((sizeof *recs) + - PAGE_ZIP_DIR_SLOT_SIZE) + + UNIV_PAGE_SIZE * 4 + + (512 << MAX_MEM_LEVEL)); + + recs = mem_heap_zalloc(heap, n_dense * sizeof *recs); + + fields = mem_heap_alloc(heap, (n_fields + 1) * 2); + + buf = mem_heap_alloc(heap, page_zip_get_size(page_zip) - PAGE_DATA); + buf_end = buf + page_zip_get_size(page_zip) - PAGE_DATA; + + /* Compress the data payload. */ + page_zip_set_alloc(&c_stream, heap); + + err = deflateInit2(&c_stream, Z_DEFAULT_COMPRESSION, + Z_DEFLATED, UNIV_PAGE_SIZE_SHIFT, + MAX_MEM_LEVEL, Z_DEFAULT_STRATEGY); + ut_a(err == Z_OK); + + c_stream.next_out = buf; + /* Subtract the space reserved for uncompressed data. */ + /* Page header and the end marker of the modification log */ + c_stream.avail_out = buf_end - buf - 1; + /* Dense page directory and uncompressed columns, if any */ + if (page_is_leaf(page)) { + if (dict_index_is_clust(index)) { + trx_id_col = dict_index_get_sys_col_pos( + index, DATA_TRX_ID); + ut_ad(trx_id_col > 0); + ut_ad(trx_id_col != ULINT_UNDEFINED); + + slot_size = PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + } else { + /* Signal the absence of trx_id + in page_zip_fields_encode() */ + ut_ad(dict_index_get_sys_col_pos(index, DATA_TRX_ID) + == ULINT_UNDEFINED); + trx_id_col = 0; + slot_size = PAGE_ZIP_DIR_SLOT_SIZE; + } + } else { + slot_size = PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE; + trx_id_col = ULINT_UNDEFINED; + } + + if (UNIV_UNLIKELY(c_stream.avail_out <= n_dense * slot_size + + 6/* sizeof(zlib header and footer) */)) { + goto zlib_error; + } + + c_stream.avail_out -= n_dense * slot_size; + c_stream.avail_in = page_zip_fields_encode(n_fields, index, + trx_id_col, fields); + c_stream.next_in = fields; + if (UNIV_LIKELY(!trx_id_col)) { + trx_id_col = ULINT_UNDEFINED; + } + + UNIV_MEM_ASSERT_RW(c_stream.next_in, c_stream.avail_in); + err = deflate(&c_stream, Z_FULL_FLUSH); + if (err != Z_OK) { + goto zlib_error; + } + + ut_ad(!c_stream.avail_in); + + page_zip_dir_encode(page, buf_end, recs); + + c_stream.next_in = (byte*) page + PAGE_ZIP_START; + + storage = buf_end - n_dense * PAGE_ZIP_DIR_SLOT_SIZE; + + /* Compress the records in heap_no order. */ + if (UNIV_UNLIKELY(!n_dense)) { + } else if (!page_is_leaf(page)) { + /* This is a node pointer page. */ + err = page_zip_compress_node_ptrs(LOGFILE + &c_stream, recs, n_dense, + index, storage, heap); + if (UNIV_UNLIKELY(err != Z_OK)) { + goto zlib_error; + } + } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) { + /* This is a leaf page in a secondary index. */ + err = page_zip_compress_sec(LOGFILE + &c_stream, recs, n_dense); + if (UNIV_UNLIKELY(err != Z_OK)) { + goto zlib_error; + } + } else { + /* This is a leaf page in a clustered index. */ + err = page_zip_compress_clust(LOGFILE + &c_stream, recs, n_dense, + index, &n_blobs, trx_id_col, + buf_end - PAGE_ZIP_DIR_SLOT_SIZE + * page_get_n_recs(page), + storage, heap); + if (UNIV_UNLIKELY(err != Z_OK)) { + goto zlib_error; + } + } + + /* Finish the compression. */ + ut_ad(!c_stream.avail_in); + /* Compress any trailing garbage, in case the last record was + allocated from an originally longer space on the free list, + or the data of the last record from page_zip_compress_sec(). */ + c_stream.avail_in + = page_header_get_field(page, PAGE_HEAP_TOP) + - (c_stream.next_in - page); + ut_a(c_stream.avail_in <= UNIV_PAGE_SIZE - PAGE_ZIP_START - PAGE_DIR); + + UNIV_MEM_ASSERT_RW(c_stream.next_in, c_stream.avail_in); + err = deflate(&c_stream, Z_FINISH); + + if (UNIV_UNLIKELY(err != Z_STREAM_END)) { +zlib_error: + deflateEnd(&c_stream); + mem_heap_free(heap); +err_exit: +#ifdef PAGE_ZIP_COMPRESS_DBG + if (logfile) { + fclose(logfile); + } +#endif /* PAGE_ZIP_COMPRESS_DBG */ + page_zip_stat[page_zip->ssize - 1].compressed_usec + += ut_time_us(NULL) - usec; + return(FALSE); + } + + err = deflateEnd(&c_stream); + ut_a(err == Z_OK); + + ut_ad(buf + c_stream.total_out == c_stream.next_out); + ut_ad((ulint) (storage - c_stream.next_out) >= c_stream.avail_out); + + /* Valgrind believes that zlib does not initialize some bits + in the last 7 or 8 bytes of the stream. Make Valgrind happy. */ + UNIV_MEM_VALID(buf, c_stream.total_out); + + /* Zero out the area reserved for the modification log. + Space for the end marker of the modification log is not + included in avail_out. */ + memset(c_stream.next_out, 0, c_stream.avail_out + 1/* end marker */); + +#ifdef UNIV_DEBUG + page_zip->m_start = +#endif /* UNIV_DEBUG */ + page_zip->m_end = PAGE_DATA + c_stream.total_out; + page_zip->m_nonempty = FALSE; + page_zip->n_blobs = n_blobs; + /* Copy those header fields that will not be written + in buf_flush_init_for_writing() */ + memcpy(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV, + FIL_PAGE_LSN - FIL_PAGE_PREV); + memcpy(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, 2); + memcpy(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA, + PAGE_DATA - FIL_PAGE_DATA); + /* Copy the rest of the compressed page */ + memcpy(page_zip->data + PAGE_DATA, buf, + page_zip_get_size(page_zip) - PAGE_DATA); + mem_heap_free(heap); +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + if (mtr) { + page_zip_compress_write_log(page_zip, page, index, mtr); + } + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + +#ifdef PAGE_ZIP_COMPRESS_DBG + if (logfile) { + /* Record the compressed size of the block. */ + byte sz[4]; + mach_write_to_4(sz, c_stream.total_out); + fseek(logfile, UNIV_PAGE_SIZE, SEEK_SET); + fwrite(sz, 1, sizeof sz, logfile); + fclose(logfile); + } +#endif /* PAGE_ZIP_COMPRESS_DBG */ + { + page_zip_stat_t* zip_stat + = &page_zip_stat[page_zip->ssize - 1]; + zip_stat->compressed_ok++; + zip_stat->compressed_usec += ut_time_us(NULL) - usec; + } + + return(TRUE); +} + +/************************************************************************** +Compare two page directory entries. */ +UNIV_INLINE +ibool +page_zip_dir_cmp( +/*=============*/ + /* out: positive if rec1 > rec2 */ + const rec_t* rec1, /* in: rec1 */ + const rec_t* rec2) /* in: rec2 */ +{ + return(rec1 > rec2); +} + +/************************************************************************** +Sort the dense page directory by address (heap_no). */ +static +void +page_zip_dir_sort( +/*==============*/ + rec_t** arr, /* in/out: dense page directory */ + rec_t** aux_arr,/* in/out: work area */ + ulint low, /* in: lower bound of the sorting area, inclusive */ + ulint high) /* in: upper bound of the sorting area, exclusive */ +{ + UT_SORT_FUNCTION_BODY(page_zip_dir_sort, arr, aux_arr, low, high, + page_zip_dir_cmp); +} + +/************************************************************************** +Deallocate the index information initialized by page_zip_fields_decode(). */ +static +void +page_zip_fields_free( +/*=================*/ + dict_index_t* index) /* in: dummy index to be freed */ +{ + if (index) { + dict_table_t* table = index->table; + mem_heap_free(index->heap); + mutex_free(&(table->autoinc_mutex)); + mem_heap_free(table->heap); + } +} + +/************************************************************************** +Read the index information for the compressed page. */ +static +dict_index_t* +page_zip_fields_decode( +/*===================*/ + /* out,own: dummy index describing the page, + or NULL on error */ + const byte* buf, /* in: index information */ + const byte* end, /* in: end of buf */ + ulint* trx_id_col)/* in: NULL for non-leaf pages; + for leaf pages, pointer to where to store + the position of the trx_id column */ +{ + const byte* b; + ulint n; + ulint i; + ulint val; + dict_table_t* table; + dict_index_t* index; + + /* Determine the number of fields. */ + for (b = buf, n = 0; b < end; n++) { + if (*b++ & 0x80) { + b++; /* skip the second byte */ + } + } + + n--; /* n_nullable or trx_id */ + + if (UNIV_UNLIKELY(n > REC_MAX_N_FIELDS)) { + + page_zip_fail(("page_zip_fields_decode: n = %lu\n", + (ulong) n)); + return(NULL); + } + + if (UNIV_UNLIKELY(b > end)) { + + page_zip_fail(("page_zip_fields_decode: %p > %p\n", + (const void*) b, (const void*) end)); + return(NULL); + } + + table = dict_mem_table_create("ZIP_DUMMY", DICT_HDR_SPACE, n, + DICT_TF_COMPACT); + index = dict_mem_index_create("ZIP_DUMMY", "ZIP_DUMMY", + DICT_HDR_SPACE, 0, n); + index->table = table; + index->n_uniq = n; + /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ + index->cached = TRUE; + + /* Initialize the fields. */ + for (b = buf, i = 0; i < n; i++) { + ulint mtype; + ulint len; + + val = *b++; + + if (UNIV_UNLIKELY(val & 0x80)) { + /* fixed length > 62 bytes */ + val = (val & 0x7f) << 8 | *b++; + len = val >> 1; + mtype = DATA_FIXBINARY; + } else if (UNIV_UNLIKELY(val >= 126)) { + /* variable length with max > 255 bytes */ + len = 0x7fff; + mtype = DATA_BINARY; + } else if (val <= 1) { + /* variable length with max <= 255 bytes */ + len = 0; + mtype = DATA_BINARY; + } else { + /* fixed length < 62 bytes */ + len = val >> 1; + mtype = DATA_FIXBINARY; + } + + dict_mem_table_add_col(table, NULL, NULL, mtype, + val & 1 ? DATA_NOT_NULL : 0, len); + dict_index_add_col(index, table, + dict_table_get_nth_col(table, i), 0); + } + + val = *b++; + if (UNIV_UNLIKELY(val & 0x80)) { + val = (val & 0x7f) << 8 | *b++; + } + + /* Decode the position of the trx_id column. */ + if (trx_id_col) { + if (!val) { + val = ULINT_UNDEFINED; + } else if (UNIV_UNLIKELY(val >= n)) { + page_zip_fields_free(index); + index = NULL; + } else { + index->type = DICT_CLUSTERED; + } + + *trx_id_col = val; + } else { + /* Decode the number of nullable fields. */ + if (UNIV_UNLIKELY(index->n_nullable > val)) { + page_zip_fields_free(index); + index = NULL; + } else { + index->n_nullable = val; + } + } + + ut_ad(b == end); + + return(index); +} + +/************************************************************************** +Populate the sparse page directory from the dense directory. */ +static +ibool +page_zip_dir_decode( +/*================*/ + /* out: TRUE on success, + FALSE on failure */ + const page_zip_des_t* page_zip,/* in: dense page directory on + compressed page */ + page_t* page, /* in: compact page with valid header; + out: trailer and sparse page directory + filled in */ + rec_t** recs, /* out: dense page directory sorted by + ascending address (and heap_no) */ + rec_t** recs_aux,/* in/out: scratch area */ + ulint n_dense)/* in: number of user records, and + size of recs[] and recs_aux[] */ +{ + ulint i; + ulint n_recs; + byte* slot; + + n_recs = page_get_n_recs(page); + + if (UNIV_UNLIKELY(n_recs > n_dense)) { + page_zip_fail(("page_zip_dir_decode 1: %lu > %lu\n", + (ulong) n_recs, (ulong) n_dense)); + return(FALSE); + } + + /* Traverse the list of stored records in the sorting order, + starting from the first user record. */ + + slot = page + (UNIV_PAGE_SIZE - PAGE_DIR - PAGE_DIR_SLOT_SIZE); + UNIV_PREFETCH_RW(slot); + + /* Zero out the page trailer. */ + memset(slot + PAGE_DIR_SLOT_SIZE, 0, PAGE_DIR); + + mach_write_to_2(slot, PAGE_NEW_INFIMUM); + slot -= PAGE_DIR_SLOT_SIZE; + UNIV_PREFETCH_RW(slot); + + /* Initialize the sparse directory and copy the dense directory. */ + for (i = 0; i < n_recs; i++) { + ulint offs = page_zip_dir_get(page_zip, i); + + if (offs & PAGE_ZIP_DIR_SLOT_OWNED) { + mach_write_to_2(slot, offs & PAGE_ZIP_DIR_SLOT_MASK); + slot -= PAGE_DIR_SLOT_SIZE; + UNIV_PREFETCH_RW(slot); + } + + if (UNIV_UNLIKELY((offs & PAGE_ZIP_DIR_SLOT_MASK) + < PAGE_ZIP_START + REC_N_NEW_EXTRA_BYTES)) { + page_zip_fail(("page_zip_dir_decode 2: %u %u %lx\n", + (unsigned) i, (unsigned) n_recs, + (ulong) offs)); + return(FALSE); + } + + recs[i] = page + (offs & PAGE_ZIP_DIR_SLOT_MASK); + } + + mach_write_to_2(slot, PAGE_NEW_SUPREMUM); + { + const page_dir_slot_t* last_slot = page_dir_get_nth_slot( + page, page_dir_get_n_slots(page) - 1); + + if (UNIV_UNLIKELY(slot != last_slot)) { + page_zip_fail(("page_zip_dir_decode 3: %p != %p\n", + (const void*) slot, + (const void*) last_slot)); + return(FALSE); + } + } + + /* Copy the rest of the dense directory. */ + for (; i < n_dense; i++) { + ulint offs = page_zip_dir_get(page_zip, i); + + if (UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) { + page_zip_fail(("page_zip_dir_decode 4: %u %u %lx\n", + (unsigned) i, (unsigned) n_dense, + (ulong) offs)); + return(FALSE); + } + + recs[i] = page + offs; + } + + if (UNIV_LIKELY(n_dense > 1)) { + page_zip_dir_sort(recs, recs_aux, 0, n_dense); + } + return(TRUE); +} + +/************************************************************************** +Initialize the REC_N_NEW_EXTRA_BYTES of each record. */ +static +ibool +page_zip_set_extra_bytes( +/*=====================*/ + /* out: TRUE on success, + FALSE on failure */ + const page_zip_des_t* page_zip,/* in: compressed page */ + page_t* page, /* in/out: uncompressed page */ + ulint info_bits)/* in: REC_INFO_MIN_REC_FLAG or 0 */ +{ + ulint n; + ulint i; + ulint n_owned = 1; + ulint offs; + rec_t* rec; + + n = page_get_n_recs(page); + rec = page + PAGE_NEW_INFIMUM; + + for (i = 0; i < n; i++) { + offs = page_zip_dir_get(page_zip, i); + + if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_DEL)) { + info_bits |= REC_INFO_DELETED_FLAG; + } + if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_OWNED)) { + info_bits |= n_owned; + n_owned = 1; + } else { + n_owned++; + } + offs &= PAGE_ZIP_DIR_SLOT_MASK; + if (UNIV_UNLIKELY(offs < PAGE_ZIP_START + + REC_N_NEW_EXTRA_BYTES)) { + page_zip_fail(("page_zip_set_extra_bytes 1:" + " %u %u %lx\n", + (unsigned) i, (unsigned) n, + (ulong) offs)); + return(FALSE); + } + + rec_set_next_offs_new(rec, offs); + rec = page + offs; + rec[-REC_N_NEW_EXTRA_BYTES] = (byte) info_bits; + info_bits = 0; + } + + /* Set the next pointer of the last user record. */ + rec_set_next_offs_new(rec, PAGE_NEW_SUPREMUM); + + /* Set n_owned of the supremum record. */ + page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES] = (byte) n_owned; + + /* The dense directory excludes the infimum and supremum records. */ + n = page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW; + + if (i >= n) { + if (UNIV_LIKELY(i == n)) { + return(TRUE); + } + + page_zip_fail(("page_zip_set_extra_bytes 2: %u != %u\n", + (unsigned) i, (unsigned) n)); + return(FALSE); + } + + offs = page_zip_dir_get(page_zip, i); + + /* Set the extra bytes of deleted records on the free list. */ + for (;;) { + if (UNIV_UNLIKELY(!offs) + || UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) { + + page_zip_fail(("page_zip_set_extra_bytes 3: %lx\n", + (ulong) offs)); + return(FALSE); + } + + rec = page + offs; + rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */ + + if (++i == n) { + break; + } + + offs = page_zip_dir_get(page_zip, i); + rec_set_next_offs_new(rec, offs); + } + + /* Terminate the free list. */ + rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */ + rec_set_next_offs_new(rec, 0); + + return(TRUE); +} + +/************************************************************************** +Apply the modification log to a record containing externally stored +columns. Do not copy the fields that are stored separately. */ +static +const byte* +page_zip_apply_log_ext( +/*===================*/ + /* out: pointer to modification log, + or NULL on failure */ + rec_t* rec, /* in/out: record */ + const ulint* offsets, /* in: rec_get_offsets(rec) */ + ulint trx_id_col, /* in: position of of DB_TRX_ID */ + const byte* data, /* in: modification log */ + const byte* end) /* in: end of modification log */ +{ + ulint i; + ulint len; + byte* next_out = rec; + + /* Check if there are any externally stored columns. + For each externally stored column, skip the + BTR_EXTERN_FIELD_REF. */ + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + byte* dst; + + if (UNIV_UNLIKELY(i == trx_id_col)) { + /* Skip trx_id and roll_ptr */ + dst = rec_get_nth_field(rec, offsets, + i, &len); + if (UNIV_UNLIKELY(dst - next_out >= end - data) + || UNIV_UNLIKELY + (len < (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) + || rec_offs_nth_extern(offsets, i)) { + page_zip_fail(("page_zip_apply_log_ext:" + " trx_id len %lu," + " %p - %p >= %p - %p\n", + (ulong) len, + (const void*) dst, + (const void*) next_out, + (const void*) end, + (const void*) data)); + return(NULL); + } + + memcpy(next_out, data, dst - next_out); + data += dst - next_out; + next_out = dst + (DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN); + } else if (rec_offs_nth_extern(offsets, i)) { + dst = rec_get_nth_field(rec, offsets, + i, &len); + ut_ad(len + >= BTR_EXTERN_FIELD_REF_SIZE); + + len += dst - next_out + - BTR_EXTERN_FIELD_REF_SIZE; + + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log_ext: " + "ext %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + + memcpy(next_out, data, len); + data += len; + next_out += len + + BTR_EXTERN_FIELD_REF_SIZE; + } + } + + /* Copy the last bytes of the record. */ + len = rec_get_end(rec, offsets) - next_out; + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log_ext: " + "last %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + memcpy(next_out, data, len); + data += len; + + return(data); +} + +/************************************************************************** +Apply the modification log to an uncompressed page. +Do not copy the fields that are stored separately. */ +static +const byte* +page_zip_apply_log( +/*===============*/ + /* out: pointer to end of modification log, + or NULL on failure */ + const byte* data, /* in: modification log */ + ulint size, /* in: maximum length of the log, in bytes */ + rec_t** recs, /* in: dense page directory, + sorted by address (indexed by + heap_no - PAGE_HEAP_NO_USER_LOW) */ + ulint n_dense,/* in: size of recs[] */ + ulint trx_id_col,/* in: column number of trx_id in the index, + or ULINT_UNDEFINED if none */ + ulint heap_status, + /* in: heap_no and status bits for + the next record to uncompress */ + dict_index_t* index, /* in: index of the page */ + ulint* offsets)/* in/out: work area for + rec_get_offsets_reverse() */ +{ + const byte* const end = data + size; + + for (;;) { + ulint val; + rec_t* rec; + ulint len; + ulint hs; + + val = *data++; + if (UNIV_UNLIKELY(!val)) { + return(data - 1); + } + if (val & 0x80) { + val = (val & 0x7f) << 8 | *data++; + if (UNIV_UNLIKELY(!val)) { + page_zip_fail(("page_zip_apply_log:" + " invalid val %x%x\n", + data[-2], data[-1])); + return(NULL); + } + } + if (UNIV_UNLIKELY(data >= end)) { + page_zip_fail(("page_zip_apply_log: %p >= %p\n", + (const void*) data, + (const void*) end)); + return(NULL); + } + if (UNIV_UNLIKELY((val >> 1) > n_dense)) { + page_zip_fail(("page_zip_apply_log: %lu>>1 > %lu\n", + (ulong) val, (ulong) n_dense)); + return(NULL); + } + + /* Determine the heap number and status bits of the record. */ + rec = recs[(val >> 1) - 1]; + + hs = ((val >> 1) + 1) << REC_HEAP_NO_SHIFT; + hs |= heap_status & ((1 << REC_HEAP_NO_SHIFT) - 1); + + /* This may either be an old record that is being + overwritten (updated in place, or allocated from + the free list), or a new record, with the next + available_heap_no. */ + if (UNIV_UNLIKELY(hs > heap_status)) { + page_zip_fail(("page_zip_apply_log: %lu > %lu\n", + (ulong) hs, (ulong) heap_status)); + return(NULL); + } else if (hs == heap_status) { + /* A new record was allocated from the heap. */ + if (UNIV_UNLIKELY(val & 1)) { + /* Only existing records may be cleared. */ + page_zip_fail(("page_zip_apply_log:" + " attempting to create" + " deleted rec %lu\n", + (ulong) hs)); + return(NULL); + } + heap_status += 1 << REC_HEAP_NO_SHIFT; + } + + mach_write_to_2(rec - REC_NEW_HEAP_NO, hs); + + if (val & 1) { + /* Clear the data bytes of the record. */ + mem_heap_t* heap = NULL; + ulint* offs; + offs = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + memset(rec, 0, rec_offs_data_size(offs)); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + continue; + } + +#if REC_STATUS_NODE_PTR != TRUE +# error "REC_STATUS_NODE_PTR != TRUE" +#endif + rec_get_offsets_reverse(data, index, + hs & REC_STATUS_NODE_PTR, + offsets); + rec_offs_make_valid(rec, index, offsets); + + /* Copy the extra bytes (backwards). */ + { + byte* start = rec_get_start(rec, offsets); + byte* b = rec - REC_N_NEW_EXTRA_BYTES; + while (b != start) { + *--b = *data++; + } + } + + /* Copy the data bytes. */ + if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) { + /* Non-leaf nodes should not contain any + externally stored columns. */ + if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) { + page_zip_fail(("page_zip_apply_log: " + "%lu&REC_STATUS_NODE_PTR\n", + (ulong) hs)); + return(NULL); + } + + data = page_zip_apply_log_ext( + rec, offsets, trx_id_col, data, end); + + if (UNIV_UNLIKELY(!data)) { + return(NULL); + } + } else if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) { + len = rec_offs_data_size(offsets) + - REC_NODE_PTR_SIZE; + /* Copy the data bytes, except node_ptr. */ + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log: " + "node_ptr %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + memcpy(rec, data, len); + data += len; + } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) { + len = rec_offs_data_size(offsets); + + /* Copy all data bytes of + a record in a secondary index. */ + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log: " + "sec %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + + memcpy(rec, data, len); + data += len; + } else { + /* Skip DB_TRX_ID and DB_ROLL_PTR. */ + ulint l = rec_get_nth_field_offs(offsets, + trx_id_col, &len); + byte* b; + + if (UNIV_UNLIKELY(data + l >= end) + || UNIV_UNLIKELY(len < (DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN))) { + page_zip_fail(("page_zip_apply_log: " + "trx_id %p+%lu >= %p\n", + (const void*) data, + (ulong) l, + (const void*) end)); + return(NULL); + } + + /* Copy any preceding data bytes. */ + memcpy(rec, data, l); + data += l; + + /* Copy any bytes following DB_TRX_ID, DB_ROLL_PTR. */ + b = rec + l + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + len = rec_get_end(rec, offsets) - b; + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log: " + "clust %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + memcpy(b, data, len); + data += len; + } + } +} + +/************************************************************************** +Decompress the records of a node pointer page. */ +static +ibool +page_zip_decompress_node_ptrs( +/*==========================*/ + /* out: TRUE on success, + FALSE on failure */ + page_zip_des_t* page_zip, /* in/out: compressed page */ + z_stream* d_stream, /* in/out: compressed page stream */ + rec_t** recs, /* in: dense page directory + sorted by address */ + ulint n_dense, /* in: size of recs[] */ + dict_index_t* index, /* in: the index of the page */ + ulint* offsets, /* in/out: temporary offsets */ + mem_heap_t* heap) /* in: temporary memory heap */ +{ + ulint heap_status = REC_STATUS_NODE_PTR + | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT; + ulint slot; + const byte* storage; + + /* Subtract the space reserved for uncompressed data. */ + d_stream->avail_in -= n_dense + * (PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE); + + /* Decompress the records in heap_no order. */ + for (slot = 0; slot < n_dense; slot++) { + rec_t* rec = recs[slot]; + + d_stream->avail_out = rec - REC_N_NEW_EXTRA_BYTES + - d_stream->next_out; + + ut_ad(d_stream->avail_out < UNIV_PAGE_SIZE + - PAGE_ZIP_START - PAGE_DIR); + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + /* Apparently, n_dense has grown + since the time the page was last compressed. */ + goto zlib_done; + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_node_ptrs:" + " 1 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + + ut_ad(d_stream->next_out == rec - REC_N_NEW_EXTRA_BYTES); + /* Prepare to decompress the data bytes. */ + d_stream->next_out = rec; + /* Set heap_no and the status bits. */ + mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status); + heap_status += 1 << REC_HEAP_NO_SHIFT; + + /* Read the offsets. The status bits are needed here. */ + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + /* Non-leaf nodes should not have any externally + stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + + /* Decompress the data bytes, except node_ptr. */ + d_stream->avail_out = rec_offs_data_size(offsets) + - REC_NODE_PTR_SIZE; + + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + goto zlib_done; + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_node_ptrs:" + " 2 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + + /* Clear the node pointer in case the record + will be deleted and the space will be reallocated + to a smaller record. */ + memset(d_stream->next_out, 0, REC_NODE_PTR_SIZE); + d_stream->next_out += REC_NODE_PTR_SIZE; + + ut_ad(d_stream->next_out == rec_get_end(rec, offsets)); + } + + /* Decompress any trailing garbage, in case the last record was + allocated from an originally longer space on the free list. */ + d_stream->avail_out = page_header_get_field(page_zip->data, + PAGE_HEAP_TOP) + - page_offset(d_stream->next_out); + if (UNIV_UNLIKELY(d_stream->avail_out > UNIV_PAGE_SIZE + - PAGE_ZIP_START - PAGE_DIR)) { + + page_zip_fail(("page_zip_decompress_node_ptrs:" + " avail_out = %u\n", + d_stream->avail_out)); + goto zlib_error; + } + + if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) { + page_zip_fail(("page_zip_decompress_node_ptrs:" + " inflate(Z_FINISH)=%s\n", + d_stream->msg)); +zlib_error: + inflateEnd(d_stream); + return(FALSE); + } + + /* Note that d_stream->avail_out > 0 may hold here + if the modification log is nonempty. */ + +zlib_done: + if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) { + ut_error; + } + + { + page_t* page = page_align(d_stream->next_out); + + /* Clear the unused heap space on the uncompressed page. */ + memset(d_stream->next_out, 0, + page_dir_get_nth_slot(page, + page_dir_get_n_slots(page) - 1) + - d_stream->next_out); + } + +#ifdef UNIV_DEBUG + page_zip->m_start = PAGE_DATA + d_stream->total_in; +#endif /* UNIV_DEBUG */ + + /* Apply the modification log. */ + { + const byte* mod_log_ptr; + mod_log_ptr = page_zip_apply_log(d_stream->next_in, + d_stream->avail_in + 1, + recs, n_dense, + ULINT_UNDEFINED, heap_status, + index, offsets); + + if (UNIV_UNLIKELY(!mod_log_ptr)) { + return(FALSE); + } + page_zip->m_end = mod_log_ptr - page_zip->data; + page_zip->m_nonempty = mod_log_ptr != d_stream->next_in; + } + + if (UNIV_UNLIKELY + (page_zip_get_trailer_len(page_zip, + dict_index_is_clust(index), NULL) + + page_zip->m_end >= page_zip_get_size(page_zip))) { + page_zip_fail(("page_zip_decompress_node_ptrs:" + " %lu + %lu >= %lu, %lu\n", + (ulong) page_zip_get_trailer_len( + page_zip, dict_index_is_clust(index), + NULL), + (ulong) page_zip->m_end, + (ulong) page_zip_get_size(page_zip), + (ulong) dict_index_is_clust(index))); + return(FALSE); + } + + /* Restore the uncompressed columns in heap_no order. */ + storage = page_zip->data + page_zip_get_size(page_zip) + - n_dense * PAGE_ZIP_DIR_SLOT_SIZE; + + for (slot = 0; slot < n_dense; slot++) { + rec_t* rec = recs[slot]; + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + /* Non-leaf nodes should not have any externally + stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + storage -= REC_NODE_PTR_SIZE; + + memcpy(rec_get_end(rec, offsets) - REC_NODE_PTR_SIZE, + storage, REC_NODE_PTR_SIZE); + } + + return(TRUE); +} + +/************************************************************************** +Decompress the records of a leaf node of a secondary index. */ +static +ibool +page_zip_decompress_sec( +/*====================*/ + /* out: TRUE on success, + FALSE on failure */ + page_zip_des_t* page_zip, /* in/out: compressed page */ + z_stream* d_stream, /* in/out: compressed page stream */ + rec_t** recs, /* in: dense page directory + sorted by address */ + ulint n_dense, /* in: size of recs[] */ + dict_index_t* index, /* in: the index of the page */ + ulint* offsets) /* in/out: temporary offsets */ +{ + ulint heap_status = REC_STATUS_ORDINARY + | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT; + ulint slot; + + ut_a(!dict_index_is_clust(index)); + + /* Subtract the space reserved for uncompressed data. */ + d_stream->avail_in -= n_dense * PAGE_ZIP_DIR_SLOT_SIZE; + + for (slot = 0; slot < n_dense; slot++) { + rec_t* rec = recs[slot]; + + /* Decompress everything up to this record. */ + d_stream->avail_out = rec - REC_N_NEW_EXTRA_BYTES + - d_stream->next_out; + + if (UNIV_LIKELY(d_stream->avail_out)) { + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + /* Apparently, n_dense has grown + since the time the page was last compressed. */ + goto zlib_done; + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_sec:" + " inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + } + + ut_ad(d_stream->next_out == rec - REC_N_NEW_EXTRA_BYTES); + + /* Skip the REC_N_NEW_EXTRA_BYTES. */ + + d_stream->next_out = rec; + + /* Set heap_no and the status bits. */ + mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status); + heap_status += 1 << REC_HEAP_NO_SHIFT; + } + + /* Decompress the data of the last record and any trailing garbage, + in case the last record was allocated from an originally longer space + on the free list. */ + d_stream->avail_out = page_header_get_field(page_zip->data, + PAGE_HEAP_TOP) + - page_offset(d_stream->next_out); + if (UNIV_UNLIKELY(d_stream->avail_out > UNIV_PAGE_SIZE + - PAGE_ZIP_START - PAGE_DIR)) { + + page_zip_fail(("page_zip_decompress_sec:" + " avail_out = %u\n", + d_stream->avail_out)); + goto zlib_error; + } + + if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) { + page_zip_fail(("page_zip_decompress_sec:" + " inflate(Z_FINISH)=%s\n", + d_stream->msg)); +zlib_error: + inflateEnd(d_stream); + return(FALSE); + } + + /* Note that d_stream->avail_out > 0 may hold here + if the modification log is nonempty. */ + +zlib_done: + if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) { + ut_error; + } + + { + page_t* page = page_align(d_stream->next_out); + + /* Clear the unused heap space on the uncompressed page. */ + memset(d_stream->next_out, 0, + page_dir_get_nth_slot(page, + page_dir_get_n_slots(page) - 1) + - d_stream->next_out); + } + +#ifdef UNIV_DEBUG + page_zip->m_start = PAGE_DATA + d_stream->total_in; +#endif /* UNIV_DEBUG */ + + /* Apply the modification log. */ + { + const byte* mod_log_ptr; + mod_log_ptr = page_zip_apply_log(d_stream->next_in, + d_stream->avail_in + 1, + recs, n_dense, + ULINT_UNDEFINED, heap_status, + index, offsets); + + if (UNIV_UNLIKELY(!mod_log_ptr)) { + return(FALSE); + } + page_zip->m_end = mod_log_ptr - page_zip->data; + page_zip->m_nonempty = mod_log_ptr != d_stream->next_in; + } + + if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, FALSE, NULL) + + page_zip->m_end >= page_zip_get_size(page_zip))) { + + page_zip_fail(("page_zip_decompress_sec: %lu + %lu >= %lu\n", + (ulong) page_zip_get_trailer_len( + page_zip, FALSE, NULL), + (ulong) page_zip->m_end, + (ulong) page_zip_get_size(page_zip))); + return(FALSE); + } + + /* There are no uncompressed columns on leaf pages of + secondary indexes. */ + + return(TRUE); +} + +/************************************************************************** +Decompress a record of a leaf node of a clustered index that contains +externally stored columns. */ +static +ibool +page_zip_decompress_clust_ext( +/*==========================*/ + /* out: TRUE on success */ + z_stream* d_stream, /* in/out: compressed page stream */ + rec_t* rec, /* in/out: record */ + const ulint* offsets, /* in: rec_get_offsets(rec) */ + ulint trx_id_col) /* in: position of of DB_TRX_ID */ +{ + ulint i; + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + ulint len; + byte* dst; + + if (UNIV_UNLIKELY(i == trx_id_col)) { + /* Skip trx_id and roll_ptr */ + dst = rec_get_nth_field(rec, offsets, i, &len); + if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN)) { + + page_zip_fail(("page_zip_decompress_clust_ext:" + " len[%lu] = %lu\n", + (ulong) i, (ulong) len)); + return(FALSE); + } + + if (rec_offs_nth_extern(offsets, i)) { + + page_zip_fail(("page_zip_decompress_clust_ext:" + " DB_TRX_ID at %lu is ext\n", + (ulong) i)); + return(FALSE); + } + + d_stream->avail_out = dst - d_stream->next_out; + + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust_ext:" + " 1 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + return(FALSE); + } + + ut_ad(d_stream->next_out == dst); + + /* Clear DB_TRX_ID and DB_ROLL_PTR in order to + avoid uninitialized bytes in case the record + is affected by page_zip_apply_log(). */ + memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + d_stream->next_out += DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN; + } else if (rec_offs_nth_extern(offsets, i)) { + dst = rec_get_nth_field(rec, offsets, i, &len); + ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE); + dst += len - BTR_EXTERN_FIELD_REF_SIZE; + + d_stream->avail_out = dst - d_stream->next_out; + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust_ext:" + " 2 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + return(FALSE); + } + + ut_ad(d_stream->next_out == dst); + + /* Clear the BLOB pointer in case + the record will be deleted and the + space will not be reused. Note that + the final initialization of the BLOB + pointers (copying from "externs" + or clearing) will have to take place + only after the page modification log + has been applied. Otherwise, we + could end up with an uninitialized + BLOB pointer when a record is deleted, + reallocated and deleted. */ + memset(d_stream->next_out, 0, + BTR_EXTERN_FIELD_REF_SIZE); + d_stream->next_out + += BTR_EXTERN_FIELD_REF_SIZE; + } + } + + return(TRUE); +} + +/************************************************************************** +Compress the records of a leaf node of a clustered index. */ +static +ibool +page_zip_decompress_clust( +/*======================*/ + /* out: TRUE on success, + FALSE on failure */ + page_zip_des_t* page_zip, /* in/out: compressed page */ + z_stream* d_stream, /* in/out: compressed page stream */ + rec_t** recs, /* in: dense page directory + sorted by address */ + ulint n_dense, /* in: size of recs[] */ + dict_index_t* index, /* in: the index of the page */ + ulint trx_id_col, /* index of the trx_id column */ + ulint* offsets, /* in/out: temporary offsets */ + mem_heap_t* heap) /* in: temporary memory heap */ +{ + int err; + ulint slot; + ulint heap_status = REC_STATUS_ORDINARY + | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT; + const byte* storage; + const byte* externs; + + ut_a(dict_index_is_clust(index)); + + /* Subtract the space reserved for uncompressed data. */ + d_stream->avail_in -= n_dense * (PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN); + + /* Decompress the records in heap_no order. */ + for (slot = 0; slot < n_dense; slot++) { + rec_t* rec = recs[slot]; + + d_stream->avail_out = rec - REC_N_NEW_EXTRA_BYTES + - d_stream->next_out; + + ut_ad(d_stream->avail_out < UNIV_PAGE_SIZE + - PAGE_ZIP_START - PAGE_DIR); + err = inflate(d_stream, Z_SYNC_FLUSH); + switch (err) { + case Z_STREAM_END: + /* Apparently, n_dense has grown + since the time the page was last compressed. */ + goto zlib_done; + case Z_OK: + case Z_BUF_ERROR: + if (UNIV_LIKELY(!d_stream->avail_out)) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust:" + " 1 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + + ut_ad(d_stream->next_out == rec - REC_N_NEW_EXTRA_BYTES); + /* Prepare to decompress the data bytes. */ + d_stream->next_out = rec; + /* Set heap_no and the status bits. */ + mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status); + heap_status += 1 << REC_HEAP_NO_SHIFT; + + /* Read the offsets. The status bits are needed here. */ + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + /* This is a leaf page in a clustered index. */ + + /* Check if there are any externally stored columns. + For each externally stored column, restore the + BTR_EXTERN_FIELD_REF separately. */ + + if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) { + if (UNIV_UNLIKELY + (!page_zip_decompress_clust_ext( + d_stream, rec, offsets, trx_id_col))) { + + goto zlib_error; + } + } else { + /* Skip trx_id and roll_ptr */ + ulint len; + byte* dst = rec_get_nth_field(rec, offsets, + trx_id_col, &len); + if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN)) { + + page_zip_fail(("page_zip_decompress_clust:" + " len = %lu\n", (ulong) len)); + goto zlib_error; + } + + d_stream->avail_out = dst - d_stream->next_out; + + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust:" + " 2 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + + ut_ad(d_stream->next_out == dst); + + /* Clear DB_TRX_ID and DB_ROLL_PTR in order to + avoid uninitialized bytes in case the record + is affected by page_zip_apply_log(). */ + memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + d_stream->next_out += DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN; + } + + /* Decompress the last bytes of the record. */ + d_stream->avail_out = rec_get_end(rec, offsets) + - d_stream->next_out; + + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust:" + " 3 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + } + + /* Decompress any trailing garbage, in case the last record was + allocated from an originally longer space on the free list. */ + d_stream->avail_out = page_header_get_field(page_zip->data, + PAGE_HEAP_TOP) + - page_offset(d_stream->next_out); + if (UNIV_UNLIKELY(d_stream->avail_out > UNIV_PAGE_SIZE + - PAGE_ZIP_START - PAGE_DIR)) { + + page_zip_fail(("page_zip_decompress_clust:" + " avail_out = %u\n", + d_stream->avail_out)); + goto zlib_error; + } + + if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) { + page_zip_fail(("page_zip_decompress_clust:" + " inflate(Z_FINISH)=%s\n", + d_stream->msg)); +zlib_error: + inflateEnd(d_stream); + return(FALSE); + } + + /* Note that d_stream->avail_out > 0 may hold here + if the modification log is nonempty. */ + +zlib_done: + if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) { + ut_error; + } + + { + page_t* page = page_align(d_stream->next_out); + + /* Clear the unused heap space on the uncompressed page. */ + memset(d_stream->next_out, 0, + page_dir_get_nth_slot(page, + page_dir_get_n_slots(page) - 1) + - d_stream->next_out); + } + +#ifdef UNIV_DEBUG + page_zip->m_start = PAGE_DATA + d_stream->total_in; +#endif /* UNIV_DEBUG */ + + /* Apply the modification log. */ + { + const byte* mod_log_ptr; + mod_log_ptr = page_zip_apply_log(d_stream->next_in, + d_stream->avail_in + 1, + recs, n_dense, + trx_id_col, heap_status, + index, offsets); + + if (UNIV_UNLIKELY(!mod_log_ptr)) { + return(FALSE); + } + page_zip->m_end = mod_log_ptr - page_zip->data; + page_zip->m_nonempty = mod_log_ptr != d_stream->next_in; + } + + if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, TRUE, NULL) + + page_zip->m_end >= page_zip_get_size(page_zip))) { + + page_zip_fail(("page_zip_decompress_clust: %lu + %lu >= %lu\n", + (ulong) page_zip_get_trailer_len( + page_zip, TRUE, NULL), + (ulong) page_zip->m_end, + (ulong) page_zip_get_size(page_zip))); + return(FALSE); + } + + storage = page_zip->data + page_zip_get_size(page_zip) + - n_dense * PAGE_ZIP_DIR_SLOT_SIZE; + + externs = storage - n_dense + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + /* Restore the uncompressed columns in heap_no order. */ + + for (slot = 0; slot < n_dense; slot++) { + ulint i; + ulint len; + byte* dst; + rec_t* rec = recs[slot]; + ibool exists = !page_zip_dir_find_free( + page_zip, page_offset(rec)); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + dst = rec_get_nth_field(rec, offsets, + trx_id_col, &len); + ut_ad(len >= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + storage -= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + memcpy(dst, storage, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + /* Check if there are any externally stored + columns in this record. For each externally + stored column, restore or clear the + BTR_EXTERN_FIELD_REF. */ + if (!rec_offs_any_extern(offsets)) { + continue; + } + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (!rec_offs_nth_extern(offsets, i)) { + continue; + } + dst = rec_get_nth_field(rec, offsets, i, &len); + + if (UNIV_UNLIKELY(len < BTR_EXTERN_FIELD_REF_SIZE)) { + page_zip_fail(("page_zip_decompress_clust:" + " %lu < 20\n", + (ulong) len)); + return(FALSE); + } + + dst += len - BTR_EXTERN_FIELD_REF_SIZE; + + if (UNIV_LIKELY(exists)) { + /* Existing record: + restore the BLOB pointer */ + externs -= BTR_EXTERN_FIELD_REF_SIZE; + + if (UNIV_UNLIKELY + (externs < page_zip->data + + page_zip->m_end)) { + page_zip_fail(("page_zip_" + "decompress_clust: " + "%p < %p + %lu\n", + (const void*) externs, + (const void*) + page_zip->data, + (ulong) + page_zip->m_end)); + return(FALSE); + } + + memcpy(dst, externs, + BTR_EXTERN_FIELD_REF_SIZE); + + page_zip->n_blobs++; + } else { + /* Deleted record: + clear the BLOB pointer */ + memset(dst, 0, + BTR_EXTERN_FIELD_REF_SIZE); + } + } + } + + return(TRUE); +} + +/************************************************************************** +Decompress a page. This function should tolerate errors on the compressed +page. Instead of letting assertions fail, it will return FALSE if an +inconsistency is detected. */ +UNIV_INTERN +ibool +page_zip_decompress( +/*================*/ + /* out: TRUE on success, FALSE on failure */ + page_zip_des_t* page_zip,/* in: data, ssize; + out: m_start, m_end, m_nonempty, n_blobs */ + page_t* page) /* out: uncompressed page, may be trashed */ +{ + z_stream d_stream; + dict_index_t* index = NULL; + rec_t** recs; /* dense page directory, sorted by address */ + ulint n_dense;/* number of user records on the page */ + ulint trx_id_col = ULINT_UNDEFINED; + mem_heap_t* heap; + ulint* offsets; + ullint usec = ut_time_us(NULL); + + ut_ad(page_zip_simple_validate(page_zip)); + UNIV_MEM_ASSERT_W(page, UNIV_PAGE_SIZE); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + /* The dense directory excludes the infimum and supremum records. */ + n_dense = page_dir_get_n_heap(page_zip->data) - PAGE_HEAP_NO_USER_LOW; + if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE + >= page_zip_get_size(page_zip))) { + page_zip_fail(("page_zip_decompress 1: %lu %lu\n", + (ulong) n_dense, + (ulong) page_zip_get_size(page_zip))); + return(FALSE); + } + + heap = mem_heap_create(n_dense * (3 * sizeof *recs) + UNIV_PAGE_SIZE); + recs = mem_heap_alloc(heap, n_dense * (2 * sizeof *recs)); + +#ifdef UNIV_ZIP_DEBUG + /* Clear the page. */ + memset(page, 0x55, UNIV_PAGE_SIZE); +#endif /* UNIV_ZIP_DEBUG */ + UNIV_MEM_INVALID(page, UNIV_PAGE_SIZE); + /* Copy the page header. */ + memcpy(page, page_zip->data, PAGE_DATA); + + /* Copy the page directory. */ + if (UNIV_UNLIKELY(!page_zip_dir_decode(page_zip, page, recs, + recs + n_dense, n_dense))) { +zlib_error: + mem_heap_free(heap); + return(FALSE); + } + + /* Copy the infimum and supremum records. */ + memcpy(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES), + infimum_extra, sizeof infimum_extra); + if (UNIV_UNLIKELY(!page_get_n_recs(page))) { + rec_set_next_offs_new(page + PAGE_NEW_INFIMUM, + PAGE_NEW_SUPREMUM); + } else { + rec_set_next_offs_new(page + PAGE_NEW_INFIMUM, + page_zip_dir_get(page_zip, 0) + & PAGE_ZIP_DIR_SLOT_MASK); + } + memcpy(page + PAGE_NEW_INFIMUM, infimum_data, sizeof infimum_data); + memcpy(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1), + supremum_extra_data, sizeof supremum_extra_data); + + page_zip_set_alloc(&d_stream, heap); + + if (UNIV_UNLIKELY(inflateInit2(&d_stream, UNIV_PAGE_SIZE_SHIFT) + != Z_OK)) { + ut_error; + } + + d_stream.next_in = page_zip->data + PAGE_DATA; + /* Subtract the space reserved for + the page header and the end marker of the modification log. */ + d_stream.avail_in = page_zip_get_size(page_zip) - (PAGE_DATA + 1); + + d_stream.next_out = page + PAGE_ZIP_START; + d_stream.avail_out = UNIV_PAGE_SIZE - PAGE_ZIP_START; + + /* Decode the zlib header and the index information. */ + if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) { + + page_zip_fail(("page_zip_decompress:" + " 1 inflate(Z_BLOCK)=%s\n", d_stream.msg)); + goto zlib_error; + } + + if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) { + + page_zip_fail(("page_zip_decompress:" + " 2 inflate(Z_BLOCK)=%s\n", d_stream.msg)); + goto zlib_error; + } + + index = page_zip_fields_decode( + page + PAGE_ZIP_START, d_stream.next_out, + page_is_leaf(page) ? &trx_id_col : NULL); + + if (UNIV_UNLIKELY(!index)) { + + goto zlib_error; + } + + /* Decompress the user records. */ + page_zip->n_blobs = 0; + d_stream.next_out = page + PAGE_ZIP_START; + + { + /* Pre-allocate the offsets for rec_get_offsets_reverse(). */ + ulint n = 1 + 1/* node ptr */ + REC_OFFS_HEADER_SIZE + + dict_index_get_n_fields(index); + offsets = mem_heap_alloc(heap, n * sizeof(ulint)); + *offsets = n; + } + + /* Decompress the records in heap_no order. */ + if (!page_is_leaf(page)) { + /* This is a node pointer page. */ + ulint info_bits; + + if (UNIV_UNLIKELY + (!page_zip_decompress_node_ptrs(page_zip, &d_stream, + recs, n_dense, index, + offsets, heap))) { + goto err_exit; + } + + info_bits = mach_read_from_4(page + FIL_PAGE_PREV) == FIL_NULL + ? REC_INFO_MIN_REC_FLAG : 0; + + if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, page, + info_bits))) { + goto err_exit; + } + } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) { + /* This is a leaf page in a secondary index. */ + if (UNIV_UNLIKELY(!page_zip_decompress_sec(page_zip, &d_stream, + recs, n_dense, + index, offsets))) { + goto err_exit; + } + + if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, + page, 0))) { +err_exit: + page_zip_fields_free(index); + mem_heap_free(heap); + return(FALSE); + } + } else { + /* This is a leaf page in a clustered index. */ + if (UNIV_UNLIKELY(!page_zip_decompress_clust(page_zip, + &d_stream, recs, + n_dense, index, + trx_id_col, + offsets, heap))) { + goto err_exit; + } + + if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, + page, 0))) { + goto err_exit; + } + } + + ut_a(page_is_comp(page)); + UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE); + + page_zip_fields_free(index); + mem_heap_free(heap); + { + page_zip_stat_t* zip_stat + = &page_zip_stat[page_zip->ssize - 1]; + zip_stat->decompressed++; + zip_stat->decompressed_usec += ut_time_us(NULL) - usec; + } + + /* Update the stat counter for LRU policy. */ + buf_LRU_stat_inc_unzip(); + + return(TRUE); +} + +#ifdef UNIV_ZIP_DEBUG +/************************************************************************** +Dump a block of memory on the standard error stream. */ +static +void +page_zip_hexdump_func( +/*==================*/ + const char* name, /* in: name of the data structure */ + const void* buf, /* in: data */ + ulint size) /* in: length of the data, in bytes */ +{ + const byte* s = buf; + ulint addr; + const ulint width = 32; /* bytes per line */ + + fprintf(stderr, "%s:\n", name); + + for (addr = 0; addr < size; addr += width) { + ulint i; + + fprintf(stderr, "%04lx ", (ulong) addr); + + i = ut_min(width, size - addr); + + while (i--) { + fprintf(stderr, "%02x", *s++); + } + + putc('\n', stderr); + } +} + +#define page_zip_hexdump(buf, size) page_zip_hexdump_func(#buf, buf, size) + +/* Flag: make page_zip_validate() compare page headers only */ +UNIV_INTERN ibool page_zip_validate_header_only = FALSE; + +/************************************************************************** +Check that the compressed and decompressed pages match. */ +UNIV_INTERN +ibool +page_zip_validate_low( +/*==================*/ + /* out: TRUE if valid, FALSE if not */ + const page_zip_des_t* page_zip,/* in: compressed page */ + const page_t* page, /* in: uncompressed page */ + ibool sloppy) /* in: FALSE=strict, + TRUE=ignore the MIN_REC_FLAG */ +{ + page_zip_des_t temp_page_zip; + byte* temp_page_buf; + page_t* temp_page; + ibool valid; + + if (memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV, + FIL_PAGE_LSN - FIL_PAGE_PREV) + || memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, 2) + || memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA, + PAGE_DATA - FIL_PAGE_DATA)) { + page_zip_fail(("page_zip_validate: page header\n")); + page_zip_hexdump(page_zip, sizeof *page_zip); + page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip)); + page_zip_hexdump(page, UNIV_PAGE_SIZE); + return(FALSE); + } + + ut_a(page_is_comp(page)); + + if (page_zip_validate_header_only) { + return(TRUE); + } + + /* page_zip_decompress() expects the uncompressed page to be + UNIV_PAGE_SIZE aligned. */ + temp_page_buf = ut_malloc(2 * UNIV_PAGE_SIZE); + temp_page = ut_align(temp_page_buf, UNIV_PAGE_SIZE); + +#ifdef UNIV_DEBUG_VALGRIND + /* Get detailed information on the valid bits in case the + UNIV_MEM_ASSERT_RW() checks fail. The v-bits of page[], + page_zip->data[] or page_zip could be viewed at temp_page[] or + temp_page_zip in a debugger when running valgrind --db-attach. */ + VALGRIND_GET_VBITS(page, temp_page, UNIV_PAGE_SIZE); + UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE); + VALGRIND_GET_VBITS(page_zip, &temp_page_zip, sizeof temp_page_zip); + UNIV_MEM_ASSERT_RW(page_zip, sizeof *page_zip); + VALGRIND_GET_VBITS(page_zip->data, temp_page, + page_zip_get_size(page_zip)); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); +#endif /* UNIV_DEBUG_VALGRIND */ + + temp_page_zip = *page_zip; + valid = page_zip_decompress(&temp_page_zip, temp_page); + if (!valid) { + fputs("page_zip_validate(): failed to decompress\n", stderr); + goto func_exit; + } + if (page_zip->n_blobs != temp_page_zip.n_blobs) { + page_zip_fail(("page_zip_validate: n_blobs: %u!=%u\n", + page_zip->n_blobs, temp_page_zip.n_blobs)); + valid = FALSE; + } +#ifdef UNIV_DEBUG + if (page_zip->m_start != temp_page_zip.m_start) { + page_zip_fail(("page_zip_validate: m_start: %u!=%u\n", + page_zip->m_start, temp_page_zip.m_start)); + valid = FALSE; + } +#endif /* UNIV_DEBUG */ + if (page_zip->m_end != temp_page_zip.m_end) { + page_zip_fail(("page_zip_validate: m_end: %u!=%u\n", + page_zip->m_end, temp_page_zip.m_end)); + valid = FALSE; + } + if (page_zip->m_nonempty != temp_page_zip.m_nonempty) { + page_zip_fail(("page_zip_validate(): m_nonempty: %u!=%u\n", + page_zip->m_nonempty, + temp_page_zip.m_nonempty)); + valid = FALSE; + } + if (memcmp(page + PAGE_HEADER, temp_page + PAGE_HEADER, + UNIV_PAGE_SIZE - PAGE_HEADER - FIL_PAGE_DATA_END)) { + + /* In crash recovery, the "minimum record" flag may be + set incorrectly until the mini-transaction is + committed. Let us tolerate that difference when we + are performing a sloppy validation. */ + + if (sloppy) { + byte info_bits_diff; + ulint offset + = rec_get_next_offs(page + PAGE_NEW_INFIMUM, + TRUE); + ut_a(offset >= PAGE_NEW_SUPREMUM); + offset -= 5 /* REC_NEW_INFO_BITS */; + + info_bits_diff = page[offset] ^ temp_page[offset]; + + if (info_bits_diff == REC_INFO_MIN_REC_FLAG) { + temp_page[offset] = page[offset]; + + if (!memcmp(page + PAGE_HEADER, + temp_page + PAGE_HEADER, + UNIV_PAGE_SIZE - PAGE_HEADER + - FIL_PAGE_DATA_END)) { + + /* Only the minimum record flag + differed. Let us ignore it. */ + page_zip_fail(("page_zip_validate: " + "min_rec_flag " + "(ignored, " + "%lu,%lu,0x%02lx)\n", + page_get_space_id(page), + page_get_page_no(page), + (ulong) page[offset])); + goto func_exit; + } + } + } + page_zip_fail(("page_zip_validate: content\n")); + valid = FALSE; + } + +func_exit: + if (!valid) { + page_zip_hexdump(page_zip, sizeof *page_zip); + page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip)); + page_zip_hexdump(page, UNIV_PAGE_SIZE); + page_zip_hexdump(temp_page, UNIV_PAGE_SIZE); + } + ut_free(temp_page_buf); + return(valid); +} + +/************************************************************************** +Check that the compressed and decompressed pages match. */ +UNIV_INTERN +ibool +page_zip_validate( +/*==============*/ + /* out: TRUE if valid, FALSE if not */ + const page_zip_des_t* page_zip,/* in: compressed page */ + const page_t* page) /* in: uncompressed page */ +{ + return(page_zip_validate_low(page_zip, page, + recv_recovery_is_on())); +} +#endif /* UNIV_ZIP_DEBUG */ + +#ifdef UNIV_DEBUG +static +ibool +page_zip_header_cmp( +/*================*/ + /* out: TRUE */ + const page_zip_des_t* page_zip,/* in: compressed page */ + const byte* page) /* in: uncompressed page */ +{ + ut_ad(!memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV, + FIL_PAGE_LSN - FIL_PAGE_PREV)); + ut_ad(!memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, + 2)); + ut_ad(!memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA, + PAGE_DATA - FIL_PAGE_DATA)); + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/************************************************************************** +Write a record on the compressed page that contains externally stored +columns. The data must already have been written to the uncompressed page. */ +static +byte* +page_zip_write_rec_ext( +/*===================*/ + /* out: end of modification log */ + page_zip_des_t* page_zip, /* in/out: compressed page */ + const page_t* page, /* in: page containing rec */ + const byte* rec, /* in: record being written */ + dict_index_t* index, /* in: record descriptor */ + const ulint* offsets, /* in: rec_get_offsets(rec, index) */ + ulint create, /* in: nonzero=insert, zero=update */ + ulint trx_id_col, /* in: position of DB_TRX_ID */ + ulint heap_no, /* in: heap number of rec */ + byte* storage, /* in: end of dense page directory */ + byte* data) /* in: end of modification log */ +{ + const byte* start = rec; + ulint i; + ulint len; + byte* externs = storage; + ulint n_ext = rec_offs_n_extern(offsets); + + ut_ad(rec_offs_validate(rec, index, offsets)); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + externs -= (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW); + + /* Note that this will not take into account + the BLOB columns of rec if create==TRUE. */ + ut_ad(data + rec_offs_data_size(offsets) + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + - n_ext * BTR_EXTERN_FIELD_REF_SIZE + < externs - BTR_EXTERN_FIELD_REF_SIZE * page_zip->n_blobs); + + { + ulint blob_no = page_zip_get_n_prev_extern( + page_zip, rec, index); + byte* ext_end = externs - page_zip->n_blobs + * BTR_EXTERN_FIELD_REF_SIZE; + ut_ad(blob_no <= page_zip->n_blobs); + externs -= blob_no * BTR_EXTERN_FIELD_REF_SIZE; + + if (create) { + page_zip->n_blobs += n_ext; + ASSERT_ZERO_BLOB(ext_end - n_ext + * BTR_EXTERN_FIELD_REF_SIZE); + memmove(ext_end - n_ext + * BTR_EXTERN_FIELD_REF_SIZE, + ext_end, + externs - ext_end); + } + + ut_a(blob_no + n_ext <= page_zip->n_blobs); + } + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + const byte* src; + + if (UNIV_UNLIKELY(i == trx_id_col)) { + ut_ad(!rec_offs_nth_extern(offsets, + i)); + ut_ad(!rec_offs_nth_extern(offsets, + i + 1)); + /* Locate trx_id and roll_ptr. */ + src = rec_get_nth_field(rec, offsets, + i, &len); + ut_ad(len == DATA_TRX_ID_LEN); + ut_ad(src + DATA_TRX_ID_LEN + == rec_get_nth_field( + rec, offsets, + i + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + + /* Log the preceding fields. */ + ASSERT_ZERO(data, src - start); + memcpy(data, start, src - start); + data += src - start; + start = src + (DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN); + + /* Store trx_id and roll_ptr. */ + memcpy(storage - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (heap_no - 1), + src, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + i++; /* skip also roll_ptr */ + } else if (rec_offs_nth_extern(offsets, i)) { + src = rec_get_nth_field(rec, offsets, + i, &len); + + ut_ad(dict_index_is_clust(index)); + ut_ad(len + >= BTR_EXTERN_FIELD_REF_SIZE); + src += len - BTR_EXTERN_FIELD_REF_SIZE; + + ASSERT_ZERO(data, src - start); + memcpy(data, start, src - start); + data += src - start; + start = src + BTR_EXTERN_FIELD_REF_SIZE; + + /* Store the BLOB pointer. */ + externs -= BTR_EXTERN_FIELD_REF_SIZE; + ut_ad(data < externs); + memcpy(externs, src, BTR_EXTERN_FIELD_REF_SIZE); + } + } + + /* Log the last bytes of the record. */ + len = rec_offs_data_size(offsets) - (start - rec); + + ASSERT_ZERO(data, len); + memcpy(data, start, len); + data += len; + + return(data); +} + +/************************************************************************** +Write an entire record on the compressed page. The data must already +have been written to the uncompressed page. */ +UNIV_INTERN +void +page_zip_write_rec( +/*===============*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* rec, /* in: record being written */ + dict_index_t* index, /* in: the index the record belongs to */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + ulint create) /* in: nonzero=insert, zero=update */ +{ + const page_t* page; + byte* data; + byte* storage; + ulint heap_no; + byte* slot; + + ut_ad(buf_frame_get_page_zip(rec) == page_zip); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(rec_offs_comp(offsets)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + + page = page_align(rec); + + ut_ad(page_zip_header_cmp(page_zip, page)); + ut_ad(page_simple_validate_new((page_t*) page)); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + slot = page_zip_dir_find(page_zip, page_offset(rec)); + ut_a(slot); + /* Copy the delete mark. */ + if (rec_get_deleted_flag(rec, TRUE)) { + *slot |= PAGE_ZIP_DIR_SLOT_DEL >> 8; + } else { + *slot &= ~(PAGE_ZIP_DIR_SLOT_DEL >> 8); + } + + ut_ad(rec_get_start((rec_t*) rec, offsets) >= page + PAGE_ZIP_START); + ut_ad(rec_get_end((rec_t*) rec, offsets) <= page + UNIV_PAGE_SIZE + - PAGE_DIR - PAGE_DIR_SLOT_SIZE + * page_dir_get_n_slots(page)); + + heap_no = rec_get_heap_no_new(rec); + ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); /* not infimum or supremum */ + ut_ad(heap_no < page_dir_get_n_heap(page)); + + /* Append to the modification log. */ + data = page_zip->data + page_zip->m_end; + ut_ad(!*data); + + /* Identify the record by writing its heap number - 1. + 0 is reserved to indicate the end of the modification log. */ + + if (UNIV_UNLIKELY(heap_no - 1 >= 64)) { + *data++ = (byte) (0x80 | (heap_no - 1) >> 7); + ut_ad(!*data); + } + *data++ = (byte) ((heap_no - 1) << 1); + ut_ad(!*data); + + { + const byte* start = rec - rec_offs_extra_size(offsets); + const byte* b = rec - REC_N_NEW_EXTRA_BYTES; + + /* Write the extra bytes backwards, so that + rec_offs_extra_size() can be easily computed in + page_zip_apply_log() by invoking + rec_get_offsets_reverse(). */ + + while (b != start) { + *data++ = *--b; + ut_ad(!*data); + } + } + + /* Write the data bytes. Store the uncompressed bytes separately. */ + storage = page_zip->data + page_zip_get_size(page_zip) + - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW) + * PAGE_ZIP_DIR_SLOT_SIZE; + + if (page_is_leaf(page)) { + ulint len; + + if (dict_index_is_clust(index)) { + ulint trx_id_col; + + trx_id_col = dict_index_get_sys_col_pos(index, + DATA_TRX_ID); + ut_ad(trx_id_col != ULINT_UNDEFINED); + + /* Store separately trx_id, roll_ptr and + the BTR_EXTERN_FIELD_REF of each BLOB column. */ + if (rec_offs_any_extern(offsets)) { + data = page_zip_write_rec_ext( + page_zip, page, + rec, index, offsets, create, + trx_id_col, heap_no, storage, data); + } else { + /* Locate trx_id and roll_ptr. */ + const byte* src + = rec_get_nth_field(rec, offsets, + trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + ut_ad(src + DATA_TRX_ID_LEN + == rec_get_nth_field( + rec, offsets, + trx_id_col + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + + /* Log the preceding fields. */ + ASSERT_ZERO(data, src - rec); + memcpy(data, rec, src - rec); + data += src - rec; + + /* Store trx_id and roll_ptr. */ + memcpy(storage + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (heap_no - 1), + src, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + src += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + + /* Log the last bytes of the record. */ + len = rec_offs_data_size(offsets) + - (src - rec); + + ASSERT_ZERO(data, len); + memcpy(data, src, len); + data += len; + } + } else { + /* Leaf page of a secondary index: + no externally stored columns */ + ut_ad(dict_index_get_sys_col_pos(index, DATA_TRX_ID) + == ULINT_UNDEFINED); + ut_ad(!rec_offs_any_extern(offsets)); + + /* Log the entire record. */ + len = rec_offs_data_size(offsets); + + ASSERT_ZERO(data, len); + memcpy(data, rec, len); + data += len; + } + } else { + /* This is a node pointer page. */ + ulint len; + + /* Non-leaf nodes should not have any externally + stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + + /* Copy the data bytes, except node_ptr. */ + len = rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE; + ut_ad(data + len < storage - REC_NODE_PTR_SIZE + * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)); + ASSERT_ZERO(data, len); + memcpy(data, rec, len); + data += len; + + /* Copy the node pointer to the uncompressed area. */ + memcpy(storage - REC_NODE_PTR_SIZE + * (heap_no - 1), + rec + len, + REC_NODE_PTR_SIZE); + } + + ut_a(!*data); + ut_ad((ulint) (data - page_zip->data) < page_zip_get_size(page_zip)); + page_zip->m_end = data - page_zip->data; + page_zip->m_nonempty = TRUE; + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page_align(rec))); +#endif /* UNIV_ZIP_DEBUG */ +} + +/*************************************************************** +Parses a log record of writing a BLOB pointer of a record. */ +UNIV_INTERN +byte* +page_zip_parse_write_blob_ptr( +/*==========================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: redo log buffer */ + byte* end_ptr,/* in: redo log buffer end */ + page_t* page, /* in/out: uncompressed page */ + page_zip_des_t* page_zip)/* in/out: compressed page */ +{ + ulint offset; + ulint z_offset; + + ut_ad(!page == !page_zip); + + if (UNIV_UNLIKELY + (end_ptr < ptr + (2 + 2 + BTR_EXTERN_FIELD_REF_SIZE))) { + + return(NULL); + } + + offset = mach_read_from_2(ptr); + z_offset = mach_read_from_2(ptr + 2); + + if (UNIV_UNLIKELY(offset < PAGE_ZIP_START) + || UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE) + || UNIV_UNLIKELY(z_offset >= UNIV_PAGE_SIZE)) { +corrupt: + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + + if (page) { + if (UNIV_UNLIKELY(!page_zip) + || UNIV_UNLIKELY(!page_is_leaf(page))) { + + goto corrupt; + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + memcpy(page + offset, + ptr + 4, BTR_EXTERN_FIELD_REF_SIZE); + memcpy(page_zip->data + z_offset, + ptr + 4, BTR_EXTERN_FIELD_REF_SIZE); + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + } + + return(ptr + (2 + 2 + BTR_EXTERN_FIELD_REF_SIZE)); +} + +/************************************************************************** +Write a BLOB pointer of a record on the leaf page of a clustered index. +The information must already have been updated on the uncompressed page. */ +UNIV_INTERN +void +page_zip_write_blob_ptr( +/*====================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* rec, /* in/out: record whose data is being + written */ + dict_index_t* index, /* in: index of the page */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + ulint n, /* in: column index */ + mtr_t* mtr) /* in: mini-transaction handle, + or NULL if no logging is needed */ +{ + const byte* field; + byte* externs; + const page_t* page = page_align(rec); + ulint blob_no; + ulint len; + + ut_ad(buf_frame_get_page_zip(rec) == page_zip); + ut_ad(page_simple_validate_new((page_t*) page)); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(rec_offs_comp(offsets)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(rec_offs_any_extern(offsets)); + ut_ad(rec_offs_nth_extern(offsets, n)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + ut_ad(page_zip_header_cmp(page_zip, page)); + + ut_ad(page_is_leaf(page)); + ut_ad(dict_index_is_clust(index)); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + blob_no = page_zip_get_n_prev_extern(page_zip, rec, index) + + rec_get_n_extern_new(rec, index, n); + ut_a(blob_no < page_zip->n_blobs); + + externs = page_zip->data + page_zip_get_size(page_zip) + - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW) + * (PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + field = rec_get_nth_field(rec, offsets, n, &len); + + externs -= (blob_no + 1) * BTR_EXTERN_FIELD_REF_SIZE; + field += len - BTR_EXTERN_FIELD_REF_SIZE; + + memcpy(externs, field, BTR_EXTERN_FIELD_REF_SIZE); + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + if (mtr) { + byte* log_ptr = mlog_open( + mtr, 11 + 2 + 2 + BTR_EXTERN_FIELD_REF_SIZE); + if (UNIV_UNLIKELY(!log_ptr)) { + return; + } + + log_ptr = mlog_write_initial_log_record_fast( + (byte*) field, MLOG_ZIP_WRITE_BLOB_PTR, log_ptr, mtr); + mach_write_to_2(log_ptr, page_offset(field)); + log_ptr += 2; + mach_write_to_2(log_ptr, externs - page_zip->data); + log_ptr += 2; + memcpy(log_ptr, externs, BTR_EXTERN_FIELD_REF_SIZE); + log_ptr += BTR_EXTERN_FIELD_REF_SIZE; + mlog_close(mtr, log_ptr); + } +} + +/*************************************************************** +Parses a log record of writing the node pointer of a record. */ +UNIV_INTERN +byte* +page_zip_parse_write_node_ptr( +/*==========================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: redo log buffer */ + byte* end_ptr,/* in: redo log buffer end */ + page_t* page, /* in/out: uncompressed page */ + page_zip_des_t* page_zip)/* in/out: compressed page */ +{ + ulint offset; + ulint z_offset; + + ut_ad(!page == !page_zip); + + if (UNIV_UNLIKELY(end_ptr < ptr + (2 + 2 + REC_NODE_PTR_SIZE))) { + + return(NULL); + } + + offset = mach_read_from_2(ptr); + z_offset = mach_read_from_2(ptr + 2); + + if (UNIV_UNLIKELY(offset < PAGE_ZIP_START) + || UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE) + || UNIV_UNLIKELY(z_offset >= UNIV_PAGE_SIZE)) { +corrupt: + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + + if (page) { + byte* storage_end; + byte* field; + byte* storage; + ulint heap_no; + + if (UNIV_UNLIKELY(!page_zip) + || UNIV_UNLIKELY(page_is_leaf(page))) { + + goto corrupt; + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + field = page + offset; + storage = page_zip->data + z_offset; + + storage_end = page_zip->data + page_zip_get_size(page_zip) + - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW) + * PAGE_ZIP_DIR_SLOT_SIZE; + + heap_no = 1 + (storage_end - storage) / REC_NODE_PTR_SIZE; + + if (UNIV_UNLIKELY((storage_end - storage) % REC_NODE_PTR_SIZE) + || UNIV_UNLIKELY(heap_no < PAGE_HEAP_NO_USER_LOW) + || UNIV_UNLIKELY(heap_no >= page_dir_get_n_heap(page))) { + + goto corrupt; + } + + memcpy(field, ptr + 4, REC_NODE_PTR_SIZE); + memcpy(storage, ptr + 4, REC_NODE_PTR_SIZE); + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + } + + return(ptr + (2 + 2 + REC_NODE_PTR_SIZE)); +} + +/************************************************************************** +Write the node pointer of a record on a non-leaf compressed page. */ +UNIV_INTERN +void +page_zip_write_node_ptr( +/*====================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + byte* rec, /* in/out: record */ + ulint size, /* in: data size of rec */ + ulint ptr, /* in: node pointer */ + mtr_t* mtr) /* in: mini-transaction, or NULL */ +{ + byte* field; + byte* storage; + page_t* page = page_align(rec); + + ut_ad(buf_frame_get_page_zip(rec) == page_zip); + ut_ad(page_simple_validate_new(page)); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(page_rec_is_comp(rec)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + ut_ad(page_zip_header_cmp(page_zip, page)); + + ut_ad(!page_is_leaf(page)); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + UNIV_MEM_ASSERT_RW(rec, size); + + storage = page_zip->data + page_zip_get_size(page_zip) + - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW) + * PAGE_ZIP_DIR_SLOT_SIZE + - (rec_get_heap_no_new(rec) - 1) * REC_NODE_PTR_SIZE; + field = rec + size - REC_NODE_PTR_SIZE; + +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + ut_a(!memcmp(storage, field, REC_NODE_PTR_SIZE)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ +#if REC_NODE_PTR_SIZE != 4 +# error "REC_NODE_PTR_SIZE != 4" +#endif + mach_write_to_4(field, ptr); + memcpy(storage, field, REC_NODE_PTR_SIZE); + + if (mtr) { + byte* log_ptr = mlog_open(mtr, + 11 + 2 + 2 + REC_NODE_PTR_SIZE); + if (UNIV_UNLIKELY(!log_ptr)) { + return; + } + + log_ptr = mlog_write_initial_log_record_fast( + field, MLOG_ZIP_WRITE_NODE_PTR, log_ptr, mtr); + mach_write_to_2(log_ptr, page_offset(field)); + log_ptr += 2; + mach_write_to_2(log_ptr, storage - page_zip->data); + log_ptr += 2; + memcpy(log_ptr, field, REC_NODE_PTR_SIZE); + log_ptr += REC_NODE_PTR_SIZE; + mlog_close(mtr, log_ptr); + } +} + +/************************************************************************** +Write the trx_id and roll_ptr of a record on a B-tree leaf node page. */ +UNIV_INTERN +void +page_zip_write_trx_id_and_roll_ptr( +/*===============================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + byte* rec, /* in/out: record */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + ulint trx_id_col,/* in: column number of TRX_ID in rec */ + dulint trx_id, /* in: transaction identifier */ + dulint roll_ptr)/* in: roll_ptr */ +{ + byte* field; + byte* storage; + page_t* page = page_align(rec); + ulint len; + + ut_ad(buf_frame_get_page_zip(rec) == page_zip); + ut_ad(page_simple_validate_new(page)); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(rec_offs_comp(offsets)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + ut_ad(page_zip_header_cmp(page_zip, page)); + + ut_ad(page_is_leaf(page)); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + storage = page_zip->data + page_zip_get_size(page_zip) + - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW) + * PAGE_ZIP_DIR_SLOT_SIZE + - (rec_get_heap_no_new(rec) - 1) + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + +#if DATA_TRX_ID + 1 != DATA_ROLL_PTR +# error "DATA_TRX_ID + 1 != DATA_ROLL_PTR" +#endif + field = rec_get_nth_field(rec, offsets, trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + ut_ad(field + DATA_TRX_ID_LEN + == rec_get_nth_field(rec, offsets, trx_id_col + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + ut_a(!memcmp(storage, field, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ +#if DATA_TRX_ID_LEN != 6 +# error "DATA_TRX_ID_LEN != 6" +#endif + mach_write_to_6(field, trx_id); +#if DATA_ROLL_PTR_LEN != 7 +# error "DATA_ROLL_PTR_LEN != 7" +#endif + mach_write_to_7(field + DATA_TRX_ID_LEN, roll_ptr); + memcpy(storage, field, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); +} + +#ifdef UNIV_ZIP_DEBUG +/* Set this variable in a debugger to disable page_zip_clear_rec(). +The only observable effect should be the compression ratio due to +deleted records not being zeroed out. In rare cases, there can be +page_zip_validate() failures on the node_ptr, trx_id and roll_ptr +columns if the space is reallocated for a smaller record. */ +UNIV_INTERN ibool page_zip_clear_rec_disable; +#endif /* UNIV_ZIP_DEBUG */ + +/************************************************************************** +Clear an area on the uncompressed and compressed page, if possible. */ +static +void +page_zip_clear_rec( +/*===============*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + byte* rec, /* in: record to clear */ + dict_index_t* index, /* in: index of rec */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ +{ + ulint heap_no; + page_t* page = page_align(rec); + /* page_zip_validate() would fail here if a record + containing externally stored columns is being deleted. */ + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!page_zip_dir_find(page_zip, page_offset(rec))); + ut_ad(page_zip_dir_find_free(page_zip, page_offset(rec))); + ut_ad(page_zip_header_cmp(page_zip, page)); + + heap_no = rec_get_heap_no_new(rec); + ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + if ( +#ifdef UNIV_ZIP_DEBUG + !page_zip_clear_rec_disable && +#endif /* UNIV_ZIP_DEBUG */ + page_zip->m_end + + 1 + ((heap_no - 1) >= 64)/* size of the log entry */ + + page_zip_get_trailer_len(page_zip, + dict_index_is_clust(index), NULL) + < page_zip_get_size(page_zip)) { + byte* data; + + /* Clear only the data bytes, because the allocator and + the decompressor depend on the extra bytes. */ + memset(rec, 0, rec_offs_data_size(offsets)); + + if (!page_is_leaf(page)) { + /* Clear node_ptr on the compressed page. */ + byte* storage = page_zip->data + + page_zip_get_size(page_zip) + - (page_dir_get_n_heap(page) + - PAGE_HEAP_NO_USER_LOW) + * PAGE_ZIP_DIR_SLOT_SIZE; + + memset(storage - (heap_no - 1) * REC_NODE_PTR_SIZE, + 0, REC_NODE_PTR_SIZE); + } else if (dict_index_is_clust(index)) { + /* Clear trx_id and roll_ptr on the compressed page. */ + byte* storage = page_zip->data + + page_zip_get_size(page_zip) + - (page_dir_get_n_heap(page) + - PAGE_HEAP_NO_USER_LOW) + * PAGE_ZIP_DIR_SLOT_SIZE; + + memset(storage - (heap_no - 1) + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN), + 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + } + + /* Log that the data was zeroed out. */ + data = page_zip->data + page_zip->m_end; + ut_ad(!*data); + if (UNIV_UNLIKELY(heap_no - 1 >= 64)) { + *data++ = (byte) (0x80 | (heap_no - 1) >> 7); + ut_ad(!*data); + } + *data++ = (byte) ((heap_no - 1) << 1 | 1); + ut_ad(!*data); + ut_ad((ulint) (data - page_zip->data) + < page_zip_get_size(page_zip)); + page_zip->m_end = data - page_zip->data; + page_zip->m_nonempty = TRUE; + } else if (page_is_leaf(page) && dict_index_is_clust(index)) { + /* Do not clear the record, because there is not enough space + to log the operation. */ + + if (rec_offs_any_extern(offsets)) { + ulint i; + + for (i = rec_offs_n_fields(offsets); i--; ) { + /* Clear all BLOB pointers in order to make + page_zip_validate() pass. */ + if (rec_offs_nth_extern(offsets, i)) { + ulint len; + byte* field = rec_get_nth_field( + rec, offsets, i, &len); + memset(field + len + - BTR_EXTERN_FIELD_REF_SIZE, + 0, BTR_EXTERN_FIELD_REF_SIZE); + } + } + } + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ +} + +/************************************************************************** +Write the "deleted" flag of a record on a compressed page. The flag must +already have been written on the uncompressed page. */ +UNIV_INTERN +void +page_zip_rec_set_deleted( +/*=====================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* rec, /* in: record on the uncompressed page */ + ulint flag) /* in: the deleted flag (nonzero=TRUE) */ +{ + byte* slot = page_zip_dir_find(page_zip, page_offset(rec)); + ut_a(slot); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + if (flag) { + *slot |= (PAGE_ZIP_DIR_SLOT_DEL >> 8); + } else { + *slot &= ~(PAGE_ZIP_DIR_SLOT_DEL >> 8); + } +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page_align(rec))); +#endif /* UNIV_ZIP_DEBUG */ +} + +/************************************************************************** +Write the "owned" flag of a record on a compressed page. The n_owned field +must already have been written on the uncompressed page. */ +UNIV_INTERN +void +page_zip_rec_set_owned( +/*===================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* rec, /* in: record on the uncompressed page */ + ulint flag) /* in: the owned flag (nonzero=TRUE) */ +{ + byte* slot = page_zip_dir_find(page_zip, page_offset(rec)); + ut_a(slot); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + if (flag) { + *slot |= (PAGE_ZIP_DIR_SLOT_OWNED >> 8); + } else { + *slot &= ~(PAGE_ZIP_DIR_SLOT_OWNED >> 8); + } +} + +/************************************************************************** +Insert a record to the dense page directory. */ +UNIV_INTERN +void +page_zip_dir_insert( +/*================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* prev_rec,/* in: record after which to insert */ + const byte* free_rec,/* in: record from which rec was + allocated, or NULL */ + byte* rec) /* in: record to insert */ +{ + ulint n_dense; + byte* slot_rec; + byte* slot_free; + + ut_ad(prev_rec != rec); + ut_ad(page_rec_get_next((rec_t*) prev_rec) == rec); + ut_ad(page_zip_simple_validate(page_zip)); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + if (page_rec_is_infimum(prev_rec)) { + /* Use the first slot. */ + slot_rec = page_zip->data + page_zip_get_size(page_zip); + } else { + byte* end = page_zip->data + page_zip_get_size(page_zip); + byte* start = end - page_zip_dir_user_size(page_zip); + + if (UNIV_LIKELY(!free_rec)) { + /* PAGE_N_RECS was already incremented + in page_cur_insert_rec_zip(), but the + dense directory slot at that position + contains garbage. Skip it. */ + start += PAGE_ZIP_DIR_SLOT_SIZE; + } + + slot_rec = page_zip_dir_find_low(start, end, + page_offset(prev_rec)); + ut_a(slot_rec); + } + + /* Read the old n_dense (n_heap may have been incremented). */ + n_dense = page_dir_get_n_heap(page_zip->data) + - (PAGE_HEAP_NO_USER_LOW + 1); + + if (UNIV_LIKELY_NULL(free_rec)) { + /* The record was allocated from the free list. + Shift the dense directory only up to that slot. + Note that in this case, n_dense is actually + off by one, because page_cur_insert_rec_zip() + did not increment n_heap. */ + ut_ad(rec_get_heap_no_new(rec) < n_dense + 1 + + PAGE_HEAP_NO_USER_LOW); + ut_ad(rec >= free_rec); + slot_free = page_zip_dir_find(page_zip, page_offset(free_rec)); + ut_ad(slot_free); + slot_free += PAGE_ZIP_DIR_SLOT_SIZE; + } else { + /* The record was allocated from the heap. + Shift the entire dense directory. */ + ut_ad(rec_get_heap_no_new(rec) == n_dense + + PAGE_HEAP_NO_USER_LOW); + + /* Shift to the end of the dense page directory. */ + slot_free = page_zip->data + page_zip_get_size(page_zip) + - PAGE_ZIP_DIR_SLOT_SIZE * n_dense; + } + + /* Shift the dense directory to allocate place for rec. */ + memmove(slot_free - PAGE_ZIP_DIR_SLOT_SIZE, slot_free, + slot_rec - slot_free); + + /* Write the entry for the inserted record. + The "owned" and "deleted" flags must be zero. */ + mach_write_to_2(slot_rec - PAGE_ZIP_DIR_SLOT_SIZE, page_offset(rec)); +} + +/************************************************************************** +Shift the dense page directory and the array of BLOB pointers +when a record is deleted. */ +UNIV_INTERN +void +page_zip_dir_delete( +/*================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + byte* rec, /* in: record to delete */ + dict_index_t* index, /* in: index of rec */ + const ulint* offsets,/* in: rec_get_offsets(rec) */ + const byte* free) /* in: previous start of the free list */ +{ + byte* slot_rec; + byte* slot_free; + ulint n_ext; + page_t* page = page_align(rec); + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_comp(offsets)); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + slot_rec = page_zip_dir_find(page_zip, page_offset(rec)); + + ut_a(slot_rec); + + /* This could not be done before page_zip_dir_find(). */ + page_header_set_field(page, page_zip, PAGE_N_RECS, + (ulint)(page_get_n_recs(page) - 1)); + + if (UNIV_UNLIKELY(!free)) { + /* Make the last slot the start of the free list. */ + slot_free = page_zip->data + page_zip_get_size(page_zip) + - PAGE_ZIP_DIR_SLOT_SIZE + * (page_dir_get_n_heap(page_zip->data) + - PAGE_HEAP_NO_USER_LOW); + } else { + slot_free = page_zip_dir_find_free(page_zip, + page_offset(free)); + ut_a(slot_free < slot_rec); + /* Grow the free list by one slot by moving the start. */ + slot_free += PAGE_ZIP_DIR_SLOT_SIZE; + } + + if (UNIV_LIKELY(slot_rec > slot_free)) { + memmove(slot_free + PAGE_ZIP_DIR_SLOT_SIZE, + slot_free, + slot_rec - slot_free); + } + + /* Write the entry for the deleted record. + The "owned" and "deleted" flags will be cleared. */ + mach_write_to_2(slot_free, page_offset(rec)); + + if (!page_is_leaf(page) || !dict_index_is_clust(index)) { + ut_ad(!rec_offs_any_extern(offsets)); + goto skip_blobs; + } + + n_ext = rec_offs_n_extern(offsets); + if (UNIV_UNLIKELY(n_ext)) { + /* Shift and zero fill the array of BLOB pointers. */ + ulint blob_no; + byte* externs; + byte* ext_end; + + blob_no = page_zip_get_n_prev_extern(page_zip, rec, index); + ut_a(blob_no + n_ext <= page_zip->n_blobs); + + externs = page_zip->data + page_zip_get_size(page_zip) + - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW) + * (PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + ext_end = externs - page_zip->n_blobs + * BTR_EXTERN_FIELD_REF_SIZE; + externs -= blob_no * BTR_EXTERN_FIELD_REF_SIZE; + + page_zip->n_blobs -= n_ext; + /* Shift and zero fill the array. */ + memmove(ext_end + n_ext * BTR_EXTERN_FIELD_REF_SIZE, ext_end, + (page_zip->n_blobs - blob_no) + * BTR_EXTERN_FIELD_REF_SIZE); + memset(ext_end, 0, n_ext * BTR_EXTERN_FIELD_REF_SIZE); + } + +skip_blobs: + /* The compression algorithm expects info_bits and n_owned + to be 0 for deleted records. */ + rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */ + + page_zip_clear_rec(page_zip, rec, index, offsets); +} + +/************************************************************************** +Add a slot to the dense page directory. */ +UNIV_INTERN +void +page_zip_dir_add_slot( +/*==================*/ + page_zip_des_t* page_zip, /* in/out: compressed page */ + ulint is_clustered) /* in: nonzero for clustered index, + zero for others */ +{ + ulint n_dense; + byte* dir; + byte* stored; + + ut_ad(page_is_comp(page_zip->data)); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + /* Read the old n_dense (n_heap has already been incremented). */ + n_dense = page_dir_get_n_heap(page_zip->data) + - (PAGE_HEAP_NO_USER_LOW + 1); + + dir = page_zip->data + page_zip_get_size(page_zip) + - PAGE_ZIP_DIR_SLOT_SIZE * n_dense; + + if (!page_is_leaf(page_zip->data)) { + ut_ad(!page_zip->n_blobs); + stored = dir - n_dense * REC_NODE_PTR_SIZE; + } else if (UNIV_UNLIKELY(is_clustered)) { + /* Move the BLOB pointer array backwards to make space for the + roll_ptr and trx_id columns and the dense directory slot. */ + byte* externs; + + stored = dir - n_dense + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + externs = stored + - page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE; + ASSERT_ZERO(externs + - (PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN), + PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + memmove(externs - (PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN), + externs, stored - externs); + } else { + stored = dir + - page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE; + ASSERT_ZERO(stored - PAGE_ZIP_DIR_SLOT_SIZE, + PAGE_ZIP_DIR_SLOT_SIZE); + } + + /* Move the uncompressed area backwards to make space + for one directory slot. */ + memmove(stored - PAGE_ZIP_DIR_SLOT_SIZE, stored, dir - stored); +} + +/*************************************************************** +Parses a log record of writing to the header of a page. */ +UNIV_INTERN +byte* +page_zip_parse_write_header( +/*========================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: redo log buffer */ + byte* end_ptr,/* in: redo log buffer end */ + page_t* page, /* in/out: uncompressed page */ + page_zip_des_t* page_zip)/* in/out: compressed page */ +{ + ulint offset; + ulint len; + + ut_ad(ptr && end_ptr); + ut_ad(!page == !page_zip); + + if (UNIV_UNLIKELY(end_ptr < ptr + (1 + 1))) { + + return(NULL); + } + + offset = (ulint) *ptr++; + len = (ulint) *ptr++; + + if (UNIV_UNLIKELY(!len) || UNIV_UNLIKELY(offset + len >= PAGE_DATA)) { +corrupt: + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + + if (UNIV_UNLIKELY(end_ptr < ptr + len)) { + + return(NULL); + } + + if (page) { + if (UNIV_UNLIKELY(!page_zip)) { + + goto corrupt; + } +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + memcpy(page + offset, ptr, len); + memcpy(page_zip->data + offset, ptr, len); + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + } + + return(ptr + len); +} + +/************************************************************************** +Write a log record of writing to the uncompressed header portion of a page. */ +UNIV_INTERN +void +page_zip_write_header_log( +/*======================*/ + const byte* data, /* in: data on the uncompressed page */ + ulint length, /* in: length of the data */ + mtr_t* mtr) /* in: mini-transaction */ +{ + byte* log_ptr = mlog_open(mtr, 11 + 1 + 1); + ulint offset = page_offset(data); + + ut_ad(offset < PAGE_DATA); + ut_ad(offset + length < PAGE_DATA); +#if PAGE_DATA > 255 +# error "PAGE_DATA > 255" +#endif + ut_ad(length < 256); + + /* If no logging is requested, we may return now */ + if (UNIV_UNLIKELY(!log_ptr)) { + + return; + } + + log_ptr = mlog_write_initial_log_record_fast( + (byte*) data, MLOG_ZIP_WRITE_HEADER, log_ptr, mtr); + *log_ptr++ = (byte) offset; + *log_ptr++ = (byte) length; + mlog_close(mtr, log_ptr); + + mlog_catenate_string(mtr, data, length); +} + +/************************************************************************** +Reorganize and compress a page. This is a low-level operation for +compressed pages, to be used when page_zip_compress() fails. +On success, a redo log entry MLOG_ZIP_PAGE_COMPRESS will be written. +The function btr_page_reorganize() should be preferred whenever possible. +IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a +non-clustered index, the caller must update the insert buffer free +bits in the same mini-transaction in such a way that the modification +will be redo-logged. */ +UNIV_INTERN +ibool +page_zip_reorganize( +/*================*/ + /* out: TRUE on success, FALSE on failure; + page and page_zip will be left intact + on failure. */ + buf_block_t* block, /* in/out: page with compressed page; + on the compressed page, in: size; + out: data, n_blobs, + m_start, m_end, m_nonempty */ + dict_index_t* index, /* in: index of the B-tree node */ + mtr_t* mtr) /* in: mini-transaction */ +{ + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + page_t* page = buf_block_get_frame(block); + buf_block_t* temp_block; + page_t* temp_page; + ulint log_mode; + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(page_is_comp(page)); + /* Note that page_zip_validate(page_zip, page) may fail here. */ + UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + /* Disable logging */ + log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + + temp_block = buf_block_alloc(0); + temp_page = temp_block->frame; + + btr_search_drop_page_hash_index(block); + + /* Copy the old page to temporary space */ + buf_frame_copy(temp_page, page); + + /* Recreate the page: note that global data on page (possible + segment headers, next page-field, etc.) is preserved intact */ + + page_create(block, mtr, TRUE); + block->check_index_page_at_flush = TRUE; + + /* Copy the records from the temporary space to the recreated page; + do not copy the lock bits yet */ + + page_copy_rec_list_end_no_locks(block, temp_block, + page_get_infimum_rec(temp_page), + index, mtr); + /* Copy max trx id to recreated page */ + page_set_max_trx_id(block, NULL, page_get_max_trx_id(temp_page)); + + /* Restore logging. */ + mtr_set_log_mode(mtr, log_mode); + + if (UNIV_UNLIKELY(!page_zip_compress(page_zip, page, index, mtr))) { + + /* Restore the old page and exit. */ + buf_frame_copy(page, temp_page); + + buf_block_free(temp_block); + return(FALSE); + } + + lock_move_reorganize_page(block, temp_block); + + buf_block_free(temp_block); + return(TRUE); +} + +/************************************************************************** +Copy the records of a page byte for byte. Do not copy the page header +or trailer, except those B-tree header fields that are directly +related to the storage of records. Also copy PAGE_MAX_TRX_ID. +NOTE: The caller must update the lock table and the adaptive hash index. */ +UNIV_INTERN +void +page_zip_copy_recs( +/*===============*/ + page_zip_des_t* page_zip, /* out: copy of src_zip + (n_blobs, m_start, m_end, + m_nonempty, data[0..size-1]) */ + page_t* page, /* out: copy of src */ + const page_zip_des_t* src_zip, /* in: compressed page */ + const page_t* src, /* in: page */ + dict_index_t* index, /* in: index of the B-tree */ + mtr_t* mtr) /* in: mini-transaction */ +{ + ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, (page_t*) src, MTR_MEMO_PAGE_X_FIX)); +#ifdef UNIV_ZIP_DEBUG + /* The B-tree operations that call this function may set + FIL_PAGE_PREV or PAGE_LEVEL, causing a temporary min_rec_flag + mismatch. A strict page_zip_validate() will be executed later + during the B-tree operations. */ + ut_a(page_zip_validate_low(src_zip, src, TRUE)); +#endif /* UNIV_ZIP_DEBUG */ + ut_a(page_zip_get_size(page_zip) == page_zip_get_size(src_zip)); + if (UNIV_UNLIKELY(src_zip->n_blobs)) { + ut_a(page_is_leaf(src)); + ut_a(dict_index_is_clust(index)); + } + + UNIV_MEM_ASSERT_W(page, UNIV_PAGE_SIZE); + UNIV_MEM_ASSERT_W(page_zip->data, page_zip_get_size(page_zip)); + UNIV_MEM_ASSERT_RW(src, UNIV_PAGE_SIZE); + UNIV_MEM_ASSERT_RW(src_zip->data, page_zip_get_size(page_zip)); + + /* Copy those B-tree page header fields that are related to + the records stored in the page. Also copy the field + PAGE_MAX_TRX_ID. Skip the rest of the page header and + trailer. On the compressed page, there is no trailer. */ +#if PAGE_MAX_TRX_ID + 8 != PAGE_HEADER_PRIV_END +# error "PAGE_MAX_TRX_ID + 8 != PAGE_HEADER_PRIV_END" +#endif + memcpy(PAGE_HEADER + page, PAGE_HEADER + src, + PAGE_HEADER_PRIV_END); + memcpy(PAGE_DATA + page, PAGE_DATA + src, + UNIV_PAGE_SIZE - PAGE_DATA - FIL_PAGE_DATA_END); + memcpy(PAGE_HEADER + page_zip->data, PAGE_HEADER + src_zip->data, + PAGE_HEADER_PRIV_END); + memcpy(PAGE_DATA + page_zip->data, PAGE_DATA + src_zip->data, + page_zip_get_size(page_zip) - PAGE_DATA); + + /* Copy all fields of src_zip to page_zip, except the pointer + to the compressed data page. */ + { + page_zip_t* data = page_zip->data; + memcpy(page_zip, src_zip, sizeof *page_zip); + page_zip->data = data; + } + ut_ad(page_zip_get_trailer_len(page_zip, + dict_index_is_clust(index), NULL) + + page_zip->m_end < page_zip_get_size(page_zip)); + + if (!page_is_leaf(src) + && UNIV_UNLIKELY(mach_read_from_4(src + FIL_PAGE_PREV) == FIL_NULL) + && UNIV_LIKELY(mach_read_from_4(page + + FIL_PAGE_PREV) != FIL_NULL)) { + /* Clear the REC_INFO_MIN_REC_FLAG of the first user record. */ + ulint offs = rec_get_next_offs(page + PAGE_NEW_INFIMUM, + TRUE); + if (UNIV_LIKELY(offs != PAGE_NEW_SUPREMUM)) { + rec_t* rec = page + offs; + ut_a(rec[-REC_N_NEW_EXTRA_BYTES] + & REC_INFO_MIN_REC_FLAG); + rec[-REC_N_NEW_EXTRA_BYTES] &= ~ REC_INFO_MIN_REC_FLAG; + } + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + page_zip_compress_write_log(page_zip, page, index, mtr); +} + +/************************************************************************** +Parses a log record of compressing an index page. */ +UNIV_INTERN +byte* +page_zip_parse_compress( +/*====================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page, /* out: uncompressed page */ + page_zip_des_t* page_zip)/* out: compressed page */ +{ + ulint size; + ulint trailer_size; + + ut_ad(ptr && end_ptr); + ut_ad(!page == !page_zip); + + if (UNIV_UNLIKELY(ptr + (2 + 2) > end_ptr)) { + + return(NULL); + } + + size = mach_read_from_2(ptr); + ptr += 2; + trailer_size = mach_read_from_2(ptr); + ptr += 2; + + if (UNIV_UNLIKELY(ptr + 8 + size + trailer_size > end_ptr)) { + + return(NULL); + } + + if (page) { + if (UNIV_UNLIKELY(!page_zip) + || UNIV_UNLIKELY(page_zip_get_size(page_zip) < size)) { +corrupt: + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + + memcpy(page_zip->data + FIL_PAGE_PREV, ptr, 4); + memcpy(page_zip->data + FIL_PAGE_NEXT, ptr + 4, 4); + memcpy(page_zip->data + FIL_PAGE_TYPE, ptr + 8, size); + memset(page_zip->data + FIL_PAGE_TYPE + size, 0, + page_zip_get_size(page_zip) - trailer_size + - (FIL_PAGE_TYPE + size)); + memcpy(page_zip->data + page_zip_get_size(page_zip) + - trailer_size, ptr + 8 + size, trailer_size); + + if (UNIV_UNLIKELY(!page_zip_decompress(page_zip, page))) { + + goto corrupt; + } + } + + return(ptr + 8 + size + trailer_size); +} + +/************************************************************************** +Calculate the compressed page checksum. */ +UNIV_INTERN +ulint +page_zip_calc_checksum( +/*===================*/ + /* out: page checksum */ + const void* data, /* in: compressed page */ + ulint size) /* in: size of compressed page */ +{ + /* Exclude FIL_PAGE_SPACE_OR_CHKSUM, FIL_PAGE_LSN, + and FIL_PAGE_FILE_FLUSH_LSN from the checksum. */ + + const Bytef* s = data; + uLong adler; + + ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + adler = adler32(0L, s + FIL_PAGE_OFFSET, + FIL_PAGE_LSN - FIL_PAGE_OFFSET); + adler = adler32(adler, s + FIL_PAGE_TYPE, 2); + adler = adler32(adler, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + return((ulint) adler); +} |