diff options
author | Sergei Golubchik <vuvova@gmail.com> | 2015-05-04 19:15:28 +0200 |
---|---|---|
committer | Sergei Golubchik <vuvova@gmail.com> | 2015-05-04 19:15:28 +0200 |
commit | 14a142fca67b9e1fb3f0250fda093f5b967f0138 (patch) | |
tree | dd49e0666c863d80b5c50642e36a9c945ea12b8a /storage/xtradb/page | |
parent | dfb001edcd4b16bd4370b08b0176df78c4c5523f (diff) | |
download | mariadb-git-14a142fca67b9e1fb3f0250fda093f5b967f0138.tar.gz |
move to storage/xtradb
Diffstat (limited to 'storage/xtradb/page')
-rw-r--r-- | storage/xtradb/page/page0cur.cc | 2145 | ||||
-rw-r--r-- | storage/xtradb/page/page0page.cc | 2813 | ||||
-rw-r--r-- | storage/xtradb/page/page0zip.cc | 4952 |
3 files changed, 9910 insertions, 0 deletions
diff --git a/storage/xtradb/page/page0cur.cc b/storage/xtradb/page/page0cur.cc new file mode 100644 index 00000000000..f5f7e1299ce --- /dev/null +++ b/storage/xtradb/page/page0cur.cc @@ -0,0 +1,2145 @@ +/***************************************************************************** + +Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file page/page0cur.cc +The page cursor + +Created 10/4/1994 Heikki Tuuri +*************************************************************************/ + +#include "page0cur.h" +#ifdef UNIV_NONINL +#include "page0cur.ic" +#endif + +#include "page0zip.h" +#include "btr0btr.h" +#include "mtr0log.h" +#include "log0recv.h" +#include "ut0ut.h" +#ifndef UNIV_HOTBACKUP +#include "rem0cmp.h" + +#ifdef PAGE_CUR_ADAPT +# ifdef UNIV_SEARCH_PERF_STAT +static ulint page_cur_short_succ = 0; +# endif /* UNIV_SEARCH_PERF_STAT */ + +/*******************************************************************//** +This is a linear congruential generator PRNG. Returns a pseudo random +number between 0 and 2^64-1 inclusive. The formula and the constants +being used are: +X[n+1] = (a * X[n] + c) mod m +where: +X[0] = ut_time_us(NULL) +a = 1103515245 (3^5 * 5 * 7 * 129749) +c = 12345 (3 * 5 * 823) +m = 18446744073709551616 (2^64) + +@return number between 0 and 2^64-1 */ +static +ib_uint64_t +page_cur_lcg_prng(void) +/*===================*/ +{ +#define LCG_a 1103515245 +#define LCG_c 12345 + static ib_uint64_t lcg_current = 0; + static ibool initialized = FALSE; + + if (!initialized) { + lcg_current = (ib_uint64_t) ut_time_us(NULL); + initialized = TRUE; + } + + /* no need to "% 2^64" explicitly because lcg_current is + 64 bit and this will be done anyway */ + lcg_current = LCG_a * lcg_current + LCG_c; + + return(lcg_current); +} + +/****************************************************************//** +Tries a search shortcut based on the last insert. +@return TRUE on success */ +UNIV_INLINE +ibool +page_cur_try_search_shortcut( +/*=========================*/ + const buf_block_t* block, /*!< in: index page */ + const dict_index_t* index, /*!< in: record descriptor */ + const dtuple_t* tuple, /*!< in: data tuple */ + ulint* iup_matched_fields, + /*!< in/out: already matched + fields in upper limit record */ + ulint* iup_matched_bytes, + /*!< in/out: already matched + bytes in a field not yet + completely matched */ + ulint* ilow_matched_fields, + /*!< in/out: already matched + fields in lower limit record */ + ulint* ilow_matched_bytes, + /*!< in/out: already matched + bytes in a field not yet + completely matched */ + page_cur_t* cursor) /*!< out: page cursor */ +{ + const rec_t* rec; + const rec_t* next_rec; + ulint low_match; + ulint low_bytes; + ulint up_match; + ulint up_bytes; +#ifdef UNIV_SEARCH_DEBUG + page_cur_t cursor2; +#endif + ibool success = FALSE; + const page_t* page = buf_block_get_frame(block); + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(dtuple_check_typed(tuple)); + + rec = page_header_get_ptr(page, PAGE_LAST_INSERT); + offsets = rec_get_offsets(rec, index, offsets, + dtuple_get_n_fields(tuple), &heap); + + ut_ad(rec); + ut_ad(page_rec_is_user_rec(rec)); + + ut_pair_min(&low_match, &low_bytes, + *ilow_matched_fields, *ilow_matched_bytes, + *iup_matched_fields, *iup_matched_bytes); + + up_match = low_match; + up_bytes = low_bytes; + + if (page_cmp_dtuple_rec_with_match(tuple, rec, offsets, + &low_match, &low_bytes) < 0) { + goto exit_func; + } + + next_rec = page_rec_get_next_const(rec); + offsets = rec_get_offsets(next_rec, index, offsets, + dtuple_get_n_fields(tuple), &heap); + + if (page_cmp_dtuple_rec_with_match(tuple, next_rec, offsets, + &up_match, &up_bytes) >= 0) { + goto exit_func; + } + + page_cur_position(rec, block, cursor); + +#ifdef UNIV_SEARCH_DEBUG + page_cur_search_with_match(block, index, tuple, PAGE_CUR_DBG, + iup_matched_fields, + iup_matched_bytes, + ilow_matched_fields, + ilow_matched_bytes, + &cursor2); + ut_a(cursor2.rec == cursor->rec); + + if (!page_rec_is_supremum(next_rec)) { + + ut_a(*iup_matched_fields == up_match); + ut_a(*iup_matched_bytes == up_bytes); + } + + ut_a(*ilow_matched_fields == low_match); + ut_a(*ilow_matched_bytes == low_bytes); +#endif + if (!page_rec_is_supremum(next_rec)) { + + *iup_matched_fields = up_match; + *iup_matched_bytes = up_bytes; + } + + *ilow_matched_fields = low_match; + *ilow_matched_bytes = low_bytes; + +#ifdef UNIV_SEARCH_PERF_STAT + page_cur_short_succ++; +#endif + success = TRUE; +exit_func: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(success); +} + +#endif + +#ifdef PAGE_CUR_LE_OR_EXTENDS +/****************************************************************//** +Checks if the nth field in a record is a character type field which extends +the nth field in tuple, i.e., the field is longer or equal in length and has +common first characters. +@return TRUE if rec field extends tuple field */ +static +ibool +page_cur_rec_field_extends( +/*=======================*/ + const dtuple_t* tuple, /*!< in: data tuple */ + const rec_t* rec, /*!< in: record */ + const ulint* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n) /*!< in: compare nth field */ +{ + const dtype_t* type; + const dfield_t* dfield; + const byte* rec_f; + ulint rec_f_len; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + dfield = dtuple_get_nth_field(tuple, n); + + type = dfield_get_type(dfield); + + rec_f = rec_get_nth_field(rec, offsets, n, &rec_f_len); + + if (type->mtype == DATA_VARCHAR + || type->mtype == DATA_CHAR + || type->mtype == DATA_FIXBINARY + || type->mtype == DATA_BINARY + || type->mtype == DATA_BLOB + || type->mtype == DATA_VARMYSQL + || type->mtype == DATA_MYSQL) { + + if (dfield_get_len(dfield) != UNIV_SQL_NULL + && rec_f_len != UNIV_SQL_NULL + && rec_f_len >= dfield_get_len(dfield) + && !cmp_data_data_slow(type->mtype, type->prtype, + dfield_get_data(dfield), + dfield_get_len(dfield), + rec_f, dfield_get_len(dfield))) { + + return(TRUE); + } + } + + return(FALSE); +} +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + +/****************************************************************//** +Searches the right position for a page cursor. */ +UNIV_INTERN +void +page_cur_search_with_match( +/*=======================*/ + const buf_block_t* block, /*!< in: buffer block */ + const dict_index_t* index, /*!< in: record descriptor */ + const dtuple_t* tuple, /*!< in: data tuple */ + ulint mode, /*!< in: PAGE_CUR_L, + PAGE_CUR_LE, PAGE_CUR_G, or + PAGE_CUR_GE */ + ulint* iup_matched_fields, + /*!< in/out: already matched + fields in upper limit record */ + ulint* iup_matched_bytes, + /*!< in/out: already matched + bytes in a field not yet + completely matched */ + ulint* ilow_matched_fields, + /*!< in/out: already matched + fields in lower limit record */ + ulint* ilow_matched_bytes, + /*!< in/out: already matched + bytes in a field not yet + completely matched */ + page_cur_t* cursor) /*!< out: page cursor */ +{ + ulint up; + ulint low; + ulint mid; + const page_t* page; + const page_dir_slot_t* slot; + const rec_t* up_rec; + const rec_t* low_rec; + const rec_t* mid_rec; + ulint up_matched_fields; + ulint up_matched_bytes; + ulint low_matched_fields; + ulint low_matched_bytes; + ulint cur_matched_fields; + ulint cur_matched_bytes; + int cmp; +#ifdef UNIV_SEARCH_DEBUG + int dbg_cmp; + ulint dbg_matched_fields; + ulint dbg_matched_bytes; +#endif +#ifdef UNIV_ZIP_DEBUG + const page_zip_des_t* page_zip = buf_block_get_page_zip(block); +#endif /* UNIV_ZIP_DEBUG */ + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(block && tuple && iup_matched_fields && iup_matched_bytes + && ilow_matched_fields && ilow_matched_bytes && cursor); + ut_ad(dtuple_validate(tuple)); +#ifdef UNIV_DEBUG +# ifdef PAGE_CUR_DBG + if (mode != PAGE_CUR_DBG) +# endif /* PAGE_CUR_DBG */ +# ifdef PAGE_CUR_LE_OR_EXTENDS + if (mode != PAGE_CUR_LE_OR_EXTENDS) +# endif /* PAGE_CUR_LE_OR_EXTENDS */ + ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE + || mode == PAGE_CUR_G || mode == PAGE_CUR_GE); +#endif /* UNIV_DEBUG */ + page = buf_block_get_frame(block); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + page_check_dir(page); + +#ifdef PAGE_CUR_ADAPT + if (page_is_leaf(page) + && (mode == PAGE_CUR_LE) + && (page_header_get_field(page, PAGE_N_DIRECTION) > 3) + && (page_header_get_ptr(page, PAGE_LAST_INSERT)) + && (page_header_get_field(page, PAGE_DIRECTION) == PAGE_RIGHT)) { + + if (page_cur_try_search_shortcut( + block, index, tuple, + iup_matched_fields, iup_matched_bytes, + ilow_matched_fields, ilow_matched_bytes, + cursor)) { + return; + } + } +# ifdef PAGE_CUR_DBG + if (mode == PAGE_CUR_DBG) { + mode = PAGE_CUR_LE; + } +# endif +#endif + + /* The following flag does not work for non-latin1 char sets because + cmp_full_field does not tell how many bytes matched */ +#ifdef PAGE_CUR_LE_OR_EXTENDS + ut_a(mode != PAGE_CUR_LE_OR_EXTENDS); +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + + /* If mode PAGE_CUR_G is specified, we are trying to position the + cursor to answer a query of the form "tuple < X", where tuple is + the input parameter, and X denotes an arbitrary physical record on + the page. We want to position the cursor on the first X which + satisfies the condition. */ + + up_matched_fields = *iup_matched_fields; + up_matched_bytes = *iup_matched_bytes; + low_matched_fields = *ilow_matched_fields; + low_matched_bytes = *ilow_matched_bytes; + + /* Perform binary search. First the search is done through the page + directory, after that as a linear search in the list of records + owned by the upper limit directory slot. */ + + low = 0; + up = page_dir_get_n_slots(page) - 1; + + /* Perform binary search until the lower and upper limit directory + slots come to the distance 1 of each other */ + + while (up - low > 1) { + mid = (low + up) / 2; + slot = page_dir_get_nth_slot(page, mid); + mid_rec = page_dir_slot_get_rec(slot); + + ut_pair_min(&cur_matched_fields, &cur_matched_bytes, + low_matched_fields, low_matched_bytes, + up_matched_fields, up_matched_bytes); + + offsets = rec_get_offsets(mid_rec, index, offsets, + dtuple_get_n_fields_cmp(tuple), + &heap); + + cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, offsets, + &cur_matched_fields, + &cur_matched_bytes); + if (UNIV_LIKELY(cmp > 0)) { +low_slot_match: + low = mid; + low_matched_fields = cur_matched_fields; + low_matched_bytes = cur_matched_bytes; + + } else if (UNIV_EXPECT(cmp, -1)) { +#ifdef PAGE_CUR_LE_OR_EXTENDS + if (mode == PAGE_CUR_LE_OR_EXTENDS + && page_cur_rec_field_extends( + tuple, mid_rec, offsets, + cur_matched_fields)) { + + goto low_slot_match; + } +#endif /* PAGE_CUR_LE_OR_EXTENDS */ +up_slot_match: + up = mid; + up_matched_fields = cur_matched_fields; + up_matched_bytes = cur_matched_bytes; + + } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE +#ifdef PAGE_CUR_LE_OR_EXTENDS + || mode == PAGE_CUR_LE_OR_EXTENDS +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + ) { + + goto low_slot_match; + } else { + + goto up_slot_match; + } + } + + slot = page_dir_get_nth_slot(page, low); + low_rec = page_dir_slot_get_rec(slot); + slot = page_dir_get_nth_slot(page, up); + up_rec = page_dir_slot_get_rec(slot); + + /* Perform linear search until the upper and lower records come to + distance 1 of each other. */ + + while (page_rec_get_next_const(low_rec) != up_rec) { + + mid_rec = page_rec_get_next_const(low_rec); + + ut_pair_min(&cur_matched_fields, &cur_matched_bytes, + low_matched_fields, low_matched_bytes, + up_matched_fields, up_matched_bytes); + + offsets = rec_get_offsets(mid_rec, index, offsets, + dtuple_get_n_fields_cmp(tuple), + &heap); + + cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, offsets, + &cur_matched_fields, + &cur_matched_bytes); + if (UNIV_LIKELY(cmp > 0)) { +low_rec_match: + low_rec = mid_rec; + low_matched_fields = cur_matched_fields; + low_matched_bytes = cur_matched_bytes; + + } else if (UNIV_EXPECT(cmp, -1)) { +#ifdef PAGE_CUR_LE_OR_EXTENDS + if (mode == PAGE_CUR_LE_OR_EXTENDS + && page_cur_rec_field_extends( + tuple, mid_rec, offsets, + cur_matched_fields)) { + + goto low_rec_match; + } +#endif /* PAGE_CUR_LE_OR_EXTENDS */ +up_rec_match: + up_rec = mid_rec; + up_matched_fields = cur_matched_fields; + up_matched_bytes = cur_matched_bytes; + } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE +#ifdef PAGE_CUR_LE_OR_EXTENDS + || mode == PAGE_CUR_LE_OR_EXTENDS +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + ) { + + goto low_rec_match; + } else { + + goto up_rec_match; + } + } + +#ifdef UNIV_SEARCH_DEBUG + + /* Check that the lower and upper limit records have the + right alphabetical order compared to tuple. */ + dbg_matched_fields = 0; + dbg_matched_bytes = 0; + + offsets = rec_get_offsets(low_rec, index, offsets, + ULINT_UNDEFINED, &heap); + dbg_cmp = page_cmp_dtuple_rec_with_match(tuple, low_rec, offsets, + &dbg_matched_fields, + &dbg_matched_bytes); + if (mode == PAGE_CUR_G) { + ut_a(dbg_cmp >= 0); + } else if (mode == PAGE_CUR_GE) { + ut_a(dbg_cmp == 1); + } else if (mode == PAGE_CUR_L) { + ut_a(dbg_cmp == 1); + } else if (mode == PAGE_CUR_LE) { + ut_a(dbg_cmp >= 0); + } + + if (!page_rec_is_infimum(low_rec)) { + + ut_a(low_matched_fields == dbg_matched_fields); + ut_a(low_matched_bytes == dbg_matched_bytes); + } + + dbg_matched_fields = 0; + dbg_matched_bytes = 0; + + offsets = rec_get_offsets(up_rec, index, offsets, + ULINT_UNDEFINED, &heap); + dbg_cmp = page_cmp_dtuple_rec_with_match(tuple, up_rec, offsets, + &dbg_matched_fields, + &dbg_matched_bytes); + if (mode == PAGE_CUR_G) { + ut_a(dbg_cmp == -1); + } else if (mode == PAGE_CUR_GE) { + ut_a(dbg_cmp <= 0); + } else if (mode == PAGE_CUR_L) { + ut_a(dbg_cmp <= 0); + } else if (mode == PAGE_CUR_LE) { + ut_a(dbg_cmp == -1); + } + + if (!page_rec_is_supremum(up_rec)) { + + ut_a(up_matched_fields == dbg_matched_fields); + ut_a(up_matched_bytes == dbg_matched_bytes); + } +#endif + if (mode <= PAGE_CUR_GE) { + page_cur_position(up_rec, block, cursor); + } else { + page_cur_position(low_rec, block, cursor); + } + + *iup_matched_fields = up_matched_fields; + *iup_matched_bytes = up_matched_bytes; + *ilow_matched_fields = low_matched_fields; + *ilow_matched_bytes = low_matched_bytes; + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/***********************************************************//** +Positions a page cursor on a randomly chosen user record on a page. If there +are no user records, sets the cursor on the infimum record. */ +UNIV_INTERN +void +page_cur_open_on_rnd_user_rec( +/*==========================*/ + buf_block_t* block, /*!< in: page */ + page_cur_t* cursor) /*!< out: page cursor */ +{ + ulint rnd; + ulint n_recs = page_get_n_recs(buf_block_get_frame(block)); + + page_cur_set_before_first(block, cursor); + + if (UNIV_UNLIKELY(n_recs == 0)) { + + return; + } + + rnd = (ulint) (page_cur_lcg_prng() % n_recs); + + do { + page_cur_move_to_next(cursor); + } while (rnd--); +} + +/***********************************************************//** +Writes the log record of a record insert on a page. */ +static +void +page_cur_insert_rec_write_log( +/*==========================*/ + rec_t* insert_rec, /*!< in: inserted physical record */ + ulint rec_size, /*!< in: insert_rec size */ + rec_t* cursor_rec, /*!< in: record the + cursor is pointing to */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + ulint cur_rec_size; + ulint extra_size; + ulint cur_extra_size; + const byte* ins_ptr; + byte* log_ptr; + const byte* log_end; + ulint i; + + ut_a(rec_size < UNIV_PAGE_SIZE); + ut_ad(page_align(insert_rec) == page_align(cursor_rec)); + ut_ad(!page_rec_is_comp(insert_rec) + == !dict_table_is_comp(index->table)); + + { + mem_heap_t* heap = NULL; + ulint cur_offs_[REC_OFFS_NORMAL_SIZE]; + ulint ins_offs_[REC_OFFS_NORMAL_SIZE]; + + ulint* cur_offs; + ulint* ins_offs; + + rec_offs_init(cur_offs_); + rec_offs_init(ins_offs_); + + cur_offs = rec_get_offsets(cursor_rec, index, cur_offs_, + ULINT_UNDEFINED, &heap); + ins_offs = rec_get_offsets(insert_rec, index, ins_offs_, + ULINT_UNDEFINED, &heap); + + extra_size = rec_offs_extra_size(ins_offs); + cur_extra_size = rec_offs_extra_size(cur_offs); + ut_ad(rec_size == rec_offs_size(ins_offs)); + cur_rec_size = rec_offs_size(cur_offs); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + + ins_ptr = insert_rec - extra_size; + + i = 0; + + if (cur_extra_size == extra_size) { + ulint min_rec_size = ut_min(cur_rec_size, rec_size); + + const byte* cur_ptr = cursor_rec - cur_extra_size; + + /* Find out the first byte in insert_rec which differs from + cursor_rec; skip the bytes in the record info */ + + do { + if (*ins_ptr == *cur_ptr) { + i++; + ins_ptr++; + cur_ptr++; + } else if ((i < extra_size) + && (i >= extra_size + - page_rec_get_base_extra_size + (insert_rec))) { + i = extra_size; + ins_ptr = insert_rec; + cur_ptr = cursor_rec; + } else { + break; + } + } while (i < min_rec_size); + } + + if (mtr_get_log_mode(mtr) != MTR_LOG_SHORT_INSERTS) { + + if (page_rec_is_comp(insert_rec)) { + log_ptr = mlog_open_and_write_index( + mtr, insert_rec, index, MLOG_COMP_REC_INSERT, + 2 + 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN); + if (UNIV_UNLIKELY(!log_ptr)) { + /* Logging in mtr is switched off + during crash recovery: in that case + mlog_open returns NULL */ + return; + } + } else { + log_ptr = mlog_open(mtr, 11 + + 2 + 5 + 1 + 5 + 5 + + MLOG_BUF_MARGIN); + if (UNIV_UNLIKELY(!log_ptr)) { + /* Logging in mtr is switched off + during crash recovery: in that case + mlog_open returns NULL */ + return; + } + + log_ptr = mlog_write_initial_log_record_fast( + insert_rec, MLOG_REC_INSERT, log_ptr, mtr); + } + + log_end = &log_ptr[2 + 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN]; + /* Write the cursor rec offset as a 2-byte ulint */ + mach_write_to_2(log_ptr, page_offset(cursor_rec)); + log_ptr += 2; + } else { + log_ptr = mlog_open(mtr, 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN); + if (!log_ptr) { + /* Logging in mtr is switched off during crash + recovery: in that case mlog_open returns NULL */ + return; + } + log_end = &log_ptr[5 + 1 + 5 + 5 + MLOG_BUF_MARGIN]; + } + + if (page_rec_is_comp(insert_rec)) { + if (UNIV_UNLIKELY + (rec_get_info_and_status_bits(insert_rec, TRUE) + != rec_get_info_and_status_bits(cursor_rec, TRUE))) { + + goto need_extra_info; + } + } else { + if (UNIV_UNLIKELY + (rec_get_info_and_status_bits(insert_rec, FALSE) + != rec_get_info_and_status_bits(cursor_rec, FALSE))) { + + goto need_extra_info; + } + } + + if (extra_size != cur_extra_size || rec_size != cur_rec_size) { +need_extra_info: + /* Write the record end segment length + and the extra info storage flag */ + log_ptr += mach_write_compressed(log_ptr, + 2 * (rec_size - i) + 1); + + /* Write the info bits */ + mach_write_to_1(log_ptr, + rec_get_info_and_status_bits( + insert_rec, + page_rec_is_comp(insert_rec))); + log_ptr++; + + /* Write the record origin offset */ + log_ptr += mach_write_compressed(log_ptr, extra_size); + + /* Write the mismatch index */ + log_ptr += mach_write_compressed(log_ptr, i); + + ut_a(i < UNIV_PAGE_SIZE); + ut_a(extra_size < UNIV_PAGE_SIZE); + } else { + /* Write the record end segment length + and the extra info storage flag */ + log_ptr += mach_write_compressed(log_ptr, 2 * (rec_size - i)); + } + + /* Write to the log the inserted index record end segment which + differs from the cursor record */ + + rec_size -= i; + + if (log_ptr + rec_size <= log_end) { + memcpy(log_ptr, ins_ptr, rec_size); + mlog_close(mtr, log_ptr + rec_size); + } else { + mlog_close(mtr, log_ptr); + ut_a(rec_size < UNIV_PAGE_SIZE); + mlog_catenate_string(mtr, ins_ptr, rec_size); + } +} +#else /* !UNIV_HOTBACKUP */ +# define page_cur_insert_rec_write_log(ins_rec,size,cur,index,mtr) ((void) 0) +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************//** +Parses a log record of a record insert on a page. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_cur_parse_insert_rec( +/*======================*/ + ibool is_short,/*!< in: TRUE if short inserts */ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + buf_block_t* block, /*!< in: page or NULL */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr or NULL */ +{ + ulint origin_offset; + ulint end_seg_len; + ulint mismatch_index; + page_t* page; + rec_t* cursor_rec; + byte buf1[1024]; + byte* buf; + byte* ptr2 = ptr; + ulint info_and_status_bits = 0; /* remove warning */ + page_cur_t cursor; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + page = block ? buf_block_get_frame(block) : NULL; + + if (is_short) { + cursor_rec = page_rec_get_prev(page_get_supremum_rec(page)); + } else { + ulint offset; + + /* Read the cursor rec offset as a 2-byte ulint */ + + if (UNIV_UNLIKELY(end_ptr < ptr + 2)) { + + return(NULL); + } + + offset = mach_read_from_2(ptr); + ptr += 2; + + cursor_rec = page + offset; + + if (UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE)) { + + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + } + + ptr = mach_parse_compressed(ptr, end_ptr, &end_seg_len); + + if (ptr == NULL) { + + return(NULL); + } + + if (UNIV_UNLIKELY(end_seg_len >= UNIV_PAGE_SIZE << 1)) { + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + + if (end_seg_len & 0x1UL) { + /* Read the info bits */ + + if (end_ptr < ptr + 1) { + + return(NULL); + } + + info_and_status_bits = mach_read_from_1(ptr); + ptr++; + + ptr = mach_parse_compressed(ptr, end_ptr, &origin_offset); + + if (ptr == NULL) { + + return(NULL); + } + + ut_a(origin_offset < UNIV_PAGE_SIZE); + + ptr = mach_parse_compressed(ptr, end_ptr, &mismatch_index); + + if (ptr == NULL) { + + return(NULL); + } + + ut_a(mismatch_index < UNIV_PAGE_SIZE); + } + + if (UNIV_UNLIKELY(end_ptr < ptr + (end_seg_len >> 1))) { + + return(NULL); + } + + if (!block) { + + return(ptr + (end_seg_len >> 1)); + } + + ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); + ut_ad(!buf_block_get_page_zip(block) || page_is_comp(page)); + + /* Read from the log the inserted index record end segment which + differs from the cursor record */ + + offsets = rec_get_offsets(cursor_rec, index, offsets, + ULINT_UNDEFINED, &heap); + + if (!(end_seg_len & 0x1UL)) { + info_and_status_bits = rec_get_info_and_status_bits( + cursor_rec, page_is_comp(page)); + origin_offset = rec_offs_extra_size(offsets); + mismatch_index = rec_offs_size(offsets) - (end_seg_len >> 1); + } + + end_seg_len >>= 1; + + if (mismatch_index + end_seg_len < sizeof buf1) { + buf = buf1; + } else { + buf = static_cast<byte*>( + mem_alloc(mismatch_index + end_seg_len)); + } + + /* Build the inserted record to buf */ + + if (UNIV_UNLIKELY(mismatch_index >= UNIV_PAGE_SIZE)) { + fprintf(stderr, + "Is short %lu, info_and_status_bits %lu, offset %lu, " + "o_offset %lu\n" + "mismatch index %lu, end_seg_len %lu\n" + "parsed len %lu\n", + (ulong) is_short, (ulong) info_and_status_bits, + (ulong) page_offset(cursor_rec), + (ulong) origin_offset, + (ulong) mismatch_index, (ulong) end_seg_len, + (ulong) (ptr - ptr2)); + + fputs("Dump of 300 bytes of log:\n", stderr); + ut_print_buf(stderr, ptr2, 300); + putc('\n', stderr); + + buf_page_print(page, 0, 0); + + ut_error; + } + + ut_memcpy(buf, rec_get_start(cursor_rec, offsets), mismatch_index); + ut_memcpy(buf + mismatch_index, ptr, end_seg_len); + + if (page_is_comp(page)) { + rec_set_info_and_status_bits(buf + origin_offset, + info_and_status_bits); + } else { + rec_set_info_bits_old(buf + origin_offset, + info_and_status_bits); + } + + page_cur_position(cursor_rec, block, &cursor); + + offsets = rec_get_offsets(buf + origin_offset, index, offsets, + ULINT_UNDEFINED, &heap); + if (UNIV_UNLIKELY(!page_cur_rec_insert(&cursor, + buf + origin_offset, + index, offsets, mtr))) { + /* The redo log record should only have been written + after the write was successful. */ + ut_error; + } + + if (buf != buf1) { + + mem_free(buf); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return(ptr + end_seg_len); +} + +/***********************************************************//** +Inserts a record next to page cursor on an uncompressed page. +Returns pointer to inserted record if succeed, i.e., enough +space available, NULL otherwise. The cursor stays at the same position. +@return pointer to record if succeed, NULL otherwise */ +UNIV_INTERN +rec_t* +page_cur_insert_rec_low( +/*====================*/ + rec_t* current_rec,/*!< in: pointer to current record after + which the new record is inserted */ + dict_index_t* index, /*!< in: record descriptor */ + const rec_t* rec, /*!< in: pointer to a physical record */ + ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr) /*!< in: mini-transaction handle, or NULL */ +{ + byte* insert_buf; + ulint rec_size; + page_t* page; /*!< the relevant page */ + rec_t* last_insert; /*!< cursor position at previous + insert */ + rec_t* free_rec; /*!< a free record that was reused, + or NULL */ + rec_t* insert_rec; /*!< inserted record */ + ulint heap_no; /*!< heap number of the inserted + record */ + + ut_ad(rec_offs_validate(rec, index, offsets)); + + page = page_align(current_rec); + ut_ad(dict_table_is_comp(index->table) + == (ibool) !!page_is_comp(page)); + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); + ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID) + == index->id || recv_recovery_is_on() + || (mtr ? mtr->inside_ibuf : dict_index_is_ibuf(index))); + + ut_ad(!page_rec_is_supremum(current_rec)); + + /* 1. Get the size of the physical record in the page */ + rec_size = rec_offs_size(offsets); + +#ifdef UNIV_DEBUG_VALGRIND + { + const void* rec_start + = rec - rec_offs_extra_size(offsets); + ulint extra_size + = rec_offs_extra_size(offsets) + - (rec_offs_comp(offsets) + ? REC_N_NEW_EXTRA_BYTES + : REC_N_OLD_EXTRA_BYTES); + + /* All data bytes of the record must be valid. */ + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + /* The variable-length header must be valid. */ + UNIV_MEM_ASSERT_RW(rec_start, extra_size); + } +#endif /* UNIV_DEBUG_VALGRIND */ + + /* 2. Try to find suitable space from page memory management */ + + free_rec = page_header_get_ptr(page, PAGE_FREE); + if (UNIV_LIKELY_NULL(free_rec)) { + /* Try to allocate from the head of the free list. */ + ulint foffsets_[REC_OFFS_NORMAL_SIZE]; + ulint* foffsets = foffsets_; + mem_heap_t* heap = NULL; + + rec_offs_init(foffsets_); + + foffsets = rec_get_offsets( + free_rec, index, foffsets, ULINT_UNDEFINED, &heap); + if (rec_offs_size(foffsets) < rec_size) { + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + goto use_heap; + } + + insert_buf = free_rec - rec_offs_extra_size(foffsets); + + if (page_is_comp(page)) { + heap_no = rec_get_heap_no_new(free_rec); + page_mem_alloc_free(page, NULL, + rec_get_next_ptr(free_rec, TRUE), + rec_size); + } else { + heap_no = rec_get_heap_no_old(free_rec); + page_mem_alloc_free(page, NULL, + rec_get_next_ptr(free_rec, FALSE), + rec_size); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } else { +use_heap: + free_rec = NULL; + insert_buf = page_mem_alloc_heap(page, NULL, + rec_size, &heap_no); + + if (UNIV_UNLIKELY(insert_buf == NULL)) { + return(NULL); + } + } + + /* 3. Create the record */ + insert_rec = rec_copy(insert_buf, rec, offsets); + rec_offs_make_valid(insert_rec, index, offsets); + + /* 4. Insert the record in the linked list of records */ + ut_ad(current_rec != insert_rec); + + { + /* next record after current before the insertion */ + rec_t* next_rec = page_rec_get_next(current_rec); +#ifdef UNIV_DEBUG + if (page_is_comp(page)) { + ut_ad(rec_get_status(current_rec) + <= REC_STATUS_INFIMUM); + ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM); + ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM); + } +#endif + page_rec_set_next(insert_rec, next_rec); + page_rec_set_next(current_rec, insert_rec); + } + + page_header_set_field(page, NULL, PAGE_N_RECS, + 1 + page_get_n_recs(page)); + + /* 5. Set the n_owned field in the inserted record to zero, + and set the heap_no field */ + if (page_is_comp(page)) { + rec_set_n_owned_new(insert_rec, NULL, 0); + rec_set_heap_no_new(insert_rec, heap_no); + } else { + rec_set_n_owned_old(insert_rec, 0); + rec_set_heap_no_old(insert_rec, heap_no); + } + + UNIV_MEM_ASSERT_RW(rec_get_start(insert_rec, offsets), + rec_offs_size(offsets)); + /* 6. Update the last insertion info in page header */ + + last_insert = page_header_get_ptr(page, PAGE_LAST_INSERT); + ut_ad(!last_insert || !page_is_comp(page) + || rec_get_node_ptr_flag(last_insert) + == rec_get_node_ptr_flag(insert_rec)); + + if (UNIV_UNLIKELY(last_insert == NULL)) { + page_header_set_field(page, NULL, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(page, NULL, PAGE_N_DIRECTION, 0); + + } else if ((last_insert == current_rec) + && (page_header_get_field(page, PAGE_DIRECTION) + != PAGE_LEFT)) { + + page_header_set_field(page, NULL, PAGE_DIRECTION, + PAGE_RIGHT); + page_header_set_field(page, NULL, PAGE_N_DIRECTION, + page_header_get_field( + page, PAGE_N_DIRECTION) + 1); + + } else if ((page_rec_get_next(insert_rec) == last_insert) + && (page_header_get_field(page, PAGE_DIRECTION) + != PAGE_RIGHT)) { + + page_header_set_field(page, NULL, PAGE_DIRECTION, + PAGE_LEFT); + page_header_set_field(page, NULL, PAGE_N_DIRECTION, + page_header_get_field( + page, PAGE_N_DIRECTION) + 1); + } else { + page_header_set_field(page, NULL, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(page, NULL, PAGE_N_DIRECTION, 0); + } + + page_header_set_ptr(page, NULL, PAGE_LAST_INSERT, insert_rec); + + /* 7. It remains to update the owner record. */ + { + rec_t* owner_rec = page_rec_find_owner_rec(insert_rec); + ulint n_owned; + if (page_is_comp(page)) { + n_owned = rec_get_n_owned_new(owner_rec); + rec_set_n_owned_new(owner_rec, NULL, n_owned + 1); + } else { + n_owned = rec_get_n_owned_old(owner_rec); + rec_set_n_owned_old(owner_rec, n_owned + 1); + } + + /* 8. Now we have incremented the n_owned field of the owner + record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED, + we have to split the corresponding directory slot in two. */ + + if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) { + page_dir_split_slot( + page, NULL, + page_dir_find_owner_slot(owner_rec)); + } + } + + /* 9. Write log record of the insert */ + if (UNIV_LIKELY(mtr != NULL)) { + page_cur_insert_rec_write_log(insert_rec, rec_size, + current_rec, index, mtr); + } + + btr_blob_dbg_add_rec(insert_rec, index, offsets, "insert"); + + return(insert_rec); +} + +/***********************************************************//** +Inserts a record next to page cursor on a compressed and uncompressed +page. Returns pointer to inserted record if succeed, i.e., +enough space available, NULL otherwise. +The cursor stays at the same position. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to record if succeed, NULL otherwise */ +UNIV_INTERN +rec_t* +page_cur_insert_rec_zip( +/*====================*/ + page_cur_t* cursor, /*!< in/out: page cursor */ + dict_index_t* index, /*!< in: record descriptor */ + const rec_t* rec, /*!< in: pointer to a physical record */ + ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr) /*!< in: mini-transaction handle, or NULL */ +{ + byte* insert_buf; + ulint rec_size; + page_t* page; /*!< the relevant page */ + rec_t* last_insert; /*!< cursor position at previous + insert */ + rec_t* free_rec; /*!< a free record that was reused, + or NULL */ + rec_t* insert_rec; /*!< inserted record */ + ulint heap_no; /*!< heap number of the inserted + record */ + page_zip_des_t* page_zip; + + page_zip = page_cur_get_page_zip(cursor); + ut_ad(page_zip); + + ut_ad(rec_offs_validate(rec, index, offsets)); + + page = page_cur_get_page(cursor); + ut_ad(dict_table_is_comp(index->table)); + ut_ad(page_is_comp(page)); + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); + ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID) + == index->id || recv_recovery_is_on() + || (mtr ? mtr->inside_ibuf : dict_index_is_ibuf(index))); + + ut_ad(!page_cur_is_after_last(cursor)); +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + /* 1. Get the size of the physical record in the page */ + rec_size = rec_offs_size(offsets); + +#ifdef UNIV_DEBUG_VALGRIND + { + const void* rec_start + = rec - rec_offs_extra_size(offsets); + ulint extra_size + = rec_offs_extra_size(offsets) + - (rec_offs_comp(offsets) + ? REC_N_NEW_EXTRA_BYTES + : REC_N_OLD_EXTRA_BYTES); + + /* All data bytes of the record must be valid. */ + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + /* The variable-length header must be valid. */ + UNIV_MEM_ASSERT_RW(rec_start, extra_size); + } +#endif /* UNIV_DEBUG_VALGRIND */ + + const bool reorg_before_insert = page_has_garbage(page) + && rec_size > page_get_max_insert_size(page, 1) + && rec_size <= page_get_max_insert_size_after_reorganize( + page, 1); + + /* 2. Try to find suitable space from page memory management */ + if (!page_zip_available(page_zip, dict_index_is_clust(index), + rec_size, 1) + || reorg_before_insert) { + /* The values can change dynamically. */ + bool log_compressed = page_zip_log_pages; + ulint level = page_zip_level; +#ifdef UNIV_DEBUG + rec_t* cursor_rec = page_cur_get_rec(cursor); +#endif /* UNIV_DEBUG */ + + /* If we are not writing compressed page images, we + must reorganize the page before attempting the + insert. */ + if (recv_recovery_is_on()) { + /* Insert into the uncompressed page only. + The page reorganization or creation that we + would attempt outside crash recovery would + have been covered by a previous redo log record. */ + } else if (page_is_empty(page)) { + ut_ad(page_cur_is_before_first(cursor)); + + /* This is an empty page. Recreate it to + get rid of the modification log. */ + page_create_zip(page_cur_get_block(cursor), index, + page_header_get_field(page, PAGE_LEVEL), + 0, mtr); + ut_ad(!page_header_get_ptr(page, PAGE_FREE)); + + if (page_zip_available( + page_zip, dict_index_is_clust(index), + rec_size, 1)) { + goto use_heap; + } + + /* The cursor should remain on the page infimum. */ + return(NULL); + } else if (!page_zip->m_nonempty && !page_has_garbage(page)) { + /* The page has been freshly compressed, so + reorganizing it will not help. */ + } else if (log_compressed && !reorg_before_insert) { + /* Insert into uncompressed page only, and + try page_zip_reorganize() afterwards. */ + } else if (btr_page_reorganize_low( + recv_recovery_is_on(), level, + cursor, index, mtr)) { + ut_ad(!page_header_get_ptr(page, PAGE_FREE)); + + if (page_zip_available( + page_zip, dict_index_is_clust(index), + rec_size, 1)) { + /* After reorganizing, there is space + available. */ + goto use_heap; + } + } else { + ut_ad(cursor->rec == cursor_rec); + return(NULL); + } + + /* Try compressing the whole page afterwards. */ + insert_rec = page_cur_insert_rec_low( + cursor->rec, index, rec, offsets, NULL); + + /* If recovery is on, this implies that the compression + of the page was successful during runtime. Had that not + been the case or had the redo logging of compressed + pages been enabled during runtime then we'd have seen + a MLOG_ZIP_PAGE_COMPRESS redo record. Therefore, we + know that we don't need to reorganize the page. We, + however, do need to recompress the page. That will + happen when the next redo record is read which must + be of type MLOG_ZIP_PAGE_COMPRESS_NO_DATA and it must + contain a valid compression level value. + This implies that during recovery from this point till + the next redo is applied the uncompressed and + compressed versions are not identical and + page_zip_validate will fail but that is OK because + we call page_zip_validate only after processing + all changes to a page under a single mtr during + recovery. */ + if (insert_rec == NULL) { + /* Out of space. + This should never occur during crash recovery, + because the MLOG_COMP_REC_INSERT should only + be logged after a successful operation. */ + ut_ad(!recv_recovery_is_on()); + } else if (recv_recovery_is_on()) { + /* This should be followed by + MLOG_ZIP_PAGE_COMPRESS_NO_DATA, + which should succeed. */ + rec_offs_make_valid(insert_rec, index, offsets); + } else { + ulint pos = page_rec_get_n_recs_before(insert_rec); + ut_ad(pos > 0); + + if (!log_compressed) { + if (page_zip_compress( + page_zip, page, index, + level, NULL)) { + page_cur_insert_rec_write_log( + insert_rec, rec_size, + cursor->rec, index, mtr); + page_zip_compress_write_log_no_data( + level, page, index, mtr); + + rec_offs_make_valid( + insert_rec, index, offsets); + return(insert_rec); + } + + ut_ad(cursor->rec + == (pos > 1 + ? page_rec_get_nth( + page, pos - 1) + : page + PAGE_NEW_INFIMUM)); + } else { + /* We are writing entire page images + to the log. Reduce the redo log volume + by reorganizing the page at the same time. */ + if (page_zip_reorganize( + cursor->block, index, mtr)) { + /* The page was reorganized: + Seek to pos. */ + if (pos > 1) { + cursor->rec = page_rec_get_nth( + page, pos - 1); + } else { + cursor->rec = page + + PAGE_NEW_INFIMUM; + } + + insert_rec = page + rec_get_next_offs( + cursor->rec, TRUE); + rec_offs_make_valid( + insert_rec, index, offsets); + return(insert_rec); + } + + /* Theoretically, we could try one + last resort of btr_page_reorganize_low() + followed by page_zip_available(), but + that would be very unlikely to + succeed. (If the full reorganized page + failed to compress, why would it + succeed to compress the page, plus log + the insert of this record? */ + } + + /* Out of space: restore the page */ + btr_blob_dbg_remove(page, index, "insert_zip_fail"); + if (!page_zip_decompress(page_zip, page, FALSE)) { + ut_error; /* Memory corrupted? */ + } + ut_ad(page_validate(page, index)); + btr_blob_dbg_add(page, index, "insert_zip_fail"); + insert_rec = NULL; + } + + return(insert_rec); + } + + free_rec = page_header_get_ptr(page, PAGE_FREE); + if (UNIV_LIKELY_NULL(free_rec)) { + /* Try to allocate from the head of the free list. */ + lint extra_size_diff; + ulint foffsets_[REC_OFFS_NORMAL_SIZE]; + ulint* foffsets = foffsets_; + mem_heap_t* heap = NULL; + + rec_offs_init(foffsets_); + + foffsets = rec_get_offsets(free_rec, index, foffsets, + ULINT_UNDEFINED, &heap); + if (rec_offs_size(foffsets) < rec_size) { +too_small: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + goto use_heap; + } + + insert_buf = free_rec - rec_offs_extra_size(foffsets); + + /* On compressed pages, do not relocate records from + the free list. If extra_size would grow, use the heap. */ + extra_size_diff + = rec_offs_extra_size(offsets) + - rec_offs_extra_size(foffsets); + + if (UNIV_UNLIKELY(extra_size_diff < 0)) { + /* Add an offset to the extra_size. */ + if (rec_offs_size(foffsets) + < rec_size - extra_size_diff) { + + goto too_small; + } + + insert_buf -= extra_size_diff; + } else if (UNIV_UNLIKELY(extra_size_diff)) { + /* Do not allow extra_size to grow */ + + goto too_small; + } + + heap_no = rec_get_heap_no_new(free_rec); + page_mem_alloc_free(page, page_zip, + rec_get_next_ptr(free_rec, TRUE), + rec_size); + + if (!page_is_leaf(page)) { + /* Zero out the node pointer of free_rec, + in case it will not be overwritten by + insert_rec. */ + + ut_ad(rec_size > REC_NODE_PTR_SIZE); + + if (rec_offs_extra_size(foffsets) + + rec_offs_data_size(foffsets) > rec_size) { + + memset(rec_get_end(free_rec, foffsets) + - REC_NODE_PTR_SIZE, 0, + REC_NODE_PTR_SIZE); + } + } else if (dict_index_is_clust(index)) { + /* Zero out the DB_TRX_ID and DB_ROLL_PTR + columns of free_rec, in case it will not be + overwritten by insert_rec. */ + + ulint trx_id_col; + ulint trx_id_offs; + ulint len; + + trx_id_col = dict_index_get_sys_col_pos(index, + DATA_TRX_ID); + ut_ad(trx_id_col > 0); + ut_ad(trx_id_col != ULINT_UNDEFINED); + + trx_id_offs = rec_get_nth_field_offs(foffsets, + trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + + if (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + trx_id_offs + + rec_offs_extra_size(foffsets) > rec_size) { + /* We will have to zero out the + DB_TRX_ID and DB_ROLL_PTR, because + they will not be fully overwritten by + insert_rec. */ + + memset(free_rec + trx_id_offs, 0, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + } + + ut_ad(free_rec + trx_id_offs + DATA_TRX_ID_LEN + == rec_get_nth_field(free_rec, foffsets, + trx_id_col + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } else { +use_heap: + free_rec = NULL; + insert_buf = page_mem_alloc_heap(page, page_zip, + rec_size, &heap_no); + + if (UNIV_UNLIKELY(insert_buf == NULL)) { + return(NULL); + } + + page_zip_dir_add_slot(page_zip, dict_index_is_clust(index)); + } + + /* 3. Create the record */ + insert_rec = rec_copy(insert_buf, rec, offsets); + rec_offs_make_valid(insert_rec, index, offsets); + + /* 4. Insert the record in the linked list of records */ + ut_ad(cursor->rec != insert_rec); + + { + /* next record after current before the insertion */ + const rec_t* next_rec = page_rec_get_next_low( + cursor->rec, TRUE); + ut_ad(rec_get_status(cursor->rec) + <= REC_STATUS_INFIMUM); + ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM); + ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM); + + page_rec_set_next(insert_rec, next_rec); + page_rec_set_next(cursor->rec, insert_rec); + } + + page_header_set_field(page, page_zip, PAGE_N_RECS, + 1 + page_get_n_recs(page)); + + /* 5. Set the n_owned field in the inserted record to zero, + and set the heap_no field */ + rec_set_n_owned_new(insert_rec, NULL, 0); + rec_set_heap_no_new(insert_rec, heap_no); + + UNIV_MEM_ASSERT_RW(rec_get_start(insert_rec, offsets), + rec_offs_size(offsets)); + + page_zip_dir_insert(page_zip, cursor->rec, free_rec, insert_rec); + + /* 6. Update the last insertion info in page header */ + + last_insert = page_header_get_ptr(page, PAGE_LAST_INSERT); + ut_ad(!last_insert + || rec_get_node_ptr_flag(last_insert) + == rec_get_node_ptr_flag(insert_rec)); + + if (UNIV_UNLIKELY(last_insert == NULL)) { + page_header_set_field(page, page_zip, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0); + + } else if ((last_insert == cursor->rec) + && (page_header_get_field(page, PAGE_DIRECTION) + != PAGE_LEFT)) { + + page_header_set_field(page, page_zip, PAGE_DIRECTION, + PAGE_RIGHT); + page_header_set_field(page, page_zip, PAGE_N_DIRECTION, + page_header_get_field( + page, PAGE_N_DIRECTION) + 1); + + } else if ((page_rec_get_next(insert_rec) == last_insert) + && (page_header_get_field(page, PAGE_DIRECTION) + != PAGE_RIGHT)) { + + page_header_set_field(page, page_zip, PAGE_DIRECTION, + PAGE_LEFT); + page_header_set_field(page, page_zip, PAGE_N_DIRECTION, + page_header_get_field( + page, PAGE_N_DIRECTION) + 1); + } else { + page_header_set_field(page, page_zip, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0); + } + + page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, insert_rec); + + /* 7. It remains to update the owner record. */ + { + rec_t* owner_rec = page_rec_find_owner_rec(insert_rec); + ulint n_owned; + + n_owned = rec_get_n_owned_new(owner_rec); + rec_set_n_owned_new(owner_rec, page_zip, n_owned + 1); + + /* 8. Now we have incremented the n_owned field of the owner + record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED, + we have to split the corresponding directory slot in two. */ + + if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) { + page_dir_split_slot( + page, page_zip, + page_dir_find_owner_slot(owner_rec)); + } + } + + page_zip_write_rec(page_zip, insert_rec, index, offsets, 1); + + btr_blob_dbg_add_rec(insert_rec, index, offsets, "insert_zip_ok"); + + /* 9. Write log record of the insert */ + if (UNIV_LIKELY(mtr != NULL)) { + page_cur_insert_rec_write_log(insert_rec, rec_size, + cursor->rec, index, mtr); + } + + return(insert_rec); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************//** +Writes a log record of copying a record list end to a new created page. +@return 4-byte field where to write the log data length, or NULL if +logging is disabled */ +UNIV_INLINE +byte* +page_copy_rec_list_to_created_page_write_log( +/*=========================================*/ + page_t* page, /*!< in: index page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + byte* log_ptr; + + ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); + + log_ptr = mlog_open_and_write_index(mtr, page, index, + page_is_comp(page) + ? MLOG_COMP_LIST_END_COPY_CREATED + : MLOG_LIST_END_COPY_CREATED, 4); + if (UNIV_LIKELY(log_ptr != NULL)) { + mlog_close(mtr, log_ptr + 4); + } + + return(log_ptr); +} +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************//** +Parses a log record of copying a record list end to a new created page. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_parse_copy_rec_list_to_created_page( +/*=====================================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + buf_block_t* block, /*!< in: page or NULL */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr or NULL */ +{ + byte* rec_end; + ulint log_data_len; + page_t* page; + page_zip_des_t* page_zip; + + if (ptr + 4 > end_ptr) { + + return(NULL); + } + + log_data_len = mach_read_from_4(ptr); + ptr += 4; + + rec_end = ptr + log_data_len; + + if (rec_end > end_ptr) { + + return(NULL); + } + + if (!block) { + + return(rec_end); + } + + while (ptr < rec_end) { + ptr = page_cur_parse_insert_rec(TRUE, ptr, end_ptr, + block, index, mtr); + } + + ut_a(ptr == rec_end); + + page = buf_block_get_frame(block); + page_zip = buf_block_get_page_zip(block); + + page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL); + page_header_set_field(page, page_zip, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0); + + return(rec_end); +} + +#ifndef UNIV_HOTBACKUP +/*************************************************************//** +Copies records from page to a newly created page, from a given record onward, +including that record. Infimum and supremum records are not copied. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). */ +UNIV_INTERN +void +page_copy_rec_list_end_to_created_page( +/*===================================*/ + page_t* new_page, /*!< in/out: index page to copy to */ + rec_t* rec, /*!< in: first record to copy */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_dir_slot_t* slot = 0; /* remove warning */ + byte* heap_top; + rec_t* insert_rec = 0; /* remove warning */ + rec_t* prev_rec; + ulint count; + ulint n_recs; + ulint slot_index; + ulint rec_size; + ulint log_mode; + byte* log_ptr; + ulint log_data_len; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(page_dir_get_n_heap(new_page) == PAGE_HEAP_NO_USER_LOW); + ut_ad(page_align(rec) != new_page); + ut_ad(page_rec_is_comp(rec) == page_is_comp(new_page)); + + if (page_rec_is_infimum(rec)) { + + rec = page_rec_get_next(rec); + } + + if (page_rec_is_supremum(rec)) { + + return; + } + +#ifdef UNIV_DEBUG + /* To pass the debug tests we have to set these dummy values + in the debug version */ + page_dir_set_n_slots(new_page, NULL, UNIV_PAGE_SIZE / 2); + page_header_set_ptr(new_page, NULL, PAGE_HEAP_TOP, + new_page + UNIV_PAGE_SIZE - 1); +#endif + + log_ptr = page_copy_rec_list_to_created_page_write_log(new_page, + index, mtr); + + log_data_len = dyn_array_get_data_size(&(mtr->log)); + + /* Individual inserts are logged in a shorter form */ + + log_mode = mtr_set_log_mode(mtr, MTR_LOG_SHORT_INSERTS); + + prev_rec = page_get_infimum_rec(new_page); + if (page_is_comp(new_page)) { + heap_top = new_page + PAGE_NEW_SUPREMUM_END; + } else { + heap_top = new_page + PAGE_OLD_SUPREMUM_END; + } + count = 0; + slot_index = 0; + n_recs = 0; + + do { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + insert_rec = rec_copy(heap_top, rec, offsets); + + if (page_is_comp(new_page)) { + rec_set_next_offs_new(prev_rec, + page_offset(insert_rec)); + + rec_set_n_owned_new(insert_rec, NULL, 0); + rec_set_heap_no_new(insert_rec, + PAGE_HEAP_NO_USER_LOW + n_recs); + } else { + rec_set_next_offs_old(prev_rec, + page_offset(insert_rec)); + + rec_set_n_owned_old(insert_rec, 0); + rec_set_heap_no_old(insert_rec, + PAGE_HEAP_NO_USER_LOW + n_recs); + } + + count++; + n_recs++; + + if (UNIV_UNLIKELY + (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2)) { + + slot_index++; + + slot = page_dir_get_nth_slot(new_page, slot_index); + + page_dir_slot_set_rec(slot, insert_rec); + page_dir_slot_set_n_owned(slot, NULL, count); + + count = 0; + } + + rec_size = rec_offs_size(offsets); + + ut_ad(heap_top < new_page + UNIV_PAGE_SIZE); + + heap_top += rec_size; + + rec_offs_make_valid(insert_rec, index, offsets); + btr_blob_dbg_add_rec(insert_rec, index, offsets, "copy_end"); + + page_cur_insert_rec_write_log(insert_rec, rec_size, prev_rec, + index, mtr); + prev_rec = insert_rec; + rec = page_rec_get_next(rec); + } while (!page_rec_is_supremum(rec)); + + if ((slot_index > 0) && (count + 1 + + (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 + <= PAGE_DIR_SLOT_MAX_N_OWNED)) { + /* We can merge the two last dir slots. This operation is + here to make this function imitate exactly the equivalent + task made using page_cur_insert_rec, which we use in database + recovery to reproduce the task performed by this function. + To be able to check the correctness of recovery, it is good + that it imitates exactly. */ + + count += (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2; + + page_dir_slot_set_n_owned(slot, NULL, 0); + + slot_index--; + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + log_data_len = dyn_array_get_data_size(&(mtr->log)) - log_data_len; + + ut_a(log_data_len < 100 * UNIV_PAGE_SIZE); + + if (UNIV_LIKELY(log_ptr != NULL)) { + mach_write_to_4(log_ptr, log_data_len); + } + + if (page_is_comp(new_page)) { + rec_set_next_offs_new(insert_rec, PAGE_NEW_SUPREMUM); + } else { + rec_set_next_offs_old(insert_rec, PAGE_OLD_SUPREMUM); + } + + slot = page_dir_get_nth_slot(new_page, 1 + slot_index); + + page_dir_slot_set_rec(slot, page_get_supremum_rec(new_page)); + page_dir_slot_set_n_owned(slot, NULL, count + 1); + + page_dir_set_n_slots(new_page, NULL, 2 + slot_index); + page_header_set_ptr(new_page, NULL, PAGE_HEAP_TOP, heap_top); + page_dir_set_n_heap(new_page, NULL, PAGE_HEAP_NO_USER_LOW + n_recs); + page_header_set_field(new_page, NULL, PAGE_N_RECS, n_recs); + + page_header_set_ptr(new_page, NULL, PAGE_LAST_INSERT, NULL); + page_header_set_field(new_page, NULL, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(new_page, NULL, PAGE_N_DIRECTION, 0); + + /* Restore the log mode */ + + mtr_set_log_mode(mtr, log_mode); +} + +/***********************************************************//** +Writes log record of a record delete on a page. */ +UNIV_INLINE +void +page_cur_delete_rec_write_log( +/*==========================*/ + rec_t* rec, /*!< in: record to be deleted */ + const dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + byte* log_ptr; + + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + + log_ptr = mlog_open_and_write_index(mtr, rec, index, + page_rec_is_comp(rec) + ? MLOG_COMP_REC_DELETE + : MLOG_REC_DELETE, 2); + + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery: + in that case mlog_open returns NULL */ + return; + } + + /* Write the cursor rec offset as a 2-byte ulint */ + mach_write_to_2(log_ptr, page_offset(rec)); + + mlog_close(mtr, log_ptr + 2); +} +#else /* !UNIV_HOTBACKUP */ +# define page_cur_delete_rec_write_log(rec,index,mtr) ((void) 0) +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************//** +Parses log record of a record delete on a page. +@return pointer to record end or NULL */ +UNIV_INTERN +byte* +page_cur_parse_delete_rec( +/*======================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + buf_block_t* block, /*!< in: page or NULL */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr or NULL */ +{ + ulint offset; + page_cur_t cursor; + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + /* Read the cursor rec offset as a 2-byte ulint */ + offset = mach_read_from_2(ptr); + ptr += 2; + + ut_a(offset <= UNIV_PAGE_SIZE); + + if (block) { + page_t* page = buf_block_get_frame(block); + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_t* rec = page + offset; + rec_offs_init(offsets_); + + page_cur_position(rec, block, &cursor); + ut_ad(!buf_block_get_page_zip(block) || page_is_comp(page)); + + page_cur_delete_rec(&cursor, index, + rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap), + mtr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + + return(ptr); +} + +/***********************************************************//** +Deletes a record at the page cursor. The cursor is moved to the next +record after the deleted one. */ +UNIV_INTERN +void +page_cur_delete_rec( +/*================*/ + page_cur_t* cursor, /*!< in/out: a page cursor */ + const dict_index_t* index, /*!< in: record descriptor */ + const ulint* offsets,/*!< in: rec_get_offsets( + cursor->rec, index) */ + mtr_t* mtr) /*!< in: mini-transaction handle + or NULL */ +{ + page_dir_slot_t* cur_dir_slot; + page_dir_slot_t* prev_slot; + page_t* page; + page_zip_des_t* page_zip; + rec_t* current_rec; + rec_t* prev_rec = NULL; + rec_t* next_rec; + ulint cur_slot_no; + ulint cur_n_owned; + rec_t* rec; + + page = page_cur_get_page(cursor); + page_zip = page_cur_get_page_zip(cursor); + + /* page_zip_validate() will fail here when + btr_cur_pessimistic_delete() invokes btr_set_min_rec_mark(). + Then, both "page_zip" and "page" would have the min-rec-mark + set on the smallest user record, but "page" would additionally + have it set on the smallest-but-one record. Because sloppy + page_zip_validate_low() only ignores min-rec-flag differences + in the smallest user record, it cannot be used here either. */ + + current_rec = cursor->rec; + ut_ad(rec_offs_validate(current_rec, index, offsets)); + ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); + ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID) + == index->id || recv_recovery_is_on() + || (mtr ? mtr->inside_ibuf : dict_index_is_ibuf(index))); + + /* The record must not be the supremum or infimum record. */ + ut_ad(page_rec_is_user_rec(current_rec)); + + if (page_get_n_recs(page) == 1 && !recv_recovery_is_on()) { + /* Empty the page, unless we are applying the redo log + during crash recovery. During normal operation, the + page_create_empty() gets logged as one of MLOG_PAGE_CREATE, + MLOG_COMP_PAGE_CREATE, MLOG_ZIP_PAGE_COMPRESS. */ + ut_ad(page_is_leaf(page)); + /* Usually, this should be the root page, + and the whole index tree should become empty. + However, this could also be a call in + btr_cur_pessimistic_update() to delete the only + record in the page and to insert another one. */ + page_cur_move_to_next(cursor); + ut_ad(page_cur_is_after_last(cursor)); + page_create_empty(page_cur_get_block(cursor), + const_cast<dict_index_t*>(index), mtr); + return; + } + + /* Save to local variables some data associated with current_rec */ + cur_slot_no = page_dir_find_owner_slot(current_rec); + ut_ad(cur_slot_no > 0); + cur_dir_slot = page_dir_get_nth_slot(page, cur_slot_no); + cur_n_owned = page_dir_slot_get_n_owned(cur_dir_slot); + + /* 0. Write the log record */ + if (mtr != 0) { + page_cur_delete_rec_write_log(current_rec, index, mtr); + } + + /* 1. Reset the last insert info in the page header and increment + the modify clock for the frame */ + + page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL); + + /* The page gets invalid for optimistic searches: increment the + frame modify clock only if there is an mini-transaction covering + the change. During IMPORT we allocate local blocks that are not + part of the buffer pool. */ + + if (mtr != 0) { + buf_block_modify_clock_inc(page_cur_get_block(cursor)); + } + + /* 2. Find the next and the previous record. Note that the cursor is + left at the next record. */ + + ut_ad(cur_slot_no > 0); + prev_slot = page_dir_get_nth_slot(page, cur_slot_no - 1); + + rec = (rec_t*) page_dir_slot_get_rec(prev_slot); + + /* rec now points to the record of the previous directory slot. Look + for the immediate predecessor of current_rec in a loop. */ + + while(current_rec != rec) { + prev_rec = rec; + rec = page_rec_get_next(rec); + } + + page_cur_move_to_next(cursor); + next_rec = cursor->rec; + + /* 3. Remove the record from the linked list of records */ + + page_rec_set_next(prev_rec, next_rec); + + /* 4. If the deleted record is pointed to by a dir slot, update the + record pointer in slot. In the following if-clause we assume that + prev_rec is owned by the same slot, i.e., PAGE_DIR_SLOT_MIN_N_OWNED + >= 2. */ + +#if PAGE_DIR_SLOT_MIN_N_OWNED < 2 +# error "PAGE_DIR_SLOT_MIN_N_OWNED < 2" +#endif + ut_ad(cur_n_owned > 1); + + if (current_rec == page_dir_slot_get_rec(cur_dir_slot)) { + page_dir_slot_set_rec(cur_dir_slot, prev_rec); + } + + /* 5. Update the number of owned records of the slot */ + + page_dir_slot_set_n_owned(cur_dir_slot, page_zip, cur_n_owned - 1); + + /* 6. Free the memory occupied by the record */ + btr_blob_dbg_remove_rec(current_rec, const_cast<dict_index_t*>(index), + offsets, "delete"); + page_mem_free(page, page_zip, current_rec, index, offsets); + + /* 7. Now we have decremented the number of owned records of the slot. + If the number drops below PAGE_DIR_SLOT_MIN_N_OWNED, we balance the + slots. */ + + if (cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) { + page_dir_balance_slot(page, page_zip, cur_slot_no); + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ +} + +#ifdef UNIV_COMPILE_TEST_FUNCS + +/*******************************************************************//** +Print the first n numbers, generated by page_cur_lcg_prng() to make sure +(visually) that it works properly. */ +void +test_page_cur_lcg_prng( +/*===================*/ + int n) /*!< in: print first n numbers */ +{ + int i; + unsigned long long rnd; + + for (i = 0; i < n; i++) { + rnd = page_cur_lcg_prng(); + printf("%llu\t%%2=%llu %%3=%llu %%5=%llu %%7=%llu %%11=%llu\n", + rnd, + rnd % 2, + rnd % 3, + rnd % 5, + rnd % 7, + rnd % 11); + } +} + +#endif /* UNIV_COMPILE_TEST_FUNCS */ diff --git a/storage/xtradb/page/page0page.cc b/storage/xtradb/page/page0page.cc new file mode 100644 index 00000000000..bd5fb36af8f --- /dev/null +++ b/storage/xtradb/page/page0page.cc @@ -0,0 +1,2813 @@ +/***************************************************************************** + +Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file page/page0page.cc +Index page routines + +Created 2/2/1994 Heikki Tuuri +*******************************************************/ + +#define THIS_MODULE +#include "page0page.h" +#ifdef UNIV_NONINL +#include "page0page.ic" +#endif +#undef THIS_MODULE + +#include "page0cur.h" +#include "page0zip.h" +#include "buf0buf.h" +#include "btr0btr.h" +#ifndef UNIV_HOTBACKUP +# include "srv0srv.h" +# include "lock0lock.h" +# include "fut0lst.h" +# include "btr0sea.h" +#endif /* !UNIV_HOTBACKUP */ + +/* THE INDEX PAGE + ============== + +The index page consists of a page header which contains the page's +id and other information. On top of it are the index records +in a heap linked into a one way linear list according to alphabetic order. + +Just below page end is an array of pointers which we call page directory, +to about every sixth record in the list. The pointers are placed in +the directory in the alphabetical order of the records pointed to, +enabling us to make binary search using the array. Each slot n:o I +in the directory points to a record, where a 4-bit field contains a count +of those records which are in the linear list between pointer I and +the pointer I - 1 in the directory, including the record +pointed to by pointer I and not including the record pointed to by I - 1. +We say that the record pointed to by slot I, or that slot I, owns +these records. The count is always kept in the range 4 to 8, with +the exception that it is 1 for the first slot, and 1--8 for the second slot. + +An essentially binary search can be performed in the list of index +records, like we could do if we had pointer to every record in the +page directory. The data structure is, however, more efficient when +we are doing inserts, because most inserts are just pushed on a heap. +Only every 8th insert requires block move in the directory pointer +table, which itself is quite small. A record is deleted from the page +by just taking it off the linear list and updating the number of owned +records-field of the record which owns it, and updating the page directory, +if necessary. A special case is the one when the record owns itself. +Because the overhead of inserts is so small, we may also increase the +page size from the projected default of 8 kB to 64 kB without too +much loss of efficiency in inserts. Bigger page becomes actual +when the disk transfer rate compared to seek and latency time rises. +On the present system, the page size is set so that the page transfer +time (3 ms) is 20 % of the disk random access time (15 ms). + +When the page is split, merged, or becomes full but contains deleted +records, we have to reorganize the page. + +Assuming a page size of 8 kB, a typical index page of a secondary +index contains 300 index entries, and the size of the page directory +is 50 x 4 bytes = 200 bytes. */ + +/***************************************************************//** +Looks for the directory slot which owns the given record. +@return the directory slot number */ +UNIV_INTERN +ulint +page_dir_find_owner_slot( +/*=====================*/ + const rec_t* rec) /*!< in: the physical record */ +{ + const page_t* page; + register uint16 rec_offs_bytes; + register const page_dir_slot_t* slot; + register const page_dir_slot_t* first_slot; + register const rec_t* r = rec; + + ut_ad(page_rec_check(rec)); + + page = page_align(rec); + first_slot = page_dir_get_nth_slot(page, 0); + slot = page_dir_get_nth_slot(page, page_dir_get_n_slots(page) - 1); + + if (page_is_comp(page)) { + while (rec_get_n_owned_new(r) == 0) { + r = rec_get_next_ptr_const(r, TRUE); + ut_ad(r >= page + PAGE_NEW_SUPREMUM); + ut_ad(r < page + (UNIV_PAGE_SIZE - PAGE_DIR)); + } + } else { + while (rec_get_n_owned_old(r) == 0) { + r = rec_get_next_ptr_const(r, FALSE); + ut_ad(r >= page + PAGE_OLD_SUPREMUM); + ut_ad(r < page + (UNIV_PAGE_SIZE - PAGE_DIR)); + } + } + + rec_offs_bytes = mach_encode_2(r - page); + + while (UNIV_LIKELY(*(uint16*) slot != rec_offs_bytes)) { + + if (UNIV_UNLIKELY(slot == first_slot)) { + fprintf(stderr, + "InnoDB: Probable data corruption on" + " page %lu\n" + "InnoDB: Original record ", + (ulong) page_get_page_no(page)); + + if (page_is_comp(page)) { + fputs("(compact record)", stderr); + } else { + rec_print_old(stderr, rec); + } + + fputs("\n" + "InnoDB: on that page.\n" + "InnoDB: Cannot find the dir slot for record ", + stderr); + if (page_is_comp(page)) { + fputs("(compact record)", stderr); + } else { + rec_print_old(stderr, page + + mach_decode_2(rec_offs_bytes)); + } + fputs("\n" + "InnoDB: on that page!\n", stderr); + + buf_page_print(page, 0, 0); + + ut_error; + } + + slot += PAGE_DIR_SLOT_SIZE; + } + + return(((ulint) (first_slot - slot)) / PAGE_DIR_SLOT_SIZE); +} + +/**************************************************************//** +Used to check the consistency of a directory slot. +@return TRUE if succeed */ +static +ibool +page_dir_slot_check( +/*================*/ + const page_dir_slot_t* slot) /*!< in: slot */ +{ + const page_t* page; + ulint n_slots; + ulint n_owned; + + ut_a(slot); + + page = page_align(slot); + + n_slots = page_dir_get_n_slots(page); + + ut_a(slot <= page_dir_get_nth_slot(page, 0)); + ut_a(slot >= page_dir_get_nth_slot(page, n_slots - 1)); + + ut_a(page_rec_check(page_dir_slot_get_rec(slot))); + + if (page_is_comp(page)) { + n_owned = rec_get_n_owned_new(page_dir_slot_get_rec(slot)); + } else { + n_owned = rec_get_n_owned_old(page_dir_slot_get_rec(slot)); + } + + if (slot == page_dir_get_nth_slot(page, 0)) { + ut_a(n_owned == 1); + } else if (slot == page_dir_get_nth_slot(page, n_slots - 1)) { + ut_a(n_owned >= 1); + ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED); + } else { + ut_a(n_owned >= PAGE_DIR_SLOT_MIN_N_OWNED); + ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED); + } + + return(TRUE); +} + +/*************************************************************//** +Sets the max trx id field value. */ +UNIV_INTERN +void +page_set_max_trx_id( +/*================*/ + buf_block_t* block, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction, or NULL */ +{ + page_t* page = buf_block_get_frame(block); +#ifndef UNIV_HOTBACKUP + ut_ad(!mtr || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); +#endif /* !UNIV_HOTBACKUP */ + + /* It is not necessary to write this change to the redo log, as + during a database recovery we assume that the max trx id of every + page is the maximum trx id assigned before the crash. */ + + if (page_zip) { + mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), trx_id); + page_zip_write_header(page_zip, + page + (PAGE_HEADER + PAGE_MAX_TRX_ID), + 8, mtr); +#ifndef UNIV_HOTBACKUP + } else if (mtr) { + mlog_write_ull(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), + trx_id, mtr); +#endif /* !UNIV_HOTBACKUP */ + } else { + mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), trx_id); + } +} + +/************************************************************//** +Allocates a block of memory from the heap of an index page. +@return pointer to start of allocated buffer, or NULL if allocation fails */ +UNIV_INTERN +byte* +page_mem_alloc_heap( +/*================*/ + page_t* page, /*!< in/out: index page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page with enough + space available for inserting the record, + or NULL */ + ulint need, /*!< in: total number of bytes needed */ + ulint* heap_no)/*!< out: this contains the heap number + of the allocated record + if allocation succeeds */ +{ + byte* block; + ulint avl_space; + + ut_ad(page && heap_no); + + avl_space = page_get_max_insert_size(page, 1); + + if (avl_space >= need) { + block = page_header_get_ptr(page, PAGE_HEAP_TOP); + + page_header_set_ptr(page, page_zip, PAGE_HEAP_TOP, + block + need); + *heap_no = page_dir_get_n_heap(page); + + page_dir_set_n_heap(page, page_zip, 1 + *heap_no); + + return(block); + } + + return(NULL); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************//** +Writes a log record of page creation. */ +UNIV_INLINE +void +page_create_write_log( +/*==================*/ + buf_frame_t* frame, /*!< in: a buffer frame where the page is + created */ + mtr_t* mtr, /*!< in: mini-transaction handle */ + ibool comp) /*!< in: TRUE=compact page format */ +{ + mlog_write_initial_log_record(frame, comp + ? MLOG_COMP_PAGE_CREATE + : MLOG_PAGE_CREATE, mtr); +} +#else /* !UNIV_HOTBACKUP */ +# define page_create_write_log(frame,mtr,comp) ((void) 0) +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************//** +Parses a redo log record of creating a page. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_parse_create( +/*==============*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr __attribute__((unused)), /*!< in: buffer end */ + ulint comp, /*!< in: nonzero=compact page format */ + buf_block_t* block, /*!< in: block or NULL */ + mtr_t* mtr) /*!< in: mtr or NULL */ +{ + ut_ad(ptr && end_ptr); + + /* The record is empty, except for the record initial part */ + + if (block) { + page_create(block, mtr, comp); + } + + return(ptr); +} + +/**********************************************************//** +The index page creation function. +@return pointer to the page */ +static +page_t* +page_create_low( +/*============*/ + buf_block_t* block, /*!< in: a buffer block where the + page is created */ + ulint comp) /*!< in: nonzero=compact page format */ +{ + page_dir_slot_t* slot; + mem_heap_t* heap; + dtuple_t* tuple; + dfield_t* field; + byte* heap_top; + rec_t* infimum_rec; + rec_t* supremum_rec; + page_t* page; + dict_index_t* index; + ulint* offsets; + + ut_ad(block); +#if PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE > PAGE_DATA +# error "PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE > PAGE_DATA" +#endif +#if PAGE_BTR_IBUF_FREE_LIST_NODE + FLST_NODE_SIZE > PAGE_DATA +# error "PAGE_BTR_IBUF_FREE_LIST_NODE + FLST_NODE_SIZE > PAGE_DATA" +#endif + + /* The infimum and supremum records use a dummy index. */ + if (UNIV_LIKELY(comp)) { + index = dict_ind_compact; + } else { + index = dict_ind_redundant; + } + + /* 1. INCREMENT MODIFY CLOCK */ + buf_block_modify_clock_inc(block); + + page = buf_block_get_frame(block); + + fil_page_set_type(page, FIL_PAGE_INDEX); + + heap = mem_heap_create(200); + + /* 3. CREATE THE INFIMUM AND SUPREMUM RECORDS */ + + /* Create first a data tuple for infimum record */ + tuple = dtuple_create(heap, 1); + dtuple_set_info_bits(tuple, REC_STATUS_INFIMUM); + field = dtuple_get_nth_field(tuple, 0); + + dfield_set_data(field, "infimum", 8); + dtype_set(dfield_get_type(field), + DATA_VARCHAR, DATA_ENGLISH | DATA_NOT_NULL, 8); + /* Set the corresponding physical record to its place in the page + record heap */ + + heap_top = page + PAGE_DATA; + + infimum_rec = rec_convert_dtuple_to_rec(heap_top, index, tuple, 0); + + if (UNIV_LIKELY(comp)) { + ut_a(infimum_rec == page + PAGE_NEW_INFIMUM); + + rec_set_n_owned_new(infimum_rec, NULL, 1); + rec_set_heap_no_new(infimum_rec, 0); + } else { + ut_a(infimum_rec == page + PAGE_OLD_INFIMUM); + + rec_set_n_owned_old(infimum_rec, 1); + rec_set_heap_no_old(infimum_rec, 0); + } + + offsets = rec_get_offsets(infimum_rec, index, NULL, + ULINT_UNDEFINED, &heap); + + heap_top = rec_get_end(infimum_rec, offsets); + + /* Create then a tuple for supremum */ + + tuple = dtuple_create(heap, 1); + dtuple_set_info_bits(tuple, REC_STATUS_SUPREMUM); + field = dtuple_get_nth_field(tuple, 0); + + dfield_set_data(field, "supremum", comp ? 8 : 9); + dtype_set(dfield_get_type(field), + DATA_VARCHAR, DATA_ENGLISH | DATA_NOT_NULL, comp ? 8 : 9); + + supremum_rec = rec_convert_dtuple_to_rec(heap_top, index, tuple, 0); + + if (UNIV_LIKELY(comp)) { + ut_a(supremum_rec == page + PAGE_NEW_SUPREMUM); + + rec_set_n_owned_new(supremum_rec, NULL, 1); + rec_set_heap_no_new(supremum_rec, 1); + } else { + ut_a(supremum_rec == page + PAGE_OLD_SUPREMUM); + + rec_set_n_owned_old(supremum_rec, 1); + rec_set_heap_no_old(supremum_rec, 1); + } + + offsets = rec_get_offsets(supremum_rec, index, offsets, + ULINT_UNDEFINED, &heap); + heap_top = rec_get_end(supremum_rec, offsets); + + ut_ad(heap_top == page + + (comp ? PAGE_NEW_SUPREMUM_END : PAGE_OLD_SUPREMUM_END)); + + mem_heap_free(heap); + + /* 4. INITIALIZE THE PAGE */ + + page_header_set_field(page, NULL, PAGE_N_DIR_SLOTS, 2); + page_header_set_ptr(page, NULL, PAGE_HEAP_TOP, heap_top); + page_header_set_field(page, NULL, PAGE_N_HEAP, comp + ? 0x8000 | PAGE_HEAP_NO_USER_LOW + : PAGE_HEAP_NO_USER_LOW); + page_header_set_ptr(page, NULL, PAGE_FREE, NULL); + page_header_set_field(page, NULL, PAGE_GARBAGE, 0); + page_header_set_ptr(page, NULL, PAGE_LAST_INSERT, NULL); + page_header_set_field(page, NULL, PAGE_DIRECTION, PAGE_NO_DIRECTION); + page_header_set_field(page, NULL, PAGE_N_DIRECTION, 0); + page_header_set_field(page, NULL, PAGE_N_RECS, 0); + page_set_max_trx_id(block, NULL, 0, NULL); + memset(heap_top, 0, UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START + - page_offset(heap_top)); + + /* 5. SET POINTERS IN RECORDS AND DIR SLOTS */ + + /* Set the slots to point to infimum and supremum. */ + + slot = page_dir_get_nth_slot(page, 0); + page_dir_slot_set_rec(slot, infimum_rec); + + slot = page_dir_get_nth_slot(page, 1); + page_dir_slot_set_rec(slot, supremum_rec); + + /* Set the next pointers in infimum and supremum */ + + if (UNIV_LIKELY(comp)) { + rec_set_next_offs_new(infimum_rec, PAGE_NEW_SUPREMUM); + rec_set_next_offs_new(supremum_rec, 0); + } else { + rec_set_next_offs_old(infimum_rec, PAGE_OLD_SUPREMUM); + rec_set_next_offs_old(supremum_rec, 0); + } + + return(page); +} + +/**********************************************************//** +Create an uncompressed B-tree index page. +@return pointer to the page */ +UNIV_INTERN +page_t* +page_create( +/*========*/ + buf_block_t* block, /*!< in: a buffer block where the + page is created */ + mtr_t* mtr, /*!< in: mini-transaction handle */ + ulint comp) /*!< in: nonzero=compact page format */ +{ + page_create_write_log(buf_block_get_frame(block), mtr, comp); + return(page_create_low(block, comp)); +} + +/**********************************************************//** +Create a compressed B-tree index page. +@return pointer to the page */ +UNIV_INTERN +page_t* +page_create_zip( +/*============*/ + buf_block_t* block, /*!< in/out: a buffer frame where the + page is created */ + dict_index_t* index, /*!< in: the index of the page */ + ulint level, /*!< in: the B-tree level of the page */ + trx_id_t max_trx_id, /*!< in: PAGE_MAX_TRX_ID */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + page_t* page; + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + + ut_ad(block); + ut_ad(page_zip); + ut_ad(index); + ut_ad(dict_table_is_comp(index->table)); + + page = page_create_low(block, TRUE); + mach_write_to_2(PAGE_HEADER + PAGE_LEVEL + page, level); + mach_write_to_8(PAGE_HEADER + PAGE_MAX_TRX_ID + page, max_trx_id); + + if (!page_zip_compress(page_zip, page, index, + page_zip_level, mtr)) { + /* The compression of a newly created page + should always succeed. */ + ut_error; + } + + return(page); +} + +/**********************************************************//** +Empty a previously created B-tree index page. */ +UNIV_INTERN +void +page_create_empty( +/*==============*/ + buf_block_t* block, /*!< in/out: B-tree block */ + dict_index_t* index, /*!< in: the index of the page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + trx_id_t max_trx_id = 0; + const page_t* page = buf_block_get_frame(block); + page_zip_des_t* page_zip= buf_block_get_page_zip(block); + + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); + + if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) { + max_trx_id = page_get_max_trx_id(page); + ut_ad(max_trx_id); + } + + if (page_zip) { + page_create_zip(block, index, + page_header_get_field(page, PAGE_LEVEL), + max_trx_id, mtr); + } else { + page_create(block, mtr, page_is_comp(page)); + + if (max_trx_id) { + page_update_max_trx_id( + block, page_zip, max_trx_id, mtr); + } + } +} + +/*************************************************************//** +Differs from page_copy_rec_list_end, because this function does not +touch the lock table and max trx id on page or compress the page. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). */ +UNIV_INTERN +void +page_copy_rec_list_end_no_locks( +/*============================*/ + buf_block_t* new_block, /*!< in: index page to copy to */ + buf_block_t* block, /*!< in: index page of rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* new_page = buf_block_get_frame(new_block); + page_cur_t cur1; + rec_t* cur2; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + page_cur_position(rec, block, &cur1); + + if (page_cur_is_before_first(&cur1)) { + + page_cur_move_to_next(&cur1); + } + + btr_assert_not_corrupted(new_block, index); + ut_a(page_is_comp(new_page) == page_rec_is_comp(rec)); + ut_a(mach_read_from_2(new_page + UNIV_PAGE_SIZE - 10) == (ulint) + (page_is_comp(new_page) ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM)); + + cur2 = page_get_infimum_rec(buf_block_get_frame(new_block)); + + /* Copy records from the original page to the new page */ + + while (!page_cur_is_after_last(&cur1)) { + rec_t* cur1_rec = page_cur_get_rec(&cur1); + rec_t* ins_rec; + offsets = rec_get_offsets(cur1_rec, index, offsets, + ULINT_UNDEFINED, &heap); + ins_rec = page_cur_insert_rec_low(cur2, index, + cur1_rec, offsets, mtr); + if (UNIV_UNLIKELY(!ins_rec)) { + /* Track an assertion failure reported on the mailing + list on June 18th, 2003 */ + + buf_page_print(new_page, 0, + BUF_PAGE_PRINT_NO_CRASH); + buf_page_print(page_align(rec), 0, + BUF_PAGE_PRINT_NO_CRASH); + ut_print_timestamp(stderr); + + fprintf(stderr, + "InnoDB: rec offset %lu, cur1 offset %lu," + " cur2 offset %lu\n", + (ulong) page_offset(rec), + (ulong) page_offset(page_cur_get_rec(&cur1)), + (ulong) page_offset(cur2)); + ut_error; + } + + page_cur_move_to_next(&cur1); + cur2 = ins_rec; + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +#ifndef UNIV_HOTBACKUP +/*************************************************************//** +Copies records from page to new_page, from a given record onward, +including that record. Infimum and supremum records are not copied. +The records are copied to the start of the record list on new_page. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to the original successor of the infimum record on +new_page, or NULL on zip overflow (new_block will be decompressed) */ +UNIV_INTERN +rec_t* +page_copy_rec_list_end( +/*===================*/ + buf_block_t* new_block, /*!< in/out: index page to copy to */ + buf_block_t* block, /*!< in: index page containing rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* new_page = buf_block_get_frame(new_block); + page_zip_des_t* new_page_zip = buf_block_get_page_zip(new_block); + page_t* page = page_align(rec); + rec_t* ret = page_rec_get_next( + page_get_infimum_rec(new_page)); + ulint log_mode = 0; /* remove warning */ + +#ifdef UNIV_ZIP_DEBUG + if (new_page_zip) { + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + ut_a(page_zip); + + /* Strict page_zip_validate() may fail here. + Furthermore, btr_compress() may set FIL_PAGE_PREV to + FIL_NULL on new_page while leaving it intact on + new_page_zip. So, we cannot validate new_page_zip. */ + ut_a(page_zip_validate_low(page_zip, page, index, TRUE)); + } +#endif /* UNIV_ZIP_DEBUG */ + ut_ad(buf_block_get_frame(block) == page); + ut_ad(page_is_leaf(page) == page_is_leaf(new_page)); + ut_ad(page_is_comp(page) == page_is_comp(new_page)); + /* Here, "ret" may be pointing to a user record or the + predefined supremum record. */ + + if (new_page_zip) { + log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + } + + if (page_dir_get_n_heap(new_page) == PAGE_HEAP_NO_USER_LOW) { + page_copy_rec_list_end_to_created_page(new_page, rec, + index, mtr); + } else { + page_copy_rec_list_end_no_locks(new_block, block, rec, + index, mtr); + } + + /* Update PAGE_MAX_TRX_ID on the uncompressed page. + Modifications will be redo logged and copied to the compressed + page in page_zip_compress() or page_zip_reorganize() below. */ + if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) { + page_update_max_trx_id(new_block, NULL, + page_get_max_trx_id(page), mtr); + } + + if (new_page_zip) { + mtr_set_log_mode(mtr, log_mode); + + if (!page_zip_compress(new_page_zip, new_page, + index, page_zip_level, mtr)) { + /* Before trying to reorganize the page, + store the number of preceding records on the page. */ + ulint ret_pos + = page_rec_get_n_recs_before(ret); + /* Before copying, "ret" was the successor of + the predefined infimum record. It must still + have at least one predecessor (the predefined + infimum record, or a freshly copied record + that is smaller than "ret"). */ + ut_a(ret_pos > 0); + + if (!page_zip_reorganize(new_block, index, mtr)) { + + btr_blob_dbg_remove(new_page, index, + "copy_end_reorg_fail"); + if (!page_zip_decompress(new_page_zip, + new_page, FALSE)) { + ut_error; + } + ut_ad(page_validate(new_page, index)); + btr_blob_dbg_add(new_page, index, + "copy_end_reorg_fail"); + return(NULL); + } else { + /* The page was reorganized: + Seek to ret_pos. */ + ret = new_page + PAGE_NEW_INFIMUM; + + do { + ret = rec_get_next_ptr(ret, TRUE); + } while (--ret_pos); + } + } + } + + /* Update the lock table and possible hash index */ + + lock_move_rec_list_end(new_block, block, rec); + + btr_search_move_or_delete_hash_entries(new_block, block, index); + + return(ret); +} + +/*************************************************************//** +Copies records from page to new_page, up to the given record, +NOT including that record. Infimum and supremum records are not copied. +The records are copied to the end of the record list on new_page. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to the original predecessor of the supremum record on +new_page, or NULL on zip overflow (new_block will be decompressed) */ +UNIV_INTERN +rec_t* +page_copy_rec_list_start( +/*=====================*/ + buf_block_t* new_block, /*!< in/out: index page to copy to */ + buf_block_t* block, /*!< in: index page containing rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* new_page = buf_block_get_frame(new_block); + page_zip_des_t* new_page_zip = buf_block_get_page_zip(new_block); + page_cur_t cur1; + rec_t* cur2; + ulint log_mode = 0 /* remove warning */; + mem_heap_t* heap = NULL; + rec_t* ret + = page_rec_get_prev(page_get_supremum_rec(new_page)); + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + /* Here, "ret" may be pointing to a user record or the + predefined infimum record. */ + + if (page_rec_is_infimum(rec)) { + + return(ret); + } + + if (new_page_zip) { + log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + } + + page_cur_set_before_first(block, &cur1); + page_cur_move_to_next(&cur1); + + cur2 = ret; + + /* Copy records from the original page to the new page */ + + while (page_cur_get_rec(&cur1) != rec) { + rec_t* cur1_rec = page_cur_get_rec(&cur1); + offsets = rec_get_offsets(cur1_rec, index, offsets, + ULINT_UNDEFINED, &heap); + cur2 = page_cur_insert_rec_low(cur2, index, + cur1_rec, offsets, mtr); + ut_a(cur2); + + page_cur_move_to_next(&cur1); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + /* Update PAGE_MAX_TRX_ID on the uncompressed page. + Modifications will be redo logged and copied to the compressed + page in page_zip_compress() or page_zip_reorganize() below. */ + if (dict_index_is_sec_or_ibuf(index) + && page_is_leaf(page_align(rec))) { + page_update_max_trx_id(new_block, NULL, + page_get_max_trx_id(page_align(rec)), + mtr); + } + + if (new_page_zip) { + mtr_set_log_mode(mtr, log_mode); + + DBUG_EXECUTE_IF("page_copy_rec_list_start_compress_fail", + goto zip_reorganize;); + + if (!page_zip_compress(new_page_zip, new_page, index, + page_zip_level, mtr)) { + + ulint ret_pos; +#ifndef DBUG_OFF +zip_reorganize: +#endif /* DBUG_OFF */ + /* Before trying to reorganize the page, + store the number of preceding records on the page. */ + ret_pos = page_rec_get_n_recs_before(ret); + /* Before copying, "ret" was the predecessor + of the predefined supremum record. If it was + the predefined infimum record, then it would + still be the infimum, and we would have + ret_pos == 0. */ + + if (UNIV_UNLIKELY + (!page_zip_reorganize(new_block, index, mtr))) { + + btr_blob_dbg_remove(new_page, index, + "copy_start_reorg_fail"); + if (UNIV_UNLIKELY + (!page_zip_decompress(new_page_zip, + new_page, FALSE))) { + ut_error; + } + ut_ad(page_validate(new_page, index)); + btr_blob_dbg_add(new_page, index, + "copy_start_reorg_fail"); + return(NULL); + } + + /* The page was reorganized: Seek to ret_pos. */ + ret = page_rec_get_nth(new_page, ret_pos); + } + } + + /* Update the lock table and possible hash index */ + + lock_move_rec_list_start(new_block, block, rec, ret); + + btr_search_move_or_delete_hash_entries(new_block, block, index); + + return(ret); +} + +/**********************************************************//** +Writes a log record of a record list end or start deletion. */ +UNIV_INLINE +void +page_delete_rec_list_write_log( +/*===========================*/ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + byte type, /*!< in: operation type: + MLOG_LIST_END_DELETE, ... */ + mtr_t* mtr) /*!< in: mtr */ +{ + byte* log_ptr; + ut_ad(type == MLOG_LIST_END_DELETE + || type == MLOG_LIST_START_DELETE + || type == MLOG_COMP_LIST_END_DELETE + || type == MLOG_COMP_LIST_START_DELETE); + + log_ptr = mlog_open_and_write_index(mtr, rec, index, type, 2); + if (log_ptr) { + /* Write the parameter as a 2-byte ulint */ + mach_write_to_2(log_ptr, page_offset(rec)); + mlog_close(mtr, log_ptr + 2); + } +} +#else /* !UNIV_HOTBACKUP */ +# define page_delete_rec_list_write_log(rec,index,type,mtr) ((void) 0) +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************//** +Parses a log record of a record list end or start deletion. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_parse_delete_rec_list( +/*=======================*/ + byte type, /*!< in: MLOG_LIST_END_DELETE, + MLOG_LIST_START_DELETE, + MLOG_COMP_LIST_END_DELETE or + MLOG_COMP_LIST_START_DELETE */ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + buf_block_t* block, /*!< in/out: buffer block or NULL */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr or NULL */ +{ + page_t* page; + ulint offset; + + ut_ad(type == MLOG_LIST_END_DELETE + || type == MLOG_LIST_START_DELETE + || type == MLOG_COMP_LIST_END_DELETE + || type == MLOG_COMP_LIST_START_DELETE); + + /* Read the record offset as a 2-byte ulint */ + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + offset = mach_read_from_2(ptr); + ptr += 2; + + if (!block) { + + return(ptr); + } + + page = buf_block_get_frame(block); + + ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); + + if (type == MLOG_LIST_END_DELETE + || type == MLOG_COMP_LIST_END_DELETE) { + page_delete_rec_list_end(page + offset, block, index, + ULINT_UNDEFINED, ULINT_UNDEFINED, + mtr); + } else { + page_delete_rec_list_start(page + offset, block, index, mtr); + } + + return(ptr); +} + +/*************************************************************//** +Deletes records from a page from a given record onward, including that record. +The infimum and supremum records are not deleted. */ +UNIV_INTERN +void +page_delete_rec_list_end( +/*=====================*/ + rec_t* rec, /*!< in: pointer to record on page */ + buf_block_t* block, /*!< in: buffer block of the page */ + dict_index_t* index, /*!< in: record descriptor */ + ulint n_recs, /*!< in: number of records to delete, + or ULINT_UNDEFINED if not known */ + ulint size, /*!< in: the sum of the sizes of the + records in the end of the chain to + delete, or ULINT_UNDEFINED if not known */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_dir_slot_t*slot; + ulint slot_index; + rec_t* last_rec; + rec_t* prev_rec; + ulint n_owned; + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + page_t* page = page_align(rec); + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(size == ULINT_UNDEFINED || size < UNIV_PAGE_SIZE); + ut_ad(!page_zip || page_rec_is_comp(rec)); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + if (page_rec_is_supremum(rec)) { + ut_ad(n_recs == 0 || n_recs == ULINT_UNDEFINED); + /* Nothing to do, there are no records bigger than the + page supremum. */ + return; + } + + if (recv_recovery_is_on()) { + /* If we are replaying a redo log record, we must + replay it exactly. Since MySQL 5.6.11, we should be + generating a redo log record for page creation if + the page would become empty. Thus, this branch should + only be executed when applying redo log that was + generated by an older version of MySQL. */ + } else if (page_rec_is_infimum(rec) + || n_recs == page_get_n_recs(page)) { +delete_all: + /* We are deleting all records. */ + page_create_empty(block, index, mtr); + return; + } else if (page_is_comp(page)) { + if (page_rec_get_next_low(page + PAGE_NEW_INFIMUM, 1) == rec) { + /* We are deleting everything from the first + user record onwards. */ + goto delete_all; + } + } else { + if (page_rec_get_next_low(page + PAGE_OLD_INFIMUM, 0) == rec) { + /* We are deleting everything from the first + user record onwards. */ + goto delete_all; + } + } + + /* Reset the last insert info in the page header and increment + the modify clock for the frame */ + + page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL); + + /* The page gets invalid for optimistic searches: increment the + frame modify clock */ + + buf_block_modify_clock_inc(block); + + page_delete_rec_list_write_log(rec, index, page_is_comp(page) + ? MLOG_COMP_LIST_END_DELETE + : MLOG_LIST_END_DELETE, mtr); + + if (page_zip) { + ulint log_mode; + + ut_a(page_is_comp(page)); + /* Individual deletes are not logged */ + + log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + + do { + page_cur_t cur; + page_cur_position(rec, block, &cur); + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + rec = rec_get_next_ptr(rec, TRUE); +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + page_cur_delete_rec(&cur, index, offsets, mtr); + } while (page_offset(rec) != PAGE_NEW_SUPREMUM); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + /* Restore log mode */ + + mtr_set_log_mode(mtr, log_mode); + return; + } + + prev_rec = page_rec_get_prev(rec); + + last_rec = page_rec_get_prev(page_get_supremum_rec(page)); + + if ((size == ULINT_UNDEFINED) || (n_recs == ULINT_UNDEFINED)) { + rec_t* rec2 = rec; + /* Calculate the sum of sizes and the number of records */ + size = 0; + n_recs = 0; + + do { + ulint s; + offsets = rec_get_offsets(rec2, index, offsets, + ULINT_UNDEFINED, &heap); + s = rec_offs_size(offsets); + ut_ad(rec2 - page + s - rec_offs_extra_size(offsets) + < UNIV_PAGE_SIZE); + ut_ad(size + s < UNIV_PAGE_SIZE); + size += s; + n_recs++; + + rec2 = page_rec_get_next(rec2); + } while (!page_rec_is_supremum(rec2)); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + + ut_ad(size < UNIV_PAGE_SIZE); + + /* Update the page directory; there is no need to balance the number + of the records owned by the supremum record, as it is allowed to be + less than PAGE_DIR_SLOT_MIN_N_OWNED */ + + if (page_is_comp(page)) { + rec_t* rec2 = rec; + ulint count = 0; + + while (rec_get_n_owned_new(rec2) == 0) { + count++; + + rec2 = rec_get_next_ptr(rec2, TRUE); + } + + ut_ad(rec_get_n_owned_new(rec2) > count); + + n_owned = rec_get_n_owned_new(rec2) - count; + slot_index = page_dir_find_owner_slot(rec2); + ut_ad(slot_index > 0); + slot = page_dir_get_nth_slot(page, slot_index); + } else { + rec_t* rec2 = rec; + ulint count = 0; + + while (rec_get_n_owned_old(rec2) == 0) { + count++; + + rec2 = rec_get_next_ptr(rec2, FALSE); + } + + ut_ad(rec_get_n_owned_old(rec2) > count); + + n_owned = rec_get_n_owned_old(rec2) - count; + slot_index = page_dir_find_owner_slot(rec2); + ut_ad(slot_index > 0); + slot = page_dir_get_nth_slot(page, slot_index); + } + + page_dir_slot_set_rec(slot, page_get_supremum_rec(page)); + page_dir_slot_set_n_owned(slot, NULL, n_owned); + + page_dir_set_n_slots(page, NULL, slot_index + 1); + + /* Remove the record chain segment from the record chain */ + page_rec_set_next(prev_rec, page_get_supremum_rec(page)); + + btr_blob_dbg_op(page, rec, index, "delete_end", + btr_blob_dbg_remove_rec); + + /* Catenate the deleted chain segment to the page free list */ + + page_rec_set_next(last_rec, page_header_get_ptr(page, PAGE_FREE)); + page_header_set_ptr(page, NULL, PAGE_FREE, rec); + + page_header_set_field(page, NULL, PAGE_GARBAGE, size + + page_header_get_field(page, PAGE_GARBAGE)); + + page_header_set_field(page, NULL, PAGE_N_RECS, + (ulint)(page_get_n_recs(page) - n_recs)); +} + +/*************************************************************//** +Deletes records from page, up to the given record, NOT including +that record. Infimum and supremum records are not deleted. */ +UNIV_INTERN +void +page_delete_rec_list_start( +/*=======================*/ + rec_t* rec, /*!< in: record on page */ + buf_block_t* block, /*!< in: buffer block of the page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_cur_t cur1; + ulint log_mode; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + mem_heap_t* heap = NULL; + byte type; + + rec_offs_init(offsets_); + + ut_ad((ibool) !!page_rec_is_comp(rec) + == dict_table_is_comp(index->table)); +#ifdef UNIV_ZIP_DEBUG + { + page_zip_des_t* page_zip= buf_block_get_page_zip(block); + page_t* page = buf_block_get_frame(block); + + /* page_zip_validate() would detect a min_rec_mark mismatch + in btr_page_split_and_insert() + between btr_attach_half_pages() and insert_page = ... + when btr_page_get_split_rec_to_left() holds + (direction == FSP_DOWN). */ + ut_a(!page_zip + || page_zip_validate_low(page_zip, page, index, TRUE)); + } +#endif /* UNIV_ZIP_DEBUG */ + + if (page_rec_is_infimum(rec)) { + return; + } + + if (page_rec_is_supremum(rec)) { + /* We are deleting all records. */ + page_create_empty(block, index, mtr); + return; + } + + if (page_rec_is_comp(rec)) { + type = MLOG_COMP_LIST_START_DELETE; + } else { + type = MLOG_LIST_START_DELETE; + } + + page_delete_rec_list_write_log(rec, index, type, mtr); + + page_cur_set_before_first(block, &cur1); + page_cur_move_to_next(&cur1); + + /* Individual deletes are not logged */ + + log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + + while (page_cur_get_rec(&cur1) != rec) { + offsets = rec_get_offsets(page_cur_get_rec(&cur1), index, + offsets, ULINT_UNDEFINED, &heap); + page_cur_delete_rec(&cur1, index, offsets, mtr); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + /* Restore log mode */ + + mtr_set_log_mode(mtr, log_mode); +} + +#ifndef UNIV_HOTBACKUP +/*************************************************************//** +Moves record list end to another page. Moved records include +split_rec. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return TRUE on success; FALSE on compression failure (new_block will +be decompressed) */ +UNIV_INTERN +ibool +page_move_rec_list_end( +/*===================*/ + buf_block_t* new_block, /*!< in/out: index page where to move */ + buf_block_t* block, /*!< in: index page from where to move */ + rec_t* split_rec, /*!< in: first record to move */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* new_page = buf_block_get_frame(new_block); + ulint old_data_size; + ulint new_data_size; + ulint old_n_recs; + ulint new_n_recs; + + old_data_size = page_get_data_size(new_page); + old_n_recs = page_get_n_recs(new_page); +#ifdef UNIV_ZIP_DEBUG + { + page_zip_des_t* new_page_zip + = buf_block_get_page_zip(new_block); + page_zip_des_t* page_zip + = buf_block_get_page_zip(block); + ut_a(!new_page_zip == !page_zip); + ut_a(!new_page_zip + || page_zip_validate(new_page_zip, new_page, index)); + ut_a(!page_zip + || page_zip_validate(page_zip, page_align(split_rec), + index)); + } +#endif /* UNIV_ZIP_DEBUG */ + + if (UNIV_UNLIKELY(!page_copy_rec_list_end(new_block, block, + split_rec, index, mtr))) { + return(FALSE); + } + + new_data_size = page_get_data_size(new_page); + new_n_recs = page_get_n_recs(new_page); + + ut_ad(new_data_size >= old_data_size); + + page_delete_rec_list_end(split_rec, block, index, + new_n_recs - old_n_recs, + new_data_size - old_data_size, mtr); + + return(TRUE); +} + +/*************************************************************//** +Moves record list start to another page. Moved records do not include +split_rec. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return TRUE on success; FALSE on compression failure */ +UNIV_INTERN +ibool +page_move_rec_list_start( +/*=====================*/ + buf_block_t* new_block, /*!< in/out: index page where to move */ + buf_block_t* block, /*!< in/out: page containing split_rec */ + rec_t* split_rec, /*!< in: first record not to move */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + if (UNIV_UNLIKELY(!page_copy_rec_list_start(new_block, block, + split_rec, index, mtr))) { + return(FALSE); + } + + page_delete_rec_list_start(split_rec, block, index, mtr); + + return(TRUE); +} +#endif /* !UNIV_HOTBACKUP */ + +/**************************************************************//** +Used to delete n slots from the directory. This function updates +also n_owned fields in the records, so that the first slot after +the deleted ones inherits the records of the deleted slots. */ +UNIV_INLINE +void +page_dir_delete_slot( +/*=================*/ + page_t* page, /*!< in/out: the index page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + ulint slot_no)/*!< in: slot to be deleted */ +{ + page_dir_slot_t* slot; + ulint n_owned; + ulint i; + ulint n_slots; + + ut_ad(!page_zip || page_is_comp(page)); + ut_ad(slot_no > 0); + ut_ad(slot_no + 1 < page_dir_get_n_slots(page)); + + n_slots = page_dir_get_n_slots(page); + + /* 1. Reset the n_owned fields of the slots to be + deleted */ + slot = page_dir_get_nth_slot(page, slot_no); + n_owned = page_dir_slot_get_n_owned(slot); + page_dir_slot_set_n_owned(slot, page_zip, 0); + + /* 2. Update the n_owned value of the first non-deleted slot */ + + slot = page_dir_get_nth_slot(page, slot_no + 1); + page_dir_slot_set_n_owned(slot, page_zip, + n_owned + page_dir_slot_get_n_owned(slot)); + + /* 3. Destroy the slot by copying slots */ + for (i = slot_no + 1; i < n_slots; i++) { + rec_t* rec = (rec_t*) + page_dir_slot_get_rec(page_dir_get_nth_slot(page, i)); + page_dir_slot_set_rec(page_dir_get_nth_slot(page, i - 1), rec); + } + + /* 4. Zero out the last slot, which will be removed */ + mach_write_to_2(page_dir_get_nth_slot(page, n_slots - 1), 0); + + /* 5. Update the page header */ + page_header_set_field(page, page_zip, PAGE_N_DIR_SLOTS, n_slots - 1); +} + +/**************************************************************//** +Used to add n slots to the directory. Does not set the record pointers +in the added slots or update n_owned values: this is the responsibility +of the caller. */ +UNIV_INLINE +void +page_dir_add_slot( +/*==============*/ + page_t* page, /*!< in/out: the index page */ + page_zip_des_t* page_zip,/*!< in/out: comprssed page, or NULL */ + ulint start) /*!< in: the slot above which the new slots + are added */ +{ + page_dir_slot_t* slot; + ulint n_slots; + + n_slots = page_dir_get_n_slots(page); + + ut_ad(start < n_slots - 1); + + /* Update the page header */ + page_dir_set_n_slots(page, page_zip, n_slots + 1); + + /* Move slots up */ + slot = page_dir_get_nth_slot(page, n_slots); + memmove(slot, slot + PAGE_DIR_SLOT_SIZE, + (n_slots - 1 - start) * PAGE_DIR_SLOT_SIZE); +} + +/****************************************************************//** +Splits a directory slot which owns too many records. */ +UNIV_INTERN +void +page_dir_split_slot( +/*================*/ + page_t* page, /*!< in/out: index page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be written, or NULL */ + ulint slot_no)/*!< in: the directory slot */ +{ + rec_t* rec; + page_dir_slot_t* new_slot; + page_dir_slot_t* prev_slot; + page_dir_slot_t* slot; + ulint i; + ulint n_owned; + + ut_ad(page); + ut_ad(!page_zip || page_is_comp(page)); + ut_ad(slot_no > 0); + + slot = page_dir_get_nth_slot(page, slot_no); + + n_owned = page_dir_slot_get_n_owned(slot); + ut_ad(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED + 1); + + /* 1. We loop to find a record approximately in the middle of the + records owned by the slot. */ + + prev_slot = page_dir_get_nth_slot(page, slot_no - 1); + rec = (rec_t*) page_dir_slot_get_rec(prev_slot); + + for (i = 0; i < n_owned / 2; i++) { + rec = page_rec_get_next(rec); + } + + ut_ad(n_owned / 2 >= PAGE_DIR_SLOT_MIN_N_OWNED); + + /* 2. We add one directory slot immediately below the slot to be + split. */ + + page_dir_add_slot(page, page_zip, slot_no - 1); + + /* The added slot is now number slot_no, and the old slot is + now number slot_no + 1 */ + + new_slot = page_dir_get_nth_slot(page, slot_no); + slot = page_dir_get_nth_slot(page, slot_no + 1); + + /* 3. We store the appropriate values to the new slot. */ + + page_dir_slot_set_rec(new_slot, rec); + page_dir_slot_set_n_owned(new_slot, page_zip, n_owned / 2); + + /* 4. Finally, we update the number of records field of the + original slot */ + + page_dir_slot_set_n_owned(slot, page_zip, n_owned - (n_owned / 2)); +} + +/*************************************************************//** +Tries to balance the given directory slot with too few records with the upper +neighbor, so that there are at least the minimum number of records owned by +the slot; this may result in the merging of two slots. */ +UNIV_INTERN +void +page_dir_balance_slot( +/*==================*/ + page_t* page, /*!< in/out: index page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + ulint slot_no)/*!< in: the directory slot */ +{ + page_dir_slot_t* slot; + page_dir_slot_t* up_slot; + ulint n_owned; + ulint up_n_owned; + rec_t* old_rec; + rec_t* new_rec; + + ut_ad(page); + ut_ad(!page_zip || page_is_comp(page)); + ut_ad(slot_no > 0); + + slot = page_dir_get_nth_slot(page, slot_no); + + /* The last directory slot cannot be balanced with the upper + neighbor, as there is none. */ + + if (UNIV_UNLIKELY(slot_no == page_dir_get_n_slots(page) - 1)) { + + return; + } + + up_slot = page_dir_get_nth_slot(page, slot_no + 1); + + n_owned = page_dir_slot_get_n_owned(slot); + up_n_owned = page_dir_slot_get_n_owned(up_slot); + + ut_ad(n_owned == PAGE_DIR_SLOT_MIN_N_OWNED - 1); + + /* If the upper slot has the minimum value of n_owned, we will merge + the two slots, therefore we assert: */ + ut_ad(2 * PAGE_DIR_SLOT_MIN_N_OWNED - 1 <= PAGE_DIR_SLOT_MAX_N_OWNED); + + if (up_n_owned > PAGE_DIR_SLOT_MIN_N_OWNED) { + + /* In this case we can just transfer one record owned + by the upper slot to the property of the lower slot */ + old_rec = (rec_t*) page_dir_slot_get_rec(slot); + + if (page_is_comp(page)) { + new_rec = rec_get_next_ptr(old_rec, TRUE); + + rec_set_n_owned_new(old_rec, page_zip, 0); + rec_set_n_owned_new(new_rec, page_zip, n_owned + 1); + } else { + new_rec = rec_get_next_ptr(old_rec, FALSE); + + rec_set_n_owned_old(old_rec, 0); + rec_set_n_owned_old(new_rec, n_owned + 1); + } + + page_dir_slot_set_rec(slot, new_rec); + + page_dir_slot_set_n_owned(up_slot, page_zip, up_n_owned -1); + } else { + /* In this case we may merge the two slots */ + page_dir_delete_slot(page, page_zip, slot_no); + } +} + +/************************************************************//** +Returns the nth record of the record list. +This is the inverse function of page_rec_get_n_recs_before(). +@return nth record */ +UNIV_INTERN +const rec_t* +page_rec_get_nth_const( +/*===================*/ + const page_t* page, /*!< in: page */ + ulint nth) /*!< in: nth record */ +{ + const page_dir_slot_t* slot; + ulint i; + ulint n_owned; + const rec_t* rec; + + if (nth == 0) { + return(page_get_infimum_rec(page)); + } + + ut_ad(nth < UNIV_PAGE_SIZE / (REC_N_NEW_EXTRA_BYTES + 1)); + + for (i = 0;; i++) { + + slot = page_dir_get_nth_slot(page, i); + n_owned = page_dir_slot_get_n_owned(slot); + + if (n_owned > nth) { + break; + } else { + nth -= n_owned; + } + } + + ut_ad(i > 0); + slot = page_dir_get_nth_slot(page, i - 1); + rec = page_dir_slot_get_rec(slot); + + if (page_is_comp(page)) { + do { + rec = page_rec_get_next_low(rec, TRUE); + ut_ad(rec); + } while (nth--); + } else { + do { + rec = page_rec_get_next_low(rec, FALSE); + ut_ad(rec); + } while (nth--); + } + + return(rec); +} + +/***************************************************************//** +Returns the number of records before the given record in chain. +The number includes infimum and supremum records. +@return number of records */ +UNIV_INTERN +ulint +page_rec_get_n_recs_before( +/*=======================*/ + const rec_t* rec) /*!< in: the physical record */ +{ + const page_dir_slot_t* slot; + const rec_t* slot_rec; + const page_t* page; + ulint i; + lint n = 0; + + ut_ad(page_rec_check(rec)); + + page = page_align(rec); + if (page_is_comp(page)) { + while (rec_get_n_owned_new(rec) == 0) { + + rec = rec_get_next_ptr_const(rec, TRUE); + n--; + } + + for (i = 0; ; i++) { + slot = page_dir_get_nth_slot(page, i); + slot_rec = page_dir_slot_get_rec(slot); + + n += rec_get_n_owned_new(slot_rec); + + if (rec == slot_rec) { + + break; + } + } + } else { + while (rec_get_n_owned_old(rec) == 0) { + + rec = rec_get_next_ptr_const(rec, FALSE); + n--; + } + + for (i = 0; ; i++) { + slot = page_dir_get_nth_slot(page, i); + slot_rec = page_dir_slot_get_rec(slot); + + n += rec_get_n_owned_old(slot_rec); + + if (rec == slot_rec) { + + break; + } + } + } + + n--; + + ut_ad(n >= 0); + ut_ad((ulong) n < UNIV_PAGE_SIZE / (REC_N_NEW_EXTRA_BYTES + 1)); + + return((ulint) n); +} + +#ifndef UNIV_HOTBACKUP +/************************************************************//** +Prints record contents including the data relevant only in +the index page context. */ +UNIV_INTERN +void +page_rec_print( +/*===========*/ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets)/*!< in: record descriptor */ +{ + ut_a(!page_rec_is_comp(rec) == !rec_offs_comp(offsets)); + rec_print_new(stderr, rec, offsets); + if (page_rec_is_comp(rec)) { + fprintf(stderr, + " n_owned: %lu; heap_no: %lu; next rec: %lu\n", + (ulong) rec_get_n_owned_new(rec), + (ulong) rec_get_heap_no_new(rec), + (ulong) rec_get_next_offs(rec, TRUE)); + } else { + fprintf(stderr, + " n_owned: %lu; heap_no: %lu; next rec: %lu\n", + (ulong) rec_get_n_owned_old(rec), + (ulong) rec_get_heap_no_old(rec), + (ulong) rec_get_next_offs(rec, FALSE)); + } + + page_rec_check(rec); + rec_validate(rec, offsets); +} + +# ifdef UNIV_BTR_PRINT +/***************************************************************//** +This is used to print the contents of the directory for +debugging purposes. */ +UNIV_INTERN +void +page_dir_print( +/*===========*/ + page_t* page, /*!< in: index page */ + ulint pr_n) /*!< in: print n first and n last entries */ +{ + ulint n; + ulint i; + page_dir_slot_t* slot; + + n = page_dir_get_n_slots(page); + + fprintf(stderr, "--------------------------------\n" + "PAGE DIRECTORY\n" + "Page address %p\n" + "Directory stack top at offs: %lu; number of slots: %lu\n", + page, (ulong) page_offset(page_dir_get_nth_slot(page, n - 1)), + (ulong) n); + for (i = 0; i < n; i++) { + slot = page_dir_get_nth_slot(page, i); + if ((i == pr_n) && (i < n - pr_n)) { + fputs(" ... \n", stderr); + } + if ((i < pr_n) || (i >= n - pr_n)) { + fprintf(stderr, + "Contents of slot: %lu: n_owned: %lu," + " rec offs: %lu\n", + (ulong) i, + (ulong) page_dir_slot_get_n_owned(slot), + (ulong) + page_offset(page_dir_slot_get_rec(slot))); + } + } + fprintf(stderr, "Total of %lu records\n" + "--------------------------------\n", + (ulong) (PAGE_HEAP_NO_USER_LOW + page_get_n_recs(page))); +} + +/***************************************************************//** +This is used to print the contents of the page record list for +debugging purposes. */ +UNIV_INTERN +void +page_print_list( +/*============*/ + buf_block_t* block, /*!< in: index page */ + dict_index_t* index, /*!< in: dictionary index of the page */ + ulint pr_n) /*!< in: print n first and n last entries */ +{ + page_t* page = block->frame; + page_cur_t cur; + ulint count; + ulint n_recs; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table)); + + fprintf(stderr, + "--------------------------------\n" + "PAGE RECORD LIST\n" + "Page address %p\n", page); + + n_recs = page_get_n_recs(page); + + page_cur_set_before_first(block, &cur); + count = 0; + for (;;) { + offsets = rec_get_offsets(cur.rec, index, offsets, + ULINT_UNDEFINED, &heap); + page_rec_print(cur.rec, offsets); + + if (count == pr_n) { + break; + } + if (page_cur_is_after_last(&cur)) { + break; + } + page_cur_move_to_next(&cur); + count++; + } + + if (n_recs > 2 * pr_n) { + fputs(" ... \n", stderr); + } + + while (!page_cur_is_after_last(&cur)) { + page_cur_move_to_next(&cur); + + if (count + pr_n >= n_recs) { + offsets = rec_get_offsets(cur.rec, index, offsets, + ULINT_UNDEFINED, &heap); + page_rec_print(cur.rec, offsets); + } + count++; + } + + fprintf(stderr, + "Total of %lu records \n" + "--------------------------------\n", + (ulong) (count + 1)); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/***************************************************************//** +Prints the info in a page header. */ +UNIV_INTERN +void +page_header_print( +/*==============*/ + const page_t* page) +{ + fprintf(stderr, + "--------------------------------\n" + "PAGE HEADER INFO\n" + "Page address %p, n records %lu (%s)\n" + "n dir slots %lu, heap top %lu\n" + "Page n heap %lu, free %lu, garbage %lu\n" + "Page last insert %lu, direction %lu, n direction %lu\n", + page, (ulong) page_header_get_field(page, PAGE_N_RECS), + page_is_comp(page) ? "compact format" : "original format", + (ulong) page_header_get_field(page, PAGE_N_DIR_SLOTS), + (ulong) page_header_get_field(page, PAGE_HEAP_TOP), + (ulong) page_dir_get_n_heap(page), + (ulong) page_header_get_field(page, PAGE_FREE), + (ulong) page_header_get_field(page, PAGE_GARBAGE), + (ulong) page_header_get_field(page, PAGE_LAST_INSERT), + (ulong) page_header_get_field(page, PAGE_DIRECTION), + (ulong) page_header_get_field(page, PAGE_N_DIRECTION)); +} + +/***************************************************************//** +This is used to print the contents of the page for +debugging purposes. */ +UNIV_INTERN +void +page_print( +/*=======*/ + buf_block_t* block, /*!< in: index page */ + dict_index_t* index, /*!< in: dictionary index of the page */ + ulint dn, /*!< in: print dn first and last entries + in directory */ + ulint rn) /*!< in: print rn first and last records + in directory */ +{ + page_t* page = block->frame; + + page_header_print(page); + page_dir_print(page, dn); + page_print_list(block, index, rn); +} +# endif /* UNIV_BTR_PRINT */ +#endif /* !UNIV_HOTBACKUP */ + +/***************************************************************//** +The following is used to validate a record on a page. This function +differs from rec_validate as it can also check the n_owned field and +the heap_no field. +@return TRUE if ok */ +UNIV_INTERN +ibool +page_rec_validate( +/*==============*/ + const rec_t* rec, /*!< in: physical record */ + const ulint* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ulint n_owned; + ulint heap_no; + const page_t* page; + + page = page_align(rec); + ut_a(!page_is_comp(page) == !rec_offs_comp(offsets)); + + page_rec_check(rec); + rec_validate(rec, offsets); + + if (page_rec_is_comp(rec)) { + n_owned = rec_get_n_owned_new(rec); + heap_no = rec_get_heap_no_new(rec); + } else { + n_owned = rec_get_n_owned_old(rec); + heap_no = rec_get_heap_no_old(rec); + } + + if (UNIV_UNLIKELY(!(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED))) { + fprintf(stderr, + "InnoDB: Dir slot of rec %lu, n owned too big %lu\n", + (ulong) page_offset(rec), (ulong) n_owned); + return(FALSE); + } + + if (UNIV_UNLIKELY(!(heap_no < page_dir_get_n_heap(page)))) { + fprintf(stderr, + "InnoDB: Heap no of rec %lu too big %lu %lu\n", + (ulong) page_offset(rec), (ulong) heap_no, + (ulong) page_dir_get_n_heap(page)); + return(FALSE); + } + + return(TRUE); +} + +#ifndef UNIV_HOTBACKUP +/***************************************************************//** +Checks that the first directory slot points to the infimum record and +the last to the supremum. This function is intended to track if the +bug fixed in 4.0.14 has caused corruption to users' databases. */ +UNIV_INTERN +void +page_check_dir( +/*===========*/ + const page_t* page) /*!< in: index page */ +{ + ulint n_slots; + ulint infimum_offs; + ulint supremum_offs; + + n_slots = page_dir_get_n_slots(page); + infimum_offs = mach_read_from_2(page_dir_get_nth_slot(page, 0)); + supremum_offs = mach_read_from_2(page_dir_get_nth_slot(page, + n_slots - 1)); + + if (UNIV_UNLIKELY(!page_rec_is_infimum_low(infimum_offs))) { + + fprintf(stderr, + "InnoDB: Page directory corruption:" + " infimum not pointed to\n"); + buf_page_print(page, 0, 0); + } + + if (UNIV_UNLIKELY(!page_rec_is_supremum_low(supremum_offs))) { + + fprintf(stderr, + "InnoDB: Page directory corruption:" + " supremum not pointed to\n"); + buf_page_print(page, 0, 0); + } +} +#endif /* !UNIV_HOTBACKUP */ + +/***************************************************************//** +This function checks the consistency of an index page when we do not +know the index. This is also resilient so that this should never crash +even if the page is total garbage. +@return TRUE if ok */ +UNIV_INTERN +ibool +page_simple_validate_old( +/*=====================*/ + const page_t* page) /*!< in: index page in ROW_FORMAT=REDUNDANT */ +{ + const page_dir_slot_t* slot; + ulint slot_no; + ulint n_slots; + const rec_t* rec; + const byte* rec_heap_top; + ulint count; + ulint own_count; + ibool ret = FALSE; + + ut_a(!page_is_comp(page)); + + /* Check first that the record heap and the directory do not + overlap. */ + + n_slots = page_dir_get_n_slots(page); + + if (UNIV_UNLIKELY(n_slots > UNIV_PAGE_SIZE / 4)) { + fprintf(stderr, + "InnoDB: Nonsensical number %lu of page dir slots\n", + (ulong) n_slots); + + goto func_exit; + } + + rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP); + + if (UNIV_UNLIKELY(rec_heap_top + > page_dir_get_nth_slot(page, n_slots - 1))) { + + fprintf(stderr, + "InnoDB: Record heap and dir overlap on a page," + " heap top %lu, dir %lu\n", + (ulong) page_header_get_field(page, PAGE_HEAP_TOP), + (ulong) + page_offset(page_dir_get_nth_slot(page, n_slots - 1))); + + goto func_exit; + } + + /* Validate the record list in a loop checking also that it is + consistent with the page record directory. */ + + count = 0; + own_count = 1; + slot_no = 0; + slot = page_dir_get_nth_slot(page, slot_no); + + rec = page_get_infimum_rec(page); + + for (;;) { + if (UNIV_UNLIKELY(rec > rec_heap_top)) { + fprintf(stderr, + "InnoDB: Record %lu is above" + " rec heap top %lu\n", + (ulong)(rec - page), + (ulong)(rec_heap_top - page)); + + goto func_exit; + } + + if (UNIV_UNLIKELY(rec_get_n_owned_old(rec))) { + /* This is a record pointed to by a dir slot */ + if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) + != own_count)) { + + fprintf(stderr, + "InnoDB: Wrong owned count %lu, %lu," + " rec %lu\n", + (ulong) rec_get_n_owned_old(rec), + (ulong) own_count, + (ulong)(rec - page)); + + goto func_exit; + } + + if (UNIV_UNLIKELY + (page_dir_slot_get_rec(slot) != rec)) { + fprintf(stderr, + "InnoDB: Dir slot does not point" + " to right rec %lu\n", + (ulong)(rec - page)); + + goto func_exit; + } + + own_count = 0; + + if (!page_rec_is_supremum(rec)) { + slot_no++; + slot = page_dir_get_nth_slot(page, slot_no); + } + } + + if (page_rec_is_supremum(rec)) { + + break; + } + + if (UNIV_UNLIKELY + (rec_get_next_offs(rec, FALSE) < FIL_PAGE_DATA + || rec_get_next_offs(rec, FALSE) >= UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Next record offset" + " nonsensical %lu for rec %lu\n", + (ulong) rec_get_next_offs(rec, FALSE), + (ulong) (rec - page)); + + goto func_exit; + } + + count++; + + if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Page record list appears" + " to be circular %lu\n", + (ulong) count); + goto func_exit; + } + + rec = page_rec_get_next_const(rec); + own_count++; + } + + if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) == 0)) { + fprintf(stderr, "InnoDB: n owned is zero in a supremum rec\n"); + + goto func_exit; + } + + if (UNIV_UNLIKELY(slot_no != n_slots - 1)) { + fprintf(stderr, "InnoDB: n slots wrong %lu, %lu\n", + (ulong) slot_no, (ulong) (n_slots - 1)); + goto func_exit; + } + + if (UNIV_UNLIKELY(page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW + != count + 1)) { + fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n", + (ulong) page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW, + (ulong) (count + 1)); + + goto func_exit; + } + + /* Check then the free list */ + rec = page_header_get_ptr(page, PAGE_FREE); + + while (rec != NULL) { + if (UNIV_UNLIKELY(rec < page + FIL_PAGE_DATA + || rec >= page + UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Free list record has" + " a nonsensical offset %lu\n", + (ulong) (rec - page)); + + goto func_exit; + } + + if (UNIV_UNLIKELY(rec > rec_heap_top)) { + fprintf(stderr, + "InnoDB: Free list record %lu" + " is above rec heap top %lu\n", + (ulong) (rec - page), + (ulong) (rec_heap_top - page)); + + goto func_exit; + } + + count++; + + if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Page free list appears" + " to be circular %lu\n", + (ulong) count); + goto func_exit; + } + + rec = page_rec_get_next_const(rec); + } + + if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) { + + fprintf(stderr, "InnoDB: N heap is wrong %lu, %lu\n", + (ulong) page_dir_get_n_heap(page), + (ulong) (count + 1)); + + goto func_exit; + } + + ret = TRUE; + +func_exit: + return(ret); +} + +/***************************************************************//** +This function checks the consistency of an index page when we do not +know the index. This is also resilient so that this should never crash +even if the page is total garbage. +@return TRUE if ok */ +UNIV_INTERN +ibool +page_simple_validate_new( +/*=====================*/ + const page_t* page) /*!< in: index page in ROW_FORMAT!=REDUNDANT */ +{ + const page_dir_slot_t* slot; + ulint slot_no; + ulint n_slots; + const rec_t* rec; + const byte* rec_heap_top; + ulint count; + ulint own_count; + ibool ret = FALSE; + + ut_a(page_is_comp(page)); + + /* Check first that the record heap and the directory do not + overlap. */ + + n_slots = page_dir_get_n_slots(page); + + if (UNIV_UNLIKELY(n_slots > UNIV_PAGE_SIZE / 4)) { + fprintf(stderr, + "InnoDB: Nonsensical number %lu" + " of page dir slots\n", (ulong) n_slots); + + goto func_exit; + } + + rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP); + + if (UNIV_UNLIKELY(rec_heap_top + > page_dir_get_nth_slot(page, n_slots - 1))) { + + fprintf(stderr, + "InnoDB: Record heap and dir overlap on a page," + " heap top %lu, dir %lu\n", + (ulong) page_header_get_field(page, PAGE_HEAP_TOP), + (ulong) + page_offset(page_dir_get_nth_slot(page, n_slots - 1))); + + goto func_exit; + } + + /* Validate the record list in a loop checking also that it is + consistent with the page record directory. */ + + count = 0; + own_count = 1; + slot_no = 0; + slot = page_dir_get_nth_slot(page, slot_no); + + rec = page_get_infimum_rec(page); + + for (;;) { + if (UNIV_UNLIKELY(rec > rec_heap_top)) { + fprintf(stderr, + "InnoDB: Record %lu is above rec" + " heap top %lu\n", + (ulong) page_offset(rec), + (ulong) page_offset(rec_heap_top)); + + goto func_exit; + } + + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec))) { + /* This is a record pointed to by a dir slot */ + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) + != own_count)) { + + fprintf(stderr, + "InnoDB: Wrong owned count %lu, %lu," + " rec %lu\n", + (ulong) rec_get_n_owned_new(rec), + (ulong) own_count, + (ulong) page_offset(rec)); + + goto func_exit; + } + + if (UNIV_UNLIKELY + (page_dir_slot_get_rec(slot) != rec)) { + fprintf(stderr, + "InnoDB: Dir slot does not point" + " to right rec %lu\n", + (ulong) page_offset(rec)); + + goto func_exit; + } + + own_count = 0; + + if (!page_rec_is_supremum(rec)) { + slot_no++; + slot = page_dir_get_nth_slot(page, slot_no); + } + } + + if (page_rec_is_supremum(rec)) { + + break; + } + + if (UNIV_UNLIKELY + (rec_get_next_offs(rec, TRUE) < FIL_PAGE_DATA + || rec_get_next_offs(rec, TRUE) >= UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Next record offset nonsensical %lu" + " for rec %lu\n", + (ulong) rec_get_next_offs(rec, TRUE), + (ulong) page_offset(rec)); + + goto func_exit; + } + + count++; + + if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Page record list appears" + " to be circular %lu\n", + (ulong) count); + goto func_exit; + } + + rec = page_rec_get_next_const(rec); + own_count++; + } + + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) == 0)) { + fprintf(stderr, "InnoDB: n owned is zero" + " in a supremum rec\n"); + + goto func_exit; + } + + if (UNIV_UNLIKELY(slot_no != n_slots - 1)) { + fprintf(stderr, "InnoDB: n slots wrong %lu, %lu\n", + (ulong) slot_no, (ulong) (n_slots - 1)); + goto func_exit; + } + + if (UNIV_UNLIKELY(page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW + != count + 1)) { + fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n", + (ulong) page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW, + (ulong) (count + 1)); + + goto func_exit; + } + + /* Check then the free list */ + rec = page_header_get_ptr(page, PAGE_FREE); + + while (rec != NULL) { + if (UNIV_UNLIKELY(rec < page + FIL_PAGE_DATA + || rec >= page + UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Free list record has" + " a nonsensical offset %lu\n", + (ulong) page_offset(rec)); + + goto func_exit; + } + + if (UNIV_UNLIKELY(rec > rec_heap_top)) { + fprintf(stderr, + "InnoDB: Free list record %lu" + " is above rec heap top %lu\n", + (ulong) page_offset(rec), + (ulong) page_offset(rec_heap_top)); + + goto func_exit; + } + + count++; + + if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Page free list appears" + " to be circular %lu\n", + (ulong) count); + goto func_exit; + } + + rec = page_rec_get_next_const(rec); + } + + if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) { + + fprintf(stderr, "InnoDB: N heap is wrong %lu, %lu\n", + (ulong) page_dir_get_n_heap(page), + (ulong) (count + 1)); + + goto func_exit; + } + + ret = TRUE; + +func_exit: + return(ret); +} + +/***************************************************************//** +This function checks the consistency of an index page. +@return TRUE if ok */ +UNIV_INTERN +ibool +page_validate( +/*==========*/ + const page_t* page, /*!< in: index page */ + dict_index_t* index) /*!< in: data dictionary index containing + the page record type definition */ +{ + const page_dir_slot_t* slot; + mem_heap_t* heap; + byte* buf; + ulint count; + ulint own_count; + ulint rec_own_count; + ulint slot_no; + ulint data_size; + const rec_t* rec; + const rec_t* old_rec = NULL; + ulint offs; + ulint n_slots; + ibool ret = FALSE; + ulint i; + ulint* offsets = NULL; + ulint* old_offsets = NULL; + + if (UNIV_UNLIKELY((ibool) !!page_is_comp(page) + != dict_table_is_comp(index->table))) { + fputs("InnoDB: 'compact format' flag mismatch\n", stderr); + goto func_exit2; + } + if (page_is_comp(page)) { + if (UNIV_UNLIKELY(!page_simple_validate_new(page))) { + goto func_exit2; + } + } else { + if (UNIV_UNLIKELY(!page_simple_validate_old(page))) { + goto func_exit2; + } + } + + if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page) + && !page_is_empty(page)) { + trx_id_t max_trx_id = page_get_max_trx_id(page); + trx_id_t sys_max_trx_id = trx_sys_get_max_trx_id(); + + if (max_trx_id == 0 || max_trx_id > sys_max_trx_id) { + ib_logf(IB_LOG_LEVEL_ERROR, + "PAGE_MAX_TRX_ID out of bounds: " + TRX_ID_FMT ", " TRX_ID_FMT, + max_trx_id, sys_max_trx_id); + goto func_exit2; + } + } + + heap = mem_heap_create(UNIV_PAGE_SIZE + 200); + + /* The following buffer is used to check that the + records in the page record heap do not overlap */ + + buf = static_cast<byte*>(mem_heap_zalloc(heap, UNIV_PAGE_SIZE)); + + /* Check first that the record heap and the directory do not + overlap. */ + + n_slots = page_dir_get_n_slots(page); + + if (UNIV_UNLIKELY(!(page_header_get_ptr(page, PAGE_HEAP_TOP) + <= page_dir_get_nth_slot(page, n_slots - 1)))) { + + fprintf(stderr, + "InnoDB: Record heap and dir overlap" + " on space %lu page %lu index %s, %p, %p\n", + (ulong) page_get_space_id(page), + (ulong) page_get_page_no(page), index->name, + page_header_get_ptr(page, PAGE_HEAP_TOP), + page_dir_get_nth_slot(page, n_slots - 1)); + + goto func_exit; + } + + /* Validate the record list in a loop checking also that + it is consistent with the directory. */ + count = 0; + data_size = 0; + own_count = 1; + slot_no = 0; + slot = page_dir_get_nth_slot(page, slot_no); + + rec = page_get_infimum_rec(page); + + for (;;) { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + if (page_is_comp(page) && page_rec_is_user_rec(rec) + && UNIV_UNLIKELY(rec_get_node_ptr_flag(rec) + == page_is_leaf(page))) { + fputs("InnoDB: node_ptr flag mismatch\n", stderr); + goto func_exit; + } + + if (UNIV_UNLIKELY(!page_rec_validate(rec, offsets))) { + goto func_exit; + } + +#ifndef UNIV_HOTBACKUP + /* Check that the records are in the ascending order */ + if (UNIV_LIKELY(count >= PAGE_HEAP_NO_USER_LOW) + && !page_rec_is_supremum(rec)) { + if (UNIV_UNLIKELY + (1 != cmp_rec_rec(rec, old_rec, + offsets, old_offsets, index))) { + fprintf(stderr, + "InnoDB: Records in wrong order" + " on space %lu page %lu index %s\n", + (ulong) page_get_space_id(page), + (ulong) page_get_page_no(page), + index->name); + fputs("\nInnoDB: previous record ", stderr); + rec_print_new(stderr, old_rec, old_offsets); + fputs("\nInnoDB: record ", stderr); + rec_print_new(stderr, rec, offsets); + putc('\n', stderr); + + goto func_exit; + } + } +#endif /* !UNIV_HOTBACKUP */ + + if (page_rec_is_user_rec(rec)) { + + data_size += rec_offs_size(offsets); + } + + offs = page_offset(rec_get_start(rec, offsets)); + i = rec_offs_size(offsets); + if (UNIV_UNLIKELY(offs + i >= UNIV_PAGE_SIZE)) { + fputs("InnoDB: record offset out of bounds\n", stderr); + goto func_exit; + } + + while (i--) { + if (UNIV_UNLIKELY(buf[offs + i])) { + /* No other record may overlap this */ + + fputs("InnoDB: Record overlaps another\n", + stderr); + goto func_exit; + } + + buf[offs + i] = 1; + } + + if (page_is_comp(page)) { + rec_own_count = rec_get_n_owned_new(rec); + } else { + rec_own_count = rec_get_n_owned_old(rec); + } + + if (UNIV_UNLIKELY(rec_own_count)) { + /* This is a record pointed to by a dir slot */ + if (UNIV_UNLIKELY(rec_own_count != own_count)) { + fprintf(stderr, + "InnoDB: Wrong owned count %lu, %lu\n", + (ulong) rec_own_count, + (ulong) own_count); + goto func_exit; + } + + if (page_dir_slot_get_rec(slot) != rec) { + fputs("InnoDB: Dir slot does not" + " point to right rec\n", + stderr); + goto func_exit; + } + + page_dir_slot_check(slot); + + own_count = 0; + if (!page_rec_is_supremum(rec)) { + slot_no++; + slot = page_dir_get_nth_slot(page, slot_no); + } + } + + if (page_rec_is_supremum(rec)) { + break; + } + + count++; + own_count++; + old_rec = rec; + rec = page_rec_get_next_const(rec); + + /* set old_offsets to offsets; recycle offsets */ + { + ulint* offs = old_offsets; + old_offsets = offsets; + offsets = offs; + } + } + + if (page_is_comp(page)) { + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) == 0)) { + + goto n_owned_zero; + } + } else if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) == 0)) { +n_owned_zero: + fputs("InnoDB: n owned is zero\n", stderr); + goto func_exit; + } + + if (UNIV_UNLIKELY(slot_no != n_slots - 1)) { + fprintf(stderr, "InnoDB: n slots wrong %lu %lu\n", + (ulong) slot_no, (ulong) (n_slots - 1)); + goto func_exit; + } + + if (UNIV_UNLIKELY(page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW + != count + 1)) { + fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n", + (ulong) page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW, + (ulong) (count + 1)); + goto func_exit; + } + + if (UNIV_UNLIKELY(data_size != page_get_data_size(page))) { + fprintf(stderr, + "InnoDB: Summed data size %lu, returned by func %lu\n", + (ulong) data_size, (ulong) page_get_data_size(page)); + goto func_exit; + } + + /* Check then the free list */ + rec = page_header_get_ptr(page, PAGE_FREE); + + while (rec != NULL) { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + if (UNIV_UNLIKELY(!page_rec_validate(rec, offsets))) { + + goto func_exit; + } + + count++; + offs = page_offset(rec_get_start(rec, offsets)); + i = rec_offs_size(offsets); + if (UNIV_UNLIKELY(offs + i >= UNIV_PAGE_SIZE)) { + fputs("InnoDB: record offset out of bounds\n", stderr); + goto func_exit; + } + + while (i--) { + + if (UNIV_UNLIKELY(buf[offs + i])) { + fputs("InnoDB: Record overlaps another" + " in free list\n", stderr); + goto func_exit; + } + + buf[offs + i] = 1; + } + + rec = page_rec_get_next_const(rec); + } + + if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) { + fprintf(stderr, "InnoDB: N heap is wrong %lu %lu\n", + (ulong) page_dir_get_n_heap(page), + (ulong) count + 1); + goto func_exit; + } + + ret = TRUE; + +func_exit: + mem_heap_free(heap); + + if (UNIV_UNLIKELY(ret == FALSE)) { +func_exit2: + fprintf(stderr, + "InnoDB: Apparent corruption" + " in space %lu page %lu index %s\n", + (ulong) page_get_space_id(page), + (ulong) page_get_page_no(page), + index->name); + buf_page_print(page, 0, 0); + } + + return(ret); +} + +#ifndef UNIV_HOTBACKUP +/***************************************************************//** +Looks in the page record list for a record with the given heap number. +@return record, NULL if not found */ +UNIV_INTERN +const rec_t* +page_find_rec_with_heap_no( +/*=======================*/ + const page_t* page, /*!< in: index page */ + ulint heap_no)/*!< in: heap number */ +{ + const rec_t* rec; + + if (page_is_comp(page)) { + rec = page + PAGE_NEW_INFIMUM; + + for(;;) { + ulint rec_heap_no = rec_get_heap_no_new(rec); + + if (rec_heap_no == heap_no) { + + return(rec); + } else if (rec_heap_no == PAGE_HEAP_NO_SUPREMUM) { + + return(NULL); + } + + rec = page + rec_get_next_offs(rec, TRUE); + } + } else { + rec = page + PAGE_OLD_INFIMUM; + + for (;;) { + ulint rec_heap_no = rec_get_heap_no_old(rec); + + if (rec_heap_no == heap_no) { + + return(rec); + } else if (rec_heap_no == PAGE_HEAP_NO_SUPREMUM) { + + return(NULL); + } + + rec = page + rec_get_next_offs(rec, FALSE); + } + } +} +#endif /* !UNIV_HOTBACKUP */ + +/*******************************************************//** +Removes the record from a leaf page. This function does not log +any changes. It is used by the IMPORT tablespace functions. +The cursor is moved to the next record after the deleted one. +@return true if success, i.e., the page did not become too empty */ +UNIV_INTERN +bool +page_delete_rec( +/*============*/ + const dict_index_t* index, /*!< in: The index that the record + belongs to */ + page_cur_t* pcur, /*!< in/out: page cursor on record + to delete */ + page_zip_des_t* page_zip,/*!< in: compressed page descriptor */ + const ulint* offsets)/*!< in: offsets for record */ +{ + bool no_compress_needed; + buf_block_t* block = pcur->block; + page_t* page = buf_block_get_frame(block); + + ut_ad(page_is_leaf(page)); + + if (!rec_offs_any_extern(offsets) + && ((page_get_data_size(page) - rec_offs_size(offsets) + < BTR_CUR_PAGE_COMPRESS_LIMIT) + || (mach_read_from_4(page + FIL_PAGE_NEXT) == FIL_NULL + && mach_read_from_4(page + FIL_PAGE_PREV) == FIL_NULL) + || (page_get_n_recs(page) < 2))) { + + ulint root_page_no = dict_index_get_page(index); + + /* The page fillfactor will drop below a predefined + minimum value, OR the level in the B-tree contains just + one page, OR the page will become empty: we recommend + compression if this is not the root page. */ + + no_compress_needed = page_get_page_no(page) == root_page_no; + } else { + no_compress_needed = true; + } + + if (no_compress_needed) { +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + page_cur_delete_rec(pcur, index, offsets, 0); + +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + } + + return(no_compress_needed); +} + +/** Get the last non-delete-marked record on a page. +@param[in] page index tree leaf page +@return the last record, not delete-marked +@retval infimum record if all records are delete-marked */ + +const rec_t* +page_find_rec_max_not_deleted( + const page_t* page) +{ + const rec_t* rec = page_get_infimum_rec(page); + const rec_t* prev_rec = NULL; // remove warning + + /* Because the page infimum is never delete-marked, + prev_rec will always be assigned to it first. */ + ut_ad(!rec_get_deleted_flag(rec, page_rec_is_comp(rec))); + if (page_is_comp(page)) { + do { + if (!rec_get_deleted_flag(rec, true)) { + prev_rec = rec; + } + rec = page_rec_get_next_low(rec, true); + } while (rec != page + PAGE_NEW_SUPREMUM); + } else { + do { + if (!rec_get_deleted_flag(rec, false)) { + prev_rec = rec; + } + rec = page_rec_get_next_low(rec, false); + } while (rec != page + PAGE_OLD_SUPREMUM); + } + return(prev_rec); +} diff --git a/storage/xtradb/page/page0zip.cc b/storage/xtradb/page/page0zip.cc new file mode 100644 index 00000000000..67dca183c6d --- /dev/null +++ b/storage/xtradb/page/page0zip.cc @@ -0,0 +1,4952 @@ +/***************************************************************************** + +Copyright (c) 2005, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file page/page0zip.cc +Compressed page interface + +Created June 2005 by Marko Makela +*******************************************************/ + +// First include (the generated) my_config.h, to get correct platform defines. +#include "my_config.h" + +#include <map> +using namespace std; + +#define THIS_MODULE +#include "page0zip.h" +#ifdef UNIV_NONINL +# include "page0zip.ic" +#endif +#undef THIS_MODULE +#include "page0page.h" +#include "mtr0log.h" +#include "ut0sort.h" +#include "dict0dict.h" +#include "btr0cur.h" +#include "page0types.h" +#include "log0recv.h" +#include "zlib.h" +#ifndef UNIV_HOTBACKUP +# include "buf0buf.h" +# include "buf0lru.h" +# include "btr0sea.h" +# include "dict0boot.h" +# include "lock0lock.h" +# include "srv0mon.h" +# include "srv0srv.h" +# include "ut0crc32.h" +#else /* !UNIV_HOTBACKUP */ +# include "buf0checksum.h" +# define lock_move_reorganize_page(block, temp_block) ((void) 0) +# define buf_LRU_stat_inc_unzip() ((void) 0) +#endif /* !UNIV_HOTBACKUP */ + +#ifndef UNIV_HOTBACKUP +/** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */ +UNIV_INTERN page_zip_stat_t page_zip_stat[PAGE_ZIP_SSIZE_MAX]; +/** Statistics on compression, indexed by index->id */ +UNIV_INTERN page_zip_stat_per_index_t page_zip_stat_per_index; +/** Mutex protecting page_zip_stat_per_index */ +UNIV_INTERN ib_mutex_t page_zip_stat_per_index_mutex; +#ifdef HAVE_PSI_INTERFACE +UNIV_INTERN mysql_pfs_key_t page_zip_stat_per_index_mutex_key; +#endif /* HAVE_PSI_INTERFACE */ +#endif /* !UNIV_HOTBACKUP */ + +/* Compression level to be used by zlib. Settable by user. */ +UNIV_INTERN uint page_zip_level = DEFAULT_COMPRESSION_LEVEL; + +/* Whether or not to log compressed page images to avoid possible +compression algorithm changes in zlib. */ +UNIV_INTERN my_bool page_zip_log_pages = true; + +/* Please refer to ../include/page0zip.ic for a description of the +compressed page format. */ + +/* The infimum and supremum records are omitted from the compressed page. +On compress, we compare that the records are there, and on uncompress we +restore the records. */ +/** Extra bytes of an infimum record */ +static const byte infimum_extra[] = { + 0x01, /* info_bits=0, n_owned=1 */ + 0x00, 0x02 /* heap_no=0, status=2 */ + /* ?, ? */ /* next=(first user rec, or supremum) */ +}; +/** Data bytes of an infimum record */ +static const byte infimum_data[] = { + 0x69, 0x6e, 0x66, 0x69, + 0x6d, 0x75, 0x6d, 0x00 /* "infimum\0" */ +}; +/** Extra bytes and data bytes of a supremum record */ +static const byte supremum_extra_data[] = { + /* 0x0?, */ /* info_bits=0, n_owned=1..8 */ + 0x00, 0x0b, /* heap_no=1, status=3 */ + 0x00, 0x00, /* next=0 */ + 0x73, 0x75, 0x70, 0x72, + 0x65, 0x6d, 0x75, 0x6d /* "supremum" */ +}; + +/** Assert that a block of memory is filled with zero bytes. +Compare at most sizeof(field_ref_zero) bytes. +@param b in: memory block +@param s in: size of the memory block, in bytes */ +#define ASSERT_ZERO(b, s) \ + ut_ad(!memcmp(b, field_ref_zero, ut_min(s, sizeof field_ref_zero))) +/** Assert that a BLOB pointer is filled with zero bytes. +@param b in: BLOB pointer */ +#define ASSERT_ZERO_BLOB(b) \ + ut_ad(!memcmp(b, field_ref_zero, sizeof field_ref_zero)) + +/* Enable some extra debugging output. This code can be enabled +independently of any UNIV_ debugging conditions. */ +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG +# include <stdarg.h> +__attribute__((format (printf, 1, 2))) +/**********************************************************************//** +Report a failure to decompress or compress. +@return number of characters printed */ +static +int +page_zip_fail_func( +/*===============*/ + const char* fmt, /*!< in: printf(3) format string */ + ...) /*!< in: arguments corresponding to fmt */ +{ + int res; + va_list ap; + + ut_print_timestamp(stderr); + fputs(" InnoDB: ", stderr); + va_start(ap, fmt); + res = vfprintf(stderr, fmt, ap); + va_end(ap); + + return(res); +} +/** Wrapper for page_zip_fail_func() +@param fmt_args in: printf(3) format string and arguments */ +# define page_zip_fail(fmt_args) page_zip_fail_func fmt_args +#else /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ +/** Dummy wrapper for page_zip_fail_func() +@param fmt_args ignored: printf(3) format string and arguments */ +# define page_zip_fail(fmt_args) /* empty */ +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Determine the guaranteed free space on an empty page. +@return minimum payload size on the page */ +UNIV_INTERN +ulint +page_zip_empty_size( +/*================*/ + ulint n_fields, /*!< in: number of columns in the index */ + ulint zip_size) /*!< in: compressed page size in bytes */ +{ + lint size = zip_size + /* subtract the page header and the longest + uncompressed data needed for one record */ + - (PAGE_DATA + + PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + + 1/* encoded heap_no==2 in page_zip_write_rec() */ + + 1/* end of modification log */ + - REC_N_NEW_EXTRA_BYTES/* omitted bytes */) + /* subtract the space for page_zip_fields_encode() */ + - compressBound(static_cast<uLong>(2 * (n_fields + 1))); + return(size > 0 ? (ulint) size : 0); +} +#endif /* !UNIV_HOTBACKUP */ + +/*************************************************************//** +Gets the number of elements in the dense page directory, +including deleted records (the free list). +@return number of elements in the dense page directory */ +UNIV_INLINE +ulint +page_zip_dir_elems( +/*===============*/ + const page_zip_des_t* page_zip) /*!< in: compressed page */ +{ + /* Exclude the page infimum and supremum from the record count. */ + return(page_dir_get_n_heap(page_zip->data) - PAGE_HEAP_NO_USER_LOW); +} + +/*************************************************************//** +Gets the size of the compressed page trailer (the dense page directory), +including deleted records (the free list). +@return length of dense page directory, in bytes */ +UNIV_INLINE +ulint +page_zip_dir_size( +/*==============*/ + const page_zip_des_t* page_zip) /*!< in: compressed page */ +{ + return(PAGE_ZIP_DIR_SLOT_SIZE * page_zip_dir_elems(page_zip)); +} + +/*************************************************************//** +Gets an offset to the compressed page trailer (the dense page directory), +including deleted records (the free list). +@return offset of the dense page directory */ +UNIV_INLINE +ulint +page_zip_dir_start_offs( +/*====================*/ + const page_zip_des_t* page_zip, /*!< in: compressed page */ + ulint n_dense) /*!< in: directory size */ +{ + ut_ad(n_dense * PAGE_ZIP_DIR_SLOT_SIZE < page_zip_get_size(page_zip)); + + return(page_zip_get_size(page_zip) - n_dense * PAGE_ZIP_DIR_SLOT_SIZE); +} + +/*************************************************************//** +Gets a pointer to the compressed page trailer (the dense page directory), +including deleted records (the free list). +@param[in] page_zip compressed page +@param[in] n_dense number of entries in the directory +@return pointer to the dense page directory */ +#define page_zip_dir_start_low(page_zip, n_dense) \ + ((page_zip)->data + page_zip_dir_start_offs(page_zip, n_dense)) +/*************************************************************//** +Gets a pointer to the compressed page trailer (the dense page directory), +including deleted records (the free list). +@param[in] page_zip compressed page +@return pointer to the dense page directory */ +#define page_zip_dir_start(page_zip) \ + page_zip_dir_start_low(page_zip, page_zip_dir_elems(page_zip)) + +/*************************************************************//** +Gets the size of the compressed page trailer (the dense page directory), +only including user records (excluding the free list). +@return length of dense page directory comprising existing records, in bytes */ +UNIV_INLINE +ulint +page_zip_dir_user_size( +/*===================*/ + const page_zip_des_t* page_zip) /*!< in: compressed page */ +{ + ulint size = PAGE_ZIP_DIR_SLOT_SIZE + * page_get_n_recs(page_zip->data); + ut_ad(size <= page_zip_dir_size(page_zip)); + return(size); +} + +/*************************************************************//** +Find the slot of the given record in the dense page directory. +@return dense directory slot, or NULL if record not found */ +UNIV_INLINE +byte* +page_zip_dir_find_low( +/*==================*/ + byte* slot, /*!< in: start of records */ + byte* end, /*!< in: end of records */ + ulint offset) /*!< in: offset of user record */ +{ + ut_ad(slot <= end); + + for (; slot < end; slot += PAGE_ZIP_DIR_SLOT_SIZE) { + if ((mach_read_from_2(slot) & PAGE_ZIP_DIR_SLOT_MASK) + == offset) { + return(slot); + } + } + + return(NULL); +} + +/*************************************************************//** +Find the slot of the given non-free record in the dense page directory. +@return dense directory slot, or NULL if record not found */ +UNIV_INLINE +byte* +page_zip_dir_find( +/*==============*/ + page_zip_des_t* page_zip, /*!< in: compressed page */ + ulint offset) /*!< in: offset of user record */ +{ + byte* end = page_zip->data + page_zip_get_size(page_zip); + + ut_ad(page_zip_simple_validate(page_zip)); + + return(page_zip_dir_find_low(end - page_zip_dir_user_size(page_zip), + end, + offset)); +} + +/*************************************************************//** +Find the slot of the given free record in the dense page directory. +@return dense directory slot, or NULL if record not found */ +UNIV_INLINE +byte* +page_zip_dir_find_free( +/*===================*/ + page_zip_des_t* page_zip, /*!< in: compressed page */ + ulint offset) /*!< in: offset of user record */ +{ + byte* end = page_zip->data + page_zip_get_size(page_zip); + + ut_ad(page_zip_simple_validate(page_zip)); + + return(page_zip_dir_find_low(end - page_zip_dir_size(page_zip), + end - page_zip_dir_user_size(page_zip), + offset)); +} + +/*************************************************************//** +Read a given slot in the dense page directory. +@return record offset on the uncompressed page, possibly ORed with +PAGE_ZIP_DIR_SLOT_DEL or PAGE_ZIP_DIR_SLOT_OWNED */ +UNIV_INLINE +ulint +page_zip_dir_get( +/*=============*/ + const page_zip_des_t* page_zip, /*!< in: compressed page */ + ulint slot) /*!< in: slot + (0=first user record) */ +{ + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(slot < page_zip_dir_size(page_zip) / PAGE_ZIP_DIR_SLOT_SIZE); + return(mach_read_from_2(page_zip->data + page_zip_get_size(page_zip) + - PAGE_ZIP_DIR_SLOT_SIZE * (slot + 1))); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Write a log record of compressing an index page. */ +static +void +page_zip_compress_write_log( +/*========================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + const page_t* page, /*!< in: uncompressed page */ + dict_index_t* index, /*!< in: index of the B-tree node */ + mtr_t* mtr) /*!< in: mini-transaction */ +{ + byte* log_ptr; + ulint trailer_size; + + ut_ad(!dict_index_is_ibuf(index)); + + log_ptr = mlog_open(mtr, 11 + 2 + 2); + + if (!log_ptr) { + + return; + } + + /* Read the number of user records. */ + trailer_size = page_dir_get_n_heap(page_zip->data) + - PAGE_HEAP_NO_USER_LOW; + /* Multiply by uncompressed of size stored per record */ + if (!page_is_leaf(page)) { + trailer_size *= PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE; + } else if (dict_index_is_clust(index)) { + trailer_size *= PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + } else { + trailer_size *= PAGE_ZIP_DIR_SLOT_SIZE; + } + /* Add the space occupied by BLOB pointers. */ + trailer_size += page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE; + ut_a(page_zip->m_end > PAGE_DATA); +#if FIL_PAGE_DATA > PAGE_DATA +# error "FIL_PAGE_DATA > PAGE_DATA" +#endif + ut_a(page_zip->m_end + trailer_size <= page_zip_get_size(page_zip)); + + log_ptr = mlog_write_initial_log_record_fast((page_t*) page, + MLOG_ZIP_PAGE_COMPRESS, + log_ptr, mtr); + mach_write_to_2(log_ptr, page_zip->m_end - FIL_PAGE_TYPE); + log_ptr += 2; + mach_write_to_2(log_ptr, trailer_size); + log_ptr += 2; + mlog_close(mtr, log_ptr); + + /* Write FIL_PAGE_PREV and FIL_PAGE_NEXT */ + mlog_catenate_string(mtr, page_zip->data + FIL_PAGE_PREV, 4); + mlog_catenate_string(mtr, page_zip->data + FIL_PAGE_NEXT, 4); + /* Write most of the page header, the compressed stream and + the modification log. */ + mlog_catenate_string(mtr, page_zip->data + FIL_PAGE_TYPE, + page_zip->m_end - FIL_PAGE_TYPE); + /* Write the uncompressed trailer of the compressed page. */ + mlog_catenate_string(mtr, page_zip->data + page_zip_get_size(page_zip) + - trailer_size, trailer_size); +} +#endif /* !UNIV_HOTBACKUP */ + +/******************************************************//** +Determine how many externally stored columns are contained +in existing records with smaller heap_no than rec. */ +static +ulint +page_zip_get_n_prev_extern( +/*=======================*/ + const page_zip_des_t* page_zip,/*!< in: dense page directory on + compressed page */ + const rec_t* rec, /*!< in: compact physical record + on a B-tree leaf page */ + const dict_index_t* index) /*!< in: record descriptor */ +{ + const page_t* page = page_align(rec); + ulint n_ext = 0; + ulint i; + ulint left; + ulint heap_no; + ulint n_recs = page_get_n_recs(page_zip->data); + + ut_ad(page_is_leaf(page)); + ut_ad(page_is_comp(page)); + ut_ad(dict_table_is_comp(index->table)); + ut_ad(dict_index_is_clust(index)); + ut_ad(!dict_index_is_ibuf(index)); + + heap_no = rec_get_heap_no_new(rec); + ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); + left = heap_no - PAGE_HEAP_NO_USER_LOW; + if (UNIV_UNLIKELY(!left)) { + return(0); + } + + for (i = 0; i < n_recs; i++) { + const rec_t* r = page + (page_zip_dir_get(page_zip, i) + & PAGE_ZIP_DIR_SLOT_MASK); + + if (rec_get_heap_no_new(r) < heap_no) { + n_ext += rec_get_n_extern_new(r, index, + ULINT_UNDEFINED); + if (!--left) { + break; + } + } + } + + return(n_ext); +} + +/**********************************************************************//** +Encode the length of a fixed-length column. +@return buf + length of encoded val */ +static +byte* +page_zip_fixed_field_encode( +/*========================*/ + byte* buf, /*!< in: pointer to buffer where to write */ + ulint val) /*!< in: value to write */ +{ + ut_ad(val >= 2); + + if (UNIV_LIKELY(val < 126)) { + /* + 0 = nullable variable field of at most 255 bytes length; + 1 = not null variable field of at most 255 bytes length; + 126 = nullable variable field with maximum length >255; + 127 = not null variable field with maximum length >255 + */ + *buf++ = (byte) val; + } else { + *buf++ = (byte) (0x80 | val >> 8); + *buf++ = (byte) val; + } + + return(buf); +} + +/**********************************************************************//** +Write the index information for the compressed page. +@return used size of buf */ +static +ulint +page_zip_fields_encode( +/*===================*/ + ulint n, /*!< in: number of fields to compress */ + dict_index_t* index, /*!< in: index comprising at least n fields */ + ulint trx_id_pos,/*!< in: position of the trx_id column + in the index, or ULINT_UNDEFINED if + this is a non-leaf page */ + byte* buf) /*!< out: buffer of (n + 1) * 2 bytes */ +{ + const byte* buf_start = buf; + ulint i; + ulint col; + ulint trx_id_col = 0; + /* sum of lengths of preceding non-nullable fixed fields, or 0 */ + ulint fixed_sum = 0; + + ut_ad(trx_id_pos == ULINT_UNDEFINED || trx_id_pos < n); + + for (i = col = 0; i < n; i++) { + dict_field_t* field = dict_index_get_nth_field(index, i); + ulint val; + + if (dict_field_get_col(field)->prtype & DATA_NOT_NULL) { + val = 1; /* set the "not nullable" flag */ + } else { + val = 0; /* nullable field */ + } + + if (!field->fixed_len) { + /* variable-length field */ + const dict_col_t* column + = dict_field_get_col(field); + + if (UNIV_UNLIKELY(column->len > 255) + || UNIV_UNLIKELY(column->mtype == DATA_BLOB)) { + val |= 0x7e; /* max > 255 bytes */ + } + + if (fixed_sum) { + /* write out the length of any + preceding non-nullable fields */ + buf = page_zip_fixed_field_encode( + buf, fixed_sum << 1 | 1); + fixed_sum = 0; + col++; + } + + *buf++ = (byte) val; + col++; + } else if (val) { + /* fixed-length non-nullable field */ + + if (fixed_sum && UNIV_UNLIKELY + (fixed_sum + field->fixed_len + > DICT_MAX_FIXED_COL_LEN)) { + /* Write out the length of the + preceding non-nullable fields, + to avoid exceeding the maximum + length of a fixed-length column. */ + buf = page_zip_fixed_field_encode( + buf, fixed_sum << 1 | 1); + fixed_sum = 0; + col++; + } + + if (i && UNIV_UNLIKELY(i == trx_id_pos)) { + if (fixed_sum) { + /* Write out the length of any + preceding non-nullable fields, + and start a new trx_id column. */ + buf = page_zip_fixed_field_encode( + buf, fixed_sum << 1 | 1); + col++; + } + + trx_id_col = col; + fixed_sum = field->fixed_len; + } else { + /* add to the sum */ + fixed_sum += field->fixed_len; + } + } else { + /* fixed-length nullable field */ + + if (fixed_sum) { + /* write out the length of any + preceding non-nullable fields */ + buf = page_zip_fixed_field_encode( + buf, fixed_sum << 1 | 1); + fixed_sum = 0; + col++; + } + + buf = page_zip_fixed_field_encode( + buf, field->fixed_len << 1); + col++; + } + } + + if (fixed_sum) { + /* Write out the lengths of last fixed-length columns. */ + buf = page_zip_fixed_field_encode(buf, fixed_sum << 1 | 1); + } + + if (trx_id_pos != ULINT_UNDEFINED) { + /* Write out the position of the trx_id column */ + i = trx_id_col; + } else { + /* Write out the number of nullable fields */ + i = index->n_nullable; + } + + if (i < 128) { + *buf++ = (byte) i; + } else { + *buf++ = (byte) (0x80 | i >> 8); + *buf++ = (byte) i; + } + + ut_ad((ulint) (buf - buf_start) <= (n + 2) * 2); + return((ulint) (buf - buf_start)); +} + +/**********************************************************************//** +Populate the dense page directory from the sparse directory. */ +static +void +page_zip_dir_encode( +/*================*/ + const page_t* page, /*!< in: compact page */ + byte* buf, /*!< in: pointer to dense page directory[-1]; + out: dense directory on compressed page */ + const rec_t** recs) /*!< in: pointer to an array of 0, or NULL; + out: dense page directory sorted by ascending + address (and heap_no) */ +{ + const byte* rec; + ulint status; + ulint min_mark; + ulint heap_no; + ulint i; + ulint n_heap; + ulint offs; + + min_mark = 0; + + if (page_is_leaf(page)) { + status = REC_STATUS_ORDINARY; + } else { + status = REC_STATUS_NODE_PTR; + if (UNIV_UNLIKELY + (mach_read_from_4(page + FIL_PAGE_PREV) == FIL_NULL)) { + min_mark = REC_INFO_MIN_REC_FLAG; + } + } + + n_heap = page_dir_get_n_heap(page); + + /* Traverse the list of stored records in the collation order, + starting from the first user record. */ + + rec = page + PAGE_NEW_INFIMUM; + + i = 0; + + for (;;) { + ulint info_bits; + offs = rec_get_next_offs(rec, TRUE); + if (UNIV_UNLIKELY(offs == PAGE_NEW_SUPREMUM)) { + break; + } + rec = page + offs; + heap_no = rec_get_heap_no_new(rec); + ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW); + ut_a(heap_no < n_heap); + ut_a(offs < UNIV_PAGE_SIZE - PAGE_DIR); + ut_a(offs >= PAGE_ZIP_START); +#if PAGE_ZIP_DIR_SLOT_MASK & (PAGE_ZIP_DIR_SLOT_MASK + 1) +# error "PAGE_ZIP_DIR_SLOT_MASK is not 1 less than a power of 2" +#endif +#if PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE_MAX - 1 +# error "PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE_MAX - 1" +#endif + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec))) { + offs |= PAGE_ZIP_DIR_SLOT_OWNED; + } + + info_bits = rec_get_info_bits(rec, TRUE); + if (info_bits & REC_INFO_DELETED_FLAG) { + info_bits &= ~REC_INFO_DELETED_FLAG; + offs |= PAGE_ZIP_DIR_SLOT_DEL; + } + ut_a(info_bits == min_mark); + /* Only the smallest user record can have + REC_INFO_MIN_REC_FLAG set. */ + min_mark = 0; + + mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs); + + if (UNIV_LIKELY_NULL(recs)) { + /* Ensure that each heap_no occurs at most once. */ + ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]); + /* exclude infimum and supremum */ + recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec; + } + + ut_a(rec_get_status(rec) == status); + } + + offs = page_header_get_field(page, PAGE_FREE); + + /* Traverse the free list (of deleted records). */ + while (offs) { + ut_ad(!(offs & ~PAGE_ZIP_DIR_SLOT_MASK)); + rec = page + offs; + + heap_no = rec_get_heap_no_new(rec); + ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW); + ut_a(heap_no < n_heap); + + ut_a(!rec[-REC_N_NEW_EXTRA_BYTES]); /* info_bits and n_owned */ + ut_a(rec_get_status(rec) == status); + + mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs); + + if (UNIV_LIKELY_NULL(recs)) { + /* Ensure that each heap_no occurs at most once. */ + ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]); + /* exclude infimum and supremum */ + recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec; + } + + offs = rec_get_next_offs(rec, TRUE); + } + + /* Ensure that each heap no occurs at least once. */ + ut_a(i + PAGE_HEAP_NO_USER_LOW == n_heap); +} + +extern "C" { + +/**********************************************************************//** +Allocate memory for zlib. */ +static +void* +page_zip_zalloc( +/*============*/ + void* opaque, /*!< in/out: memory heap */ + uInt items, /*!< in: number of items to allocate */ + uInt size) /*!< in: size of an item in bytes */ +{ + return(mem_heap_zalloc(static_cast<mem_heap_t*>(opaque), items * size)); +} + +/**********************************************************************//** +Deallocate memory for zlib. */ +static +void +page_zip_free( +/*==========*/ + void* opaque __attribute__((unused)), /*!< in: memory heap */ + void* address __attribute__((unused)))/*!< in: object to free */ +{ +} + +} /* extern "C" */ + +/**********************************************************************//** +Configure the zlib allocator to use the given memory heap. */ +UNIV_INTERN +void +page_zip_set_alloc( +/*===============*/ + void* stream, /*!< in/out: zlib stream */ + mem_heap_t* heap) /*!< in: memory heap to use */ +{ + z_stream* strm = static_cast<z_stream*>(stream); + + strm->zalloc = page_zip_zalloc; + strm->zfree = page_zip_free; + strm->opaque = heap; +} + +#if 0 || defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG +/** Symbol for enabling compression and decompression diagnostics */ +# define PAGE_ZIP_COMPRESS_DBG +#endif + +#ifdef PAGE_ZIP_COMPRESS_DBG +/** Set this variable in a debugger to enable +excessive logging in page_zip_compress(). */ +UNIV_INTERN ibool page_zip_compress_dbg; +/** Set this variable in a debugger to enable +binary logging of the data passed to deflate(). +When this variable is nonzero, it will act +as a log file name generator. */ +UNIV_INTERN unsigned page_zip_compress_log; + +/**********************************************************************//** +Wrapper for deflate(). Log the operation if page_zip_compress_dbg is set. +@return deflate() status: Z_OK, Z_BUF_ERROR, ... */ +static +int +page_zip_compress_deflate( +/*======================*/ + FILE* logfile,/*!< in: log file, or NULL */ + z_streamp strm, /*!< in/out: compressed stream for deflate() */ + int flush) /*!< in: deflate() flushing method */ +{ + int status; + if (UNIV_UNLIKELY(page_zip_compress_dbg)) { + ut_print_buf(stderr, strm->next_in, strm->avail_in); + } + if (UNIV_LIKELY_NULL(logfile)) { + fwrite(strm->next_in, 1, strm->avail_in, logfile); + } + status = deflate(strm, flush); + if (UNIV_UNLIKELY(page_zip_compress_dbg)) { + fprintf(stderr, " -> %d\n", status); + } + return(status); +} + +/* Redefine deflate(). */ +# undef deflate +/** Debug wrapper for the zlib compression routine deflate(). +Log the operation if page_zip_compress_dbg is set. +@param strm in/out: compressed stream +@param flush in: flushing method +@return deflate() status: Z_OK, Z_BUF_ERROR, ... */ +# define deflate(strm, flush) page_zip_compress_deflate(logfile, strm, flush) +/** Declaration of the logfile parameter */ +# define FILE_LOGFILE FILE* logfile, +/** The logfile parameter */ +# define LOGFILE logfile, +#else /* PAGE_ZIP_COMPRESS_DBG */ +/** Empty declaration of the logfile parameter */ +# define FILE_LOGFILE +/** Missing logfile parameter */ +# define LOGFILE +#endif /* PAGE_ZIP_COMPRESS_DBG */ + +/**********************************************************************//** +Compress the records of a node pointer page. +@return Z_OK, or a zlib error code */ +static +int +page_zip_compress_node_ptrs( +/*========================*/ + FILE_LOGFILE + z_stream* c_stream, /*!< in/out: compressed page stream */ + const rec_t** recs, /*!< in: dense page directory + sorted by address */ + ulint n_dense, /*!< in: size of recs[] */ + dict_index_t* index, /*!< in: the index of the page */ + byte* storage, /*!< in: end of dense page directory */ + mem_heap_t* heap) /*!< in: temporary memory heap */ +{ + int err = Z_OK; + ulint* offsets = NULL; + + do { + const rec_t* rec = *recs++; + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + /* Only leaf nodes may contain externally stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + /* Compress the extra bytes. */ + c_stream->avail_in = static_cast<uInt>( + rec - REC_N_NEW_EXTRA_BYTES - c_stream->next_in); + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + break; + } + } + ut_ad(!c_stream->avail_in); + + /* Compress the data bytes, except node_ptr. */ + c_stream->next_in = (byte*) rec; + c_stream->avail_in = static_cast<uInt>( + rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE); + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + break; + } + } + + ut_ad(!c_stream->avail_in); + + memcpy(storage - REC_NODE_PTR_SIZE + * (rec_get_heap_no_new(rec) - 1), + c_stream->next_in, REC_NODE_PTR_SIZE); + c_stream->next_in += REC_NODE_PTR_SIZE; + } while (--n_dense); + + return(err); +} + +/**********************************************************************//** +Compress the records of a leaf node of a secondary index. +@return Z_OK, or a zlib error code */ +static +int +page_zip_compress_sec( +/*==================*/ + FILE_LOGFILE + z_stream* c_stream, /*!< in/out: compressed page stream */ + const rec_t** recs, /*!< in: dense page directory + sorted by address */ + ulint n_dense) /*!< in: size of recs[] */ +{ + int err = Z_OK; + + ut_ad(n_dense > 0); + + do { + const rec_t* rec = *recs++; + + /* Compress everything up to this record. */ + c_stream->avail_in = static_cast<uInt>( + rec - REC_N_NEW_EXTRA_BYTES + - c_stream->next_in); + + if (UNIV_LIKELY(c_stream->avail_in)) { + UNIV_MEM_ASSERT_RW(c_stream->next_in, + c_stream->avail_in); + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + break; + } + } + + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES); + + /* Skip the REC_N_NEW_EXTRA_BYTES. */ + + c_stream->next_in = (byte*) rec; + } while (--n_dense); + + return(err); +} + +/**********************************************************************//** +Compress a record of a leaf node of a clustered index that contains +externally stored columns. +@return Z_OK, or a zlib error code */ +static +int +page_zip_compress_clust_ext( +/*========================*/ + FILE_LOGFILE + z_stream* c_stream, /*!< in/out: compressed page stream */ + const rec_t* rec, /*!< in: record */ + const ulint* offsets, /*!< in: rec_get_offsets(rec) */ + ulint trx_id_col, /*!< in: position of of DB_TRX_ID */ + byte* deleted, /*!< in: dense directory entry pointing + to the head of the free list */ + byte* storage, /*!< in: end of dense page directory */ + byte** externs, /*!< in/out: pointer to the next + available BLOB pointer */ + ulint* n_blobs) /*!< in/out: number of + externally stored columns */ +{ + int err; + ulint i; + + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + ulint len; + const byte* src; + + if (UNIV_UNLIKELY(i == trx_id_col)) { + ut_ad(!rec_offs_nth_extern(offsets, i)); + /* Store trx_id and roll_ptr + in uncompressed form. */ + src = rec_get_nth_field(rec, offsets, i, &len); + ut_ad(src + DATA_TRX_ID_LEN + == rec_get_nth_field(rec, offsets, + i + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + + /* Compress any preceding bytes. */ + c_stream->avail_in = static_cast<uInt>( + src - c_stream->next_in); + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + return(err); + } + } + + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == src); + + memcpy(storage + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (rec_get_heap_no_new(rec) - 1), + c_stream->next_in, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + c_stream->next_in + += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + + /* Skip also roll_ptr */ + i++; + } else if (rec_offs_nth_extern(offsets, i)) { + src = rec_get_nth_field(rec, offsets, i, &len); + ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE); + src += len - BTR_EXTERN_FIELD_REF_SIZE; + + c_stream->avail_in = static_cast<uInt>( + src - c_stream->next_in); + if (UNIV_LIKELY(c_stream->avail_in)) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + return(err); + } + } + + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == src); + + /* Reserve space for the data at + the end of the space reserved for + the compressed data and the page + modification log. */ + + if (UNIV_UNLIKELY + (c_stream->avail_out + <= BTR_EXTERN_FIELD_REF_SIZE)) { + /* out of space */ + return(Z_BUF_ERROR); + } + + ut_ad(*externs == c_stream->next_out + + c_stream->avail_out + + 1/* end of modif. log */); + + c_stream->next_in + += BTR_EXTERN_FIELD_REF_SIZE; + + /* Skip deleted records. */ + if (UNIV_LIKELY_NULL + (page_zip_dir_find_low( + storage, deleted, + page_offset(rec)))) { + continue; + } + + (*n_blobs)++; + c_stream->avail_out + -= BTR_EXTERN_FIELD_REF_SIZE; + *externs -= BTR_EXTERN_FIELD_REF_SIZE; + + /* Copy the BLOB pointer */ + memcpy(*externs, c_stream->next_in + - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE); + } + } + + return(Z_OK); +} + +/**********************************************************************//** +Compress the records of a leaf node of a clustered index. +@return Z_OK, or a zlib error code */ +static +int +page_zip_compress_clust( +/*====================*/ + FILE_LOGFILE + z_stream* c_stream, /*!< in/out: compressed page stream */ + const rec_t** recs, /*!< in: dense page directory + sorted by address */ + ulint n_dense, /*!< in: size of recs[] */ + dict_index_t* index, /*!< in: the index of the page */ + ulint* n_blobs, /*!< in: 0; out: number of + externally stored columns */ + ulint trx_id_col, /*!< index of the trx_id column */ + byte* deleted, /*!< in: dense directory entry pointing + to the head of the free list */ + byte* storage, /*!< in: end of dense page directory */ + mem_heap_t* heap) /*!< in: temporary memory heap */ +{ + int err = Z_OK; + ulint* offsets = NULL; + /* BTR_EXTERN_FIELD_REF storage */ + byte* externs = storage - n_dense + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + ut_ad(*n_blobs == 0); + + do { + const rec_t* rec = *recs++; + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + ut_ad(rec_offs_n_fields(offsets) + == dict_index_get_n_fields(index)); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + /* Compress the extra bytes. */ + c_stream->avail_in = static_cast<uInt>( + rec - REC_N_NEW_EXTRA_BYTES + - c_stream->next_in); + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + goto func_exit; + } + } + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES); + + /* Compress the data bytes. */ + + c_stream->next_in = (byte*) rec; + + /* Check if there are any externally stored columns. + For each externally stored column, store the + BTR_EXTERN_FIELD_REF separately. */ + if (rec_offs_any_extern(offsets)) { + ut_ad(dict_index_is_clust(index)); + + err = page_zip_compress_clust_ext( + LOGFILE + c_stream, rec, offsets, trx_id_col, + deleted, storage, &externs, n_blobs); + + if (UNIV_UNLIKELY(err != Z_OK)) { + + goto func_exit; + } + } else { + ulint len; + const byte* src; + + /* Store trx_id and roll_ptr in uncompressed form. */ + src = rec_get_nth_field(rec, offsets, + trx_id_col, &len); + ut_ad(src + DATA_TRX_ID_LEN + == rec_get_nth_field(rec, offsets, + trx_id_col + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + /* Compress any preceding bytes. */ + c_stream->avail_in = static_cast<uInt>( + src - c_stream->next_in); + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + return(err); + } + } + + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == src); + + memcpy(storage + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (rec_get_heap_no_new(rec) - 1), + c_stream->next_in, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + c_stream->next_in + += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + + /* Skip also roll_ptr */ + ut_ad(trx_id_col + 1 < rec_offs_n_fields(offsets)); + } + + /* Compress the last bytes of the record. */ + c_stream->avail_in = static_cast<uInt>( + rec + rec_offs_data_size(offsets) - c_stream->next_in); + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + goto func_exit; + } + } + ut_ad(!c_stream->avail_in); + } while (--n_dense); + +func_exit: + return(err); +} + +/**********************************************************************//** +Compress a page. +@return TRUE on success, FALSE on failure; page_zip will be left +intact on failure. */ +UNIV_INTERN +ibool +page_zip_compress( +/*==============*/ + page_zip_des_t* page_zip,/*!< in: size; out: data, n_blobs, + m_start, m_end, m_nonempty */ + const page_t* page, /*!< in: uncompressed page */ + dict_index_t* index, /*!< in: index of the B-tree node */ + ulint level, /*!< in: compression level */ + mtr_t* mtr) /*!< in: mini-transaction, or NULL */ +{ + z_stream c_stream; + int err; + ulint n_fields;/* number of index fields needed */ + byte* fields; /*!< index field information */ + byte* buf; /*!< compressed payload of the page */ + byte* buf_end;/* end of buf */ + ulint n_dense; + ulint slot_size;/* amount of uncompressed bytes per record */ + const rec_t** recs; /*!< dense page directory, sorted by address */ + mem_heap_t* heap; + ulint trx_id_col; + ulint n_blobs = 0; + byte* storage;/* storage of uncompressed columns */ +#ifndef UNIV_HOTBACKUP + ullint usec = ut_time_us(NULL); +#endif /* !UNIV_HOTBACKUP */ +#ifdef PAGE_ZIP_COMPRESS_DBG + FILE* logfile = NULL; +#endif + /* A local copy of srv_cmp_per_index_enabled to avoid reading that + variable multiple times in this function since it can be changed at + anytime. */ + my_bool cmp_per_index_enabled = srv_cmp_per_index_enabled; + + if (!page) { + return(FALSE); + } + + ut_a(page_is_comp(page)); + ut_a(fil_page_get_type(page) == FIL_PAGE_INDEX); + ut_ad(page_simple_validate_new((page_t*) page)); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(dict_table_is_comp(index->table)); + ut_ad(!dict_index_is_ibuf(index)); + + UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE); + + /* Check the data that will be omitted. */ + ut_a(!memcmp(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES), + infimum_extra, sizeof infimum_extra)); + ut_a(!memcmp(page + PAGE_NEW_INFIMUM, + infimum_data, sizeof infimum_data)); + ut_a(page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES] + /* info_bits == 0, n_owned <= max */ + <= PAGE_DIR_SLOT_MAX_N_OWNED); + ut_a(!memcmp(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1), + supremum_extra_data, sizeof supremum_extra_data)); + + if (page_is_empty(page)) { + ut_a(rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE) + == PAGE_NEW_SUPREMUM); + } + + if (page_is_leaf(page)) { + n_fields = dict_index_get_n_fields(index); + } else { + n_fields = dict_index_get_n_unique_in_tree(index); + } + + /* The dense directory excludes the infimum and supremum records. */ + n_dense = page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW; +#ifdef PAGE_ZIP_COMPRESS_DBG + if (UNIV_UNLIKELY(page_zip_compress_dbg)) { + fprintf(stderr, "compress %p %p %lu %lu %lu\n", + (void*) page_zip, (void*) page, + (ibool) page_is_leaf(page), + n_fields, n_dense); + } + if (UNIV_UNLIKELY(page_zip_compress_log)) { + /* Create a log file for every compression attempt. */ + char logfilename[9]; + ut_snprintf(logfilename, sizeof logfilename, + "%08x", page_zip_compress_log++); + logfile = fopen(logfilename, "wb"); + + if (logfile) { + /* Write the uncompressed page to the log. */ + fwrite(page, 1, UNIV_PAGE_SIZE, logfile); + /* Record the compressed size as zero. + This will be overwritten at successful exit. */ + putc(0, logfile); + putc(0, logfile); + putc(0, logfile); + putc(0, logfile); + } + } +#endif /* PAGE_ZIP_COMPRESS_DBG */ +#ifndef UNIV_HOTBACKUP + page_zip_stat[page_zip->ssize - 1].compressed++; + if (cmp_per_index_enabled) { + mutex_enter(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index[index->id].compressed++; + mutex_exit(&page_zip_stat_per_index_mutex); + } +#endif /* !UNIV_HOTBACKUP */ + + if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE + >= page_zip_get_size(page_zip))) { + + goto err_exit; + } + + MONITOR_INC(MONITOR_PAGE_COMPRESS); + + heap = mem_heap_create(page_zip_get_size(page_zip) + + n_fields * (2 + sizeof(ulint)) + + REC_OFFS_HEADER_SIZE + + n_dense * ((sizeof *recs) + - PAGE_ZIP_DIR_SLOT_SIZE) + + UNIV_PAGE_SIZE * 4 + + (512 << MAX_MEM_LEVEL)); + + recs = static_cast<const rec_t**>( + mem_heap_zalloc(heap, n_dense * sizeof *recs)); + + fields = static_cast<byte*>(mem_heap_alloc(heap, (n_fields + 1) * 2)); + + buf = static_cast<byte*>( + mem_heap_alloc(heap, page_zip_get_size(page_zip) - PAGE_DATA)); + + buf_end = buf + page_zip_get_size(page_zip) - PAGE_DATA; + + /* Compress the data payload. */ + page_zip_set_alloc(&c_stream, heap); + + err = deflateInit2(&c_stream, static_cast<int>(level), + Z_DEFLATED, UNIV_PAGE_SIZE_SHIFT, + MAX_MEM_LEVEL, Z_DEFAULT_STRATEGY); + ut_a(err == Z_OK); + + c_stream.next_out = buf; + /* Subtract the space reserved for uncompressed data. */ + /* Page header and the end marker of the modification log */ + c_stream.avail_out = static_cast<uInt>(buf_end - buf - 1); + + /* Dense page directory and uncompressed columns, if any */ + if (page_is_leaf(page)) { + if (dict_index_is_clust(index)) { + trx_id_col = dict_index_get_sys_col_pos( + index, DATA_TRX_ID); + ut_ad(trx_id_col > 0); + ut_ad(trx_id_col != ULINT_UNDEFINED); + + slot_size = PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + } else { + /* Signal the absence of trx_id + in page_zip_fields_encode() */ + ut_ad(dict_index_get_sys_col_pos(index, DATA_TRX_ID) + == ULINT_UNDEFINED); + trx_id_col = 0; + slot_size = PAGE_ZIP_DIR_SLOT_SIZE; + } + } else { + slot_size = PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE; + trx_id_col = ULINT_UNDEFINED; + } + + if (UNIV_UNLIKELY(c_stream.avail_out <= n_dense * slot_size + + 6/* sizeof(zlib header and footer) */)) { + goto zlib_error; + } + + c_stream.avail_out -= static_cast<uInt>(n_dense * slot_size); + c_stream.avail_in = static_cast<uInt>( + page_zip_fields_encode(n_fields, index, trx_id_col, fields)); + c_stream.next_in = fields; + if (UNIV_LIKELY(!trx_id_col)) { + trx_id_col = ULINT_UNDEFINED; + } + + UNIV_MEM_ASSERT_RW(c_stream.next_in, c_stream.avail_in); + err = deflate(&c_stream, Z_FULL_FLUSH); + if (err != Z_OK) { + goto zlib_error; + } + + ut_ad(!c_stream.avail_in); + + page_zip_dir_encode(page, buf_end, recs); + + c_stream.next_in = (byte*) page + PAGE_ZIP_START; + + storage = buf_end - n_dense * PAGE_ZIP_DIR_SLOT_SIZE; + + /* Compress the records in heap_no order. */ + if (UNIV_UNLIKELY(!n_dense)) { + } else if (!page_is_leaf(page)) { + /* This is a node pointer page. */ + err = page_zip_compress_node_ptrs(LOGFILE + &c_stream, recs, n_dense, + index, storage, heap); + if (UNIV_UNLIKELY(err != Z_OK)) { + goto zlib_error; + } + } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) { + /* This is a leaf page in a secondary index. */ + err = page_zip_compress_sec(LOGFILE + &c_stream, recs, n_dense); + if (UNIV_UNLIKELY(err != Z_OK)) { + goto zlib_error; + } + } else { + /* This is a leaf page in a clustered index. */ + err = page_zip_compress_clust(LOGFILE + &c_stream, recs, n_dense, + index, &n_blobs, trx_id_col, + buf_end - PAGE_ZIP_DIR_SLOT_SIZE + * page_get_n_recs(page), + storage, heap); + if (UNIV_UNLIKELY(err != Z_OK)) { + goto zlib_error; + } + } + + /* Finish the compression. */ + ut_ad(!c_stream.avail_in); + /* Compress any trailing garbage, in case the last record was + allocated from an originally longer space on the free list, + or the data of the last record from page_zip_compress_sec(). */ + c_stream.avail_in = static_cast<uInt>( + page_header_get_field(page, PAGE_HEAP_TOP) + - (c_stream.next_in - page)); + ut_a(c_stream.avail_in <= UNIV_PAGE_SIZE - PAGE_ZIP_START - PAGE_DIR); + + UNIV_MEM_ASSERT_RW(c_stream.next_in, c_stream.avail_in); + err = deflate(&c_stream, Z_FINISH); + + if (UNIV_UNLIKELY(err != Z_STREAM_END)) { +zlib_error: + deflateEnd(&c_stream); + mem_heap_free(heap); +err_exit: +#ifdef PAGE_ZIP_COMPRESS_DBG + if (logfile) { + fclose(logfile); + } +#endif /* PAGE_ZIP_COMPRESS_DBG */ +#ifndef UNIV_HOTBACKUP + if (page_is_leaf(page)) { + dict_index_zip_failure(index); + } + + ullint time_diff = ut_time_us(NULL) - usec; + page_zip_stat[page_zip->ssize - 1].compressed_usec + += time_diff; + if (cmp_per_index_enabled) { + mutex_enter(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index[index->id].compressed_usec + += time_diff; + mutex_exit(&page_zip_stat_per_index_mutex); + } +#endif /* !UNIV_HOTBACKUP */ + return(FALSE); + } + + err = deflateEnd(&c_stream); + ut_a(err == Z_OK); + + ut_ad(buf + c_stream.total_out == c_stream.next_out); + ut_ad((ulint) (storage - c_stream.next_out) >= c_stream.avail_out); + + /* Valgrind believes that zlib does not initialize some bits + in the last 7 or 8 bytes of the stream. Make Valgrind happy. */ + UNIV_MEM_VALID(buf, c_stream.total_out); + + /* Zero out the area reserved for the modification log. + Space for the end marker of the modification log is not + included in avail_out. */ + memset(c_stream.next_out, 0, c_stream.avail_out + 1/* end marker */); + +#ifdef UNIV_DEBUG + page_zip->m_start = +#endif /* UNIV_DEBUG */ + page_zip->m_end = PAGE_DATA + c_stream.total_out; + page_zip->m_nonempty = FALSE; + page_zip->n_blobs = n_blobs; + /* Copy those header fields that will not be written + in buf_flush_init_for_writing() */ + memcpy(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV, + FIL_PAGE_LSN - FIL_PAGE_PREV); + memcpy(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, 2); + memcpy(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA, + PAGE_DATA - FIL_PAGE_DATA); + /* Copy the rest of the compressed page */ + memcpy(page_zip->data + PAGE_DATA, buf, + page_zip_get_size(page_zip) - PAGE_DATA); + mem_heap_free(heap); +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + if (mtr) { +#ifndef UNIV_HOTBACKUP + page_zip_compress_write_log(page_zip, page, index, mtr); +#endif /* !UNIV_HOTBACKUP */ + } + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + +#ifdef PAGE_ZIP_COMPRESS_DBG + if (logfile) { + /* Record the compressed size of the block. */ + byte sz[4]; + mach_write_to_4(sz, c_stream.total_out); + fseek(logfile, UNIV_PAGE_SIZE, SEEK_SET); + fwrite(sz, 1, sizeof sz, logfile); + fclose(logfile); + } +#endif /* PAGE_ZIP_COMPRESS_DBG */ +#ifndef UNIV_HOTBACKUP + ullint time_diff = ut_time_us(NULL) - usec; + page_zip_stat[page_zip->ssize - 1].compressed_ok++; + page_zip_stat[page_zip->ssize - 1].compressed_usec += time_diff; + if (cmp_per_index_enabled) { + mutex_enter(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index[index->id].compressed_ok++; + page_zip_stat_per_index[index->id].compressed_usec += time_diff; + mutex_exit(&page_zip_stat_per_index_mutex); + } + + if (page_is_leaf(page)) { + dict_index_zip_success(index); + } +#endif /* !UNIV_HOTBACKUP */ + + return(TRUE); +} + +/**********************************************************************//** +Compare two page directory entries. +@return positive if rec1 > rec2 */ +UNIV_INLINE +ibool +page_zip_dir_cmp( +/*=============*/ + const rec_t* rec1, /*!< in: rec1 */ + const rec_t* rec2) /*!< in: rec2 */ +{ + return(rec1 > rec2); +} + +/**********************************************************************//** +Sort the dense page directory by address (heap_no). */ +static +void +page_zip_dir_sort( +/*==============*/ + rec_t** arr, /*!< in/out: dense page directory */ + rec_t** aux_arr,/*!< in/out: work area */ + ulint low, /*!< in: lower bound of the sorting area, inclusive */ + ulint high) /*!< in: upper bound of the sorting area, exclusive */ +{ + UT_SORT_FUNCTION_BODY(page_zip_dir_sort, arr, aux_arr, low, high, + page_zip_dir_cmp); +} + +/**********************************************************************//** +Deallocate the index information initialized by page_zip_fields_decode(). */ +static +void +page_zip_fields_free( +/*=================*/ + dict_index_t* index) /*!< in: dummy index to be freed */ +{ + if (index) { + dict_table_t* table = index->table; + os_fast_mutex_free(&index->zip_pad.mutex); + mem_heap_free(index->heap); + + dict_mem_table_free(table); + } +} + +/**********************************************************************//** +Read the index information for the compressed page. +@return own: dummy index describing the page, or NULL on error */ +static +dict_index_t* +page_zip_fields_decode( +/*===================*/ + const byte* buf, /*!< in: index information */ + const byte* end, /*!< in: end of buf */ + ulint* trx_id_col)/*!< in: NULL for non-leaf pages; + for leaf pages, pointer to where to store + the position of the trx_id column */ +{ + const byte* b; + ulint n; + ulint i; + ulint val; + dict_table_t* table; + dict_index_t* index; + + /* Determine the number of fields. */ + for (b = buf, n = 0; b < end; n++) { + if (*b++ & 0x80) { + b++; /* skip the second byte */ + } + } + + n--; /* n_nullable or trx_id */ + + if (UNIV_UNLIKELY(n > REC_MAX_N_FIELDS)) { + + page_zip_fail(("page_zip_fields_decode: n = %lu\n", + (ulong) n)); + return(NULL); + } + + if (UNIV_UNLIKELY(b > end)) { + + page_zip_fail(("page_zip_fields_decode: %p > %p\n", + (const void*) b, (const void*) end)); + return(NULL); + } + + table = dict_mem_table_create("ZIP_DUMMY", DICT_HDR_SPACE, n, + DICT_TF_COMPACT, 0, true); + index = dict_mem_index_create("ZIP_DUMMY", "ZIP_DUMMY", + DICT_HDR_SPACE, 0, n); + index->table = table; + index->n_uniq = n; + /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ + index->cached = TRUE; + + /* Initialize the fields. */ + for (b = buf, i = 0; i < n; i++) { + ulint mtype; + ulint len; + + val = *b++; + + if (UNIV_UNLIKELY(val & 0x80)) { + /* fixed length > 62 bytes */ + val = (val & 0x7f) << 8 | *b++; + len = val >> 1; + mtype = DATA_FIXBINARY; + } else if (UNIV_UNLIKELY(val >= 126)) { + /* variable length with max > 255 bytes */ + len = 0x7fff; + mtype = DATA_BINARY; + } else if (val <= 1) { + /* variable length with max <= 255 bytes */ + len = 0; + mtype = DATA_BINARY; + } else { + /* fixed length < 62 bytes */ + len = val >> 1; + mtype = DATA_FIXBINARY; + } + + dict_mem_table_add_col(table, NULL, NULL, mtype, + val & 1 ? DATA_NOT_NULL : 0, len); + dict_index_add_col(index, table, + dict_table_get_nth_col(table, i), 0); + } + + val = *b++; + if (UNIV_UNLIKELY(val & 0x80)) { + val = (val & 0x7f) << 8 | *b++; + } + + /* Decode the position of the trx_id column. */ + if (trx_id_col) { + if (!val) { + val = ULINT_UNDEFINED; + } else if (UNIV_UNLIKELY(val >= n)) { + page_zip_fields_free(index); + index = NULL; + } else { + index->type = DICT_CLUSTERED; + } + + *trx_id_col = val; + } else { + /* Decode the number of nullable fields. */ + if (UNIV_UNLIKELY(index->n_nullable > val)) { + page_zip_fields_free(index); + index = NULL; + } else { + index->n_nullable = val; + } + } + + ut_ad(b == end); + + return(index); +} + +/**********************************************************************//** +Populate the sparse page directory from the dense directory. +@return TRUE on success, FALSE on failure */ +static +ibool +page_zip_dir_decode( +/*================*/ + const page_zip_des_t* page_zip,/*!< in: dense page directory on + compressed page */ + page_t* page, /*!< in: compact page with valid header; + out: trailer and sparse page directory + filled in */ + rec_t** recs, /*!< out: dense page directory sorted by + ascending address (and heap_no) */ + rec_t** recs_aux,/*!< in/out: scratch area */ + ulint n_dense)/*!< in: number of user records, and + size of recs[] and recs_aux[] */ +{ + ulint i; + ulint n_recs; + byte* slot; + + n_recs = page_get_n_recs(page); + + if (UNIV_UNLIKELY(n_recs > n_dense)) { + page_zip_fail(("page_zip_dir_decode 1: %lu > %lu\n", + (ulong) n_recs, (ulong) n_dense)); + return(FALSE); + } + + /* Traverse the list of stored records in the sorting order, + starting from the first user record. */ + + slot = page + (UNIV_PAGE_SIZE - PAGE_DIR - PAGE_DIR_SLOT_SIZE); + UNIV_PREFETCH_RW(slot); + + /* Zero out the page trailer. */ + memset(slot + PAGE_DIR_SLOT_SIZE, 0, PAGE_DIR); + + mach_write_to_2(slot, PAGE_NEW_INFIMUM); + slot -= PAGE_DIR_SLOT_SIZE; + UNIV_PREFETCH_RW(slot); + + /* Initialize the sparse directory and copy the dense directory. */ + for (i = 0; i < n_recs; i++) { + ulint offs = page_zip_dir_get(page_zip, i); + + if (offs & PAGE_ZIP_DIR_SLOT_OWNED) { + mach_write_to_2(slot, offs & PAGE_ZIP_DIR_SLOT_MASK); + slot -= PAGE_DIR_SLOT_SIZE; + UNIV_PREFETCH_RW(slot); + } + + if (UNIV_UNLIKELY((offs & PAGE_ZIP_DIR_SLOT_MASK) + < PAGE_ZIP_START + REC_N_NEW_EXTRA_BYTES)) { + page_zip_fail(("page_zip_dir_decode 2: %u %u %lx\n", + (unsigned) i, (unsigned) n_recs, + (ulong) offs)); + return(FALSE); + } + + recs[i] = page + (offs & PAGE_ZIP_DIR_SLOT_MASK); + } + + mach_write_to_2(slot, PAGE_NEW_SUPREMUM); + { + const page_dir_slot_t* last_slot = page_dir_get_nth_slot( + page, page_dir_get_n_slots(page) - 1); + + if (UNIV_UNLIKELY(slot != last_slot)) { + page_zip_fail(("page_zip_dir_decode 3: %p != %p\n", + (const void*) slot, + (const void*) last_slot)); + return(FALSE); + } + } + + /* Copy the rest of the dense directory. */ + for (; i < n_dense; i++) { + ulint offs = page_zip_dir_get(page_zip, i); + + if (UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) { + page_zip_fail(("page_zip_dir_decode 4: %u %u %lx\n", + (unsigned) i, (unsigned) n_dense, + (ulong) offs)); + return(FALSE); + } + + recs[i] = page + offs; + } + + if (UNIV_LIKELY(n_dense > 1)) { + page_zip_dir_sort(recs, recs_aux, 0, n_dense); + } + return(TRUE); +} + +/**********************************************************************//** +Initialize the REC_N_NEW_EXTRA_BYTES of each record. +@return TRUE on success, FALSE on failure */ +static +ibool +page_zip_set_extra_bytes( +/*=====================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + page_t* page, /*!< in/out: uncompressed page */ + ulint info_bits)/*!< in: REC_INFO_MIN_REC_FLAG or 0 */ +{ + ulint n; + ulint i; + ulint n_owned = 1; + ulint offs; + rec_t* rec; + + n = page_get_n_recs(page); + rec = page + PAGE_NEW_INFIMUM; + + for (i = 0; i < n; i++) { + offs = page_zip_dir_get(page_zip, i); + + if (offs & PAGE_ZIP_DIR_SLOT_DEL) { + info_bits |= REC_INFO_DELETED_FLAG; + } + if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_OWNED)) { + info_bits |= n_owned; + n_owned = 1; + } else { + n_owned++; + } + offs &= PAGE_ZIP_DIR_SLOT_MASK; + if (UNIV_UNLIKELY(offs < PAGE_ZIP_START + + REC_N_NEW_EXTRA_BYTES)) { + page_zip_fail(("page_zip_set_extra_bytes 1:" + " %u %u %lx\n", + (unsigned) i, (unsigned) n, + (ulong) offs)); + return(FALSE); + } + + rec_set_next_offs_new(rec, offs); + rec = page + offs; + rec[-REC_N_NEW_EXTRA_BYTES] = (byte) info_bits; + info_bits = 0; + } + + /* Set the next pointer of the last user record. */ + rec_set_next_offs_new(rec, PAGE_NEW_SUPREMUM); + + /* Set n_owned of the supremum record. */ + page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES] = (byte) n_owned; + + /* The dense directory excludes the infimum and supremum records. */ + n = page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW; + + if (i >= n) { + if (UNIV_LIKELY(i == n)) { + return(TRUE); + } + + page_zip_fail(("page_zip_set_extra_bytes 2: %u != %u\n", + (unsigned) i, (unsigned) n)); + return(FALSE); + } + + offs = page_zip_dir_get(page_zip, i); + + /* Set the extra bytes of deleted records on the free list. */ + for (;;) { + if (UNIV_UNLIKELY(!offs) + || UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) { + + page_zip_fail(("page_zip_set_extra_bytes 3: %lx\n", + (ulong) offs)); + return(FALSE); + } + + rec = page + offs; + rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */ + + if (++i == n) { + break; + } + + offs = page_zip_dir_get(page_zip, i); + rec_set_next_offs_new(rec, offs); + } + + /* Terminate the free list. */ + rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */ + rec_set_next_offs_new(rec, 0); + + return(TRUE); +} + +/**********************************************************************//** +Apply the modification log to a record containing externally stored +columns. Do not copy the fields that are stored separately. +@return pointer to modification log, or NULL on failure */ +static +const byte* +page_zip_apply_log_ext( +/*===================*/ + rec_t* rec, /*!< in/out: record */ + const ulint* offsets, /*!< in: rec_get_offsets(rec) */ + ulint trx_id_col, /*!< in: position of of DB_TRX_ID */ + const byte* data, /*!< in: modification log */ + const byte* end) /*!< in: end of modification log */ +{ + ulint i; + ulint len; + byte* next_out = rec; + + /* Check if there are any externally stored columns. + For each externally stored column, skip the + BTR_EXTERN_FIELD_REF. */ + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + byte* dst; + + if (UNIV_UNLIKELY(i == trx_id_col)) { + /* Skip trx_id and roll_ptr */ + dst = rec_get_nth_field(rec, offsets, + i, &len); + if (UNIV_UNLIKELY(dst - next_out >= end - data) + || UNIV_UNLIKELY + (len < (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) + || rec_offs_nth_extern(offsets, i)) { + page_zip_fail(("page_zip_apply_log_ext:" + " trx_id len %lu," + " %p - %p >= %p - %p\n", + (ulong) len, + (const void*) dst, + (const void*) next_out, + (const void*) end, + (const void*) data)); + return(NULL); + } + + memcpy(next_out, data, dst - next_out); + data += dst - next_out; + next_out = dst + (DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN); + } else if (rec_offs_nth_extern(offsets, i)) { + dst = rec_get_nth_field(rec, offsets, + i, &len); + ut_ad(len + >= BTR_EXTERN_FIELD_REF_SIZE); + + len += dst - next_out + - BTR_EXTERN_FIELD_REF_SIZE; + + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log_ext: " + "ext %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + + memcpy(next_out, data, len); + data += len; + next_out += len + + BTR_EXTERN_FIELD_REF_SIZE; + } + } + + /* Copy the last bytes of the record. */ + len = rec_get_end(rec, offsets) - next_out; + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log_ext: " + "last %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + memcpy(next_out, data, len); + data += len; + + return(data); +} + +/**********************************************************************//** +Apply the modification log to an uncompressed page. +Do not copy the fields that are stored separately. +@return pointer to end of modification log, or NULL on failure */ +static +const byte* +page_zip_apply_log( +/*===============*/ + const byte* data, /*!< in: modification log */ + ulint size, /*!< in: maximum length of the log, in bytes */ + rec_t** recs, /*!< in: dense page directory, + sorted by address (indexed by + heap_no - PAGE_HEAP_NO_USER_LOW) */ + ulint n_dense,/*!< in: size of recs[] */ + ulint trx_id_col,/*!< in: column number of trx_id in the index, + or ULINT_UNDEFINED if none */ + ulint heap_status, + /*!< in: heap_no and status bits for + the next record to uncompress */ + dict_index_t* index, /*!< in: index of the page */ + ulint* offsets)/*!< in/out: work area for + rec_get_offsets_reverse() */ +{ + const byte* const end = data + size; + + for (;;) { + ulint val; + rec_t* rec; + ulint len; + ulint hs; + + val = *data++; + if (UNIV_UNLIKELY(!val)) { + return(data - 1); + } + if (val & 0x80) { + val = (val & 0x7f) << 8 | *data++; + if (UNIV_UNLIKELY(!val)) { + page_zip_fail(("page_zip_apply_log:" + " invalid val %x%x\n", + data[-2], data[-1])); + return(NULL); + } + } + if (UNIV_UNLIKELY(data >= end)) { + page_zip_fail(("page_zip_apply_log: %p >= %p\n", + (const void*) data, + (const void*) end)); + return(NULL); + } + if (UNIV_UNLIKELY((val >> 1) > n_dense)) { + page_zip_fail(("page_zip_apply_log: %lu>>1 > %lu\n", + (ulong) val, (ulong) n_dense)); + return(NULL); + } + + /* Determine the heap number and status bits of the record. */ + rec = recs[(val >> 1) - 1]; + + hs = ((val >> 1) + 1) << REC_HEAP_NO_SHIFT; + hs |= heap_status & ((1 << REC_HEAP_NO_SHIFT) - 1); + + /* This may either be an old record that is being + overwritten (updated in place, or allocated from + the free list), or a new record, with the next + available_heap_no. */ + if (UNIV_UNLIKELY(hs > heap_status)) { + page_zip_fail(("page_zip_apply_log: %lu > %lu\n", + (ulong) hs, (ulong) heap_status)); + return(NULL); + } else if (hs == heap_status) { + /* A new record was allocated from the heap. */ + if (UNIV_UNLIKELY(val & 1)) { + /* Only existing records may be cleared. */ + page_zip_fail(("page_zip_apply_log:" + " attempting to create" + " deleted rec %lu\n", + (ulong) hs)); + return(NULL); + } + heap_status += 1 << REC_HEAP_NO_SHIFT; + } + + mach_write_to_2(rec - REC_NEW_HEAP_NO, hs); + + if (val & 1) { + /* Clear the data bytes of the record. */ + mem_heap_t* heap = NULL; + ulint* offs; + offs = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + memset(rec, 0, rec_offs_data_size(offs)); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + continue; + } + +#if REC_STATUS_NODE_PTR != TRUE +# error "REC_STATUS_NODE_PTR != TRUE" +#endif + rec_get_offsets_reverse(data, index, + hs & REC_STATUS_NODE_PTR, + offsets); + rec_offs_make_valid(rec, index, offsets); + + /* Copy the extra bytes (backwards). */ + { + byte* start = rec_get_start(rec, offsets); + byte* b = rec - REC_N_NEW_EXTRA_BYTES; + while (b != start) { + *--b = *data++; + } + } + + /* Copy the data bytes. */ + if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) { + /* Non-leaf nodes should not contain any + externally stored columns. */ + if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) { + page_zip_fail(("page_zip_apply_log: " + "%lu&REC_STATUS_NODE_PTR\n", + (ulong) hs)); + return(NULL); + } + + data = page_zip_apply_log_ext( + rec, offsets, trx_id_col, data, end); + + if (UNIV_UNLIKELY(!data)) { + return(NULL); + } + } else if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) { + len = rec_offs_data_size(offsets) + - REC_NODE_PTR_SIZE; + /* Copy the data bytes, except node_ptr. */ + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log: " + "node_ptr %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + memcpy(rec, data, len); + data += len; + } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) { + len = rec_offs_data_size(offsets); + + /* Copy all data bytes of + a record in a secondary index. */ + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log: " + "sec %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + + memcpy(rec, data, len); + data += len; + } else { + /* Skip DB_TRX_ID and DB_ROLL_PTR. */ + ulint l = rec_get_nth_field_offs(offsets, + trx_id_col, &len); + byte* b; + + if (UNIV_UNLIKELY(data + l >= end) + || UNIV_UNLIKELY(len < (DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN))) { + page_zip_fail(("page_zip_apply_log: " + "trx_id %p+%lu >= %p\n", + (const void*) data, + (ulong) l, + (const void*) end)); + return(NULL); + } + + /* Copy any preceding data bytes. */ + memcpy(rec, data, l); + data += l; + + /* Copy any bytes following DB_TRX_ID, DB_ROLL_PTR. */ + b = rec + l + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + len = rec_get_end(rec, offsets) - b; + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log: " + "clust %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + memcpy(b, data, len); + data += len; + } + } +} + +/**********************************************************************//** +Set the heap_no in a record, and skip the fixed-size record header +that is not included in the d_stream. +@return TRUE on success, FALSE if d_stream does not end at rec */ +static +ibool +page_zip_decompress_heap_no( +/*========================*/ + z_stream* d_stream, /*!< in/out: compressed page stream */ + rec_t* rec, /*!< in/out: record */ + ulint& heap_status) /*!< in/out: heap_no and status bits */ +{ + if (d_stream->next_out != rec - REC_N_NEW_EXTRA_BYTES) { + /* n_dense has grown since the page was last compressed. */ + return(FALSE); + } + + /* Skip the REC_N_NEW_EXTRA_BYTES. */ + d_stream->next_out = rec; + + /* Set heap_no and the status bits. */ + mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status); + heap_status += 1 << REC_HEAP_NO_SHIFT; + return(TRUE); +} + +/**********************************************************************//** +Decompress the records of a node pointer page. +@return TRUE on success, FALSE on failure */ +static +ibool +page_zip_decompress_node_ptrs( +/*==========================*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + z_stream* d_stream, /*!< in/out: compressed page stream */ + rec_t** recs, /*!< in: dense page directory + sorted by address */ + ulint n_dense, /*!< in: size of recs[] */ + dict_index_t* index, /*!< in: the index of the page */ + ulint* offsets, /*!< in/out: temporary offsets */ + mem_heap_t* heap) /*!< in: temporary memory heap */ +{ + ulint heap_status = REC_STATUS_NODE_PTR + | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT; + ulint slot; + const byte* storage; + + /* Subtract the space reserved for uncompressed data. */ + d_stream->avail_in -= static_cast<uInt>( + n_dense * (PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE)); + + /* Decompress the records in heap_no order. */ + for (slot = 0; slot < n_dense; slot++) { + rec_t* rec = recs[slot]; + + d_stream->avail_out = static_cast<uInt>( + rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out); + + ut_ad(d_stream->avail_out < UNIV_PAGE_SIZE + - PAGE_ZIP_START - PAGE_DIR); + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + page_zip_decompress_heap_no( + d_stream, rec, heap_status); + goto zlib_done; + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_node_ptrs:" + " 1 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + + if (!page_zip_decompress_heap_no( + d_stream, rec, heap_status)) { + ut_ad(0); + } + + /* Read the offsets. The status bits are needed here. */ + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + /* Non-leaf nodes should not have any externally + stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + + /* Decompress the data bytes, except node_ptr. */ + d_stream->avail_out =static_cast<uInt>( + rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE); + + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + goto zlib_done; + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_node_ptrs:" + " 2 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + + /* Clear the node pointer in case the record + will be deleted and the space will be reallocated + to a smaller record. */ + memset(d_stream->next_out, 0, REC_NODE_PTR_SIZE); + d_stream->next_out += REC_NODE_PTR_SIZE; + + ut_ad(d_stream->next_out == rec_get_end(rec, offsets)); + } + + /* Decompress any trailing garbage, in case the last record was + allocated from an originally longer space on the free list. */ + d_stream->avail_out = static_cast<uInt>( + page_header_get_field(page_zip->data, PAGE_HEAP_TOP) + - page_offset(d_stream->next_out)); + if (UNIV_UNLIKELY(d_stream->avail_out > UNIV_PAGE_SIZE + - PAGE_ZIP_START - PAGE_DIR)) { + + page_zip_fail(("page_zip_decompress_node_ptrs:" + " avail_out = %u\n", + d_stream->avail_out)); + goto zlib_error; + } + + if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) { + page_zip_fail(("page_zip_decompress_node_ptrs:" + " inflate(Z_FINISH)=%s\n", + d_stream->msg)); +zlib_error: + inflateEnd(d_stream); + return(FALSE); + } + + /* Note that d_stream->avail_out > 0 may hold here + if the modification log is nonempty. */ + +zlib_done: + if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) { + ut_error; + } + + { + page_t* page = page_align(d_stream->next_out); + + /* Clear the unused heap space on the uncompressed page. */ + memset(d_stream->next_out, 0, + page_dir_get_nth_slot(page, + page_dir_get_n_slots(page) - 1) + - d_stream->next_out); + } + +#ifdef UNIV_DEBUG + page_zip->m_start = PAGE_DATA + d_stream->total_in; +#endif /* UNIV_DEBUG */ + + /* Apply the modification log. */ + { + const byte* mod_log_ptr; + mod_log_ptr = page_zip_apply_log(d_stream->next_in, + d_stream->avail_in + 1, + recs, n_dense, + ULINT_UNDEFINED, heap_status, + index, offsets); + + if (UNIV_UNLIKELY(!mod_log_ptr)) { + return(FALSE); + } + page_zip->m_end = mod_log_ptr - page_zip->data; + page_zip->m_nonempty = mod_log_ptr != d_stream->next_in; + } + + if (UNIV_UNLIKELY + (page_zip_get_trailer_len(page_zip, + dict_index_is_clust(index)) + + page_zip->m_end >= page_zip_get_size(page_zip))) { + page_zip_fail(("page_zip_decompress_node_ptrs:" + " %lu + %lu >= %lu, %lu\n", + (ulong) page_zip_get_trailer_len( + page_zip, dict_index_is_clust(index)), + (ulong) page_zip->m_end, + (ulong) page_zip_get_size(page_zip), + (ulong) dict_index_is_clust(index))); + return(FALSE); + } + + /* Restore the uncompressed columns in heap_no order. */ + storage = page_zip_dir_start_low(page_zip, n_dense); + + for (slot = 0; slot < n_dense; slot++) { + rec_t* rec = recs[slot]; + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + /* Non-leaf nodes should not have any externally + stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + storage -= REC_NODE_PTR_SIZE; + + memcpy(rec_get_end(rec, offsets) - REC_NODE_PTR_SIZE, + storage, REC_NODE_PTR_SIZE); + } + + return(TRUE); +} + +/**********************************************************************//** +Decompress the records of a leaf node of a secondary index. +@return TRUE on success, FALSE on failure */ +static +ibool +page_zip_decompress_sec( +/*====================*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + z_stream* d_stream, /*!< in/out: compressed page stream */ + rec_t** recs, /*!< in: dense page directory + sorted by address */ + ulint n_dense, /*!< in: size of recs[] */ + dict_index_t* index, /*!< in: the index of the page */ + ulint* offsets) /*!< in/out: temporary offsets */ +{ + ulint heap_status = REC_STATUS_ORDINARY + | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT; + ulint slot; + + ut_a(!dict_index_is_clust(index)); + + /* Subtract the space reserved for uncompressed data. */ + d_stream->avail_in -= static_cast<uint>( + n_dense * PAGE_ZIP_DIR_SLOT_SIZE); + + for (slot = 0; slot < n_dense; slot++) { + rec_t* rec = recs[slot]; + + /* Decompress everything up to this record. */ + d_stream->avail_out = static_cast<uint>( + rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out); + + if (UNIV_LIKELY(d_stream->avail_out)) { + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + page_zip_decompress_heap_no( + d_stream, rec, heap_status); + goto zlib_done; + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_sec:" + " inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + } + + if (!page_zip_decompress_heap_no( + d_stream, rec, heap_status)) { + ut_ad(0); + } + } + + /* Decompress the data of the last record and any trailing garbage, + in case the last record was allocated from an originally longer space + on the free list. */ + d_stream->avail_out = static_cast<uInt>( + page_header_get_field(page_zip->data, PAGE_HEAP_TOP) + - page_offset(d_stream->next_out)); + if (UNIV_UNLIKELY(d_stream->avail_out > UNIV_PAGE_SIZE + - PAGE_ZIP_START - PAGE_DIR)) { + + page_zip_fail(("page_zip_decompress_sec:" + " avail_out = %u\n", + d_stream->avail_out)); + goto zlib_error; + } + + if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) { + page_zip_fail(("page_zip_decompress_sec:" + " inflate(Z_FINISH)=%s\n", + d_stream->msg)); +zlib_error: + inflateEnd(d_stream); + return(FALSE); + } + + /* Note that d_stream->avail_out > 0 may hold here + if the modification log is nonempty. */ + +zlib_done: + if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) { + ut_error; + } + + { + page_t* page = page_align(d_stream->next_out); + + /* Clear the unused heap space on the uncompressed page. */ + memset(d_stream->next_out, 0, + page_dir_get_nth_slot(page, + page_dir_get_n_slots(page) - 1) + - d_stream->next_out); + } + +#ifdef UNIV_DEBUG + page_zip->m_start = PAGE_DATA + d_stream->total_in; +#endif /* UNIV_DEBUG */ + + /* Apply the modification log. */ + { + const byte* mod_log_ptr; + mod_log_ptr = page_zip_apply_log(d_stream->next_in, + d_stream->avail_in + 1, + recs, n_dense, + ULINT_UNDEFINED, heap_status, + index, offsets); + + if (UNIV_UNLIKELY(!mod_log_ptr)) { + return(FALSE); + } + page_zip->m_end = mod_log_ptr - page_zip->data; + page_zip->m_nonempty = mod_log_ptr != d_stream->next_in; + } + + if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, FALSE) + + page_zip->m_end >= page_zip_get_size(page_zip))) { + + page_zip_fail(("page_zip_decompress_sec: %lu + %lu >= %lu\n", + (ulong) page_zip_get_trailer_len( + page_zip, FALSE), + (ulong) page_zip->m_end, + (ulong) page_zip_get_size(page_zip))); + return(FALSE); + } + + /* There are no uncompressed columns on leaf pages of + secondary indexes. */ + + return(TRUE); +} + +/**********************************************************************//** +Decompress a record of a leaf node of a clustered index that contains +externally stored columns. +@return TRUE on success */ +static +ibool +page_zip_decompress_clust_ext( +/*==========================*/ + z_stream* d_stream, /*!< in/out: compressed page stream */ + rec_t* rec, /*!< in/out: record */ + const ulint* offsets, /*!< in: rec_get_offsets(rec) */ + ulint trx_id_col) /*!< in: position of of DB_TRX_ID */ +{ + ulint i; + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + ulint len; + byte* dst; + + if (UNIV_UNLIKELY(i == trx_id_col)) { + /* Skip trx_id and roll_ptr */ + dst = rec_get_nth_field(rec, offsets, i, &len); + if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN)) { + + page_zip_fail(("page_zip_decompress_clust_ext:" + " len[%lu] = %lu\n", + (ulong) i, (ulong) len)); + return(FALSE); + } + + if (rec_offs_nth_extern(offsets, i)) { + + page_zip_fail(("page_zip_decompress_clust_ext:" + " DB_TRX_ID at %lu is ext\n", + (ulong) i)); + return(FALSE); + } + + d_stream->avail_out = static_cast<uInt>( + dst - d_stream->next_out); + + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust_ext:" + " 1 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + return(FALSE); + } + + ut_ad(d_stream->next_out == dst); + + /* Clear DB_TRX_ID and DB_ROLL_PTR in order to + avoid uninitialized bytes in case the record + is affected by page_zip_apply_log(). */ + memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + d_stream->next_out += DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN; + } else if (rec_offs_nth_extern(offsets, i)) { + dst = rec_get_nth_field(rec, offsets, i, &len); + ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE); + dst += len - BTR_EXTERN_FIELD_REF_SIZE; + + d_stream->avail_out = static_cast<uInt>( + dst - d_stream->next_out); + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust_ext:" + " 2 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + return(FALSE); + } + + ut_ad(d_stream->next_out == dst); + + /* Clear the BLOB pointer in case + the record will be deleted and the + space will not be reused. Note that + the final initialization of the BLOB + pointers (copying from "externs" + or clearing) will have to take place + only after the page modification log + has been applied. Otherwise, we + could end up with an uninitialized + BLOB pointer when a record is deleted, + reallocated and deleted. */ + memset(d_stream->next_out, 0, + BTR_EXTERN_FIELD_REF_SIZE); + d_stream->next_out + += BTR_EXTERN_FIELD_REF_SIZE; + } + } + + return(TRUE); +} + +/**********************************************************************//** +Compress the records of a leaf node of a clustered index. +@return TRUE on success, FALSE on failure */ +static +ibool +page_zip_decompress_clust( +/*======================*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + z_stream* d_stream, /*!< in/out: compressed page stream */ + rec_t** recs, /*!< in: dense page directory + sorted by address */ + ulint n_dense, /*!< in: size of recs[] */ + dict_index_t* index, /*!< in: the index of the page */ + ulint trx_id_col, /*!< index of the trx_id column */ + ulint* offsets, /*!< in/out: temporary offsets */ + mem_heap_t* heap) /*!< in: temporary memory heap */ +{ + int err; + ulint slot; + ulint heap_status = REC_STATUS_ORDINARY + | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT; + const byte* storage; + const byte* externs; + + ut_a(dict_index_is_clust(index)); + + /* Subtract the space reserved for uncompressed data. */ + d_stream->avail_in -= static_cast<uInt>(n_dense) + * (PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN); + + /* Decompress the records in heap_no order. */ + for (slot = 0; slot < n_dense; slot++) { + rec_t* rec = recs[slot]; + + d_stream->avail_out =static_cast<uInt>( + rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out); + + ut_ad(d_stream->avail_out < UNIV_PAGE_SIZE + - PAGE_ZIP_START - PAGE_DIR); + err = inflate(d_stream, Z_SYNC_FLUSH); + switch (err) { + case Z_STREAM_END: + page_zip_decompress_heap_no( + d_stream, rec, heap_status); + goto zlib_done; + case Z_OK: + case Z_BUF_ERROR: + if (UNIV_LIKELY(!d_stream->avail_out)) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust:" + " 1 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + + if (!page_zip_decompress_heap_no( + d_stream, rec, heap_status)) { + ut_ad(0); + } + + /* Read the offsets. The status bits are needed here. */ + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + /* This is a leaf page in a clustered index. */ + + /* Check if there are any externally stored columns. + For each externally stored column, restore the + BTR_EXTERN_FIELD_REF separately. */ + + if (rec_offs_any_extern(offsets)) { + if (UNIV_UNLIKELY + (!page_zip_decompress_clust_ext( + d_stream, rec, offsets, trx_id_col))) { + + goto zlib_error; + } + } else { + /* Skip trx_id and roll_ptr */ + ulint len; + byte* dst = rec_get_nth_field(rec, offsets, + trx_id_col, &len); + if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN)) { + + page_zip_fail(("page_zip_decompress_clust:" + " len = %lu\n", (ulong) len)); + goto zlib_error; + } + + d_stream->avail_out = static_cast<uInt>( + dst - d_stream->next_out); + + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust:" + " 2 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + + ut_ad(d_stream->next_out == dst); + + /* Clear DB_TRX_ID and DB_ROLL_PTR in order to + avoid uninitialized bytes in case the record + is affected by page_zip_apply_log(). */ + memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + d_stream->next_out += DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN; + } + + /* Decompress the last bytes of the record. */ + d_stream->avail_out = static_cast<uInt>( + rec_get_end(rec, offsets) - d_stream->next_out); + + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust:" + " 3 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + } + + /* Decompress any trailing garbage, in case the last record was + allocated from an originally longer space on the free list. */ + d_stream->avail_out = static_cast<uInt>( + page_header_get_field(page_zip->data, PAGE_HEAP_TOP) + - page_offset(d_stream->next_out)); + if (UNIV_UNLIKELY(d_stream->avail_out > UNIV_PAGE_SIZE + - PAGE_ZIP_START - PAGE_DIR)) { + + page_zip_fail(("page_zip_decompress_clust:" + " avail_out = %u\n", + d_stream->avail_out)); + goto zlib_error; + } + + if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) { + page_zip_fail(("page_zip_decompress_clust:" + " inflate(Z_FINISH)=%s\n", + d_stream->msg)); +zlib_error: + inflateEnd(d_stream); + return(FALSE); + } + + /* Note that d_stream->avail_out > 0 may hold here + if the modification log is nonempty. */ + +zlib_done: + if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) { + ut_error; + } + + { + page_t* page = page_align(d_stream->next_out); + + /* Clear the unused heap space on the uncompressed page. */ + memset(d_stream->next_out, 0, + page_dir_get_nth_slot(page, + page_dir_get_n_slots(page) - 1) + - d_stream->next_out); + } + +#ifdef UNIV_DEBUG + page_zip->m_start = PAGE_DATA + d_stream->total_in; +#endif /* UNIV_DEBUG */ + + /* Apply the modification log. */ + { + const byte* mod_log_ptr; + mod_log_ptr = page_zip_apply_log(d_stream->next_in, + d_stream->avail_in + 1, + recs, n_dense, + trx_id_col, heap_status, + index, offsets); + + if (UNIV_UNLIKELY(!mod_log_ptr)) { + return(FALSE); + } + page_zip->m_end = mod_log_ptr - page_zip->data; + page_zip->m_nonempty = mod_log_ptr != d_stream->next_in; + } + + if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, TRUE) + + page_zip->m_end >= page_zip_get_size(page_zip))) { + + page_zip_fail(("page_zip_decompress_clust: %lu + %lu >= %lu\n", + (ulong) page_zip_get_trailer_len( + page_zip, TRUE), + (ulong) page_zip->m_end, + (ulong) page_zip_get_size(page_zip))); + return(FALSE); + } + + storage = page_zip_dir_start_low(page_zip, n_dense); + + externs = storage - n_dense + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + /* Restore the uncompressed columns in heap_no order. */ + + for (slot = 0; slot < n_dense; slot++) { + ulint i; + ulint len; + byte* dst; + rec_t* rec = recs[slot]; + ibool exists = !page_zip_dir_find_free( + page_zip, page_offset(rec)); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + dst = rec_get_nth_field(rec, offsets, + trx_id_col, &len); + ut_ad(len >= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + storage -= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + memcpy(dst, storage, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + /* Check if there are any externally stored + columns in this record. For each externally + stored column, restore or clear the + BTR_EXTERN_FIELD_REF. */ + if (!rec_offs_any_extern(offsets)) { + continue; + } + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (!rec_offs_nth_extern(offsets, i)) { + continue; + } + dst = rec_get_nth_field(rec, offsets, i, &len); + + if (UNIV_UNLIKELY(len < BTR_EXTERN_FIELD_REF_SIZE)) { + page_zip_fail(("page_zip_decompress_clust:" + " %lu < 20\n", + (ulong) len)); + return(FALSE); + } + + dst += len - BTR_EXTERN_FIELD_REF_SIZE; + + if (UNIV_LIKELY(exists)) { + /* Existing record: + restore the BLOB pointer */ + externs -= BTR_EXTERN_FIELD_REF_SIZE; + + if (UNIV_UNLIKELY + (externs < page_zip->data + + page_zip->m_end)) { + page_zip_fail(("page_zip_" + "decompress_clust: " + "%p < %p + %lu\n", + (const void*) externs, + (const void*) + page_zip->data, + (ulong) + page_zip->m_end)); + return(FALSE); + } + + memcpy(dst, externs, + BTR_EXTERN_FIELD_REF_SIZE); + + page_zip->n_blobs++; + } else { + /* Deleted record: + clear the BLOB pointer */ + memset(dst, 0, + BTR_EXTERN_FIELD_REF_SIZE); + } + } + } + + return(TRUE); +} + +/**********************************************************************//** +Decompress a page. This function should tolerate errors on the compressed +page. Instead of letting assertions fail, it will return FALSE if an +inconsistency is detected. +@return TRUE on success, FALSE on failure */ +UNIV_INTERN +ibool +page_zip_decompress( +/*================*/ + page_zip_des_t* page_zip,/*!< in: data, ssize; + out: m_start, m_end, m_nonempty, n_blobs */ + page_t* page, /*!< out: uncompressed page, may be trashed */ + ibool all) /*!< in: TRUE=decompress the whole page; + FALSE=verify but do not copy some + page header fields that should not change + after page creation */ +{ + z_stream d_stream; + dict_index_t* index = NULL; + rec_t** recs; /*!< dense page directory, sorted by address */ + ulint n_dense;/* number of user records on the page */ + ulint trx_id_col = ULINT_UNDEFINED; + mem_heap_t* heap; + ulint* offsets; +#ifndef UNIV_HOTBACKUP + ullint usec = ut_time_us(NULL); +#endif /* !UNIV_HOTBACKUP */ + + ut_ad(page_zip_simple_validate(page_zip)); + UNIV_MEM_ASSERT_W(page, UNIV_PAGE_SIZE); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + /* The dense directory excludes the infimum and supremum records. */ + n_dense = page_dir_get_n_heap(page_zip->data) - PAGE_HEAP_NO_USER_LOW; + if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE + >= page_zip_get_size(page_zip))) { + page_zip_fail(("page_zip_decompress 1: %lu %lu\n", + (ulong) n_dense, + (ulong) page_zip_get_size(page_zip))); + return(FALSE); + } + + heap = mem_heap_create(n_dense * (3 * sizeof *recs) + UNIV_PAGE_SIZE); + + recs = static_cast<rec_t**>( + mem_heap_alloc(heap, n_dense * (2 * sizeof *recs))); + + if (all) { + /* Copy the page header. */ + memcpy(page, page_zip->data, PAGE_DATA); + } else { + /* Check that the bytes that we skip are identical. */ +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + ut_a(!memcmp(FIL_PAGE_TYPE + page, + FIL_PAGE_TYPE + page_zip->data, + PAGE_HEADER - FIL_PAGE_TYPE)); + ut_a(!memcmp(PAGE_HEADER + PAGE_LEVEL + page, + PAGE_HEADER + PAGE_LEVEL + page_zip->data, + PAGE_DATA - (PAGE_HEADER + PAGE_LEVEL))); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + + /* Copy the mutable parts of the page header. */ + memcpy(page, page_zip->data, FIL_PAGE_TYPE); + memcpy(PAGE_HEADER + page, PAGE_HEADER + page_zip->data, + PAGE_LEVEL - PAGE_N_DIR_SLOTS); + +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + /* Check that the page headers match after copying. */ + ut_a(!memcmp(page, page_zip->data, PAGE_DATA)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + } + +#ifdef UNIV_ZIP_DEBUG + /* Clear the uncompressed page, except the header. */ + memset(PAGE_DATA + page, 0x55, UNIV_PAGE_SIZE - PAGE_DATA); +#endif /* UNIV_ZIP_DEBUG */ + UNIV_MEM_INVALID(PAGE_DATA + page, UNIV_PAGE_SIZE - PAGE_DATA); + + /* Copy the page directory. */ + if (UNIV_UNLIKELY(!page_zip_dir_decode(page_zip, page, recs, + recs + n_dense, n_dense))) { +zlib_error: + mem_heap_free(heap); + return(FALSE); + } + + /* Copy the infimum and supremum records. */ + memcpy(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES), + infimum_extra, sizeof infimum_extra); + if (page_is_empty(page)) { + rec_set_next_offs_new(page + PAGE_NEW_INFIMUM, + PAGE_NEW_SUPREMUM); + } else { + rec_set_next_offs_new(page + PAGE_NEW_INFIMUM, + page_zip_dir_get(page_zip, 0) + & PAGE_ZIP_DIR_SLOT_MASK); + } + memcpy(page + PAGE_NEW_INFIMUM, infimum_data, sizeof infimum_data); + memcpy(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1), + supremum_extra_data, sizeof supremum_extra_data); + + page_zip_set_alloc(&d_stream, heap); + + d_stream.next_in = page_zip->data + PAGE_DATA; + /* Subtract the space reserved for + the page header and the end marker of the modification log. */ + d_stream.avail_in = static_cast<uInt>( + page_zip_get_size(page_zip) - (PAGE_DATA + 1)); + d_stream.next_out = page + PAGE_ZIP_START; + d_stream.avail_out = UNIV_PAGE_SIZE - PAGE_ZIP_START; + + if (UNIV_UNLIKELY(inflateInit2(&d_stream, UNIV_PAGE_SIZE_SHIFT) + != Z_OK)) { + ut_error; + } + + /* Decode the zlib header and the index information. */ + if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) { + + page_zip_fail(("page_zip_decompress:" + " 1 inflate(Z_BLOCK)=%s\n", d_stream.msg)); + goto zlib_error; + } + + if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) { + + page_zip_fail(("page_zip_decompress:" + " 2 inflate(Z_BLOCK)=%s\n", d_stream.msg)); + goto zlib_error; + } + + index = page_zip_fields_decode( + page + PAGE_ZIP_START, d_stream.next_out, + page_is_leaf(page) ? &trx_id_col : NULL); + + if (UNIV_UNLIKELY(!index)) { + + goto zlib_error; + } + + /* Decompress the user records. */ + page_zip->n_blobs = 0; + d_stream.next_out = page + PAGE_ZIP_START; + + { + /* Pre-allocate the offsets for rec_get_offsets_reverse(). */ + ulint n = 1 + 1/* node ptr */ + REC_OFFS_HEADER_SIZE + + dict_index_get_n_fields(index); + + offsets = static_cast<ulint*>( + mem_heap_alloc(heap, n * sizeof(ulint))); + + *offsets = n; + } + + /* Decompress the records in heap_no order. */ + if (!page_is_leaf(page)) { + /* This is a node pointer page. */ + ulint info_bits; + + if (UNIV_UNLIKELY + (!page_zip_decompress_node_ptrs(page_zip, &d_stream, + recs, n_dense, index, + offsets, heap))) { + goto err_exit; + } + + info_bits = mach_read_from_4(page + FIL_PAGE_PREV) == FIL_NULL + ? REC_INFO_MIN_REC_FLAG : 0; + + if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, page, + info_bits))) { + goto err_exit; + } + } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) { + /* This is a leaf page in a secondary index. */ + if (UNIV_UNLIKELY(!page_zip_decompress_sec(page_zip, &d_stream, + recs, n_dense, + index, offsets))) { + goto err_exit; + } + + if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, + page, 0))) { +err_exit: + page_zip_fields_free(index); + mem_heap_free(heap); + return(FALSE); + } + } else { + /* This is a leaf page in a clustered index. */ + if (UNIV_UNLIKELY(!page_zip_decompress_clust(page_zip, + &d_stream, recs, + n_dense, index, + trx_id_col, + offsets, heap))) { + goto err_exit; + } + + if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, + page, 0))) { + goto err_exit; + } + } + + ut_a(page_is_comp(page)); + UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE); + + page_zip_fields_free(index); + mem_heap_free(heap); +#ifndef UNIV_HOTBACKUP + ullint time_diff = ut_time_us(NULL) - usec; + page_zip_stat[page_zip->ssize - 1].decompressed++; + page_zip_stat[page_zip->ssize - 1].decompressed_usec += time_diff; + + index_id_t index_id = btr_page_get_index_id(page); + + if (srv_cmp_per_index_enabled) { + mutex_enter(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index[index_id].decompressed++; + page_zip_stat_per_index[index_id].decompressed_usec += time_diff; + mutex_exit(&page_zip_stat_per_index_mutex); + } +#endif /* !UNIV_HOTBACKUP */ + + /* Update the stat counter for LRU policy. */ + buf_LRU_stat_inc_unzip(); + + MONITOR_INC(MONITOR_PAGE_DECOMPRESS); + + return(TRUE); +} + +#ifdef UNIV_ZIP_DEBUG +/**********************************************************************//** +Dump a block of memory on the standard error stream. */ +static +void +page_zip_hexdump_func( +/*==================*/ + const char* name, /*!< in: name of the data structure */ + const void* buf, /*!< in: data */ + ulint size) /*!< in: length of the data, in bytes */ +{ + const byte* s = static_cast<const byte*>(buf); + ulint addr; + const ulint width = 32; /* bytes per line */ + + fprintf(stderr, "%s:\n", name); + + for (addr = 0; addr < size; addr += width) { + ulint i; + + fprintf(stderr, "%04lx ", (ulong) addr); + + i = ut_min(width, size - addr); + + while (i--) { + fprintf(stderr, "%02x", *s++); + } + + putc('\n', stderr); + } +} + +/** Dump a block of memory on the standard error stream. +@param buf in: data +@param size in: length of the data, in bytes */ +#define page_zip_hexdump(buf, size) page_zip_hexdump_func(#buf, buf, size) + +/** Flag: make page_zip_validate() compare page headers only */ +UNIV_INTERN ibool page_zip_validate_header_only = FALSE; + +/**********************************************************************//** +Check that the compressed and decompressed pages match. +@return TRUE if valid, FALSE if not */ +UNIV_INTERN +ibool +page_zip_validate_low( +/*==================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + const page_t* page, /*!< in: uncompressed page */ + const dict_index_t* index, /*!< in: index of the page, if known */ + ibool sloppy) /*!< in: FALSE=strict, + TRUE=ignore the MIN_REC_FLAG */ +{ + page_zip_des_t temp_page_zip; + byte* temp_page_buf; + page_t* temp_page; + ibool valid; + + if (memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV, + FIL_PAGE_LSN - FIL_PAGE_PREV) + || memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, 2) + || memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA, + PAGE_DATA - FIL_PAGE_DATA)) { + page_zip_fail(("page_zip_validate: page header\n")); + page_zip_hexdump(page_zip, sizeof *page_zip); + page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip)); + page_zip_hexdump(page, UNIV_PAGE_SIZE); + return(FALSE); + } + + ut_a(page_is_comp(page)); + + if (page_zip_validate_header_only) { + return(TRUE); + } + + /* page_zip_decompress() expects the uncompressed page to be + UNIV_PAGE_SIZE aligned. */ + temp_page_buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE)); + temp_page = static_cast<byte*>(ut_align(temp_page_buf, UNIV_PAGE_SIZE)); + + UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + temp_page_zip = *page_zip; + valid = page_zip_decompress(&temp_page_zip, temp_page, TRUE); + if (!valid) { + fputs("page_zip_validate(): failed to decompress\n", stderr); + goto func_exit; + } + if (page_zip->n_blobs != temp_page_zip.n_blobs) { + page_zip_fail(("page_zip_validate: n_blobs: %u!=%u\n", + page_zip->n_blobs, temp_page_zip.n_blobs)); + valid = FALSE; + } +#ifdef UNIV_DEBUG + if (page_zip->m_start != temp_page_zip.m_start) { + page_zip_fail(("page_zip_validate: m_start: %u!=%u\n", + page_zip->m_start, temp_page_zip.m_start)); + valid = FALSE; + } +#endif /* UNIV_DEBUG */ + if (page_zip->m_end != temp_page_zip.m_end) { + page_zip_fail(("page_zip_validate: m_end: %u!=%u\n", + page_zip->m_end, temp_page_zip.m_end)); + valid = FALSE; + } + if (page_zip->m_nonempty != temp_page_zip.m_nonempty) { + page_zip_fail(("page_zip_validate(): m_nonempty: %u!=%u\n", + page_zip->m_nonempty, + temp_page_zip.m_nonempty)); + valid = FALSE; + } + if (memcmp(page + PAGE_HEADER, temp_page + PAGE_HEADER, + UNIV_PAGE_SIZE - PAGE_HEADER - FIL_PAGE_DATA_END)) { + + /* In crash recovery, the "minimum record" flag may be + set incorrectly until the mini-transaction is + committed. Let us tolerate that difference when we + are performing a sloppy validation. */ + + ulint* offsets; + mem_heap_t* heap; + const rec_t* rec; + const rec_t* trec; + byte info_bits_diff; + ulint offset + = rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE); + ut_a(offset >= PAGE_NEW_SUPREMUM); + offset -= 5/*REC_NEW_INFO_BITS*/; + + info_bits_diff = page[offset] ^ temp_page[offset]; + + if (info_bits_diff == REC_INFO_MIN_REC_FLAG) { + temp_page[offset] = page[offset]; + + if (!memcmp(page + PAGE_HEADER, + temp_page + PAGE_HEADER, + UNIV_PAGE_SIZE - PAGE_HEADER + - FIL_PAGE_DATA_END)) { + + /* Only the minimum record flag + differed. Let us ignore it. */ + page_zip_fail(("page_zip_validate: " + "min_rec_flag " + "(%s" + "%lu,%lu,0x%02lx)\n", + sloppy ? "ignored, " : "", + page_get_space_id(page), + page_get_page_no(page), + (ulong) page[offset])); + valid = sloppy; + goto func_exit; + } + } + + /* Compare the pointers in the PAGE_FREE list. */ + rec = page_header_get_ptr(page, PAGE_FREE); + trec = page_header_get_ptr(temp_page, PAGE_FREE); + + while (rec || trec) { + if (page_offset(rec) != page_offset(trec)) { + page_zip_fail(("page_zip_validate: " + "PAGE_FREE list: %u!=%u\n", + (unsigned) page_offset(rec), + (unsigned) page_offset(trec))); + valid = FALSE; + goto func_exit; + } + + rec = page_rec_get_next_low(rec, TRUE); + trec = page_rec_get_next_low(trec, TRUE); + } + + /* Compare the records. */ + heap = NULL; + offsets = NULL; + rec = page_rec_get_next_low( + page + PAGE_NEW_INFIMUM, TRUE); + trec = page_rec_get_next_low( + temp_page + PAGE_NEW_INFIMUM, TRUE); + + do { + if (page_offset(rec) != page_offset(trec)) { + page_zip_fail(("page_zip_validate: " + "record list: 0x%02x!=0x%02x\n", + (unsigned) page_offset(rec), + (unsigned) page_offset(trec))); + valid = FALSE; + break; + } + + if (index) { + /* Compare the data. */ + offsets = rec_get_offsets( + rec, index, offsets, + ULINT_UNDEFINED, &heap); + + if (memcmp(rec - rec_offs_extra_size(offsets), + trec - rec_offs_extra_size(offsets), + rec_offs_size(offsets))) { + page_zip_fail( + ("page_zip_validate: " + "record content: 0x%02x", + (unsigned) page_offset(rec))); + valid = FALSE; + break; + } + } + + rec = page_rec_get_next_low(rec, TRUE); + trec = page_rec_get_next_low(trec, TRUE); + } while (rec || trec); + + if (heap) { + mem_heap_free(heap); + } + } + +func_exit: + if (!valid) { + page_zip_hexdump(page_zip, sizeof *page_zip); + page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip)); + page_zip_hexdump(page, UNIV_PAGE_SIZE); + page_zip_hexdump(temp_page, UNIV_PAGE_SIZE); + } + ut_free(temp_page_buf); + return(valid); +} + +/**********************************************************************//** +Check that the compressed and decompressed pages match. +@return TRUE if valid, FALSE if not */ +UNIV_INTERN +ibool +page_zip_validate( +/*==============*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + const page_t* page, /*!< in: uncompressed page */ + const dict_index_t* index) /*!< in: index of the page, if known */ +{ + return(page_zip_validate_low(page_zip, page, index, + recv_recovery_is_on())); +} +#endif /* UNIV_ZIP_DEBUG */ + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Assert that the compressed and decompressed page headers match. +@return TRUE */ +static +ibool +page_zip_header_cmp( +/*================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + const byte* page) /*!< in: uncompressed page */ +{ + ut_ad(!memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV, + FIL_PAGE_LSN - FIL_PAGE_PREV)); + ut_ad(!memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, + 2)); + ut_ad(!memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA, + PAGE_DATA - FIL_PAGE_DATA)); + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/**********************************************************************//** +Write a record on the compressed page that contains externally stored +columns. The data must already have been written to the uncompressed page. +@return end of modification log */ +static +byte* +page_zip_write_rec_ext( +/*===================*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + const page_t* page, /*!< in: page containing rec */ + const byte* rec, /*!< in: record being written */ + dict_index_t* index, /*!< in: record descriptor */ + const ulint* offsets, /*!< in: rec_get_offsets(rec, index) */ + ulint create, /*!< in: nonzero=insert, zero=update */ + ulint trx_id_col, /*!< in: position of DB_TRX_ID */ + ulint heap_no, /*!< in: heap number of rec */ + byte* storage, /*!< in: end of dense page directory */ + byte* data) /*!< in: end of modification log */ +{ + const byte* start = rec; + ulint i; + ulint len; + byte* externs = storage; + ulint n_ext = rec_offs_n_extern(offsets); + + ut_ad(rec_offs_validate(rec, index, offsets)); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + externs -= (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW); + + /* Note that this will not take into account + the BLOB columns of rec if create==TRUE. */ + ut_ad(data + rec_offs_data_size(offsets) + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + - n_ext * BTR_EXTERN_FIELD_REF_SIZE + < externs - BTR_EXTERN_FIELD_REF_SIZE * page_zip->n_blobs); + + { + ulint blob_no = page_zip_get_n_prev_extern( + page_zip, rec, index); + byte* ext_end = externs - page_zip->n_blobs + * BTR_EXTERN_FIELD_REF_SIZE; + ut_ad(blob_no <= page_zip->n_blobs); + externs -= blob_no * BTR_EXTERN_FIELD_REF_SIZE; + + if (create) { + page_zip->n_blobs += static_cast<unsigned>(n_ext); + ASSERT_ZERO_BLOB(ext_end - n_ext + * BTR_EXTERN_FIELD_REF_SIZE); + memmove(ext_end - n_ext + * BTR_EXTERN_FIELD_REF_SIZE, + ext_end, + externs - ext_end); + } + + ut_a(blob_no + n_ext <= page_zip->n_blobs); + } + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + const byte* src; + + if (UNIV_UNLIKELY(i == trx_id_col)) { + ut_ad(!rec_offs_nth_extern(offsets, + i)); + ut_ad(!rec_offs_nth_extern(offsets, + i + 1)); + /* Locate trx_id and roll_ptr. */ + src = rec_get_nth_field(rec, offsets, + i, &len); + ut_ad(len == DATA_TRX_ID_LEN); + ut_ad(src + DATA_TRX_ID_LEN + == rec_get_nth_field( + rec, offsets, + i + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + + /* Log the preceding fields. */ + ASSERT_ZERO(data, src - start); + memcpy(data, start, src - start); + data += src - start; + start = src + (DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN); + + /* Store trx_id and roll_ptr. */ + memcpy(storage - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (heap_no - 1), + src, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + i++; /* skip also roll_ptr */ + } else if (rec_offs_nth_extern(offsets, i)) { + src = rec_get_nth_field(rec, offsets, + i, &len); + + ut_ad(dict_index_is_clust(index)); + ut_ad(len + >= BTR_EXTERN_FIELD_REF_SIZE); + src += len - BTR_EXTERN_FIELD_REF_SIZE; + + ASSERT_ZERO(data, src - start); + memcpy(data, start, src - start); + data += src - start; + start = src + BTR_EXTERN_FIELD_REF_SIZE; + + /* Store the BLOB pointer. */ + externs -= BTR_EXTERN_FIELD_REF_SIZE; + ut_ad(data < externs); + memcpy(externs, src, BTR_EXTERN_FIELD_REF_SIZE); + } + } + + /* Log the last bytes of the record. */ + len = rec_offs_data_size(offsets) - (start - rec); + + ASSERT_ZERO(data, len); + memcpy(data, start, len); + data += len; + + return(data); +} + +/**********************************************************************//** +Write an entire record on the compressed page. The data must already +have been written to the uncompressed page. */ +UNIV_INTERN +void +page_zip_write_rec( +/*===============*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* rec, /*!< in: record being written */ + dict_index_t* index, /*!< in: the index the record belongs to */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + ulint create) /*!< in: nonzero=insert, zero=update */ +{ + const page_t* page; + byte* data; + byte* storage; + ulint heap_no; + byte* slot; + + ut_ad(PAGE_ZIP_MATCH(rec, page_zip)); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(rec_offs_comp(offsets)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + + page = page_align(rec); + + ut_ad(page_zip_header_cmp(page_zip, page)); + ut_ad(page_simple_validate_new((page_t*) page)); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + slot = page_zip_dir_find(page_zip, page_offset(rec)); + ut_a(slot); + /* Copy the delete mark. */ + if (rec_get_deleted_flag(rec, TRUE)) { + *slot |= PAGE_ZIP_DIR_SLOT_DEL >> 8; + } else { + *slot &= ~(PAGE_ZIP_DIR_SLOT_DEL >> 8); + } + + ut_ad(rec_get_start((rec_t*) rec, offsets) >= page + PAGE_ZIP_START); + ut_ad(rec_get_end((rec_t*) rec, offsets) <= page + UNIV_PAGE_SIZE + - PAGE_DIR - PAGE_DIR_SLOT_SIZE + * page_dir_get_n_slots(page)); + + heap_no = rec_get_heap_no_new(rec); + ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); /* not infimum or supremum */ + ut_ad(heap_no < page_dir_get_n_heap(page)); + + /* Append to the modification log. */ + data = page_zip->data + page_zip->m_end; + ut_ad(!*data); + + /* Identify the record by writing its heap number - 1. + 0 is reserved to indicate the end of the modification log. */ + + if (UNIV_UNLIKELY(heap_no - 1 >= 64)) { + *data++ = (byte) (0x80 | (heap_no - 1) >> 7); + ut_ad(!*data); + } + *data++ = (byte) ((heap_no - 1) << 1); + ut_ad(!*data); + + { + const byte* start = rec - rec_offs_extra_size(offsets); + const byte* b = rec - REC_N_NEW_EXTRA_BYTES; + + /* Write the extra bytes backwards, so that + rec_offs_extra_size() can be easily computed in + page_zip_apply_log() by invoking + rec_get_offsets_reverse(). */ + + while (b != start) { + *data++ = *--b; + ut_ad(!*data); + } + } + + /* Write the data bytes. Store the uncompressed bytes separately. */ + storage = page_zip_dir_start(page_zip); + + if (page_is_leaf(page)) { + ulint len; + + if (dict_index_is_clust(index)) { + ulint trx_id_col; + + trx_id_col = dict_index_get_sys_col_pos(index, + DATA_TRX_ID); + ut_ad(trx_id_col != ULINT_UNDEFINED); + + /* Store separately trx_id, roll_ptr and + the BTR_EXTERN_FIELD_REF of each BLOB column. */ + if (rec_offs_any_extern(offsets)) { + data = page_zip_write_rec_ext( + page_zip, page, + rec, index, offsets, create, + trx_id_col, heap_no, storage, data); + } else { + /* Locate trx_id and roll_ptr. */ + const byte* src + = rec_get_nth_field(rec, offsets, + trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + ut_ad(src + DATA_TRX_ID_LEN + == rec_get_nth_field( + rec, offsets, + trx_id_col + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + + /* Log the preceding fields. */ + ASSERT_ZERO(data, src - rec); + memcpy(data, rec, src - rec); + data += src - rec; + + /* Store trx_id and roll_ptr. */ + memcpy(storage + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (heap_no - 1), + src, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + src += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + + /* Log the last bytes of the record. */ + len = rec_offs_data_size(offsets) + - (src - rec); + + ASSERT_ZERO(data, len); + memcpy(data, src, len); + data += len; + } + } else { + /* Leaf page of a secondary index: + no externally stored columns */ + ut_ad(dict_index_get_sys_col_pos(index, DATA_TRX_ID) + == ULINT_UNDEFINED); + ut_ad(!rec_offs_any_extern(offsets)); + + /* Log the entire record. */ + len = rec_offs_data_size(offsets); + + ASSERT_ZERO(data, len); + memcpy(data, rec, len); + data += len; + } + } else { + /* This is a node pointer page. */ + ulint len; + + /* Non-leaf nodes should not have any externally + stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + + /* Copy the data bytes, except node_ptr. */ + len = rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE; + ut_ad(data + len < storage - REC_NODE_PTR_SIZE + * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)); + ASSERT_ZERO(data, len); + memcpy(data, rec, len); + data += len; + + /* Copy the node pointer to the uncompressed area. */ + memcpy(storage - REC_NODE_PTR_SIZE + * (heap_no - 1), + rec + len, + REC_NODE_PTR_SIZE); + } + + ut_a(!*data); + ut_ad((ulint) (data - page_zip->data) < page_zip_get_size(page_zip)); + page_zip->m_end = data - page_zip->data; + page_zip->m_nonempty = TRUE; + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page_align(rec), index)); +#endif /* UNIV_ZIP_DEBUG */ +} + +/***********************************************************//** +Parses a log record of writing a BLOB pointer of a record. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_zip_parse_write_blob_ptr( +/*==========================*/ + byte* ptr, /*!< in: redo log buffer */ + byte* end_ptr,/*!< in: redo log buffer end */ + page_t* page, /*!< in/out: uncompressed page */ + page_zip_des_t* page_zip)/*!< in/out: compressed page */ +{ + ulint offset; + ulint z_offset; + + ut_ad(!page == !page_zip); + + if (UNIV_UNLIKELY + (end_ptr < ptr + (2 + 2 + BTR_EXTERN_FIELD_REF_SIZE))) { + + return(NULL); + } + + offset = mach_read_from_2(ptr); + z_offset = mach_read_from_2(ptr + 2); + + if (UNIV_UNLIKELY(offset < PAGE_ZIP_START) + || UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE) + || UNIV_UNLIKELY(z_offset >= UNIV_PAGE_SIZE)) { +corrupt: + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + + if (page) { + if (UNIV_UNLIKELY(!page_zip) + || UNIV_UNLIKELY(!page_is_leaf(page))) { + + goto corrupt; + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, NULL)); +#endif /* UNIV_ZIP_DEBUG */ + + memcpy(page + offset, + ptr + 4, BTR_EXTERN_FIELD_REF_SIZE); + memcpy(page_zip->data + z_offset, + ptr + 4, BTR_EXTERN_FIELD_REF_SIZE); + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, NULL)); +#endif /* UNIV_ZIP_DEBUG */ + } + + return(ptr + (2 + 2 + BTR_EXTERN_FIELD_REF_SIZE)); +} + +/**********************************************************************//** +Write a BLOB pointer of a record on the leaf page of a clustered index. +The information must already have been updated on the uncompressed page. */ +UNIV_INTERN +void +page_zip_write_blob_ptr( +/*====================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* rec, /*!< in/out: record whose data is being + written */ + dict_index_t* index, /*!< in: index of the page */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + ulint n, /*!< in: column index */ + mtr_t* mtr) /*!< in: mini-transaction handle, + or NULL if no logging is needed */ +{ + const byte* field; + byte* externs; + const page_t* page = page_align(rec); + ulint blob_no; + ulint len; + + ut_ad(PAGE_ZIP_MATCH(rec, page_zip)); + ut_ad(page_simple_validate_new((page_t*) page)); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(rec_offs_comp(offsets)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(rec_offs_any_extern(offsets)); + ut_ad(rec_offs_nth_extern(offsets, n)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + ut_ad(page_zip_header_cmp(page_zip, page)); + + ut_ad(page_is_leaf(page)); + ut_ad(dict_index_is_clust(index)); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + blob_no = page_zip_get_n_prev_extern(page_zip, rec, index) + + rec_get_n_extern_new(rec, index, n); + ut_a(blob_no < page_zip->n_blobs); + + externs = page_zip->data + page_zip_get_size(page_zip) + - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW) + * (PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + field = rec_get_nth_field(rec, offsets, n, &len); + + externs -= (blob_no + 1) * BTR_EXTERN_FIELD_REF_SIZE; + field += len - BTR_EXTERN_FIELD_REF_SIZE; + + memcpy(externs, field, BTR_EXTERN_FIELD_REF_SIZE); + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + if (mtr) { +#ifndef UNIV_HOTBACKUP + byte* log_ptr = mlog_open( + mtr, 11 + 2 + 2 + BTR_EXTERN_FIELD_REF_SIZE); + if (UNIV_UNLIKELY(!log_ptr)) { + return; + } + + log_ptr = mlog_write_initial_log_record_fast( + (byte*) field, MLOG_ZIP_WRITE_BLOB_PTR, log_ptr, mtr); + mach_write_to_2(log_ptr, page_offset(field)); + log_ptr += 2; + mach_write_to_2(log_ptr, externs - page_zip->data); + log_ptr += 2; + memcpy(log_ptr, externs, BTR_EXTERN_FIELD_REF_SIZE); + log_ptr += BTR_EXTERN_FIELD_REF_SIZE; + mlog_close(mtr, log_ptr); +#endif /* !UNIV_HOTBACKUP */ + } +} + +/***********************************************************//** +Parses a log record of writing the node pointer of a record. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_zip_parse_write_node_ptr( +/*==========================*/ + byte* ptr, /*!< in: redo log buffer */ + byte* end_ptr,/*!< in: redo log buffer end */ + page_t* page, /*!< in/out: uncompressed page */ + page_zip_des_t* page_zip)/*!< in/out: compressed page */ +{ + ulint offset; + ulint z_offset; + + ut_ad(!page == !page_zip); + + if (UNIV_UNLIKELY(end_ptr < ptr + (2 + 2 + REC_NODE_PTR_SIZE))) { + + return(NULL); + } + + offset = mach_read_from_2(ptr); + z_offset = mach_read_from_2(ptr + 2); + + if (UNIV_UNLIKELY(offset < PAGE_ZIP_START) + || UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE) + || UNIV_UNLIKELY(z_offset >= UNIV_PAGE_SIZE)) { +corrupt: + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + + if (page) { + byte* storage_end; + byte* field; + byte* storage; + ulint heap_no; + + if (UNIV_UNLIKELY(!page_zip) + || UNIV_UNLIKELY(page_is_leaf(page))) { + + goto corrupt; + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, NULL)); +#endif /* UNIV_ZIP_DEBUG */ + + field = page + offset; + storage = page_zip->data + z_offset; + + storage_end = page_zip_dir_start(page_zip); + + heap_no = 1 + (storage_end - storage) / REC_NODE_PTR_SIZE; + + if (UNIV_UNLIKELY((storage_end - storage) % REC_NODE_PTR_SIZE) + || UNIV_UNLIKELY(heap_no < PAGE_HEAP_NO_USER_LOW) + || UNIV_UNLIKELY(heap_no >= page_dir_get_n_heap(page))) { + + goto corrupt; + } + + memcpy(field, ptr + 4, REC_NODE_PTR_SIZE); + memcpy(storage, ptr + 4, REC_NODE_PTR_SIZE); + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, NULL)); +#endif /* UNIV_ZIP_DEBUG */ + } + + return(ptr + (2 + 2 + REC_NODE_PTR_SIZE)); +} + +/**********************************************************************//** +Write the node pointer of a record on a non-leaf compressed page. */ +UNIV_INTERN +void +page_zip_write_node_ptr( +/*====================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + byte* rec, /*!< in/out: record */ + ulint size, /*!< in: data size of rec */ + ulint ptr, /*!< in: node pointer */ + mtr_t* mtr) /*!< in: mini-transaction, or NULL */ +{ + byte* field; + byte* storage; +#ifdef UNIV_DEBUG + page_t* page = page_align(rec); +#endif /* UNIV_DEBUG */ + + ut_ad(PAGE_ZIP_MATCH(rec, page_zip)); + ut_ad(page_simple_validate_new(page)); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(page_rec_is_comp(rec)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + ut_ad(page_zip_header_cmp(page_zip, page)); + + ut_ad(!page_is_leaf(page)); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + UNIV_MEM_ASSERT_RW(rec, size); + + storage = page_zip_dir_start(page_zip) + - (rec_get_heap_no_new(rec) - 1) * REC_NODE_PTR_SIZE; + field = rec + size - REC_NODE_PTR_SIZE; + +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + ut_a(!memcmp(storage, field, REC_NODE_PTR_SIZE)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ +#if REC_NODE_PTR_SIZE != 4 +# error "REC_NODE_PTR_SIZE != 4" +#endif + mach_write_to_4(field, ptr); + memcpy(storage, field, REC_NODE_PTR_SIZE); + + if (mtr) { +#ifndef UNIV_HOTBACKUP + byte* log_ptr = mlog_open(mtr, + 11 + 2 + 2 + REC_NODE_PTR_SIZE); + if (UNIV_UNLIKELY(!log_ptr)) { + return; + } + + log_ptr = mlog_write_initial_log_record_fast( + field, MLOG_ZIP_WRITE_NODE_PTR, log_ptr, mtr); + mach_write_to_2(log_ptr, page_offset(field)); + log_ptr += 2; + mach_write_to_2(log_ptr, storage - page_zip->data); + log_ptr += 2; + memcpy(log_ptr, field, REC_NODE_PTR_SIZE); + log_ptr += REC_NODE_PTR_SIZE; + mlog_close(mtr, log_ptr); +#endif /* !UNIV_HOTBACKUP */ + } +} + +/**********************************************************************//** +Write the trx_id and roll_ptr of a record on a B-tree leaf node page. */ +UNIV_INTERN +void +page_zip_write_trx_id_and_roll_ptr( +/*===============================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + byte* rec, /*!< in/out: record */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ + ulint trx_id_col,/*!< in: column number of TRX_ID in rec */ + trx_id_t trx_id, /*!< in: transaction identifier */ + roll_ptr_t roll_ptr)/*!< in: roll_ptr */ +{ + byte* field; + byte* storage; +#ifdef UNIV_DEBUG + page_t* page = page_align(rec); +#endif /* UNIV_DEBUG */ + ulint len; + + ut_ad(PAGE_ZIP_MATCH(rec, page_zip)); + + ut_ad(page_simple_validate_new(page)); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(rec_offs_comp(offsets)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + ut_ad(page_zip_header_cmp(page_zip, page)); + + ut_ad(page_is_leaf(page)); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + storage = page_zip_dir_start(page_zip) + - (rec_get_heap_no_new(rec) - 1) + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + +#if DATA_TRX_ID + 1 != DATA_ROLL_PTR +# error "DATA_TRX_ID + 1 != DATA_ROLL_PTR" +#endif + field = rec_get_nth_field(rec, offsets, trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + ut_ad(field + DATA_TRX_ID_LEN + == rec_get_nth_field(rec, offsets, trx_id_col + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + ut_a(!memcmp(storage, field, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ +#if DATA_TRX_ID_LEN != 6 +# error "DATA_TRX_ID_LEN != 6" +#endif + mach_write_to_6(field, trx_id); +#if DATA_ROLL_PTR_LEN != 7 +# error "DATA_ROLL_PTR_LEN != 7" +#endif + mach_write_to_7(field + DATA_TRX_ID_LEN, roll_ptr); + memcpy(storage, field, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); +} + +/**********************************************************************//** +Clear an area on the uncompressed and compressed page. +Do not clear the data payload, as that would grow the modification log. */ +static +void +page_zip_clear_rec( +/*===============*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + byte* rec, /*!< in: record to clear */ + const dict_index_t* index, /*!< in: index of rec */ + const ulint* offsets) /*!< in: rec_get_offsets(rec, index) */ +{ + ulint heap_no; + page_t* page = page_align(rec); + byte* storage; + byte* field; + ulint len; + /* page_zip_validate() would fail here if a record + containing externally stored columns is being deleted. */ + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!page_zip_dir_find(page_zip, page_offset(rec))); + ut_ad(page_zip_dir_find_free(page_zip, page_offset(rec))); + ut_ad(page_zip_header_cmp(page_zip, page)); + + heap_no = rec_get_heap_no_new(rec); + ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + if (!page_is_leaf(page)) { + /* Clear node_ptr. On the compressed page, + there is an array of node_ptr immediately before the + dense page directory, at the very end of the page. */ + storage = page_zip_dir_start(page_zip); + ut_ad(dict_index_get_n_unique_in_tree(index) == + rec_offs_n_fields(offsets) - 1); + field = rec_get_nth_field(rec, offsets, + rec_offs_n_fields(offsets) - 1, + &len); + ut_ad(len == REC_NODE_PTR_SIZE); + + ut_ad(!rec_offs_any_extern(offsets)); + memset(field, 0, REC_NODE_PTR_SIZE); + memset(storage - (heap_no - 1) * REC_NODE_PTR_SIZE, + 0, REC_NODE_PTR_SIZE); + } else if (dict_index_is_clust(index)) { + /* Clear trx_id and roll_ptr. On the compressed page, + there is an array of these fields immediately before the + dense page directory, at the very end of the page. */ + const ulint trx_id_pos + = dict_col_get_clust_pos( + dict_table_get_sys_col( + index->table, DATA_TRX_ID), index); + storage = page_zip_dir_start(page_zip); + field = rec_get_nth_field(rec, offsets, trx_id_pos, &len); + ut_ad(len == DATA_TRX_ID_LEN); + + memset(field, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + memset(storage - (heap_no - 1) + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN), + 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + if (rec_offs_any_extern(offsets)) { + ulint i; + + for (i = rec_offs_n_fields(offsets); i--; ) { + /* Clear all BLOB pointers in order to make + page_zip_validate() pass. */ + if (rec_offs_nth_extern(offsets, i)) { + field = rec_get_nth_field( + rec, offsets, i, &len); + ut_ad(len + == BTR_EXTERN_FIELD_REF_SIZE); + memset(field + len + - BTR_EXTERN_FIELD_REF_SIZE, + 0, BTR_EXTERN_FIELD_REF_SIZE); + } + } + } + } else { + ut_ad(!rec_offs_any_extern(offsets)); + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ +} + +/**********************************************************************//** +Write the "deleted" flag of a record on a compressed page. The flag must +already have been written on the uncompressed page. */ +UNIV_INTERN +void +page_zip_rec_set_deleted( +/*=====================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* rec, /*!< in: record on the uncompressed page */ + ulint flag) /*!< in: the deleted flag (nonzero=TRUE) */ +{ + byte* slot = page_zip_dir_find(page_zip, page_offset(rec)); + ut_a(slot); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + if (flag) { + *slot |= (PAGE_ZIP_DIR_SLOT_DEL >> 8); + } else { + *slot &= ~(PAGE_ZIP_DIR_SLOT_DEL >> 8); + } +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page_align(rec), NULL)); +#endif /* UNIV_ZIP_DEBUG */ +} + +/**********************************************************************//** +Write the "owned" flag of a record on a compressed page. The n_owned field +must already have been written on the uncompressed page. */ +UNIV_INTERN +void +page_zip_rec_set_owned( +/*===================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* rec, /*!< in: record on the uncompressed page */ + ulint flag) /*!< in: the owned flag (nonzero=TRUE) */ +{ + byte* slot = page_zip_dir_find(page_zip, page_offset(rec)); + ut_a(slot); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + if (flag) { + *slot |= (PAGE_ZIP_DIR_SLOT_OWNED >> 8); + } else { + *slot &= ~(PAGE_ZIP_DIR_SLOT_OWNED >> 8); + } +} + +/**********************************************************************//** +Insert a record to the dense page directory. */ +UNIV_INTERN +void +page_zip_dir_insert( +/*================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + const byte* prev_rec,/*!< in: record after which to insert */ + const byte* free_rec,/*!< in: record from which rec was + allocated, or NULL */ + byte* rec) /*!< in: record to insert */ +{ + ulint n_dense; + byte* slot_rec; + byte* slot_free; + + ut_ad(prev_rec != rec); + ut_ad(page_rec_get_next((rec_t*) prev_rec) == rec); + ut_ad(page_zip_simple_validate(page_zip)); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + if (page_rec_is_infimum(prev_rec)) { + /* Use the first slot. */ + slot_rec = page_zip->data + page_zip_get_size(page_zip); + } else { + byte* end = page_zip->data + page_zip_get_size(page_zip); + byte* start = end - page_zip_dir_user_size(page_zip); + + if (UNIV_LIKELY(!free_rec)) { + /* PAGE_N_RECS was already incremented + in page_cur_insert_rec_zip(), but the + dense directory slot at that position + contains garbage. Skip it. */ + start += PAGE_ZIP_DIR_SLOT_SIZE; + } + + slot_rec = page_zip_dir_find_low(start, end, + page_offset(prev_rec)); + ut_a(slot_rec); + } + + /* Read the old n_dense (n_heap may have been incremented). */ + n_dense = page_dir_get_n_heap(page_zip->data) + - (PAGE_HEAP_NO_USER_LOW + 1); + + if (UNIV_LIKELY_NULL(free_rec)) { + /* The record was allocated from the free list. + Shift the dense directory only up to that slot. + Note that in this case, n_dense is actually + off by one, because page_cur_insert_rec_zip() + did not increment n_heap. */ + ut_ad(rec_get_heap_no_new(rec) < n_dense + 1 + + PAGE_HEAP_NO_USER_LOW); + ut_ad(rec >= free_rec); + slot_free = page_zip_dir_find(page_zip, page_offset(free_rec)); + ut_ad(slot_free); + slot_free += PAGE_ZIP_DIR_SLOT_SIZE; + } else { + /* The record was allocated from the heap. + Shift the entire dense directory. */ + ut_ad(rec_get_heap_no_new(rec) == n_dense + + PAGE_HEAP_NO_USER_LOW); + + /* Shift to the end of the dense page directory. */ + slot_free = page_zip->data + page_zip_get_size(page_zip) + - PAGE_ZIP_DIR_SLOT_SIZE * n_dense; + } + + /* Shift the dense directory to allocate place for rec. */ + memmove(slot_free - PAGE_ZIP_DIR_SLOT_SIZE, slot_free, + slot_rec - slot_free); + + /* Write the entry for the inserted record. + The "owned" and "deleted" flags must be zero. */ + mach_write_to_2(slot_rec - PAGE_ZIP_DIR_SLOT_SIZE, page_offset(rec)); +} + +/**********************************************************************//** +Shift the dense page directory and the array of BLOB pointers +when a record is deleted. */ +UNIV_INTERN +void +page_zip_dir_delete( +/*================*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + byte* rec, /*!< in: deleted record */ + const dict_index_t* index, /*!< in: index of rec */ + const ulint* offsets, /*!< in: rec_get_offsets(rec) */ + const byte* free) /*!< in: previous start of + the free list */ +{ + byte* slot_rec; + byte* slot_free; + ulint n_ext; + page_t* page = page_align(rec); + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_comp(offsets)); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + slot_rec = page_zip_dir_find(page_zip, page_offset(rec)); + + ut_a(slot_rec); + + /* This could not be done before page_zip_dir_find(). */ + page_header_set_field(page, page_zip, PAGE_N_RECS, + (ulint)(page_get_n_recs(page) - 1)); + + if (UNIV_UNLIKELY(!free)) { + /* Make the last slot the start of the free list. */ + slot_free = page_zip->data + page_zip_get_size(page_zip) + - PAGE_ZIP_DIR_SLOT_SIZE + * (page_dir_get_n_heap(page_zip->data) + - PAGE_HEAP_NO_USER_LOW); + } else { + slot_free = page_zip_dir_find_free(page_zip, + page_offset(free)); + ut_a(slot_free < slot_rec); + /* Grow the free list by one slot by moving the start. */ + slot_free += PAGE_ZIP_DIR_SLOT_SIZE; + } + + if (UNIV_LIKELY(slot_rec > slot_free)) { + memmove(slot_free + PAGE_ZIP_DIR_SLOT_SIZE, + slot_free, + slot_rec - slot_free); + } + + /* Write the entry for the deleted record. + The "owned" and "deleted" flags will be cleared. */ + mach_write_to_2(slot_free, page_offset(rec)); + + if (!page_is_leaf(page) || !dict_index_is_clust(index)) { + ut_ad(!rec_offs_any_extern(offsets)); + goto skip_blobs; + } + + n_ext = rec_offs_n_extern(offsets); + if (UNIV_UNLIKELY(n_ext)) { + /* Shift and zero fill the array of BLOB pointers. */ + ulint blob_no; + byte* externs; + byte* ext_end; + + blob_no = page_zip_get_n_prev_extern(page_zip, rec, index); + ut_a(blob_no + n_ext <= page_zip->n_blobs); + + externs = page_zip->data + page_zip_get_size(page_zip) + - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW) + * (PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + ext_end = externs - page_zip->n_blobs + * BTR_EXTERN_FIELD_REF_SIZE; + externs -= blob_no * BTR_EXTERN_FIELD_REF_SIZE; + + page_zip->n_blobs -= static_cast<unsigned>(n_ext); + /* Shift and zero fill the array. */ + memmove(ext_end + n_ext * BTR_EXTERN_FIELD_REF_SIZE, ext_end, + (page_zip->n_blobs - blob_no) + * BTR_EXTERN_FIELD_REF_SIZE); + memset(ext_end, 0, n_ext * BTR_EXTERN_FIELD_REF_SIZE); + } + +skip_blobs: + /* The compression algorithm expects info_bits and n_owned + to be 0 for deleted records. */ + rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */ + + page_zip_clear_rec(page_zip, rec, index, offsets); +} + +/**********************************************************************//** +Add a slot to the dense page directory. */ +UNIV_INTERN +void +page_zip_dir_add_slot( +/*==================*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + ulint is_clustered) /*!< in: nonzero for clustered index, + zero for others */ +{ + ulint n_dense; + byte* dir; + byte* stored; + + ut_ad(page_is_comp(page_zip->data)); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + /* Read the old n_dense (n_heap has already been incremented). */ + n_dense = page_dir_get_n_heap(page_zip->data) + - (PAGE_HEAP_NO_USER_LOW + 1); + + dir = page_zip->data + page_zip_get_size(page_zip) + - PAGE_ZIP_DIR_SLOT_SIZE * n_dense; + + if (!page_is_leaf(page_zip->data)) { + ut_ad(!page_zip->n_blobs); + stored = dir - n_dense * REC_NODE_PTR_SIZE; + } else if (is_clustered) { + /* Move the BLOB pointer array backwards to make space for the + roll_ptr and trx_id columns and the dense directory slot. */ + byte* externs; + + stored = dir - n_dense + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + externs = stored + - page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE; + ASSERT_ZERO(externs + - (PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN), + PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + memmove(externs - (PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN), + externs, stored - externs); + } else { + stored = dir + - page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE; + ASSERT_ZERO(stored - PAGE_ZIP_DIR_SLOT_SIZE, + PAGE_ZIP_DIR_SLOT_SIZE); + } + + /* Move the uncompressed area backwards to make space + for one directory slot. */ + memmove(stored - PAGE_ZIP_DIR_SLOT_SIZE, stored, dir - stored); +} + +/***********************************************************//** +Parses a log record of writing to the header of a page. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_zip_parse_write_header( +/*========================*/ + byte* ptr, /*!< in: redo log buffer */ + byte* end_ptr,/*!< in: redo log buffer end */ + page_t* page, /*!< in/out: uncompressed page */ + page_zip_des_t* page_zip)/*!< in/out: compressed page */ +{ + ulint offset; + ulint len; + + ut_ad(ptr && end_ptr); + ut_ad(!page == !page_zip); + + if (UNIV_UNLIKELY(end_ptr < ptr + (1 + 1))) { + + return(NULL); + } + + offset = (ulint) *ptr++; + len = (ulint) *ptr++; + + if (UNIV_UNLIKELY(!len) || UNIV_UNLIKELY(offset + len >= PAGE_DATA)) { +corrupt: + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + + if (UNIV_UNLIKELY(end_ptr < ptr + len)) { + + return(NULL); + } + + if (page) { + if (UNIV_UNLIKELY(!page_zip)) { + + goto corrupt; + } +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, NULL)); +#endif /* UNIV_ZIP_DEBUG */ + + memcpy(page + offset, ptr, len); + memcpy(page_zip->data + offset, ptr, len); + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, NULL)); +#endif /* UNIV_ZIP_DEBUG */ + } + + return(ptr + len); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Write a log record of writing to the uncompressed header portion of a page. */ +UNIV_INTERN +void +page_zip_write_header_log( +/*======================*/ + const byte* data, /*!< in: data on the uncompressed page */ + ulint length, /*!< in: length of the data */ + mtr_t* mtr) /*!< in: mini-transaction */ +{ + byte* log_ptr = mlog_open(mtr, 11 + 1 + 1); + ulint offset = page_offset(data); + + ut_ad(offset < PAGE_DATA); + ut_ad(offset + length < PAGE_DATA); +#if PAGE_DATA > 255 +# error "PAGE_DATA > 255" +#endif + ut_ad(length < 256); + + /* If no logging is requested, we may return now */ + if (UNIV_UNLIKELY(!log_ptr)) { + + return; + } + + log_ptr = mlog_write_initial_log_record_fast( + (byte*) data, MLOG_ZIP_WRITE_HEADER, log_ptr, mtr); + *log_ptr++ = (byte) offset; + *log_ptr++ = (byte) length; + mlog_close(mtr, log_ptr); + + mlog_catenate_string(mtr, data, length); +} +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************************//** +Reorganize and compress a page. This is a low-level operation for +compressed pages, to be used when page_zip_compress() fails. +On success, a redo log entry MLOG_ZIP_PAGE_COMPRESS will be written. +The function btr_page_reorganize() should be preferred whenever possible. +IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a +non-clustered index, the caller must update the insert buffer free +bits in the same mini-transaction in such a way that the modification +will be redo-logged. +@return TRUE on success, FALSE on failure; page_zip will be left +intact on failure, but page will be overwritten. */ +UNIV_INTERN +ibool +page_zip_reorganize( +/*================*/ + buf_block_t* block, /*!< in/out: page with compressed page; + on the compressed page, in: size; + out: data, n_blobs, + m_start, m_end, m_nonempty */ + dict_index_t* index, /*!< in: index of the B-tree node */ + mtr_t* mtr) /*!< in: mini-transaction */ +{ +#ifndef UNIV_HOTBACKUP + buf_pool_t* buf_pool = buf_pool_from_block(block); +#endif /* !UNIV_HOTBACKUP */ + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + page_t* page = buf_block_get_frame(block); + buf_block_t* temp_block; + page_t* temp_page; + ulint log_mode; + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(page_is_comp(page)); + ut_ad(!dict_index_is_ibuf(index)); + /* Note that page_zip_validate(page_zip, page, index) may fail here. */ + UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + /* Disable logging */ + log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + +#ifndef UNIV_HOTBACKUP + temp_block = buf_block_alloc(buf_pool); + btr_search_drop_page_hash_index(block); + block->check_index_page_at_flush = TRUE; +#else /* !UNIV_HOTBACKUP */ + ut_ad(block == back_block1); + temp_block = back_block2; +#endif /* !UNIV_HOTBACKUP */ + temp_page = temp_block->frame; + + /* Copy the old page to temporary space */ + buf_frame_copy(temp_page, page); + + btr_blob_dbg_remove(page, index, "zip_reorg"); + + /* Recreate the page: note that global data on page (possible + segment headers, next page-field, etc.) is preserved intact */ + + page_create(block, mtr, TRUE); + + /* Copy the records from the temporary space to the recreated page; + do not copy the lock bits yet */ + + page_copy_rec_list_end_no_locks(block, temp_block, + page_get_infimum_rec(temp_page), + index, mtr); + + if (!dict_index_is_clust(index) && page_is_leaf(temp_page)) { + /* Copy max trx id to recreated page */ + trx_id_t max_trx_id = page_get_max_trx_id(temp_page); + page_set_max_trx_id(block, NULL, max_trx_id, NULL); + ut_ad(max_trx_id != 0); + } + + /* Restore logging. */ + mtr_set_log_mode(mtr, log_mode); + + if (!page_zip_compress(page_zip, page, index, page_zip_level, mtr)) { + +#ifndef UNIV_HOTBACKUP + buf_block_free(temp_block); +#endif /* !UNIV_HOTBACKUP */ + return(FALSE); + } + + lock_move_reorganize_page(block, temp_block); + +#ifndef UNIV_HOTBACKUP + buf_block_free(temp_block); +#endif /* !UNIV_HOTBACKUP */ + return(TRUE); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Copy the records of a page byte for byte. Do not copy the page header +or trailer, except those B-tree header fields that are directly +related to the storage of records. Also copy PAGE_MAX_TRX_ID. +NOTE: The caller must update the lock table and the adaptive hash index. */ +UNIV_INTERN +void +page_zip_copy_recs( +/*===============*/ + page_zip_des_t* page_zip, /*!< out: copy of src_zip + (n_blobs, m_start, m_end, + m_nonempty, data[0..size-1]) */ + page_t* page, /*!< out: copy of src */ + const page_zip_des_t* src_zip, /*!< in: compressed page */ + const page_t* src, /*!< in: page */ + dict_index_t* index, /*!< in: index of the B-tree */ + mtr_t* mtr) /*!< in: mini-transaction */ +{ + ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, src, MTR_MEMO_PAGE_X_FIX)); + ut_ad(!dict_index_is_ibuf(index)); +#ifdef UNIV_ZIP_DEBUG + /* The B-tree operations that call this function may set + FIL_PAGE_PREV or PAGE_LEVEL, causing a temporary min_rec_flag + mismatch. A strict page_zip_validate() will be executed later + during the B-tree operations. */ + ut_a(page_zip_validate_low(src_zip, src, index, TRUE)); +#endif /* UNIV_ZIP_DEBUG */ + ut_a(page_zip_get_size(page_zip) == page_zip_get_size(src_zip)); + if (UNIV_UNLIKELY(src_zip->n_blobs)) { + ut_a(page_is_leaf(src)); + ut_a(dict_index_is_clust(index)); + } + + /* The PAGE_MAX_TRX_ID must be set on leaf pages of secondary + indexes. It does not matter on other pages. */ + ut_a(dict_index_is_clust(index) || !page_is_leaf(src) + || page_get_max_trx_id(src)); + + UNIV_MEM_ASSERT_W(page, UNIV_PAGE_SIZE); + UNIV_MEM_ASSERT_W(page_zip->data, page_zip_get_size(page_zip)); + UNIV_MEM_ASSERT_RW(src, UNIV_PAGE_SIZE); + UNIV_MEM_ASSERT_RW(src_zip->data, page_zip_get_size(page_zip)); + + /* Copy those B-tree page header fields that are related to + the records stored in the page. Also copy the field + PAGE_MAX_TRX_ID. Skip the rest of the page header and + trailer. On the compressed page, there is no trailer. */ +#if PAGE_MAX_TRX_ID + 8 != PAGE_HEADER_PRIV_END +# error "PAGE_MAX_TRX_ID + 8 != PAGE_HEADER_PRIV_END" +#endif + memcpy(PAGE_HEADER + page, PAGE_HEADER + src, + PAGE_HEADER_PRIV_END); + memcpy(PAGE_DATA + page, PAGE_DATA + src, + UNIV_PAGE_SIZE - PAGE_DATA - FIL_PAGE_DATA_END); + memcpy(PAGE_HEADER + page_zip->data, PAGE_HEADER + src_zip->data, + PAGE_HEADER_PRIV_END); + memcpy(PAGE_DATA + page_zip->data, PAGE_DATA + src_zip->data, + page_zip_get_size(page_zip) - PAGE_DATA); + + /* Copy all fields of src_zip to page_zip, except the pointer + to the compressed data page. */ + { + page_zip_t* data = page_zip->data; + memcpy(page_zip, src_zip, sizeof *page_zip); + page_zip->data = data; + } + ut_ad(page_zip_get_trailer_len(page_zip, dict_index_is_clust(index)) + + page_zip->m_end < page_zip_get_size(page_zip)); + + if (!page_is_leaf(src) + && UNIV_UNLIKELY(mach_read_from_4(src + FIL_PAGE_PREV) == FIL_NULL) + && UNIV_LIKELY(mach_read_from_4(page + + FIL_PAGE_PREV) != FIL_NULL)) { + /* Clear the REC_INFO_MIN_REC_FLAG of the first user record. */ + ulint offs = rec_get_next_offs(page + PAGE_NEW_INFIMUM, + TRUE); + if (UNIV_LIKELY(offs != PAGE_NEW_SUPREMUM)) { + rec_t* rec = page + offs; + ut_a(rec[-REC_N_NEW_EXTRA_BYTES] + & REC_INFO_MIN_REC_FLAG); + rec[-REC_N_NEW_EXTRA_BYTES] &= ~ REC_INFO_MIN_REC_FLAG; + } + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + btr_blob_dbg_add(page, index, "page_zip_copy_recs"); + + page_zip_compress_write_log(page_zip, page, index, mtr); +} +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************************//** +Parses a log record of compressing an index page. +@return end of log record or NULL */ +UNIV_INTERN +byte* +page_zip_parse_compress( +/*====================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< out: uncompressed page */ + page_zip_des_t* page_zip)/*!< out: compressed page */ +{ + ulint size; + ulint trailer_size; + + ut_ad(ptr && end_ptr); + ut_ad(!page == !page_zip); + + if (UNIV_UNLIKELY(ptr + (2 + 2) > end_ptr)) { + + return(NULL); + } + + size = mach_read_from_2(ptr); + ptr += 2; + trailer_size = mach_read_from_2(ptr); + ptr += 2; + + if (UNIV_UNLIKELY(ptr + 8 + size + trailer_size > end_ptr)) { + + return(NULL); + } + + if (page) { + if (UNIV_UNLIKELY(!page_zip) + || UNIV_UNLIKELY(page_zip_get_size(page_zip) < size)) { +corrupt: + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + + memcpy(page_zip->data + FIL_PAGE_PREV, ptr, 4); + memcpy(page_zip->data + FIL_PAGE_NEXT, ptr + 4, 4); + memcpy(page_zip->data + FIL_PAGE_TYPE, ptr + 8, size); + memset(page_zip->data + FIL_PAGE_TYPE + size, 0, + page_zip_get_size(page_zip) - trailer_size + - (FIL_PAGE_TYPE + size)); + memcpy(page_zip->data + page_zip_get_size(page_zip) + - trailer_size, ptr + 8 + size, trailer_size); + + if (UNIV_UNLIKELY(!page_zip_decompress(page_zip, page, + TRUE))) { + + goto corrupt; + } + } + + return(ptr + 8 + size + trailer_size); +} + +/**********************************************************************//** +Calculate the compressed page checksum. +@return page checksum */ +UNIV_INTERN +ulint +page_zip_calc_checksum( +/*===================*/ + const void* data, /*!< in: compressed page */ + ulint size, /*!< in: size of compressed page */ + srv_checksum_algorithm_t algo) /*!< in: algorithm to use */ +{ + uLong adler; + ib_uint32_t crc32; + const Bytef* s = static_cast<const byte*>(data); + + /* Exclude FIL_PAGE_SPACE_OR_CHKSUM, FIL_PAGE_LSN, + and FIL_PAGE_FILE_FLUSH_LSN from the checksum. */ + + switch (algo) { + case SRV_CHECKSUM_ALGORITHM_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: + + ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + crc32 = ut_crc32(s + FIL_PAGE_OFFSET, + FIL_PAGE_LSN - FIL_PAGE_OFFSET) + ^ ut_crc32(s + FIL_PAGE_TYPE, 2) + ^ ut_crc32(s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + return((ulint) crc32); + case SRV_CHECKSUM_ALGORITHM_INNODB: + case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB: + ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + adler = adler32(0L, s + FIL_PAGE_OFFSET, + FIL_PAGE_LSN - FIL_PAGE_OFFSET); + adler = adler32(adler, s + FIL_PAGE_TYPE, 2); + adler = adler32( + adler, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + static_cast<uInt>(size) + - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + return((ulint) adler); + case SRV_CHECKSUM_ALGORITHM_NONE: + case SRV_CHECKSUM_ALGORITHM_STRICT_NONE: + return(BUF_NO_CHECKSUM_MAGIC); + /* no default so the compiler will emit a warning if new enum + is added and not handled here */ + } + + ut_error; + return(0); +} + +/**********************************************************************//** +Verify a compressed page's checksum. +@return TRUE if the stored checksum is valid according to the value of +innodb_checksum_algorithm */ +UNIV_INTERN +ibool +page_zip_verify_checksum( +/*=====================*/ + const void* data, /*!< in: compressed page */ + ulint size) /*!< in: size of compressed page */ +{ + ib_uint32_t stored; + ib_uint32_t calc; + ib_uint32_t crc32 = 0 /* silence bogus warning */; + ib_uint32_t innodb = 0 /* silence bogus warning */; + + stored = static_cast<ib_uint32_t>(mach_read_from_4( + static_cast<const unsigned char*>(data) + FIL_PAGE_SPACE_OR_CHKSUM)); + +#if FIL_PAGE_LSN % 8 +#error "FIL_PAGE_LSN must be 64 bit aligned" +#endif + + /* Check if page is empty */ + if (stored == 0 + && *reinterpret_cast<const ib_uint64_t*>(static_cast<const char*>( + data) + + FIL_PAGE_LSN) == 0) { + /* make sure that the page is really empty */ + ulint i; + for (i = 0; i < size; i++) { + if (*((const char*) data + i) != 0) { + return(FALSE); + } + } + /* Empty page */ + return(TRUE); + } + + calc = static_cast<ib_uint32_t>(page_zip_calc_checksum( + data, size, static_cast<srv_checksum_algorithm_t>( + srv_checksum_algorithm))); + + if (stored == calc) { + return(TRUE); + } + + switch ((srv_checksum_algorithm_t) srv_checksum_algorithm) { + case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB: + case SRV_CHECKSUM_ALGORITHM_STRICT_NONE: + return(stored == calc); + case SRV_CHECKSUM_ALGORITHM_CRC32: + if (stored == BUF_NO_CHECKSUM_MAGIC) { + return(TRUE); + } + crc32 = calc; + innodb = static_cast<ib_uint32_t>(page_zip_calc_checksum( + data, size, SRV_CHECKSUM_ALGORITHM_INNODB)); + break; + case SRV_CHECKSUM_ALGORITHM_INNODB: + if (stored == BUF_NO_CHECKSUM_MAGIC) { + return(TRUE); + } + crc32 = static_cast<ib_uint32_t>(page_zip_calc_checksum( + data, size, SRV_CHECKSUM_ALGORITHM_CRC32)); + innodb = calc; + break; + case SRV_CHECKSUM_ALGORITHM_NONE: + return(TRUE); + /* no default so the compiler will emit a warning if new enum + is added and not handled here */ + } + + return(stored == crc32 || stored == innodb); +} |