diff options
author | Sergei Golubchik <sergii@pisem.net> | 2013-12-22 17:06:50 +0100 |
---|---|---|
committer | Sergei Golubchik <sergii@pisem.net> | 2013-12-22 17:06:50 +0100 |
commit | ffa8c4cfcc41d4f160e3bdfca5cfd4b01a7d6e63 (patch) | |
tree | 728585c36f22a5db3cea796430883d0ebc5c05eb /storage/xtradb/page | |
parent | e27c34f9e4ca15c797fcd3191ee5679c2f237a09 (diff) | |
parent | 52c26f7a1f675185d2ef1a28aca7f9bcc67c6414 (diff) | |
download | mariadb-git-ffa8c4cfcc41d4f160e3bdfca5cfd4b01a7d6e63.tar.gz |
Percona-Server-5.6.14-rel62.0 merge
support ha_innodb.so as a dynamic plugin.
* remove obsolete *,innodb_plugin.rdiff files
* s/--plugin-load=/--plugin-load-add=/
* MYSQL_PLUGIN_IMPORT glob_hostname[]
* use my_error instead of push_warning_printf(ER_DEFAULT)
* don't use tdc_size and tc_size in a module
update test cases (XtraDB is 5.6.14, InnoDB is 5.6.10)
* copy new tests over
* disable some tests for (old) InnoDB
* delete XtraDB tests that no longer apply
small compatibility changes:
* s/HTON_EXTENDED_KEYS/HTON_SUPPORTS_EXTENDED_KEYS/
* revert unnecessary InnoDB changes to make it a bit closer to the upstream
fix XtraDB to compile on Windows (both as a static and a dynamic plugin)
disable XtraDB on Windows (deadlocks) and where no atomic ops are available (e.g. CentOS 5)
storage/innobase/handler/ha_innodb.cc:
revert few unnecessary changes to make it a bit closer to the original InnoDB
storage/innobase/include/univ.i:
correct the version to match what it was merged from
Diffstat (limited to 'storage/xtradb/page')
-rw-r--r-- | storage/xtradb/page/page0cur.cc (renamed from storage/xtradb/page/page0cur.c) | 340 | ||||
-rw-r--r-- | storage/xtradb/page/page0page.cc (renamed from storage/xtradb/page/page0page.c) | 228 | ||||
-rw-r--r-- | storage/xtradb/page/page0zip.cc (renamed from storage/xtradb/page/page0zip.c) | 398 |
3 files changed, 711 insertions, 255 deletions
diff --git a/storage/xtradb/page/page0cur.c b/storage/xtradb/page/page0cur.cc index a722f5b188d..efce1f10cae 100644 --- a/storage/xtradb/page/page0cur.c +++ b/storage/xtradb/page/page0cur.cc @@ -1,6 +1,7 @@ /***************************************************************************** -Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -17,7 +18,7 @@ this program; if not, write to the Free Software Foundation, Inc., *****************************************************************************/ /********************************************************************//** -@file page/page0cur.c +@file page/page0cur.cc The page cursor Created 10/4/1994 Heikki Tuuri @@ -29,6 +30,7 @@ Created 10/4/1994 Heikki Tuuri #endif #include "page0zip.h" +#include "btr0btr.h" #include "mtr0log.h" #include "log0recv.h" #include "ut0ut.h" @@ -772,7 +774,7 @@ page_cur_parse_insert_rec( byte* buf; byte* ptr2 = ptr; ulint info_and_status_bits = 0; /* remove warning */ - page_cur_t cursor; + page_cur_t cursor; mem_heap_t* heap = NULL; ulint offsets_[REC_OFFS_NORMAL_SIZE]; ulint* offsets = offsets_; @@ -879,7 +881,8 @@ page_cur_parse_insert_rec( if (mismatch_index + end_seg_len < sizeof buf1) { buf = buf1; } else { - buf = mem_alloc(mismatch_index + end_seg_len); + buf = static_cast<byte*>( + mem_alloc(mismatch_index + end_seg_len)); } /* Build the inserted record to buf */ @@ -972,6 +975,9 @@ page_cur_insert_rec_low( page = page_align(current_rec); ut_ad(dict_table_is_comp(index->table) == (ibool) !!page_is_comp(page)); + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); + ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID) + == index->id || recv_recovery_is_on() || mtr->inside_ibuf); ut_ad(!page_rec_is_supremum(current_rec)); @@ -1006,8 +1012,8 @@ page_cur_insert_rec_low( rec_offs_init(foffsets_); - foffsets = rec_get_offsets(free_rec, index, foffsets, - ULINT_UNDEFINED, &heap); + foffsets = rec_get_offsets( + free_rec, index, foffsets, ULINT_UNDEFINED, &heap); if (rec_offs_size(foffsets) < rec_size) { if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); @@ -1155,71 +1161,22 @@ use_heap: } /***********************************************************//** -Compresses or reorganizes a page after an optimistic insert. -@return rec if succeed, NULL otherwise */ -static -rec_t* -page_cur_insert_rec_zip_reorg( -/*==========================*/ - rec_t** current_rec,/*!< in/out: pointer to current record after - which the new record is inserted */ - buf_block_t* block, /*!< in: buffer block */ - dict_index_t* index, /*!< in: record descriptor */ - rec_t* rec, /*!< in: inserted record */ - page_t* page, /*!< in: uncompressed page */ - page_zip_des_t* page_zip,/*!< in: compressed page */ - mtr_t* mtr) /*!< in: mini-transaction, or NULL */ -{ - ulint pos; - - /* Recompress or reorganize and recompress the page. */ - if (UNIV_LIKELY(page_zip_compress(page_zip, page, index, mtr))) { - return(rec); - } - - /* Before trying to reorganize the page, - store the number of preceding records on the page. */ - pos = page_rec_get_n_recs_before(rec); - ut_ad(pos > 0); - - if (page_zip_reorganize(block, index, mtr)) { - /* The page was reorganized: Find rec by seeking to pos, - and update *current_rec. */ - if (pos > 1) { - rec = page_rec_get_nth(page, pos - 1); - } else { - rec = page + PAGE_NEW_INFIMUM; - } - - *current_rec = rec; - rec = page + rec_get_next_offs(rec, TRUE); - - return(rec); - } - - /* Out of space: restore the page */ - btr_blob_dbg_remove(page, index, "insert_zip_fail"); - if (!page_zip_decompress(page_zip, page, FALSE)) { - ut_error; /* Memory corrupted? */ - } - ut_ad(page_validate(page, index)); - btr_blob_dbg_add(page, index, "insert_zip_fail"); - return(NULL); -} - -/***********************************************************//** Inserts a record next to page cursor on a compressed and uncompressed page. Returns pointer to inserted record if succeed, i.e., enough space available, NULL otherwise. The cursor stays at the same position. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + @return pointer to record if succeed, NULL otherwise */ UNIV_INTERN rec_t* page_cur_insert_rec_zip( /*====================*/ - rec_t** current_rec,/*!< in/out: pointer to current record after - which the new record is inserted */ - buf_block_t* block, /*!< in: buffer block of *current_rec */ + page_cur_t* cursor, /*!< in/out: page cursor */ dict_index_t* index, /*!< in: record descriptor */ const rec_t* rec, /*!< in: pointer to a physical record */ ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */ @@ -1237,16 +1194,19 @@ page_cur_insert_rec_zip( record */ page_zip_des_t* page_zip; - page_zip = buf_block_get_page_zip(block); + page_zip = page_cur_get_page_zip(cursor); ut_ad(page_zip); ut_ad(rec_offs_validate(rec, index, offsets)); - page = page_align(*current_rec); + page = page_cur_get_page(cursor); ut_ad(dict_table_is_comp(index->table)); ut_ad(page_is_comp(page)); + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); + ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID) + == index->id || mtr->inside_ibuf || recv_recovery_is_on()); - ut_ad(!page_rec_is_supremum(*current_rec)); + ut_ad(!page_cur_is_after_last(cursor)); #ifdef UNIV_ZIP_DEBUG ut_a(page_zip_validate(page_zip, page, index)); #endif /* UNIV_ZIP_DEBUG */ @@ -1271,25 +1231,168 @@ page_cur_insert_rec_zip( } #endif /* UNIV_DEBUG_VALGRIND */ + const bool reorg_before_insert = page_has_garbage(page) + && rec_size > page_get_max_insert_size(page, 1) + && rec_size <= page_get_max_insert_size_after_reorganize( + page, 1); + /* 2. Try to find suitable space from page memory management */ if (!page_zip_available(page_zip, dict_index_is_clust(index), - rec_size, 1)) { + rec_size, 1) + || reorg_before_insert) { + /* The values can change dynamically. */ + bool log_compressed = page_zip_log_pages; + ulint level = page_zip_level; +#ifdef UNIV_DEBUG + rec_t* cursor_rec = page_cur_get_rec(cursor); +#endif /* UNIV_DEBUG */ + + /* If we are not writing compressed page images, we + must reorganize the page before attempting the + insert. */ + if (recv_recovery_is_on()) { + /* Insert into the uncompressed page only. + The page reorganization or creation that we + would attempt outside crash recovery would + have been covered by a previous redo log record. */ + } else if (page_is_empty(page)) { + ut_ad(page_cur_is_before_first(cursor)); + + /* This is an empty page. Recreate it to + get rid of the modification log. */ + page_create_zip(page_cur_get_block(cursor), index, + page_header_get_field(page, PAGE_LEVEL), + 0, mtr); + ut_ad(!page_header_get_ptr(page, PAGE_FREE)); + + if (page_zip_available( + page_zip, dict_index_is_clust(index), + rec_size, 1)) { + goto use_heap; + } + + /* The cursor should remain on the page infimum. */ + return(NULL); + } else if (!page_zip->m_nonempty && !page_has_garbage(page)) { + /* The page has been freshly compressed, so + reorganizing it will not help. */ + } else if (log_compressed && !reorg_before_insert) { + /* Insert into uncompressed page only, and + try page_zip_reorganize() afterwards. */ + } else if (btr_page_reorganize_low( + recv_recovery_is_on(), level, + cursor, index, mtr)) { + ut_ad(!page_header_get_ptr(page, PAGE_FREE)); + + if (page_zip_available( + page_zip, dict_index_is_clust(index), + rec_size, 1)) { + /* After reorganizing, there is space + available. */ + goto use_heap; + } + } else { + ut_ad(cursor->rec == cursor_rec); + return(NULL); + } /* Try compressing the whole page afterwards. */ - insert_rec = page_cur_insert_rec_low(*current_rec, - index, rec, offsets, - NULL); - - if (UNIV_LIKELY(insert_rec != NULL)) { - insert_rec = page_cur_insert_rec_zip_reorg( - current_rec, block, index, insert_rec, - page, page_zip, mtr); -#ifdef UNIV_DEBUG - if (insert_rec) { - rec_offs_make_valid( - insert_rec, index, offsets); + insert_rec = page_cur_insert_rec_low( + cursor->rec, index, rec, offsets, NULL); + + /* If recovery is on, this implies that the compression + of the page was successful during runtime. Had that not + been the case or had the redo logging of compressed + pages been enabled during runtime then we'd have seen + a MLOG_ZIP_PAGE_COMPRESS redo record. Therefore, we + know that we don't need to reorganize the page. We, + however, do need to recompress the page. That will + happen when the next redo record is read which must + be of type MLOG_ZIP_PAGE_COMPRESS_NO_DATA and it must + contain a valid compression level value. + This implies that during recovery from this point till + the next redo is applied the uncompressed and + compressed versions are not identical and + page_zip_validate will fail but that is OK because + we call page_zip_validate only after processing + all changes to a page under a single mtr during + recovery. */ + if (insert_rec == NULL) { + /* Out of space. + This should never occur during crash recovery, + because the MLOG_COMP_REC_INSERT should only + be logged after a successful operation. */ + ut_ad(!recv_recovery_is_on()); + } else if (recv_recovery_is_on()) { + /* This should be followed by + MLOG_ZIP_PAGE_COMPRESS_NO_DATA, + which should succeed. */ + rec_offs_make_valid(insert_rec, index, offsets); + } else { + ulint pos = page_rec_get_n_recs_before(insert_rec); + ut_ad(pos > 0); + + if (!log_compressed) { + if (page_zip_compress( + page_zip, page, index, + level, NULL)) { + page_cur_insert_rec_write_log( + insert_rec, rec_size, + cursor->rec, index, mtr); + page_zip_compress_write_log_no_data( + level, page, index, mtr); + + rec_offs_make_valid( + insert_rec, index, offsets); + return(insert_rec); + } + + ut_ad(cursor->rec + == (pos > 1 + ? page_rec_get_nth( + page, pos - 1) + : page + PAGE_NEW_INFIMUM)); + } else { + /* We are writing entire page images + to the log. Reduce the redo log volume + by reorganizing the page at the same time. */ + if (page_zip_reorganize( + cursor->block, index, mtr)) { + /* The page was reorganized: + Seek to pos. */ + if (pos > 1) { + cursor->rec = page_rec_get_nth( + page, pos - 1); + } else { + cursor->rec = page + + PAGE_NEW_INFIMUM; + } + + insert_rec = page + rec_get_next_offs( + cursor->rec, TRUE); + rec_offs_make_valid( + insert_rec, index, offsets); + return(insert_rec); + } + + /* Theoretically, we could try one + last resort of btr_page_reorganize_low() + followed by page_zip_available(), but + that would be very unlikely to + succeed. (If the full reorganized page + failed to compress, why would it + succeed to compress the page, plus log + the insert of this record? */ } -#endif /* UNIV_DEBUG */ + + /* Out of space: restore the page */ + btr_blob_dbg_remove(page, index, "insert_zip_fail"); + if (!page_zip_decompress(page_zip, page, FALSE)) { + ut_error; /* Memory corrupted? */ + } + ut_ad(page_validate(page, index)); + btr_blob_dbg_add(page, index, "insert_zip_fail"); + insert_rec = NULL; } return(insert_rec); @@ -1306,7 +1409,7 @@ page_cur_insert_rec_zip( rec_offs_init(foffsets_); foffsets = rec_get_offsets(free_rec, index, foffsets, - ULINT_UNDEFINED, &heap); + ULINT_UNDEFINED, &heap); if (rec_offs_size(foffsets) < rec_size) { too_small: if (UNIV_LIKELY_NULL(heap)) { @@ -1414,18 +1517,19 @@ use_heap: rec_offs_make_valid(insert_rec, index, offsets); /* 4. Insert the record in the linked list of records */ - ut_ad(*current_rec != insert_rec); + ut_ad(cursor->rec != insert_rec); { /* next record after current before the insertion */ - rec_t* next_rec = page_rec_get_next(*current_rec); - ut_ad(rec_get_status(*current_rec) + const rec_t* next_rec = page_rec_get_next_low( + cursor->rec, TRUE); + ut_ad(rec_get_status(cursor->rec) <= REC_STATUS_INFIMUM); ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM); ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM); page_rec_set_next(insert_rec, next_rec); - page_rec_set_next(*current_rec, insert_rec); + page_rec_set_next(cursor->rec, insert_rec); } page_header_set_field(page, page_zip, PAGE_N_RECS, @@ -1439,7 +1543,7 @@ use_heap: UNIV_MEM_ASSERT_RW(rec_get_start(insert_rec, offsets), rec_offs_size(offsets)); - page_zip_dir_insert(page_zip, *current_rec, free_rec, insert_rec); + page_zip_dir_insert(page_zip, cursor->rec, free_rec, insert_rec); /* 6. Update the last insertion info in page header */ @@ -1453,7 +1557,7 @@ use_heap: PAGE_NO_DIRECTION); page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0); - } else if ((last_insert == *current_rec) + } else if ((last_insert == cursor->rec) && (page_header_get_field(page, PAGE_DIRECTION) != PAGE_LEFT)) { @@ -1506,7 +1610,7 @@ use_heap: /* 9. Write log record of the insert */ if (UNIV_LIKELY(mtr != NULL)) { page_cur_insert_rec_write_log(insert_rec, rec_size, - *current_rec, index, mtr); + cursor->rec, index, mtr); } return(insert_rec); @@ -1600,7 +1704,12 @@ page_parse_copy_rec_list_to_created_page( #ifndef UNIV_HOTBACKUP /*************************************************************//** Copies records from page to a newly created page, from a given record onward, -including that record. Infimum and supremum records are not copied. */ +including that record. Infimum and supremum records are not copied. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). */ UNIV_INTERN void page_copy_rec_list_end_to_created_page( @@ -1780,9 +1889,9 @@ UNIV_INLINE void page_cur_delete_rec_write_log( /*==========================*/ - rec_t* rec, /*!< in: record to be deleted */ - dict_index_t* index, /*!< in: record descriptor */ - mtr_t* mtr) /*!< in: mini-transaction handle */ + rec_t* rec, /*!< in: record to be deleted */ + const dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mini-transaction handle */ { byte* log_ptr; @@ -1864,10 +1973,11 @@ UNIV_INTERN void page_cur_delete_rec( /*================*/ - page_cur_t* cursor, /*!< in/out: a page cursor */ - dict_index_t* index, /*!< in: record descriptor */ - const ulint* offsets,/*!< in: rec_get_offsets(cursor->rec, index) */ - mtr_t* mtr) /*!< in: mini-transaction handle */ + page_cur_t* cursor, /*!< in/out: a page cursor */ + const dict_index_t* index, /*!< in: record descriptor */ + const ulint* offsets,/*!< in: rec_get_offsets( + cursor->rec, index) */ + mtr_t* mtr) /*!< in: mini-transaction handle */ { page_dir_slot_t* cur_dir_slot; page_dir_slot_t* prev_slot; @@ -1880,8 +1990,6 @@ page_cur_delete_rec( ulint cur_n_owned; rec_t* rec; - ut_ad(cursor && mtr); - page = page_cur_get_page(cursor); page_zip = page_cur_get_page_zip(cursor); @@ -1896,10 +2004,31 @@ page_cur_delete_rec( current_rec = cursor->rec; ut_ad(rec_offs_validate(current_rec, index, offsets)); ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); + ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID) + == index->id || mtr->inside_ibuf || recv_recovery_is_on()); /* The record must not be the supremum or infimum record. */ ut_ad(page_rec_is_user_rec(current_rec)); + if (page_get_n_recs(page) == 1 && !recv_recovery_is_on()) { + /* Empty the page, unless we are applying the redo log + during crash recovery. During normal operation, the + page_create_empty() gets logged as one of MLOG_PAGE_CREATE, + MLOG_COMP_PAGE_CREATE, MLOG_ZIP_PAGE_COMPRESS. */ + ut_ad(page_is_leaf(page)); + /* Usually, this should be the root page, + and the whole index tree should become empty. + However, this could also be a call in + btr_cur_pessimistic_update() to delete the only + record in the page and to insert another one. */ + page_cur_move_to_next(cursor); + ut_ad(page_cur_is_after_last(cursor)); + page_create_empty(page_cur_get_block(cursor), + const_cast<dict_index_t*>(index), mtr); + return; + } + /* Save to local variables some data associated with current_rec */ cur_slot_no = page_dir_find_owner_slot(current_rec); ut_ad(cur_slot_no > 0); @@ -1907,7 +2036,9 @@ page_cur_delete_rec( cur_n_owned = page_dir_slot_get_n_owned(cur_dir_slot); /* 0. Write the log record */ - page_cur_delete_rec_write_log(current_rec, index, mtr); + if (mtr != 0) { + page_cur_delete_rec_write_log(current_rec, index, mtr); + } /* 1. Reset the last insert info in the page header and increment the modify clock for the frame */ @@ -1915,9 +2046,13 @@ page_cur_delete_rec( page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL); /* The page gets invalid for optimistic searches: increment the - frame modify clock */ + frame modify clock only if there is an mini-transaction covering + the change. During IMPORT we allocate local blocks that are not + part of the buffer pool. */ - buf_block_modify_clock_inc(page_cur_get_block(cursor)); + if (mtr != 0) { + buf_block_modify_clock_inc(page_cur_get_block(cursor)); + } /* 2. Find the next and the previous record. Note that the cursor is left at the next record. */ @@ -1961,14 +2096,15 @@ page_cur_delete_rec( page_dir_slot_set_n_owned(cur_dir_slot, page_zip, cur_n_owned - 1); /* 6. Free the memory occupied by the record */ - btr_blob_dbg_remove_rec(current_rec, index, offsets, "delete"); + btr_blob_dbg_remove_rec(current_rec, const_cast<dict_index_t*>(index), + offsets, "delete"); page_mem_free(page, page_zip, current_rec, index, offsets); /* 7. Now we have decremented the number of owned records of the slot. If the number drops below PAGE_DIR_SLOT_MIN_N_OWNED, we balance the slots. */ - if (UNIV_UNLIKELY(cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED)) { + if (cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) { page_dir_balance_slot(page, page_zip, cur_slot_no); } diff --git a/storage/xtradb/page/page0page.c b/storage/xtradb/page/page0page.cc index f2ce6c9fe16..2faf804279c 100644 --- a/storage/xtradb/page/page0page.c +++ b/storage/xtradb/page/page0page.cc @@ -1,6 +1,7 @@ /***************************************************************************** -Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -17,7 +18,7 @@ this program; if not, write to the Free Software Foundation, Inc., *****************************************************************************/ /**************************************************//** -@file page/page0page.c +@file page/page0page.cc Index page routines Created 2/2/1994 Heikki Tuuri @@ -222,7 +223,7 @@ page_set_max_trx_id( during a database recovery we assume that the max trx id of every page is the maximum trx id assigned before the crash. */ - if (UNIV_LIKELY_NULL(page_zip)) { + if (page_zip) { mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), trx_id); page_zip_write_header(page_zip, page + (PAGE_HEADER + PAGE_MAX_TRX_ID), @@ -499,7 +500,8 @@ page_create_zip( page is created */ dict_index_t* index, /*!< in: the index of the page */ ulint level, /*!< in: the B-tree level of the page */ - mtr_t* mtr) /*!< in: mini-transaction handle */ + trx_id_t max_trx_id, /*!< in: PAGE_MAX_TRX_ID */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { page_t* page; page_zip_des_t* page_zip = buf_block_get_page_zip(block); @@ -510,9 +512,11 @@ page_create_zip( ut_ad(dict_table_is_comp(index->table)); page = page_create_low(block, TRUE); - mach_write_to_2(page + PAGE_HEADER + PAGE_LEVEL, level); + mach_write_to_2(PAGE_HEADER + PAGE_LEVEL + page, level); + mach_write_to_8(PAGE_HEADER + PAGE_MAX_TRX_ID + page, max_trx_id); - if (UNIV_UNLIKELY(!page_zip_compress(page_zip, page, index, mtr))) { + if (!page_zip_compress(page_zip, page, index, + page_zip_level, mtr)) { /* The compression of a newly created page should always succeed. */ ut_error; @@ -521,9 +525,49 @@ page_create_zip( return(page); } +/**********************************************************//** +Empty a previously created B-tree index page. */ +UNIV_INTERN +void +page_create_empty( +/*==============*/ + buf_block_t* block, /*!< in/out: B-tree block */ + dict_index_t* index, /*!< in: the index of the page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + trx_id_t max_trx_id = 0; + const page_t* page = buf_block_get_frame(block); + page_zip_des_t* page_zip= buf_block_get_page_zip(block); + + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); + + if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) { + max_trx_id = page_get_max_trx_id(page); + ut_ad(max_trx_id); + } + + if (page_zip) { + page_create_zip(block, index, + page_header_get_field(page, PAGE_LEVEL), + max_trx_id, mtr); + } else { + page_create(block, mtr, page_is_comp(page)); + + if (max_trx_id) { + page_update_max_trx_id( + block, page_zip, max_trx_id, mtr); + } + } +} + /*************************************************************//** Differs from page_copy_rec_list_end, because this function does not -touch the lock table and max trx id on page or compress the page. */ +touch the lock table and max trx id on page or compress the page. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). */ UNIV_INTERN void page_copy_rec_list_end_no_locks( @@ -598,6 +642,12 @@ page_copy_rec_list_end_no_locks( Copies records from page to new_page, from a given record onward, including that record. Infimum and supremum records are not copied. The records are copied to the start of the record list on new_page. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + @return pointer to the original successor of the infimum record on new_page, or NULL on zip overflow (new_block will be decompressed) */ UNIV_INTERN @@ -635,7 +685,7 @@ page_copy_rec_list_end( /* Here, "ret" may be pointing to a user record or the predefined supremum record. */ - if (UNIV_LIKELY_NULL(new_page_zip)) { + if (new_page_zip) { log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); } @@ -655,11 +705,11 @@ page_copy_rec_list_end( page_get_max_trx_id(page), mtr); } - if (UNIV_LIKELY_NULL(new_page_zip)) { + if (new_page_zip) { mtr_set_log_mode(mtr, log_mode); - if (UNIV_UNLIKELY - (!page_zip_compress(new_page_zip, new_page, index, mtr))) { + if (!page_zip_compress(new_page_zip, new_page, + index, page_zip_level, mtr)) { /* Before trying to reorganize the page, store the number of preceding records on the page. */ ulint ret_pos @@ -671,14 +721,12 @@ page_copy_rec_list_end( that is smaller than "ret"). */ ut_a(ret_pos > 0); - if (UNIV_UNLIKELY - (!page_zip_reorganize(new_block, index, mtr))) { + if (!page_zip_reorganize(new_block, index, mtr)) { btr_blob_dbg_remove(new_page, index, "copy_end_reorg_fail"); - if (UNIV_UNLIKELY - (!page_zip_decompress(new_page_zip, - new_page, FALSE))) { + if (!page_zip_decompress(new_page_zip, + new_page, FALSE)) { ut_error; } ut_ad(page_validate(new_page, index)); @@ -710,6 +758,12 @@ page_copy_rec_list_end( Copies records from page to new_page, up to the given record, NOT including that record. Infimum and supremum records are not copied. The records are copied to the end of the record list on new_page. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + @return pointer to the original predecessor of the supremum record on new_page, or NULL on zip overflow (new_block will be decompressed) */ UNIV_INTERN @@ -742,7 +796,7 @@ page_copy_rec_list_start( return(ret); } - if (UNIV_LIKELY_NULL(new_page_zip)) { + if (new_page_zip) { log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); } @@ -778,14 +832,15 @@ page_copy_rec_list_start( mtr); } - if (UNIV_LIKELY_NULL(new_page_zip)) { + if (new_page_zip) { mtr_set_log_mode(mtr, log_mode); DBUG_EXECUTE_IF("page_copy_rec_list_start_compress_fail", goto zip_reorganize;); - if (UNIV_UNLIKELY - (!page_zip_compress(new_page_zip, new_page, index, mtr))) { + if (!page_zip_compress(new_page_zip, new_page, index, + page_zip_level, mtr)) { + ulint ret_pos; #ifndef DBUG_OFF zip_reorganize: @@ -949,13 +1004,38 @@ page_delete_rec_list_end( ut_a(!page_zip || page_zip_validate(page_zip, page, index)); #endif /* UNIV_ZIP_DEBUG */ - if (page_rec_is_infimum(rec)) { - rec = page_rec_get_next(rec); - } - if (page_rec_is_supremum(rec)) { + ut_ad(n_recs == 0 || n_recs == ULINT_UNDEFINED); + /* Nothing to do, there are no records bigger than the + page supremum. */ + return; + } + if (recv_recovery_is_on()) { + /* If we are replaying a redo log record, we must + replay it exactly. Since MySQL 5.6.11, we should be + generating a redo log record for page creation if + the page would become empty. Thus, this branch should + only be executed when applying redo log that was + generated by an older version of MySQL. */ + } else if (page_rec_is_infimum(rec) + || n_recs == page_get_n_recs(page)) { +delete_all: + /* We are deleting all records. */ + page_create_empty(block, index, mtr); return; + } else if (page_is_comp(page)) { + if (page_rec_get_next_low(page + PAGE_NEW_INFIMUM, 1) == rec) { + /* We are deleting everything from the first + user record onwards. */ + goto delete_all; + } + } else { + if (page_rec_get_next_low(page + PAGE_OLD_INFIMUM, 0) == rec) { + /* We are deleting everything from the first + user record onwards. */ + goto delete_all; + } } /* Reset the last insert info in the page header and increment @@ -972,7 +1052,7 @@ page_delete_rec_list_end( ? MLOG_COMP_LIST_END_DELETE : MLOG_LIST_END_DELETE, mtr); - if (UNIV_LIKELY_NULL(page_zip)) { + if (page_zip) { ulint log_mode; ut_a(page_is_comp(page)); @@ -1134,7 +1214,12 @@ page_delete_rec_list_start( #endif /* UNIV_ZIP_DEBUG */ if (page_rec_is_infimum(rec)) { + return; + } + if (page_rec_is_supremum(rec)) { + /* We are deleting all records. */ + page_create_empty(block, index, mtr); return; } @@ -1172,6 +1257,12 @@ page_delete_rec_list_start( /*************************************************************//** Moves record list end to another page. Moved records include split_rec. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + @return TRUE on success; FALSE on compression failure (new_block will be decompressed) */ UNIV_INTERN @@ -1227,6 +1318,12 @@ page_move_rec_list_end( /*************************************************************//** Moves record list start to another page. Moved records do not include split_rec. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + @return TRUE on success; FALSE on compression failure */ UNIV_INTERN ibool @@ -1572,7 +1669,7 @@ page_rec_get_n_recs_before( n--; ut_ad(n >= 0); - ut_ad((ulint)n < UNIV_PAGE_SIZE / (REC_N_NEW_EXTRA_BYTES + 1)); + ut_ad((ulong) n < UNIV_PAGE_SIZE / (REC_N_NEW_EXTRA_BYTES + 1)); return((ulint) n); } @@ -2322,12 +2419,26 @@ page_validate( } } + if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page) + && !page_is_empty(page)) { + trx_id_t max_trx_id = page_get_max_trx_id(page); + trx_id_t sys_max_trx_id = trx_sys_get_max_trx_id(); + + if (max_trx_id == 0 || max_trx_id > sys_max_trx_id) { + ib_logf(IB_LOG_LEVEL_ERROR, + "PAGE_MAX_TRX_ID out of bounds: " + TRX_ID_FMT ", " TRX_ID_FMT, + max_trx_id, sys_max_trx_id); + goto func_exit2; + } + } + heap = mem_heap_create(UNIV_PAGE_SIZE + 200); /* The following buffer is used to check that the records in the page record heap do not overlap */ - buf = mem_heap_zalloc(heap, UNIV_PAGE_SIZE); + buf = static_cast<byte*>(mem_heap_zalloc(heap, UNIV_PAGE_SIZE)); /* Check first that the record heap and the directory do not overlap. */ @@ -2337,7 +2448,7 @@ page_validate( if (UNIV_UNLIKELY(!(page_header_get_ptr(page, PAGE_HEAP_TOP) <= page_dir_get_nth_slot(page, n_slots - 1)))) { - fprintf(stderr, + fprintf(stderr, "InnoDB: Record heap and dir overlap" " on space %lu page %lu index %s, %p, %p\n", (ulong) page_get_space_id(page), @@ -2380,7 +2491,7 @@ page_validate( if (UNIV_UNLIKELY (1 != cmp_rec_rec(rec, old_rec, offsets, old_offsets, index))) { - fprintf(stderr, + fprintf(stderr, "InnoDB: Records in wrong order" " on space %lu page %lu index %s\n", (ulong) page_get_space_id(page), @@ -2551,7 +2662,7 @@ func_exit: if (UNIV_UNLIKELY(ret == FALSE)) { func_exit2: - fprintf(stderr, + fprintf(stderr, "InnoDB: Apparent corruption" " in space %lu page %lu index %s\n", (ulong) page_get_space_id(page), @@ -2611,3 +2722,60 @@ page_find_rec_with_heap_no( } } #endif /* !UNIV_HOTBACKUP */ + +/*******************************************************//** +Removes the record from a leaf page. This function does not log +any changes. It is used by the IMPORT tablespace functions. +The cursor is moved to the next record after the deleted one. +@return true if success, i.e., the page did not become too empty */ +UNIV_INTERN +bool +page_delete_rec( +/*============*/ + const dict_index_t* index, /*!< in: The index that the record + belongs to */ + page_cur_t* pcur, /*!< in/out: page cursor on record + to delete */ + page_zip_des_t* page_zip,/*!< in: compressed page descriptor */ + const ulint* offsets)/*!< in: offsets for record */ +{ + bool no_compress_needed; + buf_block_t* block = pcur->block; + page_t* page = buf_block_get_frame(block); + + ut_ad(page_is_leaf(page)); + + if (!rec_offs_any_extern(offsets) + && ((page_get_data_size(page) - rec_offs_size(offsets) + < BTR_CUR_PAGE_COMPRESS_LIMIT) + || (mach_read_from_4(page + FIL_PAGE_NEXT) == FIL_NULL + && mach_read_from_4(page + FIL_PAGE_PREV) == FIL_NULL) + || (page_get_n_recs(page) < 2))) { + + ulint root_page_no = dict_index_get_page(index); + + /* The page fillfactor will drop below a predefined + minimum value, OR the level in the B-tree contains just + one page, OR the page will become empty: we recommend + compression if this is not the root page. */ + + no_compress_needed = page_get_page_no(page) == root_page_no; + } else { + no_compress_needed = true; + } + + if (no_compress_needed) { +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + page_cur_delete_rec(pcur, index, offsets, 0); + +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + } + + return(no_compress_needed); +} + diff --git a/storage/xtradb/page/page0zip.c b/storage/xtradb/page/page0zip.cc index 40d794770ff..81c9e0ab45a 100644 --- a/storage/xtradb/page/page0zip.c +++ b/storage/xtradb/page/page0zip.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2005, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -11,18 +12,21 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA *****************************************************************************/ /**************************************************//** -@file page/page0zip.c +@file page/page0zip.cc Compressed page interface Created June 2005 by Marko Makela *******************************************************/ +#include <map> +using namespace std; + #define THIS_MODULE #include "page0zip.h" #ifdef UNIV_NONINL @@ -38,20 +42,39 @@ Created June 2005 by Marko Makela #include "log0recv.h" #include "zlib.h" #ifndef UNIV_HOTBACKUP +# include "buf0buf.h" # include "buf0lru.h" # include "btr0sea.h" # include "dict0boot.h" # include "lock0lock.h" +# include "srv0mon.h" +# include "srv0srv.h" +# include "ut0crc32.h" #else /* !UNIV_HOTBACKUP */ +# include "buf0checksum.h" # define lock_move_reorganize_page(block, temp_block) ((void) 0) # define buf_LRU_stat_inc_unzip() ((void) 0) #endif /* !UNIV_HOTBACKUP */ #ifndef UNIV_HOTBACKUP /** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */ -UNIV_INTERN page_zip_stat_t page_zip_stat[PAGE_ZIP_NUM_SSIZE_MAX - 1]; +UNIV_INTERN page_zip_stat_t page_zip_stat[PAGE_ZIP_SSIZE_MAX]; +/** Statistics on compression, indexed by index->id */ +UNIV_INTERN page_zip_stat_per_index_t page_zip_stat_per_index; +/** Mutex protecting page_zip_stat_per_index */ +UNIV_INTERN ib_mutex_t page_zip_stat_per_index_mutex; +#ifdef HAVE_PSI_INTERFACE +UNIV_INTERN mysql_pfs_key_t page_zip_stat_per_index_mutex_key; +#endif /* HAVE_PSI_INTERFACE */ #endif /* !UNIV_HOTBACKUP */ +/* Compression level to be used by zlib. Settable by user. */ +UNIV_INTERN uint page_zip_level = DEFAULT_COMPRESSION_LEVEL; + +/* Whether or not to log compressed page images to avoid possible +compression algorithm changes in zlib. */ +UNIV_INTERN my_bool page_zip_log_pages = true; + /* Please refer to ../include/page0zip.ic for a description of the compressed page format. */ @@ -381,7 +404,7 @@ page_zip_get_n_prev_extern( compressed page */ const rec_t* rec, /*!< in: compact physical record on a B-tree leaf page */ - dict_index_t* index) /*!< in: record descriptor */ + const dict_index_t* index) /*!< in: record descriptor */ { const page_t* page = page_align(rec); ulint n_ext = 0; @@ -632,15 +655,15 @@ page_zip_dir_encode( #if PAGE_ZIP_DIR_SLOT_MASK & (PAGE_ZIP_DIR_SLOT_MASK + 1) # error "PAGE_ZIP_DIR_SLOT_MASK is not 1 less than a power of 2" #endif -#if PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE - 1 -# error "PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE - 1" +#if PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE_MAX - 1 +# error "PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE_MAX - 1" #endif if (UNIV_UNLIKELY(rec_get_n_owned_new(rec))) { offs |= PAGE_ZIP_DIR_SLOT_OWNED; } info_bits = rec_get_info_bits(rec, TRUE); - if (UNIV_UNLIKELY(info_bits & REC_INFO_DELETED_FLAG)) { + if (info_bits & REC_INFO_DELETED_FLAG) { info_bits &= ~REC_INFO_DELETED_FLAG; offs |= PAGE_ZIP_DIR_SLOT_DEL; } @@ -691,6 +714,8 @@ page_zip_dir_encode( ut_a(i + PAGE_HEAP_NO_USER_LOW == n_heap); } +extern "C" { + /**********************************************************************//** Allocate memory for zlib. */ static @@ -701,7 +726,7 @@ page_zip_zalloc( uInt items, /*!< in: number of items to allocate */ uInt size) /*!< in: size of an item in bytes */ { - return(mem_heap_zalloc(opaque, items * size)); + return(mem_heap_zalloc(static_cast<mem_heap_t*>(opaque), items * size)); } /**********************************************************************//** @@ -715,6 +740,8 @@ page_zip_free( { } +} /* extern "C" */ + /**********************************************************************//** Configure the zlib allocator to use the given memory heap. */ UNIV_INTERN @@ -724,7 +751,7 @@ page_zip_set_alloc( void* stream, /*!< in/out: zlib stream */ mem_heap_t* heap) /*!< in: memory heap to use */ { - z_stream* strm = stream; + z_stream* strm = static_cast<z_stream*>(stream); strm->zalloc = page_zip_zalloc; strm->zfree = page_zip_free; @@ -1089,7 +1116,7 @@ page_zip_compress_clust( /* Check if there are any externally stored columns. For each externally stored column, store the BTR_EXTERN_FIELD_REF separately. */ - if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) { + if (rec_offs_any_extern(offsets)) { ut_ad(dict_index_is_clust(index)); err = page_zip_compress_clust_ext( @@ -1173,6 +1200,7 @@ page_zip_compress( m_start, m_end, m_nonempty */ const page_t* page, /*!< in: uncompressed page */ dict_index_t* index, /*!< in: index of the B-tree node */ + ulint level, /*!< in: compression level */ mtr_t* mtr) /*!< in: mini-transaction, or NULL */ { z_stream c_stream; @@ -1186,7 +1214,6 @@ page_zip_compress( const rec_t** recs; /*!< dense page directory, sorted by address */ mem_heap_t* heap; ulint trx_id_col; - ulint* offsets = NULL; ulint n_blobs = 0; byte* storage;/* storage of uncompressed columns */ #ifndef UNIV_HOTBACKUP @@ -1195,6 +1222,10 @@ page_zip_compress( #ifdef PAGE_ZIP_COMPRESS_DBG FILE* logfile = NULL; #endif + /* A local copy of srv_cmp_per_index_enabled to avoid reading that + variable multiple times in this function since it can be changed at + anytime. */ + my_bool cmp_per_index_enabled = srv_cmp_per_index_enabled; if (!page) { return(FALSE); @@ -1220,7 +1251,7 @@ page_zip_compress( ut_a(!memcmp(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1), supremum_extra_data, sizeof supremum_extra_data)); - if (UNIV_UNLIKELY(!page_get_n_recs(page))) { + if (page_is_empty(page)) { ut_a(rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE) == PAGE_NEW_SUPREMUM); } @@ -1237,7 +1268,7 @@ page_zip_compress( if (UNIV_UNLIKELY(page_zip_compress_dbg)) { fprintf(stderr, "compress %p %p %lu %lu %lu\n", (void*) page_zip, (void*) page, - page_is_leaf(page), + (ibool) page_is_leaf(page), n_fields, n_dense); } if (UNIV_UNLIKELY(page_zip_compress_log)) { @@ -1261,6 +1292,11 @@ page_zip_compress( #endif /* PAGE_ZIP_COMPRESS_DBG */ #ifndef UNIV_HOTBACKUP page_zip_stat[page_zip->ssize - 1].compressed++; + if (cmp_per_index_enabled) { + mutex_enter(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index[index->id].compressed++; + mutex_exit(&page_zip_stat_per_index_mutex); + } #endif /* !UNIV_HOTBACKUP */ if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE @@ -1269,24 +1305,30 @@ page_zip_compress( goto err_exit; } + MONITOR_INC(MONITOR_PAGE_COMPRESS); + heap = mem_heap_create(page_zip_get_size(page_zip) - + n_fields * (2 + sizeof *offsets) + + n_fields * (2 + sizeof(ulint)) + + REC_OFFS_HEADER_SIZE + n_dense * ((sizeof *recs) - PAGE_ZIP_DIR_SLOT_SIZE) + UNIV_PAGE_SIZE * 4 + (512 << MAX_MEM_LEVEL)); - recs = mem_heap_zalloc(heap, n_dense * sizeof *recs); + recs = static_cast<const rec_t**>( + mem_heap_zalloc(heap, n_dense * sizeof *recs)); - fields = mem_heap_alloc(heap, (n_fields + 1) * 2); + fields = static_cast<byte*>(mem_heap_alloc(heap, (n_fields + 1) * 2)); + + buf = static_cast<byte*>( + mem_heap_alloc(heap, page_zip_get_size(page_zip) - PAGE_DATA)); - buf = mem_heap_alloc(heap, page_zip_get_size(page_zip) - PAGE_DATA); buf_end = buf + page_zip_get_size(page_zip) - PAGE_DATA; /* Compress the data payload. */ page_zip_set_alloc(&c_stream, heap); - err = deflateInit2(&c_stream, Z_DEFAULT_COMPRESSION, + err = deflateInit2(&c_stream, level, Z_DEFLATED, UNIV_PAGE_SIZE_SHIFT, MAX_MEM_LEVEL, Z_DEFAULT_STRATEGY); ut_a(err == Z_OK); @@ -1399,8 +1441,19 @@ err_exit: } #endif /* PAGE_ZIP_COMPRESS_DBG */ #ifndef UNIV_HOTBACKUP + if (page_is_leaf(page)) { + dict_index_zip_failure(index); + } + + ullint time_diff = ut_time_us(NULL) - usec; page_zip_stat[page_zip->ssize - 1].compressed_usec - += ut_time_us(NULL) - usec; + += time_diff; + if (cmp_per_index_enabled) { + mutex_enter(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index[index->id].compressed_usec + += time_diff; + mutex_exit(&page_zip_stat_per_index_mutex); + } #endif /* !UNIV_HOTBACKUP */ return(FALSE); } @@ -1460,11 +1513,18 @@ err_exit: } #endif /* PAGE_ZIP_COMPRESS_DBG */ #ifndef UNIV_HOTBACKUP - { - page_zip_stat_t* zip_stat - = &page_zip_stat[page_zip->ssize - 1]; - zip_stat->compressed_ok++; - zip_stat->compressed_usec += ut_time_us(NULL) - usec; + ullint time_diff = ut_time_us(NULL) - usec; + page_zip_stat[page_zip->ssize - 1].compressed_ok++; + page_zip_stat[page_zip->ssize - 1].compressed_usec += time_diff; + if (cmp_per_index_enabled) { + mutex_enter(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index[index->id].compressed_ok++; + page_zip_stat_per_index[index->id].compressed_usec += time_diff; + mutex_exit(&page_zip_stat_per_index_mutex); + } + + if (page_is_leaf(page)) { + dict_index_zip_success(index); } #endif /* !UNIV_HOTBACKUP */ @@ -1509,6 +1569,7 @@ page_zip_fields_free( { if (index) { dict_table_t* table = index->table; + os_fast_mutex_free(&index->zip_pad.mutex); mem_heap_free(index->heap); mutex_free(&(table->autoinc_mutex)); ut_free(table->name); @@ -1560,7 +1621,7 @@ page_zip_fields_decode( } table = dict_mem_table_create("ZIP_DUMMY", DICT_HDR_SPACE, n, - DICT_TF_COMPACT); + DICT_TF_COMPACT, 0); index = dict_mem_index_create("ZIP_DUMMY", "ZIP_DUMMY", DICT_HDR_SPACE, 0, n); index->table = table; @@ -1752,7 +1813,7 @@ page_zip_set_extra_bytes( for (i = 0; i < n; i++) { offs = page_zip_dir_get(page_zip, i); - if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_DEL)) { + if (offs & PAGE_ZIP_DIR_SLOT_DEL) { info_bits |= REC_INFO_DELETED_FLAG; } if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_OWNED)) { @@ -2117,6 +2178,32 @@ page_zip_apply_log( } /**********************************************************************//** +Set the heap_no in a record, and skip the fixed-size record header +that is not included in the d_stream. +@return TRUE on success, FALSE if d_stream does not end at rec */ +static +ibool +page_zip_decompress_heap_no( +/*========================*/ + z_stream* d_stream, /*!< in/out: compressed page stream */ + rec_t* rec, /*!< in/out: record */ + ulint& heap_status) /*!< in/out: heap_no and status bits */ +{ + if (d_stream->next_out != rec - REC_N_NEW_EXTRA_BYTES) { + /* n_dense has grown since the page was last compressed. */ + return(FALSE); + } + + /* Skip the REC_N_NEW_EXTRA_BYTES. */ + d_stream->next_out = rec; + + /* Set heap_no and the status bits. */ + mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status); + heap_status += 1 << REC_HEAP_NO_SHIFT; + return(TRUE); +} + +/**********************************************************************//** Decompress the records of a node pointer page. @return TRUE on success, FALSE on failure */ static @@ -2152,19 +2239,8 @@ page_zip_decompress_node_ptrs( - PAGE_ZIP_START - PAGE_DIR); switch (inflate(d_stream, Z_SYNC_FLUSH)) { case Z_STREAM_END: - if (d_stream->next_out - != rec - REC_N_NEW_EXTRA_BYTES) { - /* n_dense has grown since the page - was last compressed. */ - } else { - /* Skip the REC_N_NEW_EXTRA_BYTES. */ - d_stream->next_out = rec; - - /* Set heap_no and the status bits. */ - mach_write_to_2(rec - REC_NEW_HEAP_NO, - heap_status); - heap_status += 1 << REC_HEAP_NO_SHIFT; - } + page_zip_decompress_heap_no( + d_stream, rec, heap_status); goto zlib_done; case Z_OK: case Z_BUF_ERROR: @@ -2179,12 +2255,10 @@ page_zip_decompress_node_ptrs( goto zlib_error; } - ut_ad(d_stream->next_out == rec - REC_N_NEW_EXTRA_BYTES); - /* Prepare to decompress the data bytes. */ - d_stream->next_out = rec; - /* Set heap_no and the status bits. */ - mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status); - heap_status += 1 << REC_HEAP_NO_SHIFT; + if (!page_zip_decompress_heap_no( + d_stream, rec, heap_status)) { + ut_ad(0); + } /* Read the offsets. The status bits are needed here. */ offsets = rec_get_offsets(rec, index, offsets, @@ -2352,19 +2426,8 @@ page_zip_decompress_sec( if (UNIV_LIKELY(d_stream->avail_out)) { switch (inflate(d_stream, Z_SYNC_FLUSH)) { case Z_STREAM_END: - if (d_stream->next_out - != rec - REC_N_NEW_EXTRA_BYTES) { - /* n_dense has grown since the page - was last compressed. */ - } else { - /* Skip the REC_N_NEW_EXTRA_BYTES. */ - d_stream->next_out = rec; - - /* Set heap_no and the status bits. */ - mach_write_to_2(rec - REC_NEW_HEAP_NO, - heap_status); - heap_status += 1 << REC_HEAP_NO_SHIFT; - } + page_zip_decompress_heap_no( + d_stream, rec, heap_status); goto zlib_done; case Z_OK: case Z_BUF_ERROR: @@ -2380,15 +2443,10 @@ page_zip_decompress_sec( } } - ut_ad(d_stream->next_out == rec - REC_N_NEW_EXTRA_BYTES); - - /* Skip the REC_N_NEW_EXTRA_BYTES. */ - - d_stream->next_out = rec; - - /* Set heap_no and the status bits. */ - mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status); - heap_status += 1 << REC_HEAP_NO_SHIFT; + if (!page_zip_decompress_heap_no( + d_stream, rec, heap_status)) { + ut_ad(0); + } } /* Decompress the data of the last record and any trailing garbage, @@ -2622,19 +2680,8 @@ page_zip_decompress_clust( err = inflate(d_stream, Z_SYNC_FLUSH); switch (err) { case Z_STREAM_END: - if (d_stream->next_out - != rec - REC_N_NEW_EXTRA_BYTES) { - /* n_dense has grown since the page - was last compressed. */ - } else { - /* Skip the REC_N_NEW_EXTRA_BYTES. */ - d_stream->next_out = rec; - - /* Set heap_no and the status bits. */ - mach_write_to_2(rec - REC_NEW_HEAP_NO, - heap_status); - heap_status += 1 << REC_HEAP_NO_SHIFT; - } + page_zip_decompress_heap_no( + d_stream, rec, heap_status); goto zlib_done; case Z_OK: case Z_BUF_ERROR: @@ -2649,12 +2696,10 @@ page_zip_decompress_clust( goto zlib_error; } - ut_ad(d_stream->next_out == rec - REC_N_NEW_EXTRA_BYTES); - /* Prepare to decompress the data bytes. */ - d_stream->next_out = rec; - /* Set heap_no and the status bits. */ - mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status); - heap_status += 1 << REC_HEAP_NO_SHIFT; + if (!page_zip_decompress_heap_no( + d_stream, rec, heap_status)) { + ut_ad(0); + } /* Read the offsets. The status bits are needed here. */ offsets = rec_get_offsets(rec, index, offsets, @@ -2666,7 +2711,7 @@ page_zip_decompress_clust( For each externally stored column, restore the BTR_EXTERN_FIELD_REF separately. */ - if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) { + if (rec_offs_any_extern(offsets)) { if (UNIV_UNLIKELY (!page_zip_decompress_clust_ext( d_stream, rec, offsets, trx_id_col))) { @@ -2931,7 +2976,9 @@ page_zip_decompress( } heap = mem_heap_create(n_dense * (3 * sizeof *recs) + UNIV_PAGE_SIZE); - recs = mem_heap_alloc(heap, n_dense * (2 * sizeof *recs)); + + recs = static_cast<rec_t**>( + mem_heap_alloc(heap, n_dense * (2 * sizeof *recs))); if (all) { /* Copy the page header. */ @@ -2975,7 +3022,7 @@ zlib_error: /* Copy the infimum and supremum records. */ memcpy(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES), infimum_extra, sizeof infimum_extra); - if (UNIV_UNLIKELY(!page_get_n_recs(page))) { + if (page_is_empty(page)) { rec_set_next_offs_new(page + PAGE_NEW_INFIMUM, PAGE_NEW_SUPREMUM); } else { @@ -3033,7 +3080,10 @@ zlib_error: /* Pre-allocate the offsets for rec_get_offsets_reverse(). */ ulint n = 1 + 1/* node ptr */ + REC_OFFS_HEADER_SIZE + dict_index_get_n_fields(index); - offsets = mem_heap_alloc(heap, n * sizeof(ulint)); + + offsets = static_cast<ulint*>( + mem_heap_alloc(heap, n * sizeof(ulint))); + *offsets = n; } @@ -3093,17 +3143,25 @@ err_exit: page_zip_fields_free(index); mem_heap_free(heap); #ifndef UNIV_HOTBACKUP - { - page_zip_stat_t* zip_stat - = &page_zip_stat[page_zip->ssize - 1]; - zip_stat->decompressed++; - zip_stat->decompressed_usec += ut_time_us(NULL) - usec; + ullint time_diff = ut_time_us(NULL) - usec; + page_zip_stat[page_zip->ssize - 1].decompressed++; + page_zip_stat[page_zip->ssize - 1].decompressed_usec += time_diff; + + index_id_t index_id = btr_page_get_index_id(page); + + if (srv_cmp_per_index_enabled) { + mutex_enter(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index[index_id].decompressed++; + page_zip_stat_per_index[index_id].decompressed_usec += time_diff; + mutex_exit(&page_zip_stat_per_index_mutex); } #endif /* !UNIV_HOTBACKUP */ /* Update the stat counter for LRU policy. */ buf_LRU_stat_inc_unzip(); + MONITOR_INC(MONITOR_PAGE_DECOMPRESS); + return(TRUE); } @@ -3118,7 +3176,7 @@ page_zip_hexdump_func( const void* buf, /*!< in: data */ ulint size) /*!< in: length of the data, in bytes */ { - const byte* s = buf; + const byte* s = static_cast<const byte*>(buf); ulint addr; const ulint width = 32; /* bytes per line */ @@ -3185,15 +3243,15 @@ page_zip_validate_low( /* page_zip_decompress() expects the uncompressed page to be UNIV_PAGE_SIZE aligned. */ - temp_page_buf = ut_malloc(2 * UNIV_PAGE_SIZE); - temp_page = ut_align(temp_page_buf, UNIV_PAGE_SIZE); + temp_page_buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE)); + temp_page = static_cast<byte*>(ut_align(temp_page_buf, UNIV_PAGE_SIZE)); #ifdef UNIV_DEBUG_VALGRIND /* Get detailed information on the valid bits in case the UNIV_MEM_ASSERT_RW() checks fail. The v-bits of page[], page_zip->data[] or page_zip could be viewed at temp_page[] or temp_page_zip in a debugger when running valgrind --db-attach. */ - VALGRIND_GET_VBITS(page, temp_page, UNIV_PAGE_SIZE); + (void) VALGRIND_GET_VBITS(page, temp_page, UNIV_PAGE_SIZE); UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE); # if UNIV_WORD_SIZE == 4 VALGRIND_GET_VBITS(page_zip, &temp_page_zip, sizeof temp_page_zip); @@ -3202,8 +3260,8 @@ page_zip_validate_low( pad bytes. */ UNIV_MEM_ASSERT_RW(page_zip, sizeof *page_zip); # endif - VALGRIND_GET_VBITS(page_zip->data, temp_page, - page_zip_get_size(page_zip)); + (void) VALGRIND_GET_VBITS(page_zip->data, temp_page, + page_zip_get_size(page_zip)); UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); #endif /* UNIV_DEBUG_VALGRIND */ @@ -4005,6 +4063,7 @@ page_zip_write_trx_id_and_roll_ptr( ulint len; ut_ad(PAGE_ZIP_MATCH(rec, page_zip)); + ut_ad(page_simple_validate_new(page)); ut_ad(page_zip_simple_validate(page_zip)); ut_ad(page_zip_get_size(page_zip) @@ -4057,10 +4116,10 @@ static void page_zip_clear_rec( /*===============*/ - page_zip_des_t* page_zip,/*!< in/out: compressed page */ - byte* rec, /*!< in: record to clear */ - dict_index_t* index, /*!< in: index of rec */ - const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + byte* rec, /*!< in: record to clear */ + const dict_index_t* index, /*!< in: index of rec */ + const ulint* offsets) /*!< in: rec_get_offsets(rec, index) */ { ulint heap_no; page_t* page = page_align(rec); @@ -4271,11 +4330,12 @@ UNIV_INTERN void page_zip_dir_delete( /*================*/ - page_zip_des_t* page_zip,/*!< in/out: compressed page */ - byte* rec, /*!< in: record to delete */ - dict_index_t* index, /*!< in: index of rec */ - const ulint* offsets,/*!< in: rec_get_offsets(rec) */ - const byte* free) /*!< in: previous start of the free list */ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + byte* rec, /*!< in: deleted record */ + const dict_index_t* index, /*!< in: index of rec */ + const ulint* offsets, /*!< in: rec_get_offsets(rec) */ + const byte* free) /*!< in: previous start of + the free list */ { byte* slot_rec; byte* slot_free; @@ -4389,7 +4449,7 @@ page_zip_dir_add_slot( if (!page_is_leaf(page_zip->data)) { ut_ad(!page_zip->n_blobs); stored = dir - n_dense * REC_NODE_PTR_SIZE; - } else if (UNIV_UNLIKELY(is_clustered)) { + } else if (is_clustered) { /* Move the BLOB pointer array backwards to make space for the roll_ptr and trx_id columns and the dense directory slot. */ byte* externs; @@ -4591,7 +4651,7 @@ page_zip_reorganize( /* Restore logging. */ mtr_set_log_mode(mtr, log_mode); - if (UNIV_UNLIKELY(!page_zip_compress(page_zip, page, index, mtr))) { + if (!page_zip_compress(page_zip, page, index, page_zip_level, mtr)) { #ifndef UNIV_HOTBACKUP buf_block_free(temp_block); @@ -4771,21 +4831,113 @@ ulint page_zip_calc_checksum( /*===================*/ const void* data, /*!< in: compressed page */ - ulint size) /*!< in: size of compressed page */ + ulint size, /*!< in: size of compressed page */ + srv_checksum_algorithm_t algo) /*!< in: algorithm to use */ { + uLong adler; + ib_uint32_t crc32; + const Bytef* s = static_cast<const byte*>(data); + /* Exclude FIL_PAGE_SPACE_OR_CHKSUM, FIL_PAGE_LSN, and FIL_PAGE_FILE_FLUSH_LSN from the checksum. */ - const Bytef* s = data; - uLong adler; + switch (algo) { + case SRV_CHECKSUM_ALGORITHM_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: + + ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + crc32 = ut_crc32(s + FIL_PAGE_OFFSET, + FIL_PAGE_LSN - FIL_PAGE_OFFSET) + ^ ut_crc32(s + FIL_PAGE_TYPE, 2) + ^ ut_crc32(s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + return((ulint) crc32); + case SRV_CHECKSUM_ALGORITHM_INNODB: + case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB: + ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + adler = adler32(0L, s + FIL_PAGE_OFFSET, + FIL_PAGE_LSN - FIL_PAGE_OFFSET); + adler = adler32(adler, s + FIL_PAGE_TYPE, 2); + adler = adler32(adler, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + return((ulint) adler); + case SRV_CHECKSUM_ALGORITHM_NONE: + case SRV_CHECKSUM_ALGORITHM_STRICT_NONE: + return(BUF_NO_CHECKSUM_MAGIC); + /* no default so the compiler will emit a warning if new enum + is added and not handled here */ + } + + ut_error; + return(0); +} + +/**********************************************************************//** +Verify a compressed page's checksum. +@return TRUE if the stored checksum is valid according to the value of +innodb_checksum_algorithm */ +UNIV_INTERN +ibool +page_zip_verify_checksum( +/*=====================*/ + const void* data, /*!< in: compressed page */ + ulint size) /*!< in: size of compressed page */ +{ + ib_uint32_t stored; + ib_uint32_t calc; + ib_uint32_t crc32 = 0 /* silence bogus warning */; + ib_uint32_t innodb = 0 /* silence bogus warning */; - ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + stored = mach_read_from_4( + (const unsigned char*) data + FIL_PAGE_SPACE_OR_CHKSUM); - adler = adler32(0L, s + FIL_PAGE_OFFSET, - FIL_PAGE_LSN - FIL_PAGE_OFFSET); - adler = adler32(adler, s + FIL_PAGE_TYPE, 2); - adler = adler32(adler, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, - size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + /* declare empty pages non-corrupted */ + if (stored == 0) { + /* make sure that the page is really empty */ + ut_d(ulint i; for (i = 0; i < size; i++) { + ut_a(*((const char*) data + i) == 0); }); + + return(TRUE); + } + + calc = page_zip_calc_checksum( + data, size, static_cast<srv_checksum_algorithm_t>( + srv_checksum_algorithm)); + + if (stored == calc) { + return(TRUE); + } + + switch ((srv_checksum_algorithm_t) srv_checksum_algorithm) { + case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB: + case SRV_CHECKSUM_ALGORITHM_STRICT_NONE: + return(stored == calc); + case SRV_CHECKSUM_ALGORITHM_CRC32: + if (stored == BUF_NO_CHECKSUM_MAGIC) { + return(TRUE); + } + crc32 = calc; + innodb = page_zip_calc_checksum( + data, size, SRV_CHECKSUM_ALGORITHM_INNODB); + break; + case SRV_CHECKSUM_ALGORITHM_INNODB: + if (stored == BUF_NO_CHECKSUM_MAGIC) { + return(TRUE); + } + crc32 = page_zip_calc_checksum( + data, size, SRV_CHECKSUM_ALGORITHM_CRC32); + innodb = calc; + break; + case SRV_CHECKSUM_ALGORITHM_NONE: + return(TRUE); + /* no default so the compiler will emit a warning if new enum + is added and not handled here */ + } - return((ulint) adler); + return(stored == crc32 || stored == innodb); } |