summaryrefslogtreecommitdiff
path: root/storage/xtradb/page
diff options
context:
space:
mode:
authorSergei Golubchik <sergii@pisem.net>2013-12-22 17:06:50 +0100
committerSergei Golubchik <sergii@pisem.net>2013-12-22 17:06:50 +0100
commitffa8c4cfcc41d4f160e3bdfca5cfd4b01a7d6e63 (patch)
tree728585c36f22a5db3cea796430883d0ebc5c05eb /storage/xtradb/page
parente27c34f9e4ca15c797fcd3191ee5679c2f237a09 (diff)
parent52c26f7a1f675185d2ef1a28aca7f9bcc67c6414 (diff)
downloadmariadb-git-ffa8c4cfcc41d4f160e3bdfca5cfd4b01a7d6e63.tar.gz
Percona-Server-5.6.14-rel62.0 merge
support ha_innodb.so as a dynamic plugin. * remove obsolete *,innodb_plugin.rdiff files * s/--plugin-load=/--plugin-load-add=/ * MYSQL_PLUGIN_IMPORT glob_hostname[] * use my_error instead of push_warning_printf(ER_DEFAULT) * don't use tdc_size and tc_size in a module update test cases (XtraDB is 5.6.14, InnoDB is 5.6.10) * copy new tests over * disable some tests for (old) InnoDB * delete XtraDB tests that no longer apply small compatibility changes: * s/HTON_EXTENDED_KEYS/HTON_SUPPORTS_EXTENDED_KEYS/ * revert unnecessary InnoDB changes to make it a bit closer to the upstream fix XtraDB to compile on Windows (both as a static and a dynamic plugin) disable XtraDB on Windows (deadlocks) and where no atomic ops are available (e.g. CentOS 5) storage/innobase/handler/ha_innodb.cc: revert few unnecessary changes to make it a bit closer to the original InnoDB storage/innobase/include/univ.i: correct the version to match what it was merged from
Diffstat (limited to 'storage/xtradb/page')
-rw-r--r--storage/xtradb/page/page0cur.cc (renamed from storage/xtradb/page/page0cur.c)340
-rw-r--r--storage/xtradb/page/page0page.cc (renamed from storage/xtradb/page/page0page.c)228
-rw-r--r--storage/xtradb/page/page0zip.cc (renamed from storage/xtradb/page/page0zip.c)398
3 files changed, 711 insertions, 255 deletions
diff --git a/storage/xtradb/page/page0cur.c b/storage/xtradb/page/page0cur.cc
index a722f5b188d..efce1f10cae 100644
--- a/storage/xtradb/page/page0cur.c
+++ b/storage/xtradb/page/page0cur.cc
@@ -1,6 +1,7 @@
/*****************************************************************************
-Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -17,7 +18,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
*****************************************************************************/
/********************************************************************//**
-@file page/page0cur.c
+@file page/page0cur.cc
The page cursor
Created 10/4/1994 Heikki Tuuri
@@ -29,6 +30,7 @@ Created 10/4/1994 Heikki Tuuri
#endif
#include "page0zip.h"
+#include "btr0btr.h"
#include "mtr0log.h"
#include "log0recv.h"
#include "ut0ut.h"
@@ -772,7 +774,7 @@ page_cur_parse_insert_rec(
byte* buf;
byte* ptr2 = ptr;
ulint info_and_status_bits = 0; /* remove warning */
- page_cur_t cursor;
+ page_cur_t cursor;
mem_heap_t* heap = NULL;
ulint offsets_[REC_OFFS_NORMAL_SIZE];
ulint* offsets = offsets_;
@@ -879,7 +881,8 @@ page_cur_parse_insert_rec(
if (mismatch_index + end_seg_len < sizeof buf1) {
buf = buf1;
} else {
- buf = mem_alloc(mismatch_index + end_seg_len);
+ buf = static_cast<byte*>(
+ mem_alloc(mismatch_index + end_seg_len));
}
/* Build the inserted record to buf */
@@ -972,6 +975,9 @@ page_cur_insert_rec_low(
page = page_align(current_rec);
ut_ad(dict_table_is_comp(index->table)
== (ibool) !!page_is_comp(page));
+ ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
+ ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID)
+ == index->id || recv_recovery_is_on() || mtr->inside_ibuf);
ut_ad(!page_rec_is_supremum(current_rec));
@@ -1006,8 +1012,8 @@ page_cur_insert_rec_low(
rec_offs_init(foffsets_);
- foffsets = rec_get_offsets(free_rec, index, foffsets,
- ULINT_UNDEFINED, &heap);
+ foffsets = rec_get_offsets(
+ free_rec, index, foffsets, ULINT_UNDEFINED, &heap);
if (rec_offs_size(foffsets) < rec_size) {
if (UNIV_LIKELY_NULL(heap)) {
mem_heap_free(heap);
@@ -1155,71 +1161,22 @@ use_heap:
}
/***********************************************************//**
-Compresses or reorganizes a page after an optimistic insert.
-@return rec if succeed, NULL otherwise */
-static
-rec_t*
-page_cur_insert_rec_zip_reorg(
-/*==========================*/
- rec_t** current_rec,/*!< in/out: pointer to current record after
- which the new record is inserted */
- buf_block_t* block, /*!< in: buffer block */
- dict_index_t* index, /*!< in: record descriptor */
- rec_t* rec, /*!< in: inserted record */
- page_t* page, /*!< in: uncompressed page */
- page_zip_des_t* page_zip,/*!< in: compressed page */
- mtr_t* mtr) /*!< in: mini-transaction, or NULL */
-{
- ulint pos;
-
- /* Recompress or reorganize and recompress the page. */
- if (UNIV_LIKELY(page_zip_compress(page_zip, page, index, mtr))) {
- return(rec);
- }
-
- /* Before trying to reorganize the page,
- store the number of preceding records on the page. */
- pos = page_rec_get_n_recs_before(rec);
- ut_ad(pos > 0);
-
- if (page_zip_reorganize(block, index, mtr)) {
- /* The page was reorganized: Find rec by seeking to pos,
- and update *current_rec. */
- if (pos > 1) {
- rec = page_rec_get_nth(page, pos - 1);
- } else {
- rec = page + PAGE_NEW_INFIMUM;
- }
-
- *current_rec = rec;
- rec = page + rec_get_next_offs(rec, TRUE);
-
- return(rec);
- }
-
- /* Out of space: restore the page */
- btr_blob_dbg_remove(page, index, "insert_zip_fail");
- if (!page_zip_decompress(page_zip, page, FALSE)) {
- ut_error; /* Memory corrupted? */
- }
- ut_ad(page_validate(page, index));
- btr_blob_dbg_add(page, index, "insert_zip_fail");
- return(NULL);
-}
-
-/***********************************************************//**
Inserts a record next to page cursor on a compressed and uncompressed
page. Returns pointer to inserted record if succeed, i.e.,
enough space available, NULL otherwise.
The cursor stays at the same position.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
@return pointer to record if succeed, NULL otherwise */
UNIV_INTERN
rec_t*
page_cur_insert_rec_zip(
/*====================*/
- rec_t** current_rec,/*!< in/out: pointer to current record after
- which the new record is inserted */
- buf_block_t* block, /*!< in: buffer block of *current_rec */
+ page_cur_t* cursor, /*!< in/out: page cursor */
dict_index_t* index, /*!< in: record descriptor */
const rec_t* rec, /*!< in: pointer to a physical record */
ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */
@@ -1237,16 +1194,19 @@ page_cur_insert_rec_zip(
record */
page_zip_des_t* page_zip;
- page_zip = buf_block_get_page_zip(block);
+ page_zip = page_cur_get_page_zip(cursor);
ut_ad(page_zip);
ut_ad(rec_offs_validate(rec, index, offsets));
- page = page_align(*current_rec);
+ page = page_cur_get_page(cursor);
ut_ad(dict_table_is_comp(index->table));
ut_ad(page_is_comp(page));
+ ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
+ ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID)
+ == index->id || mtr->inside_ibuf || recv_recovery_is_on());
- ut_ad(!page_rec_is_supremum(*current_rec));
+ ut_ad(!page_cur_is_after_last(cursor));
#ifdef UNIV_ZIP_DEBUG
ut_a(page_zip_validate(page_zip, page, index));
#endif /* UNIV_ZIP_DEBUG */
@@ -1271,25 +1231,168 @@ page_cur_insert_rec_zip(
}
#endif /* UNIV_DEBUG_VALGRIND */
+ const bool reorg_before_insert = page_has_garbage(page)
+ && rec_size > page_get_max_insert_size(page, 1)
+ && rec_size <= page_get_max_insert_size_after_reorganize(
+ page, 1);
+
/* 2. Try to find suitable space from page memory management */
if (!page_zip_available(page_zip, dict_index_is_clust(index),
- rec_size, 1)) {
+ rec_size, 1)
+ || reorg_before_insert) {
+ /* The values can change dynamically. */
+ bool log_compressed = page_zip_log_pages;
+ ulint level = page_zip_level;
+#ifdef UNIV_DEBUG
+ rec_t* cursor_rec = page_cur_get_rec(cursor);
+#endif /* UNIV_DEBUG */
+
+ /* If we are not writing compressed page images, we
+ must reorganize the page before attempting the
+ insert. */
+ if (recv_recovery_is_on()) {
+ /* Insert into the uncompressed page only.
+ The page reorganization or creation that we
+ would attempt outside crash recovery would
+ have been covered by a previous redo log record. */
+ } else if (page_is_empty(page)) {
+ ut_ad(page_cur_is_before_first(cursor));
+
+ /* This is an empty page. Recreate it to
+ get rid of the modification log. */
+ page_create_zip(page_cur_get_block(cursor), index,
+ page_header_get_field(page, PAGE_LEVEL),
+ 0, mtr);
+ ut_ad(!page_header_get_ptr(page, PAGE_FREE));
+
+ if (page_zip_available(
+ page_zip, dict_index_is_clust(index),
+ rec_size, 1)) {
+ goto use_heap;
+ }
+
+ /* The cursor should remain on the page infimum. */
+ return(NULL);
+ } else if (!page_zip->m_nonempty && !page_has_garbage(page)) {
+ /* The page has been freshly compressed, so
+ reorganizing it will not help. */
+ } else if (log_compressed && !reorg_before_insert) {
+ /* Insert into uncompressed page only, and
+ try page_zip_reorganize() afterwards. */
+ } else if (btr_page_reorganize_low(
+ recv_recovery_is_on(), level,
+ cursor, index, mtr)) {
+ ut_ad(!page_header_get_ptr(page, PAGE_FREE));
+
+ if (page_zip_available(
+ page_zip, dict_index_is_clust(index),
+ rec_size, 1)) {
+ /* After reorganizing, there is space
+ available. */
+ goto use_heap;
+ }
+ } else {
+ ut_ad(cursor->rec == cursor_rec);
+ return(NULL);
+ }
/* Try compressing the whole page afterwards. */
- insert_rec = page_cur_insert_rec_low(*current_rec,
- index, rec, offsets,
- NULL);
-
- if (UNIV_LIKELY(insert_rec != NULL)) {
- insert_rec = page_cur_insert_rec_zip_reorg(
- current_rec, block, index, insert_rec,
- page, page_zip, mtr);
-#ifdef UNIV_DEBUG
- if (insert_rec) {
- rec_offs_make_valid(
- insert_rec, index, offsets);
+ insert_rec = page_cur_insert_rec_low(
+ cursor->rec, index, rec, offsets, NULL);
+
+ /* If recovery is on, this implies that the compression
+ of the page was successful during runtime. Had that not
+ been the case or had the redo logging of compressed
+ pages been enabled during runtime then we'd have seen
+ a MLOG_ZIP_PAGE_COMPRESS redo record. Therefore, we
+ know that we don't need to reorganize the page. We,
+ however, do need to recompress the page. That will
+ happen when the next redo record is read which must
+ be of type MLOG_ZIP_PAGE_COMPRESS_NO_DATA and it must
+ contain a valid compression level value.
+ This implies that during recovery from this point till
+ the next redo is applied the uncompressed and
+ compressed versions are not identical and
+ page_zip_validate will fail but that is OK because
+ we call page_zip_validate only after processing
+ all changes to a page under a single mtr during
+ recovery. */
+ if (insert_rec == NULL) {
+ /* Out of space.
+ This should never occur during crash recovery,
+ because the MLOG_COMP_REC_INSERT should only
+ be logged after a successful operation. */
+ ut_ad(!recv_recovery_is_on());
+ } else if (recv_recovery_is_on()) {
+ /* This should be followed by
+ MLOG_ZIP_PAGE_COMPRESS_NO_DATA,
+ which should succeed. */
+ rec_offs_make_valid(insert_rec, index, offsets);
+ } else {
+ ulint pos = page_rec_get_n_recs_before(insert_rec);
+ ut_ad(pos > 0);
+
+ if (!log_compressed) {
+ if (page_zip_compress(
+ page_zip, page, index,
+ level, NULL)) {
+ page_cur_insert_rec_write_log(
+ insert_rec, rec_size,
+ cursor->rec, index, mtr);
+ page_zip_compress_write_log_no_data(
+ level, page, index, mtr);
+
+ rec_offs_make_valid(
+ insert_rec, index, offsets);
+ return(insert_rec);
+ }
+
+ ut_ad(cursor->rec
+ == (pos > 1
+ ? page_rec_get_nth(
+ page, pos - 1)
+ : page + PAGE_NEW_INFIMUM));
+ } else {
+ /* We are writing entire page images
+ to the log. Reduce the redo log volume
+ by reorganizing the page at the same time. */
+ if (page_zip_reorganize(
+ cursor->block, index, mtr)) {
+ /* The page was reorganized:
+ Seek to pos. */
+ if (pos > 1) {
+ cursor->rec = page_rec_get_nth(
+ page, pos - 1);
+ } else {
+ cursor->rec = page
+ + PAGE_NEW_INFIMUM;
+ }
+
+ insert_rec = page + rec_get_next_offs(
+ cursor->rec, TRUE);
+ rec_offs_make_valid(
+ insert_rec, index, offsets);
+ return(insert_rec);
+ }
+
+ /* Theoretically, we could try one
+ last resort of btr_page_reorganize_low()
+ followed by page_zip_available(), but
+ that would be very unlikely to
+ succeed. (If the full reorganized page
+ failed to compress, why would it
+ succeed to compress the page, plus log
+ the insert of this record? */
}
-#endif /* UNIV_DEBUG */
+
+ /* Out of space: restore the page */
+ btr_blob_dbg_remove(page, index, "insert_zip_fail");
+ if (!page_zip_decompress(page_zip, page, FALSE)) {
+ ut_error; /* Memory corrupted? */
+ }
+ ut_ad(page_validate(page, index));
+ btr_blob_dbg_add(page, index, "insert_zip_fail");
+ insert_rec = NULL;
}
return(insert_rec);
@@ -1306,7 +1409,7 @@ page_cur_insert_rec_zip(
rec_offs_init(foffsets_);
foffsets = rec_get_offsets(free_rec, index, foffsets,
- ULINT_UNDEFINED, &heap);
+ ULINT_UNDEFINED, &heap);
if (rec_offs_size(foffsets) < rec_size) {
too_small:
if (UNIV_LIKELY_NULL(heap)) {
@@ -1414,18 +1517,19 @@ use_heap:
rec_offs_make_valid(insert_rec, index, offsets);
/* 4. Insert the record in the linked list of records */
- ut_ad(*current_rec != insert_rec);
+ ut_ad(cursor->rec != insert_rec);
{
/* next record after current before the insertion */
- rec_t* next_rec = page_rec_get_next(*current_rec);
- ut_ad(rec_get_status(*current_rec)
+ const rec_t* next_rec = page_rec_get_next_low(
+ cursor->rec, TRUE);
+ ut_ad(rec_get_status(cursor->rec)
<= REC_STATUS_INFIMUM);
ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM);
ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM);
page_rec_set_next(insert_rec, next_rec);
- page_rec_set_next(*current_rec, insert_rec);
+ page_rec_set_next(cursor->rec, insert_rec);
}
page_header_set_field(page, page_zip, PAGE_N_RECS,
@@ -1439,7 +1543,7 @@ use_heap:
UNIV_MEM_ASSERT_RW(rec_get_start(insert_rec, offsets),
rec_offs_size(offsets));
- page_zip_dir_insert(page_zip, *current_rec, free_rec, insert_rec);
+ page_zip_dir_insert(page_zip, cursor->rec, free_rec, insert_rec);
/* 6. Update the last insertion info in page header */
@@ -1453,7 +1557,7 @@ use_heap:
PAGE_NO_DIRECTION);
page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0);
- } else if ((last_insert == *current_rec)
+ } else if ((last_insert == cursor->rec)
&& (page_header_get_field(page, PAGE_DIRECTION)
!= PAGE_LEFT)) {
@@ -1506,7 +1610,7 @@ use_heap:
/* 9. Write log record of the insert */
if (UNIV_LIKELY(mtr != NULL)) {
page_cur_insert_rec_write_log(insert_rec, rec_size,
- *current_rec, index, mtr);
+ cursor->rec, index, mtr);
}
return(insert_rec);
@@ -1600,7 +1704,12 @@ page_parse_copy_rec_list_to_created_page(
#ifndef UNIV_HOTBACKUP
/*************************************************************//**
Copies records from page to a newly created page, from a given record onward,
-including that record. Infimum and supremum records are not copied. */
+including that record. Infimum and supremum records are not copied.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit(). */
UNIV_INTERN
void
page_copy_rec_list_end_to_created_page(
@@ -1780,9 +1889,9 @@ UNIV_INLINE
void
page_cur_delete_rec_write_log(
/*==========================*/
- rec_t* rec, /*!< in: record to be deleted */
- dict_index_t* index, /*!< in: record descriptor */
- mtr_t* mtr) /*!< in: mini-transaction handle */
+ rec_t* rec, /*!< in: record to be deleted */
+ const dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
{
byte* log_ptr;
@@ -1864,10 +1973,11 @@ UNIV_INTERN
void
page_cur_delete_rec(
/*================*/
- page_cur_t* cursor, /*!< in/out: a page cursor */
- dict_index_t* index, /*!< in: record descriptor */
- const ulint* offsets,/*!< in: rec_get_offsets(cursor->rec, index) */
- mtr_t* mtr) /*!< in: mini-transaction handle */
+ page_cur_t* cursor, /*!< in/out: a page cursor */
+ const dict_index_t* index, /*!< in: record descriptor */
+ const ulint* offsets,/*!< in: rec_get_offsets(
+ cursor->rec, index) */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
{
page_dir_slot_t* cur_dir_slot;
page_dir_slot_t* prev_slot;
@@ -1880,8 +1990,6 @@ page_cur_delete_rec(
ulint cur_n_owned;
rec_t* rec;
- ut_ad(cursor && mtr);
-
page = page_cur_get_page(cursor);
page_zip = page_cur_get_page_zip(cursor);
@@ -1896,10 +2004,31 @@ page_cur_delete_rec(
current_rec = cursor->rec;
ut_ad(rec_offs_validate(current_rec, index, offsets));
ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
+ ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
+ ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID)
+ == index->id || mtr->inside_ibuf || recv_recovery_is_on());
/* The record must not be the supremum or infimum record. */
ut_ad(page_rec_is_user_rec(current_rec));
+ if (page_get_n_recs(page) == 1 && !recv_recovery_is_on()) {
+ /* Empty the page, unless we are applying the redo log
+ during crash recovery. During normal operation, the
+ page_create_empty() gets logged as one of MLOG_PAGE_CREATE,
+ MLOG_COMP_PAGE_CREATE, MLOG_ZIP_PAGE_COMPRESS. */
+ ut_ad(page_is_leaf(page));
+ /* Usually, this should be the root page,
+ and the whole index tree should become empty.
+ However, this could also be a call in
+ btr_cur_pessimistic_update() to delete the only
+ record in the page and to insert another one. */
+ page_cur_move_to_next(cursor);
+ ut_ad(page_cur_is_after_last(cursor));
+ page_create_empty(page_cur_get_block(cursor),
+ const_cast<dict_index_t*>(index), mtr);
+ return;
+ }
+
/* Save to local variables some data associated with current_rec */
cur_slot_no = page_dir_find_owner_slot(current_rec);
ut_ad(cur_slot_no > 0);
@@ -1907,7 +2036,9 @@ page_cur_delete_rec(
cur_n_owned = page_dir_slot_get_n_owned(cur_dir_slot);
/* 0. Write the log record */
- page_cur_delete_rec_write_log(current_rec, index, mtr);
+ if (mtr != 0) {
+ page_cur_delete_rec_write_log(current_rec, index, mtr);
+ }
/* 1. Reset the last insert info in the page header and increment
the modify clock for the frame */
@@ -1915,9 +2046,13 @@ page_cur_delete_rec(
page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL);
/* The page gets invalid for optimistic searches: increment the
- frame modify clock */
+ frame modify clock only if there is an mini-transaction covering
+ the change. During IMPORT we allocate local blocks that are not
+ part of the buffer pool. */
- buf_block_modify_clock_inc(page_cur_get_block(cursor));
+ if (mtr != 0) {
+ buf_block_modify_clock_inc(page_cur_get_block(cursor));
+ }
/* 2. Find the next and the previous record. Note that the cursor is
left at the next record. */
@@ -1961,14 +2096,15 @@ page_cur_delete_rec(
page_dir_slot_set_n_owned(cur_dir_slot, page_zip, cur_n_owned - 1);
/* 6. Free the memory occupied by the record */
- btr_blob_dbg_remove_rec(current_rec, index, offsets, "delete");
+ btr_blob_dbg_remove_rec(current_rec, const_cast<dict_index_t*>(index),
+ offsets, "delete");
page_mem_free(page, page_zip, current_rec, index, offsets);
/* 7. Now we have decremented the number of owned records of the slot.
If the number drops below PAGE_DIR_SLOT_MIN_N_OWNED, we balance the
slots. */
- if (UNIV_UNLIKELY(cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED)) {
+ if (cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) {
page_dir_balance_slot(page, page_zip, cur_slot_no);
}
diff --git a/storage/xtradb/page/page0page.c b/storage/xtradb/page/page0page.cc
index f2ce6c9fe16..2faf804279c 100644
--- a/storage/xtradb/page/page0page.c
+++ b/storage/xtradb/page/page0page.cc
@@ -1,6 +1,7 @@
/*****************************************************************************
-Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -17,7 +18,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
*****************************************************************************/
/**************************************************//**
-@file page/page0page.c
+@file page/page0page.cc
Index page routines
Created 2/2/1994 Heikki Tuuri
@@ -222,7 +223,7 @@ page_set_max_trx_id(
during a database recovery we assume that the max trx id of every
page is the maximum trx id assigned before the crash. */
- if (UNIV_LIKELY_NULL(page_zip)) {
+ if (page_zip) {
mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), trx_id);
page_zip_write_header(page_zip,
page + (PAGE_HEADER + PAGE_MAX_TRX_ID),
@@ -499,7 +500,8 @@ page_create_zip(
page is created */
dict_index_t* index, /*!< in: the index of the page */
ulint level, /*!< in: the B-tree level of the page */
- mtr_t* mtr) /*!< in: mini-transaction handle */
+ trx_id_t max_trx_id, /*!< in: PAGE_MAX_TRX_ID */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
{
page_t* page;
page_zip_des_t* page_zip = buf_block_get_page_zip(block);
@@ -510,9 +512,11 @@ page_create_zip(
ut_ad(dict_table_is_comp(index->table));
page = page_create_low(block, TRUE);
- mach_write_to_2(page + PAGE_HEADER + PAGE_LEVEL, level);
+ mach_write_to_2(PAGE_HEADER + PAGE_LEVEL + page, level);
+ mach_write_to_8(PAGE_HEADER + PAGE_MAX_TRX_ID + page, max_trx_id);
- if (UNIV_UNLIKELY(!page_zip_compress(page_zip, page, index, mtr))) {
+ if (!page_zip_compress(page_zip, page, index,
+ page_zip_level, mtr)) {
/* The compression of a newly created page
should always succeed. */
ut_error;
@@ -521,9 +525,49 @@ page_create_zip(
return(page);
}
+/**********************************************************//**
+Empty a previously created B-tree index page. */
+UNIV_INTERN
+void
+page_create_empty(
+/*==============*/
+ buf_block_t* block, /*!< in/out: B-tree block */
+ dict_index_t* index, /*!< in: the index of the page */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ trx_id_t max_trx_id = 0;
+ const page_t* page = buf_block_get_frame(block);
+ page_zip_des_t* page_zip= buf_block_get_page_zip(block);
+
+ ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
+
+ if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) {
+ max_trx_id = page_get_max_trx_id(page);
+ ut_ad(max_trx_id);
+ }
+
+ if (page_zip) {
+ page_create_zip(block, index,
+ page_header_get_field(page, PAGE_LEVEL),
+ max_trx_id, mtr);
+ } else {
+ page_create(block, mtr, page_is_comp(page));
+
+ if (max_trx_id) {
+ page_update_max_trx_id(
+ block, page_zip, max_trx_id, mtr);
+ }
+ }
+}
+
/*************************************************************//**
Differs from page_copy_rec_list_end, because this function does not
-touch the lock table and max trx id on page or compress the page. */
+touch the lock table and max trx id on page or compress the page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit(). */
UNIV_INTERN
void
page_copy_rec_list_end_no_locks(
@@ -598,6 +642,12 @@ page_copy_rec_list_end_no_locks(
Copies records from page to new_page, from a given record onward,
including that record. Infimum and supremum records are not copied.
The records are copied to the start of the record list on new_page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
@return pointer to the original successor of the infimum record on
new_page, or NULL on zip overflow (new_block will be decompressed) */
UNIV_INTERN
@@ -635,7 +685,7 @@ page_copy_rec_list_end(
/* Here, "ret" may be pointing to a user record or the
predefined supremum record. */
- if (UNIV_LIKELY_NULL(new_page_zip)) {
+ if (new_page_zip) {
log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
}
@@ -655,11 +705,11 @@ page_copy_rec_list_end(
page_get_max_trx_id(page), mtr);
}
- if (UNIV_LIKELY_NULL(new_page_zip)) {
+ if (new_page_zip) {
mtr_set_log_mode(mtr, log_mode);
- if (UNIV_UNLIKELY
- (!page_zip_compress(new_page_zip, new_page, index, mtr))) {
+ if (!page_zip_compress(new_page_zip, new_page,
+ index, page_zip_level, mtr)) {
/* Before trying to reorganize the page,
store the number of preceding records on the page. */
ulint ret_pos
@@ -671,14 +721,12 @@ page_copy_rec_list_end(
that is smaller than "ret"). */
ut_a(ret_pos > 0);
- if (UNIV_UNLIKELY
- (!page_zip_reorganize(new_block, index, mtr))) {
+ if (!page_zip_reorganize(new_block, index, mtr)) {
btr_blob_dbg_remove(new_page, index,
"copy_end_reorg_fail");
- if (UNIV_UNLIKELY
- (!page_zip_decompress(new_page_zip,
- new_page, FALSE))) {
+ if (!page_zip_decompress(new_page_zip,
+ new_page, FALSE)) {
ut_error;
}
ut_ad(page_validate(new_page, index));
@@ -710,6 +758,12 @@ page_copy_rec_list_end(
Copies records from page to new_page, up to the given record,
NOT including that record. Infimum and supremum records are not copied.
The records are copied to the end of the record list on new_page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
@return pointer to the original predecessor of the supremum record on
new_page, or NULL on zip overflow (new_block will be decompressed) */
UNIV_INTERN
@@ -742,7 +796,7 @@ page_copy_rec_list_start(
return(ret);
}
- if (UNIV_LIKELY_NULL(new_page_zip)) {
+ if (new_page_zip) {
log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
}
@@ -778,14 +832,15 @@ page_copy_rec_list_start(
mtr);
}
- if (UNIV_LIKELY_NULL(new_page_zip)) {
+ if (new_page_zip) {
mtr_set_log_mode(mtr, log_mode);
DBUG_EXECUTE_IF("page_copy_rec_list_start_compress_fail",
goto zip_reorganize;);
- if (UNIV_UNLIKELY
- (!page_zip_compress(new_page_zip, new_page, index, mtr))) {
+ if (!page_zip_compress(new_page_zip, new_page, index,
+ page_zip_level, mtr)) {
+
ulint ret_pos;
#ifndef DBUG_OFF
zip_reorganize:
@@ -949,13 +1004,38 @@ page_delete_rec_list_end(
ut_a(!page_zip || page_zip_validate(page_zip, page, index));
#endif /* UNIV_ZIP_DEBUG */
- if (page_rec_is_infimum(rec)) {
- rec = page_rec_get_next(rec);
- }
-
if (page_rec_is_supremum(rec)) {
+ ut_ad(n_recs == 0 || n_recs == ULINT_UNDEFINED);
+ /* Nothing to do, there are no records bigger than the
+ page supremum. */
+ return;
+ }
+ if (recv_recovery_is_on()) {
+ /* If we are replaying a redo log record, we must
+ replay it exactly. Since MySQL 5.6.11, we should be
+ generating a redo log record for page creation if
+ the page would become empty. Thus, this branch should
+ only be executed when applying redo log that was
+ generated by an older version of MySQL. */
+ } else if (page_rec_is_infimum(rec)
+ || n_recs == page_get_n_recs(page)) {
+delete_all:
+ /* We are deleting all records. */
+ page_create_empty(block, index, mtr);
return;
+ } else if (page_is_comp(page)) {
+ if (page_rec_get_next_low(page + PAGE_NEW_INFIMUM, 1) == rec) {
+ /* We are deleting everything from the first
+ user record onwards. */
+ goto delete_all;
+ }
+ } else {
+ if (page_rec_get_next_low(page + PAGE_OLD_INFIMUM, 0) == rec) {
+ /* We are deleting everything from the first
+ user record onwards. */
+ goto delete_all;
+ }
}
/* Reset the last insert info in the page header and increment
@@ -972,7 +1052,7 @@ page_delete_rec_list_end(
? MLOG_COMP_LIST_END_DELETE
: MLOG_LIST_END_DELETE, mtr);
- if (UNIV_LIKELY_NULL(page_zip)) {
+ if (page_zip) {
ulint log_mode;
ut_a(page_is_comp(page));
@@ -1134,7 +1214,12 @@ page_delete_rec_list_start(
#endif /* UNIV_ZIP_DEBUG */
if (page_rec_is_infimum(rec)) {
+ return;
+ }
+ if (page_rec_is_supremum(rec)) {
+ /* We are deleting all records. */
+ page_create_empty(block, index, mtr);
return;
}
@@ -1172,6 +1257,12 @@ page_delete_rec_list_start(
/*************************************************************//**
Moves record list end to another page. Moved records include
split_rec.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
@return TRUE on success; FALSE on compression failure (new_block will
be decompressed) */
UNIV_INTERN
@@ -1227,6 +1318,12 @@ page_move_rec_list_end(
/*************************************************************//**
Moves record list start to another page. Moved records do not include
split_rec.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
@return TRUE on success; FALSE on compression failure */
UNIV_INTERN
ibool
@@ -1572,7 +1669,7 @@ page_rec_get_n_recs_before(
n--;
ut_ad(n >= 0);
- ut_ad((ulint)n < UNIV_PAGE_SIZE / (REC_N_NEW_EXTRA_BYTES + 1));
+ ut_ad((ulong) n < UNIV_PAGE_SIZE / (REC_N_NEW_EXTRA_BYTES + 1));
return((ulint) n);
}
@@ -2322,12 +2419,26 @@ page_validate(
}
}
+ if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)
+ && !page_is_empty(page)) {
+ trx_id_t max_trx_id = page_get_max_trx_id(page);
+ trx_id_t sys_max_trx_id = trx_sys_get_max_trx_id();
+
+ if (max_trx_id == 0 || max_trx_id > sys_max_trx_id) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "PAGE_MAX_TRX_ID out of bounds: "
+ TRX_ID_FMT ", " TRX_ID_FMT,
+ max_trx_id, sys_max_trx_id);
+ goto func_exit2;
+ }
+ }
+
heap = mem_heap_create(UNIV_PAGE_SIZE + 200);
/* The following buffer is used to check that the
records in the page record heap do not overlap */
- buf = mem_heap_zalloc(heap, UNIV_PAGE_SIZE);
+ buf = static_cast<byte*>(mem_heap_zalloc(heap, UNIV_PAGE_SIZE));
/* Check first that the record heap and the directory do not
overlap. */
@@ -2337,7 +2448,7 @@ page_validate(
if (UNIV_UNLIKELY(!(page_header_get_ptr(page, PAGE_HEAP_TOP)
<= page_dir_get_nth_slot(page, n_slots - 1)))) {
- fprintf(stderr,
+ fprintf(stderr,
"InnoDB: Record heap and dir overlap"
" on space %lu page %lu index %s, %p, %p\n",
(ulong) page_get_space_id(page),
@@ -2380,7 +2491,7 @@ page_validate(
if (UNIV_UNLIKELY
(1 != cmp_rec_rec(rec, old_rec,
offsets, old_offsets, index))) {
- fprintf(stderr,
+ fprintf(stderr,
"InnoDB: Records in wrong order"
" on space %lu page %lu index %s\n",
(ulong) page_get_space_id(page),
@@ -2551,7 +2662,7 @@ func_exit:
if (UNIV_UNLIKELY(ret == FALSE)) {
func_exit2:
- fprintf(stderr,
+ fprintf(stderr,
"InnoDB: Apparent corruption"
" in space %lu page %lu index %s\n",
(ulong) page_get_space_id(page),
@@ -2611,3 +2722,60 @@ page_find_rec_with_heap_no(
}
}
#endif /* !UNIV_HOTBACKUP */
+
+/*******************************************************//**
+Removes the record from a leaf page. This function does not log
+any changes. It is used by the IMPORT tablespace functions.
+The cursor is moved to the next record after the deleted one.
+@return true if success, i.e., the page did not become too empty */
+UNIV_INTERN
+bool
+page_delete_rec(
+/*============*/
+ const dict_index_t* index, /*!< in: The index that the record
+ belongs to */
+ page_cur_t* pcur, /*!< in/out: page cursor on record
+ to delete */
+ page_zip_des_t* page_zip,/*!< in: compressed page descriptor */
+ const ulint* offsets)/*!< in: offsets for record */
+{
+ bool no_compress_needed;
+ buf_block_t* block = pcur->block;
+ page_t* page = buf_block_get_frame(block);
+
+ ut_ad(page_is_leaf(page));
+
+ if (!rec_offs_any_extern(offsets)
+ && ((page_get_data_size(page) - rec_offs_size(offsets)
+ < BTR_CUR_PAGE_COMPRESS_LIMIT)
+ || (mach_read_from_4(page + FIL_PAGE_NEXT) == FIL_NULL
+ && mach_read_from_4(page + FIL_PAGE_PREV) == FIL_NULL)
+ || (page_get_n_recs(page) < 2))) {
+
+ ulint root_page_no = dict_index_get_page(index);
+
+ /* The page fillfactor will drop below a predefined
+ minimum value, OR the level in the B-tree contains just
+ one page, OR the page will become empty: we recommend
+ compression if this is not the root page. */
+
+ no_compress_needed = page_get_page_no(page) == root_page_no;
+ } else {
+ no_compress_needed = true;
+ }
+
+ if (no_compress_needed) {
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+ page_cur_delete_rec(pcur, index, offsets, 0);
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+ }
+
+ return(no_compress_needed);
+}
+
diff --git a/storage/xtradb/page/page0zip.c b/storage/xtradb/page/page0zip.cc
index 40d794770ff..81c9e0ab45a 100644
--- a/storage/xtradb/page/page0zip.c
+++ b/storage/xtradb/page/page0zip.cc
@@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 2005, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -11,18 +12,21 @@ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
*****************************************************************************/
/**************************************************//**
-@file page/page0zip.c
+@file page/page0zip.cc
Compressed page interface
Created June 2005 by Marko Makela
*******************************************************/
+#include <map>
+using namespace std;
+
#define THIS_MODULE
#include "page0zip.h"
#ifdef UNIV_NONINL
@@ -38,20 +42,39 @@ Created June 2005 by Marko Makela
#include "log0recv.h"
#include "zlib.h"
#ifndef UNIV_HOTBACKUP
+# include "buf0buf.h"
# include "buf0lru.h"
# include "btr0sea.h"
# include "dict0boot.h"
# include "lock0lock.h"
+# include "srv0mon.h"
+# include "srv0srv.h"
+# include "ut0crc32.h"
#else /* !UNIV_HOTBACKUP */
+# include "buf0checksum.h"
# define lock_move_reorganize_page(block, temp_block) ((void) 0)
# define buf_LRU_stat_inc_unzip() ((void) 0)
#endif /* !UNIV_HOTBACKUP */
#ifndef UNIV_HOTBACKUP
/** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */
-UNIV_INTERN page_zip_stat_t page_zip_stat[PAGE_ZIP_NUM_SSIZE_MAX - 1];
+UNIV_INTERN page_zip_stat_t page_zip_stat[PAGE_ZIP_SSIZE_MAX];
+/** Statistics on compression, indexed by index->id */
+UNIV_INTERN page_zip_stat_per_index_t page_zip_stat_per_index;
+/** Mutex protecting page_zip_stat_per_index */
+UNIV_INTERN ib_mutex_t page_zip_stat_per_index_mutex;
+#ifdef HAVE_PSI_INTERFACE
+UNIV_INTERN mysql_pfs_key_t page_zip_stat_per_index_mutex_key;
+#endif /* HAVE_PSI_INTERFACE */
#endif /* !UNIV_HOTBACKUP */
+/* Compression level to be used by zlib. Settable by user. */
+UNIV_INTERN uint page_zip_level = DEFAULT_COMPRESSION_LEVEL;
+
+/* Whether or not to log compressed page images to avoid possible
+compression algorithm changes in zlib. */
+UNIV_INTERN my_bool page_zip_log_pages = true;
+
/* Please refer to ../include/page0zip.ic for a description of the
compressed page format. */
@@ -381,7 +404,7 @@ page_zip_get_n_prev_extern(
compressed page */
const rec_t* rec, /*!< in: compact physical record
on a B-tree leaf page */
- dict_index_t* index) /*!< in: record descriptor */
+ const dict_index_t* index) /*!< in: record descriptor */
{
const page_t* page = page_align(rec);
ulint n_ext = 0;
@@ -632,15 +655,15 @@ page_zip_dir_encode(
#if PAGE_ZIP_DIR_SLOT_MASK & (PAGE_ZIP_DIR_SLOT_MASK + 1)
# error "PAGE_ZIP_DIR_SLOT_MASK is not 1 less than a power of 2"
#endif
-#if PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE - 1
-# error "PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE - 1"
+#if PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE_MAX - 1
+# error "PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE_MAX - 1"
#endif
if (UNIV_UNLIKELY(rec_get_n_owned_new(rec))) {
offs |= PAGE_ZIP_DIR_SLOT_OWNED;
}
info_bits = rec_get_info_bits(rec, TRUE);
- if (UNIV_UNLIKELY(info_bits & REC_INFO_DELETED_FLAG)) {
+ if (info_bits & REC_INFO_DELETED_FLAG) {
info_bits &= ~REC_INFO_DELETED_FLAG;
offs |= PAGE_ZIP_DIR_SLOT_DEL;
}
@@ -691,6 +714,8 @@ page_zip_dir_encode(
ut_a(i + PAGE_HEAP_NO_USER_LOW == n_heap);
}
+extern "C" {
+
/**********************************************************************//**
Allocate memory for zlib. */
static
@@ -701,7 +726,7 @@ page_zip_zalloc(
uInt items, /*!< in: number of items to allocate */
uInt size) /*!< in: size of an item in bytes */
{
- return(mem_heap_zalloc(opaque, items * size));
+ return(mem_heap_zalloc(static_cast<mem_heap_t*>(opaque), items * size));
}
/**********************************************************************//**
@@ -715,6 +740,8 @@ page_zip_free(
{
}
+} /* extern "C" */
+
/**********************************************************************//**
Configure the zlib allocator to use the given memory heap. */
UNIV_INTERN
@@ -724,7 +751,7 @@ page_zip_set_alloc(
void* stream, /*!< in/out: zlib stream */
mem_heap_t* heap) /*!< in: memory heap to use */
{
- z_stream* strm = stream;
+ z_stream* strm = static_cast<z_stream*>(stream);
strm->zalloc = page_zip_zalloc;
strm->zfree = page_zip_free;
@@ -1089,7 +1116,7 @@ page_zip_compress_clust(
/* Check if there are any externally stored columns.
For each externally stored column, store the
BTR_EXTERN_FIELD_REF separately. */
- if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) {
+ if (rec_offs_any_extern(offsets)) {
ut_ad(dict_index_is_clust(index));
err = page_zip_compress_clust_ext(
@@ -1173,6 +1200,7 @@ page_zip_compress(
m_start, m_end, m_nonempty */
const page_t* page, /*!< in: uncompressed page */
dict_index_t* index, /*!< in: index of the B-tree node */
+ ulint level, /*!< in: compression level */
mtr_t* mtr) /*!< in: mini-transaction, or NULL */
{
z_stream c_stream;
@@ -1186,7 +1214,6 @@ page_zip_compress(
const rec_t** recs; /*!< dense page directory, sorted by address */
mem_heap_t* heap;
ulint trx_id_col;
- ulint* offsets = NULL;
ulint n_blobs = 0;
byte* storage;/* storage of uncompressed columns */
#ifndef UNIV_HOTBACKUP
@@ -1195,6 +1222,10 @@ page_zip_compress(
#ifdef PAGE_ZIP_COMPRESS_DBG
FILE* logfile = NULL;
#endif
+ /* A local copy of srv_cmp_per_index_enabled to avoid reading that
+ variable multiple times in this function since it can be changed at
+ anytime. */
+ my_bool cmp_per_index_enabled = srv_cmp_per_index_enabled;
if (!page) {
return(FALSE);
@@ -1220,7 +1251,7 @@ page_zip_compress(
ut_a(!memcmp(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1),
supremum_extra_data, sizeof supremum_extra_data));
- if (UNIV_UNLIKELY(!page_get_n_recs(page))) {
+ if (page_is_empty(page)) {
ut_a(rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE)
== PAGE_NEW_SUPREMUM);
}
@@ -1237,7 +1268,7 @@ page_zip_compress(
if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
fprintf(stderr, "compress %p %p %lu %lu %lu\n",
(void*) page_zip, (void*) page,
- page_is_leaf(page),
+ (ibool) page_is_leaf(page),
n_fields, n_dense);
}
if (UNIV_UNLIKELY(page_zip_compress_log)) {
@@ -1261,6 +1292,11 @@ page_zip_compress(
#endif /* PAGE_ZIP_COMPRESS_DBG */
#ifndef UNIV_HOTBACKUP
page_zip_stat[page_zip->ssize - 1].compressed++;
+ if (cmp_per_index_enabled) {
+ mutex_enter(&page_zip_stat_per_index_mutex);
+ page_zip_stat_per_index[index->id].compressed++;
+ mutex_exit(&page_zip_stat_per_index_mutex);
+ }
#endif /* !UNIV_HOTBACKUP */
if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE
@@ -1269,24 +1305,30 @@ page_zip_compress(
goto err_exit;
}
+ MONITOR_INC(MONITOR_PAGE_COMPRESS);
+
heap = mem_heap_create(page_zip_get_size(page_zip)
- + n_fields * (2 + sizeof *offsets)
+ + n_fields * (2 + sizeof(ulint))
+ + REC_OFFS_HEADER_SIZE
+ n_dense * ((sizeof *recs)
- PAGE_ZIP_DIR_SLOT_SIZE)
+ UNIV_PAGE_SIZE * 4
+ (512 << MAX_MEM_LEVEL));
- recs = mem_heap_zalloc(heap, n_dense * sizeof *recs);
+ recs = static_cast<const rec_t**>(
+ mem_heap_zalloc(heap, n_dense * sizeof *recs));
- fields = mem_heap_alloc(heap, (n_fields + 1) * 2);
+ fields = static_cast<byte*>(mem_heap_alloc(heap, (n_fields + 1) * 2));
+
+ buf = static_cast<byte*>(
+ mem_heap_alloc(heap, page_zip_get_size(page_zip) - PAGE_DATA));
- buf = mem_heap_alloc(heap, page_zip_get_size(page_zip) - PAGE_DATA);
buf_end = buf + page_zip_get_size(page_zip) - PAGE_DATA;
/* Compress the data payload. */
page_zip_set_alloc(&c_stream, heap);
- err = deflateInit2(&c_stream, Z_DEFAULT_COMPRESSION,
+ err = deflateInit2(&c_stream, level,
Z_DEFLATED, UNIV_PAGE_SIZE_SHIFT,
MAX_MEM_LEVEL, Z_DEFAULT_STRATEGY);
ut_a(err == Z_OK);
@@ -1399,8 +1441,19 @@ err_exit:
}
#endif /* PAGE_ZIP_COMPRESS_DBG */
#ifndef UNIV_HOTBACKUP
+ if (page_is_leaf(page)) {
+ dict_index_zip_failure(index);
+ }
+
+ ullint time_diff = ut_time_us(NULL) - usec;
page_zip_stat[page_zip->ssize - 1].compressed_usec
- += ut_time_us(NULL) - usec;
+ += time_diff;
+ if (cmp_per_index_enabled) {
+ mutex_enter(&page_zip_stat_per_index_mutex);
+ page_zip_stat_per_index[index->id].compressed_usec
+ += time_diff;
+ mutex_exit(&page_zip_stat_per_index_mutex);
+ }
#endif /* !UNIV_HOTBACKUP */
return(FALSE);
}
@@ -1460,11 +1513,18 @@ err_exit:
}
#endif /* PAGE_ZIP_COMPRESS_DBG */
#ifndef UNIV_HOTBACKUP
- {
- page_zip_stat_t* zip_stat
- = &page_zip_stat[page_zip->ssize - 1];
- zip_stat->compressed_ok++;
- zip_stat->compressed_usec += ut_time_us(NULL) - usec;
+ ullint time_diff = ut_time_us(NULL) - usec;
+ page_zip_stat[page_zip->ssize - 1].compressed_ok++;
+ page_zip_stat[page_zip->ssize - 1].compressed_usec += time_diff;
+ if (cmp_per_index_enabled) {
+ mutex_enter(&page_zip_stat_per_index_mutex);
+ page_zip_stat_per_index[index->id].compressed_ok++;
+ page_zip_stat_per_index[index->id].compressed_usec += time_diff;
+ mutex_exit(&page_zip_stat_per_index_mutex);
+ }
+
+ if (page_is_leaf(page)) {
+ dict_index_zip_success(index);
}
#endif /* !UNIV_HOTBACKUP */
@@ -1509,6 +1569,7 @@ page_zip_fields_free(
{
if (index) {
dict_table_t* table = index->table;
+ os_fast_mutex_free(&index->zip_pad.mutex);
mem_heap_free(index->heap);
mutex_free(&(table->autoinc_mutex));
ut_free(table->name);
@@ -1560,7 +1621,7 @@ page_zip_fields_decode(
}
table = dict_mem_table_create("ZIP_DUMMY", DICT_HDR_SPACE, n,
- DICT_TF_COMPACT);
+ DICT_TF_COMPACT, 0);
index = dict_mem_index_create("ZIP_DUMMY", "ZIP_DUMMY",
DICT_HDR_SPACE, 0, n);
index->table = table;
@@ -1752,7 +1813,7 @@ page_zip_set_extra_bytes(
for (i = 0; i < n; i++) {
offs = page_zip_dir_get(page_zip, i);
- if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_DEL)) {
+ if (offs & PAGE_ZIP_DIR_SLOT_DEL) {
info_bits |= REC_INFO_DELETED_FLAG;
}
if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_OWNED)) {
@@ -2117,6 +2178,32 @@ page_zip_apply_log(
}
/**********************************************************************//**
+Set the heap_no in a record, and skip the fixed-size record header
+that is not included in the d_stream.
+@return TRUE on success, FALSE if d_stream does not end at rec */
+static
+ibool
+page_zip_decompress_heap_no(
+/*========================*/
+ z_stream* d_stream, /*!< in/out: compressed page stream */
+ rec_t* rec, /*!< in/out: record */
+ ulint& heap_status) /*!< in/out: heap_no and status bits */
+{
+ if (d_stream->next_out != rec - REC_N_NEW_EXTRA_BYTES) {
+ /* n_dense has grown since the page was last compressed. */
+ return(FALSE);
+ }
+
+ /* Skip the REC_N_NEW_EXTRA_BYTES. */
+ d_stream->next_out = rec;
+
+ /* Set heap_no and the status bits. */
+ mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status);
+ heap_status += 1 << REC_HEAP_NO_SHIFT;
+ return(TRUE);
+}
+
+/**********************************************************************//**
Decompress the records of a node pointer page.
@return TRUE on success, FALSE on failure */
static
@@ -2152,19 +2239,8 @@ page_zip_decompress_node_ptrs(
- PAGE_ZIP_START - PAGE_DIR);
switch (inflate(d_stream, Z_SYNC_FLUSH)) {
case Z_STREAM_END:
- if (d_stream->next_out
- != rec - REC_N_NEW_EXTRA_BYTES) {
- /* n_dense has grown since the page
- was last compressed. */
- } else {
- /* Skip the REC_N_NEW_EXTRA_BYTES. */
- d_stream->next_out = rec;
-
- /* Set heap_no and the status bits. */
- mach_write_to_2(rec - REC_NEW_HEAP_NO,
- heap_status);
- heap_status += 1 << REC_HEAP_NO_SHIFT;
- }
+ page_zip_decompress_heap_no(
+ d_stream, rec, heap_status);
goto zlib_done;
case Z_OK:
case Z_BUF_ERROR:
@@ -2179,12 +2255,10 @@ page_zip_decompress_node_ptrs(
goto zlib_error;
}
- ut_ad(d_stream->next_out == rec - REC_N_NEW_EXTRA_BYTES);
- /* Prepare to decompress the data bytes. */
- d_stream->next_out = rec;
- /* Set heap_no and the status bits. */
- mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status);
- heap_status += 1 << REC_HEAP_NO_SHIFT;
+ if (!page_zip_decompress_heap_no(
+ d_stream, rec, heap_status)) {
+ ut_ad(0);
+ }
/* Read the offsets. The status bits are needed here. */
offsets = rec_get_offsets(rec, index, offsets,
@@ -2352,19 +2426,8 @@ page_zip_decompress_sec(
if (UNIV_LIKELY(d_stream->avail_out)) {
switch (inflate(d_stream, Z_SYNC_FLUSH)) {
case Z_STREAM_END:
- if (d_stream->next_out
- != rec - REC_N_NEW_EXTRA_BYTES) {
- /* n_dense has grown since the page
- was last compressed. */
- } else {
- /* Skip the REC_N_NEW_EXTRA_BYTES. */
- d_stream->next_out = rec;
-
- /* Set heap_no and the status bits. */
- mach_write_to_2(rec - REC_NEW_HEAP_NO,
- heap_status);
- heap_status += 1 << REC_HEAP_NO_SHIFT;
- }
+ page_zip_decompress_heap_no(
+ d_stream, rec, heap_status);
goto zlib_done;
case Z_OK:
case Z_BUF_ERROR:
@@ -2380,15 +2443,10 @@ page_zip_decompress_sec(
}
}
- ut_ad(d_stream->next_out == rec - REC_N_NEW_EXTRA_BYTES);
-
- /* Skip the REC_N_NEW_EXTRA_BYTES. */
-
- d_stream->next_out = rec;
-
- /* Set heap_no and the status bits. */
- mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status);
- heap_status += 1 << REC_HEAP_NO_SHIFT;
+ if (!page_zip_decompress_heap_no(
+ d_stream, rec, heap_status)) {
+ ut_ad(0);
+ }
}
/* Decompress the data of the last record and any trailing garbage,
@@ -2622,19 +2680,8 @@ page_zip_decompress_clust(
err = inflate(d_stream, Z_SYNC_FLUSH);
switch (err) {
case Z_STREAM_END:
- if (d_stream->next_out
- != rec - REC_N_NEW_EXTRA_BYTES) {
- /* n_dense has grown since the page
- was last compressed. */
- } else {
- /* Skip the REC_N_NEW_EXTRA_BYTES. */
- d_stream->next_out = rec;
-
- /* Set heap_no and the status bits. */
- mach_write_to_2(rec - REC_NEW_HEAP_NO,
- heap_status);
- heap_status += 1 << REC_HEAP_NO_SHIFT;
- }
+ page_zip_decompress_heap_no(
+ d_stream, rec, heap_status);
goto zlib_done;
case Z_OK:
case Z_BUF_ERROR:
@@ -2649,12 +2696,10 @@ page_zip_decompress_clust(
goto zlib_error;
}
- ut_ad(d_stream->next_out == rec - REC_N_NEW_EXTRA_BYTES);
- /* Prepare to decompress the data bytes. */
- d_stream->next_out = rec;
- /* Set heap_no and the status bits. */
- mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status);
- heap_status += 1 << REC_HEAP_NO_SHIFT;
+ if (!page_zip_decompress_heap_no(
+ d_stream, rec, heap_status)) {
+ ut_ad(0);
+ }
/* Read the offsets. The status bits are needed here. */
offsets = rec_get_offsets(rec, index, offsets,
@@ -2666,7 +2711,7 @@ page_zip_decompress_clust(
For each externally stored column, restore the
BTR_EXTERN_FIELD_REF separately. */
- if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) {
+ if (rec_offs_any_extern(offsets)) {
if (UNIV_UNLIKELY
(!page_zip_decompress_clust_ext(
d_stream, rec, offsets, trx_id_col))) {
@@ -2931,7 +2976,9 @@ page_zip_decompress(
}
heap = mem_heap_create(n_dense * (3 * sizeof *recs) + UNIV_PAGE_SIZE);
- recs = mem_heap_alloc(heap, n_dense * (2 * sizeof *recs));
+
+ recs = static_cast<rec_t**>(
+ mem_heap_alloc(heap, n_dense * (2 * sizeof *recs)));
if (all) {
/* Copy the page header. */
@@ -2975,7 +3022,7 @@ zlib_error:
/* Copy the infimum and supremum records. */
memcpy(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES),
infimum_extra, sizeof infimum_extra);
- if (UNIV_UNLIKELY(!page_get_n_recs(page))) {
+ if (page_is_empty(page)) {
rec_set_next_offs_new(page + PAGE_NEW_INFIMUM,
PAGE_NEW_SUPREMUM);
} else {
@@ -3033,7 +3080,10 @@ zlib_error:
/* Pre-allocate the offsets for rec_get_offsets_reverse(). */
ulint n = 1 + 1/* node ptr */ + REC_OFFS_HEADER_SIZE
+ dict_index_get_n_fields(index);
- offsets = mem_heap_alloc(heap, n * sizeof(ulint));
+
+ offsets = static_cast<ulint*>(
+ mem_heap_alloc(heap, n * sizeof(ulint)));
+
*offsets = n;
}
@@ -3093,17 +3143,25 @@ err_exit:
page_zip_fields_free(index);
mem_heap_free(heap);
#ifndef UNIV_HOTBACKUP
- {
- page_zip_stat_t* zip_stat
- = &page_zip_stat[page_zip->ssize - 1];
- zip_stat->decompressed++;
- zip_stat->decompressed_usec += ut_time_us(NULL) - usec;
+ ullint time_diff = ut_time_us(NULL) - usec;
+ page_zip_stat[page_zip->ssize - 1].decompressed++;
+ page_zip_stat[page_zip->ssize - 1].decompressed_usec += time_diff;
+
+ index_id_t index_id = btr_page_get_index_id(page);
+
+ if (srv_cmp_per_index_enabled) {
+ mutex_enter(&page_zip_stat_per_index_mutex);
+ page_zip_stat_per_index[index_id].decompressed++;
+ page_zip_stat_per_index[index_id].decompressed_usec += time_diff;
+ mutex_exit(&page_zip_stat_per_index_mutex);
}
#endif /* !UNIV_HOTBACKUP */
/* Update the stat counter for LRU policy. */
buf_LRU_stat_inc_unzip();
+ MONITOR_INC(MONITOR_PAGE_DECOMPRESS);
+
return(TRUE);
}
@@ -3118,7 +3176,7 @@ page_zip_hexdump_func(
const void* buf, /*!< in: data */
ulint size) /*!< in: length of the data, in bytes */
{
- const byte* s = buf;
+ const byte* s = static_cast<const byte*>(buf);
ulint addr;
const ulint width = 32; /* bytes per line */
@@ -3185,15 +3243,15 @@ page_zip_validate_low(
/* page_zip_decompress() expects the uncompressed page to be
UNIV_PAGE_SIZE aligned. */
- temp_page_buf = ut_malloc(2 * UNIV_PAGE_SIZE);
- temp_page = ut_align(temp_page_buf, UNIV_PAGE_SIZE);
+ temp_page_buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
+ temp_page = static_cast<byte*>(ut_align(temp_page_buf, UNIV_PAGE_SIZE));
#ifdef UNIV_DEBUG_VALGRIND
/* Get detailed information on the valid bits in case the
UNIV_MEM_ASSERT_RW() checks fail. The v-bits of page[],
page_zip->data[] or page_zip could be viewed at temp_page[] or
temp_page_zip in a debugger when running valgrind --db-attach. */
- VALGRIND_GET_VBITS(page, temp_page, UNIV_PAGE_SIZE);
+ (void) VALGRIND_GET_VBITS(page, temp_page, UNIV_PAGE_SIZE);
UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
# if UNIV_WORD_SIZE == 4
VALGRIND_GET_VBITS(page_zip, &temp_page_zip, sizeof temp_page_zip);
@@ -3202,8 +3260,8 @@ page_zip_validate_low(
pad bytes. */
UNIV_MEM_ASSERT_RW(page_zip, sizeof *page_zip);
# endif
- VALGRIND_GET_VBITS(page_zip->data, temp_page,
- page_zip_get_size(page_zip));
+ (void) VALGRIND_GET_VBITS(page_zip->data, temp_page,
+ page_zip_get_size(page_zip));
UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
#endif /* UNIV_DEBUG_VALGRIND */
@@ -4005,6 +4063,7 @@ page_zip_write_trx_id_and_roll_ptr(
ulint len;
ut_ad(PAGE_ZIP_MATCH(rec, page_zip));
+
ut_ad(page_simple_validate_new(page));
ut_ad(page_zip_simple_validate(page_zip));
ut_ad(page_zip_get_size(page_zip)
@@ -4057,10 +4116,10 @@ static
void
page_zip_clear_rec(
/*===============*/
- page_zip_des_t* page_zip,/*!< in/out: compressed page */
- byte* rec, /*!< in: record to clear */
- dict_index_t* index, /*!< in: index of rec */
- const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ byte* rec, /*!< in: record to clear */
+ const dict_index_t* index, /*!< in: index of rec */
+ const ulint* offsets) /*!< in: rec_get_offsets(rec, index) */
{
ulint heap_no;
page_t* page = page_align(rec);
@@ -4271,11 +4330,12 @@ UNIV_INTERN
void
page_zip_dir_delete(
/*================*/
- page_zip_des_t* page_zip,/*!< in/out: compressed page */
- byte* rec, /*!< in: record to delete */
- dict_index_t* index, /*!< in: index of rec */
- const ulint* offsets,/*!< in: rec_get_offsets(rec) */
- const byte* free) /*!< in: previous start of the free list */
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ byte* rec, /*!< in: deleted record */
+ const dict_index_t* index, /*!< in: index of rec */
+ const ulint* offsets, /*!< in: rec_get_offsets(rec) */
+ const byte* free) /*!< in: previous start of
+ the free list */
{
byte* slot_rec;
byte* slot_free;
@@ -4389,7 +4449,7 @@ page_zip_dir_add_slot(
if (!page_is_leaf(page_zip->data)) {
ut_ad(!page_zip->n_blobs);
stored = dir - n_dense * REC_NODE_PTR_SIZE;
- } else if (UNIV_UNLIKELY(is_clustered)) {
+ } else if (is_clustered) {
/* Move the BLOB pointer array backwards to make space for the
roll_ptr and trx_id columns and the dense directory slot. */
byte* externs;
@@ -4591,7 +4651,7 @@ page_zip_reorganize(
/* Restore logging. */
mtr_set_log_mode(mtr, log_mode);
- if (UNIV_UNLIKELY(!page_zip_compress(page_zip, page, index, mtr))) {
+ if (!page_zip_compress(page_zip, page, index, page_zip_level, mtr)) {
#ifndef UNIV_HOTBACKUP
buf_block_free(temp_block);
@@ -4771,21 +4831,113 @@ ulint
page_zip_calc_checksum(
/*===================*/
const void* data, /*!< in: compressed page */
- ulint size) /*!< in: size of compressed page */
+ ulint size, /*!< in: size of compressed page */
+ srv_checksum_algorithm_t algo) /*!< in: algorithm to use */
{
+ uLong adler;
+ ib_uint32_t crc32;
+ const Bytef* s = static_cast<const byte*>(data);
+
/* Exclude FIL_PAGE_SPACE_OR_CHKSUM, FIL_PAGE_LSN,
and FIL_PAGE_FILE_FLUSH_LSN from the checksum. */
- const Bytef* s = data;
- uLong adler;
+ switch (algo) {
+ case SRV_CHECKSUM_ALGORITHM_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+
+ ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+ crc32 = ut_crc32(s + FIL_PAGE_OFFSET,
+ FIL_PAGE_LSN - FIL_PAGE_OFFSET)
+ ^ ut_crc32(s + FIL_PAGE_TYPE, 2)
+ ^ ut_crc32(s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+ size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+ return((ulint) crc32);
+ case SRV_CHECKSUM_ALGORITHM_INNODB:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+ ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+ adler = adler32(0L, s + FIL_PAGE_OFFSET,
+ FIL_PAGE_LSN - FIL_PAGE_OFFSET);
+ adler = adler32(adler, s + FIL_PAGE_TYPE, 2);
+ adler = adler32(adler, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+ size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+ return((ulint) adler);
+ case SRV_CHECKSUM_ALGORITHM_NONE:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+ return(BUF_NO_CHECKSUM_MAGIC);
+ /* no default so the compiler will emit a warning if new enum
+ is added and not handled here */
+ }
+
+ ut_error;
+ return(0);
+}
+
+/**********************************************************************//**
+Verify a compressed page's checksum.
+@return TRUE if the stored checksum is valid according to the value of
+innodb_checksum_algorithm */
+UNIV_INTERN
+ibool
+page_zip_verify_checksum(
+/*=====================*/
+ const void* data, /*!< in: compressed page */
+ ulint size) /*!< in: size of compressed page */
+{
+ ib_uint32_t stored;
+ ib_uint32_t calc;
+ ib_uint32_t crc32 = 0 /* silence bogus warning */;
+ ib_uint32_t innodb = 0 /* silence bogus warning */;
- ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ stored = mach_read_from_4(
+ (const unsigned char*) data + FIL_PAGE_SPACE_OR_CHKSUM);
- adler = adler32(0L, s + FIL_PAGE_OFFSET,
- FIL_PAGE_LSN - FIL_PAGE_OFFSET);
- adler = adler32(adler, s + FIL_PAGE_TYPE, 2);
- adler = adler32(adler, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
- size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ /* declare empty pages non-corrupted */
+ if (stored == 0) {
+ /* make sure that the page is really empty */
+ ut_d(ulint i; for (i = 0; i < size; i++) {
+ ut_a(*((const char*) data + i) == 0); });
+
+ return(TRUE);
+ }
+
+ calc = page_zip_calc_checksum(
+ data, size, static_cast<srv_checksum_algorithm_t>(
+ srv_checksum_algorithm));
+
+ if (stored == calc) {
+ return(TRUE);
+ }
+
+ switch ((srv_checksum_algorithm_t) srv_checksum_algorithm) {
+ case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+ return(stored == calc);
+ case SRV_CHECKSUM_ALGORITHM_CRC32:
+ if (stored == BUF_NO_CHECKSUM_MAGIC) {
+ return(TRUE);
+ }
+ crc32 = calc;
+ innodb = page_zip_calc_checksum(
+ data, size, SRV_CHECKSUM_ALGORITHM_INNODB);
+ break;
+ case SRV_CHECKSUM_ALGORITHM_INNODB:
+ if (stored == BUF_NO_CHECKSUM_MAGIC) {
+ return(TRUE);
+ }
+ crc32 = page_zip_calc_checksum(
+ data, size, SRV_CHECKSUM_ALGORITHM_CRC32);
+ innodb = calc;
+ break;
+ case SRV_CHECKSUM_ALGORITHM_NONE:
+ return(TRUE);
+ /* no default so the compiler will emit a warning if new enum
+ is added and not handled here */
+ }
- return((ulint) adler);
+ return(stored == crc32 || stored == innodb);
}