summaryrefslogtreecommitdiff
path: root/storage
diff options
context:
space:
mode:
Diffstat (limited to 'storage')
-rw-r--r--storage/innobase/CMakeLists.txt4
-rw-r--r--storage/innobase/btr/btr0btr.cc78
-rw-r--r--storage/innobase/btr/btr0defragment.cc814
-rw-r--r--storage/innobase/dict/dict0dict.cc14
-rw-r--r--storage/innobase/dict/dict0stats.cc225
-rw-r--r--storage/innobase/dict/dict0stats_bg.cc203
-rw-r--r--storage/innobase/handler/ha_innodb.cc184
-rw-r--r--storage/innobase/handler/ha_innodb.h5
-rw-r--r--storage/innobase/include/btr0btr.h97
-rw-r--r--storage/innobase/include/btr0btr.ic3
-rw-r--r--storage/innobase/include/btr0defragment.h101
-rw-r--r--storage/innobase/include/dict0dict.h14
-rw-r--r--storage/innobase/include/dict0mem.h21
-rw-r--r--storage/innobase/include/dict0priv.h3
-rw-r--r--storage/innobase/include/dict0priv.ic5
-rw-r--r--storage/innobase/include/dict0stats.h33
-rw-r--r--storage/innobase/include/dict0stats_bg.h22
-rw-r--r--storage/innobase/include/lock0lock.h10
-rw-r--r--storage/innobase/include/srv0srv.h16
-rw-r--r--storage/innobase/include/sync0sync.h1
-rw-r--r--storage/innobase/include/ut0timer.h104
-rw-r--r--storage/innobase/include/ut0timer.ic113
-rw-r--r--storage/innobase/lock/lock0lock.cc41
-rw-r--r--storage/innobase/page/page0cur.cc15
-rw-r--r--storage/innobase/row/row0mysql.cc15
-rw-r--r--storage/innobase/srv/srv0srv.cc15
-rw-r--r--storage/innobase/srv/srv0start.cc9
-rw-r--r--storage/innobase/sync/sync0sync.cc1
-rw-r--r--storage/innobase/ut/ut0timer.cc92
-rw-r--r--storage/xtradb/CMakeLists.txt4
-rw-r--r--storage/xtradb/btr/btr0btr.cc84
-rw-r--r--storage/xtradb/btr/btr0defragment.cc815
-rw-r--r--storage/xtradb/dict/dict0dict.cc14
-rw-r--r--storage/xtradb/dict/dict0stats.cc225
-rw-r--r--storage/xtradb/dict/dict0stats_bg.cc193
-rw-r--r--storage/xtradb/handler/ha_innodb.cc185
-rw-r--r--storage/xtradb/handler/ha_innodb.h7
-rw-r--r--storage/xtradb/include/btr0btr.h97
-rw-r--r--storage/xtradb/include/btr0btr.ic9
-rw-r--r--storage/xtradb/include/btr0defragment.h100
-rw-r--r--storage/xtradb/include/dict0dict.h14
-rw-r--r--storage/xtradb/include/dict0mem.h21
-rw-r--r--storage/xtradb/include/dict0priv.h3
-rw-r--r--storage/xtradb/include/dict0priv.ic5
-rw-r--r--storage/xtradb/include/dict0stats.h33
-rw-r--r--storage/xtradb/include/dict0stats_bg.h22
-rw-r--r--storage/xtradb/include/lock0lock.h10
-rw-r--r--storage/xtradb/include/srv0srv.h12
-rw-r--r--storage/xtradb/include/sync0sync.h1
-rw-r--r--storage/xtradb/include/ut0timer.h104
-rw-r--r--storage/xtradb/include/ut0timer.ic113
-rw-r--r--storage/xtradb/lock/lock0lock.cc41
-rw-r--r--storage/xtradb/page/page0cur.cc15
-rw-r--r--storage/xtradb/row/row0mysql.cc3
-rw-r--r--storage/xtradb/srv/srv0srv.cc18
-rw-r--r--storage/xtradb/srv/srv0start.cc8
-rw-r--r--storage/xtradb/sync/sync0sync.cc1
-rw-r--r--storage/xtradb/ut/ut0timer.cc92
58 files changed, 4339 insertions, 168 deletions
diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt
index 622fff87536..e783f3e6459 100644
--- a/storage/innobase/CMakeLists.txt
+++ b/storage/innobase/CMakeLists.txt
@@ -285,6 +285,7 @@ SET(INNOBASE_SOURCES
btr/btr0cur.cc
btr/btr0pcur.cc
btr/btr0sea.cc
+ btr/btr0defragment.cc
buf/buf0buddy.cc
buf/buf0buf.cc
buf/buf0dblwr.cc
@@ -395,7 +396,8 @@ SET(INNOBASE_SOURCES
ut/ut0rnd.cc
ut/ut0ut.cc
ut/ut0vec.cc
- ut/ut0wqueue.cc)
+ ut/ut0wqueue.cc
+ ut/ut0timer.cc)
IF(WITH_INNODB)
# Legacy option
diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc
index 104c2f00ef6..4f9ccbe061a 100644
--- a/storage/innobase/btr/btr0btr.cc
+++ b/storage/innobase/btr/btr0btr.cc
@@ -38,6 +38,7 @@ Created 6/2/1994 Heikki Tuuri
#include "btr0cur.h"
#include "btr0sea.h"
#include "btr0pcur.h"
+#include "btr0defragment.h"
#include "rem0cmp.h"
#include "lock0lock.h"
#include "ibuf0ibuf.h"
@@ -1193,6 +1194,32 @@ btr_get_size(
mtr_t* mtr) /*!< in/out: mini-transaction where index
is s-latched */
{
+ ulint used;
+ if (flag == BTR_N_LEAF_PAGES) {
+ btr_get_size_and_reserved(index, flag, &used, mtr);
+ return used;
+ } else if (flag == BTR_TOTAL_SIZE) {
+ return btr_get_size_and_reserved(index, flag, &used, mtr);
+ } else {
+ ut_error;
+ }
+ return (ULINT_UNDEFINED);
+}
+
+/**************************************************************//**
+Gets the number of reserved and used pages in a B-tree.
+@return number of pages reserved, or ULINT_UNDEFINED if the index
+is unavailable */
+UNIV_INTERN
+ulint
+btr_get_size_and_reserved(
+/*======================*/
+ dict_index_t* index, /*!< in: index */
+ ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
+ ulint* used, /*!< out: number of pages used (<= reserved) */
+ mtr_t* mtr) /*!< in/out: mini-transaction where index
+ is s-latched */
+{
fseg_header_t* seg_header;
page_t* root;
ulint n;
@@ -1201,6 +1228,8 @@ btr_get_size(
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
MTR_MEMO_S_LOCK));
+ ut_a(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE);
+
if (index->page == FIL_NULL || dict_index_is_online_ddl(index)
|| *index->name == TEMP_INDEX_PREFIX) {
return(ULINT_UNDEFINED);
@@ -1208,21 +1237,16 @@ btr_get_size(
root = btr_root_get(index, mtr);
- if (flag == BTR_N_LEAF_PAGES) {
- seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
+ seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
- fseg_n_reserved_pages(seg_header, &n, mtr);
+ n = fseg_n_reserved_pages(seg_header, used, mtr);
- } else if (flag == BTR_TOTAL_SIZE) {
+ if (flag == BTR_TOTAL_SIZE) {
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
- n = fseg_n_reserved_pages(seg_header, &dummy, mtr);
-
- seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
-
n += fseg_n_reserved_pages(seg_header, &dummy, mtr);
- } else {
- ut_error;
+ *used += dummy;
+
}
return(n);
@@ -1971,7 +1995,7 @@ IBUF_BITMAP_FREE is unaffected by reorganization.
@retval true if the operation was successful
@retval false if it is a compressed page, and recompression failed */
-static __attribute__((nonnull))
+UNIV_INTERN
bool
btr_page_reorganize_block(
/*======================*/
@@ -2923,6 +2947,12 @@ func_start:
new_page_zip = buf_block_get_page_zip(new_block);
btr_page_create(new_block, new_page_zip, cursor->index,
btr_page_get_level(page, mtr), mtr);
+ /* Only record the leaf level page splits. */
+ if (btr_page_get_level(page, mtr) == 0) {
+ cursor->index->stat_defrag_n_page_split ++;
+ cursor->index->stat_defrag_modified_counter ++;
+ btr_defragment_save_defrag_stats_if_needed(cursor->index);
+ }
/* 3. Calculate the first record on the upper half-page, and the
first record (move_limit) on original page which ends up on the
@@ -3181,31 +3211,9 @@ func_exit:
return(rec);
}
-#ifdef UNIV_SYNC_DEBUG
-/*************************************************************//**
-Removes a page from the level list of pages.
-@param space in: space where removed
-@param zip_size in: compressed page size in bytes, or 0 for uncompressed
-@param page in/out: page to remove
-@param index in: index tree
-@param mtr in/out: mini-transaction */
-# define btr_level_list_remove(space,zip_size,page,index,mtr) \
- btr_level_list_remove_func(space,zip_size,page,index,mtr)
-#else /* UNIV_SYNC_DEBUG */
-/*************************************************************//**
-Removes a page from the level list of pages.
-@param space in: space where removed
-@param zip_size in: compressed page size in bytes, or 0 for uncompressed
-@param page in/out: page to remove
-@param index in: index tree
-@param mtr in/out: mini-transaction */
-# define btr_level_list_remove(space,zip_size,page,index,mtr) \
- btr_level_list_remove_func(space,zip_size,page,mtr)
-#endif /* UNIV_SYNC_DEBUG */
-
/*************************************************************//**
Removes a page from the level list of pages. */
-static __attribute__((nonnull))
+UNIV_INTERN
void
btr_level_list_remove_func(
/*=======================*/
@@ -3377,7 +3385,7 @@ btr_node_ptr_delete(
If page is the only on its level, this function moves its records to the
father page, thus reducing the tree height.
@return father block */
-static
+UNIV_INTERN
buf_block_t*
btr_lift_page_up(
/*=============*/
diff --git a/storage/innobase/btr/btr0defragment.cc b/storage/innobase/btr/btr0defragment.cc
new file mode 100644
index 00000000000..e315a291359
--- /dev/null
+++ b/storage/innobase/btr/btr0defragment.cc
@@ -0,0 +1,814 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved.
+Copyright (C) 2014, SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**************************************************//**
+@file btr/btr0defragment.cc
+Index defragmentation.
+
+Created 05/29/2014 Rongrong Zhong
+Modified 16/07/2014 Sunguck Lee
+Modified 30/07/2014 Jan Lindström jan.lindstrom@skysql.com
+*******************************************************/
+
+#include "btr0defragment.h"
+#ifndef UNIV_HOTBACKUP
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "btr0pcur.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+#include "srv0start.h"
+#include "ut0timer.h"
+
+#include <list>
+
+/**************************************************//**
+Custom nullptr implementation for under g++ 4.6
+*******************************************************/
+// #pragma once
+namespace std
+{
+ // based on SC22/WG21/N2431 = J16/07-0301
+ struct nullptr_t
+ {
+ template<typename any> operator any * () const
+ {
+ return 0;
+ }
+ template<class any, typename T> operator T any:: * () const
+ {
+ return 0;
+ }
+
+#ifdef _MSC_VER
+ struct pad {};
+ pad __[sizeof(void*)/sizeof(pad)];
+#else
+ char __[sizeof(void*)];
+#endif
+private:
+ // nullptr_t();// {}
+ // nullptr_t(const nullptr_t&);
+ // void operator = (const nullptr_t&);
+ void operator &() const;
+ template<typename any> void operator +(any) const
+ {
+ /*I Love MSVC 2005!*/
+ }
+ template<typename any> void operator -(any) const
+ {
+ /*I Love MSVC 2005!*/
+ }
+ };
+static const nullptr_t __nullptr = {};
+}
+
+#ifndef nullptr
+#define nullptr std::__nullptr
+#endif
+/**************************************************//**
+End of Custom nullptr implementation for under g++ 4.6
+*******************************************************/
+
+/* When there's no work, either because defragment is disabled, or because no
+query is submitted, thread checks state every BTR_DEFRAGMENT_SLEEP_IN_USECS.*/
+#define BTR_DEFRAGMENT_SLEEP_IN_USECS 1000000
+/* Reduce the target page size by this amount when compression failure happens
+during defragmentaiton. 512 is chosen because it's a power of 2 and it is about
+3% of the page size. When there are compression failures in defragmentation,
+our goal is to get a decent defrag ratio with as few compression failure as
+possible. From experimentation it seems that reduce the target size by 512 every
+time will make sure the page is compressible within a couple of iterations. */
+#define BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE 512
+
+/* Work queue for defragmentation. */
+typedef std::list<btr_defragment_item_t*> btr_defragment_wq_t;
+static btr_defragment_wq_t btr_defragment_wq;
+
+/* Mutex protecting the defragmentation work queue.*/
+ib_mutex_t btr_defragment_mutex;
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t btr_defragment_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/* Number of compression failures caused by defragmentation since server
+start. */
+ulint btr_defragment_compression_failures = 0;
+/* Number of btr_defragment_n_pages calls that altered page but didn't
+manage to release any page. */
+ulint btr_defragment_failures = 0;
+/* Total number of btr_defragment_n_pages calls that altered page.
+The difference between btr_defragment_count and btr_defragment_failures shows
+the amount of effort wasted. */
+ulint btr_defragment_count = 0;
+
+/******************************************************************//**
+Constructor for btr_defragment_item_t. */
+btr_defragment_item_t::btr_defragment_item_t(
+ btr_pcur_t* pcur,
+ os_event_t event)
+{
+ this->pcur = pcur;
+ this->event = event;
+ this->removed = false;
+ this->last_processed = 0;
+}
+
+/******************************************************************//**
+Destructor for btr_defragment_item_t. */
+btr_defragment_item_t::~btr_defragment_item_t() {
+ if (this->pcur) {
+ btr_pcur_free_for_mysql(this->pcur);
+ }
+ if (this->event) {
+ os_event_set(this->event);
+ }
+}
+
+/******************************************************************//**
+Initialize defragmentation. */
+void
+btr_defragment_init()
+{
+ srv_defragment_interval = ut_microseconds_to_timer(
+ 1000000.0 / srv_defragment_frequency);
+ mutex_create(btr_defragment_mutex_key, &btr_defragment_mutex,
+ SYNC_ANY_LATCH);
+ os_thread_create(btr_defragment_thread, NULL, NULL);
+}
+
+/******************************************************************//**
+Shutdown defragmentation. Release all resources. */
+void
+btr_defragment_shutdown()
+{
+ mutex_enter(&btr_defragment_mutex);
+ list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ while(iter != btr_defragment_wq.end()) {
+ btr_defragment_item_t* item = *iter;
+ iter = btr_defragment_wq.erase(iter);
+ delete item;
+ }
+ mutex_exit(&btr_defragment_mutex);
+ mutex_free(&btr_defragment_mutex);
+}
+
+
+/******************************************************************//**
+Functions used by the query threads: btr_defragment_xxx_index
+Query threads find/add/remove index. */
+/******************************************************************//**
+Check whether the given index is in btr_defragment_wq. We use index->id
+to identify indices. */
+bool
+btr_defragment_find_index(
+ dict_index_t* index) /*!< Index to find. */
+{
+ mutex_enter(&btr_defragment_mutex);
+ for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ iter != btr_defragment_wq.end();
+ ++iter) {
+ btr_defragment_item_t* item = *iter;
+ btr_pcur_t* pcur = item->pcur;
+ btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
+ dict_index_t* idx = btr_cur_get_index(cursor);
+ if (index->id == idx->id) {
+ mutex_exit(&btr_defragment_mutex);
+ return true;
+ }
+ }
+ mutex_exit(&btr_defragment_mutex);
+ return false;
+}
+
+/******************************************************************//**
+Query thread uses this function to add an index to btr_defragment_wq.
+Return a pointer to os_event for the query thread to wait on if this is a
+synchronized defragmentation. */
+os_event_t
+btr_defragment_add_index(
+ dict_index_t* index, /*!< index to be added */
+ bool async) /*!< whether this is an async defragmentation */
+{
+ mtr_t mtr;
+ ulint space = dict_index_get_space(index);
+ ulint zip_size = dict_table_zip_size(index->table);
+ ulint page_no = dict_index_get_page(index);
+ mtr_start(&mtr);
+ // Load index rood page.
+ page_t* page = btr_page_get(space, zip_size, page_no,
+ RW_NO_LATCH, index, &mtr);
+ if (btr_page_get_level(page, &mtr) == 0) {
+ // Index root is a leaf page, no need to defragment.
+ mtr_commit(&mtr);
+ return NULL;
+ }
+ btr_pcur_t* pcur = btr_pcur_create_for_mysql();
+ os_event_t event = NULL;
+ if (!async) {
+ event = os_event_create();
+ }
+ btr_pcur_open_at_index_side(true, index, BTR_SEARCH_LEAF, pcur,
+ true, 0, &mtr);
+ btr_pcur_move_to_next(pcur, &mtr);
+ btr_pcur_store_position(pcur, &mtr);
+ mtr_commit(&mtr);
+ dict_stats_empty_defrag_summary(index);
+ btr_defragment_item_t* item = new btr_defragment_item_t(pcur, event);
+ mutex_enter(&btr_defragment_mutex);
+ btr_defragment_wq.push_back(item);
+ mutex_exit(&btr_defragment_mutex);
+ return event;
+}
+
+/******************************************************************//**
+When table is dropped, this function is called to mark a table as removed in
+btr_efragment_wq. The difference between this function and the remove_index
+function is this will not NULL the event. */
+void
+btr_defragment_remove_table(
+ dict_table_t* table) /*!< Index to be removed. */
+{
+ mutex_enter(&btr_defragment_mutex);
+ for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ iter != btr_defragment_wq.end();
+ ++iter) {
+ btr_defragment_item_t* item = *iter;
+ btr_pcur_t* pcur = item->pcur;
+ btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
+ dict_index_t* idx = btr_cur_get_index(cursor);
+ if (table->id == idx->table->id) {
+ item->removed = true;
+ }
+ }
+ mutex_exit(&btr_defragment_mutex);
+}
+
+/******************************************************************//**
+Query thread uses this function to mark an index as removed in
+btr_efragment_wq. */
+void
+btr_defragment_remove_index(
+ dict_index_t* index) /*!< Index to be removed. */
+{
+ mutex_enter(&btr_defragment_mutex);
+ for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ iter != btr_defragment_wq.end();
+ ++iter) {
+ btr_defragment_item_t* item = *iter;
+ btr_pcur_t* pcur = item->pcur;
+ btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
+ dict_index_t* idx = btr_cur_get_index(cursor);
+ if (index->id == idx->id) {
+ item->removed = true;
+ item->event = NULL;
+ break;
+ }
+ }
+ mutex_exit(&btr_defragment_mutex);
+}
+
+/******************************************************************//**
+Functions used by defragmentation thread: btr_defragment_xxx_item.
+Defragmentation thread operates on the work *item*. It gets/removes
+item from the work queue. */
+/******************************************************************//**
+Defragment thread uses this to remove an item from btr_defragment_wq.
+When an item is removed from the work queue, all resources associated with it
+are free as well. */
+void
+btr_defragment_remove_item(
+ btr_defragment_item_t* item) /*!< Item to be removed. */
+{
+ mutex_enter(&btr_defragment_mutex);
+ for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ iter != btr_defragment_wq.end();
+ ++iter) {
+ if (item == *iter) {
+ btr_defragment_wq.erase(iter);
+ delete item;
+ break;
+ }
+ }
+ mutex_exit(&btr_defragment_mutex);
+}
+
+/******************************************************************//**
+Defragment thread uses this to get an item from btr_defragment_wq to work on.
+The item is not removed from the work queue so query threads can still access
+this item. We keep it this way so query threads can find and kill a
+defragmentation even if that index is being worked on. Be aware that while you
+work on this item you have no lock protection on it whatsoever. This is OK as
+long as the query threads and defragment thread won't modify the same fields
+without lock protection.
+*/
+btr_defragment_item_t*
+btr_defragment_get_item()
+{
+ if (btr_defragment_wq.empty()) {
+ return nullptr;
+ }
+ mutex_enter(&btr_defragment_mutex);
+ list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ if (iter == btr_defragment_wq.end()) {
+ iter = btr_defragment_wq.begin();
+ }
+ btr_defragment_item_t* item = *iter;
+ iter++;
+ mutex_exit(&btr_defragment_mutex);
+ return item;
+}
+
+/*********************************************************************//**
+Check whether we should save defragmentation statistics to persistent storage.
+Currently we save the stats to persistent storage every 100 updates. */
+UNIV_INTERN
+void
+btr_defragment_save_defrag_stats_if_needed(
+ dict_index_t* index) /*!< in: index */
+{
+ if (srv_defragment_stats_accuracy != 0 // stats tracking disabled
+ && dict_index_get_space(index) != 0 // do not track system tables
+ && index->stat_defrag_modified_counter
+ >= srv_defragment_stats_accuracy) {
+ dict_stats_defrag_pool_add(index);
+ index->stat_defrag_modified_counter = 0;
+ }
+}
+
+/*********************************************************************//**
+Main defragment functionalities used by defragment thread.*/
+/*************************************************************//**
+Calculate number of records from beginning of block that can
+fit into size_limit
+@return number of records */
+UNIV_INTERN
+ulint
+btr_defragment_calc_n_recs_for_size(
+ buf_block_t* block, /*!< in: B-tree page */
+ dict_index_t* index, /*!< in: index of the page */
+ ulint size_limit, /*!< in: size limit to fit records in */
+ ulint* n_recs_size) /*!< out: actual size of the records that fit
+ in size_limit. */
+{
+ page_t* page = buf_block_get_frame(block);
+ ulint n_recs = 0;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+ mem_heap_t* heap = NULL;
+ ulint size = 0;
+ page_cur_t cur;
+
+ page_cur_set_before_first(block, &cur);
+ page_cur_move_to_next(&cur);
+ while (page_cur_get_rec(&cur) != page_get_supremum_rec(page)) {
+ rec_t* cur_rec = page_cur_get_rec(&cur);
+ offsets = rec_get_offsets(cur_rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ ulint rec_size = rec_offs_size(offsets);
+ size += rec_size;
+ if (size > size_limit) {
+ size = size - rec_size;
+ break;
+ }
+ n_recs ++;
+ page_cur_move_to_next(&cur);
+ }
+ *n_recs_size = size;
+ return n_recs;
+}
+
+/*************************************************************//**
+Merge as many records from the from_block to the to_block. Delete
+the from_block if all records are successfully merged to to_block.
+@return the to_block to target for next merge operation. */
+UNIV_INTERN
+buf_block_t*
+btr_defragment_merge_pages(
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* from_block, /*!< in: origin of merge */
+ buf_block_t* to_block, /*!< in: destination of merge */
+ ulint zip_size, /*!< in: zip size of the block */
+ ulint reserved_space, /*!< in: space reserved for future
+ insert to avoid immediate page split */
+ ulint* max_data_size, /*!< in/out: max data size to
+ fit in a single compressed page. */
+ mem_heap_t* heap, /*!< in/out: pointer to memory heap */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ page_t* from_page = buf_block_get_frame(from_block);
+ page_t* to_page = buf_block_get_frame(to_block);
+ ulint space = dict_index_get_space(index);
+ ulint level = btr_page_get_level(from_page, mtr);
+ ulint n_recs = page_get_n_recs(from_page);
+ ulint new_data_size = page_get_data_size(to_page);
+ ulint max_ins_size =
+ page_get_max_insert_size(to_page, n_recs);
+ ulint max_ins_size_reorg =
+ page_get_max_insert_size_after_reorganize(
+ to_page, n_recs);
+ ulint max_ins_size_to_use = max_ins_size_reorg > reserved_space
+ ? max_ins_size_reorg - reserved_space : 0;
+ ulint move_size = 0;
+ ulint n_recs_to_move = 0;
+ rec_t* rec = NULL;
+ ulint target_n_recs = 0;
+ rec_t* orig_pred;
+
+ // Estimate how many records can be moved from the from_page to
+ // the to_page.
+ if (zip_size) {
+ ulint page_diff = UNIV_PAGE_SIZE - *max_data_size;
+ max_ins_size_to_use = (max_ins_size_to_use > page_diff)
+ ? max_ins_size_to_use - page_diff : 0;
+ }
+ n_recs_to_move = btr_defragment_calc_n_recs_for_size(
+ from_block, index, max_ins_size_to_use, &move_size);
+
+ // If max_ins_size >= move_size, we can move the records without
+ // reorganizing the page, otherwise we need to reorganize the page
+ // first to release more space.
+ if (move_size > max_ins_size) {
+ if (!btr_page_reorganize_block(false, page_zip_level,
+ to_block, index,
+ mtr)) {
+ if (!dict_index_is_clust(index)
+ && page_is_leaf(to_page)) {
+ ibuf_reset_free_bits(to_block);
+ }
+ // If reorganization fails, that means page is
+ // not compressable. There's no point to try
+ // merging into this page. Continue to the
+ // next page.
+ return from_block;
+ }
+ ut_ad(page_validate(to_page, index));
+ max_ins_size = page_get_max_insert_size(to_page, n_recs);
+ ut_a(max_ins_size >= move_size);
+ }
+
+ // Move records to pack to_page more full.
+ orig_pred = NULL;
+ target_n_recs = n_recs_to_move;
+ while (n_recs_to_move > 0) {
+ rec = page_rec_get_nth(from_page,
+ n_recs_to_move + 1);
+ orig_pred = page_copy_rec_list_start(
+ to_block, from_block, rec, index, mtr);
+ if (orig_pred)
+ break;
+ // If we reach here, that means compression failed after packing
+ // n_recs_to_move number of records to to_page. We try to reduce
+ // the targeted data size on the to_page by
+ // BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE and try again.
+ os_atomic_increment_ulint(
+ &btr_defragment_compression_failures, 1);
+ max_ins_size_to_use =
+ move_size > BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
+ ? move_size - BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
+ : 0;
+ if (max_ins_size_to_use == 0) {
+ n_recs_to_move = 0;
+ move_size = 0;
+ break;
+ }
+ n_recs_to_move = btr_defragment_calc_n_recs_for_size(
+ from_block, index, max_ins_size_to_use, &move_size);
+ }
+ // If less than target_n_recs are moved, it means there are
+ // compression failures during page_copy_rec_list_start. Adjust
+ // the max_data_size estimation to reduce compression failures
+ // in the following runs.
+ if (target_n_recs > n_recs_to_move
+ && *max_data_size > new_data_size + move_size) {
+ *max_data_size = new_data_size + move_size;
+ }
+ // Set ibuf free bits if necessary.
+ if (!dict_index_is_clust(index)
+ && page_is_leaf(to_page)) {
+ if (zip_size) {
+ ibuf_reset_free_bits(to_block);
+ } else {
+ ibuf_update_free_bits_if_full(
+ to_block,
+ UNIV_PAGE_SIZE,
+ ULINT_UNDEFINED);
+ }
+ }
+ if (n_recs_to_move == n_recs) {
+ /* The whole page is merged with the previous page,
+ free it. */
+ lock_update_merge_left(to_block, orig_pred,
+ from_block);
+ btr_search_drop_page_hash_index(from_block);
+ btr_level_list_remove(space, zip_size, from_page,
+ index, mtr);
+ btr_node_ptr_delete(index, from_block, mtr);
+ btr_blob_dbg_remove(from_page, index,
+ "btr_defragment_n_pages");
+ btr_page_free(index, from_block, mtr);
+ } else {
+ // There are still records left on the page, so
+ // increment n_defragmented. Node pointer will be changed
+ // so remove the old node pointer.
+ if (n_recs_to_move > 0) {
+ // Part of the page is merged to left, remove
+ // the merged records, update record locks and
+ // node pointer.
+ dtuple_t* node_ptr;
+ page_delete_rec_list_start(rec, from_block,
+ index, mtr);
+ lock_update_split_and_merge(to_block,
+ orig_pred,
+ from_block);
+ btr_node_ptr_delete(index, from_block, mtr);
+ rec = page_rec_get_next(
+ page_get_infimum_rec(from_page));
+ node_ptr = dict_index_build_node_ptr(
+ index, rec, page_get_page_no(from_page),
+ heap, level + 1);
+ btr_insert_on_non_leaf_level(0, index, level+1,
+ node_ptr, mtr);
+ }
+ to_block = from_block;
+ }
+ return to_block;
+}
+
+/*************************************************************//**
+Tries to merge N consecutive pages, starting from the page pointed by the
+cursor. Skip space 0. Only consider leaf pages.
+This function first loads all N pages into memory, then for each of
+the pages other than the first page, it tries to move as many records
+as possible to the left sibling to keep the left sibling full. During
+the process, if any page becomes empty, that page will be removed from
+the level list. Record locks, hash, and node pointers are updated after
+page reorganization.
+@return pointer to the last block processed, or NULL if reaching end of index */
+UNIV_INTERN
+buf_block_t*
+btr_defragment_n_pages(
+ buf_block_t* block, /*!< in: starting block for defragmentation */
+ dict_index_t* index, /*!< in: index tree */
+ uint n_pages,/*!< in: number of pages to defragment */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ulint space;
+ ulint zip_size;
+ /* We will need to load the n+1 block because if the last page is freed
+ and we need to modify the prev_page_no of that block. */
+ buf_block_t* blocks[BTR_DEFRAGMENT_MAX_N_PAGES + 1];
+ page_t* first_page;
+ buf_block_t* current_block;
+ ulint total_data_size = 0;
+ ulint total_n_recs = 0;
+ ulint data_size_per_rec;
+ ulint optimal_page_size;
+ ulint reserved_space;
+ ulint level;
+ ulint max_data_size = 0;
+ uint n_defragmented = 0;
+ uint n_new_slots;
+ mem_heap_t* heap;
+ ibool end_of_index = FALSE;
+
+ /* It doesn't make sense to call this function with n_pages = 1. */
+ ut_ad(n_pages > 1);
+
+ ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+ MTR_MEMO_X_LOCK));
+ space = dict_index_get_space(index);
+ if (space == 0) {
+ /* Ignore space 0. */
+ return NULL;
+ }
+
+ if (n_pages > BTR_DEFRAGMENT_MAX_N_PAGES) {
+ n_pages = BTR_DEFRAGMENT_MAX_N_PAGES;
+ }
+
+ zip_size = dict_table_zip_size(index->table);
+ first_page = buf_block_get_frame(block);
+ level = btr_page_get_level(first_page, mtr);
+
+ if (level != 0) {
+ return NULL;
+ }
+
+ /* 1. Load the pages and calculate the total data size. */
+ blocks[0] = block;
+ for (uint i = 1; i <= n_pages; i++) {
+ page_t* page = buf_block_get_frame(blocks[i-1]);
+ ulint page_no = btr_page_get_next(page, mtr);
+ total_data_size += page_get_data_size(page);
+ total_n_recs += page_get_n_recs(page);
+ if (page_no == FIL_NULL) {
+ n_pages = i;
+ end_of_index = TRUE;
+ break;
+ }
+ blocks[i] = btr_block_get(space, zip_size, page_no,
+ RW_X_LATCH, index, mtr);
+ }
+
+ if (n_pages == 1) {
+ if (btr_page_get_prev(first_page, mtr) == FIL_NULL) {
+ /* last page in the index */
+ if (dict_index_get_page(index)
+ == page_get_page_no(first_page))
+ return NULL;
+ /* given page is the last page.
+ Lift the records to father. */
+ btr_lift_page_up(index, block, mtr);
+ }
+ return NULL;
+ }
+
+ /* 2. Calculate how many pages data can fit in. If not compressable,
+ return early. */
+ ut_a(total_n_recs != 0);
+ data_size_per_rec = total_data_size / total_n_recs;
+ // For uncompressed pages, the optimal data size if the free space of a
+ // empty page.
+ optimal_page_size = page_get_free_space_of_empty(
+ page_is_comp(first_page));
+ // For compressed pages, we take compression failures into account.
+ if (zip_size) {
+ ulint size = 0;
+ int i = 0;
+ // We estimate the optimal data size of the index use samples of
+ // data size. These samples are taken when pages failed to
+ // compress due to insertion on the page. We use the average
+ // of all samples we have as the estimation. Different pages of
+ // the same index vary in compressibility. Average gives a good
+ // enough estimation.
+ for (;i < STAT_DEFRAG_DATA_SIZE_N_SAMPLE; i++) {
+ if (index->stat_defrag_data_size_sample[i] == 0) {
+ break;
+ }
+ size += index->stat_defrag_data_size_sample[i];
+ }
+ if (i != 0) {
+ size = size / i;
+ optimal_page_size = min(optimal_page_size, size);
+ }
+ max_data_size = optimal_page_size;
+ }
+
+ reserved_space = min((ulint)(optimal_page_size
+ * (1 - srv_defragment_fill_factor)),
+ (data_size_per_rec
+ * srv_defragment_fill_factor_n_recs));
+ optimal_page_size -= reserved_space;
+ n_new_slots = (total_data_size + optimal_page_size - 1)
+ / optimal_page_size;
+ if (n_new_slots >= n_pages) {
+ /* Can't defragment. */
+ if (end_of_index)
+ return NULL;
+ return blocks[n_pages-1];
+ }
+
+ /* 3. Defragment pages. */
+ heap = mem_heap_create(256);
+ // First defragmented page will be the first page.
+ current_block = blocks[0];
+ // Start from the second page.
+ for (uint i = 1; i < n_pages; i ++) {
+ buf_block_t* new_block = btr_defragment_merge_pages(
+ index, blocks[i], current_block, zip_size,
+ reserved_space, &max_data_size, heap, mtr);
+ if (new_block != current_block) {
+ n_defragmented ++;
+ current_block = new_block;
+ }
+ }
+ mem_heap_free(heap);
+ n_defragmented ++;
+ os_atomic_increment_ulint(
+ &btr_defragment_count, 1);
+ if (n_pages == n_defragmented) {
+ os_atomic_increment_ulint(
+ &btr_defragment_failures, 1);
+ } else {
+ index->stat_defrag_n_pages_freed += (n_pages - n_defragmented);
+ }
+ if (end_of_index)
+ return NULL;
+ return current_block;
+}
+
+/******************************************************************//**
+Thread that merges consecutive b-tree pages into fewer pages to defragment
+the index. */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(btr_defragment_thread)(
+/*==========================================*/
+ void* arg) /*!< in: work queue */
+{
+ btr_pcur_t* pcur;
+ btr_cur_t* cursor;
+ dict_index_t* index;
+ mtr_t mtr;
+ buf_block_t* first_block;
+ buf_block_t* last_block;
+
+ while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
+ /* If defragmentation is disabled, sleep before
+ checking whether it's enabled. */
+ if (!srv_defragment) {
+ os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS);
+ continue;
+ }
+ /* The following call won't remove the item from work queue.
+ We only get a pointer to it to work on. This will make sure
+ when user issue a kill command, all indices are in the work
+ queue to be searched. This also means that the user thread
+ cannot directly remove the item from queue (since we might be
+ using it). So user thread only marks index as removed. */
+ btr_defragment_item_t* item = btr_defragment_get_item();
+ /* If work queue is empty, sleep and check later. */
+ if (!item) {
+ os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS);
+ continue;
+ }
+ /* If an index is marked as removed, we remove it from the work
+ queue. No other thread could be using this item at this point so
+ it's safe to remove now. */
+ if (item->removed) {
+ btr_defragment_remove_item(item);
+ continue;
+ }
+
+ pcur = item->pcur;
+ ulonglong now = ut_timer_now();
+ ulonglong elapsed = now - item->last_processed;
+
+ if (elapsed < srv_defragment_interval) {
+ /* If we see an index again before the interval
+ determined by the configured frequency is reached,
+ we just sleep until the interval pass. Since
+ defragmentation of all indices queue up on a single
+ thread, it's likely other indices that follow this one
+ don't need to sleep again. */
+ os_thread_sleep(((ulint)ut_timer_to_microseconds(
+ srv_defragment_interval - elapsed)));
+ }
+
+ now = ut_timer_now();
+ mtr_start(&mtr);
+ btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, &mtr);
+ cursor = btr_pcur_get_btr_cur(pcur);
+ index = btr_cur_get_index(cursor);
+ first_block = btr_cur_get_block(cursor);
+ last_block = btr_defragment_n_pages(first_block, index,
+ srv_defragment_n_pages,
+ &mtr);
+ if (last_block) {
+ /* If we haven't reached the end of the index,
+ place the cursor on the last record of last page,
+ store the cursor position, and put back in queue. */
+ page_t* last_page = buf_block_get_frame(last_block);
+ rec_t* rec = page_rec_get_prev(
+ page_get_supremum_rec(last_page));
+ ut_a(page_rec_is_user_rec(rec));
+ page_cur_position(rec, last_block,
+ btr_cur_get_page_cur(cursor));
+ btr_pcur_store_position(pcur, &mtr);
+ mtr_commit(&mtr);
+ /* Update the last_processed time of this index. */
+ item->last_processed = now;
+ } else {
+ mtr_commit(&mtr);
+ /* Reaching the end of the index. */
+ dict_stats_empty_defrag_stats(index);
+ dict_stats_save_defrag_stats(index);
+ dict_stats_save_defrag_summary(index);
+ btr_defragment_remove_item(item);
+ }
+ }
+ btr_defragment_shutdown();
+ os_thread_exit(NULL);
+ OS_THREAD_DUMMY_RETURN;
+}
+
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc
index 0c83089478a..52ba11fc9ab 100644
--- a/storage/innobase/dict/dict0dict.cc
+++ b/storage/innobase/dict/dict0dict.cc
@@ -408,7 +408,7 @@ dict_table_try_drop_aborted(
if (table == NULL) {
table = dict_table_open_on_id_low(
- table_id, DICT_ERR_IGNORE_NONE);
+ table_id, DICT_ERR_IGNORE_NONE, FALSE);
} else {
ut_ad(table->id == table_id);
}
@@ -795,7 +795,8 @@ dict_table_open_on_id(
table_id,
table_op == DICT_TABLE_OP_LOAD_TABLESPACE
? DICT_ERR_IGNORE_RECOVER_LOCK
- : DICT_ERR_IGNORE_NONE);
+ : DICT_ERR_IGNORE_NONE,
+ table_op == DICT_TABLE_OP_OPEN_ONLY_IF_CACHED);
if (table != NULL) {
@@ -1313,7 +1314,7 @@ dict_table_move_from_non_lru_to_lru(
/**********************************************************************//**
Looks for an index with the given id given a table instance.
@return index or NULL */
-static
+UNIV_INTERN
dict_index_t*
dict_table_find_index_on_id(
/*========================*/
@@ -2408,6 +2409,13 @@ undo_size_ok:
new_index->stat_index_size = 1;
new_index->stat_n_leaf_pages = 1;
+ new_index->stat_defrag_n_pages_freed = 0;
+ new_index->stat_defrag_n_page_split = 0;
+
+ new_index->stat_defrag_sample_next_slot = 0;
+ memset(&new_index->stat_defrag_data_size_sample,
+ 0x0, sizeof(ulint) * STAT_DEFRAG_DATA_SIZE_N_SAMPLE);
+
/* Add the new index as the last index for the table */
UT_LIST_ADD_LAST(indexes, table->indexes, new_index);
diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc
index 928bdb3f2ef..bec0079942b 100644
--- a/storage/innobase/dict/dict0stats.cc
+++ b/storage/innobase/dict/dict0stats.cc
@@ -492,6 +492,9 @@ dict_stats_table_clone_create(
heap,
idx->n_uniq * sizeof(idx->stat_n_non_null_key_vals[0]));
ut_d(idx->magic_n = DICT_INDEX_MAGIC_N);
+
+ idx->stat_defrag_n_page_split = 0;
+ idx->stat_defrag_n_pages_freed = 0;
}
ut_d(t->magic_n = DICT_TABLE_MAGIC_N);
@@ -520,7 +523,9 @@ static
void
dict_stats_empty_index(
/*===================*/
- dict_index_t* index) /*!< in/out: index */
+ dict_index_t* index, /*!< in/out: index */
+ bool empty_defrag_stats)
+ /*!< in: whether to empty defrag stats */
{
ut_ad(!(index->type & DICT_FTS));
ut_ad(!dict_index_is_univ(index));
@@ -535,6 +540,34 @@ dict_stats_empty_index(
index->stat_index_size = 1;
index->stat_n_leaf_pages = 1;
+
+ if (empty_defrag_stats) {
+ dict_stats_empty_defrag_stats(index);
+ dict_stats_empty_defrag_summary(index);
+ }
+}
+
+/**********************************************************************//**
+Clear defragmentation summary. */
+UNIV_INTERN
+void
+dict_stats_empty_defrag_summary(
+/*==================*/
+ dict_index_t* index) /*!< in: index to clear defragmentation stats */
+{
+ index->stat_defrag_n_pages_freed = 0;
+}
+
+/**********************************************************************//**
+Clear defragmentation related index stats. */
+UNIV_INTERN
+void
+dict_stats_empty_defrag_stats(
+/*==================*/
+ dict_index_t* index) /*!< in: index to clear defragmentation stats */
+{
+ index->stat_defrag_modified_counter = 0;
+ index->stat_defrag_n_page_split = 0;
}
/*********************************************************************//**
@@ -544,7 +577,9 @@ static
void
dict_stats_empty_table(
/*===================*/
- dict_table_t* table) /*!< in/out: table */
+ dict_table_t* table, /*!< in/out: table */
+ bool empty_defrag_stats)
+ /*!< in: whether to empty defrag stats */
{
/* Zero the stats members */
@@ -569,7 +604,7 @@ dict_stats_empty_table(
ut_ad(!dict_index_is_univ(index));
- dict_stats_empty_index(index);
+ dict_stats_empty_index(index, empty_defrag_stats);
}
table->stat_initialized = TRUE;
@@ -704,7 +739,7 @@ dict_stats_copy(
}
if (!INDEX_EQ(src_idx, dst_idx)) {
- dict_stats_empty_index(dst_idx);
+ dict_stats_empty_index(dst_idx, true);
continue;
}
@@ -715,7 +750,7 @@ dict_stats_copy(
/* Since src is smaller some elements in dst
will remain untouched by the following memmove(),
thus we init all of them here. */
- dict_stats_empty_index(dst_idx);
+ dict_stats_empty_index(dst_idx, true);
} else {
n_copy_el = dst_idx->n_uniq;
}
@@ -735,6 +770,13 @@ dict_stats_copy(
dst_idx->stat_index_size = src_idx->stat_index_size;
dst_idx->stat_n_leaf_pages = src_idx->stat_n_leaf_pages;
+
+ dst_idx->stat_defrag_modified_counter =
+ src_idx->stat_defrag_modified_counter;
+ dst_idx->stat_defrag_n_pages_freed =
+ src_idx->stat_defrag_n_pages_freed;
+ dst_idx->stat_defrag_n_page_split =
+ src_idx->stat_defrag_n_page_split;
}
dst->stat_initialized = TRUE;
@@ -758,6 +800,9 @@ dict_index_t::stat_n_sample_sizes[]
dict_index_t::stat_n_non_null_key_vals[]
dict_index_t::stat_index_size
dict_index_t::stat_n_leaf_pages
+dict_index_t::stat_defrag_modified_counter
+dict_index_t::stat_defrag_n_pages_freed
+dict_index_t::stat_defrag_n_page_split
The returned object should be freed with dict_stats_snapshot_free()
when no longer needed.
@return incomplete table object */
@@ -807,7 +852,9 @@ dict_stats_snapshot_free(
Calculates new estimates for index statistics. This function is
relatively quick and is used to calculate transient statistics that
are not saved on disk. This was the only way to calculate statistics
-before the Persistent Statistics feature was introduced. */
+before the Persistent Statistics feature was introduced.
+This function doesn't update the defragmentation related stats.
+Only persistent statistics supports defragmentation stats. */
static
void
dict_stats_update_transient_for_index(
@@ -823,10 +870,10 @@ dict_stats_update_transient_for_index(
Initialize some bogus index cardinality
statistics, so that the data can be queried in
various means, also via secondary indexes. */
- dict_stats_empty_index(index);
+ dict_stats_empty_index(index, false);
#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
} else if (ibuf_debug && !dict_index_is_clust(index)) {
- dict_stats_empty_index(index);
+ dict_stats_empty_index(index, false);
#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
} else {
mtr_t mtr;
@@ -847,7 +894,7 @@ dict_stats_update_transient_for_index(
switch (size) {
case ULINT_UNDEFINED:
- dict_stats_empty_index(index);
+ dict_stats_empty_index(index, false);
return;
case 0:
/* The root node of the tree is a leaf */
@@ -882,7 +929,7 @@ dict_stats_update_transient(
if (dict_table_is_discarded(table)) {
/* Nothing to do. */
- dict_stats_empty_table(table);
+ dict_stats_empty_table(table, false);
return;
} else if (index == NULL) {
/* Table definition is corrupt */
@@ -892,7 +939,7 @@ dict_stats_update_transient(
fprintf(stderr, " InnoDB: table %s has no indexes. "
"Cannot calculate statistics.\n",
ut_format_name(table->name, TRUE, buf, sizeof(buf)));
- dict_stats_empty_table(table);
+ dict_stats_empty_table(table, false);
return;
}
@@ -904,7 +951,7 @@ dict_stats_update_transient(
continue;
}
- dict_stats_empty_index(index);
+ dict_stats_empty_index(index, false);
if (dict_stats_should_ignore_index(index)) {
continue;
@@ -1794,7 +1841,7 @@ dict_stats_analyze_index(
DEBUG_PRINTF(" %s(index=%s)\n", __func__, index->name);
- dict_stats_empty_index(index);
+ dict_stats_empty_index(index, false);
mtr_start(&mtr);
@@ -2059,7 +2106,7 @@ dict_stats_update_persistent(
/* Table definition is corrupt */
dict_table_stats_unlock(table, RW_X_LATCH);
- dict_stats_empty_table(table);
+ dict_stats_empty_table(table, true);
return(DB_CORRUPTION);
}
@@ -2088,7 +2135,7 @@ dict_stats_update_persistent(
continue;
}
- dict_stats_empty_index(index);
+ dict_stats_empty_index(index, false);
if (dict_stats_should_ignore_index(index)) {
continue;
@@ -2657,6 +2704,16 @@ dict_stats_fetch_index_stats_step(
== 0) {
index->stat_n_leaf_pages = (ulint) stat_value;
arg->stats_were_modified = true;
+ } else if (stat_name_len == 12 /* strlen("n_page_split") */
+ && strncasecmp("n_page_split", stat_name, stat_name_len)
+ == 0) {
+ index->stat_defrag_n_page_split = (ulint) stat_value;
+ arg->stats_were_modified = true;
+ } else if (stat_name_len == 13 /* strlen("n_pages_freed") */
+ && strncasecmp("n_pages_freed", stat_name, stat_name_len)
+ == 0) {
+ index->stat_defrag_n_pages_freed = (ulint) stat_value;
+ arg->stats_were_modified = true;
} else if (stat_name_len > PFX_LEN /* e.g. stat_name=="n_diff_pfx01" */
&& strncasecmp(PFX, stat_name, PFX_LEN) == 0) {
@@ -2776,7 +2833,7 @@ dict_stats_fetch_from_ps(
the persistent storage contains incomplete stats (e.g. missing stats
for some index) then we would end up with (partially) uninitialized
stats. */
- dict_stats_empty_table(table);
+ dict_stats_empty_table(table, true);
trx = trx_allocate_for_background();
@@ -2878,6 +2935,22 @@ dict_stats_fetch_from_ps(
}
/*********************************************************************//**
+Clear defragmentation stats modified counter for all indices in table. */
+static
+void
+dict_stats_empty_defrag_modified_counter(
+ dict_table_t* table) /*!< in: table */
+{
+ dict_index_t* index;
+ ut_a(table);
+ for (index = dict_table_get_first_index(table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+ index->stat_defrag_modified_counter = 0;
+ }
+}
+
+/*********************************************************************//**
Fetches or calculates new estimates for index statistics. */
UNIV_INTERN
void
@@ -2949,13 +3022,13 @@ dict_stats_update(
"because the .ibd file is missing. For help, please "
"refer to " REFMAN "innodb-troubleshooting.html\n",
ut_format_name(table->name, TRUE, buf, sizeof(buf)));
- dict_stats_empty_table(table);
+ dict_stats_empty_table(table, true);
return(DB_TABLESPACE_DELETED);
} else if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
/* If we have set a high innodb_force_recovery level, do
not calculate statistics, as a badly corrupted index can
cause a crash in it. */
- dict_stats_empty_table(table);
+ dict_stats_empty_table(table, false);
return(DB_SUCCESS);
}
@@ -3014,7 +3087,7 @@ dict_stats_update(
case DICT_STATS_EMPTY_TABLE:
- dict_stats_empty_table(table);
+ dict_stats_empty_table(table, true);
/* If table is using persistent stats,
then save the stats on disk */
@@ -3073,6 +3146,7 @@ dict_stats_update(
t->stats_last_recalc = table->stats_last_recalc;
t->stat_modified_counter = 0;
+ dict_stats_empty_defrag_modified_counter(t);
switch (err) {
case DB_SUCCESS:
@@ -3083,7 +3157,7 @@ dict_stats_update(
copying because dict_stats_table_clone_create() does
skip corrupted indexes so our dummy object 't' may
have less indexes than the real object 'table'. */
- dict_stats_empty_table(table);
+ dict_stats_empty_table(table, true);
dict_stats_copy(table, t);
@@ -3650,6 +3724,117 @@ dict_stats_rename_table(
return(ret);
}
+/*********************************************************************//**
+Save defragmentation result.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_save_defrag_summary(
+ dict_index_t* index) /*!< in: index */
+{
+ dberr_t ret;
+ lint now = (lint) ut_time();
+ if (dict_index_is_univ(index)) {
+ return DB_SUCCESS;
+ }
+ rw_lock_x_lock(&dict_operation_lock);
+ mutex_enter(&dict_sys->mutex);
+ ret = dict_stats_save_index_stat(index, now, "n_pages_freed",
+ index->stat_defrag_n_pages_freed,
+ NULL,
+ "Number of pages freed during"
+ " last defragmentation run.",
+ NULL);
+
+ mutex_exit(&dict_sys->mutex);
+ rw_lock_x_unlock(&dict_operation_lock);
+ return (ret);
+}
+
+/*********************************************************************//**
+Save defragmentation stats for a given index.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_save_defrag_stats(
+ dict_index_t* index) /*!< in: index */
+{
+ dberr_t ret;
+
+ if (index->table->ibd_file_missing) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Cannot save defragment stats because "
+ ".ibd file is missing.\n");
+ return (DB_TABLESPACE_DELETED);
+ }
+ if (dict_index_is_corrupted(index)) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Cannot save defragment stats because "
+ "index is corrupted.\n");
+ return(DB_CORRUPTION);
+ }
+
+ if (dict_index_is_univ(index)) {
+ return DB_SUCCESS;
+ }
+
+ lint now = (lint) ut_time();
+ mtr_t mtr;
+ ulint n_leaf_pages;
+ ulint n_leaf_reserved;
+ mtr_start(&mtr);
+ mtr_s_lock(dict_index_get_lock(index), &mtr);
+ n_leaf_reserved = btr_get_size_and_reserved(index, BTR_N_LEAF_PAGES,
+ &n_leaf_pages, &mtr);
+ mtr_commit(&mtr);
+
+ if (n_leaf_reserved == ULINT_UNDEFINED) {
+ // The index name is different during fast index creation,
+ // so the stats won't be associated with the right index
+ // for later use. We just return without saving.
+ return DB_SUCCESS;
+ }
+
+ rw_lock_x_lock(&dict_operation_lock);
+
+ mutex_enter(&dict_sys->mutex);
+ ret = dict_stats_save_index_stat(index, now, "n_page_split",
+ index->stat_defrag_n_page_split,
+ NULL,
+ "Number of new page splits on leaves"
+ " since last defragmentation.",
+ NULL);
+ if (ret != DB_SUCCESS) {
+ goto end;
+ }
+
+ ret = dict_stats_save_index_stat(
+ index, now, "n_leaf_pages_defrag",
+ n_leaf_pages,
+ NULL,
+ "Number of leaf pages when this stat is saved to disk",
+ NULL);
+ if (ret != DB_SUCCESS) {
+ goto end;
+ }
+
+ ret = dict_stats_save_index_stat(
+ index, now, "n_leaf_pages_reserved",
+ n_leaf_reserved,
+ NULL,
+ "Number of pages reserved for this index leaves when this stat "
+ "is saved to disk",
+ NULL);
+
+end:
+ mutex_exit(&dict_sys->mutex);
+ rw_lock_x_unlock(&dict_operation_lock);
+
+ return (ret);
+}
+
/* tests @{ */
#ifdef UNIV_COMPILE_TEST_FUNCS
diff --git a/storage/innobase/dict/dict0stats_bg.cc b/storage/innobase/dict/dict0stats_bg.cc
index ecd723ca39a..0089f9897ae 100644
--- a/storage/innobase/dict/dict0stats_bg.cc
+++ b/storage/innobase/dict/dict0stats_bg.cc
@@ -25,6 +25,7 @@ Created Apr 25, 2012 Vasil Dimov
#include "row0mysql.h"
#include "srv0start.h"
+#include "dict0dict.h"
#include "dict0stats.h"
#include "dict0stats_bg.h"
@@ -44,8 +45,10 @@ UNIV_INTERN os_event_t dict_stats_event = NULL;
/** This mutex protects the "recalc_pool" variable. */
static ib_mutex_t recalc_pool_mutex;
+static ib_mutex_t defrag_pool_mutex;
#ifdef HAVE_PSI_INTERFACE
static mysql_pfs_key_t recalc_pool_mutex_key;
+static mysql_pfs_key_t defrag_pool_mutex_key;
#endif /* HAVE_PSI_INTERFACE */
/** The number of tables that can be added to "recalc_pool" before
@@ -59,16 +62,26 @@ static recalc_pool_t recalc_pool;
typedef recalc_pool_t::iterator recalc_pool_iterator_t;
+/** Indices whose defrag stats need to be saved to persistent storage.*/
+struct defrag_pool_item_t {
+ table_id_t table_id;
+ index_id_t index_id;
+};
+typedef std::vector<defrag_pool_item_t> defrag_pool_t;
+static defrag_pool_t defrag_pool;
+typedef defrag_pool_t::iterator defrag_pool_iterator_t;
+
/*****************************************************************//**
Initialize the recalc pool, called once during thread initialization. */
static
void
-dict_stats_recalc_pool_init()
+dict_stats_pool_init()
/*=========================*/
{
ut_ad(!srv_read_only_mode);
recalc_pool.reserve(RECALC_POOL_INITIAL_SLOTS);
+ defrag_pool.reserve(RECALC_POOL_INITIAL_SLOTS);
}
/*****************************************************************//**
@@ -76,12 +89,13 @@ Free the resources occupied by the recalc pool, called once during
thread de-initialization. */
static
void
-dict_stats_recalc_pool_deinit()
-/*===========================*/
+dict_stats_pool_deinit()
+/*====================*/
{
ut_ad(!srv_read_only_mode);
recalc_pool.clear();
+ defrag_pool.clear();
/*
recalc_pool may still have its buffer allocated. It will free it when
its destructor is called.
@@ -90,8 +104,12 @@ dict_stats_recalc_pool_deinit()
memory. To avoid that, we force recalc_pool to surrender its buffer
to empty_pool object, which will free it when leaving this function:
*/
- recalc_pool_t empty_pool;
- recalc_pool.swap(empty_pool);
+ recalc_pool_t recalc_empty_pool;
+ defrag_pool_t defrag_empty_pool;
+ memset(&recalc_empty_pool, 0, sizeof(recalc_pool_t));
+ memset(&defrag_empty_pool, 0, sizeof(defrag_pool_t));
+ recalc_pool.swap(recalc_empty_pool);
+ defrag_pool.swap(defrag_empty_pool);
}
/*****************************************************************//**
@@ -188,6 +206,111 @@ dict_stats_recalc_pool_del(
}
/*****************************************************************//**
+Add an index in a table to the defrag pool, which is processed by the
+background stats gathering thread. Only the table id and index id are
+added to the list, so the table can be closed after being enqueued and
+it will be opened when needed. If the table or index does not exist later
+(has been DROPped), then it will be removed from the pool and skipped. */
+UNIV_INTERN
+void
+dict_stats_defrag_pool_add(
+/*=======================*/
+ const dict_index_t* index) /*!< in: table to add */
+{
+ defrag_pool_item_t item;
+
+ ut_ad(!srv_read_only_mode);
+
+ mutex_enter(&defrag_pool_mutex);
+
+ /* quit if already in the list */
+ for (defrag_pool_iterator_t iter = defrag_pool.begin();
+ iter != defrag_pool.end();
+ ++iter) {
+ if ((*iter).table_id == index->table->id
+ && (*iter).index_id == index->id) {
+ mutex_exit(&defrag_pool_mutex);
+ return;
+ }
+ }
+
+ item.table_id = index->table->id;
+ item.index_id = index->id;
+ defrag_pool.push_back(item);
+
+ mutex_exit(&defrag_pool_mutex);
+
+ os_event_set(dict_stats_event);
+}
+
+/*****************************************************************//**
+Get an index from the auto defrag pool. The returned index id is removed
+from the pool.
+@return true if the pool was non-empty and "id" was set, false otherwise */
+static
+bool
+dict_stats_defrag_pool_get(
+/*=======================*/
+ table_id_t* table_id, /*!< out: table id, or unmodified if
+ list is empty */
+ index_id_t* index_id) /*!< out: index id, or unmodified if
+ list is empty */
+{
+ ut_ad(!srv_read_only_mode);
+
+ mutex_enter(&defrag_pool_mutex);
+
+ if (defrag_pool.empty()) {
+ mutex_exit(&defrag_pool_mutex);
+ return(false);
+ }
+
+ defrag_pool_item_t& item = defrag_pool.back();
+ *table_id = item.table_id;
+ *index_id = item.index_id;
+
+ defrag_pool.pop_back();
+
+ mutex_exit(&defrag_pool_mutex);
+
+ return(true);
+}
+
+/*****************************************************************//**
+Delete a given index from the auto defrag pool. */
+UNIV_INTERN
+void
+dict_stats_defrag_pool_del(
+/*=======================*/
+ const dict_table_t* table, /*!<in: if given, remove
+ all entries for the table */
+ const dict_index_t* index) /*!< in: if given, remove this index */
+{
+ ut_a((table && !index) || (!table && index));
+ ut_ad(!srv_read_only_mode);
+ ut_ad(mutex_own(&dict_sys->mutex));
+
+ mutex_enter(&defrag_pool_mutex);
+
+ defrag_pool_iterator_t iter = defrag_pool.begin();
+ while (iter != defrag_pool.end()) {
+ if ((table && (*iter).table_id == table->id)
+ || (index
+ && (*iter).table_id == index->table->id
+ && (*iter).index_id == index->id)) {
+ /* erase() invalidates the iterator */
+ iter = defrag_pool.erase(iter);
+ if (index)
+ break;
+ } else {
+ iter++;
+ }
+ }
+
+ mutex_exit(&defrag_pool_mutex);
+}
+
+/*****************************************************************//**
Wait until background stats thread has stopped using the specified table.
The caller must have locked the data dictionary using
row_mysql_lock_data_dictionary() and this function may unlock it temporarily
@@ -237,7 +360,10 @@ dict_stats_thread_init()
mutex_create(recalc_pool_mutex_key, &recalc_pool_mutex,
SYNC_STATS_AUTO_RECALC);
- dict_stats_recalc_pool_init();
+ /* We choose SYNC_STATS_DEFRAG to be below SYNC_FSP_PAGE. */
+ mutex_create(defrag_pool_mutex_key, &defrag_pool_mutex,
+ SYNC_STATS_DEFRAG);
+ dict_stats_pool_init();
}
/*****************************************************************//**
@@ -251,11 +377,14 @@ dict_stats_thread_deinit()
ut_a(!srv_read_only_mode);
ut_ad(!srv_dict_stats_thread_active);
- dict_stats_recalc_pool_deinit();
+ dict_stats_pool_deinit();
mutex_free(&recalc_pool_mutex);
memset(&recalc_pool_mutex, 0x0, sizeof(recalc_pool_mutex));
+ mutex_free(&defrag_pool_mutex);
+ memset(&defrag_pool_mutex, 0x0, sizeof(defrag_pool_mutex));
+
os_event_free(dict_stats_event);
dict_stats_event = NULL;
}
@@ -333,6 +462,63 @@ dict_stats_process_entry_from_recalc_pool()
}
/*****************************************************************//**
+Get the first index that has been added for updating persistent defrag
+stats and eventually save its stats. */
+static
+void
+dict_stats_process_entry_from_defrag_pool()
+/*=======================================*/
+{
+ table_id_t table_id;
+ index_id_t index_id;
+
+ ut_ad(!srv_read_only_mode);
+
+ /* pop the first index from the auto defrag pool */
+ if (!dict_stats_defrag_pool_get(&table_id, &index_id)) {
+ /* no index in defrag pool */
+ return;
+ }
+
+ dict_table_t* table;
+
+ mutex_enter(&dict_sys->mutex);
+
+ /* If the table is no longer cached, we've already lost the in
+ memory stats so there's nothing really to write to disk. */
+ table = dict_table_open_on_id(table_id, TRUE,
+ DICT_TABLE_OP_OPEN_ONLY_IF_CACHED);
+
+ if (table == NULL) {
+ mutex_exit(&dict_sys->mutex);
+ return;
+ }
+
+ /* Check whether table is corrupted */
+ if (table->corrupted) {
+ dict_table_close(table, TRUE, FALSE);
+ mutex_exit(&dict_sys->mutex);
+ return;
+ }
+ mutex_exit(&dict_sys->mutex);
+
+ dict_index_t* index = dict_table_find_index_on_id(table, index_id);
+
+ if (index == NULL) {
+ return;
+ }
+
+ /* Check whether index is corrupted */
+ if (dict_index_is_corrupted(index)) {
+ dict_table_close(table, FALSE, FALSE);
+ return;
+ }
+
+ dict_stats_save_defrag_stats(index);
+ dict_table_close(table, FALSE, FALSE);
+}
+
+/*****************************************************************//**
This is the thread for background stats gathering. It pops tables, from
the auto recalc list and proceeds them, eventually recalculating their
statistics.
@@ -364,6 +550,9 @@ DECLARE_THREAD(dict_stats_thread)(
dict_stats_process_entry_from_recalc_pool();
+ while (defrag_pool.size())
+ dict_stats_process_entry_from_defrag_pool();
+
os_event_reset(dict_stats_event);
}
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index ead86fd3085..7887951a026 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -57,6 +57,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
#include "buf0flu.h"
#include "buf0dblwr.h"
#include "btr0sea.h"
+#include "btr0defragment.h"
#include "os0file.h"
#include "os0thread.h"
#include "srv0start.h"
@@ -65,7 +66,6 @@ this program; if not, write to the Free Software Foundation, Inc.,
#include "trx0trx.h"
#include "trx0sys.h"
-#include "mtr0mtr.h"
#include "rem0types.h"
#include "row0ins.h"
#include "row0mysql.h"
@@ -86,6 +86,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
#include "dict0stats_bg.h"
#include "ha_prototypes.h"
#include "ut0mem.h"
+#include "ut0timer.h"
#include "ibuf0ibuf.h"
#include "dict0dict.h"
#include "srv0mon.h"
@@ -752,6 +753,14 @@ static SHOW_VAR innodb_status_variables[]= {
{"have_bzip2",
(char*) &innodb_have_bzip2, SHOW_BOOL},
+ /* Defragmentation */
+ {"defragment_compression_failures",
+ (char*) &export_vars.innodb_defragment_compression_failures, SHOW_LONG},
+ {"defragment_failures",
+ (char*) &export_vars.innodb_defragment_failures, SHOW_LONG},
+ {"defragment_count",
+ (char*) &export_vars.innodb_defragment_count, SHOW_LONG},
+
{NullS, NullS, SHOW_LONG}
};
@@ -2351,7 +2360,8 @@ ha_innobase::ha_innobase(
(srv_force_primary_key ? HA_REQUIRE_PRIMARY_KEY : 0 ) |
HA_CAN_FULLTEXT_EXT | HA_CAN_EXPORT),
start_of_scan(0),
- num_write_row(0)
+ num_write_row(0),
+ ha_partition_stats(NULL)
{}
/*********************************************************************//**
@@ -10678,6 +10688,71 @@ ha_innobase::delete_table(
DBUG_RETURN(convert_error_code_to_mysql(err, 0, NULL));
}
+/*****************************************************************//**
+Defragment table.
+@return error number */
+UNIV_INTERN
+int
+ha_innobase::defragment_table(
+/*==========================*/
+ const char* name, /*!< in: table name */
+ const char* index_name, /*!< in: index name */
+ bool async) /*!< in: whether to wait until finish */
+{
+ char norm_name[FN_REFLEN];
+ dict_table_t* table;
+ dict_index_t* index;
+ ibool one_index = (index_name != 0);
+ int ret = 0;
+ if (!srv_defragment) {
+ return ER_FEATURE_DISABLED;
+ }
+ normalize_table_name(norm_name, name);
+ table = dict_table_open_on_name(norm_name, FALSE,
+ FALSE, DICT_ERR_IGNORE_NONE);
+ for (index = dict_table_get_first_index(table); index;
+ index = dict_table_get_next_index(index)) {
+ if (one_index && strcasecmp(index_name, index->name) != 0)
+ continue;
+ if (btr_defragment_find_index(index)) {
+ // We borrow this error code. When the same index is
+ // already in the defragmentation queue, issue another
+ // defragmentation only introduces overhead. We return
+ // an error here to let the user know this is not
+ // necessary. Note that this will fail a query that's
+ // trying to defragment a full table if one of the
+ // indicies in that table is already in defragmentation.
+ // We choose this behavior so user is aware of this
+ // rather than silently defragment other indicies of
+ // that table.
+ ret = ER_SP_ALREADY_EXISTS;
+ break;
+ }
+ os_event_t event = btr_defragment_add_index(index, async);
+ if (!async && event) {
+ while(os_event_wait_time(event, 1000000)) {
+ if (thd_killed(current_thd)) {
+ btr_defragment_remove_index(index);
+ ret = ER_QUERY_INTERRUPTED;
+ break;
+ }
+ }
+ os_event_free(event);
+ }
+ if (ret) {
+ break;
+ }
+ if (one_index) {
+ one_index = FALSE;
+ break;
+ }
+ }
+ dict_table_close(table, FALSE, FALSE);
+ if (ret == 0 && one_index) {
+ ret = ER_NO_SUCH_INDEX;
+ }
+ return ret;
+}
/*****************************************************************//**
Removes all tables in the named database inside InnoDB. */
@@ -11816,6 +11891,27 @@ ha_innobase::optimize(
This works OK otherwise, but MySQL locks the entire table during
calls to OPTIMIZE, which is undesirable. */
+ if (srv_defragment) {
+ int err;
+
+ err = defragment_table(prebuilt->table->name, NULL, false);
+
+ if (err == 0) {
+ return (HA_ADMIN_OK);
+ } else {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ err,
+ "InnoDB: Cannot defragment table %s: returned error code %d\n",
+ prebuilt->table->name, err);
+
+ if(err == ER_SP_ALREADY_EXISTS) {
+ return (HA_ADMIN_OK);
+ } else {
+ return (HA_ADMIN_TRY_ALTER);
+ }
+ }
+ }
+
if (innodb_optimize_fulltext_only) {
if (prebuilt->table->fts && prebuilt->table->fts->cache
&& !dict_table_is_discarded(prebuilt->table)) {
@@ -14520,6 +14616,13 @@ innodb_max_dirty_pages_pct_lwm_update(
srv_max_dirty_pages_pct_lwm = in_val;
}
+UNIV_INTERN
+void
+ha_innobase::set_partition_owner_stats(ha_statistics *stats)
+{
+ ha_partition_stats= stats;
+}
+
/************************************************************//**
Validate the file format name and return its corresponding id.
@return valid file format id */
@@ -15773,6 +15876,23 @@ innodb_reset_all_monitor_update(
TRUE);
}
+static
+void
+innodb_defragment_frequency_update(
+/*===============================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to
+ system variable */
+ void* var_ptr,/*!< out: where the
+ formal string goes */
+ const void* save) /*!< in: immediate result
+ from check function */
+{
+ srv_defragment_frequency = (*static_cast<const uint*>(save));
+ srv_defragment_interval = ut_microseconds_to_timer(
+ 1000000.0 / srv_defragment_frequency);
+}
+
/****************************************************************//**
Parse and enable InnoDB monitor counters during server startup.
User can list the monitor counters/groups to be enable by specifying
@@ -16631,6 +16751,60 @@ static MYSQL_SYSVAR_BOOL(buffer_pool_load_at_startup, srv_buffer_pool_load_at_st
"Load the buffer pool from a file named @@innodb_buffer_pool_filename",
NULL, NULL, FALSE);
+static MYSQL_SYSVAR_BOOL(defragment, srv_defragment,
+ PLUGIN_VAR_RQCMDARG,
+ "Enable/disable InnoDB defragmentation (default FALSE). When set to FALSE, all existing "
+ "defragmentation will be paused. And new defragmentation command will fail."
+ "Paused defragmentation commands will resume when this variable is set to "
+ "true again.",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_UINT(defragment_n_pages, srv_defragment_n_pages,
+ PLUGIN_VAR_RQCMDARG,
+ "Number of pages considered at once when merging multiple pages to "
+ "defragment",
+ NULL, NULL, 7, 2, 32, 0);
+
+static MYSQL_SYSVAR_UINT(defragment_stats_accuracy,
+ srv_defragment_stats_accuracy,
+ PLUGIN_VAR_RQCMDARG,
+ "How many defragment stats changes there are before the stats "
+ "are written to persistent storage. Set to 0 meaning disable "
+ "defragment stats tracking.",
+ NULL, NULL, 0, 0, ~0U, 0);
+
+static MYSQL_SYSVAR_UINT(defragment_fill_factor_n_recs,
+ srv_defragment_fill_factor_n_recs,
+ PLUGIN_VAR_RQCMDARG,
+ "How many records of space defragmentation should leave on the page. "
+ "This variable, together with innodb_defragment_fill_factor, is introduced "
+ "so defragmentation won't pack the page too full and cause page split on "
+ "the next insert on every page. The variable indicating more defragmentation"
+ " gain is the one effective.",
+ NULL, NULL, 20, 1, 100, 0);
+
+static MYSQL_SYSVAR_DOUBLE(defragment_fill_factor, srv_defragment_fill_factor,
+ PLUGIN_VAR_RQCMDARG,
+ "A number between [0.7, 1] that tells defragmentation how full it should "
+ "fill a page. Default is 0.9. Number below 0.7 won't make much sense."
+ "This variable, together with innodb_defragment_fill_factor_n_recs, is "
+ "introduced so defragmentation won't pack the page too full and cause "
+ "page split on the next insert on every page. The variable indicating more "
+ "defragmentation gain is the one effective.",
+ NULL, NULL, 0.9, 0.7, 1, 0);
+
+static MYSQL_SYSVAR_UINT(defragment_frequency, srv_defragment_frequency,
+ PLUGIN_VAR_RQCMDARG,
+ "Do not defragment a single index more than this number of time per second."
+ "This controls the number of time defragmentation thread can request X_LOCK "
+ "on an index. Defragmentation thread will check whether "
+ "1/defragment_frequency (s) has passed since it worked on this index last "
+ "time, and put the index back to the queue if not enough time has passed. "
+ "The actual frequency can only be lower than this given number.",
+ NULL, innodb_defragment_frequency_update,
+ SRV_DEFRAGMENT_FREQUENCY_DEFAULT, 1, 1000, 0);
+
+
static MYSQL_SYSVAR_ULONG(lru_scan_depth, srv_LRU_scan_depth,
PLUGIN_VAR_RQCMDARG,
"How deep to scan LRU to keep it clean",
@@ -17116,6 +17290,12 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(buffer_pool_load_now),
MYSQL_SYSVAR(buffer_pool_load_abort),
MYSQL_SYSVAR(buffer_pool_load_at_startup),
+ MYSQL_SYSVAR(defragment),
+ MYSQL_SYSVAR(defragment_n_pages),
+ MYSQL_SYSVAR(defragment_stats_accuracy),
+ MYSQL_SYSVAR(defragment_fill_factor),
+ MYSQL_SYSVAR(defragment_fill_factor_n_recs),
+ MYSQL_SYSVAR(defragment_frequency),
MYSQL_SYSVAR(lru_scan_depth),
MYSQL_SYSVAR(flush_neighbors),
MYSQL_SYSVAR(checksum_algorithm),
diff --git a/storage/innobase/handler/ha_innodb.h b/storage/innobase/handler/ha_innodb.h
index 912be30c0ec..4e9586d8a3d 100644
--- a/storage/innobase/handler/ha_innodb.h
+++ b/storage/innobase/handler/ha_innodb.h
@@ -105,6 +105,8 @@ class ha_innobase: public handler
or undefined */
uint num_write_row; /*!< number of write_row() calls */
+ ha_statistics* ha_partition_stats; /*!< stats of the partition owner
+ handler (if there is one) */
uint store_key_val_for_row(uint keynr, char* buff, uint buff_len,
const uchar* record);
inline void update_thd(THD* thd);
@@ -206,6 +208,8 @@ class ha_innobase: public handler
int truncate();
int delete_table(const char *name);
int rename_table(const char* from, const char* to);
+ int defragment_table(const char* name, const char* index_name,
+ bool async);
int check(THD* thd, HA_CHECK_OPT* check_opt);
char* update_table_comment(const char* comment);
char* get_foreign_key_create_info();
@@ -309,6 +313,7 @@ class ha_innobase: public handler
Alter_inplace_info* ha_alter_info,
bool commit);
/** @} */
+ void set_partition_owner_stats(ha_statistics *stats);
bool check_if_incompatible_data(HA_CREATE_INFO *info,
uint table_changes);
private:
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h
index 305acf7e322..b6f8a685ae9 100644
--- a/storage/innobase/include/btr0btr.h
+++ b/storage/innobase/include/btr0btr.h
@@ -2,6 +2,7 @@
Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2014, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -671,6 +672,21 @@ btr_get_size(
is s-latched */
__attribute__((nonnull, warn_unused_result));
/**************************************************************//**
+Gets the number of reserved and used pages in a B-tree.
+@return number of pages reserved, or ULINT_UNDEFINED if the index
+is unavailable */
+UNIV_INTERN
+ulint
+btr_get_size_and_reserved(
+/*======================*/
+ dict_index_t* index, /*!< in: index */
+ ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
+ ulint* used, /*!< out: number of pages used (<= reserved) */
+ mtr_t* mtr) /*!< in/out: mini-transaction where index
+ is s-latched */
+ __attribute__((nonnull));
+
+/**************************************************************//**
Allocates a new file page to be used in an index tree. NOTE: we assume
that the caller has made the reservation for free extents!
@retval NULL if no page could be allocated
@@ -717,6 +733,33 @@ btr_page_free_low(
ulint level, /*!< in: page level */
mtr_t* mtr) /*!< in: mtr */
__attribute__((nonnull));
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@retval true if the operation was successful
+@retval false if it is a compressed page, and recompression failed */
+UNIV_INTERN
+bool
+btr_page_reorganize_block(
+/*======================*/
+ bool recovery,/*!< in: true if called in recovery:
+ locks should not be updated, i.e.,
+ there cannot exist locks on the
+ page, and a hash index should not be
+ dropped: it cannot exist */
+ ulint z_level,/*!< in: compression level to be used
+ if dealing with compressed page */
+ buf_block_t* block, /*!< in/out: B-tree page */
+ dict_index_t* index, /*!< in: the index tree of the page */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ __attribute__((nonnull));
+
#ifdef UNIV_BTR_PRINT
/*************************************************************//**
Prints size info of a B-tree. */
@@ -762,6 +805,60 @@ btr_validate_index(
const trx_t* trx) /*!< in: transaction or 0 */
__attribute__((nonnull(1), warn_unused_result));
+#ifdef UNIV_SYNC_DEBUG
+/*************************************************************//**
+Removes a page from the level list of pages.
+@param space in: space where removed
+@param zip_size in: compressed page size in bytes, or 0 for uncompressed
+@param page in/out: page to remove
+@param index in: index tree
+@param mtr in/out: mini-transaction */
+# define btr_level_list_remove(space,zip_size,page,index,mtr) \
+ btr_level_list_remove_func(space,zip_size,page,index,mtr)
+#else /* UNIV_SYNC_DEBUG */
+/*************************************************************//**
+Removes a page from the level list of pages.
+@param space in: space where removed
+@param zip_size in: compressed page size in bytes, or 0 for uncompressed
+@param page in/out: page to remove
+@param index in: index tree
+@param mtr in/out: mini-transaction */
+# define btr_level_list_remove(space,zip_size,page,index,mtr) \
+ btr_level_list_remove_func(space,zip_size,page,mtr)
+#endif /* UNIV_SYNC_DEBUG */
+
+/*************************************************************//**
+Removes a page from the level list of pages. */
+UNIV_INTERN
+void
+btr_level_list_remove_func(
+/*=======================*/
+ ulint space, /*!< in: space where removed */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ page_t* page, /*!< in/out: page to remove */
+#ifdef UNIV_SYNC_DEBUG
+ const dict_index_t* index, /*!< in: index tree */
+#endif /* UNIV_SYNC_DEBUG */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ __attribute__((nonnull));
+
+/*************************************************************//**
+If page is the only on its level, this function moves its records to the
+father page, thus reducing the tree height.
+@return father block */
+UNIV_INTERN
+buf_block_t*
+btr_lift_page_up(
+/*=============*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: page which is the only on its level;
+ must not be empty: use
+ btr_discard_only_page_on_level if the last
+ record from the page should be removed */
+ mtr_t* mtr) /*!< in: mtr */
+ __attribute__((nonnull));
+
#define BTR_N_LEAF_PAGES 1
#define BTR_TOTAL_SIZE 2
#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/btr0btr.ic b/storage/innobase/include/btr0btr.ic
index 00f50b5dcaf..40b468b200a 100644
--- a/storage/innobase/include/btr0btr.ic
+++ b/storage/innobase/include/btr0btr.ic
@@ -163,9 +163,10 @@ btr_page_get_next(
/*!< in: mini-transaction handle */
{
ut_ad(page && mtr);
+#ifndef UNIV_INNOCHECKSUM
ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)
|| mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_S_FIX));
-
+#endif /* UNIV_INNOCHECKSUM */
return(mach_read_from_4(page + FIL_PAGE_NEXT));
}
diff --git a/storage/innobase/include/btr0defragment.h b/storage/innobase/include/btr0defragment.h
new file mode 100644
index 00000000000..8fef3c6519a
--- /dev/null
+++ b/storage/innobase/include/btr0defragment.h
@@ -0,0 +1,101 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#ifndef btr0defragment_h
+#define btr0defragment_h
+
+#include "univ.i"
+
+#ifndef UNIV_HOTBACKUP
+
+#include "btr0pcur.h"
+
+/* Max number of pages to consider at once during defragmentation. */
+#define BTR_DEFRAGMENT_MAX_N_PAGES 32
+
+/** stats in btr_defragment */
+extern ulint btr_defragment_compression_failures;
+extern ulint btr_defragment_failures;
+extern ulint btr_defragment_count;
+
+/** Item in the work queue for btr_degrament_thread. */
+struct btr_defragment_item_t
+{
+ btr_pcur_t* pcur; /* persistent cursor where
+ btr_defragment_n_pages should start */
+ os_event_t event; /* if not null, signal after work
+ is done */
+ bool removed; /* Mark an item as removed */
+ ulonglong last_processed; /* timestamp of last time this index
+ is processed by defragment thread */
+
+ btr_defragment_item_t(btr_pcur_t* pcur, os_event_t event);
+ ~btr_defragment_item_t();
+};
+
+/******************************************************************//**
+Initialize defragmentation. */
+void
+btr_defragment_init(void);
+/******************************************************************//**
+Shutdown defragmentation. */
+void
+btr_defragment_shutdown();
+/******************************************************************//**
+Check whether the given index is in btr_defragment_wq. */
+bool
+btr_defragment_find_index(
+ dict_index_t* index); /*!< Index to find. */
+/******************************************************************//**
+Add an index to btr_defragment_wq. Return a pointer to os_event if this
+is a synchronized defragmentation. */
+os_event_t
+btr_defragment_add_index(
+ dict_index_t* index, /*!< index to be added */
+ bool async); /*!< whether this is an async defragmentation */
+/******************************************************************//**
+When table is dropped, this function is called to mark a table as removed in
+btr_efragment_wq. The difference between this function and the remove_index
+function is this will not NULL the event. */
+void
+btr_defragment_remove_table(
+ dict_table_t* table); /*!< Index to be removed. */
+/******************************************************************//**
+Mark an index as removed from btr_defragment_wq. */
+void
+btr_defragment_remove_index(
+ dict_index_t* index); /*!< Index to be removed. */
+/*********************************************************************//**
+Check whether we should save defragmentation statistics to persistent storage.*/
+UNIV_INTERN
+void
+btr_defragment_save_defrag_stats_if_needed(
+ dict_index_t* index); /*!< in: index */
+/******************************************************************//**
+Thread that merges consecutive b-tree pages into fewer pages to defragment
+the index. */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(btr_defragment_thread)(
+/*==========================================*/
+ void* arg); /*!< in: a dummy parameter required by
+ os_thread_create */
+
+
+#endif /* !UNIV_HOTBACKUP */
+#endif
diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h
index 2a96f5299bb..7d14df09cb2 100644
--- a/storage/innobase/include/dict0dict.h
+++ b/storage/innobase/include/dict0dict.h
@@ -120,7 +120,9 @@ enum dict_table_op_t {
DICT_TABLE_OP_DROP_ORPHAN,
/** Silently load the tablespace if it does not exist,
and do not load the definitions of incomplete indexes. */
- DICT_TABLE_OP_LOAD_TABLESPACE
+ DICT_TABLE_OP_LOAD_TABLESPACE,
+ /** Open the table only if it's in table cache. */
+ DICT_TABLE_OP_OPEN_ONLY_IF_CACHED
};
/**********************************************************************//**
@@ -1496,6 +1498,16 @@ dict_table_get_index_on_name(
const char* name) /*!< in: name of the index to find */
__attribute__((nonnull, warn_unused_result));
/**********************************************************************//**
+Looks for an index with the given id given a table instance.
+@return index or NULL */
+UNIV_INTERN
+dict_index_t*
+dict_table_find_index_on_id(
+/*========================*/
+ const dict_table_t* table, /*!< in: table instance */
+ index_id_t id) /*!< in: index id */
+ __attribute__((nonnull, warn_unused_result));
+/**********************************************************************//**
In case there is more than one index with the same name return the index
with the min(id).
@return index, NULL if does not exist */
diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h
index b026210b214..ccca7af1c03 100644
--- a/storage/innobase/include/dict0mem.h
+++ b/storage/innobase/include/dict0mem.h
@@ -588,6 +588,10 @@ struct zip_pad_info_t {
rounds */
};
+/** Number of samples of data size kept when page compression fails for
+a certain index.*/
+#define STAT_DEFRAG_DATA_SIZE_N_SAMPLE 10
+
/** Data structure for an index. Most fields will be
initialized to 0, NULL or FALSE in dict_mem_index_create(). */
struct dict_index_t{
@@ -676,6 +680,23 @@ struct dict_index_t{
/*!< approximate number of leaf pages in the
index tree */
/* @} */
+ /** Statistics for defragmentation, these numbers are estimations and
+ could be very inaccurate at certain times, e.g. right after restart,
+ during defragmentation, etc. */
+ /* @{ */
+ ulint stat_defrag_modified_counter;
+ ulint stat_defrag_n_pages_freed;
+ /* number of pages freed by defragmentation. */
+ ulint stat_defrag_n_page_split;
+ /* number of page splits since last full index
+ defragmentation. */
+ ulint stat_defrag_data_size_sample[STAT_DEFRAG_DATA_SIZE_N_SAMPLE];
+ /* data size when compression failure happened
+ the most recent 10 times. */
+ ulint stat_defrag_sample_next_slot;
+ /* in which slot the next sample should be
+ saved. */
+ /* @} */
rw_lock_t lock; /*!< read-write lock protecting the
upper levels of the index tree */
trx_id_t trx_id; /*!< id of the transaction that created this
diff --git a/storage/innobase/include/dict0priv.h b/storage/innobase/include/dict0priv.h
index 9a3c8e22992..e034662aba0 100644
--- a/storage/innobase/include/dict0priv.h
+++ b/storage/innobase/include/dict0priv.h
@@ -53,8 +53,9 @@ dict_table_t*
dict_table_open_on_id_low(
/*=====================*/
table_id_t table_id, /*!< in: table id */
- dict_err_ignore_t ignore_err); /*!< in: errors to ignore
+ dict_err_ignore_t ignore_err, /*!< in: errors to ignore
when loading the table */
+ ibool open_only_if_in_cache);
#ifndef UNIV_NONINL
#include "dict0priv.ic"
diff --git a/storage/innobase/include/dict0priv.ic b/storage/innobase/include/dict0priv.ic
index 30ba8fb60aa..983218af78a 100644
--- a/storage/innobase/include/dict0priv.ic
+++ b/storage/innobase/include/dict0priv.ic
@@ -74,8 +74,9 @@ dict_table_t*
dict_table_open_on_id_low(
/*======================*/
table_id_t table_id, /*!< in: table id */
- dict_err_ignore_t ignore_err) /*!< in: errors to ignore
+ dict_err_ignore_t ignore_err, /*!< in: errors to ignore
when loading the table */
+ ibool open_only_if_in_cache)
{
dict_table_t* table;
ulint fold;
@@ -88,7 +89,7 @@ dict_table_open_on_id_low(
HASH_SEARCH(id_hash, dict_sys->table_id_hash, fold,
dict_table_t*, table, ut_ad(table->cached),
table->id == table_id);
- if (table == NULL) {
+ if (table == NULL && !open_only_if_in_cache) {
table = dict_load_table_on_id(table_id, ignore_err);
}
diff --git a/storage/innobase/include/dict0stats.h b/storage/innobase/include/dict0stats.h
index 186f90e3694..abf56b2f0c7 100644
--- a/storage/innobase/include/dict0stats.h
+++ b/storage/innobase/include/dict0stats.h
@@ -195,6 +195,39 @@ dict_stats_rename_table(
is returned */
size_t errstr_sz); /*!< in: errstr size */
+/*********************************************************************//**
+Save defragmentation result.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_save_defrag_summary(
+ dict_index_t* index); /*!< in: index */
+
+/*********************************************************************//**
+Save defragmentation stats for a given index.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_save_defrag_stats(
+ dict_index_t* index); /*!< in: index */
+
+/**********************************************************************//**
+Clear defragmentation summary. */
+UNIV_INTERN
+void
+dict_stats_empty_defrag_summary(
+/*==================*/
+ dict_index_t* index); /*!< in: index to clear defragmentation stats */
+
+/**********************************************************************//**
+Clear defragmentation related index stats. */
+UNIV_INTERN
+void
+dict_stats_empty_defrag_stats(
+/*==================*/
+ dict_index_t* index); /*!< in: index to clear defragmentation stats */
+
+
#ifndef UNIV_NONINL
#include "dict0stats.ic"
#endif
diff --git a/storage/innobase/include/dict0stats_bg.h b/storage/innobase/include/dict0stats_bg.h
index e866ab419fe..32fac3015e8 100644
--- a/storage/innobase/include/dict0stats_bg.h
+++ b/storage/innobase/include/dict0stats_bg.h
@@ -56,6 +56,28 @@ dict_stats_recalc_pool_del(
/*=======================*/
const dict_table_t* table); /*!< in: table to remove */
+/*****************************************************************//**
+Add an index in a table to the defrag pool, which is processed by the
+background stats gathering thread. Only the table id and index id are
+added to the list, so the table can be closed after being enqueued and
+it will be opened when needed. If the table or index does not exist later
+(has been DROPped), then it will be removed from the pool and skipped. */
+UNIV_INTERN
+void
+dict_stats_defrag_pool_add(
+/*=======================*/
+ const dict_index_t* index); /*!< in: table to add */
+
+/*****************************************************************//**
+Delete a given index from the auto defrag pool. */
+UNIV_INTERN
+void
+dict_stats_defrag_pool_del(
+/*=======================*/
+ const dict_table_t* table, /*!<in: if given, remove
+ all entries for the table */
+ const dict_index_t* index); /*!< in: index to remove */
+
/** Yield the data dictionary latch when waiting
for the background thread to stop accessing a table.
@param trx transaction holding the data dictionary locks */
diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h
index 6d5ed35d5d8..3babc4d82fd 100644
--- a/storage/innobase/include/lock0lock.h
+++ b/storage/innobase/include/lock0lock.h
@@ -181,6 +181,16 @@ lock_update_merge_left(
const buf_block_t* right_block); /*!< in: merged index page
which will be discarded */
/*************************************************************//**
+Updates the lock table when a page is splited and merged to
+two pages. */
+UNIV_INTERN
+void
+lock_update_split_and_merge(
+ const buf_block_t* left_block, /*!< in: left page to which merged */
+ const rec_t* orig_pred, /*!< in: original predecessor of
+ supremum on the left page before merge*/
+ const buf_block_t* right_block);/*!< in: right page from which merged */
+/*************************************************************//**
Resets the original locks on heir and replaces them with gap type locks
inherited from rec. */
UNIV_INTERN
diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
index 905d4a0afa7..231537b3cde 100644
--- a/storage/innobase/include/srv0srv.h
+++ b/storage/innobase/include/srv0srv.h
@@ -335,6 +335,15 @@ extern my_bool srv_random_read_ahead;
extern ulong srv_read_ahead_threshold;
extern ulint srv_n_read_io_threads;
extern ulint srv_n_write_io_threads;
+/* Defragmentation, Origianlly facebook default value is 100, but it's too high */
+#define SRV_DEFRAGMENT_FREQUENCY_DEFAULT 40
+extern my_bool srv_defragment;
+extern uint srv_defragment_n_pages;
+extern uint srv_defragment_stats_accuracy;
+extern uint srv_defragment_fill_factor_n_recs;
+extern double srv_defragment_fill_factor;
+extern uint srv_defragment_frequency;
+extern ulonglong srv_defragment_interval;
/* Number of IO operations per second the server can do */
extern ulong srv_io_capacity;
@@ -888,7 +897,12 @@ struct export_var_t{
ulint innodb_rows_deleted; /*!< srv_n_rows_deleted */
ulint innodb_num_open_files; /*!< fil_n_file_opened */
ulint innodb_truncated_status_writes; /*!< srv_truncated_status_writes */
- ulint innodb_available_undo_logs; /*!< srv_available_undo_logs */
+ ulint innodb_available_undo_logs; /*!< srv_available_undo_logs
+ */
+ ulint innodb_defragment_compression_failures;
+ ulint innodb_defragment_failures;
+ ulint innodb_defragment_count;
+
#ifdef UNIV_DEBUG
ulint innodb_purge_trx_id_age; /*!< rw_max_trx_id - purged trx_id */
ulint innodb_purge_view_trx_id_age; /*!< rw_max_trx_id
diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h
index 7b00e16476b..f26e66f1a87 100644
--- a/storage/innobase/include/sync0sync.h
+++ b/storage/innobase/include/sync0sync.h
@@ -687,6 +687,7 @@ or row lock! */
#define SYNC_EXTERN_STORAGE 500
#define SYNC_FSP 400
#define SYNC_FSP_PAGE 395
+#define SYNC_STATS_DEFRAG 390
/*------------------------------------- Change buffer headers */
#define SYNC_IBUF_MUTEX 370 /* ibuf_mutex */
/*------------------------------------- Change buffer tree */
diff --git a/storage/innobase/include/ut0timer.h b/storage/innobase/include/ut0timer.h
new file mode 100644
index 00000000000..f361ae79bf5
--- /dev/null
+++ b/storage/innobase/include/ut0timer.h
@@ -0,0 +1,104 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved.
+Copyright (c) 2014, SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/ut0timer.h
+Timer rountines
+
+Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com
+modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6
+*************************************************************************/
+#ifndef ut0timer_h
+#define ut0timer_h
+
+#include "univ.i"
+#include "data0type.h"
+#include <my_rdtsc.h>
+
+/* Current timer stats */
+extern struct my_timer_unit_info ut_timer;
+
+/**************************************************************//**
+Function pointer to point selected timer function.
+@return timer current value */
+extern ulonglong (*ut_timer_now)(void);
+
+/**************************************************************//**
+Sets up the data required for use of my_timer_* functions.
+Selects the best timer by high frequency, and tight resolution.
+Points my_timer_now() to the selected timer function.
+Initializes my_timer struct to contain the info for selected timer.*/
+UNIV_INTERN
+void ut_init_timer(void);
+
+/**************************************************************//**
+Return time passed since time then, automatically adjusted
+for the estimated timer overhead.
+@return time passed since "then" */
+UNIV_INLINE
+ulonglong
+ut_timer_since(
+/*===========*/
+ ulonglong then); /*!< in: time where to calculate */
+/**************************************************************//**
+Get time passed since "then", and update then to now
+@return time passed sinche "then" */
+UNIV_INLINE
+ulonglong
+ut_timer_since_and_update(
+/*======================*/
+ ulonglong *then); /*!< in: time where to calculate */
+/**************************************************************//**
+Convert native timer units in a ulonglong into seconds in a double
+@return time in a seconds */
+UNIV_INLINE
+double
+ut_timer_to_seconds(
+/*=================*/
+ ulonglong when); /*!< in: time where to calculate */
+/**************************************************************//**
+Convert native timer units in a ulonglong into milliseconds in a double
+@return time in milliseconds */
+UNIV_INLINE
+double
+ut_timer_to_milliseconds(
+/*=====================*/
+ ulonglong when); /*!< in: time where to calculate */
+/**************************************************************//**
+Convert native timer units in a ulonglong into microseconds in a double
+@return time in microseconds */
+UNIV_INLINE
+double
+ut_timer_to_microseconds(
+/*=====================*/
+ ulonglong when); /*!< in: time where to calculate */
+/**************************************************************//**
+Convert microseconds in a double to native timer units in a ulonglong
+@return time in microseconds */
+UNIV_INLINE
+ulonglong
+ut_microseconds_to_timer(
+/*=====================*/
+ ulonglong when); /*!< in: time where to calculate */
+
+#ifndef UNIV_NONINL
+#include "ut0timer.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/ut0timer.ic b/storage/innobase/include/ut0timer.ic
new file mode 100644
index 00000000000..027e89c6279
--- /dev/null
+++ b/storage/innobase/include/ut0timer.ic
@@ -0,0 +1,113 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved.
+Copyright (c) 2014, SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/ut0timer.ic
+Timer rountines
+
+Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com
+modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6
+*************************************************************************/
+
+/**************************************************************//**
+Return time passed since time then, automatically adjusted
+for the estimated timer overhead.
+@return time passed since "then" */
+UNIV_INLINE
+ulonglong
+ut_timer_since(
+/*===========*/
+ ulonglong then) /*!< in: time where to calculate */
+{
+ return (ut_timer_now() - then) - ut_timer.overhead;
+}
+
+/**************************************************************//**
+Get time passed since "then", and update then to now
+@return time passed sinche "then" */
+UNIV_INLINE
+ulonglong
+ut_timer_since_and_update(
+/*======================*/
+ ulonglong *then) /*!< in: time where to calculate */
+{
+ ulonglong now = ut_timer_now();
+ ulonglong ret = (now - (*then)) - ut_timer.overhead;
+ *then = now;
+ return ret;
+}
+
+/**************************************************************//**
+Convert native timer units in a ulonglong into seconds in a double
+@return time in a seconds */
+UNIV_INLINE
+double
+ut_timer_to_seconds(
+/*=================*/
+ ulonglong when) /*!< in: time where to calculate */
+{
+ double ret = (double)(when);
+ ret /= (double)(ut_timer.frequency);
+ return ret;
+}
+
+/**************************************************************//**
+Convert native timer units in a ulonglong into milliseconds in a double
+@return time in milliseconds */
+UNIV_INLINE
+double
+ut_timer_to_milliseconds(
+/*=====================*/
+ ulonglong when) /*!< in: time where to calculate */
+{
+ double ret = (double)(when);
+ ret *= 1000.0;
+ ret /= (double)(ut_timer.frequency);
+ return ret;
+}
+
+/**************************************************************//**
+Convert native timer units in a ulonglong into microseconds in a double
+@return time in microseconds */
+UNIV_INLINE
+double
+ut_timer_to_microseconds(
+/*=====================*/
+ ulonglong when) /*!< in: time where to calculate */
+{
+ double ret = (double)(when);
+ ret *= 1000000.0;
+ ret /= (double)(ut_timer.frequency);
+ return ret;
+}
+
+/**************************************************************//**
+Convert microseconds in a double to native timer units in a ulonglong
+@return time in microseconds */
+UNIV_INLINE
+ulonglong
+ut_microseconds_to_timer(
+/*=====================*/
+ ulonglong when) /*!< in: time where to calculate */
+{
+ double ret = when;
+ ret *= (double)(ut_timer.frequency);
+ ret /= 1000000.0;
+ return (ulonglong)ret;
+}
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index e4db2c30751..c1e12ea2928 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -3268,6 +3268,47 @@ lock_update_merge_left(
}
/*************************************************************//**
+Updates the lock table when a page is split and merged to
+two pages. */
+UNIV_INTERN
+void
+lock_update_split_and_merge(
+ const buf_block_t* left_block, /*!< in: left page to which merged */
+ const rec_t* orig_pred, /*!< in: original predecessor of
+ supremum on the left page before merge*/
+ const buf_block_t* right_block) /*!< in: right page from which merged */
+{
+ const rec_t* left_next_rec;
+
+ ut_a(left_block && right_block);
+ ut_a(orig_pred);
+
+ lock_mutex_enter();
+
+ left_next_rec = page_rec_get_next_const(orig_pred);
+
+ /* Inherit the locks on the supremum of the left page to the
+ first record which was moved from the right page */
+ lock_rec_inherit_to_gap(
+ left_block, left_block,
+ page_rec_get_heap_no(left_next_rec),
+ PAGE_HEAP_NO_SUPREMUM);
+
+ /* Reset the locks on the supremum of the left page,
+ releasing waiting transactions */
+ lock_rec_reset_and_release_wait(left_block,
+ PAGE_HEAP_NO_SUPREMUM);
+
+ /* Inherit the locks to the supremum of the left page from the
+ successor of the infimum on the right page */
+ lock_rec_inherit_to_gap(left_block, right_block,
+ PAGE_HEAP_NO_SUPREMUM,
+ lock_get_min_heap_no(right_block));
+
+ lock_mutex_exit();
+}
+
+/*************************************************************//**
Resets the original locks on heir and replaces them with gap type locks
inherited from rec. */
UNIV_INTERN
diff --git a/storage/innobase/page/page0cur.cc b/storage/innobase/page/page0cur.cc
index f5f7e1299ce..97405261392 100644
--- a/storage/innobase/page/page0cur.cc
+++ b/storage/innobase/page/page0cur.cc
@@ -1349,6 +1349,21 @@ page_cur_insert_rec_zip(
return(insert_rec);
}
+ /* Page compress failed. If this happened on a
+ leaf page, put the data size into the sample
+ buffer. */
+ if (page_is_leaf(page)) {
+ ulint occupied = page_get_data_size(page)
+ + page_dir_calc_reserved_space(
+ page_get_n_recs(page));
+ index->stat_defrag_data_size_sample[
+ index->stat_defrag_sample_next_slot] =
+ occupied;
+ index->stat_defrag_sample_next_slot =
+ (index->stat_defrag_sample_next_slot
+ + 1) % STAT_DEFRAG_DATA_SIZE_N_SAMPLE;
+ }
+
ut_ad(cursor->rec
== (pos > 1
? page_rec_get_nth(
diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc
index 93d13ea49ee..8def475e1f9 100644
--- a/storage/innobase/row/row0mysql.cc
+++ b/storage/innobase/row/row0mysql.cc
@@ -54,6 +54,7 @@ Created 9/17/2000 Heikki Tuuri
#include "rem0cmp.h"
#include "log0log.h"
#include "btr0sea.h"
+#include "btr0defragment.h"
#include "fil0fil.h"
#include "ibuf0ibuf.h"
#include "fts0fts.h"
@@ -3843,6 +3844,8 @@ row_drop_table_for_mysql(
if (!dict_table_is_temporary(table)) {
dict_stats_recalc_pool_del(table);
+ dict_stats_defrag_pool_del(table, NULL);
+ btr_defragment_remove_table(table);
/* Remove stats for this table and all of its indexes from the
persistent storage if it exists and if there are stats for this
@@ -5128,18 +5131,6 @@ end:
trx->error_state = DB_SUCCESS;
trx_rollback_to_savepoint(trx, NULL);
trx->error_state = DB_SUCCESS;
- } else {
- if (old_is_tmp && !new_is_tmp) {
- /* After ALTER TABLE the table statistics
- needs to be rebuilt. Even if we close
- table below there could be other
- transactions using this table (e.g.
- SELECT * FROM INFORMATION_SCHEMA.`TABLE_CONSTRAINTS`),
- thus we can't remove table from dictionary cache
- here. Therefore, we initialize the
- transient statistics here. */
- dict_stats_update_transient(table);
- }
}
}
diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc
index b9cfb3544b9..7ca29d1ace1 100644
--- a/storage/innobase/srv/srv0srv.cc
+++ b/storage/innobase/srv/srv0srv.cc
@@ -68,6 +68,7 @@ Created 10/8/1995 Heikki Tuuri
#include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */
#include "srv0mon.h"
#include "ut0crc32.h"
+#include "btr0defragment.h"
#include "mysql/plugin.h"
#include "mysql/service_thd_wait.h"
@@ -396,6 +397,15 @@ UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op = 0;
UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op_saved = 0;
UNIV_INTERN ib_uint64_t srv_index_page_decompressed = 0;
+/* Defragmentation */
+UNIV_INTERN my_bool srv_defragment = FALSE;
+UNIV_INTERN uint srv_defragment_n_pages = 7;
+UNIV_INTERN uint srv_defragment_stats_accuracy = 0;
+UNIV_INTERN uint srv_defragment_fill_factor_n_recs = 20;
+UNIV_INTERN double srv_defragment_fill_factor = 0.9;
+UNIV_INTERN uint srv_defragment_frequency =
+ SRV_DEFRAGMENT_FREQUENCY_DEFAULT;
+UNIV_INTERN ulonglong srv_defragment_interval = 0;
/* Set the following to 0 if you want InnoDB to write messages on
stderr on startup/shutdown. */
@@ -1492,6 +1502,11 @@ srv_export_innodb_status(void)
export_vars.innodb_page_compressed_trim_op_saved = srv_stats.page_compressed_trim_op_saved;
export_vars.innodb_pages_page_decompressed = srv_stats.pages_page_decompressed;
+ export_vars.innodb_defragment_compression_failures =
+ btr_defragment_compression_failures;
+ export_vars.innodb_defragment_failures = btr_defragment_failures;
+ export_vars.innodb_defragment_count = btr_defragment_count;
+
#ifdef UNIV_DEBUG
rw_lock_s_lock(&purge_sys->latch);
trx_id_t done_trx_no = purge_sys->done.trx_no;
diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc
index ece16c6bd70..6a02b08c3b7 100644
--- a/storage/innobase/srv/srv0start.cc
+++ b/storage/innobase/srv/srv0start.cc
@@ -43,6 +43,7 @@ Created 2/16/1996 Heikki Tuuri
#include "pars0pars.h"
#include "row0ftsort.h"
#include "ut0mem.h"
+#include "ut0timer.h"
#include "mem0mem.h"
#include "data0data.h"
#include "data0type.h"
@@ -67,6 +68,8 @@ Created 2/16/1996 Heikki Tuuri
#include "ibuf0ibuf.h"
#include "srv0start.h"
#include "srv0srv.h"
+#include "btr0defragment.h"
+
#ifndef UNIV_HOTBACKUP
# include "trx0rseg.h"
# include "os0proc.h"
@@ -1531,6 +1534,9 @@ innobase_start_or_create_for_mysql(void)
char* logfile0 = NULL;
size_t dirnamelen;
+ /* This should be initialized early */
+ ut_init_timer();
+
if (srv_force_recovery > SRV_FORCE_NO_TRX_UNDO) {
srv_read_only_mode = true;
}
@@ -2877,6 +2883,9 @@ files_checked:
fts_optimize_init();
}
+ /* Initialize online defragmentation. */
+ btr_defragment_init();
+
srv_was_started = TRUE;
return(DB_SUCCESS);
diff --git a/storage/innobase/sync/sync0sync.cc b/storage/innobase/sync/sync0sync.cc
index 5ef8a02fb3f..3532f513646 100644
--- a/storage/innobase/sync/sync0sync.cc
+++ b/storage/innobase/sync/sync0sync.cc
@@ -1164,6 +1164,7 @@ sync_thread_add_level(
case SYNC_IBUF_MUTEX:
case SYNC_INDEX_ONLINE_LOG:
case SYNC_STATS_AUTO_RECALC:
+ case SYNC_STATS_DEFRAG:
if (!sync_thread_levels_g(array, level, TRUE)) {
fprintf(stderr,
"InnoDB: sync_thread_levels_g(array, %lu)"
diff --git a/storage/innobase/ut/ut0timer.cc b/storage/innobase/ut/ut0timer.cc
new file mode 100644
index 00000000000..85292cce28c
--- /dev/null
+++ b/storage/innobase/ut/ut0timer.cc
@@ -0,0 +1,92 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved.
+Copyright (c) 2014, SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file ut/ut0timer.cc
+Timer rountines
+
+Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com
+modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6
+*************************************************************************/
+
+#include "data0type.h"
+#include <my_rdtsc.h>
+#include <ut0timer.h>
+
+/**************************************************************//**
+Initial timer definition
+@return 0 */
+static
+ulonglong
+ut_timer_none(void)
+/*===============*/
+{
+ return 0;
+}
+
+/**************************************************************//**
+Function pointer to point selected timer function.
+@return timer current value */
+ulonglong (*ut_timer_now)(void) = &ut_timer_none;
+
+struct my_timer_unit_info ut_timer;
+
+/**************************************************************//**
+Sets up the data required for use of my_timer_* functions.
+Selects the best timer by high frequency, and tight resolution.
+Points my_timer_now() to the selected timer function.
+Initializes my_timer struct to contain the info for selected timer.*/
+UNIV_INTERN
+void
+ut_init_timer(void)
+/*===============*/
+{
+ MY_TIMER_INFO all_timer_info;
+ my_timer_init(&all_timer_info);
+
+ if (all_timer_info.cycles.frequency > 1000000 &&
+ all_timer_info.cycles.resolution == 1) {
+ ut_timer = all_timer_info.cycles;
+ ut_timer_now = &my_timer_cycles;
+ } else if (all_timer_info.nanoseconds.frequency > 1000000 &&
+ all_timer_info.nanoseconds.resolution == 1) {
+ ut_timer = all_timer_info.nanoseconds;
+ ut_timer_now = &my_timer_nanoseconds;
+ } else if (all_timer_info.microseconds.frequency >= 1000000 &&
+ all_timer_info.microseconds.resolution == 1) {
+ ut_timer = all_timer_info.microseconds;
+ ut_timer_now = &my_timer_microseconds;
+
+ } else if (all_timer_info.milliseconds.frequency >= 1000 &&
+ all_timer_info.milliseconds.resolution == 1) {
+ ut_timer = all_timer_info.milliseconds;
+ ut_timer_now = &my_timer_milliseconds;
+ } else if (all_timer_info.ticks.frequency >= 1000 &&
+ /* Will probably be false */
+ all_timer_info.ticks.resolution == 1) {
+ ut_timer = all_timer_info.ticks;
+ ut_timer_now = &my_timer_ticks;
+ } else {
+ /* None are acceptable, so leave it as "None", and fill in struct */
+ ut_timer.frequency = 1; /* Avoid div-by-zero */
+ ut_timer.overhead = 0; /* Since it doesn't do anything */
+ ut_timer.resolution = 10; /* Another sign it's bad */
+ ut_timer.routine = 0; /* None */
+ }
+}
diff --git a/storage/xtradb/CMakeLists.txt b/storage/xtradb/CMakeLists.txt
index 528c6f87fcc..e34add61886 100644
--- a/storage/xtradb/CMakeLists.txt
+++ b/storage/xtradb/CMakeLists.txt
@@ -292,6 +292,7 @@ SET(INNOBASE_SOURCES
btr/btr0cur.cc
btr/btr0pcur.cc
btr/btr0sea.cc
+ btr/btr0defragment.cc
buf/buf0buddy.cc
buf/buf0buf.cc
buf/buf0dblwr.cc
@@ -405,7 +406,8 @@ SET(INNOBASE_SOURCES
ut/ut0rnd.cc
ut/ut0ut.cc
ut/ut0vec.cc
- ut/ut0wqueue.cc)
+ ut/ut0wqueue.cc
+ ut/ut0timer.cc)
IF(NOT XTRADB_OK)
MESSAGE(FATAL_ERROR "Percona XtraDB is not supported on this platform")
diff --git a/storage/xtradb/btr/btr0btr.cc b/storage/xtradb/btr/btr0btr.cc
index cce91bdab6e..926c3be0fb5 100644
--- a/storage/xtradb/btr/btr0btr.cc
+++ b/storage/xtradb/btr/btr0btr.cc
@@ -38,6 +38,7 @@ Created 6/2/1994 Heikki Tuuri
#include "btr0cur.h"
#include "btr0sea.h"
#include "btr0pcur.h"
+#include "btr0defragment.h"
#include "rem0cmp.h"
#include "lock0lock.h"
#include "ibuf0ibuf.h"
@@ -1213,6 +1214,32 @@ btr_get_size(
mtr_t* mtr) /*!< in/out: mini-transaction where index
is s-latched */
{
+ ulint used;
+ if (flag == BTR_N_LEAF_PAGES) {
+ btr_get_size_and_reserved(index, flag, &used, mtr);
+ return used;
+ } else if (flag == BTR_TOTAL_SIZE) {
+ return btr_get_size_and_reserved(index, flag, &used, mtr);
+ } else {
+ ut_error;
+ }
+ return (ULINT_UNDEFINED);
+}
+
+/**************************************************************//**
+Gets the number of reserved and used pages in a B-tree.
+@return number of pages reserved, or ULINT_UNDEFINED if the index
+is unavailable */
+UNIV_INTERN
+ulint
+btr_get_size_and_reserved(
+/*======================*/
+ dict_index_t* index, /*!< in: index */
+ ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
+ ulint* used, /*!< out: number of pages used (<= reserved) */
+ mtr_t* mtr) /*!< in/out: mini-transaction where index
+ is s-latched */
+{
fseg_header_t* seg_header;
page_t* root;
ulint n;
@@ -1221,6 +1248,8 @@ btr_get_size(
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
MTR_MEMO_S_LOCK));
+ ut_a(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE);
+
if (index->page == FIL_NULL || dict_index_is_online_ddl(index)
|| *index->name == TEMP_INDEX_PREFIX) {
return(ULINT_UNDEFINED);
@@ -1228,27 +1257,16 @@ btr_get_size(
root = btr_root_get(index, mtr);
- SRV_CORRUPT_TABLE_CHECK(root,
- {
- mtr_commit(mtr);
- return(0);
- });
+ seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
- if (flag == BTR_N_LEAF_PAGES) {
- seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
-
- fseg_n_reserved_pages(seg_header, &n, mtr);
+ n = fseg_n_reserved_pages(seg_header, used, mtr);
- } else if (flag == BTR_TOTAL_SIZE) {
+ if (flag == BTR_TOTAL_SIZE) {
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
- n = fseg_n_reserved_pages(seg_header, &dummy, mtr);
-
- seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
-
n += fseg_n_reserved_pages(seg_header, &dummy, mtr);
- } else {
- ut_error;
+ *used += dummy;
+
}
return(n);
@@ -2013,7 +2031,7 @@ IBUF_BITMAP_FREE is unaffected by reorganization.
@retval true if the operation was successful
@retval false if it is a compressed page, and recompression failed */
-static __attribute__((nonnull))
+UNIV_INTERN
bool
btr_page_reorganize_block(
/*======================*/
@@ -2965,6 +2983,12 @@ func_start:
new_page_zip = buf_block_get_page_zip(new_block);
btr_page_create(new_block, new_page_zip, cursor->index,
btr_page_get_level(page, mtr), mtr);
+ /* Only record the leaf level page splits. */
+ if (btr_page_get_level(page, mtr) == 0) {
+ cursor->index->stat_defrag_n_page_split ++;
+ cursor->index->stat_defrag_modified_counter ++;
+ btr_defragment_save_defrag_stats_if_needed(cursor->index);
+ }
/* 3. Calculate the first record on the upper half-page, and the
first record (move_limit) on original page which ends up on the
@@ -3223,31 +3247,9 @@ func_exit:
return(rec);
}
-#ifdef UNIV_SYNC_DEBUG
-/*************************************************************//**
-Removes a page from the level list of pages.
-@param space in: space where removed
-@param zip_size in: compressed page size in bytes, or 0 for uncompressed
-@param page in/out: page to remove
-@param index in: index tree
-@param mtr in/out: mini-transaction */
-# define btr_level_list_remove(space,zip_size,page,index,mtr) \
- btr_level_list_remove_func(space,zip_size,page,index,mtr)
-#else /* UNIV_SYNC_DEBUG */
-/*************************************************************//**
-Removes a page from the level list of pages.
-@param space in: space where removed
-@param zip_size in: compressed page size in bytes, or 0 for uncompressed
-@param page in/out: page to remove
-@param index in: index tree
-@param mtr in/out: mini-transaction */
-# define btr_level_list_remove(space,zip_size,page,index,mtr) \
- btr_level_list_remove_func(space,zip_size,page,mtr)
-#endif /* UNIV_SYNC_DEBUG */
-
/*************************************************************//**
Removes a page from the level list of pages. */
-static __attribute__((nonnull))
+UNIV_INTERN
void
btr_level_list_remove_func(
/*=======================*/
@@ -3419,7 +3421,7 @@ btr_node_ptr_delete(
If page is the only on its level, this function moves its records to the
father page, thus reducing the tree height.
@return father block */
-static
+UNIV_INTERN
buf_block_t*
btr_lift_page_up(
/*=============*/
diff --git a/storage/xtradb/btr/btr0defragment.cc b/storage/xtradb/btr/btr0defragment.cc
new file mode 100644
index 00000000000..a784c8c5be7
--- /dev/null
+++ b/storage/xtradb/btr/btr0defragment.cc
@@ -0,0 +1,815 @@
+/*****************************************************************************
+
+Copyright (C) 2012, 2014 Facebook, Inc. All Rights Reserved.
+Copyright (C) 2014, SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**************************************************//**
+@file btr/btr0defragment.cc
+Index defragmentation.
+
+Created 05/29/2014 Rongrong Zhong
+Modified 16/07/2014 Sunguck Lee
+Modified 30/07/2014 Jan Lindström jan.lindstrom@skysql.com
+*******************************************************/
+
+#include "btr0defragment.h"
+#ifndef UNIV_HOTBACKUP
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "btr0pcur.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "ut0timer.h"
+
+#include <list>
+
+/**************************************************//**
+Custom nullptr implementation for under g++ 4.6
+*******************************************************/
+// #pragma once
+namespace std
+{
+ // based on SC22/WG21/N2431 = J16/07-0301
+ struct nullptr_t
+ {
+ template<typename any> operator any * () const
+ {
+ return 0;
+ }
+ template<class any, typename T> operator T any:: * () const
+ {
+ return 0;
+ }
+
+#ifdef _MSC_VER
+ struct pad {};
+ pad __[sizeof(void*)/sizeof(pad)];
+#else
+ char __[sizeof(void*)];
+#endif
+private:
+ // nullptr_t();// {}
+ // nullptr_t(const nullptr_t&);
+ // void operator = (const nullptr_t&);
+ void operator &() const;
+ template<typename any> void operator +(any) const
+ {
+ /*I Love MSVC 2005!*/
+ }
+ template<typename any> void operator -(any) const
+ {
+ /*I Love MSVC 2005!*/
+ }
+ };
+static const nullptr_t __nullptr = {};
+}
+
+#ifndef nullptr
+#define nullptr std::__nullptr
+#endif
+/**************************************************//**
+End of Custom nullptr implementation for under g++ 4.6
+*******************************************************/
+
+/* When there's no work, either because defragment is disabled, or because no
+query is submitted, thread checks state every BTR_DEFRAGMENT_SLEEP_IN_USECS.*/
+#define BTR_DEFRAGMENT_SLEEP_IN_USECS 1000000
+/* Reduce the target page size by this amount when compression failure happens
+during defragmentaiton. 512 is chosen because it's a power of 2 and it is about
+3% of the page size. When there are compression failures in defragmentation,
+our goal is to get a decent defrag ratio with as few compression failure as
+possible. From experimentation it seems that reduce the target size by 512 every
+time will make sure the page is compressible within a couple of iterations. */
+#define BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE 512
+
+/* Work queue for defragmentation. */
+typedef std::list<btr_defragment_item_t*> btr_defragment_wq_t;
+static btr_defragment_wq_t btr_defragment_wq;
+
+/* Mutex protecting the defragmentation work queue.*/
+ib_mutex_t btr_defragment_mutex;
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t btr_defragment_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/* Number of compression failures caused by defragmentation since server
+start. */
+ulint btr_defragment_compression_failures = 0;
+/* Number of btr_defragment_n_pages calls that altered page but didn't
+manage to release any page. */
+ulint btr_defragment_failures = 0;
+/* Total number of btr_defragment_n_pages calls that altered page.
+The difference between btr_defragment_count and btr_defragment_failures shows
+the amount of effort wasted. */
+ulint btr_defragment_count = 0;
+
+/******************************************************************//**
+Constructor for btr_defragment_item_t. */
+btr_defragment_item_t::btr_defragment_item_t(
+ btr_pcur_t* pcur,
+ os_event_t event)
+{
+ this->pcur = pcur;
+ this->event = event;
+ this->removed = false;
+ this->last_processed = 0;
+}
+
+/******************************************************************//**
+Destructor for btr_defragment_item_t. */
+btr_defragment_item_t::~btr_defragment_item_t() {
+ if (this->pcur) {
+ btr_pcur_free_for_mysql(this->pcur);
+ }
+ if (this->event) {
+ os_event_set(this->event);
+ }
+}
+
+/******************************************************************//**
+Initialize defragmentation. */
+void
+btr_defragment_init()
+{
+ srv_defragment_interval = ut_microseconds_to_timer(
+ 1000000.0 / srv_defragment_frequency);
+ mutex_create(btr_defragment_mutex_key, &btr_defragment_mutex,
+ SYNC_ANY_LATCH);
+ os_thread_create(btr_defragment_thread, NULL, NULL);
+}
+
+/******************************************************************//**
+Shutdown defragmentation. Release all resources. */
+void
+btr_defragment_shutdown()
+{
+ mutex_enter(&btr_defragment_mutex);
+ list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ while(iter != btr_defragment_wq.end()) {
+ btr_defragment_item_t* item = *iter;
+ iter = btr_defragment_wq.erase(iter);
+ delete item;
+ }
+ mutex_exit(&btr_defragment_mutex);
+ mutex_free(&btr_defragment_mutex);
+}
+
+
+/******************************************************************//**
+Functions used by the query threads: btr_defragment_xxx_index
+Query threads find/add/remove index. */
+/******************************************************************//**
+Check whether the given index is in btr_defragment_wq. We use index->id
+to identify indices. */
+bool
+btr_defragment_find_index(
+ dict_index_t* index) /*!< Index to find. */
+{
+ mutex_enter(&btr_defragment_mutex);
+ for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ iter != btr_defragment_wq.end();
+ ++iter) {
+ btr_defragment_item_t* item = *iter;
+ btr_pcur_t* pcur = item->pcur;
+ btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
+ dict_index_t* idx = btr_cur_get_index(cursor);
+ if (index->id == idx->id) {
+ mutex_exit(&btr_defragment_mutex);
+ return true;
+ }
+ }
+ mutex_exit(&btr_defragment_mutex);
+ return false;
+}
+
+/******************************************************************//**
+Query thread uses this function to add an index to btr_defragment_wq.
+Return a pointer to os_event for the query thread to wait on if this is a
+synchronized defragmentation. */
+os_event_t
+btr_defragment_add_index(
+ dict_index_t* index, /*!< index to be added */
+ bool async) /*!< whether this is an async defragmentation */
+{
+ mtr_t mtr;
+ ulint space = dict_index_get_space(index);
+ ulint zip_size = dict_table_zip_size(index->table);
+ ulint page_no = dict_index_get_page(index);
+ mtr_start(&mtr);
+ // Load index rood page.
+ page_t* page = btr_page_get(space, zip_size, page_no,
+ RW_NO_LATCH, index, &mtr);
+ if (btr_page_get_level(page, &mtr) == 0) {
+ // Index root is a leaf page, no need to defragment.
+ mtr_commit(&mtr);
+ return NULL;
+ }
+ btr_pcur_t* pcur = btr_pcur_create_for_mysql();
+ os_event_t event = NULL;
+ if (!async) {
+ event = os_event_create();
+ }
+ btr_pcur_open_at_index_side(true, index, BTR_SEARCH_LEAF, pcur,
+ true, 0, &mtr);
+ btr_pcur_move_to_next(pcur, &mtr);
+ btr_pcur_store_position(pcur, &mtr);
+ mtr_commit(&mtr);
+ dict_stats_empty_defrag_summary(index);
+ btr_defragment_item_t* item = new btr_defragment_item_t(pcur, event);
+ mutex_enter(&btr_defragment_mutex);
+ btr_defragment_wq.push_back(item);
+ mutex_exit(&btr_defragment_mutex);
+ return event;
+}
+
+/******************************************************************//**
+When table is dropped, this function is called to mark a table as removed in
+btr_efragment_wq. The difference between this function and the remove_index
+function is this will not NULL the event. */
+void
+btr_defragment_remove_table(
+ dict_table_t* table) /*!< Index to be removed. */
+{
+ mutex_enter(&btr_defragment_mutex);
+ for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ iter != btr_defragment_wq.end();
+ ++iter) {
+ btr_defragment_item_t* item = *iter;
+ btr_pcur_t* pcur = item->pcur;
+ btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
+ dict_index_t* idx = btr_cur_get_index(cursor);
+ if (table->id == idx->table->id) {
+ item->removed = true;
+ }
+ }
+ mutex_exit(&btr_defragment_mutex);
+}
+
+/******************************************************************//**
+Query thread uses this function to mark an index as removed in
+btr_efragment_wq. */
+void
+btr_defragment_remove_index(
+ dict_index_t* index) /*!< Index to be removed. */
+{
+ mutex_enter(&btr_defragment_mutex);
+ for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ iter != btr_defragment_wq.end();
+ ++iter) {
+ btr_defragment_item_t* item = *iter;
+ btr_pcur_t* pcur = item->pcur;
+ btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
+ dict_index_t* idx = btr_cur_get_index(cursor);
+ if (index->id == idx->id) {
+ item->removed = true;
+ item->event = NULL;
+ break;
+ }
+ }
+ mutex_exit(&btr_defragment_mutex);
+}
+
+/******************************************************************//**
+Functions used by defragmentation thread: btr_defragment_xxx_item.
+Defragmentation thread operates on the work *item*. It gets/removes
+item from the work queue. */
+/******************************************************************//**
+Defragment thread uses this to remove an item from btr_defragment_wq.
+When an item is removed from the work queue, all resources associated with it
+are free as well. */
+void
+btr_defragment_remove_item(
+ btr_defragment_item_t* item) /*!< Item to be removed. */
+{
+ mutex_enter(&btr_defragment_mutex);
+ for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ iter != btr_defragment_wq.end();
+ ++iter) {
+ if (item == *iter) {
+ btr_defragment_wq.erase(iter);
+ delete item;
+ break;
+ }
+ }
+ mutex_exit(&btr_defragment_mutex);
+}
+
+/******************************************************************//**
+Defragment thread uses this to get an item from btr_defragment_wq to work on.
+The item is not removed from the work queue so query threads can still access
+this item. We keep it this way so query threads can find and kill a
+defragmentation even if that index is being worked on. Be aware that while you
+work on this item you have no lock protection on it whatsoever. This is OK as
+long as the query threads and defragment thread won't modify the same fields
+without lock protection.
+*/
+btr_defragment_item_t*
+btr_defragment_get_item()
+{
+ if (btr_defragment_wq.empty()) {
+ return nullptr;
+ }
+ mutex_enter(&btr_defragment_mutex);
+ list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ if (iter == btr_defragment_wq.end()) {
+ iter = btr_defragment_wq.begin();
+ }
+ btr_defragment_item_t* item = *iter;
+ iter++;
+ mutex_exit(&btr_defragment_mutex);
+ return item;
+}
+
+/*********************************************************************//**
+Check whether we should save defragmentation statistics to persistent storage.
+Currently we save the stats to persistent storage every 100 updates. */
+UNIV_INTERN
+void
+btr_defragment_save_defrag_stats_if_needed(
+ dict_index_t* index) /*!< in: index */
+{
+ if (srv_defragment_stats_accuracy != 0 // stats tracking disabled
+ && dict_index_get_space(index) != 0 // do not track system tables
+ && index->stat_defrag_modified_counter
+ >= srv_defragment_stats_accuracy) {
+ dict_stats_defrag_pool_add(index);
+ index->stat_defrag_modified_counter = 0;
+ }
+}
+
+/*********************************************************************//**
+Main defragment functionalities used by defragment thread.*/
+/*************************************************************//**
+Calculate number of records from beginning of block that can
+fit into size_limit
+@return number of records */
+UNIV_INTERN
+ulint
+btr_defragment_calc_n_recs_for_size(
+ buf_block_t* block, /*!< in: B-tree page */
+ dict_index_t* index, /*!< in: index of the page */
+ ulint size_limit, /*!< in: size limit to fit records in */
+ ulint* n_recs_size) /*!< out: actual size of the records that fit
+ in size_limit. */
+{
+ page_t* page = buf_block_get_frame(block);
+ ulint n_recs = 0;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+ mem_heap_t* heap = NULL;
+ ulint size = 0;
+ page_cur_t cur;
+
+ page_cur_set_before_first(block, &cur);
+ page_cur_move_to_next(&cur);
+ while (page_cur_get_rec(&cur) != page_get_supremum_rec(page)) {
+ rec_t* cur_rec = page_cur_get_rec(&cur);
+ offsets = rec_get_offsets(cur_rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ ulint rec_size = rec_offs_size(offsets);
+ size += rec_size;
+ if (size > size_limit) {
+ size = size - rec_size;
+ break;
+ }
+ n_recs ++;
+ page_cur_move_to_next(&cur);
+ }
+ *n_recs_size = size;
+ return n_recs;
+}
+
+/*************************************************************//**
+Merge as many records from the from_block to the to_block. Delete
+the from_block if all records are successfully merged to to_block.
+@return the to_block to target for next merge operation. */
+UNIV_INTERN
+buf_block_t*
+btr_defragment_merge_pages(
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* from_block, /*!< in: origin of merge */
+ buf_block_t* to_block, /*!< in: destination of merge */
+ ulint zip_size, /*!< in: zip size of the block */
+ ulint reserved_space, /*!< in: space reserved for future
+ insert to avoid immediate page split */
+ ulint* max_data_size, /*!< in/out: max data size to
+ fit in a single compressed page. */
+ mem_heap_t* heap, /*!< in/out: pointer to memory heap */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ page_t* from_page = buf_block_get_frame(from_block);
+ page_t* to_page = buf_block_get_frame(to_block);
+ ulint space = dict_index_get_space(index);
+ ulint level = btr_page_get_level(from_page, mtr);
+ ulint n_recs = page_get_n_recs(from_page);
+ ulint new_data_size = page_get_data_size(to_page);
+ ulint max_ins_size =
+ page_get_max_insert_size(to_page, n_recs);
+ ulint max_ins_size_reorg =
+ page_get_max_insert_size_after_reorganize(
+ to_page, n_recs);
+ ulint max_ins_size_to_use = max_ins_size_reorg > reserved_space
+ ? max_ins_size_reorg - reserved_space : 0;
+ ulint move_size = 0;
+ ulint n_recs_to_move = 0;
+ rec_t* rec = NULL;
+ ulint target_n_recs = 0;
+ rec_t* orig_pred;
+
+ // Estimate how many records can be moved from the from_page to
+ // the to_page.
+ if (zip_size) {
+ ulint page_diff = UNIV_PAGE_SIZE - *max_data_size;
+ max_ins_size_to_use = (max_ins_size_to_use > page_diff)
+ ? max_ins_size_to_use - page_diff : 0;
+ }
+ n_recs_to_move = btr_defragment_calc_n_recs_for_size(
+ from_block, index, max_ins_size_to_use, &move_size);
+
+ // If max_ins_size >= move_size, we can move the records without
+ // reorganizing the page, otherwise we need to reorganize the page
+ // first to release more space.
+ if (move_size > max_ins_size) {
+ if (!btr_page_reorganize_block(false, page_zip_level,
+ to_block, index,
+ mtr)) {
+ if (!dict_index_is_clust(index)
+ && page_is_leaf(to_page)) {
+ ibuf_reset_free_bits(to_block);
+ }
+ // If reorganization fails, that means page is
+ // not compressable. There's no point to try
+ // merging into this page. Continue to the
+ // next page.
+ return from_block;
+ }
+ ut_ad(page_validate(to_page, index));
+ max_ins_size = page_get_max_insert_size(to_page, n_recs);
+ ut_a(max_ins_size >= move_size);
+ }
+
+ // Move records to pack to_page more full.
+ orig_pred = NULL;
+ target_n_recs = n_recs_to_move;
+ while (n_recs_to_move > 0) {
+ rec = page_rec_get_nth(from_page,
+ n_recs_to_move + 1);
+ orig_pred = page_copy_rec_list_start(
+ to_block, from_block, rec, index, mtr);
+ if (orig_pred)
+ break;
+ // If we reach here, that means compression failed after packing
+ // n_recs_to_move number of records to to_page. We try to reduce
+ // the targeted data size on the to_page by
+ // BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE and try again.
+ os_atomic_increment_ulint(
+ &btr_defragment_compression_failures, 1);
+ max_ins_size_to_use =
+ move_size > BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
+ ? move_size - BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
+ : 0;
+ if (max_ins_size_to_use == 0) {
+ n_recs_to_move = 0;
+ move_size = 0;
+ break;
+ }
+ n_recs_to_move = btr_defragment_calc_n_recs_for_size(
+ from_block, index, max_ins_size_to_use, &move_size);
+ }
+ // If less than target_n_recs are moved, it means there are
+ // compression failures during page_copy_rec_list_start. Adjust
+ // the max_data_size estimation to reduce compression failures
+ // in the following runs.
+ if (target_n_recs > n_recs_to_move
+ && *max_data_size > new_data_size + move_size) {
+ *max_data_size = new_data_size + move_size;
+ }
+ // Set ibuf free bits if necessary.
+ if (!dict_index_is_clust(index)
+ && page_is_leaf(to_page)) {
+ if (zip_size) {
+ ibuf_reset_free_bits(to_block);
+ } else {
+ ibuf_update_free_bits_if_full(
+ to_block,
+ UNIV_PAGE_SIZE,
+ ULINT_UNDEFINED);
+ }
+ }
+ if (n_recs_to_move == n_recs) {
+ /* The whole page is merged with the previous page,
+ free it. */
+ lock_update_merge_left(to_block, orig_pred,
+ from_block);
+ btr_search_drop_page_hash_index(from_block);
+ btr_level_list_remove(space, zip_size, from_page,
+ index, mtr);
+ btr_node_ptr_delete(index, from_block, mtr);
+ btr_blob_dbg_remove(from_page, index,
+ "btr_defragment_n_pages");
+ btr_page_free(index, from_block, mtr);
+ } else {
+ // There are still records left on the page, so
+ // increment n_defragmented. Node pointer will be changed
+ // so remove the old node pointer.
+ if (n_recs_to_move > 0) {
+ // Part of the page is merged to left, remove
+ // the merged records, update record locks and
+ // node pointer.
+ dtuple_t* node_ptr;
+ page_delete_rec_list_start(rec, from_block,
+ index, mtr);
+ lock_update_split_and_merge(to_block,
+ orig_pred,
+ from_block);
+ btr_node_ptr_delete(index, from_block, mtr);
+ rec = page_rec_get_next(
+ page_get_infimum_rec(from_page));
+ node_ptr = dict_index_build_node_ptr(
+ index, rec, page_get_page_no(from_page),
+ heap, level + 1);
+ btr_insert_on_non_leaf_level(0, index, level+1,
+ node_ptr, mtr);
+ }
+ to_block = from_block;
+ }
+ return to_block;
+}
+
+/*************************************************************//**
+Tries to merge N consecutive pages, starting from the page pointed by the
+cursor. Skip space 0. Only consider leaf pages.
+This function first loads all N pages into memory, then for each of
+the pages other than the first page, it tries to move as many records
+as possible to the left sibling to keep the left sibling full. During
+the process, if any page becomes empty, that page will be removed from
+the level list. Record locks, hash, and node pointers are updated after
+page reorganization.
+@return pointer to the last block processed, or NULL if reaching end of index */
+UNIV_INTERN
+buf_block_t*
+btr_defragment_n_pages(
+ buf_block_t* block, /*!< in: starting block for defragmentation */
+ dict_index_t* index, /*!< in: index tree */
+ uint n_pages,/*!< in: number of pages to defragment */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ulint space;
+ ulint zip_size;
+ /* We will need to load the n+1 block because if the last page is freed
+ and we need to modify the prev_page_no of that block. */
+ buf_block_t* blocks[BTR_DEFRAGMENT_MAX_N_PAGES + 1];
+ page_t* first_page;
+ buf_block_t* current_block;
+ ulint total_data_size = 0;
+ ulint total_n_recs = 0;
+ ulint data_size_per_rec;
+ ulint optimal_page_size;
+ ulint reserved_space;
+ ulint level;
+ ulint max_data_size = 0;
+ uint n_defragmented = 0;
+ uint n_new_slots;
+ mem_heap_t* heap;
+ ibool end_of_index = FALSE;
+
+ /* It doesn't make sense to call this function with n_pages = 1. */
+ ut_ad(n_pages > 1);
+
+ ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+ MTR_MEMO_X_LOCK));
+ space = dict_index_get_space(index);
+ if (space == 0) {
+ /* Ignore space 0. */
+ return NULL;
+ }
+
+ if (n_pages > BTR_DEFRAGMENT_MAX_N_PAGES) {
+ n_pages = BTR_DEFRAGMENT_MAX_N_PAGES;
+ }
+
+ zip_size = dict_table_zip_size(index->table);
+ first_page = buf_block_get_frame(block);
+ level = btr_page_get_level(first_page, mtr);
+
+ if (level != 0) {
+ return NULL;
+ }
+
+ /* 1. Load the pages and calculate the total data size. */
+ blocks[0] = block;
+ for (uint i = 1; i <= n_pages; i++) {
+ page_t* page = buf_block_get_frame(blocks[i-1]);
+ ulint page_no = btr_page_get_next(page, mtr);
+ total_data_size += page_get_data_size(page);
+ total_n_recs += page_get_n_recs(page);
+ if (page_no == FIL_NULL) {
+ n_pages = i;
+ end_of_index = TRUE;
+ break;
+ }
+ blocks[i] = btr_block_get(space, zip_size, page_no,
+ RW_X_LATCH, index, mtr);
+ }
+
+ if (n_pages == 1) {
+ if (btr_page_get_prev(first_page, mtr) == FIL_NULL) {
+ /* last page in the index */
+ if (dict_index_get_page(index)
+ == page_get_page_no(first_page))
+ return NULL;
+ /* given page is the last page.
+ Lift the records to father. */
+ btr_lift_page_up(index, block, mtr);
+ }
+ return NULL;
+ }
+
+ /* 2. Calculate how many pages data can fit in. If not compressable,
+ return early. */
+ ut_a(total_n_recs != 0);
+ data_size_per_rec = total_data_size / total_n_recs;
+ // For uncompressed pages, the optimal data size if the free space of a
+ // empty page.
+ optimal_page_size = page_get_free_space_of_empty(
+ page_is_comp(first_page));
+ // For compressed pages, we take compression failures into account.
+ if (zip_size) {
+ ulint size = 0;
+ int i = 0;
+ // We estimate the optimal data size of the index use samples of
+ // data size. These samples are taken when pages failed to
+ // compress due to insertion on the page. We use the average
+ // of all samples we have as the estimation. Different pages of
+ // the same index vary in compressibility. Average gives a good
+ // enough estimation.
+ for (;i < STAT_DEFRAG_DATA_SIZE_N_SAMPLE; i++) {
+ if (index->stat_defrag_data_size_sample[i] == 0) {
+ break;
+ }
+ size += index->stat_defrag_data_size_sample[i];
+ }
+ if (i != 0) {
+ size = size / i;
+ optimal_page_size = min(optimal_page_size, size);
+ }
+ max_data_size = optimal_page_size;
+ }
+
+ reserved_space = min((ulint)(optimal_page_size
+ * (1 - srv_defragment_fill_factor)),
+ (data_size_per_rec
+ * srv_defragment_fill_factor_n_recs));
+ optimal_page_size -= reserved_space;
+ n_new_slots = (total_data_size + optimal_page_size - 1)
+ / optimal_page_size;
+ if (n_new_slots >= n_pages) {
+ /* Can't defragment. */
+ if (end_of_index)
+ return NULL;
+ return blocks[n_pages-1];
+ }
+
+ /* 3. Defragment pages. */
+ heap = mem_heap_create(256);
+ // First defragmented page will be the first page.
+ current_block = blocks[0];
+ // Start from the second page.
+ for (uint i = 1; i < n_pages; i ++) {
+ buf_block_t* new_block = btr_defragment_merge_pages(
+ index, blocks[i], current_block, zip_size,
+ reserved_space, &max_data_size, heap, mtr);
+ if (new_block != current_block) {
+ n_defragmented ++;
+ current_block = new_block;
+ }
+ }
+ mem_heap_free(heap);
+ n_defragmented ++;
+ os_atomic_increment_ulint(
+ &btr_defragment_count, 1);
+ if (n_pages == n_defragmented) {
+ os_atomic_increment_ulint(
+ &btr_defragment_failures, 1);
+ } else {
+ index->stat_defrag_n_pages_freed += (n_pages - n_defragmented);
+ }
+ if (end_of_index)
+ return NULL;
+ return current_block;
+}
+
+/******************************************************************//**
+Thread that merges consecutive b-tree pages into fewer pages to defragment
+the index. */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(btr_defragment_thread)(
+/*==========================================*/
+ void* arg) /*!< in: work queue */
+{
+ btr_pcur_t* pcur;
+ btr_cur_t* cursor;
+ dict_index_t* index;
+ mtr_t mtr;
+ buf_block_t* first_block;
+ buf_block_t* last_block;
+
+ while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
+ /* If defragmentation is disabled, sleep before
+ checking whether it's enabled. */
+ if (!srv_defragment) {
+ os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS);
+ continue;
+ }
+ /* The following call won't remove the item from work queue.
+ We only get a pointer to it to work on. This will make sure
+ when user issue a kill command, all indices are in the work
+ queue to be searched. This also means that the user thread
+ cannot directly remove the item from queue (since we might be
+ using it). So user thread only marks index as removed. */
+ btr_defragment_item_t* item = btr_defragment_get_item();
+ /* If work queue is empty, sleep and check later. */
+ if (!item) {
+ os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS);
+ continue;
+ }
+ /* If an index is marked as removed, we remove it from the work
+ queue. No other thread could be using this item at this point so
+ it's safe to remove now. */
+ if (item->removed) {
+ btr_defragment_remove_item(item);
+ continue;
+ }
+
+ pcur = item->pcur;
+ ulonglong now = ut_timer_now();
+ ulonglong elapsed = now - item->last_processed;
+
+ if (elapsed < srv_defragment_interval) {
+ /* If we see an index again before the interval
+ determined by the configured frequency is reached,
+ we just sleep until the interval pass. Since
+ defragmentation of all indices queue up on a single
+ thread, it's likely other indices that follow this one
+ don't need to sleep again. */
+ os_thread_sleep(((ulint)ut_timer_to_microseconds(
+ srv_defragment_interval - elapsed)));
+ }
+
+ now = ut_timer_now();
+ mtr_start(&mtr);
+ btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, &mtr);
+ cursor = btr_pcur_get_btr_cur(pcur);
+ index = btr_cur_get_index(cursor);
+ first_block = btr_cur_get_block(cursor);
+ last_block = btr_defragment_n_pages(first_block, index,
+ srv_defragment_n_pages,
+ &mtr);
+ if (last_block) {
+ /* If we haven't reached the end of the index,
+ place the cursor on the last record of last page,
+ store the cursor position, and put back in queue. */
+ page_t* last_page = buf_block_get_frame(last_block);
+ rec_t* rec = page_rec_get_prev(
+ page_get_supremum_rec(last_page));
+ ut_a(page_rec_is_user_rec(rec));
+ page_cur_position(rec, last_block,
+ btr_cur_get_page_cur(cursor));
+ btr_pcur_store_position(pcur, &mtr);
+ mtr_commit(&mtr);
+ /* Update the last_processed time of this index. */
+ item->last_processed = now;
+ } else {
+ mtr_commit(&mtr);
+ /* Reaching the end of the index. */
+ dict_stats_empty_defrag_stats(index);
+ dict_stats_save_defrag_stats(index);
+ dict_stats_save_defrag_summary(index);
+ btr_defragment_remove_item(item);
+ }
+ }
+ btr_defragment_shutdown();
+ os_thread_exit(NULL);
+ OS_THREAD_DUMMY_RETURN;
+}
+
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/dict/dict0dict.cc b/storage/xtradb/dict/dict0dict.cc
index af3518337d4..e8576177967 100644
--- a/storage/xtradb/dict/dict0dict.cc
+++ b/storage/xtradb/dict/dict0dict.cc
@@ -408,7 +408,7 @@ dict_table_try_drop_aborted(
if (table == NULL) {
table = dict_table_open_on_id_low(
- table_id, DICT_ERR_IGNORE_NONE);
+ table_id, DICT_ERR_IGNORE_NONE, FALSE);
} else {
ut_ad(table->id == table_id);
}
@@ -795,7 +795,8 @@ dict_table_open_on_id(
table_id,
table_op == DICT_TABLE_OP_LOAD_TABLESPACE
? DICT_ERR_IGNORE_RECOVER_LOCK
- : DICT_ERR_IGNORE_NONE);
+ : DICT_ERR_IGNORE_NONE,
+ table_op == DICT_TABLE_OP_OPEN_ONLY_IF_CACHED);
if (table != NULL) {
@@ -1313,7 +1314,7 @@ dict_table_move_from_non_lru_to_lru(
/**********************************************************************//**
Looks for an index with the given id given a table instance.
@return index or NULL */
-static
+UNIV_INTERN
dict_index_t*
dict_table_find_index_on_id(
/*========================*/
@@ -2408,6 +2409,13 @@ undo_size_ok:
new_index->stat_index_size = 1;
new_index->stat_n_leaf_pages = 1;
+ new_index->stat_defrag_n_pages_freed = 0;
+ new_index->stat_defrag_n_page_split = 0;
+
+ new_index->stat_defrag_sample_next_slot = 0;
+ memset(&new_index->stat_defrag_data_size_sample,
+ 0x0, sizeof(ulint) * STAT_DEFRAG_DATA_SIZE_N_SAMPLE);
+
/* Add the new index as the last index for the table */
UT_LIST_ADD_LAST(indexes, table->indexes, new_index);
diff --git a/storage/xtradb/dict/dict0stats.cc b/storage/xtradb/dict/dict0stats.cc
index 928bdb3f2ef..bec0079942b 100644
--- a/storage/xtradb/dict/dict0stats.cc
+++ b/storage/xtradb/dict/dict0stats.cc
@@ -492,6 +492,9 @@ dict_stats_table_clone_create(
heap,
idx->n_uniq * sizeof(idx->stat_n_non_null_key_vals[0]));
ut_d(idx->magic_n = DICT_INDEX_MAGIC_N);
+
+ idx->stat_defrag_n_page_split = 0;
+ idx->stat_defrag_n_pages_freed = 0;
}
ut_d(t->magic_n = DICT_TABLE_MAGIC_N);
@@ -520,7 +523,9 @@ static
void
dict_stats_empty_index(
/*===================*/
- dict_index_t* index) /*!< in/out: index */
+ dict_index_t* index, /*!< in/out: index */
+ bool empty_defrag_stats)
+ /*!< in: whether to empty defrag stats */
{
ut_ad(!(index->type & DICT_FTS));
ut_ad(!dict_index_is_univ(index));
@@ -535,6 +540,34 @@ dict_stats_empty_index(
index->stat_index_size = 1;
index->stat_n_leaf_pages = 1;
+
+ if (empty_defrag_stats) {
+ dict_stats_empty_defrag_stats(index);
+ dict_stats_empty_defrag_summary(index);
+ }
+}
+
+/**********************************************************************//**
+Clear defragmentation summary. */
+UNIV_INTERN
+void
+dict_stats_empty_defrag_summary(
+/*==================*/
+ dict_index_t* index) /*!< in: index to clear defragmentation stats */
+{
+ index->stat_defrag_n_pages_freed = 0;
+}
+
+/**********************************************************************//**
+Clear defragmentation related index stats. */
+UNIV_INTERN
+void
+dict_stats_empty_defrag_stats(
+/*==================*/
+ dict_index_t* index) /*!< in: index to clear defragmentation stats */
+{
+ index->stat_defrag_modified_counter = 0;
+ index->stat_defrag_n_page_split = 0;
}
/*********************************************************************//**
@@ -544,7 +577,9 @@ static
void
dict_stats_empty_table(
/*===================*/
- dict_table_t* table) /*!< in/out: table */
+ dict_table_t* table, /*!< in/out: table */
+ bool empty_defrag_stats)
+ /*!< in: whether to empty defrag stats */
{
/* Zero the stats members */
@@ -569,7 +604,7 @@ dict_stats_empty_table(
ut_ad(!dict_index_is_univ(index));
- dict_stats_empty_index(index);
+ dict_stats_empty_index(index, empty_defrag_stats);
}
table->stat_initialized = TRUE;
@@ -704,7 +739,7 @@ dict_stats_copy(
}
if (!INDEX_EQ(src_idx, dst_idx)) {
- dict_stats_empty_index(dst_idx);
+ dict_stats_empty_index(dst_idx, true);
continue;
}
@@ -715,7 +750,7 @@ dict_stats_copy(
/* Since src is smaller some elements in dst
will remain untouched by the following memmove(),
thus we init all of them here. */
- dict_stats_empty_index(dst_idx);
+ dict_stats_empty_index(dst_idx, true);
} else {
n_copy_el = dst_idx->n_uniq;
}
@@ -735,6 +770,13 @@ dict_stats_copy(
dst_idx->stat_index_size = src_idx->stat_index_size;
dst_idx->stat_n_leaf_pages = src_idx->stat_n_leaf_pages;
+
+ dst_idx->stat_defrag_modified_counter =
+ src_idx->stat_defrag_modified_counter;
+ dst_idx->stat_defrag_n_pages_freed =
+ src_idx->stat_defrag_n_pages_freed;
+ dst_idx->stat_defrag_n_page_split =
+ src_idx->stat_defrag_n_page_split;
}
dst->stat_initialized = TRUE;
@@ -758,6 +800,9 @@ dict_index_t::stat_n_sample_sizes[]
dict_index_t::stat_n_non_null_key_vals[]
dict_index_t::stat_index_size
dict_index_t::stat_n_leaf_pages
+dict_index_t::stat_defrag_modified_counter
+dict_index_t::stat_defrag_n_pages_freed
+dict_index_t::stat_defrag_n_page_split
The returned object should be freed with dict_stats_snapshot_free()
when no longer needed.
@return incomplete table object */
@@ -807,7 +852,9 @@ dict_stats_snapshot_free(
Calculates new estimates for index statistics. This function is
relatively quick and is used to calculate transient statistics that
are not saved on disk. This was the only way to calculate statistics
-before the Persistent Statistics feature was introduced. */
+before the Persistent Statistics feature was introduced.
+This function doesn't update the defragmentation related stats.
+Only persistent statistics supports defragmentation stats. */
static
void
dict_stats_update_transient_for_index(
@@ -823,10 +870,10 @@ dict_stats_update_transient_for_index(
Initialize some bogus index cardinality
statistics, so that the data can be queried in
various means, also via secondary indexes. */
- dict_stats_empty_index(index);
+ dict_stats_empty_index(index, false);
#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
} else if (ibuf_debug && !dict_index_is_clust(index)) {
- dict_stats_empty_index(index);
+ dict_stats_empty_index(index, false);
#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
} else {
mtr_t mtr;
@@ -847,7 +894,7 @@ dict_stats_update_transient_for_index(
switch (size) {
case ULINT_UNDEFINED:
- dict_stats_empty_index(index);
+ dict_stats_empty_index(index, false);
return;
case 0:
/* The root node of the tree is a leaf */
@@ -882,7 +929,7 @@ dict_stats_update_transient(
if (dict_table_is_discarded(table)) {
/* Nothing to do. */
- dict_stats_empty_table(table);
+ dict_stats_empty_table(table, false);
return;
} else if (index == NULL) {
/* Table definition is corrupt */
@@ -892,7 +939,7 @@ dict_stats_update_transient(
fprintf(stderr, " InnoDB: table %s has no indexes. "
"Cannot calculate statistics.\n",
ut_format_name(table->name, TRUE, buf, sizeof(buf)));
- dict_stats_empty_table(table);
+ dict_stats_empty_table(table, false);
return;
}
@@ -904,7 +951,7 @@ dict_stats_update_transient(
continue;
}
- dict_stats_empty_index(index);
+ dict_stats_empty_index(index, false);
if (dict_stats_should_ignore_index(index)) {
continue;
@@ -1794,7 +1841,7 @@ dict_stats_analyze_index(
DEBUG_PRINTF(" %s(index=%s)\n", __func__, index->name);
- dict_stats_empty_index(index);
+ dict_stats_empty_index(index, false);
mtr_start(&mtr);
@@ -2059,7 +2106,7 @@ dict_stats_update_persistent(
/* Table definition is corrupt */
dict_table_stats_unlock(table, RW_X_LATCH);
- dict_stats_empty_table(table);
+ dict_stats_empty_table(table, true);
return(DB_CORRUPTION);
}
@@ -2088,7 +2135,7 @@ dict_stats_update_persistent(
continue;
}
- dict_stats_empty_index(index);
+ dict_stats_empty_index(index, false);
if (dict_stats_should_ignore_index(index)) {
continue;
@@ -2657,6 +2704,16 @@ dict_stats_fetch_index_stats_step(
== 0) {
index->stat_n_leaf_pages = (ulint) stat_value;
arg->stats_were_modified = true;
+ } else if (stat_name_len == 12 /* strlen("n_page_split") */
+ && strncasecmp("n_page_split", stat_name, stat_name_len)
+ == 0) {
+ index->stat_defrag_n_page_split = (ulint) stat_value;
+ arg->stats_were_modified = true;
+ } else if (stat_name_len == 13 /* strlen("n_pages_freed") */
+ && strncasecmp("n_pages_freed", stat_name, stat_name_len)
+ == 0) {
+ index->stat_defrag_n_pages_freed = (ulint) stat_value;
+ arg->stats_were_modified = true;
} else if (stat_name_len > PFX_LEN /* e.g. stat_name=="n_diff_pfx01" */
&& strncasecmp(PFX, stat_name, PFX_LEN) == 0) {
@@ -2776,7 +2833,7 @@ dict_stats_fetch_from_ps(
the persistent storage contains incomplete stats (e.g. missing stats
for some index) then we would end up with (partially) uninitialized
stats. */
- dict_stats_empty_table(table);
+ dict_stats_empty_table(table, true);
trx = trx_allocate_for_background();
@@ -2878,6 +2935,22 @@ dict_stats_fetch_from_ps(
}
/*********************************************************************//**
+Clear defragmentation stats modified counter for all indices in table. */
+static
+void
+dict_stats_empty_defrag_modified_counter(
+ dict_table_t* table) /*!< in: table */
+{
+ dict_index_t* index;
+ ut_a(table);
+ for (index = dict_table_get_first_index(table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+ index->stat_defrag_modified_counter = 0;
+ }
+}
+
+/*********************************************************************//**
Fetches or calculates new estimates for index statistics. */
UNIV_INTERN
void
@@ -2949,13 +3022,13 @@ dict_stats_update(
"because the .ibd file is missing. For help, please "
"refer to " REFMAN "innodb-troubleshooting.html\n",
ut_format_name(table->name, TRUE, buf, sizeof(buf)));
- dict_stats_empty_table(table);
+ dict_stats_empty_table(table, true);
return(DB_TABLESPACE_DELETED);
} else if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
/* If we have set a high innodb_force_recovery level, do
not calculate statistics, as a badly corrupted index can
cause a crash in it. */
- dict_stats_empty_table(table);
+ dict_stats_empty_table(table, false);
return(DB_SUCCESS);
}
@@ -3014,7 +3087,7 @@ dict_stats_update(
case DICT_STATS_EMPTY_TABLE:
- dict_stats_empty_table(table);
+ dict_stats_empty_table(table, true);
/* If table is using persistent stats,
then save the stats on disk */
@@ -3073,6 +3146,7 @@ dict_stats_update(
t->stats_last_recalc = table->stats_last_recalc;
t->stat_modified_counter = 0;
+ dict_stats_empty_defrag_modified_counter(t);
switch (err) {
case DB_SUCCESS:
@@ -3083,7 +3157,7 @@ dict_stats_update(
copying because dict_stats_table_clone_create() does
skip corrupted indexes so our dummy object 't' may
have less indexes than the real object 'table'. */
- dict_stats_empty_table(table);
+ dict_stats_empty_table(table, true);
dict_stats_copy(table, t);
@@ -3650,6 +3724,117 @@ dict_stats_rename_table(
return(ret);
}
+/*********************************************************************//**
+Save defragmentation result.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_save_defrag_summary(
+ dict_index_t* index) /*!< in: index */
+{
+ dberr_t ret;
+ lint now = (lint) ut_time();
+ if (dict_index_is_univ(index)) {
+ return DB_SUCCESS;
+ }
+ rw_lock_x_lock(&dict_operation_lock);
+ mutex_enter(&dict_sys->mutex);
+ ret = dict_stats_save_index_stat(index, now, "n_pages_freed",
+ index->stat_defrag_n_pages_freed,
+ NULL,
+ "Number of pages freed during"
+ " last defragmentation run.",
+ NULL);
+
+ mutex_exit(&dict_sys->mutex);
+ rw_lock_x_unlock(&dict_operation_lock);
+ return (ret);
+}
+
+/*********************************************************************//**
+Save defragmentation stats for a given index.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_save_defrag_stats(
+ dict_index_t* index) /*!< in: index */
+{
+ dberr_t ret;
+
+ if (index->table->ibd_file_missing) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Cannot save defragment stats because "
+ ".ibd file is missing.\n");
+ return (DB_TABLESPACE_DELETED);
+ }
+ if (dict_index_is_corrupted(index)) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Cannot save defragment stats because "
+ "index is corrupted.\n");
+ return(DB_CORRUPTION);
+ }
+
+ if (dict_index_is_univ(index)) {
+ return DB_SUCCESS;
+ }
+
+ lint now = (lint) ut_time();
+ mtr_t mtr;
+ ulint n_leaf_pages;
+ ulint n_leaf_reserved;
+ mtr_start(&mtr);
+ mtr_s_lock(dict_index_get_lock(index), &mtr);
+ n_leaf_reserved = btr_get_size_and_reserved(index, BTR_N_LEAF_PAGES,
+ &n_leaf_pages, &mtr);
+ mtr_commit(&mtr);
+
+ if (n_leaf_reserved == ULINT_UNDEFINED) {
+ // The index name is different during fast index creation,
+ // so the stats won't be associated with the right index
+ // for later use. We just return without saving.
+ return DB_SUCCESS;
+ }
+
+ rw_lock_x_lock(&dict_operation_lock);
+
+ mutex_enter(&dict_sys->mutex);
+ ret = dict_stats_save_index_stat(index, now, "n_page_split",
+ index->stat_defrag_n_page_split,
+ NULL,
+ "Number of new page splits on leaves"
+ " since last defragmentation.",
+ NULL);
+ if (ret != DB_SUCCESS) {
+ goto end;
+ }
+
+ ret = dict_stats_save_index_stat(
+ index, now, "n_leaf_pages_defrag",
+ n_leaf_pages,
+ NULL,
+ "Number of leaf pages when this stat is saved to disk",
+ NULL);
+ if (ret != DB_SUCCESS) {
+ goto end;
+ }
+
+ ret = dict_stats_save_index_stat(
+ index, now, "n_leaf_pages_reserved",
+ n_leaf_reserved,
+ NULL,
+ "Number of pages reserved for this index leaves when this stat "
+ "is saved to disk",
+ NULL);
+
+end:
+ mutex_exit(&dict_sys->mutex);
+ rw_lock_x_unlock(&dict_operation_lock);
+
+ return (ret);
+}
+
/* tests @{ */
#ifdef UNIV_COMPILE_TEST_FUNCS
diff --git a/storage/xtradb/dict/dict0stats_bg.cc b/storage/xtradb/dict/dict0stats_bg.cc
index 9e1f75a13a9..2cf8aff1e30 100644
--- a/storage/xtradb/dict/dict0stats_bg.cc
+++ b/storage/xtradb/dict/dict0stats_bg.cc
@@ -25,6 +25,7 @@ Created Apr 25, 2012 Vasil Dimov
#include "row0mysql.h"
#include "srv0start.h"
+#include "dict0dict.h"
#include "dict0stats.h"
#include "dict0stats_bg.h"
@@ -44,8 +45,10 @@ UNIV_INTERN os_event_t dict_stats_event = NULL;
/** This mutex protects the "recalc_pool" variable. */
static ib_mutex_t recalc_pool_mutex;
+static ib_mutex_t defrag_pool_mutex;
#ifdef HAVE_PSI_INTERFACE
static mysql_pfs_key_t recalc_pool_mutex_key;
+static mysql_pfs_key_t defrag_pool_mutex_key;
#endif /* HAVE_PSI_INTERFACE */
/** The number of tables that can be added to "recalc_pool" before
@@ -59,16 +62,26 @@ static recalc_pool_t recalc_pool;
typedef recalc_pool_t::iterator recalc_pool_iterator_t;
+/** Indices whose defrag stats need to be saved to persistent storage.*/
+struct defrag_pool_item_t {
+ table_id_t table_id;
+ index_id_t index_id;
+};
+typedef std::vector<defrag_pool_item_t> defrag_pool_t;
+static defrag_pool_t defrag_pool;
+typedef defrag_pool_t::iterator defrag_pool_iterator_t;
+
/*****************************************************************//**
Initialize the recalc pool, called once during thread initialization. */
static
void
-dict_stats_recalc_pool_init()
+dict_stats_pool_init()
/*=========================*/
{
ut_ad(!srv_read_only_mode);
recalc_pool.reserve(RECALC_POOL_INITIAL_SLOTS);
+ defrag_pool.reserve(RECALC_POOL_INITIAL_SLOTS);
}
/*****************************************************************//**
@@ -76,12 +89,13 @@ Free the resources occupied by the recalc pool, called once during
thread de-initialization. */
static
void
-dict_stats_recalc_pool_deinit()
+dict_stats_pool_deinit()
/*===========================*/
{
ut_ad(!srv_read_only_mode);
recalc_pool.clear();
+ defrag_pool.clear();
}
/*****************************************************************//**
@@ -178,6 +192,111 @@ dict_stats_recalc_pool_del(
}
/*****************************************************************//**
+Add an index in a table to the defrag pool, which is processed by the
+background stats gathering thread. Only the table id and index id are
+added to the list, so the table can be closed after being enqueued and
+it will be opened when needed. If the table or index does not exist later
+(has been DROPped), then it will be removed from the pool and skipped. */
+UNIV_INTERN
+void
+dict_stats_defrag_pool_add(
+/*=======================*/
+ const dict_index_t* index) /*!< in: table to add */
+{
+ defrag_pool_item_t item;
+
+ ut_ad(!srv_read_only_mode);
+
+ mutex_enter(&defrag_pool_mutex);
+
+ /* quit if already in the list */
+ for (defrag_pool_iterator_t iter = defrag_pool.begin();
+ iter != defrag_pool.end();
+ ++iter) {
+ if ((*iter).table_id == index->table->id
+ && (*iter).index_id == index->id) {
+ mutex_exit(&defrag_pool_mutex);
+ return;
+ }
+ }
+
+ item.table_id = index->table->id;
+ item.index_id = index->id;
+ defrag_pool.push_back(item);
+
+ mutex_exit(&defrag_pool_mutex);
+
+ os_event_set(dict_stats_event);
+}
+
+/*****************************************************************//**
+Get an index from the auto defrag pool. The returned index id is removed
+from the pool.
+@return true if the pool was non-empty and "id" was set, false otherwise */
+static
+bool
+dict_stats_defrag_pool_get(
+/*=======================*/
+ table_id_t* table_id, /*!< out: table id, or unmodified if
+ list is empty */
+ index_id_t* index_id) /*!< out: index id, or unmodified if
+ list is empty */
+{
+ ut_ad(!srv_read_only_mode);
+
+ mutex_enter(&defrag_pool_mutex);
+
+ if (defrag_pool.empty()) {
+ mutex_exit(&defrag_pool_mutex);
+ return(false);
+ }
+
+ defrag_pool_item_t& item = defrag_pool.back();
+ *table_id = item.table_id;
+ *index_id = item.index_id;
+
+ defrag_pool.pop_back();
+
+ mutex_exit(&defrag_pool_mutex);
+
+ return(true);
+}
+
+/*****************************************************************//**
+Delete a given index from the auto defrag pool. */
+UNIV_INTERN
+void
+dict_stats_defrag_pool_del(
+/*=======================*/
+ const dict_table_t* table, /*!<in: if given, remove
+ all entries for the table */
+ const dict_index_t* index) /*!< in: if given, remove this index */
+{
+ ut_a((table && !index) || (!table && index));
+ ut_ad(!srv_read_only_mode);
+ ut_ad(mutex_own(&dict_sys->mutex));
+
+ mutex_enter(&defrag_pool_mutex);
+
+ defrag_pool_iterator_t iter = defrag_pool.begin();
+ while (iter != defrag_pool.end()) {
+ if ((table && (*iter).table_id == table->id)
+ || (index
+ && (*iter).table_id == index->table->id
+ && (*iter).index_id == index->id)) {
+ /* erase() invalidates the iterator */
+ iter = defrag_pool.erase(iter);
+ if (index)
+ break;
+ } else {
+ iter++;
+ }
+ }
+
+ mutex_exit(&defrag_pool_mutex);
+}
+
+/*****************************************************************//**
Wait until background stats thread has stopped using the specified table.
The caller must have locked the data dictionary using
row_mysql_lock_data_dictionary() and this function may unlock it temporarily
@@ -227,7 +346,10 @@ dict_stats_thread_init()
mutex_create(recalc_pool_mutex_key, &recalc_pool_mutex,
SYNC_STATS_AUTO_RECALC);
- dict_stats_recalc_pool_init();
+ /* We choose SYNC_STATS_DEFRAG to be below SYNC_FSP_PAGE. */
+ mutex_create(defrag_pool_mutex_key, &defrag_pool_mutex,
+ SYNC_STATS_DEFRAG);
+ dict_stats_pool_init();
}
/*****************************************************************//**
@@ -241,11 +363,14 @@ dict_stats_thread_deinit()
ut_a(!srv_read_only_mode);
ut_ad(!srv_dict_stats_thread_active);
- dict_stats_recalc_pool_deinit();
+ dict_stats_pool_deinit();
mutex_free(&recalc_pool_mutex);
memset(&recalc_pool_mutex, 0x0, sizeof(recalc_pool_mutex));
+ mutex_free(&defrag_pool_mutex);
+ memset(&defrag_pool_mutex, 0x0, sizeof(defrag_pool_mutex));
+
os_event_free(dict_stats_event);
dict_stats_event = NULL;
}
@@ -323,6 +448,63 @@ dict_stats_process_entry_from_recalc_pool()
}
/*****************************************************************//**
+Get the first index that has been added for updating persistent defrag
+stats and eventually save its stats. */
+static
+void
+dict_stats_process_entry_from_defrag_pool()
+/*=======================================*/
+{
+ table_id_t table_id;
+ index_id_t index_id;
+
+ ut_ad(!srv_read_only_mode);
+
+ /* pop the first index from the auto defrag pool */
+ if (!dict_stats_defrag_pool_get(&table_id, &index_id)) {
+ /* no index in defrag pool */
+ return;
+ }
+
+ dict_table_t* table;
+
+ mutex_enter(&dict_sys->mutex);
+
+ /* If the table is no longer cached, we've already lost the in
+ memory stats so there's nothing really to write to disk. */
+ table = dict_table_open_on_id(table_id, TRUE,
+ DICT_TABLE_OP_OPEN_ONLY_IF_CACHED);
+
+ if (table == NULL) {
+ mutex_exit(&dict_sys->mutex);
+ return;
+ }
+
+ /* Check whether table is corrupted */
+ if (table->corrupted) {
+ dict_table_close(table, TRUE, FALSE);
+ mutex_exit(&dict_sys->mutex);
+ return;
+ }
+ mutex_exit(&dict_sys->mutex);
+
+ dict_index_t* index = dict_table_find_index_on_id(table, index_id);
+
+ if (index == NULL) {
+ return;
+ }
+
+ /* Check whether index is corrupted */
+ if (dict_index_is_corrupted(index)) {
+ dict_table_close(table, FALSE, FALSE);
+ return;
+ }
+
+ dict_stats_save_defrag_stats(index);
+ dict_table_close(table, FALSE, FALSE);
+}
+
+/*****************************************************************//**
This is the thread for background stats gathering. It pops tables, from
the auto recalc list and proceeds them, eventually recalculating their
statistics.
@@ -354,6 +536,9 @@ DECLARE_THREAD(dict_stats_thread)(
dict_stats_process_entry_from_recalc_pool();
+ while (defrag_pool.size())
+ dict_stats_process_entry_from_defrag_pool();
+
os_event_reset(dict_stats_event);
}
diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc
index fb3e097491d..8f3bdcf0614 100644
--- a/storage/xtradb/handler/ha_innodb.cc
+++ b/storage/xtradb/handler/ha_innodb.cc
@@ -58,6 +58,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
#include "buf0flu.h"
#include "buf0dblwr.h"
#include "btr0sea.h"
+#include "btr0defragment.h"
#include "os0file.h"
#include "os0thread.h"
#include "srv0start.h"
@@ -66,7 +67,6 @@ this program; if not, write to the Free Software Foundation, Inc.,
#include "trx0trx.h"
#include "trx0sys.h"
-#include "mtr0mtr.h"
#include "rem0types.h"
#include "row0ins.h"
#include "row0mysql.h"
@@ -88,6 +88,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
#include "dict0stats_bg.h"
#include "ha_prototypes.h"
#include "ut0mem.h"
+#include "ut0timer.h"
#include "ibuf0ibuf.h"
#include "dict0dict.h"
#include "srv0mon.h"
@@ -946,6 +947,14 @@ static SHOW_VAR innodb_status_variables[]= {
{"have_bzip2",
(char*) &innodb_have_bzip2, SHOW_BOOL},
+ /* Defragment */
+ {"defragment_compression_failures",
+ (char*) &export_vars.innodb_defragment_compression_failures, SHOW_LONG},
+ {"defragment_failures",
+ (char*) &export_vars.innodb_defragment_failures, SHOW_LONG},
+ {"defragment_count",
+ (char*) &export_vars.innodb_defragment_count, SHOW_LONG},
+
{NullS, NullS, SHOW_LONG}
};
@@ -2700,7 +2709,8 @@ ha_innobase::ha_innobase(
(srv_force_primary_key ? HA_REQUIRE_PRIMARY_KEY : 0 ) |
HA_CAN_FULLTEXT_EXT | HA_CAN_EXPORT),
start_of_scan(0),
- num_write_row(0)
+ num_write_row(0),
+ ha_partition_stats(NULL)
{}
/*********************************************************************//**
@@ -11223,6 +11233,72 @@ ha_innobase::delete_table(
}
/*****************************************************************//**
+Defragment table.
+@return error number */
+UNIV_INTERN
+int
+ha_innobase::defragment_table(
+/*==========================*/
+ const char* name, /*!< in: table name */
+ const char* index_name, /*!< in: index name */
+ bool async) /*!< in: whether to wait until finish */
+{
+ char norm_name[FN_REFLEN];
+ dict_table_t* table;
+ dict_index_t* index;
+ ibool one_index = (index_name != 0);
+ int ret = 0;
+ if (!srv_defragment) {
+ return ER_FEATURE_DISABLED;
+ }
+ normalize_table_name(norm_name, name);
+ table = dict_table_open_on_name(norm_name, FALSE,
+ FALSE, DICT_ERR_IGNORE_NONE);
+ for (index = dict_table_get_first_index(table); index;
+ index = dict_table_get_next_index(index)) {
+ if (one_index && strcasecmp(index_name, index->name) != 0)
+ continue;
+ if (btr_defragment_find_index(index)) {
+ // We borrow this error code. When the same index is
+ // already in the defragmentation queue, issue another
+ // defragmentation only introduces overhead. We return
+ // an error here to let the user know this is not
+ // necessary. Note that this will fail a query that's
+ // trying to defragment a full table if one of the
+ // indicies in that table is already in defragmentation.
+ // We choose this behavior so user is aware of this
+ // rather than silently defragment other indicies of
+ // that table.
+ ret = ER_SP_ALREADY_EXISTS;
+ break;
+ }
+ os_event_t event = btr_defragment_add_index(index, async);
+ if (!async && event) {
+ while(os_event_wait_time(event, 1000000)) {
+ if (thd_killed(current_thd)) {
+ btr_defragment_remove_index(index);
+ ret = ER_QUERY_INTERRUPTED;
+ break;
+ }
+ }
+ os_event_free(event);
+ }
+ if (ret) {
+ break;
+ }
+ if (one_index) {
+ one_index = FALSE;
+ break;
+ }
+ }
+ dict_table_close(table, FALSE, FALSE);
+ if (ret == 0 && one_index) {
+ ret = ER_NO_SUCH_INDEX;
+ }
+ return ret;
+}
+
+/*****************************************************************//**
Removes all tables in the named database inside InnoDB. */
static
void
@@ -12389,6 +12465,27 @@ ha_innobase::optimize(
This works OK otherwise, but MySQL locks the entire table during
calls to OPTIMIZE, which is undesirable. */
+ if (srv_defragment) {
+ int err;
+
+ err = defragment_table(prebuilt->table->name, NULL, false);
+
+ if (err == 0) {
+ return (HA_ADMIN_OK);
+ } else {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ err,
+ "InnoDB: Cannot defragment table %s: returned error code %d\n",
+ prebuilt->table->name, err);
+
+ if(err == ER_SP_ALREADY_EXISTS) {
+ return (HA_ADMIN_OK);
+ } else {
+ return (HA_ADMIN_TRY_ALTER);
+ }
+ }
+ }
+
if (innodb_optimize_fulltext_only) {
if (prebuilt->table->fts && prebuilt->table->fts->cache
&& !dict_table_is_discarded(prebuilt->table)) {
@@ -15190,6 +15287,13 @@ innodb_max_dirty_pages_pct_lwm_update(
srv_max_dirty_pages_pct_lwm = in_val;
}
+UNIV_INTERN
+void
+ha_innobase::set_partition_owner_stats(ha_statistics *stats)
+{
+ ha_partition_stats= stats;
+}
+
/************************************************************//**
Validate the file format name and return its corresponding id.
@return valid file format id */
@@ -16448,6 +16552,23 @@ innodb_reset_all_monitor_update(
TRUE);
}
+static
+void
+innodb_defragment_frequency_update(
+/*===============================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to
+ system variable */
+ void* var_ptr,/*!< out: where the
+ formal string goes */
+ const void* save) /*!< in: immediate result
+ from check function */
+{
+ srv_defragment_frequency = (*static_cast<const uint*>(save));
+ srv_defragment_interval = ut_microseconds_to_timer(
+ 1000000.0 / srv_defragment_frequency);
+}
+
/****************************************************************//**
Parse and enable InnoDB monitor counters during server startup.
User can list the monitor counters/groups to be enable by specifying
@@ -17735,6 +17856,60 @@ static MYSQL_SYSVAR_BOOL(buffer_pool_load_at_startup, srv_buffer_pool_load_at_st
"Load the buffer pool from a file named @@innodb_buffer_pool_filename",
NULL, NULL, FALSE);
+static MYSQL_SYSVAR_BOOL(defragment, srv_defragment,
+ PLUGIN_VAR_RQCMDARG,
+ "Enable/disable InnoDB defragmentation (default FALSE). When set to FALSE, all existing "
+ "defragmentation will be paused. And new defragmentation command will fail."
+ "Paused defragmentation commands will resume when this variable is set to "
+ "true again.",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_UINT(defragment_n_pages, srv_defragment_n_pages,
+ PLUGIN_VAR_RQCMDARG,
+ "Number of pages considered at once when merging multiple pages to "
+ "defragment",
+ NULL, NULL, 7, 2, 32, 0);
+
+static MYSQL_SYSVAR_UINT(defragment_stats_accuracy,
+ srv_defragment_stats_accuracy,
+ PLUGIN_VAR_RQCMDARG,
+ "How many defragment stats changes there are before the stats "
+ "are written to persistent storage. Set to 0 meaning disable "
+ "defragment stats tracking.",
+ NULL, NULL, 0, 0, ~0U, 0);
+
+static MYSQL_SYSVAR_UINT(defragment_fill_factor_n_recs,
+ srv_defragment_fill_factor_n_recs,
+ PLUGIN_VAR_RQCMDARG,
+ "How many records of space defragmentation should leave on the page. "
+ "This variable, together with innodb_defragment_fill_factor, is introduced "
+ "so defragmentation won't pack the page too full and cause page split on "
+ "the next insert on every page. The variable indicating more defragmentation"
+ " gain is the one effective.",
+ NULL, NULL, 20, 1, 100, 0);
+
+static MYSQL_SYSVAR_DOUBLE(defragment_fill_factor, srv_defragment_fill_factor,
+ PLUGIN_VAR_RQCMDARG,
+ "A number between [0.7, 1] that tells defragmentation how full it should "
+ "fill a page. Default is 0.9. Number below 0.7 won't make much sense."
+ "This variable, together with innodb_defragment_fill_factor_n_recs, is "
+ "introduced so defragmentation won't pack the page too full and cause "
+ "page split on the next insert on every page. The variable indicating more "
+ "defragmentation gain is the one effective.",
+ NULL, NULL, 0.9, 0.7, 1, 0);
+
+static MYSQL_SYSVAR_UINT(defragment_frequency, srv_defragment_frequency,
+ PLUGIN_VAR_RQCMDARG,
+ "Do not defragment a single index more than this number of time per second."
+ "This controls the number of time defragmentation thread can request X_LOCK "
+ "on an index. Defragmentation thread will check whether "
+ "1/defragment_frequency (s) has passed since it worked on this index last "
+ "time, and put the index back to the queue if not enough time has passed. "
+ "The actual frequency can only be lower than this given number.",
+ NULL, innodb_defragment_frequency_update,
+ SRV_DEFRAGMENT_FREQUENCY_DEFAULT, 1, 1000, 0);
+
+
static MYSQL_SYSVAR_ULONG(lru_scan_depth, srv_LRU_scan_depth,
PLUGIN_VAR_RQCMDARG,
"How deep to scan LRU to keep it clean",
@@ -18291,6 +18466,12 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(buffer_pool_load_now),
MYSQL_SYSVAR(buffer_pool_load_abort),
MYSQL_SYSVAR(buffer_pool_load_at_startup),
+ MYSQL_SYSVAR(defragment),
+ MYSQL_SYSVAR(defragment_n_pages),
+ MYSQL_SYSVAR(defragment_stats_accuracy),
+ MYSQL_SYSVAR(defragment_fill_factor),
+ MYSQL_SYSVAR(defragment_fill_factor_n_recs),
+ MYSQL_SYSVAR(defragment_frequency),
MYSQL_SYSVAR(lru_scan_depth),
MYSQL_SYSVAR(flush_neighbors),
MYSQL_SYSVAR(checksum_algorithm),
diff --git a/storage/xtradb/handler/ha_innodb.h b/storage/xtradb/handler/ha_innodb.h
index b4df711356c..19356750640 100644
--- a/storage/xtradb/handler/ha_innodb.h
+++ b/storage/xtradb/handler/ha_innodb.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2000, 2012, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
+Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -105,6 +105,8 @@ class ha_innobase: public handler
or undefined */
uint num_write_row; /*!< number of write_row() calls */
+ ha_statistics* ha_partition_stats; /*!< stats of the partition owner
+ handler (if there is one) */
uint store_key_val_for_row(uint keynr, char* buff, uint buff_len,
const uchar* record);
inline void update_thd(THD* thd);
@@ -207,6 +209,8 @@ class ha_innobase: public handler
int truncate();
int delete_table(const char *name);
int rename_table(const char* from, const char* to);
+ int defragment_table(const char* name, const char* index_name,
+ bool async);
int check(THD* thd, HA_CHECK_OPT* check_opt);
char* update_table_comment(const char* comment);
char* get_foreign_key_create_info();
@@ -310,6 +314,7 @@ class ha_innobase: public handler
Alter_inplace_info* ha_alter_info,
bool commit);
/** @} */
+ void set_partition_owner_stats(ha_statistics *stats);
bool check_if_incompatible_data(HA_CREATE_INFO *info,
uint table_changes);
bool check_if_supported_virtual_columns(void) { return TRUE; }
diff --git a/storage/xtradb/include/btr0btr.h b/storage/xtradb/include/btr0btr.h
index a3f7cee2733..001e1af7d2d 100644
--- a/storage/xtradb/include/btr0btr.h
+++ b/storage/xtradb/include/btr0btr.h
@@ -2,6 +2,7 @@
Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2014, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -674,6 +675,21 @@ btr_get_size(
is s-latched */
__attribute__((nonnull, warn_unused_result));
/**************************************************************//**
+Gets the number of reserved and used pages in a B-tree.
+@return number of pages reserved, or ULINT_UNDEFINED if the index
+is unavailable */
+UNIV_INTERN
+ulint
+btr_get_size_and_reserved(
+/*======================*/
+ dict_index_t* index, /*!< in: index */
+ ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
+ ulint* used, /*!< out: number of pages used (<= reserved) */
+ mtr_t* mtr) /*!< in/out: mini-transaction where index
+ is s-latched */
+ __attribute__((nonnull));
+
+/**************************************************************//**
Allocates a new file page to be used in an index tree. NOTE: we assume
that the caller has made the reservation for free extents!
@retval NULL if no page could be allocated
@@ -720,6 +736,33 @@ btr_page_free_low(
ulint level, /*!< in: page level */
mtr_t* mtr) /*!< in: mtr */
__attribute__((nonnull));
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@retval true if the operation was successful
+@retval false if it is a compressed page, and recompression failed */
+UNIV_INTERN
+bool
+btr_page_reorganize_block(
+/*======================*/
+ bool recovery,/*!< in: true if called in recovery:
+ locks should not be updated, i.e.,
+ there cannot exist locks on the
+ page, and a hash index should not be
+ dropped: it cannot exist */
+ ulint z_level,/*!< in: compression level to be used
+ if dealing with compressed page */
+ buf_block_t* block, /*!< in/out: B-tree page */
+ dict_index_t* index, /*!< in: the index tree of the page */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ __attribute__((nonnull));
+
#ifdef UNIV_BTR_PRINT
/*************************************************************//**
Prints size info of a B-tree. */
@@ -765,6 +808,60 @@ btr_validate_index(
const trx_t* trx) /*!< in: transaction or 0 */
__attribute__((nonnull(1), warn_unused_result));
+#ifdef UNIV_SYNC_DEBUG
+/*************************************************************//**
+Removes a page from the level list of pages.
+@param space in: space where removed
+@param zip_size in: compressed page size in bytes, or 0 for uncompressed
+@param page in/out: page to remove
+@param index in: index tree
+@param mtr in/out: mini-transaction */
+# define btr_level_list_remove(space,zip_size,page,index,mtr) \
+ btr_level_list_remove_func(space,zip_size,page,index,mtr)
+#else /* UNIV_SYNC_DEBUG */
+/*************************************************************//**
+Removes a page from the level list of pages.
+@param space in: space where removed
+@param zip_size in: compressed page size in bytes, or 0 for uncompressed
+@param page in/out: page to remove
+@param index in: index tree
+@param mtr in/out: mini-transaction */
+# define btr_level_list_remove(space,zip_size,page,index,mtr) \
+ btr_level_list_remove_func(space,zip_size,page,mtr)
+#endif /* UNIV_SYNC_DEBUG */
+
+/*************************************************************//**
+Removes a page from the level list of pages. */
+UNIV_INTERN
+void
+btr_level_list_remove_func(
+/*=======================*/
+ ulint space, /*!< in: space where removed */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ page_t* page, /*!< in/out: page to remove */
+#ifdef UNIV_SYNC_DEBUG
+ const dict_index_t* index, /*!< in: index tree */
+#endif /* UNIV_SYNC_DEBUG */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ __attribute__((nonnull));
+
+/*************************************************************//**
+If page is the only on its level, this function moves its records to the
+father page, thus reducing the tree height.
+@return father block */
+UNIV_INTERN
+buf_block_t*
+btr_lift_page_up(
+/*=============*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: page which is the only on its level;
+ must not be empty: use
+ btr_discard_only_page_on_level if the last
+ record from the page should be removed */
+ mtr_t* mtr) /*!< in: mtr */
+ __attribute__((nonnull));
+
#define BTR_N_LEAF_PAGES 1
#define BTR_TOTAL_SIZE 2
#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/btr0btr.ic b/storage/xtradb/include/btr0btr.ic
index 9cc611ee450..40b468b200a 100644
--- a/storage/xtradb/include/btr0btr.ic
+++ b/storage/xtradb/include/btr0btr.ic
@@ -28,7 +28,7 @@ Created 6/2/1994 Heikki Tuuri
#include "mtr0mtr.h"
#include "mtr0log.h"
#include "page0zip.h"
-#include "srv0srv.h"
+
#define BTR_MAX_NODE_LEVEL 50 /*!< Maximum B-tree page level
(not really a hard limit).
Used in debug assertions
@@ -59,9 +59,7 @@ btr_block_get_func(
block = buf_page_get_gen(space, zip_size, page_no, mode,
NULL, BUF_GET, file, line, mtr);
- SRV_CORRUPT_TABLE_CHECK(block, ; /* do nothing */);
-
- if (block && mode != RW_NO_LATCH) {
+ if (mode != RW_NO_LATCH) {
buf_block_dbg_add_level(
block, index != NULL && dict_index_is_ibuf(index)
@@ -165,9 +163,10 @@ btr_page_get_next(
/*!< in: mini-transaction handle */
{
ut_ad(page && mtr);
+#ifndef UNIV_INNOCHECKSUM
ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)
|| mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_S_FIX));
-
+#endif /* UNIV_INNOCHECKSUM */
return(mach_read_from_4(page + FIL_PAGE_NEXT));
}
diff --git a/storage/xtradb/include/btr0defragment.h b/storage/xtradb/include/btr0defragment.h
new file mode 100644
index 00000000000..99beb0a24ba
--- /dev/null
+++ b/storage/xtradb/include/btr0defragment.h
@@ -0,0 +1,100 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#ifndef btr0defragment_h
+#define btr0defragment_h
+
+#include "univ.i"
+
+#ifndef UNIV_HOTBACKUP
+
+#include "btr0pcur.h"
+
+/* Max number of pages to consider at once during defragmentation. */
+#define BTR_DEFRAGMENT_MAX_N_PAGES 32
+
+/** stats in btr_defragment */
+extern ulint btr_defragment_compression_failures;
+extern ulint btr_defragment_failures;
+extern ulint btr_defragment_count;
+
+/** Item in the work queue for btr_degrament_thread. */
+struct btr_defragment_item_t
+{
+ btr_pcur_t* pcur; /* persistent cursor where
+ btr_defragment_n_pages should start */
+ os_event_t event; /* if not null, signal after work
+ is done */
+ bool removed; /* Mark an item as removed */
+ ulonglong last_processed; /* timestamp of last time this index
+ is processed by defragment thread */
+
+ btr_defragment_item_t(btr_pcur_t* pcur, os_event_t event);
+ ~btr_defragment_item_t();
+};
+
+/******************************************************************//**
+Initialize defragmentation. */
+void
+btr_defragment_init(void);
+/******************************************************************//**
+Shutdown defragmentation. */
+void
+btr_defragment_shutdown();
+/******************************************************************//**
+Check whether the given index is in btr_defragment_wq. */
+bool
+btr_defragment_find_index(
+ dict_index_t* index); /*!< Index to find. */
+/******************************************************************//**
+Add an index to btr_defragment_wq. Return a pointer to os_event if this
+is a synchronized defragmentation. */
+os_event_t
+btr_defragment_add_index(
+ dict_index_t* index, /*!< index to be added */
+ bool async); /*!< whether this is an async defragmentation */
+/******************************************************************//**
+When table is dropped, this function is called to mark a table as removed in
+btr_efragment_wq. The difference between this function and the remove_index
+function is this will not NULL the event. */
+void
+btr_defragment_remove_table(
+ dict_table_t* table); /*!< Index to be removed. */
+/******************************************************************//**
+Mark an index as removed from btr_defragment_wq. */
+void
+btr_defragment_remove_index(
+ dict_index_t* index); /*!< Index to be removed. */
+/*********************************************************************//**
+Check whether we should save defragmentation statistics to persistent storage.*/
+UNIV_INTERN
+void
+btr_defragment_save_defrag_stats_if_needed(
+ dict_index_t* index); /*!< in: index */
+/******************************************************************//**
+Thread that merges consecutive b-tree pages into fewer pages to defragment
+the index. */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(btr_defragment_thread)(
+/*==========================================*/
+ void* arg); /*!< in: a dummy parameter required by
+ os_thread_create */
+
+#endif /* !UNIV_HOTBACKUP */
+#endif
diff --git a/storage/xtradb/include/dict0dict.h b/storage/xtradb/include/dict0dict.h
index 47790a158da..52ac5eee86b 100644
--- a/storage/xtradb/include/dict0dict.h
+++ b/storage/xtradb/include/dict0dict.h
@@ -120,7 +120,9 @@ enum dict_table_op_t {
DICT_TABLE_OP_DROP_ORPHAN,
/** Silently load the tablespace if it does not exist,
and do not load the definitions of incomplete indexes. */
- DICT_TABLE_OP_LOAD_TABLESPACE
+ DICT_TABLE_OP_LOAD_TABLESPACE,
+ /** Open the table only if it's in table cache. */
+ DICT_TABLE_OP_OPEN_ONLY_IF_CACHED
};
/**********************************************************************//**
@@ -1495,6 +1497,16 @@ dict_table_get_index_on_name(
const char* name) /*!< in: name of the index to find */
__attribute__((nonnull, warn_unused_result));
/**********************************************************************//**
+Looks for an index with the given id given a table instance.
+@return index or NULL */
+UNIV_INTERN
+dict_index_t*
+dict_table_find_index_on_id(
+/*========================*/
+ const dict_table_t* table, /*!< in: table instance */
+ index_id_t id) /*!< in: index id */
+ __attribute__((nonnull, warn_unused_result));
+/**********************************************************************//**
In case there is more than one index with the same name return the index
with the min(id).
@return index, NULL if does not exist */
diff --git a/storage/xtradb/include/dict0mem.h b/storage/xtradb/include/dict0mem.h
index a347a75ea42..68cf7f1ba61 100644
--- a/storage/xtradb/include/dict0mem.h
+++ b/storage/xtradb/include/dict0mem.h
@@ -597,6 +597,10 @@ struct zip_pad_info_t {
rounds */
};
+/** Number of samples of data size kept when page compression fails for
+a certain index.*/
+#define STAT_DEFRAG_DATA_SIZE_N_SAMPLE 10
+
/** Data structure for an index. Most fields will be
initialized to 0, NULL or FALSE in dict_mem_index_create(). */
struct dict_index_t{
@@ -689,6 +693,23 @@ struct dict_index_t{
/*!< approximate number of leaf pages in the
index tree */
/* @} */
+ /** Statistics for defragmentation, these numbers are estimations and
+ could be very inaccurate at certain times, e.g. right after restart,
+ during defragmentation, etc. */
+ /* @{ */
+ ulint stat_defrag_modified_counter;
+ ulint stat_defrag_n_pages_freed;
+ /* number of pages freed by defragmentation. */
+ ulint stat_defrag_n_page_split;
+ /* number of page splits since last full index
+ defragmentation. */
+ ulint stat_defrag_data_size_sample[STAT_DEFRAG_DATA_SIZE_N_SAMPLE];
+ /* data size when compression failure happened
+ the most recent 10 times. */
+ ulint stat_defrag_sample_next_slot;
+ /* in which slot the next sample should be
+ saved. */
+ /* @} */
prio_rw_lock_t lock; /*!< read-write lock protecting the
upper levels of the index tree */
trx_id_t trx_id; /*!< id of the transaction that created this
diff --git a/storage/xtradb/include/dict0priv.h b/storage/xtradb/include/dict0priv.h
index 9a3c8e22992..e034662aba0 100644
--- a/storage/xtradb/include/dict0priv.h
+++ b/storage/xtradb/include/dict0priv.h
@@ -53,8 +53,9 @@ dict_table_t*
dict_table_open_on_id_low(
/*=====================*/
table_id_t table_id, /*!< in: table id */
- dict_err_ignore_t ignore_err); /*!< in: errors to ignore
+ dict_err_ignore_t ignore_err, /*!< in: errors to ignore
when loading the table */
+ ibool open_only_if_in_cache);
#ifndef UNIV_NONINL
#include "dict0priv.ic"
diff --git a/storage/xtradb/include/dict0priv.ic b/storage/xtradb/include/dict0priv.ic
index 30ba8fb60aa..983218af78a 100644
--- a/storage/xtradb/include/dict0priv.ic
+++ b/storage/xtradb/include/dict0priv.ic
@@ -74,8 +74,9 @@ dict_table_t*
dict_table_open_on_id_low(
/*======================*/
table_id_t table_id, /*!< in: table id */
- dict_err_ignore_t ignore_err) /*!< in: errors to ignore
+ dict_err_ignore_t ignore_err, /*!< in: errors to ignore
when loading the table */
+ ibool open_only_if_in_cache)
{
dict_table_t* table;
ulint fold;
@@ -88,7 +89,7 @@ dict_table_open_on_id_low(
HASH_SEARCH(id_hash, dict_sys->table_id_hash, fold,
dict_table_t*, table, ut_ad(table->cached),
table->id == table_id);
- if (table == NULL) {
+ if (table == NULL && !open_only_if_in_cache) {
table = dict_load_table_on_id(table_id, ignore_err);
}
diff --git a/storage/xtradb/include/dict0stats.h b/storage/xtradb/include/dict0stats.h
index 186f90e3694..abf56b2f0c7 100644
--- a/storage/xtradb/include/dict0stats.h
+++ b/storage/xtradb/include/dict0stats.h
@@ -195,6 +195,39 @@ dict_stats_rename_table(
is returned */
size_t errstr_sz); /*!< in: errstr size */
+/*********************************************************************//**
+Save defragmentation result.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_save_defrag_summary(
+ dict_index_t* index); /*!< in: index */
+
+/*********************************************************************//**
+Save defragmentation stats for a given index.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_save_defrag_stats(
+ dict_index_t* index); /*!< in: index */
+
+/**********************************************************************//**
+Clear defragmentation summary. */
+UNIV_INTERN
+void
+dict_stats_empty_defrag_summary(
+/*==================*/
+ dict_index_t* index); /*!< in: index to clear defragmentation stats */
+
+/**********************************************************************//**
+Clear defragmentation related index stats. */
+UNIV_INTERN
+void
+dict_stats_empty_defrag_stats(
+/*==================*/
+ dict_index_t* index); /*!< in: index to clear defragmentation stats */
+
+
#ifndef UNIV_NONINL
#include "dict0stats.ic"
#endif
diff --git a/storage/xtradb/include/dict0stats_bg.h b/storage/xtradb/include/dict0stats_bg.h
index e866ab419fe..32fac3015e8 100644
--- a/storage/xtradb/include/dict0stats_bg.h
+++ b/storage/xtradb/include/dict0stats_bg.h
@@ -56,6 +56,28 @@ dict_stats_recalc_pool_del(
/*=======================*/
const dict_table_t* table); /*!< in: table to remove */
+/*****************************************************************//**
+Add an index in a table to the defrag pool, which is processed by the
+background stats gathering thread. Only the table id and index id are
+added to the list, so the table can be closed after being enqueued and
+it will be opened when needed. If the table or index does not exist later
+(has been DROPped), then it will be removed from the pool and skipped. */
+UNIV_INTERN
+void
+dict_stats_defrag_pool_add(
+/*=======================*/
+ const dict_index_t* index); /*!< in: table to add */
+
+/*****************************************************************//**
+Delete a given index from the auto defrag pool. */
+UNIV_INTERN
+void
+dict_stats_defrag_pool_del(
+/*=======================*/
+ const dict_table_t* table, /*!<in: if given, remove
+ all entries for the table */
+ const dict_index_t* index); /*!< in: index to remove */
+
/** Yield the data dictionary latch when waiting
for the background thread to stop accessing a table.
@param trx transaction holding the data dictionary locks */
diff --git a/storage/xtradb/include/lock0lock.h b/storage/xtradb/include/lock0lock.h
index 633e4f6626b..8d5515b5eb5 100644
--- a/storage/xtradb/include/lock0lock.h
+++ b/storage/xtradb/include/lock0lock.h
@@ -183,6 +183,16 @@ lock_update_merge_left(
const buf_block_t* right_block); /*!< in: merged index page
which will be discarded */
/*************************************************************//**
+Updates the lock table when a page is splited and merged to
+two pages. */
+UNIV_INTERN
+void
+lock_update_split_and_merge(
+ const buf_block_t* left_block, /*!< in: left page to which merged */
+ const rec_t* orig_pred, /*!< in: original predecessor of
+ supremum on the left page before merge*/
+ const buf_block_t* right_block);/*!< in: right page from which merged */
+/*************************************************************//**
Resets the original locks on heir and replaces them with gap type locks
inherited from rec. */
UNIV_INTERN
diff --git a/storage/xtradb/include/srv0srv.h b/storage/xtradb/include/srv0srv.h
index a02c8a96e1a..57f9b2c72e5 100644
--- a/storage/xtradb/include/srv0srv.h
+++ b/storage/xtradb/include/srv0srv.h
@@ -397,6 +397,15 @@ extern my_bool srv_random_read_ahead;
extern ulong srv_read_ahead_threshold;
extern ulint srv_n_read_io_threads;
extern ulint srv_n_write_io_threads;
+/* Defragmentation, Origianlly facebook default value is 100, but it's too high */
+#define SRV_DEFRAGMENT_FREQUENCY_DEFAULT 40
+extern my_bool srv_defragment;
+extern uint srv_defragment_n_pages;
+extern uint srv_defragment_stats_accuracy;
+extern uint srv_defragment_fill_factor_n_recs;
+extern double srv_defragment_fill_factor;
+extern uint srv_defragment_frequency;
+extern ulonglong srv_defragment_interval;
/* Number of IO operations per second the server can do */
extern ulong srv_io_capacity;
@@ -1099,6 +1108,9 @@ struct export_var_t{
ib_int64_t innodb_x_lock_os_waits;
ib_int64_t innodb_x_lock_spin_rounds;
ib_int64_t innodb_x_lock_spin_waits;
+ ulint innodb_defragment_compression_failures;
+ ulint innodb_defragment_failures;
+ ulint innodb_defragment_count;
#ifdef UNIV_DEBUG
ulint innodb_purge_trx_id_age; /*!< rw_max_trx_id - purged trx_id */
ulint innodb_purge_view_trx_id_age; /*!< rw_max_trx_id
diff --git a/storage/xtradb/include/sync0sync.h b/storage/xtradb/include/sync0sync.h
index 788f765f919..72cfbf61dd8 100644
--- a/storage/xtradb/include/sync0sync.h
+++ b/storage/xtradb/include/sync0sync.h
@@ -864,6 +864,7 @@ or row lock! */
#define SYNC_EXTERN_STORAGE 500
#define SYNC_FSP 400
#define SYNC_FSP_PAGE 395
+#define SYNC_STATS_DEFRAG 390
/*------------------------------------- Change buffer headers */
#define SYNC_IBUF_MUTEX 370 /* ibuf_mutex */
/*------------------------------------- Change buffer tree */
diff --git a/storage/xtradb/include/ut0timer.h b/storage/xtradb/include/ut0timer.h
new file mode 100644
index 00000000000..f361ae79bf5
--- /dev/null
+++ b/storage/xtradb/include/ut0timer.h
@@ -0,0 +1,104 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved.
+Copyright (c) 2014, SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/ut0timer.h
+Timer rountines
+
+Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com
+modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6
+*************************************************************************/
+#ifndef ut0timer_h
+#define ut0timer_h
+
+#include "univ.i"
+#include "data0type.h"
+#include <my_rdtsc.h>
+
+/* Current timer stats */
+extern struct my_timer_unit_info ut_timer;
+
+/**************************************************************//**
+Function pointer to point selected timer function.
+@return timer current value */
+extern ulonglong (*ut_timer_now)(void);
+
+/**************************************************************//**
+Sets up the data required for use of my_timer_* functions.
+Selects the best timer by high frequency, and tight resolution.
+Points my_timer_now() to the selected timer function.
+Initializes my_timer struct to contain the info for selected timer.*/
+UNIV_INTERN
+void ut_init_timer(void);
+
+/**************************************************************//**
+Return time passed since time then, automatically adjusted
+for the estimated timer overhead.
+@return time passed since "then" */
+UNIV_INLINE
+ulonglong
+ut_timer_since(
+/*===========*/
+ ulonglong then); /*!< in: time where to calculate */
+/**************************************************************//**
+Get time passed since "then", and update then to now
+@return time passed sinche "then" */
+UNIV_INLINE
+ulonglong
+ut_timer_since_and_update(
+/*======================*/
+ ulonglong *then); /*!< in: time where to calculate */
+/**************************************************************//**
+Convert native timer units in a ulonglong into seconds in a double
+@return time in a seconds */
+UNIV_INLINE
+double
+ut_timer_to_seconds(
+/*=================*/
+ ulonglong when); /*!< in: time where to calculate */
+/**************************************************************//**
+Convert native timer units in a ulonglong into milliseconds in a double
+@return time in milliseconds */
+UNIV_INLINE
+double
+ut_timer_to_milliseconds(
+/*=====================*/
+ ulonglong when); /*!< in: time where to calculate */
+/**************************************************************//**
+Convert native timer units in a ulonglong into microseconds in a double
+@return time in microseconds */
+UNIV_INLINE
+double
+ut_timer_to_microseconds(
+/*=====================*/
+ ulonglong when); /*!< in: time where to calculate */
+/**************************************************************//**
+Convert microseconds in a double to native timer units in a ulonglong
+@return time in microseconds */
+UNIV_INLINE
+ulonglong
+ut_microseconds_to_timer(
+/*=====================*/
+ ulonglong when); /*!< in: time where to calculate */
+
+#ifndef UNIV_NONINL
+#include "ut0timer.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/ut0timer.ic b/storage/xtradb/include/ut0timer.ic
new file mode 100644
index 00000000000..62e17a10fb1
--- /dev/null
+++ b/storage/xtradb/include/ut0timer.ic
@@ -0,0 +1,113 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved.
+Copyright (c) 2014, SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/ut0timer.ic
+Timer rountines
+
+Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com
+modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6
+*************************************************************************/
+
+/**************************************************************//**
+Return time passed since time then, automatically adjusted
+for the estimated timer overhead.
+@return time passed since "then" */
+UNIV_INLINE
+ulonglong
+ut_timer_since(
+/*===========*/
+ ulonglong then) /*!< in: time where to calculate */
+{
+ return (ut_timer_now() - then) - ut_timer.overhead;
+}
+
+/**************************************************************//**
+Get time passed since "then", and update then to now
+@return time passed sinche "then" */
+UNIV_INLINE
+ulonglong
+ut_timer_since_and_update(
+/*======================*/
+ ulonglong *then) /*!< in: time where to calculate */
+{
+ ulonglong now = ut_timer_now();
+ ulonglong ret = (now - (*then)) - ut_timer.overhead;
+ *then = now;
+ return ret;
+}
+
+/**************************************************************//**
+Convert native timer units in a ulonglong into seconds in a double
+@return time in a seconds */
+UNIV_INLINE
+double
+ut_timer_to_seconds(
+/*=================*/
+ ulonglong when) /*!< in: time where to calculate */
+{
+ double ret = (double)(when);
+ ret /= (double)(ut_timer.frequency);
+ return ret;
+}
+
+/**************************************************************//**
+Convert native timer units in a ulonglong into milliseconds in a double
+@return time in milliseconds */
+UNIV_INLINE
+double
+ut_timer_to_milliseconds(
+/*=====================*/
+ ulonglong when) /*!< in: time where to calculate */
+{
+ double ret = (double)(when);
+ ret *= 1000.0;
+ ret /= (double)(ut_timer.frequency);
+ return ret;
+}
+
+/**************************************************************//**
+Convert native timer units in a ulonglong into microseconds in a double
+@return time in microseconds */
+UNIV_INLINE
+double
+ut_timer_to_microseconds(
+/*=====================*/
+ ulonglong when) /*!< in: time where to calculate */
+{
+ double ret = (double)(when);
+ ret *= 1000000.0;
+ ret /= (double)(ut_timer.frequency);
+ return ret;
+}
+
+/**************************************************************//**
+Convert microseconds in a double to native timer units in a ulonglong
+@return time in microseconds */
+UNIV_INLINE
+ulonglong
+ut_microseconds_to_timer(
+/*=====================*/
+ ulonglong when) /*!< in: time where to calculate */
+{
+ double ret = when;
+ ret *= (double)(ut_timer.frequency);
+ ret /= 1000000.0;
+ return (ulonglong)ret;
+}
diff --git a/storage/xtradb/lock/lock0lock.cc b/storage/xtradb/lock/lock0lock.cc
index 4f9395e27d8..d6f7b4217c3 100644
--- a/storage/xtradb/lock/lock0lock.cc
+++ b/storage/xtradb/lock/lock0lock.cc
@@ -3291,6 +3291,47 @@ lock_update_merge_left(
}
/*************************************************************//**
+Updates the lock table when a page is split and merged to
+two pages. */
+UNIV_INTERN
+void
+lock_update_split_and_merge(
+ const buf_block_t* left_block, /*!< in: left page to which merged */
+ const rec_t* orig_pred, /*!< in: original predecessor of
+ supremum on the left page before merge*/
+ const buf_block_t* right_block) /*!< in: right page from which merged */
+{
+ const rec_t* left_next_rec;
+
+ ut_a(left_block && right_block);
+ ut_a(orig_pred);
+
+ lock_mutex_enter();
+
+ left_next_rec = page_rec_get_next_const(orig_pred);
+
+ /* Inherit the locks on the supremum of the left page to the
+ first record which was moved from the right page */
+ lock_rec_inherit_to_gap(
+ left_block, left_block,
+ page_rec_get_heap_no(left_next_rec),
+ PAGE_HEAP_NO_SUPREMUM);
+
+ /* Reset the locks on the supremum of the left page,
+ releasing waiting transactions */
+ lock_rec_reset_and_release_wait(left_block,
+ PAGE_HEAP_NO_SUPREMUM);
+
+ /* Inherit the locks to the supremum of the left page from the
+ successor of the infimum on the right page */
+ lock_rec_inherit_to_gap(left_block, right_block,
+ PAGE_HEAP_NO_SUPREMUM,
+ lock_get_min_heap_no(right_block));
+
+ lock_mutex_exit();
+}
+
+/*************************************************************//**
Resets the original locks on heir and replaces them with gap type locks
inherited from rec. */
UNIV_INTERN
diff --git a/storage/xtradb/page/page0cur.cc b/storage/xtradb/page/page0cur.cc
index f5f7e1299ce..97405261392 100644
--- a/storage/xtradb/page/page0cur.cc
+++ b/storage/xtradb/page/page0cur.cc
@@ -1349,6 +1349,21 @@ page_cur_insert_rec_zip(
return(insert_rec);
}
+ /* Page compress failed. If this happened on a
+ leaf page, put the data size into the sample
+ buffer. */
+ if (page_is_leaf(page)) {
+ ulint occupied = page_get_data_size(page)
+ + page_dir_calc_reserved_space(
+ page_get_n_recs(page));
+ index->stat_defrag_data_size_sample[
+ index->stat_defrag_sample_next_slot] =
+ occupied;
+ index->stat_defrag_sample_next_slot =
+ (index->stat_defrag_sample_next_slot
+ + 1) % STAT_DEFRAG_DATA_SIZE_N_SAMPLE;
+ }
+
ut_ad(cursor->rec
== (pos > 1
? page_rec_get_nth(
diff --git a/storage/xtradb/row/row0mysql.cc b/storage/xtradb/row/row0mysql.cc
index c65c39b7971..86de2eeb14c 100644
--- a/storage/xtradb/row/row0mysql.cc
+++ b/storage/xtradb/row/row0mysql.cc
@@ -53,6 +53,7 @@ Created 9/17/2000 Heikki Tuuri
#include "rem0cmp.h"
#include "log0log.h"
#include "btr0sea.h"
+#include "btr0defragment.h"
#include "fil0fil.h"
#include "ibuf0ibuf.h"
#include "fts0fts.h"
@@ -3857,6 +3858,8 @@ row_drop_table_for_mysql(
if (!dict_table_is_temporary(table)) {
dict_stats_recalc_pool_del(table);
+ dict_stats_defrag_pool_del(table, NULL);
+ btr_defragment_remove_table(table);
/* Remove stats for this table and all of its indexes from the
persistent storage if it exists and if there are stats for this
diff --git a/storage/xtradb/srv/srv0srv.cc b/storage/xtradb/srv/srv0srv.cc
index 8e01ea7402e..bec8c9b95c3 100644
--- a/storage/xtradb/srv/srv0srv.cc
+++ b/storage/xtradb/srv/srv0srv.cc
@@ -70,10 +70,11 @@ Created 10/8/1995 Heikki Tuuri
#include "srv0mon.h"
#include "ut0crc32.h"
#include "os0file.h"
-
+#include "btr0defragment.h"
#include "mysql/plugin.h"
#include "mysql/service_thd_wait.h"
#include "fil0pagecompress.h"
+#include <my_rdtsc.h>
/* prototypes of new functions added to ha_innodb.cc for kill_idle_transaction */
ibool innobase_thd_is_idle(const void* thd);
@@ -280,6 +281,16 @@ UNIV_INTERN ulint srv_buf_pool_curr_size = 0;
UNIV_INTERN ulint srv_mem_pool_size = ULINT_MAX;
UNIV_INTERN ulint srv_lock_table_size = ULINT_MAX;
+/* Defragmentation */
+UNIV_INTERN my_bool srv_defragment = FALSE;
+UNIV_INTERN uint srv_defragment_n_pages = 7;
+UNIV_INTERN uint srv_defragment_stats_accuracy = 0;
+UNIV_INTERN uint srv_defragment_fill_factor_n_recs = 20;
+UNIV_INTERN double srv_defragment_fill_factor = 0.9;
+UNIV_INTERN uint srv_defragment_frequency =
+ SRV_DEFRAGMENT_FREQUENCY_DEFAULT;
+UNIV_INTERN ulonglong srv_defragment_interval = 0;
+
/** Query thread preflush algorithm */
UNIV_INTERN ulong srv_foreground_preflush
= SRV_FOREGROUND_PREFLUSH_EXP_BACKOFF;
@@ -1876,6 +1887,11 @@ srv_export_innodb_status(void)
export_vars.innodb_page_compressed_trim_op_saved = srv_stats.page_compressed_trim_op_saved;
export_vars.innodb_pages_page_decompressed = srv_stats.pages_page_decompressed;
+ export_vars.innodb_defragment_compression_failures =
+ btr_defragment_compression_failures;
+ export_vars.innodb_defragment_failures = btr_defragment_failures;
+ export_vars.innodb_defragment_count = btr_defragment_count;
+
#ifdef UNIV_DEBUG
rw_lock_s_lock(&purge_sys->latch);
trx_id_t done_trx_no = purge_sys->done.trx_no;
diff --git a/storage/xtradb/srv/srv0start.cc b/storage/xtradb/srv/srv0start.cc
index 3aedede7c97..cb7aa9bc3c7 100644
--- a/storage/xtradb/srv/srv0start.cc
+++ b/storage/xtradb/srv/srv0start.cc
@@ -69,6 +69,8 @@ Created 2/16/1996 Heikki Tuuri
#include "srv0start.h"
#include "srv0srv.h"
#include "buf0flu.h"
+#include "btr0defragment.h"
+#include "ut0timer.h"
#ifndef UNIV_HOTBACKUP
# include "trx0rseg.h"
@@ -1575,6 +1577,9 @@ innobase_start_or_create_for_mysql(void)
char* logfile0 = NULL;
size_t dirnamelen;
+ /* This should be initialized early */
+ ut_init_timer();
+
if (srv_force_recovery > SRV_FORCE_NO_TRX_UNDO) {
srv_read_only_mode = true;
}
@@ -2960,6 +2965,9 @@ files_checked:
fts_optimize_init();
}
+ /* Initialize online defragmentation. */
+ btr_defragment_init();
+
srv_was_started = TRUE;
return(DB_SUCCESS);
diff --git a/storage/xtradb/sync/sync0sync.cc b/storage/xtradb/sync/sync0sync.cc
index e698b7dcf10..1c5b144eb24 100644
--- a/storage/xtradb/sync/sync0sync.cc
+++ b/storage/xtradb/sync/sync0sync.cc
@@ -1272,6 +1272,7 @@ sync_thread_add_level(
case SYNC_IBUF_MUTEX:
case SYNC_INDEX_ONLINE_LOG:
case SYNC_STATS_AUTO_RECALC:
+ case SYNC_STATS_DEFRAG:
if (!sync_thread_levels_g(array, level, TRUE)) {
fprintf(stderr,
"InnoDB: sync_thread_levels_g(array, %lu)"
diff --git a/storage/xtradb/ut/ut0timer.cc b/storage/xtradb/ut/ut0timer.cc
new file mode 100644
index 00000000000..85292cce28c
--- /dev/null
+++ b/storage/xtradb/ut/ut0timer.cc
@@ -0,0 +1,92 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved.
+Copyright (c) 2014, SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file ut/ut0timer.cc
+Timer rountines
+
+Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com
+modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6
+*************************************************************************/
+
+#include "data0type.h"
+#include <my_rdtsc.h>
+#include <ut0timer.h>
+
+/**************************************************************//**
+Initial timer definition
+@return 0 */
+static
+ulonglong
+ut_timer_none(void)
+/*===============*/
+{
+ return 0;
+}
+
+/**************************************************************//**
+Function pointer to point selected timer function.
+@return timer current value */
+ulonglong (*ut_timer_now)(void) = &ut_timer_none;
+
+struct my_timer_unit_info ut_timer;
+
+/**************************************************************//**
+Sets up the data required for use of my_timer_* functions.
+Selects the best timer by high frequency, and tight resolution.
+Points my_timer_now() to the selected timer function.
+Initializes my_timer struct to contain the info for selected timer.*/
+UNIV_INTERN
+void
+ut_init_timer(void)
+/*===============*/
+{
+ MY_TIMER_INFO all_timer_info;
+ my_timer_init(&all_timer_info);
+
+ if (all_timer_info.cycles.frequency > 1000000 &&
+ all_timer_info.cycles.resolution == 1) {
+ ut_timer = all_timer_info.cycles;
+ ut_timer_now = &my_timer_cycles;
+ } else if (all_timer_info.nanoseconds.frequency > 1000000 &&
+ all_timer_info.nanoseconds.resolution == 1) {
+ ut_timer = all_timer_info.nanoseconds;
+ ut_timer_now = &my_timer_nanoseconds;
+ } else if (all_timer_info.microseconds.frequency >= 1000000 &&
+ all_timer_info.microseconds.resolution == 1) {
+ ut_timer = all_timer_info.microseconds;
+ ut_timer_now = &my_timer_microseconds;
+
+ } else if (all_timer_info.milliseconds.frequency >= 1000 &&
+ all_timer_info.milliseconds.resolution == 1) {
+ ut_timer = all_timer_info.milliseconds;
+ ut_timer_now = &my_timer_milliseconds;
+ } else if (all_timer_info.ticks.frequency >= 1000 &&
+ /* Will probably be false */
+ all_timer_info.ticks.resolution == 1) {
+ ut_timer = all_timer_info.ticks;
+ ut_timer_now = &my_timer_ticks;
+ } else {
+ /* None are acceptable, so leave it as "None", and fill in struct */
+ ut_timer.frequency = 1; /* Avoid div-by-zero */
+ ut_timer.overhead = 0; /* Since it doesn't do anything */
+ ut_timer.resolution = 10; /* Another sign it's bad */
+ ut_timer.routine = 0; /* None */
+ }
+}