diff options
Diffstat (limited to 'storage/innobase')
93 files changed, 10943 insertions, 802 deletions
diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt index 425d0bd0f1c..964294a962d 100644 --- a/storage/innobase/CMakeLists.txt +++ b/storage/innobase/CMakeLists.txt @@ -18,6 +18,15 @@ INCLUDE(CheckFunctionExists) INCLUDE(CheckCSourceCompiles) INCLUDE(CheckCSourceRuns) +INCLUDE(lz4) +INCLUDE(lzo) +INCLUDE(lzma) +INCLUDE(bzip2) + +MYSQL_CHECK_LZ4() +MYSQL_CHECK_LZO() +MYSQL_CHECK_LZMA() +MYSQL_CHECK_BZIP2() # OS tests IF(UNIX) @@ -328,6 +337,7 @@ SET(INNOBASE_SOURCES btr/btr0cur.cc btr/btr0pcur.cc btr/btr0sea.cc + btr/btr0defragment.cc buf/buf0buddy.cc buf/buf0buf.cc buf/buf0dblwr.cc @@ -336,6 +346,7 @@ SET(INNOBASE_SOURCES buf/buf0flu.cc buf/buf0lru.cc buf/buf0rea.cc + buf/buf0mtflu.cc data/data0data.cc data/data0type.cc dict/dict0boot.cc @@ -349,6 +360,7 @@ SET(INNOBASE_SOURCES eval/eval0eval.cc eval/eval0proc.cc fil/fil0fil.cc + fil/fil0pagecompress.cc fsp/fsp0fsp.cc fut/fut0fut.cc fut/fut0lst.cc @@ -436,7 +448,8 @@ SET(INNOBASE_SOURCES ut/ut0rnd.cc ut/ut0ut.cc ut/ut0vec.cc - ut/ut0wqueue.cc) + ut/ut0wqueue.cc + ut/ut0timer.cc) IF(WITH_INNODB) # Legacy option diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc index 79b533481b7..ff27b470974 100644 --- a/storage/innobase/btr/btr0btr.cc +++ b/storage/innobase/btr/btr0btr.cc @@ -38,6 +38,7 @@ Created 6/2/1994 Heikki Tuuri #include "btr0cur.h" #include "btr0sea.h" #include "btr0pcur.h" +#include "btr0defragment.h" #include "rem0cmp.h" #include "lock0lock.h" #include "ibuf0ibuf.h" @@ -1193,6 +1194,32 @@ btr_get_size( mtr_t* mtr) /*!< in/out: mini-transaction where index is s-latched */ { + ulint used; + if (flag == BTR_N_LEAF_PAGES) { + btr_get_size_and_reserved(index, flag, &used, mtr); + return used; + } else if (flag == BTR_TOTAL_SIZE) { + return btr_get_size_and_reserved(index, flag, &used, mtr); + } else { + ut_error; + } + return (ULINT_UNDEFINED); +} + +/**************************************************************//** +Gets the number of reserved and used pages in a B-tree. +@return number of pages reserved, or ULINT_UNDEFINED if the index +is unavailable */ +UNIV_INTERN +ulint +btr_get_size_and_reserved( +/*======================*/ + dict_index_t* index, /*!< in: index */ + ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */ + ulint* used, /*!< out: number of pages used (<= reserved) */ + mtr_t* mtr) /*!< in/out: mini-transaction where index + is s-latched */ +{ fseg_header_t* seg_header; page_t* root; ulint n; @@ -1201,6 +1228,8 @@ btr_get_size( ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), MTR_MEMO_S_LOCK)); + ut_a(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE); + if (index->page == FIL_NULL || dict_index_is_online_ddl(index) || *index->name == TEMP_INDEX_PREFIX) { return(ULINT_UNDEFINED); @@ -1208,21 +1237,16 @@ btr_get_size( root = btr_root_get(index, mtr); - if (flag == BTR_N_LEAF_PAGES) { - seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; + seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; - fseg_n_reserved_pages(seg_header, &n, mtr); + n = fseg_n_reserved_pages(seg_header, used, mtr); - } else if (flag == BTR_TOTAL_SIZE) { + if (flag == BTR_TOTAL_SIZE) { seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP; - n = fseg_n_reserved_pages(seg_header, &dummy, mtr); - - seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; - n += fseg_n_reserved_pages(seg_header, &dummy, mtr); - } else { - ut_error; + *used += dummy; + } return(n); @@ -1971,7 +1995,7 @@ IBUF_BITMAP_FREE is unaffected by reorganization. @retval true if the operation was successful @retval false if it is a compressed page, and recompression failed */ -static __attribute__((nonnull)) +UNIV_INTERN bool btr_page_reorganize_block( /*======================*/ @@ -2031,7 +2055,7 @@ btr_parse_page_reorganize( buf_block_t* block, /*!< in: page to be reorganized, or NULL */ mtr_t* mtr) /*!< in: mtr or NULL */ { - ulint level; + ulint level = page_zip_level; ut_ad(ptr && end_ptr); @@ -3059,6 +3083,12 @@ func_start: new_page_zip = buf_block_get_page_zip(new_block); btr_page_create(new_block, new_page_zip, cursor->index, btr_page_get_level(page, mtr), mtr); + /* Only record the leaf level page splits. */ + if (btr_page_get_level(page, mtr) == 0) { + cursor->index->stat_defrag_n_page_split ++; + cursor->index->stat_defrag_modified_counter ++; + btr_defragment_save_defrag_stats_if_needed(cursor->index); + } /* 3. Calculate the first record on the upper half-page, and the first record (move_limit) on original page which ends up on the @@ -3317,31 +3347,9 @@ func_exit: return(rec); } -#ifdef UNIV_SYNC_DEBUG -/*************************************************************//** -Removes a page from the level list of pages. -@param space in: space where removed -@param zip_size in: compressed page size in bytes, or 0 for uncompressed -@param page in/out: page to remove -@param index in: index tree -@param mtr in/out: mini-transaction */ -# define btr_level_list_remove(space,zip_size,page,index,mtr) \ - btr_level_list_remove_func(space,zip_size,page,index,mtr) -#else /* UNIV_SYNC_DEBUG */ -/*************************************************************//** -Removes a page from the level list of pages. -@param space in: space where removed -@param zip_size in: compressed page size in bytes, or 0 for uncompressed -@param page in/out: page to remove -@param index in: index tree -@param mtr in/out: mini-transaction */ -# define btr_level_list_remove(space,zip_size,page,index,mtr) \ - btr_level_list_remove_func(space,zip_size,page,mtr) -#endif /* UNIV_SYNC_DEBUG */ - /*************************************************************//** Removes a page from the level list of pages. */ -static __attribute__((nonnull)) +UNIV_INTERN void btr_level_list_remove_func( /*=======================*/ @@ -3513,7 +3521,7 @@ btr_node_ptr_delete( If page is the only on its level, this function moves its records to the father page, thus reducing the tree height. @return father block */ -static +UNIV_INTERN buf_block_t* btr_lift_page_up( /*=============*/ diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index acc3a4d1c98..b030fd7da79 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -1873,9 +1873,13 @@ btr_cur_update_alloc_zip_func( false=update-in-place */ mtr_t* mtr) /*!< in/out: mini-transaction */ { + + /* Have a local copy of the variables as these can change + dynamically. */ const page_t* page = page_cur_get_page(cursor); ut_ad(page_zip == page_cur_get_page_zip(cursor)); + ut_ad(page_zip); ut_ad(!dict_index_is_ibuf(index)); ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets)); diff --git a/storage/innobase/btr/btr0defragment.cc b/storage/innobase/btr/btr0defragment.cc new file mode 100644 index 00000000000..dfb2cd8dffd --- /dev/null +++ b/storage/innobase/btr/btr0defragment.cc @@ -0,0 +1,818 @@ +/***************************************************************************** + +Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved. +Copyright (C) 2014, SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ +/**************************************************//** +@file btr/btr0defragment.cc +Index defragmentation. + +Created 05/29/2014 Rongrong Zhong +Modified 16/07/2014 Sunguck Lee +Modified 30/07/2014 Jan Lindström jan.lindstrom@skysql.com +*******************************************************/ + +#include "btr0defragment.h" +#ifndef UNIV_HOTBACKUP +#include "btr0cur.h" +#include "btr0sea.h" +#include "btr0pcur.h" +#include "dict0stats.h" +#include "dict0stats_bg.h" +#include "ibuf0ibuf.h" +#include "lock0lock.h" +#include "srv0start.h" +#include "ut0timer.h" + +#include <list> + +/**************************************************//** +Custom nullptr implementation for under g++ 4.6 +*******************************************************/ +// #pragma once +/* +namespace std +{ + // based on SC22/WG21/N2431 = J16/07-0301 + struct nullptr_t + { + template<typename any> operator any * () const + { + return 0; + } + template<class any, typename T> operator T any:: * () const + { + return 0; + } + +#ifdef _MSC_VER + struct pad {}; + pad __[sizeof(void*)/sizeof(pad)]; +#else + char __[sizeof(void*)]; +#endif +private: + // nullptr_t();// {} + // nullptr_t(const nullptr_t&); + // void operator = (const nullptr_t&); + void operator &() const; + template<typename any> void operator +(any) const + { + // I Love MSVC 2005! + } + template<typename any> void operator -(any) const + { + // I Love MSVC 2005! + } + }; +static const nullptr_t __nullptr = {}; +} + +#ifndef nullptr +#define nullptr std::__nullptr +#endif +*/ + +/**************************************************//** +End of Custom nullptr implementation for under g++ 4.6 +*******************************************************/ + +/* When there's no work, either because defragment is disabled, or because no +query is submitted, thread checks state every BTR_DEFRAGMENT_SLEEP_IN_USECS.*/ +#define BTR_DEFRAGMENT_SLEEP_IN_USECS 1000000 +/* Reduce the target page size by this amount when compression failure happens +during defragmentaiton. 512 is chosen because it's a power of 2 and it is about +3% of the page size. When there are compression failures in defragmentation, +our goal is to get a decent defrag ratio with as few compression failure as +possible. From experimentation it seems that reduce the target size by 512 every +time will make sure the page is compressible within a couple of iterations. */ +#define BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE 512 + +/* Work queue for defragmentation. */ +typedef std::list<btr_defragment_item_t*> btr_defragment_wq_t; +static btr_defragment_wq_t btr_defragment_wq; + +/* Mutex protecting the defragmentation work queue.*/ +ib_mutex_t btr_defragment_mutex; +#ifdef UNIV_PFS_MUTEX +UNIV_INTERN mysql_pfs_key_t btr_defragment_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +/* Number of compression failures caused by defragmentation since server +start. */ +ulint btr_defragment_compression_failures = 0; +/* Number of btr_defragment_n_pages calls that altered page but didn't +manage to release any page. */ +ulint btr_defragment_failures = 0; +/* Total number of btr_defragment_n_pages calls that altered page. +The difference between btr_defragment_count and btr_defragment_failures shows +the amount of effort wasted. */ +ulint btr_defragment_count = 0; + +/******************************************************************//** +Constructor for btr_defragment_item_t. */ +btr_defragment_item_t::btr_defragment_item_t( + btr_pcur_t* pcur, + os_event_t event) +{ + this->pcur = pcur; + this->event = event; + this->removed = false; + this->last_processed = 0; +} + +/******************************************************************//** +Destructor for btr_defragment_item_t. */ +btr_defragment_item_t::~btr_defragment_item_t() { + if (this->pcur) { + btr_pcur_free_for_mysql(this->pcur); + } + if (this->event) { + os_event_set(this->event); + } +} + +/******************************************************************//** +Initialize defragmentation. */ +void +btr_defragment_init() +{ + srv_defragment_interval = ut_microseconds_to_timer( + 1000000.0 / srv_defragment_frequency); + mutex_create(btr_defragment_mutex_key, &btr_defragment_mutex, + SYNC_ANY_LATCH); + os_thread_create(btr_defragment_thread, NULL, NULL); +} + +/******************************************************************//** +Shutdown defragmentation. Release all resources. */ +void +btr_defragment_shutdown() +{ + mutex_enter(&btr_defragment_mutex); + list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); + while(iter != btr_defragment_wq.end()) { + btr_defragment_item_t* item = *iter; + iter = btr_defragment_wq.erase(iter); + delete item; + } + mutex_exit(&btr_defragment_mutex); + mutex_free(&btr_defragment_mutex); +} + + +/******************************************************************//** +Functions used by the query threads: btr_defragment_xxx_index +Query threads find/add/remove index. */ +/******************************************************************//** +Check whether the given index is in btr_defragment_wq. We use index->id +to identify indices. */ +bool +btr_defragment_find_index( + dict_index_t* index) /*!< Index to find. */ +{ + mutex_enter(&btr_defragment_mutex); + for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); + iter != btr_defragment_wq.end(); + ++iter) { + btr_defragment_item_t* item = *iter; + btr_pcur_t* pcur = item->pcur; + btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur); + dict_index_t* idx = btr_cur_get_index(cursor); + if (index->id == idx->id) { + mutex_exit(&btr_defragment_mutex); + return true; + } + } + mutex_exit(&btr_defragment_mutex); + return false; +} + +/******************************************************************//** +Query thread uses this function to add an index to btr_defragment_wq. +Return a pointer to os_event for the query thread to wait on if this is a +synchronized defragmentation. */ +os_event_t +btr_defragment_add_index( + dict_index_t* index, /*!< index to be added */ + bool async) /*!< whether this is an async defragmentation */ +{ + mtr_t mtr; + ulint space = dict_index_get_space(index); + ulint zip_size = dict_table_zip_size(index->table); + ulint page_no = dict_index_get_page(index); + mtr_start(&mtr); + // Load index rood page. + page_t* page = btr_page_get(space, zip_size, page_no, + RW_NO_LATCH, index, &mtr); + if (btr_page_get_level(page, &mtr) == 0) { + // Index root is a leaf page, no need to defragment. + mtr_commit(&mtr); + return NULL; + } + btr_pcur_t* pcur = btr_pcur_create_for_mysql(); + os_event_t event = NULL; + if (!async) { + event = os_event_create(); + } + btr_pcur_open_at_index_side(true, index, BTR_SEARCH_LEAF, pcur, + true, 0, &mtr); + btr_pcur_move_to_next(pcur, &mtr); + btr_pcur_store_position(pcur, &mtr); + mtr_commit(&mtr); + dict_stats_empty_defrag_summary(index); + btr_defragment_item_t* item = new btr_defragment_item_t(pcur, event); + mutex_enter(&btr_defragment_mutex); + btr_defragment_wq.push_back(item); + mutex_exit(&btr_defragment_mutex); + return event; +} + +/******************************************************************//** +When table is dropped, this function is called to mark a table as removed in +btr_efragment_wq. The difference between this function and the remove_index +function is this will not NULL the event. */ +void +btr_defragment_remove_table( + dict_table_t* table) /*!< Index to be removed. */ +{ + mutex_enter(&btr_defragment_mutex); + for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); + iter != btr_defragment_wq.end(); + ++iter) { + btr_defragment_item_t* item = *iter; + btr_pcur_t* pcur = item->pcur; + btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur); + dict_index_t* idx = btr_cur_get_index(cursor); + if (table->id == idx->table->id) { + item->removed = true; + } + } + mutex_exit(&btr_defragment_mutex); +} + +/******************************************************************//** +Query thread uses this function to mark an index as removed in +btr_efragment_wq. */ +void +btr_defragment_remove_index( + dict_index_t* index) /*!< Index to be removed. */ +{ + mutex_enter(&btr_defragment_mutex); + for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); + iter != btr_defragment_wq.end(); + ++iter) { + btr_defragment_item_t* item = *iter; + btr_pcur_t* pcur = item->pcur; + btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur); + dict_index_t* idx = btr_cur_get_index(cursor); + if (index->id == idx->id) { + item->removed = true; + item->event = NULL; + break; + } + } + mutex_exit(&btr_defragment_mutex); +} + +/******************************************************************//** +Functions used by defragmentation thread: btr_defragment_xxx_item. +Defragmentation thread operates on the work *item*. It gets/removes +item from the work queue. */ +/******************************************************************//** +Defragment thread uses this to remove an item from btr_defragment_wq. +When an item is removed from the work queue, all resources associated with it +are free as well. */ +void +btr_defragment_remove_item( + btr_defragment_item_t* item) /*!< Item to be removed. */ +{ + mutex_enter(&btr_defragment_mutex); + for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); + iter != btr_defragment_wq.end(); + ++iter) { + if (item == *iter) { + btr_defragment_wq.erase(iter); + delete item; + break; + } + } + mutex_exit(&btr_defragment_mutex); +} + +/******************************************************************//** +Defragment thread uses this to get an item from btr_defragment_wq to work on. +The item is not removed from the work queue so query threads can still access +this item. We keep it this way so query threads can find and kill a +defragmentation even if that index is being worked on. Be aware that while you +work on this item you have no lock protection on it whatsoever. This is OK as +long as the query threads and defragment thread won't modify the same fields +without lock protection. +*/ +btr_defragment_item_t* +btr_defragment_get_item() +{ + if (btr_defragment_wq.empty()) { + return NULL; + //return nullptr; + } + mutex_enter(&btr_defragment_mutex); + list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); + if (iter == btr_defragment_wq.end()) { + iter = btr_defragment_wq.begin(); + } + btr_defragment_item_t* item = *iter; + iter++; + mutex_exit(&btr_defragment_mutex); + return item; +} + +/*********************************************************************//** +Check whether we should save defragmentation statistics to persistent storage. +Currently we save the stats to persistent storage every 100 updates. */ +UNIV_INTERN +void +btr_defragment_save_defrag_stats_if_needed( + dict_index_t* index) /*!< in: index */ +{ + if (srv_defragment_stats_accuracy != 0 // stats tracking disabled + && dict_index_get_space(index) != 0 // do not track system tables + && index->stat_defrag_modified_counter + >= srv_defragment_stats_accuracy) { + dict_stats_defrag_pool_add(index); + index->stat_defrag_modified_counter = 0; + } +} + +/*********************************************************************//** +Main defragment functionalities used by defragment thread.*/ +/*************************************************************//** +Calculate number of records from beginning of block that can +fit into size_limit +@return number of records */ +UNIV_INTERN +ulint +btr_defragment_calc_n_recs_for_size( + buf_block_t* block, /*!< in: B-tree page */ + dict_index_t* index, /*!< in: index of the page */ + ulint size_limit, /*!< in: size limit to fit records in */ + ulint* n_recs_size) /*!< out: actual size of the records that fit + in size_limit. */ +{ + page_t* page = buf_block_get_frame(block); + ulint n_recs = 0; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + mem_heap_t* heap = NULL; + ulint size = 0; + page_cur_t cur; + + page_cur_set_before_first(block, &cur); + page_cur_move_to_next(&cur); + while (page_cur_get_rec(&cur) != page_get_supremum_rec(page)) { + rec_t* cur_rec = page_cur_get_rec(&cur); + offsets = rec_get_offsets(cur_rec, index, offsets, + ULINT_UNDEFINED, &heap); + ulint rec_size = rec_offs_size(offsets); + size += rec_size; + if (size > size_limit) { + size = size - rec_size; + break; + } + n_recs ++; + page_cur_move_to_next(&cur); + } + *n_recs_size = size; + return n_recs; +} + +/*************************************************************//** +Merge as many records from the from_block to the to_block. Delete +the from_block if all records are successfully merged to to_block. +@return the to_block to target for next merge operation. */ +UNIV_INTERN +buf_block_t* +btr_defragment_merge_pages( + dict_index_t* index, /*!< in: index tree */ + buf_block_t* from_block, /*!< in: origin of merge */ + buf_block_t* to_block, /*!< in: destination of merge */ + ulint zip_size, /*!< in: zip size of the block */ + ulint reserved_space, /*!< in: space reserved for future + insert to avoid immediate page split */ + ulint* max_data_size, /*!< in/out: max data size to + fit in a single compressed page. */ + mem_heap_t* heap, /*!< in/out: pointer to memory heap */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + page_t* from_page = buf_block_get_frame(from_block); + page_t* to_page = buf_block_get_frame(to_block); + ulint space = dict_index_get_space(index); + ulint level = btr_page_get_level(from_page, mtr); + ulint n_recs = page_get_n_recs(from_page); + ulint new_data_size = page_get_data_size(to_page); + ulint max_ins_size = + page_get_max_insert_size(to_page, n_recs); + ulint max_ins_size_reorg = + page_get_max_insert_size_after_reorganize( + to_page, n_recs); + ulint max_ins_size_to_use = max_ins_size_reorg > reserved_space + ? max_ins_size_reorg - reserved_space : 0; + ulint move_size = 0; + ulint n_recs_to_move = 0; + rec_t* rec = NULL; + ulint target_n_recs = 0; + rec_t* orig_pred; + + // Estimate how many records can be moved from the from_page to + // the to_page. + if (zip_size) { + ulint page_diff = UNIV_PAGE_SIZE - *max_data_size; + max_ins_size_to_use = (max_ins_size_to_use > page_diff) + ? max_ins_size_to_use - page_diff : 0; + } + n_recs_to_move = btr_defragment_calc_n_recs_for_size( + from_block, index, max_ins_size_to_use, &move_size); + + // If max_ins_size >= move_size, we can move the records without + // reorganizing the page, otherwise we need to reorganize the page + // first to release more space. + if (move_size > max_ins_size) { + if (!btr_page_reorganize_block(false, page_zip_level, + to_block, index, + mtr)) { + if (!dict_index_is_clust(index) + && page_is_leaf(to_page)) { + ibuf_reset_free_bits(to_block); + } + // If reorganization fails, that means page is + // not compressable. There's no point to try + // merging into this page. Continue to the + // next page. + return from_block; + } + ut_ad(page_validate(to_page, index)); + max_ins_size = page_get_max_insert_size(to_page, n_recs); + ut_a(max_ins_size >= move_size); + } + + // Move records to pack to_page more full. + orig_pred = NULL; + target_n_recs = n_recs_to_move; + while (n_recs_to_move > 0) { + rec = page_rec_get_nth(from_page, + n_recs_to_move + 1); + orig_pred = page_copy_rec_list_start( + to_block, from_block, rec, index, mtr); + if (orig_pred) + break; + // If we reach here, that means compression failed after packing + // n_recs_to_move number of records to to_page. We try to reduce + // the targeted data size on the to_page by + // BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE and try again. + os_atomic_increment_ulint( + &btr_defragment_compression_failures, 1); + max_ins_size_to_use = + move_size > BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE + ? move_size - BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE + : 0; + if (max_ins_size_to_use == 0) { + n_recs_to_move = 0; + move_size = 0; + break; + } + n_recs_to_move = btr_defragment_calc_n_recs_for_size( + from_block, index, max_ins_size_to_use, &move_size); + } + // If less than target_n_recs are moved, it means there are + // compression failures during page_copy_rec_list_start. Adjust + // the max_data_size estimation to reduce compression failures + // in the following runs. + if (target_n_recs > n_recs_to_move + && *max_data_size > new_data_size + move_size) { + *max_data_size = new_data_size + move_size; + } + // Set ibuf free bits if necessary. + if (!dict_index_is_clust(index) + && page_is_leaf(to_page)) { + if (zip_size) { + ibuf_reset_free_bits(to_block); + } else { + ibuf_update_free_bits_if_full( + to_block, + UNIV_PAGE_SIZE, + ULINT_UNDEFINED); + } + } + if (n_recs_to_move == n_recs) { + /* The whole page is merged with the previous page, + free it. */ + lock_update_merge_left(to_block, orig_pred, + from_block); + btr_search_drop_page_hash_index(from_block); + btr_level_list_remove(space, zip_size, from_page, + index, mtr); + btr_node_ptr_delete(index, from_block, mtr); + btr_blob_dbg_remove(from_page, index, + "btr_defragment_n_pages"); + btr_page_free(index, from_block, mtr); + } else { + // There are still records left on the page, so + // increment n_defragmented. Node pointer will be changed + // so remove the old node pointer. + if (n_recs_to_move > 0) { + // Part of the page is merged to left, remove + // the merged records, update record locks and + // node pointer. + dtuple_t* node_ptr; + page_delete_rec_list_start(rec, from_block, + index, mtr); + lock_update_split_and_merge(to_block, + orig_pred, + from_block); + btr_node_ptr_delete(index, from_block, mtr); + rec = page_rec_get_next( + page_get_infimum_rec(from_page)); + node_ptr = dict_index_build_node_ptr( + index, rec, page_get_page_no(from_page), + heap, level + 1); + btr_insert_on_non_leaf_level(0, index, level+1, + node_ptr, mtr); + } + to_block = from_block; + } + return to_block; +} + +/*************************************************************//** +Tries to merge N consecutive pages, starting from the page pointed by the +cursor. Skip space 0. Only consider leaf pages. +This function first loads all N pages into memory, then for each of +the pages other than the first page, it tries to move as many records +as possible to the left sibling to keep the left sibling full. During +the process, if any page becomes empty, that page will be removed from +the level list. Record locks, hash, and node pointers are updated after +page reorganization. +@return pointer to the last block processed, or NULL if reaching end of index */ +UNIV_INTERN +buf_block_t* +btr_defragment_n_pages( + buf_block_t* block, /*!< in: starting block for defragmentation */ + dict_index_t* index, /*!< in: index tree */ + uint n_pages,/*!< in: number of pages to defragment */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint space; + ulint zip_size; + /* We will need to load the n+1 block because if the last page is freed + and we need to modify the prev_page_no of that block. */ + buf_block_t* blocks[BTR_DEFRAGMENT_MAX_N_PAGES + 1]; + page_t* first_page; + buf_block_t* current_block; + ulint total_data_size = 0; + ulint total_n_recs = 0; + ulint data_size_per_rec; + ulint optimal_page_size; + ulint reserved_space; + ulint level; + ulint max_data_size = 0; + uint n_defragmented = 0; + uint n_new_slots; + mem_heap_t* heap; + ibool end_of_index = FALSE; + + /* It doesn't make sense to call this function with n_pages = 1. */ + ut_ad(n_pages > 1); + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + space = dict_index_get_space(index); + if (space == 0) { + /* Ignore space 0. */ + return NULL; + } + + if (n_pages > BTR_DEFRAGMENT_MAX_N_PAGES) { + n_pages = BTR_DEFRAGMENT_MAX_N_PAGES; + } + + zip_size = dict_table_zip_size(index->table); + first_page = buf_block_get_frame(block); + level = btr_page_get_level(first_page, mtr); + + if (level != 0) { + return NULL; + } + + /* 1. Load the pages and calculate the total data size. */ + blocks[0] = block; + for (uint i = 1; i <= n_pages; i++) { + page_t* page = buf_block_get_frame(blocks[i-1]); + ulint page_no = btr_page_get_next(page, mtr); + total_data_size += page_get_data_size(page); + total_n_recs += page_get_n_recs(page); + if (page_no == FIL_NULL) { + n_pages = i; + end_of_index = TRUE; + break; + } + blocks[i] = btr_block_get(space, zip_size, page_no, + RW_X_LATCH, index, mtr); + } + + if (n_pages == 1) { + if (btr_page_get_prev(first_page, mtr) == FIL_NULL) { + /* last page in the index */ + if (dict_index_get_page(index) + == page_get_page_no(first_page)) + return NULL; + /* given page is the last page. + Lift the records to father. */ + btr_lift_page_up(index, block, mtr); + } + return NULL; + } + + /* 2. Calculate how many pages data can fit in. If not compressable, + return early. */ + ut_a(total_n_recs != 0); + data_size_per_rec = total_data_size / total_n_recs; + // For uncompressed pages, the optimal data size if the free space of a + // empty page. + optimal_page_size = page_get_free_space_of_empty( + page_is_comp(first_page)); + // For compressed pages, we take compression failures into account. + if (zip_size) { + ulint size = 0; + int i = 0; + // We estimate the optimal data size of the index use samples of + // data size. These samples are taken when pages failed to + // compress due to insertion on the page. We use the average + // of all samples we have as the estimation. Different pages of + // the same index vary in compressibility. Average gives a good + // enough estimation. + for (;i < STAT_DEFRAG_DATA_SIZE_N_SAMPLE; i++) { + if (index->stat_defrag_data_size_sample[i] == 0) { + break; + } + size += index->stat_defrag_data_size_sample[i]; + } + if (i != 0) { + size = size / i; + optimal_page_size = min(optimal_page_size, size); + } + max_data_size = optimal_page_size; + } + + reserved_space = min((ulint)(optimal_page_size + * (1 - srv_defragment_fill_factor)), + (data_size_per_rec + * srv_defragment_fill_factor_n_recs)); + optimal_page_size -= reserved_space; + n_new_slots = (total_data_size + optimal_page_size - 1) + / optimal_page_size; + if (n_new_slots >= n_pages) { + /* Can't defragment. */ + if (end_of_index) + return NULL; + return blocks[n_pages-1]; + } + + /* 3. Defragment pages. */ + heap = mem_heap_create(256); + // First defragmented page will be the first page. + current_block = blocks[0]; + // Start from the second page. + for (uint i = 1; i < n_pages; i ++) { + buf_block_t* new_block = btr_defragment_merge_pages( + index, blocks[i], current_block, zip_size, + reserved_space, &max_data_size, heap, mtr); + if (new_block != current_block) { + n_defragmented ++; + current_block = new_block; + } + } + mem_heap_free(heap); + n_defragmented ++; + os_atomic_increment_ulint( + &btr_defragment_count, 1); + if (n_pages == n_defragmented) { + os_atomic_increment_ulint( + &btr_defragment_failures, 1); + } else { + index->stat_defrag_n_pages_freed += (n_pages - n_defragmented); + } + if (end_of_index) + return NULL; + return current_block; +} + +/******************************************************************//** +Thread that merges consecutive b-tree pages into fewer pages to defragment +the index. */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(btr_defragment_thread)( +/*==========================================*/ + void* arg) /*!< in: work queue */ +{ + btr_pcur_t* pcur; + btr_cur_t* cursor; + dict_index_t* index; + mtr_t mtr; + buf_block_t* first_block; + buf_block_t* last_block; + + while (srv_shutdown_state == SRV_SHUTDOWN_NONE) { + /* If defragmentation is disabled, sleep before + checking whether it's enabled. */ + if (!srv_defragment) { + os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS); + continue; + } + /* The following call won't remove the item from work queue. + We only get a pointer to it to work on. This will make sure + when user issue a kill command, all indices are in the work + queue to be searched. This also means that the user thread + cannot directly remove the item from queue (since we might be + using it). So user thread only marks index as removed. */ + btr_defragment_item_t* item = btr_defragment_get_item(); + /* If work queue is empty, sleep and check later. */ + if (!item) { + os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS); + continue; + } + /* If an index is marked as removed, we remove it from the work + queue. No other thread could be using this item at this point so + it's safe to remove now. */ + if (item->removed) { + btr_defragment_remove_item(item); + continue; + } + + pcur = item->pcur; + ulonglong now = ut_timer_now(); + ulonglong elapsed = now - item->last_processed; + + if (elapsed < srv_defragment_interval) { + /* If we see an index again before the interval + determined by the configured frequency is reached, + we just sleep until the interval pass. Since + defragmentation of all indices queue up on a single + thread, it's likely other indices that follow this one + don't need to sleep again. */ + os_thread_sleep(((ulint)ut_timer_to_microseconds( + srv_defragment_interval - elapsed))); + } + + now = ut_timer_now(); + mtr_start(&mtr); + btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, &mtr); + cursor = btr_pcur_get_btr_cur(pcur); + index = btr_cur_get_index(cursor); + first_block = btr_cur_get_block(cursor); + last_block = btr_defragment_n_pages(first_block, index, + srv_defragment_n_pages, + &mtr); + if (last_block) { + /* If we haven't reached the end of the index, + place the cursor on the last record of last page, + store the cursor position, and put back in queue. */ + page_t* last_page = buf_block_get_frame(last_block); + rec_t* rec = page_rec_get_prev( + page_get_supremum_rec(last_page)); + ut_a(page_rec_is_user_rec(rec)); + page_cur_position(rec, last_block, + btr_cur_get_page_cur(cursor)); + btr_pcur_store_position(pcur, &mtr); + mtr_commit(&mtr); + /* Update the last_processed time of this index. */ + item->last_processed = now; + } else { + mtr_commit(&mtr); + /* Reaching the end of the index. */ + dict_stats_empty_defrag_stats(index); + dict_stats_save_defrag_stats(index); + dict_stats_save_defrag_summary(index); + btr_defragment_remove_item(item); + } + } + btr_defragment_shutdown(); + os_thread_exit(NULL); + OS_THREAD_DUMMY_RETURN; +} + +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 9fceae0f880..33c9eb7a0f2 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -2,6 +2,7 @@ Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -54,6 +55,8 @@ Created 11/5/1995 Heikki Tuuri #include "srv0mon.h" #include "buf0checksum.h" +#include <new> + /* IMPLEMENTATION OF THE BUFFER POOL ================================= @@ -829,6 +832,11 @@ buf_page_print( mach_read_from_4(read_buf + FIL_PAGE_OFFSET), mach_read_from_4(read_buf + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID)); + + ulint page_type = mach_read_from_4(read_buf + FIL_PAGE_TYPE); + + fprintf(stderr, "InnoDB: page type %ld meaning %s\n", page_type, + fil_get_page_type_name(page_type)); } #ifndef UNIV_HOTBACKUP @@ -1323,6 +1331,19 @@ buf_pool_init_instance( buf_pool->try_LRU_scan = TRUE; + /* Initialize the hazard pointer for flush_list batches */ + new(&buf_pool->flush_hp) + FlushHp(buf_pool, &buf_pool->flush_list_mutex); + + /* Initialize the hazard pointer for LRU batches */ + new(&buf_pool->lru_hp) LRUHp(buf_pool, &buf_pool->mutex); + + /* Initialize the iterator for LRU scan search */ + new(&buf_pool->lru_scan_itr) LRUItr(buf_pool, &buf_pool->mutex); + + /* Initialize the iterator for single page scan search */ + new(&buf_pool->single_scan_itr) LRUItr(buf_pool, &buf_pool->mutex); + buf_pool_mutex_exit(buf_pool); return(DB_SUCCESS); @@ -1413,6 +1434,8 @@ buf_pool_init( btr_search_sys_create(buf_pool_get_curr_size() / sizeof(void*) / 64); + buf_flush_event = os_event_create(); + return(DB_SUCCESS); } @@ -1529,6 +1552,10 @@ buf_relocate( memcpy(dpage, bpage, sizeof *dpage); + /* Important that we adjust the hazard pointer before + removing bpage from LRU list. */ + buf_LRU_adjust_hp(buf_pool, bpage); + ut_d(bpage->in_LRU_list = FALSE); ut_d(bpage->in_page_hash = FALSE); @@ -1567,6 +1594,84 @@ buf_relocate( HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, dpage); } +/** Hazard Pointer implementation. */ + +/** Set current value +@param bpage buffer block to be set as hp */ +void +HazardPointer::set(buf_page_t* bpage) +{ + ut_ad(mutex_own(m_mutex)); + ut_ad(!bpage || buf_pool_from_bpage(bpage) == m_buf_pool); + ut_ad(!bpage || buf_page_in_file(bpage)); + + m_hp = bpage; +} + +/** Checks if a bpage is the hp +@param bpage buffer block to be compared +@return true if it is hp */ + +bool +HazardPointer::is_hp(const buf_page_t* bpage) +{ + ut_ad(mutex_own(m_mutex)); + ut_ad(!m_hp || buf_pool_from_bpage(m_hp) == m_buf_pool); + ut_ad(!bpage || buf_pool_from_bpage(bpage) == m_buf_pool); + + return(bpage == m_hp); +} + +/** Adjust the value of hp. This happens when some other thread working +on the same list attempts to remove the hp from the list. +@param bpage buffer block to be compared */ + +void +FlushHp::adjust(const buf_page_t* bpage) +{ + ut_ad(bpage != NULL); + + /** We only support reverse traversal for now. */ + if (is_hp(bpage)) { + m_hp = UT_LIST_GET_PREV(list, m_hp); + } + + ut_ad(!m_hp || m_hp->in_flush_list); +} + +/** Adjust the value of hp. This happens when some other thread working +on the same list attempts to remove the hp from the list. +@param bpage buffer block to be compared */ + +void +LRUHp::adjust(const buf_page_t* bpage) +{ + ut_ad(bpage); + + /** We only support reverse traversal for now. */ + if (is_hp(bpage)) { + m_hp = UT_LIST_GET_PREV(LRU, m_hp); + } + + ut_ad(!m_hp || m_hp->in_LRU_list); +} + +/** Selects from where to start a scan. If we have scanned too deep into +the LRU list it resets the value to the tail of the LRU list. +@return buf_page_t from where to start scan. */ + +buf_page_t* +LRUItr::start() +{ + ut_ad(mutex_own(m_mutex)); + + if (!m_hp || m_hp->old) { + m_hp = UT_LIST_GET_LAST(m_buf_pool->LRU); + } + + return(m_hp); +} + /********************************************************************//** Determine if a block is a sentinel for a buffer pool watch. @return TRUE if a sentinel for a buffer pool watch, FALSE if not */ @@ -3363,6 +3468,7 @@ buf_page_init_low( bpage->access_time = 0; bpage->newest_modification = 0; bpage->oldest_modification = 0; + bpage->write_size = 0; HASH_INVALIDATE(bpage, hash); #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG bpage->file_page_was_freed = FALSE; @@ -4050,7 +4156,10 @@ UNIV_INTERN bool buf_page_io_complete( /*=================*/ - buf_page_t* bpage) /*!< in: pointer to the block in question */ + buf_page_t* bpage, /*!< in: pointer to the block in question */ + bool evict) /*!< in: whether or not to evict the page + from LRU list. */ + { enum buf_io_fix io_type; buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); @@ -4232,6 +4341,7 @@ corrupt: id. */ buf_page_set_io_fix(bpage, BUF_IO_NONE); + buf_page_monitor(bpage, io_type); switch (io_type) { case BUF_IO_READ: @@ -4248,6 +4358,8 @@ corrupt: BUF_IO_READ); } + mutex_exit(buf_page_get_mutex(bpage)); + break; case BUF_IO_WRITE: @@ -4263,14 +4375,30 @@ corrupt: buf_pool->stat.n_pages_written++; + /* In case of flush batches i.e.: BUF_FLUSH_LIST and + BUF_FLUSH_LRU this function is always called from IO + helper thread. In this case, we decide whether or not + to evict the page based on flush type. The value + passed as evict is the default value in function + definition which is false. + We always evict in case of LRU batch and never evict + in case of flush list batch. For single page flush + the caller sets the appropriate value. */ + if (buf_page_get_flush_type(bpage) == BUF_FLUSH_LRU) { + evict = true; + } + + mutex_exit(buf_page_get_mutex(bpage)); + if (evict) { + buf_LRU_free_page(bpage, true); + } + break; default: ut_error; } - buf_page_monitor(bpage, io_type); - #ifdef UNIV_DEBUG if (buf_debug_prints) { fprintf(stderr, "Has %s page space %lu page no %lu\n", @@ -4280,7 +4408,6 @@ corrupt: } #endif /* UNIV_DEBUG */ - mutex_exit(buf_page_get_mutex(bpage)); buf_pool_mutex_exit(buf_pool); return(true); diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc index 62222993622..c903f5fbffa 100644 --- a/storage/innobase/buf/buf0dblwr.cc +++ b/storage/innobase/buf/buf0dblwr.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -382,7 +383,7 @@ buf_dblwr_init_or_load_pages( /* Read the trx sys header to check if we are using the doublewrite buffer */ off_t trx_sys_page = TRX_SYS_PAGE_NO * UNIV_PAGE_SIZE; - os_file_read(file, read_buf, trx_sys_page, UNIV_PAGE_SIZE); + os_file_read(file, read_buf, trx_sys_page, UNIV_PAGE_SIZE, FALSE); doublewrite = read_buf + TRX_SYS_DOUBLEWRITE; @@ -419,9 +420,9 @@ buf_dblwr_init_or_load_pages( block_bytes = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE; - os_file_read(file, buf, block1 * UNIV_PAGE_SIZE, block_bytes); + os_file_read(file, buf, block1 * UNIV_PAGE_SIZE, block_bytes, FALSE); os_file_read(file, buf + block_bytes, block2 * UNIV_PAGE_SIZE, - block_bytes); + block_bytes, FALSE); /* Check if any of these pages is half-written in data files, in the intended position */ @@ -451,7 +452,6 @@ buf_dblwr_init_or_load_pages( os_file_write(path, file, page, source_page_no * UNIV_PAGE_SIZE, UNIV_PAGE_SIZE); - } else if (load_corrupt_pages) { recv_dblwr.add(page); @@ -514,7 +514,7 @@ buf_dblwr_process() fil_io(OS_FILE_READ, true, space_id, zip_size, page_no, 0, zip_size ? zip_size : UNIV_PAGE_SIZE, - read_buf, NULL); + read_buf, NULL, 0); /* Check if the page is corrupt */ @@ -566,7 +566,7 @@ buf_dblwr_process() fil_io(OS_FILE_WRITE, true, space_id, zip_size, page_no, 0, zip_size ? zip_size : UNIV_PAGE_SIZE, - page, NULL); + page, NULL, 0); ib_logf(IB_LOG_LEVEL_INFO, "Recovered the page from" @@ -586,7 +586,7 @@ buf_dblwr_process() zip_size, page_no, 0, zip_size ? zip_size : UNIV_PAGE_SIZE, - page, NULL); + page, NULL, 0); } } } @@ -798,7 +798,7 @@ buf_dblwr_write_block_to_datafile( buf_page_get_page_no(bpage), 0, buf_page_get_zip_size(bpage), (void*) bpage->zip.data, - (void*) bpage); + (void*) bpage, 0); return; } @@ -810,8 +810,7 @@ buf_dblwr_write_block_to_datafile( fil_io(flags, sync, buf_block_get_space(block), 0, buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE, - (void*) block->frame, (void*) block); - + (void*) block->frame, (void*) block, (ulint *)&bpage->write_size); } /********************************************************************//** @@ -905,7 +904,7 @@ try_again: fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, buf_dblwr->block1, 0, len, - (void*) write_buf, NULL); + (void*) write_buf, NULL, 0); if (buf_dblwr->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { /* No unwritten pages in the second block. */ @@ -921,7 +920,7 @@ try_again: fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, buf_dblwr->block2, 0, len, - (void*) write_buf, NULL); + (void*) write_buf, NULL, 0); flush: /* increment the doublewrite flushed pages counter */ @@ -1150,14 +1149,14 @@ retry: fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, offset, 0, UNIV_PAGE_SIZE, (void*) (buf_dblwr->write_buf - + UNIV_PAGE_SIZE * i), NULL); + + UNIV_PAGE_SIZE * i), NULL, 0); } else { /* It is a regular page. Write it directly to the doublewrite buffer */ fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, offset, 0, UNIV_PAGE_SIZE, (void*) ((buf_block_t*) bpage)->frame, - NULL); + NULL, 0); } /* Now flush the doublewrite buffer data to disk */ diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 6b219262207..32c3c816a85 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -1,6 +1,8 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. +Copyright (c) 2013, 2014, Fusion-io. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -30,6 +32,7 @@ Created 11/11/1995 Heikki Tuuri #endif #include "buf0buf.h" +#include "buf0mtflu.h" #include "buf0checksum.h" #include "srv0start.h" #include "srv0srv.h" @@ -44,10 +47,12 @@ Created 11/11/1995 Heikki Tuuri #include "ibuf0ibuf.h" #include "log0log.h" #include "os0file.h" +#include "os0sync.h" #include "trx0sys.h" #include "srv0mon.h" #include "mysql/plugin.h" #include "mysql/service_thd_wait.h" +#include "fil0pagecompress.h" /** Number of pages flushed through non flush_list flushes. */ static ulint buf_lru_flush_page_count = 0; @@ -59,14 +64,13 @@ need to protect it by a mutex. It is only ever read by the thread doing the shutdown */ UNIV_INTERN ibool buf_page_cleaner_is_active = FALSE; -/** LRU flush batch is further divided into this chunk size to -reduce the wait time for the threads waiting for a clean block */ -#define PAGE_CLEANER_LRU_BATCH_CHUNK_SIZE 100 - #ifdef UNIV_PFS_THREAD UNIV_INTERN mysql_pfs_key_t buf_page_cleaner_thread_key; #endif /* UNIV_PFS_THREAD */ +/** Event to synchronise with the flushing. */ + os_event_t buf_flush_event; + /** If LRU list of a buf_pool is less than this size then LRU eviction should not happen. This is because when we do LRU flushing we also put the blocks on free list. If LRU list is very small then we can end up @@ -75,15 +79,6 @@ in thrashing. */ /* @} */ -/** Handled page counters for a single flush */ -struct flush_counters_t { - ulint flushed; /*!< number of dirty pages flushed */ - ulint evicted; /*!< number of clean pages evicted, including - evicted uncompressed page images */ - ulint unzip_LRU_evicted;/*!< number of uncompressed page images - evicted */ -}; - /******************************************************************//** Increases flush_list size in bytes with zip_size for compressed page, UNIV_PAGE_SIZE for uncompressed page in inline function */ @@ -139,60 +134,6 @@ buf_flush_validate_skip( } #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ -/*******************************************************************//** -Sets hazard pointer during flush_list iteration. */ -UNIV_INLINE -void -buf_flush_set_hp( -/*=============*/ - buf_pool_t* buf_pool,/*!< in/out: buffer pool instance */ - const buf_page_t* bpage) /*!< in: buffer control block */ -{ - ut_ad(buf_flush_list_mutex_own(buf_pool)); - ut_ad(buf_pool->flush_list_hp == NULL || bpage == NULL); - ut_ad(!bpage || buf_page_in_file(bpage)); - ut_ad(!bpage || bpage->in_flush_list); - ut_ad(!bpage || buf_pool_from_bpage(bpage) == buf_pool); - - buf_pool->flush_list_hp = bpage; -} - -/*******************************************************************//** -Checks if the given block is a hazard pointer -@return true if bpage is hazard pointer */ -UNIV_INLINE -bool -buf_flush_is_hp( -/*============*/ - buf_pool_t* buf_pool,/*!< in: buffer pool instance */ - const buf_page_t* bpage) /*!< in: buffer control block */ -{ - ut_ad(buf_flush_list_mutex_own(buf_pool)); - - return(buf_pool->flush_list_hp == bpage); -} - -/*******************************************************************//** -Whenever we move a block in flush_list (either to remove it or to -relocate it) we check the hazard pointer set by some other thread -doing the flush list scan. If the hazard pointer is the same as the -one we are about going to move then we set it to NULL to force a rescan -in the thread doing the batch. */ -UNIV_INLINE -void -buf_flush_update_hp( -/*================*/ - buf_pool_t* buf_pool, /*!< in: buffer pool instance */ - buf_page_t* bpage) /*!< in: buffer control block */ -{ - ut_ad(buf_flush_list_mutex_own(buf_pool)); - - if (buf_flush_is_hp(buf_pool, bpage)) { - buf_flush_set_hp(buf_pool, NULL); - MONITOR_INC(MONITOR_FLUSH_HP_RESCAN); - } -} - /******************************************************************//** Insert a block in the flush_rbt and returns a pointer to its predecessor or NULL if no predecessor. The ordering is maintained @@ -591,6 +532,10 @@ buf_flush_remove( buf_flush_list_mutex_enter(buf_pool); + /* Important that we adjust the hazard pointer before removing + the bpage from flush list. */ + buf_pool->flush_hp.adjust(bpage); + switch (buf_page_get_state(bpage)) { case BUF_BLOCK_POOL_WATCH: case BUF_BLOCK_ZIP_PAGE: @@ -631,7 +576,6 @@ buf_flush_remove( ut_a(buf_flush_validate_skip(buf_pool)); #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ - buf_flush_update_hp(buf_pool, bpage); buf_flush_list_mutex_exit(buf_pool); } @@ -682,6 +626,10 @@ buf_flush_relocate_on_flush_list( prev_b = buf_flush_insert_in_flush_rbt(dpage); } + /* Important that we adjust the hazard pointer before removing + the bpage from the flush list. */ + buf_pool->flush_hp.adjust(bpage); + /* Must be done after we have removed it from the flush_rbt because we assert on in_flush_list in comparison function. */ ut_d(bpage->in_flush_list = FALSE); @@ -710,7 +658,6 @@ buf_flush_relocate_on_flush_list( ut_a(buf_flush_validate_low(buf_pool)); #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ - buf_flush_update_hp(buf_pool, bpage); buf_flush_list_mutex_exit(buf_pool); } @@ -732,8 +679,10 @@ buf_flush_write_complete( flush_type = buf_page_get_flush_type(bpage); buf_pool->n_flush[flush_type]--; +#ifdef UNIV_DEBUG /* fprintf(stderr, "n pending flush %lu\n", buf_pool->n_flush[flush_type]); */ +#endif if (buf_pool->n_flush[flush_type] == 0 && buf_pool->init_flush[flush_type] == FALSE) { @@ -891,6 +840,8 @@ buf_flush_write_block_low( { ulint zip_size = buf_page_get_zip_size(bpage); page_t* frame = NULL; + ulint space_id = buf_page_get_space(bpage); + atomic_writes_t awrites = fil_space_get_atomic_writes(space_id); #ifdef UNIV_DEBUG buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); @@ -967,12 +918,28 @@ buf_flush_write_block_low( sync, buf_page_get_space(bpage), zip_size, buf_page_get_page_no(bpage), 0, zip_size ? zip_size : UNIV_PAGE_SIZE, - frame, bpage); - } else if (flush_type == BUF_FLUSH_SINGLE_PAGE) { - buf_dblwr_write_single_page(bpage, sync); + frame, bpage, &bpage->write_size); } else { - ut_ad(!sync); - buf_dblwr_add_to_batch(bpage); + + /* InnoDB uses doublewrite buffer and doublewrite buffer + is initialized. User can define do we use atomic writes + on a file space (table) or not. If atomic writes are + not used we should use doublewrite buffer and if + atomic writes should be used, no doublewrite buffer + is used. */ + + if (awrites == ATOMIC_WRITES_ON) { + fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, + FALSE, buf_page_get_space(bpage), zip_size, + buf_page_get_page_no(bpage), 0, + zip_size ? zip_size : UNIV_PAGE_SIZE, + frame, bpage, &bpage->write_size); + } else if (flush_type == BUF_FLUSH_SINGLE_PAGE) { + buf_dblwr_write_single_page(bpage, sync); + } else { + ut_ad(!sync); + buf_dblwr_add_to_batch(bpage); + } } /* When doing single page flushing the IO is done synchronously @@ -981,7 +948,10 @@ buf_flush_write_block_low( if (sync) { ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE); fil_flush(buf_page_get_space(bpage)); - buf_page_io_complete(bpage); + + /* true means we want to evict this page from the + LRU list as well. */ + buf_page_io_complete(bpage, true); } /* Increment the counter of I/O operations used @@ -1075,10 +1045,10 @@ buf_flush_page( rw_lock_s_lock_gen(rw_lock, BUF_IO_WRITE); } - /* Even though bpage is not protected by any mutex at this - point, it is safe to access bpage, because it is io_fixed and - oldest_modification != 0. Thus, it cannot be relocated in the - buffer pool or removed from flush_list or LRU_list. */ + /* Even though bpage is not protected by any mutex at this + point, it is safe to access bpage, because it is io_fixed and + oldest_modification != 0. Thus, it cannot be relocated in the + buffer pool or removed from flush_list or LRU_list. */ buf_flush_write_block_low(bpage, flush_type, sync); } @@ -1232,7 +1202,9 @@ buf_flush_try_neighbors( } } +#ifdef UNIV_DEBUG /* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */ +#endif if (high > fil_space_get_size(space)) { high = fil_space_get_size(space); @@ -1440,9 +1412,8 @@ This utility flushes dirty blocks from the end of the LRU list. The calling thread is not allowed to own any latches on pages! It attempts to make 'max' blocks available in the free list. Note that it is a best effort attempt and it is not guaranteed that after a call -to this function there will be 'max' blocks in the free list. -@return number of blocks for which the write request was queued. */ -static +to this function there will be 'max' blocks in the free list.*/ +__attribute__((nonnull)) void buf_flush_LRU_list_batch( /*=====================*/ @@ -1453,96 +1424,54 @@ buf_flush_LRU_list_batch( counts */ { buf_page_t* bpage; - ulint count = 0; ulint scanned = 0; ulint free_len = UT_LIST_GET_LEN(buf_pool->free); ulint lru_len = UT_LIST_GET_LEN(buf_pool->LRU); - ut_ad(buf_pool_mutex_own(buf_pool)); - n->flushed = 0; n->evicted = 0; n->unzip_LRU_evicted = 0; - bpage = UT_LIST_GET_LAST(buf_pool->LRU); - while (bpage != NULL && count < max - && (n->flushed + n->evicted) < max - && free_len < srv_LRU_scan_depth - && lru_len > BUF_LRU_MIN_LEN) { + ut_ad(buf_pool_mutex_own(buf_pool)); - ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); - ibool evict; + for (bpage = UT_LIST_GET_LAST(buf_pool->LRU); + bpage != NULL && (n->evicted + n->flushed) < max + && free_len < srv_LRU_scan_depth + && lru_len > BUF_LRU_MIN_LEN; + ++scanned, + bpage = buf_pool->lru_hp.get()) { + buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage); + buf_pool->lru_hp.set(prev); + + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); mutex_enter(block_mutex); - evict = buf_flush_ready_for_replace(bpage); + bool evict = buf_flush_ready_for_replace(bpage); mutex_exit(block_mutex); - ++scanned; - - /* If the block is ready to be replaced we try to - free it i.e.: put it on the free list. - Otherwise we try to flush the block and its - neighbors. In this case we'll put it on the - free list in the next pass. We do this extra work - of putting blocks to the free list instead of - just flushing them because after every flush - we have to restart the scan from the tail of - the LRU list and if we don't clear the tail - of the flushed pages then the scan becomes - O(n*n). */ if (evict) { + /* block is ready for eviction i.e., it is + clean and is not IO-fixed or buffer fixed. */ if (buf_LRU_free_page(bpage, true)) { - /* buf_pool->mutex was potentially - released and reacquired. */ n->evicted++; - bpage = UT_LIST_GET_LAST(buf_pool->LRU); - } else { - bpage = UT_LIST_GET_PREV(LRU, bpage); } } else { - ulint space; - ulint offset; - buf_page_t* prev_bpage; - - prev_bpage = UT_LIST_GET_PREV(LRU, bpage); - - /* Save the previous bpage */ - - if (prev_bpage != NULL) { - space = prev_bpage->space; - offset = prev_bpage->offset; - } else { - space = ULINT_UNDEFINED; - offset = ULINT_UNDEFINED; - } - - if (!buf_flush_page_and_try_neighbors( - bpage, BUF_FLUSH_LRU, max, &n->flushed)) { - - bpage = prev_bpage; - } else { - /* buf_pool->mutex was released. - reposition the iterator. Note: the - prev block could have been repositioned - too but that should be rare. */ - - if (prev_bpage != NULL) { - - ut_ad(space != ULINT_UNDEFINED); - ut_ad(offset != ULINT_UNDEFINED); - - prev_bpage = buf_page_hash_get( - buf_pool, space, offset); - } - - bpage = prev_bpage; - } + /* Block is ready for flush. Dispatch an IO + request. The IO helper thread will put it on + free list in IO completion routine. */ + buf_flush_page_and_try_neighbors( + bpage, BUF_FLUSH_LRU, max, &n->flushed); } + ut_ad(!mutex_own(block_mutex)); + ut_ad(buf_pool_mutex_own(buf_pool)); + free_len = UT_LIST_GET_LEN(buf_pool->free); lru_len = UT_LIST_GET_LEN(buf_pool->LRU); } + buf_pool->lru_hp.set(NULL); + /* We keep track of all flushes happening as part of LRU flush. When estimating the desired rate at which flush_list should be flushed, we factor in this value. */ @@ -1561,10 +1490,8 @@ buf_flush_LRU_list_batch( /*******************************************************************//** Flush and move pages from LRU or unzip_LRU list to the free list. -Whether LRU or unzip_LRU is used depends on the state of the system. -@return number of blocks for which either the write request was queued -or in case of unzip_LRU the number of blocks actually moved to the -free list */ +Whether LRU or unzip_LRU is used depends on the state of the system.*/ +__attribute__((nonnull)) static void buf_do_LRU_batch( @@ -1575,7 +1502,6 @@ buf_do_LRU_batch( flush_counters_t* n) /*!< out: flushed/evicted page counts */ { - if (buf_LRU_evict_from_unzip_LRU(buf_pool)) { n->unzip_LRU_evicted = buf_free_from_unzip_LRU_list_batch(buf_pool, max); } else { @@ -1588,6 +1514,10 @@ buf_do_LRU_batch( n->evicted = 0; n->flushed = 0; } + + /* Add evicted pages from unzip_LRU to the evicted pages from + the simple LRU. */ + n->evicted += n->unzip_LRU_evicted; } /*******************************************************************//** @@ -1629,6 +1559,7 @@ buf_do_flush_list_batch( for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->flush_list); count < min_n && bpage != NULL && len > 0 && bpage->oldest_modification < lsn_limit; + bpage = buf_pool->flush_hp.get(), ++scanned) { buf_page_t* prev; @@ -1637,8 +1568,7 @@ buf_do_flush_list_batch( ut_ad(bpage->in_flush_list); prev = UT_LIST_GET_PREV(list, bpage); - buf_flush_set_hp(buf_pool, prev); - + buf_pool->flush_hp.set(prev); buf_flush_list_mutex_exit(buf_pool); #ifdef UNIV_DEBUG @@ -1649,23 +1579,12 @@ buf_do_flush_list_batch( buf_flush_list_mutex_enter(buf_pool); - ut_ad(flushed || buf_flush_is_hp(buf_pool, prev)); + ut_ad(flushed || buf_pool->flush_hp.is_hp(prev)); - if (!buf_flush_is_hp(buf_pool, prev)) { - /* The hazard pointer was reset by some other - thread. Restart the scan. */ - ut_ad(buf_flush_is_hp(buf_pool, NULL)); - bpage = UT_LIST_GET_LAST(buf_pool->flush_list); - len = UT_LIST_GET_LEN(buf_pool->flush_list); - } else { - bpage = prev; - --len; - buf_flush_set_hp(buf_pool, NULL); - } - - ut_ad(!bpage || bpage->in_flush_list); + --len; } + buf_pool->flush_hp.set(NULL); buf_flush_list_mutex_exit(buf_pool); MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_SCANNED, @@ -1683,9 +1602,8 @@ This utility flushes dirty blocks from the end of the LRU list or flush_list. NOTE 1: in the case of an LRU flush the calling thread may own latches to pages: to avoid deadlocks, this function must be written so that it cannot end up waiting for these latches! NOTE 2: in the case of a flush list flush, -the calling thread is not allowed to own any latches on pages! -@return number of blocks for which the write request was queued */ -static +the calling thread is not allowed to own any latches on pages! */ +__attribute__((nonnull)) void buf_flush_batch( /*============*/ @@ -1705,7 +1623,6 @@ buf_flush_batch( flush_counters_t* n) /*!< out: flushed/evicted page counts */ { - ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); #ifdef UNIV_SYNC_DEBUG ut_ad((flush_type != BUF_FLUSH_LIST) @@ -1742,7 +1659,6 @@ buf_flush_batch( /******************************************************************//** Gather the aggregated stats for both flush list and LRU list flushing */ -static void buf_flush_common( /*=============*/ @@ -1767,7 +1683,6 @@ buf_flush_common( /******************************************************************//** Start a buffer flush batch for LRU or flush list */ -static ibool buf_flush_start( /*============*/ @@ -1796,7 +1711,6 @@ buf_flush_start( /******************************************************************//** End a buffer flush batch for LRU or flush list */ -static void buf_flush_end( /*==========*/ @@ -1852,40 +1766,6 @@ buf_flush_wait_batch_end( } /*******************************************************************//** -This utility flushes dirty blocks from the end of the LRU list and also -puts replaceable clean pages from the end of the LRU list to the free -list. -NOTE: The calling thread is not allowed to own any latches on pages! -@return true if a batch was queued successfully. false if another batch -of same type was already running. */ -static -bool -buf_flush_LRU( -/*==========*/ - buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ - ulint min_n, /*!< in: wished minimum mumber of blocks - flushed (it is not guaranteed that the - actual number is that big, though) */ - flush_counters_t *n) /*!< out: flushed/evicted page - counts */ -{ - if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) { - n->flushed = 0; - n->evicted = 0; - n->unzip_LRU_evicted = 0; - return(false); - } - - buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0, n); - - buf_flush_end(buf_pool, BUF_FLUSH_LRU); - - buf_flush_common(BUF_FLUSH_LRU, n->flushed); - - return(true); -} - -/*******************************************************************//** This utility flushes dirty blocks from the end of the flush list of all buffer pool instances. NOTE: The calling thread is not allowed to own any latches on pages! @@ -1912,6 +1792,10 @@ buf_flush_list( ulint i; bool success = true; + if (buf_mtflu_init_done()) { + return(buf_mtflu_flush_list(min_n, lsn_limit, n_processed)); + } + if (n_processed) { *n_processed = 0; } @@ -1927,8 +1811,8 @@ buf_flush_list( /* Flush to lsn_limit in all buffer pool instances */ for (i = 0; i < srv_buf_pool_instances; i++) { - buf_pool_t* buf_pool; - flush_counters_t n; + buf_pool_t* buf_pool; + flush_counters_t n; buf_pool = buf_pool_from_array(i); @@ -1972,12 +1856,12 @@ buf_flush_list( } /******************************************************************//** -This function picks up a single dirty page from the tail of the LRU -list, flushes it, removes it from page_hash and LRU list and puts -it on the free list. It is called from user threads when they are -unable to find a replaceable page at the tail of the LRU list i.e.: -when the background LRU flushing in the page_cleaner thread is not -fast enough to keep pace with the workload. +This function picks up a single page from the tail of the LRU +list, flushes it (if it is dirty), removes it from page_hash and LRU +list and puts it on the free list. It is called from user threads when +they are unable to find a replaceable page at the tail of the LRU +list i.e.: when the background LRU flushing in the page_cleaner thread +is not fast enough to keep pace with the workload. @return TRUE if success. */ UNIV_INTERN ibool @@ -1987,84 +1871,67 @@ buf_flush_single_page_from_LRU( { ulint scanned; buf_page_t* bpage; + ibool freed; buf_pool_mutex_enter(buf_pool); - for (bpage = UT_LIST_GET_LAST(buf_pool->LRU), scanned = 1; + for (bpage = buf_pool->single_scan_itr.start(), + scanned = 0, freed = FALSE; bpage != NULL; - bpage = UT_LIST_GET_PREV(LRU, bpage), ++scanned) { + ++scanned, bpage = buf_pool->single_scan_itr.get()) { - ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); + ut_ad(buf_pool_mutex_own(buf_pool)); - mutex_enter(block_mutex); - - if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_SINGLE_PAGE)) { - - /* The following call will release the buffer pool - and block mutex. */ + buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage); + buf_pool->single_scan_itr.set(prev); - ibool flushed = buf_flush_page( - buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, true); + ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); + mutex_enter(block_mutex); - if (flushed) { - /* buf_flush_page() will release the - block mutex */ + if (buf_flush_ready_for_replace(bpage)) { + /* block is ready for eviction i.e., it is + clean and is not IO-fixed or buffer fixed. */ + mutex_exit(block_mutex); + if (buf_LRU_free_page(bpage, true)) { + buf_pool_mutex_exit(buf_pool); + freed = TRUE; + break; + } + } else if (buf_flush_ready_for_flush( + bpage, BUF_FLUSH_SINGLE_PAGE)) { + /* Block is ready for flush. Dispatch an IO + request. We'll put it on free list in IO + completion routine. The following call, if + successful, will release the buffer pool and + block mutex. */ + freed = buf_flush_page(buf_pool, bpage, + BUF_FLUSH_SINGLE_PAGE, true); + if (freed) { + /* block and buffer pool mutex have + already been reelased. */ break; } + mutex_exit(block_mutex); + } else { + mutex_exit(block_mutex); } - - mutex_exit(block_mutex); } - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_LRU_SINGLE_FLUSH_SCANNED, - MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL, - MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL, - scanned); - - if (bpage == NULL) { + if (!freed) { /* Can't find a single flushable page. */ + ut_ad(!bpage); buf_pool_mutex_exit(buf_pool); - return(FALSE); } - - ibool freed = FALSE; - - /* At this point the page has been written to the disk. - As we are not holding buffer pool or block mutex therefore - we cannot use the bpage safely. It may have been plucked out - of the LRU list by some other thread or it may even have - relocated in case of a compressed page. We need to start - the scan of LRU list again to remove the block from the LRU - list and put it on the free list. */ - buf_pool_mutex_enter(buf_pool); - - for (bpage = UT_LIST_GET_LAST(buf_pool->LRU); - bpage != NULL; - bpage = UT_LIST_GET_PREV(LRU, bpage)) { - - ib_mutex_t* block_mutex = buf_page_get_mutex(bpage); - - mutex_enter(block_mutex); - - ibool ready = buf_flush_ready_for_replace(bpage); - - mutex_exit(block_mutex); - - if (ready) { - bool evict_zip; - - evict_zip = !buf_LRU_evict_from_unzip_LRU(buf_pool);; - - freed = buf_LRU_free_page(bpage, evict_zip); - - break; - } + if (scanned) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_SINGLE_FLUSH_SCANNED, + MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL, + MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL, + scanned); } - buf_pool_mutex_exit(buf_pool); - + ut_ad(!buf_pool_mutex_own(buf_pool)); return(freed); } @@ -2082,10 +1949,16 @@ buf_flush_LRU_tail(void) { ulint total_flushed = 0; + if(buf_mtflu_init_done()) + { + return(buf_mtflu_flush_LRU_tail()); + } + for (ulint i = 0; i < srv_buf_pool_instances; i++) { buf_pool_t* buf_pool = buf_pool_from_array(i); ulint scan_depth; + flush_counters_t n; /* srv_LRU_scan_depth can be arbitrarily large value. We cap it with current LRU size. */ @@ -2095,44 +1968,37 @@ buf_flush_LRU_tail(void) scan_depth = ut_min(srv_LRU_scan_depth, scan_depth); - /* We divide LRU flush into smaller chunks because - there may be user threads waiting for the flush to - end in buf_LRU_get_free_block(). */ - for (ulint j = 0; - j < scan_depth; - j += PAGE_CLEANER_LRU_BATCH_CHUNK_SIZE) { - - flush_counters_t n; - - /* Currently page_cleaner is the only thread - that can trigger an LRU flush. It is possible - that a batch triggered during last iteration is - still running, */ - if (buf_flush_LRU(buf_pool, - PAGE_CLEANER_LRU_BATCH_CHUNK_SIZE, - &n)) { - - /* Allowed only one batch per - buffer pool instance. */ - buf_flush_wait_batch_end( - buf_pool, BUF_FLUSH_LRU); - } + /* Currently page_cleaner is the only thread + that can trigger an LRU flush. It is possible + that a batch triggered during last iteration is + still running, */ + if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) { + continue; + } - if (n.flushed) { - total_flushed += n.flushed; - } else { - /* Nothing to flush */ - break; - } + buf_flush_batch(buf_pool, BUF_FLUSH_LRU, scan_depth, 0, &n); + + buf_flush_end(buf_pool, BUF_FLUSH_LRU); + + buf_flush_common(BUF_FLUSH_LRU, n.flushed); + + if (n.flushed) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE, + MONITOR_LRU_BATCH_FLUSH_COUNT, + MONITOR_LRU_BATCH_FLUSH_PAGES, + n.flushed); } - } - if (total_flushed) { - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_LRU_BATCH_TOTAL_PAGE, - MONITOR_LRU_BATCH_COUNT, - MONITOR_LRU_BATCH_PAGES, - total_flushed); + if (n.evicted) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE, + MONITOR_LRU_BATCH_EVICT_COUNT, + MONITOR_LRU_BATCH_EVICT_PAGES, + n.evicted); + } + + total_flushed += (n.flushed + n.evicted); } return(total_flushed); @@ -2390,14 +2256,19 @@ page_cleaner_sleep_if_needed( if (next_loop_time > cur_time) { /* Get sleep interval in micro seconds. We use - ut_min() to avoid long sleep in case of - wrap around. */ - os_thread_sleep(ut_min(1000000, - (next_loop_time - cur_time) - * 1000)); + ut_min() to avoid long sleep in case of wrap around. */ + ulint sleep_us; + + sleep_us = ut_min(1000000, (next_loop_time - cur_time) * 1000); + + ib_int64_t sig_count = os_event_reset(buf_flush_event); + + os_event_wait_time_low(buf_flush_event, sleep_us, sig_count); } } + + /******************************************************************//** page_cleaner thread tasked with flushing dirty pages from the buffer pools. As of now we'll have only one instance of this thread. @@ -2424,7 +2295,6 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( fprintf(stderr, "InnoDB: page_cleaner thread running, id %lu\n", os_thread_pf(os_thread_get_curr_id())); #endif /* UNIV_DEBUG_THREAD_CREATION */ - buf_page_cleaner_is_active = TRUE; while (srv_shutdown_state == SRV_SHUTDOWN_NONE) { @@ -2437,12 +2307,12 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( last_activity = srv_get_activity_count(); /* Flush pages from flush_list if required */ - page_cleaner_flush_pages_if_needed(); - n_flushed = 0; - } else { + n_flushed += page_cleaner_flush_pages_if_needed(); + + } else if (srv_idle_flush_pct) { n_flushed = page_cleaner_do_flush_batch( - PCT_IO(100), - LSN_MAX); + PCT_IO(100), + LSN_MAX); if (n_flushed) { MONITOR_INC_VALUE_CUMULATIVE( @@ -2454,10 +2324,11 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( } /* Flush pages from end of LRU if required */ - n_flushed = buf_flush_LRU_tail(); + buf_flush_LRU_tail(); } ut_ad(srv_shutdown_state > 0); + if (srv_fast_shutdown == 2) { /* In very fast shutdown we simulate a crash of buffer pool. We are not required to do any flushing */ @@ -2522,6 +2393,8 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( thread_exit: buf_page_cleaner_is_active = FALSE; + os_event_free(buf_flush_event); + /* We count the number of threads in os_thread_exit(). A created thread should always use that to exit and not use return() to exit. */ os_thread_exit(NULL); @@ -2623,9 +2496,11 @@ buf_flush_validate( return(ret); } + #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ #endif /* !UNIV_HOTBACKUP */ + #ifdef UNIV_DEBUG /******************************************************************//** Check if there are any dirty pages that belong to a space id in the flush diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc index 36eae54c17f..952f0fc3083 100644 --- a/storage/innobase/buf/buf0lru.cc +++ b/storage/innobase/buf/buf0lru.cc @@ -81,6 +81,10 @@ are not blocked for extended period of time when using very large buffer pools. */ #define BUF_LRU_DROP_SEARCH_SIZE 1024 +/** We scan these many blocks when looking for a clean page to evict +during LRU eviction. */ +#define BUF_LRU_SEARCH_SCAN_THRESHOLD 100 + /** If we switch on the InnoDB monitor because there are too few available frames in the buffer pool, we set this to TRUE */ static ibool buf_lru_switched_on_innodb_mon = FALSE; @@ -961,7 +965,7 @@ buf_LRU_free_from_unzip_LRU_list( } for (block = UT_LIST_GET_LAST(buf_pool->unzip_LRU), - scanned = 1, freed = FALSE; + scanned = 0, freed = FALSE; block != NULL && !freed && (scan_all || scanned < srv_LRU_scan_depth); ++scanned) { @@ -978,11 +982,13 @@ buf_LRU_free_from_unzip_LRU_list( block = prev_block; } - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_LRU_UNZIP_SEARCH_SCANNED, - MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL, - MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL, - scanned); + if (scanned) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_UNZIP_SEARCH_SCANNED, + MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL, + MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL, + scanned); + } return(freed); } @@ -1004,21 +1010,30 @@ buf_LRU_free_from_common_LRU_list( ut_ad(buf_pool_mutex_own(buf_pool)); - for (bpage = UT_LIST_GET_LAST(buf_pool->LRU), - scanned = 1, freed = FALSE; + for (bpage = buf_pool->lru_scan_itr.start(), + scanned = 0, freed = false; bpage != NULL && !freed - && (scan_all || scanned < srv_LRU_scan_depth); - ++scanned) { + && (scan_all || scanned < BUF_LRU_SEARCH_SCAN_THRESHOLD); + ++scanned, bpage = buf_pool->lru_scan_itr.get()) { - unsigned accessed; - buf_page_t* prev_bpage = UT_LIST_GET_PREV(LRU, - bpage); + buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage); + buf_pool->lru_scan_itr.set(prev); + + ib_mutex_t* mutex = buf_page_get_mutex(bpage); + mutex_enter(mutex); ut_ad(buf_page_in_file(bpage)); ut_ad(bpage->in_LRU_list); - accessed = buf_page_is_accessed(bpage); - freed = buf_LRU_free_page(bpage, true); + unsigned accessed = buf_page_is_accessed(bpage); + + if (buf_flush_ready_for_replace(bpage)) { + mutex_exit(mutex); + freed = buf_LRU_free_page(bpage, true); + } else { + mutex_exit(mutex); + } + if (freed && !accessed) { /* Keep track of pages that are evicted without ever being accessed. This gives us a measure of @@ -1026,14 +1041,17 @@ buf_LRU_free_from_common_LRU_list( ++buf_pool->stat.n_ra_pages_evicted; } - bpage = prev_bpage; + ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(!mutex_own(mutex)); } - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_LRU_SEARCH_SCANNED, - MONITOR_LRU_SEARCH_SCANNED_NUM_CALL, - MONITOR_LRU_SEARCH_SCANNED_PER_CALL, - scanned); + if (scanned) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_SEARCH_SCANNED, + MONITOR_LRU_SEARCH_SCANNED_NUM_CALL, + MONITOR_LRU_SEARCH_SCANNED_PER_CALL, + scanned); + } return(freed); } @@ -1217,8 +1235,6 @@ the free list. Even when we flush a page or find a page in LRU scan we put it to free list to be used. * iteration 0: * get a block from free list, success:done - * if there is an LRU flush batch in progress: - * wait for batch to end: retry free list * if buf_pool->try_LRU_scan is set * scan LRU up to srv_LRU_scan_depth to find a clean block * the above will put the block on free list @@ -1231,7 +1247,7 @@ we put it to free list to be used. * scan whole LRU list * scan LRU list even if buf_pool->try_LRU_scan is not set * iteration > 1: - * same as iteration 1 but sleep 100ms + * same as iteration 1 but sleep 10ms @return the free control block, in state BUF_BLOCK_READY_FOR_USE */ UNIV_INTERN buf_block_t* @@ -1269,20 +1285,6 @@ loop: return(block); } - if (buf_pool->init_flush[BUF_FLUSH_LRU] - && srv_use_doublewrite_buf - && buf_dblwr != NULL) { - - /* If there is an LRU flush happening in the background - then we wait for it to end instead of trying a single - page flush. If, however, we are not using doublewrite - buffer then it is better to do our own single page - flush instead of waiting for LRU flush to end. */ - buf_pool_mutex_exit(buf_pool); - buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU); - goto loop; - } - freed = FALSE; if (buf_pool->try_LRU_scan || n_iterations > 0) { /* If no block was in the free list, search from the @@ -1299,6 +1301,10 @@ loop: TRUE again when we flush a batch from this buffer pool. */ buf_pool->try_LRU_scan = FALSE; + + /* Also tell the page_cleaner thread that + there is work for it to do. */ + os_event_set(buf_flush_event); } } @@ -1347,12 +1353,10 @@ loop: /* If we have scanned the whole LRU and still are unable to find a free block then we should sleep here to let the - page_cleaner do an LRU batch for us. - TODO: It'd be better if we can signal the page_cleaner. Perhaps - we should use timed wait for page_cleaner. */ - if (n_iterations > 1) { + page_cleaner do an LRU batch for us. */ - os_thread_sleep(100000); + if (n_iterations > 1) { + os_thread_sleep(10000); } /* No free block was found: try to flush the LRU list. @@ -1503,6 +1507,20 @@ buf_unzip_LRU_remove_block_if_needed( } /******************************************************************//** +Adjust LRU hazard pointers if needed. */ + +void +buf_LRU_adjust_hp( +/*==============*/ + buf_pool_t* buf_pool,/*!< in: buffer pool instance */ + const buf_page_t* bpage) /*!< in: control block */ +{ + buf_pool->lru_hp.adjust(bpage); + buf_pool->lru_scan_itr.adjust(bpage); + buf_pool->single_scan_itr.adjust(bpage); +} + +/******************************************************************//** Removes a block from the LRU list. */ UNIV_INLINE void @@ -1521,6 +1539,10 @@ buf_LRU_remove_block( ut_ad(bpage->in_LRU_list); + /* Important that we adjust the hazard pointers before removing + bpage from the LRU list. */ + buf_LRU_adjust_hp(buf_pool, bpage); + /* If the LRU_old pointer is defined and points to just this block, move it backward one step */ diff --git a/storage/innobase/buf/buf0mtflu.cc b/storage/innobase/buf/buf0mtflu.cc new file mode 100644 index 00000000000..c14f9048ae5 --- /dev/null +++ b/storage/innobase/buf/buf0mtflu.cc @@ -0,0 +1,746 @@ +/***************************************************************************** + +Copyright (C) 2013, 2014, Fusion-io. All Rights Reserved. +Copyright (C) 2013, 2014, SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file buf/buf0mtflu.cc +Multi-threaded flush method implementation + +Created 06/11/2013 Dhananjoy Das DDas@fusionio.com +Modified 12/12/2013 Jan Lindström jan.lindstrom@skysql.com +Modified 03/02/2014 Dhananjoy Das DDas@fusionio.com +Modified 06/02/2014 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#include "buf0buf.h" +#include "buf0flu.h" +#include "buf0mtflu.h" +#include "buf0checksum.h" +#include "srv0start.h" +#include "srv0srv.h" +#include "page0zip.h" +#include "ut0byte.h" +#include "ut0lst.h" +#include "page0page.h" +#include "fil0fil.h" +#include "buf0lru.h" +#include "buf0rea.h" +#include "ibuf0ibuf.h" +#include "log0log.h" +#include "os0file.h" +#include "os0sync.h" +#include "trx0sys.h" +#include "srv0mon.h" +#include "mysql/plugin.h" +#include "mysql/service_thd_wait.h" +#include "fil0pagecompress.h" + +#define MT_COMP_WATER_MARK 50 +/** Time to wait for a message. */ +#define MT_WAIT_IN_USECS 5000000 + +/* Work item status */ +typedef enum wrk_status { + WRK_ITEM_UNSET=0, /*!< Work item is not set */ + WRK_ITEM_START=1, /*!< Processing of work item has started */ + WRK_ITEM_DONE=2, /*!< Processing is done usually set to + SUCCESS/FAILED */ + WRK_ITEM_SUCCESS=2, /*!< Work item successfully processed */ + WRK_ITEM_FAILED=3, /*!< Work item process failed */ + WRK_ITEM_EXIT=4, /*!< Exiting */ + WRK_ITEM_SET=5, /*!< Work item is set */ + WRK_ITEM_STATUS_UNDEFINED +} wrk_status_t; + +/* Work item task type */ +typedef enum mt_wrk_tsk { + MT_WRK_NONE=0, /*!< Exit queue-wait */ + MT_WRK_WRITE=1, /*!< Flush operation */ + MT_WRK_READ=2, /*!< Read operation */ + MT_WRK_UNDEFINED +} mt_wrk_tsk_t; + +/* Work thread status */ +typedef enum wthr_status { + WTHR_NOT_INIT=0, /*!< Work thread not initialized */ + WTHR_INITIALIZED=1, /*!< Work thread initialized */ + WTHR_SIG_WAITING=2, /*!< Work thread wating signal */ + WTHR_RUNNING=3, /*!< Work thread running */ + WTHR_NO_WORK=4, /*!< Work thread has no work */ + WTHR_KILL_IT=5, /*!< Work thread should exit */ + WTHR_STATUS_UNDEFINED +} wthr_status_t; + +/* Write work task */ +typedef struct wr_tsk { + buf_pool_t *buf_pool; /*!< buffer-pool instance */ + buf_flush_t flush_type; /*!< flush-type for buffer-pool + flush operation */ + ulint min; /*!< minimum number of pages + requested to be flushed */ + lsn_t lsn_limit; /*!< lsn limit for the buffer-pool + flush operation */ +} wr_tsk_t; + +/* Read work task */ +typedef struct rd_tsk { + buf_pool_t *page_pool; /*!< list of pages to decompress; */ +} rd_tsk_t; + +/* Work item */ +typedef struct wrk_itm +{ + mt_wrk_tsk_t tsk; /*!< Task type. Based on task-type + one of the entries wr_tsk/rd_tsk + will be used */ + wr_tsk_t wr; /*!< Flush page list */ + rd_tsk_t rd; /*!< Decompress page list */ + ulint n_flushed; /*!< Number of flushed pages */ + ulint n_evicted; /*!< Number of evicted pages */ + os_thread_id_t id_usr; /*!< Thread-id currently working */ + wrk_status_t wi_status; /*!< Work item status */ + mem_heap_t *wheap; /*!< Heap were to allocate memory + for queue nodes */ + mem_heap_t *rheap; +} wrk_t; + +typedef struct thread_data +{ + os_thread_id_t wthread_id; /*!< Identifier */ + os_thread_t wthread; /*!< Thread id */ + wthr_status_t wt_status; /*!< Worker thread status */ +} thread_data_t; + +/* Thread syncronization data */ +typedef struct thread_sync +{ + /* Global variables used by all threads */ + os_fast_mutex_t thread_global_mtx; /*!< Mutex used protecting below + variables */ + ulint n_threads; /*!< Number of threads */ + ib_wqueue_t *wq; /*!< Work Queue */ + ib_wqueue_t *wr_cq; /*!< Write Completion Queue */ + ib_wqueue_t *rd_cq; /*!< Read Completion Queue */ + mem_heap_t* wheap; /*!< Work heap where memory + is allocated */ + mem_heap_t* rheap; /*!< Work heap where memory + is allocated */ + wthr_status_t gwt_status; /*!< Global thread status */ + + /* Variables used by only one thread at a time */ + thread_data_t* thread_data; /*!< Thread specific data */ + +} thread_sync_t; + +static int mtflush_work_initialized = -1; +static thread_sync_t* mtflush_ctx=NULL; +static os_fast_mutex_t mtflush_mtx; + +/******************************************************************//** +Set multi-threaded flush work initialized. */ +static inline +void +buf_mtflu_work_init(void) +/*=====================*/ +{ + mtflush_work_initialized = 1; +} + +/******************************************************************//** +Return true if multi-threaded flush is initialized +@return true if initialized */ +bool +buf_mtflu_init_done(void) +/*=====================*/ +{ + return(mtflush_work_initialized == 1); +} + +/******************************************************************//** +Fush buffer pool instance. +@return number of flushed pages, or 0 if error happened +*/ +static +ulint +buf_mtflu_flush_pool_instance( +/*==========================*/ + wrk_t *work_item) /*!< inout: work item to be flushed */ +{ + flush_counters_t n; + ut_a(work_item != NULL); + ut_a(work_item->wr.buf_pool != NULL); + + if (!buf_flush_start(work_item->wr.buf_pool, work_item->wr.flush_type)) { + /* We have two choices here. If lsn_limit was + specified then skipping an instance of buffer + pool means we cannot guarantee that all pages + up to lsn_limit has been flushed. We can + return right now with failure or we can try + to flush remaining buffer pools up to the + lsn_limit. We attempt to flush other buffer + pools based on the assumption that it will + help in the retry which will follow the + failure. */ +#ifdef UNIV_MTFLUSH_DEBUG + fprintf(stderr, "InnoDB: Note: buf flush start failed there is already active flush for this buffer pool.\n"); +#endif + return 0; + } + + + if (work_item->wr.flush_type == BUF_FLUSH_LRU) { + /* srv_LRU_scan_depth can be arbitrarily large value. + * We cap it with current LRU size. + */ + buf_pool_mutex_enter(work_item->wr.buf_pool); + work_item->wr.min = UT_LIST_GET_LEN(work_item->wr.buf_pool->LRU); + buf_pool_mutex_exit(work_item->wr.buf_pool); + work_item->wr.min = ut_min(srv_LRU_scan_depth,work_item->wr.min); + } + + buf_flush_batch(work_item->wr.buf_pool, + work_item->wr.flush_type, + work_item->wr.min, + work_item->wr.lsn_limit, + &n); + + buf_flush_end(work_item->wr.buf_pool, work_item->wr.flush_type); + buf_flush_common(work_item->wr.flush_type, n.flushed); + work_item->n_flushed = n.flushed; + work_item->n_evicted = n.evicted; + + return work_item->n_flushed; +} + +/******************************************************************//** +Worker function to wait for work items and processing them and +sending reply back. +*/ +static +void +mtflush_service_io( +/*===============*/ + thread_sync_t* mtflush_io, /*!< inout: multi-threaded flush + syncronization data */ + thread_data_t* thread_data) /* Thread status data */ +{ + wrk_t *work_item = NULL; + ulint n_flushed=0; + + ut_a(mtflush_io != NULL); + ut_a(thread_data != NULL); + + thread_data->wt_status = WTHR_SIG_WAITING; + + work_item = (wrk_t *)ib_wqueue_nowait(mtflush_io->wq); + + if (work_item == NULL) { + work_item = (wrk_t *)ib_wqueue_wait(mtflush_io->wq); + } + + if (work_item) { + thread_data->wt_status = WTHR_RUNNING; + } else { + /* Thread did not get any work */ + thread_data->wt_status = WTHR_NO_WORK; + return; + } + + if (work_item->wi_status != WRK_ITEM_EXIT) { + work_item->wi_status = WRK_ITEM_SET; + } + +#ifdef UNIV_MTFLUSH_DEBUG + ut_a(work_item->id_usr == 0); +#endif + work_item->id_usr = os_thread_get_curr_id(); + + /* This works as a producer/consumer model, where in tasks are + * inserted into the work-queue (wq) and completions are based + * on the type of operations performed and as a result the WRITE/ + * compression/flush operation completions get posted to wr_cq. + * And READ/decompress operations completions get posted to rd_cq. + * in future we may have others. + */ + + switch(work_item->tsk) { + case MT_WRK_NONE: + ut_a(work_item->wi_status == WRK_ITEM_EXIT); + work_item->wi_status = WRK_ITEM_EXIT; + ib_wqueue_add(mtflush_io->wr_cq, work_item, work_item->rheap); + thread_data->wt_status = WTHR_KILL_IT; + break; + + case MT_WRK_WRITE: + ut_a(work_item->wi_status == WRK_ITEM_SET); + work_item->wi_status = WRK_ITEM_START; + /* Process work item */ + if (0 == (n_flushed = buf_mtflu_flush_pool_instance(work_item))) { + work_item->wi_status = WRK_ITEM_FAILED; + } + work_item->wi_status = WRK_ITEM_SUCCESS; + ib_wqueue_add(mtflush_io->wr_cq, work_item, work_item->rheap); + break; + + case MT_WRK_READ: + ut_a(0); + break; + + default: + /* None other than Write/Read handling planned */ + ut_a(0); + break; + } +} + +/******************************************************************//** +Thead used to flush dirty pages when multi-threaded flush is +used. +@return a dummy parameter*/ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(mtflush_io_thread)( +/*==============================*/ + void * arg) +{ + thread_sync_t *mtflush_io = ((thread_sync_t *)arg); + thread_data_t *this_thread_data = NULL; + ulint i; + + /* Find correct slot for this thread */ + os_fast_mutex_lock(&(mtflush_io->thread_global_mtx)); + for(i=0; i < mtflush_io->n_threads; i ++) { + if (mtflush_io->thread_data[i].wthread_id == os_thread_get_curr_id()) { + break; + } + } + + ut_a(i <= mtflush_io->n_threads); + this_thread_data = &mtflush_io->thread_data[i]; + os_fast_mutex_unlock(&(mtflush_io->thread_global_mtx)); + + while (TRUE) { + +#ifdef UNIV_MTFLUSH_DEBUG + fprintf(stderr, "InnoDB: Note. Thread %lu work queue len %lu return queue len %lu\n", + os_thread_get_curr_id(), + ib_wqueue_len(mtflush_io->wq), + ib_wqueue_len(mtflush_io->wr_cq)); +#endif /* UNIV_MTFLUSH_DEBUG */ + + mtflush_service_io(mtflush_io, this_thread_data); + + + if (this_thread_data->wt_status == WTHR_KILL_IT) { + break; + } + } + + os_thread_exit(NULL); + OS_THREAD_DUMMY_RETURN; +} + +/******************************************************************//** +Add exit work item to work queue to signal multi-threded flush +threads that they should exit. +*/ +void +buf_mtflu_io_thread_exit(void) +/*==========================*/ +{ + ulint i; + thread_sync_t* mtflush_io = mtflush_ctx; + wrk_t* work_item = NULL; + + ut_a(mtflush_io != NULL); + + /* Allocate work items for shutdown message */ + work_item = (wrk_t*)mem_heap_alloc(mtflush_io->wheap, sizeof(wrk_t)*srv_mtflush_threads); + + /* Confirm if the io-thread KILL is in progress, bailout */ + if (mtflush_io->gwt_status == WTHR_KILL_IT) { + return; + } + + mtflush_io->gwt_status = WTHR_KILL_IT; + + fprintf(stderr, "InnoDB: [Note]: Signal mtflush_io_threads to exit [%lu]\n", + srv_mtflush_threads); + + /* This lock is to safequard against timing bug: flush request take + this mutex before sending work items to be processed by flush + threads. Inside flush thread we assume that work queue contains only + a constant number of items. Thus, we may not install new work items + below before all previous ones are processed. This mutex is released + by flush request after all work items sent to flush threads have + been processed. Thus, we can get this mutex if and only if work + queue is empty. */ + + os_fast_mutex_lock(&mtflush_mtx); + + /* Make sure the work queue is empty */ + ut_a(ib_wqueue_is_empty(mtflush_io->wq)); + + /* Send one exit work item/thread */ + for (i=0; i < (ulint)srv_mtflush_threads; i++) { + work_item[i].tsk = MT_WRK_NONE; + work_item[i].wi_status = WRK_ITEM_EXIT; + work_item[i].wheap = mtflush_io->wheap; + work_item[i].rheap = mtflush_io->rheap; + work_item[i].id_usr = 0; + + ib_wqueue_add(mtflush_io->wq, + (void *)&(work_item[i]), + mtflush_io->wheap); + } + + /* Wait until all work items on a work queue are processed */ + while(!ib_wqueue_is_empty(mtflush_io->wq)) { + /* Wait */ + os_thread_sleep(MT_WAIT_IN_USECS); + } + + ut_a(ib_wqueue_is_empty(mtflush_io->wq)); + + /* Collect all work done items */ + for (i=0; i < (ulint)srv_mtflush_threads;) { + wrk_t* work_item = NULL; + + work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wr_cq, MT_WAIT_IN_USECS); + + /* If we receive reply to work item and it's status is exit, + thead has processed this message and existed */ + if (work_item && work_item->wi_status == WRK_ITEM_EXIT) { + i++; + } + } + + /* Wait about 1/2 sec to allow threads really exit */ + os_thread_sleep(MT_WAIT_IN_USECS); + + while(!ib_wqueue_is_empty(mtflush_io->wq)) + { + ib_wqueue_nowait(mtflush_io->wq); + } + + ut_a(ib_wqueue_is_empty(mtflush_io->wq)); + ut_a(ib_wqueue_is_empty(mtflush_io->wr_cq)); + ut_a(ib_wqueue_is_empty(mtflush_io->rd_cq)); + + /* Free all queues */ + ib_wqueue_free(mtflush_io->wq); + ib_wqueue_free(mtflush_io->wr_cq); + ib_wqueue_free(mtflush_io->rd_cq); + + /* Requests sent */ + os_fast_mutex_unlock(&mtflush_mtx); + os_fast_mutex_free(&mtflush_mtx); + os_fast_mutex_free(&mtflush_io->thread_global_mtx); + + /* Free heap */ + mem_heap_free(mtflush_io->wheap); + mem_heap_free(mtflush_io->rheap); +} + +/******************************************************************//** +Initialize multi-threaded flush thread syncronization data. +@return Initialized multi-threaded flush thread syncroniztion data. */ +void* +buf_mtflu_handler_init( +/*===================*/ + ulint n_threads, /*!< in: Number of threads to create */ + ulint wrk_cnt) /*!< in: Number of work items */ +{ + ulint i; + mem_heap_t* mtflush_heap; + mem_heap_t* mtflush_heap2; + + /* Create heap, work queue, write completion queue, read + completion queue for multi-threaded flush, and init + handler. */ + mtflush_heap = mem_heap_create(0); + ut_a(mtflush_heap != NULL); + mtflush_heap2 = mem_heap_create(0); + ut_a(mtflush_heap2 != NULL); + + mtflush_ctx = (thread_sync_t *)mem_heap_alloc(mtflush_heap, + sizeof(thread_sync_t)); + memset(mtflush_ctx, 0, sizeof(thread_sync_t)); + ut_a(mtflush_ctx != NULL); + mtflush_ctx->thread_data = (thread_data_t*)mem_heap_alloc( + mtflush_heap, sizeof(thread_data_t) * n_threads); + ut_a(mtflush_ctx->thread_data); + memset(mtflush_ctx->thread_data, 0, sizeof(thread_data_t) * n_threads); + + mtflush_ctx->n_threads = n_threads; + mtflush_ctx->wq = ib_wqueue_create(); + ut_a(mtflush_ctx->wq); + mtflush_ctx->wr_cq = ib_wqueue_create(); + ut_a(mtflush_ctx->wr_cq); + mtflush_ctx->rd_cq = ib_wqueue_create(); + ut_a(mtflush_ctx->rd_cq); + mtflush_ctx->wheap = mtflush_heap; + mtflush_ctx->rheap = mtflush_heap2; + + os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mtflush_ctx->thread_global_mtx); + os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mtflush_mtx); + + /* Create threads for page-compression-flush */ + for(i=0; i < n_threads; i++) { + os_thread_id_t new_thread_id; + + mtflush_ctx->thread_data[i].wt_status = WTHR_INITIALIZED; + + mtflush_ctx->thread_data[i].wthread = os_thread_create( + mtflush_io_thread, + ((void *) mtflush_ctx), + &new_thread_id); + + mtflush_ctx->thread_data[i].wthread_id = new_thread_id; + } + + buf_mtflu_work_init(); + + return((void *)mtflush_ctx); +} + +/******************************************************************//** +Flush buffer pool instances. +@return number of pages flushed. */ +ulint +buf_mtflu_flush_work_items( +/*=======================*/ + ulint buf_pool_inst, /*!< in: Number of buffer pool instances */ + flush_counters_t *per_pool_cnt, /*!< out: Number of pages + flushed or evicted /instance */ + buf_flush_t flush_type, /*!< in: Type of flush */ + ulint min_n, /*!< in: Wished minimum number of + blocks to be flushed */ + lsn_t lsn_limit) /*!< in: All blocks whose + oldest_modification is smaller than + this should be flushed (if their + number does not exceed min_n) */ +{ + ulint n_flushed=0, i; + mem_heap_t* work_heap; + mem_heap_t* reply_heap; + wrk_t work_item[MTFLUSH_MAX_WORKER]; + + /* Allocate heap where all work items used and queue + node items areallocated */ + work_heap = mem_heap_create(0); + reply_heap = mem_heap_create(0); + + + for(i=0;i<buf_pool_inst; i++) { + work_item[i].tsk = MT_WRK_WRITE; + work_item[i].wr.buf_pool = buf_pool_from_array(i); + work_item[i].wr.flush_type = flush_type; + work_item[i].wr.min = min_n; + work_item[i].wr.lsn_limit = lsn_limit; + work_item[i].wi_status = WRK_ITEM_UNSET; + work_item[i].wheap = work_heap; + work_item[i].rheap = reply_heap; + work_item[i].n_flushed = 0; + work_item[i].n_evicted = 0; + work_item[i].id_usr = 0; + + ib_wqueue_add(mtflush_ctx->wq, + (void *)(work_item + i), + work_heap); + } + + /* wait on the completion to arrive */ + for(i=0; i< buf_pool_inst;) { + wrk_t *done_wi = NULL; + done_wi = (wrk_t *)ib_wqueue_wait(mtflush_ctx->wr_cq); + + if (done_wi != NULL) { + per_pool_cnt[i].flushed = done_wi->n_flushed; + per_pool_cnt[i].evicted = done_wi->n_evicted; + +#ifdef UNIV_MTFLUSH_DEBUG + if((int)done_wi->id_usr == 0 && + (done_wi->wi_status == WRK_ITEM_SET || + done_wi->wi_status == WRK_ITEM_UNSET)) { + fprintf(stderr, + "**Set/Unused work_item[%lu] flush_type=%d\n", + i, + done_wi->wr.flush_type); + ut_a(0); + } +#endif + + n_flushed+= done_wi->n_flushed+done_wi->n_evicted; + i++; + } + } + + /* Release used work_items and queue nodes */ + mem_heap_free(work_heap); + mem_heap_free(reply_heap); + + return(n_flushed); +} + +/*******************************************************************//** +Multi-threaded version of buf_flush_list +*/ +bool +buf_mtflu_flush_list( +/*=================*/ + ulint min_n, /*!< in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ + lsn_t lsn_limit, /*!< in the case BUF_FLUSH_LIST all + blocks whose oldest_modification is + smaller than this should be flushed + (if their number does not exceed + min_n), otherwise ignored */ + ulint* n_processed) /*!< out: the number of pages + which were processed is passed + back to caller. Ignored if NULL */ + +{ + ulint i; + bool success = true; + flush_counters_t cnt[MTFLUSH_MAX_WORKER]; + + if (n_processed) { + *n_processed = 0; + } + + if (min_n != ULINT_MAX) { + /* Ensure that flushing is spread evenly amongst the + buffer pool instances. When min_n is ULINT_MAX + we need to flush everything up to the lsn limit + so no limit here. */ + min_n = (min_n + srv_buf_pool_instances - 1) + / srv_buf_pool_instances; + } + + /* This lock is to safequard against re-entry if any. */ + os_fast_mutex_lock(&mtflush_mtx); + buf_mtflu_flush_work_items(srv_buf_pool_instances, + cnt, BUF_FLUSH_LIST, + min_n, lsn_limit); + os_fast_mutex_unlock(&mtflush_mtx); + + for (i = 0; i < srv_buf_pool_instances; i++) { + if (n_processed) { + *n_processed += cnt[i].flushed+cnt[i].evicted; + } + + if (cnt[i].flushed) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_FLUSH_BATCH_TOTAL_PAGE, + MONITOR_FLUSH_BATCH_COUNT, + MONITOR_FLUSH_BATCH_PAGES, + cnt[i].flushed); + } + + if(cnt[i].evicted) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE, + MONITOR_LRU_BATCH_EVICT_COUNT, + MONITOR_LRU_BATCH_EVICT_PAGES, + cnt[i].evicted); + } + } +#ifdef UNIV_MTFLUSH_DEBUG + fprintf(stderr, "%s: [1] [*n_processed: (min:%lu)%lu ]\n", + __FUNCTION__, (min_n * srv_buf_pool_instances), *n_processed); +#endif + return(success); +} + +/*********************************************************************//** +Clears up tail of the LRU lists: +* Put replaceable pages at the tail of LRU to the free list +* Flush dirty pages at the tail of LRU to the disk +The depth to which we scan each buffer pool is controlled by dynamic +config parameter innodb_LRU_scan_depth. +@return total pages flushed */ +UNIV_INTERN +ulint +buf_mtflu_flush_LRU_tail(void) +/*==========================*/ +{ + ulint total_flushed=0, i; + flush_counters_t cnt[MTFLUSH_MAX_WORKER]; + + ut_a(buf_mtflu_init_done()); + + /* At shutdown do not send requests anymore */ + if (!mtflush_ctx || mtflush_ctx->gwt_status == WTHR_KILL_IT) { + return (total_flushed); + } + + /* This lock is to safeguard against re-entry if any */ + os_fast_mutex_lock(&mtflush_mtx); + buf_mtflu_flush_work_items(srv_buf_pool_instances, + cnt, BUF_FLUSH_LRU, srv_LRU_scan_depth, 0); + os_fast_mutex_unlock(&mtflush_mtx); + + for (i = 0; i < srv_buf_pool_instances; i++) { + total_flushed += cnt[i].flushed+cnt[i].evicted; + + if (cnt[i].flushed) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE, + MONITOR_LRU_BATCH_FLUSH_COUNT, + MONITOR_LRU_BATCH_FLUSH_PAGES, + cnt[i].flushed); + } + + if(cnt[i].evicted) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE, + MONITOR_LRU_BATCH_EVICT_COUNT, + MONITOR_LRU_BATCH_EVICT_PAGES, + cnt[i].evicted); + } + } + +#if UNIV_MTFLUSH_DEBUG + fprintf(stderr, "[1] [*n_processed: (min:%lu)%lu ]\n", ( + srv_LRU_scan_depth * srv_buf_pool_instances), total_flushed); +#endif + + return(total_flushed); +} + +/*********************************************************************//** +Set correct thread identifiers to io thread array based on +information we have. */ +void +buf_mtflu_set_thread_ids( +/*=====================*/ + ulint n_threads, /*!<in: Number of threads to fill */ + void* ctx, /*!<in: thread context */ + os_thread_id_t* thread_ids) /*!<in: thread id array */ +{ + thread_sync_t *mtflush_io = ((thread_sync_t *)ctx); + ulint i; + ut_a(mtflush_io != NULL); + ut_a(thread_ids != NULL); + + for(i = 0; i < n_threads; i++) { + thread_ids[i] = mtflush_io->thread_data[i].wthread_id; + } +} diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc index 7c8369c0c09..9e81d010d0f 100644 --- a/storage/innobase/buf/buf0rea.cc +++ b/storage/innobase/buf/buf0rea.cc @@ -184,14 +184,15 @@ buf_read_page_low( *err = fil_io(OS_FILE_READ | wake_later | ignore_nonexistent_pages, sync, space, zip_size, offset, 0, zip_size, - bpage->zip.data, bpage); + bpage->zip.data, bpage, &bpage->write_size); } else { ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); *err = fil_io(OS_FILE_READ | wake_later | ignore_nonexistent_pages, sync, space, 0, offset, 0, UNIV_PAGE_SIZE, - ((buf_block_t*) bpage)->frame, bpage); + ((buf_block_t*) bpage)->frame, bpage, + &bpage->write_size); } if (sync) { diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc index 947476e0e02..b13f68a08a7 100644 --- a/storage/innobase/dict/dict0dict.cc +++ b/storage/innobase/dict/dict0dict.cc @@ -2,6 +2,7 @@ Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -498,7 +499,7 @@ dict_table_try_drop_aborted( if (table == NULL) { table = dict_table_open_on_id_low( - table_id, DICT_ERR_IGNORE_NONE); + table_id, DICT_ERR_IGNORE_NONE, FALSE); } else { ut_ad(table->id == table_id); } @@ -747,17 +748,24 @@ dict_index_get_nth_col_or_prefix_pos( /*=================================*/ const dict_index_t* index, /*!< in: index */ ulint n, /*!< in: column number */ - ibool inc_prefix) /*!< in: TRUE=consider + ibool inc_prefix, /*!< in: TRUE=consider column prefixes too */ + ulint* prefix_col_pos) /*!< out: col num if prefix */ { const dict_field_t* field; const dict_col_t* col; ulint pos; ulint n_fields; + ulint prefixed_pos_dummy; ut_ad(index); ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + if (!prefix_col_pos) { + prefix_col_pos = &prefixed_pos_dummy; + } + *prefix_col_pos = ULINT_UNDEFINED; + col = dict_table_get_nth_col(index->table, n); if (dict_index_is_clust(index)) { @@ -770,10 +778,11 @@ dict_index_get_nth_col_or_prefix_pos( for (pos = 0; pos < n_fields; pos++) { field = dict_index_get_nth_field(index, pos); - if (col == field->col - && (inc_prefix || field->prefix_len == 0)) { - - return(pos); + if (col == field->col) { + *prefix_col_pos = pos; + if (inc_prefix || field->prefix_len == 0) { + return(pos); + } } } @@ -885,7 +894,8 @@ dict_table_open_on_id( table_id, table_op == DICT_TABLE_OP_LOAD_TABLESPACE ? DICT_ERR_IGNORE_RECOVER_LOCK - : DICT_ERR_IGNORE_NONE); + : DICT_ERR_IGNORE_NONE, + table_op == DICT_TABLE_OP_OPEN_ONLY_IF_CACHED); if (table != NULL) { @@ -917,7 +927,7 @@ dict_table_get_nth_col_pos( ulint n) /*!< in: column number */ { return(dict_index_get_nth_col_pos(dict_table_get_first_index(table), - n)); + n, NULL)); } /********************************************************************//** @@ -1403,7 +1413,7 @@ dict_table_move_from_non_lru_to_lru( /**********************************************************************//** Looks for an index with the given id given a table instance. @return index or NULL */ -static +UNIV_INTERN dict_index_t* dict_table_find_index_on_id( /*========================*/ @@ -2525,6 +2535,13 @@ undo_size_ok: new_index->stat_index_size = 1; new_index->stat_n_leaf_pages = 1; + new_index->stat_defrag_n_pages_freed = 0; + new_index->stat_defrag_n_page_split = 0; + + new_index->stat_defrag_sample_next_slot = 0; + memset(&new_index->stat_defrag_data_size_sample, + 0x0, sizeof(ulint) * STAT_DEFRAG_DATA_SIZE_N_SAMPLE); + /* Add the new index as the last index for the table */ UT_LIST_ADD_LAST(indexes, table->indexes, new_index); @@ -3334,7 +3351,29 @@ dict_foreign_find_index( return(NULL); } - +#ifdef WITH_WSREP +dict_index_t* +wsrep_dict_foreign_find_index( +/*====================*/ + dict_table_t* table, /*!< in: table */ + const char** col_names, /*!< in: column names, or NULL + to use table->col_names */ + const char** columns,/*!< in: array of column names */ + ulint n_cols, /*!< in: number of columns */ + dict_index_t* types_idx, /*!< in: NULL or an index to whose types the + column types must match */ + ibool check_charsets, + /*!< in: whether to check charsets. + only has an effect if types_idx != NULL */ + ulint check_null) + /*!< in: nonzero if none of the columns must + be declared NOT NULL */ +{ + return dict_foreign_find_index( + table, col_names, columns, n_cols, types_idx, check_charsets, + check_null); +} +#endif /* WITH_WSREP */ /**********************************************************************//** Report an error in a foreign key definition. */ static diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc index 1eac9e0df51..001623a49bc 100644 --- a/storage/innobase/dict/dict0stats.cc +++ b/storage/innobase/dict/dict0stats.cc @@ -194,7 +194,7 @@ dict_stats_persistent_storage_check( {"table_name", DATA_VARMYSQL, DATA_NOT_NULL, 192}, - {"last_update", DATA_INT, + {"last_update", DATA_FIXBINARY, DATA_NOT_NULL, 4}, {"n_rows", DATA_INT, @@ -225,7 +225,7 @@ dict_stats_persistent_storage_check( {"index_name", DATA_VARMYSQL, DATA_NOT_NULL, 192}, - {"last_update", DATA_INT, + {"last_update", DATA_FIXBINARY, DATA_NOT_NULL, 4}, {"stat_name", DATA_VARMYSQL, @@ -496,6 +496,9 @@ dict_stats_table_clone_create( heap, idx->n_uniq * sizeof(idx->stat_n_non_null_key_vals[0])); ut_d(idx->magic_n = DICT_INDEX_MAGIC_N); + + idx->stat_defrag_n_page_split = 0; + idx->stat_defrag_n_pages_freed = 0; } ut_d(t->magic_n = DICT_TABLE_MAGIC_N); @@ -525,7 +528,9 @@ static void dict_stats_empty_index( /*===================*/ - dict_index_t* index) /*!< in/out: index */ + dict_index_t* index, /*!< in/out: index */ + bool empty_defrag_stats) + /*!< in: whether to empty defrag stats */ { ut_ad(!(index->type & DICT_FTS)); ut_ad(!dict_index_is_univ(index)); @@ -540,6 +545,34 @@ dict_stats_empty_index( index->stat_index_size = 1; index->stat_n_leaf_pages = 1; + + if (empty_defrag_stats) { + dict_stats_empty_defrag_stats(index); + dict_stats_empty_defrag_summary(index); + } +} + +/**********************************************************************//** +Clear defragmentation summary. */ +UNIV_INTERN +void +dict_stats_empty_defrag_summary( +/*==================*/ + dict_index_t* index) /*!< in: index to clear defragmentation stats */ +{ + index->stat_defrag_n_pages_freed = 0; +} + +/**********************************************************************//** +Clear defragmentation related index stats. */ +UNIV_INTERN +void +dict_stats_empty_defrag_stats( +/*==================*/ + dict_index_t* index) /*!< in: index to clear defragmentation stats */ +{ + index->stat_defrag_modified_counter = 0; + index->stat_defrag_n_page_split = 0; } /*********************************************************************//** @@ -549,7 +582,9 @@ static void dict_stats_empty_table( /*===================*/ - dict_table_t* table) /*!< in/out: table */ + dict_table_t* table, /*!< in/out: table */ + bool empty_defrag_stats) + /*!< in: whether to empty defrag stats */ { /* Zero the stats members */ @@ -574,7 +609,7 @@ dict_stats_empty_table( ut_ad(!dict_index_is_univ(index)); - dict_stats_empty_index(index); + dict_stats_empty_index(index, empty_defrag_stats); } table->stat_initialized = TRUE; @@ -709,7 +744,7 @@ dict_stats_copy( } if (!INDEX_EQ(src_idx, dst_idx)) { - dict_stats_empty_index(dst_idx); + dict_stats_empty_index(dst_idx, true); continue; } @@ -720,7 +755,7 @@ dict_stats_copy( /* Since src is smaller some elements in dst will remain untouched by the following memmove(), thus we init all of them here. */ - dict_stats_empty_index(dst_idx); + dict_stats_empty_index(dst_idx, true); } else { n_copy_el = dst_idx->n_uniq; } @@ -740,6 +775,13 @@ dict_stats_copy( dst_idx->stat_index_size = src_idx->stat_index_size; dst_idx->stat_n_leaf_pages = src_idx->stat_n_leaf_pages; + + dst_idx->stat_defrag_modified_counter = + src_idx->stat_defrag_modified_counter; + dst_idx->stat_defrag_n_pages_freed = + src_idx->stat_defrag_n_pages_freed; + dst_idx->stat_defrag_n_page_split = + src_idx->stat_defrag_n_page_split; } dst->stat_initialized = TRUE; @@ -763,6 +805,9 @@ dict_index_t::stat_n_sample_sizes[] dict_index_t::stat_n_non_null_key_vals[] dict_index_t::stat_index_size dict_index_t::stat_n_leaf_pages +dict_index_t::stat_defrag_modified_counter +dict_index_t::stat_defrag_n_pages_freed +dict_index_t::stat_defrag_n_page_split The returned object should be freed with dict_stats_snapshot_free() when no longer needed. @return incomplete table object */ @@ -812,7 +857,9 @@ dict_stats_snapshot_free( Calculates new estimates for index statistics. This function is relatively quick and is used to calculate transient statistics that are not saved on disk. This was the only way to calculate statistics -before the Persistent Statistics feature was introduced. */ +before the Persistent Statistics feature was introduced. +This function doesn't update the defragmentation related stats. +Only persistent statistics supports defragmentation stats. */ static void dict_stats_update_transient_for_index( @@ -828,10 +875,10 @@ dict_stats_update_transient_for_index( Initialize some bogus index cardinality statistics, so that the data can be queried in various means, also via secondary indexes. */ - dict_stats_empty_index(index); + dict_stats_empty_index(index, false); #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG } else if (ibuf_debug && !dict_index_is_clust(index)) { - dict_stats_empty_index(index); + dict_stats_empty_index(index, false); #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ } else { mtr_t mtr; @@ -852,7 +899,7 @@ dict_stats_update_transient_for_index( switch (size) { case ULINT_UNDEFINED: - dict_stats_empty_index(index); + dict_stats_empty_index(index, false); return; case 0: /* The root node of the tree is a leaf */ @@ -887,7 +934,7 @@ dict_stats_update_transient( if (dict_table_is_discarded(table)) { /* Nothing to do. */ - dict_stats_empty_table(table); + dict_stats_empty_table(table, false); return; } else if (index == NULL) { /* Table definition is corrupt */ @@ -897,7 +944,7 @@ dict_stats_update_transient( fprintf(stderr, " InnoDB: table %s has no indexes. " "Cannot calculate statistics.\n", ut_format_name(table->name, TRUE, buf, sizeof(buf))); - dict_stats_empty_table(table); + dict_stats_empty_table(table, false); return; } @@ -909,7 +956,7 @@ dict_stats_update_transient( continue; } - dict_stats_empty_index(index); + dict_stats_empty_index(index, false); if (dict_stats_should_ignore_index(index)) { continue; @@ -1903,7 +1950,7 @@ dict_stats_analyze_index( DEBUG_PRINTF(" %s(index=%s)\n", __func__, index->name); - dict_stats_empty_index(index); + dict_stats_empty_index(index, false); mtr_start(&mtr); @@ -2201,7 +2248,7 @@ dict_stats_update_persistent( /* Table definition is corrupt */ dict_table_stats_unlock(table, RW_X_LATCH); - dict_stats_empty_table(table); + dict_stats_empty_table(table, true); return(DB_CORRUPTION); } @@ -2230,7 +2277,7 @@ dict_stats_update_persistent( continue; } - dict_stats_empty_index(index); + dict_stats_empty_index(index, false); if (dict_stats_should_ignore_index(index)) { continue; @@ -2803,6 +2850,16 @@ dict_stats_fetch_index_stats_step( == 0) { index->stat_n_leaf_pages = (ulint) stat_value; arg->stats_were_modified = true; + } else if (stat_name_len == 12 /* strlen("n_page_split") */ + && strncasecmp("n_page_split", stat_name, stat_name_len) + == 0) { + index->stat_defrag_n_page_split = (ulint) stat_value; + arg->stats_were_modified = true; + } else if (stat_name_len == 13 /* strlen("n_pages_freed") */ + && strncasecmp("n_pages_freed", stat_name, stat_name_len) + == 0) { + index->stat_defrag_n_pages_freed = (ulint) stat_value; + arg->stats_were_modified = true; } else if (stat_name_len > PFX_LEN /* e.g. stat_name=="n_diff_pfx01" */ && strncasecmp(PFX, stat_name, PFX_LEN) == 0) { @@ -2922,7 +2979,7 @@ dict_stats_fetch_from_ps( the persistent storage contains incomplete stats (e.g. missing stats for some index) then we would end up with (partially) uninitialized stats. */ - dict_stats_empty_table(table); + dict_stats_empty_table(table, true); trx = trx_allocate_for_background(); @@ -3024,6 +3081,22 @@ dict_stats_fetch_from_ps( } /*********************************************************************//** +Clear defragmentation stats modified counter for all indices in table. */ +static +void +dict_stats_empty_defrag_modified_counter( + dict_table_t* table) /*!< in: table */ +{ + dict_index_t* index; + ut_a(table); + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + index->stat_defrag_modified_counter = 0; + } +} + +/*********************************************************************//** Fetches or calculates new estimates for index statistics. */ UNIV_INTERN void @@ -3099,13 +3172,13 @@ dict_stats_update( "because the .ibd file is missing. For help, please " "refer to " REFMAN "innodb-troubleshooting.html\n", ut_format_name(table->name, TRUE, buf, sizeof(buf))); - dict_stats_empty_table(table); + dict_stats_empty_table(table, true); return(DB_TABLESPACE_DELETED); } else if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) { /* If we have set a high innodb_force_recovery level, do not calculate statistics, as a badly corrupted index can cause a crash in it. */ - dict_stats_empty_table(table); + dict_stats_empty_table(table, false); return(DB_SUCCESS); } @@ -3168,7 +3241,7 @@ dict_stats_update( case DICT_STATS_EMPTY_TABLE: - dict_stats_empty_table(table); + dict_stats_empty_table(table, true); /* If table is using persistent stats, then save the stats on disk */ @@ -3231,6 +3304,7 @@ dict_stats_update( t->stats_last_recalc = table->stats_last_recalc; t->stat_modified_counter = 0; + dict_stats_empty_defrag_modified_counter(t); switch (err) { case DB_SUCCESS: @@ -3241,7 +3315,7 @@ dict_stats_update( copying because dict_stats_table_clone_create() does skip corrupted indexes so our dummy object 't' may have less indexes than the real object 'table'. */ - dict_stats_empty_table(table); + dict_stats_empty_table(table, true); dict_stats_copy(table, t); @@ -3811,6 +3885,117 @@ dict_stats_rename_table( return(ret); } +/*********************************************************************//** +Save defragmentation result. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_stats_save_defrag_summary( + dict_index_t* index) /*!< in: index */ +{ + dberr_t ret; + lint now = (lint) ut_time(); + if (dict_index_is_univ(index)) { + return DB_SUCCESS; + } + rw_lock_x_lock(&dict_operation_lock); + mutex_enter(&dict_sys->mutex); + ret = dict_stats_save_index_stat(index, now, "n_pages_freed", + index->stat_defrag_n_pages_freed, + NULL, + "Number of pages freed during" + " last defragmentation run.", + NULL); + + mutex_exit(&dict_sys->mutex); + rw_lock_x_unlock(&dict_operation_lock); + return (ret); +} + +/*********************************************************************//** +Save defragmentation stats for a given index. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_stats_save_defrag_stats( + dict_index_t* index) /*!< in: index */ +{ + dberr_t ret; + + if (index->table->ibd_file_missing) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Cannot save defragment stats because " + ".ibd file is missing.\n"); + return (DB_TABLESPACE_DELETED); + } + if (dict_index_is_corrupted(index)) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Cannot save defragment stats because " + "index is corrupted.\n"); + return(DB_CORRUPTION); + } + + if (dict_index_is_univ(index)) { + return DB_SUCCESS; + } + + lint now = (lint) ut_time(); + mtr_t mtr; + ulint n_leaf_pages; + ulint n_leaf_reserved; + mtr_start(&mtr); + mtr_s_lock(dict_index_get_lock(index), &mtr); + n_leaf_reserved = btr_get_size_and_reserved(index, BTR_N_LEAF_PAGES, + &n_leaf_pages, &mtr); + mtr_commit(&mtr); + + if (n_leaf_reserved == ULINT_UNDEFINED) { + // The index name is different during fast index creation, + // so the stats won't be associated with the right index + // for later use. We just return without saving. + return DB_SUCCESS; + } + + rw_lock_x_lock(&dict_operation_lock); + + mutex_enter(&dict_sys->mutex); + ret = dict_stats_save_index_stat(index, now, "n_page_split", + index->stat_defrag_n_page_split, + NULL, + "Number of new page splits on leaves" + " since last defragmentation.", + NULL); + if (ret != DB_SUCCESS) { + goto end; + } + + ret = dict_stats_save_index_stat( + index, now, "n_leaf_pages_defrag", + n_leaf_pages, + NULL, + "Number of leaf pages when this stat is saved to disk", + NULL); + if (ret != DB_SUCCESS) { + goto end; + } + + ret = dict_stats_save_index_stat( + index, now, "n_leaf_pages_reserved", + n_leaf_reserved, + NULL, + "Number of pages reserved for this index leaves when this stat " + "is saved to disk", + NULL); + +end: + mutex_exit(&dict_sys->mutex); + rw_lock_x_unlock(&dict_operation_lock); + + return (ret); +} + /* tests @{ */ #ifdef UNIV_COMPILE_TEST_FUNCS diff --git a/storage/innobase/dict/dict0stats_bg.cc b/storage/innobase/dict/dict0stats_bg.cc index ecd723ca39a..0089f9897ae 100644 --- a/storage/innobase/dict/dict0stats_bg.cc +++ b/storage/innobase/dict/dict0stats_bg.cc @@ -25,6 +25,7 @@ Created Apr 25, 2012 Vasil Dimov #include "row0mysql.h" #include "srv0start.h" +#include "dict0dict.h" #include "dict0stats.h" #include "dict0stats_bg.h" @@ -44,8 +45,10 @@ UNIV_INTERN os_event_t dict_stats_event = NULL; /** This mutex protects the "recalc_pool" variable. */ static ib_mutex_t recalc_pool_mutex; +static ib_mutex_t defrag_pool_mutex; #ifdef HAVE_PSI_INTERFACE static mysql_pfs_key_t recalc_pool_mutex_key; +static mysql_pfs_key_t defrag_pool_mutex_key; #endif /* HAVE_PSI_INTERFACE */ /** The number of tables that can be added to "recalc_pool" before @@ -59,16 +62,26 @@ static recalc_pool_t recalc_pool; typedef recalc_pool_t::iterator recalc_pool_iterator_t; +/** Indices whose defrag stats need to be saved to persistent storage.*/ +struct defrag_pool_item_t { + table_id_t table_id; + index_id_t index_id; +}; +typedef std::vector<defrag_pool_item_t> defrag_pool_t; +static defrag_pool_t defrag_pool; +typedef defrag_pool_t::iterator defrag_pool_iterator_t; + /*****************************************************************//** Initialize the recalc pool, called once during thread initialization. */ static void -dict_stats_recalc_pool_init() +dict_stats_pool_init() /*=========================*/ { ut_ad(!srv_read_only_mode); recalc_pool.reserve(RECALC_POOL_INITIAL_SLOTS); + defrag_pool.reserve(RECALC_POOL_INITIAL_SLOTS); } /*****************************************************************//** @@ -76,12 +89,13 @@ Free the resources occupied by the recalc pool, called once during thread de-initialization. */ static void -dict_stats_recalc_pool_deinit() -/*===========================*/ +dict_stats_pool_deinit() +/*====================*/ { ut_ad(!srv_read_only_mode); recalc_pool.clear(); + defrag_pool.clear(); /* recalc_pool may still have its buffer allocated. It will free it when its destructor is called. @@ -90,8 +104,12 @@ dict_stats_recalc_pool_deinit() memory. To avoid that, we force recalc_pool to surrender its buffer to empty_pool object, which will free it when leaving this function: */ - recalc_pool_t empty_pool; - recalc_pool.swap(empty_pool); + recalc_pool_t recalc_empty_pool; + defrag_pool_t defrag_empty_pool; + memset(&recalc_empty_pool, 0, sizeof(recalc_pool_t)); + memset(&defrag_empty_pool, 0, sizeof(defrag_pool_t)); + recalc_pool.swap(recalc_empty_pool); + defrag_pool.swap(defrag_empty_pool); } /*****************************************************************//** @@ -188,6 +206,111 @@ dict_stats_recalc_pool_del( } /*****************************************************************//** +Add an index in a table to the defrag pool, which is processed by the +background stats gathering thread. Only the table id and index id are +added to the list, so the table can be closed after being enqueued and +it will be opened when needed. If the table or index does not exist later +(has been DROPped), then it will be removed from the pool and skipped. */ +UNIV_INTERN +void +dict_stats_defrag_pool_add( +/*=======================*/ + const dict_index_t* index) /*!< in: table to add */ +{ + defrag_pool_item_t item; + + ut_ad(!srv_read_only_mode); + + mutex_enter(&defrag_pool_mutex); + + /* quit if already in the list */ + for (defrag_pool_iterator_t iter = defrag_pool.begin(); + iter != defrag_pool.end(); + ++iter) { + if ((*iter).table_id == index->table->id + && (*iter).index_id == index->id) { + mutex_exit(&defrag_pool_mutex); + return; + } + } + + item.table_id = index->table->id; + item.index_id = index->id; + defrag_pool.push_back(item); + + mutex_exit(&defrag_pool_mutex); + + os_event_set(dict_stats_event); +} + +/*****************************************************************//** +Get an index from the auto defrag pool. The returned index id is removed +from the pool. +@return true if the pool was non-empty and "id" was set, false otherwise */ +static +bool +dict_stats_defrag_pool_get( +/*=======================*/ + table_id_t* table_id, /*!< out: table id, or unmodified if + list is empty */ + index_id_t* index_id) /*!< out: index id, or unmodified if + list is empty */ +{ + ut_ad(!srv_read_only_mode); + + mutex_enter(&defrag_pool_mutex); + + if (defrag_pool.empty()) { + mutex_exit(&defrag_pool_mutex); + return(false); + } + + defrag_pool_item_t& item = defrag_pool.back(); + *table_id = item.table_id; + *index_id = item.index_id; + + defrag_pool.pop_back(); + + mutex_exit(&defrag_pool_mutex); + + return(true); +} + +/*****************************************************************//** +Delete a given index from the auto defrag pool. */ +UNIV_INTERN +void +dict_stats_defrag_pool_del( +/*=======================*/ + const dict_table_t* table, /*!<in: if given, remove + all entries for the table */ + const dict_index_t* index) /*!< in: if given, remove this index */ +{ + ut_a((table && !index) || (!table && index)); + ut_ad(!srv_read_only_mode); + ut_ad(mutex_own(&dict_sys->mutex)); + + mutex_enter(&defrag_pool_mutex); + + defrag_pool_iterator_t iter = defrag_pool.begin(); + while (iter != defrag_pool.end()) { + if ((table && (*iter).table_id == table->id) + || (index + && (*iter).table_id == index->table->id + && (*iter).index_id == index->id)) { + /* erase() invalidates the iterator */ + iter = defrag_pool.erase(iter); + if (index) + break; + } else { + iter++; + } + } + + mutex_exit(&defrag_pool_mutex); +} + +/*****************************************************************//** Wait until background stats thread has stopped using the specified table. The caller must have locked the data dictionary using row_mysql_lock_data_dictionary() and this function may unlock it temporarily @@ -237,7 +360,10 @@ dict_stats_thread_init() mutex_create(recalc_pool_mutex_key, &recalc_pool_mutex, SYNC_STATS_AUTO_RECALC); - dict_stats_recalc_pool_init(); + /* We choose SYNC_STATS_DEFRAG to be below SYNC_FSP_PAGE. */ + mutex_create(defrag_pool_mutex_key, &defrag_pool_mutex, + SYNC_STATS_DEFRAG); + dict_stats_pool_init(); } /*****************************************************************//** @@ -251,11 +377,14 @@ dict_stats_thread_deinit() ut_a(!srv_read_only_mode); ut_ad(!srv_dict_stats_thread_active); - dict_stats_recalc_pool_deinit(); + dict_stats_pool_deinit(); mutex_free(&recalc_pool_mutex); memset(&recalc_pool_mutex, 0x0, sizeof(recalc_pool_mutex)); + mutex_free(&defrag_pool_mutex); + memset(&defrag_pool_mutex, 0x0, sizeof(defrag_pool_mutex)); + os_event_free(dict_stats_event); dict_stats_event = NULL; } @@ -333,6 +462,63 @@ dict_stats_process_entry_from_recalc_pool() } /*****************************************************************//** +Get the first index that has been added for updating persistent defrag +stats and eventually save its stats. */ +static +void +dict_stats_process_entry_from_defrag_pool() +/*=======================================*/ +{ + table_id_t table_id; + index_id_t index_id; + + ut_ad(!srv_read_only_mode); + + /* pop the first index from the auto defrag pool */ + if (!dict_stats_defrag_pool_get(&table_id, &index_id)) { + /* no index in defrag pool */ + return; + } + + dict_table_t* table; + + mutex_enter(&dict_sys->mutex); + + /* If the table is no longer cached, we've already lost the in + memory stats so there's nothing really to write to disk. */ + table = dict_table_open_on_id(table_id, TRUE, + DICT_TABLE_OP_OPEN_ONLY_IF_CACHED); + + if (table == NULL) { + mutex_exit(&dict_sys->mutex); + return; + } + + /* Check whether table is corrupted */ + if (table->corrupted) { + dict_table_close(table, TRUE, FALSE); + mutex_exit(&dict_sys->mutex); + return; + } + mutex_exit(&dict_sys->mutex); + + dict_index_t* index = dict_table_find_index_on_id(table, index_id); + + if (index == NULL) { + return; + } + + /* Check whether index is corrupted */ + if (dict_index_is_corrupted(index)) { + dict_table_close(table, FALSE, FALSE); + return; + } + + dict_stats_save_defrag_stats(index); + dict_table_close(table, FALSE, FALSE); +} + +/*****************************************************************//** This is the thread for background stats gathering. It pops tables, from the auto recalc list and proceeds them, eventually recalculating their statistics. @@ -364,6 +550,9 @@ DECLARE_THREAD(dict_stats_thread)( dict_stats_process_entry_from_recalc_pool(); + while (defrag_pool.size()) + dict_stats_process_entry_from_defrag_pool(); + os_event_reset(dict_stats_event); } diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index ba0476b1772..bc12774d475 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2014, MariaDB Corporation. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -24,6 +25,8 @@ Created 10/25/1995 Heikki Tuuri *******************************************************/ #include "fil0fil.h" +#include "fil0pagecompress.h" +#include "fsp0pagecompress.h" #include <debug_sync.h> #include <my_dbug.h> @@ -45,6 +48,7 @@ Created 10/25/1995 Heikki Tuuri #include "page0zip.h" #include "trx0sys.h" #include "row0mysql.h" +#include "os0file.h" #ifndef UNIV_HOTBACKUP # include "buf0lru.h" # include "ibuf0ibuf.h" @@ -54,6 +58,13 @@ Created 10/25/1995 Heikki Tuuri # include "srv0srv.h" static ulint srv_data_read, srv_data_written; #endif /* !UNIV_HOTBACKUP */ +#include "zlib.h" +#ifdef __linux__ +#include <linux/fs.h> +#include <sys/ioctl.h> +#include <fcntl.h> +#endif +#include "row0mysql.h" /* IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE @@ -260,11 +271,16 @@ fil_read( block size multiple */ void* buf, /*!< in/out: buffer where to store data read; in aio this must be appropriately aligned */ - void* message) /*!< in: message for aio handler if non-sync + void* message, /*!< in: message for aio handler if non-sync aio used, else ignored */ + ulint* write_size) /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ { return(fil_io(OS_FILE_READ, sync, space_id, zip_size, block_offset, - byte_offset, len, buf, message)); + byte_offset, len, buf, message, write_size)); } /********************************************************************//** @@ -289,18 +305,22 @@ fil_write( be a block size multiple */ void* buf, /*!< in: buffer from which to write; in aio this must be appropriately aligned */ - void* message) /*!< in: message for aio handler if non-sync + void* message, /*!< in: message for aio handler if non-sync aio used, else ignored */ + ulint* write_size) /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ { ut_ad(!srv_read_only_mode); return(fil_io(OS_FILE_WRITE, sync, space_id, zip_size, block_offset, - byte_offset, len, buf, message)); + byte_offset, len, buf, message, write_size)); } /*******************************************************************//** Returns the table space by a given id, NULL if not found. */ -UNIV_INLINE fil_space_t* fil_space_get_by_id( /*================*/ @@ -318,6 +338,19 @@ fil_space_get_by_id( return(space); } +/****************************************************************//** +Get space id from fil node */ +ulint +fil_node_get_space_id( +/*==================*/ + fil_node_t* node) /*!< in: Compressed node*/ +{ + ut_ad(node); + ut_ad(node->space); + + return (node->space->id); +} + /*******************************************************************//** Returns the table space by a given name, NULL if not found. */ UNIV_INLINE @@ -538,8 +571,9 @@ fil_node_open_file( byte* buf2; byte* page; ulint space_id; - ulint flags; + ulint flags=0; ulint page_size; + ulint atomic_writes=0; ut_ad(mutex_own(&(system->mutex))); ut_a(node->n_pending == 0); @@ -556,7 +590,7 @@ fil_node_open_file( node->handle = os_file_create_simple_no_error_handling( innodb_file_data_key, node->name, OS_FILE_OPEN, - OS_FILE_READ_ONLY, &success); + OS_FILE_READ_ONLY, &success, 0); if (!success) { /* The following call prints an error message */ os_file_get_last_error(true); @@ -573,6 +607,8 @@ fil_node_open_file( size_bytes = os_file_get_size(node->handle); ut_a(size_bytes != (os_offset_t) -1); + + node->file_block_size = os_file_get_block_size(node->handle, node->name); #ifdef UNIV_HOTBACKUP if (space->id == 0) { node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE); @@ -604,10 +640,14 @@ fil_node_open_file( set */ page = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE)); - success = os_file_read(node->handle, page, 0, UNIV_PAGE_SIZE); + success = os_file_read(node->handle, page, 0, UNIV_PAGE_SIZE, + space->flags); + space_id = fsp_header_get_space_id(page); flags = fsp_header_get_flags(page); page_size = fsp_flags_get_page_size(flags); + atomic_writes = fsp_flags_get_atomic_writes(flags); + ut_free(buf2); @@ -658,13 +698,28 @@ fil_node_open_file( ut_error; } - if (size_bytes >= 1024 * 1024) { - /* Truncate the size to whole megabytes. */ - size_bytes = ut_2pow_round(size_bytes, 1024 * 1024); + if (UNIV_UNLIKELY(space->flags != flags)) { + if (!dict_tf_verify_flags(space->flags, flags)) { + fprintf(stderr, + "InnoDB: Error: table flags are 0x%lx" + " in the data dictionary\n" + "InnoDB: but the flags in file %s are 0x%lx!\n", + space->flags, node->name, flags); + ut_error; + } + } + + if (size_bytes >= FSP_EXTENT_SIZE * UNIV_PAGE_SIZE) { + /* Truncate the size to whole extent size. */ + size_bytes = ut_2pow_round(size_bytes, + FSP_EXTENT_SIZE * + UNIV_PAGE_SIZE); } if (!fsp_flags_is_compressed(flags)) { - node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE); + node->size = (ulint) + (size_bytes + / fsp_flags_get_page_size(flags)); } else { node->size = (ulint) (size_bytes @@ -677,6 +732,8 @@ add_size: space->size += node->size; } + atomic_writes = fsp_flags_get_atomic_writes(space->flags); + /* printf("Opening file %s\n", node->name); */ /* Open the file for reading and writing, in Windows normally in the @@ -687,18 +744,22 @@ add_size: node->handle = os_file_create(innodb_file_log_key, node->name, OS_FILE_OPEN, OS_FILE_AIO, OS_LOG_FILE, - &ret); + &ret, atomic_writes); } else if (node->is_raw_disk) { node->handle = os_file_create(innodb_file_data_key, node->name, OS_FILE_OPEN_RAW, OS_FILE_AIO, OS_DATA_FILE, - &ret); + &ret, atomic_writes); } else { node->handle = os_file_create(innodb_file_data_key, node->name, OS_FILE_OPEN, OS_FILE_AIO, OS_DATA_FILE, - &ret); + &ret, atomic_writes); + } + + if (node->file_block_size == 0) { + node->file_block_size = os_file_get_block_size(node->handle, node->name); } ut_a(ret); @@ -1069,7 +1130,6 @@ fil_space_create( DBUG_EXECUTE_IF("fil_space_create_failure", return(false);); ut_a(fil_system); - ut_a(fsp_flags_is_valid(flags)); /* Look for a matching tablespace and if found free it. */ do { @@ -1723,12 +1783,12 @@ fil_write_lsn_and_arch_no_to_file( buf = static_cast<byte*>(ut_align(buf1, UNIV_PAGE_SIZE)); err = fil_read(TRUE, space, 0, sum_of_sizes, 0, - UNIV_PAGE_SIZE, buf, NULL); + UNIV_PAGE_SIZE, buf, NULL, 0); if (err == DB_SUCCESS) { mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn); err = fil_write(TRUE, space, 0, sum_of_sizes, 0, - UNIV_PAGE_SIZE, buf, NULL); + UNIV_PAGE_SIZE, buf, NULL, 0); } mem_free(buf1); @@ -1816,6 +1876,9 @@ fil_check_first_page( flags = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page); if (UNIV_PAGE_SIZE != fsp_flags_get_page_size(flags)) { + fprintf(stderr, "InnoDB: Error: Current page size %lu != page size on page %lu\n", + UNIV_PAGE_SIZE, fsp_flags_get_page_size(flags)); + return("innodb-page-size mismatch"); } @@ -1868,8 +1931,10 @@ fil_read_first_page( #endif /* UNIV_LOG_ARCHIVE */ lsn_t* min_flushed_lsn, /*!< out: min of flushed lsn values in data files */ - lsn_t* max_flushed_lsn) /*!< out: max of flushed + lsn_t* max_flushed_lsn, /*!< out: max of flushed lsn values in data files */ + ulint orig_space_id) /*!< in: original file space + id */ { byte* buf; byte* page; @@ -1882,7 +1947,10 @@ fil_read_first_page( page = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE)); - os_file_read(data_file, page, 0, UNIV_PAGE_SIZE); + os_file_read(data_file, page, 0, UNIV_PAGE_SIZE, + orig_space_id != ULINT_UNDEFINED ? + fil_space_is_page_compressed(orig_space_id) : + FALSE); /* The FSP_HEADER on page 0 is only valid for the first file in a tablespace. So if this is not the first datafile, leave @@ -1891,7 +1959,16 @@ fil_read_first_page( if (!one_read_already) { *flags = fsp_header_get_flags(page); *space_id = fsp_header_get_space_id(page); + } + /* Page is page compressed page, need to decompress, before + continue. */ + if (fil_page_is_compressed(page)) { + ulint write_size=0; + fil_decompress_page(NULL, page, UNIV_PAGE_SIZE, &write_size); + } + + if (!one_read_already) { check_msg = fil_check_first_page(page); } @@ -3022,7 +3099,7 @@ fil_create_link_file( file = os_file_create_simple_no_error_handling( innodb_file_data_key, link_filepath, - OS_FILE_CREATE, OS_FILE_READ_WRITE, &success); + OS_FILE_CREATE, OS_FILE_READ_WRITE, &success, 0); if (!success) { /* The following call will print an error message */ @@ -3038,10 +3115,10 @@ fil_create_link_file( ut_print_filename(stderr, filepath); fputs(" already exists.\n", stderr); err = DB_TABLESPACE_EXISTS; - } else if (error == OS_FILE_DISK_FULL) { err = DB_OUT_OF_FILE_SPACE; - + } else if (error == OS_FILE_OPERATION_NOT_SUPPORTED) { + err = DB_UNSUPPORTED; } else { err = DB_ERROR; } @@ -3052,7 +3129,7 @@ fil_create_link_file( } if (!os_file_write(link_filepath, file, filepath, 0, - strlen(filepath))) { + strlen(filepath))) { err = DB_ERROR; } @@ -3131,8 +3208,9 @@ fil_open_linked_file( /*===============*/ const char* tablename, /*!< in: database/tablename */ char** remote_filepath,/*!< out: remote filepath */ - os_file_t* remote_file) /*!< out: remote file handle */ - + os_file_t* remote_file, /*!< out: remote file handle */ + ulint atomic_writes) /*!< in: atomic writes table option + value */ { ibool success; @@ -3146,7 +3224,7 @@ fil_open_linked_file( *remote_file = os_file_create_simple_no_error_handling( innodb_file_data_key, *remote_filepath, OS_FILE_OPEN, OS_FILE_READ_ONLY, - &success); + &success, atomic_writes); if (!success) { char* link_filepath = fil_make_isl_name(tablename); @@ -3201,6 +3279,7 @@ fil_create_new_single_table_tablespace( /* TRUE if a table is created with CREATE TEMPORARY TABLE */ bool is_temp = !!(flags2 & DICT_TF2_TEMPORARY); bool has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags); + ulint atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags); ut_a(space_id > 0); ut_ad(!srv_read_only_mode); @@ -3233,7 +3312,8 @@ fil_create_new_single_table_tablespace( OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_NORMAL, OS_DATA_FILE, - &ret); + &ret, + atomic_writes); if (ret == FALSE) { /* The following call will print an error message */ @@ -3260,6 +3340,11 @@ fil_create_new_single_table_tablespace( goto error_exit_3; } + if (error == OS_FILE_OPERATION_NOT_SUPPORTED) { + err = DB_UNSUPPORTED; + goto error_exit_3; + } + if (error == OS_FILE_DISK_FULL) { err = DB_OUT_OF_FILE_SPACE; goto error_exit_3; @@ -3298,6 +3383,7 @@ fil_create_new_single_table_tablespace( flags = fsp_flags_set_page_size(flags, UNIV_PAGE_SIZE); fsp_header_init_fields(page, space_id, flags); mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id); + ut_ad(fsp_flags_is_valid(flags)); if (!(fsp_flags_is_compressed(flags))) { buf_flush_init_for_writing(page, NULL, 0); @@ -3474,16 +3560,25 @@ fil_open_single_table_tablespace( fsp_open_info remote; ulint tablespaces_found = 0; ulint valid_tablespaces_found = 0; + ulint atomic_writes = 0; #ifdef UNIV_SYNC_DEBUG ut_ad(!fix_dict || rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ ut_ad(!fix_dict || mutex_own(&(dict_sys->mutex))); - if (!fsp_flags_is_valid(flags)) { + /* Table flags can be ULINT_UNDEFINED if + dict_tf_to_fsp_flags_failure is set. */ + if (flags != ULINT_UNDEFINED) { + if (!fsp_flags_is_valid(flags)) { + return(DB_CORRUPTION); + } + } else { return(DB_CORRUPTION); } + atomic_writes = fsp_flags_get_atomic_writes(flags); + /* If the tablespace was relocated, we do not compare the DATA_DIR flag */ ulint mod_flags = flags & ~FSP_FLAGS_MASK_DATA_DIR; @@ -3508,7 +3603,7 @@ fil_open_single_table_tablespace( } link_file_found = fil_open_linked_file( - tablename, &remote.filepath, &remote.file); + tablename, &remote.filepath, &remote.file, atomic_writes); remote.success = link_file_found; if (remote.success) { /* possibility of multiple files. */ @@ -3536,7 +3631,7 @@ fil_open_single_table_tablespace( if (dict.filepath) { dict.file = os_file_create_simple_no_error_handling( innodb_file_data_key, dict.filepath, OS_FILE_OPEN, - OS_FILE_READ_ONLY, &dict.success); + OS_FILE_READ_ONLY, &dict.success, atomic_writes); if (dict.success) { /* possibility of multiple files. */ validate = true; @@ -3548,7 +3643,7 @@ fil_open_single_table_tablespace( ut_a(def.filepath); def.file = os_file_create_simple_no_error_handling( innodb_file_data_key, def.filepath, OS_FILE_OPEN, - OS_FILE_READ_ONLY, &def.success); + OS_FILE_READ_ONLY, &def.success, atomic_writes); if (def.success) { tablespaces_found++; } @@ -3567,7 +3662,7 @@ fil_open_single_table_tablespace( #ifdef UNIV_LOG_ARCHIVE &space_arch_log_no, &space_arch_log_no, #endif /* UNIV_LOG_ARCHIVE */ - &def.lsn, &def.lsn); + &def.lsn, &def.lsn, id); def.valid = !def.check_msg; /* Validate this single-table-tablespace with SYS_TABLES, @@ -3592,7 +3687,7 @@ fil_open_single_table_tablespace( #ifdef UNIV_LOG_ARCHIVE &remote.arch_log_no, &remote.arch_log_no, #endif /* UNIV_LOG_ARCHIVE */ - &remote.lsn, &remote.lsn); + &remote.lsn, &remote.lsn, id); remote.valid = !remote.check_msg; /* Validate this single-table-tablespace with SYS_TABLES, @@ -3618,7 +3713,7 @@ fil_open_single_table_tablespace( #ifdef UNIV_LOG_ARCHIVE &dict.arch_log_no, &dict.arch_log_no, #endif /* UNIV_LOG_ARCHIVE */ - &dict.lsn, &dict.lsn); + &dict.lsn, &dict.lsn, id); dict.valid = !dict.check_msg; /* Validate this single-table-tablespace with SYS_TABLES, @@ -3882,7 +3977,8 @@ fil_user_tablespace_find_space_id( for (ulint j = 0; j < page_count; ++j) { - st = os_file_read(fsp->file, page, (j* page_size), page_size); + st = os_file_read(fsp->file, page, (j* page_size), page_size, + fsp_flags_is_page_compressed(fsp->flags)); if (!st) { ib_logf(IB_LOG_LEVEL_INFO, @@ -3995,7 +4091,7 @@ fil_user_tablespace_restore_page( err = os_file_write(fsp->filepath, fsp->file, page, (zip_size ? zip_size : page_size) * page_no, - buflen); + buflen); os_file_flush(fsp->file); out: @@ -4022,7 +4118,7 @@ check_first_page: #ifdef UNIV_LOG_ARCHIVE &fsp->arch_log_no, &fsp->arch_log_no, #endif /* UNIV_LOG_ARCHIVE */ - &fsp->lsn, &fsp->lsn)) { + &fsp->lsn, &fsp->lsn, ULINT_UNDEFINED)) { ib_logf(IB_LOG_LEVEL_ERROR, "%s in tablespace %s (table %s)", check_msg, fsp->filepath, tablename); @@ -4095,9 +4191,7 @@ fil_load_single_table_tablespace( fsp_open_info def; fsp_open_info remote; os_offset_t size; -#ifdef UNIV_HOTBACKUP fil_space_t* space; -#endif memset(&def, 0, sizeof(def)); memset(&remote, 0, sizeof(remote)); @@ -4119,7 +4213,8 @@ fil_load_single_table_tablespace( one of them is sent to this function. So if this table has already been loaded, there is nothing to do.*/ mutex_enter(&fil_system->mutex); - if (fil_space_get_by_name(tablename)) { + space = fil_space_get_by_name(tablename); + if (space) { mem_free(tablename); mutex_exit(&fil_system->mutex); return; @@ -4144,7 +4239,7 @@ fil_load_single_table_tablespace( /* Check for a link file which locates a remote tablespace. */ remote.success = fil_open_linked_file( - tablename, &remote.filepath, &remote.file); + tablename, &remote.filepath, &remote.file, FALSE); /* Read the first page of the remote tablespace */ if (remote.success) { @@ -4159,7 +4254,7 @@ fil_load_single_table_tablespace( /* Try to open the tablespace in the datadir. */ def.file = os_file_create_simple_no_error_handling( innodb_file_data_key, def.filepath, OS_FILE_OPEN, - OS_FILE_READ_WRITE, &def.success); + OS_FILE_READ_WRITE, &def.success, FALSE); /* Read the first page of the remote tablespace */ if (def.success) { @@ -4887,6 +4982,7 @@ retry: } page_size = fsp_flags_get_zip_size(space->flags); + if (!page_size) { page_size = UNIV_PAGE_SIZE; } @@ -4924,6 +5020,11 @@ retry: start_page_no = space->size; file_start_page_no = space->size - node->size; + /* Determine correct file block size */ + if (node->file_block_size == 0) { + node->file_block_size = os_file_get_block_size(node->handle, node->name); + } + #ifdef HAVE_POSIX_FALLOCATE if (srv_use_posix_fallocate) { os_offset_t start_offset = start_page_no * page_size; @@ -4935,16 +5036,18 @@ retry: "space for file \'%s\' failed. Current size " INT64PF ", desired size " INT64PF "\n", node->name, start_offset, len+start_offset); - os_file_handle_error_no_exit(node->name, "posix_fallocate", FALSE); + os_file_handle_error_no_exit(node->name, "posix_fallocate", FALSE, __FILE__, __LINE__); success = FALSE; } else { success = TRUE; } mutex_enter(&fil_system->mutex); + if (success) { - node->size += n_pages; - space->size += n_pages; + node->size += (size_after_extend - start_page_no); + space->size += (size_after_extend - start_page_no); + os_has_said_disk_full = FALSE; } @@ -4980,7 +5083,7 @@ retry: success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC, node->name, node->handle, buf, offset, page_size * n_pages, - NULL, NULL); + node, NULL, 0, FALSE, 0); #endif /* UNIV_HOTBACKUP */ if (success) { os_has_said_disk_full = FALSE; @@ -5074,7 +5177,7 @@ fil_extend_tablespaces_to_stored_len(void) single-threaded operation */ error = fil_read(TRUE, space->id, fsp_flags_get_zip_size(space->flags), - 0, 0, UNIV_PAGE_SIZE, buf, NULL); + 0, 0, UNIV_PAGE_SIZE, buf, NULL, 0); ut_a(error == DB_SUCCESS); size_in_header = fsp_get_size_low(buf); @@ -5354,8 +5457,13 @@ fil_io( void* buf, /*!< in/out: buffer where to store read data or from where to write; in aio this must be appropriately aligned */ - void* message) /*!< in: message for aio handler if non-sync + void* message, /*!< in: message for aio handler if non-sync aio used, else ignored */ + ulint* write_size) /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ { ulint mode; fil_space_t* space; @@ -5365,6 +5473,8 @@ fil_io( ulint wake_later; os_offset_t offset; ibool ignore_nonexistent_pages; + ibool page_compressed = FALSE; + ulint page_compression_level = 0; is_log = type & OS_FILE_LOG; type = type & ~OS_FILE_LOG; @@ -5418,6 +5528,11 @@ fil_io( } else if (type == OS_FILE_WRITE) { ut_ad(!srv_read_only_mode); srv_stats.data_written.add(len); + if (fil_page_is_index_page((byte *)buf)) { + srv_stats.index_pages_written.inc(); + } else { + srv_stats.non_index_pages_written.inc(); + } } /* Reserve the fil_system mutex and make sure that we can open at @@ -5543,6 +5658,9 @@ fil_io( ut_a(byte_offset % OS_FILE_LOG_BLOCK_SIZE == 0); ut_a((len % OS_FILE_LOG_BLOCK_SIZE) == 0); + page_compressed = fsp_flags_is_page_compressed(space->flags); + page_compression_level = fsp_flags_get_page_compression_level(space->flags); + #ifdef UNIV_HOTBACKUP /* In mysqlbackup do normal i/o, not aio */ if (type == OS_FILE_READ) { @@ -5555,7 +5673,8 @@ fil_io( #else /* Queue the aio request */ ret = os_aio(type, mode | wake_later, node->name, node->handle, buf, - offset, len, node, message); + offset, len, node, message, write_size, + page_compressed, page_compression_level); #endif /* UNIV_HOTBACKUP */ @@ -6095,7 +6214,8 @@ fil_iterate( ut_ad(!(n_bytes % iter.page_size)); if (!os_file_read(iter.file, io_buffer, offset, - (ulint) n_bytes)) { + (ulint) n_bytes, + fil_space_is_page_compressed(space_id))) { ib_logf(IB_LOG_LEVEL_ERROR, "os_file_read() failed"); @@ -6182,7 +6302,7 @@ fil_tablespace_iterate( file = os_file_create_simple_no_error_handling( innodb_file_data_key, filepath, - OS_FILE_OPEN, OS_FILE_READ_WRITE, &success); + OS_FILE_OPEN, OS_FILE_READ_WRITE, &success, FALSE); DBUG_EXECUTE_IF("fil_tablespace_iterate_failure", { @@ -6234,7 +6354,8 @@ fil_tablespace_iterate( /* Read the first page and determine the page and zip size. */ - if (!os_file_read(file, page, 0, UNIV_PAGE_SIZE)) { + if (!os_file_read(file, page, 0, UNIV_PAGE_SIZE, + dict_tf_get_page_compression(table->flags))) { err = DB_IO_ERROR; @@ -6400,3 +6521,87 @@ fil_mtr_rename_log( 0, 0, new_name, old_name, mtr); } } + +/****************************************************************//** +Acquire fil_system mutex */ +void +fil_system_enter(void) +/*==================*/ +{ + ut_ad(!mutex_own(&fil_system->mutex)); + mutex_enter(&fil_system->mutex); +} + +/****************************************************************//** +Release fil_system mutex */ +void +fil_system_exit(void) +/*=================*/ +{ + ut_ad(mutex_own(&fil_system->mutex)); + mutex_exit(&fil_system->mutex); +} + +/*******************************************************************//** +Return space name */ +char* +fil_space_name( +/*===========*/ + fil_space_t* space) /*!< in: space */ +{ + return (space->name); +} + +/*******************************************************************//** +Return page type name */ +const char* +fil_get_page_type_name( +/*===================*/ + ulint page_type) /*!< in: FIL_PAGE_TYPE */ +{ + switch(page_type) { + case FIL_PAGE_PAGE_COMPRESSED: + return (const char*)"PAGE_COMPRESSED"; + case FIL_PAGE_INDEX: + return (const char*)"INDEX"; + case FIL_PAGE_UNDO_LOG: + return (const char*)"UNDO LOG"; + case FIL_PAGE_INODE: + return (const char*)"INODE"; + case FIL_PAGE_IBUF_FREE_LIST: + return (const char*)"IBUF_FREE_LIST"; + case FIL_PAGE_TYPE_ALLOCATED: + return (const char*)"ALLOCATED"; + case FIL_PAGE_IBUF_BITMAP: + return (const char*)"IBUF_BITMAP"; + case FIL_PAGE_TYPE_SYS: + return (const char*)"SYS"; + case FIL_PAGE_TYPE_TRX_SYS: + return (const char*)"TRX_SYS"; + case FIL_PAGE_TYPE_FSP_HDR: + return (const char*)"FSP_HDR"; + case FIL_PAGE_TYPE_XDES: + return (const char*)"XDES"; + case FIL_PAGE_TYPE_BLOB: + return (const char*)"BLOB"; + case FIL_PAGE_TYPE_ZBLOB: + return (const char*)"ZBLOB"; + case FIL_PAGE_TYPE_ZBLOB2: + return (const char*)"ZBLOB2"; + case FIL_PAGE_TYPE_COMPRESSED: + return (const char*)"ORACLE PAGE COMPRESSED"; + default: + return (const char*)"PAGE TYPE CORRUPTED"; + } +} +/****************************************************************//** +Get block size from fil node +@return block size*/ +ulint +fil_node_get_block_size( +/*====================*/ + fil_node_t* node) /*!< in: Node where to get block + size */ +{ + return (node->file_block_size); +} diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc new file mode 100644 index 00000000000..2b0196c9017 --- /dev/null +++ b/storage/innobase/fil/fil0pagecompress.cc @@ -0,0 +1,740 @@ +/***************************************************************************** + +Copyright (C) 2013, 2014, SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file fil/fil0pagecompress.cc +Implementation for page compressed file spaces. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#include "fil0fil.h" +#include "fil0pagecompress.h" + +#include <debug_sync.h> +#include <my_dbug.h> + +#include "mem0mem.h" +#include "hash0hash.h" +#include "os0file.h" +#include "mach0data.h" +#include "buf0buf.h" +#include "buf0flu.h" +#include "log0recv.h" +#include "fsp0fsp.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "mtr0mtr.h" +#include "mtr0log.h" +#include "dict0dict.h" +#include "page0page.h" +#include "page0zip.h" +#include "trx0sys.h" +#include "row0mysql.h" +#ifndef UNIV_HOTBACKUP +# include "buf0lru.h" +# include "ibuf0ibuf.h" +# include "sync0sync.h" +# include "os0sync.h" +#else /* !UNIV_HOTBACKUP */ +# include "srv0srv.h" +static ulint srv_data_read, srv_data_written; +#endif /* !UNIV_HOTBACKUP */ +#include "zlib.h" +#ifdef __linux__ +#include <linux/fs.h> +#include <sys/ioctl.h> +#include <fcntl.h> +#endif +#include "row0mysql.h" +#ifdef HAVE_LZ4 +#include "lz4.h" +#endif +#ifdef HAVE_LZO +#include "lzo/lzo1x.h" +#endif +#ifdef HAVE_LZMA +#include "lzma.h" +#endif +#ifdef HAVE_BZIP2 +#include "bzlib.h" +#endif + +/* Used for debugging */ +//#define UNIV_PAGECOMPRESS_DEBUG 1 + +/****************************************************************//** +For page compressed pages decompress the page after actual read +operation. */ +static +void +fil_decompress_page_2( +/*==================*/ + byte* page_buf, /*!< out: destination buffer for + uncompressed data */ + byte* buf, /*!< in: source compressed data */ + ulong len, /*!< in: length of output buffer.*/ + ulint* write_size) /*!< in/out: Actual payload size of + the compressed data. */ +{ + ulint page_type = mach_read_from_2(buf + FIL_PAGE_TYPE); + + if (page_type != FIL_PAGE_TYPE_COMPRESSED) { + /* It is not a compressed page */ + return; + } + + byte* ptr = buf + FIL_PAGE_DATA; + ulint version = mach_read_from_1(buf + FIL_PAGE_VERSION); + int err = 0; + + ut_a(version == 1); + + /* Read the original page type, before we compressed the data. */ + page_type = mach_read_from_2(buf + FIL_PAGE_ORIGINAL_TYPE_V1); + + ulint original_len = mach_read_from_2(buf + FIL_PAGE_ORIGINAL_SIZE_V1); + + if (original_len < UNIV_PAGE_SIZE_MIN - (FIL_PAGE_DATA + 8) + || original_len > UNIV_PAGE_SIZE_MAX - FIL_PAGE_DATA + || len < original_len + FIL_PAGE_DATA) { + fprintf(stderr, + "InnoDB: Corruption: We try to uncompress corrupted page\n" + "InnoDB: Original len %lu len %lu.\n", + original_len, len); + + fflush(stderr); + ut_error; + + } + + ulint algorithm = mach_read_from_1(buf + FIL_PAGE_ALGORITHM_V1); + + switch(algorithm) { + case PAGE_ZLIB_ALGORITHM: { + + fprintf(stderr, "InnoDB: [Note]: zlib\n"); + + err = uncompress(page_buf, &len, ptr, original_len); + /* If uncompress fails it means that page is corrupted */ + if (err != Z_OK) { + + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but uncompress failed with error %d.\n" + "InnoDB: size %lu len %lu\n", + err, original_len, len); + + fflush(stderr); + + ut_error; + } + + break; + } +#ifdef HAVE_LZ4 + case PAGE_LZ4_ALGORITHM: { + fprintf(stderr, "InnoDB: [Note]: lz4\n"); + err = LZ4_decompress_fast( + (const char*) ptr, (char*) (page_buf), original_len); + + if (err < 0) { + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but decompression read only %d bytes.\n" + "InnoDB: size %lu len %lu\n", + err, original_len, len); + fflush(stderr); + + ut_error; + } + break; + } +#endif /* HAVE_LZ4 */ + +#ifdef HAVE_LZMA + case PAGE_LZMA_ALGORITHM: { + + lzma_ret ret; + size_t src_pos = 0; + size_t dst_pos = 0; + uint64_t memlimit = UINT64_MAX; + + fprintf(stderr, "InnoDB: [Note]: lzma\n"); + ret = lzma_stream_buffer_decode( + &memlimit, + 0, + NULL, + ptr, + &src_pos, + original_len, + (page_buf), + &dst_pos, + len); + + + if (ret != LZMA_OK || (dst_pos <= 0 || dst_pos > len)) { + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but decompression read only %ld bytes.\n" + "InnoDB: size %lu len %lu\n", + dst_pos, original_len, len); + fflush(stderr); + + ut_error; + } + + break; + } +#endif /* HAVE_LZMA */ + +#ifdef HAVE_LZO + case PAGE_LZO_ALGORITHM: { + ulint olen = 0; + fprintf(stderr, "InnoDB: [Note]: lzo \n"); + err = lzo1x_decompress((const unsigned char *)ptr, + original_len,(unsigned char *)(page_buf), &olen, NULL); + + if (err != LZO_E_OK || (olen == 0 || olen > UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but decompression read only %ld bytes.\n" + "InnoDB: size %lu len %lu\n", + olen, original_len, len); + fflush(stderr); + + ut_error; + } + break; + } +#endif /* HAVE_LZO */ + + default: + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but compression algorithm %s\n" + "InnoDB: is not known.\n" + ,fil_get_compression_alg_name(algorithm)); + + fflush(stderr); + ut_error; + break; + } + + /* Leave the header alone */ + memmove(buf+FIL_PAGE_DATA, page_buf, original_len); + + mach_write_to_2(buf + FIL_PAGE_TYPE, page_type); + + ut_ad(memcmp(buf + FIL_PAGE_LSN + 4, + buf + (original_len + FIL_PAGE_DATA) + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4) == 0); +} + +/****************************************************************//** +For page compressed pages compress the page before actual write +operation. +@return compressed page to be written*/ +byte* +fil_compress_page( +/*==============*/ + ulint space_id, /*!< in: tablespace id of the + table. */ + byte* buf, /*!< in: buffer from which to write; in aio + this must be appropriately aligned */ + byte* out_buf, /*!< out: compressed buffer */ + ulint len, /*!< in: length of input buffer.*/ + ulint compression_level, /* in: compression level */ + ulint block_size, /*!< in: block size */ + ulint* out_len, /*!< out: actual length of compressed + page */ + byte* lzo_mem) /*!< in: temporal memory used by LZO */ +{ + int err = Z_OK; + int level = 0; + ulint header_len = FIL_PAGE_DATA + FIL_PAGE_COMPRESSED_SIZE; + ulint write_size=0; + ulint comp_method = innodb_compression_algorithm; /* Cache to avoid + change during + function execution */ + + ut_ad(buf); + ut_ad(out_buf); + ut_ad(len); + ut_ad(out_len); + + level = compression_level; + ut_ad(fil_space_is_page_compressed(space_id)); + + fil_system_enter(); + fil_space_t* space = fil_space_get_by_id(space_id); + fil_system_exit(); + + /* If no compression level was provided to this table, use system + default level */ + if (level == 0) { + level = page_zip_level; + } + +#ifdef UNIV_PAGECOMPRESS_DEBUG + fprintf(stderr, + "InnoDB: Note: Preparing for compress for space %lu name %s len %lu\n", + space_id, fil_space_name(space), len); +#endif /* UNIV_PAGECOMPRESS_DEBUG */ + + write_size = UNIV_PAGE_SIZE - header_len; + + switch(comp_method) { +#ifdef HAVE_LZ4 + case PAGE_LZ4_ALGORITHM: + err = LZ4_compress_limitedOutput((const char *)buf, + (char *)out_buf+header_len, len, write_size); + write_size = err; + + if (err == 0) { + /* If error we leave the actual page as it was */ + + fprintf(stderr, + "InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n", + space_id, fil_space_name(space), len, err, write_size); + + srv_stats.pages_page_compression_error.inc(); + *out_len = len; + return (buf); + } + break; +#endif /* HAVE_LZ4 */ +#ifdef HAVE_LZO + case PAGE_LZO_ALGORITHM: + err = lzo1x_1_15_compress( + buf, len, out_buf+header_len, &write_size, lzo_mem); + + if (err != LZO_E_OK || write_size > UNIV_PAGE_SIZE-header_len) { + fprintf(stderr, + "InnoDB: Warning: Compression failed for space %lu name %s len %lu err %d write_size %lu\n", + space_id, fil_space_name(space), len, err, write_size); + srv_stats.pages_page_compression_error.inc(); + *out_len = len; + return (buf); + } + + break; +#endif /* HAVE_LZO */ +#ifdef HAVE_LZMA + case PAGE_LZMA_ALGORITHM: { + size_t out_pos=0; + + err = lzma_easy_buffer_encode( + compression_level, + LZMA_CHECK_NONE, + NULL, /* No custom allocator, use malloc/free */ + reinterpret_cast<uint8_t*>(buf), + len, + reinterpret_cast<uint8_t*>(out_buf + header_len), + &out_pos, + (size_t)write_size); + + if (err != LZMA_OK || out_pos > UNIV_PAGE_SIZE-header_len) { + fprintf(stderr, + "InnoDB: Warning: Compression failed for space %lu name %s len %lu err %d write_size %lu\n", + space_id, fil_space_name(space), len, err, out_pos); + + srv_stats.pages_page_compression_error.inc(); + *out_len = len; + return (buf); + } + + write_size = out_pos; + + break; + } +#endif /* HAVE_LZMA */ + +#ifdef HAVE_BZIP2 + case PAGE_BZIP2_ALGORITHM: { + + err = BZ2_bzBuffToBuffCompress( + (char *)(out_buf + header_len), + (unsigned int *)&write_size, + (char *)buf, + len, + 1, + 0, + 0); + + if (err != BZ_OK || write_size > UNIV_PAGE_SIZE-header_len) { + fprintf(stderr, + "InnoDB: Warning: Compression failed for space %lu name %s len %lu err %d write_size %lu\n", + space_id, fil_space_name(space), len, err, write_size); + srv_stats.pages_page_compression_error.inc(); + *out_len = len; + return (buf); + } + break; + } +#endif /* HAVE_BZIP2 */ + + case PAGE_ZLIB_ALGORITHM: + err = compress2(out_buf+header_len, (ulong*)&write_size, buf, len, level); + + if (err != Z_OK) { + /* If error we leave the actual page as it was */ + + fprintf(stderr, + "InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n", + space_id, fil_space_name(space), len, err, write_size); + + srv_stats.pages_page_compression_error.inc(); + *out_len = len; + return (buf); + } + break; + + case PAGE_UNCOMPRESSED: + *out_len = len; + return (buf); + break; + + default: + ut_error; + break; + } + + /* Set up the page header */ + memcpy(out_buf, buf, FIL_PAGE_DATA); + /* Set up the checksum */ + mach_write_to_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM, BUF_NO_CHECKSUM_MAGIC); + /* Set up the correct page type */ + mach_write_to_2(out_buf+FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED); + /* Set up the flush lsn to be compression algorithm */ + mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, comp_method); + /* Set up the actual payload lenght */ + mach_write_to_2(out_buf+FIL_PAGE_DATA, write_size); + +#ifdef UNIV_DEBUG + /* Verify */ + ut_ad(fil_page_is_compressed(out_buf)); + ut_ad(mach_read_from_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM) == BUF_NO_CHECKSUM_MAGIC); + ut_ad(mach_read_from_2(out_buf+FIL_PAGE_DATA) == write_size); + ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == (ulint)comp_method); + + /* Verify that page can be decompressed */ + { + byte *comp_page; + byte *uncomp_page; + + comp_page = static_cast<byte *>(ut_malloc(UNIV_PAGE_SIZE*2)); + uncomp_page = static_cast<byte *>(ut_malloc(UNIV_PAGE_SIZE*2)); + memcpy(comp_page, out_buf, UNIV_PAGE_SIZE); + + fil_decompress_page(uncomp_page, comp_page, len, NULL); + if(buf_page_is_corrupted(false, uncomp_page, 0)) { + buf_page_print(uncomp_page, 0, BUF_PAGE_PRINT_NO_CRASH); + ut_error; + } + ut_free(comp_page); + ut_free(uncomp_page); + } +#endif /* UNIV_DEBUG */ + + write_size+=header_len; + + /* Actual write needs to be alligned on block size */ + if (write_size % block_size) { +#ifdef UNIV_DEBUG + size_t tmp = write_size; + ut_a(block_size > 0); +#endif + write_size = (size_t)ut_uint64_align_up((ib_uint64_t)write_size, block_size); +#ifdef UNIV_DEBUG + ut_a(write_size > 0 && ((write_size % block_size) == 0)); + ut_a(write_size >= tmp); +#endif + } + +#ifdef UNIV_PAGECOMPRESS_DEBUG + fprintf(stderr, + "InnoDB: Note: Compression succeeded for space %lu name %s len %lu out_len %lu\n", + space_id, fil_space_name(space), len, write_size); +#endif /* UNIV_PAGECOMPRESS_DEBUG */ + + + srv_stats.page_compression_saved.add((len - write_size)); + srv_stats.pages_page_compressed.inc(); + +#if defined (__linux__) && (!defined(FALLOC_FL_PUNCH_HOLE) || !defined (FALLOC_FL_KEEP_SIZE)) + if (srv_use_trim) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: [Warning] System does not support FALLOC_FL_PUNCH_HOLE || FALLOC_FL_KEEP_SIZE.\n" + " InnoDB: Disabling trim for now.\n"); + srv_use_trim = FALSE; + } +#endif + + if (!srv_use_trim) { + /* If persistent trims are not used we always write full + page */ + write_size = len; + } + + *out_len = write_size; + + return(out_buf); + +} + +/****************************************************************//** +For page compressed pages decompress the page after actual read +operation. */ +void +fil_decompress_page( +/*================*/ + byte* page_buf, /*!< in: preallocated buffer or NULL */ + byte* buf, /*!< out: buffer from which to read; in aio + this must be appropriately aligned */ + ulong len, /*!< in: length of output buffer.*/ + ulint* write_size) /*!< in/out: Actual payload size of + the compressed data. */ +{ + int err = 0; + ulint actual_size = 0; + ulint compression_alg = 0; + byte *in_buf; + ulint ptype; + + ut_ad(buf); + ut_ad(len); + + ptype = mach_read_from_2(buf+FIL_PAGE_TYPE); + + /* Do not try to uncompressed pages that are not compressed */ + if (ptype != FIL_PAGE_PAGE_COMPRESSED && ptype != FIL_PAGE_TYPE_COMPRESSED) { + return; + } + + // If no buffer was given, we need to allocate temporal buffer + if (page_buf == NULL) { +#ifdef UNIV_PAGECOMPRESS_DEBUG + fprintf(stderr, + "InnoDB: Note: FIL: Compression buffer not given, allocating...\n"); +#endif /* UNIV_PAGECOMPRESS_DEBUG */ + in_buf = static_cast<byte *>(ut_malloc(UNIV_PAGE_SIZE*2)); + } else { + in_buf = page_buf; + } + + if (ptype == FIL_PAGE_TYPE_COMPRESSED) { + + fil_decompress_page_2(in_buf, buf, len, write_size); + // Need to free temporal buffer if no buffer was given + if (page_buf == NULL) { + ut_free(in_buf); + } + return; + } + + /* Before actual decompress, make sure that page type is correct */ + + if (mach_read_from_4(buf+FIL_PAGE_SPACE_OR_CHKSUM) != BUF_NO_CHECKSUM_MAGIC || + mach_read_from_2(buf+FIL_PAGE_TYPE) != FIL_PAGE_PAGE_COMPRESSED) { + fprintf(stderr, + "InnoDB: Corruption: We try to uncompress corrupted page\n" + "InnoDB: CRC %lu type %lu.\n" + "InnoDB: len %lu\n", + mach_read_from_4(buf+FIL_PAGE_SPACE_OR_CHKSUM), + mach_read_from_2(buf+FIL_PAGE_TYPE), len); + + fflush(stderr); + ut_error; + } + + /* Get compression algorithm */ + compression_alg = mach_read_from_8(buf+FIL_PAGE_FILE_FLUSH_LSN); + + /* Get the actual size of compressed page */ + actual_size = mach_read_from_2(buf+FIL_PAGE_DATA); + /* Check if payload size is corrupted */ + if (actual_size == 0 || actual_size > UNIV_PAGE_SIZE) { + fprintf(stderr, + "InnoDB: Corruption: We try to uncompress corrupted page\n" + "InnoDB: actual size %lu compression %s\n", + actual_size, fil_get_compression_alg_name(compression_alg)); + fflush(stderr); + ut_error; + } + + /* Store actual payload size of the compressed data. This pointer + points to buffer pool. */ + if (write_size) { + *write_size = actual_size; + } + +#ifdef UNIV_PAGECOMPRESS_DEBUG + fprintf(stderr, + "InnoDB: Note: Preparing for decompress for len %lu\n", + actual_size); +#endif /* UNIV_PAGECOMPRESS_DEBUG */ + + + switch(compression_alg) { + case PAGE_ZLIB_ALGORITHM: + err= uncompress(in_buf, &len, buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (unsigned long)actual_size); + + /* If uncompress fails it means that page is corrupted */ + if (err != Z_OK) { + + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but uncompress failed with error %d.\n" + "InnoDB: size %lu len %lu\n", + err, actual_size, len); + + fflush(stderr); + + ut_error; + } + break; + +#ifdef HAVE_LZ4 + case PAGE_LZ4_ALGORITHM: + err = LZ4_decompress_fast((const char *)buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (char *)in_buf, len); + + if (err != (int)actual_size) { + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but decompression read only %d bytes.\n" + "InnoDB: size %lu len %lu\n", + err, actual_size, len); + fflush(stderr); + + ut_error; + } + break; +#endif /* HAVE_LZ4 */ +#ifdef HAVE_LZO + case PAGE_LZO_ALGORITHM: + { + ulint olen=0; + err = lzo1x_decompress((const unsigned char *)buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, + actual_size,(unsigned char *)in_buf, &olen, NULL); + + if (err != LZO_E_OK || (olen == 0 || olen > UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but decompression read only %ld bytes.\n" + "InnoDB: size %lu len %lu\n", + olen, actual_size, len); + fflush(stderr); + + ut_error; + } + break; + } +#endif /* HAVE_LZO */ +#ifdef HAVE_LZMA + case PAGE_LZMA_ALGORITHM: { + + lzma_ret ret; + size_t src_pos = 0; + size_t dst_pos = 0; + uint64_t memlimit = UINT64_MAX; + + ret = lzma_stream_buffer_decode( + &memlimit, + 0, + NULL, + buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, + &src_pos, + actual_size, + in_buf, + &dst_pos, + len); + + + if (ret != LZMA_OK || (dst_pos == 0 || dst_pos > UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but decompression read only %ld bytes.\n" + "InnoDB: size %lu len %lu\n", + dst_pos, actual_size, len); + fflush(stderr); + + ut_error; + } + + break; + } +#endif /* HAVE_LZMA */ +#ifdef HAVE_BZIP2 + case PAGE_BZIP2_ALGORITHM: { + unsigned int dst_pos = UNIV_PAGE_SIZE; + + err = BZ2_bzBuffToBuffDecompress( + (char *)in_buf, + &dst_pos, + (char *)(buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE), + actual_size, + 1, + 0); + + if (err != BZ_OK || (dst_pos == 0 || dst_pos > UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but decompression read only %du bytes.\n" + "InnoDB: size %lu len %lu err %d\n", + dst_pos, actual_size, len, err); + fflush(stderr); + + ut_error; + } + break; + } +#endif /* HAVE_BZIP2 */ + + default: + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but compression algorithm %s\n" + "InnoDB: is not known.\n" + ,fil_get_compression_alg_name(compression_alg)); + + fflush(stderr); + ut_error; + break; + } + +#ifdef UNIV_PAGECOMPRESS_DEBUG + fprintf(stderr, + "InnoDB: Note: Decompression succeeded for len %lu \n", + len); +#endif /* UNIV_PAGECOMPRESS_DEBUG */ + + srv_stats.pages_page_decompressed.inc(); + + /* Copy the uncompressed page to the buffer pool, not + really any other options. */ + memcpy(buf, in_buf, len); + + // Need to free temporal buffer if no buffer was given + if (page_buf == NULL) { + ut_free(in_buf); + } +} + + diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 3c52a35b2b4..2d6b9881bd3 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -4,7 +4,7 @@ Copyright (c) 2000, 2014, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, 2009 Google Inc. Copyright (c) 2009, Percona Inc. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2013, 2014 SkySQL Ab. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -57,6 +57,7 @@ this program; if not, write to the Free Software Foundation, Inc., #include "buf0flu.h" #include "buf0dblwr.h" #include "btr0sea.h" +#include "btr0defragment.h" #include "os0file.h" #include "os0thread.h" #include "srv0start.h" @@ -65,7 +66,6 @@ this program; if not, write to the Free Software Foundation, Inc., #include "trx0trx.h" #include "trx0sys.h" -#include "mtr0mtr.h" #include "rem0types.h" #include "row0ins.h" #include "row0mysql.h" @@ -86,6 +86,7 @@ this program; if not, write to the Free Software Foundation, Inc., #include "dict0stats_bg.h" #include "ha_prototypes.h" #include "ut0mem.h" +#include "ut0timer.h" #include "ibuf0ibuf.h" #include "dict0dict.h" #include "srv0mon.h" @@ -101,6 +102,7 @@ this program; if not, write to the Free Software Foundation, Inc., #endif /* UNIV_DEBUG */ #include "fts0priv.h" #include "page0zip.h" +#include "fil0pagecompress.h" #define thd_get_trx_isolation(X) ((enum_tx_isolation)thd_tx_isolation(X)) @@ -112,10 +114,40 @@ this program; if not, write to the Free Software Foundation, Inc., #include "ha_innodb.h" #include "i_s.h" +#include <mysql/plugin.h> +#include <mysql/service_wsrep.h> + # ifndef MYSQL_PLUGIN_IMPORT # define MYSQL_PLUGIN_IMPORT /* nothing */ # endif /* MYSQL_PLUGIN_IMPORT */ +#ifdef WITH_WSREP +#include "dict0priv.h" +#include "../storage/innobase/include/ut0byte.h" +#include <mysql/service_md5.h> + +class binlog_trx_data; +extern handlerton *binlog_hton; + +extern MYSQL_PLUGIN_IMPORT MYSQL_BIN_LOG mysql_bin_log; + +static inline wsrep_ws_handle_t* +wsrep_ws_handle(THD* thd, const trx_t* trx) { + return wsrep_ws_handle_for_trx(wsrep_thd_ws_handle(thd), + (wsrep_trx_id_t)trx->id); +} + +extern TC_LOG* tc_log; +extern void wsrep_cleanup_transaction(THD *thd); +static int +wsrep_abort_transaction(handlerton* hton, THD *bf_thd, THD *victim_thd, + my_bool signal); +static void +wsrep_fake_trx_id(handlerton* hton, THD *thd); +static int innobase_wsrep_set_checkpoint(handlerton* hton, const XID* xid); +static int innobase_wsrep_get_checkpoint(handlerton* hton, XID* xid); +#endif /* WITH_WSREP */ + /** to protect innobase_open_files */ static mysql_mutex_t innobase_share_mutex; /** to force correct commit order in binlog */ @@ -224,12 +256,12 @@ static TYPELIB innodb_stats_method_typelib = { /** Possible values for system variable "innodb_checksum_algorithm". */ static const char* innodb_checksum_algorithm_names[] = { - "crc32", - "strict_crc32", - "innodb", - "strict_innodb", - "none", - "strict_none", + "CRC32", + "STRICT_CRC32", + "INNODB", + "STRICT_INNODB", + "NONE", + "STRICT_NONE", NullS }; @@ -501,6 +533,28 @@ ib_cb_t innodb_api_cb[] = { (ib_cb_t) ib_cursor_stmt_begin }; +/** + Structure for CREATE TABLE options (table options). + It needs to be called ha_table_option_struct. + + The option values can be specified in the CREATE TABLE at the end: + CREATE TABLE ( ... ) *here* +*/ + +ha_create_table_option innodb_table_option_list[]= +{ + /* With this option user can enable page compression feature for the + table */ + HA_TOPTION_BOOL("PAGE_COMPRESSED", page_compressed, 0), + /* With this option user can set zip compression level for page + compression for this table*/ + HA_TOPTION_NUMBER("PAGE_COMPRESSION_LEVEL", page_compression_level, ULINT_UNDEFINED, 0, 9, 1), + /* With this option user can enable atomic writes feature for this table */ + HA_TOPTION_ENUM("ATOMIC_WRITES", atomic_writes, "DEFAULT,ON,OFF", 0), + HA_TOPTION_END +}; + + /*************************************************************//** Check whether valid argument given to innodb_ft_*_stopword_table. This function is registered as a callback with MySQL. @@ -536,7 +590,27 @@ static inline ulint innobase_map_isolation_level( /*=========================*/ - enum_tx_isolation iso); /*!< in: MySQL isolation level code */ + enum_tx_isolation iso); /*!< in: MySQL isolation level code + */ + +/*************************************************************//** +Check for a valid value of innobase_compression_algorithm. +@return 0 for valid innodb_compression_algorithm. */ +static +int +innodb_compression_algorithm_validate( +/*==================================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to system + variable */ + void* save, /*!< out: immediate result + for update function */ + struct st_mysql_value* value); /*!< in: incoming string */ + +static ibool innodb_have_lzo=IF_LZO(1, 0); +static ibool innodb_have_lz4=IF_LZ4(1, 0); +static ibool innodb_have_lzma=IF_LZMA(1, 0); +static ibool innodb_have_bzip2=IF_BZIP2(1, 0); static const char innobase_hton_name[]= "InnoDB"; @@ -689,6 +763,68 @@ static SHOW_VAR innodb_status_variables[]= { {"purge_view_trx_id_age", (char*) &export_vars.innodb_purge_view_trx_id_age, SHOW_LONG}, #endif /* UNIV_DEBUG */ + /* Status variables for page compression */ + {"page_compression_saved", + (char*) &export_vars.innodb_page_compression_saved, SHOW_LONGLONG}, + {"page_compression_trim_sect512", + (char*) &export_vars.innodb_page_compression_trim_sect512, SHOW_LONGLONG}, + {"page_compression_trim_sect1024", + (char*) &export_vars.innodb_page_compression_trim_sect1024, SHOW_LONGLONG}, + {"page_compression_trim_sect2048", + (char*) &export_vars.innodb_page_compression_trim_sect2048, SHOW_LONGLONG}, + {"page_compression_trim_sect4096", + (char*) &export_vars.innodb_page_compression_trim_sect4096, SHOW_LONGLONG}, + {"page_compression_trim_sect8192", + (char*) &export_vars.innodb_page_compression_trim_sect8192, SHOW_LONGLONG}, + {"page_compression_trim_sect16384", + (char*) &export_vars.innodb_page_compression_trim_sect16384, SHOW_LONGLONG}, + {"page_compression_trim_sect32768", + (char*) &export_vars.innodb_page_compression_trim_sect32768, SHOW_LONGLONG}, + {"num_index_pages_written", + (char*) &export_vars.innodb_index_pages_written, SHOW_LONGLONG}, + {"num_non_index_pages_written", + (char*) &export_vars.innodb_non_index_pages_written, SHOW_LONGLONG}, + {"num_pages_page_compressed", + (char*) &export_vars.innodb_pages_page_compressed, SHOW_LONGLONG}, + {"num_page_compressed_trim_op", + (char*) &export_vars.innodb_page_compressed_trim_op, SHOW_LONGLONG}, + {"num_page_compressed_trim_op_saved", + (char*) &export_vars.innodb_page_compressed_trim_op_saved, SHOW_LONGLONG}, + {"num_pages_page_decompressed", + (char*) &export_vars.innodb_pages_page_decompressed, SHOW_LONGLONG}, + {"have_lz4", + (char*) &innodb_have_lz4, SHOW_BOOL}, + {"have_lzo", + (char*) &innodb_have_lzo, SHOW_BOOL}, + {"have_lzma", + (char*) &innodb_have_lzma, SHOW_BOOL}, + {"have_bzip2", + (char*) &innodb_have_bzip2, SHOW_BOOL}, + + /* Defragmentation */ + {"defragment_compression_failures", + (char*) &export_vars.innodb_defragment_compression_failures, SHOW_LONG}, + {"defragment_failures", + (char*) &export_vars.innodb_defragment_failures, SHOW_LONG}, + {"defragment_count", + (char*) &export_vars.innodb_defragment_count, SHOW_LONG}, + + /* Online alter table status variables */ + {"onlineddl_rowlog_rows", + (char*) &export_vars.innodb_onlineddl_rowlog_rows, SHOW_LONG}, + {"onlineddl_rowlog_pct_used", + (char*) &export_vars.innodb_onlineddl_rowlog_pct_used, SHOW_LONG}, + {"onlineddl_pct_progress", + (char*) &export_vars.innodb_onlineddl_pct_progress, SHOW_LONG}, + + /* Times secondary index lookup triggered cluster lookup and + times prefix optimization avoided triggering cluster lookup */ + {"secondary_index_triggered_cluster_reads", + (char*) &export_vars.innodb_sec_rec_cluster_reads, SHOW_LONG}, + {"secondary_index_triggered_cluster_reads_avoided", + (char*) &export_vars.innodb_sec_rec_cluster_reads_avoided, SHOW_LONG}, + + {NullS, NullS, SHOW_LONG} }; @@ -1190,6 +1326,10 @@ innobase_srv_conc_enter_innodb( /*===========================*/ trx_t* trx) /*!< in: transaction handle */ { +#ifdef WITH_WSREP + if (wsrep_on(trx->mysql_thd) && + wsrep_thd_is_BF(trx->mysql_thd, FALSE)) return; +#endif /* WITH_WSREP */ if (srv_thread_concurrency) { if (trx->n_tickets_to_enter_innodb > 0) { @@ -1224,6 +1364,10 @@ innobase_srv_conc_exit_innodb( #ifdef UNIV_SYNC_DEBUG ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch)); #endif /* UNIV_SYNC_DEBUG */ +#ifdef WITH_WSREP + if (wsrep_on(trx->mysql_thd) && + wsrep_thd_is_BF(trx->mysql_thd, FALSE)) return; +#endif /* WITH_WSREP */ /* This is to avoid making an unnecessary function call. */ if (trx->declared_to_be_inside_innodb @@ -1344,6 +1488,15 @@ thd_to_trx( { return(*(trx_t**) thd_ha_data(thd, innodb_hton_ptr)); } +#ifdef WITH_WSREP +ulonglong +thd_to_trx_id( +/*=======*/ + THD* thd) /*!< in: MySQL thread */ +{ + return(thd_to_trx(thd)->id); +} +#endif /* WITH_WSREP */ /********************************************************************//** Call this function when mysqld passes control to the client. That is to @@ -1829,6 +1982,9 @@ int innobase_mysql_tmpfile(void) /*========================*/ { +#ifdef WITH_INNODB_DISALLOW_WRITES + os_event_wait(srv_allow_writes_event); +#endif /* WITH_INNODB_DISALLOW_WRITES */ int fd2 = -1; File fd; @@ -2285,9 +2441,11 @@ ha_innobase::ha_innobase( HA_BINLOG_ROW_CAPABLE | HA_CAN_GEOMETRY | HA_PARTIAL_COLUMN_READ | HA_TABLE_SCAN_ON_INDEX | HA_CAN_FULLTEXT | + (srv_force_primary_key ? HA_REQUIRE_PRIMARY_KEY : 0 ) | HA_CAN_FULLTEXT_EXT | HA_CAN_EXPORT), start_of_scan(0), - num_write_row(0) + num_write_row(0), + ha_partition_stats(NULL) {} /*********************************************************************//** @@ -2907,11 +3065,19 @@ innobase_init( innobase_hton->release_temporary_latches = innobase_release_temporary_latches; +#ifdef WITH_WSREP + innobase_hton->abort_transaction=wsrep_abort_transaction; + innobase_hton->set_checkpoint=innobase_wsrep_set_checkpoint; + innobase_hton->get_checkpoint=innobase_wsrep_get_checkpoint; + innobase_hton->fake_trx_id=wsrep_fake_trx_id; +#endif /* WITH_WSREP */ innobase_hton->kill_query = innobase_kill_query; if (srv_file_per_table) innobase_hton->tablefile_extensions = ha_innobase_exts; + innobase_hton->table_options = innodb_table_option_list; + ut_a(DATA_MYSQL_TRUE_VARCHAR == (ulint)MYSQL_TYPE_VARCHAR); #ifndef DBUG_OFF @@ -2946,6 +3112,58 @@ innobase_init( } } + if (UNIV_PAGE_SIZE != UNIV_PAGE_SIZE_DEF) { + fprintf(stderr, + "InnoDB: Warning: innodb_page_size has been " + "changed from default value %d to %ldd. (###EXPERIMENTAL### " + "operation)\n", UNIV_PAGE_SIZE_DEF, UNIV_PAGE_SIZE); + + /* There is hang on buffer pool when trying to get a new + page if buffer pool size is too small for large page sizes */ + if (innobase_buffer_pool_size < (24 * 1024 * 1024)) { + fprintf(stderr, "InnoDB: Error: innobase_page_size %lu requires " + "innodb_buffer_pool_size > 24M current %lld", + UNIV_PAGE_SIZE, innobase_buffer_pool_size); + goto error; + } + } + +#ifndef HAVE_LZ4 + if (innodb_compression_algorithm == PAGE_LZ4_ALGORITHM) { + sql_print_error("InnoDB: innodb_compression_algorithm = %lu unsupported.\n" + "InnoDB: liblz4 is not installed. \n", + innodb_compression_algorithm); + goto error; + } +#endif + +#ifndef HAVE_LZO + if (innodb_compression_algorithm == PAGE_LZO_ALGORITHM) { + sql_print_error("InnoDB: innodb_compression_algorithm = %lu unsupported.\n" + "InnoDB: liblzo is not installed. \n", + innodb_compression_algorithm); + goto error; + } +#endif + +#ifndef HAVE_LZMA + if (innodb_compression_algorithm == PAGE_LZMA_ALGORITHM) { + sql_print_error("InnoDB: innodb_compression_algorithm = %lu unsupported.\n" + "InnoDB: liblzma is not installed. \n", + innodb_compression_algorithm); + goto error; + } +#endif + +#ifndef HAVE_BZIP2 + if (innodb_compression_algorithm == PAGE_BZIP2_ALGORITHM) { + sql_print_error("InnoDB: innodb_compression_algorithm = %lu unsupported.\n" + "InnoDB: libbz2 is not installed. \n", + innodb_compression_algorithm); + goto error; + } +#endif + os_innodb_umask = (ulint) my_umask; /* First calculate the default path for innodb_data_home_dir etc., @@ -3519,10 +3737,30 @@ innobase_commit_low( /*================*/ trx_t* trx) /*!< in: transaction handle */ { +#ifdef WITH_WSREP + THD* thd = (THD*)trx->mysql_thd; + const char* tmp = 0; + if (wsrep_on(thd)) { +#ifdef WSREP_PROC_INFO + char info[64]; + info[sizeof(info) - 1] = '\0'; + snprintf(info, sizeof(info) - 1, + "innobase_commit_low():trx_commit_for_mysql(%lld)", + (long long) wsrep_thd_trx_seqno(thd)); + tmp = thd_proc_info(thd, info); + +#else + tmp = thd_proc_info(thd, "innobase_commit_low()"); +#endif /* WSREP_PROC_INFO */ + } +#endif /* WITH_WSREP */ if (trx_is_started(trx)) { trx_commit_for_mysql(trx); } +#ifdef WITH_WSREP + if (wsrep_on(thd)) { thd_proc_info(thd, tmp); } +#endif /* WITH_WSREP */ } /*****************************************************************//** @@ -4244,6 +4482,20 @@ innobase_kill_query( DBUG_ENTER("innobase_kill_query"); DBUG_ASSERT(hton == innodb_hton_ptr); +#ifdef WITH_WSREP + wsrep_thd_LOCK(thd); + if (wsrep_thd_get_conflict_state(thd) != NO_CONFLICT) { + /* if victim has been signaled by BF thread and/or aborting + is already progressing, following query aborting is not necessary + any more. + Also, BF thread should own trx mutex for the victim, which would + conflict with trx_mutex_enter() below + */ + wsrep_thd_UNLOCK(thd); + DBUG_VOID_RETURN; + } + wsrep_thd_UNLOCK(thd); +#endif /* WITH_WSREP */ trx = thd_to_trx(thd); if (trx) { @@ -4251,7 +4503,7 @@ innobase_kill_query( THD *owner = trx->current_lock_mutex_owner; /* Cancel a pending lock request. */ - if (owner != cur) { + if (!owner || owner != cur) { lock_mutex_enter(); } trx_mutex_enter(trx); @@ -4259,7 +4511,7 @@ innobase_kill_query( lock_cancel_waiting_and_release(trx->lock.wait_lock); } trx_mutex_exit(trx); - if (owner != cur) { + if (!owner || owner != cur) { lock_mutex_exit(); } } @@ -4418,7 +4670,11 @@ ha_innobase::max_supported_key_length() const case 8192: return(1536); default: +#ifdef WITH_WSREP + return(3500); +#else return(3500); +#endif } } @@ -5525,6 +5781,117 @@ get_field_offset( return((uint) (field->ptr - table->record[0])); } +#ifdef WITH_WSREP +UNIV_INTERN +int +wsrep_innobase_mysql_sort( +/*===============*/ + /* out: str contains sort string */ + int mysql_type, /* in: MySQL type */ + uint charset_number, /* in: number of the charset */ + unsigned char* str, /* in: data field */ + unsigned int str_length, /* in: data field length, + not UNIV_SQL_NULL */ + unsigned int buf_length) /* in: total str buffer length */ + +{ + CHARSET_INFO* charset; + enum_field_types mysql_tp; + int ret_length = str_length; + + DBUG_ASSERT(str_length != UNIV_SQL_NULL); + + mysql_tp = (enum_field_types) mysql_type; + + switch (mysql_tp) { + + case MYSQL_TYPE_BIT: + case MYSQL_TYPE_STRING: + case MYSQL_TYPE_VAR_STRING: + case MYSQL_TYPE_TINY_BLOB: + case MYSQL_TYPE_MEDIUM_BLOB: + case MYSQL_TYPE_BLOB: + case MYSQL_TYPE_LONG_BLOB: + case MYSQL_TYPE_VARCHAR: + { + uchar tmp_str[REC_VERSION_56_MAX_INDEX_COL_LEN] = {'\0'}; + uint tmp_length = REC_VERSION_56_MAX_INDEX_COL_LEN; + + /* Use the charset number to pick the right charset struct for + the comparison. Since the MySQL function get_charset may be + slow before Bar removes the mutex operation there, we first + look at 2 common charsets directly. */ + + if (charset_number == default_charset_info->number) { + charset = default_charset_info; + } else if (charset_number == my_charset_latin1.number) { + charset = &my_charset_latin1; + } else { + charset = get_charset(charset_number, MYF(MY_WME)); + + if (charset == NULL) { + sql_print_error("InnoDB needs charset %lu for doing " + "a comparison, but MySQL cannot " + "find that charset.", + (ulong) charset_number); + ut_a(0); + } + } + + ut_a(str_length <= tmp_length); + memcpy(tmp_str, str, str_length); + + tmp_length = charset->coll->strnxfrm(charset, str, str_length, + str_length, tmp_str, + tmp_length, 0); + DBUG_ASSERT(tmp_length <= str_length); + if (wsrep_protocol_version < 3) { + tmp_length = charset->coll->strnxfrm( + charset, str, str_length, + str_length, tmp_str, tmp_length, 0); + DBUG_ASSERT(tmp_length <= str_length); + } else { + /* strnxfrm will expand the destination string, + protocols < 3 truncated the sorted sring + protocols >= 3 gets full sorted sring + */ + tmp_length = charset->coll->strnxfrm( + charset, str, buf_length, + str_length, tmp_str, str_length, 0); + DBUG_ASSERT(tmp_length <= buf_length); + ret_length = tmp_length; + } + + break; + } + case MYSQL_TYPE_DECIMAL : + case MYSQL_TYPE_TINY : + case MYSQL_TYPE_SHORT : + case MYSQL_TYPE_LONG : + case MYSQL_TYPE_FLOAT : + case MYSQL_TYPE_DOUBLE : + case MYSQL_TYPE_NULL : + case MYSQL_TYPE_TIMESTAMP : + case MYSQL_TYPE_LONGLONG : + case MYSQL_TYPE_INT24 : + case MYSQL_TYPE_DATE : + case MYSQL_TYPE_TIME : + case MYSQL_TYPE_DATETIME : + case MYSQL_TYPE_YEAR : + case MYSQL_TYPE_NEWDATE : + case MYSQL_TYPE_NEWDECIMAL : + case MYSQL_TYPE_ENUM : + case MYSQL_TYPE_SET : + case MYSQL_TYPE_GEOMETRY : + break; + default: + break; + } + + return ret_length; +} +#endif /* WITH_WSREP */ + /*************************************************************//** InnoDB uses this function to compare two data fields for which the data type is such that we must use MySQL code to compare them. NOTE that the prototype @@ -6025,11 +6392,313 @@ innobase_read_from_2_little_endian( return((uint) ((ulint)(buf[0]) + 256 * ((ulint)(buf[1])))); } +#ifdef WITH_WSREP /*******************************************************************//** Stores a key value for a row to a buffer. @return key value length as stored in buff */ UNIV_INTERN uint +wsrep_store_key_val_for_row( +/*===============================*/ + THD* thd, + TABLE* table, + uint keynr, /*!< in: key number */ + char* buff, /*!< in/out: buffer for the key value (in MySQL + format) */ + uint buff_len,/*!< in: buffer length */ + const uchar* record, + ibool* key_is_null)/*!< out: full key was null */ +{ + KEY* key_info = table->key_info + keynr; + KEY_PART_INFO* key_part = key_info->key_part; + KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts; + char* buff_start = buff; + enum_field_types mysql_type; + Field* field; + uint buff_space = buff_len; + + DBUG_ENTER("wsrep_store_key_val_for_row"); + + memset(buff, 0, buff_len); + *key_is_null = TRUE; + + for (; key_part != end; key_part++) { + + uchar sorted[REC_VERSION_56_MAX_INDEX_COL_LEN] = {'\0'}; + ibool part_is_null = FALSE; + + if (key_part->null_bit) { + if (buff_space > 0) { + if (record[key_part->null_offset] + & key_part->null_bit) { + *buff = 1; + part_is_null = TRUE; + } else { + *buff = 0; + } + buff++; + buff_space--; + } else { + fprintf (stderr, "WSREP: key truncated: %s\n", + wsrep_thd_query(thd)); + } + } + if (!part_is_null) *key_is_null = FALSE; + + field = key_part->field; + mysql_type = field->type(); + + if (mysql_type == MYSQL_TYPE_VARCHAR) { + /* >= 5.0.3 true VARCHAR */ + ulint lenlen; + ulint len; + const byte* data; + ulint key_len; + ulint true_len; + const CHARSET_INFO* cs; + int error=0; + + key_len = key_part->length; + + if (part_is_null) { + true_len = key_len + 2; + if (true_len > buff_space) { + fprintf (stderr, + "WSREP: key truncated: %s\n", + wsrep_thd_query(thd)); + true_len = buff_space; + } + buff += true_len; + buff_space -= true_len; + continue; + } + cs = field->charset(); + + lenlen = (ulint) + (((Field_varstring*)field)->length_bytes); + + data = row_mysql_read_true_varchar(&len, + (byte*) (record + + (ulint)get_field_offset(table, field)), + lenlen); + + true_len = len; + + /* For multi byte character sets we need to calculate + the true length of the key */ + + if (len > 0 && cs->mbmaxlen > 1) { + true_len = (ulint) cs->cset->well_formed_len(cs, + (const char *) data, + (const char *) data + len, + (uint) (key_len / + cs->mbmaxlen), + &error); + } + + /* In a column prefix index, we may need to truncate + the stored value: */ + + if (true_len > key_len) { + true_len = key_len; + } + + memcpy(sorted, data, true_len); + true_len = wsrep_innobase_mysql_sort( + mysql_type, cs->number, sorted, true_len, + REC_VERSION_56_MAX_INDEX_COL_LEN); + + if (wsrep_protocol_version > 1) { + /* Note that we always reserve the maximum possible + length of the true VARCHAR in the key value, though + only len first bytes after the 2 length bytes contain + actual data. The rest of the space was reset to zero + in the bzero() call above. */ + if (true_len > buff_space) { + fprintf (stderr, + "WSREP: key truncated: %s\n", + wsrep_thd_query(thd)); + true_len = buff_space; + } + memcpy(buff, sorted, true_len); + buff += true_len; + buff_space -= true_len; + } else { + buff += key_len; + } + } else if (mysql_type == MYSQL_TYPE_TINY_BLOB + || mysql_type == MYSQL_TYPE_MEDIUM_BLOB + || mysql_type == MYSQL_TYPE_BLOB + || mysql_type == MYSQL_TYPE_LONG_BLOB + /* MYSQL_TYPE_GEOMETRY data is treated + as BLOB data in innodb. */ + || mysql_type == MYSQL_TYPE_GEOMETRY) { + + const CHARSET_INFO* cs; + ulint key_len; + ulint true_len; + int error=0; + ulint blob_len; + const byte* blob_data; + + ut_a(key_part->key_part_flag & HA_PART_KEY_SEG); + + key_len = key_part->length; + + if (part_is_null) { + true_len = key_len + 2; + if (true_len > buff_space) { + fprintf (stderr, + "WSREP: key truncated: %s\n", + wsrep_thd_query(thd)); + true_len = buff_space; + } + buff += true_len; + buff_space -= true_len; + + continue; + } + + cs = field->charset(); + + blob_data = row_mysql_read_blob_ref(&blob_len, + (byte*) (record + + (ulint)get_field_offset(table, field)), + (ulint) field->pack_length()); + + true_len = blob_len; + + ut_a(get_field_offset(table, field) + == key_part->offset); + + /* For multi byte character sets we need to calculate + the true length of the key */ + + if (blob_len > 0 && cs->mbmaxlen > 1) { + true_len = (ulint) cs->cset->well_formed_len(cs, + (const char *) blob_data, + (const char *) blob_data + + blob_len, + (uint) (key_len / + cs->mbmaxlen), + &error); + } + + /* All indexes on BLOB and TEXT are column prefix + indexes, and we may need to truncate the data to be + stored in the key value: */ + + if (true_len > key_len) { + true_len = key_len; + } + + memcpy(sorted, blob_data, true_len); + true_len = wsrep_innobase_mysql_sort( + mysql_type, cs->number, sorted, true_len, + REC_VERSION_56_MAX_INDEX_COL_LEN); + + + /* Note that we always reserve the maximum possible + length of the BLOB prefix in the key value. */ + if (wsrep_protocol_version > 1) { + if (true_len > buff_space) { + fprintf (stderr, + "WSREP: key truncated: %s\n", + wsrep_thd_query(thd)); + true_len = buff_space; + } + buff += true_len; + buff_space -= true_len; + } else { + buff += key_len; + } + memcpy(buff, sorted, true_len); + } else { + /* Here we handle all other data types except the + true VARCHAR, BLOB and TEXT. Note that the column + value we store may be also in a column prefix + index. */ + + const CHARSET_INFO* cs = NULL; + ulint true_len; + ulint key_len; + const uchar* src_start; + int error=0; + enum_field_types real_type; + + key_len = key_part->length; + + if (part_is_null) { + true_len = key_len; + if (true_len > buff_space) { + fprintf (stderr, + "WSREP: key truncated: %s\n", + wsrep_thd_query(thd)); + true_len = buff_space; + } + buff += true_len; + buff_space -= true_len; + + continue; + } + + src_start = record + key_part->offset; + real_type = field->real_type(); + true_len = key_len; + + /* Character set for the field is defined only + to fields whose type is string and real field + type is not enum or set. For these fields check + if character set is multi byte. */ + + if (real_type != MYSQL_TYPE_ENUM + && real_type != MYSQL_TYPE_SET + && ( mysql_type == MYSQL_TYPE_VAR_STRING + || mysql_type == MYSQL_TYPE_STRING)) { + + cs = field->charset(); + + /* For multi byte character sets we need to + calculate the true length of the key */ + + if (key_len > 0 && cs->mbmaxlen > 1) { + + true_len = (ulint) + cs->cset->well_formed_len(cs, + (const char *)src_start, + (const char *)src_start + + key_len, + (uint) (key_len / + cs->mbmaxlen), + &error); + } + memcpy(sorted, src_start, true_len); + true_len = wsrep_innobase_mysql_sort( + mysql_type, cs->number, sorted, true_len, + REC_VERSION_56_MAX_INDEX_COL_LEN); + + if (true_len > buff_space) { + fprintf (stderr, + "WSREP: key truncated: %s\n", + wsrep_thd_query(thd)); + true_len = buff_space; + } + memcpy(buff, sorted, true_len); + } else { + memcpy(buff, src_start, true_len); + } + buff += true_len; + buff_space -= true_len; + } + } + + ut_a(buff <= buff_start + buff_len); + + DBUG_RETURN((uint)(buff - buff_start)); +} +#endif /* WITH_WSREP */ +UNIV_INTERN +uint ha_innobase::store_key_val_for_row( /*===============================*/ uint keynr, /*!< in: key number */ @@ -6411,11 +7080,20 @@ build_template_field( templ->col_no = i; templ->clust_rec_field_no = dict_col_get_clust_pos(col, clust_index); ut_a(templ->clust_rec_field_no != ULINT_UNDEFINED); + templ->rec_field_is_prefix = FALSE; if (dict_index_is_clust(index)) { templ->rec_field_no = templ->clust_rec_field_no; + templ->rec_prefix_field_no = ULINT_UNDEFINED; } else { - templ->rec_field_no = dict_index_get_nth_col_pos(index, i); + /* If we're in a secondary index, keep track + * of the original index position even if this + * is just a prefix index; we will use this + * later to avoid a cluster index lookup in + * some cases.*/ + + templ->rec_field_no = dict_index_get_nth_col_pos(index, i, + &templ->rec_prefix_field_no); } if (field->real_maybe_null()) { @@ -6446,6 +7124,13 @@ build_template_field( if (!dict_index_is_clust(index) && templ->rec_field_no == ULINT_UNDEFINED) { prebuilt->need_to_access_clustered = TRUE; + + if (templ->rec_prefix_field_no != ULINT_UNDEFINED) { + dict_field_t* field = dict_index_get_nth_field( + index, + templ->rec_prefix_field_no); + templ->rec_field_is_prefix = (field->prefix_len != 0); + } } if (prebuilt->mysql_prefix_len < templ->mysql_col_offset @@ -6607,7 +7292,8 @@ ha_innobase::build_template( } else { templ->icp_rec_field_no = dict_index_get_nth_col_pos( - prebuilt->index, i); + prebuilt->index, i, + NULL); } if (dict_index_is_clust(prebuilt->index)) { @@ -6637,7 +7323,7 @@ ha_innobase::build_template( templ->icp_rec_field_no = dict_index_get_nth_col_or_prefix_pos( - prebuilt->index, i, TRUE); + prebuilt->index, i, TRUE, NULL); ut_ad(templ->icp_rec_field_no != ULINT_UNDEFINED); @@ -6870,6 +7556,9 @@ ha_innobase::write_row( dberr_t error; int error_result= 0; ibool auto_inc_used= FALSE; +#ifdef WITH_WSREP + ibool auto_inc_inserted= FALSE; /* if NULL was inserted */ +#endif ulint sql_command; trx_t* trx = thd_to_trx(user_thd); @@ -6903,8 +7592,20 @@ ha_innobase::write_row( if ((sql_command == SQLCOM_ALTER_TABLE || sql_command == SQLCOM_OPTIMIZE || sql_command == SQLCOM_CREATE_INDEX +#ifdef WITH_WSREP + || (wsrep_on(user_thd) && wsrep_load_data_splitting && + sql_command == SQLCOM_LOAD && + !thd_test_options( + user_thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) +#endif /* WITH_WSREP */ || sql_command == SQLCOM_DROP_INDEX) && num_write_row >= 10000) { +#ifdef WITH_WSREP + if (wsrep_on(user_thd) && sql_command == SQLCOM_LOAD) { + WSREP_DEBUG("forced trx split for LOAD: %s", + wsrep_thd_query(user_thd)); + } +#endif /* WITH_WSREP */ /* ALTER TABLE is COMMITted at every 10000 copied rows. The IX table lock for the original table has to be re-issued. As this method will be called on a temporary table where the @@ -6938,6 +7639,21 @@ no_commit: */ ; } else if (src_table == prebuilt->table) { +#ifdef WITH_WSREP + switch (wsrep_run_wsrep_commit(user_thd, 0, 1)) + { + case WSREP_TRX_OK: + break; + case WSREP_TRX_SIZE_EXCEEDED: + case WSREP_TRX_CERT_FAIL: + case WSREP_TRX_ERROR: + DBUG_RETURN(1); + } + + if (binlog_hton->commit(binlog_hton, user_thd, 1)) + DBUG_RETURN(1); + wsrep_post_commit(user_thd, TRUE); +#endif /* WITH_WSREP */ /* Source table is not in InnoDB format: no need to re-acquire locks on it. */ @@ -6948,6 +7664,21 @@ no_commit: /* We will need an IX lock on the destination table. */ prebuilt->sql_stat_start = TRUE; } else { +#ifdef WITH_WSREP + switch (wsrep_run_wsrep_commit(user_thd, 0, 1)) + { + case WSREP_TRX_OK: + break; + case WSREP_TRX_SIZE_EXCEEDED: + case WSREP_TRX_CERT_FAIL: + case WSREP_TRX_ERROR: + DBUG_RETURN(1); + } + + if (binlog_hton->commit(binlog_hton, user_thd, 1)) + DBUG_RETURN(1); + wsrep_post_commit(user_thd, TRUE); +#endif /* WITH_WSREP */ /* Ensure that there are no other table locks than LOCK_IX and LOCK_AUTO_INC on the destination table. */ @@ -6977,6 +7708,10 @@ no_commit: innobase_get_auto_increment(). */ prebuilt->autoinc_error = DB_SUCCESS; +#ifdef WITH_WSREP + auto_inc_inserted= (table->next_number_field->val_int() == 0); +#endif + if ((error_result = update_auto_increment())) { /* We don't want to mask autoinc overflow errors. */ @@ -7055,6 +7790,33 @@ no_commit: case SQLCOM_REPLACE_SELECT: goto set_max_autoinc; +#ifdef WITH_WSREP + /* workaround for LP bug #355000, retrying the insert */ + case SQLCOM_INSERT: + if (wsrep_on(current_thd) && + auto_inc_inserted && + wsrep_drupal_282555_workaround && + wsrep_thd_retry_counter(current_thd) == 0 && + !thd_test_options(current_thd, + OPTION_NOT_AUTOCOMMIT | + OPTION_BEGIN)) { + WSREP_DEBUG( + "retrying insert: %s", + (*wsrep_thd_query(current_thd)) ? + wsrep_thd_query(current_thd) : + (char *)"void"); + error= DB_SUCCESS; + wsrep_thd_set_conflict_state( + current_thd, MUST_ABORT); + innobase_srv_conc_exit_innodb( + prebuilt->trx); + /* jump straight to func exit over + * later wsrep hooks */ + goto func_exit; + } + break; +#endif /* WITH_WSREP */ + default: break; } @@ -7114,6 +7876,21 @@ report_error: prebuilt->table->flags, user_thd); +#ifdef WITH_WSREP + if (!error_result && wsrep_thd_exec_mode(user_thd) == LOCAL_STATE && + wsrep_on(user_thd) && !wsrep_consistency_check(user_thd) && + (sql_command != SQLCOM_LOAD || + thd_binlog_format(user_thd) == BINLOG_FORMAT_ROW)) { + + if (wsrep_append_keys(user_thd, false, record, NULL)) { + DBUG_PRINT("wsrep", ("row key failed")); + error_result = HA_ERR_INTERNAL_ERROR; + goto wsrep_error; + } + } +wsrep_error: +#endif /* WITH_WSREP */ + if (error_result == HA_FTS_INVALID_DOCID) { my_error(HA_FTS_INVALID_DOCID, MYF(0)); } @@ -7401,6 +8178,88 @@ calc_row_difference( return(DB_SUCCESS); } +#ifdef WITH_WSREP +static +int +wsrep_calc_row_hash( +/*================*/ + byte* digest, /*!< in/out: md5 sum */ + const uchar* row, /*!< in: row in MySQL format */ + TABLE* table, /*!< in: table in MySQL data + dictionary */ + row_prebuilt_t* prebuilt, /*!< in: InnoDB prebuilt struct */ + THD* thd) /*!< in: user thread */ +{ + Field* field; + enum_field_types field_mysql_type; + uint n_fields; + ulint len; + const byte* ptr; + ulint col_type; + uint i; + + void *ctx = alloca(my_md5_context_size()); + my_md5_init(ctx); + + n_fields = table->s->fields; + + for (i = 0; i < n_fields; i++) { + byte null_byte=0; + byte true_byte=1; + + field = table->field[i]; + + ptr = (const byte*) row + get_field_offset(table, field); + len = field->pack_length(); + + field_mysql_type = field->type(); + + col_type = prebuilt->table->cols[i].mtype; + + switch (col_type) { + + case DATA_BLOB: + ptr = row_mysql_read_blob_ref(&len, ptr, len); + + break; + + case DATA_VARCHAR: + case DATA_BINARY: + case DATA_VARMYSQL: + if (field_mysql_type == MYSQL_TYPE_VARCHAR) { + /* This is a >= 5.0.3 type true VARCHAR where + the real payload data length is stored in + 1 or 2 bytes */ + + ptr = row_mysql_read_true_varchar( + &len, ptr, + (ulint) + (((Field_varstring*)field)->length_bytes)); + + } + + break; + default: + ; + } + /* + if (field->null_ptr && + field_in_record_is_null(table, field, (char*) row)) { + */ + + if (field->is_null_in_record(row)) { + my_md5_input(ctx, &null_byte, 1); + } else { + my_md5_input(ctx, &true_byte, 1); + my_md5_input(ctx, ptr, len); + } + } + + my_md5_result(ctx, digest); + + return(0); +} +#endif /* WITH_WSREP */ /**********************************************************************//** Updates a row given as a parameter to a new value. Note that we are given whole rows, not just the fields which are updated: this incurs some @@ -7538,6 +8397,24 @@ func_exit: innobase_active_small(); +#ifdef WITH_WSREP + if (error == DB_SUCCESS && + wsrep_thd_exec_mode(user_thd) == LOCAL_STATE && + wsrep_on(user_thd)) { + + DBUG_PRINT("wsrep", ("update row key")); + + if (wsrep_append_keys(user_thd, false, old_row, new_row)) { + WSREP_DEBUG("WSREP: UPDATE_ROW_KEY FAILED"); + DBUG_PRINT("wsrep", ("row key failed")); + err = HA_ERR_INTERNAL_ERROR; + goto wsrep_error; + } + } +wsrep_error: +#endif /* WITH_WSREP */ + + DBUG_RETURN(err); } @@ -7585,6 +8462,19 @@ ha_innobase::delete_row( innobase_active_small(); +#ifdef WITH_WSREP + if (error == DB_SUCCESS && + wsrep_thd_exec_mode(user_thd) == LOCAL_STATE && + wsrep_on(user_thd)) { + + if (wsrep_append_keys(user_thd, false, record, NULL)) { + DBUG_PRINT("wsrep", ("delete fail")); + error = (dberr_t)HA_ERR_INTERNAL_ERROR; + goto wsrep_error; + } + } +wsrep_error: +#endif DBUG_RETURN(convert_error_code_to_mysql( error, prebuilt->table->flags, user_thd)); } @@ -8781,6 +9671,396 @@ ha_innobase::ft_end() rnd_end(); } +#ifdef WITH_WSREP +extern dict_index_t* +wsrep_dict_foreign_find_index( + dict_table_t* table, + const char** col_names, + const char** columns, + ulint n_cols, + dict_index_t* types_idx, + ibool check_charsets, + ulint check_null); + + +extern dberr_t +wsrep_append_foreign_key( +/*===========================*/ + trx_t* trx, /*!< in: trx */ + dict_foreign_t* foreign, /*!< in: foreign key constraint */ + const rec_t* rec, /*!<in: clustered index record */ + dict_index_t* index, /*!<in: clustered index */ + ibool referenced, /*!<in: is check for referenced table */ + ibool shared) /*!<in: is shared access */ +{ + ut_a(trx); + THD* thd = (THD*)trx->mysql_thd; + ulint rcode = DB_SUCCESS; + char cache_key[513] = {'\0'}; + int cache_key_len; + bool const copy = true; + + if (!wsrep_on(trx->mysql_thd) || + wsrep_thd_exec_mode(thd) != LOCAL_STATE) + return DB_SUCCESS; + + if (!thd || !foreign || + (!foreign->referenced_table && !foreign->foreign_table)) + { + WSREP_INFO("FK: %s missing in: %s", + (!thd) ? "thread" : + ((!foreign) ? "constraint" : + ((!foreign->referenced_table) ? + "referenced table" : "foreign table")), + (thd && wsrep_thd_query(thd)) ? + wsrep_thd_query(thd) : "void"); + return DB_ERROR; + } + + if ( !((referenced) ? + foreign->referenced_table : foreign->foreign_table)) + { + WSREP_DEBUG("pulling %s table into cache", + (referenced) ? "referenced" : "foreign"); + mutex_enter(&(dict_sys->mutex)); + if (referenced) + { + foreign->referenced_table = + dict_table_get_low( + foreign->referenced_table_name_lookup); + if (foreign->referenced_table) + { + foreign->referenced_index = + wsrep_dict_foreign_find_index( + foreign->referenced_table, NULL, + foreign->referenced_col_names, + foreign->n_fields, + foreign->foreign_index, + TRUE, FALSE); + } + } + else + { + foreign->foreign_table = + dict_table_get_low( + foreign->foreign_table_name_lookup); + if (foreign->foreign_table) + { + foreign->foreign_index = + wsrep_dict_foreign_find_index( + foreign->foreign_table, NULL, + foreign->foreign_col_names, + foreign->n_fields, + foreign->referenced_index, + TRUE, FALSE); + } + } + mutex_exit(&(dict_sys->mutex)); + } + + if ( !((referenced) ? + foreign->referenced_table : foreign->foreign_table)) + { + WSREP_WARN("FK: %s missing in query: %s", + (!foreign->referenced_table) ? + "referenced table" : "foreign table", + (wsrep_thd_query(thd)) ? + wsrep_thd_query(thd) : "void"); + return DB_ERROR; + } + byte key[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'}; + ulint len = WSREP_MAX_SUPPORTED_KEY_LENGTH; + + dict_index_t *idx_target = (referenced) ? + foreign->referenced_index : index; + dict_index_t *idx = (referenced) ? + UT_LIST_GET_FIRST(foreign->referenced_table->indexes) : + UT_LIST_GET_FIRST(foreign->foreign_table->indexes); + int i = 0; + while (idx != NULL && idx != idx_target) { + if (innobase_strcasecmp (idx->name, innobase_index_reserve_name) != 0) { + i++; + } + idx = UT_LIST_GET_NEXT(indexes, idx); + } + ut_a(idx); + key[0] = (char)i; + + rcode = wsrep_rec_get_foreign_key( + &key[1], &len, rec, index, idx, + wsrep_protocol_version > 1); + if (rcode != DB_SUCCESS) { + WSREP_ERROR( + "FK key set failed: %lu (%lu %lu), index: %s %s, %s", + rcode, referenced, shared, + (index && index->name) ? index->name : + "void index", + (index && index->table_name) ? index->table_name : + "void table", + wsrep_thd_query(thd)); + return DB_ERROR; + } + strncpy(cache_key, + (wsrep_protocol_version > 1) ? + ((referenced) ? + foreign->referenced_table->name : + foreign->foreign_table->name) : + foreign->foreign_table->name, sizeof(cache_key) - 1); + cache_key_len = strlen(cache_key); +#ifdef WSREP_DEBUG_PRINT + ulint j; + fprintf(stderr, "FK parent key, table: %s %s len: %lu ", + cache_key, (shared) ? "shared" : "exclusive", len+1); + for (j=0; j<len+1; j++) { + fprintf(stderr, " %hhX, ", key[j]); + } + fprintf(stderr, "\n"); +#endif + char *p = strchr(cache_key, '/'); + if (p) { + *p = '\0'; + } else { + WSREP_WARN("unexpected foreign key table %s %s", + foreign->referenced_table->name, + foreign->foreign_table->name); + } + + wsrep_buf_t wkey_part[3]; + wsrep_key_t wkey = {wkey_part, 3}; + if (!wsrep_prepare_key( + (const uchar*)cache_key, + cache_key_len + 1, + (const uchar*)key, len+1, + wkey_part, + (size_t*)&wkey.key_parts_num)) { + WSREP_WARN("key prepare failed for cascaded FK: %s", + (wsrep_thd_query(thd)) ? + wsrep_thd_query(thd) : "void"); + return DB_ERROR; + } + wsrep_t *wsrep= get_wsrep(); + rcode = (int)wsrep->append_key( + wsrep, + wsrep_ws_handle(thd, trx), + &wkey, + 1, + shared ? WSREP_KEY_SHARED : WSREP_KEY_EXCLUSIVE, + copy); + if (rcode) { + DBUG_PRINT("wsrep", ("row key failed: %lu", rcode)); + WSREP_ERROR("Appending cascaded fk row key failed: %s, %lu", + (wsrep_thd_query(thd)) ? + wsrep_thd_query(thd) : "void", rcode); + return DB_ERROR; + } + + return DB_SUCCESS; +} + +static int +wsrep_append_key( +/*==================*/ + THD *thd, + trx_t *trx, + TABLE_SHARE *table_share, + TABLE *table, + const char* key, + uint16_t key_len, + bool shared +) +{ + DBUG_ENTER("wsrep_append_key"); + bool const copy = true; +#ifdef WSREP_DEBUG_PRINT + fprintf(stderr, "%s conn %ld, trx %llu, keylen %d, table %s ", + (shared) ? "Shared" : "Exclusive", + thd_get_thread_id(thd), (long long)trx->id, key_len, + table_share->table_name.str); + for (int i=0; i<key_len; i++) { + fprintf(stderr, "%hhX, ", key[i]); + } + fprintf(stderr, "\n"); +#endif + wsrep_buf_t wkey_part[3]; + wsrep_key_t wkey = {wkey_part, 3}; + if (!wsrep_prepare_key( + (const uchar*)table_share->table_cache_key.str, + table_share->table_cache_key.length, + (const uchar*)key, key_len, + wkey_part, + (size_t*)&wkey.key_parts_num)) { + WSREP_WARN("key prepare failed for: %s", + (wsrep_thd_query(thd)) ? + wsrep_thd_query(thd) : "void"); + DBUG_RETURN(HA_ERR_INTERNAL_ERROR); + } + + wsrep_t *wsrep= get_wsrep(); + int rcode = (int)wsrep->append_key( + wsrep, + wsrep_ws_handle(thd, trx), + &wkey, + 1, + shared ? WSREP_KEY_SHARED : WSREP_KEY_EXCLUSIVE, + copy); + if (rcode) { + DBUG_PRINT("wsrep", ("row key failed: %d", rcode)); + WSREP_WARN("Appending row key failed: %s, %d", + (wsrep_thd_query(thd)) ? + wsrep_thd_query(thd) : "void", rcode); + DBUG_RETURN(HA_ERR_INTERNAL_ERROR); + } + DBUG_RETURN(0); +} + +extern void compute_md5_hash(char *digest, const char *buf, int len); +#define MD5_HASH compute_md5_hash + +int +ha_innobase::wsrep_append_keys( +/*==================*/ + THD *thd, + bool shared, + const uchar* record0, /* in: row in MySQL format */ + const uchar* record1) /* in: row in MySQL format */ +{ + int rcode; + DBUG_ENTER("wsrep_append_keys"); + + bool key_appended = false; + trx_t *trx = thd_to_trx(thd); + + if (table_share && table_share->tmp_table != NO_TMP_TABLE) { + WSREP_DEBUG("skipping tmp table DML: THD: %lu tmp: %d SQL: %s", + thd_get_thread_id(thd), + table_share->tmp_table, + (wsrep_thd_query(thd)) ? + wsrep_thd_query(thd) : "void"); + DBUG_RETURN(0); + } + + if (wsrep_protocol_version == 0) { + uint len; + char keyval[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'}; + char *key = &keyval[0]; + ibool is_null; + + len = wsrep_store_key_val_for_row( + thd, table, 0, key, WSREP_MAX_SUPPORTED_KEY_LENGTH, + record0, &is_null); + + if (!is_null) { + rcode = wsrep_append_key( + thd, trx, table_share, table, keyval, + len, shared); + if (rcode) DBUG_RETURN(rcode); + } + else + { + WSREP_DEBUG("NULL key skipped (proto 0): %s", + wsrep_thd_query(thd)); + } + } else { + ut_a(table->s->keys <= 256); + uint i; + bool hasPK= false; + + for (i=0; i<table->s->keys; ++i) { + KEY* key_info = table->key_info + i; + if (key_info->flags & HA_NOSAME) { + hasPK = true; + } + } + + for (i=0; i<table->s->keys; ++i) { + uint len; + char keyval0[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'}; + char keyval1[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'}; + char* key0 = &keyval0[1]; + char* key1 = &keyval1[1]; + KEY* key_info = table->key_info + i; + ibool is_null; + + dict_index_t* idx = innobase_get_index(i); + dict_table_t* tab = (idx) ? idx->table : NULL; + + keyval0[0] = (char)i; + keyval1[0] = (char)i; + + if (!tab) { + WSREP_WARN("MySQL-InnoDB key mismatch %s %s", + table->s->table_name.str, + key_info->name); + } + /* !hasPK == table with no PK, must append all non-unique keys */ + if (!hasPK || key_info->flags & HA_NOSAME || + ((tab && + dict_table_get_referenced_constraint(tab, idx)) || + (!tab && referenced_by_foreign_key()))) { + + len = wsrep_store_key_val_for_row( + thd, table, i, key0, + WSREP_MAX_SUPPORTED_KEY_LENGTH, + record0, &is_null); + if (!is_null) { + rcode = wsrep_append_key( + thd, trx, table_share, table, + keyval0, len+1, shared); + if (rcode) DBUG_RETURN(rcode); + + if (key_info->flags & HA_NOSAME || shared) + key_appended = true; + } + else + { + WSREP_DEBUG("NULL key skipped: %s", + wsrep_thd_query(thd)); + } + if (record1) { + len = wsrep_store_key_val_for_row( + thd, table, i, key1, + WSREP_MAX_SUPPORTED_KEY_LENGTH, + record1, &is_null); + if (!is_null && memcmp(key0, key1, len)) { + rcode = wsrep_append_key( + thd, trx, table_share, + table, + keyval1, len+1, shared); + if (rcode) DBUG_RETURN(rcode); + } + } + } + } + } + + /* if no PK, calculate hash of full row, to be the key value */ + if (!key_appended && wsrep_certify_nonPK) { + uchar digest[16]; + int rcode; + + wsrep_calc_row_hash(digest, record0, table, prebuilt, thd); + if ((rcode = wsrep_append_key(thd, trx, table_share, table, + (const char*) digest, 16, + shared))) { + DBUG_RETURN(rcode); + } + + if (record1) { + wsrep_calc_row_hash( + digest, record1, table, prebuilt, thd); + if ((rcode = wsrep_append_key(thd, trx, table_share, + table, + (const char*) digest, + 16, shared))) { + DBUG_RETURN(rcode); + } + } + DBUG_RETURN(0); + } + + DBUG_RETURN(0); +} +#endif /* WITH_WSREP */ /*********************************************************************//** Stores a reference to the current row to 'ref' field of the handle. Note @@ -9655,11 +10935,16 @@ innobase_table_flags( enum row_type row_format; rec_format_t innodb_row_format = REC_FORMAT_COMPACT; bool use_data_dir; + ha_table_option_struct *options= form->s->option_struct; /* Cache the value of innodb_file_format, in case it is modified by another thread while the table is being created. */ const ulint file_format_allowed = srv_file_format; + /* Cache the value of innobase_compression_level, in case it is + modified by another thread while the table is being created. */ + const ulint default_compression_level = page_zip_level; + *flags = 0; *flags2 = 0; @@ -9713,6 +10998,8 @@ index_bad: } } + row_format = form->s->row_type; + if (create_info->key_block_size) { /* The requested compressed page size (key_block_size) is given in kilobytes. If it is a valid number, store @@ -9722,7 +11009,7 @@ index_bad: ulint kbsize; /* Key Block Size */ for (zssize = kbsize = 1; zssize <= ut_min(UNIV_PAGE_SSIZE_MAX, - PAGE_ZIP_SSIZE_MAX); + PAGE_ZIP_SSIZE_MAX); zssize++, kbsize <<= 1) { if (kbsize == create_info->key_block_size) { zip_ssize = zssize; @@ -9750,8 +11037,8 @@ index_bad: } if (!zip_allowed - || zssize > ut_min(UNIV_PAGE_SSIZE_MAX, - PAGE_ZIP_SSIZE_MAX)) { + || zssize > ut_min(UNIV_PAGE_SSIZE_MAX, + PAGE_ZIP_SSIZE_MAX)) { push_warning_printf( thd, Sql_condition::WARN_LEVEL_WARN, ER_ILLEGAL_HA_CREATE_OPTION, @@ -9760,8 +11047,6 @@ index_bad: } } - row_format = form->s->row_type; - if (zip_ssize && zip_allowed) { /* if ROW_FORMAT is set to default, automatically change it to COMPRESSED.*/ @@ -9798,7 +11083,6 @@ index_bad: case ROW_TYPE_REDUNDANT: innodb_row_format = REC_FORMAT_REDUNDANT; break; - case ROW_TYPE_COMPRESSED: case ROW_TYPE_DYNAMIC: if (!use_tablespace) { @@ -9816,10 +11100,18 @@ index_bad: " innodb_file_format > Antelope.", get_row_format_name(row_format)); } else { - innodb_row_format = (row_format == ROW_TYPE_DYNAMIC - ? REC_FORMAT_DYNAMIC - : REC_FORMAT_COMPRESSED); - break; + switch(row_format) { + case ROW_TYPE_COMPRESSED: + innodb_row_format = REC_FORMAT_COMPRESSED; + break; + case ROW_TYPE_DYNAMIC: + innodb_row_format = REC_FORMAT_DYNAMIC; + break; + default: + /* Not possible, avoid compiler warning */ + break; + } + break; /* Correct row_format */ } zip_allowed = FALSE; /* fall through to set row_format = COMPACT */ @@ -9846,7 +11138,15 @@ index_bad: && ((create_info->data_file_name != NULL) && !(create_info->options & HA_LEX_CREATE_TMP_TABLE)); - dict_tf_set(flags, innodb_row_format, zip_ssize, use_data_dir); + /* Set up table dictionary flags */ + dict_tf_set(flags, + innodb_row_format, + zip_ssize, + use_data_dir, + options->page_compressed, + (ulint)options->page_compression_level == ULINT_UNDEFINED ? + default_compression_level : options->page_compression_level, + options->atomic_writes); if (create_info->options & HA_LEX_CREATE_TMP_TABLE) { *flags2 |= DICT_TF2_TEMPORARY; @@ -9864,6 +11164,114 @@ index_bad: DBUG_RETURN(true); } + +/*****************************************************************//** +Check engine specific table options not handled by SQL-parser. +@return NULL if valid, string if not */ +UNIV_INTERN +const char* +ha_innobase::check_table_options( + THD *thd, /*!< in: thread handle */ + TABLE* table, /*!< in: information on table + columns and indexes */ + HA_CREATE_INFO* create_info, /*!< in: more information of the + created table, contains also the + create statement string */ + const bool use_tablespace, /*!< in: use file par table */ + const ulint file_format) +{ + enum row_type row_format = table->s->row_type;; + ha_table_option_struct *options= table->s->option_struct; + atomic_writes_t awrites = (atomic_writes_t)options->atomic_writes; + + /* Check page compression requirements */ + if (options->page_compressed) { + + if (row_format == ROW_TYPE_COMPRESSED) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED table can't have" + " ROW_TYPE=COMPRESSED"); + return "PAGE_COMPRESSED"; + } + + if (row_format == ROW_TYPE_REDUNDANT) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED table can't have" + " ROW_TYPE=REDUNDANT"); + return "PAGE_COMPRESSED"; + } + + if (!use_tablespace) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED requires" + " innodb_file_per_table."); + return "PAGE_COMPRESSED"; + } + + if (file_format < UNIV_FORMAT_B) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED requires" + " innodb_file_format > Antelope."); + return "PAGE_COMPRESSED"; + } + + if (create_info->key_block_size) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED table can't have" + " key_block_size"); + return "PAGE_COMPRESSED"; + } + } + + /* Check page compression level requirements, some of them are + already checked above */ + if ((ulint)options->page_compression_level != ULINT_UNDEFINED) { + if (options->page_compressed == false) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSION_LEVEL requires" + " PAGE_COMPRESSED"); + return "PAGE_COMPRESSION_LEVEL"; + } + + if (options->page_compression_level < 0 || options->page_compression_level > 9) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: invalid PAGE_COMPRESSION_LEVEL = %lu." + " Valid values are [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]", + options->page_compression_level); + return "PAGE_COMPRESSION_LEVEL"; + } + } + + /* Check atomic writes requirements */ + if (awrites == ATOMIC_WRITES_ON || + (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes)) { + if (!use_tablespace) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: ATOMIC_WRITES requires" + " innodb_file_per_table."); + return "ATOMIC_WRITES"; + } + } + + return 0; +} + /*****************************************************************//** Creates a new table to an InnoDB database. @return error number */ @@ -9895,6 +11303,7 @@ ha_innobase::create( while creating the table. So we read the current value here and make all further decisions based on this. */ bool use_tablespace = srv_file_per_table; + const ulint file_format = srv_file_format; /* Zip Shift Size - log2 - 9 of compressed page size, zero for uncompressed */ @@ -9918,6 +11327,12 @@ ha_innobase::create( /* Create the table definition in InnoDB */ + /* Validate table options not handled by the SQL-parser */ + if(check_table_options(thd, form, create_info, use_tablespace, + file_format)) { + DBUG_RETURN(HA_WRONG_CREATE_OPTION); + } + /* Validate create options if innodb_strict_mode is set. */ if (create_options_are_invalid( thd, form, create_info, use_tablespace)) { @@ -10488,6 +11903,71 @@ ha_innobase::delete_table( DBUG_RETURN(convert_error_code_to_mysql(err, 0, NULL)); } +/*****************************************************************//** +Defragment table. +@return error number */ +UNIV_INTERN +int +ha_innobase::defragment_table( +/*==========================*/ + const char* name, /*!< in: table name */ + const char* index_name, /*!< in: index name */ + bool async) /*!< in: whether to wait until finish */ +{ + char norm_name[FN_REFLEN]; + dict_table_t* table; + dict_index_t* index; + ibool one_index = (index_name != 0); + int ret = 0; + if (!srv_defragment) { + return ER_FEATURE_DISABLED; + } + normalize_table_name(norm_name, name); + table = dict_table_open_on_name(norm_name, FALSE, + FALSE, DICT_ERR_IGNORE_NONE); + for (index = dict_table_get_first_index(table); index; + index = dict_table_get_next_index(index)) { + if (one_index && strcasecmp(index_name, index->name) != 0) + continue; + if (btr_defragment_find_index(index)) { + // We borrow this error code. When the same index is + // already in the defragmentation queue, issue another + // defragmentation only introduces overhead. We return + // an error here to let the user know this is not + // necessary. Note that this will fail a query that's + // trying to defragment a full table if one of the + // indicies in that table is already in defragmentation. + // We choose this behavior so user is aware of this + // rather than silently defragment other indicies of + // that table. + ret = ER_SP_ALREADY_EXISTS; + break; + } + os_event_t event = btr_defragment_add_index(index, async); + if (!async && event) { + while(os_event_wait_time(event, 1000000)) { + if (thd_killed(current_thd)) { + btr_defragment_remove_index(index); + ret = ER_QUERY_INTERRUPTED; + break; + } + } + os_event_free(event); + } + if (ret) { + break; + } + if (one_index) { + one_index = FALSE; + break; + } + } + dict_table_close(table, FALSE, FALSE); + if (ret == 0 && one_index) { + ret = ER_NO_SUCH_INDEX; + } + return ret; +} /*****************************************************************//** Removes all tables in the named database inside InnoDB. */ @@ -11646,6 +13126,27 @@ ha_innobase::optimize( This works OK otherwise, but MySQL locks the entire table during calls to OPTIMIZE, which is undesirable. */ + if (srv_defragment) { + int err; + + err = defragment_table(prebuilt->table->name, NULL, false); + + if (err == 0) { + return (HA_ADMIN_OK); + } else { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + err, + "InnoDB: Cannot defragment table %s: returned error code %d\n", + prebuilt->table->name, err); + + if(err == ER_SP_ALREADY_EXISTS) { + return (HA_ADMIN_OK); + } else { + return (HA_ADMIN_TRY_ALTER); + } + } + } + if (innodb_optimize_fulltext_only) { if (prebuilt->table->fts && prebuilt->table->fts->cache && !dict_table_is_discarded(prebuilt->table)) { @@ -11745,7 +13246,7 @@ ha_innobase::check( CHECK TABLE. */ os_increment_counter_by_amount( server_mutex, - srv_fatal_semaphore_wait_threshold, + srv_fatal_semaphore_wait_threshold, SRV_SEMAPHORE_WAIT_EXTENSION); bool valid = btr_validate_index(index, prebuilt->trx); @@ -11753,7 +13254,7 @@ ha_innobase::check( CHECK TABLE. */ os_decrement_counter_by_amount( server_mutex, - srv_fatal_semaphore_wait_threshold, + srv_fatal_semaphore_wait_threshold, SRV_SEMAPHORE_WAIT_EXTENSION); if (!valid) { @@ -12537,11 +14038,18 @@ ha_innobase::external_lock( /* used by test case */ DBUG_EXECUTE_IF("no_innodb_binlog_errors", skip = true;); if (!skip) { +#ifdef WITH_WSREP + if (!wsrep_on(thd) || wsrep_thd_exec_mode(thd) == LOCAL_STATE) + { +#endif /* WITH_WSREP */ my_error(ER_BINLOG_STMT_MODE_AND_ROW_ENGINE, MYF(0), " InnoDB is limited to row-logging when " "transaction isolation level is " "READ COMMITTED or READ UNCOMMITTED."); DBUG_RETURN(HA_ERR_LOGGING_IMPOSSIBLE); +#ifdef WITH_WSREP + } +#endif /* WITH_WSREP */ } } @@ -13995,6 +15503,9 @@ innobase_xa_prepare( to the session variable take effect only in the next transaction */ if (!trx->support_xa) { +#ifdef WITH_WSREP + thd_get_xid(thd, (MYSQL_XID*) &trx->xid); +#endif // WITH_WSREP return(0); } @@ -14182,6 +15693,12 @@ ha_innobase::check_if_incompatible_data( HA_CREATE_INFO* info, uint table_changes) { + ha_table_option_struct *param_old, *param_new; + + /* Cache engine specific options */ + param_new = info->option_struct; + param_old = table->s->option_struct; + innobase_copy_frm_flags_from_create_info(prebuilt->table, info); if (table_changes != IS_EQUAL_YES) { @@ -14208,6 +15725,13 @@ ha_innobase::check_if_incompatible_data( return(COMPATIBLE_DATA_NO); } + /* Changes on engine specific table options requests a rebuild of the table. */ + if (param_new->page_compressed != param_old->page_compressed || + param_new->page_compression_level != param_old->page_compression_level || + param_new->atomic_writes != param_old->atomic_writes) { + return(COMPATIBLE_DATA_NO); + } + return(COMPATIBLE_DATA_YES); } @@ -14347,6 +15871,13 @@ innodb_max_dirty_pages_pct_lwm_update( srv_max_dirty_pages_pct_lwm = in_val; } +UNIV_INTERN +void +ha_innobase::set_partition_owner_stats(ha_statistics *stats) +{ + ha_partition_stats= stats; +} + /************************************************************//** Validate the file format name and return its corresponding id. @return valid file format id */ @@ -15600,6 +17131,23 @@ innodb_reset_all_monitor_update( TRUE); } +static +void +innodb_defragment_frequency_update( +/*===============================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + srv_defragment_frequency = (*static_cast<const uint*>(save)); + srv_defragment_interval = ut_microseconds_to_timer( + 1000000.0 / srv_defragment_frequency); +} + /****************************************************************//** Parse and enable InnoDB monitor counters during server startup. User can list the monitor counters/groups to be enable by specifying @@ -16051,6 +17599,290 @@ static SHOW_VAR innodb_status_variables_export[]= { static struct st_mysql_storage_engine innobase_storage_engine= { MYSQL_HANDLERTON_INTERFACE_VERSION }; +#ifdef WITH_WSREP +void +wsrep_abort_slave_trx(wsrep_seqno_t bf_seqno, wsrep_seqno_t victim_seqno) +{ + WSREP_ERROR("Trx %lld tries to abort slave trx %lld. This could be " + "caused by:\n\t" + "1) unsupported configuration options combination, please check documentation.\n\t" + "2) a bug in the code.\n\t" + "3) a database corruption.\n Node consistency compromized, " + "need to abort. Restart the node to resync with cluster.", + (long long)bf_seqno, (long long)victim_seqno); + abort(); +} +/*******************************************************************//** +This function is used to kill one transaction in BF. */ + +int +wsrep_innobase_kill_one_trx(void * const bf_thd_ptr, + const trx_t * const bf_trx, + trx_t *victim_trx, ibool signal) +{ + ut_ad(lock_mutex_own()); + ut_ad(trx_mutex_own(victim_trx)); + ut_ad(bf_thd_ptr); + ut_ad(victim_trx); + + DBUG_ENTER("wsrep_innobase_kill_one_trx"); + THD *bf_thd = bf_thd_ptr ? (THD*) bf_thd_ptr : NULL; + THD *thd = (THD *) victim_trx->mysql_thd; + int64_t bf_seqno = (bf_thd) ? wsrep_thd_trx_seqno(bf_thd) : 0; + + if (!thd) { + DBUG_PRINT("wsrep", ("no thd for conflicting lock")); + WSREP_WARN("no THD for trx: %lu", victim_trx->id); + DBUG_RETURN(1); + } + if (!bf_thd) { + DBUG_PRINT("wsrep", ("no BF thd for conflicting lock")); + WSREP_WARN("no BF THD for trx: %lu", (bf_trx) ? bf_trx->id : 0); + DBUG_RETURN(1); + } + + WSREP_LOG_CONFLICT(bf_thd, thd, TRUE); + + WSREP_DEBUG("BF kill (%lu, seqno: %lld), victim: (%lu) trx: %lu", + signal, (long long)bf_seqno, + thd_get_thread_id(thd), + victim_trx->id); + + WSREP_DEBUG("Aborting query: %s", + (thd && wsrep_thd_query(thd)) ? wsrep_thd_query(thd) : "void"); + + wsrep_thd_LOCK(thd); + + if (wsrep_thd_query_state(thd) == QUERY_EXITING) { + WSREP_DEBUG("kill trx EXITING for %lu", victim_trx->id); + wsrep_thd_UNLOCK(thd); + DBUG_RETURN(0); + } + if(wsrep_thd_exec_mode(thd) != LOCAL_STATE) { + WSREP_DEBUG("withdraw for BF trx: %lu, state: %d", + victim_trx->id, + wsrep_thd_get_conflict_state(thd)); + } + + switch (wsrep_thd_get_conflict_state(thd)) { + case NO_CONFLICT: + wsrep_thd_set_conflict_state(thd, MUST_ABORT); + break; + case MUST_ABORT: + WSREP_DEBUG("victim %lu in MUST ABORT state", + victim_trx->id); + wsrep_thd_UNLOCK(thd); + wsrep_thd_awake(thd, signal); + DBUG_RETURN(0); + break; + case ABORTED: + case ABORTING: // fall through + default: + WSREP_DEBUG("victim %lu in state %d", + victim_trx->id, wsrep_thd_get_conflict_state(thd)); + wsrep_thd_UNLOCK(thd); + DBUG_RETURN(0); + break; + } + + switch (wsrep_thd_query_state(thd)) { + case QUERY_COMMITTING: + enum wsrep_status rcode; + + WSREP_DEBUG("kill query for: %ld", + thd_get_thread_id(thd)); + WSREP_DEBUG("kill trx QUERY_COMMITTING for %lu", + victim_trx->id); + + if (wsrep_thd_exec_mode(thd) == REPL_RECV) { + wsrep_abort_slave_trx(bf_seqno, + wsrep_thd_trx_seqno(thd)); + } else { + wsrep_t *wsrep= get_wsrep(); + rcode = wsrep->abort_pre_commit( + wsrep, bf_seqno, + (wsrep_trx_id_t)victim_trx->id + ); + + switch (rcode) { + case WSREP_WARNING: + WSREP_DEBUG("cancel commit warning: %lu", + victim_trx->id); + wsrep_thd_UNLOCK(thd); + wsrep_thd_awake(thd, signal); + DBUG_RETURN(1); + break; + case WSREP_OK: + break; + default: + WSREP_ERROR( + "cancel commit bad exit: %d %lu", + rcode, + victim_trx->id); + /* unable to interrupt, must abort */ + /* note: kill_mysql() will block, if we cannot. + * kill the lock holder first. + */ + abort(); + break; + } + } + wsrep_thd_UNLOCK(thd); + wsrep_thd_awake(thd, signal); + break; + case QUERY_EXEC: + /* it is possible that victim trx is itself waiting for some + * other lock. We need to cancel this waiting + */ + WSREP_DEBUG("kill trx QUERY_EXEC for %lu", victim_trx->id); + + victim_trx->lock.was_chosen_as_deadlock_victim= TRUE; + if (victim_trx->lock.wait_lock) { + WSREP_DEBUG("victim has wait flag: %ld", + thd_get_thread_id(thd)); + lock_t* wait_lock = victim_trx->lock.wait_lock; + if (wait_lock) { + WSREP_DEBUG("canceling wait lock"); + victim_trx->lock.was_chosen_as_deadlock_victim= TRUE; + lock_cancel_waiting_and_release(wait_lock); + } + + wsrep_thd_UNLOCK(thd); + wsrep_thd_awake(thd, signal); + } else { + /* abort currently executing query */ + DBUG_PRINT("wsrep",("sending KILL_QUERY to: %ld", + thd_get_thread_id(thd))); + WSREP_DEBUG("kill query for: %ld", + thd_get_thread_id(thd)); + /* Note that innobase_kill_connection will take lock_mutex + and trx_mutex */ + wsrep_thd_UNLOCK(thd); + wsrep_thd_awake(thd, signal); + + /* for BF thd, we need to prevent him from committing */ + if (wsrep_thd_exec_mode(thd) == REPL_RECV) { + wsrep_abort_slave_trx(bf_seqno, + wsrep_thd_trx_seqno(thd)); + } + } + break; + case QUERY_IDLE: + { + WSREP_DEBUG("kill IDLE for %lu", victim_trx->id); + + if (wsrep_thd_exec_mode(thd) == REPL_RECV) { + WSREP_DEBUG("kill BF IDLE, seqno: %lld", + (long long)wsrep_thd_trx_seqno(thd)); + wsrep_thd_UNLOCK(thd); + wsrep_abort_slave_trx(bf_seqno, + wsrep_thd_trx_seqno(thd)); + DBUG_RETURN(0); + } + /* This will lock thd from proceeding after net_read() */ + wsrep_thd_set_conflict_state(thd, ABORTING); + + wsrep_lock_rollback(); + + if (wsrep_aborting_thd_contains(thd)) { + WSREP_WARN("duplicate thd aborter %lu", + thd_get_thread_id(thd)); + } else { + wsrep_aborting_thd_enqueue(thd); + DBUG_PRINT("wsrep",("enqueuing trx abort for %lu", + thd_get_thread_id(thd))); + WSREP_DEBUG("enqueuing trx abort for (%lu)", + thd_get_thread_id(thd)); + } + + DBUG_PRINT("wsrep",("signalling wsrep rollbacker")); + WSREP_DEBUG("signaling aborter"); + wsrep_unlock_rollback(); + wsrep_thd_UNLOCK(thd); + + break; + } + default: + WSREP_WARN("bad wsrep query state: %d", + wsrep_thd_query_state(thd)); + wsrep_thd_UNLOCK(thd); + break; + } + + DBUG_RETURN(0); +} + +static +int +wsrep_abort_transaction(handlerton* hton, THD *bf_thd, THD *victim_thd, + my_bool signal) +{ + DBUG_ENTER("wsrep_innobase_abort_thd"); + trx_t* victim_trx = thd_to_trx(victim_thd); + trx_t* bf_trx = (bf_thd) ? thd_to_trx(bf_thd) : NULL; + WSREP_DEBUG("abort transaction: BF: %s victim: %s", + wsrep_thd_query(bf_thd), + wsrep_thd_query(victim_thd)); + + if (victim_trx) + { + lock_mutex_enter(); + trx_mutex_enter(victim_trx); + int rcode = wsrep_innobase_kill_one_trx(bf_thd, bf_trx, + victim_trx, signal); + trx_mutex_exit(victim_trx); + lock_mutex_exit(); + wsrep_srv_conc_cancel_wait(victim_trx); + + DBUG_RETURN(rcode); + } else { + WSREP_DEBUG("victim does not have transaction"); + wsrep_thd_LOCK(victim_thd); + wsrep_thd_set_conflict_state(victim_thd, MUST_ABORT); + wsrep_thd_UNLOCK(victim_thd); + wsrep_thd_awake(victim_thd, signal); + } + DBUG_RETURN(-1); +} + +static int innobase_wsrep_set_checkpoint(handlerton* hton, const XID* xid) +{ + DBUG_ASSERT(hton == innodb_hton_ptr); + if (wsrep_is_wsrep_xid(xid)) { + mtr_t mtr; + mtr_start(&mtr); + trx_sysf_t* sys_header = trx_sysf_get(&mtr); + trx_sys_update_wsrep_checkpoint(xid, sys_header, &mtr); + mtr_commit(&mtr); + innobase_flush_logs(hton); + return 0; + } else { + return 1; + } +} + +static int innobase_wsrep_get_checkpoint(handlerton* hton, XID* xid) +{ + DBUG_ASSERT(hton == innodb_hton_ptr); + trx_sys_read_wsrep_checkpoint(xid); + return 0; +} + +static void +wsrep_fake_trx_id( +/*==================*/ + handlerton *hton, + THD *thd) /*!< in: user thread handle */ +{ + mutex_enter(&trx_sys->mutex); + trx_id_t trx_id = trx_sys_get_new_trx_id(); + mutex_exit(&trx_sys->mutex); + + (void *)wsrep_ws_handle_for_trx(wsrep_thd_ws_handle(thd), trx_id); +} + +#endif /* WITH_WSREP */ + /* plugin options */ static MYSQL_SYSVAR_ENUM(checksum_algorithm, srv_checksum_algorithm, @@ -16122,6 +17954,13 @@ static MYSQL_SYSVAR_ULONG(io_capacity_max, srv_max_io_capacity, SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT, 100, SRV_MAX_IO_CAPACITY_LIMIT, 0); +static MYSQL_SYSVAR_ULONG(idle_flush_pct, + srv_idle_flush_pct, + PLUGIN_VAR_RQCMDARG, + "Up to what percentage of dirty pages should be flushed when innodb " + "finds it has spare resources to do so.", + NULL, NULL, 100, 0, 100, 0); + #ifdef UNIV_DEBUG static MYSQL_SYSVAR_BOOL(purge_run_now, innodb_purge_run_now, PLUGIN_VAR_OPCMDARG, @@ -16381,7 +18220,7 @@ static MYSQL_SYSVAR_ULONG(replication_delay, srv_replication_delay, static MYSQL_SYSVAR_UINT(compression_level, page_zip_level, PLUGIN_VAR_RQCMDARG, - "Compression level used for compressed row format. 0 is no compression" + "Compression level used for zlib compression. 0 is no compression" ", 1 is fastest, 9 is best compression and default is 6.", NULL, NULL, DEFAULT_COMPRESSION_LEVEL, 0, 9, 0); @@ -16392,7 +18231,7 @@ static MYSQL_SYSVAR_BOOL(log_compressed_pages, page_zip_log_pages, " the zlib compression algorithm changes." " When turned OFF, InnoDB will assume that the zlib" " compression algorithm doesn't change.", - NULL, NULL, TRUE); + NULL, NULL, FALSE); static MYSQL_SYSVAR_LONG(additional_mem_pool_size, innobase_additional_mem_pool_size, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, @@ -16467,6 +18306,60 @@ static MYSQL_SYSVAR_BOOL(buffer_pool_load_at_startup, srv_buffer_pool_load_at_st "Load the buffer pool from a file named @@innodb_buffer_pool_filename", NULL, NULL, FALSE); +static MYSQL_SYSVAR_BOOL(defragment, srv_defragment, + PLUGIN_VAR_RQCMDARG, + "Enable/disable InnoDB defragmentation (default FALSE). When set to FALSE, all existing " + "defragmentation will be paused. And new defragmentation command will fail." + "Paused defragmentation commands will resume when this variable is set to " + "true again.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_UINT(defragment_n_pages, srv_defragment_n_pages, + PLUGIN_VAR_RQCMDARG, + "Number of pages considered at once when merging multiple pages to " + "defragment", + NULL, NULL, 7, 2, 32, 0); + +static MYSQL_SYSVAR_UINT(defragment_stats_accuracy, + srv_defragment_stats_accuracy, + PLUGIN_VAR_RQCMDARG, + "How many defragment stats changes there are before the stats " + "are written to persistent storage. Set to 0 meaning disable " + "defragment stats tracking.", + NULL, NULL, 0, 0, ~0U, 0); + +static MYSQL_SYSVAR_UINT(defragment_fill_factor_n_recs, + srv_defragment_fill_factor_n_recs, + PLUGIN_VAR_RQCMDARG, + "How many records of space defragmentation should leave on the page. " + "This variable, together with innodb_defragment_fill_factor, is introduced " + "so defragmentation won't pack the page too full and cause page split on " + "the next insert on every page. The variable indicating more defragmentation" + " gain is the one effective.", + NULL, NULL, 20, 1, 100, 0); + +static MYSQL_SYSVAR_DOUBLE(defragment_fill_factor, srv_defragment_fill_factor, + PLUGIN_VAR_RQCMDARG, + "A number between [0.7, 1] that tells defragmentation how full it should " + "fill a page. Default is 0.9. Number below 0.7 won't make much sense." + "This variable, together with innodb_defragment_fill_factor_n_recs, is " + "introduced so defragmentation won't pack the page too full and cause " + "page split on the next insert on every page. The variable indicating more " + "defragmentation gain is the one effective.", + NULL, NULL, 0.9, 0.7, 1, 0); + +static MYSQL_SYSVAR_UINT(defragment_frequency, srv_defragment_frequency, + PLUGIN_VAR_RQCMDARG, + "Do not defragment a single index more than this number of time per second." + "This controls the number of time defragmentation thread can request X_LOCK " + "on an index. Defragmentation thread will check whether " + "1/defragment_frequency (s) has passed since it worked on this index last " + "time, and put the index back to the queue if not enough time has passed. " + "The actual frequency can only be lower than this given number.", + NULL, innodb_defragment_frequency_update, + SRV_DEFRAGMENT_FREQUENCY_DEFAULT, 1, 1000, 0); + + static MYSQL_SYSVAR_ULONG(lru_scan_depth, srv_LRU_scan_depth, PLUGIN_VAR_RQCMDARG, "How deep to scan LRU to keep it clean", @@ -16655,6 +18548,12 @@ static MYSQL_SYSVAR_ULONG( 1000000, 0); /* Maximum value */ #endif /* HAVE_ATOMIC_BUILTINS */ +static MYSQL_SYSVAR_BOOL(prefix_index_cluster_optimization, + srv_prefix_index_cluster_optimization, + PLUGIN_VAR_OPCMDARG, + "Enable prefix optimization to sometimes avoid cluster index lookups.", + NULL, NULL, FALSE); + static MYSQL_SYSVAR_ULONG(thread_sleep_delay, srv_thread_sleep_delay, PLUGIN_VAR_RQCMDARG, "Time of innodb thread sleeping before joining InnoDB queue (usec). " @@ -16793,6 +18692,40 @@ static MYSQL_SYSVAR_BOOL(disable_background_merge, NULL, NULL, FALSE); #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ +#ifdef WITH_INNODB_DISALLOW_WRITES +/******************************************************* + * innobase_disallow_writes variable definition * + *******************************************************/ + +/* Must always init to FALSE. */ +static my_bool innobase_disallow_writes = FALSE; + +/************************************************************************** +An "update" method for innobase_disallow_writes variable. */ +static +void +innobase_disallow_writes_update( +/*============================*/ + THD* thd, /* in: thread handle */ + st_mysql_sys_var* var, /* in: pointer to system + variable */ + void* var_ptr, /* out: pointer to dynamic + variable */ + const void* save) /* in: temporary storage */ +{ + *(my_bool*)var_ptr = *(my_bool*)save; + ut_a(srv_allow_writes_event); + if (*(my_bool*)var_ptr) + os_event_reset(srv_allow_writes_event); + else + os_event_set(srv_allow_writes_event); +} + +static MYSQL_SYSVAR_BOOL(disallow_writes, innobase_disallow_writes, + PLUGIN_VAR_NOCMDOPT, + "Tell InnoDB to stop any writes to disk", + NULL, innobase_disallow_writes_update, FALSE); +#endif /* WITH_INNODB_DISALLOW_WRITES */ static MYSQL_SYSVAR_BOOL(random_read_ahead, srv_random_read_ahead, PLUGIN_VAR_NOCMDARG, "Whether to use read ahead for random access within an extent.", @@ -16900,6 +18833,56 @@ static MYSQL_SYSVAR_UINT(simulate_comp_failures, srv_simulate_comp_failures, "Simulate compression failures.", NULL, NULL, 0, 0, 99, 0); +static MYSQL_SYSVAR_BOOL(force_primary_key, + srv_force_primary_key, + PLUGIN_VAR_OPCMDARG, + "Do not allow to create table without primary key (off by default)", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(use_trim, srv_use_trim, + PLUGIN_VAR_OPCMDARG, + "Use trim. Default FALSE.", + NULL, NULL, FALSE); + +static const char *page_compression_algorithms[]= { "none", "zlib", "lz4", "lzo", "lzma", "bzip2", 0 }; +static TYPELIB page_compression_algorithms_typelib= +{ + array_elements(page_compression_algorithms) - 1, 0, + page_compression_algorithms, 0 +}; +static MYSQL_SYSVAR_ENUM(compression_algorithm, innodb_compression_algorithm, + PLUGIN_VAR_OPCMDARG, + "Compression algorithm used on page compression. One of: none, zlib, lz4, lzo, lzma, or bzip2", + innodb_compression_algorithm_validate, NULL, + /* We use here the largest number of supported compression method to + enable all those methods that are available. Availability of compression + method is verified on innodb_compression_algorithm_validate function. */ + PAGE_UNCOMPRESSED, + &page_compression_algorithms_typelib); + +static MYSQL_SYSVAR_LONG(mtflush_threads, srv_mtflush_threads, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Number of multi-threaded flush threads", + NULL, NULL, + MTFLUSH_DEFAULT_WORKER, /* Default setting */ + 1, /* Minimum setting */ + MTFLUSH_MAX_WORKER, /* Max setting */ + 0); + +static MYSQL_SYSVAR_BOOL(use_mtflush, srv_use_mtflush, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Use multi-threaded flush. Default FALSE.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_ULONG(fatal_semaphore_wait_threshold, srv_fatal_semaphore_wait_threshold, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Maximum number of seconds that semaphore times out in InnoDB.", + NULL, NULL, + DEFAULT_SRV_FATAL_SEMAPHORE_TIMEOUT, /* Default setting */ + 1, /* Minimum setting */ + UINT_MAX32, /* Maximum setting */ + 0); + static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(additional_mem_pool_size), MYSQL_SYSVAR(api_trx_level), @@ -16916,6 +18899,12 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(buffer_pool_load_now), MYSQL_SYSVAR(buffer_pool_load_abort), MYSQL_SYSVAR(buffer_pool_load_at_startup), + MYSQL_SYSVAR(defragment), + MYSQL_SYSVAR(defragment_n_pages), + MYSQL_SYSVAR(defragment_stats_accuracy), + MYSQL_SYSVAR(defragment_fill_factor), + MYSQL_SYSVAR(defragment_fill_factor_n_recs), + MYSQL_SYSVAR(defragment_frequency), MYSQL_SYSVAR(lru_scan_depth), MYSQL_SYSVAR(flush_neighbors), MYSQL_SYSVAR(checksum_algorithm), @@ -17009,6 +18998,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { #ifdef HAVE_ATOMIC_BUILTINS MYSQL_SYSVAR(adaptive_max_sleep_delay), #endif /* HAVE_ATOMIC_BUILTINS */ + MYSQL_SYSVAR(prefix_index_cluster_optimization), MYSQL_SYSVAR(thread_sleep_delay), MYSQL_SYSVAR(autoinc_lock_mode), MYSQL_SYSVAR(version), @@ -17020,11 +19010,15 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(change_buffering_debug), MYSQL_SYSVAR(disable_background_merge), #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ +#ifdef WITH_INNODB_DISALLOW_WRITES + MYSQL_SYSVAR(disallow_writes), +#endif /* WITH_INNODB_DISALLOW_WRITES */ MYSQL_SYSVAR(random_read_ahead), MYSQL_SYSVAR(read_ahead_threshold), MYSQL_SYSVAR(read_only), MYSQL_SYSVAR(io_capacity), MYSQL_SYSVAR(io_capacity_max), + MYSQL_SYSVAR(idle_flush_pct), MYSQL_SYSVAR(monitor_enable), MYSQL_SYSVAR(monitor_disable), MYSQL_SYSVAR(monitor_reset), @@ -17060,6 +19054,13 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(saved_page_number_debug), #endif /* UNIV_DEBUG */ MYSQL_SYSVAR(simulate_comp_failures), + MYSQL_SYSVAR(force_primary_key), + MYSQL_SYSVAR(use_trim), + MYSQL_SYSVAR(compression_algorithm), + MYSQL_SYSVAR(mtflush_threads), + MYSQL_SYSVAR(use_mtflush), + + MYSQL_SYSVAR(fatal_semaphore_wait_threshold), NULL }; @@ -17402,6 +19403,9 @@ ib_senderrf( case IB_LOG_LEVEL_FATAL: l = 0; break; + default: + l = 0; + break; } my_printv_error(code, format, MYF(l), args); @@ -17559,3 +19563,94 @@ innobase_convert_to_system_charset( static_cast<uint>(len), errors)); } +/*************************************************************//** +Check for a valid value of innobase_compression_algorithm. +@return 0 for valid innodb_compression_algorithm. */ +static +int +innodb_compression_algorithm_validate( +/*==================================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to system + variable */ + void* save, /*!< out: immediate result + for update function */ + struct st_mysql_value* value) /*!< in: incoming string */ +{ + long compression_algorithm; + DBUG_ENTER("innobase_compression_algorithm_validate"); + + if (value->value_type(value) == MYSQL_VALUE_TYPE_STRING) { + char buff[STRING_BUFFER_USUAL_SIZE]; + const char *str; + int length= sizeof(buff); + + if (!(str= value->val_str(value, buff, &length))) { + DBUG_RETURN(1); + } + + if ((compression_algorithm= (long)find_type(str, &page_compression_algorithms_typelib, 0) - 1) < 0) { + DBUG_RETURN(1); + } + } else { + long long tmp; + + if (value->val_int(value, &tmp)) { + DBUG_RETURN(1); + } + + if (tmp < 0 || tmp >= page_compression_algorithms_typelib.count) { + DBUG_RETURN(1); + } + + compression_algorithm= (long) tmp; + } + + *reinterpret_cast<ulong*>(save) = compression_algorithm; + +#ifndef HAVE_LZ4 + if (compression_algorithm == PAGE_LZ4_ALGORITHM) { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_UNSUPPORTED, + "InnoDB: innodb_compression_algorithm = %lu unsupported.\n" + "InnoDB: liblz4 is not installed. \n", + compression_algorithm); + DBUG_RETURN(1); + } +#endif + +#ifndef HAVE_LZO + if (compression_algorithm == PAGE_LZO_ALGORITHM) { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_UNSUPPORTED, + "InnoDB: innodb_compression_algorithm = %lu unsupported.\n" + "InnoDB: liblzo is not installed. \n", + compression_algorithm); + DBUG_RETURN(1); + } +#endif + +#ifndef HAVE_LZMA + if (compression_algorithm == PAGE_LZMA_ALGORITHM) { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_UNSUPPORTED, + "InnoDB: innodb_compression_algorithm = %lu unsupported.\n" + "InnoDB: liblzma is not installed. \n", + compression_algorithm); + DBUG_RETURN(1); + } +#endif + +#ifndef HAVE_BZIP2 + if (compression_algorithm == PAGE_BZIP2_ALGORITHM) { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_UNSUPPORTED, + "InnoDB: innodb_compression_algorithm = %lu unsupported.\n" + "InnoDB: libbz2 is not installed. \n", + compression_algorithm); + DBUG_RETURN(1); + } +#endif + + DBUG_RETURN(0); +} diff --git a/storage/innobase/handler/ha_innodb.h b/storage/innobase/handler/ha_innodb.h index 5cebc425769..6da31c8ecc6 100644 --- a/storage/innobase/handler/ha_innodb.h +++ b/storage/innobase/handler/ha_innodb.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2000, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -56,6 +57,22 @@ typedef struct st_innobase_share { /** Prebuilt structures in an InnoDB table handle used within MySQL */ struct row_prebuilt_t; +/** Engine specific table options are definined using this struct */ +struct ha_table_option_struct +{ + bool page_compressed; /*!< Table is using page compression + if this option is true. */ + int page_compression_level; /*!< Table page compression level + or UNIV_UNSPECIFIED. */ + uint atomic_writes; /*!< Use atomic writes for this + table if this options is ON or + in DEFAULT if + srv_use_atomic_writes=1. + Atomic writes are not used if + value OFF.*/ +}; + + /** The class defining a handle to an Innodb table */ class ha_innobase: public handler { @@ -81,6 +98,8 @@ class ha_innobase: public handler or undefined */ uint num_write_row; /*!< number of write_row() calls */ + ha_statistics* ha_partition_stats; /*!< stats of the partition owner + handler (if there is one) */ uint store_key_val_for_row(uint keynr, char* buff, uint buff_len, const uchar* record); inline void update_thd(THD* thd); @@ -95,6 +114,10 @@ class ha_innobase: public handler void innobase_initialize_autoinc(); dict_index_t* innobase_get_index(uint keynr); +#ifdef WITH_WSREP + int wsrep_append_keys(THD *thd, bool shared, + const uchar* record0, const uchar* record1); +#endif /* Init values for the class: */ public: ha_innobase(handlerton *hton, TABLE_SHARE *table_arg); @@ -175,11 +198,15 @@ class ha_innobase: public handler char* norm_name, char* temp_path, char* remote_path); + const char* check_table_options(THD *thd, TABLE* table, + HA_CREATE_INFO* create_info, const bool use_tablespace, const ulint file_format); int create(const char *name, register TABLE *form, HA_CREATE_INFO *create_info); int truncate(); int delete_table(const char *name); int rename_table(const char* from, const char* to); + int defragment_table(const char* name, const char* index_name, + bool async); int check(THD* thd, HA_CHECK_OPT* check_opt); char* update_table_comment(const char* comment); char* get_foreign_key_create_info(); @@ -283,6 +310,7 @@ class ha_innobase: public handler Alter_inplace_info* ha_alter_info, bool commit); /** @} */ + void set_partition_owner_stats(ha_statistics *stats); bool check_if_incompatible_data(HA_CREATE_INFO *info, uint table_changes); private: @@ -440,7 +468,9 @@ __attribute__((nonnull)); */ extern void mysql_bin_log_commit_pos(THD *thd, ulonglong *out_pos, const char **out_file); -struct trx_t; +#ifdef WITH_WSREP +#include <mysql/service_wsrep.h> +#endif extern const struct _ft_vft ft_vft_result; @@ -478,6 +508,9 @@ innobase_index_name_is_reserved( __attribute__((nonnull, warn_unused_result)); /*****************************************************************//** +#ifdef WITH_WSREP +extern "C" int wsrep_trx_is_aborting(void *thd_ptr); +#endif Determines InnoDB table flags. @retval true if successful, false if error */ UNIV_INTERN diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc index a04b34fe027..d08fe25d377 100644 --- a/storage/innobase/handler/handler0alter.cc +++ b/storage/innobase/handler/handler0alter.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2005, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -262,6 +263,22 @@ ha_innobase::check_if_supported_inplace_alter( update_thd(); trx_search_latch_release_if_reserved(prebuilt->trx); + /* Change on engine specific table options require rebuild of the + table */ + if (ha_alter_info->handler_flags + == Alter_inplace_info::CHANGE_CREATE_OPTION) { + ha_table_option_struct *new_options= ha_alter_info->create_info->option_struct; + ha_table_option_struct *old_options= table->s->option_struct; + + if (new_options->page_compressed != old_options->page_compressed || + new_options->page_compression_level != old_options->page_compression_level || + new_options->atomic_writes != old_options->atomic_writes) { + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + } + if (ha_alter_info->handler_flags & ~(INNOBASE_INPLACE_IGNORE | INNOBASE_ALTER_NOREBUILD @@ -1178,7 +1195,8 @@ innobase_rec_to_mysql( field->reset(); - ipos = dict_index_get_nth_col_or_prefix_pos(index, i, TRUE); + ipos = dict_index_get_nth_col_or_prefix_pos(index, i, TRUE, + NULL); if (ipos == ULINT_UNDEFINED || rec_offs_nth_extern(offsets, ipos)) { @@ -1230,7 +1248,8 @@ innobase_fields_to_mysql( field->reset(); - ipos = dict_index_get_nth_col_or_prefix_pos(index, i, TRUE); + ipos = dict_index_get_nth_col_or_prefix_pos(index, i, TRUE, + NULL); if (ipos == ULINT_UNDEFINED || dfield_is_ext(&fields[ipos]) @@ -3371,6 +3390,11 @@ ha_innobase::prepare_inplace_alter_table( DBUG_ASSERT(ha_alter_info->create_info); DBUG_ASSERT(!srv_read_only_mode); + /* Init online ddl status variables */ + onlineddl_rowlog_rows = 0; + onlineddl_rowlog_pct_used = 0; + onlineddl_pct_progress = 0; + MONITOR_ATOMIC_INC(MONITOR_PENDING_ALTER_TABLE); #ifdef UNIV_DEBUG @@ -3393,6 +3417,17 @@ ha_innobase::prepare_inplace_alter_table( if (ha_alter_info->handler_flags & Alter_inplace_info::CHANGE_CREATE_OPTION) { + /* Check engine specific table options */ + if (const char* invalid_tbopt = check_table_options( + user_thd, altered_table, + ha_alter_info->create_info, + prebuilt->table->space != 0, + srv_file_format)) { + my_error(ER_ILLEGAL_HA_CREATE_OPTION, MYF(0), + table_type(), invalid_tbopt); + goto err_exit_no_heap; + } + if (const char* invalid_opt = create_options_are_invalid( user_thd, altered_table, ha_alter_info->create_info, @@ -4011,6 +4046,11 @@ oom: ctx->thr, prebuilt->table, altered_table); } + /* Init online ddl status variables */ + onlineddl_rowlog_rows = 0; + onlineddl_rowlog_pct_used = 0; + onlineddl_pct_progress = 0; + DEBUG_SYNC_C("inplace_after_index_build"); DBUG_EXECUTE_IF("create_index_fail", diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc index ca504acf64b..f6b3dbd2d5d 100644 --- a/storage/innobase/handler/i_s.cc +++ b/storage/innobase/handler/i_s.cc @@ -92,6 +92,7 @@ static buf_page_desc_t i_s_page_type[] = { {"COMPRESSED_BLOB", FIL_PAGE_TYPE_ZBLOB}, {"COMPRESSED_BLOB2", FIL_PAGE_TYPE_ZBLOB2}, {"IBUF_INDEX", I_S_PAGE_TYPE_IBUF}, + {"PAGE COMPRESSED", FIL_PAGE_PAGE_COMPRESSED}, {"UNKNOWN", I_S_PAGE_TYPE_UNKNOWN} }; @@ -2885,7 +2886,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_default_stopword = /* general descriptive text (for SHOW PLUGINS) */ /* const char* */ - STRUCT_FLD(descr, "Default stopword list for InnDB Full Text Search"), + STRUCT_FLD(descr, "Default stopword list for InnoDB Full Text Search"), /* the plugin license (PLUGIN_LICENSE_XXX) */ /* int */ diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h index 305acf7e322..b6f8a685ae9 100644 --- a/storage/innobase/include/btr0btr.h +++ b/storage/innobase/include/btr0btr.h @@ -2,6 +2,7 @@ Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -671,6 +672,21 @@ btr_get_size( is s-latched */ __attribute__((nonnull, warn_unused_result)); /**************************************************************//** +Gets the number of reserved and used pages in a B-tree. +@return number of pages reserved, or ULINT_UNDEFINED if the index +is unavailable */ +UNIV_INTERN +ulint +btr_get_size_and_reserved( +/*======================*/ + dict_index_t* index, /*!< in: index */ + ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */ + ulint* used, /*!< out: number of pages used (<= reserved) */ + mtr_t* mtr) /*!< in/out: mini-transaction where index + is s-latched */ + __attribute__((nonnull)); + +/**************************************************************//** Allocates a new file page to be used in an index tree. NOTE: we assume that the caller has made the reservation for free extents! @retval NULL if no page could be allocated @@ -717,6 +733,33 @@ btr_page_free_low( ulint level, /*!< in: page level */ mtr_t* mtr) /*!< in: mtr */ __attribute__((nonnull)); +/*************************************************************//** +Reorganizes an index page. + +IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. This has to +be done either within the same mini-transaction, or by invoking +ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages, +IBUF_BITMAP_FREE is unaffected by reorganization. + +@retval true if the operation was successful +@retval false if it is a compressed page, and recompression failed */ +UNIV_INTERN +bool +btr_page_reorganize_block( +/*======================*/ + bool recovery,/*!< in: true if called in recovery: + locks should not be updated, i.e., + there cannot exist locks on the + page, and a hash index should not be + dropped: it cannot exist */ + ulint z_level,/*!< in: compression level to be used + if dealing with compressed page */ + buf_block_t* block, /*!< in/out: B-tree page */ + dict_index_t* index, /*!< in: the index tree of the page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); + #ifdef UNIV_BTR_PRINT /*************************************************************//** Prints size info of a B-tree. */ @@ -762,6 +805,60 @@ btr_validate_index( const trx_t* trx) /*!< in: transaction or 0 */ __attribute__((nonnull(1), warn_unused_result)); +#ifdef UNIV_SYNC_DEBUG +/*************************************************************//** +Removes a page from the level list of pages. +@param space in: space where removed +@param zip_size in: compressed page size in bytes, or 0 for uncompressed +@param page in/out: page to remove +@param index in: index tree +@param mtr in/out: mini-transaction */ +# define btr_level_list_remove(space,zip_size,page,index,mtr) \ + btr_level_list_remove_func(space,zip_size,page,index,mtr) +#else /* UNIV_SYNC_DEBUG */ +/*************************************************************//** +Removes a page from the level list of pages. +@param space in: space where removed +@param zip_size in: compressed page size in bytes, or 0 for uncompressed +@param page in/out: page to remove +@param index in: index tree +@param mtr in/out: mini-transaction */ +# define btr_level_list_remove(space,zip_size,page,index,mtr) \ + btr_level_list_remove_func(space,zip_size,page,mtr) +#endif /* UNIV_SYNC_DEBUG */ + +/*************************************************************//** +Removes a page from the level list of pages. */ +UNIV_INTERN +void +btr_level_list_remove_func( +/*=======================*/ + ulint space, /*!< in: space where removed */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + page_t* page, /*!< in/out: page to remove */ +#ifdef UNIV_SYNC_DEBUG + const dict_index_t* index, /*!< in: index tree */ +#endif /* UNIV_SYNC_DEBUG */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); + +/*************************************************************//** +If page is the only on its level, this function moves its records to the +father page, thus reducing the tree height. +@return father block */ +UNIV_INTERN +buf_block_t* +btr_lift_page_up( +/*=============*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: page which is the only on its level; + must not be empty: use + btr_discard_only_page_on_level if the last + record from the page should be removed */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); + #define BTR_N_LEAF_PAGES 1 #define BTR_TOTAL_SIZE 2 #endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/include/btr0btr.ic b/storage/innobase/include/btr0btr.ic index 00f50b5dcaf..40b468b200a 100644 --- a/storage/innobase/include/btr0btr.ic +++ b/storage/innobase/include/btr0btr.ic @@ -163,9 +163,10 @@ btr_page_get_next( /*!< in: mini-transaction handle */ { ut_ad(page && mtr); +#ifndef UNIV_INNOCHECKSUM ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX) || mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_S_FIX)); - +#endif /* UNIV_INNOCHECKSUM */ return(mach_read_from_4(page + FIL_PAGE_NEXT)); } diff --git a/storage/innobase/include/btr0defragment.h b/storage/innobase/include/btr0defragment.h new file mode 100644 index 00000000000..8fef3c6519a --- /dev/null +++ b/storage/innobase/include/btr0defragment.h @@ -0,0 +1,101 @@ +/***************************************************************************** + +Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +#ifndef btr0defragment_h +#define btr0defragment_h + +#include "univ.i" + +#ifndef UNIV_HOTBACKUP + +#include "btr0pcur.h" + +/* Max number of pages to consider at once during defragmentation. */ +#define BTR_DEFRAGMENT_MAX_N_PAGES 32 + +/** stats in btr_defragment */ +extern ulint btr_defragment_compression_failures; +extern ulint btr_defragment_failures; +extern ulint btr_defragment_count; + +/** Item in the work queue for btr_degrament_thread. */ +struct btr_defragment_item_t +{ + btr_pcur_t* pcur; /* persistent cursor where + btr_defragment_n_pages should start */ + os_event_t event; /* if not null, signal after work + is done */ + bool removed; /* Mark an item as removed */ + ulonglong last_processed; /* timestamp of last time this index + is processed by defragment thread */ + + btr_defragment_item_t(btr_pcur_t* pcur, os_event_t event); + ~btr_defragment_item_t(); +}; + +/******************************************************************//** +Initialize defragmentation. */ +void +btr_defragment_init(void); +/******************************************************************//** +Shutdown defragmentation. */ +void +btr_defragment_shutdown(); +/******************************************************************//** +Check whether the given index is in btr_defragment_wq. */ +bool +btr_defragment_find_index( + dict_index_t* index); /*!< Index to find. */ +/******************************************************************//** +Add an index to btr_defragment_wq. Return a pointer to os_event if this +is a synchronized defragmentation. */ +os_event_t +btr_defragment_add_index( + dict_index_t* index, /*!< index to be added */ + bool async); /*!< whether this is an async defragmentation */ +/******************************************************************//** +When table is dropped, this function is called to mark a table as removed in +btr_efragment_wq. The difference between this function and the remove_index +function is this will not NULL the event. */ +void +btr_defragment_remove_table( + dict_table_t* table); /*!< Index to be removed. */ +/******************************************************************//** +Mark an index as removed from btr_defragment_wq. */ +void +btr_defragment_remove_index( + dict_index_t* index); /*!< Index to be removed. */ +/*********************************************************************//** +Check whether we should save defragmentation statistics to persistent storage.*/ +UNIV_INTERN +void +btr_defragment_save_defrag_stats_if_needed( + dict_index_t* index); /*!< in: index */ +/******************************************************************//** +Thread that merges consecutive b-tree pages into fewer pages to defragment +the index. */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(btr_defragment_thread)( +/*==========================================*/ + void* arg); /*!< in: a dummy parameter required by + os_thread_create */ + + +#endif /* !UNIV_HOTBACKUP */ +#endif diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 31ec6b9ef8b..7ea29169a48 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -1198,7 +1199,9 @@ UNIV_INTERN bool buf_page_io_complete( /*=================*/ - buf_page_t* bpage); /*!< in: pointer to the block in question */ + buf_page_t* bpage, /*!< in: pointer to the block in question */ + bool evict = false);/*!< in: whether or not to evict + the page from LRU list. */ /********************************************************************//** Calculates a folded value of a file page address to use in the page hash table. @@ -1498,6 +1501,11 @@ struct buf_page_t{ state == BUF_BLOCK_ZIP_PAGE and zip.data == NULL means an active buf_pool->watch */ + + ulint write_size; /* Write size is set when this + page is first time written and then + if written again we check is TRIM + operation needed. */ #ifndef UNIV_HOTBACKUP buf_page_t* hash; /*!< node used in chaining to buf_pool->page_hash or @@ -1756,6 +1764,133 @@ Compute the hash fold value for blocks in buf_pool->zip_hash. */ #define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b)) /* @} */ +/** A "Hazard Pointer" class used to iterate over page lists +inside the buffer pool. A hazard pointer is a buf_page_t pointer +which we intend to iterate over next and we want it remain valid +even after we release the buffer pool mutex. */ +class HazardPointer { + +public: + /** Constructor + @param buf_pool buffer pool instance + @param mutex mutex that is protecting the hp. */ + HazardPointer(const buf_pool_t* buf_pool, const ib_mutex_t* mutex) + : + m_buf_pool(buf_pool) +#ifdef UNIV_DEBUG + , m_mutex(mutex) +#endif /* UNIV_DEBUG */ + , m_hp() {} + + /** Destructor */ + virtual ~HazardPointer() {} + + /** Get current value */ + buf_page_t* get() + { + ut_ad(mutex_own(m_mutex)); + return(m_hp); + } + + /** Set current value + @param bpage buffer block to be set as hp */ + void set(buf_page_t* bpage); + + /** Checks if a bpage is the hp + @param bpage buffer block to be compared + @return true if it is hp */ + bool is_hp(const buf_page_t* bpage); + + /** Adjust the value of hp. This happens when some + other thread working on the same list attempts to + remove the hp from the list. Must be implemented + by the derived classes. + @param bpage buffer block to be compared */ + virtual void adjust(const buf_page_t*) = 0; + +protected: + /** Disable copying */ + HazardPointer(const HazardPointer&); + HazardPointer& operator=(const HazardPointer&); + + /** Buffer pool instance */ + const buf_pool_t* m_buf_pool; + +#if UNIV_DEBUG + /** mutex that protects access to the m_hp. */ + const ib_mutex_t* m_mutex; +#endif /* UNIV_DEBUG */ + + /** hazard pointer. */ + buf_page_t* m_hp; +}; + +/** Class implementing buf_pool->flush_list hazard pointer */ +class FlushHp: public HazardPointer { + +public: + /** Constructor + @param buf_pool buffer pool instance + @param mutex mutex that is protecting the hp. */ + FlushHp(const buf_pool_t* buf_pool, const ib_mutex_t* mutex) + : + HazardPointer(buf_pool, mutex) {} + + /** Destructor */ + virtual ~FlushHp() {} + + /** Adjust the value of hp. This happens when some + other thread working on the same list attempts to + remove the hp from the list. + @param bpage buffer block to be compared */ + void adjust(const buf_page_t* bpage); +}; + +/** Class implementing buf_pool->LRU hazard pointer */ +class LRUHp: public HazardPointer { + +public: + /** Constructor + @param buf_pool buffer pool instance + @param mutex mutex that is protecting the hp. */ + LRUHp(const buf_pool_t* buf_pool, const ib_mutex_t* mutex) + : + HazardPointer(buf_pool, mutex) {} + + /** Destructor */ + virtual ~LRUHp() {} + + /** Adjust the value of hp. This happens when some + other thread working on the same list attempts to + remove the hp from the list. + @param bpage buffer block to be compared */ + void adjust(const buf_page_t* bpage); +}; + +/** Special purpose iterators to be used when scanning the LRU list. +The idea is that when one thread finishes the scan it leaves the +itr in that position and the other thread can start scan from +there */ +class LRUItr: public LRUHp { + +public: + /** Constructor + @param buf_pool buffer pool instance + @param mutex mutex that is protecting the hp. */ + LRUItr(const buf_pool_t* buf_pool, const ib_mutex_t* mutex) + : + LRUHp(buf_pool, mutex) {} + + /** Destructor */ + virtual ~LRUItr() {} + + /** Selects from where to start a scan. If we have scanned + too deep into the LRU list it resets the value to the tail + of the LRU list. + @return buf_page_t from where to start scan. */ + buf_page_t* start(); +}; + /** Struct that is embedded in the free zip blocks */ struct buf_buddy_free_t { union { @@ -1888,7 +2023,7 @@ struct buf_pool_t{ also protects writes to bpage::oldest_modification and flush_list_hp */ - const buf_page_t* flush_list_hp;/*!< "hazard pointer" + FlushHp flush_hp;/*!< "hazard pointer" used during scan of flush_list while doing flush list batch. Protected by flush_list_mutex */ @@ -1946,6 +2081,19 @@ struct buf_pool_t{ UT_LIST_BASE_NODE_T(buf_page_t) free; /*!< base node of the free block list */ + + /** "hazard pointer" used during scan of LRU while doing + LRU list batch. Protected by buf_pool::mutex */ + LRUHp lru_hp; + + /** Iterator used to scan the LRU list when searching for + replacable victim. Protected by buf_pool::mutex. */ + LRUItr lru_scan_itr; + + /** Iterator used to scan the LRU list when searching for + single page flushing victim. Protected by buf_pool::mutex. */ + LRUItr single_scan_itr; + UT_LIST_BASE_NODE_T(buf_page_t) LRU; /*!< base node of the LRU list */ buf_page_t* LRU_old; /*!< pointer to the about diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h index f116720574b..3ab3f7c308a 100644 --- a/storage/innobase/include/buf0flu.h +++ b/storage/innobase/include/buf0flu.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2014, 2014, SkySQL Ab. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -36,6 +37,17 @@ Created 11/5/1995 Heikki Tuuri /** Flag indicating if the page_cleaner is in active state. */ extern ibool buf_page_cleaner_is_active; +/** Event to synchronise with the flushing. */ +extern os_event_t buf_flush_event; + +/** Handled page counters for a single flush */ +struct flush_counters_t { + ulint flushed; /*!< number of dirty pages flushed */ + ulint evicted; /*!< number of clean pages evicted */ + ulint unzip_LRU_evicted;/*!< number of uncompressed page images + evicted */ +}; + /********************************************************************//** Remove a block from the flush list of modified blocks. */ UNIV_INTERN @@ -110,12 +122,12 @@ buf_flush_list( which were processed is passed back to caller. Ignored if NULL */ /******************************************************************//** -This function picks up a single dirty page from the tail of the LRU -list, flushes it, removes it from page_hash and LRU list and puts -it on the free list. It is called from user threads when they are -unable to find a replacable page at the tail of the LRU list i.e.: -when the background LRU flushing in the page_cleaner thread is not -fast enough to keep pace with the workload. +This function picks up a single page from the tail of the LRU +list, flushes it (if it is dirty), removes it from page_hash and LRU +list and puts it on the free list. It is called from user threads when +they are unable to find a replaceable page at the tail of the LRU +list i.e.: when the background LRU flushing in the page_cleaner thread +is not fast enough to keep pace with the workload. @return TRUE if success. */ UNIV_INTERN ibool @@ -279,6 +291,57 @@ buf_flush_get_dirty_pages_count( #endif /* !UNIV_HOTBACKUP */ +/******************************************************************//** +Start a buffer flush batch for LRU or flush list */ +ibool +buf_flush_start( +/*============*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + buf_flush_t flush_type); /*!< in: BUF_FLUSH_LRU + or BUF_FLUSH_LIST */ +/******************************************************************//** +End a buffer flush batch for LRU or flush list */ +void +buf_flush_end( +/*==========*/ + buf_pool_t* buf_pool, /*!< buffer pool instance */ + buf_flush_t flush_type); /*!< in: BUF_FLUSH_LRU + or BUF_FLUSH_LIST */ +/******************************************************************//** +Gather the aggregated stats for both flush list and LRU list flushing */ +void +buf_flush_common( +/*=============*/ + buf_flush_t flush_type, /*!< in: type of flush */ + ulint page_count); /*!< in: number of pages flushed */ + +/*******************************************************************//** +This utility flushes dirty blocks from the end of the LRU list or flush_list. +NOTE 1: in the case of an LRU flush the calling thread may own latches to +pages: to avoid deadlocks, this function must be written so that it cannot +end up waiting for these latches! NOTE 2: in the case of a flush list flush, +the calling thread is not allowed to own any latches on pages! */ +__attribute__((nonnull)) +void +buf_flush_batch( +/*============*/ + buf_pool_t* buf_pool, /*!< in: buffer pool instance */ + buf_flush_t flush_type, /*!< in: BUF_FLUSH_LRU or + BUF_FLUSH_LIST; if BUF_FLUSH_LIST, + then the caller must not own any + latches on pages */ + ulint min_n, /*!< in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ + lsn_t lsn_limit, /*!< in: in the case of BUF_FLUSH_LIST + all blocks whose oldest_modification is + smaller than this should be flushed + (if their number does not exceed + min_n), otherwise ignored */ + flush_counters_t* n); /*!< out: flushed/evicted page + counts */ + + #ifndef UNIV_NONINL #include "buf0flu.ic" #endif diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h index ecdaef685a1..f1f6abd2d68 100644 --- a/storage/innobase/include/buf0lru.h +++ b/storage/innobase/include/buf0lru.h @@ -117,7 +117,7 @@ buf_LRU_get_free_only( buf_pool_t* buf_pool); /*!< buffer pool instance */ /******************************************************************//** Returns a free block from the buf_pool. The block is taken off the -free list. If it is empty, blocks are moved from the end of the +free list. If free list is empty, blocks are moved from the end of the LRU list to the free list. This function is called from a user thread when it needs a clean block to read in a page. Note that we only ever get a block from @@ -125,8 +125,6 @@ the free list. Even when we flush a page or find a page in LRU scan we put it to free list to be used. * iteration 0: * get a block from free list, success:done - * if there is an LRU flush batch in progress: - * wait for batch to end: retry free list * if buf_pool->try_LRU_scan is set * scan LRU up to srv_LRU_scan_depth to find a clean block * the above will put the block on free list @@ -139,7 +137,7 @@ we put it to free list to be used. * scan whole LRU list * scan LRU list even if buf_pool->try_LRU_scan is not set * iteration > 1: - * same as iteration 1 but sleep 100ms + * same as iteration 1 but sleep 10ms @return the free control block, in state BUF_BLOCK_READY_FOR_USE */ UNIV_INTERN buf_block_t* @@ -231,6 +229,15 @@ buf_LRU_free_one_page( may or may not be a hash index to the page */ __attribute__((nonnull)); +/******************************************************************//** +Adjust LRU hazard pointers if needed. */ + +void +buf_LRU_adjust_hp( +/*==============*/ + buf_pool_t* buf_pool,/*!< in: buffer pool instance */ + const buf_page_t* bpage); /*!< in: control block */ + #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG /**********************************************************************//** Validates the LRU list. diff --git a/storage/innobase/include/buf0mtflu.h b/storage/innobase/include/buf0mtflu.h new file mode 100644 index 00000000000..0475335bbf5 --- /dev/null +++ b/storage/innobase/include/buf0mtflu.h @@ -0,0 +1,95 @@ +/***************************************************************************** + +Copyright (C) 2014 SkySQL Ab. All Rights Reserved. +Copyright (C) 2014 Fusion-io. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/buf0mtflu.h +Multi-threadef flush method interface function prototypes + +Created 06/02/2014 Jan Lindström jan.lindstrom@skysql.com + Dhananjoy Das DDas@fusionio.com +***********************************************************************/ + +#ifndef buf0mtflu_h +#define buf0mtflu_h + +/******************************************************************//** +Add exit work item to work queue to signal multi-threded flush +threads that they should exit. +*/ +void +buf_mtflu_io_thread_exit(void); +/*===========================*/ + +/******************************************************************//** +Initialize multi-threaded flush thread syncronization data. +@return Initialized multi-threaded flush thread syncroniztion data. */ +void* +buf_mtflu_handler_init( +/*===================*/ + ulint n_threads, /*!< in: Number of threads to create */ + ulint wrk_cnt); /*!< in: Number of work items */ + +/******************************************************************//** +Return true if multi-threaded flush is initialized +@return true if initialized, false if not */ +bool +buf_mtflu_init_done(void); +/*======================*/ + +/*********************************************************************//** +Clears up tail of the LRU lists: +* Put replaceable pages at the tail of LRU to the free list +* Flush dirty pages at the tail of LRU to the disk +The depth to which we scan each buffer pool is controlled by dynamic +config parameter innodb_LRU_scan_depth. +@return total pages flushed */ +UNIV_INTERN +ulint +buf_mtflu_flush_LRU_tail(void); +/*===========================*/ + +/*******************************************************************//** +Multi-threaded version of buf_flush_list +*/ +bool +buf_mtflu_flush_list( +/*=================*/ + ulint min_n, /*!< in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ + lsn_t lsn_limit, /*!< in the case BUF_FLUSH_LIST all + blocks whose oldest_modification is + smaller than this should be flushed + (if their number does not exceed + min_n), otherwise ignored */ + ulint* n_processed); /*!< out: the number of pages + which were processed is passed + back to caller. Ignored if NULL */ + +/*********************************************************************//** +Set correct thread identifiers to io thread array based on +information we have. */ +void +buf_mtflu_set_thread_ids( +/*=====================*/ + ulint n_threads, /*!<in: Number of threads to fill */ + void* ctx, /*!<in: thread context */ + os_thread_id_t* thread_ids); /*!<in: thread id array */ + +#endif diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h index 026187b2000..c7161987b78 100644 --- a/storage/innobase/include/dict0dict.h +++ b/storage/innobase/include/dict0dict.h @@ -2,6 +2,7 @@ Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -42,6 +43,8 @@ Created 1/8/1996 Heikki Tuuri #include "ut0byte.h" #include "trx0types.h" #include "row0types.h" +#include "fsp0fsp.h" +#include "dict0pagecompress.h" extern bool innodb_table_stats_not_found; extern bool innodb_index_stats_not_found; @@ -120,7 +123,9 @@ enum dict_table_op_t { DICT_TABLE_OP_DROP_ORPHAN, /** Silently load the tablespace if it does not exist, and do not load the definitions of incomplete indexes. */ - DICT_TABLE_OP_LOAD_TABLESPACE + DICT_TABLE_OP_LOAD_TABLESPACE, + /** Open the table only if it's in table cache. */ + DICT_TABLE_OP_OPEN_ONLY_IF_CACHED }; /**********************************************************************//** @@ -907,7 +912,14 @@ dict_tf_set( ulint* flags, /*!< in/out: table */ rec_format_t format, /*!< in: file format */ ulint zip_ssize, /*!< in: zip shift size */ - bool remote_path) /*!< in: table uses DATA DIRECTORY */ + bool remote_path, /*!< in: table uses DATA DIRECTORY + */ + bool page_compressed,/*!< in: table uses page compressed + pages */ + ulint page_compression_level, /*!< in: table page compression + level */ + ulint atomic_writes) /*!< in: table atomic + writes option value*/ __attribute__((nonnull)); /********************************************************************//** Convert a 32 bit integer table flags to the 32 bit integer that is @@ -935,6 +947,7 @@ dict_tf_get_zip_size( /*=================*/ ulint flags) /*!< in: flags */ __attribute__((const)); + /********************************************************************//** Check whether the table uses the compressed compact page format. @return compressed page size, or 0 if not compressed */ @@ -1146,8 +1159,9 @@ ulint dict_index_get_nth_col_pos( /*=======================*/ const dict_index_t* index, /*!< in: index */ - ulint n) /*!< in: column number */ - __attribute__((nonnull, warn_unused_result)); + ulint n, /*!< in: column number */ + ulint* prefix_col_pos) /*!< out: col num if prefix */ + __attribute__((nonnull(1), warn_unused_result)); /********************************************************************//** Looks for column n in an index. @return position in internal representation of the index; @@ -1158,9 +1172,11 @@ dict_index_get_nth_col_or_prefix_pos( /*=================================*/ const dict_index_t* index, /*!< in: index */ ulint n, /*!< in: column number */ - ibool inc_prefix) /*!< in: TRUE=consider + ibool inc_prefix, /*!< in: TRUE=consider column prefixes too */ - __attribute__((nonnull, warn_unused_result)); + ulint* prefix_col_pos) /*!< out: col num if prefix */ + + __attribute__((nonnull(1), warn_unused_result)); /********************************************************************//** Returns TRUE if the index contains a column or a prefix of that column. @return TRUE if contains the column or its prefix */ @@ -1510,6 +1526,16 @@ dict_table_get_index_on_name( const char* name) /*!< in: name of the index to find */ __attribute__((nonnull, warn_unused_result)); /**********************************************************************//** +Looks for an index with the given id given a table instance. +@return index or NULL */ +UNIV_INTERN +dict_index_t* +dict_table_find_index_on_id( +/*========================*/ + const dict_table_t* table, /*!< in: table instance */ + index_id_t id) /*!< in: index id */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************************//** In case there is more than one index with the same name return the index with the min(id). @return index, NULL if does not exist */ @@ -1837,6 +1863,7 @@ dict_table_get_index_on_first_col( #endif /* !UNIV_HOTBACKUP */ + #ifndef UNIV_NONINL #include "dict0dict.ic" #endif diff --git a/storage/innobase/include/dict0dict.ic b/storage/innobase/include/dict0dict.ic index 066ffe47e4a..43bd42ae025 100644 --- a/storage/innobase/include/dict0dict.ic +++ b/storage/innobase/include/dict0dict.ic @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -537,9 +538,25 @@ dict_tf_is_valid( ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(flags); ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(flags); ulint unused = DICT_TF_GET_UNUSED(flags); + ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(flags); + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags); + ulint data_dir = DICT_TF_HAS_DATA_DIR(flags); + ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(flags); /* Make sure there are no bits that we do not know about. */ if (unused != 0) { + fprintf(stderr, + "InnoDB: Error: table unused flags are %ld" + " in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + unused, + compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + ); return(false); @@ -550,12 +567,34 @@ dict_tf_is_valid( data stored off-page in the clustered index. */ if (!compact) { + fprintf(stderr, + "InnoDB: Error: table compact flags are %ld" + " in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + compact, compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + ); return(false); } } else if (zip_ssize) { /* Antelope does not support COMPRESSED row format. */ + fprintf(stderr, + "InnoDB: Error: table flags are %ld" + " in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + flags, compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + ); return(false); } @@ -568,6 +607,58 @@ dict_tf_is_valid( || !atomic_blobs || zip_ssize > PAGE_ZIP_SSIZE_MAX) { + fprintf(stderr, + "InnoDB: Error: table compact flags are %ld in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + flags, + compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + + ); + return(false); + } + } + + if (page_compression || page_compression_level) { + /* Page compression format must have compact and + atomic_blobs and page_compression_level requires + page_compression */ + if (!compact + || !page_compression + || !atomic_blobs) { + + fprintf(stderr, + "InnoDB: Error: table flags are %ld in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + flags, compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + ); + return(false); + } + } + + if (atomic_writes) { + + if(atomic_writes > ATOMIC_WRITES_OFF) { + + fprintf(stderr, + "InnoDB: Error: table flags are %ld in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + flags, compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + ); return(false); } } @@ -594,6 +685,11 @@ dict_sys_tables_type_validate( ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(type); ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(type); ulint unused = DICT_TF_GET_UNUSED(type); + ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(type); + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(type); + ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(type); + + ut_a(atomic_writes <= ATOMIC_WRITES_OFF); /* The low order bit of SYS_TABLES.TYPE is always set to 1. If the format is UNIV_FORMAT_B or higher, this field is the same @@ -604,12 +700,16 @@ dict_sys_tables_type_validate( if (redundant) { if (zip_ssize || atomic_blobs) { + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=Redundant, zip_ssize %lu atomic_blobs %lu\n", + zip_ssize, atomic_blobs); return(ULINT_UNDEFINED); } } /* Make sure there are no bits that we do not know about. */ if (unused) { + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, unused %lu\n", + type, unused); return(ULINT_UNDEFINED); } @@ -624,6 +724,8 @@ dict_sys_tables_type_validate( } else if (zip_ssize) { /* Antelope does not support COMPRESSED format. */ + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu\n", + type, zip_ssize); return(ULINT_UNDEFINED); } @@ -633,11 +735,15 @@ dict_sys_tables_type_validate( should be in N_COLS, but we already know about the low_order_bit and DICT_N_COLS_COMPACT flags. */ if (!atomic_blobs) { + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu atomic_blobs %lu\n", + type, zip_ssize, atomic_blobs); return(ULINT_UNDEFINED); } /* Validate that the number is within allowed range. */ if (zip_ssize > PAGE_ZIP_SSIZE_MAX) { + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu max %d\n", + type, zip_ssize, PAGE_ZIP_SSIZE_MAX); return(ULINT_UNDEFINED); } } @@ -647,6 +753,27 @@ dict_sys_tables_type_validate( format, so the DATA_DIR flag is compatible with any other table flags. However, it is not used with TEMPORARY tables.*/ + if (page_compression || page_compression_level) { + /* page compressed row format must have low_order_bit and + atomic_blobs bits set and the DICT_N_COLS_COMPACT flag + should be in N_COLS, but we already know about the + low_order_bit and DICT_N_COLS_COMPACT flags. */ + + if (!atomic_blobs || !page_compression) { + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, page_compression %lu page_compression_level %lu\n" + "InnoDB: Error: atomic_blobs %lu\n", + type, page_compression, page_compression_level, atomic_blobs); + return(ULINT_UNDEFINED); + } + } + + /* Validate that the atomic writes number is within allowed range. */ + if (atomic_writes > ATOMIC_WRITES_OFF) { + fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, atomic_writes %lu\n", + type, atomic_writes); + return(ULINT_UNDEFINED); + } + /* Return the validated SYS_TABLES.TYPE. */ return(type); } @@ -719,8 +846,16 @@ dict_tf_set( ulint* flags, /*!< in/out: table flags */ rec_format_t format, /*!< in: file format */ ulint zip_ssize, /*!< in: zip shift size */ - bool use_data_dir) /*!< in: table uses DATA DIRECTORY */ + bool use_data_dir, /*!< in: table uses DATA DIRECTORY + */ + bool page_compressed,/*!< in: table uses page compressed + pages */ + ulint page_compression_level, /*!< in: table page compression + level */ + ulint atomic_writes) /*!< in: table atomic writes setup */ { + atomic_writes_t awrites = (atomic_writes_t)atomic_writes; + switch (format) { case REC_FORMAT_REDUNDANT: *flags = 0; @@ -742,6 +877,19 @@ dict_tf_set( break; } + if (page_compressed) { + *flags |= (1 << DICT_TF_POS_ATOMIC_BLOBS) + | (1 << DICT_TF_POS_PAGE_COMPRESSION) + | (page_compression_level << DICT_TF_POS_PAGE_COMPRESSION_LEVEL); + + ut_ad(zip_ssize == 0); + ut_ad(dict_tf_get_page_compression(*flags) == TRUE); + ut_ad(dict_tf_get_page_compression_level(*flags) == page_compression_level); + } + + *flags |= (atomic_writes << DICT_TF_POS_ATOMIC_WRITES); + ut_a(dict_tf_get_atomic_writes(*flags) == awrites); + if (use_data_dir) { *flags |= (1 << DICT_TF_POS_DATA_DIR); } @@ -765,6 +913,9 @@ dict_tf_to_fsp_flags( ulint table_flags) /*!< in: dict_table_t::flags */ { ulint fsp_flags; + ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(table_flags); + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(table_flags); + ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(table_flags); DBUG_EXECUTE_IF("dict_tf_to_fsp_flags_failure", return(ULINT_UNDEFINED);); @@ -783,7 +934,20 @@ dict_tf_to_fsp_flags( fsp_flags |= DICT_TF_HAS_DATA_DIR(table_flags) ? FSP_FLAGS_MASK_DATA_DIR : 0; + /* In addition, tablespace flags also contain if the page + compression is used for this table. */ + fsp_flags |= FSP_FLAGS_SET_PAGE_COMPRESSION(fsp_flags, page_compression); + + /* In addition, tablespace flags also contain page compression level + if page compression is used for this table. */ + fsp_flags |= FSP_FLAGS_SET_PAGE_COMPRESSION_LEVEL(fsp_flags, page_compression_level); + + /* In addition, tablespace flags also contain flag if atomic writes + is used for this table */ + fsp_flags |= FSP_FLAGS_SET_ATOMIC_WRITES(fsp_flags, atomic_writes); + ut_a(fsp_flags_is_valid(fsp_flags)); + ut_a(dict_tf_verify_flags(table_flags, fsp_flags)); return(fsp_flags); } @@ -811,10 +975,15 @@ dict_sys_tables_type_to_tf( /* Adjust bit zero. */ flags = redundant ? 0 : 1; - /* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */ + /* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION, + PAGE_COMPRESSION_LEVEL, ATOMIC_WRITES are the same. */ flags |= type & (DICT_TF_MASK_ZIP_SSIZE | DICT_TF_MASK_ATOMIC_BLOBS - | DICT_TF_MASK_DATA_DIR); + | DICT_TF_MASK_DATA_DIR + | DICT_TF_MASK_PAGE_COMPRESSION + | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL + | DICT_TF_MASK_ATOMIC_WRITES + ); return(flags); } @@ -842,10 +1011,14 @@ dict_tf_to_sys_tables_type( /* Adjust bit zero. It is always 1 in SYS_TABLES.TYPE */ type = 1; - /* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */ + /* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION, + PAGE_COMPRESSION_LEVEL, ATOMIC_WRITES are the same. */ type |= flags & (DICT_TF_MASK_ZIP_SSIZE | DICT_TF_MASK_ATOMIC_BLOBS - | DICT_TF_MASK_DATA_DIR); + | DICT_TF_MASK_DATA_DIR + | DICT_TF_MASK_PAGE_COMPRESSION + | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL + | DICT_TF_MASK_ATOMIC_WRITES); return(type); } @@ -1048,7 +1221,8 @@ dict_index_get_sys_col_pos( } return(dict_index_get_nth_col_pos( - index, dict_table_get_sys_col_no(index->table, type))); + index, dict_table_get_sys_col_no(index->table, type), + NULL)); } /*********************************************************************//** @@ -1100,9 +1274,11 @@ ulint dict_index_get_nth_col_pos( /*=======================*/ const dict_index_t* index, /*!< in: index */ - ulint n) /*!< in: column number */ + ulint n, /*!< in: column number */ + ulint* prefix_col_pos) /*!< out: col num if prefix */ { - return(dict_index_get_nth_col_or_prefix_pos(index, n, FALSE)); + return(dict_index_get_nth_col_or_prefix_pos(index, n, FALSE, + prefix_col_pos)); } #ifndef UNIV_HOTBACKUP diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h index be0ef395ba8..1d59bc09f6d 100644 --- a/storage/innobase/include/dict0mem.h +++ b/storage/innobase/include/dict0mem.h @@ -2,6 +2,7 @@ Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -124,11 +125,26 @@ This flag prevents older engines from attempting to open the table and allows InnoDB to update_create_info() accordingly. */ #define DICT_TF_WIDTH_DATA_DIR 1 +/** +Width of the page compression flag +*/ +#define DICT_TF_WIDTH_PAGE_COMPRESSION 1 +#define DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL 4 + +/** +Width of atomic writes flag +DEFAULT=0, ON = 1, OFF = 2 +*/ +#define DICT_TF_WIDTH_ATOMIC_WRITES 2 + /** Width of all the currently known table flags */ #define DICT_TF_BITS (DICT_TF_WIDTH_COMPACT \ + DICT_TF_WIDTH_ZIP_SSIZE \ + DICT_TF_WIDTH_ATOMIC_BLOBS \ - + DICT_TF_WIDTH_DATA_DIR) + + DICT_TF_WIDTH_DATA_DIR \ + + DICT_TF_WIDTH_PAGE_COMPRESSION \ + + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL \ + + DICT_TF_WIDTH_ATOMIC_WRITES) /** A mask of all the known/used bits in table flags */ #define DICT_TF_BIT_MASK (~(~0 << DICT_TF_BITS)) @@ -144,9 +160,19 @@ allows InnoDB to update_create_info() accordingly. */ /** Zero relative shift position of the DATA_DIR field */ #define DICT_TF_POS_DATA_DIR (DICT_TF_POS_ATOMIC_BLOBS \ + DICT_TF_WIDTH_ATOMIC_BLOBS) +/** Zero relative shift position of the PAGE_COMPRESSION field */ +#define DICT_TF_POS_PAGE_COMPRESSION (DICT_TF_POS_DATA_DIR \ + + DICT_TF_WIDTH_DATA_DIR) +/** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */ +#define DICT_TF_POS_PAGE_COMPRESSION_LEVEL (DICT_TF_POS_PAGE_COMPRESSION \ + + DICT_TF_WIDTH_PAGE_COMPRESSION) +/** Zero relative shift position of the ATOMIC_WRITES field */ +#define DICT_TF_POS_ATOMIC_WRITES (DICT_TF_POS_PAGE_COMPRESSION_LEVEL \ + + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL) + /** Zero relative shift position of the start of the UNUSED bits */ -#define DICT_TF_POS_UNUSED (DICT_TF_POS_DATA_DIR \ - + DICT_TF_WIDTH_DATA_DIR) +#define DICT_TF_POS_UNUSED (DICT_TF_POS_ATOMIC_WRITES \ + + DICT_TF_WIDTH_ATOMIC_WRITES) /** Bit mask of the COMPACT field */ #define DICT_TF_MASK_COMPACT \ @@ -164,6 +190,18 @@ allows InnoDB to update_create_info() accordingly. */ #define DICT_TF_MASK_DATA_DIR \ ((~(~0 << DICT_TF_WIDTH_DATA_DIR)) \ << DICT_TF_POS_DATA_DIR) +/** Bit mask of the PAGE_COMPRESSION field */ +#define DICT_TF_MASK_PAGE_COMPRESSION \ + ((~(~0 << DICT_TF_WIDTH_PAGE_COMPRESSION)) \ + << DICT_TF_POS_PAGE_COMPRESSION) +/** Bit mask of the PAGE_COMPRESSION_LEVEL field */ +#define DICT_TF_MASK_PAGE_COMPRESSION_LEVEL \ + ((~(~0 << DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL)) \ + << DICT_TF_POS_PAGE_COMPRESSION_LEVEL) +/** Bit mask of the ATOMIC_WRITES field */ +#define DICT_TF_MASK_ATOMIC_WRITES \ + ((~(~0 << DICT_TF_WIDTH_ATOMIC_WRITES)) \ + << DICT_TF_POS_ATOMIC_WRITES) /** Return the value of the COMPACT field */ #define DICT_TF_GET_COMPACT(flags) \ @@ -181,6 +219,19 @@ allows InnoDB to update_create_info() accordingly. */ #define DICT_TF_HAS_DATA_DIR(flags) \ ((flags & DICT_TF_MASK_DATA_DIR) \ >> DICT_TF_POS_DATA_DIR) +/** Return the value of the PAGE_COMPRESSION field */ +#define DICT_TF_GET_PAGE_COMPRESSION(flags) \ + ((flags & DICT_TF_MASK_PAGE_COMPRESSION) \ + >> DICT_TF_POS_PAGE_COMPRESSION) +/** Return the value of the PAGE_COMPRESSION_LEVEL field */ +#define DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags) \ + ((flags & DICT_TF_MASK_PAGE_COMPRESSION_LEVEL) \ + >> DICT_TF_POS_PAGE_COMPRESSION_LEVEL) +/** Return the value of the ATOMIC_WRITES field */ +#define DICT_TF_GET_ATOMIC_WRITES(flags) \ + ((flags & DICT_TF_MASK_ATOMIC_WRITES) \ + >> DICT_TF_POS_ATOMIC_WRITES) + /** Return the contents of the UNUSED bits */ #define DICT_TF_GET_UNUSED(flags) \ (flags >> DICT_TF_POS_UNUSED) @@ -492,6 +543,9 @@ be REC_VERSION_56_MAX_INDEX_COL_LEN (3072) bytes */ /** Defines the maximum fixed length column size */ #define DICT_MAX_FIXED_COL_LEN DICT_ANTELOPE_MAX_INDEX_COL_LEN +#ifdef WITH_WSREP +#define WSREP_MAX_SUPPORTED_KEY_LENGTH 3500 +#endif /* WITH_WSREP */ /** Data structure for a field in an index */ struct dict_field_t{ @@ -562,6 +616,10 @@ struct zip_pad_info_t { rounds */ }; +/** Number of samples of data size kept when page compression fails for +a certain index.*/ +#define STAT_DEFRAG_DATA_SIZE_N_SAMPLE 10 + /** Data structure for an index. Most fields will be initialized to 0, NULL or FALSE in dict_mem_index_create(). */ struct dict_index_t{ @@ -653,6 +711,23 @@ struct dict_index_t{ /*!< has persistent statistics error printed for this index ? */ /* @} */ + /** Statistics for defragmentation, these numbers are estimations and + could be very inaccurate at certain times, e.g. right after restart, + during defragmentation, etc. */ + /* @{ */ + ulint stat_defrag_modified_counter; + ulint stat_defrag_n_pages_freed; + /* number of pages freed by defragmentation. */ + ulint stat_defrag_n_page_split; + /* number of page splits since last full index + defragmentation. */ + ulint stat_defrag_data_size_sample[STAT_DEFRAG_DATA_SIZE_N_SAMPLE]; + /* data size when compression failure happened + the most recent 10 times. */ + ulint stat_defrag_sample_next_slot; + /* in which slot the next sample should be + saved. */ + /* @} */ rw_lock_t lock; /*!< read-write lock protecting the upper levels of the index tree */ trx_id_t trx_id; /*!< id of the transaction that created this diff --git a/storage/innobase/include/dict0pagecompress.h b/storage/innobase/include/dict0pagecompress.h new file mode 100644 index 00000000000..19a2a6c52f3 --- /dev/null +++ b/storage/innobase/include/dict0pagecompress.h @@ -0,0 +1,94 @@ +/***************************************************************************** + +Copyright (C) 2013 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0pagecompress.h +Helper functions for extracting/storing page compression information +to dictionary. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#ifndef dict0pagecompress_h +#define dict0pagecompress_h + +/********************************************************************//** +Extract the page compression level from table flags. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_tf_get_page_compression_level( +/*===============================*/ + ulint flags) /*!< in: flags */ + __attribute__((const)); +/********************************************************************//** +Extract the page compression flag from table flags +@return page compression flag, or false if not compressed */ +UNIV_INLINE +ibool +dict_tf_get_page_compression( +/*==========================*/ + ulint flags) /*!< in: flags */ + __attribute__((const)); + +/********************************************************************//** +Check whether the table uses the page compressed page format. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_table_page_compression_level( +/*==============================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((const)); + +/********************************************************************//** +Verify that dictionary flags match tablespace flags +@return true if flags match, false if not */ +UNIV_INLINE +ibool +dict_tf_verify_flags( +/*=================*/ + ulint table_flags, /*!< in: dict_table_t::flags */ + ulint fsp_flags) /*!< in: fil_space_t::flags */ + __attribute__((const)); + +/********************************************************************//** +Extract the atomic writes flag from table flags. +@return true if atomic writes are used, false if not used */ +UNIV_INLINE +atomic_writes_t +dict_tf_get_atomic_writes( +/*======================*/ + ulint flags) /*!< in: flags */ + __attribute__((const)); + +/********************************************************************//** +Check whether the table uses the atomic writes. +@return true if atomic writes is used, false if not */ +UNIV_INLINE +atomic_writes_t +dict_table_get_atomic_writes( +/*=========================*/ + const dict_table_t* table); /*!< in: table */ + + +#ifndef UNIV_NONINL +#include "dict0pagecompress.ic" +#endif + +#endif diff --git a/storage/innobase/include/dict0pagecompress.ic b/storage/innobase/include/dict0pagecompress.ic new file mode 100644 index 00000000000..811976434a8 --- /dev/null +++ b/storage/innobase/include/dict0pagecompress.ic @@ -0,0 +1,191 @@ +/***************************************************************************** + +Copyright (C) 2013 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0pagecompress.ic +Inline implementation for helper functions for extracting/storing +page compression and atomic writes information to dictionary. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +/********************************************************************//** +Verify that dictionary flags match tablespace flags +@return true if flags match, false if not */ +UNIV_INLINE +ibool +dict_tf_verify_flags( +/*=================*/ + ulint table_flags, /*!< in: dict_table_t::flags */ + ulint fsp_flags) /*!< in: fil_space_t::flags */ +{ + ulint table_unused = DICT_TF_GET_UNUSED(table_flags); + ulint compact = DICT_TF_GET_COMPACT(table_flags); + ulint ssize = DICT_TF_GET_ZIP_SSIZE(table_flags); + ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(table_flags); + ulint data_dir = DICT_TF_HAS_DATA_DIR(table_flags); + ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(table_flags); + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(table_flags); + ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(table_flags); + ulint post_antelope = FSP_FLAGS_GET_POST_ANTELOPE(fsp_flags); + ulint zip_ssize = FSP_FLAGS_GET_ZIP_SSIZE(fsp_flags); + ulint fsp_atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(fsp_flags); + ulint page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(fsp_flags); + ulint fsp_unused = FSP_FLAGS_GET_UNUSED(fsp_flags); + ulint fsp_page_compression = FSP_FLAGS_GET_PAGE_COMPRESSION(fsp_flags); + ulint fsp_page_compression_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(fsp_flags); + ulint fsp_atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(fsp_flags); + + DBUG_EXECUTE_IF("dict_tf_verify_flags_failure", + return(ULINT_UNDEFINED);); + + ut_a(!table_unused); + ut_a(!fsp_unused); + ut_a(page_ssize == 0 || page_ssize != 0); /* silence compiler */ + ut_a(compact == 0 || compact == 1); /* silence compiler */ + ut_a(data_dir == 0 || data_dir == 1); /* silence compiler */ + ut_a(post_antelope == 0 || post_antelope == 1); /* silence compiler */ + + if (ssize != zip_ssize) { + fprintf(stderr, + "InnoDB: Error: table flags has zip_ssize %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file has zip_ssize %ld\n", + ssize, zip_ssize); + return (FALSE); + } + if (atomic_blobs != fsp_atomic_blobs) { + fprintf(stderr, + "InnoDB: Error: table flags has atomic_blobs %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file has atomic_blobs %ld\n", + atomic_blobs, fsp_atomic_blobs); + + return (FALSE); + } + if (page_compression != fsp_page_compression) { + fprintf(stderr, + "InnoDB: Error: table flags has page_compression %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file ahas page_compression %ld\n", + page_compression, fsp_page_compression); + + return (FALSE); + } + if (page_compression_level != fsp_page_compression_level) { + fprintf(stderr, + "InnoDB: Error: table flags has page_compression_level %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file has page_compression_level %ld\n", + page_compression_level, fsp_page_compression_level); + + return (FALSE); + } + + if (atomic_writes != fsp_atomic_writes) { + fprintf(stderr, + "InnoDB: Error: table flags has atomic writes %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file has atomic_writes %ld\n", + atomic_writes, fsp_atomic_writes); + + return (FALSE); + } + + return(TRUE); +} + +/********************************************************************//** +Extract the page compression level from dict_table_t::flags. +These flags are in memory, so assert that they are valid. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_tf_get_page_compression_level( +/*===============================*/ + ulint flags) /*!< in: flags */ +{ + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags); + + ut_ad(page_compression_level <= 9); + + return(page_compression_level); +} + +/********************************************************************//** +Check whether the table uses the page compression page format. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_table_page_compression_level( +/*==============================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + ut_ad(dict_tf_get_page_compression(table->flags)); + + return(dict_tf_get_page_compression_level(table->flags)); +} + +/********************************************************************//** +Check whether the table uses the page compression page format. +@return true if page compressed, false if not */ +UNIV_INLINE +ibool +dict_tf_get_page_compression( +/*=========================*/ + ulint flags) /*!< in: flags */ +{ + return(DICT_TF_GET_PAGE_COMPRESSION(flags)); +} + +/********************************************************************//** +Check whether the table uses the page compression page format. +@return true if page compressed, false if not */ +UNIV_INLINE +ibool +dict_table_is_page_compressed( +/*==========================*/ + const dict_table_t* table) /*!< in: table */ +{ + return (dict_tf_get_page_compression(table->flags)); +} + +/********************************************************************//** +Extract the atomic writes flag from table flags. +@return enumerated value of atomic writes */ +UNIV_INLINE +atomic_writes_t +dict_tf_get_atomic_writes( +/*======================*/ + ulint flags) /*!< in: flags */ +{ + return((atomic_writes_t)DICT_TF_GET_ATOMIC_WRITES(flags)); +} + +/********************************************************************//** +Check whether the table uses the atomic writes. +@return enumerated value of atomic writes */ +UNIV_INLINE +atomic_writes_t +dict_table_get_atomic_writes( +/*=========================*/ + const dict_table_t* table) /*!< in: table */ +{ + return ((atomic_writes_t)dict_tf_get_atomic_writes(table->flags)); +} diff --git a/storage/innobase/include/dict0priv.h b/storage/innobase/include/dict0priv.h index 9a3c8e22992..e034662aba0 100644 --- a/storage/innobase/include/dict0priv.h +++ b/storage/innobase/include/dict0priv.h @@ -53,8 +53,9 @@ dict_table_t* dict_table_open_on_id_low( /*=====================*/ table_id_t table_id, /*!< in: table id */ - dict_err_ignore_t ignore_err); /*!< in: errors to ignore + dict_err_ignore_t ignore_err, /*!< in: errors to ignore when loading the table */ + ibool open_only_if_in_cache); #ifndef UNIV_NONINL #include "dict0priv.ic" diff --git a/storage/innobase/include/dict0priv.ic b/storage/innobase/include/dict0priv.ic index 30ba8fb60aa..983218af78a 100644 --- a/storage/innobase/include/dict0priv.ic +++ b/storage/innobase/include/dict0priv.ic @@ -74,8 +74,9 @@ dict_table_t* dict_table_open_on_id_low( /*======================*/ table_id_t table_id, /*!< in: table id */ - dict_err_ignore_t ignore_err) /*!< in: errors to ignore + dict_err_ignore_t ignore_err, /*!< in: errors to ignore when loading the table */ + ibool open_only_if_in_cache) { dict_table_t* table; ulint fold; @@ -88,7 +89,7 @@ dict_table_open_on_id_low( HASH_SEARCH(id_hash, dict_sys->table_id_hash, fold, dict_table_t*, table, ut_ad(table->cached), table->id == table_id); - if (table == NULL) { + if (table == NULL && !open_only_if_in_cache) { table = dict_load_table_on_id(table_id, ignore_err); } diff --git a/storage/innobase/include/dict0stats.h b/storage/innobase/include/dict0stats.h index 186f90e3694..abf56b2f0c7 100644 --- a/storage/innobase/include/dict0stats.h +++ b/storage/innobase/include/dict0stats.h @@ -195,6 +195,39 @@ dict_stats_rename_table( is returned */ size_t errstr_sz); /*!< in: errstr size */ +/*********************************************************************//** +Save defragmentation result. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_stats_save_defrag_summary( + dict_index_t* index); /*!< in: index */ + +/*********************************************************************//** +Save defragmentation stats for a given index. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_stats_save_defrag_stats( + dict_index_t* index); /*!< in: index */ + +/**********************************************************************//** +Clear defragmentation summary. */ +UNIV_INTERN +void +dict_stats_empty_defrag_summary( +/*==================*/ + dict_index_t* index); /*!< in: index to clear defragmentation stats */ + +/**********************************************************************//** +Clear defragmentation related index stats. */ +UNIV_INTERN +void +dict_stats_empty_defrag_stats( +/*==================*/ + dict_index_t* index); /*!< in: index to clear defragmentation stats */ + + #ifndef UNIV_NONINL #include "dict0stats.ic" #endif diff --git a/storage/innobase/include/dict0stats_bg.h b/storage/innobase/include/dict0stats_bg.h index e866ab419fe..32fac3015e8 100644 --- a/storage/innobase/include/dict0stats_bg.h +++ b/storage/innobase/include/dict0stats_bg.h @@ -56,6 +56,28 @@ dict_stats_recalc_pool_del( /*=======================*/ const dict_table_t* table); /*!< in: table to remove */ +/*****************************************************************//** +Add an index in a table to the defrag pool, which is processed by the +background stats gathering thread. Only the table id and index id are +added to the list, so the table can be closed after being enqueued and +it will be opened when needed. If the table or index does not exist later +(has been DROPped), then it will be removed from the pool and skipped. */ +UNIV_INTERN +void +dict_stats_defrag_pool_add( +/*=======================*/ + const dict_index_t* index); /*!< in: table to add */ + +/*****************************************************************//** +Delete a given index from the auto defrag pool. */ +UNIV_INTERN +void +dict_stats_defrag_pool_del( +/*=======================*/ + const dict_table_t* table, /*!<in: if given, remove + all entries for the table */ + const dict_index_t* index); /*!< in: index to remove */ + /** Yield the data dictionary latch when waiting for the background thread to stop accessing a table. @param trx transaction holding the data dictionary locks */ diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h index d34b6f7eab3..35430e8ea62 100644 --- a/storage/innobase/include/dict0types.h +++ b/storage/innobase/include/dict0types.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -75,6 +76,13 @@ enum ib_quiesce_t { QUIESCE_COMPLETE /*!< All done */ }; +/** Enum values for atomic_writes table option */ +typedef enum { + ATOMIC_WRITES_DEFAULT = 0, + ATOMIC_WRITES_ON = 1, + ATOMIC_WRITES_OFF = 2 +} atomic_writes_t; + /** Prefix for tmp tables, adopted from sql/table.h */ #define tmp_file_prefix "#sql" #define tmp_file_prefix_length 4 diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 798423eeddd..9c453d3f4ca 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -126,11 +127,33 @@ extern fil_addr_t fil_addr_null; data file (ibdata*, not *.ibd): the file has been flushed to disk at least up to this lsn */ +/** If page type is FIL_PAGE_COMPRESSED then the 8 bytes starting at +FIL_PAGE_FILE_FLUSH_LSN are broken down as follows: */ + +/** Control information version format (u8) */ +static const ulint FIL_PAGE_VERSION = FIL_PAGE_FILE_FLUSH_LSN; + +/** Compression algorithm (u8) */ +static const ulint FIL_PAGE_ALGORITHM_V1 = FIL_PAGE_VERSION + 1; + +/** Original page type (u16) */ +static const ulint FIL_PAGE_ORIGINAL_TYPE_V1 = FIL_PAGE_ALGORITHM_V1 + 1; + +/** Original data size in bytes (u16)*/ +static const ulint FIL_PAGE_ORIGINAL_SIZE_V1 = FIL_PAGE_ORIGINAL_TYPE_V1 + 2; + +/** Size after compression (u16)*/ +static const ulint FIL_PAGE_COMPRESS_SIZE_V1 = FIL_PAGE_ORIGINAL_SIZE_V1 + 2; + #define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID 34 /*!< starting from 4.1.x this contains the space id of the page */ #define FIL_PAGE_SPACE_ID FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID #define FIL_PAGE_DATA 38 /*!< start of the data on the page */ +/* Following are used when page compression is used */ +#define FIL_PAGE_COMPRESSED_SIZE 2 /*!< Number of bytes used to store + actual payload data size on + compressed pages. */ /* @} */ /** File page trailer @{ */ #define FIL_PAGE_END_LSN_OLD_CHKSUM 8 /*!< the low 4 bytes of this are used @@ -143,6 +166,7 @@ extern fil_addr_t fil_addr_null; #ifndef UNIV_INNOCHECKSUM /** File page types (values of FIL_PAGE_TYPE) @{ */ +#define FIL_PAGE_PAGE_COMPRESSED 34354 /*!< page compressed page */ #define FIL_PAGE_INDEX 17855 /*!< B-tree node */ #define FIL_PAGE_UNDO_LOG 2 /*!< Undo log page */ #define FIL_PAGE_INODE 3 /*!< Index node */ @@ -157,7 +181,8 @@ extern fil_addr_t fil_addr_null; #define FIL_PAGE_TYPE_BLOB 10 /*!< Uncompressed BLOB page */ #define FIL_PAGE_TYPE_ZBLOB 11 /*!< First compressed BLOB page */ #define FIL_PAGE_TYPE_ZBLOB2 12 /*!< Subsequent compressed BLOB page */ -#define FIL_PAGE_TYPE_LAST FIL_PAGE_TYPE_ZBLOB2 +#define FIL_PAGE_TYPE_COMPRESSED 13 /*!< Compressed page */ +#define FIL_PAGE_TYPE_LAST FIL_PAGE_TYPE_COMPRESSED /*!< Last page type */ /* @} */ @@ -223,6 +248,7 @@ struct fil_node_t { ib_int64_t flush_counter;/*!< up to what modification_counter value we have flushed the modifications to disk */ + ulint file_block_size;/*!< file system block size */ UT_LIST_NODE_T(fil_node_t) chain; /*!< link field for the file chain */ UT_LIST_NODE_T(fil_node_t) LRU; @@ -396,6 +422,7 @@ ulint fil_space_get_type( /*===============*/ ulint id); /*!< in: space id */ + #endif /* !UNIV_HOTBACKUP */ /*******************************************************************//** Appends a new file to the chain of files of a space. File must be closed. @@ -575,8 +602,10 @@ fil_read_first_page( #endif /* UNIV_LOG_ARCHIVE */ lsn_t* min_flushed_lsn, /*!< out: min of flushed lsn values in data files */ - lsn_t* max_flushed_lsn) /*!< out: max of flushed + lsn_t* max_flushed_lsn, /*!< out: max of flushed lsn values in data files */ + ulint orig_space_id) /*!< in: file space id or + ULINT_UNDEFINED */ __attribute__((warn_unused_result)); /*******************************************************************//** Increments the count of pending operation, if space is not being deleted. @@ -939,8 +968,13 @@ fil_io( void* buf, /*!< in/out: buffer where to store read data or from where to write; in aio this must be appropriately aligned */ - void* message) /*!< in: message for aio handler if non-sync + void* message, /*!< in: message for aio handler if non-sync aio used, else ignored */ + ulint* write_size) /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ __attribute__((nonnull(8))); /**********************************************************************//** Waits for an aio operation to complete. This function is used to write the @@ -1192,4 +1226,38 @@ fil_user_tablespace_restore_page( write buffer */ #endif /* !UNIV_INNOCHECKSUM */ + +/****************************************************************//** +Acquire fil_system mutex */ +void +fil_system_enter(void); +/*==================*/ +/****************************************************************//** +Release fil_system mutex */ +void +fil_system_exit(void); +/*==================*/ + +#ifndef UNIV_INNOCHECKSUM +/*******************************************************************//** +Returns the table space by a given id, NULL if not found. */ +fil_space_t* +fil_space_get_by_id( +/*================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Return space name */ +char* +fil_space_name( +/*===========*/ + fil_space_t* space); /*!< in: space */ +#endif + +/*******************************************************************//** +Return page type name */ +const char* +fil_get_page_type_name( +/*===================*/ + ulint page_type); /*!< in: FIL_PAGE_TYPE */ + #endif /* fil0fil_h */ diff --git a/storage/innobase/include/fil0pagecompress.h b/storage/innobase/include/fil0pagecompress.h new file mode 100644 index 00000000000..fb97af87460 --- /dev/null +++ b/storage/innobase/include/fil0pagecompress.h @@ -0,0 +1,138 @@ +/***************************************************************************** + +Copyright (C) 2013, 2014 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +#ifndef fil0pagecompress_h +#define fil0pagecompress_h + +#include "fsp0fsp.h" +#include "fsp0pagecompress.h" + +/******************************************************************//** +@file include/fil0pagecompress.h +Helper functions for extracting/storing page compression and +atomic writes information to table space. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +/*******************************************************************//** +Returns the page compression level flag of the space, or 0 if the space +is not compressed. The tablespace must be cached in the memory cache. +@return page compression level if page compressed, ULINT_UNDEFINED if space not found */ +ulint +fil_space_get_page_compression_level( +/*=================================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Returns the page compression flag of the space, or false if the space +is not compressed. The tablespace must be cached in the memory cache. +@return true if page compressed, false if not or space not found */ +ibool +fil_space_is_page_compressed( +/*=========================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Returns the page compression flag of the space, or false if the space +is not compressed. The tablespace must be cached in the memory cache. +@return true if page compressed, false if not or space not found */ +ibool +fil_space_get_page_compressed( +/*=========================*/ + fil_space_t* space); /*!< in: space id */ +/*******************************************************************//** +Returns the atomic writes flag of the space, or false if the space +is not using atomic writes. The tablespace must be cached in the memory cache. +@return atomic write table option value */ +atomic_writes_t +fil_space_get_atomic_writes( +/*=========================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Find out wheather the page is index page or not +@return true if page type index page, false if not */ +ibool +fil_page_is_index_page( +/*===================*/ + byte *buf); /*!< in: page */ + +/****************************************************************//** +Get the name of the compression algorithm used for page +compression. +@return compression algorithm name or "UNKNOWN" if not known*/ +const char* +fil_get_compression_alg_name( +/*=========================*/ + ulint comp_alg); /*!<in: compression algorithm number */ + +/****************************************************************//** +For page compressed pages compress the page before actual write +operation. +@return compressed page to be written*/ +byte* +fil_compress_page( +/*==============*/ + ulint space_id, /*!< in: tablespace id of the + table. */ + byte* buf, /*!< in: buffer from which to write; in aio + this must be appropriately aligned */ + byte* out_buf, /*!< out: compressed buffer */ + ulint len, /*!< in: length of input buffer.*/ + ulint compression_level, /*!< in: compression level */ + ulint block_size, /*!< in: block size */ + ulint* out_len, /*!< out: actual length of compressed + page */ + byte* lzo_mem); /*!< in: temporal memory used by LZO */ + +/****************************************************************//** +For page compressed pages decompress the page after actual read +operation. +@return uncompressed page */ +void +fil_decompress_page( +/*================*/ + byte* page_buf, /*!< in: preallocated buffer or NULL */ + byte* buf, /*!< out: buffer from which to read; in aio + this must be appropriately aligned */ + ulong len, /*!< in: length of output buffer.*/ + ulint* write_size); /*!< in/out: Actual payload size of + the compressed data. */ + +/****************************************************************//** +Get space id from fil node +@return space id*/ +ulint +fil_node_get_space_id( +/*==================*/ + fil_node_t* node); /*!< in: Node where to get space id*/ + +/****************************************************************//** +Get block size from fil node +@return block size*/ +ulint +fil_node_get_block_size( + fil_node_t* node); /*!< in: Node where to get block + size */ +/*******************************************************************//** +Find out wheather the page is page compressed +@return true if page is page compressed*/ +ibool +fil_page_is_compressed( +/*===================*/ + byte *buf); /*!< in: page */ + +#endif diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h index a587ccc9f20..87f1f5a636d 100644 --- a/storage/innobase/include/fsp0fsp.h +++ b/storage/innobase/include/fsp0fsp.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -53,12 +54,21 @@ to the two Barracuda row formats COMPRESSED and DYNAMIC. */ /** Width of the DATA_DIR flag. This flag indicates that the tablespace is found in a remote location, not the default data directory. */ #define FSP_FLAGS_WIDTH_DATA_DIR 1 +/** Number of flag bits used to indicate the page compression and compression level */ +#define FSP_FLAGS_WIDTH_PAGE_COMPRESSION 1 +#define FSP_FLAGS_WIDTH_PAGE_COMPRESSION_LEVEL 4 +/** Number of flag bits used to indicate atomic writes for this tablespace */ +#define FSP_FLAGS_WIDTH_ATOMIC_WRITES 2 + /** Width of all the currently known tablespace flags */ #define FSP_FLAGS_WIDTH (FSP_FLAGS_WIDTH_POST_ANTELOPE \ + FSP_FLAGS_WIDTH_ZIP_SSIZE \ + FSP_FLAGS_WIDTH_ATOMIC_BLOBS \ + FSP_FLAGS_WIDTH_PAGE_SSIZE \ - + FSP_FLAGS_WIDTH_DATA_DIR) + + FSP_FLAGS_WIDTH_DATA_DIR \ + + FSP_FLAGS_WIDTH_PAGE_COMPRESSION \ + + FSP_FLAGS_WIDTH_PAGE_COMPRESSION_LEVEL \ + + FSP_FLAGS_WIDTH_ATOMIC_WRITES) /** A mask of all the known/used bits in tablespace flags */ #define FSP_FLAGS_MASK (~(~0 << FSP_FLAGS_WIDTH)) @@ -71,9 +81,20 @@ is found in a remote location, not the default data directory. */ /** Zero relative shift position of the ATOMIC_BLOBS field */ #define FSP_FLAGS_POS_ATOMIC_BLOBS (FSP_FLAGS_POS_ZIP_SSIZE \ + FSP_FLAGS_WIDTH_ZIP_SSIZE) -/** Zero relative shift position of the PAGE_SSIZE field */ -#define FSP_FLAGS_POS_PAGE_SSIZE (FSP_FLAGS_POS_ATOMIC_BLOBS \ +/** Note that these need to be before the page size to be compatible with +dictionary */ +/** Zero relative shift position of the PAGE_COMPRESSION field */ +#define FSP_FLAGS_POS_PAGE_COMPRESSION (FSP_FLAGS_POS_ATOMIC_BLOBS \ + FSP_FLAGS_WIDTH_ATOMIC_BLOBS) +/** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */ +#define FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL (FSP_FLAGS_POS_PAGE_COMPRESSION \ + + FSP_FLAGS_WIDTH_PAGE_COMPRESSION) +/** Zero relative shift position of the ATOMIC_WRITES field */ +#define FSP_FLAGS_POS_ATOMIC_WRITES (FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL \ + + FSP_FLAGS_WIDTH_PAGE_COMPRESSION_LEVEL) + /** Zero relative shift position of the PAGE_SSIZE field */ +#define FSP_FLAGS_POS_PAGE_SSIZE (FSP_FLAGS_POS_ATOMIC_WRITES \ + + FSP_FLAGS_WIDTH_ATOMIC_WRITES) /** Zero relative shift position of the start of the UNUSED bits */ #define FSP_FLAGS_POS_DATA_DIR (FSP_FLAGS_POS_PAGE_SSIZE \ + FSP_FLAGS_WIDTH_PAGE_SSIZE) @@ -101,6 +122,18 @@ is found in a remote location, not the default data directory. */ #define FSP_FLAGS_MASK_DATA_DIR \ ((~(~0 << FSP_FLAGS_WIDTH_DATA_DIR)) \ << FSP_FLAGS_POS_DATA_DIR) +/** Bit mask of the PAGE_COMPRESSION field */ +#define FSP_FLAGS_MASK_PAGE_COMPRESSION \ + ((~(~0 << FSP_FLAGS_WIDTH_PAGE_COMPRESSION)) \ + << FSP_FLAGS_POS_PAGE_COMPRESSION) +/** Bit mask of the PAGE_COMPRESSION_LEVEL field */ +#define FSP_FLAGS_MASK_PAGE_COMPRESSION_LEVEL \ + ((~(~0 << FSP_FLAGS_WIDTH_PAGE_COMPRESSION_LEVEL)) \ + << FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL) +/** Bit mask of the ATOMIC_WRITES field */ +#define FSP_FLAGS_MASK_ATOMIC_WRITES \ + ((~(~0 << FSP_FLAGS_WIDTH_ATOMIC_WRITES)) \ + << FSP_FLAGS_POS_ATOMIC_WRITES) /** Return the value of the POST_ANTELOPE field */ #define FSP_FLAGS_GET_POST_ANTELOPE(flags) \ @@ -126,11 +159,38 @@ is found in a remote location, not the default data directory. */ #define FSP_FLAGS_GET_UNUSED(flags) \ (flags >> FSP_FLAGS_POS_UNUSED) +/** Return the value of the PAGE_COMPRESSION field */ +#define FSP_FLAGS_GET_PAGE_COMPRESSION(flags) \ + ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION) \ + >> FSP_FLAGS_POS_PAGE_COMPRESSION) +/** Return the value of the PAGE_COMPRESSION_LEVEL field */ +#define FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags) \ + ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION_LEVEL) \ + >> FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL) +/** Return the value of the ATOMIC_WRITES field */ +#define FSP_FLAGS_GET_ATOMIC_WRITES(flags) \ + ((flags & FSP_FLAGS_MASK_ATOMIC_WRITES) \ + >> FSP_FLAGS_POS_ATOMIC_WRITES) + /** Set a PAGE_SSIZE into the correct bits in a given tablespace flags. */ #define FSP_FLAGS_SET_PAGE_SSIZE(flags, ssize) \ (flags | (ssize << FSP_FLAGS_POS_PAGE_SSIZE)) +/** Set a PAGE_COMPRESSION into the correct bits in a given +tablespace flags. */ +#define FSP_FLAGS_SET_PAGE_COMPRESSION(flags, compression) \ + (flags | (compression << FSP_FLAGS_POS_PAGE_COMPRESSION)) + +/** Set a PAGE_COMPRESSION_LEVEL into the correct bits in a given +tablespace flags. */ +#define FSP_FLAGS_SET_PAGE_COMPRESSION_LEVEL(flags, level) \ + (flags | (level << FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL)) +/** Set a ATOMIC_WRITES into the correct bits in a given +tablespace flags. */ +#define FSP_FLAGS_SET_ATOMIC_WRITES(flags, atomics) \ + (flags | (atomics << FSP_FLAGS_POS_ATOMIC_WRITES)) + /* @} */ /* @defgroup Tablespace Header Constants (moved from fsp0fsp.c) @{ */ diff --git a/storage/innobase/include/fsp0fsp.ic b/storage/innobase/include/fsp0fsp.ic index 0d81e817cc9..3a3eb21a61a 100644 --- a/storage/innobase/include/fsp0fsp.ic +++ b/storage/innobase/include/fsp0fsp.ic @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -63,12 +64,17 @@ fsp_flags_is_valid( ulint atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(flags); ulint page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags); ulint unused = FSP_FLAGS_GET_UNUSED(flags); + ulint page_compression = FSP_FLAGS_GET_PAGE_COMPRESSION(flags); + ulint page_compression_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags); + ulint atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags); DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return(false);); /* fsp_flags is zero unless atomic_blobs is set. */ /* Make sure there are no bits that we do not know about. */ if (unused != 0 || flags == 1) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted unused %lu\n", + flags, unused); return(false); } else if (post_antelope) { /* The Antelope row formats REDUNDANT and COMPACT did @@ -76,6 +82,8 @@ fsp_flags_is_valid( 4-byte field is zero for Antelope row formats. */ if (!atomic_blobs) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted atomic_blobs %lu\n", + flags, atomic_blobs); return(false); } } @@ -87,10 +95,14 @@ fsp_flags_is_valid( externally stored parts. */ if (post_antelope || zip_ssize != 0) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted zip_ssize %lu atomic_blobs %lu\n", + flags, zip_ssize, atomic_blobs); return(false); } } else if (!post_antelope || zip_ssize > PAGE_ZIP_SSIZE_MAX) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted zip_ssize %lu max %d\n", + flags, zip_ssize, PAGE_ZIP_SSIZE_MAX); return(false); } else if (page_ssize > UNIV_PAGE_SSIZE_MAX) { @@ -98,12 +110,33 @@ fsp_flags_is_valid( be zero for an original 16k page size. Validate the page shift size is within allowed range. */ + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted page_ssize %lu max %lu\n", + flags, page_ssize, UNIV_PAGE_SSIZE_MAX); return(false); } else if (UNIV_PAGE_SIZE != UNIV_PAGE_SIZE_ORIG && !page_ssize) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted page_ssize %lu max %lu:%d\n", + flags, page_ssize, UNIV_PAGE_SIZE, UNIV_PAGE_SIZE_ORIG); return(false); } + /* Page compression level requires page compression and atomic blobs + to be set */ + if (page_compression_level || page_compression) { + if (!page_compression || !atomic_blobs) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted page_compression %lu\n" + "InnoDB: Error: page_compression_level %lu atomic_blobs %lu\n", + flags, page_compression, page_compression_level, atomic_blobs); + return(false); + } + } + + if (atomic_writes > ATOMIC_WRITES_OFF) { + fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted atomic_writes %lu\n", + flags, atomic_writes); + return (false); + } + #if UNIV_FORMAT_MAX != UNIV_FORMAT_B # error "UNIV_FORMAT_MAX != UNIV_FORMAT_B, Add more validations." #endif @@ -312,3 +345,4 @@ xdes_calc_descriptor_page( } #endif /* !UNIV_INNOCHECKSUM */ + diff --git a/storage/innobase/include/fsp0pagecompress.h b/storage/innobase/include/fsp0pagecompress.h new file mode 100644 index 00000000000..15212227829 --- /dev/null +++ b/storage/innobase/include/fsp0pagecompress.h @@ -0,0 +1,83 @@ +/***************************************************************************** + +Copyright (C) 2013, 2014 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fsp0pagecompress.h +Helper functions for extracting/storing page compression and +atomic writes information to file space. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#ifndef fsp0pagecompress_h +#define fsp0pagecompress_h + +/* Supported page compression methods */ + +#define PAGE_UNCOMPRESSED 0 +#define PAGE_ZLIB_ALGORITHM 1 +#define PAGE_LZ4_ALGORITHM 2 +#define PAGE_LZO_ALGORITHM 3 +#define PAGE_LZMA_ALGORITHM 4 +#define PAGE_BZIP2_ALGORITHM 5 +#define PAGE_ALGORITHM_LAST PAGE_BZIP2_ALGORITHM + +/**********************************************************************//** +Reads the page compression level from the first page of a tablespace. +@return page compression level, or 0 if uncompressed */ +UNIV_INTERN +ulint +fsp_header_get_compression_level( +/*=============================*/ + const page_t* page); /*!< in: first page of a tablespace */ + +/********************************************************************//** +Determine if the tablespace is page compressed from dict_table_t::flags. +@return TRUE if page compressed, FALSE if not compressed */ +UNIV_INLINE +ibool +fsp_flags_is_page_compressed( +/*=========================*/ + ulint flags); /*!< in: tablespace flags */ + +/********************************************************************//** +Extract the page compression level from tablespace flags. +A tablespace has only one physical page compression level +whether that page is compressed or not. +@return page compression level of the file-per-table tablespace, +or zero if the table is not compressed. */ +UNIV_INLINE +ulint +fsp_flags_get_page_compression_level( +/*=================================*/ + ulint flags); /*!< in: tablespace flags */ + +/********************************************************************//** +Determine the tablespace is using atomic writes from dict_table_t::flags. +@return true if atomic writes is used, false if not */ +UNIV_INLINE +atomic_writes_t +fsp_flags_get_atomic_writes( +/*========================*/ + ulint flags); /*!< in: tablespace flags */ + +#ifndef UNIV_NONINL +#include "fsp0pagecompress.ic" +#endif + +#endif diff --git a/storage/innobase/include/fsp0pagecompress.ic b/storage/innobase/include/fsp0pagecompress.ic new file mode 100644 index 00000000000..1ba3b7835c9 --- /dev/null +++ b/storage/innobase/include/fsp0pagecompress.ic @@ -0,0 +1,184 @@ +/***************************************************************************** + +Copyright (C) 2013,2014 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fsp0pagecompress.ic +Implementation for helper functions for extracting/storing page +compression and atomic writes information to file space. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +/********************************************************************//** +Determine if the tablespace is page compressed from dict_table_t::flags. +@return TRUE if page compressed, FALSE if not page compressed */ +UNIV_INLINE +ibool +fsp_flags_is_page_compressed( +/*=========================*/ + ulint flags) /*!< in: tablespace flags */ +{ + return(FSP_FLAGS_GET_PAGE_COMPRESSION(flags)); +} + +/********************************************************************//** +Determine the tablespace is page compression level from dict_table_t::flags. +@return page compression level or 0 if not compressed*/ +UNIV_INLINE +ulint +fsp_flags_get_page_compression_level( +/*=================================*/ + ulint flags) /*!< in: tablespace flags */ +{ + return(FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags)); +} + +/********************************************************************//** +Determine the tablespace is using atomic writes from dict_table_t::flags. +@return true if atomic writes is used, false if not */ +UNIV_INLINE +atomic_writes_t +fsp_flags_get_atomic_writes( +/*========================*/ + ulint flags) /*!< in: tablespace flags */ +{ + return((atomic_writes_t)FSP_FLAGS_GET_ATOMIC_WRITES(flags)); +} + +/*******************************************************************//** +Find out wheather the page is index page or not +@return true if page type index page, false if not */ +UNIV_INLINE +ibool +fil_page_is_index_page( +/*===================*/ + byte *buf) /*!< in: page */ +{ + return(mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_INDEX); +} + +/*******************************************************************//** +Find out wheather the page is page compressed +@return true if page is page compressed, false if not */ +UNIV_INLINE +ibool +fil_page_is_compressed( +/*===================*/ + byte *buf) /*!< in: page */ +{ + return(mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_PAGE_COMPRESSED); +} + +/*******************************************************************//** +Returns the page compression level of the space, or 0 if the space +is not compressed. The tablespace must be cached in the memory cache. +@return page compression level, ULINT_UNDEFINED if space not found */ +UNIV_INLINE +ulint +fil_space_get_page_compression_level( +/*=================================*/ + ulint id) /*!< in: space id */ +{ + ulint flags; + + flags = fil_space_get_flags(id); + + if (flags && flags != ULINT_UNDEFINED) { + + return(fsp_flags_get_page_compression_level(flags)); + } + + return(flags); +} + +/*******************************************************************//** +Extract the page compression from space. +@return true if space is page compressed, false if space is not found +or space is not page compressed. */ +UNIV_INLINE +ibool +fil_space_is_page_compressed( +/*=========================*/ + ulint id) /*!< in: space id */ +{ + ulint flags; + + flags = fil_space_get_flags(id); + + if (flags && flags != ULINT_UNDEFINED) { + + return(fsp_flags_is_page_compressed(flags)); + } + + return(flags); +} + +/****************************************************************//** +Get the name of the compression algorithm used for page +compression. +@return compression algorithm name or "UNKNOWN" if not known*/ +UNIV_INLINE +const char* +fil_get_compression_alg_name( +/*=========================*/ + ulint comp_alg) /*!<in: compression algorithm number */ +{ + switch(comp_alg) { + case PAGE_UNCOMPRESSED: + return ("uncompressed"); + break; + case PAGE_ZLIB_ALGORITHM: + return ("ZLIB"); + break; + case PAGE_LZ4_ALGORITHM: + return ("LZ4"); + break; + case PAGE_LZO_ALGORITHM: + return ("LZO"); + break; + case PAGE_LZMA_ALGORITHM: + return ("LZMA"); + break; + default: + return("UNKNOWN"); + ut_error; + break; + } +} + +/*******************************************************************//** +Returns the atomic writes flag of the space, or false if the space +is not using atomic writes. The tablespace must be cached in the memory cache. +@return atomic writes table option value */ +UNIV_INLINE +atomic_writes_t +fil_space_get_atomic_writes( +/*========================*/ + ulint id) /*!< in: space id */ +{ + ulint flags; + + flags = fil_space_get_flags(id); + + if (flags && flags != ULINT_UNDEFINED) { + + return((atomic_writes_t)fsp_flags_get_atomic_writes(flags)); + } + + return((atomic_writes_t)0); +} diff --git a/storage/innobase/include/fsp0types.h b/storage/innobase/include/fsp0types.h index 94fd908ab0c..a6797cd66de 100644 --- a/storage/innobase/include/fsp0types.h +++ b/storage/innobase/include/fsp0types.h @@ -29,6 +29,7 @@ Created May 26, 2009 Vasil Dimov #include "univ.i" #include "fil0fil.h" /* for FIL_PAGE_DATA */ +#include "ut0byte.h" /** @name Flags for inserting records in order If records are inserted in order, there are the following @@ -41,14 +42,17 @@ fseg_alloc_free_page) */ #define FSP_NO_DIR ((byte)113) /*!< no order */ /* @} */ -/** File space extent size (one megabyte) in pages */ -#define FSP_EXTENT_SIZE (1048576U / UNIV_PAGE_SIZE) +/** File space extent size (one megabyte if default two or four if not) in pages */ +#define FSP_EXTENT_SIZE ((UNIV_PAGE_SIZE <= (1 << 14) ? \ + (1048576U / UNIV_PAGE_SIZE) : \ + ((UNIV_PAGE_SIZE <= 1 << 15) ? \ + (2097152U / UNIV_PAGE_SIZE) : (4194304U / UNIV_PAGE_SIZE)))) -/** File space extent size (one megabyte) in pages for MAX page size */ -#define FSP_EXTENT_SIZE_MAX (1048576 / UNIV_PAGE_SIZE_MAX) +/** File space extent size (four megabytes) in pages for MAX page size */ +#define FSP_EXTENT_SIZE_MAX (4194304U / UNIV_PAGE_SIZE_MAX) /** File space extent size (one megabyte) in pages for MIN page size */ -#define FSP_EXTENT_SIZE_MIN (1048576 / UNIV_PAGE_SIZE_MIN) +#define FSP_EXTENT_SIZE_MIN (1048576U / UNIV_PAGE_SIZE_MIN) /** On a page of any file segment, data may be put starting from this offset */ diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h index a02b8f1893a..212df1a1283 100644 --- a/storage/innobase/include/ha_prototypes.h +++ b/storage/innobase/include/ha_prototypes.h @@ -286,6 +286,16 @@ innobase_casedn_str( /*================*/ char* a); /*!< in/out: string to put in lower case */ +#ifdef WITH_WSREP +UNIV_INTERN +int +wsrep_innobase_kill_one_trx(void *thd_ptr, + const trx_t *bf_trx, trx_t *victim_trx, ibool signal); +int wsrep_innobase_mysql_sort(int mysql_type, uint charset_number, + unsigned char* str, unsigned int str_length, + unsigned int buf_length); +#endif /* WITH_WSREP */ + /**********************************************************************//** Determines the connection character set. @return connection character set */ diff --git a/storage/innobase/include/hash0hash.h b/storage/innobase/include/hash0hash.h index 6f9a628df5d..9a4077befb1 100644 --- a/storage/innobase/include/hash0hash.h +++ b/storage/innobase/include/hash0hash.h @@ -144,6 +144,33 @@ do {\ }\ } while (0) +#ifdef WITH_WSREP +/*******************************************************************//** +Inserts a struct to the head of hash table. */ + +#define HASH_PREPEND(TYPE, NAME, TABLE, FOLD, DATA) \ +do { \ + hash_cell_t* cell3333; \ + TYPE* struct3333; \ + \ + HASH_ASSERT_OWN(TABLE, FOLD) \ + \ + (DATA)->NAME = NULL; \ + \ + cell3333 = hash_get_nth_cell(TABLE, hash_calc_hash(FOLD, TABLE));\ + \ + if (cell3333->node == NULL) { \ + cell3333->node = DATA; \ + DATA->NAME = NULL; \ + } else { \ + struct3333 = (TYPE*) cell3333->node; \ + \ + DATA->NAME = struct3333; \ + \ + cell3333->node = DATA; \ + } \ +} while (0) +#endif /*WITH_WSREP */ #ifdef UNIV_HASH_DEBUG # define HASH_ASSERT_VALID(DATA) ut_a((void*) (DATA) != (void*) -1) # define HASH_INVALIDATE(DATA, NAME) *(void**) (&DATA->NAME) = (void*) -1 diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h index bf4a4ae1c35..d96fdfa9d89 100644 --- a/storage/innobase/include/lock0lock.h +++ b/storage/innobase/include/lock0lock.h @@ -181,6 +181,16 @@ lock_update_merge_left( const buf_block_t* right_block); /*!< in: merged index page which will be discarded */ /*************************************************************//** +Updates the lock table when a page is splited and merged to +two pages. */ +UNIV_INTERN +void +lock_update_split_and_merge( + const buf_block_t* left_block, /*!< in: left page to which merged */ + const rec_t* orig_pred, /*!< in: original predecessor of + supremum on the left page before merge*/ + const buf_block_t* right_block);/*!< in: right page from which merged */ +/*************************************************************//** Resets the original locks on heir and replaces them with gap type locks inherited from rec. */ UNIV_INTERN @@ -972,6 +982,16 @@ extern lock_sys_t* lock_sys; mutex_exit(&lock_sys->wait_mutex); \ } while (0) +#ifdef WITH_WSREP +/*********************************************************************//** +Cancels a waiting lock request and releases possible other transactions +waiting behind it. */ +UNIV_INTERN +void +lock_cancel_waiting_and_release( +/*============================*/ + lock_t* lock); /*!< in/out: waiting lock request */ +#endif /* WITH_WSREP */ #ifndef UNIV_NONINL #include "lock0lock.ic" #endif diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index 74d3c6bbc7c..8f8aef4f45c 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -2,6 +2,7 @@ Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Percona Inc.. Those modifications are @@ -151,10 +152,9 @@ enum os_file_create_t { #define OS_FILE_INSUFFICIENT_RESOURCE 78 #define OS_FILE_AIO_INTERRUPTED 79 #define OS_FILE_OPERATION_ABORTED 80 - #define OS_FILE_ACCESS_VIOLATION 81 - -#define OS_FILE_ERROR_MAX 100 +#define OS_FILE_OPERATION_NOT_SUPPORTED 125 +#define OS_FILE_ERROR_MAX 200 /* @} */ /** Types for aio operations @{ */ @@ -295,33 +295,35 @@ os_file_write The wrapper functions have the prefix of "innodb_". */ #ifdef UNIV_PFS_IO -# define os_file_create(key, name, create, purpose, type, success) \ +# define os_file_create(key, name, create, purpose, type, success, atomic_writes) \ pfs_os_file_create_func(key, name, create, purpose, type, \ - success, __FILE__, __LINE__) + success, atomic_writes, __FILE__, __LINE__) # define os_file_create_simple(key, name, create, access, success) \ pfs_os_file_create_simple_func(key, name, create, access, \ success, __FILE__, __LINE__) # define os_file_create_simple_no_error_handling( \ - key, name, create_mode, access, success) \ + key, name, create_mode, access, success, atomic_writes) \ pfs_os_file_create_simple_no_error_handling_func( \ - key, name, create_mode, access, success, __FILE__, __LINE__) + key, name, create_mode, access, success, atomic_writes, __FILE__, __LINE__) # define os_file_close(file) \ pfs_os_file_close_func(file, __FILE__, __LINE__) # define os_aio(type, mode, name, file, buf, offset, \ - n, message1, message2) \ + n, message1, message2, write_size, \ + page_compression, page_compression_level) \ pfs_os_aio_func(type, mode, name, file, buf, offset, \ - n, message1, message2, __FILE__, __LINE__) + n, message1, message2, write_size, \ + page_compression, page_compression_level, __FILE__, __LINE__) -# define os_file_read(file, buf, offset, n) \ - pfs_os_file_read_func(file, buf, offset, n, __FILE__, __LINE__) +# define os_file_read(file, buf, offset, n, compressed) \ + pfs_os_file_read_func(file, buf, offset, n, compressed, __FILE__, __LINE__) -# define os_file_read_no_error_handling(file, buf, offset, n) \ +# define os_file_read_no_error_handling(file, buf, offset, n, compressed) \ pfs_os_file_read_no_error_handling_func(file, buf, offset, n, \ - __FILE__, __LINE__) + compressed, __FILE__, __LINE__) # define os_file_write(name, file, buf, offset, n) \ pfs_os_file_write_func(name, file, buf, offset, \ @@ -342,28 +344,28 @@ The wrapper functions have the prefix of "innodb_". */ /* If UNIV_PFS_IO is not defined, these I/O APIs point to original un-instrumented file I/O APIs */ -# define os_file_create(key, name, create, purpose, type, success) \ - os_file_create_func(name, create, purpose, type, success) +# define os_file_create(key, name, create, purpose, type, success, atomic_writes) \ + os_file_create_func(name, create, purpose, type, success, atomic_writes) -# define os_file_create_simple(key, name, create_mode, access, success) \ +# define os_file_create_simple(key, name, create_mode, access, success) \ os_file_create_simple_func(name, create_mode, access, success) # define os_file_create_simple_no_error_handling( \ - key, name, create_mode, access, success) \ - os_file_create_simple_no_error_handling_func( \ - name, create_mode, access, success) + key, name, create_mode, access, success, atomic_writes) \ + os_file_create_simple_no_error_handling_func( \ + name, create_mode, access, success, atomic_writes) # define os_file_close(file) os_file_close_func(file) -# define os_aio(type, mode, name, file, buf, offset, n, message1, message2) \ +# define os_aio(type, mode, name, file, buf, offset, n, message1, message2, write_size, page_compression, page_compression_level) \ os_aio_func(type, mode, name, file, buf, offset, n, \ - message1, message2) + message1, message2, write_size, page_compression, page_compression_level) -# define os_file_read(file, buf, offset, n) \ - os_file_read_func(file, buf, offset, n) +# define os_file_read(file, buf, offset, n, compressed) \ + os_file_read_func(file, buf, offset, n, compressed) -# define os_file_read_no_error_handling(file, buf, offset, n) \ - os_file_read_no_error_handling_func(file, buf, offset, n) +# define os_file_read_no_error_handling(file, buf, offset, n, compressed) \ + os_file_read_no_error_handling_func(file, buf, offset, n, compressed) # define os_file_write(name, file, buf, offset, n) \ os_file_write_func(name, file, buf, offset, n) @@ -524,7 +526,9 @@ os_file_create_simple_no_error_handling_func( OS_FILE_READ_WRITE, or OS_FILE_READ_ALLOW_DELETE; the last option is used by a backup program reading the file */ - ibool* success)/*!< out: TRUE if succeed, FALSE if error */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + ulint atomic_writes)/*!< in: atomic writes table option + value */ __attribute__((nonnull, warn_unused_result)); /****************************************************************//** Tries to disable OS caching on an opened file descriptor. */ @@ -558,7 +562,9 @@ os_file_create_func( async i/o or unbuffered i/o: look in the function source code for the exact rules */ ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */ - ibool* success)/*!< out: TRUE if succeed, FALSE if error */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + ulint atomic_writes)/*!< in: atomic writes table option + value */ __attribute__((nonnull, warn_unused_result)); /***********************************************************************//** Deletes a file. The file has to be closed before calling this. @@ -648,6 +654,8 @@ pfs_os_file_create_simple_no_error_handling_func( OS_FILE_READ_ALLOW_DELETE; the last option is used by a backup program reading the file */ ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + ulint atomic_writes,/*!< in: atomic writes table option + value */ const char* src_file,/*!< in: file name where func invoked */ ulint src_line)/*!< in: line where the func invoked */ __attribute__((nonnull, warn_unused_result)); @@ -676,6 +684,8 @@ pfs_os_file_create_func( function source code for the exact rules */ ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */ ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + ulint atomic_writes,/*!< in: atomic writes table option + value*/ const char* src_file,/*!< in: file name where func invoked */ ulint src_line)/*!< in: line where the func invoked */ __attribute__((nonnull, warn_unused_result)); @@ -706,6 +716,8 @@ pfs_os_file_read_func( void* buf, /*!< in: buffer where to read */ os_offset_t offset, /*!< in: file offset where to read */ ulint n, /*!< in: number of bytes to read */ + ibool compressed, /*!< in: is this file space + compressed ? */ const char* src_file,/*!< in: file name where func invoked */ ulint src_line);/*!< in: line where the func invoked */ @@ -724,6 +736,8 @@ pfs_os_file_read_no_error_handling_func( void* buf, /*!< in: buffer where to read */ os_offset_t offset, /*!< in: file offset where to read */ ulint n, /*!< in: number of bytes to read */ + ibool compressed, /*!< in: is this file space + compressed ? */ const char* src_file,/*!< in: file name where func invoked */ ulint src_line);/*!< in: line where the func invoked */ @@ -754,6 +768,15 @@ pfs_os_aio_func( (can be used to identify a completed aio operation); ignored if mode is OS_AIO_SYNC */ + ulint* write_size,/*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ + ibool page_compression, /*!< in: is page compression used + on this file space */ + ulint page_compression_level, /*!< page compression + level to be used */ const char* src_file,/*!< in: file name where func invoked */ ulint src_line);/*!< in: line where the func invoked */ /*******************************************************************//** @@ -910,7 +933,9 @@ os_file_read_func( os_file_t file, /*!< in: handle to a file */ void* buf, /*!< in: buffer where to read */ os_offset_t offset, /*!< in: file offset where to read */ - ulint n); /*!< in: number of bytes to read */ + ulint n, /*!< in: number of bytes to read */ + ibool compressed); /*!< in: is this file space + compressed ? */ /*******************************************************************//** Rewind file to its start, read at most size - 1 bytes from it to str, and NUL-terminate str. All errors are silently ignored. This function is @@ -935,7 +960,9 @@ os_file_read_no_error_handling_func( os_file_t file, /*!< in: handle to a file */ void* buf, /*!< in: buffer where to read */ os_offset_t offset, /*!< in: file offset where to read */ - ulint n); /*!< in: number of bytes to read */ + ulint n, /*!< in: number of bytes to read */ + ibool compressed); /*!< in: is this file space + compressed ? */ /*******************************************************************//** NOTE! Use the corresponding macro os_file_write(), not directly this @@ -952,6 +979,7 @@ os_file_write_func( const void* buf, /*!< in: buffer from which to write */ os_offset_t offset, /*!< in: file offset where to write */ ulint n); /*!< in: number of bytes to write */ + /*******************************************************************//** Check the existence and type of the given file. @return TRUE if call succeeded */ @@ -1114,10 +1142,20 @@ os_aio_func( (can be used to identify a completed aio operation); ignored if mode is OS_AIO_SYNC */ - void* message2);/*!< in: message for the aio handler + void* message2,/*!< in: message for the aio handler (can be used to identify a completed aio operation); ignored if mode is OS_AIO_SYNC */ + ulint* write_size,/*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ + ibool page_compression, /*!< in: is page compression used + on this file space */ + ulint page_compression_level); /*!< page compression + level to be used */ + /************************************************************************//** Wakes up all async i/o threads so that they know to exit themselves in shutdown. */ @@ -1291,8 +1329,20 @@ os_file_handle_error_no_exit( /*=========================*/ const char* name, /*!< in: name of a file or NULL */ const char* operation, /*!< in: operation */ - ibool on_error_silent);/*!< in: if TRUE then don't print + ibool on_error_silent,/*!< in: if TRUE then don't print any message to the log. */ + const char* file, /*!< in: file name */ + const ulint line); /*!< in: line */ + +/***********************************************************************//** +Try to get number of bytes per sector from file system. +@return file block size */ +UNIV_INTERN +ulint +os_file_get_block_size( +/*===================*/ + os_file_t file, /*!< in: handle to a file */ + const char* name); /*!< in: file name */ #ifndef UNIV_NONINL #include "os0file.ic" diff --git a/storage/innobase/include/os0file.ic b/storage/innobase/include/os0file.ic index defd8204ba3..8e1cea585e6 100644 --- a/storage/innobase/include/os0file.ic +++ b/storage/innobase/include/os0file.ic @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2010, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -88,6 +89,8 @@ pfs_os_file_create_simple_no_error_handling_func( OS_FILE_READ_ALLOW_DELETE; the last option is used by a backup program reading the file */ ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + ulint atomic_writes,/*!< in: atomic writes table option + value */ const char* src_file,/*!< in: file name where func invoked */ ulint src_line)/*!< in: line where the func invoked */ { @@ -103,7 +106,7 @@ pfs_os_file_create_simple_no_error_handling_func( name, src_file, src_line); file = os_file_create_simple_no_error_handling_func( - name, create_mode, access_type, success); + name, create_mode, access_type, success, atomic_writes); register_pfs_file_open_end(locker, file); @@ -134,6 +137,8 @@ pfs_os_file_create_func( function source code for the exact rules */ ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */ ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + ulint atomic_writes, /*!< in: atomic writes table option + value */ const char* src_file,/*!< in: file name where func invoked */ ulint src_line)/*!< in: line where the func invoked */ { @@ -148,7 +153,7 @@ pfs_os_file_create_func( : PSI_FILE_OPEN), name, src_file, src_line); - file = os_file_create_func(name, create_mode, purpose, type, success); + file = os_file_create_func(name, create_mode, purpose, type, success, atomic_writes); register_pfs_file_open_end(locker, file); @@ -210,6 +215,15 @@ pfs_os_aio_func( (can be used to identify a completed aio operation); ignored if mode is OS_AIO_SYNC */ + ulint* write_size,/*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ + ibool page_compression, /*!< in: is page compression used + on this file space */ + ulint page_compression_level, /*!< page compression + level to be used */ const char* src_file,/*!< in: file name where func invoked */ ulint src_line)/*!< in: line where the func invoked */ { @@ -225,7 +239,8 @@ pfs_os_aio_func( src_file, src_line); result = os_aio_func(type, mode, name, file, buf, offset, - n, message1, message2); + n, message1, message2, write_size, + page_compression, page_compression_level); register_pfs_file_io_end(locker, n); @@ -246,6 +261,8 @@ pfs_os_file_read_func( void* buf, /*!< in: buffer where to read */ os_offset_t offset, /*!< in: file offset where to read */ ulint n, /*!< in: number of bytes to read */ + ibool compressed, /*!< in: is this file space + compressed ? */ const char* src_file,/*!< in: file name where func invoked */ ulint src_line)/*!< in: line where the func invoked */ { @@ -256,7 +273,7 @@ pfs_os_file_read_func( register_pfs_file_io_begin(&state, locker, file, n, PSI_FILE_READ, src_file, src_line); - result = os_file_read_func(file, buf, offset, n); + result = os_file_read_func(file, buf, offset, n, compressed); register_pfs_file_io_end(locker, n); @@ -279,6 +296,8 @@ pfs_os_file_read_no_error_handling_func( void* buf, /*!< in: buffer where to read */ os_offset_t offset, /*!< in: file offset where to read */ ulint n, /*!< in: number of bytes to read */ + ibool compressed, /*!< in: is this file space + compressed ? */ const char* src_file,/*!< in: file name where func invoked */ ulint src_line)/*!< in: line where the func invoked */ { @@ -289,7 +308,7 @@ pfs_os_file_read_no_error_handling_func( register_pfs_file_io_begin(&state, locker, file, n, PSI_FILE_READ, src_file, src_line); - result = os_file_read_no_error_handling_func(file, buf, offset, n); + result = os_file_read_no_error_handling_func(file, buf, offset, n, compressed); register_pfs_file_io_end(locker, n); diff --git a/storage/innobase/include/rem0rec.h b/storage/innobase/include/rem0rec.h index 8e7d5ff2d48..238cb04e1f8 100644 --- a/storage/innobase/include/rem0rec.h +++ b/storage/innobase/include/rem0rec.h @@ -981,6 +981,15 @@ are given in one byte (resp. two byte) format. */ two upmost bits in a two byte offset for special purposes */ #define REC_MAX_DATA_SIZE (16 * 1024) +#ifdef WITH_WSREP +int wsrep_rec_get_foreign_key( + byte *buf, /* out: extracted key */ + ulint *buf_len, /* in/out: length of buf */ + const rec_t* rec, /* in: physical record */ + dict_index_t* index_for, /* in: index for foreign table */ + dict_index_t* index_ref, /* in: index for referenced table */ + ibool new_protocol); /* in: protocol > 1 */ +#endif /* WITH_WSREP */ #ifndef UNIV_NONINL #include "rem0rec.ic" #endif diff --git a/storage/innobase/include/row0log.h b/storage/innobase/include/row0log.h index 62715fe8808..f105838eece 100644 --- a/storage/innobase/include/row0log.h +++ b/storage/innobase/include/row0log.h @@ -35,6 +35,10 @@ Created 2011-05-26 Marko Makela #include "trx0types.h" #include "que0types.h" +extern ulint onlineddl_rowlog_rows; +extern ulint onlineddl_rowlog_pct_used; +extern ulint onlineddl_pct_progress; + /******************************************************//** Allocate the row log for an index and flag the index for online creation. diff --git a/storage/innobase/include/row0merge.h b/storage/innobase/include/row0merge.h index 390c0ce038b..31a9ac6f45e 100644 --- a/storage/innobase/include/row0merge.h +++ b/storage/innobase/include/row0merge.h @@ -40,6 +40,18 @@ Created 13/06/2005 Jan Lindstrom #include "lock0types.h" #include "srv0srv.h" +/* Cluster index read task is mandatory */ +#define COST_READ_CLUSTERED_INDEX 1.0 + +/* Basic fixed cost to build all type of index */ +#define COST_BUILD_INDEX_STATIC 0.5 +/* Dynamic cost to build all type of index, dynamic cost will be re-distributed based on page count ratio of each index */ +#define COST_BUILD_INDEX_DYNAMIC 0.5 + +/* Sum of below two must be 1.0 */ +#define PCT_COST_MERGESORT_INDEX 0.4 +#define PCT_COST_INSERT_INDEX 0.6 + // Forward declaration struct ib_sequence_t; @@ -370,7 +382,10 @@ row_merge_sort( merge_file_t* file, /*!< in/out: file containing index entries */ row_merge_block_t* block, /*!< in/out: 3 buffers */ - int* tmpfd) /*!< in/out: temporary file handle */ + int* tmpfd, /*!< in/out: temporary file handle */ + const bool update_progress, /*!< in: update progress status variable or not */ + const float pct_progress, /*!< in: total progress percent until now */ + const float pct_cost) /*!< in: current progress percent */ __attribute__((nonnull)); /*********************************************************************//** Allocate a sort buffer. diff --git a/storage/innobase/include/row0mysql.h b/storage/innobase/include/row0mysql.h index 06c07002c2b..440001410f0 100644 --- a/storage/innobase/include/row0mysql.h +++ b/storage/innobase/include/row0mysql.h @@ -606,6 +606,12 @@ struct mysql_row_templ_t { Innobase record in the current index; not defined if template_type is ROW_MYSQL_WHOLE_ROW */ + ibool rec_field_is_prefix; /* is this field in a prefix index? */ + ulint rec_prefix_field_no; /* record field, even if just a + prefix; same as rec_field_no when not a + prefix, otherwise rec_field_no is + ULINT_UNDEFINED but this is the true + field number*/ ulint clust_rec_field_no; /*!< field number of the column in an Innobase record in the clustered index; not defined if template_type is @@ -707,7 +713,9 @@ struct row_prebuilt_t { columns through a secondary index and at least one column is not in the secondary index, then this is - set to TRUE */ + set to TRUE; note that sometimes this + is set but we later optimize out the + clustered index lookup */ unsigned templ_contains_blob:1;/*!< TRUE if the template contains a column with DATA_BLOB == get_innobase_type_from_mysql_type(); diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h index 2d90f47eefe..0a47d514e1b 100644 --- a/storage/innobase/include/srv0mon.h +++ b/storage/innobase/include/srv0mon.h @@ -2,6 +2,7 @@ Copyright (c) 2010, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the @@ -163,7 +164,11 @@ enum monitor_id_t { MONITOR_OVLD_BUF_POOL_PAGES_FREE, MONITOR_OVLD_PAGE_CREATED, MONITOR_OVLD_PAGES_WRITTEN, + MONITOR_OVLD_INDEX_PAGES_WRITTEN, + MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN, MONITOR_OVLD_PAGES_READ, + MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS, + MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS_AVOIDED, MONITOR_OVLD_BYTE_READ, MONITOR_OVLD_BYTE_WRITTEN, MONITOR_FLUSH_BATCH_SCANNED, @@ -194,9 +199,12 @@ enum monitor_id_t { MONITOR_LRU_BATCH_SCANNED, MONITOR_LRU_BATCH_SCANNED_NUM_CALL, MONITOR_LRU_BATCH_SCANNED_PER_CALL, - MONITOR_LRU_BATCH_TOTAL_PAGE, - MONITOR_LRU_BATCH_COUNT, - MONITOR_LRU_BATCH_PAGES, + MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE, + MONITOR_LRU_BATCH_FLUSH_COUNT, + MONITOR_LRU_BATCH_FLUSH_PAGES, + MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE, + MONITOR_LRU_BATCH_EVICT_COUNT, + MONITOR_LRU_BATCH_EVICT_PAGES, MONITOR_LRU_SINGLE_FLUSH_SCANNED, MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL, MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL, @@ -304,6 +312,20 @@ enum monitor_id_t { MONITOR_PAGE_DECOMPRESS, MONITOR_PAD_INCREMENTS, MONITOR_PAD_DECREMENTS, + /* New monitor variables for page compression */ + MONITOR_OVLD_PAGE_COMPRESS_SAVED, + MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT512, + MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT1024, + MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT2048, + MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT4096, + MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT8192, + MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT16384, + MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT32768, + MONITOR_OVLD_PAGES_PAGE_COMPRESSED, + MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP, + MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP_SAVED, + MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED, + MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR, /* Index related counters */ MONITOR_MODULE_INDEX, diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index a3e6a17a6e2..52f2f22b372 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -103,6 +103,37 @@ struct srv_stats_t { a disk page */ ulint_ctr_1_t buf_pool_reads; + /** Number of bytes saved by page compression */ + ulint_ctr_64_t page_compression_saved; + /** Number of 512Byte TRIM by page compression */ + ulint_ctr_64_t page_compression_trim_sect512; + /** Number of 1K TRIM by page compression */ + ulint_ctr_64_t page_compression_trim_sect1024; + /** Number of 2K TRIM by page compression */ + ulint_ctr_64_t page_compression_trim_sect2048; + /** Number of 4K TRIM by page compression */ + ulint_ctr_64_t page_compression_trim_sect4096; + /** Number of 8K TRIM by page compression */ + ulint_ctr_64_t page_compression_trim_sect8192; + /** Number of 16K TRIM by page compression */ + ulint_ctr_64_t page_compression_trim_sect16384; + /** Number of 32K TRIM by page compression */ + ulint_ctr_64_t page_compression_trim_sect32768; + /* Number of index pages written */ + ulint_ctr_64_t index_pages_written; + /* Number of non index pages written */ + ulint_ctr_64_t non_index_pages_written; + /* Number of pages compressed with page compression */ + ulint_ctr_64_t pages_page_compressed; + /* Number of TRIM operations induced by page compression */ + ulint_ctr_64_t page_compressed_trim_op; + /* Number of TRIM operations saved by using actual write size knowledge */ + ulint_ctr_64_t page_compressed_trim_op_saved; + /* Number of pages decompressed with page compression */ + ulint_ctr_64_t pages_page_decompressed; + /* Number of page compression errors */ + ulint_ctr_64_t pages_page_compression_error; + /** Number of data read in total (in bytes) */ ulint_ctr_1_t data_read; @@ -138,6 +169,12 @@ struct srv_stats_t { /** Number of system rows inserted */ ulint_ctr_64_t n_system_rows_inserted; + + /** Number of times secondary index lookup triggered cluster lookup */ + ulint_ctr_64_t n_sec_rec_cluster_reads; + + /** Number of times prefix optimization avoided triggering cluster lookup */ + ulint_ctr_64_t n_sec_rec_cluster_reads_avoided; }; extern const char* srv_main_thread_op_info; @@ -230,6 +267,31 @@ OS (provided we compiled Innobase with it in), otherwise we will use simulated aio we build below with threads. Currently we support native aio on windows and linux */ extern my_bool srv_use_native_aio; + +/* Use trim operation */ +extern my_bool srv_use_trim; + +/* Use posix fallocate */ +#ifdef HAVE_POSIX_FALLOCATE +extern my_bool srv_use_posix_fallocate; +#endif + +/* Use atomic writes i.e disable doublewrite buffer */ +extern my_bool srv_use_atomic_writes; + +/* Compression algorithm*/ +extern ulong innodb_compression_algorithm; + +/* Number of flush threads */ +#define MTFLUSH_MAX_WORKER 64 +#define MTFLUSH_DEFAULT_WORKER 8 + +/* Number of threads used for multi-threaded flush */ +extern long srv_mtflush_threads; + +/* If this flag is TRUE, then we will use multi threaded flush. */ +extern my_bool srv_use_mtflush; + #ifdef __WIN__ extern ibool srv_use_native_conditions; #endif /* __WIN__ */ @@ -260,6 +322,10 @@ extern ulong srv_auto_extend_increment; extern ibool srv_created_new_raw; +/* Optimize prefix index queries to skip cluster index lookup when possible */ +/* Enables or disables this prefix optimization. Disabled by default. */ +extern my_bool srv_prefix_index_cluster_optimization; + /** Maximum number of srv_n_log_files, or innodb_log_files_in_group */ #define SRV_N_LOG_FILES_MAX 100 extern ulong srv_n_log_files; @@ -270,6 +336,10 @@ extern ulong srv_flush_log_at_trx_commit; extern uint srv_flush_log_at_timeout; extern char srv_adaptive_flushing; +#ifdef WITH_INNODB_DISALLOW_WRITES +/* When this event is reset we do not allow any file writes to take place. */ +extern os_event_t srv_allow_writes_event; +#endif /* WITH_INNODB_DISALLOW_WRITES */ /* If this flag is TRUE, then we will load the indexes' (and tables') metadata even if they are marked as "corrupted". Mostly it is for DBA to process corrupted index and table */ @@ -301,6 +371,17 @@ extern my_bool srv_random_read_ahead; extern ulong srv_read_ahead_threshold; extern ulint srv_n_read_io_threads; extern ulint srv_n_write_io_threads; +/* Defragmentation, Origianlly facebook default value is 100, but it's too high */ +#define SRV_DEFRAGMENT_FREQUENCY_DEFAULT 40 +extern my_bool srv_defragment; +extern uint srv_defragment_n_pages; +extern uint srv_defragment_stats_accuracy; +extern uint srv_defragment_fill_factor_n_recs; +extern double srv_defragment_fill_factor; +extern uint srv_defragment_frequency; +extern ulonglong srv_defragment_interval; + +extern ulong srv_idle_flush_pct; /* Number of IO operations per second the server can do */ extern ulong srv_io_capacity; @@ -363,10 +444,7 @@ extern ibool srv_use_doublewrite_buf; extern ulong srv_doublewrite_batch_size; extern ulong srv_checksum_algorithm; -extern ibool srv_use_atomic_writes; -#ifdef HAVE_POSIX_FALLOCATE -extern ibool srv_use_posix_fallocate; -#endif +extern my_bool srv_force_primary_key; extern double srv_max_buf_pool_modified_pct; extern ulong srv_max_purge_lag; @@ -428,7 +506,6 @@ extern my_bool srv_ibuf_disable_background_merge; extern my_bool srv_purge_view_update_only_debug; #endif /* UNIV_DEBUG */ -extern ulint srv_fatal_semaphore_wait_threshold; #define SRV_SEMAPHORE_WAIT_EXTENSION 7200 extern ulint srv_dml_needed_delay; @@ -467,6 +544,11 @@ extern srv_stats_t srv_stats; /** Simulate compression failures. */ extern uint srv_simulate_comp_failures; +/** Fatal semaphore wait threshold = maximum number of seconds +that semaphore times out in InnoDB */ +#define DEFAULT_SRV_FATAL_SEMAPHORE_TIMEOUT 600 +extern ulong srv_fatal_semaphore_wait_threshold; + # ifdef UNIV_PFS_THREAD /* Keys to register InnoDB threads with performance schema */ extern mysql_pfs_key_t buf_page_cleaner_thread_key; @@ -866,12 +948,62 @@ struct export_var_t{ ulint innodb_system_rows_deleted; /*!< srv_n_system_rows_deleted*/ ulint innodb_num_open_files; /*!< fil_n_file_opened */ ulint innodb_truncated_status_writes; /*!< srv_truncated_status_writes */ - ulint innodb_available_undo_logs; /*!< srv_available_undo_logs */ + ulint innodb_available_undo_logs; /*!< srv_available_undo_logs + */ + ulint innodb_defragment_compression_failures; /*!< Number of + defragment re-compression + failures */ + + ulint innodb_defragment_failures; /*!< Number of defragment + failures*/ + ulint innodb_defragment_count; /*!< Number of defragment + operations*/ + + ulint innodb_onlineddl_rowlog_rows; /*!< Online alter rows */ + ulint innodb_onlineddl_rowlog_pct_used; /*!< Online alter percentage + of used row log buffer */ + ulint innodb_onlineddl_pct_progress; /*!< Online alter progress */ + #ifdef UNIV_DEBUG ulint innodb_purge_trx_id_age; /*!< rw_max_trx_id - purged trx_id */ ulint innodb_purge_view_trx_id_age; /*!< rw_max_trx_id - purged view's min trx_id */ #endif /* UNIV_DEBUG */ + + ib_int64_t innodb_page_compression_saved;/*!< Number of bytes saved + by page compression */ + ib_int64_t innodb_page_compression_trim_sect512;/*!< Number of 512b TRIM + by page compression */ + ib_int64_t innodb_page_compression_trim_sect1024;/*!< Number of 1K TRIM + by page compression */ + ib_int64_t innodb_page_compression_trim_sect2048;/*!< Number of 2K TRIM + by page compression */ + ib_int64_t innodb_page_compression_trim_sect4096;/*!< Number of 4K byte TRIM + by page compression */ + ib_int64_t innodb_page_compression_trim_sect8192;/*!< Number of 8K TRIM + by page compression */ + ib_int64_t innodb_page_compression_trim_sect16384;/*!< Number of 16K TRIM + by page compression */ + ib_int64_t innodb_page_compression_trim_sect32768;/*!< Number of 32K TRIM + by page compression */ + ib_int64_t innodb_index_pages_written; /*!< Number of index pages + written */ + ib_int64_t innodb_non_index_pages_written; /*!< Number of non index pages + written */ + ib_int64_t innodb_pages_page_compressed;/*!< Number of pages + compressed by page compression */ + ib_int64_t innodb_page_compressed_trim_op;/*!< Number of TRIM operations + induced by page compression */ + ib_int64_t innodb_page_compressed_trim_op_saved;/*!< Number of TRIM operations + saved by page compression */ + ib_int64_t innodb_pages_page_decompressed;/*!< Number of pages + decompressed by page + compression */ + ib_int64_t innodb_pages_page_compression_error;/*!< Number of page + compression errors */ + + ulint innodb_sec_rec_cluster_reads; /*!< srv_sec_rec_cluster_reads */ + ulint innodb_sec_rec_cluster_reads_avoided; /*!< srv_sec_rec_cluster_reads_avoided */ }; /** Thread slot in the thread table. */ @@ -911,5 +1043,13 @@ struct srv_slot_t{ # define srv_start_raw_disk_in_use 0 # define srv_file_per_table 1 #endif /* !UNIV_HOTBACKUP */ +#ifdef WITH_WSREP +UNIV_INTERN +void +wsrep_srv_conc_cancel_wait( +/*==================*/ + trx_t* trx); /*!< in: transaction object associated with the + thread */ +#endif /* WITH_WSREP */ #endif diff --git a/storage/innobase/include/srv0start.h b/storage/innobase/include/srv0start.h index 40d502f4459..e1c19982ba5 100644 --- a/storage/innobase/include/srv0start.h +++ b/storage/innobase/include/srv0start.h @@ -37,7 +37,8 @@ Created 10/10/1995 Heikki Tuuri #endif /*********************************************************************//** -Normalizes a directory path for Windows: converts slashes to backslashes. */ +Normalizes a directory path for Windows: converts slashes to backslashes. +*/ UNIV_INTERN void srv_normalize_path_for_win( diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h index 7b00e16476b..f26e66f1a87 100644 --- a/storage/innobase/include/sync0sync.h +++ b/storage/innobase/include/sync0sync.h @@ -687,6 +687,7 @@ or row lock! */ #define SYNC_EXTERN_STORAGE 500 #define SYNC_FSP 400 #define SYNC_FSP_PAGE 395 +#define SYNC_STATS_DEFRAG 390 /*------------------------------------- Change buffer headers */ #define SYNC_IBUF_MUTEX 370 /* ibuf_mutex */ /*------------------------------------- Change buffer tree */ diff --git a/storage/innobase/include/sync0sync.ic b/storage/innobase/include/sync0sync.ic index 97ec63c0dd2..a5887b1fd6f 100644 --- a/storage/innobase/include/sync0sync.ic +++ b/storage/innobase/include/sync0sync.ic @@ -204,7 +204,10 @@ mutex_enter_func( ulint line) /*!< in: line where locked */ { ut_ad(mutex_validate(mutex)); +#ifndef WITH_WSREP + /* this cannot be be granted when BF trx kills a trx in lock wait state */ ut_ad(!mutex_own(mutex)); +#endif /* WITH_WSREP */ /* Note that we do not peek at the value of lock_word before trying the atomic test_and_set; we could peek, and possibly save time. */ diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h index 70f214d1ac7..9ffc8d99a7f 100644 --- a/storage/innobase/include/trx0sys.h +++ b/storage/innobase/include/trx0sys.h @@ -42,6 +42,9 @@ Created 3/26/1996 Heikki Tuuri #include "read0types.h" #include "page0types.h" #include "ut0bh.h" +#ifdef WITH_WSREP +#include "trx0xa.h" +#endif /* WITH_WSREP */ typedef UT_LIST_BASE_NODE_T(trx_t) trx_list_t; @@ -293,6 +296,9 @@ trx_sys_update_mysql_binlog_offset( ib_int64_t offset, /*!< in: position in that log file */ ulint field, /*!< in: offset of the MySQL log info field in the trx sys header */ +#ifdef WITH_WSREP + trx_sysf_t* sys_header, /*!< in: trx sys header */ +#endif /* WITH_WSREP */ mtr_t* mtr); /*!< in: mtr */ /*****************************************************************//** Prints to stderr the MySQL binlog offset info in the trx system header if @@ -301,6 +307,19 @@ UNIV_INTERN void trx_sys_print_mysql_binlog_offset(void); /*===================================*/ +#ifdef WITH_WSREP +/** Update WSREP checkpoint XID in sys header. */ +void +trx_sys_update_wsrep_checkpoint( + const XID* xid, /*!< in: WSREP XID */ + trx_sysf_t* sys_header, /*!< in: sys_header */ + mtr_t* mtr); /*!< in: mtr */ + +void +/** Read WSREP checkpoint XID from sys header. */ +trx_sys_read_wsrep_checkpoint( + XID* xid); /*!< out: WSREP XID */ +#endif /* WITH_WSREP */ /*****************************************************************//** Prints to stderr the MySQL master log offset info in the trx system header if the magic number shows it valid. */ @@ -529,6 +548,20 @@ this contains the same fields as TRX_SYS_MYSQL_LOG_INFO below */ within that file */ #define TRX_SYS_MYSQL_LOG_NAME 12 /*!< MySQL log file name */ +#ifdef WITH_WSREP +/* The offset to WSREP XID headers */ +#define TRX_SYS_WSREP_XID_INFO (UNIV_PAGE_SIZE - 3500) +#define TRX_SYS_WSREP_XID_MAGIC_N_FLD 0 +#define TRX_SYS_WSREP_XID_MAGIC_N 0x77737265 + +/* XID field: formatID, gtrid_len, bqual_len, xid_data */ +#define TRX_SYS_WSREP_XID_LEN (4 + 4 + 4 + XIDDATASIZE) +#define TRX_SYS_WSREP_XID_FORMAT 4 +#define TRX_SYS_WSREP_XID_GTRID_LEN 8 +#define TRX_SYS_WSREP_XID_BQUAL_LEN 12 +#define TRX_SYS_WSREP_XID_DATA 16 +#endif /* WITH_WSREP*/ + /** Doublewrite buffer */ /* @{ */ /** The offset of the doublewrite buffer header on the trx system header page */ diff --git a/storage/innobase/include/trx0sys.ic b/storage/innobase/include/trx0sys.ic index e097e29b551..7265a97ae25 100644 --- a/storage/innobase/include/trx0sys.ic +++ b/storage/innobase/include/trx0sys.ic @@ -445,7 +445,10 @@ trx_id_t trx_sys_get_new_trx_id(void) /*========================*/ { +#ifndef WITH_WSREP + /* wsrep_fake_trx_id violates this assert */ ut_ad(mutex_own(&trx_sys->mutex)); +#endif /* WITH_WSREP */ /* VERY important: after the database is started, max_trx_id value is divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the following if diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h index fcc9ed05081..7c92445b796 100644 --- a/storage/innobase/include/trx0trx.h +++ b/storage/innobase/include/trx0trx.h @@ -1009,6 +1009,9 @@ struct trx_t{ /*------------------------------*/ char detailed_error[256]; /*!< detailed error message for last error, or empty. */ +#ifdef WITH_WSREP + os_event_t wsrep_event; /* event waited for in srv_conc_slot */ +#endif /* WITH_WSREP */ }; /* Transaction isolation levels (trx->isolation_level) */ diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i index eeeaca166a8..a4c401134f9 100644 --- a/storage/innobase/include/univ.i +++ b/storage/innobase/include/univ.i @@ -2,6 +2,7 @@ Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. +Copyright (c) 2013, 2014 SkySQL Ab. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -331,6 +332,30 @@ typedef enum innodb_file_formats_enum innodb_file_formats_t; /** The 2-logarithm of UNIV_PAGE_SIZE: */ #define UNIV_PAGE_SIZE_SHIFT srv_page_size_shift +#ifdef HAVE_LZO +#define IF_LZO(A,B) A +#else +#define IF_LZO(A,B) B +#endif + +#ifdef HAVE_LZ4 +#define IF_LZ4(A,B) A +#else +#define IF_LZ4(A,B) B +#endif + +#ifdef HAVE_LZMA +#define IF_LZMA(A,B) A +#else +#define IF_LZMA(A,B) B +#endif + +#ifdef HAVE_BZIP2 +#define IF_BZIP2(A,B) A +#else +#define IF_BZIP2(A,B) B +#endif + /** The universal page size of the database */ #define UNIV_PAGE_SIZE ((ulint) srv_page_size) @@ -344,13 +369,15 @@ and 2 bits for flags. This limits the uncompressed page size to 16k. Even though a 16k uncompressed page can theoretically be compressed into a larger compressed page, it is not a useful feature so we will limit both with this same constant. */ -#define UNIV_ZIP_SIZE_SHIFT_MAX 14 +#define UNIV_ZIP_SIZE_SHIFT_MAX 15 /* Define the Min, Max, Default page sizes. */ /** Minimum Page Size Shift (power of 2) */ #define UNIV_PAGE_SIZE_SHIFT_MIN 12 +/** log2 of largest page size (1<<16 == 64436 bytes). */ /** Maximum Page Size Shift (power of 2) */ -#define UNIV_PAGE_SIZE_SHIFT_MAX 14 +#define UNIV_PAGE_SIZE_SHIFT_MAX 16 +/** log2 of default page size (1<<14 == 16384 bytes). */ /** Default Page Size Shift (power of 2) */ #define UNIV_PAGE_SIZE_SHIFT_DEF 14 /** Original 16k InnoDB Page Size Shift, in case the default changes */ diff --git a/storage/innobase/include/ut0list.h b/storage/innobase/include/ut0list.h index 29fc8669ce4..796a272db59 100644 --- a/storage/innobase/include/ut0list.h +++ b/storage/innobase/include/ut0list.h @@ -150,6 +150,15 @@ ib_list_is_empty( /* out: TRUE if empty else */ const ib_list_t* list); /* in: list */ +/******************************************************************** +Get number of items on list. +@return number of items on list */ +UNIV_INLINE +ulint +ib_list_len( +/*========*/ + const ib_list_t* list); /*<! in: list */ + /* List. */ struct ib_list_t { ib_list_node_t* first; /*!< first node */ diff --git a/storage/innobase/include/ut0list.ic b/storage/innobase/include/ut0list.ic index d9dcb2eac99..7a7f53adb2f 100644 --- a/storage/innobase/include/ut0list.ic +++ b/storage/innobase/include/ut0list.ic @@ -58,3 +58,23 @@ ib_list_is_empty( { return(!(list->first || list->last)); } + +/******************************************************************** +Get number of items on list. +@return number of items on list */ +UNIV_INLINE +ulint +ib_list_len( +/*========*/ + const ib_list_t* list) /*<! in: list */ +{ + ulint len = 0; + ib_list_node_t* node = list->first; + + while(node) { + len++; + node = node->next; + } + + return (len); +} diff --git a/storage/innobase/include/ut0timer.h b/storage/innobase/include/ut0timer.h new file mode 100644 index 00000000000..f361ae79bf5 --- /dev/null +++ b/storage/innobase/include/ut0timer.h @@ -0,0 +1,104 @@ +/***************************************************************************** + +Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved. +Copyright (c) 2014, SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/ut0timer.h +Timer rountines + +Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com +modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6 +*************************************************************************/ +#ifndef ut0timer_h +#define ut0timer_h + +#include "univ.i" +#include "data0type.h" +#include <my_rdtsc.h> + +/* Current timer stats */ +extern struct my_timer_unit_info ut_timer; + +/**************************************************************//** +Function pointer to point selected timer function. +@return timer current value */ +extern ulonglong (*ut_timer_now)(void); + +/**************************************************************//** +Sets up the data required for use of my_timer_* functions. +Selects the best timer by high frequency, and tight resolution. +Points my_timer_now() to the selected timer function. +Initializes my_timer struct to contain the info for selected timer.*/ +UNIV_INTERN +void ut_init_timer(void); + +/**************************************************************//** +Return time passed since time then, automatically adjusted +for the estimated timer overhead. +@return time passed since "then" */ +UNIV_INLINE +ulonglong +ut_timer_since( +/*===========*/ + ulonglong then); /*!< in: time where to calculate */ +/**************************************************************//** +Get time passed since "then", and update then to now +@return time passed sinche "then" */ +UNIV_INLINE +ulonglong +ut_timer_since_and_update( +/*======================*/ + ulonglong *then); /*!< in: time where to calculate */ +/**************************************************************//** +Convert native timer units in a ulonglong into seconds in a double +@return time in a seconds */ +UNIV_INLINE +double +ut_timer_to_seconds( +/*=================*/ + ulonglong when); /*!< in: time where to calculate */ +/**************************************************************//** +Convert native timer units in a ulonglong into milliseconds in a double +@return time in milliseconds */ +UNIV_INLINE +double +ut_timer_to_milliseconds( +/*=====================*/ + ulonglong when); /*!< in: time where to calculate */ +/**************************************************************//** +Convert native timer units in a ulonglong into microseconds in a double +@return time in microseconds */ +UNIV_INLINE +double +ut_timer_to_microseconds( +/*=====================*/ + ulonglong when); /*!< in: time where to calculate */ +/**************************************************************//** +Convert microseconds in a double to native timer units in a ulonglong +@return time in microseconds */ +UNIV_INLINE +ulonglong +ut_microseconds_to_timer( +/*=====================*/ + ulonglong when); /*!< in: time where to calculate */ + +#ifndef UNIV_NONINL +#include "ut0timer.ic" +#endif + +#endif diff --git a/storage/innobase/include/ut0timer.ic b/storage/innobase/include/ut0timer.ic new file mode 100644 index 00000000000..027e89c6279 --- /dev/null +++ b/storage/innobase/include/ut0timer.ic @@ -0,0 +1,113 @@ +/***************************************************************************** + +Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved. +Copyright (c) 2014, SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/ut0timer.ic +Timer rountines + +Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com +modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6 +*************************************************************************/ + +/**************************************************************//** +Return time passed since time then, automatically adjusted +for the estimated timer overhead. +@return time passed since "then" */ +UNIV_INLINE +ulonglong +ut_timer_since( +/*===========*/ + ulonglong then) /*!< in: time where to calculate */ +{ + return (ut_timer_now() - then) - ut_timer.overhead; +} + +/**************************************************************//** +Get time passed since "then", and update then to now +@return time passed sinche "then" */ +UNIV_INLINE +ulonglong +ut_timer_since_and_update( +/*======================*/ + ulonglong *then) /*!< in: time where to calculate */ +{ + ulonglong now = ut_timer_now(); + ulonglong ret = (now - (*then)) - ut_timer.overhead; + *then = now; + return ret; +} + +/**************************************************************//** +Convert native timer units in a ulonglong into seconds in a double +@return time in a seconds */ +UNIV_INLINE +double +ut_timer_to_seconds( +/*=================*/ + ulonglong when) /*!< in: time where to calculate */ +{ + double ret = (double)(when); + ret /= (double)(ut_timer.frequency); + return ret; +} + +/**************************************************************//** +Convert native timer units in a ulonglong into milliseconds in a double +@return time in milliseconds */ +UNIV_INLINE +double +ut_timer_to_milliseconds( +/*=====================*/ + ulonglong when) /*!< in: time where to calculate */ +{ + double ret = (double)(when); + ret *= 1000.0; + ret /= (double)(ut_timer.frequency); + return ret; +} + +/**************************************************************//** +Convert native timer units in a ulonglong into microseconds in a double +@return time in microseconds */ +UNIV_INLINE +double +ut_timer_to_microseconds( +/*=====================*/ + ulonglong when) /*!< in: time where to calculate */ +{ + double ret = (double)(when); + ret *= 1000000.0; + ret /= (double)(ut_timer.frequency); + return ret; +} + +/**************************************************************//** +Convert microseconds in a double to native timer units in a ulonglong +@return time in microseconds */ +UNIV_INLINE +ulonglong +ut_microseconds_to_timer( +/*=====================*/ + ulonglong when) /*!< in: time where to calculate */ +{ + double ret = when; + ret *= (double)(ut_timer.frequency); + ret /= 1000000.0; + return (ulonglong)ret; +} diff --git a/storage/innobase/include/ut0wqueue.h b/storage/innobase/include/ut0wqueue.h index 33385ddf2d4..9906e299808 100644 --- a/storage/innobase/include/ut0wqueue.h +++ b/storage/innobase/include/ut0wqueue.h @@ -95,6 +95,23 @@ ib_wqueue_timedwait( ib_wqueue_t* wq, /* in: work queue */ ib_time_t wait_in_usecs); /* in: wait time in micro seconds */ +/******************************************************************** +Return first item on work queue or NULL if queue is empty +@return work item or NULL */ +void* +ib_wqueue_nowait( +/*=============*/ + ib_wqueue_t* wq); /*<! in: work queue */ + +/******************************************************************** +Get number of items on queue. +@return number of items on queue */ +ulint +ib_wqueue_len( +/*==========*/ + ib_wqueue_t* wq); /*<! in: work queue */ + + /* Work queue. */ struct ib_wqueue_t { ib_mutex_t mutex; /*!< mutex protecting everything */ diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index eac18a02d16..42719fcc3cd 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -51,6 +51,8 @@ Created 5/7/1996 Heikki Tuuri #include <set> #include "mysql/plugin.h" +#include <mysql/service_wsrep.h> + /* Restricts the length of search we will do in the waits-for graph of transactions */ #define LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK 1000000 @@ -959,6 +961,9 @@ UNIV_INLINE ibool lock_rec_has_to_wait( /*=================*/ +#ifdef WITH_WSREP + ibool for_locking, /*!< is caller locking or releasing */ +#endif /* WITH_WSREP */ const trx_t* trx, /*!< in: trx of new lock */ ulint type_mode,/*!< in: precise mode of the new lock to set: LOCK_S or LOCK_X, possibly @@ -1055,6 +1060,50 @@ lock_rec_has_to_wait( return (FALSE); } +#ifdef WITH_WSREP + /* if BF thread is locking and has conflict with another BF + thread, we need to look at trx ordering and lock types */ + if (for_locking && + wsrep_thd_is_BF(trx->mysql_thd, FALSE) && + wsrep_thd_is_BF(lock2->trx->mysql_thd, TRUE)) { + + if (wsrep_debug) { + fprintf(stderr, "\n BF-BF lock conflict \n"); + lock_rec_print(stderr, lock2); + } + + if (wsrep_trx_order_before(trx->mysql_thd, + lock2->trx->mysql_thd) && + (type_mode & LOCK_MODE_MASK) == LOCK_X && + (lock2->type_mode & LOCK_MODE_MASK) == LOCK_X) + { + /* exclusive lock conflicts are not accepted */ + fprintf(stderr, "BF-BF X lock conflict," + "type_mode: %lu supremum: %lu\n", + type_mode, lock_is_on_supremum); + fprintf(stderr, "conflicts states: my %d locked %d\n", + wsrep_thd_conflict_state(trx->mysql_thd, FALSE), + wsrep_thd_conflict_state(lock2->trx->mysql_thd, FALSE) ); + lock_rec_print(stderr, lock2); + return FALSE; + //abort(); + } else { + /* if lock2->index->n_uniq <= + lock2->index->n_user_defined_cols + operation is on uniq index + */ + if (wsrep_debug) fprintf(stderr, + "BF conflict, modes: %lu %lu, " + "idx: %s-%s n_uniq %u n_user %u\n", + type_mode, lock2->type_mode, + lock2->index->name, + lock2->index->table_name, + lock2->index->n_uniq, + lock2->index->n_user_defined_cols); + return FALSE; + } + } +#endif /* WITH_WSREP */ return(TRUE); } @@ -1085,7 +1134,11 @@ lock_has_to_wait( /* If this lock request is for a supremum record then the second bit on the lock bitmap is set */ +#ifdef WITH_WSREP + return(lock_rec_has_to_wait(FALSE, lock1->trx, +#else return(lock_rec_has_to_wait(lock1->trx, +#endif /* WITH_WSREP */ lock1->type_mode, lock2, lock_rec_get_nth_bit( lock1, 1))); @@ -1554,6 +1607,11 @@ lock_rec_has_expl( return(NULL); } +#ifdef WITH_WSREP +static +void +lock_rec_discard(lock_t* in_lock); +#endif #ifdef UNIV_DEBUG /*********************************************************************//** Checks if some other transaction has a lock request in the queue. @@ -1602,6 +1660,69 @@ lock_rec_other_has_expl_req( } #endif /* UNIV_DEBUG */ +#ifdef WITH_WSREP +static +void +wsrep_kill_victim( + const trx_t * const trx, + const lock_t *lock) +{ + ut_ad(lock_mutex_own()); + ut_ad(trx_mutex_own(lock->trx)); + my_bool bf_this = wsrep_thd_is_BF(trx->mysql_thd, FALSE); + my_bool bf_other = wsrep_thd_is_BF(lock->trx->mysql_thd, TRUE); + + if ((bf_this && !bf_other) || + (bf_this && bf_other && wsrep_trx_order_before( + trx->mysql_thd, lock->trx->mysql_thd))) { + + if (lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) { + if (wsrep_debug) { + fprintf(stderr, "WSREP: BF victim waiting\n"); + } + /* cannot release lock, until our lock + is in the queue*/ + } else if (lock->trx != trx) { + if (wsrep_log_conflicts) { + mutex_enter(&trx_sys->mutex); + if (bf_this) { + fputs("\n*** Priority TRANSACTION:\n", + stderr); + } else { + fputs("\n*** Victim TRANSACTION:\n", + stderr); + } + + trx_print_latched(stderr, trx, 3000); + + if (bf_other) { + fputs("\n*** Priority TRANSACTION:\n", + stderr); + } else { + fputs("\n*** Victim TRANSACTION:\n", + stderr); + } + + trx_print_latched(stderr, lock->trx, 3000); + + mutex_exit(&trx_sys->mutex); + + fputs("*** WAITING FOR THIS LOCK TO BE GRANTED:\n", + stderr); + + if (lock_get_type(lock) == LOCK_REC) { + lock_rec_print(stderr, lock); + } else { + lock_table_print(stderr, lock); + } + } + + wsrep_innobase_kill_one_trx(trx->mysql_thd, + (const trx_t*) trx, lock->trx, TRUE); + } + } +} +#endif /*********************************************************************//** Checks if some other transaction has a conflicting explicit lock request in the queue, so that we have to wait. @@ -1630,7 +1751,15 @@ lock_rec_other_has_conflicting( lock != NULL; lock = lock_rec_get_next_const(heap_no, lock)) { - if (lock_rec_has_to_wait(trx, mode, lock, is_supremum)) { +#ifdef WITH_WSREP + if (lock_rec_has_to_wait(TRUE, trx, mode, lock, is_supremum)) { + trx_mutex_enter(lock->trx); + wsrep_kill_victim(trx, lock); + trx_mutex_exit(lock->trx); +#else + if (lock_rec_has_to_wait(trx, mode, lock, is_supremum)) { +#endif /* WITH_WSREP */ + return(lock); } } @@ -1811,6 +1940,28 @@ lock_number_of_rows_locked( /*============== RECORD LOCK CREATION AND QUEUE MANAGEMENT =============*/ +#ifdef WITH_WSREP +static +void +wsrep_print_wait_locks( +/*============*/ + lock_t* c_lock) /* conflicting lock to print */ +{ + if (wsrep_debug && c_lock->trx->lock.wait_lock != c_lock) { + fprintf(stderr, "WSREP: c_lock != wait lock\n"); + if (lock_get_type_low(c_lock) & LOCK_TABLE) + lock_table_print(stderr, c_lock); + else + lock_rec_print(stderr, c_lock); + + if (lock_get_type_low(c_lock->trx->lock.wait_lock) & LOCK_TABLE) + lock_table_print(stderr, c_lock->trx->lock.wait_lock); + else + lock_rec_print(stderr, c_lock->trx->lock.wait_lock); + } +} +#endif /* WITH_WSREP */ + /*********************************************************************//** Creates a new record lock and inserts it to the lock queue. Does NOT check for deadlocks or lock compatibility! @@ -1819,6 +1970,10 @@ static lock_t* lock_rec_create( /*============*/ +#ifdef WITH_WSREP + lock_t* const c_lock, /* conflicting lock */ + que_thr_t* thr, +#endif ulint type_mode,/*!< in: lock mode and wait flag, type is ignored and replaced by LOCK_REC */ @@ -1890,8 +2045,88 @@ lock_rec_create( ut_ad(index->table->n_ref_count > 0 || !index->table->can_be_evicted); +#ifdef WITH_WSREP + if (c_lock && wsrep_thd_is_BF(trx->mysql_thd, FALSE)) { + lock_t *hash = (lock_t *)c_lock->hash; + lock_t *prev = NULL; + + while (hash && + wsrep_thd_is_BF(((lock_t *)hash)->trx->mysql_thd, TRUE) && + wsrep_trx_order_before( + ((lock_t *)hash)->trx->mysql_thd, + trx->mysql_thd)) { + prev = hash; + hash = (lock_t *)hash->hash; + } + lock->hash = hash; + if (prev) { + prev->hash = lock; + } else { + c_lock->hash = lock; + } + /* + * delayed conflict resolution '...kill_one_trx' was not called, + * if victim was waiting for some other lock + */ + trx_mutex_enter(c_lock->trx); + if (c_lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) { + + c_lock->trx->lock.was_chosen_as_deadlock_victim = TRUE; + + if (wsrep_debug) { + wsrep_print_wait_locks(c_lock); + } + + trx->lock.que_state = TRX_QUE_LOCK_WAIT; + lock_set_lock_and_trx_wait(lock, trx); + UT_LIST_ADD_LAST(trx_locks, trx->lock.trx_locks, lock); + + ut_ad(thr != NULL); + trx->lock.wait_thr = thr; + thr->state = QUE_THR_LOCK_WAIT; + + /* have to release trx mutex for the duration of + victim lock release. This will eventually call + lock_grant, which wants to grant trx mutex again + */ + if (caller_owns_trx_mutex) { + trx_mutex_exit(trx); + } + lock_cancel_waiting_and_release( + c_lock->trx->lock.wait_lock); + + if (caller_owns_trx_mutex) { + trx_mutex_enter(trx); + } + + /* trx might not wait for c_lock, but some other lock + does not matter if wait_lock was released above + */ + if (c_lock->trx->lock.wait_lock == c_lock) { + lock_reset_lock_and_trx_wait(lock); + } + + trx_mutex_exit(c_lock->trx); + + if (wsrep_debug) { + fprintf( + stderr, + "WSREP: c_lock canceled %llu\n", + (ulonglong) c_lock->trx->id); + } + + /* have to bail out here to avoid lock_set_lock... */ + return(lock); + } + trx_mutex_exit(c_lock->trx); + } else { + HASH_INSERT(lock_t, hash, lock_sys->rec_hash, + lock_rec_fold(space, page_no), lock); + } +#else HASH_INSERT(lock_t, hash, lock_sys->rec_hash, lock_rec_fold(space, page_no), lock); +#endif /* WITH_WSREP */ if (!caller_owns_trx_mutex) { trx_mutex_enter(trx); @@ -1899,7 +2134,6 @@ lock_rec_create( ut_ad(trx_mutex_own(trx)); if (type_mode & LOCK_WAIT) { - lock_set_lock_and_trx_wait(lock, trx); } @@ -1911,7 +2145,6 @@ lock_rec_create( MONITOR_INC(MONITOR_RECLOCK_CREATED); MONITOR_INC(MONITOR_NUM_RECLOCK); - return(lock); } @@ -1926,6 +2159,9 @@ static dberr_t lock_rec_enqueue_waiting( /*=====================*/ +#ifdef WITH_WSREP + lock_t* c_lock, /* conflicting lock */ +#endif ulint type_mode,/*!< in: lock mode this transaction is requesting: LOCK_S or LOCK_X, possibly @@ -1983,6 +2219,9 @@ lock_rec_enqueue_waiting( /* Enqueue the lock request that will wait to be granted, note that we already own the trx mutex. */ lock = lock_rec_create( +#ifdef WITH_WSREP + c_lock, thr, +#endif /* WITH_WSREP */ type_mode | LOCK_WAIT, block, heap_no, index, trx, TRUE); /* Release the mutex to obey the latching order. @@ -2083,7 +2322,19 @@ lock_rec_add_to_queue( const lock_t* other_lock = lock_rec_other_has_expl_req(mode, 0, LOCK_WAIT, block, heap_no, trx); +#ifdef WITH_WSREP + /* this can potentionally assert with wsrep */ + if (wsrep_thd_is_wsrep(trx->mysql_thd)) { + if (wsrep_debug && other_lock) { + fprintf(stderr, + "WSREP: InnoDB assert ignored\n"); + } + } else { + ut_a(!other_lock); + } +#else ut_a(!other_lock); +#endif /* WITH_WSREP */ } #endif /* UNIV_DEBUG */ @@ -2111,7 +2362,16 @@ lock_rec_add_to_queue( if (lock_get_wait(lock) && lock_rec_get_nth_bit(lock, heap_no)) { - +#ifdef WITH_WSREP + if (wsrep_thd_is_BF(trx->mysql_thd, FALSE)) { + if (wsrep_debug) { + fprintf(stderr, + "BF skipping wait: %lu\n", + trx->id); + lock_rec_print(stderr, lock); + } + } else +#endif goto somebody_waits; } } @@ -2134,9 +2394,15 @@ lock_rec_add_to_queue( } somebody_waits: - return(lock_rec_create( +#ifdef WITH_WSREP + return(lock_rec_create(NULL, NULL, type_mode, block, heap_no, index, trx, caller_owns_trx_mutex)); +#else + return(lock_rec_create( + type_mode, block, heap_no, index, trx, + caller_owns_trx_mutex)); +#endif /* WITH_WSREP */ } /** Record locking request status */ @@ -2199,9 +2465,13 @@ lock_rec_lock_fast( if (lock == NULL) { if (!impl) { /* Note that we don't own the trx mutex. */ +#ifdef WITH_WSREP + lock = lock_rec_create(NULL, thr, + mode, block, heap_no, index, trx, FALSE); +#else lock = lock_rec_create( mode, block, heap_no, index, trx, FALSE); - +#endif /* WITH_WSREP */ } status = LOCK_REC_SUCCESS_CREATED; } else { @@ -2254,6 +2524,9 @@ lock_rec_lock_slow( que_thr_t* thr) /*!< in: query thread */ { trx_t* trx; +#ifdef WITH_WSREP + lock_t* c_lock(NULL); +#endif dberr_t err = DB_SUCCESS; ut_ad(lock_mutex_own()); @@ -2277,18 +2550,31 @@ lock_rec_lock_slow( /* The trx already has a strong enough lock on rec: do nothing */ - +#ifdef WITH_WSREP + } else if ((c_lock = (ib_lock_t*)lock_rec_other_has_conflicting( + static_cast<enum lock_mode>(mode), + block, heap_no, trx))) { +#else } else if (lock_rec_other_has_conflicting( static_cast<enum lock_mode>(mode), block, heap_no, trx)) { +#endif /* WITH_WSREP */ /* If another transaction has a non-gap conflicting request in the queue, as this transaction does not have a lock strong enough already granted on the record, we have to wait. */ +#ifdef WITH_WSREP + /* c_lock is NULL here if jump to enqueue_waiting happened + but it's ok because lock is not NULL in that case and c_lock + is not used. */ + err = lock_rec_enqueue_waiting(c_lock, + mode, block, heap_no, index, thr); +#else err = lock_rec_enqueue_waiting( mode, block, heap_no, index, thr); +#endif /* WITH_WSREP */ } else if (!impl) { /* Set the requested lock on the record, note that @@ -2394,7 +2680,13 @@ lock_rec_has_to_wait_in_queue( if (heap_no < lock_rec_get_n_bits(lock) && (p[bit_offset] & bit_mask) && lock_has_to_wait(wait_lock, lock)) { - +#ifdef WITH_WSREP + if (wsrep_thd_is_BF(wait_lock->trx->mysql_thd, FALSE) && + wsrep_thd_is_BF(lock->trx->mysql_thd, TRUE)) { + /* don't wait for another BF lock */ + continue; + } +#endif return(lock); } } @@ -3308,6 +3600,47 @@ lock_update_merge_left( } /*************************************************************//** +Updates the lock table when a page is split and merged to +two pages. */ +UNIV_INTERN +void +lock_update_split_and_merge( + const buf_block_t* left_block, /*!< in: left page to which merged */ + const rec_t* orig_pred, /*!< in: original predecessor of + supremum on the left page before merge*/ + const buf_block_t* right_block) /*!< in: right page from which merged */ +{ + const rec_t* left_next_rec; + + ut_a(left_block && right_block); + ut_a(orig_pred); + + lock_mutex_enter(); + + left_next_rec = page_rec_get_next_const(orig_pred); + + /* Inherit the locks on the supremum of the left page to the + first record which was moved from the right page */ + lock_rec_inherit_to_gap( + left_block, left_block, + page_rec_get_heap_no(left_next_rec), + PAGE_HEAP_NO_SUPREMUM); + + /* Reset the locks on the supremum of the left page, + releasing waiting transactions */ + lock_rec_reset_and_release_wait(left_block, + PAGE_HEAP_NO_SUPREMUM); + + /* Inherit the locks to the supremum of the left page from the + successor of the infimum on the right page */ + lock_rec_inherit_to_gap(left_block, right_block, + PAGE_HEAP_NO_SUPREMUM, + lock_get_min_heap_no(right_block)); + + lock_mutex_exit(); +} + +/*************************************************************//** Resets the original locks on heir and replaces them with gap type locks inherited from rec. */ UNIV_INTERN @@ -3778,10 +4111,22 @@ lock_deadlock_select_victim( /* The joining transaction is 'smaller', choose it as the victim and roll it back. */ - return(ctx->start); +#ifdef WITH_WSREP + if (wsrep_thd_is_BF(ctx->start->mysql_thd, TRUE)) { + return(ctx->wait_lock->trx); + } + else +#endif /* WITH_WSREP */ + return(ctx->start); } - return(ctx->wait_lock->trx); +#ifdef WITH_WSREP + if (wsrep_thd_is_BF(ctx->wait_lock->trx->mysql_thd, TRUE)) { + return(ctx->start); + } + else +#endif /* WITH_WSREP */ + return(ctx->wait_lock->trx); } /********************************************************************//** @@ -3911,8 +4256,14 @@ lock_deadlock_search( ctx->too_deep = TRUE; +#ifdef WITH_WSREP + if (wsrep_thd_is_BF(ctx->start->mysql_thd, TRUE)) { + return(ctx->wait_lock->trx->id); + } + else +#endif /* WITH_WSREP */ /* Select the joining transaction as the victim. */ - return(ctx->start->id); + return(ctx->start->id); } else { /* We do not need to report autoinc locks to the upper @@ -3953,6 +4304,11 @@ lock_deadlock_search( size not big enough. */ ctx->too_deep = TRUE; +#ifdef WITH_WSREP + if (wsrep_thd_is_BF(ctx->start->mysql_thd, TRUE)) + return(lock->trx->id); + else +#endif /* WITH_WSREP */ return(ctx->start->id); } @@ -4137,9 +4493,18 @@ lock_deadlock_check_and_resolve( ut_a(trx == ctx.start); ut_a(victim_trx_id == trx->id); - if (!srv_read_only_mode) { - lock_deadlock_joining_trx_print(trx, lock); +#ifdef WITH_WSREP + if (!wsrep_thd_is_BF(ctx.start->mysql_thd, TRUE)) + { +#endif /* WITH_WSREP */ + if (!srv_read_only_mode) { + lock_deadlock_joining_trx_print(trx, lock); + } +#ifdef WITH_WSREP + } else { + /* BF processor */; } +#endif /* WITH_WSREP */ MONITOR_INC(MONITOR_DEADLOCK); @@ -4177,6 +4542,9 @@ UNIV_INLINE lock_t* lock_table_create( /*==============*/ +#ifdef WITH_WSREP + lock_t* c_lock, /*!< in: conflicting lock */ +#endif dict_table_t* table, /*!< in/out: database table in dictionary cache */ ulint type_mode,/*!< in: lock mode possibly ORed with @@ -4220,7 +4588,59 @@ lock_table_create( ut_ad(table->n_ref_count > 0 || !table->can_be_evicted); UT_LIST_ADD_LAST(trx_locks, trx->lock.trx_locks, lock); + +#ifdef WITH_WSREP + if (wsrep_thd_is_wsrep(trx->mysql_thd)) { + if (c_lock && wsrep_thd_is_BF(trx->mysql_thd, FALSE)) { + UT_LIST_INSERT_AFTER( + un_member.tab_lock.locks, table->locks, c_lock, lock); + } else { + UT_LIST_ADD_LAST(un_member.tab_lock.locks, table->locks, lock); + } + + if (c_lock) { + trx_mutex_enter(c_lock->trx); + } + + if (c_lock && c_lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) { + + c_lock->trx->lock.was_chosen_as_deadlock_victim = TRUE; + + if (wsrep_debug) { + wsrep_print_wait_locks(c_lock); + wsrep_print_wait_locks(c_lock->trx->lock.wait_lock); + } + + /* have to release trx mutex for the duration of + victim lock release. This will eventually call + lock_grant, which wants to grant trx mutex again + */ + /* caller has trx_mutex, have to release for lock cancel */ + trx_mutex_exit(trx); + lock_cancel_waiting_and_release(c_lock->trx->lock.wait_lock); + trx_mutex_enter(trx); + + /* trx might not wait for c_lock, but some other lock + does not matter if wait_lock was released above + */ + if (c_lock->trx->lock.wait_lock == c_lock) { + lock_reset_lock_and_trx_wait(lock); + } + + if (wsrep_debug) { + fprintf(stderr, "WSREP: c_lock canceled %llu\n", + (ulonglong) c_lock->trx->id); + } + } + if (c_lock) { + trx_mutex_exit(c_lock->trx); + } + } else { + UT_LIST_ADD_LAST(un_member.tab_lock.locks, table->locks, lock); + } +#else UT_LIST_ADD_LAST(un_member.tab_lock.locks, table->locks, lock); +#endif /* WITH_WSREP */ if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) { @@ -4377,6 +4797,9 @@ static dberr_t lock_table_enqueue_waiting( /*=======================*/ +#ifdef WITH_WSREP + lock_t* c_lock, /*!< in: conflicting lock */ +#endif ulint mode, /*!< in: lock mode this transaction is requesting */ dict_table_t* table, /*!< in/out: table */ @@ -4421,7 +4844,14 @@ lock_table_enqueue_waiting( /* Enqueue the lock request that will wait to be granted */ - lock = lock_table_create(table, mode | LOCK_WAIT, trx); +#ifdef WITH_WSREP + if (trx->lock.was_chosen_as_deadlock_victim) { + return(DB_DEADLOCK); + } + lock = lock_table_create(c_lock, table, mode | LOCK_WAIT, trx); +#else + lock = lock_table_create(table, mode | LOCK_WAIT, trx); +#endif /* WITH_WSREP */ /* Release the mutex to obey the latching order. This is safe, because lock_deadlock_check_and_resolve() @@ -4493,6 +4923,18 @@ lock_table_other_has_incompatible( && !lock_mode_compatible(lock_get_mode(lock), mode) && (wait || !lock_get_wait(lock))) { +#ifdef WITH_WSREP + if(wsrep_thd_is_wsrep(trx->mysql_thd)) { + if (wsrep_debug) { + fprintf(stderr, "WSREP: trx %ld table lock abort\n", + trx->id); + } + trx_mutex_enter(lock->trx); + wsrep_kill_victim((trx_t *)trx, (lock_t *)lock); + trx_mutex_exit(lock->trx); + } +#endif + return(lock); } } @@ -4515,6 +4957,9 @@ lock_table( enum lock_mode mode, /*!< in: lock mode */ que_thr_t* thr) /*!< in: query thread */ { +#ifdef WITH_WSREP + lock_t *c_lock = NULL; +#endif trx_t* trx; dberr_t err; const lock_t* wait_for; @@ -4542,11 +4987,19 @@ lock_table( lock_mutex_enter(); + DBUG_EXECUTE_IF("fatal-semaphore-timeout", + { os_thread_sleep(3600000000); }); + /* We have to check if the new lock is compatible with any locks other transactions have in the table lock queue. */ +#ifdef WITH_WSREP + wait_for = lock_table_other_has_incompatible( + trx, LOCK_WAIT, table, mode); +#else wait_for = lock_table_other_has_incompatible( trx, LOCK_WAIT, table, mode); +#endif trx_mutex_enter(trx); @@ -4554,9 +5007,17 @@ lock_table( mode: this trx may have to wait */ if (wait_for != NULL) { +#ifdef WITH_WSREP + err = lock_table_enqueue_waiting((ib_lock_t*)wait_for, mode | flags, table, thr); +#else err = lock_table_enqueue_waiting(mode | flags, table, thr); +#endif } else { +#ifdef WITH_WSREP + lock_table_create(c_lock, table, mode | flags, trx); +#else lock_table_create(table, mode | flags, trx); +#endif ut_a(!flags || mode == LOCK_S || mode == LOCK_X); @@ -4594,7 +5055,11 @@ lock_table_ix_resurrect( trx, LOCK_WAIT, table, LOCK_IX)); trx_mutex_enter(trx); +#ifdef WITH_WSREP + lock_table_create(NULL, table, LOCK_IX, trx); +#else lock_table_create(table, LOCK_IX, trx); +#endif lock_mutex_exit(); trx_mutex_exit(trx); } @@ -5725,6 +6190,7 @@ lock_rec_queue_validate( if (!lock_rec_get_gap(lock) && !lock_get_wait(lock)) { +#ifndef WITH_WSREP enum lock_mode mode; if (lock_get_mode(lock) == LOCK_S) { @@ -5733,7 +6199,8 @@ lock_rec_queue_validate( mode = LOCK_S; } ut_a(!lock_rec_other_has_expl_req( - mode, 0, 0, block, heap_no, lock->trx)); + mode, 0, 0, block, heap_no, lock->trx)); +#endif /* WITH_WSREP */ } else if (lock_get_wait(lock) && !lock_rec_get_gap(lock)) { @@ -6038,6 +6505,9 @@ lock_rec_insert_check_and_lock( dberr_t err; ulint next_rec_heap_no; ibool inherit_in = *inherit; +#ifdef WITH_WSREP + lock_t* c_lock=NULL; +#endif ut_ad(block->frame == page_align(rec)); ut_ad(!dict_index_is_online_ddl(index) @@ -6094,17 +6564,30 @@ lock_rec_insert_check_and_lock( had to wait for their insert. Both had waiting gap type lock requests on the successor, which produced an unnecessary deadlock. */ +#ifdef WITH_WSREP + if ((c_lock = (ib_lock_t*)lock_rec_other_has_conflicting( + static_cast<enum lock_mode>( + LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION), + block, next_rec_heap_no, trx))) { +#else if (lock_rec_other_has_conflicting( static_cast<enum lock_mode>( LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION), block, next_rec_heap_no, trx)) { +#endif /* WITH_WSREP */ /* Note that we may get DB_SUCCESS also here! */ trx_mutex_enter(trx); +#ifdef WITH_WSREP + err = lock_rec_enqueue_waiting(c_lock, + LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION, + block, next_rec_heap_no, index, thr); +#else err = lock_rec_enqueue_waiting( LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION, block, next_rec_heap_no, index, thr); +#endif /* WITH_WSREP */ trx_mutex_exit(trx); } else { diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index 53794a0d773..d1418dcaab5 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -2,6 +2,7 @@ Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2009, Google Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -1272,7 +1273,7 @@ log_group_file_header_flush( (ulint) (dest_offset / UNIV_PAGE_SIZE), (ulint) (dest_offset % UNIV_PAGE_SIZE), OS_FILE_LOG_BLOCK_SIZE, - buf, group); + buf, group, 0); srv_stats.os_log_pending_writes.dec(); } @@ -1400,7 +1401,7 @@ loop: fil_io(OS_FILE_WRITE | OS_FILE_LOG, true, group->space_id, 0, (ulint) (next_offset / UNIV_PAGE_SIZE), (ulint) (next_offset % UNIV_PAGE_SIZE), write_len, buf, - group); + group, 0); srv_stats.os_log_pending_writes.dec(); @@ -1966,7 +1967,7 @@ log_group_checkpoint( write_offset / UNIV_PAGE_SIZE, write_offset % UNIV_PAGE_SIZE, OS_FILE_LOG_BLOCK_SIZE, - buf, ((byte*) group + 1)); + buf, ((byte*) group + 1), 0); ut_ad(((ulint) group & 0x1UL) == 0); } @@ -2046,7 +2047,7 @@ log_group_read_checkpoint_info( fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->space_id, 0, field / UNIV_PAGE_SIZE, field % UNIV_PAGE_SIZE, - OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL); + OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL, 0); } /******************************************************//** @@ -2340,7 +2341,7 @@ loop: fil_io(OS_FILE_READ | OS_FILE_LOG, sync, group->space_id, 0, (ulint) (source_offset / UNIV_PAGE_SIZE), (ulint) (source_offset % UNIV_PAGE_SIZE), - len, buf, NULL); + len, buf, NULL, 0); start_lsn += len; buf += len; @@ -2405,7 +2406,7 @@ log_group_archive_file_header_write( dest_offset / UNIV_PAGE_SIZE, dest_offset % UNIV_PAGE_SIZE, 2 * OS_FILE_LOG_BLOCK_SIZE, - buf, &log_archive_io); + buf, &log_archive_io, 0); } /******************************************************//** @@ -2441,7 +2442,7 @@ log_group_archive_completed_header_write( dest_offset % UNIV_PAGE_SIZE, OS_FILE_LOG_BLOCK_SIZE, buf + LOG_FILE_ARCH_COMPLETED, - &log_archive_io); + &log_archive_io, 0); } /******************************************************//** @@ -2569,7 +2570,7 @@ loop: (ulint) (next_offset / UNIV_PAGE_SIZE), (ulint) (next_offset % UNIV_PAGE_SIZE), ut_calc_align(len, OS_FILE_LOG_BLOCK_SIZE), buf, - &log_archive_io); + &log_archive_io, 0); start_lsn += len; next_offset += len; diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index 4fe9620ccaa..48a204ff327 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -2,6 +2,7 @@ Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -347,7 +348,10 @@ DECLARE_THREAD(recv_writer_thread)( while (srv_shutdown_state == SRV_SHUTDOWN_NONE) { - os_thread_sleep(100000); + /* Wait till we get a signal to clean the LRU list. + Bounded by max wait time of 100ms. */ + ib_int64_t sig_count = os_event_reset(buf_flush_event); + os_event_wait_time_low(buf_flush_event, 100000, sig_count); mutex_enter(&recv_sys->writer_mutex); @@ -2078,7 +2082,7 @@ recv_apply_log_recs_for_backup(void) error = fil_io(OS_FILE_READ, true, recv_addr->space, zip_size, recv_addr->page_no, 0, zip_size, - block->page.zip.data, NULL); + block->page.zip.data, NULL, 0); if (error == DB_SUCCESS && !buf_zip_decompress(block, TRUE)) { exit(1); @@ -2088,7 +2092,7 @@ recv_apply_log_recs_for_backup(void) recv_addr->space, 0, recv_addr->page_no, 0, UNIV_PAGE_SIZE, - block->frame, NULL); + block->frame, NULL, 0); } if (error != DB_SUCCESS) { @@ -2117,13 +2121,13 @@ recv_apply_log_recs_for_backup(void) recv_addr->space, zip_size, recv_addr->page_no, 0, zip_size, - block->page.zip.data, NULL); + block->page.zip.data, NULL, 0); } else { error = fil_io(OS_FILE_WRITE, true, recv_addr->space, 0, recv_addr->page_no, 0, UNIV_PAGE_SIZE, - block->frame, NULL); + block->frame, NULL, 0); } skip_this_recv_addr: recv_addr = HASH_GET_NEXT(addr_hash, recv_addr); @@ -3082,7 +3086,7 @@ recv_recovery_from_checkpoint_start_func( fil_io(OS_FILE_READ | OS_FILE_LOG, true, max_cp_group->space_id, 0, 0, 0, LOG_FILE_HDR_SIZE, - log_hdr_buf, max_cp_group); + log_hdr_buf, max_cp_group, 0); if (0 == ut_memcmp(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, (byte*)"ibbackup", (sizeof "ibbackup") - 1)) { @@ -3113,7 +3117,7 @@ recv_recovery_from_checkpoint_start_func( fil_io(OS_FILE_WRITE | OS_FILE_LOG, true, max_cp_group->space_id, 0, 0, 0, OS_FILE_LOG_BLOCK_SIZE, - log_hdr_buf, max_cp_group); + log_hdr_buf, max_cp_group, 0); } #ifdef UNIV_LOG_ARCHIVE @@ -3742,7 +3746,7 @@ ask_again: /* Read the archive file header */ fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->archive_space_id, 0, 0, - LOG_FILE_HDR_SIZE, buf, NULL); + LOG_FILE_HDR_SIZE, buf, NULL, 0); /* Check if the archive file header is consistent */ @@ -3815,7 +3819,7 @@ ask_again: fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->archive_space_id, read_offset / UNIV_PAGE_SIZE, - read_offset % UNIV_PAGE_SIZE, len, buf, NULL); + read_offset % UNIV_PAGE_SIZE, len, buf, NULL, 0); ret = recv_scan_log_recs( (buf_pool_get_n_pages() diff --git a/storage/innobase/mysql-test/storage_engine/type_char_indexes.rdiff b/storage/innobase/mysql-test/storage_engine/type_char_indexes.rdiff index 7a388552c57..98e17f3c825 100644 --- a/storage/innobase/mysql-test/storage_engine/type_char_indexes.rdiff +++ b/storage/innobase/mysql-test/storage_engine/type_char_indexes.rdiff @@ -1,6 +1,6 @@ ---- suite/storage_engine/type_char_indexes.result 2012-07-12 19:27:42.191013570 +0400 -+++ suite/storage_engine/type_char_indexes.reject 2012-07-15 17:51:55.810034331 +0400 -@@ -135,7 +135,7 @@ +--- suite/storage_engine/type_char_indexes.result 2014-10-12 14:22:11.000000000 +0400 ++++ suite/storage_engine/type_char_indexes.reject 2014-10-12 14:23:28.000000000 +0400 +@@ -137,7 +137,7 @@ r3a EXPLAIN SELECT c,c20,v16,v128 FROM t1 WHERE v16 = 'varchar1a' OR v16 = 'varchar3a' ORDER BY v16; id select_type table type possible_keys key key_len ref rows Extra diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index f94d6353431..89c8bf373f7 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -2,6 +2,7 @@ Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, 2014, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Percona Inc.. Those modifications are @@ -42,8 +43,13 @@ Created 10/21/1995 Heikki Tuuri #include "srv0srv.h" #include "srv0start.h" #include "fil0fil.h" +#include "fil0pagecompress.h" #include "buf0buf.h" #include "srv0mon.h" +#include "srv0srv.h" +#ifdef HAVE_POSIX_FALLOCATE +#include "fcntl.h" +#endif #ifndef UNIV_HOTBACKUP # include "os0sync.h" # include "os0thread.h" @@ -60,6 +66,21 @@ Created 10/21/1995 Heikki Tuuri #include <libaio.h> #endif +#if defined(UNIV_LINUX) && defined(HAVE_SYS_IOCTL_H) +# include <sys/ioctl.h> +# ifndef DFS_IOCTL_ATOMIC_WRITE_SET +# define DFS_IOCTL_ATOMIC_WRITE_SET _IOW(0x95, 2, uint) +# endif +#endif + +#if defined(UNIV_LINUX) && defined(HAVE_SYS_STATVFS_H) +#include <sys/statvfs.h> +#endif + +#ifdef HAVE_LZO +#include "lzo/lzo1x.h" +#endif + /** Insert buffer segment id */ static const ulint IO_IBUF_SEGMENT = 0; @@ -87,6 +108,12 @@ UNIV_INTERN os_ib_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES]; /* In simulated aio, merge at most this many consecutive i/os */ #define OS_AIO_MERGE_N_CONSECUTIVE 64 +#ifdef WITH_INNODB_DISALLOW_WRITES +#define WAIT_ALLOW_WRITES() os_event_wait(srv_allow_writes_event) +#else +#define WAIT_ALLOW_WRITES() do { } while (0) +#endif /* WITH_INNODB_DISALLOW_WRITES */ + /********************************************************************** InnoDB AIO Implementation: @@ -175,6 +202,32 @@ struct os_aio_slot_t{ and which can be used to identify which pending aio operation was completed */ + ulint bitmap; + + byte* page_compression_page; /*!< Memory allocated for + page compressed page and + freed after the write + has been completed */ + + ibool page_compression; + ulint page_compression_level; + + ulint* write_size; /*!< Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ + + byte* page_buf; /*!< Actual page buffer for + page compressed pages, do not + free this */ + + ibool page_compress_success; + /*!< TRUE if page compression was + successfull, false if not */ + + ulint file_block_size;/*!< file block size */ + #ifdef WIN_ASYNC_IO HANDLE handle; /*!< handle object we need in the OVERLAPPED struct */ @@ -185,6 +238,7 @@ struct os_aio_slot_t{ int n_bytes; /* bytes written/read. */ int ret; /* AIO return code */ #endif /* WIN_ASYNC_IO */ + byte *lzo_mem; /* Temporal memory used by LZO */ }; /** The asynchronous i/o array structure */ @@ -294,6 +348,88 @@ UNIV_INTERN ulint os_n_pending_writes = 0; /** Number of pending read operations */ UNIV_INTERN ulint os_n_pending_reads = 0; +/** After first fallocate failure we will disable os_file_trim */ +UNIV_INTERN ibool os_fallocate_failed = FALSE; + +/**********************************************************************//** +Directly manipulate the allocated disk space by deallocating for the file referred to +by fd for the byte range starting at offset and continuing for len bytes. +Within the specified range, partial file system blocks are zeroed, and whole +file system blocks are removed from the file. After a successful call, +subsequent reads from this range will return zeroes. +@return true if success, false if error */ +UNIV_INTERN +ibool +os_file_trim( +/*=========*/ + os_aio_slot_t* slot); /*!< in: slot structure */ + +/**********************************************************************//** +Allocate memory for temporal buffer used for page compression. This +buffer is freed later. */ +UNIV_INTERN +void +os_slot_alloc_page_buf( +/*===================*/ + os_aio_slot_t* slot); /*!< in: slot structure */ + +#ifdef HAVE_LZO +/**********************************************************************//** +Allocate memory for temporal memory used for page compression when +LZO compression method is used */ +UNIV_INTERN +void +os_slot_alloc_lzo_mem( +/*===================*/ + os_aio_slot_t* slot); /*!< in: slot structure */ +#endif + +/****************************************************************//** +Does error handling when a file operation fails. +@return TRUE if we should retry the operation */ +ibool +os_file_handle_error_no_exit( +/*=========================*/ + const char* name, /*!< in: name of a file or NULL */ + const char* operation, /*!< in: operation */ + ibool on_error_silent,/*!< in: if TRUE then don't print + any message to the log. */ + const char* file, /*!< in: file name */ + const ulint line); /*!< in: line */ + +/****************************************************************//** +Tries to enable the atomic write feature, if available, for the specified file +handle. +@return TRUE if success */ +static __attribute__((warn_unused_result)) +ibool +os_file_set_atomic_writes( +/*======================*/ + const char* name /*!< in: name of the file */ + __attribute__((unused)), + os_file_t file /*!< in: handle to the file */ + __attribute__((unused))) +{ +#ifdef DFS_IOCTL_ATOMIC_WRITE_SET + int atomic_option = 1; + + if (ioctl(file, DFS_IOCTL_ATOMIC_WRITE_SET, &atomic_option)) { + + fprintf(stderr, "InnoDB: Warning:Trying to enable atomic writes on " + "file %s on non-supported platform!\n", name); + os_file_handle_error_no_exit(name, "ioctl", FALSE, __FILE__, __LINE__); + return(FALSE); + } + + return(TRUE); +#else + fprintf(stderr, "InnoDB: Error: trying to enable atomic writes on " + "file %s on non-supported platform!\n", name); + return(FALSE); +#endif +} + + #ifdef UNIV_DEBUG # ifndef UNIV_HOTBACKUP /**********************************************************************//** @@ -439,6 +575,19 @@ os_file_get_last_error_low( "InnoDB: because of either a thread exit" " or an application request.\n" "InnoDB: Retry attempt is made.\n"); + } else if (err == ECANCELED || err == ENOTTY) { + if (strerror(err) != NULL) { + fprintf(stderr, + "InnoDB: Error number %d" + " means '%s'.\n", + err, strerror(err)); + } + + if(srv_use_atomic_writes) { + fprintf(stderr, + "InnoDB: Error trying to enable atomic writes on " + "non-supported destination!\n"); + } } else { fprintf(stderr, "InnoDB: Some operating system error numbers" @@ -503,6 +652,19 @@ os_file_get_last_error_low( "InnoDB: The error means mysqld does not have" " the access rights to\n" "InnoDB: the directory.\n"); + } else if (err == ECANCELED || err == ENOTTY) { + if (strerror(err) != NULL) { + fprintf(stderr, + "InnoDB: Error number %d" + " means '%s'.\n", + err, strerror(err)); + } + + if(srv_use_atomic_writes) { + fprintf(stderr, + "InnoDB: Error trying to enable atomic writes on " + "non-supported destination!\n"); + } } else { if (strerror(err) != NULL) { fprintf(stderr, @@ -536,6 +698,9 @@ os_file_get_last_error_low( case ENOTDIR: case EISDIR: return(OS_FILE_PATH_ERROR); + case ECANCELED: + case ENOTTY: + return(OS_FILE_OPERATION_NOT_SUPPORTED); case EAGAIN: if (srv_use_native_aio) { return(OS_FILE_AIO_RESOURCES_RESERVED); @@ -582,9 +747,11 @@ os_file_handle_error_cond_exit( const char* operation, /*!< in: operation */ ibool should_exit, /*!< in: call exit(3) if unknown error and this parameter is TRUE */ - ibool on_error_silent)/*!< in: if TRUE then don't print + ibool on_error_silent,/*!< in: if TRUE then don't print any message to the log iff it is an unknown non-fatal error */ + const char* file, /*!< in: file name */ + const ulint line) /*!< in: line */ { ulint err; @@ -614,6 +781,9 @@ os_file_handle_error_cond_exit( " InnoDB: Disk is full. Try to clean the disk" " to free space.\n"); + fprintf(stderr, + " InnoDB: at file %s and at line %ld\n", file, line); + os_has_said_disk_full = TRUE; fflush(stderr); @@ -649,6 +819,12 @@ os_file_handle_error_cond_exit( to the log. */ if (should_exit || !on_error_silent) { + fprintf(stderr, + " InnoDB: Operation %s to file %s and at line %ld\n", + operation, file, line); + } + + if (should_exit || !on_error_silent) { ib_logf(IB_LOG_LEVEL_ERROR, "File %s: '%s' returned OS " "error " ULINTPF ".%s", name ? name : "(unknown)", operation, err, should_exit @@ -671,10 +847,12 @@ ibool os_file_handle_error( /*=================*/ const char* name, /*!< in: name of a file or NULL */ - const char* operation) /*!< in: operation */ + const char* operation, /*!< in: operation */ + const char* file, /*!< in: file name */ + const ulint line) /*!< in: line */ { /* exit in case of unknown error */ - return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE)); + return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE, file, line)); } /****************************************************************//** @@ -685,12 +863,14 @@ os_file_handle_error_no_exit( /*=========================*/ const char* name, /*!< in: name of a file or NULL */ const char* operation, /*!< in: operation */ - ibool on_error_silent)/*!< in: if TRUE then don't print + ibool on_error_silent,/*!< in: if TRUE then don't print any message to the log. */ + const char* file, /*!< in: file name */ + const ulint line) /*!< in: line */ { /* don't exit in case of unknown error */ return(os_file_handle_error_cond_exit( - name, operation, FALSE, on_error_silent)); + name, operation, FALSE, on_error_silent, file, line)); } #undef USE_FILE_LOCK @@ -766,7 +946,9 @@ os_file_create_tmpfile(void) /*========================*/ { FILE* file = NULL; - int fd = innobase_mysql_tmpfile(); + int fd; + WAIT_ALLOW_WRITES(); + fd = innobase_mysql_tmpfile(); ut_ad(!srv_read_only_mode); @@ -830,7 +1012,7 @@ os_file_opendir( if (dir == INVALID_HANDLE_VALUE) { if (error_is_fatal) { - os_file_handle_error(dirname, "opendir"); + os_file_handle_error(dirname, "opendir", __FILE__, __LINE__); } return(NULL); @@ -841,7 +1023,7 @@ os_file_opendir( dir = opendir(dirname); if (dir == NULL && error_is_fatal) { - os_file_handle_error(dirname, "opendir"); + os_file_handle_error(dirname, "opendir", __FILE__, __LINE__); } return(dir); @@ -863,7 +1045,7 @@ os_file_closedir( ret = FindClose(dir); if (!ret) { - os_file_handle_error_no_exit(NULL, "closedir", FALSE); + os_file_handle_error_no_exit(NULL, "closedir", FALSE, __FILE__, __LINE__); return(-1); } @@ -875,7 +1057,7 @@ os_file_closedir( ret = closedir(dir); if (ret) { - os_file_handle_error_no_exit(NULL, "closedir", FALSE); + os_file_handle_error_no_exit(NULL, "closedir", FALSE, __FILE__, __LINE__); } return(ret); @@ -947,7 +1129,7 @@ next_file: return(1); } else { - os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE); + os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE, __FILE__, __LINE__); return(-1); } #else @@ -1033,7 +1215,7 @@ next_file: goto next_file; } - os_file_handle_error_no_exit(full_path, "stat", FALSE); + os_file_handle_error_no_exit(full_path, "stat", FALSE, __FILE__, __LINE__); ut_free(full_path); @@ -1084,7 +1266,7 @@ os_file_create_directory( && !fail_if_exists))) { os_file_handle_error_no_exit( - pathname, "CreateDirectory", FALSE); + pathname, "CreateDirectory", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -1092,12 +1274,13 @@ os_file_create_directory( return(TRUE); #else int rcode; + WAIT_ALLOW_WRITES(); rcode = mkdir(pathname, 0770); if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) { /* failure */ - os_file_handle_error_no_exit(pathname, "mkdir", FALSE); + os_file_handle_error_no_exit(pathname, "mkdir", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -1207,7 +1390,7 @@ os_file_create_simple_func( retry = os_file_handle_error( name, create_mode == OS_FILE_OPEN ? - "open" : "create"); + "open" : "create", __FILE__, __LINE__); } else { *success = TRUE; @@ -1218,6 +1401,8 @@ os_file_create_simple_func( #else /* __WIN__ */ int create_flag; + if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) + WAIT_ALLOW_WRITES(); ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); @@ -1275,7 +1460,7 @@ os_file_create_simple_func( retry = os_file_handle_error( name, create_mode == OS_FILE_OPEN - ? "open" : "create"); + ? "open" : "create", __FILE__, __LINE__); } else { *success = TRUE; retry = false; @@ -1317,9 +1502,12 @@ os_file_create_simple_no_error_handling_func( OS_FILE_READ_WRITE, or OS_FILE_READ_ALLOW_DELETE; the last option is used by a backup program reading the file */ - ibool* success)/*!< out: TRUE if succeed, FALSE if error */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + ulint atomic_writes) /*! in: atomic writes table option + value */ { os_file_t file; + atomic_writes_t awrites = (atomic_writes_t) atomic_writes; *success = FALSE; #ifdef __WIN__ @@ -1380,11 +1568,30 @@ os_file_create_simple_no_error_handling_func( attributes, NULL); // No template file + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ + if (file != INVALID_HANDLE_VALUE + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) + && !os_file_set_atomic_writes(name, file)) { + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + CloseHandle(file); + os_file_delete_if_exists_func(name); + *success = FALSE; + file = INVALID_HANDLE_VALUE; + } + } + *success = (file != INVALID_HANDLE_VALUE); #else /* __WIN__ */ int create_flag; ut_a(name); + if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) + WAIT_ALLOW_WRITES(); ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); @@ -1440,6 +1647,24 @@ os_file_create_simple_no_error_handling_func( } #endif /* USE_FILE_LOCK */ + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ + if (file != -1 + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) + && !os_file_set_atomic_writes(name, file)) { + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + close(file); + os_file_delete_if_exists_func(name); + *success = FALSE; + file = -1; + } + } + + #endif /* __WIN__ */ return(file); @@ -1524,12 +1749,15 @@ os_file_create_func( async i/o or unbuffered i/o: look in the function source code for the exact rules */ ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */ - ibool* success)/*!< out: TRUE if succeed, FALSE if error */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + ulint atomic_writes) /*! in: atomic writes table option + value */ { os_file_t file; ibool retry; ibool on_error_no_exit; ibool on_error_silent; + atomic_writes_t awrites = (atomic_writes_t) atomic_writes; #ifdef __WIN__ DBUG_EXECUTE_IF( @@ -1662,9 +1890,9 @@ os_file_create_func( if (on_error_no_exit) { retry = os_file_handle_error_no_exit( - name, operation, on_error_silent); + name, operation, on_error_silent, __FILE__, __LINE__); } else { - retry = os_file_handle_error(name, operation); + retry = os_file_handle_error(name, operation, __FILE__, __LINE__); } } else { *success = TRUE; @@ -1673,9 +1901,27 @@ os_file_create_func( } while (retry); + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ + if (file != INVALID_HANDLE_VALUE + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) + && !os_file_set_atomic_writes(name, file)) { + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + CloseHandle(file); + os_file_delete_if_exists_func(name); + *success = FALSE; + file = INVALID_HANDLE_VALUE; + } + } #else /* __WIN__ */ int create_flag; const char* mode_str = NULL; + if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) + WAIT_ALLOW_WRITES(); on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT ? TRUE : FALSE; @@ -1747,9 +1993,9 @@ os_file_create_func( if (on_error_no_exit) { retry = os_file_handle_error_no_exit( - name, operation, on_error_silent); + name, operation, on_error_silent, __FILE__, __LINE__); } else { - retry = os_file_handle_error(name, operation); + retry = os_file_handle_error(name, operation, __FILE__, __LINE__); } } else { *success = TRUE; @@ -1801,6 +2047,22 @@ os_file_create_func( } #endif /* USE_FILE_LOCK */ + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ + if (file != -1 + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) + && !os_file_set_atomic_writes(name, file)) { + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + close(file); + os_file_delete_if_exists_func(name); + *success = FALSE; + file = -1; + } + } #endif /* __WIN__ */ return(file); @@ -1855,11 +2117,12 @@ loop: goto loop; #else int ret; + WAIT_ALLOW_WRITES(); ret = unlink(name); if (ret != 0 && errno != ENOENT) { - os_file_handle_error_no_exit(name, "delete", FALSE); + os_file_handle_error_no_exit(name, "delete", FALSE, __FILE__, __LINE__); return(false); } @@ -1919,11 +2182,12 @@ loop: goto loop; #else int ret; + WAIT_ALLOW_WRITES(); ret = unlink(name); if (ret != 0) { - os_file_handle_error_no_exit(name, "delete", FALSE); + os_file_handle_error_no_exit(name, "delete", FALSE, __FILE__, __LINE__); return(false); } @@ -1967,16 +2231,17 @@ os_file_rename_func( return(TRUE); } - os_file_handle_error_no_exit(oldpath, "rename", FALSE); + os_file_handle_error_no_exit(oldpath, "rename", FALSE, __FILE__, __LINE__); return(FALSE); #else int ret; + WAIT_ALLOW_WRITES(); ret = rename(oldpath, newpath); if (ret != 0) { - os_file_handle_error_no_exit(oldpath, "rename", FALSE); + os_file_handle_error_no_exit(oldpath, "rename", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -2007,7 +2272,7 @@ os_file_close_func( return(TRUE); } - os_file_handle_error(NULL, "close"); + os_file_handle_error(NULL, "close", __FILE__, __LINE__); return(FALSE); #else @@ -2016,7 +2281,7 @@ os_file_close_func( ret = close(file); if (ret == -1) { - os_file_handle_error(NULL, "close"); + os_file_handle_error(NULL, "close", __FILE__, __LINE__); return(FALSE); } @@ -2118,15 +2383,15 @@ os_file_set_size( fprintf(stderr, "InnoDB: Error: preallocating file " "space for file \'%s\' failed. Current size " "%lu, desired size %lu\n", - name, (long unsigned) current_size, (long unsigned) size); - os_file_handle_error_no_exit(name, "posix_fallocate", FALSE); + name, current_size, size); + os_file_handle_error_no_exit(name, "posix_fallocate", FALSE, __FILE__, __LINE__); + return(FALSE); } return(TRUE); } #endif - /* Write up to 1 megabyte at a time. */ buf_size = ut_min(64, (ulint) (size / UNIV_PAGE_SIZE)) * UNIV_PAGE_SIZE; @@ -2153,6 +2418,7 @@ os_file_set_size( } ret = os_file_write(name, file, buf, current_size, n_bytes); + if (!ret) { ut_free(buf2); goto error_handling; @@ -2200,6 +2466,7 @@ os_file_set_eof( HANDLE h = (HANDLE) _get_osfhandle(fileno(file)); return(SetEndOfFile(h)); #else /* __WIN__ */ + WAIT_ALLOW_WRITES(); return(!ftruncate(fileno(file), ftell(file))); #endif /* __WIN__ */ } @@ -2285,7 +2552,7 @@ os_file_flush_func( return(TRUE); } - os_file_handle_error(NULL, "flush"); + os_file_handle_error(NULL, "flush", __FILE__, __LINE__); /* It is a fatal error if a file flush does not succeed, because then the database can get corrupt on disk */ @@ -2294,6 +2561,7 @@ os_file_flush_func( return(FALSE); #else int ret; + WAIT_ALLOW_WRITES(); #if defined(HAVE_DARWIN_THREADS) # ifndef F_FULLFSYNC @@ -2339,7 +2607,7 @@ os_file_flush_func( ib_logf(IB_LOG_LEVEL_ERROR, "The OS said file flush did not succeed"); - os_file_handle_error(NULL, "flush"); + os_file_handle_error(NULL, "flush", __FILE__, __LINE__); /* It is a fatal error if a file flush does not succeed, because then the database can get corrupt on disk */ @@ -2577,7 +2845,9 @@ os_file_read_func( os_file_t file, /*!< in: handle to a file */ void* buf, /*!< in: buffer where to read */ os_offset_t offset, /*!< in: file offset where to read */ - ulint n) /*!< in: number of bytes to read */ + ulint n, /*!< in: number of bytes to read */ + ibool compressed) /*!< in: is this file space + compressed ? */ { #ifdef __WIN__ BOOL ret; @@ -2646,6 +2916,14 @@ try_again: os_mutex_exit(os_file_count_mutex); if (ret && len == n) { + /* Note that InnoDB writes files that are not formated + as file spaces and they do not have FIL_PAGE_TYPE + field, thus we must use here information is the actual + file space compressed. */ + if (compressed && fil_page_is_compressed((byte *)buf)) { + fil_decompress_page(NULL, (byte *)buf, len, NULL); + } + return(TRUE); } #else /* __WIN__ */ @@ -2658,6 +2936,13 @@ try_again: ret = os_file_pread(file, buf, n, offset); if ((ulint) ret == n) { + /* Note that InnoDB writes files that are not formated + as file spaces and they do not have FIL_PAGE_TYPE + field, thus we must use here information is the actual + file space compressed. */ + if (compressed && fil_page_is_compressed((byte *)buf)) { + fil_decompress_page(NULL, (byte *)buf, n, NULL); + } return(TRUE); } @@ -2669,7 +2954,7 @@ try_again: #ifdef __WIN__ error_handling: #endif - retry = os_file_handle_error(NULL, "read"); + retry = os_file_handle_error(NULL, "read", __FILE__, __LINE__); if (retry) { goto try_again; @@ -2704,7 +2989,9 @@ os_file_read_no_error_handling_func( os_file_t file, /*!< in: handle to a file */ void* buf, /*!< in: buffer where to read */ os_offset_t offset, /*!< in: file offset where to read */ - ulint n) /*!< in: number of bytes to read */ + ulint n, /*!< in: number of bytes to read */ + ibool compressed) /*!< in: is this file space + compressed ? */ { #ifdef __WIN__ BOOL ret; @@ -2773,6 +3060,15 @@ try_again: os_mutex_exit(os_file_count_mutex); if (ret && len == n) { + + /* Note that InnoDB writes files that are not formated + as file spaces and they do not have FIL_PAGE_TYPE + field, thus we must use here information is the actual + file space compressed. */ + if (compressed && fil_page_is_compressed((byte *)buf)) { + fil_decompress_page(NULL, (byte *)buf, n, NULL); + } + return(TRUE); } #else /* __WIN__ */ @@ -2785,6 +3081,13 @@ try_again: ret = os_file_pread(file, buf, n, offset); if ((ulint) ret == n) { + /* Note that InnoDB writes files that are not formated + as file spaces and they do not have FIL_PAGE_TYPE + field, thus we must use here information is the actual + file space compressed. */ + if (compressed && fil_page_is_compressed((byte *)buf)) { + fil_decompress_page(NULL, (byte *)buf, n, NULL); + } return(TRUE); } @@ -2792,7 +3095,7 @@ try_again: #ifdef __WIN__ error_handling: #endif - retry = os_file_handle_error_no_exit(NULL, "read", FALSE); + retry = os_file_handle_error_no_exit(NULL, "read", FALSE, __FILE__, __LINE__); if (retry) { goto try_again; @@ -2864,6 +3167,7 @@ os_file_write_func( ut_ad(file); ut_ad(buf); ut_ad(n > 0); + retry: low = (DWORD) offset & 0xFFFFFFFF; high = (DWORD) (offset >> 32); @@ -2995,6 +3299,7 @@ retry: return(FALSE); #else ssize_t ret; + WAIT_ALLOW_WRITES(); ret = os_file_pwrite(file, buf, n, offset); @@ -3060,7 +3365,7 @@ os_file_status( } else if (ret) { /* file exists, but stat call failed */ - os_file_handle_error_no_exit(path, "stat", FALSE); + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -3088,7 +3393,7 @@ os_file_status( } else if (ret) { /* file exists, but stat call failed */ - os_file_handle_error_no_exit(path, "stat", FALSE); + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -3137,7 +3442,7 @@ os_file_get_status( } else if (ret) { /* file exists, but stat call failed */ - os_file_handle_error_no_exit(path, "stat", FALSE); + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); return(DB_FAIL); @@ -3190,7 +3495,7 @@ os_file_get_status( } else if (ret) { /* file exists, but stat call failed */ - os_file_handle_error_no_exit(path, "stat", FALSE); + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); return(DB_FAIL); @@ -3756,7 +4061,8 @@ os_aio_array_create( array->slots = static_cast<os_aio_slot_t*>( ut_malloc(n * sizeof(*array->slots))); - memset(array->slots, 0x0, sizeof(n * sizeof(*array->slots))); + memset(array->slots, 0x0, n * sizeof(*array->slots)); + #ifdef __WIN__ array->handles = static_cast<HANDLE*>(ut_malloc(n * sizeof(HANDLE))); #endif /* __WIN__ */ @@ -3844,8 +4150,8 @@ os_aio_array_free( /*==============*/ os_aio_array_t*& array) /*!< in, own: array to free */ { -#ifdef WIN_ASYNC_IO ulint i; +#ifdef WIN_ASYNC_IO for (i = 0; i < array->n_slots; i++) { os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i); @@ -3867,6 +4173,19 @@ os_aio_array_free( } #endif /* LINUX_NATIVE_AIO */ + for (i = 0; i < array->n_slots; i++) { + os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i); + if (slot->page_compression_page) { + ut_free(slot->page_compression_page); + slot->page_compression_page = NULL; + } + + if (slot->lzo_mem) { + ut_free(slot->lzo_mem); + slot->lzo_mem = NULL; + } + } + ut_free(array->slots); ut_free(array); @@ -4200,7 +4519,16 @@ os_aio_array_reserve_slot( void* buf, /*!< in: buffer where to read or from which to write */ os_offset_t offset, /*!< in: file offset */ - ulint len) /*!< in: length of the block to read or write */ + ulint len, /*!< in: length of the block to read or write */ + ulint* write_size,/*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ + ibool page_compression, /*!< in: is page compression used + on this file space */ + ulint page_compression_level) /*!< page compression + level to be used */ { os_aio_slot_t* slot = NULL; #ifdef WIN_ASYNC_IO @@ -4290,6 +4618,63 @@ found: slot->buf = static_cast<byte*>(buf); slot->offset = offset; slot->io_already_done = FALSE; + slot->page_compress_success = FALSE; + slot->write_size = write_size; + slot->page_compression_level = page_compression_level; + slot->page_compression = page_compression; + + if (message1) { + slot->file_block_size = fil_node_get_block_size(message1); + } + + /* If the space is page compressed and this is write operation + then we compress the page */ + if (message1 && type == OS_FILE_WRITE && page_compression ) { + ulint real_len = len; + byte* tmp = NULL; + + /* Release the array mutex while compressing */ + os_mutex_exit(array->mutex); + + // We allocate memory for page compressed buffer if and only + // if it is not yet allocated. + if (slot->page_buf == NULL) { + os_slot_alloc_page_buf(slot); + } + +#ifdef HAVE_LZO + if (innodb_compression_algorithm == 3 && slot->lzo_mem == NULL) { + os_slot_alloc_lzo_mem(slot); + } +#endif + + /* Call page compression */ + tmp = fil_compress_page(fil_node_get_space_id(slot->message1), + (byte *)buf, + slot->page_buf, + len, + page_compression_level, + fil_node_get_block_size(slot->message1), + &real_len, + slot->lzo_mem + ); + + /* If compression succeeded, set up the length and buffer */ + if (tmp != buf) { + len = real_len; + buf = slot->page_buf; + slot->len = real_len; + slot->page_compress_success = TRUE; + } else { + slot->page_compress_success = FALSE; + } + + /* Take array mutex back, not sure if this is really needed + below */ + os_mutex_enter(array->mutex); + + } + #ifdef WIN_ASYNC_IO control = &slot->control; @@ -4564,10 +4949,19 @@ os_aio_func( (can be used to identify a completed aio operation); ignored if mode is OS_AIO_SYNC */ - void* message2)/*!< in: message for the aio handler + void* message2,/*!< in: message for the aio handler (can be used to identify a completed aio operation); ignored if mode is OS_AIO_SYNC */ + ulint* write_size,/*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ + ibool page_compression, /*!< in: is page compression used + on this file space */ + ulint page_compression_level) /*!< page compression + level to be used */ { os_aio_array_t* array; os_aio_slot_t* slot; @@ -4618,7 +5012,8 @@ os_aio_func( and os_file_write_func() */ if (type == OS_FILE_READ) { - ret = os_file_read_func(file, buf, offset, n); + ret = os_file_read_func(file, buf, offset, n, + page_compression); } else { ut_ad(!srv_read_only_mode); @@ -4627,12 +5022,12 @@ os_aio_func( ret = os_file_write_func(name, file, buf, offset, n); } - DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28", - os_has_said_disk_full = FALSE;); - DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28", - ret = 0;); - DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28", - errno = 28;); + if (type == OS_FILE_WRITE) { + DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28", + os_has_said_disk_full = FALSE; + ret = 0; + errno = 28;); + } return ret; } @@ -4680,7 +5075,8 @@ try_again: } slot = os_aio_array_reserve_slot(type, array, message1, message2, file, - name, buf, offset, n); + name, buf, offset, n, write_size, page_compression, page_compression_level); + if (type == OS_FILE_READ) { if (srv_use_native_aio) { os_n_file_reads++; @@ -4760,7 +5156,7 @@ err_exit: os_aio_array_free_slot(array, slot); if (os_file_handle_error( - name,type == OS_FILE_READ ? "aio read" : "aio write")) { + name,type == OS_FILE_READ ? "aio read" : "aio write", __FILE__, __LINE__)) { goto try_again; } @@ -4873,9 +5269,17 @@ os_aio_windows_handle( if (ret && len == slot->len) { ret_val = TRUE; - } else if (os_file_handle_error(slot->name, "Windows aio")) { + } else if (!ret || (len != slot->len)) { - retry = TRUE; + if (!ret) { + if (os_file_handle_error(slot->name, "Windows aio", __FILE__, __LINE__)) { + retry = TRUE; + } else { + ret_val = FALSE; + } + } else { + retry = TRUE; + } } else { ret_val = FALSE; @@ -4903,9 +5307,17 @@ os_aio_windows_handle( switch (slot->type) { case OS_FILE_WRITE: - ret = WriteFile(slot->file, slot->buf, + if (slot->message1 && + slot->page_compression && + slot->page_buf) { + ret = WriteFile(slot->file, slot->page_buf, (DWORD) slot->len, &len, &(slot->control)); + } else { + ret = WriteFile(slot->file, slot->buf, + (DWORD) slot->len, &len, + &(slot->control)); + } break; case OS_FILE_READ: @@ -4937,6 +5349,30 @@ os_aio_windows_handle( ret_val = ret && len == slot->len; } + if (slot->message1 && slot->page_compression) { + // We allocate memory for page compressed buffer if and only + // if it is not yet allocated. + if (slot->page_buf == NULL) { + os_slot_alloc_page_buf(slot); + } +#ifdef HAVE_LZO + if (innodb_compression_algorithm == 3 && slot->lzo_mem == NULL) { + os_slot_alloc_lzo_mem(slot); + } +#endif + + if (slot->type == OS_FILE_READ) { + fil_decompress_page(slot->page_buf, slot->buf, slot->len, slot->write_size); + } else { + if (slot->page_compress_success && fil_page_is_compressed(slot->page_buf)) { + if (srv_use_trim && os_fallocate_failed == FALSE) { + // Deallocate unused blocks from file system + os_file_trim(slot); + } + } + } + } + os_aio_array_free_slot(array, slot); return(ret_val); @@ -5026,6 +5462,36 @@ retry: /* We have not overstepped to next segment. */ ut_a(slot->pos < end_pos); + /* If the table is page compressed and this is read, + we decompress before we annouce the read is + complete. For writes, we free the compressed page. */ + if (slot->message1 && slot->page_compression) { + // We allocate memory for page compressed buffer if and only + // if it is not yet allocated. + if (slot->page_buf == NULL) { + os_slot_alloc_page_buf(slot); + } + +#ifdef HAVE_LZO + if (innodb_compression_algorithm == 3 && slot->lzo_mem == NULL) { + os_slot_alloc_lzo_mem(slot); + } +#endif + + if (slot->type == OS_FILE_READ) { + fil_decompress_page(slot->page_buf, slot->buf, slot->len, slot->write_size); + } else { + if (slot->page_compress_success && + fil_page_is_compressed(slot->page_buf)) { + ut_ad(slot->page_compression_page); + if (srv_use_trim && os_fallocate_failed == FALSE) { + // Deallocate unused blocks from file system + os_file_trim(slot); + } + } + } + } + /* Mark this request as completed. The error handling will be done in the calling function. */ os_mutex_enter(array->mutex); @@ -5169,6 +5635,13 @@ found: } else { errno = -slot->ret; + if (slot->ret == 0) { + fprintf(stderr, + "InnoDB: Number of bytes after aio %d requested %lu\n" + "InnoDB: from file %s\n", + slot->n_bytes, slot->len, slot->name); + } + /* os_file_handle_error does tell us if we should retry this IO. As it stands now, we don't do this retry when reaping requests from a different context than @@ -5176,7 +5649,7 @@ found: windows and linux native AIO. We should probably look into this to transparently re-submit the IO. */ - os_file_handle_error(slot->name, "Linux aio"); + os_file_handle_error(slot->name, "Linux aio", __FILE__, __LINE__); ret = FALSE; } @@ -5456,7 +5929,8 @@ consecutive_loop: } else { ret = os_file_read( aio_slot->file, combined_buf, - aio_slot->offset, total_len); + aio_slot->offset, total_len, + aio_slot->page_compression); } if (aio_slot->type == OS_FILE_WRITE) { @@ -5853,4 +6327,282 @@ os_aio_all_slots_free(void) } #endif /* UNIV_DEBUG */ +#ifdef _WIN32 +#include <winioctl.h> +#ifndef FSCTL_FILE_LEVEL_TRIM +#define FSCTL_FILE_LEVEL_TRIM CTL_CODE(FILE_DEVICE_FILE_SYSTEM, 130, METHOD_BUFFERED, FILE_WRITE_DATA) +typedef struct _FILE_LEVEL_TRIM_RANGE { + DWORDLONG Offset; + DWORDLONG Length; +} FILE_LEVEL_TRIM_RANGE, *PFILE_LEVEL_TRIM_RANGE; + +typedef struct _FILE_LEVEL_TRIM { + DWORD Key; + DWORD NumRanges; + FILE_LEVEL_TRIM_RANGE Ranges[1]; +} FILE_LEVEL_TRIM, *PFILE_LEVEL_TRIM; +#endif +#endif + +/**********************************************************************//** +Directly manipulate the allocated disk space by deallocating for the file referred to +by fd for the byte range starting at offset and continuing for len bytes. +Within the specified range, partial file system blocks are zeroed, and whole +file system blocks are removed from the file. After a successful call, +subsequent reads from this range will return zeroes. +@return true if success, false if error */ +UNIV_INTERN +ibool +os_file_trim( +/*=========*/ + os_aio_slot_t* slot) /*!< in: slot structure */ +{ + + size_t len = slot->len; + size_t trim_len = UNIV_PAGE_SIZE - len; + os_offset_t off = slot->offset + len; + size_t bsize = slot->file_block_size; + + // len here should be alligned to sector size + ut_ad((trim_len % bsize) == 0); + ut_ad((len % bsize) == 0); + ut_ad(bsize != 0); + ut_ad((off % bsize) == 0); + +#ifdef UNIV_TRIM_DEBUG + fprintf(stderr, "Note: TRIM: write_size %lu trim_len %lu len %lu off %lu bz %lu\n", + *slot->write_size, trim_len, len, off, bsize); +#endif + + // Nothing to do if trim length is zero or if actual write + // size is initialized and it is smaller than current write size. + // In first write if we trim we set write_size to actual bytes + // written and rest of the page is trimmed. In following writes + // there is no need to trim again if write_size only increases + // because rest of the page is already trimmed. If actual write + // size decreases we need to trim again. + if (trim_len == 0 || + (slot->write_size && + *slot->write_size > 0 && + len >= *slot->write_size)) { + +#ifdef UNIV_PAGECOMPRESS_DEBUG + fprintf(stderr, "Note: TRIM: write_size %lu trim_len %lu len %lu\n", + *slot->write_size, trim_len, len); +#endif + + if (*slot->write_size > 0 && len >= *slot->write_size) { + srv_stats.page_compressed_trim_op_saved.inc(); + } + + *slot->write_size = len; + + return (TRUE); + } + +#ifdef __linux__ +#if defined(FALLOC_FL_PUNCH_HOLE) && defined (FALLOC_FL_KEEP_SIZE) + int ret = fallocate(slot->file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off, trim_len); + + if (ret) { + /* After first failure do not try to trim again */ + os_fallocate_failed = TRUE; + srv_use_trim = FALSE; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: [Warning] fallocate call failed with error code %d.\n" + " InnoDB: start: %lu len: %lu payload: %lu\n" + " InnoDB: Disabling fallocate for now.\n", errno, off, trim_len, len); + + os_file_handle_error_no_exit(slot->name, + " fallocate(FALLOC_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE) ", + FALSE, __FILE__, __LINE__); + + if (slot->write_size) { + *slot->write_size = 0; + } + + return (FALSE); + } else { + if (slot->write_size) { + *slot->write_size = len; + } + } +#else + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: [Warning] fallocate not supported on this installation." + " InnoDB: Disabling fallocate for now."); + os_fallocate_failed = TRUE; + srv_use_trim = FALSE; + if (slot->write_size) { + *slot->write_size = 0; + } + +#endif /* HAVE_FALLOCATE ... */ + +#elif defined(_WIN32) + FILE_LEVEL_TRIM flt; + flt.Key = 0; + flt.NumRanges = 1; + flt.Ranges[0].Offset = off; + flt.Ranges[0].Length = trim_len; + + BOOL ret = DeviceIoControl(slot->file, FSCTL_FILE_LEVEL_TRIM, + &flt, sizeof(flt), NULL, NULL, NULL, NULL); + + if (!ret) { + /* After first failure do not try to trim again */ + os_fallocate_failed = TRUE; + srv_use_trim=FALSE; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: [Warning] fallocate call failed with error.\n" + " InnoDB: start: %lu len: %lu payload: %lu\n" + " InnoDB: Disabling fallocate for now.\n", off, trim_len, len); + + os_file_handle_error_no_exit(slot->name, + " DeviceIOControl(FSCTL_FILE_LEVEL_TRIM) ", + FALSE, __FILE__, __LINE__); + + if (slot->write_size) { + *slot->write_size = 0; + } + return (FALSE); + } else { + if (slot->write_size) { + *slot->write_size = len; + } + } +#endif + + switch(bsize) { + case 512: + srv_stats.page_compression_trim_sect512.add((trim_len / bsize)); + break; + case 1024: + srv_stats.page_compression_trim_sect1024.add((trim_len / bsize)); + break; + case 2948: + srv_stats.page_compression_trim_sect2048.add((trim_len / bsize)); + break; + case 4096: + srv_stats.page_compression_trim_sect4096.add((trim_len / bsize)); + break; + case 8192: + srv_stats.page_compression_trim_sect8192.add((trim_len / bsize)); + break; + case 16384: + srv_stats.page_compression_trim_sect16384.add((trim_len / bsize)); + break; + case 32768: + srv_stats.page_compression_trim_sect32768.add((trim_len / bsize)); + break; + default: + break; + } + + srv_stats.page_compressed_trim_op.inc(); + + return (TRUE); + +} #endif /* !UNIV_HOTBACKUP */ + +/**********************************************************************//** +Allocate memory for temporal buffer used for page compression. This +buffer is freed later. */ +UNIV_INTERN +void +os_slot_alloc_page_buf( +/*===================*/ + os_aio_slot_t* slot) /*!< in: slot structure */ +{ + byte* cbuf2; + byte* cbuf; + + ut_a(slot != NULL); + /* We allocate extra to avoid memory overwrite on compression */ + cbuf2 = static_cast<byte *>(ut_malloc(UNIV_PAGE_SIZE*2)); + cbuf = static_cast<byte *>(ut_align(cbuf2, UNIV_PAGE_SIZE)); + slot->page_compression_page = static_cast<byte *>(cbuf2); + slot->page_buf = static_cast<byte *>(cbuf); + ut_a(slot->page_buf != NULL); +} + +#ifdef HAVE_LZO +/**********************************************************************//** +Allocate memory for temporal memory used for page compression when +LZO compression method is used */ +UNIV_INTERN +void +os_slot_alloc_lzo_mem( +/*===================*/ + os_aio_slot_t* slot) /*!< in: slot structure */ +{ + ut_a(slot != NULL); + slot->lzo_mem = static_cast<byte *>(ut_malloc(LZO1X_1_15_MEM_COMPRESS)); + ut_a(slot->lzo_mem != NULL); +} +#endif + +/***********************************************************************//** +Try to get number of bytes per sector from file system. +@return file block size */ +UNIV_INTERN +ulint +os_file_get_block_size( +/*===================*/ + os_file_t file, /*!< in: handle to a file */ + const char* name) /*!< in: file name */ +{ + ulint fblock_size = 512; + +#if defined(UNIV_LINUX) && defined(HAVE_SYS_STATVFS_H) + struct statvfs fstat; + int err; + + err = fstatvfs(file, &fstat); + + if (err != 0) { + fprintf(stderr, "InnoDB: Warning: fstatvfs() failed on file %s\n", name); + os_file_handle_error_no_exit(name, "fstatvfs()", FALSE, __FILE__, __LINE__); + } else { + fblock_size = fstat.f_bsize; + } +#endif /* UNIV_LINUX */ +#ifdef __WIN__ + { + DWORD SectorsPerCluster = 0; + DWORD BytesPerSector = 0; + DWORD NumberOfFreeClusters = 0; + DWORD TotalNumberOfClusters = 0; + + /* + if (GetFreeSpace((LPCTSTR)name, &SectorsPerCluster, &BytesPerSector, &NumberOfFreeClusters, &TotalNumberOfClusters)) { + fblock_size = BytesPerSector; + } else { + fprintf(stderr, "InnoDB: Warning: GetFreeSpace() failed on file %s\n", name); + os_file_handle_error_no_exit(name, "GetFreeSpace()", FALSE, __FILE__, __LINE__); + } + */ + } +#endif /* __WIN__*/ + + if (fblock_size > UNIV_PAGE_SIZE/2 || fblock_size < 512) { + fprintf(stderr, "InnoDB: Note: File system for file %s has " + "file block size %lu not supported for page_size %lu\n", + name, fblock_size, UNIV_PAGE_SIZE); + + if (fblock_size < 512) { + fblock_size = 512; + } else { + fblock_size = UNIV_PAGE_SIZE/2; + } + + fprintf(stderr, "InnoDB: Note: Using file block size %ld for file %s\n", + fblock_size, name); + } + + return fblock_size; +} diff --git a/storage/innobase/page/page0cur.cc b/storage/innobase/page/page0cur.cc index f5f7e1299ce..97405261392 100644 --- a/storage/innobase/page/page0cur.cc +++ b/storage/innobase/page/page0cur.cc @@ -1349,6 +1349,21 @@ page_cur_insert_rec_zip( return(insert_rec); } + /* Page compress failed. If this happened on a + leaf page, put the data size into the sample + buffer. */ + if (page_is_leaf(page)) { + ulint occupied = page_get_data_size(page) + + page_dir_calc_reserved_space( + page_get_n_recs(page)); + index->stat_defrag_data_size_sample[ + index->stat_defrag_sample_next_slot] = + occupied; + index->stat_defrag_sample_next_slot = + (index->stat_defrag_sample_next_slot + + 1) % STAT_DEFRAG_DATA_SIZE_N_SAMPLE; + } + ut_ad(cursor->rec == (pos > 1 ? page_rec_get_nth( diff --git a/storage/innobase/page/page0zip.cc b/storage/innobase/page/page0zip.cc index 6989953cb0c..b0cb9407e72 100644 --- a/storage/innobase/page/page0zip.cc +++ b/storage/innobase/page/page0zip.cc @@ -76,7 +76,7 @@ UNIV_INTERN uint page_zip_level = DEFAULT_COMPRESSION_LEVEL; /* Whether or not to log compressed page images to avoid possible compression algorithm changes in zlib. */ -UNIV_INTERN my_bool page_zip_log_pages = true; +UNIV_INTERN my_bool page_zip_log_pages = false; /* Please refer to ../include/page0zip.ic for a description of the compressed page format. */ @@ -658,7 +658,7 @@ page_zip_dir_encode( #if PAGE_ZIP_DIR_SLOT_MASK & (PAGE_ZIP_DIR_SLOT_MASK + 1) # error "PAGE_ZIP_DIR_SLOT_MASK is not 1 less than a power of 2" #endif -#if PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE_MAX - 1 +#if PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE_DEF - 1 # error "PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE_MAX - 1" #endif if (UNIV_UNLIKELY(rec_get_n_owned_new(rec))) { diff --git a/storage/innobase/pars/pars0opt.cc b/storage/innobase/pars/pars0opt.cc index cbed2b39eeb..5a7e1861d74 100644 --- a/storage/innobase/pars/pars0opt.cc +++ b/storage/innobase/pars/pars0opt.cc @@ -948,12 +948,14 @@ opt_find_all_cols( /* Fill in the field_no fields in sym_node */ sym_node->field_nos[SYM_CLUST_FIELD_NO] = dict_index_get_nth_col_pos( - dict_table_get_first_index(index->table), sym_node->col_no); + dict_table_get_first_index(index->table), sym_node->col_no, + NULL); if (!dict_index_is_clust(index)) { ut_a(plan); - col_pos = dict_index_get_nth_col_pos(index, sym_node->col_no); + col_pos = dict_index_get_nth_col_pos(index, sym_node->col_no, + NULL); if (col_pos == ULINT_UNDEFINED) { diff --git a/storage/innobase/pars/pars0pars.cc b/storage/innobase/pars/pars0pars.cc index 655e5ba1324..c87e1f8e247 100644 --- a/storage/innobase/pars/pars0pars.cc +++ b/storage/innobase/pars/pars0pars.cc @@ -1232,7 +1232,8 @@ pars_process_assign_list( col_sym = assign_node->col; upd_field_set_field_no(upd_field, dict_index_get_nth_col_pos( - clust_index, col_sym->col_no), + clust_index, col_sym->col_no, + NULL), clust_index, NULL); upd_field->exp = assign_node->val; diff --git a/storage/innobase/rem/rem0rec.cc b/storage/innobase/rem/rem0rec.cc index 0d7b7c16785..3ff71d5c59e 100644 --- a/storage/innobase/rem/rem0rec.cc +++ b/storage/innobase/rem/rem0rec.cc @@ -33,6 +33,9 @@ Created 5/30/1994 Heikki Tuuri #include "mtr0mtr.h" #include "mtr0log.h" #include "fts0fts.h" +#ifdef WITH_WSREP +#include <ha_prototypes.h> +#endif /* WITH_WSREP */ /* PHYSICAL RECORD (OLD STYLE) =========================== @@ -1961,3 +1964,134 @@ rec_get_trx_id( } # endif /* UNIV_DEBUG */ #endif /* !UNIV_HOTBACKUP */ + +#ifdef WITH_WSREP +int +wsrep_rec_get_foreign_key( + byte *buf, /* out: extracted key */ + ulint *buf_len, /* in/out: length of buf */ + const rec_t* rec, /* in: physical record */ + dict_index_t* index_for, /* in: index in foreign table */ + dict_index_t* index_ref, /* in: index in referenced table */ + ibool new_protocol) /* in: protocol > 1 */ +{ + const byte* data; + ulint len; + ulint key_len = 0; + ulint i; + uint key_parts; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + const ulint* offsets; + + ut_ad(index_for); + ut_ad(index_ref); + + rec_offs_init(offsets_); + offsets = rec_get_offsets(rec, index_for, offsets_, + ULINT_UNDEFINED, &heap); + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + ut_ad(rec); + + key_parts = dict_index_get_n_unique_in_tree(index_for); + for (i = 0; + i < key_parts && + (index_for->type & DICT_CLUSTERED || i < key_parts - 1); + i++) { + dict_field_t* field_f = + dict_index_get_nth_field(index_for, i); + const dict_col_t* col_f = dict_field_get_col(field_f); + dict_field_t* field_r = + dict_index_get_nth_field(index_ref, i); + const dict_col_t* col_r = dict_field_get_col(field_r); + + data = rec_get_nth_field(rec, offsets, i, &len); + if (key_len + ((len != UNIV_SQL_NULL) ? len + 1 : 1) > + *buf_len) { + fprintf (stderr, + "WSREP: FK key len exceeded %lu %lu %lu\n", + key_len, len, *buf_len); + goto err_out; + } + + if (len == UNIV_SQL_NULL) { + ut_a(!(col_f->prtype & DATA_NOT_NULL)); + *buf++ = 1; + key_len++; + } else if (!new_protocol) { + if (!(col_r->prtype & DATA_NOT_NULL)) { + *buf++ = 0; + key_len++; + } + memcpy(buf, data, len); + *buf_len = wsrep_innobase_mysql_sort( + (int)(col_f->prtype & DATA_MYSQL_TYPE_MASK), + (uint)dtype_get_charset_coll(col_f->prtype), + buf, len, *buf_len); + } else { /* new protocol */ + if (!(col_r->prtype & DATA_NOT_NULL)) { + *buf++ = 0; + key_len++; + } + switch (col_f->mtype) { + case DATA_INT: { + byte* ptr = buf+len; + for (;;) { + ptr--; + *ptr = *data; + if (ptr == buf) { + break; + } + data++; + } + + if (!(col_f->prtype & DATA_UNSIGNED)) { + buf[len-1] = (byte) (buf[len-1] ^ 128); + } + + break; + } + case DATA_VARCHAR: + case DATA_VARMYSQL: + case DATA_CHAR: + case DATA_MYSQL: + /* Copy the actual data */ + ut_memcpy(buf, data, len); + len = wsrep_innobase_mysql_sort( + (int) + (col_f->prtype & DATA_MYSQL_TYPE_MASK), + (uint) + dtype_get_charset_coll(col_f->prtype), + buf, len, *buf_len); + break; + case DATA_BLOB: + case DATA_BINARY: + memcpy(buf, data, len); + break; + default: + break; + } + + key_len += len; + buf += len; + } + } + + rec_validate(rec, offsets); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + *buf_len = key_len; + return DB_SUCCESS; + + err_out: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return DB_ERROR; +} +#endif // WITH_WSREP diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc index b11a9f0d85a..eb032246416 100644 --- a/storage/innobase/row/row0ftsort.cc +++ b/storage/innobase/row/row0ftsort.cc @@ -848,7 +848,7 @@ exit: error = row_merge_sort(psort_info->psort_common->trx, psort_info->psort_common->dup, - merge_file[i], block[i], &tmpfd[i]); + merge_file[i], block[i], &tmpfd[i], false, 0.0/* pct_progress */, 0.0/* pct_cost */); if (error != DB_SUCCESS) { close(tmpfd[i]); goto func_exit; @@ -1409,8 +1409,9 @@ row_fts_merge_insert( fd[i] = psort_info[i].merge_file[id]->fd; foffs[i] = 0; - buf[i] = static_cast<unsigned char (*)[16384]>( + buf[i] = static_cast<unsigned char (*)[65536]>( mem_heap_alloc(heap, sizeof *buf[i])); + count_diag += (int) psort_info[i].merge_file[id]->n_rec; } diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc index ac9ca7b44eb..44c9ac32d16 100644 --- a/storage/innobase/row/row0ins.cc +++ b/storage/innobase/row/row0ins.cc @@ -920,6 +920,14 @@ row_ins_invalidate_query_cache( innobase_invalidate_query_cache(thr_get_trx(thr), buf, len); mem_free(buf); } +#ifdef WITH_WSREP +dberr_t wsrep_append_foreign_key(trx_t *trx, + dict_foreign_t* foreign, + const rec_t* clust_rec, + dict_index_t* clust_index, + ibool referenced, + ibool shared); +#endif /* WITH_WSREP */ /*********************************************************************//** Perform referential actions or checks when a parent row is deleted or updated @@ -1271,7 +1279,19 @@ row_ins_foreign_check_on_constraint( cascade->state = UPD_NODE_UPDATE_CLUSTERED; - err = row_update_cascade_for_mysql(thr, cascade, +#ifdef WITH_WSREP + err = wsrep_append_foreign_key( + thr_get_trx(thr), + foreign, + clust_rec, + clust_index, + FALSE, FALSE); + if (err != DB_SUCCESS) { + fprintf(stderr, + "WSREP: foreign key append failed: %d\n", err); + } else +#endif /* WITH_WSREP */ + err = row_update_cascade_for_mysql(thr, cascade, foreign->foreign_table); if (foreign->foreign_table->n_foreign_key_checks_running == 0) { @@ -1603,7 +1623,14 @@ run_again: if (check_ref) { err = DB_SUCCESS; - +#ifdef WITH_WSREP + err = wsrep_append_foreign_key( + thr_get_trx(thr), + foreign, + rec, + check_index, + check_ref, TRUE); +#endif /* WITH_WSREP */ goto end_scan; } else if (foreign->type != 0) { /* There is an ON UPDATE or ON DELETE diff --git a/storage/innobase/row/row0log.cc b/storage/innobase/row/row0log.cc index fd0c54d889b..caed087b439 100644 --- a/storage/innobase/row/row0log.cc +++ b/storage/innobase/row/row0log.cc @@ -40,6 +40,10 @@ Created 2011-05-26 Marko Makela #include<map> +ulint onlineddl_rowlog_rows; +ulint onlineddl_rowlog_pct_used; +ulint onlineddl_pct_progress; + /** Table row modification operations during online table rebuild. Delete-marked records are not copied to the rebuilt table. */ enum row_tab_op { @@ -470,6 +474,10 @@ write_failed: log->tail.total += size; UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf); mutex_exit(&log->mutex); + + os_atomic_increment_ulint(&onlineddl_rowlog_rows, 1); + /* 10000 means 100.00%, 4525 means 45.25% */ + onlineddl_rowlog_pct_used = (log->tail.total * 10000) / srv_online_max_size; } #ifdef UNIV_DEBUG @@ -2546,7 +2554,7 @@ all_done: success = os_file_read_no_error_handling( OS_FILE_FROM_FD(index->online_log->fd), index->online_log->head.block, ofs, - srv_sort_buf_size); + srv_sort_buf_size, FALSE); if (!success) { fprintf(stderr, "InnoDB: unable to read temporary file" @@ -3377,7 +3385,7 @@ all_done: success = os_file_read_no_error_handling( OS_FILE_FROM_FD(index->online_log->fd), index->online_log->head.block, ofs, - srv_sort_buf_size); + srv_sort_buf_size, FALSE); if (!success) { fprintf(stderr, "InnoDB: unable to read temporary file" diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc index e9d8bd50d6a..c79bd6c62ec 100644 --- a/storage/innobase/row/row0merge.cc +++ b/storage/innobase/row/row0merge.cc @@ -23,6 +23,8 @@ New index creation routines using a merge sort Created 12/4/2005 Jan Lindstrom Completed by Sunny Bains and Marko Makela *******************************************************/ +#include <my_config.h> +#include <log.h> #include "row0merge.h" #include "row0ext.h" @@ -38,6 +40,13 @@ Completed by Sunny Bains and Marko Makela #include "row0import.h" #include "handler0alter.h" #include "ha_prototypes.h" +#include "math.h" /* log() */ + +float my_log2f(float n) +{ + /* log(n) / log(2) is log2. */ + return (float)(log((double)n) / log((double)2)); +} /* Ignore posix_fadvise() on those platforms where it does not exist */ #if defined __WIN__ @@ -777,7 +786,8 @@ row_merge_read( #endif /* UNIV_DEBUG */ success = os_file_read_no_error_handling(OS_FILE_FROM_FD(fd), buf, - ofs, srv_sort_buf_size); + ofs, srv_sort_buf_size, FALSE); + #ifdef POSIX_FADV_DONTNEED /* Each block is read exactly once. Free up the file cache. */ posix_fadvise(fd, ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED); @@ -1188,7 +1198,8 @@ row_merge_read_clustered_index( AUTO_INCREMENT column, or ULINT_UNDEFINED if none is added */ ib_sequence_t& sequence,/*!< in/out: autoinc sequence */ - row_merge_block_t* block) /*!< in/out: file buffer */ + row_merge_block_t* block, /*!< in/out: file buffer */ + float pct_cost) /*!< in: percent of task weight out of total alter job */ { dict_index_t* clust_index; /* Clustered index */ mem_heap_t* row_heap; /* Heap memory to create @@ -1208,11 +1219,21 @@ row_merge_read_clustered_index( os_event_t fts_parallel_sort_event = NULL; ibool fts_pll_sort = FALSE; ib_int64_t sig_count = 0; + + float curr_progress; + ib_int64_t read_rows = 0; + ib_int64_t table_total_rows; DBUG_ENTER("row_merge_read_clustered_index"); ut_ad((old_table == new_table) == !col_map); ut_ad(!add_cols || col_map); + table_total_rows = dict_table_get_n_rows(old_table); + if(table_total_rows == 0) { + /* We don't know total row count */ + table_total_rows = 1; + } + trx->op_info = "reading clustered index"; #ifdef FTS_INTERNAL_DIAG_PRINT @@ -1710,6 +1731,17 @@ write_buffers: } mem_heap_empty(row_heap); + + /* Increment innodb_onlineddl_pct_progress status variable */ + read_rows++; + if(read_rows % 1000 == 0) { + /* Update progress for each 1000 rows */ + curr_progress = (read_rows >= table_total_rows) ? + pct_cost : + ((pct_cost * read_rows) / table_total_rows); + /* presenting 10.12% as 1012 integer */ + onlineddl_pct_progress = curr_progress * 100; + } } func_exit: @@ -2099,6 +2131,7 @@ row_merge( /* Copy the last blocks, if there are any. */ while (foffs0 < ihalf) { + if (UNIV_UNLIKELY(trx_is_interrupted(trx))) { return(DB_INTERRUPTED); } @@ -2115,6 +2148,7 @@ row_merge( ut_ad(foffs0 == ihalf); while (foffs1 < file->offset) { + if (trx_is_interrupted(trx)) { return(DB_INTERRUPTED); } @@ -2170,17 +2204,37 @@ row_merge_sort( merge_file_t* file, /*!< in/out: file containing index entries */ row_merge_block_t* block, /*!< in/out: 3 buffers */ - int* tmpfd) /*!< in/out: temporary file handle */ + int* tmpfd, /*!< in/out: temporary file handle + */ + const bool update_progress, + /*!< in: update progress + status variable or not */ + const float pct_progress, + /*!< in: total progress percent + until now */ + const float pct_cost) /*!< in: current progress percent */ { const ulint half = file->offset / 2; ulint num_runs; + ulint cur_run = 0; ulint* run_offset; dberr_t error = DB_SUCCESS; + ulint merge_count = 0; + ulint total_merge_sort_count; + float curr_progress = 0; + DBUG_ENTER("row_merge_sort"); /* Record the number of merge runs we need to perform */ num_runs = file->offset; + /* Find the number N which 2^N is greater or equal than num_runs */ + /* N is merge sort running count */ + total_merge_sort_count = ceil(my_log2f(num_runs)); + if(total_merge_sort_count <= 0) { + total_merge_sort_count=1; + } + /* If num_runs are less than 1, nothing to merge */ if (num_runs <= 1) { DBUG_RETURN(error); @@ -2197,11 +2251,30 @@ row_merge_sort( of file marker). Thus, it must be at least one block. */ ut_ad(file->offset > 0); + thd_progress_init(trx->mysql_thd, num_runs); + sql_print_information("InnoDB: Online DDL : merge-sorting has estimated %lu runs", num_runs); + /* Merge the runs until we have one big run */ do { + cur_run++; + + /* Report progress of merge sort to MySQL for + show processlist progress field */ + thd_progress_report(trx->mysql_thd, cur_run, num_runs); + sql_print_information("InnoDB: Online DDL : merge-sorting current run %lu estimated %lu runs", cur_run, num_runs); + error = row_merge(trx, dup, file, block, tmpfd, &num_runs, run_offset); + if(update_progress) { + merge_count++; + curr_progress = (merge_count >= total_merge_sort_count) ? + pct_cost : + ((pct_cost * merge_count) / total_merge_sort_count); + /* presenting 10.12% as 1012 integer */; + onlineddl_pct_progress = (pct_progress + curr_progress) * 100; + } + if (error != DB_SUCCESS) { break; } @@ -2211,6 +2284,8 @@ row_merge_sort( mem_free(run_offset); + thd_progress_end(trx->mysql_thd); + DBUG_RETURN(error); } @@ -2269,7 +2344,10 @@ row_merge_insert_index_tuples( dict_index_t* index, /*!< in: index */ const dict_table_t* old_table,/*!< in: old table */ int fd, /*!< in: file descriptor */ - row_merge_block_t* block) /*!< in/out: file buffer */ + row_merge_block_t* block, /*!< in/out: file buffer */ + const ib_int64_t table_total_rows, /*!< in: total rows of old table */ + const float pct_progress, /*!< in: total progress percent until now */ + const float pct_cost) /*!< in: current progress percent */ { const byte* b; mem_heap_t* heap; @@ -2279,6 +2357,8 @@ row_merge_insert_index_tuples( ulint foffs = 0; ulint* offsets; mrec_buf_t* buf; + ib_int64_t inserted_rows = 0; + float curr_progress; DBUG_ENTER("row_merge_insert_index_tuples"); ut_ad(!srv_read_only_mode); @@ -2455,6 +2535,19 @@ row_merge_insert_index_tuples( mem_heap_empty(tuple_heap); mem_heap_empty(ins_heap); + + /* Increment innodb_onlineddl_pct_progress status variable */ + inserted_rows++; + if(inserted_rows % 1000 == 0) { + /* Update progress for each 1000 rows */ + curr_progress = (inserted_rows >= table_total_rows || + table_total_rows <= 0) ? + pct_cost : + ((pct_cost * inserted_rows) / table_total_rows); + + /* presenting 10.12% as 1012 integer */; + onlineddl_pct_progress = (pct_progress + curr_progress) * 100; + } } } @@ -3450,6 +3543,13 @@ row_merge_build_indexes( fts_psort_t* merge_info = NULL; ib_int64_t sig_count = 0; bool fts_psort_initiated = false; + + float total_static_cost = 0; + float total_dynamic_cost = 0; + uint total_index_blocks = 0; + float pct_cost=0; + float pct_progress=0; + DBUG_ENTER("row_merge_build_indexes"); ut_ad(!srv_read_only_mode); @@ -3480,6 +3580,9 @@ row_merge_build_indexes( merge_files[i].fd = -1; } + total_static_cost = COST_BUILD_INDEX_STATIC * n_indexes + COST_READ_CLUSTERED_INDEX; + total_dynamic_cost = COST_BUILD_INDEX_DYNAMIC * n_indexes; + for (i = 0; i < n_indexes; i++) { if (row_merge_file_create(&merge_files[i]) < 0) { error = DB_OUT_OF_MEMORY; @@ -3524,6 +3627,12 @@ row_merge_build_indexes( duplicate keys. */ innobase_rec_reset(table); + sql_print_information("InnoDB: Online DDL : Start"); + sql_print_information("InnoDB: Online DDL : Start reading clustered " + "index of the table and create temporary files"); + + pct_cost = COST_READ_CLUSTERED_INDEX * 100 / (total_static_cost + total_dynamic_cost); + /* Read clustered index of the table and create files for secondary index entries for merge sort */ @@ -3531,10 +3640,18 @@ row_merge_build_indexes( trx, table, old_table, new_table, online, indexes, fts_sort_idx, psort_info, merge_files, key_numbers, n_indexes, add_cols, col_map, - add_autoinc, sequence, block); + add_autoinc, sequence, block, pct_cost); - if (error != DB_SUCCESS) { + pct_progress += pct_cost; + + sql_print_information("InnoDB: Online DDL : End of reading " + "clustered index of the table and create temporary files"); + for (i = 0; i < n_indexes; i++) { + total_index_blocks += merge_files[i].offset; + } + + if (error != DB_SUCCESS) { goto func_exit; } @@ -3616,14 +3733,47 @@ wait_again: row_merge_dup_t dup = { sort_idx, table, col_map, 0}; + pct_cost = (COST_BUILD_INDEX_STATIC + + (total_dynamic_cost * merge_files[i].offset / + total_index_blocks)) / + (total_static_cost + total_dynamic_cost) + * PCT_COST_MERGESORT_INDEX * 100; + + sql_print_information("InnoDB: Online DDL : Start merge-sorting" + " index %s (%lu / %lu), estimated cost : %2.4f", + indexes[i]->name, (i+1), n_indexes, pct_cost); + error = row_merge_sort( trx, &dup, &merge_files[i], - block, &tmpfd); + block, &tmpfd, true, pct_progress, pct_cost); + + pct_progress += pct_cost; + + sql_print_information("InnoDB: Online DDL : End of " + " merge-sorting index %s (%lu / %lu)", + indexes[i]->name, (i+1), n_indexes); if (error == DB_SUCCESS) { + pct_cost = (COST_BUILD_INDEX_STATIC + + (total_dynamic_cost * merge_files[i].offset / + total_index_blocks)) / + (total_static_cost + total_dynamic_cost) * + PCT_COST_INSERT_INDEX * 100; + + sql_print_information("InnoDB: Online DDL : Start " + "building index %s (%lu / %lu), estimated " + "cost : %2.4f", indexes[i]->name, (i+1), + n_indexes, pct_cost); + error = row_merge_insert_index_tuples( trx->id, sort_idx, old_table, - merge_files[i].fd, block); + merge_files[i].fd, block, + merge_files[i].n_rec, pct_progress, pct_cost); + pct_progress += pct_cost; + + sql_print_information("InnoDB: Online DDL : " + "End of building index %s (%lu / %lu)", + indexes[i]->name, (i+1), n_indexes); } } @@ -3640,11 +3790,15 @@ wait_again: ut_ad(sort_idx->online_status == ONLINE_INDEX_COMPLETE); } else { + sql_print_information("InnoDB: Online DDL : Start applying row log"); DEBUG_SYNC_C("row_log_apply_before"); error = row_log_apply(trx, sort_idx, table); DEBUG_SYNC_C("row_log_apply_after"); + sql_print_information("InnoDB: Online DDL : End of applying row log"); } + sql_print_information("InnoDB: Online DDL : Completed"); + if (error != DB_SUCCESS) { trx->error_key_num = key_numbers[i]; goto func_exit; diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc index 1138aa410cc..86248b87c66 100644 --- a/storage/innobase/row/row0mysql.cc +++ b/storage/innobase/row/row0mysql.cc @@ -55,6 +55,7 @@ Created 9/17/2000 Heikki Tuuri #include "rem0cmp.h" #include "log0log.h" #include "btr0sea.h" +#include "btr0defragment.h" #include "fil0fil.h" #include "ibuf0ibuf.h" #include "fts0fts.h" @@ -3931,6 +3932,8 @@ row_drop_table_for_mysql( if (!dict_table_is_temporary(table)) { dict_stats_recalc_pool_del(table); + dict_stats_defrag_pool_del(table, NULL); + btr_defragment_remove_table(table); /* Remove stats for this table and all of its indexes from the persistent storage if it exists and if there are stats for this @@ -5219,18 +5222,6 @@ end: trx->error_state = DB_SUCCESS; trx_rollback_to_savepoint(trx, NULL); trx->error_state = DB_SUCCESS; - } else { - if (old_is_tmp && !new_is_tmp) { - /* After ALTER TABLE the table statistics - needs to be rebuilt. Even if we close - table below there could be other - transactions using this table (e.g. - SELECT * FROM INFORMATION_SCHEMA.`TABLE_CONSTRAINTS`), - thus we can't remove table from dictionary cache - here. Therefore, we initialize the - transient statistics here. */ - dict_stats_update_transient(table); - } } } diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc index 69c8498839e..b0e0c89b778 100644 --- a/storage/innobase/row/row0sel.cc +++ b/storage/innobase/row/row0sel.cc @@ -56,6 +56,7 @@ Created 12/19/1997 Heikki Tuuri #include "row0mysql.h" #include "read0read.h" #include "buf0lru.h" +#include "srv0srv.h" #include "ha_prototypes.h" #include "m_string.h" /* for my_sys.h */ #include "my_sys.h" /* DEBUG_SYNC_C */ @@ -2933,9 +2934,14 @@ row_sel_store_mysql_rec( : templ->rec_field_no; /* We should never deliver column prefixes to MySQL, except for evaluating innobase_index_cond(). */ + /* ...actually, we do want to do this in order to + support the prefix query optimization. + ut_ad(dict_index_get_nth_field(index, field_no)->prefix_len == 0); + ...so we disable this assert. */ + if (!row_sel_store_mysql_field(mysql_rec, prebuilt, rec, index, offsets, field_no, templ)) { @@ -3028,6 +3034,8 @@ row_sel_get_clust_rec_for_mysql( dberr_t err; trx_t* trx; + srv_stats.n_sec_rec_cluster_reads.inc(); + *out_rec = NULL; trx = thr_get_trx(thr); @@ -3683,6 +3691,7 @@ row_search_for_mysql( ulint* offsets = offsets_; ibool table_lock_waited = FALSE; byte* next_buf = 0; + ibool use_clustered_index = FALSE; rec_offs_init(offsets_); @@ -4706,10 +4715,68 @@ locks_ok: } /* Get the clustered index record if needed, if we did not do the - search using the clustered index. */ - - if (index != clust_index && prebuilt->need_to_access_clustered) { + search using the clustered index... */ + + use_clustered_index = + (index != clust_index && prebuilt->need_to_access_clustered); + + if (use_clustered_index && srv_prefix_index_cluster_optimization + && prebuilt->n_template <= index->n_fields) { + /* ...but, perhaps avoid the clustered index lookup if + all of the following are true: + 1) all columns are in the secondary index + 2) all values for columns that are prefix-only + indexes are shorter than the prefix size + This optimization can avoid many IOs for certain schemas. + */ + ibool row_contains_all_values = TRUE; + int i; + for (i = 0; i < prebuilt->n_template; i++) { + /* Condition (1) from above: is the field in the + index (prefix or not)? */ + mysql_row_templ_t* templ = + prebuilt->mysql_template + i; + ulint secondary_index_field_no = + templ->rec_prefix_field_no; + if (secondary_index_field_no == ULINT_UNDEFINED) { + row_contains_all_values = FALSE; + break; + } + /* Condition (2) from above: if this is a + prefix, is this row's value size shorter + than the prefix? */ + if (templ->rec_field_is_prefix) { + ulint record_size = rec_offs_nth_size( + offsets, + secondary_index_field_no); + const dict_field_t *field = + dict_index_get_nth_field( + index, + secondary_index_field_no); + ut_a(field->prefix_len > 0); + if (record_size >= field->prefix_len) { + row_contains_all_values = FALSE; + break; + } + } + } + /* If (1) and (2) were true for all columns above, use + rec_prefix_field_no instead of rec_field_no, and skip + the clustered lookup below. */ + if (row_contains_all_values) { + for (i = 0; i < prebuilt->n_template; i++) { + mysql_row_templ_t* templ = + prebuilt->mysql_template + i; + templ->rec_field_no = + templ->rec_prefix_field_no; + ut_a(templ->rec_field_no != ULINT_UNDEFINED); + } + use_clustered_index = FALSE; + srv_stats.n_sec_rec_cluster_reads_avoided.inc(); + } + } + if (use_clustered_index) { requires_clust_rec: ut_ad(index != clust_index); /* We use a 'goto' to the preceding label if a consistent diff --git a/storage/innobase/row/row0upd.cc b/storage/innobase/row/row0upd.cc index a8c2eaa6683..0ea4865d15f 100644 --- a/storage/innobase/row/row0upd.cc +++ b/storage/innobase/row/row0upd.cc @@ -53,6 +53,9 @@ Created 12/27/1996 Heikki Tuuri #include "buf0lru.h" #include <algorithm> +#include <mysql/plugin.h> +#include <mysql/service_wsrep.h> + /* What kind of latch and lock can we assume when the control comes to ------------------------------------------------------------------- an update node? @@ -162,6 +165,52 @@ row_upd_index_is_referenced( return(is_referenced); } +#ifdef WITH_WSREP +static +ibool +wsrep_row_upd_index_is_foreign( +/*========================*/ + dict_index_t* index, /*!< in: index */ + trx_t* trx) /*!< in: transaction */ +{ + dict_table_t* table = index->table; + dict_foreign_t* foreign; + ibool froze_data_dict = FALSE; + ibool is_referenced = FALSE; + + if (table->foreign_set.empty()) { + + return(FALSE); + } + + if (trx->dict_operation_lock_mode == 0) { + row_mysql_freeze_data_dictionary(trx); + froze_data_dict = TRUE; + } + + for (dict_foreign_set::iterator it= table->foreign_set.begin(); + it != table->foreign_set.end(); + ++ it) + { + foreign= *it; + + if (foreign->foreign_index == index) { + + is_referenced = TRUE; + goto func_exit; + } + + } + +func_exit: + if (froze_data_dict) { + row_mysql_unfreeze_data_dictionary(trx); + } + + return(is_referenced); +} +#endif /* WITH_WSREP */ + /*********************************************************************//** Checks if possible foreign key constraints hold after a delete of the record under pcur. @@ -281,7 +330,125 @@ run_again: } err = DB_SUCCESS; +func_exit: + if (got_s_lock) { + row_mysql_unfreeze_data_dictionary(trx); + } + + mem_heap_free(heap); + + return(err); +} +#ifdef WITH_WSREP +static +dberr_t +wsrep_row_upd_check_foreign_constraints( +/*=================================*/ + upd_node_t* node, /*!< in: row update node */ + btr_pcur_t* pcur, /*!< in: cursor positioned on a record; NOTE: the + cursor position is lost in this function! */ + dict_table_t* table, /*!< in: table in question */ + dict_index_t* index, /*!< in: index of the cursor */ + ulint* offsets,/*!< in/out: rec_get_offsets(pcur.rec, index) */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_foreign_t* foreign; + mem_heap_t* heap; + dtuple_t* entry; + trx_t* trx; + const rec_t* rec; + ulint n_ext; + dberr_t err; + ibool got_s_lock = FALSE; + ibool opened = FALSE; + + if (table->foreign_set.empty()) { + + return(DB_SUCCESS); + } + trx = thr_get_trx(thr); + + /* TODO: make native slave thread bail out here */ + + rec = btr_pcur_get_rec(pcur); + ut_ad(rec_offs_validate(rec, index, offsets)); + + heap = mem_heap_create(500); + + entry = row_rec_to_index_entry(rec, index, offsets, + &n_ext, heap); + + mtr_commit(mtr); + + mtr_start(mtr); + + if (trx->dict_operation_lock_mode == 0) { + got_s_lock = TRUE; + + row_mysql_freeze_data_dictionary(trx); + } + + for (dict_foreign_set::iterator it= table->foreign_set.begin(); + it != table->foreign_set.end(); + ++ it) + { + foreign= *it; + + /* Note that we may have an update which updates the index + record, but does NOT update the first fields which are + referenced in a foreign key constraint. Then the update does + NOT break the constraint. */ + + if (foreign->foreign_index == index + && (node->is_delete + || row_upd_changes_first_fields_binary( + entry, index, node->update, + foreign->n_fields))) { + + if (foreign->referenced_table == NULL) { + foreign->referenced_table = + dict_table_open_on_name( + foreign->referenced_table_name_lookup, + FALSE, FALSE, DICT_ERR_IGNORE_NONE); + opened = TRUE; + } + + if (foreign->referenced_table) { + os_inc_counter(dict_sys->mutex, + foreign->referenced_table + ->n_foreign_key_checks_running); + } + + /* NOTE that if the thread ends up waiting for a lock + we will release dict_operation_lock temporarily! + But the counter on the table protects 'foreign' from + being dropped while the check is running. */ + + err = row_ins_check_foreign_constraint( + TRUE, foreign, table, entry, thr); + + if (foreign->referenced_table) { + os_dec_counter(dict_sys->mutex, + foreign->referenced_table + ->n_foreign_key_checks_running); + + if (opened == TRUE) { + dict_table_close(foreign->referenced_table, TRUE, FALSE); + opened = FALSE; + } + } + + if (err != DB_SUCCESS) { + + goto func_exit; + } + } + + } + + err = DB_SUCCESS; func_exit: if (got_s_lock) { row_mysql_unfreeze_data_dictionary(trx); @@ -293,6 +460,7 @@ func_exit: return(err); } +#endif /* WITH_WSREP */ /*********************************************************************//** Creates an update node for a query graph. @@ -1667,6 +1835,9 @@ row_upd_sec_index_entry( index = node->index; referenced = row_upd_index_is_referenced(index, trx); +#ifdef WITH_WSREP + ibool foreign = wsrep_row_upd_index_is_foreign(index, trx); +#endif /* WITH_WSREP */ heap = mem_heap_create(1024); @@ -1794,6 +1965,9 @@ row_upd_sec_index_entry( row_ins_sec_index_entry() below */ if (!rec_get_deleted_flag( rec, dict_table_is_comp(index->table))) { +#ifdef WITH_WSREP + que_node_t *parent = que_node_get_parent(node); +#endif /* WITH_WSREP */ err = btr_cur_del_mark_set_sec_rec( 0, btr_cur, TRUE, thr, &mtr); @@ -1811,6 +1985,37 @@ row_upd_sec_index_entry( node, &pcur, index->table, index, offsets, thr, &mtr); } +#ifdef WITH_WSREP + if (err == DB_SUCCESS && !referenced && + !(parent && que_node_get_type(parent) == + QUE_NODE_UPDATE && + ((upd_node_t*)parent)->cascade_node == node) && + foreign + ) { + ulint* offsets = + rec_get_offsets( + rec, index, NULL, ULINT_UNDEFINED, + &heap); + err = wsrep_row_upd_check_foreign_constraints( + node, &pcur, index->table, + index, offsets, thr, &mtr); + switch (err) { + case DB_SUCCESS: + case DB_NO_REFERENCED_ROW: + err = DB_SUCCESS; + break; + case DB_DEADLOCK: + if (wsrep_debug) fprintf (stderr, + "WSREP: sec index FK check fail for deadlock"); + break; + default: + fprintf (stderr, + "WSREP: referenced FK check fail: %d", + (int)err); + break; + } + } +#endif /* WITH_WSREP */ } break; } @@ -1965,6 +2170,9 @@ row_upd_clust_rec_by_insert( que_thr_t* thr, /*!< in: query thread */ ibool referenced,/*!< in: TRUE if index may be referenced in a foreign key constraint */ +#ifdef WITH_WSREP + ibool foreign, /*!< in: TRUE if index is foreign key index */ +#endif /* WITH_WSREP */ mtr_t* mtr) /*!< in/out: mtr; gets committed here */ { mem_heap_t* heap; @@ -1978,6 +2186,9 @@ row_upd_clust_rec_by_insert( rec_t* rec; ulint* offsets = NULL; +#ifdef WITH_WSREP + que_node_t *parent = que_node_get_parent(node); +#endif /* WITH_WSREP */ ut_ad(node); ut_ad(dict_index_is_clust(index)); @@ -2060,6 +2271,34 @@ err_exit: goto err_exit; } } +#ifdef WITH_WSREP + if (!referenced && + !(parent && que_node_get_type(parent) == QUE_NODE_UPDATE && + ((upd_node_t*)parent)->cascade_node == node) && + foreign + ) { + err = wsrep_row_upd_check_foreign_constraints( + node, pcur, table, index, offsets, thr, mtr); + switch (err) { + case DB_SUCCESS: + case DB_NO_REFERENCED_ROW: + err = DB_SUCCESS; + break; + case DB_DEADLOCK: + if (wsrep_debug) fprintf (stderr, + "WSREP: insert FK check fail for deadlock"); + break; + default: + fprintf (stderr, + "WSREP: referenced FK check fail: %d", + (int)err); + break; + } + if (err != DB_SUCCESS) { + goto err_exit; + } + } +#endif /* WITH_WSREP */ } mtr_commit(mtr); @@ -2252,11 +2491,18 @@ row_upd_del_mark_clust_rec( ibool referenced, /*!< in: TRUE if index may be referenced in a foreign key constraint */ +#ifdef WITH_WSREP + ibool foreign,/*!< in: TRUE if index is foreign key index */ +#endif /* WITH_WSREP */ mtr_t* mtr) /*!< in: mtr; gets committed here */ { btr_pcur_t* pcur; btr_cur_t* btr_cur; dberr_t err; +#ifdef WITH_WSREP + rec_t* rec; + que_node_t *parent = que_node_get_parent(node); +#endif /* WITH_WSREP */ ut_ad(node); ut_ad(dict_index_is_clust(index)); @@ -2273,8 +2519,16 @@ row_upd_del_mark_clust_rec( /* Mark the clustered index record deleted; we do not have to check locks, because we assume that we have an x-lock on the record */ +#ifdef WITH_WSREP + rec = btr_cur_get_rec(btr_cur); +#endif /* WITH_WSREP */ + err = btr_cur_del_mark_set_clust_rec( +#ifdef WITH_WSREP + btr_cur_get_block(btr_cur), rec, +#else btr_cur_get_block(btr_cur), btr_cur_get_rec(btr_cur), +#endif /* WITH_WSREP */ index, offsets, thr, mtr); if (err == DB_SUCCESS && referenced) { /* NOTE that the following call loses the position of pcur ! */ @@ -2282,6 +2536,32 @@ row_upd_del_mark_clust_rec( err = row_upd_check_references_constraints( node, pcur, index->table, index, offsets, thr, mtr); } +#ifdef WITH_WSREP + if (err == DB_SUCCESS && !referenced && + !(parent && que_node_get_type(parent) == QUE_NODE_UPDATE && + ((upd_node_t*)parent)->cascade_node == node) && + thr_get_trx(thr) && + foreign + ) { + err = wsrep_row_upd_check_foreign_constraints( + node, pcur, index->table, index, offsets, thr, mtr); + switch (err) { + case DB_SUCCESS: + case DB_NO_REFERENCED_ROW: + err = DB_SUCCESS; + break; + case DB_DEADLOCK: + if (wsrep_debug) fprintf (stderr, + "WSREP: clust rec FK check fail for deadlock"); + break; + default: + fprintf (stderr, + "WSREP: clust rec referenced FK check fail: %d", + (int)err); + break; + } + } +#endif /* WITH_WSREP */ mtr_commit(mtr); @@ -2314,6 +2594,10 @@ row_upd_clust_step( index = dict_table_get_first_index(node->table); referenced = row_upd_index_is_referenced(index, thr_get_trx(thr)); +#ifdef WITH_WSREP + ibool foreign = wsrep_row_upd_index_is_foreign( + index, thr_get_trx(thr)); +#endif /* WITH_WSREP */ pcur = node->pcur; @@ -2408,7 +2692,11 @@ row_upd_clust_step( if (node->is_delete) { err = row_upd_del_mark_clust_rec( +#ifdef WITH_WSREP + node, index, offsets, thr, referenced, foreign, &mtr); +#else node, index, offsets, thr, referenced, &mtr); +#endif /* WITH_WSREP */ if (err == DB_SUCCESS) { node->state = UPD_NODE_UPDATE_ALL_SEC; @@ -2453,7 +2741,11 @@ row_upd_clust_step( externally! */ err = row_upd_clust_rec_by_insert( +#ifdef WITH_WSREP + node, index, thr, referenced, foreign, &mtr); +#else node, index, thr, referenced, &mtr); +#endif /* WITH_WSREP */ if (err != DB_SUCCESS) { diff --git a/storage/innobase/srv/srv0conc.cc b/storage/innobase/srv/srv0conc.cc index dc3c0b1dd88..8942eb20080 100644 --- a/storage/innobase/srv/srv0conc.cc +++ b/storage/innobase/srv/srv0conc.cc @@ -41,7 +41,8 @@ Created 2011/04/18 Sunny Bains #include "sync0sync.h" #include "trx0trx.h" -#include "mysql/plugin.h" +#include <mysql/plugin.h> +#include <mysql/service_wsrep.h> /** Number of times a thread is allowed to enter InnoDB within the same SQL query after it has once got the ticket. */ @@ -86,6 +87,9 @@ struct srv_conc_slot_t{ reserved may still be TRUE at that point */ srv_conc_node_t srv_conc_queue; /*!< queue node */ +#ifdef WITH_WSREP + void *thd; /*!< to see priority */ +#endif }; /** Queue of threads waiting to get in */ @@ -145,6 +149,9 @@ srv_conc_init(void) conc_slot->event = os_event_create(); ut_a(conc_slot->event); +#ifdef WITH_WSREP + conc_slot->thd = NULL; +#endif /* WITH_WSREP */ } #endif /* !HAVE_ATOMIC_BUILTINS */ } @@ -202,6 +209,16 @@ srv_conc_enter_innodb_with_atomics( for (;;) { ulint sleep_in_us; +#ifdef WITH_WSREP + if (wsrep_on(trx->mysql_thd) && + wsrep_trx_is_aborting(trx->mysql_thd)) { + if (wsrep_debug) + fprintf(stderr, + "srv_conc_enter due to MUST_ABORT"); + srv_conc_force_enter_innodb(trx); + return; + } +#endif /* WITH_WSREP */ if (srv_conc.n_active < (lint) srv_thread_concurrency) { ulint n_active; @@ -319,6 +336,9 @@ srv_conc_exit_innodb_without_atomics( slot = NULL; if (srv_conc.n_active < (lint) srv_thread_concurrency) { +#ifdef WITH_WSREP + srv_conc_slot_t* wsrep_slot; +#endif /* Look for a slot where a thread is waiting and no other thread has yet released the thread */ @@ -329,6 +349,19 @@ srv_conc_exit_innodb_without_atomics( /* No op */ } +#ifdef WITH_WSREP + /* look for aborting trx, they must be released asap */ + wsrep_slot= slot; + while (wsrep_slot && (wsrep_slot->wait_ended == TRUE || + !wsrep_trx_is_aborting(wsrep_slot->thd))) { + wsrep_slot = UT_LIST_GET_NEXT(srv_conc_queue, wsrep_slot); + } + if (wsrep_slot) { + slot = wsrep_slot; + if (wsrep_debug) + fprintf(stderr, "WSREP: releasing aborting thd\n"); + } +#endif if (slot != NULL) { slot->wait_ended = TRUE; @@ -384,6 +417,13 @@ retry: return; } +#ifdef WITH_WSREP + if (wsrep_on(trx->mysql_thd) && + wsrep_thd_is_brute_force(trx->mysql_thd)) { + srv_conc_force_enter_innodb(trx); + return; + } +#endif /* If the transaction is not holding resources, let it sleep for srv_thread_sleep_delay microseconds, and try again then */ @@ -450,6 +490,9 @@ retry: /* Add to the queue */ slot->reserved = TRUE; slot->wait_ended = FALSE; +#ifdef WITH_WSREP + slot->thd = trx->mysql_thd; +#endif UT_LIST_ADD_LAST(srv_conc_queue, srv_conc_queue, slot); @@ -457,6 +500,18 @@ retry: srv_conc.n_waiting++; +#ifdef WITH_WSREP + if (wsrep_on(trx->mysql_thd) && + wsrep_trx_is_aborting(trx->mysql_thd)) { + os_fast_mutex_unlock(&srv_conc_mutex); + if (wsrep_debug) + fprintf(stderr, "srv_conc_enter due to MUST_ABORT"); + trx->declared_to_be_inside_innodb = TRUE; + trx->n_tickets_to_enter_innodb = srv_n_free_tickets_to_enter; + return; + } + trx->wsrep_event = slot->event; +#endif /* WITH_WSREP */ os_fast_mutex_unlock(&srv_conc_mutex); /* Go to wait for the event; when a thread leaves InnoDB it will @@ -472,6 +527,9 @@ retry: os_event_wait(slot->event); thd_wait_end(trx->mysql_thd); +#ifdef WITH_WSREP + trx->wsrep_event = NULL; +#endif /* WITH_WSREP */ trx->op_info = ""; @@ -483,6 +541,9 @@ retry: incremented the thread counter on behalf of this thread */ slot->reserved = FALSE; +#ifdef WITH_WSREP + slot->thd = NULL; +#endif UT_LIST_REMOVE(srv_conc_queue, srv_conc_queue, slot); @@ -593,5 +654,32 @@ srv_conc_get_active_threads(void) /*==============================*/ { return(srv_conc.n_active); - } +} + +#ifdef WITH_WSREP +UNIV_INTERN +void +wsrep_srv_conc_cancel_wait( +/*==================*/ + trx_t* trx) /*!< in: transaction object associated with the + thread */ +{ +#ifdef HAVE_ATOMIC_BUILTINS + /* aborting transactions will enter innodb by force in + srv_conc_enter_innodb_with_atomics(). No need to cancel here, + thr will wake up after os_sleep and let to enter innodb + */ + if (wsrep_debug) + fprintf(stderr, "WSREP: conc slot cancel, no atomics\n"); +#else + os_fast_mutex_lock(&srv_conc_mutex); + if (trx->wsrep_event) { + if (wsrep_debug) + fprintf(stderr, "WSREP: conc slot cancel\n"); + os_event_set(trx->wsrep_event); + } + os_fast_mutex_unlock(&srv_conc_mutex); +#endif +} +#endif /* WITH_WSREP */ diff --git a/storage/innobase/srv/srv0mon.cc b/storage/innobase/srv/srv0mon.cc index f29621bc90a..24cf403c0af 100644 --- a/storage/innobase/srv/srv0mon.cc +++ b/storage/innobase/srv/srv0mon.cc @@ -2,6 +2,7 @@ Copyright (c) 2010, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, 2014, MariaDB Corporation This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -290,12 +291,36 @@ static monitor_info_t innodb_counter_info[] = MONITOR_EXISTING | MONITOR_DEFAULT_ON), MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_WRITTEN}, + {"buffer_index_pages_written", "buffer", + "Number of index pages written (innodb_index_pages_written)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_INDEX_PAGES_WRITTEN}, + + {"buffer_non_index_pages_written", "buffer", + "Number of non index pages written (innodb_non_index_pages_written)", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN}, + {"buffer_pages_read", "buffer", "Number of pages read (innodb_pages_read)", static_cast<monitor_type_t>( MONITOR_EXISTING | MONITOR_DEFAULT_ON), MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_READ}, + {"buffer_index_sec_rec_cluster_reads", "buffer", + "Number of secondary record reads triggered cluster read", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS}, + + {"buffer_index_sec_rec_cluster_reads_avoided", "buffer", + "Number of secondary record reads avoided triggering cluster read", + static_cast<monitor_type_t>( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS_AVOIDED}, + {"buffer_data_reads", "buffer", "Amount of data read in bytes (innodb_data_reads)", static_cast<monitor_type_t>( @@ -457,20 +482,36 @@ static monitor_info_t innodb_counter_info[] = MONITOR_LRU_BATCH_SCANNED_PER_CALL}, /* Cumulative counter for LRU batch pages flushed */ - {"buffer_LRU_batch_total_pages", "buffer", + {"buffer_LRU_batch_flush_total_pages", "buffer", "Total pages flushed as part of LRU batches", - MONITOR_SET_OWNER, MONITOR_LRU_BATCH_COUNT, - MONITOR_LRU_BATCH_TOTAL_PAGE}, + MONITOR_SET_OWNER, MONITOR_LRU_BATCH_FLUSH_COUNT, + MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE}, - {"buffer_LRU_batches", "buffer", + {"buffer_LRU_batches_flush", "buffer", "Number of LRU batches", - MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_TOTAL_PAGE, - MONITOR_LRU_BATCH_COUNT}, + MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE, + MONITOR_LRU_BATCH_FLUSH_COUNT}, - {"buffer_LRU_batch_pages", "buffer", + {"buffer_LRU_batch_flush_pages", "buffer", "Pages queued as an LRU batch", - MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_TOTAL_PAGE, - MONITOR_LRU_BATCH_PAGES}, + MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE, + MONITOR_LRU_BATCH_FLUSH_PAGES}, + + /* Cumulative counter for LRU batch pages flushed */ + {"buffer_LRU_batch_evict_total_pages", "buffer", + "Total pages evicted as part of LRU batches", + MONITOR_SET_OWNER, MONITOR_LRU_BATCH_EVICT_COUNT, + MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE}, + + {"buffer_LRU_batches_evict", "buffer", + "Number of LRU batches", + MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE, + MONITOR_LRU_BATCH_EVICT_COUNT}, + + {"buffer_LRU_batch_evict_pages", "buffer", + "Pages queued as an LRU batch", + MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE, + MONITOR_LRU_BATCH_EVICT_PAGES}, /* Cumulative counter for single page LRU scans */ {"buffer_LRU_single_flush_scanned", "buffer", @@ -879,6 +920,71 @@ static monitor_info_t innodb_counter_info[] = MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_PAD_DECREMENTS}, + {"compress_saved", "compression", + "Number of bytes saved by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_SAVED}, + + {"compress_trim_sect512", "compression", + "Number of sect-512 TRIMed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT512}, + + {"compress_trim_sect1024", "compression", + "Number of sect-1024 TRIMed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT1024}, + + {"compress_trim_sect2048", "compression", + "Number of sect-2048 TRIMed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT2048}, + + {"compress_trim_sect4096", "compression", + "Number of sect-4K TRIMed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT4096}, + + {"compress_trim_sect8192", "compression", + "Number of sect-8K TRIMed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT8192}, + + {"compress_trim_sect16384", "compression", + "Number of sect-16K TRIMed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT16384}, + + {"compress_trim_sect32768", "compression", + "Number of sect-32K TRIMed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT32768}, + + {"compress_pages_page_compressed", "compression", + "Number of pages compressed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_COMPRESSED}, + + {"compress_page_compressed_trim_op", "compression", + "Number of TRIM operation performed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP}, + + {"compress_page_compressed_trim_op_saved", "compression", + "Number of TRIM operation saved by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP_SAVED}, + + {"compress_pages_page_decompressed", "compression", + "Number of pages decompressed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED}, + + {"compress_pages_page_compression_error", "compression", + "Number of page compression errors", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR}, + /* ========== Counters for Index ========== */ {"module_index", "index", "Index Manager", MONITOR_MODULE, @@ -1572,12 +1678,32 @@ srv_mon_process_existing_counter( value = stat.n_pages_written; break; + /* innodb_index_pages_written, the number of index pages written */ + case MONITOR_OVLD_INDEX_PAGES_WRITTEN: + value = srv_stats.index_pages_written; + break; + + /* innodb_non_index_pages_written, the number of non index pages written */ + case MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN: + value = srv_stats.non_index_pages_written; + break; + /* innodb_pages_read */ case MONITOR_OVLD_PAGES_READ: buf_get_total_stat(&stat); value = stat.n_pages_read; break; + /* Number of times secondary index lookup triggered cluster lookup */ + case MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS: + value = srv_stats.n_sec_rec_cluster_reads; + break; + /* Number of times prefix optimization avoided triggering cluster + lookup */ + case MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS_AVOIDED: + value = srv_stats.n_sec_rec_cluster_reads_avoided; + break; + /* innodb_data_reads, the total number of data reads */ case MONITOR_OVLD_BYTE_READ: value = srv_stats.data_read; @@ -1833,6 +1959,46 @@ srv_mon_process_existing_counter( value = btr_cur_n_non_sea; break; + case MONITOR_OVLD_PAGE_COMPRESS_SAVED: + value = srv_stats.page_compression_saved; + break; + case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT512: + value = srv_stats.page_compression_trim_sect512; + break; + case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT1024: + value = srv_stats.page_compression_trim_sect1024; + break; + case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT2048: + value = srv_stats.page_compression_trim_sect2048; + break; + case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT4096: + value = srv_stats.page_compression_trim_sect4096; + break; + case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT8192: + value = srv_stats.page_compression_trim_sect8192; + break; + case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT16384: + value = srv_stats.page_compression_trim_sect16384; + break; + case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT32768: + value = srv_stats.page_compression_trim_sect32768; + break; + case MONITOR_OVLD_PAGES_PAGE_COMPRESSED: + value = srv_stats.pages_page_compressed; + break; + case MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP: + value = srv_stats.page_compressed_trim_op; + break; + case MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP_SAVED: + value = srv_stats.page_compressed_trim_op_saved; + break; + case MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED: + value = srv_stats.pages_page_decompressed; + break; + case MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR: + value = srv_stats.pages_page_compression_error; + break; + default: ut_error; } diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index 14b2bdbe03c..bcbce3cd53c 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -63,17 +63,24 @@ Created 10/8/1995 Heikki Tuuri #include "dict0stats_bg.h" /* dict_stats_event */ #include "srv0start.h" #include "row0mysql.h" +#include "row0log.h" #include "ha_prototypes.h" #include "trx0i_s.h" #include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */ #include "srv0mon.h" #include "ut0crc32.h" +#include "btr0defragment.h" #include "mysql/plugin.h" #include "mysql/service_thd_wait.h" +#include "fil0pagecompress.h" +#ifdef WITH_WSREP +extern int wsrep_debug; +extern int wsrep_trx_is_aborting(void *thd_ptr); +#endif /* The following is the maximum allowed duration of a lock wait. */ -UNIV_INTERN ulint srv_fatal_semaphore_wait_threshold = 600; +UNIV_INTERN ulong srv_fatal_semaphore_wait_threshold = DEFAULT_SRV_FATAL_SEMAPHORE_TIMEOUT; /* How much data manipulation language (DML) statements need to be delayed, in microseconds, in order to reduce the lagging of the purge thread. */ @@ -146,6 +153,20 @@ use simulated aio we build below with threads. Currently we support native aio on windows and linux */ UNIV_INTERN my_bool srv_use_native_aio = TRUE; +/* If this flag is TRUE, then we will use fallocate(PUCH_HOLE) +to the pages */ +UNIV_INTERN my_bool srv_use_trim = FALSE; +/* If this flag is TRUE, then we will use posix fallocate for file extentsion */ +UNIV_INTERN my_bool srv_use_posix_fallocate = FALSE; +/* If this flag is TRUE, then we disable doublewrite buffer */ +UNIV_INTERN my_bool srv_use_atomic_writes = FALSE; +/* If this flag IS TRUE, then we use this algorithm for page compressing the pages */ +UNIV_INTERN ulong innodb_compression_algorithm = PAGE_ZLIB_ALGORITHM; +/* Number of threads used for multi-threaded flush */ +UNIV_INTERN long srv_mtflush_threads = MTFLUSH_DEFAULT_WORKER; +/* If this flag is TRUE, then we will use multi threaded flush. */ +UNIV_INTERN my_bool srv_use_mtflush = FALSE; + #ifdef __WIN__ /* Windows native condition variables. We use runtime loading / function pointers, because they are not available on Windows Server 2003 and @@ -208,6 +229,10 @@ srv_printf_innodb_monitor() will request mutex acquisition with mutex_enter(), which will wait until it gets the mutex. */ #define MUTEX_NOWAIT(mutex_skipped) ((mutex_skipped) < MAX_MUTEX_NOWAIT) +#ifdef WITH_INNODB_DISALLOW_WRITES +UNIV_INTERN os_event_t srv_allow_writes_event; +#endif /* WITH_INNODB_DISALLOW_WRITES */ + /** The sort order table of the MySQL latin1_swedish_ci character set collation */ UNIV_INTERN const byte* srv_latin1_ordering; @@ -232,6 +257,8 @@ UNIV_INTERN ulint srv_buf_pool_curr_size = 0; UNIV_INTERN ulint srv_mem_pool_size = ULINT_MAX; UNIV_INTERN ulint srv_lock_table_size = ULINT_MAX; +UNIV_INTERN ulong srv_idle_flush_pct = 100; + /* This parameter is deprecated. Use srv_n_io_[read|write]_threads instead. */ UNIV_INTERN ulint srv_n_file_io_threads = ULINT_MAX; @@ -329,6 +356,10 @@ UNIV_INTERN ulint srv_fast_shutdown = 0; /* Generate a innodb_status.<pid> file */ UNIV_INTERN ibool srv_innodb_status = FALSE; +/* Optimize prefix index queries to skip cluster index lookup when possible */ +/* Enables or disables this prefix optimization. Disabled by default. */ +UNIV_INTERN my_bool srv_prefix_index_cluster_optimization = 0; + /* When estimating number of different key values in an index, sample this many index pages, there are 2 ways to calculate statistics: * persistent stats that are calculated by ANALYZE TABLE and saved @@ -356,11 +387,6 @@ batch flushing i.e.: LRU flushing and flush_list flushing. The rest of the pages are used for single page flushing. */ UNIV_INTERN ulong srv_doublewrite_batch_size = 120; -UNIV_INTERN ibool srv_use_atomic_writes = FALSE; -#ifdef HAVE_POSIX_FALLOCATE -UNIV_INTERN ibool srv_use_posix_fallocate = TRUE; -#endif - UNIV_INTERN ulong srv_replication_delay = 0; /*-------------------------------------------*/ @@ -393,6 +419,26 @@ static ulint srv_n_system_rows_read_old = 0; UNIV_INTERN ulint srv_truncated_status_writes = 0; UNIV_INTERN ulint srv_available_undo_logs = 0; +UNIV_INTERN ib_uint64_t srv_page_compression_saved = 0; +UNIV_INTERN ib_uint64_t srv_page_compression_trim_sect512 = 0; +UNIV_INTERN ib_uint64_t srv_page_compression_trim_sect4096 = 0; +UNIV_INTERN ib_uint64_t srv_index_pages_written = 0; +UNIV_INTERN ib_uint64_t srv_non_index_pages_written = 0; +UNIV_INTERN ib_uint64_t srv_pages_page_compressed = 0; +UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op = 0; +UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op_saved = 0; +UNIV_INTERN ib_uint64_t srv_index_page_decompressed = 0; + +/* Defragmentation */ +UNIV_INTERN my_bool srv_defragment = FALSE; +UNIV_INTERN uint srv_defragment_n_pages = 7; +UNIV_INTERN uint srv_defragment_stats_accuracy = 0; +UNIV_INTERN uint srv_defragment_fill_factor_n_recs = 20; +UNIV_INTERN double srv_defragment_fill_factor = 0.9; +UNIV_INTERN uint srv_defragment_frequency = + SRV_DEFRAGMENT_FREQUENCY_DEFAULT; +UNIV_INTERN ulonglong srv_defragment_interval = 0; + /* Set the following to 0 if you want InnoDB to write messages on stderr on startup/shutdown. */ UNIV_INTERN ibool srv_print_verbose_log = TRUE; @@ -401,6 +447,9 @@ UNIV_INTERN my_bool srv_print_innodb_lock_monitor = FALSE; UNIV_INTERN ibool srv_print_innodb_tablespace_monitor = FALSE; UNIV_INTERN ibool srv_print_innodb_table_monitor = FALSE; +/** If this flag is set tables without primary key are not allowed */ +UNIV_INTERN my_bool srv_force_primary_key = FALSE; + /* Array of English strings describing the current state of an i/o handler thread */ @@ -1000,6 +1049,14 @@ srv_init(void) dict_ind_init(); srv_conc_init(); +#ifdef WITH_INNODB_DISALLOW_WRITES + /* Writes have to be enabled on init or else we hang. Thus, we + always set the event here regardless of innobase_disallow_writes. + That flag will always be 0 at this point because it isn't settable + via my.cnf or command line arg. */ + srv_allow_writes_event = os_event_create(); + os_event_set(srv_allow_writes_event); +#endif /* WITH_INNODB_DISALLOW_WRITES */ /* Initialize some INFORMATION SCHEMA internal structures */ trx_i_s_cache_init(trx_i_s_cache); @@ -1518,6 +1575,24 @@ srv_export_innodb_status(void) srv_truncated_status_writes; export_vars.innodb_available_undo_logs = srv_available_undo_logs; + export_vars.innodb_page_compression_saved = srv_stats.page_compression_saved; + export_vars.innodb_page_compression_trim_sect512 = srv_stats.page_compression_trim_sect512; + export_vars.innodb_page_compression_trim_sect4096 = srv_stats.page_compression_trim_sect4096; + export_vars.innodb_index_pages_written = srv_stats.index_pages_written; + export_vars.innodb_non_index_pages_written = srv_stats.non_index_pages_written; + export_vars.innodb_pages_page_compressed = srv_stats.pages_page_compressed; + export_vars.innodb_page_compressed_trim_op = srv_stats.page_compressed_trim_op; + export_vars.innodb_page_compressed_trim_op_saved = srv_stats.page_compressed_trim_op_saved; + export_vars.innodb_pages_page_decompressed = srv_stats.pages_page_decompressed; + + export_vars.innodb_defragment_compression_failures = + btr_defragment_compression_failures; + export_vars.innodb_defragment_failures = btr_defragment_failures; + export_vars.innodb_defragment_count = btr_defragment_count; + + export_vars.innodb_onlineddl_rowlog_rows = onlineddl_rowlog_rows; + export_vars.innodb_onlineddl_rowlog_pct_used = onlineddl_rowlog_pct_used; + export_vars.innodb_onlineddl_pct_progress = onlineddl_pct_progress; #ifdef UNIV_DEBUG rw_lock_s_lock(&purge_sys->latch); @@ -1547,6 +1622,11 @@ srv_export_innodb_status(void) } #endif /* UNIV_DEBUG */ + export_vars.innodb_sec_rec_cluster_reads = + srv_stats.n_sec_rec_cluster_reads; + export_vars.innodb_sec_rec_cluster_reads_avoided = + srv_stats.n_sec_rec_cluster_reads_avoided; + mutex_exit(&srv_innodb_monitor_mutex); } @@ -1803,7 +1883,20 @@ loop: if (sync_array_print_long_waits(&waiter, &sema) && sema == old_sema && os_thread_eq(waiter, old_waiter)) { +#if defined(WITH_WSREP) && defined(WITH_INNODB_DISALLOW_WRITES) + if (srv_allow_writes_event->is_set) { +#endif /* WITH_WSREP */ fatal_cnt++; +#if defined(WITH_WSREP) && defined(WITH_INNODB_DISALLOW_WRITES) + } else { + fprintf(stderr, + "WSREP: avoiding InnoDB self crash due to long " + "semaphore wait of > %lu seconds\n" + "Server is processing SST donor operation, " + "fatal_cnt now: %lu", + (ulong) srv_fatal_semaphore_wait_threshold, fatal_cnt); + } +#endif /* WITH_WSREP */ if (fatal_cnt > 10) { fprintf(stderr, diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index d1be5be9238..2692636dcb5 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -3,6 +3,7 @@ Copyright (c) 1996, 2014, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2008, Google Inc. Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -42,6 +43,7 @@ Created 2/16/1996 Heikki Tuuri #include "pars0pars.h" #include "row0ftsort.h" #include "ut0mem.h" +#include "ut0timer.h" #include "mem0mem.h" #include "data0data.h" #include "data0type.h" @@ -66,12 +68,15 @@ Created 2/16/1996 Heikki Tuuri #include "ibuf0ibuf.h" #include "srv0start.h" #include "srv0srv.h" +#include "btr0defragment.h" + #ifndef UNIV_HOTBACKUP # include "trx0rseg.h" # include "os0proc.h" # include "sync0sync.h" # include "buf0flu.h" # include "buf0rea.h" +# include "buf0mtflu.h" # include "dict0boot.h" # include "dict0load.h" # include "dict0stats_bg.h" @@ -129,7 +134,11 @@ static os_file_t files[1000]; /** io_handler_thread parameters for thread identification */ static ulint n[SRV_MAX_N_IO_THREADS + 6]; /** io_handler_thread identifiers, 32 is the maximum number of purge threads */ -static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6 + 32]; +/** 6 is the ? */ +#define START_OLD_THREAD_CNT (SRV_MAX_N_IO_THREADS + 6 + 32) +static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6 + 32 + MTFLUSH_MAX_WORKER]; +/* Thread contex data for multi-threaded flush */ +void *mtflush_ctx=NULL; /** We use this mutex to test the return value of pthread_mutex_trylock on successful locking. HP-UX does NOT return 0, though Linux et al do. */ @@ -531,7 +540,7 @@ create_log_file( *file = os_file_create( innodb_file_log_key, name, OS_FILE_CREATE|OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_NORMAL, - OS_LOG_FILE, &ret); + OS_LOG_FILE, &ret, FALSE); if (!ret) { ib_logf(IB_LOG_LEVEL_ERROR, "Cannot create %s", name); @@ -738,7 +747,7 @@ open_log_file( *file = os_file_create(innodb_file_log_key, name, OS_FILE_OPEN, OS_FILE_AIO, - OS_LOG_FILE, &ret); + OS_LOG_FILE, &ret, FALSE); if (!ret) { ib_logf(IB_LOG_LEVEL_ERROR, "Unable to open '%s'", name); return(DB_ERROR); @@ -829,7 +838,7 @@ open_or_create_data_files( files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_CREATE, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); if (srv_read_only_mode) { @@ -872,7 +881,7 @@ open_or_create_data_files( files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_OPEN_RAW, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); if (!ret) { ib_logf(IB_LOG_LEVEL_ERROR, @@ -905,17 +914,17 @@ open_or_create_data_files( files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_OPEN_RAW, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); } else if (i == 0) { files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_OPEN_RETRY, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); } else { files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_OPEN, OS_FILE_NORMAL, - OS_DATA_FILE, &ret); + OS_DATA_FILE, &ret, FALSE); } if (!ret) { @@ -1000,7 +1009,8 @@ check_first_page: #ifdef UNIV_LOG_ARCHIVE min_arch_log_no, max_arch_log_no, #endif /* UNIV_LOG_ARCHIVE */ - min_flushed_lsn, max_flushed_lsn); + min_flushed_lsn, max_flushed_lsn, + ULINT_UNDEFINED); if (check_msg) { @@ -1135,7 +1145,7 @@ srv_undo_tablespace_create( innodb_file_data_key, name, srv_read_only_mode ? OS_FILE_OPEN : OS_FILE_CREATE, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); if (srv_read_only_mode && ret) { ib_logf(IB_LOG_LEVEL_INFO, @@ -1222,7 +1232,8 @@ srv_undo_tablespace_open( | OS_FILE_ON_ERROR_SILENT, OS_FILE_NORMAL, OS_DATA_FILE, - &ret); + &ret, + FALSE); /* If the file open was successful then load the tablespace. */ @@ -1524,6 +1535,9 @@ innobase_start_or_create_for_mysql(void) size_t dirnamelen; bool sys_datafiles_created = false; + /* This should be initialized early */ + ut_init_timer(); + if (srv_force_recovery > SRV_FORCE_NO_TRX_UNDO) { srv_read_only_mode = true; } @@ -2715,6 +2729,25 @@ files_checked: } if (!srv_read_only_mode) { + + if (srv_use_mtflush) { + /* Start multi-threaded flush threads */ + mtflush_ctx = buf_mtflu_handler_init( + srv_mtflush_threads, + srv_buf_pool_instances); + + /* Set up the thread ids */ + buf_mtflu_set_thread_ids( + srv_mtflush_threads, + mtflush_ctx, + (thread_ids + 6 + 32)); + +#if UNIV_DEBUG + fprintf(stderr, "InnoDB: Note: %s:%d buf-pool-instances:%lu mtflush_threads %lu\n", + __FILE__, __LINE__, srv_buf_pool_instances, srv_mtflush_threads); +#endif + } + os_thread_create(buf_flush_page_cleaner_thread, NULL, NULL); } @@ -2869,6 +2902,9 @@ files_checked: fts_optimize_init(); } + /* Initialize online defragmentation. */ + btr_defragment_init(); + srv_was_started = TRUE; return(DB_SUCCESS); @@ -2979,6 +3015,13 @@ innobase_shutdown_for_mysql(void) logs_empty_and_mark_files_at_shutdown() and should have already quit or is quitting right now. */ + + if (srv_use_mtflush) { + /* g. Exit the multi threaded flush threads */ + + buf_mtflu_io_thread_exit(); + } + os_mutex_enter(os_sync_mutex); if (os_thread_count == 0) { diff --git a/storage/innobase/sync/sync0sync.cc b/storage/innobase/sync/sync0sync.cc index fb559f26bd4..aa2b5fa29db 100644 --- a/storage/innobase/sync/sync0sync.cc +++ b/storage/innobase/sync/sync0sync.cc @@ -1172,6 +1172,7 @@ sync_thread_add_level( case SYNC_IBUF_MUTEX: case SYNC_INDEX_ONLINE_LOG: case SYNC_STATS_AUTO_RECALC: + case SYNC_STATS_DEFRAG: if (!sync_thread_levels_g(array, level, TRUE)) { fprintf(stderr, "InnoDB: sync_thread_levels_g(array, %lu)" diff --git a/storage/innobase/trx/trx0rec.cc b/storage/innobase/trx/trx0rec.cc index 11ad7fe4afd..fa3fe0904b8 100644 --- a/storage/innobase/trx/trx0rec.cc +++ b/storage/innobase/trx/trx0rec.cc @@ -781,7 +781,8 @@ trx_undo_page_report_modify( } pos = dict_index_get_nth_col_pos(index, - col_no); + col_no, + NULL); ptr += mach_write_compressed(ptr, pos); /* Save the old value of field */ diff --git a/storage/innobase/trx/trx0sys.cc b/storage/innobase/trx/trx0sys.cc index 52830a77b12..2c31af9442c 100644 --- a/storage/innobase/trx/trx0sys.cc +++ b/storage/innobase/trx/trx0sys.cc @@ -44,6 +44,8 @@ Created 3/26/1996 Heikki Tuuri #include "os0file.h" #include "read0read.h" +#include <mysql/service_wsrep.h> + /** The file format tag structure with id and name. */ struct file_format_t { ulint id; /*!< id of the file format */ @@ -174,7 +176,12 @@ trx_sys_flush_max_trx_id(void) mtr_t mtr; trx_sysf_t* sys_header; +#ifndef WITH_WSREP + /* wsrep_fake_trx_id violates this assert + * Copied from trx_sys_get_new_trx_id + */ ut_ad(mutex_own(&trx_sys->mutex)); +#endif /* WITH_WSREP */ if (!srv_read_only_mode) { mtr_start(&mtr); @@ -202,9 +209,14 @@ trx_sys_update_mysql_binlog_offset( ib_int64_t offset, /*!< in: position in that log file */ ulint field, /*!< in: offset of the MySQL log info field in the trx sys header */ +#ifdef WITH_WSREP + trx_sysf_t* sys_header, /*!< in: trx sys header */ +#endif /* WITH_WSREP */ mtr_t* mtr) /*!< in: mtr */ { +#ifndef WITH_WSREP trx_sysf_t* sys_header; +#endif /* !WITH_WSREP */ if (ut_strlen(file_name) >= TRX_SYS_MYSQL_LOG_NAME_LEN) { @@ -213,7 +225,9 @@ trx_sys_update_mysql_binlog_offset( return; } +#ifndef WITH_WSREP sys_header = trx_sysf_get(mtr); +#endif /* !WITH_WSREP */ if (mach_read_from_4(sys_header + field + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) @@ -300,6 +314,124 @@ trx_sys_print_mysql_binlog_offset(void) mtr_commit(&mtr); } +#ifdef WITH_WSREP + +#ifdef UNIV_DEBUG +static long long trx_sys_cur_xid_seqno = -1; +static unsigned char trx_sys_cur_xid_uuid[16]; + +long long read_wsrep_xid_seqno(const XID* xid) +{ + long long seqno; + memcpy(&seqno, xid->data + 24, sizeof(long long)); + return seqno; +} + +void read_wsrep_xid_uuid(const XID* xid, unsigned char* buf) +{ + memcpy(buf, xid->data + 8, 16); +} + +#endif /* UNIV_DEBUG */ + +void +trx_sys_update_wsrep_checkpoint( + const XID* xid, /*!< in: transaction XID */ + trx_sysf_t* sys_header, /*!< in: sys_header */ + mtr_t* mtr) /*!< in: mtr */ +{ +#ifdef UNIV_DEBUG + { + /* Check that seqno is monotonically increasing */ + unsigned char xid_uuid[16]; + long long xid_seqno = read_wsrep_xid_seqno(xid); + read_wsrep_xid_uuid(xid, xid_uuid); + if (!memcmp(xid_uuid, trx_sys_cur_xid_uuid, 8)) + { + ut_ad(xid_seqno > trx_sys_cur_xid_seqno); + trx_sys_cur_xid_seqno = xid_seqno; + } + else + { + memcpy(trx_sys_cur_xid_uuid, xid_uuid, 16); + } + trx_sys_cur_xid_seqno = xid_seqno; + } +#endif /* UNIV_DEBUG */ + + ut_ad(xid && mtr); + ut_a(xid->formatID == -1 || wsrep_is_wsrep_xid(xid)); + + if (mach_read_from_4(sys_header + TRX_SYS_WSREP_XID_INFO + + TRX_SYS_WSREP_XID_MAGIC_N_FLD) + != TRX_SYS_WSREP_XID_MAGIC_N) { + mlog_write_ulint(sys_header + TRX_SYS_WSREP_XID_INFO + + TRX_SYS_WSREP_XID_MAGIC_N_FLD, + TRX_SYS_WSREP_XID_MAGIC_N, + MLOG_4BYTES, mtr); + } + + mlog_write_ulint(sys_header + TRX_SYS_WSREP_XID_INFO + + TRX_SYS_WSREP_XID_FORMAT, + (int)xid->formatID, + MLOG_4BYTES, mtr); + mlog_write_ulint(sys_header + TRX_SYS_WSREP_XID_INFO + + TRX_SYS_WSREP_XID_GTRID_LEN, + (int)xid->gtrid_length, + MLOG_4BYTES, mtr); + mlog_write_ulint(sys_header + TRX_SYS_WSREP_XID_INFO + + TRX_SYS_WSREP_XID_BQUAL_LEN, + (int)xid->bqual_length, + MLOG_4BYTES, mtr); + mlog_write_string(sys_header + TRX_SYS_WSREP_XID_INFO + + TRX_SYS_WSREP_XID_DATA, + (const unsigned char*) xid->data, + XIDDATASIZE, mtr); + +} + +void +trx_sys_read_wsrep_checkpoint(XID* xid) +/*===================================*/ +{ + trx_sysf_t* sys_header; + mtr_t mtr; + ulint magic; + + ut_ad(xid); + + mtr_start(&mtr); + + sys_header = trx_sysf_get(&mtr); + + if ((magic = mach_read_from_4(sys_header + TRX_SYS_WSREP_XID_INFO + + TRX_SYS_WSREP_XID_MAGIC_N_FLD)) + != TRX_SYS_WSREP_XID_MAGIC_N) { + memset(xid, 0, sizeof(*xid)); + xid->formatID = -1; + trx_sys_update_wsrep_checkpoint(xid, sys_header, &mtr); + mtr_commit(&mtr); + return; + } + + xid->formatID = (int)mach_read_from_4( + sys_header + + TRX_SYS_WSREP_XID_INFO + TRX_SYS_WSREP_XID_FORMAT); + xid->gtrid_length = (int)mach_read_from_4( + sys_header + + TRX_SYS_WSREP_XID_INFO + TRX_SYS_WSREP_XID_GTRID_LEN); + xid->bqual_length = (int)mach_read_from_4( + sys_header + + TRX_SYS_WSREP_XID_INFO + TRX_SYS_WSREP_XID_BQUAL_LEN); + ut_memcpy(xid->data, + sys_header + TRX_SYS_WSREP_XID_INFO + TRX_SYS_WSREP_XID_DATA, + XIDDATASIZE); + + mtr_commit(&mtr); +} + +#endif /* WITH_WSREP */ + /*****************************************************************//** Prints to stderr the MySQL master log offset info in the trx system header if the magic number shows it valid. */ diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc index 405d4ef958f..5410bb98190 100644 --- a/storage/innobase/trx/trx0trx.cc +++ b/storage/innobase/trx/trx0trx.cc @@ -29,6 +29,8 @@ Created 3/26/1996 Heikki Tuuri #include "trx0trx.ic" #endif +#include <mysql/service_wsrep.h> + #include "trx0undo.h" #include "trx0rseg.h" #include "log0log.h" @@ -162,6 +164,9 @@ trx_create(void) trx->lock.table_locks = ib_vector_create( heap_alloc, sizeof(void**), 32); +#ifdef WITH_WSREP + trx->wsrep_event = NULL; +#endif /* WITH_WSREP */ return(trx); } @@ -857,6 +862,11 @@ trx_start_low( srv_undo_logs, srv_undo_tablespaces); } +#ifdef WITH_WSREP + memset(&trx->xid, 0, sizeof(trx->xid)); + trx->xid.formatID = -1; +#endif /* WITH_WSREP */ + /* The initial value for trx->no: TRX_ID_MAX is used in read_view_open_now: */ @@ -971,6 +981,9 @@ trx_write_serialisation_history( trx_t* trx, /*!< in/out: transaction */ mtr_t* mtr) /*!< in/out: mini-transaction */ { +#ifdef WITH_WSREP + trx_sysf_t* sys_header; +#endif /* WITH_WSREP */ trx_rseg_t* rseg; rseg = trx->rseg; @@ -1017,6 +1030,15 @@ trx_write_serialisation_history( MONITOR_INC(MONITOR_TRX_COMMIT_UNDO); +#ifdef WITH_WSREP + sys_header = trx_sysf_get(mtr); + /* Update latest MySQL wsrep XID in trx sys header. */ + if (wsrep_is_wsrep_xid(&trx->xid)) + { + trx_sys_update_wsrep_checkpoint(&trx->xid, sys_header, mtr); + } +#endif /* WITH_WSREP */ + /* Update the latest MySQL binlog name and offset info in trx sys header if MySQL binlogging is on or the database server is a MySQL replication slave */ @@ -1027,7 +1049,11 @@ trx_write_serialisation_history( trx_sys_update_mysql_binlog_offset( trx->mysql_log_file_name, trx->mysql_log_offset, - TRX_SYS_MYSQL_LOG_INFO, mtr); + TRX_SYS_MYSQL_LOG_INFO, +#ifdef WITH_WSREP + sys_header, +#endif /* WITH_WSREP */ + mtr); trx->mysql_log_file_name = NULL; } @@ -1321,6 +1347,11 @@ trx_commit_in_memory( ut_ad(!trx->in_ro_trx_list); ut_ad(!trx->in_rw_trx_list); +#ifdef WITH_WSREP + if (wsrep_on(trx->mysql_thd)) { + trx->lock.was_chosen_as_deadlock_victim = FALSE; + } +#endif trx->dict_operation = TRX_DICT_OP_NONE; trx->error_state = DB_SUCCESS; @@ -1505,6 +1536,10 @@ trx_commit_or_rollback_prepare( switch (trx->state) { case TRX_STATE_NOT_STARTED: +#ifdef WITH_WSREP + ut_d(trx->start_file = __FILE__); + ut_d(trx->start_line = __LINE__); +#endif /* WITH_WSREP */ trx_start_low(trx); /* fall through */ case TRX_STATE_ACTIVE: diff --git a/storage/innobase/ut/ut0timer.cc b/storage/innobase/ut/ut0timer.cc new file mode 100644 index 00000000000..85292cce28c --- /dev/null +++ b/storage/innobase/ut/ut0timer.cc @@ -0,0 +1,92 @@ +/***************************************************************************** + +Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved. +Copyright (c) 2014, SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file ut/ut0timer.cc +Timer rountines + +Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com +modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6 +*************************************************************************/ + +#include "data0type.h" +#include <my_rdtsc.h> +#include <ut0timer.h> + +/**************************************************************//** +Initial timer definition +@return 0 */ +static +ulonglong +ut_timer_none(void) +/*===============*/ +{ + return 0; +} + +/**************************************************************//** +Function pointer to point selected timer function. +@return timer current value */ +ulonglong (*ut_timer_now)(void) = &ut_timer_none; + +struct my_timer_unit_info ut_timer; + +/**************************************************************//** +Sets up the data required for use of my_timer_* functions. +Selects the best timer by high frequency, and tight resolution. +Points my_timer_now() to the selected timer function. +Initializes my_timer struct to contain the info for selected timer.*/ +UNIV_INTERN +void +ut_init_timer(void) +/*===============*/ +{ + MY_TIMER_INFO all_timer_info; + my_timer_init(&all_timer_info); + + if (all_timer_info.cycles.frequency > 1000000 && + all_timer_info.cycles.resolution == 1) { + ut_timer = all_timer_info.cycles; + ut_timer_now = &my_timer_cycles; + } else if (all_timer_info.nanoseconds.frequency > 1000000 && + all_timer_info.nanoseconds.resolution == 1) { + ut_timer = all_timer_info.nanoseconds; + ut_timer_now = &my_timer_nanoseconds; + } else if (all_timer_info.microseconds.frequency >= 1000000 && + all_timer_info.microseconds.resolution == 1) { + ut_timer = all_timer_info.microseconds; + ut_timer_now = &my_timer_microseconds; + + } else if (all_timer_info.milliseconds.frequency >= 1000 && + all_timer_info.milliseconds.resolution == 1) { + ut_timer = all_timer_info.milliseconds; + ut_timer_now = &my_timer_milliseconds; + } else if (all_timer_info.ticks.frequency >= 1000 && + /* Will probably be false */ + all_timer_info.ticks.resolution == 1) { + ut_timer = all_timer_info.ticks; + ut_timer_now = &my_timer_ticks; + } else { + /* None are acceptable, so leave it as "None", and fill in struct */ + ut_timer.frequency = 1; /* Avoid div-by-zero */ + ut_timer.overhead = 0; /* Since it doesn't do anything */ + ut_timer.resolution = 10; /* Another sign it's bad */ + ut_timer.routine = 0; /* None */ + } +} diff --git a/storage/innobase/ut/ut0wqueue.cc b/storage/innobase/ut/ut0wqueue.cc index d1ba36b3b00..1607e535a94 100644 --- a/storage/innobase/ut/ut0wqueue.cc +++ b/storage/innobase/ut/ut0wqueue.cc @@ -162,6 +162,38 @@ ib_wqueue_timedwait( } /******************************************************************** +Return first item on work queue or NULL if queue is empty +@return work item or NULL */ +void* +ib_wqueue_nowait( +/*=============*/ + ib_wqueue_t* wq) /*<! in: work queue */ +{ + ib_list_node_t* node = NULL; + + mutex_enter(&wq->mutex); + + if(!ib_list_is_empty(wq->items)) { + node = ib_list_get_first(wq->items); + + if (node) { + ib_list_remove(wq->items, node); + + } + } + + /* We must reset the event when the list + gets emptied. */ + if(ib_list_is_empty(wq->items)) { + os_event_reset(wq->event); + } + + mutex_exit(&wq->mutex); + + return (node ? node->data : NULL); +} + +/******************************************************************** Check if queue is empty. */ ibool @@ -173,3 +205,20 @@ ib_wqueue_is_empty( { return(ib_list_is_empty(wq->items)); } + +/******************************************************************** +Get number of items on queue. +@return number of items on queue */ +ulint +ib_wqueue_len( +/*==========*/ + ib_wqueue_t* wq) /*<! in: work queue */ +{ + ulint len = 0; + + mutex_enter(&wq->mutex); + len = ib_list_len(wq->items); + mutex_exit(&wq->mutex); + + return(len); +} |