summaryrefslogtreecommitdiff
path: root/storage/innobase
diff options
context:
space:
mode:
Diffstat (limited to 'storage/innobase')
-rw-r--r--storage/innobase/CMakeLists.txt20
-rw-r--r--storage/innobase/btr/btr0btr.cc222
-rw-r--r--storage/innobase/btr/btr0cur.cc19
-rw-r--r--storage/innobase/btr/btr0defragment.cc818
-rw-r--r--storage/innobase/btr/btr0scrub.cc898
-rw-r--r--storage/innobase/buf/buf0buf.cc533
-rw-r--r--storage/innobase/buf/buf0checksum.cc8
-rw-r--r--storage/innobase/buf/buf0dblwr.cc182
-rw-r--r--storage/innobase/buf/buf0dump.cc20
-rw-r--r--storage/innobase/buf/buf0flu.cc545
-rw-r--r--storage/innobase/buf/buf0lru.cc108
-rw-r--r--storage/innobase/buf/buf0mtflu.cc758
-rw-r--r--storage/innobase/buf/buf0rea.cc15
-rw-r--r--storage/innobase/dict/dict0dict.cc63
-rw-r--r--storage/innobase/dict/dict0load.cc101
-rw-r--r--storage/innobase/dict/dict0mem.cc2
-rw-r--r--storage/innobase/dict/dict0stats.cc229
-rw-r--r--storage/innobase/dict/dict0stats_bg.cc207
-rw-r--r--storage/innobase/fil/fil0crypt.cc2515
-rw-r--r--storage/innobase/fil/fil0fil.cc773
-rw-r--r--storage/innobase/fil/fil0pagecompress.cc643
-rw-r--r--storage/innobase/fsp/fsp0fsp.cc64
-rw-r--r--storage/innobase/handler/ha_innodb.cc2594
-rw-r--r--storage/innobase/handler/ha_innodb.h37
-rw-r--r--storage/innobase/handler/handler0alter.cc51
-rw-r--r--storage/innobase/handler/i_s.cc1213
-rw-r--r--storage/innobase/handler/i_s.h93
-rw-r--r--storage/innobase/include/btr0btr.h106
-rw-r--r--storage/innobase/include/btr0btr.ic3
-rw-r--r--storage/innobase/include/btr0defragment.h101
-rw-r--r--storage/innobase/include/btr0scrub.h166
-rw-r--r--storage/innobase/include/buf0buf.h257
-rw-r--r--storage/innobase/include/buf0buf.ic22
-rw-r--r--storage/innobase/include/buf0flu.h75
-rw-r--r--storage/innobase/include/buf0lru.h15
-rw-r--r--storage/innobase/include/buf0mtflu.h95
-rw-r--r--storage/innobase/include/dict0dict.h56
-rw-r--r--storage/innobase/include/dict0dict.ic193
-rw-r--r--storage/innobase/include/dict0mem.h128
-rw-r--r--storage/innobase/include/dict0pagecompress.h94
-rw-r--r--storage/innobase/include/dict0pagecompress.ic191
-rw-r--r--storage/innobase/include/dict0priv.h3
-rw-r--r--storage/innobase/include/dict0priv.ic5
-rw-r--r--storage/innobase/include/dict0stats.h33
-rw-r--r--storage/innobase/include/dict0stats_bg.h22
-rw-r--r--storage/innobase/include/dict0types.h8
-rw-r--r--storage/innobase/include/fil0crypt.h385
-rw-r--r--storage/innobase/include/fil0crypt.ic68
-rw-r--r--storage/innobase/include/fil0fil.h141
-rw-r--r--storage/innobase/include/fil0fil.ic177
-rw-r--r--storage/innobase/include/fil0pagecompress.h166
-rw-r--r--storage/innobase/include/fsp0fsp.h101
-rw-r--r--storage/innobase/include/fsp0fsp.ic36
-rw-r--r--storage/innobase/include/fsp0pagecompress.h84
-rw-r--r--storage/innobase/include/fsp0pagecompress.ic211
-rw-r--r--storage/innobase/include/fsp0types.h14
-rw-r--r--storage/innobase/include/ha_prototypes.h10
-rw-r--r--storage/innobase/include/hash0hash.h27
-rw-r--r--storage/innobase/include/lock0lock.h20
-rw-r--r--storage/innobase/include/log0crypt.h68
-rw-r--r--storage/innobase/include/log0log.h29
-rw-r--r--storage/innobase/include/mach0data.ic7
-rw-r--r--storage/innobase/include/mtr0log.ic2
-rw-r--r--storage/innobase/include/mtr0mtr.h20
-rw-r--r--storage/innobase/include/os0file.h84
-rw-r--r--storage/innobase/include/os0file.ic16
-rw-r--r--storage/innobase/include/page0page.h2
-rw-r--r--storage/innobase/include/page0page.ic7
-rw-r--r--storage/innobase/include/rem0rec.h9
-rw-r--r--storage/innobase/include/row0log.h4
-rw-r--r--storage/innobase/include/row0merge.h17
-rw-r--r--storage/innobase/include/row0mysql.h10
-rw-r--r--storage/innobase/include/srv0mon.h32
-rw-r--r--storage/innobase/include/srv0srv.h189
-rw-r--r--storage/innobase/include/srv0start.h3
-rw-r--r--storage/innobase/include/sync0arr.h12
-rw-r--r--storage/innobase/include/sync0rw.h15
-rw-r--r--storage/innobase/include/sync0rw.ic16
-rw-r--r--storage/innobase/include/sync0sync.h24
-rw-r--r--storage/innobase/include/sync0sync.ic16
-rw-r--r--storage/innobase/include/trx0sys.h33
-rw-r--r--storage/innobase/include/trx0sys.ic3
-rw-r--r--storage/innobase/include/trx0trx.h4
-rw-r--r--storage/innobase/include/univ.i40
-rw-r--r--storage/innobase/include/ut0list.h9
-rw-r--r--storage/innobase/include/ut0list.ic20
-rw-r--r--storage/innobase/include/ut0timer.h104
-rw-r--r--storage/innobase/include/ut0timer.ic113
-rw-r--r--storage/innobase/include/ut0ut.h3
-rw-r--r--storage/innobase/include/ut0wqueue.h17
-rw-r--r--storage/innobase/lock/lock0lock.cc513
-rw-r--r--storage/innobase/log/log0crypt.cc501
-rw-r--r--storage/innobase/log/log0log.cc110
-rw-r--r--storage/innobase/log/log0recv.cc58
-rw-r--r--storage/innobase/mtr/mtr0log.cc19
-rw-r--r--storage/innobase/mtr/mtr0mtr.cc33
-rw-r--r--storage/innobase/mysql-test/storage_engine/type_char_indexes.rdiff6
-rw-r--r--storage/innobase/os/os0file.cc654
-rw-r--r--storage/innobase/page/page0cur.cc15
-rw-r--r--storage/innobase/page/page0page.cc10
-rw-r--r--storage/innobase/page/page0zip.cc8
-rw-r--r--storage/innobase/pars/pars0opt.cc6
-rw-r--r--storage/innobase/pars/pars0pars.cc3
-rw-r--r--storage/innobase/rem/rem0rec.cc134
-rw-r--r--storage/innobase/row/row0ftsort.cc5
-rw-r--r--storage/innobase/row/row0import.cc3
-rw-r--r--storage/innobase/row/row0ins.cc31
-rw-r--r--storage/innobase/row/row0log.cc8
-rw-r--r--storage/innobase/row/row0merge.cc172
-rw-r--r--storage/innobase/row/row0mysql.cc54
-rw-r--r--storage/innobase/row/row0sel.cc73
-rw-r--r--storage/innobase/row/row0umod.cc11
-rw-r--r--storage/innobase/row/row0upd.cc292
-rw-r--r--storage/innobase/srv/srv0conc.cc92
-rw-r--r--storage/innobase/srv/srv0mon.cc200
-rw-r--r--storage/innobase/srv/srv0srv.cc156
-rw-r--r--storage/innobase/srv/srv0start.cc165
-rw-r--r--storage/innobase/sync/sync0arr.cc186
-rw-r--r--storage/innobase/sync/sync0rw.cc22
-rw-r--r--storage/innobase/sync/sync0sync.cc28
-rw-r--r--storage/innobase/trx/trx0rec.cc3
-rw-r--r--storage/innobase/trx/trx0sys.cc132
-rw-r--r--storage/innobase/trx/trx0trx.cc37
-rw-r--r--storage/innobase/ut/ut0timer.cc92
-rw-r--r--storage/innobase/ut/ut0wqueue.cc49
125 files changed, 19421 insertions, 1132 deletions
diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt
index c24f1cda59e..ee637f7ea3f 100644
--- a/storage/innobase/CMakeLists.txt
+++ b/storage/innobase/CMakeLists.txt
@@ -18,6 +18,17 @@
INCLUDE(CheckFunctionExists)
INCLUDE(CheckCSourceCompiles)
INCLUDE(CheckCSourceRuns)
+INCLUDE(lz4)
+INCLUDE(lzo)
+INCLUDE(lzma)
+INCLUDE(bzip2)
+INCLUDE(snappy)
+
+MYSQL_CHECK_LZ4()
+MYSQL_CHECK_LZO()
+MYSQL_CHECK_LZMA()
+MYSQL_CHECK_BZIP2()
+MYSQL_CHECK_SNAPPY()
# OS tests
IF(UNIX)
@@ -329,7 +340,9 @@ SET(INNOBASE_SOURCES
btr/btr0btr.cc
btr/btr0cur.cc
btr/btr0pcur.cc
+ btr/btr0scrub.cc
btr/btr0sea.cc
+ btr/btr0defragment.cc
buf/buf0buddy.cc
buf/buf0buf.cc
buf/buf0dblwr.cc
@@ -338,6 +351,7 @@ SET(INNOBASE_SOURCES
buf/buf0flu.cc
buf/buf0lru.cc
buf/buf0rea.cc
+ buf/buf0mtflu.cc
data/data0data.cc
data/data0type.cc
dict/dict0boot.cc
@@ -351,6 +365,8 @@ SET(INNOBASE_SOURCES
eval/eval0eval.cc
eval/eval0proc.cc
fil/fil0fil.cc
+ fil/fil0pagecompress.cc
+ fil/fil0crypt.cc
fsp/fsp0fsp.cc
fut/fut0fut.cc
fut/fut0lst.cc
@@ -375,6 +391,7 @@ SET(INNOBASE_SOURCES
lock/lock0wait.cc
log/log0log.cc
log/log0recv.cc
+ log/log0crypt.cc
mach/mach0data.cc
mem/mem0mem.cc
mem/mem0pool.cc
@@ -438,7 +455,8 @@ SET(INNOBASE_SOURCES
ut/ut0rnd.cc
ut/ut0ut.cc
ut/ut0vec.cc
- ut/ut0wqueue.cc)
+ ut/ut0wqueue.cc
+ ut/ut0timer.cc)
IF(WITH_INNODB)
# Legacy option
diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc
index 79b533481b7..92539ce1524 100644
--- a/storage/innobase/btr/btr0btr.cc
+++ b/storage/innobase/btr/btr0btr.cc
@@ -38,6 +38,7 @@ Created 6/2/1994 Heikki Tuuri
#include "btr0cur.h"
#include "btr0sea.h"
#include "btr0pcur.h"
+#include "btr0defragment.h"
#include "rem0cmp.h"
#include "lock0lock.h"
#include "ibuf0ibuf.h"
@@ -1136,9 +1137,27 @@ btr_page_alloc_low(
reservation for free extents, and thus we know that a page can
be allocated: */
- return(fseg_alloc_free_page_general(
- seg_header, hint_page_no, file_direction,
- TRUE, mtr, init_mtr));
+ buf_block_t* block = fseg_alloc_free_page_general(
+ seg_header, hint_page_no, file_direction,
+ TRUE, mtr, init_mtr);
+
+#ifdef UNIV_DEBUG_SCRUBBING
+ if (block != NULL) {
+ fprintf(stderr,
+ "alloc %lu:%lu to index: %lu root: %lu\n",
+ buf_block_get_page_no(block),
+ buf_block_get_space(block),
+ index->id,
+ dict_index_get_page(index));
+ } else {
+ fprintf(stderr,
+ "failed alloc index: %lu root: %lu\n",
+ index->id,
+ dict_index_get_page(index));
+ }
+#endif /* UNIV_DEBUG_SCRUBBING */
+
+ return block;
}
/**************************************************************//**
@@ -1193,6 +1212,32 @@ btr_get_size(
mtr_t* mtr) /*!< in/out: mini-transaction where index
is s-latched */
{
+ ulint used;
+ if (flag == BTR_N_LEAF_PAGES) {
+ btr_get_size_and_reserved(index, flag, &used, mtr);
+ return used;
+ } else if (flag == BTR_TOTAL_SIZE) {
+ return btr_get_size_and_reserved(index, flag, &used, mtr);
+ } else {
+ ut_error;
+ }
+ return (ULINT_UNDEFINED);
+}
+
+/**************************************************************//**
+Gets the number of reserved and used pages in a B-tree.
+@return number of pages reserved, or ULINT_UNDEFINED if the index
+is unavailable */
+UNIV_INTERN
+ulint
+btr_get_size_and_reserved(
+/*======================*/
+ dict_index_t* index, /*!< in: index */
+ ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
+ ulint* used, /*!< out: number of pages used (<= reserved) */
+ mtr_t* mtr) /*!< in/out: mini-transaction where index
+ is s-latched */
+{
fseg_header_t* seg_header;
page_t* root;
ulint n;
@@ -1201,6 +1246,8 @@ btr_get_size(
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
MTR_MEMO_S_LOCK));
+ ut_a(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE);
+
if (index->page == FIL_NULL || dict_index_is_online_ddl(index)
|| *index->name == TEMP_INDEX_PREFIX) {
return(ULINT_UNDEFINED);
@@ -1208,21 +1255,16 @@ btr_get_size(
root = btr_root_get(index, mtr);
- if (flag == BTR_N_LEAF_PAGES) {
- seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
+ seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
- fseg_n_reserved_pages(seg_header, &n, mtr);
+ n = fseg_n_reserved_pages(seg_header, used, mtr);
- } else if (flag == BTR_TOTAL_SIZE) {
+ if (flag == BTR_TOTAL_SIZE) {
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
- n = fseg_n_reserved_pages(seg_header, &dummy, mtr);
-
- seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
-
n += fseg_n_reserved_pages(seg_header, &dummy, mtr);
- } else {
- ut_error;
+ *used += dummy;
+
}
return(n);
@@ -1263,6 +1305,7 @@ btr_page_free_low(
dict_index_t* index, /*!< in: index tree */
buf_block_t* block, /*!< in: block to be freed, x-latched */
ulint level, /*!< in: page level */
+ bool blob, /*!< in: blob page */
mtr_t* mtr) /*!< in: mtr */
{
fseg_header_t* seg_header;
@@ -1275,6 +1318,76 @@ btr_page_free_low(
buf_block_modify_clock_inc(block);
btr_blob_dbg_assert_empty(index, buf_block_get_page_no(block));
+ if (blob) {
+ ut_a(level == 0);
+ }
+
+ bool scrub = srv_immediate_scrub_data_uncompressed;
+ /* scrub page */
+ if (scrub && blob) {
+ /* blob page: scrub entire page */
+ // TODO(jonaso): scrub only what is actually needed
+ page_t* page = buf_block_get_frame(block);
+ memset(page + PAGE_HEADER, 0,
+ UNIV_PAGE_SIZE - PAGE_HEADER);
+#ifdef UNIV_DEBUG_SCRUBBING
+ fprintf(stderr,
+ "btr_page_free_low: scrub blob page %lu/%lu\n",
+ buf_block_get_space(block),
+ buf_block_get_page_no(block));
+#endif /* UNIV_DEBUG_SCRUBBING */
+ } else if (scrub) {
+ /* scrub records on page */
+
+ /* TODO(jonaso): in theory we could clear full page
+ * but, since page still remains in buffer pool, and
+ * gets flushed etc. Lots of routines validates consistency
+ * of it. And in order to remain structurally consistent
+ * we clear each record by it own
+ *
+ * NOTE: The TODO below mentions removing page from buffer pool
+ * and removing redo entries, once that is done, clearing full
+ * pages should be possible
+ */
+ uint cnt = 0;
+ uint bytes = 0;
+ page_t* page = buf_block_get_frame(block);
+ mem_heap_t* heap = NULL;
+ ulint* offsets = NULL;
+ rec_t* rec = page_rec_get_next(page_get_infimum_rec(page));
+ while (!page_rec_is_supremum(rec)) {
+ offsets = rec_get_offsets(rec, index,
+ offsets, ULINT_UNDEFINED,
+ &heap);
+ uint size = rec_offs_data_size(offsets);
+ memset(rec, 0, size);
+ rec = page_rec_get_next(rec);
+ cnt++;
+ bytes += size;
+ }
+#ifdef UNIV_DEBUG_SCRUBBING
+ fprintf(stderr,
+ "btr_page_free_low: scrub %lu/%lu - "
+ "%u records %u bytes\n",
+ buf_block_get_space(block),
+ buf_block_get_page_no(block),
+ cnt, bytes);
+#endif /* UNIV_DEBUG_SCRUBBING */
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ }
+
+#ifdef UNIV_DEBUG_SCRUBBING
+ if (scrub == false) {
+ fprintf(stderr,
+ "btr_page_free_low %lu/%lu blob: %u\n",
+ buf_block_get_space(block),
+ buf_block_get_page_no(block),
+ blob);
+ }
+#endif /* UNIV_DEBUG_SCRUBBING */
+
if (dict_index_is_ibuf(index)) {
btr_page_free_for_ibuf(index, block, mtr);
@@ -1290,6 +1403,14 @@ btr_page_free_low(
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
}
+ if (scrub) {
+ /**
+ * Reset page type so that scrub thread won't try to scrub it
+ */
+ mlog_write_ulint(buf_block_get_frame(block) + FIL_PAGE_TYPE,
+ FIL_PAGE_TYPE_ALLOCATED, MLOG_2BYTES, mtr);
+ }
+
fseg_free_page(seg_header,
buf_block_get_space(block),
buf_block_get_page_no(block), mtr);
@@ -1319,7 +1440,7 @@ btr_page_free(
ulint level = btr_page_get_level(page, mtr);
ut_ad(fil_page_get_type(block->frame) == FIL_PAGE_INDEX);
- btr_page_free_low(index, block, level, mtr);
+ btr_page_free_low(index, block, level, false, mtr);
}
/**************************************************************//**
@@ -1971,7 +2092,7 @@ IBUF_BITMAP_FREE is unaffected by reorganization.
@retval true if the operation was successful
@retval false if it is a compressed page, and recompression failed */
-static __attribute__((nonnull))
+UNIV_INTERN
bool
btr_page_reorganize_block(
/*======================*/
@@ -2031,7 +2152,7 @@ btr_parse_page_reorganize(
buf_block_t* block, /*!< in: page to be reorganized, or NULL */
mtr_t* mtr) /*!< in: mtr or NULL */
{
- ulint level;
+ ulint level = page_zip_level;
ut_ad(ptr && end_ptr);
@@ -2260,9 +2381,14 @@ btr_root_raise_and_insert(
ibuf_reset_free_bits(new_block);
}
- /* Reposition the cursor to the child node */
- page_cur_search(new_block, index, tuple,
- PAGE_CUR_LE, page_cursor);
+ if (tuple != NULL) {
+ /* Reposition the cursor to the child node */
+ page_cur_search(new_block, index, tuple,
+ PAGE_CUR_LE, page_cursor);
+ } else {
+ /* Set cursor to first record on child node */
+ page_cur_set_before_first(new_block, page_cursor);
+ }
/* Split the child and insert tuple */
return(btr_page_split_and_insert(flags, cursor, offsets, heap,
@@ -2938,6 +3064,9 @@ function must always succeed, we cannot reverse it: therefore enough
free disk space (2 pages) must be guaranteed to be available before
this function is called.
+NOTE: jonaso added support for calling function with tuple == NULL
+which cause it to only split a page.
+
@return inserted record */
UNIV_INTERN
rec_t*
@@ -3015,7 +3144,7 @@ func_start:
half-page */
insert_left = FALSE;
- if (n_iterations > 0) {
+ if (tuple != NULL && n_iterations > 0) {
direction = FSP_UP;
hint_page_no = page_no + 1;
split_rec = btr_page_get_split_rec(cursor, tuple, n_ext);
@@ -3059,6 +3188,12 @@ func_start:
new_page_zip = buf_block_get_page_zip(new_block);
btr_page_create(new_block, new_page_zip, cursor->index,
btr_page_get_level(page, mtr), mtr);
+ /* Only record the leaf level page splits. */
+ if (btr_page_get_level(page, mtr) == 0) {
+ cursor->index->stat_defrag_n_page_split ++;
+ cursor->index->stat_defrag_modified_counter ++;
+ btr_defragment_save_defrag_stats_if_needed(cursor->index);
+ }
/* 3. Calculate the first record on the upper half-page, and the
first record (move_limit) on original page which ends up on the
@@ -3070,7 +3205,12 @@ func_start:
*offsets = rec_get_offsets(split_rec, cursor->index, *offsets,
n_uniq, heap);
- insert_left = cmp_dtuple_rec(tuple, split_rec, *offsets) < 0;
+ if (tuple != NULL) {
+ insert_left = cmp_dtuple_rec(
+ tuple, split_rec, *offsets) < 0;
+ } else {
+ insert_left = 1;
+ }
if (!insert_left && new_page_zip && n_iterations > 0) {
/* If a compressed page has already been split,
@@ -3104,8 +3244,10 @@ insert_empty:
on the appropriate half-page, we may release the tree x-latch.
We can then move the records after releasing the tree latch,
thus reducing the tree latch contention. */
-
- if (split_rec) {
+ if (tuple == NULL) {
+ insert_will_fit = 1;
+ }
+ else if (split_rec) {
insert_will_fit = !new_page_zip
&& btr_page_insert_fits(cursor, split_rec,
offsets, tuple, n_ext, heap);
@@ -3226,6 +3368,11 @@ insert_empty:
/* 6. The split and the tree modification is now completed. Decide the
page where the tuple should be inserted */
+ if (tuple == NULL) {
+ rec = NULL;
+ goto func_exit;
+ }
+
if (insert_left) {
insert_block = left_block;
} else {
@@ -3313,35 +3460,16 @@ func_exit:
ut_ad(page_validate(buf_block_get_frame(left_block), cursor->index));
ut_ad(page_validate(buf_block_get_frame(right_block), cursor->index));
+ if (tuple == NULL) {
+ ut_ad(rec == NULL);
+ }
ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
return(rec);
}
-#ifdef UNIV_SYNC_DEBUG
-/*************************************************************//**
-Removes a page from the level list of pages.
-@param space in: space where removed
-@param zip_size in: compressed page size in bytes, or 0 for uncompressed
-@param page in/out: page to remove
-@param index in: index tree
-@param mtr in/out: mini-transaction */
-# define btr_level_list_remove(space,zip_size,page,index,mtr) \
- btr_level_list_remove_func(space,zip_size,page,index,mtr)
-#else /* UNIV_SYNC_DEBUG */
-/*************************************************************//**
-Removes a page from the level list of pages.
-@param space in: space where removed
-@param zip_size in: compressed page size in bytes, or 0 for uncompressed
-@param page in/out: page to remove
-@param index in: index tree
-@param mtr in/out: mini-transaction */
-# define btr_level_list_remove(space,zip_size,page,index,mtr) \
- btr_level_list_remove_func(space,zip_size,page,mtr)
-#endif /* UNIV_SYNC_DEBUG */
-
/*************************************************************//**
Removes a page from the level list of pages. */
-static __attribute__((nonnull))
+UNIV_INTERN
void
btr_level_list_remove_func(
/*=======================*/
@@ -3513,7 +3641,7 @@ btr_node_ptr_delete(
If page is the only on its level, this function moves its records to the
father page, thus reducing the tree height.
@return father block */
-static
+UNIV_INTERN
buf_block_t*
btr_lift_page_up(
/*=============*/
diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc
index c8dd4fae0a9..1a4eb347cd2 100644
--- a/storage/innobase/btr/btr0cur.cc
+++ b/storage/innobase/btr/btr0cur.cc
@@ -1873,9 +1873,13 @@ btr_cur_update_alloc_zip_func(
false=update-in-place */
mtr_t* mtr) /*!< in/out: mini-transaction */
{
+
+ /* Have a local copy of the variables as these can change
+ dynamically. */
const page_t* page = page_cur_get_page(cursor);
ut_ad(page_zip == page_cur_get_page_zip(cursor));
+
ut_ad(page_zip);
ut_ad(!dict_index_is_ibuf(index));
ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
@@ -4734,11 +4738,11 @@ alloc_another:
change when B-tree nodes are split or
merged. */
mlog_write_ulint(page
- + FIL_PAGE_FILE_FLUSH_LSN,
+ + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION,
space_id,
MLOG_4BYTES, &mtr);
mlog_write_ulint(page
- + FIL_PAGE_FILE_FLUSH_LSN + 4,
+ + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + 4,
rec_page_no,
MLOG_4BYTES, &mtr);
@@ -4746,9 +4750,10 @@ alloc_another:
memset(page + page_zip_get_size(page_zip)
- c_stream.avail_out,
0, c_stream.avail_out);
- mlog_log_string(page + FIL_PAGE_FILE_FLUSH_LSN,
+ mlog_log_string(page
+ + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION,
page_zip_get_size(page_zip)
- - FIL_PAGE_FILE_FLUSH_LSN,
+ - FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION,
&mtr);
/* Copy the page to compressed storage,
because it will be flushed to disk
@@ -4921,7 +4926,7 @@ func_exit:
ut_ad(btr_blob_op_is_update(op));
for (i = 0; i < n_freed_pages; i++) {
- btr_page_free_low(index, freed_pages[i], 0, alloc_mtr);
+ btr_page_free_low(index, freed_pages[i], 0, true, alloc_mtr);
}
DBUG_EXECUTE_IF("btr_store_big_rec_extern",
@@ -5159,7 +5164,7 @@ btr_free_externally_stored_field(
}
next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT);
- btr_page_free_low(index, ext_block, 0, &mtr);
+ btr_page_free_low(index, ext_block, 0, true, &mtr);
if (page_zip != NULL) {
mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
@@ -5190,7 +5195,7 @@ btr_free_externally_stored_field(
because we did not store it on the page (we save the
space overhead from an index page header. */
- btr_page_free_low(index, ext_block, 0, &mtr);
+ btr_page_free_low(index, ext_block, 0, true, &mtr);
mlog_write_ulint(field_ref + BTR_EXTERN_PAGE_NO,
next_page_no,
diff --git a/storage/innobase/btr/btr0defragment.cc b/storage/innobase/btr/btr0defragment.cc
new file mode 100644
index 00000000000..dfb2cd8dffd
--- /dev/null
+++ b/storage/innobase/btr/btr0defragment.cc
@@ -0,0 +1,818 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved.
+Copyright (C) 2014, SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**************************************************//**
+@file btr/btr0defragment.cc
+Index defragmentation.
+
+Created 05/29/2014 Rongrong Zhong
+Modified 16/07/2014 Sunguck Lee
+Modified 30/07/2014 Jan Lindström jan.lindstrom@skysql.com
+*******************************************************/
+
+#include "btr0defragment.h"
+#ifndef UNIV_HOTBACKUP
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "btr0pcur.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+#include "srv0start.h"
+#include "ut0timer.h"
+
+#include <list>
+
+/**************************************************//**
+Custom nullptr implementation for under g++ 4.6
+*******************************************************/
+// #pragma once
+/*
+namespace std
+{
+ // based on SC22/WG21/N2431 = J16/07-0301
+ struct nullptr_t
+ {
+ template<typename any> operator any * () const
+ {
+ return 0;
+ }
+ template<class any, typename T> operator T any:: * () const
+ {
+ return 0;
+ }
+
+#ifdef _MSC_VER
+ struct pad {};
+ pad __[sizeof(void*)/sizeof(pad)];
+#else
+ char __[sizeof(void*)];
+#endif
+private:
+ // nullptr_t();// {}
+ // nullptr_t(const nullptr_t&);
+ // void operator = (const nullptr_t&);
+ void operator &() const;
+ template<typename any> void operator +(any) const
+ {
+ // I Love MSVC 2005!
+ }
+ template<typename any> void operator -(any) const
+ {
+ // I Love MSVC 2005!
+ }
+ };
+static const nullptr_t __nullptr = {};
+}
+
+#ifndef nullptr
+#define nullptr std::__nullptr
+#endif
+*/
+
+/**************************************************//**
+End of Custom nullptr implementation for under g++ 4.6
+*******************************************************/
+
+/* When there's no work, either because defragment is disabled, or because no
+query is submitted, thread checks state every BTR_DEFRAGMENT_SLEEP_IN_USECS.*/
+#define BTR_DEFRAGMENT_SLEEP_IN_USECS 1000000
+/* Reduce the target page size by this amount when compression failure happens
+during defragmentaiton. 512 is chosen because it's a power of 2 and it is about
+3% of the page size. When there are compression failures in defragmentation,
+our goal is to get a decent defrag ratio with as few compression failure as
+possible. From experimentation it seems that reduce the target size by 512 every
+time will make sure the page is compressible within a couple of iterations. */
+#define BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE 512
+
+/* Work queue for defragmentation. */
+typedef std::list<btr_defragment_item_t*> btr_defragment_wq_t;
+static btr_defragment_wq_t btr_defragment_wq;
+
+/* Mutex protecting the defragmentation work queue.*/
+ib_mutex_t btr_defragment_mutex;
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t btr_defragment_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/* Number of compression failures caused by defragmentation since server
+start. */
+ulint btr_defragment_compression_failures = 0;
+/* Number of btr_defragment_n_pages calls that altered page but didn't
+manage to release any page. */
+ulint btr_defragment_failures = 0;
+/* Total number of btr_defragment_n_pages calls that altered page.
+The difference between btr_defragment_count and btr_defragment_failures shows
+the amount of effort wasted. */
+ulint btr_defragment_count = 0;
+
+/******************************************************************//**
+Constructor for btr_defragment_item_t. */
+btr_defragment_item_t::btr_defragment_item_t(
+ btr_pcur_t* pcur,
+ os_event_t event)
+{
+ this->pcur = pcur;
+ this->event = event;
+ this->removed = false;
+ this->last_processed = 0;
+}
+
+/******************************************************************//**
+Destructor for btr_defragment_item_t. */
+btr_defragment_item_t::~btr_defragment_item_t() {
+ if (this->pcur) {
+ btr_pcur_free_for_mysql(this->pcur);
+ }
+ if (this->event) {
+ os_event_set(this->event);
+ }
+}
+
+/******************************************************************//**
+Initialize defragmentation. */
+void
+btr_defragment_init()
+{
+ srv_defragment_interval = ut_microseconds_to_timer(
+ 1000000.0 / srv_defragment_frequency);
+ mutex_create(btr_defragment_mutex_key, &btr_defragment_mutex,
+ SYNC_ANY_LATCH);
+ os_thread_create(btr_defragment_thread, NULL, NULL);
+}
+
+/******************************************************************//**
+Shutdown defragmentation. Release all resources. */
+void
+btr_defragment_shutdown()
+{
+ mutex_enter(&btr_defragment_mutex);
+ list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ while(iter != btr_defragment_wq.end()) {
+ btr_defragment_item_t* item = *iter;
+ iter = btr_defragment_wq.erase(iter);
+ delete item;
+ }
+ mutex_exit(&btr_defragment_mutex);
+ mutex_free(&btr_defragment_mutex);
+}
+
+
+/******************************************************************//**
+Functions used by the query threads: btr_defragment_xxx_index
+Query threads find/add/remove index. */
+/******************************************************************//**
+Check whether the given index is in btr_defragment_wq. We use index->id
+to identify indices. */
+bool
+btr_defragment_find_index(
+ dict_index_t* index) /*!< Index to find. */
+{
+ mutex_enter(&btr_defragment_mutex);
+ for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ iter != btr_defragment_wq.end();
+ ++iter) {
+ btr_defragment_item_t* item = *iter;
+ btr_pcur_t* pcur = item->pcur;
+ btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
+ dict_index_t* idx = btr_cur_get_index(cursor);
+ if (index->id == idx->id) {
+ mutex_exit(&btr_defragment_mutex);
+ return true;
+ }
+ }
+ mutex_exit(&btr_defragment_mutex);
+ return false;
+}
+
+/******************************************************************//**
+Query thread uses this function to add an index to btr_defragment_wq.
+Return a pointer to os_event for the query thread to wait on if this is a
+synchronized defragmentation. */
+os_event_t
+btr_defragment_add_index(
+ dict_index_t* index, /*!< index to be added */
+ bool async) /*!< whether this is an async defragmentation */
+{
+ mtr_t mtr;
+ ulint space = dict_index_get_space(index);
+ ulint zip_size = dict_table_zip_size(index->table);
+ ulint page_no = dict_index_get_page(index);
+ mtr_start(&mtr);
+ // Load index rood page.
+ page_t* page = btr_page_get(space, zip_size, page_no,
+ RW_NO_LATCH, index, &mtr);
+ if (btr_page_get_level(page, &mtr) == 0) {
+ // Index root is a leaf page, no need to defragment.
+ mtr_commit(&mtr);
+ return NULL;
+ }
+ btr_pcur_t* pcur = btr_pcur_create_for_mysql();
+ os_event_t event = NULL;
+ if (!async) {
+ event = os_event_create();
+ }
+ btr_pcur_open_at_index_side(true, index, BTR_SEARCH_LEAF, pcur,
+ true, 0, &mtr);
+ btr_pcur_move_to_next(pcur, &mtr);
+ btr_pcur_store_position(pcur, &mtr);
+ mtr_commit(&mtr);
+ dict_stats_empty_defrag_summary(index);
+ btr_defragment_item_t* item = new btr_defragment_item_t(pcur, event);
+ mutex_enter(&btr_defragment_mutex);
+ btr_defragment_wq.push_back(item);
+ mutex_exit(&btr_defragment_mutex);
+ return event;
+}
+
+/******************************************************************//**
+When table is dropped, this function is called to mark a table as removed in
+btr_efragment_wq. The difference between this function and the remove_index
+function is this will not NULL the event. */
+void
+btr_defragment_remove_table(
+ dict_table_t* table) /*!< Index to be removed. */
+{
+ mutex_enter(&btr_defragment_mutex);
+ for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ iter != btr_defragment_wq.end();
+ ++iter) {
+ btr_defragment_item_t* item = *iter;
+ btr_pcur_t* pcur = item->pcur;
+ btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
+ dict_index_t* idx = btr_cur_get_index(cursor);
+ if (table->id == idx->table->id) {
+ item->removed = true;
+ }
+ }
+ mutex_exit(&btr_defragment_mutex);
+}
+
+/******************************************************************//**
+Query thread uses this function to mark an index as removed in
+btr_efragment_wq. */
+void
+btr_defragment_remove_index(
+ dict_index_t* index) /*!< Index to be removed. */
+{
+ mutex_enter(&btr_defragment_mutex);
+ for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ iter != btr_defragment_wq.end();
+ ++iter) {
+ btr_defragment_item_t* item = *iter;
+ btr_pcur_t* pcur = item->pcur;
+ btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
+ dict_index_t* idx = btr_cur_get_index(cursor);
+ if (index->id == idx->id) {
+ item->removed = true;
+ item->event = NULL;
+ break;
+ }
+ }
+ mutex_exit(&btr_defragment_mutex);
+}
+
+/******************************************************************//**
+Functions used by defragmentation thread: btr_defragment_xxx_item.
+Defragmentation thread operates on the work *item*. It gets/removes
+item from the work queue. */
+/******************************************************************//**
+Defragment thread uses this to remove an item from btr_defragment_wq.
+When an item is removed from the work queue, all resources associated with it
+are free as well. */
+void
+btr_defragment_remove_item(
+ btr_defragment_item_t* item) /*!< Item to be removed. */
+{
+ mutex_enter(&btr_defragment_mutex);
+ for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ iter != btr_defragment_wq.end();
+ ++iter) {
+ if (item == *iter) {
+ btr_defragment_wq.erase(iter);
+ delete item;
+ break;
+ }
+ }
+ mutex_exit(&btr_defragment_mutex);
+}
+
+/******************************************************************//**
+Defragment thread uses this to get an item from btr_defragment_wq to work on.
+The item is not removed from the work queue so query threads can still access
+this item. We keep it this way so query threads can find and kill a
+defragmentation even if that index is being worked on. Be aware that while you
+work on this item you have no lock protection on it whatsoever. This is OK as
+long as the query threads and defragment thread won't modify the same fields
+without lock protection.
+*/
+btr_defragment_item_t*
+btr_defragment_get_item()
+{
+ if (btr_defragment_wq.empty()) {
+ return NULL;
+ //return nullptr;
+ }
+ mutex_enter(&btr_defragment_mutex);
+ list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+ if (iter == btr_defragment_wq.end()) {
+ iter = btr_defragment_wq.begin();
+ }
+ btr_defragment_item_t* item = *iter;
+ iter++;
+ mutex_exit(&btr_defragment_mutex);
+ return item;
+}
+
+/*********************************************************************//**
+Check whether we should save defragmentation statistics to persistent storage.
+Currently we save the stats to persistent storage every 100 updates. */
+UNIV_INTERN
+void
+btr_defragment_save_defrag_stats_if_needed(
+ dict_index_t* index) /*!< in: index */
+{
+ if (srv_defragment_stats_accuracy != 0 // stats tracking disabled
+ && dict_index_get_space(index) != 0 // do not track system tables
+ && index->stat_defrag_modified_counter
+ >= srv_defragment_stats_accuracy) {
+ dict_stats_defrag_pool_add(index);
+ index->stat_defrag_modified_counter = 0;
+ }
+}
+
+/*********************************************************************//**
+Main defragment functionalities used by defragment thread.*/
+/*************************************************************//**
+Calculate number of records from beginning of block that can
+fit into size_limit
+@return number of records */
+UNIV_INTERN
+ulint
+btr_defragment_calc_n_recs_for_size(
+ buf_block_t* block, /*!< in: B-tree page */
+ dict_index_t* index, /*!< in: index of the page */
+ ulint size_limit, /*!< in: size limit to fit records in */
+ ulint* n_recs_size) /*!< out: actual size of the records that fit
+ in size_limit. */
+{
+ page_t* page = buf_block_get_frame(block);
+ ulint n_recs = 0;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+ mem_heap_t* heap = NULL;
+ ulint size = 0;
+ page_cur_t cur;
+
+ page_cur_set_before_first(block, &cur);
+ page_cur_move_to_next(&cur);
+ while (page_cur_get_rec(&cur) != page_get_supremum_rec(page)) {
+ rec_t* cur_rec = page_cur_get_rec(&cur);
+ offsets = rec_get_offsets(cur_rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ ulint rec_size = rec_offs_size(offsets);
+ size += rec_size;
+ if (size > size_limit) {
+ size = size - rec_size;
+ break;
+ }
+ n_recs ++;
+ page_cur_move_to_next(&cur);
+ }
+ *n_recs_size = size;
+ return n_recs;
+}
+
+/*************************************************************//**
+Merge as many records from the from_block to the to_block. Delete
+the from_block if all records are successfully merged to to_block.
+@return the to_block to target for next merge operation. */
+UNIV_INTERN
+buf_block_t*
+btr_defragment_merge_pages(
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* from_block, /*!< in: origin of merge */
+ buf_block_t* to_block, /*!< in: destination of merge */
+ ulint zip_size, /*!< in: zip size of the block */
+ ulint reserved_space, /*!< in: space reserved for future
+ insert to avoid immediate page split */
+ ulint* max_data_size, /*!< in/out: max data size to
+ fit in a single compressed page. */
+ mem_heap_t* heap, /*!< in/out: pointer to memory heap */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ page_t* from_page = buf_block_get_frame(from_block);
+ page_t* to_page = buf_block_get_frame(to_block);
+ ulint space = dict_index_get_space(index);
+ ulint level = btr_page_get_level(from_page, mtr);
+ ulint n_recs = page_get_n_recs(from_page);
+ ulint new_data_size = page_get_data_size(to_page);
+ ulint max_ins_size =
+ page_get_max_insert_size(to_page, n_recs);
+ ulint max_ins_size_reorg =
+ page_get_max_insert_size_after_reorganize(
+ to_page, n_recs);
+ ulint max_ins_size_to_use = max_ins_size_reorg > reserved_space
+ ? max_ins_size_reorg - reserved_space : 0;
+ ulint move_size = 0;
+ ulint n_recs_to_move = 0;
+ rec_t* rec = NULL;
+ ulint target_n_recs = 0;
+ rec_t* orig_pred;
+
+ // Estimate how many records can be moved from the from_page to
+ // the to_page.
+ if (zip_size) {
+ ulint page_diff = UNIV_PAGE_SIZE - *max_data_size;
+ max_ins_size_to_use = (max_ins_size_to_use > page_diff)
+ ? max_ins_size_to_use - page_diff : 0;
+ }
+ n_recs_to_move = btr_defragment_calc_n_recs_for_size(
+ from_block, index, max_ins_size_to_use, &move_size);
+
+ // If max_ins_size >= move_size, we can move the records without
+ // reorganizing the page, otherwise we need to reorganize the page
+ // first to release more space.
+ if (move_size > max_ins_size) {
+ if (!btr_page_reorganize_block(false, page_zip_level,
+ to_block, index,
+ mtr)) {
+ if (!dict_index_is_clust(index)
+ && page_is_leaf(to_page)) {
+ ibuf_reset_free_bits(to_block);
+ }
+ // If reorganization fails, that means page is
+ // not compressable. There's no point to try
+ // merging into this page. Continue to the
+ // next page.
+ return from_block;
+ }
+ ut_ad(page_validate(to_page, index));
+ max_ins_size = page_get_max_insert_size(to_page, n_recs);
+ ut_a(max_ins_size >= move_size);
+ }
+
+ // Move records to pack to_page more full.
+ orig_pred = NULL;
+ target_n_recs = n_recs_to_move;
+ while (n_recs_to_move > 0) {
+ rec = page_rec_get_nth(from_page,
+ n_recs_to_move + 1);
+ orig_pred = page_copy_rec_list_start(
+ to_block, from_block, rec, index, mtr);
+ if (orig_pred)
+ break;
+ // If we reach here, that means compression failed after packing
+ // n_recs_to_move number of records to to_page. We try to reduce
+ // the targeted data size on the to_page by
+ // BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE and try again.
+ os_atomic_increment_ulint(
+ &btr_defragment_compression_failures, 1);
+ max_ins_size_to_use =
+ move_size > BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
+ ? move_size - BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
+ : 0;
+ if (max_ins_size_to_use == 0) {
+ n_recs_to_move = 0;
+ move_size = 0;
+ break;
+ }
+ n_recs_to_move = btr_defragment_calc_n_recs_for_size(
+ from_block, index, max_ins_size_to_use, &move_size);
+ }
+ // If less than target_n_recs are moved, it means there are
+ // compression failures during page_copy_rec_list_start. Adjust
+ // the max_data_size estimation to reduce compression failures
+ // in the following runs.
+ if (target_n_recs > n_recs_to_move
+ && *max_data_size > new_data_size + move_size) {
+ *max_data_size = new_data_size + move_size;
+ }
+ // Set ibuf free bits if necessary.
+ if (!dict_index_is_clust(index)
+ && page_is_leaf(to_page)) {
+ if (zip_size) {
+ ibuf_reset_free_bits(to_block);
+ } else {
+ ibuf_update_free_bits_if_full(
+ to_block,
+ UNIV_PAGE_SIZE,
+ ULINT_UNDEFINED);
+ }
+ }
+ if (n_recs_to_move == n_recs) {
+ /* The whole page is merged with the previous page,
+ free it. */
+ lock_update_merge_left(to_block, orig_pred,
+ from_block);
+ btr_search_drop_page_hash_index(from_block);
+ btr_level_list_remove(space, zip_size, from_page,
+ index, mtr);
+ btr_node_ptr_delete(index, from_block, mtr);
+ btr_blob_dbg_remove(from_page, index,
+ "btr_defragment_n_pages");
+ btr_page_free(index, from_block, mtr);
+ } else {
+ // There are still records left on the page, so
+ // increment n_defragmented. Node pointer will be changed
+ // so remove the old node pointer.
+ if (n_recs_to_move > 0) {
+ // Part of the page is merged to left, remove
+ // the merged records, update record locks and
+ // node pointer.
+ dtuple_t* node_ptr;
+ page_delete_rec_list_start(rec, from_block,
+ index, mtr);
+ lock_update_split_and_merge(to_block,
+ orig_pred,
+ from_block);
+ btr_node_ptr_delete(index, from_block, mtr);
+ rec = page_rec_get_next(
+ page_get_infimum_rec(from_page));
+ node_ptr = dict_index_build_node_ptr(
+ index, rec, page_get_page_no(from_page),
+ heap, level + 1);
+ btr_insert_on_non_leaf_level(0, index, level+1,
+ node_ptr, mtr);
+ }
+ to_block = from_block;
+ }
+ return to_block;
+}
+
+/*************************************************************//**
+Tries to merge N consecutive pages, starting from the page pointed by the
+cursor. Skip space 0. Only consider leaf pages.
+This function first loads all N pages into memory, then for each of
+the pages other than the first page, it tries to move as many records
+as possible to the left sibling to keep the left sibling full. During
+the process, if any page becomes empty, that page will be removed from
+the level list. Record locks, hash, and node pointers are updated after
+page reorganization.
+@return pointer to the last block processed, or NULL if reaching end of index */
+UNIV_INTERN
+buf_block_t*
+btr_defragment_n_pages(
+ buf_block_t* block, /*!< in: starting block for defragmentation */
+ dict_index_t* index, /*!< in: index tree */
+ uint n_pages,/*!< in: number of pages to defragment */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ulint space;
+ ulint zip_size;
+ /* We will need to load the n+1 block because if the last page is freed
+ and we need to modify the prev_page_no of that block. */
+ buf_block_t* blocks[BTR_DEFRAGMENT_MAX_N_PAGES + 1];
+ page_t* first_page;
+ buf_block_t* current_block;
+ ulint total_data_size = 0;
+ ulint total_n_recs = 0;
+ ulint data_size_per_rec;
+ ulint optimal_page_size;
+ ulint reserved_space;
+ ulint level;
+ ulint max_data_size = 0;
+ uint n_defragmented = 0;
+ uint n_new_slots;
+ mem_heap_t* heap;
+ ibool end_of_index = FALSE;
+
+ /* It doesn't make sense to call this function with n_pages = 1. */
+ ut_ad(n_pages > 1);
+
+ ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+ MTR_MEMO_X_LOCK));
+ space = dict_index_get_space(index);
+ if (space == 0) {
+ /* Ignore space 0. */
+ return NULL;
+ }
+
+ if (n_pages > BTR_DEFRAGMENT_MAX_N_PAGES) {
+ n_pages = BTR_DEFRAGMENT_MAX_N_PAGES;
+ }
+
+ zip_size = dict_table_zip_size(index->table);
+ first_page = buf_block_get_frame(block);
+ level = btr_page_get_level(first_page, mtr);
+
+ if (level != 0) {
+ return NULL;
+ }
+
+ /* 1. Load the pages and calculate the total data size. */
+ blocks[0] = block;
+ for (uint i = 1; i <= n_pages; i++) {
+ page_t* page = buf_block_get_frame(blocks[i-1]);
+ ulint page_no = btr_page_get_next(page, mtr);
+ total_data_size += page_get_data_size(page);
+ total_n_recs += page_get_n_recs(page);
+ if (page_no == FIL_NULL) {
+ n_pages = i;
+ end_of_index = TRUE;
+ break;
+ }
+ blocks[i] = btr_block_get(space, zip_size, page_no,
+ RW_X_LATCH, index, mtr);
+ }
+
+ if (n_pages == 1) {
+ if (btr_page_get_prev(first_page, mtr) == FIL_NULL) {
+ /* last page in the index */
+ if (dict_index_get_page(index)
+ == page_get_page_no(first_page))
+ return NULL;
+ /* given page is the last page.
+ Lift the records to father. */
+ btr_lift_page_up(index, block, mtr);
+ }
+ return NULL;
+ }
+
+ /* 2. Calculate how many pages data can fit in. If not compressable,
+ return early. */
+ ut_a(total_n_recs != 0);
+ data_size_per_rec = total_data_size / total_n_recs;
+ // For uncompressed pages, the optimal data size if the free space of a
+ // empty page.
+ optimal_page_size = page_get_free_space_of_empty(
+ page_is_comp(first_page));
+ // For compressed pages, we take compression failures into account.
+ if (zip_size) {
+ ulint size = 0;
+ int i = 0;
+ // We estimate the optimal data size of the index use samples of
+ // data size. These samples are taken when pages failed to
+ // compress due to insertion on the page. We use the average
+ // of all samples we have as the estimation. Different pages of
+ // the same index vary in compressibility. Average gives a good
+ // enough estimation.
+ for (;i < STAT_DEFRAG_DATA_SIZE_N_SAMPLE; i++) {
+ if (index->stat_defrag_data_size_sample[i] == 0) {
+ break;
+ }
+ size += index->stat_defrag_data_size_sample[i];
+ }
+ if (i != 0) {
+ size = size / i;
+ optimal_page_size = min(optimal_page_size, size);
+ }
+ max_data_size = optimal_page_size;
+ }
+
+ reserved_space = min((ulint)(optimal_page_size
+ * (1 - srv_defragment_fill_factor)),
+ (data_size_per_rec
+ * srv_defragment_fill_factor_n_recs));
+ optimal_page_size -= reserved_space;
+ n_new_slots = (total_data_size + optimal_page_size - 1)
+ / optimal_page_size;
+ if (n_new_slots >= n_pages) {
+ /* Can't defragment. */
+ if (end_of_index)
+ return NULL;
+ return blocks[n_pages-1];
+ }
+
+ /* 3. Defragment pages. */
+ heap = mem_heap_create(256);
+ // First defragmented page will be the first page.
+ current_block = blocks[0];
+ // Start from the second page.
+ for (uint i = 1; i < n_pages; i ++) {
+ buf_block_t* new_block = btr_defragment_merge_pages(
+ index, blocks[i], current_block, zip_size,
+ reserved_space, &max_data_size, heap, mtr);
+ if (new_block != current_block) {
+ n_defragmented ++;
+ current_block = new_block;
+ }
+ }
+ mem_heap_free(heap);
+ n_defragmented ++;
+ os_atomic_increment_ulint(
+ &btr_defragment_count, 1);
+ if (n_pages == n_defragmented) {
+ os_atomic_increment_ulint(
+ &btr_defragment_failures, 1);
+ } else {
+ index->stat_defrag_n_pages_freed += (n_pages - n_defragmented);
+ }
+ if (end_of_index)
+ return NULL;
+ return current_block;
+}
+
+/******************************************************************//**
+Thread that merges consecutive b-tree pages into fewer pages to defragment
+the index. */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(btr_defragment_thread)(
+/*==========================================*/
+ void* arg) /*!< in: work queue */
+{
+ btr_pcur_t* pcur;
+ btr_cur_t* cursor;
+ dict_index_t* index;
+ mtr_t mtr;
+ buf_block_t* first_block;
+ buf_block_t* last_block;
+
+ while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
+ /* If defragmentation is disabled, sleep before
+ checking whether it's enabled. */
+ if (!srv_defragment) {
+ os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS);
+ continue;
+ }
+ /* The following call won't remove the item from work queue.
+ We only get a pointer to it to work on. This will make sure
+ when user issue a kill command, all indices are in the work
+ queue to be searched. This also means that the user thread
+ cannot directly remove the item from queue (since we might be
+ using it). So user thread only marks index as removed. */
+ btr_defragment_item_t* item = btr_defragment_get_item();
+ /* If work queue is empty, sleep and check later. */
+ if (!item) {
+ os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS);
+ continue;
+ }
+ /* If an index is marked as removed, we remove it from the work
+ queue. No other thread could be using this item at this point so
+ it's safe to remove now. */
+ if (item->removed) {
+ btr_defragment_remove_item(item);
+ continue;
+ }
+
+ pcur = item->pcur;
+ ulonglong now = ut_timer_now();
+ ulonglong elapsed = now - item->last_processed;
+
+ if (elapsed < srv_defragment_interval) {
+ /* If we see an index again before the interval
+ determined by the configured frequency is reached,
+ we just sleep until the interval pass. Since
+ defragmentation of all indices queue up on a single
+ thread, it's likely other indices that follow this one
+ don't need to sleep again. */
+ os_thread_sleep(((ulint)ut_timer_to_microseconds(
+ srv_defragment_interval - elapsed)));
+ }
+
+ now = ut_timer_now();
+ mtr_start(&mtr);
+ btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, &mtr);
+ cursor = btr_pcur_get_btr_cur(pcur);
+ index = btr_cur_get_index(cursor);
+ first_block = btr_cur_get_block(cursor);
+ last_block = btr_defragment_n_pages(first_block, index,
+ srv_defragment_n_pages,
+ &mtr);
+ if (last_block) {
+ /* If we haven't reached the end of the index,
+ place the cursor on the last record of last page,
+ store the cursor position, and put back in queue. */
+ page_t* last_page = buf_block_get_frame(last_block);
+ rec_t* rec = page_rec_get_prev(
+ page_get_supremum_rec(last_page));
+ ut_a(page_rec_is_user_rec(rec));
+ page_cur_position(rec, last_block,
+ btr_cur_get_page_cur(cursor));
+ btr_pcur_store_position(pcur, &mtr);
+ mtr_commit(&mtr);
+ /* Update the last_processed time of this index. */
+ item->last_processed = now;
+ } else {
+ mtr_commit(&mtr);
+ /* Reaching the end of the index. */
+ dict_stats_empty_defrag_stats(index);
+ dict_stats_save_defrag_stats(index);
+ dict_stats_save_defrag_summary(index);
+ btr_defragment_remove_item(item);
+ }
+ }
+ btr_defragment_shutdown();
+ os_thread_exit(NULL);
+ OS_THREAD_DUMMY_RETURN;
+}
+
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/btr/btr0scrub.cc b/storage/innobase/btr/btr0scrub.cc
new file mode 100644
index 00000000000..d53b478e429
--- /dev/null
+++ b/storage/innobase/btr/btr0scrub.cc
@@ -0,0 +1,898 @@
+// Copyright (c) 2014, Google Inc.
+
+/**************************************************//**
+@file btr/btr0scrub.cc
+Scrubbing of btree pages
+
+*******************************************************/
+
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0scrub.h"
+#include "ibuf0ibuf.h"
+#include "fsp0fsp.h"
+#include "dict0dict.h"
+#include "mtr0mtr.h"
+
+/* used when trying to acquire dict-lock */
+UNIV_INTERN bool fil_crypt_is_closing(ulint space);
+
+/**
+* scrub data at delete time (e.g purge thread)
+*/
+my_bool srv_immediate_scrub_data_uncompressed = false;
+
+/**
+* background scrub uncompressed data
+*
+* if srv_immediate_scrub_data_uncompressed is enabled
+* this is only needed to handle "old" data
+*/
+my_bool srv_background_scrub_data_uncompressed = false;
+
+/**
+* backgrounds scrub compressed data
+*
+* reorganize compressed page for scrubbing
+* (only way to scrub compressed data)
+*/
+my_bool srv_background_scrub_data_compressed = false;
+
+/* check spaces once per hour */
+UNIV_INTERN uint srv_background_scrub_data_check_interval = (60 * 60);
+
+/* default to scrub spaces that hasn't been scrubbed in a week */
+UNIV_INTERN uint srv_background_scrub_data_interval = (7 * 24 * 60 * 60);
+
+/**
+* statistics for scrubbing by background threads
+*/
+static btr_scrub_stat_t scrub_stat;
+static ib_mutex_t scrub_stat_mutex;
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t scrub_stat_mutex_key;
+#endif
+
+#ifdef UNIV_DEBUG
+/**
+* srv_scrub_force_testing
+*
+* - force scrubbing using background threads even for uncompressed tables
+* - force pessimistic scrubbing (page split) even if not needed
+* (see test_pessimistic_scrub_pct)
+*/
+my_bool srv_scrub_force_testing = true;
+
+/**
+* Force pessimistic scrubbing in 50% of the cases (UNIV_DEBUG only)
+*/
+static int test_pessimistic_scrub_pct = 50;
+
+#endif
+static uint scrub_compression_level = page_zip_level;
+
+/**************************************************************//**
+Log a scrubbing failure */
+static
+void
+log_scrub_failure(
+/*===============*/
+ btr_scrub_t* scrub_data, /*!< in: data to store statistics on */
+ buf_block_t* block, /*!< in: block */
+ dberr_t err) /*!< in: error */
+{
+ const char* reason = "unknown";
+ switch(err) {
+ case DB_UNDERFLOW:
+ reason = "too few records on page";
+ scrub_data->scrub_stat.page_split_failures_underflow++;
+ break;
+ case DB_INDEX_CORRUPT:
+ reason = "unable to find index!";
+ scrub_data->scrub_stat.page_split_failures_missing_index++;
+ break;
+ case DB_OUT_OF_FILE_SPACE:
+ reason = "out of filespace";
+ scrub_data->scrub_stat.page_split_failures_out_of_filespace++;
+ break;
+ default:
+ ut_ad(0);
+ reason = "unknown";
+ scrub_data->scrub_stat.page_split_failures_unknown++;
+ }
+ fprintf(stderr,
+ "InnoDB: Warning: Failed to scrub page %lu in space %lu : %s\n",
+ buf_block_get_page_no(block),
+ buf_block_get_space(block),
+ reason);
+}
+
+/****************************************************************
+Lock dict mutexes */
+static
+bool
+btr_scrub_lock_dict_func(ulint space, bool lock_to_close_table,
+ const char * file, uint line)
+{
+ uint start = time(0);
+ uint last = start;
+
+ while (mutex_enter_nowait_func(&(dict_sys->mutex), file, line)) {
+ /* if we lock to close a table, we wait forever
+ * if we don't lock to close a table, we check if space
+ * is closing, and then instead give up
+ */
+ if (lock_to_close_table == false) {
+ if (fil_crypt_is_closing(space)) {
+ return false;
+ }
+ }
+ os_thread_sleep(250000);
+
+ uint now = time(0);
+ if (now >= last + 30) {
+ fprintf(stderr,
+ "WARNING: %s:%u waited %u seconds for"
+ " dict_sys lock, space: %lu"
+ " lock_to_close_table: %u\n",
+ file, line, now - start, space,
+ lock_to_close_table);
+
+ last = now;
+ }
+ }
+
+ ut_ad(mutex_own(&dict_sys->mutex));
+ return true;
+}
+
+#define btr_scrub_lock_dict(space, lock_to_close_table) \
+ btr_scrub_lock_dict_func(space, lock_to_close_table, __FILE__, __LINE__)
+
+/****************************************************************
+Unlock dict mutexes */
+static
+void
+btr_scrub_unlock_dict()
+{
+ dict_mutex_exit_for_mysql();
+}
+
+/****************************************************************
+Release reference to table
+*/
+static
+void
+btr_scrub_table_close(
+/*==================*/
+ dict_table_t* table) /*!< in: table */
+{
+ bool dict_locked = true;
+ bool try_drop = false;
+ table->stats_bg_flag &= ~BG_SCRUB_IN_PROGRESS;
+ dict_table_close(table, dict_locked, try_drop);
+}
+
+/****************************************************************
+Release reference to table
+*/
+static
+void
+btr_scrub_table_close_for_thread(
+ btr_scrub_t *scrub_data)
+{
+ if (scrub_data->current_table == NULL)
+ return;
+
+ bool lock_for_close = true;
+ btr_scrub_lock_dict(scrub_data->space, lock_for_close);
+
+ /* perform the actual closing */
+ btr_scrub_table_close(scrub_data->current_table);
+
+ btr_scrub_unlock_dict();
+
+ scrub_data->current_table = NULL;
+ scrub_data->current_index = NULL;
+}
+
+/**************************************************************//**
+Check if scrubbing is turned ON or OFF */
+static
+bool
+check_scrub_setting(
+/*=====================*/
+ btr_scrub_t* scrub_data) /*!< in: scrub data */
+{
+ if (scrub_data->compressed)
+ return srv_background_scrub_data_compressed;
+ else
+ return srv_background_scrub_data_uncompressed;
+}
+
+#define IBUF_INDEX_ID (DICT_IBUF_ID_MIN + IBUF_SPACE_ID)
+
+/**************************************************************//**
+Check if a page needs scrubbing */
+UNIV_INTERN
+int
+btr_page_needs_scrubbing(
+/*=====================*/
+ btr_scrub_t* scrub_data, /*!< in: scrub data */
+ buf_block_t* block, /*!< in: block to check, latched */
+ btr_scrub_page_allocation_status_t allocated) /*!< in: is block known
+ to be allocated */
+{
+ /**
+ * Check if scrubbing has been turned OFF.
+ *
+ * at start of space, we check if scrubbing is ON or OFF
+ * here we only check if scrubbing is turned OFF.
+ *
+ * Motivation is that it's only valueable to have a full table (space)
+ * scrubbed.
+ */
+ if (!check_scrub_setting(scrub_data)) {
+ bool before_value = scrub_data->scrubbing;
+ scrub_data->scrubbing = false;
+
+ if (before_value == true) {
+ /* we toggle scrubbing from on to off */
+ return BTR_SCRUB_TURNED_OFF;
+ }
+ }
+
+ if (scrub_data->scrubbing == false) {
+ return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
+ }
+
+ page_t* page = buf_block_get_frame(block);
+ uint type = fil_page_get_type(page);
+
+ if (allocated == BTR_SCRUB_PAGE_ALLOCATED) {
+ if (type != FIL_PAGE_INDEX) {
+ /* this function is called from fil-crypt-threads.
+ * these threads iterate all pages of all tablespaces
+ * and don't know about fil_page_type.
+ * But scrubbing is only needed for index-pages. */
+
+ /**
+ * NOTE: scrubbing is also needed for UNDO pages,
+ * but they are scrubbed at purge-time, since they are
+ * uncompressed
+ */
+
+ /* if encountering page type not needing scrubbing
+ release reference to table object */
+ return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
+ }
+
+ if (page_has_garbage(page) == false) {
+ /* no garbage (from deleted/shrunken records) */
+ return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
+ }
+
+ } else if (allocated == BTR_SCRUB_PAGE_FREE ||
+ allocated == BTR_SCRUB_PAGE_ALLOCATION_UNKNOWN) {
+
+ if (! (type == FIL_PAGE_INDEX ||
+ type == FIL_PAGE_TYPE_BLOB ||
+ type == FIL_PAGE_TYPE_ZBLOB ||
+ type == FIL_PAGE_TYPE_ZBLOB2)) {
+
+ /**
+ * If this is a dropped page, we also need to scrub
+ * BLOB pages
+ */
+
+ /* if encountering page type not needing scrubbing
+ release reference to table object */
+ return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
+ }
+ }
+
+ if (btr_page_get_index_id(page) == IBUF_INDEX_ID) {
+ /* skip ibuf */
+ return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
+ }
+
+ return BTR_SCRUB_PAGE;
+}
+
+/****************************************************************
+Handle a skipped page
+*/
+UNIV_INTERN
+void
+btr_scrub_skip_page(
+/*==================*/
+ btr_scrub_t* scrub_data, /*!< in: data with scrub state */
+ int needs_scrubbing) /*!< in: return code from
+ btr_page_needs_scrubbing */
+{
+ switch(needs_scrubbing) {
+ case BTR_SCRUB_SKIP_PAGE:
+ /* nothing todo */
+ return;
+ case BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE:
+ btr_scrub_table_close_for_thread(scrub_data);
+ return;
+ case BTR_SCRUB_TURNED_OFF:
+ case BTR_SCRUB_SKIP_PAGE_AND_COMPLETE_SPACE:
+ btr_scrub_complete_space(scrub_data);
+ return;
+ }
+
+ /* unknown value. should not happen */
+ ut_a(0);
+}
+
+/****************************************************************
+Try to scrub a page using btr_page_reorganize_low
+return DB_SUCCESS on success or DB_OVERFLOW on failure */
+static
+dberr_t
+btr_optimistic_scrub(
+/*==================*/
+ btr_scrub_t* scrub_data, /*!< in: data with scrub state */
+ buf_block_t* block, /*!< in: block to scrub */
+ dict_index_t* index, /*!< in: index */
+ mtr_t* mtr) /*!< in: mtr */
+{
+#ifdef UNIV_DEBUG
+ if (srv_scrub_force_testing &&
+ page_get_n_recs(buf_block_get_frame(block)) > 2 &&
+ (rand() % 100) < test_pessimistic_scrub_pct) {
+
+ fprintf(stderr,
+ "scrub: simulate btr_page_reorganize failed %lu:%lu "
+ " table: %llu:%s index: %llu:%s get_n_recs(): %lu\n",
+ buf_block_get_space(block),
+ buf_block_get_page_no(block),
+ (ulonglong)scrub_data->current_table->id,
+ scrub_data->current_table->name,
+ (ulonglong)scrub_data->current_index->id,
+ scrub_data->current_index->name,
+ page_get_n_recs(buf_block_get_frame(block)));
+ return DB_OVERFLOW;
+ }
+#endif
+
+ page_cur_t cur;
+ page_cur_set_before_first(block, &cur);
+ bool recovery = false;
+ if (!btr_page_reorganize_low(recovery, scrub_compression_level,
+ &cur, index, mtr)) {
+ return DB_OVERFLOW;
+ }
+
+ /* We play safe and reset the free bits */
+ if (!dict_index_is_clust(index) &&
+ page_is_leaf(buf_block_get_frame(block))) {
+
+ ibuf_reset_free_bits(block);
+ }
+
+ scrub_data->scrub_stat.page_reorganizations++;
+ return DB_SUCCESS;
+}
+
+/****************************************************************
+Try to scrub a page by splitting it
+return DB_SUCCESS on success
+DB_UNDERFLOW if page has too few records
+DB_OUT_OF_FILE_SPACE if we can't find space for split */
+static
+dberr_t
+btr_pessimistic_scrub(
+/*==================*/
+ btr_scrub_t* scrub_data, /*!< in: data with scrub state */
+ buf_block_t* block, /*!< in: block to scrub */
+ dict_index_t* index, /*!< in: index */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* page = buf_block_get_frame(block);
+ if (page_get_n_recs(page) < 2) {
+ /**
+ * There is no way we can split a page with < 2 records
+ */
+ log_scrub_failure(scrub_data, block, DB_UNDERFLOW);
+ return DB_UNDERFLOW;
+ }
+
+ /**
+ * Splitting page needs new space, allocate it here
+ * so that splitting won't fail due to this */
+ ulint n_extents = 3;
+ ulint n_reserved = 0;
+ if (!fsp_reserve_free_extents(&n_reserved, index->space,
+ n_extents, FSP_NORMAL, mtr)) {
+ log_scrub_failure(scrub_data, block,
+ DB_OUT_OF_FILE_SPACE);
+ return DB_OUT_OF_FILE_SPACE;
+ }
+
+ /* read block variables */
+ ulint space = buf_block_get_space(block);
+ ulint page_no = buf_block_get_page_no(block);
+ ulint zip_size = buf_block_get_zip_size(block);
+ ulint left_page_no = btr_page_get_prev(page, mtr);
+ ulint right_page_no = btr_page_get_next(page, mtr);
+
+ /**
+ * When splitting page, we need X-latches on left/right brothers
+ * see e.g btr_cur_latch_leaves
+ */
+
+ if (left_page_no != FIL_NULL) {
+ /**
+ * pages needs to be locked left-to-right, release block
+ * and re-lock. We still have x-lock on index
+ * so this should be safe
+ */
+ mtr_release_buf_page_at_savepoint(mtr, scrub_data->savepoint,
+ block);
+
+ buf_block_t* get_block = btr_block_get(
+ space, zip_size, left_page_no,
+ RW_X_LATCH, index, mtr);
+ get_block->check_index_page_at_flush = TRUE;
+
+ /**
+ * Refetch block and re-initialize page
+ */
+ block = btr_block_get(
+ space, zip_size, page_no,
+ RW_X_LATCH, index, mtr);
+
+ page = buf_block_get_frame(block);
+
+ /**
+ * structure should be unchanged
+ */
+ ut_a(left_page_no == btr_page_get_prev(page, mtr));
+ ut_a(right_page_no == btr_page_get_next(page, mtr));
+ }
+
+ if (right_page_no != FIL_NULL) {
+ buf_block_t* get_block = btr_block_get(
+ space, zip_size, right_page_no,
+ RW_X_LATCH, index, mtr);
+ get_block->check_index_page_at_flush = TRUE;
+ }
+
+ /* arguments to btr_page_split_and_insert */
+ mem_heap_t* heap = NULL;
+ dtuple_t* entry = NULL;
+ ulint* offsets = NULL;
+ ulint n_ext = 0;
+ ulint flags = BTR_MODIFY_TREE;
+
+ /**
+ * position a cursor on first record on page
+ */
+ rec_t* rec = page_rec_get_next(page_get_infimum_rec(page));
+ btr_cur_t cursor;
+ btr_cur_position(index, rec, block, &cursor);
+
+ /**
+ * call split page with NULL as argument for entry to insert
+ */
+ if (dict_index_get_page(index) == buf_block_get_page_no(block)) {
+ /* The page is the root page
+ * NOTE: ibuf_reset_free_bits is called inside
+ * btr_root_raise_and_insert */
+ rec = btr_root_raise_and_insert(
+ flags, &cursor, &offsets, &heap, entry, n_ext, mtr);
+ } else {
+ /* We play safe and reset the free bits
+ * NOTE: need to call this prior to btr_page_split_and_insert */
+ if (!dict_index_is_clust(index) &&
+ page_is_leaf(buf_block_get_frame(block))) {
+
+ ibuf_reset_free_bits(block);
+ }
+
+ rec = btr_page_split_and_insert(
+ flags, &cursor, &offsets, &heap, entry, n_ext, mtr);
+ }
+
+ if (heap) {
+ mem_heap_free(heap);
+ }
+
+ if (n_reserved > 0) {
+ fil_space_release_free_extents(index->space, n_reserved);
+ }
+
+ scrub_data->scrub_stat.page_splits++;
+ return DB_SUCCESS;
+}
+
+/****************************************************************
+Location index by id for a table
+return index or NULL */
+static
+dict_index_t*
+find_index(
+/*========*/
+ dict_table_t* table, /*!< in: table */
+ index_id_t index_id) /*!< in: index id */
+{
+ if (table != NULL) {
+ dict_index_t* index = dict_table_get_first_index(table);
+ while (index != NULL) {
+ if (index->id == index_id)
+ return index;
+ index = dict_table_get_next_index(index);
+ }
+ }
+
+ return NULL;
+}
+
+/****************************************************************
+Check if table should be scrubbed
+*/
+static
+bool
+btr_scrub_table_needs_scrubbing(
+/*============================*/
+ dict_table_t* table) /*!< in: table */
+{
+ if (table == NULL)
+ return false;
+
+ if (table->stats_bg_flag & BG_STAT_SHOULD_QUIT) {
+ return false;
+ }
+
+ if (table->to_be_dropped) {
+ return false;
+ }
+
+ if (table->corrupted) {
+ return false;
+ }
+
+ return true;
+}
+
+/****************************************************************
+Check if index should be scrubbed
+*/
+static
+bool
+btr_scrub_index_needs_scrubbing(
+/*============================*/
+ dict_index_t* index) /*!< in: index */
+{
+ if (index == NULL)
+ return false;
+
+ if (dict_index_is_ibuf(index)) {
+ return false;
+ }
+
+ if (dict_index_is_online_ddl(index)) {
+ return false;
+ }
+
+ return true;
+}
+
+/****************************************************************
+Get table and index and store it on scrub_data
+*/
+static
+void
+btr_scrub_get_table_and_index(
+/*=========================*/
+ btr_scrub_t* scrub_data, /*!< in/out: scrub data */
+ index_id_t index_id) /*!< in: index id */
+{
+ /* first check if it's an index to current table */
+ scrub_data->current_index = find_index(scrub_data->current_table,
+ index_id);
+
+ if (scrub_data->current_index != NULL) {
+ /* yes it was */
+ return;
+ }
+
+ if (!btr_scrub_lock_dict(scrub_data->space, false)) {
+ btr_scrub_complete_space(scrub_data);
+ return;
+ }
+
+ /* close current table (if any) */
+ if (scrub_data->current_table != NULL) {
+ btr_scrub_table_close(scrub_data->current_table);
+ scrub_data->current_table = NULL;
+ }
+
+ /* argument to dict_table_open_on_index_id */
+ bool dict_locked = true;
+
+ /* open table based on index_id */
+ dict_table_t* table = dict_table_open_on_index_id(
+ index_id,
+ dict_locked);
+
+ if (table != NULL) {
+ /* mark table as being scrubbed */
+ table->stats_bg_flag |= BG_SCRUB_IN_PROGRESS;
+
+ if (!btr_scrub_table_needs_scrubbing(table)) {
+ btr_scrub_table_close(table);
+ btr_scrub_unlock_dict();
+ return;
+ }
+ }
+
+ btr_scrub_unlock_dict();
+ scrub_data->current_table = table;
+ scrub_data->current_index = find_index(table, index_id);
+}
+
+/****************************************************************
+Handle free page */
+UNIV_INTERN
+int
+btr_scrub_free_page(
+/*====================*/
+ btr_scrub_t* scrub_data, /*!< in/out: scrub data */
+ buf_block_t* block, /*!< in: block to scrub */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ // TODO(jonaso): scrub only what is actually needed
+
+ {
+ /* note: perform both the memset and setting of FIL_PAGE_TYPE
+ * wo/ logging. so that if we crash before page is flushed
+ * it will be found by scrubbing thread again
+ */
+ memset(buf_block_get_frame(block) + PAGE_HEADER, 0,
+ UNIV_PAGE_SIZE - PAGE_HEADER);
+
+ mach_write_to_2(buf_block_get_frame(block) + FIL_PAGE_TYPE,
+ FIL_PAGE_TYPE_ALLOCATED);
+ }
+
+ ulint compact = 1;
+ page_create(block, mtr, compact);
+
+ mtr_commit(mtr);
+
+ /* page doesn't need further processing => SKIP
+ * and close table/index so that we don't keep references too long */
+ return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
+}
+
+/****************************************************************
+Recheck if a page needs scrubbing, and if it does load appropriate
+table and index */
+UNIV_INTERN
+int
+btr_scrub_recheck_page(
+/*====================*/
+ btr_scrub_t* scrub_data, /*!< inut: scrub data */
+ buf_block_t* block, /*!< in: block */
+ btr_scrub_page_allocation_status_t allocated, /*!< in: is block
+ allocated or free */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ /* recheck if page needs scrubbing (knowing allocation status) */
+ int needs_scrubbing = btr_page_needs_scrubbing(
+ scrub_data, block, allocated);
+
+ if (needs_scrubbing != BTR_SCRUB_PAGE) {
+ mtr_commit(mtr);
+ return needs_scrubbing;
+ }
+
+ if (allocated == BTR_SCRUB_PAGE_FREE) {
+ /** we don't need to load table/index for free pages
+ * so scrub directly here */
+ /* mtr is committed inside btr_scrub_page_free */
+ return btr_scrub_free_page(scrub_data,
+ block,
+ mtr);
+ }
+
+ page_t* page = buf_block_get_frame(block);
+ index_id_t index_id = btr_page_get_index_id(page);
+
+ if (scrub_data->current_index == NULL ||
+ scrub_data->current_index->id != index_id) {
+
+ /**
+ * commit mtr (i.e release locks on block)
+ * and try to get table&index potentially loading it
+ * from disk
+ */
+ mtr_commit(mtr);
+ btr_scrub_get_table_and_index(scrub_data, index_id);
+ } else {
+ /* we already have correct index
+ * commit mtr so that we can lock index before fetching page
+ */
+ mtr_commit(mtr);
+ }
+
+ /* check if table is about to be dropped */
+ if (!btr_scrub_table_needs_scrubbing(scrub_data->current_table)) {
+ return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
+ }
+
+ /* check if index is scrubbable */
+ if (!btr_scrub_index_needs_scrubbing(scrub_data->current_index)) {
+ return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
+ }
+
+ mtr_start(mtr);
+ mtr_x_lock(dict_index_get_lock(scrub_data->current_index), mtr);
+ /** set savepoint for X-latch of block */
+ scrub_data->savepoint = mtr_set_savepoint(mtr);
+ return BTR_SCRUB_PAGE;
+}
+
+/****************************************************************
+Perform actual scrubbing of page */
+UNIV_INTERN
+int
+btr_scrub_page(
+/*============*/
+ btr_scrub_t* scrub_data, /*!< in/out: scrub data */
+ buf_block_t* block, /*!< in: block */
+ btr_scrub_page_allocation_status_t allocated, /*!< in: is block
+ allocated or free */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ /* recheck if page needs scrubbing (knowing allocation status) */
+ int needs_scrubbing = btr_page_needs_scrubbing(
+ scrub_data, block, allocated);
+ if (needs_scrubbing != BTR_SCRUB_PAGE) {
+ mtr_commit(mtr);
+ return needs_scrubbing;
+ }
+
+ if (allocated == BTR_SCRUB_PAGE_FREE) {
+ /* mtr is committed inside btr_scrub_page_free */
+ return btr_scrub_free_page(scrub_data,
+ block,
+ mtr);
+ }
+
+ /* check that table/index still match now that they are loaded */
+
+ if (scrub_data->current_table->space != scrub_data->space) {
+ /* this is truncate table */
+ mtr_commit(mtr);
+ return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
+ }
+
+ if (scrub_data->current_index->space != scrub_data->space) {
+ /* this is truncate table */
+ mtr_commit(mtr);
+ return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
+ }
+
+ if (scrub_data->current_index->page == FIL_NULL) {
+ /* this is truncate table */
+ mtr_commit(mtr);
+ return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
+ }
+
+ if (btr_page_get_index_id(buf_block_get_frame(block)) !=
+ scrub_data->current_index->id) {
+ /* page has been reallocated to new index */
+ mtr_commit(mtr);
+ return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
+ }
+
+ /* check if I can scrub (reorganize) page wo/ overflow */
+ if (btr_optimistic_scrub(scrub_data,
+ block,
+ scrub_data->current_index,
+ mtr) != DB_SUCCESS) {
+
+ /**
+ * Can't reorganize page...need to split it
+ */
+ btr_pessimistic_scrub(scrub_data,
+ block,
+ scrub_data->current_index,
+ mtr);
+ }
+ mtr_commit(mtr);
+
+ return BTR_SCRUB_SKIP_PAGE; // no further action needed
+}
+
+/**************************************************************//**
+Start iterating a space */
+UNIV_INTERN
+bool
+btr_scrub_start_space(
+/*===================*/
+ ulint space, /*!< in: space */
+ btr_scrub_t* scrub_data) /*!< in/out: scrub data */
+{
+ scrub_data->space = space;
+ scrub_data->current_table = NULL;
+ scrub_data->current_index = NULL;
+
+ scrub_data->compressed = fil_space_get_zip_size(space) > 0;
+ scrub_data->scrubbing = check_scrub_setting(scrub_data);
+ return scrub_data->scrubbing;
+}
+
+/***********************************************************************
+Update global statistics with thread statistics */
+static
+void
+btr_scrub_update_total_stat(btr_scrub_t *scrub_data)
+{
+ mutex_enter(&scrub_stat_mutex);
+ scrub_stat.page_reorganizations +=
+ scrub_data->scrub_stat.page_reorganizations;
+ scrub_stat.page_splits +=
+ scrub_data->scrub_stat.page_splits;
+ scrub_stat.page_split_failures_underflow +=
+ scrub_data->scrub_stat.page_split_failures_underflow;
+ scrub_stat.page_split_failures_out_of_filespace +=
+ scrub_data->scrub_stat.page_split_failures_out_of_filespace;
+ scrub_stat.page_split_failures_missing_index +=
+ scrub_data->scrub_stat.page_split_failures_missing_index;
+ scrub_stat.page_split_failures_unknown +=
+ scrub_data->scrub_stat.page_split_failures_unknown;
+ mutex_exit(&scrub_stat_mutex);
+
+ // clear stat
+ memset(&scrub_data->scrub_stat, 0, sizeof(scrub_data->scrub_stat));
+}
+
+/**************************************************************//**
+Complete iterating a space */
+UNIV_INTERN
+bool
+btr_scrub_complete_space(
+/*=====================*/
+ btr_scrub_t* scrub_data) /*!< in/out: scrub data */
+{
+ btr_scrub_table_close_for_thread(scrub_data);
+ btr_scrub_update_total_stat(scrub_data);
+ return scrub_data->scrubbing;
+}
+
+/*********************************************************************
+Return scrub statistics */
+void
+btr_scrub_total_stat(btr_scrub_stat_t *stat)
+{
+ mutex_enter(&scrub_stat_mutex);
+ *stat = scrub_stat;
+ mutex_exit(&scrub_stat_mutex);
+}
+
+/*********************************************************************
+Init global variables */
+UNIV_INTERN
+void
+btr_scrub_init()
+{
+ mutex_create(scrub_stat_mutex_key,
+ &scrub_stat_mutex, SYNC_NO_ORDER_CHECK);
+
+ memset(&scrub_stat, 0, sizeof(scrub_stat));
+}
+
+/*********************************************************************
+Cleanup globals */
+UNIV_INTERN
+void
+btr_scrub_cleanup()
+{
+ mutex_free(&scrub_stat_mutex);
+}
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
index e5800ef30c0..12115fde7f4 100644
--- a/storage/innobase/buf/buf0buf.cc
+++ b/storage/innobase/buf/buf0buf.cc
@@ -2,6 +2,7 @@
Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, Google Inc.
+Copyright (c) 2013, 2015, MariaDB Corporation. All Rights Reserved.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -39,6 +40,7 @@ Created 11/5/1995 Heikki Tuuri
#include "mem0mem.h"
#include "btr0btr.h"
#include "fil0fil.h"
+#include "fil0crypt.h"
#ifndef UNIV_HOTBACKUP
#include "buf0buddy.h"
#include "lock0lock.h"
@@ -53,6 +55,13 @@ Created 11/5/1995 Heikki Tuuri
#include "page0zip.h"
#include "srv0mon.h"
#include "buf0checksum.h"
+#include "fil0pagecompress.h"
+#include "ut0byte.h"
+#include <new>
+
+#ifdef HAVE_LZO
+#include "lzo/lzo1x.h"
+#endif
/*
IMPLEMENTATION OF THE BUFFER POOL
@@ -572,10 +581,11 @@ buf_page_is_corrupted(
ulint zip_size) /*!< in: size of compressed page;
0 for uncompressed pages */
{
+ ulint page_encrypted = fil_page_is_encrypted(read_buf);
ulint checksum_field1;
ulint checksum_field2;
- if (!zip_size
+ if (!page_encrypted && !zip_size
&& memcmp(read_buf + FIL_PAGE_LSN + 4,
read_buf + UNIV_PAGE_SIZE
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
@@ -628,6 +638,9 @@ buf_page_is_corrupted(
if (zip_size) {
return(!page_zip_verify_checksum(read_buf, zip_size));
}
+ if (page_encrypted) {
+ return (FALSE);
+ }
checksum_field1 = mach_read_from_4(
read_buf + FIL_PAGE_SPACE_OR_CHKSUM);
@@ -878,6 +891,11 @@ buf_page_print(
mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
mach_read_from_4(read_buf
+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+
+ ulint page_type = mach_read_from_4(read_buf + FIL_PAGE_TYPE);
+
+ fprintf(stderr, "InnoDB: page type %ld meaning %s\n", page_type,
+ fil_get_page_type_name(page_type));
}
#ifndef UNIV_HOTBACKUP
@@ -1031,8 +1049,11 @@ buf_block_init(
block->page.state = BUF_BLOCK_NOT_USED;
block->page.buf_fix_count = 0;
block->page.io_fix = BUF_IO_NONE;
-
+ block->page.key_version = 0;
+ block->page.real_size = 0;
+ block->page.write_size = 0;
block->modify_clock = 0;
+ block->page.slot = NULL;
#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
block->page.file_page_was_freed = FALSE;
@@ -1372,6 +1393,25 @@ buf_pool_init_instance(
buf_pool->try_LRU_scan = TRUE;
+ /* Initialize the hazard pointer for flush_list batches */
+ new(&buf_pool->flush_hp)
+ FlushHp(buf_pool, &buf_pool->flush_list_mutex);
+
+ /* Initialize the hazard pointer for LRU batches */
+ new(&buf_pool->lru_hp) LRUHp(buf_pool, &buf_pool->mutex);
+
+ /* Initialize the iterator for LRU scan search */
+ new(&buf_pool->lru_scan_itr) LRUItr(buf_pool, &buf_pool->mutex);
+
+ /* Initialize the iterator for single page scan search */
+ new(&buf_pool->single_scan_itr) LRUItr(buf_pool, &buf_pool->mutex);
+
+ /* Initialize the temporal memory array and slots */
+ buf_pool->tmp_arr = (buf_tmp_array_t *)mem_zalloc(sizeof(buf_tmp_array_t));
+ ulint n_slots = srv_n_read_io_threads * srv_n_write_io_threads * (8 * OS_AIO_N_PENDING_IOS_PER_THREAD);
+ buf_pool->tmp_arr->n_slots = n_slots;
+ buf_pool->tmp_arr->slots = (buf_tmp_buffer_t*)mem_zalloc(sizeof(buf_tmp_buffer_t) * n_slots);
+
buf_pool_mutex_exit(buf_pool);
return(DB_SUCCESS);
@@ -1423,6 +1463,32 @@ buf_pool_free_instance(
ha_clear(buf_pool->page_hash);
hash_table_free(buf_pool->page_hash);
hash_table_free(buf_pool->zip_hash);
+
+ /* Free all used temporary slots */
+ if (buf_pool->tmp_arr) {
+ for(ulint i = 0; i < buf_pool->tmp_arr->n_slots; i++) {
+ buf_tmp_buffer_t* slot = &(buf_pool->tmp_arr->slots[i]);
+#ifdef HAVE_LZO
+ if (slot && slot->lzo_mem) {
+ ut_free(slot->lzo_mem);
+ slot->lzo_mem = NULL;
+ }
+#endif
+ if (slot && slot->crypt_buf_free) {
+ ut_free(slot->crypt_buf_free);
+ slot->crypt_buf_free = NULL;
+ }
+
+ if (slot && slot->comp_buf_free) {
+ ut_free(slot->comp_buf_free);
+ slot->comp_buf_free = NULL;
+ }
+ }
+ }
+
+ mem_free(buf_pool->tmp_arr->slots);
+ mem_free(buf_pool->tmp_arr);
+ buf_pool->tmp_arr = NULL;
}
/********************************************************************//**
@@ -1462,6 +1528,8 @@ buf_pool_init(
btr_search_sys_create(buf_pool_get_curr_size() / sizeof(void*) / 64);
+ buf_flush_event = os_event_create();
+
return(DB_SUCCESS);
}
@@ -1578,6 +1646,10 @@ buf_relocate(
memcpy(dpage, bpage, sizeof *dpage);
+ /* Important that we adjust the hazard pointer before
+ removing bpage from LRU list. */
+ buf_LRU_adjust_hp(buf_pool, bpage);
+
ut_d(bpage->in_LRU_list = FALSE);
ut_d(bpage->in_page_hash = FALSE);
@@ -1616,6 +1688,84 @@ buf_relocate(
HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, dpage);
}
+/** Hazard Pointer implementation. */
+
+/** Set current value
+@param bpage buffer block to be set as hp */
+void
+HazardPointer::set(buf_page_t* bpage)
+{
+ ut_ad(mutex_own(m_mutex));
+ ut_ad(!bpage || buf_pool_from_bpage(bpage) == m_buf_pool);
+ ut_ad(!bpage || buf_page_in_file(bpage));
+
+ m_hp = bpage;
+}
+
+/** Checks if a bpage is the hp
+@param bpage buffer block to be compared
+@return true if it is hp */
+
+bool
+HazardPointer::is_hp(const buf_page_t* bpage)
+{
+ ut_ad(mutex_own(m_mutex));
+ ut_ad(!m_hp || buf_pool_from_bpage(m_hp) == m_buf_pool);
+ ut_ad(!bpage || buf_pool_from_bpage(bpage) == m_buf_pool);
+
+ return(bpage == m_hp);
+}
+
+/** Adjust the value of hp. This happens when some other thread working
+on the same list attempts to remove the hp from the list.
+@param bpage buffer block to be compared */
+
+void
+FlushHp::adjust(const buf_page_t* bpage)
+{
+ ut_ad(bpage != NULL);
+
+ /** We only support reverse traversal for now. */
+ if (is_hp(bpage)) {
+ m_hp = UT_LIST_GET_PREV(list, m_hp);
+ }
+
+ ut_ad(!m_hp || m_hp->in_flush_list);
+}
+
+/** Adjust the value of hp. This happens when some other thread working
+on the same list attempts to remove the hp from the list.
+@param bpage buffer block to be compared */
+
+void
+LRUHp::adjust(const buf_page_t* bpage)
+{
+ ut_ad(bpage);
+
+ /** We only support reverse traversal for now. */
+ if (is_hp(bpage)) {
+ m_hp = UT_LIST_GET_PREV(LRU, m_hp);
+ }
+
+ ut_ad(!m_hp || m_hp->in_LRU_list);
+}
+
+/** Selects from where to start a scan. If we have scanned too deep into
+the LRU list it resets the value to the tail of the LRU list.
+@return buf_page_t from where to start scan. */
+
+buf_page_t*
+LRUItr::start()
+{
+ ut_ad(mutex_own(m_mutex));
+
+ if (!m_hp || m_hp->old) {
+ m_hp = UT_LIST_GET_LAST(m_buf_pool->LRU);
+ }
+
+ return(m_hp);
+}
+
/********************************************************************//**
Determine if a block is a sentinel for a buffer pool watch.
@return TRUE if a sentinel for a buffer pool watch, FALSE if not */
@@ -2354,13 +2504,26 @@ buf_block_align_instance(
ut_ad(page_get_page_no(page_align(ptr))
== 0xffffffff);
break;
- case BUF_BLOCK_FILE_PAGE:
+ case BUF_BLOCK_FILE_PAGE: {
+ ulint space = page_get_space_id(page_align(ptr));
+ ulint offset = page_get_page_no(page_align(ptr));
+
+ if (block->page.space != space ||
+ block->page.offset != offset) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Corruption: Block space_id %lu != page space_id %lu or "
+ "Block offset %lu != page offset %lu",
+ (ulint)block->page.space, space,
+ (ulint)block->page.offset, offset);
+ }
+
ut_ad(block->page.space
- == page_get_space_id(page_align(ptr)));
+ == page_get_space_id(page_align(ptr)));
ut_ad(block->page.offset
== page_get_page_no(page_align(ptr)));
break;
}
+ }
mutex_exit(&block->mutex);
#endif /* UNIV_DEBUG */
@@ -3316,11 +3479,13 @@ page is not in the buffer pool it is not loaded and NULL is returned.
Suitable for using when holding the lock_sys_t::mutex.
@return pointer to a page or NULL */
UNIV_INTERN
-const buf_block_t*
+buf_block_t*
buf_page_try_get_func(
/*==================*/
ulint space_id,/*!< in: tablespace id */
ulint page_no,/*!< in: page number */
+ ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
+ bool possibly_freed,
const char* file, /*!< in: file name */
ulint line, /*!< in: line where called */
mtr_t* mtr) /*!< in: mini-transaction */
@@ -3358,8 +3523,12 @@ buf_page_try_get_func(
buf_block_buf_fix_inc(block, file, line);
mutex_exit(&block->mutex);
- fix_type = MTR_MEMO_PAGE_S_FIX;
- success = rw_lock_s_lock_nowait(&block->lock, file, line);
+ if (rw_latch == RW_S_LATCH) {
+ fix_type = MTR_MEMO_PAGE_S_FIX;
+ success = rw_lock_s_lock_nowait(&block->lock, file, line);
+ } else {
+ success = false;
+ }
if (!success) {
/* Let us try to get an X-latch. If the current thread
@@ -3384,9 +3553,11 @@ buf_page_try_get_func(
ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
- mutex_enter(&block->mutex);
- ut_a(!block->page.file_page_was_freed);
- mutex_exit(&block->mutex);
+ if (!possibly_freed) {
+ mutex_enter(&block->mutex);
+ ut_a(!block->page.file_page_was_freed);
+ mutex_exit(&block->mutex);
+ }
#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
@@ -3415,6 +3586,11 @@ buf_page_init_low(
bpage->access_time = 0;
bpage->newest_modification = 0;
bpage->oldest_modification = 0;
+ bpage->write_size = 0;
+ bpage->key_version = 0;
+ bpage->real_size = 0;
+ bpage->slot = NULL;
+
HASH_INVALIDATE(bpage, hash);
#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
bpage->file_page_was_freed = FALSE;
@@ -3713,6 +3889,8 @@ err_exit:
page_zip_set_size(&bpage->zip, zip_size);
bpage->zip.data = (page_zip_t*) data;
+ bpage->slot = NULL;
+
mutex_enter(&buf_pool->zip_mutex);
UNIV_MEM_DESC(bpage->zip.data,
page_zip_get_size(&bpage->zip));
@@ -3928,7 +4106,7 @@ buf_page_create(
Then InnoDB could in a crash recovery print a big, false, corruption
warning if the stamp contains an lsn bigger than the ib_logfile lsn. */
- memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
+ memset(frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
ut_a(++buf_dbg_counter % 5771 || buf_validate());
@@ -4102,12 +4280,16 @@ UNIV_INTERN
bool
buf_page_io_complete(
/*=================*/
- buf_page_t* bpage) /*!< in: pointer to the block in question */
+ buf_page_t* bpage, /*!< in: pointer to the block in question */
+ bool evict) /*!< in: whether or not to evict the page
+ from LRU list. */
+
{
enum buf_io_fix io_type;
buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
const ibool uncompressed = (buf_page_get_state(bpage)
== BUF_BLOCK_FILE_PAGE);
+ fil_space_t* space = NULL;
ut_a(buf_page_in_file(bpage));
@@ -4125,6 +4307,16 @@ buf_page_io_complete(
ulint read_space_id;
byte* frame;
+ if (!buf_page_decrypt_after_read(bpage)) {
+ /* encryption error! */
+ if (buf_page_get_zip_size(bpage)) {
+ frame = bpage->zip.data;
+ } else {
+ frame = ((buf_block_t*) bpage)->frame;
+ }
+ goto corrupt;
+ }
+
if (buf_page_get_zip_size(bpage)) {
frame = bpage->zip.data;
buf_pool->n_pend_unzip++;
@@ -4196,12 +4388,18 @@ buf_page_io_complete(
goto page_not_corrupt;
;);
corrupt:
+ fil_system_enter();
+ space = fil_space_get_by_id(bpage->space);
+ fil_system_exit();
+
fprintf(stderr,
"InnoDB: Database page corruption on disk"
" or a failed\n"
- "InnoDB: file read of page %lu.\n"
+ "InnoDB: space %lu file %s read of page %lu.\n"
"InnoDB: You may have to recover"
" from a backup.\n",
+ (ulint)bpage->space,
+ space ? space->name : "NULL",
(ulong) bpage->offset);
buf_page_print(frame, buf_page_get_zip_size(bpage),
BUF_PAGE_PRINT_NO_CRASH);
@@ -4265,6 +4463,13 @@ corrupt:
bpage->offset, buf_page_get_zip_size(bpage),
TRUE);
}
+ } else {
+ /* io_type == BUF_IO_WRITE */
+ if (bpage->slot) {
+ /* Mark slot free */
+ bpage->slot->reserved = false;
+ bpage->slot = NULL;
+ }
}
buf_pool_mutex_enter(buf_pool);
@@ -4284,6 +4489,7 @@ corrupt:
id. */
buf_page_set_io_fix(bpage, BUF_IO_NONE);
+ buf_page_monitor(bpage, io_type);
switch (io_type) {
case BUF_IO_READ:
@@ -4300,6 +4506,8 @@ corrupt:
BUF_IO_READ);
}
+ mutex_exit(buf_page_get_mutex(bpage));
+
break;
case BUF_IO_WRITE:
@@ -4315,14 +4523,30 @@ corrupt:
buf_pool->stat.n_pages_written++;
+ /* In case of flush batches i.e.: BUF_FLUSH_LIST and
+ BUF_FLUSH_LRU this function is always called from IO
+ helper thread. In this case, we decide whether or not
+ to evict the page based on flush type. The value
+ passed as evict is the default value in function
+ definition which is false.
+ We always evict in case of LRU batch and never evict
+ in case of flush list batch. For single page flush
+ the caller sets the appropriate value. */
+ if (buf_page_get_flush_type(bpage) == BUF_FLUSH_LRU) {
+ evict = true;
+ }
+
+ mutex_exit(buf_page_get_mutex(bpage));
+ if (evict) {
+ buf_LRU_free_page(bpage, true);
+ }
+
break;
default:
ut_error;
}
- buf_page_monitor(bpage, io_type);
-
#ifdef UNIV_DEBUG
if (buf_debug_prints) {
fprintf(stderr, "Has %s page space %lu page no %lu\n",
@@ -4332,7 +4556,6 @@ corrupt:
}
#endif /* UNIV_DEBUG */
- mutex_exit(buf_page_get_mutex(bpage));
buf_pool_mutex_exit(buf_pool);
return(true);
@@ -5481,3 +5704,281 @@ buf_page_init_for_backup_restore(
}
}
#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Reserve unused slot from temporary memory array and allocate necessary
+temporary memory if not yet allocated.
+@return reserved slot */
+buf_tmp_buffer_t*
+buf_pool_reserve_tmp_slot(
+/*======================*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool where to
+ reserve */
+ bool compressed) /*!< in: is file space compressed */
+{
+ buf_tmp_buffer_t *free_slot=NULL;
+
+ /* Array is protected by buf_pool mutex */
+ buf_pool_mutex_enter(buf_pool);
+
+ for(ulint i = 0; i < buf_pool->tmp_arr->n_slots; i++) {
+ buf_tmp_buffer_t *slot = &buf_pool->tmp_arr->slots[i];
+
+ if(slot->reserved == false) {
+ free_slot = slot;
+ break;
+ }
+ }
+
+ /* We assume that free slot is found */
+ ut_a(free_slot != NULL);
+ free_slot->reserved = true;
+ /* Now that we have reserved this slot we can release
+ buf_pool mutex */
+ buf_pool_mutex_exit(buf_pool);
+
+ /* Allocate temporary memory for encryption/decryption */
+ if (free_slot->crypt_buf_free == NULL) {
+ free_slot->crypt_buf_free = static_cast<byte *>(ut_malloc(UNIV_PAGE_SIZE*2));
+ free_slot->crypt_buf = static_cast<byte *>(ut_align(free_slot->crypt_buf_free, UNIV_PAGE_SIZE));
+ memset(free_slot->crypt_buf_free, 0, UNIV_PAGE_SIZE *2);
+ }
+
+ /* For page compressed tables allocate temporary memory for
+ compression/decompression */
+ if (compressed && free_slot->comp_buf_free == NULL) {
+ free_slot->comp_buf_free = static_cast<byte *>(ut_malloc(UNIV_PAGE_SIZE*2));
+ free_slot->comp_buf = static_cast<byte *>(ut_align(free_slot->comp_buf_free, UNIV_PAGE_SIZE));
+ memset(free_slot->comp_buf_free, 0, UNIV_PAGE_SIZE *2);
+#ifdef HAVE_LZO
+ free_slot->lzo_mem = static_cast<byte *>(ut_malloc(LZO1X_1_15_MEM_COMPRESS));
+ memset(free_slot->lzo_mem, 0, LZO1X_1_15_MEM_COMPRESS);
+#endif
+ }
+
+ return (free_slot);
+}
+
+/********************************************************************//**
+Encrypts a buffer page right before it's flushed to disk
+*/
+byte*
+buf_page_encrypt_before_write(
+/*==========================*/
+ buf_page_t* bpage, /*!< in/out: buffer page to be flushed */
+ byte* src_frame, /*!< in: src frame */
+ ulint space_id) /*!< in: space id */
+{
+ fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(space_id);
+ ulint zip_size = buf_page_get_zip_size(bpage);
+ ulint page_size = (zip_size) ? zip_size : UNIV_PAGE_SIZE;
+ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
+ bool page_compressed = fil_space_is_page_compressed(bpage->space);
+ bool encrypted = true;
+
+ bpage->real_size = UNIV_PAGE_SIZE;
+
+ fil_page_type_validate(src_frame);
+
+ if (bpage->offset == 0) {
+ /* Page 0 of a tablespace is not encrypted/compressed */
+ ut_ad(bpage->key_version == 0);
+ return src_frame;
+ }
+
+ if (bpage->space == TRX_SYS_SPACE && bpage->offset == TRX_SYS_PAGE_NO) {
+ /* don't encrypt/compress page as it contains address to dblwr buffer */
+ bpage->key_version = 0;
+ return src_frame;
+ }
+
+ if (crypt_data != NULL && crypt_data->encryption == FIL_SPACE_ENCRYPTION_OFF) {
+ /* Encryption is disabled */
+ encrypted = false;
+ }
+
+ if (!srv_encrypt_tables && (crypt_data == NULL || crypt_data->encryption == FIL_SPACE_ENCRYPTION_DEFAULT)) {
+ /* Encryption is disabled */
+ encrypted = false;
+ }
+
+ /* Is encryption needed? */
+ if (crypt_data == NULL || crypt_data->type == CRYPT_SCHEME_UNENCRYPTED) {
+ /* An unencrypted table */
+ bpage->key_version = 0;
+ encrypted = false;
+ }
+
+ if (!encrypted && !page_compressed) {
+ /* No need to encrypt or page compress the page */
+ return src_frame;
+ }
+
+ /* Find free slot from temporary memory array */
+ buf_tmp_buffer_t* slot = buf_pool_reserve_tmp_slot(buf_pool, page_compressed);
+ slot->out_buf = NULL;
+ bpage->slot = slot;
+
+ byte *dst_frame = slot->crypt_buf;
+
+ if (!page_compressed) {
+ /* Encrypt page content */
+ byte* tmp = fil_space_encrypt(bpage->space,
+ bpage->offset,
+ bpage->newest_modification,
+ src_frame,
+ zip_size,
+ dst_frame);
+
+ unsigned key_version =
+ mach_read_from_4(dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+ ut_ad(key_version == 0 || key_version >= bpage->key_version);
+ bpage->key_version = key_version;
+ bpage->real_size = page_size;
+ slot->out_buf = dst_frame = tmp;
+
+#ifdef UNIV_DEBUG
+ fil_page_type_validate(tmp);
+#endif
+
+ } else {
+ /* First we compress the page content */
+ ulint out_len = 0;
+ ulint block_size = fil_space_get_block_size(bpage->space, bpage->offset, page_size);
+
+ byte *tmp = fil_compress_page(bpage->space,
+ (byte *)src_frame,
+ slot->comp_buf,
+ page_size,
+ fil_space_get_page_compression_level(bpage->space),
+ block_size,
+ encrypted,
+ &out_len,
+ IF_LZO(slot->lzo_mem, NULL)
+ );
+
+ bpage->real_size = out_len;
+
+#ifdef UNIV_DEBUG
+ fil_page_type_validate(tmp);
+#endif
+
+ if(encrypted) {
+
+ /* And then we encrypt the page content */
+ tmp = fil_space_encrypt(bpage->space,
+ bpage->offset,
+ bpage->newest_modification,
+ tmp,
+ zip_size,
+ dst_frame);
+ }
+
+ slot->out_buf = dst_frame = tmp;
+ }
+
+#ifdef UNIV_DEBUG
+ fil_page_type_validate(dst_frame);
+#endif
+
+ // return dst_frame which will be written
+ return dst_frame;
+}
+
+/********************************************************************//**
+Decrypt page after it has been read from disk
+*/
+ibool
+buf_page_decrypt_after_read(
+/*========================*/
+ buf_page_t* bpage) /*!< in/out: buffer page read from disk */
+{
+ ulint zip_size = buf_page_get_zip_size(bpage);
+ ulint size = (zip_size) ? zip_size : UNIV_PAGE_SIZE;
+
+ byte* dst_frame = (zip_size) ? bpage->zip.data :
+ ((buf_block_t*) bpage)->frame;
+ unsigned key_version =
+ mach_read_from_4(dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+ bool page_compressed = fil_page_is_compressed(dst_frame);
+ bool page_compressed_encrypted = fil_page_is_compressed_encrypted(dst_frame);
+ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
+
+ ut_ad(bpage->key_version == 0);
+
+ if (bpage->offset == 0) {
+ /* File header pages are not encrypted/compressed */
+ return (TRUE);
+ }
+
+ if (page_compressed) {
+ /* the page we read is unencrypted */
+ /* Find free slot from temporary memory array */
+ buf_tmp_buffer_t* slot = buf_pool_reserve_tmp_slot(buf_pool, page_compressed);
+
+#ifdef UNIV_DEBUG
+ fil_page_type_validate(dst_frame);
+#endif
+
+ /* decompress using comp_buf to dst_frame */
+ fil_decompress_page(slot->comp_buf,
+ dst_frame,
+ size,
+ &bpage->write_size);
+
+ /* Mark this slot as free */
+ slot->reserved = false;
+ key_version = 0;
+
+#ifdef UNIV_DEBUG
+ fil_page_type_validate(dst_frame);
+#endif
+ } else {
+ buf_tmp_buffer_t* slot = NULL;
+
+ if (key_version) {
+ /* Find free slot from temporary memory array */
+ slot = buf_pool_reserve_tmp_slot(buf_pool, page_compressed);
+
+#ifdef UNIV_DEBUG
+ fil_page_type_validate(dst_frame);
+#endif
+ /* decrypt using crypt_buf to dst_frame */
+ fil_space_decrypt(bpage->space,
+ slot->crypt_buf,
+ size,
+ dst_frame);
+#ifdef UNIV_DEBUG
+ fil_page_type_validate(dst_frame);
+#endif
+ }
+
+ if (page_compressed_encrypted) {
+ if (!slot) {
+ slot = buf_pool_reserve_tmp_slot(buf_pool, page_compressed);
+ }
+
+#ifdef UNIV_DEBUG
+ fil_page_type_validate(dst_frame);
+#endif
+ /* decompress using comp_buf to dst_frame */
+ fil_decompress_page(slot->comp_buf,
+ dst_frame,
+ size,
+ &bpage->write_size);
+ }
+
+#ifdef UNIV_DEBUG
+ fil_page_type_validate(dst_frame);
+#endif
+
+ /* Mark this slot as free */
+ if (slot) {
+ slot->reserved = false;
+ }
+ }
+
+ bpage->key_version = key_version;
+
+ return (TRUE);
+}
diff --git a/storage/innobase/buf/buf0checksum.cc b/storage/innobase/buf/buf0checksum.cc
index f95eba39ab4..4101d117896 100644
--- a/storage/innobase/buf/buf0checksum.cc
+++ b/storage/innobase/buf/buf0checksum.cc
@@ -64,7 +64,8 @@ buf_calc_page_crc32(
there we store the old formula checksum. */
checksum = ut_crc32(page + FIL_PAGE_OFFSET,
- FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET)
+ FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+ - FIL_PAGE_OFFSET)
^ ut_crc32(page + FIL_PAGE_DATA,
UNIV_PAGE_SIZE - FIL_PAGE_DATA
- FIL_PAGE_END_LSN_OLD_CHKSUM);
@@ -94,7 +95,8 @@ buf_calc_page_new_checksum(
there we store the old formula checksum. */
checksum = ut_fold_binary(page + FIL_PAGE_OFFSET,
- FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET)
+ FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+ - FIL_PAGE_OFFSET)
+ ut_fold_binary(page + FIL_PAGE_DATA,
UNIV_PAGE_SIZE - FIL_PAGE_DATA
- FIL_PAGE_END_LSN_OLD_CHKSUM);
@@ -119,7 +121,7 @@ buf_calc_page_old_checksum(
{
ulint checksum;
- checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN);
+ checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
checksum = checksum & 0xFFFFFFFFUL;
diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc
index 62222993622..35188b8ba16 100644
--- a/storage/innobase/buf/buf0dblwr.cc
+++ b/storage/innobase/buf/buf0dblwr.cc
@@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2015, MariaDB Corporation. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -35,6 +36,7 @@ Created 2011/12/19
#include "srv0srv.h"
#include "page0zip.h"
#include "trx0sys.h"
+#include "fil0crypt.h"
#ifndef UNIV_HOTBACKUP
@@ -49,6 +51,8 @@ UNIV_INTERN buf_dblwr_t* buf_dblwr = NULL;
/** Set to TRUE when the doublewrite buffer is being created */
UNIV_INTERN ibool buf_dblwr_being_created = FALSE;
+#define TRX_SYS_DOUBLEWRITE_BLOCKS 2
+
/****************************************************************//**
Determines if a page number is located inside the doublewrite buffer.
@return TRUE if the location is inside the two blocks of the
@@ -135,7 +139,7 @@ buf_dblwr_init(
/* There are two blocks of same size in the doublewrite
buffer. */
- buf_size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+ buf_size = TRX_SYS_DOUBLEWRITE_BLOCKS * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
/* There must be atleast one buffer for single page writes
and one buffer for batch writes. */
@@ -215,7 +219,7 @@ start_again:
"Doublewrite buffer not found: creating new");
if (buf_pool_get_curr_size()
- < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+ < ((TRX_SYS_DOUBLEWRITE_BLOCKS * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+ FSP_EXTENT_SIZE / 2 + 100)
* UNIV_PAGE_SIZE)) {
@@ -251,7 +255,7 @@ start_again:
fseg_header = doublewrite + TRX_SYS_DOUBLEWRITE_FSEG;
prev_page_no = 0;
- for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+ for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCKS * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+ FSP_EXTENT_SIZE / 2; i++) {
new_block = fseg_alloc_free_page(
fseg_header, prev_page_no + 1, FSP_UP, &mtr);
@@ -374,7 +378,7 @@ buf_dblwr_init_or_load_pages(
/* We do the file i/o past the buffer pool */
- unaligned_read_buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
+ unaligned_read_buf = static_cast<byte*>(ut_malloc(3 * UNIV_PAGE_SIZE));
read_buf = static_cast<byte*>(
ut_align(unaligned_read_buf, UNIV_PAGE_SIZE));
@@ -386,6 +390,14 @@ buf_dblwr_init_or_load_pages(
doublewrite = read_buf + TRX_SYS_DOUBLEWRITE;
+ if (mach_read_from_4(read_buf + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) != 0) {
+ byte* tmp = fil_space_decrypt((ulint)TRX_SYS_SPACE,
+ read_buf + UNIV_PAGE_SIZE,
+ UNIV_PAGE_SIZE, /* page size */
+ read_buf);
+ doublewrite = tmp + TRX_SYS_DOUBLEWRITE;
+ }
+
if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
== TRX_SYS_DOUBLEWRITE_MAGIC_N) {
/* The doublewrite buffer has been created */
@@ -428,7 +440,7 @@ buf_dblwr_init_or_load_pages(
page = buf;
- for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) {
+ for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * TRX_SYS_DOUBLEWRITE_BLOCKS; i++) {
ulint source_page_no;
@@ -451,7 +463,6 @@ buf_dblwr_init_or_load_pages(
os_file_write(path, file, page,
source_page_no * UNIV_PAGE_SIZE,
UNIV_PAGE_SIZE);
-
} else if (load_corrupt_pages) {
recv_dblwr.add(page);
@@ -511,14 +522,20 @@ buf_dblwr_process()
ulint zip_size = fil_space_get_zip_size(space_id);
/* Read in the actual page from the file */
- fil_io(OS_FILE_READ, true, space_id, zip_size,
- page_no, 0,
- zip_size ? zip_size : UNIV_PAGE_SIZE,
- read_buf, NULL);
-
- /* Check if the page is corrupt */
-
- if (buf_page_is_corrupted(true, read_buf, zip_size)) {
+ fil_io(OS_FILE_READ,
+ true,
+ space_id,
+ zip_size,
+ page_no,
+ 0,
+ zip_size ? zip_size : UNIV_PAGE_SIZE,
+ read_buf,
+ NULL,
+ 0);
+
+ if (fil_space_verify_crypt_checksum(read_buf, zip_size)) {
+ /* page is encrypted and checksum is OK */
+ } else if (buf_page_is_corrupted(true, read_buf, zip_size)) {
fprintf(stderr,
"InnoDB: Warning: database page"
@@ -529,8 +546,11 @@ buf_dblwr_process()
" the doublewrite buffer.\n",
(ulong) space_id, (ulong) page_no);
- if (buf_page_is_corrupted(true,
- page, zip_size)) {
+ if (fil_space_verify_crypt_checksum(page, zip_size)) {
+ /* the doublewrite buffer page is encrypted and OK */
+ } else if (buf_page_is_corrupted(true,
+ page,
+ zip_size)) {
fprintf(stderr,
"InnoDB: Dump of the page:\n");
buf_page_print(
@@ -563,10 +583,16 @@ buf_dblwr_process()
doublewrite buffer to the intended
position */
- fil_io(OS_FILE_WRITE, true, space_id,
- zip_size, page_no, 0,
- zip_size ? zip_size : UNIV_PAGE_SIZE,
- page, NULL);
+ fil_io(OS_FILE_WRITE,
+ true,
+ space_id,
+ zip_size,
+ page_no,
+ 0,
+ zip_size ? zip_size : UNIV_PAGE_SIZE,
+ page,
+ NULL,
+ 0);
ib_logf(IB_LOG_LEVEL_INFO,
"Recovered the page from"
@@ -582,18 +608,39 @@ buf_dblwr_process()
zeroes, while a valid copy is
available in dblwr buffer. */
- fil_io(OS_FILE_WRITE, true, space_id,
- zip_size, page_no, 0,
- zip_size ? zip_size
- : UNIV_PAGE_SIZE,
- page, NULL);
+ fil_io(OS_FILE_WRITE,
+ true,
+ space_id,
+ zip_size,
+ page_no,
+ 0,
+ zip_size ? zip_size : UNIV_PAGE_SIZE,
+ page,
+ NULL,
+ 0);
}
}
}
}
fil_flush_file_spaces(FIL_TABLESPACE);
- ut_free(unaligned_read_buf);
+
+ {
+ size_t bytes = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
+ byte *unaligned_buf = static_cast<byte*>(
+ ut_malloc(bytes + UNIV_PAGE_SIZE - 1));
+
+ byte *buf = static_cast<byte*>(
+ ut_align(unaligned_buf, UNIV_PAGE_SIZE));
+ memset(buf, 0, bytes);
+
+ fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
+ buf_dblwr->block1, 0, bytes, buf, NULL, NULL);
+ fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
+ buf_dblwr->block2, 0, bytes, buf, NULL, NULL);
+
+ ut_free(unaligned_buf);
+ }
}
/****************************************************************//**
@@ -665,7 +712,7 @@ buf_dblwr_update(
break;
case BUF_FLUSH_SINGLE_PAGE:
{
- const ulint size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+ const ulint size = TRX_SYS_DOUBLEWRITE_BLOCKS * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
ulint i;
mutex_enter(&buf_dblwr->mutex);
for (i = srv_doublewrite_batch_size; i < size; ++i) {
@@ -697,6 +744,14 @@ buf_dblwr_check_page_lsn(
/*=====================*/
const page_t* page) /*!< in: page to check */
{
+ ibool page_compressed = (mach_read_from_2(page+FIL_PAGE_TYPE) == FIL_PAGE_PAGE_COMPRESSED);
+ uint key_version = mach_read_from_4(page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+
+ /* Ignore page compressed or encrypted pages */
+ if (page_compressed || key_version) {
+ return;
+ }
+
if (memcmp(page + (FIL_PAGE_LSN + 4),
page + (UNIV_PAGE_SIZE
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
@@ -792,13 +847,19 @@ buf_dblwr_write_block_to_datafile(
? OS_FILE_WRITE
: OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER;
+ void * frame = buf_page_get_frame(bpage);
+
if (bpage->zip.data) {
- fil_io(flags, sync, buf_page_get_space(bpage),
- buf_page_get_zip_size(bpage),
- buf_page_get_page_no(bpage), 0,
- buf_page_get_zip_size(bpage),
- (void*) bpage->zip.data,
- (void*) bpage);
+ fil_io(flags,
+ sync,
+ buf_page_get_space(bpage),
+ buf_page_get_zip_size(bpage),
+ buf_page_get_page_no(bpage),
+ 0,
+ buf_page_get_zip_size(bpage),
+ frame,
+ (void*) bpage,
+ 0);
return;
}
@@ -808,10 +869,16 @@ buf_dblwr_write_block_to_datafile(
ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
buf_dblwr_check_page_lsn(block->frame);
- fil_io(flags, sync, buf_block_get_space(block), 0,
- buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
- (void*) block->frame, (void*) block);
-
+ fil_io(flags,
+ sync,
+ buf_block_get_space(block),
+ 0,
+ buf_block_get_page_no(block),
+ 0,
+ bpage->real_size,
+ frame,
+ (void*) block,
+ (ulint *)&bpage->write_size);
}
/********************************************************************//**
@@ -905,7 +972,7 @@ try_again:
fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
buf_dblwr->block1, 0, len,
- (void*) write_buf, NULL);
+ (void*) write_buf, NULL, 0);
if (buf_dblwr->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
/* No unwritten pages in the second block. */
@@ -921,7 +988,7 @@ try_again:
fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
buf_dblwr->block2, 0, len,
- (void*) write_buf, NULL);
+ (void*) write_buf, NULL, 0);
flush:
/* increment the doublewrite flushed pages counter */
@@ -1002,13 +1069,14 @@ try_again:
}
zip_size = buf_page_get_zip_size(bpage);
+ void * frame = buf_page_get_frame(bpage);
if (zip_size) {
UNIV_MEM_ASSERT_RW(bpage->zip.data, zip_size);
/* Copy the compressed page and clear the rest. */
memcpy(buf_dblwr->write_buf
+ UNIV_PAGE_SIZE * buf_dblwr->first_free,
- bpage->zip.data, zip_size);
+ frame, zip_size);
memset(buf_dblwr->write_buf
+ UNIV_PAGE_SIZE * buf_dblwr->first_free
+ zip_size, 0, UNIV_PAGE_SIZE - zip_size);
@@ -1019,7 +1087,7 @@ try_again:
memcpy(buf_dblwr->write_buf
+ UNIV_PAGE_SIZE * buf_dblwr->first_free,
- ((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE);
+ frame, UNIV_PAGE_SIZE);
}
buf_dblwr->buf_block_arr[buf_dblwr->first_free] = bpage;
@@ -1070,7 +1138,7 @@ buf_dblwr_write_single_page(
/* total number of slots available for single page flushes
starts from srv_doublewrite_batch_size to the end of the
buffer. */
- size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+ size = TRX_SYS_DOUBLEWRITE_BLOCKS * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
ut_a(size > srv_doublewrite_batch_size);
n_slots = size - srv_doublewrite_batch_size;
@@ -1141,23 +1209,35 @@ retry:
bytes in the doublewrite page with zeros. */
zip_size = buf_page_get_zip_size(bpage);
+ void * frame = buf_page_get_frame(bpage);
+
if (zip_size) {
memcpy(buf_dblwr->write_buf + UNIV_PAGE_SIZE * i,
- bpage->zip.data, zip_size);
+ frame, zip_size);
memset(buf_dblwr->write_buf + UNIV_PAGE_SIZE * i
+ zip_size, 0, UNIV_PAGE_SIZE - zip_size);
- fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
- offset, 0, UNIV_PAGE_SIZE,
- (void*) (buf_dblwr->write_buf
- + UNIV_PAGE_SIZE * i), NULL);
+ fil_io(OS_FILE_WRITE,
+ true,
+ TRX_SYS_SPACE, 0,
+ offset,
+ 0,
+ UNIV_PAGE_SIZE,
+ (void*) (buf_dblwr->write_buf + UNIV_PAGE_SIZE * i),
+ NULL,
+ 0);
} else {
/* It is a regular page. Write it directly to the
doublewrite buffer */
- fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
- offset, 0, UNIV_PAGE_SIZE,
- (void*) ((buf_block_t*) bpage)->frame,
- NULL);
+ fil_io(OS_FILE_WRITE,
+ true,
+ TRX_SYS_SPACE, 0,
+ offset,
+ 0,
+ bpage->real_size,
+ frame,
+ NULL,
+ 0);
}
/* Now flush the doublewrite buffer data to disk */
diff --git a/storage/innobase/buf/buf0dump.cc b/storage/innobase/buf/buf0dump.cc
index 467f817a2d1..3e3f96280f8 100644
--- a/storage/innobase/buf/buf0dump.cc
+++ b/storage/innobase/buf/buf0dump.cc
@@ -123,11 +123,7 @@ buf_dump_status(
sizeof(export_vars.innodb_buffer_pool_dump_status),
fmt, ap);
- if (severity == STATUS_NOTICE || severity == STATUS_ERR) {
- ut_print_timestamp(stderr);
- fprintf(stderr, " InnoDB: %s\n",
- export_vars.innodb_buffer_pool_dump_status);
- }
+ ib_logf((ib_log_level_t) severity, "%s", export_vars.innodb_buffer_pool_dump_status);
va_end(ap);
}
@@ -215,6 +211,8 @@ buf_dump(
buf_dump_t* dump;
ulint n_pages;
ulint j;
+ ulint limit;
+ ulint counter;
buf_pool = buf_pool_from_array(i);
@@ -258,6 +256,9 @@ buf_dump(
buf_pool_mutex_exit(buf_pool);
+ limit = (ulint)((double)n_pages * ((double)srv_buf_dump_status_frequency / (double)100));
+ counter = 0;
+
for (j = 0; j < n_pages && !SHOULD_QUIT(); j++) {
ret = fprintf(f, ULINTPF "," ULINTPF "\n",
BUF_DUMP_SPACE(dump[j]),
@@ -272,7 +273,14 @@ buf_dump(
return;
}
- if (j % 128 == 0) {
+ counter++;
+
+ /* Print buffer pool dump status only if
+ srv_buf_dump_status_frequency is > 0 and
+ we have processed that amount of pages. */
+ if (srv_buf_dump_status_frequency &&
+ counter == limit) {
+ counter = 0;
buf_dump_status(
STATUS_INFO,
"Dumping buffer pool "
diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc
index f5145297b3f..d893d424b02 100644
--- a/storage/innobase/buf/buf0flu.cc
+++ b/storage/innobase/buf/buf0flu.cc
@@ -1,6 +1,8 @@
/*****************************************************************************
Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved.
+Copyright (c) 2013, 2014, Fusion-io. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -30,6 +32,7 @@ Created 11/11/1995 Heikki Tuuri
#endif
#include "buf0buf.h"
+#include "buf0mtflu.h"
#include "buf0checksum.h"
#include "srv0start.h"
#include "srv0srv.h"
@@ -44,10 +47,12 @@ Created 11/11/1995 Heikki Tuuri
#include "ibuf0ibuf.h"
#include "log0log.h"
#include "os0file.h"
+#include "os0sync.h"
#include "trx0sys.h"
#include "srv0mon.h"
#include "mysql/plugin.h"
#include "mysql/service_thd_wait.h"
+#include "fil0pagecompress.h"
/** Number of pages flushed through non flush_list flushes. */
static ulint buf_lru_flush_page_count = 0;
@@ -59,14 +64,13 @@ need to protect it by a mutex. It is only ever read by the thread
doing the shutdown */
UNIV_INTERN ibool buf_page_cleaner_is_active = FALSE;
-/** LRU flush batch is further divided into this chunk size to
-reduce the wait time for the threads waiting for a clean block */
-#define PAGE_CLEANER_LRU_BATCH_CHUNK_SIZE 100
-
#ifdef UNIV_PFS_THREAD
UNIV_INTERN mysql_pfs_key_t buf_page_cleaner_thread_key;
#endif /* UNIV_PFS_THREAD */
+/** Event to synchronise with the flushing. */
+ os_event_t buf_flush_event;
+
/** If LRU list of a buf_pool is less than this size then LRU eviction
should not happen. This is because when we do LRU flushing we also put
the blocks on free list. If LRU list is very small then we can end up
@@ -75,15 +79,6 @@ in thrashing. */
/* @} */
-/** Handled page counters for a single flush */
-struct flush_counters_t {
- ulint flushed; /*!< number of dirty pages flushed */
- ulint evicted; /*!< number of clean pages evicted, including
- evicted uncompressed page images */
- ulint unzip_LRU_evicted;/*!< number of uncompressed page images
- evicted */
-};
-
/******************************************************************//**
Increases flush_list size in bytes with zip_size for compressed page,
UNIV_PAGE_SIZE for uncompressed page in inline function */
@@ -139,60 +134,6 @@ buf_flush_validate_skip(
}
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-/*******************************************************************//**
-Sets hazard pointer during flush_list iteration. */
-UNIV_INLINE
-void
-buf_flush_set_hp(
-/*=============*/
- buf_pool_t* buf_pool,/*!< in/out: buffer pool instance */
- const buf_page_t* bpage) /*!< in: buffer control block */
-{
- ut_ad(buf_flush_list_mutex_own(buf_pool));
- ut_ad(buf_pool->flush_list_hp == NULL || bpage == NULL);
- ut_ad(!bpage || buf_page_in_file(bpage));
- ut_ad(!bpage || bpage->in_flush_list);
- ut_ad(!bpage || buf_pool_from_bpage(bpage) == buf_pool);
-
- buf_pool->flush_list_hp = bpage;
-}
-
-/*******************************************************************//**
-Checks if the given block is a hazard pointer
-@return true if bpage is hazard pointer */
-UNIV_INLINE
-bool
-buf_flush_is_hp(
-/*============*/
- buf_pool_t* buf_pool,/*!< in: buffer pool instance */
- const buf_page_t* bpage) /*!< in: buffer control block */
-{
- ut_ad(buf_flush_list_mutex_own(buf_pool));
-
- return(buf_pool->flush_list_hp == bpage);
-}
-
-/*******************************************************************//**
-Whenever we move a block in flush_list (either to remove it or to
-relocate it) we check the hazard pointer set by some other thread
-doing the flush list scan. If the hazard pointer is the same as the
-one we are about going to move then we set it to NULL to force a rescan
-in the thread doing the batch. */
-UNIV_INLINE
-void
-buf_flush_update_hp(
-/*================*/
- buf_pool_t* buf_pool, /*!< in: buffer pool instance */
- buf_page_t* bpage) /*!< in: buffer control block */
-{
- ut_ad(buf_flush_list_mutex_own(buf_pool));
-
- if (buf_flush_is_hp(buf_pool, bpage)) {
- buf_flush_set_hp(buf_pool, NULL);
- MONITOR_INC(MONITOR_FLUSH_HP_RESCAN);
- }
-}
-
/******************************************************************//**
Insert a block in the flush_rbt and returns a pointer to its
predecessor or NULL if no predecessor. The ordering is maintained
@@ -591,6 +532,10 @@ buf_flush_remove(
buf_flush_list_mutex_enter(buf_pool);
+ /* Important that we adjust the hazard pointer before removing
+ the bpage from flush list. */
+ buf_pool->flush_hp.adjust(bpage);
+
switch (buf_page_get_state(bpage)) {
case BUF_BLOCK_POOL_WATCH:
case BUF_BLOCK_ZIP_PAGE:
@@ -631,7 +576,6 @@ buf_flush_remove(
ut_a(buf_flush_validate_skip(buf_pool));
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
- buf_flush_update_hp(buf_pool, bpage);
buf_flush_list_mutex_exit(buf_pool);
}
@@ -682,6 +626,10 @@ buf_flush_relocate_on_flush_list(
prev_b = buf_flush_insert_in_flush_rbt(dpage);
}
+ /* Important that we adjust the hazard pointer before removing
+ the bpage from the flush list. */
+ buf_pool->flush_hp.adjust(bpage);
+
/* Must be done after we have removed it from the flush_rbt
because we assert on in_flush_list in comparison function. */
ut_d(bpage->in_flush_list = FALSE);
@@ -710,7 +658,6 @@ buf_flush_relocate_on_flush_list(
ut_a(buf_flush_validate_low(buf_pool));
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
- buf_flush_update_hp(buf_pool, bpage);
buf_flush_list_mutex_exit(buf_pool);
}
@@ -732,8 +679,10 @@ buf_flush_write_complete(
flush_type = buf_page_get_flush_type(bpage);
buf_pool->n_flush[flush_type]--;
+#ifdef UNIV_DEBUG
/* fprintf(stderr, "n pending flush %lu\n",
buf_pool->n_flush[flush_type]); */
+#endif
if (buf_pool->n_flush[flush_type] == 0
&& buf_pool->init_flush[flush_type] == FALSE) {
@@ -766,7 +715,7 @@ buf_flush_update_zip_checksum(
srv_checksum_algorithm)));
mach_write_to_8(page + FIL_PAGE_LSN, lsn);
- memset(page + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
+ memset(page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
}
@@ -887,6 +836,8 @@ buf_flush_write_block_low(
{
ulint zip_size = buf_page_get_zip_size(bpage);
page_t* frame = NULL;
+ ulint space_id = buf_page_get_space(bpage);
+ atomic_writes_t awrites = fil_space_get_atomic_writes(space_id);
#ifdef UNIV_DEBUG
buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
@@ -943,7 +894,7 @@ buf_flush_write_block_low(
mach_write_to_8(frame + FIL_PAGE_LSN,
bpage->newest_modification);
- memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
+ memset(frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
break;
case BUF_BLOCK_FILE_PAGE:
frame = bpage->zip.data;
@@ -958,17 +909,45 @@ buf_flush_write_block_low(
break;
}
+ frame = buf_page_encrypt_before_write(bpage, frame, space_id);
+
if (!srv_use_doublewrite_buf || !buf_dblwr) {
fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
- sync, buf_page_get_space(bpage), zip_size,
- buf_page_get_page_no(bpage), 0,
- zip_size ? zip_size : UNIV_PAGE_SIZE,
- frame, bpage);
- } else if (flush_type == BUF_FLUSH_SINGLE_PAGE) {
- buf_dblwr_write_single_page(bpage, sync);
+ sync,
+ buf_page_get_space(bpage),
+ zip_size,
+ buf_page_get_page_no(bpage),
+ 0,
+ zip_size ? zip_size : bpage->real_size,
+ frame,
+ bpage,
+ &bpage->write_size);
} else {
- ut_ad(!sync);
- buf_dblwr_add_to_batch(bpage);
+
+ /* InnoDB uses doublewrite buffer and doublewrite buffer
+ is initialized. User can define do we use atomic writes
+ on a file space (table) or not. If atomic writes are
+ not used we should use doublewrite buffer and if
+ atomic writes should be used, no doublewrite buffer
+ is used. */
+
+ if (awrites == ATOMIC_WRITES_ON) {
+ fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
+ FALSE,
+ buf_page_get_space(bpage),
+ zip_size,
+ buf_page_get_page_no(bpage),
+ 0,
+ zip_size ? zip_size : bpage->real_size,
+ frame,
+ bpage,
+ &bpage->write_size);
+ } else if (flush_type == BUF_FLUSH_SINGLE_PAGE) {
+ buf_dblwr_write_single_page(bpage, sync);
+ } else {
+ ut_ad(!sync);
+ buf_dblwr_add_to_batch(bpage);
+ }
}
/* When doing single page flushing the IO is done synchronously
@@ -977,7 +956,10 @@ buf_flush_write_block_low(
if (sync) {
ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE);
fil_flush(buf_page_get_space(bpage));
- buf_page_io_complete(bpage);
+
+ /* true means we want to evict this page from the
+ LRU list as well. */
+ buf_page_io_complete(bpage, true);
}
/* Increment the counter of I/O operations used
@@ -1071,10 +1053,10 @@ buf_flush_page(
rw_lock_s_lock_gen(rw_lock, BUF_IO_WRITE);
}
- /* Even though bpage is not protected by any mutex at this
- point, it is safe to access bpage, because it is io_fixed and
- oldest_modification != 0. Thus, it cannot be relocated in the
- buffer pool or removed from flush_list or LRU_list. */
+ /* Even though bpage is not protected by any mutex at this
+ point, it is safe to access bpage, because it is io_fixed and
+ oldest_modification != 0. Thus, it cannot be relocated in the
+ buffer pool or removed from flush_list or LRU_list. */
buf_flush_write_block_low(bpage, flush_type, sync);
}
@@ -1228,7 +1210,9 @@ buf_flush_try_neighbors(
}
}
+#ifdef UNIV_DEBUG
/* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
+#endif
if (high > fil_space_get_size(space)) {
high = fil_space_get_size(space);
@@ -1436,9 +1420,8 @@ This utility flushes dirty blocks from the end of the LRU list.
The calling thread is not allowed to own any latches on pages!
It attempts to make 'max' blocks available in the free list. Note that
it is a best effort attempt and it is not guaranteed that after a call
-to this function there will be 'max' blocks in the free list.
-@return number of blocks for which the write request was queued. */
-static
+to this function there will be 'max' blocks in the free list.*/
+__attribute__((nonnull))
void
buf_flush_LRU_list_batch(
/*=====================*/
@@ -1449,96 +1432,54 @@ buf_flush_LRU_list_batch(
counts */
{
buf_page_t* bpage;
- ulint count = 0;
ulint scanned = 0;
ulint free_len = UT_LIST_GET_LEN(buf_pool->free);
ulint lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
- ut_ad(buf_pool_mutex_own(buf_pool));
-
n->flushed = 0;
n->evicted = 0;
n->unzip_LRU_evicted = 0;
- bpage = UT_LIST_GET_LAST(buf_pool->LRU);
- while (bpage != NULL && count < max
- && (n->flushed + n->evicted) < max
- && free_len < srv_LRU_scan_depth
- && lru_len > BUF_LRU_MIN_LEN) {
+ ut_ad(buf_pool_mutex_own(buf_pool));
- ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
- ibool evict;
+ for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+ bpage != NULL && (n->evicted + n->flushed) < max
+ && free_len < srv_LRU_scan_depth
+ && lru_len > BUF_LRU_MIN_LEN;
+ ++scanned,
+ bpage = buf_pool->lru_hp.get()) {
+
+ buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
+ buf_pool->lru_hp.set(prev);
+ ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
mutex_enter(block_mutex);
- evict = buf_flush_ready_for_replace(bpage);
+ bool evict = buf_flush_ready_for_replace(bpage);
mutex_exit(block_mutex);
- ++scanned;
-
- /* If the block is ready to be replaced we try to
- free it i.e.: put it on the free list.
- Otherwise we try to flush the block and its
- neighbors. In this case we'll put it on the
- free list in the next pass. We do this extra work
- of putting blocks to the free list instead of
- just flushing them because after every flush
- we have to restart the scan from the tail of
- the LRU list and if we don't clear the tail
- of the flushed pages then the scan becomes
- O(n*n). */
if (evict) {
+ /* block is ready for eviction i.e., it is
+ clean and is not IO-fixed or buffer fixed. */
if (buf_LRU_free_page(bpage, true)) {
- /* buf_pool->mutex was potentially
- released and reacquired. */
n->evicted++;
- bpage = UT_LIST_GET_LAST(buf_pool->LRU);
- } else {
- bpage = UT_LIST_GET_PREV(LRU, bpage);
}
} else {
- ulint space;
- ulint offset;
- buf_page_t* prev_bpage;
-
- prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
-
- /* Save the previous bpage */
-
- if (prev_bpage != NULL) {
- space = prev_bpage->space;
- offset = prev_bpage->offset;
- } else {
- space = ULINT_UNDEFINED;
- offset = ULINT_UNDEFINED;
- }
-
- if (!buf_flush_page_and_try_neighbors(
- bpage, BUF_FLUSH_LRU, max, &n->flushed)) {
-
- bpage = prev_bpage;
- } else {
- /* buf_pool->mutex was released.
- reposition the iterator. Note: the
- prev block could have been repositioned
- too but that should be rare. */
-
- if (prev_bpage != NULL) {
-
- ut_ad(space != ULINT_UNDEFINED);
- ut_ad(offset != ULINT_UNDEFINED);
-
- prev_bpage = buf_page_hash_get(
- buf_pool, space, offset);
- }
-
- bpage = prev_bpage;
- }
+ /* Block is ready for flush. Dispatch an IO
+ request. The IO helper thread will put it on
+ free list in IO completion routine. */
+ buf_flush_page_and_try_neighbors(
+ bpage, BUF_FLUSH_LRU, max, &n->flushed);
}
+ ut_ad(!mutex_own(block_mutex));
+ ut_ad(buf_pool_mutex_own(buf_pool));
+
free_len = UT_LIST_GET_LEN(buf_pool->free);
lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
}
+ buf_pool->lru_hp.set(NULL);
+
/* We keep track of all flushes happening as part of LRU
flush. When estimating the desired rate at which flush_list
should be flushed, we factor in this value. */
@@ -1557,10 +1498,8 @@ buf_flush_LRU_list_batch(
/*******************************************************************//**
Flush and move pages from LRU or unzip_LRU list to the free list.
-Whether LRU or unzip_LRU is used depends on the state of the system.
-@return number of blocks for which either the write request was queued
-or in case of unzip_LRU the number of blocks actually moved to the
-free list */
+Whether LRU or unzip_LRU is used depends on the state of the system.*/
+__attribute__((nonnull))
static
void
buf_do_LRU_batch(
@@ -1571,7 +1510,6 @@ buf_do_LRU_batch(
flush_counters_t* n) /*!< out: flushed/evicted page
counts */
{
-
if (buf_LRU_evict_from_unzip_LRU(buf_pool)) {
n->unzip_LRU_evicted = buf_free_from_unzip_LRU_list_batch(buf_pool, max);
} else {
@@ -1584,6 +1522,10 @@ buf_do_LRU_batch(
n->evicted = 0;
n->flushed = 0;
}
+
+ /* Add evicted pages from unzip_LRU to the evicted pages from
+ the simple LRU. */
+ n->evicted += n->unzip_LRU_evicted;
}
/*******************************************************************//**
@@ -1625,6 +1567,7 @@ buf_do_flush_list_batch(
for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
count < min_n && bpage != NULL && len > 0
&& bpage->oldest_modification < lsn_limit;
+ bpage = buf_pool->flush_hp.get(),
++scanned) {
buf_page_t* prev;
@@ -1633,8 +1576,7 @@ buf_do_flush_list_batch(
ut_ad(bpage->in_flush_list);
prev = UT_LIST_GET_PREV(list, bpage);
- buf_flush_set_hp(buf_pool, prev);
-
+ buf_pool->flush_hp.set(prev);
buf_flush_list_mutex_exit(buf_pool);
#ifdef UNIV_DEBUG
@@ -1645,23 +1587,12 @@ buf_do_flush_list_batch(
buf_flush_list_mutex_enter(buf_pool);
- ut_ad(flushed || buf_flush_is_hp(buf_pool, prev));
+ ut_ad(flushed || buf_pool->flush_hp.is_hp(prev));
- if (!buf_flush_is_hp(buf_pool, prev)) {
- /* The hazard pointer was reset by some other
- thread. Restart the scan. */
- ut_ad(buf_flush_is_hp(buf_pool, NULL));
- bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
- len = UT_LIST_GET_LEN(buf_pool->flush_list);
- } else {
- bpage = prev;
- --len;
- buf_flush_set_hp(buf_pool, NULL);
- }
-
- ut_ad(!bpage || bpage->in_flush_list);
+ --len;
}
+ buf_pool->flush_hp.set(NULL);
buf_flush_list_mutex_exit(buf_pool);
MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_SCANNED,
@@ -1679,9 +1610,8 @@ This utility flushes dirty blocks from the end of the LRU list or flush_list.
NOTE 1: in the case of an LRU flush the calling thread may own latches to
pages: to avoid deadlocks, this function must be written so that it cannot
end up waiting for these latches! NOTE 2: in the case of a flush list flush,
-the calling thread is not allowed to own any latches on pages!
-@return number of blocks for which the write request was queued */
-static
+the calling thread is not allowed to own any latches on pages! */
+__attribute__((nonnull))
void
buf_flush_batch(
/*============*/
@@ -1701,7 +1631,6 @@ buf_flush_batch(
flush_counters_t* n) /*!< out: flushed/evicted page
counts */
{
-
ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
#ifdef UNIV_SYNC_DEBUG
ut_ad((flush_type != BUF_FLUSH_LIST)
@@ -1738,7 +1667,6 @@ buf_flush_batch(
/******************************************************************//**
Gather the aggregated stats for both flush list and LRU list flushing */
-static
void
buf_flush_common(
/*=============*/
@@ -1763,7 +1691,6 @@ buf_flush_common(
/******************************************************************//**
Start a buffer flush batch for LRU or flush list */
-static
ibool
buf_flush_start(
/*============*/
@@ -1792,7 +1719,6 @@ buf_flush_start(
/******************************************************************//**
End a buffer flush batch for LRU or flush list */
-static
void
buf_flush_end(
/*==========*/
@@ -1848,40 +1774,6 @@ buf_flush_wait_batch_end(
}
/*******************************************************************//**
-This utility flushes dirty blocks from the end of the LRU list and also
-puts replaceable clean pages from the end of the LRU list to the free
-list.
-NOTE: The calling thread is not allowed to own any latches on pages!
-@return true if a batch was queued successfully. false if another batch
-of same type was already running. */
-static
-bool
-buf_flush_LRU(
-/*==========*/
- buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */
- ulint min_n, /*!< in: wished minimum mumber of blocks
- flushed (it is not guaranteed that the
- actual number is that big, though) */
- flush_counters_t *n) /*!< out: flushed/evicted page
- counts */
-{
- if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) {
- n->flushed = 0;
- n->evicted = 0;
- n->unzip_LRU_evicted = 0;
- return(false);
- }
-
- buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0, n);
-
- buf_flush_end(buf_pool, BUF_FLUSH_LRU);
-
- buf_flush_common(BUF_FLUSH_LRU, n->flushed);
-
- return(true);
-}
-
-/*******************************************************************//**
This utility flushes dirty blocks from the end of the flush list of
all buffer pool instances.
NOTE: The calling thread is not allowed to own any latches on pages!
@@ -1908,6 +1800,10 @@ buf_flush_list(
ulint i;
bool success = true;
+ if (buf_mtflu_init_done()) {
+ return(buf_mtflu_flush_list(min_n, lsn_limit, n_processed));
+ }
+
if (n_processed) {
*n_processed = 0;
}
@@ -1923,8 +1819,8 @@ buf_flush_list(
/* Flush to lsn_limit in all buffer pool instances */
for (i = 0; i < srv_buf_pool_instances; i++) {
- buf_pool_t* buf_pool;
- flush_counters_t n;
+ buf_pool_t* buf_pool;
+ flush_counters_t n;
buf_pool = buf_pool_from_array(i);
@@ -1968,12 +1864,12 @@ buf_flush_list(
}
/******************************************************************//**
-This function picks up a single dirty page from the tail of the LRU
-list, flushes it, removes it from page_hash and LRU list and puts
-it on the free list. It is called from user threads when they are
-unable to find a replaceable page at the tail of the LRU list i.e.:
-when the background LRU flushing in the page_cleaner thread is not
-fast enough to keep pace with the workload.
+This function picks up a single page from the tail of the LRU
+list, flushes it (if it is dirty), removes it from page_hash and LRU
+list and puts it on the free list. It is called from user threads when
+they are unable to find a replaceable page at the tail of the LRU
+list i.e.: when the background LRU flushing in the page_cleaner thread
+is not fast enough to keep pace with the workload.
@return TRUE if success. */
UNIV_INTERN
ibool
@@ -1983,84 +1879,67 @@ buf_flush_single_page_from_LRU(
{
ulint scanned;
buf_page_t* bpage;
+ ibool freed;
buf_pool_mutex_enter(buf_pool);
- for (bpage = UT_LIST_GET_LAST(buf_pool->LRU), scanned = 1;
+ for (bpage = buf_pool->single_scan_itr.start(),
+ scanned = 0, freed = FALSE;
bpage != NULL;
- bpage = UT_LIST_GET_PREV(LRU, bpage), ++scanned) {
+ ++scanned, bpage = buf_pool->single_scan_itr.get()) {
- ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
+ ut_ad(buf_pool_mutex_own(buf_pool));
- mutex_enter(block_mutex);
-
- if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_SINGLE_PAGE)) {
-
- /* The following call will release the buffer pool
- and block mutex. */
+ buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
+ buf_pool->single_scan_itr.set(prev);
- ibool flushed = buf_flush_page(
- buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, true);
+ ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
+ mutex_enter(block_mutex);
- if (flushed) {
- /* buf_flush_page() will release the
- block mutex */
+ if (buf_flush_ready_for_replace(bpage)) {
+ /* block is ready for eviction i.e., it is
+ clean and is not IO-fixed or buffer fixed. */
+ mutex_exit(block_mutex);
+ if (buf_LRU_free_page(bpage, true)) {
+ buf_pool_mutex_exit(buf_pool);
+ freed = TRUE;
+ break;
+ }
+ } else if (buf_flush_ready_for_flush(
+ bpage, BUF_FLUSH_SINGLE_PAGE)) {
+ /* Block is ready for flush. Dispatch an IO
+ request. We'll put it on free list in IO
+ completion routine. The following call, if
+ successful, will release the buffer pool and
+ block mutex. */
+ freed = buf_flush_page(buf_pool, bpage,
+ BUF_FLUSH_SINGLE_PAGE, true);
+ if (freed) {
+ /* block and buffer pool mutex have
+ already been reelased. */
break;
}
+ mutex_exit(block_mutex);
+ } else {
+ mutex_exit(block_mutex);
}
-
- mutex_exit(block_mutex);
}
- MONITOR_INC_VALUE_CUMULATIVE(
- MONITOR_LRU_SINGLE_FLUSH_SCANNED,
- MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
- MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
- scanned);
-
- if (bpage == NULL) {
+ if (!freed) {
/* Can't find a single flushable page. */
+ ut_ad(!bpage);
buf_pool_mutex_exit(buf_pool);
- return(FALSE);
}
-
- ibool freed = FALSE;
-
- /* At this point the page has been written to the disk.
- As we are not holding buffer pool or block mutex therefore
- we cannot use the bpage safely. It may have been plucked out
- of the LRU list by some other thread or it may even have
- relocated in case of a compressed page. We need to start
- the scan of LRU list again to remove the block from the LRU
- list and put it on the free list. */
- buf_pool_mutex_enter(buf_pool);
-
- for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
- bpage != NULL;
- bpage = UT_LIST_GET_PREV(LRU, bpage)) {
-
- ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
-
- mutex_enter(block_mutex);
-
- ibool ready = buf_flush_ready_for_replace(bpage);
-
- mutex_exit(block_mutex);
-
- if (ready) {
- bool evict_zip;
-
- evict_zip = !buf_LRU_evict_from_unzip_LRU(buf_pool);;
-
- freed = buf_LRU_free_page(bpage, evict_zip);
-
- break;
- }
+ if (scanned) {
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_LRU_SINGLE_FLUSH_SCANNED,
+ MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
+ MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
+ scanned);
}
- buf_pool_mutex_exit(buf_pool);
-
+ ut_ad(!buf_pool_mutex_own(buf_pool));
return(freed);
}
@@ -2078,10 +1957,16 @@ buf_flush_LRU_tail(void)
{
ulint total_flushed = 0;
+ if(buf_mtflu_init_done())
+ {
+ return(buf_mtflu_flush_LRU_tail());
+ }
+
for (ulint i = 0; i < srv_buf_pool_instances; i++) {
buf_pool_t* buf_pool = buf_pool_from_array(i);
ulint scan_depth;
+ flush_counters_t n;
/* srv_LRU_scan_depth can be arbitrarily large value.
We cap it with current LRU size. */
@@ -2091,44 +1976,37 @@ buf_flush_LRU_tail(void)
scan_depth = ut_min(srv_LRU_scan_depth, scan_depth);
- /* We divide LRU flush into smaller chunks because
- there may be user threads waiting for the flush to
- end in buf_LRU_get_free_block(). */
- for (ulint j = 0;
- j < scan_depth;
- j += PAGE_CLEANER_LRU_BATCH_CHUNK_SIZE) {
-
- flush_counters_t n;
-
- /* Currently page_cleaner is the only thread
- that can trigger an LRU flush. It is possible
- that a batch triggered during last iteration is
- still running, */
- if (buf_flush_LRU(buf_pool,
- PAGE_CLEANER_LRU_BATCH_CHUNK_SIZE,
- &n)) {
-
- /* Allowed only one batch per
- buffer pool instance. */
- buf_flush_wait_batch_end(
- buf_pool, BUF_FLUSH_LRU);
- }
+ /* Currently page_cleaner is the only thread
+ that can trigger an LRU flush. It is possible
+ that a batch triggered during last iteration is
+ still running, */
+ if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) {
+ continue;
+ }
- if (n.flushed) {
- total_flushed += n.flushed;
- } else {
- /* Nothing to flush */
- break;
- }
+ buf_flush_batch(buf_pool, BUF_FLUSH_LRU, scan_depth, 0, &n);
+
+ buf_flush_end(buf_pool, BUF_FLUSH_LRU);
+
+ buf_flush_common(BUF_FLUSH_LRU, n.flushed);
+
+ if (n.flushed) {
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
+ MONITOR_LRU_BATCH_FLUSH_COUNT,
+ MONITOR_LRU_BATCH_FLUSH_PAGES,
+ n.flushed);
}
- }
- if (total_flushed) {
- MONITOR_INC_VALUE_CUMULATIVE(
- MONITOR_LRU_BATCH_TOTAL_PAGE,
- MONITOR_LRU_BATCH_COUNT,
- MONITOR_LRU_BATCH_PAGES,
- total_flushed);
+ if (n.evicted) {
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
+ MONITOR_LRU_BATCH_EVICT_COUNT,
+ MONITOR_LRU_BATCH_EVICT_PAGES,
+ n.evicted);
+ }
+
+ total_flushed += (n.flushed + n.evicted);
}
return(total_flushed);
@@ -2386,14 +2264,19 @@ page_cleaner_sleep_if_needed(
if (next_loop_time > cur_time) {
/* Get sleep interval in micro seconds. We use
- ut_min() to avoid long sleep in case of
- wrap around. */
- os_thread_sleep(ut_min(1000000,
- (next_loop_time - cur_time)
- * 1000));
+ ut_min() to avoid long sleep in case of wrap around. */
+ ulint sleep_us;
+
+ sleep_us = ut_min(1000000, (next_loop_time - cur_time) * 1000);
+
+ ib_int64_t sig_count = os_event_reset(buf_flush_event);
+
+ os_event_wait_time_low(buf_flush_event, sleep_us, sig_count);
}
}
+
+
/******************************************************************//**
page_cleaner thread tasked with flushing dirty pages from the buffer
pools. As of now we'll have only one instance of this thread.
@@ -2420,7 +2303,6 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)(
fprintf(stderr, "InnoDB: page_cleaner thread running, id %lu\n",
os_thread_pf(os_thread_get_curr_id()));
#endif /* UNIV_DEBUG_THREAD_CREATION */
-
buf_page_cleaner_is_active = TRUE;
while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
@@ -2433,12 +2315,12 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)(
last_activity = srv_get_activity_count();
/* Flush pages from flush_list if required */
- page_cleaner_flush_pages_if_needed();
- n_flushed = 0;
- } else {
+ n_flushed += page_cleaner_flush_pages_if_needed();
+
+ } else if (srv_idle_flush_pct) {
n_flushed = page_cleaner_do_flush_batch(
- PCT_IO(100),
- LSN_MAX);
+ PCT_IO(100),
+ LSN_MAX);
if (n_flushed) {
MONITOR_INC_VALUE_CUMULATIVE(
@@ -2450,10 +2332,11 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)(
}
/* Flush pages from end of LRU if required */
- n_flushed = buf_flush_LRU_tail();
+ buf_flush_LRU_tail();
}
ut_ad(srv_shutdown_state > 0);
+
if (srv_fast_shutdown == 2) {
/* In very fast shutdown we simulate a crash of
buffer pool. We are not required to do any flushing */
@@ -2518,6 +2401,8 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)(
thread_exit:
buf_page_cleaner_is_active = FALSE;
+ os_event_free(buf_flush_event);
+
/* We count the number of threads in os_thread_exit(). A created
thread should always use that to exit and not use return() to exit. */
os_thread_exit(NULL);
@@ -2619,9 +2504,11 @@ buf_flush_validate(
return(ret);
}
+
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
#endif /* !UNIV_HOTBACKUP */
+
#ifdef UNIV_DEBUG
/******************************************************************//**
Check if there are any dirty pages that belong to a space id in the flush
diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc
index 36eae54c17f..952f0fc3083 100644
--- a/storage/innobase/buf/buf0lru.cc
+++ b/storage/innobase/buf/buf0lru.cc
@@ -81,6 +81,10 @@ are not blocked for extended period of time when using very large
buffer pools. */
#define BUF_LRU_DROP_SEARCH_SIZE 1024
+/** We scan these many blocks when looking for a clean page to evict
+during LRU eviction. */
+#define BUF_LRU_SEARCH_SCAN_THRESHOLD 100
+
/** If we switch on the InnoDB monitor because there are too few available
frames in the buffer pool, we set this to TRUE */
static ibool buf_lru_switched_on_innodb_mon = FALSE;
@@ -961,7 +965,7 @@ buf_LRU_free_from_unzip_LRU_list(
}
for (block = UT_LIST_GET_LAST(buf_pool->unzip_LRU),
- scanned = 1, freed = FALSE;
+ scanned = 0, freed = FALSE;
block != NULL && !freed
&& (scan_all || scanned < srv_LRU_scan_depth);
++scanned) {
@@ -978,11 +982,13 @@ buf_LRU_free_from_unzip_LRU_list(
block = prev_block;
}
- MONITOR_INC_VALUE_CUMULATIVE(
- MONITOR_LRU_UNZIP_SEARCH_SCANNED,
- MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL,
- MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL,
- scanned);
+ if (scanned) {
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+ MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL,
+ MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL,
+ scanned);
+ }
return(freed);
}
@@ -1004,21 +1010,30 @@ buf_LRU_free_from_common_LRU_list(
ut_ad(buf_pool_mutex_own(buf_pool));
- for (bpage = UT_LIST_GET_LAST(buf_pool->LRU),
- scanned = 1, freed = FALSE;
+ for (bpage = buf_pool->lru_scan_itr.start(),
+ scanned = 0, freed = false;
bpage != NULL && !freed
- && (scan_all || scanned < srv_LRU_scan_depth);
- ++scanned) {
+ && (scan_all || scanned < BUF_LRU_SEARCH_SCAN_THRESHOLD);
+ ++scanned, bpage = buf_pool->lru_scan_itr.get()) {
- unsigned accessed;
- buf_page_t* prev_bpage = UT_LIST_GET_PREV(LRU,
- bpage);
+ buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
+ buf_pool->lru_scan_itr.set(prev);
+
+ ib_mutex_t* mutex = buf_page_get_mutex(bpage);
+ mutex_enter(mutex);
ut_ad(buf_page_in_file(bpage));
ut_ad(bpage->in_LRU_list);
- accessed = buf_page_is_accessed(bpage);
- freed = buf_LRU_free_page(bpage, true);
+ unsigned accessed = buf_page_is_accessed(bpage);
+
+ if (buf_flush_ready_for_replace(bpage)) {
+ mutex_exit(mutex);
+ freed = buf_LRU_free_page(bpage, true);
+ } else {
+ mutex_exit(mutex);
+ }
+
if (freed && !accessed) {
/* Keep track of pages that are evicted without
ever being accessed. This gives us a measure of
@@ -1026,14 +1041,17 @@ buf_LRU_free_from_common_LRU_list(
++buf_pool->stat.n_ra_pages_evicted;
}
- bpage = prev_bpage;
+ ut_ad(buf_pool_mutex_own(buf_pool));
+ ut_ad(!mutex_own(mutex));
}
- MONITOR_INC_VALUE_CUMULATIVE(
- MONITOR_LRU_SEARCH_SCANNED,
- MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
- MONITOR_LRU_SEARCH_SCANNED_PER_CALL,
- scanned);
+ if (scanned) {
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_LRU_SEARCH_SCANNED,
+ MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
+ MONITOR_LRU_SEARCH_SCANNED_PER_CALL,
+ scanned);
+ }
return(freed);
}
@@ -1217,8 +1235,6 @@ the free list. Even when we flush a page or find a page in LRU scan
we put it to free list to be used.
* iteration 0:
* get a block from free list, success:done
- * if there is an LRU flush batch in progress:
- * wait for batch to end: retry free list
* if buf_pool->try_LRU_scan is set
* scan LRU up to srv_LRU_scan_depth to find a clean block
* the above will put the block on free list
@@ -1231,7 +1247,7 @@ we put it to free list to be used.
* scan whole LRU list
* scan LRU list even if buf_pool->try_LRU_scan is not set
* iteration > 1:
- * same as iteration 1 but sleep 100ms
+ * same as iteration 1 but sleep 10ms
@return the free control block, in state BUF_BLOCK_READY_FOR_USE */
UNIV_INTERN
buf_block_t*
@@ -1269,20 +1285,6 @@ loop:
return(block);
}
- if (buf_pool->init_flush[BUF_FLUSH_LRU]
- && srv_use_doublewrite_buf
- && buf_dblwr != NULL) {
-
- /* If there is an LRU flush happening in the background
- then we wait for it to end instead of trying a single
- page flush. If, however, we are not using doublewrite
- buffer then it is better to do our own single page
- flush instead of waiting for LRU flush to end. */
- buf_pool_mutex_exit(buf_pool);
- buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
- goto loop;
- }
-
freed = FALSE;
if (buf_pool->try_LRU_scan || n_iterations > 0) {
/* If no block was in the free list, search from the
@@ -1299,6 +1301,10 @@ loop:
TRUE again when we flush a batch from this
buffer pool. */
buf_pool->try_LRU_scan = FALSE;
+
+ /* Also tell the page_cleaner thread that
+ there is work for it to do. */
+ os_event_set(buf_flush_event);
}
}
@@ -1347,12 +1353,10 @@ loop:
/* If we have scanned the whole LRU and still are unable to
find a free block then we should sleep here to let the
- page_cleaner do an LRU batch for us.
- TODO: It'd be better if we can signal the page_cleaner. Perhaps
- we should use timed wait for page_cleaner. */
- if (n_iterations > 1) {
+ page_cleaner do an LRU batch for us. */
- os_thread_sleep(100000);
+ if (n_iterations > 1) {
+ os_thread_sleep(10000);
}
/* No free block was found: try to flush the LRU list.
@@ -1503,6 +1507,20 @@ buf_unzip_LRU_remove_block_if_needed(
}
/******************************************************************//**
+Adjust LRU hazard pointers if needed. */
+
+void
+buf_LRU_adjust_hp(
+/*==============*/
+ buf_pool_t* buf_pool,/*!< in: buffer pool instance */
+ const buf_page_t* bpage) /*!< in: control block */
+{
+ buf_pool->lru_hp.adjust(bpage);
+ buf_pool->lru_scan_itr.adjust(bpage);
+ buf_pool->single_scan_itr.adjust(bpage);
+}
+
+/******************************************************************//**
Removes a block from the LRU list. */
UNIV_INLINE
void
@@ -1521,6 +1539,10 @@ buf_LRU_remove_block(
ut_ad(bpage->in_LRU_list);
+ /* Important that we adjust the hazard pointers before removing
+ bpage from the LRU list. */
+ buf_LRU_adjust_hp(buf_pool, bpage);
+
/* If the LRU_old pointer is defined and points to just this block,
move it backward one step */
diff --git a/storage/innobase/buf/buf0mtflu.cc b/storage/innobase/buf/buf0mtflu.cc
new file mode 100644
index 00000000000..e990ba785e7
--- /dev/null
+++ b/storage/innobase/buf/buf0mtflu.cc
@@ -0,0 +1,758 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2014, Fusion-io. All Rights Reserved.
+Copyright (C) 2013, 2015, MariaDB Corporation. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file buf/buf0mtflu.cc
+Multi-threaded flush method implementation
+
+Created 06/11/2013 Dhananjoy Das DDas@fusionio.com
+Modified 12/12/2013 Jan Lindström jan.lindstrom@skysql.com
+Modified 03/02/2014 Dhananjoy Das DDas@fusionio.com
+Modified 06/02/2014 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "buf0mtflu.h"
+#include "buf0checksum.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "page0zip.h"
+#include "ut0byte.h"
+#include "ut0lst.h"
+#include "page0page.h"
+#include "fil0fil.h"
+#include "buf0lru.h"
+#include "buf0rea.h"
+#include "ibuf0ibuf.h"
+#include "log0log.h"
+#include "os0file.h"
+#include "os0sync.h"
+#include "trx0sys.h"
+#include "srv0mon.h"
+#include "mysql/plugin.h"
+#include "mysql/service_thd_wait.h"
+#include "fil0pagecompress.h"
+
+#define MT_COMP_WATER_MARK 50
+/** Time to wait for a message. */
+#define MT_WAIT_IN_USECS 5000000
+
+/* Work item status */
+typedef enum wrk_status {
+ WRK_ITEM_UNSET=0, /*!< Work item is not set */
+ WRK_ITEM_START=1, /*!< Processing of work item has started */
+ WRK_ITEM_DONE=2, /*!< Processing is done usually set to
+ SUCCESS/FAILED */
+ WRK_ITEM_SUCCESS=2, /*!< Work item successfully processed */
+ WRK_ITEM_FAILED=3, /*!< Work item process failed */
+ WRK_ITEM_EXIT=4, /*!< Exiting */
+ WRK_ITEM_SET=5, /*!< Work item is set */
+ WRK_ITEM_STATUS_UNDEFINED
+} wrk_status_t;
+
+/* Work item task type */
+typedef enum mt_wrk_tsk {
+ MT_WRK_NONE=0, /*!< Exit queue-wait */
+ MT_WRK_WRITE=1, /*!< Flush operation */
+ MT_WRK_READ=2, /*!< Read operation */
+ MT_WRK_UNDEFINED
+} mt_wrk_tsk_t;
+
+/* Work thread status */
+typedef enum wthr_status {
+ WTHR_NOT_INIT=0, /*!< Work thread not initialized */
+ WTHR_INITIALIZED=1, /*!< Work thread initialized */
+ WTHR_SIG_WAITING=2, /*!< Work thread wating signal */
+ WTHR_RUNNING=3, /*!< Work thread running */
+ WTHR_NO_WORK=4, /*!< Work thread has no work */
+ WTHR_KILL_IT=5, /*!< Work thread should exit */
+ WTHR_STATUS_UNDEFINED
+} wthr_status_t;
+
+/* Write work task */
+typedef struct wr_tsk {
+ buf_pool_t *buf_pool; /*!< buffer-pool instance */
+ buf_flush_t flush_type; /*!< flush-type for buffer-pool
+ flush operation */
+ ulint min; /*!< minimum number of pages
+ requested to be flushed */
+ lsn_t lsn_limit; /*!< lsn limit for the buffer-pool
+ flush operation */
+} wr_tsk_t;
+
+/* Read work task */
+typedef struct rd_tsk {
+ buf_pool_t *page_pool; /*!< list of pages to decompress; */
+} rd_tsk_t;
+
+/* Work item */
+typedef struct wrk_itm
+{
+ mt_wrk_tsk_t tsk; /*!< Task type. Based on task-type
+ one of the entries wr_tsk/rd_tsk
+ will be used */
+ wr_tsk_t wr; /*!< Flush page list */
+ rd_tsk_t rd; /*!< Decompress page list */
+ ulint n_flushed; /*!< Number of flushed pages */
+ ulint n_evicted; /*!< Number of evicted pages */
+ os_thread_id_t id_usr; /*!< Thread-id currently working */
+ wrk_status_t wi_status; /*!< Work item status */
+ mem_heap_t *wheap; /*!< Heap were to allocate memory
+ for queue nodes */
+ mem_heap_t *rheap;
+} wrk_t;
+
+typedef struct thread_data
+{
+ os_thread_id_t wthread_id; /*!< Identifier */
+ os_thread_t wthread; /*!< Thread id */
+ wthr_status_t wt_status; /*!< Worker thread status */
+} thread_data_t;
+
+/* Thread syncronization data */
+typedef struct thread_sync
+{
+ /* Global variables used by all threads */
+ os_fast_mutex_t thread_global_mtx; /*!< Mutex used protecting below
+ variables */
+ ulint n_threads; /*!< Number of threads */
+ ib_wqueue_t *wq; /*!< Work Queue */
+ ib_wqueue_t *wr_cq; /*!< Write Completion Queue */
+ ib_wqueue_t *rd_cq; /*!< Read Completion Queue */
+ mem_heap_t* wheap; /*!< Work heap where memory
+ is allocated */
+ mem_heap_t* rheap; /*!< Work heap where memory
+ is allocated */
+ wthr_status_t gwt_status; /*!< Global thread status */
+
+ /* Variables used by only one thread at a time */
+ thread_data_t* thread_data; /*!< Thread specific data */
+
+} thread_sync_t;
+
+static int mtflush_work_initialized = -1;
+static thread_sync_t* mtflush_ctx=NULL;
+static os_fast_mutex_t mtflush_mtx;
+
+/******************************************************************//**
+Set multi-threaded flush work initialized. */
+static inline
+void
+buf_mtflu_work_init(void)
+/*=====================*/
+{
+ mtflush_work_initialized = 1;
+}
+
+/******************************************************************//**
+Return true if multi-threaded flush is initialized
+@return true if initialized */
+bool
+buf_mtflu_init_done(void)
+/*=====================*/
+{
+ return(mtflush_work_initialized == 1);
+}
+
+/******************************************************************//**
+Fush buffer pool instance.
+@return number of flushed pages, or 0 if error happened
+*/
+static
+ulint
+buf_mtflu_flush_pool_instance(
+/*==========================*/
+ wrk_t *work_item) /*!< inout: work item to be flushed */
+{
+ flush_counters_t n;
+ ut_a(work_item != NULL);
+ ut_a(work_item->wr.buf_pool != NULL);
+
+ if (!buf_flush_start(work_item->wr.buf_pool, work_item->wr.flush_type)) {
+ /* We have two choices here. If lsn_limit was
+ specified then skipping an instance of buffer
+ pool means we cannot guarantee that all pages
+ up to lsn_limit has been flushed. We can
+ return right now with failure or we can try
+ to flush remaining buffer pools up to the
+ lsn_limit. We attempt to flush other buffer
+ pools based on the assumption that it will
+ help in the retry which will follow the
+ failure. */
+#ifdef UNIV_MTFLUSH_DEBUG
+ fprintf(stderr, "InnoDB: Note: buf flush start failed there is already active flush for this buffer pool.\n");
+#endif
+ return 0;
+ }
+
+ memset(&n, 0, sizeof(flush_counters_t));
+
+ if (work_item->wr.flush_type == BUF_FLUSH_LRU) {
+ /* srv_LRU_scan_depth can be arbitrarily large value.
+ * We cap it with current LRU size.
+ */
+ buf_pool_mutex_enter(work_item->wr.buf_pool);
+ work_item->wr.min = UT_LIST_GET_LEN(work_item->wr.buf_pool->LRU);
+ buf_pool_mutex_exit(work_item->wr.buf_pool);
+ work_item->wr.min = ut_min(srv_LRU_scan_depth,work_item->wr.min);
+ }
+
+ buf_flush_batch(work_item->wr.buf_pool,
+ work_item->wr.flush_type,
+ work_item->wr.min,
+ work_item->wr.lsn_limit,
+ &n);
+
+ buf_flush_end(work_item->wr.buf_pool, work_item->wr.flush_type);
+ buf_flush_common(work_item->wr.flush_type, n.flushed);
+ work_item->n_flushed = n.flushed;
+ work_item->n_evicted = n.evicted;
+
+ return work_item->n_flushed;
+}
+
+/******************************************************************//**
+Worker function to wait for work items and processing them and
+sending reply back.
+*/
+static
+void
+mtflush_service_io(
+/*===============*/
+ thread_sync_t* mtflush_io, /*!< inout: multi-threaded flush
+ syncronization data */
+ thread_data_t* thread_data) /* Thread status data */
+{
+ wrk_t *work_item = NULL;
+ ulint n_flushed=0;
+
+ ut_a(mtflush_io != NULL);
+ ut_a(thread_data != NULL);
+
+ thread_data->wt_status = WTHR_SIG_WAITING;
+
+ work_item = (wrk_t *)ib_wqueue_nowait(mtflush_io->wq);
+
+ if (work_item == NULL) {
+ work_item = (wrk_t *)ib_wqueue_wait(mtflush_io->wq);
+ }
+
+ if (work_item) {
+ thread_data->wt_status = WTHR_RUNNING;
+ } else {
+ /* Thread did not get any work */
+ thread_data->wt_status = WTHR_NO_WORK;
+ return;
+ }
+
+ if (work_item->wi_status != WRK_ITEM_EXIT) {
+ work_item->wi_status = WRK_ITEM_SET;
+ }
+
+#ifdef UNIV_MTFLUSH_DEBUG
+ ut_a(work_item->id_usr == 0);
+#endif
+ work_item->id_usr = os_thread_get_curr_id();
+
+ /* This works as a producer/consumer model, where in tasks are
+ * inserted into the work-queue (wq) and completions are based
+ * on the type of operations performed and as a result the WRITE/
+ * compression/flush operation completions get posted to wr_cq.
+ * And READ/decompress operations completions get posted to rd_cq.
+ * in future we may have others.
+ */
+
+ switch(work_item->tsk) {
+ case MT_WRK_NONE:
+ ut_a(work_item->wi_status == WRK_ITEM_EXIT);
+ work_item->wi_status = WRK_ITEM_EXIT;
+ ib_wqueue_add(mtflush_io->wr_cq, work_item, work_item->rheap);
+ thread_data->wt_status = WTHR_KILL_IT;
+ break;
+
+ case MT_WRK_WRITE:
+ ut_a(work_item->wi_status == WRK_ITEM_SET);
+ work_item->wi_status = WRK_ITEM_START;
+ /* Process work item */
+ if (0 == (n_flushed = buf_mtflu_flush_pool_instance(work_item))) {
+ work_item->wi_status = WRK_ITEM_FAILED;
+ }
+ work_item->wi_status = WRK_ITEM_SUCCESS;
+ ib_wqueue_add(mtflush_io->wr_cq, work_item, work_item->rheap);
+ break;
+
+ case MT_WRK_READ:
+ ut_a(0);
+ break;
+
+ default:
+ /* None other than Write/Read handling planned */
+ ut_a(0);
+ break;
+ }
+}
+
+/******************************************************************//**
+Thead used to flush dirty pages when multi-threaded flush is
+used.
+@return a dummy parameter*/
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(mtflush_io_thread)(
+/*==============================*/
+ void * arg)
+{
+ thread_sync_t *mtflush_io = ((thread_sync_t *)arg);
+ thread_data_t *this_thread_data = NULL;
+ ulint i;
+
+ /* Find correct slot for this thread */
+ os_fast_mutex_lock(&(mtflush_io->thread_global_mtx));
+ for(i=0; i < mtflush_io->n_threads; i ++) {
+ if (mtflush_io->thread_data[i].wthread_id == os_thread_get_curr_id()) {
+ break;
+ }
+ }
+
+ ut_a(i <= mtflush_io->n_threads);
+ this_thread_data = &mtflush_io->thread_data[i];
+ os_fast_mutex_unlock(&(mtflush_io->thread_global_mtx));
+
+ while (TRUE) {
+
+#ifdef UNIV_MTFLUSH_DEBUG
+ fprintf(stderr, "InnoDB: Note. Thread %lu work queue len %lu return queue len %lu\n",
+ os_thread_get_curr_id(),
+ ib_wqueue_len(mtflush_io->wq),
+ ib_wqueue_len(mtflush_io->wr_cq));
+#endif /* UNIV_MTFLUSH_DEBUG */
+
+ mtflush_service_io(mtflush_io, this_thread_data);
+
+
+ if (this_thread_data->wt_status == WTHR_KILL_IT) {
+ break;
+ }
+ }
+
+ os_thread_exit(NULL);
+ OS_THREAD_DUMMY_RETURN;
+}
+
+/******************************************************************//**
+Add exit work item to work queue to signal multi-threded flush
+threads that they should exit.
+*/
+void
+buf_mtflu_io_thread_exit(void)
+/*==========================*/
+{
+ ulint i;
+ thread_sync_t* mtflush_io = mtflush_ctx;
+ wrk_t* work_item = NULL;
+
+ ut_a(mtflush_io != NULL);
+
+ /* Allocate work items for shutdown message */
+ work_item = (wrk_t*)mem_heap_alloc(mtflush_io->wheap, sizeof(wrk_t)*srv_mtflush_threads);
+
+ /* Confirm if the io-thread KILL is in progress, bailout */
+ if (mtflush_io->gwt_status == WTHR_KILL_IT) {
+ return;
+ }
+
+ mtflush_io->gwt_status = WTHR_KILL_IT;
+
+ /* This lock is to safequard against timing bug: flush request take
+ this mutex before sending work items to be processed by flush
+ threads. Inside flush thread we assume that work queue contains only
+ a constant number of items. Thus, we may not install new work items
+ below before all previous ones are processed. This mutex is released
+ by flush request after all work items sent to flush threads have
+ been processed. Thus, we can get this mutex if and only if work
+ queue is empty. */
+
+ os_fast_mutex_lock(&mtflush_mtx);
+
+ /* Make sure the work queue is empty */
+ ut_a(ib_wqueue_is_empty(mtflush_io->wq));
+
+ /* Send one exit work item/thread */
+ for (i=0; i < (ulint)srv_mtflush_threads; i++) {
+ work_item[i].tsk = MT_WRK_NONE;
+ work_item[i].wi_status = WRK_ITEM_EXIT;
+ work_item[i].wheap = mtflush_io->wheap;
+ work_item[i].rheap = mtflush_io->rheap;
+ work_item[i].id_usr = 0;
+
+ ib_wqueue_add(mtflush_io->wq,
+ (void *)&(work_item[i]),
+ mtflush_io->wheap);
+ }
+
+ /* Requests sent */
+ os_fast_mutex_unlock(&mtflush_mtx);
+
+ /* Wait until all work items on a work queue are processed */
+ while(!ib_wqueue_is_empty(mtflush_io->wq)) {
+ /* Wait */
+ os_thread_sleep(MT_WAIT_IN_USECS);
+ }
+
+ ut_a(ib_wqueue_is_empty(mtflush_io->wq));
+
+ /* Collect all work done items */
+ for (i=0; i < (ulint)srv_mtflush_threads;) {
+ wrk_t* work_item = NULL;
+
+ work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wr_cq, MT_WAIT_IN_USECS);
+
+ /* If we receive reply to work item and it's status is exit,
+ thead has processed this message and existed */
+ if (work_item && work_item->wi_status == WRK_ITEM_EXIT) {
+ i++;
+ }
+ }
+
+ /* Wait about 1/2 sec to allow threads really exit */
+ os_thread_sleep(MT_WAIT_IN_USECS);
+
+ /* Make sure that work queue is empty */
+ while(!ib_wqueue_is_empty(mtflush_io->wq))
+ {
+ ib_wqueue_nowait(mtflush_io->wq);
+ }
+
+ os_fast_mutex_lock(&mtflush_mtx);
+
+ ut_a(ib_wqueue_is_empty(mtflush_io->wq));
+ ut_a(ib_wqueue_is_empty(mtflush_io->wr_cq));
+ ut_a(ib_wqueue_is_empty(mtflush_io->rd_cq));
+
+ /* Free all queues */
+ ib_wqueue_free(mtflush_io->wq);
+ ib_wqueue_free(mtflush_io->wr_cq);
+ ib_wqueue_free(mtflush_io->rd_cq);
+
+ mtflush_io->wq = NULL;
+ mtflush_io->wr_cq = NULL;
+ mtflush_io->rd_cq = NULL;
+ mtflush_work_initialized = 0;
+
+ /* Free heap */
+ mem_heap_free(mtflush_io->wheap);
+ mem_heap_free(mtflush_io->rheap);
+
+ os_fast_mutex_unlock(&mtflush_mtx);
+ os_fast_mutex_free(&mtflush_mtx);
+ os_fast_mutex_free(&mtflush_io->thread_global_mtx);
+}
+
+/******************************************************************//**
+Initialize multi-threaded flush thread syncronization data.
+@return Initialized multi-threaded flush thread syncroniztion data. */
+void*
+buf_mtflu_handler_init(
+/*===================*/
+ ulint n_threads, /*!< in: Number of threads to create */
+ ulint wrk_cnt) /*!< in: Number of work items */
+{
+ ulint i;
+ mem_heap_t* mtflush_heap;
+ mem_heap_t* mtflush_heap2;
+
+ /* Create heap, work queue, write completion queue, read
+ completion queue for multi-threaded flush, and init
+ handler. */
+ mtflush_heap = mem_heap_create(0);
+ ut_a(mtflush_heap != NULL);
+ mtflush_heap2 = mem_heap_create(0);
+ ut_a(mtflush_heap2 != NULL);
+
+ mtflush_ctx = (thread_sync_t *)mem_heap_alloc(mtflush_heap,
+ sizeof(thread_sync_t));
+ memset(mtflush_ctx, 0, sizeof(thread_sync_t));
+ ut_a(mtflush_ctx != NULL);
+ mtflush_ctx->thread_data = (thread_data_t*)mem_heap_alloc(
+ mtflush_heap, sizeof(thread_data_t) * n_threads);
+ ut_a(mtflush_ctx->thread_data);
+ memset(mtflush_ctx->thread_data, 0, sizeof(thread_data_t) * n_threads);
+
+ mtflush_ctx->n_threads = n_threads;
+ mtflush_ctx->wq = ib_wqueue_create();
+ ut_a(mtflush_ctx->wq);
+ mtflush_ctx->wr_cq = ib_wqueue_create();
+ ut_a(mtflush_ctx->wr_cq);
+ mtflush_ctx->rd_cq = ib_wqueue_create();
+ ut_a(mtflush_ctx->rd_cq);
+ mtflush_ctx->wheap = mtflush_heap;
+ mtflush_ctx->rheap = mtflush_heap2;
+
+ os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mtflush_ctx->thread_global_mtx);
+ os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mtflush_mtx);
+
+ /* Create threads for page-compression-flush */
+ for(i=0; i < n_threads; i++) {
+ os_thread_id_t new_thread_id;
+
+ mtflush_ctx->thread_data[i].wt_status = WTHR_INITIALIZED;
+
+ mtflush_ctx->thread_data[i].wthread = os_thread_create(
+ mtflush_io_thread,
+ ((void *) mtflush_ctx),
+ &new_thread_id);
+
+ mtflush_ctx->thread_data[i].wthread_id = new_thread_id;
+ }
+
+ buf_mtflu_work_init();
+
+ return((void *)mtflush_ctx);
+}
+
+/******************************************************************//**
+Flush buffer pool instances.
+@return number of pages flushed. */
+ulint
+buf_mtflu_flush_work_items(
+/*=======================*/
+ ulint buf_pool_inst, /*!< in: Number of buffer pool instances */
+ flush_counters_t *per_pool_cnt, /*!< out: Number of pages
+ flushed or evicted /instance */
+ buf_flush_t flush_type, /*!< in: Type of flush */
+ ulint min_n, /*!< in: Wished minimum number of
+ blocks to be flushed */
+ lsn_t lsn_limit) /*!< in: All blocks whose
+ oldest_modification is smaller than
+ this should be flushed (if their
+ number does not exceed min_n) */
+{
+ ulint n_flushed=0, i;
+ mem_heap_t* work_heap;
+ mem_heap_t* reply_heap;
+ wrk_t work_item[MTFLUSH_MAX_WORKER];
+
+ if (mtflush_ctx->gwt_status == WTHR_KILL_IT) {
+ return 0;
+ }
+
+ /* Allocate heap where all work items used and queue
+ node items areallocated */
+ work_heap = mem_heap_create(0);
+ reply_heap = mem_heap_create(0);
+
+
+ for(i=0;i<buf_pool_inst; i++) {
+ work_item[i].tsk = MT_WRK_WRITE;
+ work_item[i].wr.buf_pool = buf_pool_from_array(i);
+ work_item[i].wr.flush_type = flush_type;
+ work_item[i].wr.min = min_n;
+ work_item[i].wr.lsn_limit = lsn_limit;
+ work_item[i].wi_status = WRK_ITEM_UNSET;
+ work_item[i].wheap = work_heap;
+ work_item[i].rheap = reply_heap;
+ work_item[i].n_flushed = 0;
+ work_item[i].n_evicted = 0;
+ work_item[i].id_usr = 0;
+
+ ib_wqueue_add(mtflush_ctx->wq,
+ (void *)(work_item + i),
+ work_heap);
+ }
+
+ /* wait on the completion to arrive */
+ for(i=0; i< buf_pool_inst;) {
+ wrk_t *done_wi = NULL;
+ done_wi = (wrk_t *)ib_wqueue_wait(mtflush_ctx->wr_cq);
+
+ if (done_wi != NULL) {
+ per_pool_cnt[i].flushed = done_wi->n_flushed;
+ per_pool_cnt[i].evicted = done_wi->n_evicted;
+
+#ifdef UNIV_MTFLUSH_DEBUG
+ if((int)done_wi->id_usr == 0 &&
+ (done_wi->wi_status == WRK_ITEM_SET ||
+ done_wi->wi_status == WRK_ITEM_UNSET)) {
+ fprintf(stderr,
+ "**Set/Unused work_item[%lu] flush_type=%d\n",
+ i,
+ done_wi->wr.flush_type);
+ ut_a(0);
+ }
+#endif
+
+ n_flushed+= done_wi->n_flushed+done_wi->n_evicted;
+ i++;
+ }
+ }
+
+ /* Release used work_items and queue nodes */
+ mem_heap_free(work_heap);
+ mem_heap_free(reply_heap);
+
+ return(n_flushed);
+}
+
+/*******************************************************************//**
+Multi-threaded version of buf_flush_list
+*/
+bool
+buf_mtflu_flush_list(
+/*=================*/
+ ulint min_n, /*!< in: wished minimum mumber of blocks
+ flushed (it is not guaranteed that the
+ actual number is that big, though) */
+ lsn_t lsn_limit, /*!< in the case BUF_FLUSH_LIST all
+ blocks whose oldest_modification is
+ smaller than this should be flushed
+ (if their number does not exceed
+ min_n), otherwise ignored */
+ ulint* n_processed) /*!< out: the number of pages
+ which were processed is passed
+ back to caller. Ignored if NULL */
+
+{
+ ulint i;
+ bool success = true;
+ flush_counters_t cnt[MTFLUSH_MAX_WORKER];
+
+ if (n_processed) {
+ *n_processed = 0;
+ }
+
+ if (min_n != ULINT_MAX) {
+ /* Ensure that flushing is spread evenly amongst the
+ buffer pool instances. When min_n is ULINT_MAX
+ we need to flush everything up to the lsn limit
+ so no limit here. */
+ min_n = (min_n + srv_buf_pool_instances - 1)
+ / srv_buf_pool_instances;
+ }
+
+ /* This lock is to safequard against re-entry if any. */
+ os_fast_mutex_lock(&mtflush_mtx);
+ buf_mtflu_flush_work_items(srv_buf_pool_instances,
+ cnt, BUF_FLUSH_LIST,
+ min_n, lsn_limit);
+ os_fast_mutex_unlock(&mtflush_mtx);
+
+ for (i = 0; i < srv_buf_pool_instances; i++) {
+ if (n_processed) {
+ *n_processed += cnt[i].flushed+cnt[i].evicted;
+ }
+
+ if (cnt[i].flushed) {
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+ MONITOR_FLUSH_BATCH_COUNT,
+ MONITOR_FLUSH_BATCH_PAGES,
+ cnt[i].flushed);
+ }
+
+ if(cnt[i].evicted) {
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
+ MONITOR_LRU_BATCH_EVICT_COUNT,
+ MONITOR_LRU_BATCH_EVICT_PAGES,
+ cnt[i].evicted);
+ }
+ }
+#ifdef UNIV_MTFLUSH_DEBUG
+ fprintf(stderr, "%s: [1] [*n_processed: (min:%lu)%lu ]\n",
+ __FUNCTION__, (min_n * srv_buf_pool_instances), *n_processed);
+#endif
+ return(success);
+}
+
+/*********************************************************************//**
+Clears up tail of the LRU lists:
+* Put replaceable pages at the tail of LRU to the free list
+* Flush dirty pages at the tail of LRU to the disk
+The depth to which we scan each buffer pool is controlled by dynamic
+config parameter innodb_LRU_scan_depth.
+@return total pages flushed */
+UNIV_INTERN
+ulint
+buf_mtflu_flush_LRU_tail(void)
+/*==========================*/
+{
+ ulint total_flushed=0, i;
+ flush_counters_t cnt[MTFLUSH_MAX_WORKER];
+
+ ut_a(buf_mtflu_init_done());
+
+ /* At shutdown do not send requests anymore */
+ if (!mtflush_ctx || mtflush_ctx->gwt_status == WTHR_KILL_IT) {
+ return (total_flushed);
+ }
+
+ /* This lock is to safeguard against re-entry if any */
+ os_fast_mutex_lock(&mtflush_mtx);
+ buf_mtflu_flush_work_items(srv_buf_pool_instances,
+ cnt, BUF_FLUSH_LRU, srv_LRU_scan_depth, 0);
+ os_fast_mutex_unlock(&mtflush_mtx);
+
+ for (i = 0; i < srv_buf_pool_instances; i++) {
+ total_flushed += cnt[i].flushed+cnt[i].evicted;
+
+ if (cnt[i].flushed) {
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
+ MONITOR_LRU_BATCH_FLUSH_COUNT,
+ MONITOR_LRU_BATCH_FLUSH_PAGES,
+ cnt[i].flushed);
+ }
+
+ if(cnt[i].evicted) {
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
+ MONITOR_LRU_BATCH_EVICT_COUNT,
+ MONITOR_LRU_BATCH_EVICT_PAGES,
+ cnt[i].evicted);
+ }
+ }
+
+#if UNIV_MTFLUSH_DEBUG
+ fprintf(stderr, "[1] [*n_processed: (min:%lu)%lu ]\n", (
+ srv_LRU_scan_depth * srv_buf_pool_instances), total_flushed);
+#endif
+
+ return(total_flushed);
+}
+
+/*********************************************************************//**
+Set correct thread identifiers to io thread array based on
+information we have. */
+void
+buf_mtflu_set_thread_ids(
+/*=====================*/
+ ulint n_threads, /*!<in: Number of threads to fill */
+ void* ctx, /*!<in: thread context */
+ os_thread_id_t* thread_ids) /*!<in: thread id array */
+{
+ thread_sync_t *mtflush_io = ((thread_sync_t *)ctx);
+ ulint i;
+ ut_a(mtflush_io != NULL);
+ ut_a(thread_ids != NULL);
+
+ for(i = 0; i < n_threads; i++) {
+ thread_ids[i] = mtflush_io->thread_data[i].wthread_id;
+ }
+}
diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc
index 7c8369c0c09..b4b474c547f 100644
--- a/storage/innobase/buf/buf0rea.cc
+++ b/storage/innobase/buf/buf0rea.cc
@@ -176,22 +176,25 @@ buf_read_page_low(
ut_ad(buf_page_in_file(bpage));
+ byte* frame = zip_size ? bpage->zip.data : ((buf_block_t*) bpage)->frame;
+
if (sync) {
thd_wait_begin(NULL, THD_WAIT_DISKIO);
}
if (zip_size) {
*err = fil_io(OS_FILE_READ | wake_later
- | ignore_nonexistent_pages,
- sync, space, zip_size, offset, 0, zip_size,
- bpage->zip.data, bpage);
+ | ignore_nonexistent_pages,
+ sync, space, zip_size, offset, 0, zip_size,
+ frame, bpage, &bpage->write_size);
} else {
ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
*err = fil_io(OS_FILE_READ | wake_later
- | ignore_nonexistent_pages,
- sync, space, 0, offset, 0, UNIV_PAGE_SIZE,
- ((buf_block_t*) bpage)->frame, bpage);
+ | ignore_nonexistent_pages,
+ sync, space, 0, offset, 0, UNIV_PAGE_SIZE,
+ frame, bpage,
+ &bpage->write_size);
}
if (sync) {
diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc
index b866f44cc54..48053a954cf 100644
--- a/storage/innobase/dict/dict0dict.cc
+++ b/storage/innobase/dict/dict0dict.cc
@@ -2,6 +2,7 @@
Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -320,10 +321,10 @@ dict_get_db_name_len(
Reserves the dictionary system mutex for MySQL. */
UNIV_INTERN
void
-dict_mutex_enter_for_mysql(void)
+dict_mutex_enter_for_mysql_func(const char * file, ulint line)
/*============================*/
{
- mutex_enter(&(dict_sys->mutex));
+ mutex_enter_func(&(dict_sys->mutex), file, line);
}
/********************************************************************//**
@@ -508,7 +509,7 @@ dict_table_try_drop_aborted(
if (table == NULL) {
table = dict_table_open_on_id_low(
- table_id, DICT_ERR_IGNORE_NONE);
+ table_id, DICT_ERR_IGNORE_NONE, FALSE);
} else {
ut_ad(table->id == table_id);
}
@@ -843,17 +844,24 @@ dict_index_get_nth_col_or_prefix_pos(
/*=================================*/
const dict_index_t* index, /*!< in: index */
ulint n, /*!< in: column number */
- ibool inc_prefix) /*!< in: TRUE=consider
+ ibool inc_prefix, /*!< in: TRUE=consider
column prefixes too */
+ ulint* prefix_col_pos) /*!< out: col num if prefix */
{
const dict_field_t* field;
const dict_col_t* col;
ulint pos;
ulint n_fields;
+ ulint prefixed_pos_dummy;
ut_ad(index);
ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ if (!prefix_col_pos) {
+ prefix_col_pos = &prefixed_pos_dummy;
+ }
+ *prefix_col_pos = ULINT_UNDEFINED;
+
col = dict_table_get_nth_col(index->table, n);
if (dict_index_is_clust(index)) {
@@ -866,10 +874,11 @@ dict_index_get_nth_col_or_prefix_pos(
for (pos = 0; pos < n_fields; pos++) {
field = dict_index_get_nth_field(index, pos);
- if (col == field->col
- && (inc_prefix || field->prefix_len == 0)) {
-
- return(pos);
+ if (col == field->col) {
+ *prefix_col_pos = pos;
+ if (inc_prefix || field->prefix_len == 0) {
+ return(pos);
+ }
}
}
@@ -981,7 +990,8 @@ dict_table_open_on_id(
table_id,
table_op == DICT_TABLE_OP_LOAD_TABLESPACE
? DICT_ERR_IGNORE_RECOVER_LOCK
- : DICT_ERR_IGNORE_NONE);
+ : DICT_ERR_IGNORE_NONE,
+ table_op == DICT_TABLE_OP_OPEN_ONLY_IF_CACHED);
if (table != NULL) {
@@ -1013,7 +1023,7 @@ dict_table_get_nth_col_pos(
ulint n) /*!< in: column number */
{
return(dict_index_get_nth_col_pos(dict_table_get_first_index(table),
- n));
+ n, NULL));
}
/********************************************************************//**
@@ -1499,7 +1509,7 @@ dict_table_move_from_non_lru_to_lru(
/**********************************************************************//**
Looks for an index with the given id given a table instance.
@return index or NULL */
-static
+UNIV_INTERN
dict_index_t*
dict_table_find_index_on_id(
/*========================*/
@@ -2631,6 +2641,13 @@ undo_size_ok:
new_index->stat_index_size = 1;
new_index->stat_n_leaf_pages = 1;
+ new_index->stat_defrag_n_pages_freed = 0;
+ new_index->stat_defrag_n_page_split = 0;
+
+ new_index->stat_defrag_sample_next_slot = 0;
+ memset(&new_index->stat_defrag_data_size_sample,
+ 0x0, sizeof(ulint) * STAT_DEFRAG_DATA_SIZE_N_SAMPLE);
+
/* Add the new index as the last index for the table */
UT_LIST_ADD_LAST(indexes, table->indexes, new_index);
@@ -3440,7 +3457,29 @@ dict_foreign_find_index(
return(NULL);
}
-
+#ifdef WITH_WSREP
+dict_index_t*
+wsrep_dict_foreign_find_index(
+/*====================*/
+ dict_table_t* table, /*!< in: table */
+ const char** col_names, /*!< in: column names, or NULL
+ to use table->col_names */
+ const char** columns,/*!< in: array of column names */
+ ulint n_cols, /*!< in: number of columns */
+ dict_index_t* types_idx, /*!< in: NULL or an index to whose types the
+ column types must match */
+ ibool check_charsets,
+ /*!< in: whether to check charsets.
+ only has an effect if types_idx != NULL */
+ ulint check_null)
+ /*!< in: nonzero if none of the columns must
+ be declared NOT NULL */
+{
+ return dict_foreign_find_index(
+ table, col_names, columns, n_cols, types_idx, check_charsets,
+ check_null);
+}
+#endif /* WITH_WSREP */
/**********************************************************************//**
Report an error in a foreign key definition. */
static
diff --git a/storage/innobase/dict/dict0load.cc b/storage/innobase/dict/dict0load.cc
index 16e64da6619..149811dab60 100644
--- a/storage/innobase/dict/dict0load.cc
+++ b/storage/innobase/dict/dict0load.cc
@@ -1153,6 +1153,12 @@ loop:
space_id, name);
}
+ /* We need to read page 0 to get (optional) IV
+ regardless if encryptions is turned on or not,
+ since if it's off we should decrypt a potentially
+ already encrypted table */
+ bool read_page_0 = true;
+
/* We set the 2nd param (fix_dict = true)
here because we already have an x-lock on
dict_operation_lock and dict_sys->mutex. Besides,
@@ -1160,7 +1166,7 @@ loop:
If the filepath is not known, it will need to
be discovered. */
dberr_t err = fil_open_single_table_tablespace(
- false, srv_read_only_mode ? false : true,
+ read_page_0, srv_read_only_mode ? false : true,
space_id, dict_tf_to_fsp_flags(flags),
name, filepath);
@@ -2640,6 +2646,99 @@ check_rec:
return(table);
}
+/***********************************************************************//**
+Loads a table id based on the index id.
+@return true if found */
+static
+bool
+dict_load_table_id_on_index_id(
+/*==================*/
+ index_id_t index_id, /*!< in: index id */
+ table_id_t* table_id) /*!< out: table id */
+{
+ /* check hard coded indexes */
+ switch(index_id) {
+ case DICT_TABLES_ID:
+ case DICT_COLUMNS_ID:
+ case DICT_INDEXES_ID:
+ case DICT_FIELDS_ID:
+ *table_id = index_id;
+ return true;
+ case DICT_TABLE_IDS_ID:
+ /* The following is a secondary index on SYS_TABLES */
+ *table_id = DICT_TABLES_ID;
+ return true;
+ }
+
+ bool found = false;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ /* NOTE that the operation of this function is protected by
+ the dictionary mutex, and therefore no deadlocks can occur
+ with other dictionary operations. */
+
+ mtr_start(&mtr);
+
+ btr_pcur_t pcur;
+ const rec_t* rec = dict_startscan_system(&pcur, &mtr, SYS_INDEXES);
+
+ while (rec) {
+ ulint len;
+ const byte* field = rec_get_nth_field_old(
+ rec, DICT_FLD__SYS_INDEXES__ID, &len);
+ ut_ad(len == 8);
+
+ /* Check if the index id is the one searched for */
+ if (index_id == mach_read_from_8(field)) {
+ found = true;
+ /* Now we get the table id */
+ const byte* field = rec_get_nth_field_old(
+ rec,
+ DICT_FLD__SYS_INDEXES__TABLE_ID,
+ &len);
+ *table_id = mach_read_from_8(field);
+ break;
+ }
+ mtr_commit(&mtr);
+ mtr_start(&mtr);
+ rec = dict_getnext_system(&pcur, &mtr);
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ return(found);
+}
+
+UNIV_INTERN
+dict_table_t*
+dict_table_open_on_index_id(
+/*==================*/
+ index_id_t index_id, /*!< in: index id */
+ bool dict_locked) /*!< in: dict locked */
+{
+ if (!dict_locked) {
+ mutex_enter(&dict_sys->mutex);
+ }
+
+ ut_ad(mutex_own(&dict_sys->mutex));
+ table_id_t table_id;
+ dict_table_t * table = NULL;
+ if (dict_load_table_id_on_index_id(index_id, &table_id)) {
+ bool local_dict_locked = true;
+ table = dict_table_open_on_id(table_id,
+ local_dict_locked,
+ DICT_TABLE_OP_LOAD_TABLESPACE);
+ }
+
+ if (!dict_locked) {
+ mutex_exit(&dict_sys->mutex);
+ }
+ return table;
+}
+
/********************************************************************//**
This function is called when the database is booted. Loads system table
index definitions except for the clustered index which is added to the
diff --git a/storage/innobase/dict/dict0mem.cc b/storage/innobase/dict/dict0mem.cc
index e830a8430b7..9c186304d27 100644
--- a/storage/innobase/dict/dict0mem.cc
+++ b/storage/innobase/dict/dict0mem.cc
@@ -284,7 +284,7 @@ dict_mem_table_add_col(
if (UNIV_UNLIKELY(table->n_def == table->n_cols)) {
heap = table->heap;
}
- if (UNIV_LIKELY(i) && UNIV_UNLIKELY(!table->col_names)) {
+ if (UNIV_LIKELY(i != 0) && UNIV_UNLIKELY(table->col_names == NULL)) {
/* All preceding column names are empty. */
char* s = static_cast<char*>(
mem_heap_zalloc(heap, table->n_def));
diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc
index 1eac9e0df51..001623a49bc 100644
--- a/storage/innobase/dict/dict0stats.cc
+++ b/storage/innobase/dict/dict0stats.cc
@@ -194,7 +194,7 @@ dict_stats_persistent_storage_check(
{"table_name", DATA_VARMYSQL,
DATA_NOT_NULL, 192},
- {"last_update", DATA_INT,
+ {"last_update", DATA_FIXBINARY,
DATA_NOT_NULL, 4},
{"n_rows", DATA_INT,
@@ -225,7 +225,7 @@ dict_stats_persistent_storage_check(
{"index_name", DATA_VARMYSQL,
DATA_NOT_NULL, 192},
- {"last_update", DATA_INT,
+ {"last_update", DATA_FIXBINARY,
DATA_NOT_NULL, 4},
{"stat_name", DATA_VARMYSQL,
@@ -496,6 +496,9 @@ dict_stats_table_clone_create(
heap,
idx->n_uniq * sizeof(idx->stat_n_non_null_key_vals[0]));
ut_d(idx->magic_n = DICT_INDEX_MAGIC_N);
+
+ idx->stat_defrag_n_page_split = 0;
+ idx->stat_defrag_n_pages_freed = 0;
}
ut_d(t->magic_n = DICT_TABLE_MAGIC_N);
@@ -525,7 +528,9 @@ static
void
dict_stats_empty_index(
/*===================*/
- dict_index_t* index) /*!< in/out: index */
+ dict_index_t* index, /*!< in/out: index */
+ bool empty_defrag_stats)
+ /*!< in: whether to empty defrag stats */
{
ut_ad(!(index->type & DICT_FTS));
ut_ad(!dict_index_is_univ(index));
@@ -540,6 +545,34 @@ dict_stats_empty_index(
index->stat_index_size = 1;
index->stat_n_leaf_pages = 1;
+
+ if (empty_defrag_stats) {
+ dict_stats_empty_defrag_stats(index);
+ dict_stats_empty_defrag_summary(index);
+ }
+}
+
+/**********************************************************************//**
+Clear defragmentation summary. */
+UNIV_INTERN
+void
+dict_stats_empty_defrag_summary(
+/*==================*/
+ dict_index_t* index) /*!< in: index to clear defragmentation stats */
+{
+ index->stat_defrag_n_pages_freed = 0;
+}
+
+/**********************************************************************//**
+Clear defragmentation related index stats. */
+UNIV_INTERN
+void
+dict_stats_empty_defrag_stats(
+/*==================*/
+ dict_index_t* index) /*!< in: index to clear defragmentation stats */
+{
+ index->stat_defrag_modified_counter = 0;
+ index->stat_defrag_n_page_split = 0;
}
/*********************************************************************//**
@@ -549,7 +582,9 @@ static
void
dict_stats_empty_table(
/*===================*/
- dict_table_t* table) /*!< in/out: table */
+ dict_table_t* table, /*!< in/out: table */
+ bool empty_defrag_stats)
+ /*!< in: whether to empty defrag stats */
{
/* Zero the stats members */
@@ -574,7 +609,7 @@ dict_stats_empty_table(
ut_ad(!dict_index_is_univ(index));
- dict_stats_empty_index(index);
+ dict_stats_empty_index(index, empty_defrag_stats);
}
table->stat_initialized = TRUE;
@@ -709,7 +744,7 @@ dict_stats_copy(
}
if (!INDEX_EQ(src_idx, dst_idx)) {
- dict_stats_empty_index(dst_idx);
+ dict_stats_empty_index(dst_idx, true);
continue;
}
@@ -720,7 +755,7 @@ dict_stats_copy(
/* Since src is smaller some elements in dst
will remain untouched by the following memmove(),
thus we init all of them here. */
- dict_stats_empty_index(dst_idx);
+ dict_stats_empty_index(dst_idx, true);
} else {
n_copy_el = dst_idx->n_uniq;
}
@@ -740,6 +775,13 @@ dict_stats_copy(
dst_idx->stat_index_size = src_idx->stat_index_size;
dst_idx->stat_n_leaf_pages = src_idx->stat_n_leaf_pages;
+
+ dst_idx->stat_defrag_modified_counter =
+ src_idx->stat_defrag_modified_counter;
+ dst_idx->stat_defrag_n_pages_freed =
+ src_idx->stat_defrag_n_pages_freed;
+ dst_idx->stat_defrag_n_page_split =
+ src_idx->stat_defrag_n_page_split;
}
dst->stat_initialized = TRUE;
@@ -763,6 +805,9 @@ dict_index_t::stat_n_sample_sizes[]
dict_index_t::stat_n_non_null_key_vals[]
dict_index_t::stat_index_size
dict_index_t::stat_n_leaf_pages
+dict_index_t::stat_defrag_modified_counter
+dict_index_t::stat_defrag_n_pages_freed
+dict_index_t::stat_defrag_n_page_split
The returned object should be freed with dict_stats_snapshot_free()
when no longer needed.
@return incomplete table object */
@@ -812,7 +857,9 @@ dict_stats_snapshot_free(
Calculates new estimates for index statistics. This function is
relatively quick and is used to calculate transient statistics that
are not saved on disk. This was the only way to calculate statistics
-before the Persistent Statistics feature was introduced. */
+before the Persistent Statistics feature was introduced.
+This function doesn't update the defragmentation related stats.
+Only persistent statistics supports defragmentation stats. */
static
void
dict_stats_update_transient_for_index(
@@ -828,10 +875,10 @@ dict_stats_update_transient_for_index(
Initialize some bogus index cardinality
statistics, so that the data can be queried in
various means, also via secondary indexes. */
- dict_stats_empty_index(index);
+ dict_stats_empty_index(index, false);
#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
} else if (ibuf_debug && !dict_index_is_clust(index)) {
- dict_stats_empty_index(index);
+ dict_stats_empty_index(index, false);
#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
} else {
mtr_t mtr;
@@ -852,7 +899,7 @@ dict_stats_update_transient_for_index(
switch (size) {
case ULINT_UNDEFINED:
- dict_stats_empty_index(index);
+ dict_stats_empty_index(index, false);
return;
case 0:
/* The root node of the tree is a leaf */
@@ -887,7 +934,7 @@ dict_stats_update_transient(
if (dict_table_is_discarded(table)) {
/* Nothing to do. */
- dict_stats_empty_table(table);
+ dict_stats_empty_table(table, false);
return;
} else if (index == NULL) {
/* Table definition is corrupt */
@@ -897,7 +944,7 @@ dict_stats_update_transient(
fprintf(stderr, " InnoDB: table %s has no indexes. "
"Cannot calculate statistics.\n",
ut_format_name(table->name, TRUE, buf, sizeof(buf)));
- dict_stats_empty_table(table);
+ dict_stats_empty_table(table, false);
return;
}
@@ -909,7 +956,7 @@ dict_stats_update_transient(
continue;
}
- dict_stats_empty_index(index);
+ dict_stats_empty_index(index, false);
if (dict_stats_should_ignore_index(index)) {
continue;
@@ -1903,7 +1950,7 @@ dict_stats_analyze_index(
DEBUG_PRINTF(" %s(index=%s)\n", __func__, index->name);
- dict_stats_empty_index(index);
+ dict_stats_empty_index(index, false);
mtr_start(&mtr);
@@ -2201,7 +2248,7 @@ dict_stats_update_persistent(
/* Table definition is corrupt */
dict_table_stats_unlock(table, RW_X_LATCH);
- dict_stats_empty_table(table);
+ dict_stats_empty_table(table, true);
return(DB_CORRUPTION);
}
@@ -2230,7 +2277,7 @@ dict_stats_update_persistent(
continue;
}
- dict_stats_empty_index(index);
+ dict_stats_empty_index(index, false);
if (dict_stats_should_ignore_index(index)) {
continue;
@@ -2803,6 +2850,16 @@ dict_stats_fetch_index_stats_step(
== 0) {
index->stat_n_leaf_pages = (ulint) stat_value;
arg->stats_were_modified = true;
+ } else if (stat_name_len == 12 /* strlen("n_page_split") */
+ && strncasecmp("n_page_split", stat_name, stat_name_len)
+ == 0) {
+ index->stat_defrag_n_page_split = (ulint) stat_value;
+ arg->stats_were_modified = true;
+ } else if (stat_name_len == 13 /* strlen("n_pages_freed") */
+ && strncasecmp("n_pages_freed", stat_name, stat_name_len)
+ == 0) {
+ index->stat_defrag_n_pages_freed = (ulint) stat_value;
+ arg->stats_were_modified = true;
} else if (stat_name_len > PFX_LEN /* e.g. stat_name=="n_diff_pfx01" */
&& strncasecmp(PFX, stat_name, PFX_LEN) == 0) {
@@ -2922,7 +2979,7 @@ dict_stats_fetch_from_ps(
the persistent storage contains incomplete stats (e.g. missing stats
for some index) then we would end up with (partially) uninitialized
stats. */
- dict_stats_empty_table(table);
+ dict_stats_empty_table(table, true);
trx = trx_allocate_for_background();
@@ -3024,6 +3081,22 @@ dict_stats_fetch_from_ps(
}
/*********************************************************************//**
+Clear defragmentation stats modified counter for all indices in table. */
+static
+void
+dict_stats_empty_defrag_modified_counter(
+ dict_table_t* table) /*!< in: table */
+{
+ dict_index_t* index;
+ ut_a(table);
+ for (index = dict_table_get_first_index(table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+ index->stat_defrag_modified_counter = 0;
+ }
+}
+
+/*********************************************************************//**
Fetches or calculates new estimates for index statistics. */
UNIV_INTERN
void
@@ -3099,13 +3172,13 @@ dict_stats_update(
"because the .ibd file is missing. For help, please "
"refer to " REFMAN "innodb-troubleshooting.html\n",
ut_format_name(table->name, TRUE, buf, sizeof(buf)));
- dict_stats_empty_table(table);
+ dict_stats_empty_table(table, true);
return(DB_TABLESPACE_DELETED);
} else if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
/* If we have set a high innodb_force_recovery level, do
not calculate statistics, as a badly corrupted index can
cause a crash in it. */
- dict_stats_empty_table(table);
+ dict_stats_empty_table(table, false);
return(DB_SUCCESS);
}
@@ -3168,7 +3241,7 @@ dict_stats_update(
case DICT_STATS_EMPTY_TABLE:
- dict_stats_empty_table(table);
+ dict_stats_empty_table(table, true);
/* If table is using persistent stats,
then save the stats on disk */
@@ -3231,6 +3304,7 @@ dict_stats_update(
t->stats_last_recalc = table->stats_last_recalc;
t->stat_modified_counter = 0;
+ dict_stats_empty_defrag_modified_counter(t);
switch (err) {
case DB_SUCCESS:
@@ -3241,7 +3315,7 @@ dict_stats_update(
copying because dict_stats_table_clone_create() does
skip corrupted indexes so our dummy object 't' may
have less indexes than the real object 'table'. */
- dict_stats_empty_table(table);
+ dict_stats_empty_table(table, true);
dict_stats_copy(table, t);
@@ -3811,6 +3885,117 @@ dict_stats_rename_table(
return(ret);
}
+/*********************************************************************//**
+Save defragmentation result.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_save_defrag_summary(
+ dict_index_t* index) /*!< in: index */
+{
+ dberr_t ret;
+ lint now = (lint) ut_time();
+ if (dict_index_is_univ(index)) {
+ return DB_SUCCESS;
+ }
+ rw_lock_x_lock(&dict_operation_lock);
+ mutex_enter(&dict_sys->mutex);
+ ret = dict_stats_save_index_stat(index, now, "n_pages_freed",
+ index->stat_defrag_n_pages_freed,
+ NULL,
+ "Number of pages freed during"
+ " last defragmentation run.",
+ NULL);
+
+ mutex_exit(&dict_sys->mutex);
+ rw_lock_x_unlock(&dict_operation_lock);
+ return (ret);
+}
+
+/*********************************************************************//**
+Save defragmentation stats for a given index.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_save_defrag_stats(
+ dict_index_t* index) /*!< in: index */
+{
+ dberr_t ret;
+
+ if (index->table->ibd_file_missing) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Cannot save defragment stats because "
+ ".ibd file is missing.\n");
+ return (DB_TABLESPACE_DELETED);
+ }
+ if (dict_index_is_corrupted(index)) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Cannot save defragment stats because "
+ "index is corrupted.\n");
+ return(DB_CORRUPTION);
+ }
+
+ if (dict_index_is_univ(index)) {
+ return DB_SUCCESS;
+ }
+
+ lint now = (lint) ut_time();
+ mtr_t mtr;
+ ulint n_leaf_pages;
+ ulint n_leaf_reserved;
+ mtr_start(&mtr);
+ mtr_s_lock(dict_index_get_lock(index), &mtr);
+ n_leaf_reserved = btr_get_size_and_reserved(index, BTR_N_LEAF_PAGES,
+ &n_leaf_pages, &mtr);
+ mtr_commit(&mtr);
+
+ if (n_leaf_reserved == ULINT_UNDEFINED) {
+ // The index name is different during fast index creation,
+ // so the stats won't be associated with the right index
+ // for later use. We just return without saving.
+ return DB_SUCCESS;
+ }
+
+ rw_lock_x_lock(&dict_operation_lock);
+
+ mutex_enter(&dict_sys->mutex);
+ ret = dict_stats_save_index_stat(index, now, "n_page_split",
+ index->stat_defrag_n_page_split,
+ NULL,
+ "Number of new page splits on leaves"
+ " since last defragmentation.",
+ NULL);
+ if (ret != DB_SUCCESS) {
+ goto end;
+ }
+
+ ret = dict_stats_save_index_stat(
+ index, now, "n_leaf_pages_defrag",
+ n_leaf_pages,
+ NULL,
+ "Number of leaf pages when this stat is saved to disk",
+ NULL);
+ if (ret != DB_SUCCESS) {
+ goto end;
+ }
+
+ ret = dict_stats_save_index_stat(
+ index, now, "n_leaf_pages_reserved",
+ n_leaf_reserved,
+ NULL,
+ "Number of pages reserved for this index leaves when this stat "
+ "is saved to disk",
+ NULL);
+
+end:
+ mutex_exit(&dict_sys->mutex);
+ rw_lock_x_unlock(&dict_operation_lock);
+
+ return (ret);
+}
+
/* tests @{ */
#ifdef UNIV_COMPILE_TEST_FUNCS
diff --git a/storage/innobase/dict/dict0stats_bg.cc b/storage/innobase/dict/dict0stats_bg.cc
index ecd723ca39a..076ceb79613 100644
--- a/storage/innobase/dict/dict0stats_bg.cc
+++ b/storage/innobase/dict/dict0stats_bg.cc
@@ -25,6 +25,7 @@ Created Apr 25, 2012 Vasil Dimov
#include "row0mysql.h"
#include "srv0start.h"
+#include "dict0dict.h"
#include "dict0stats.h"
#include "dict0stats_bg.h"
@@ -44,8 +45,10 @@ UNIV_INTERN os_event_t dict_stats_event = NULL;
/** This mutex protects the "recalc_pool" variable. */
static ib_mutex_t recalc_pool_mutex;
+static ib_mutex_t defrag_pool_mutex;
#ifdef HAVE_PSI_INTERFACE
static mysql_pfs_key_t recalc_pool_mutex_key;
+static mysql_pfs_key_t defrag_pool_mutex_key;
#endif /* HAVE_PSI_INTERFACE */
/** The number of tables that can be added to "recalc_pool" before
@@ -59,16 +62,26 @@ static recalc_pool_t recalc_pool;
typedef recalc_pool_t::iterator recalc_pool_iterator_t;
+/** Indices whose defrag stats need to be saved to persistent storage.*/
+struct defrag_pool_item_t {
+ table_id_t table_id;
+ index_id_t index_id;
+};
+typedef std::vector<defrag_pool_item_t> defrag_pool_t;
+static defrag_pool_t defrag_pool;
+typedef defrag_pool_t::iterator defrag_pool_iterator_t;
+
/*****************************************************************//**
Initialize the recalc pool, called once during thread initialization. */
static
void
-dict_stats_recalc_pool_init()
+dict_stats_pool_init()
/*=========================*/
{
ut_ad(!srv_read_only_mode);
recalc_pool.reserve(RECALC_POOL_INITIAL_SLOTS);
+ defrag_pool.reserve(RECALC_POOL_INITIAL_SLOTS);
}
/*****************************************************************//**
@@ -76,12 +89,13 @@ Free the resources occupied by the recalc pool, called once during
thread de-initialization. */
static
void
-dict_stats_recalc_pool_deinit()
-/*===========================*/
+dict_stats_pool_deinit()
+/*====================*/
{
ut_ad(!srv_read_only_mode);
recalc_pool.clear();
+ defrag_pool.clear();
/*
recalc_pool may still have its buffer allocated. It will free it when
its destructor is called.
@@ -90,8 +104,12 @@ dict_stats_recalc_pool_deinit()
memory. To avoid that, we force recalc_pool to surrender its buffer
to empty_pool object, which will free it when leaving this function:
*/
- recalc_pool_t empty_pool;
- recalc_pool.swap(empty_pool);
+ recalc_pool_t recalc_empty_pool;
+ defrag_pool_t defrag_empty_pool;
+ memset(&recalc_empty_pool, 0, sizeof(recalc_pool_t));
+ memset(&defrag_empty_pool, 0, sizeof(defrag_pool_t));
+ recalc_pool.swap(recalc_empty_pool);
+ defrag_pool.swap(defrag_empty_pool);
}
/*****************************************************************//**
@@ -188,6 +206,111 @@ dict_stats_recalc_pool_del(
}
/*****************************************************************//**
+Add an index in a table to the defrag pool, which is processed by the
+background stats gathering thread. Only the table id and index id are
+added to the list, so the table can be closed after being enqueued and
+it will be opened when needed. If the table or index does not exist later
+(has been DROPped), then it will be removed from the pool and skipped. */
+UNIV_INTERN
+void
+dict_stats_defrag_pool_add(
+/*=======================*/
+ const dict_index_t* index) /*!< in: table to add */
+{
+ defrag_pool_item_t item;
+
+ ut_ad(!srv_read_only_mode);
+
+ mutex_enter(&defrag_pool_mutex);
+
+ /* quit if already in the list */
+ for (defrag_pool_iterator_t iter = defrag_pool.begin();
+ iter != defrag_pool.end();
+ ++iter) {
+ if ((*iter).table_id == index->table->id
+ && (*iter).index_id == index->id) {
+ mutex_exit(&defrag_pool_mutex);
+ return;
+ }
+ }
+
+ item.table_id = index->table->id;
+ item.index_id = index->id;
+ defrag_pool.push_back(item);
+
+ mutex_exit(&defrag_pool_mutex);
+
+ os_event_set(dict_stats_event);
+}
+
+/*****************************************************************//**
+Get an index from the auto defrag pool. The returned index id is removed
+from the pool.
+@return true if the pool was non-empty and "id" was set, false otherwise */
+static
+bool
+dict_stats_defrag_pool_get(
+/*=======================*/
+ table_id_t* table_id, /*!< out: table id, or unmodified if
+ list is empty */
+ index_id_t* index_id) /*!< out: index id, or unmodified if
+ list is empty */
+{
+ ut_ad(!srv_read_only_mode);
+
+ mutex_enter(&defrag_pool_mutex);
+
+ if (defrag_pool.empty()) {
+ mutex_exit(&defrag_pool_mutex);
+ return(false);
+ }
+
+ defrag_pool_item_t& item = defrag_pool.back();
+ *table_id = item.table_id;
+ *index_id = item.index_id;
+
+ defrag_pool.pop_back();
+
+ mutex_exit(&defrag_pool_mutex);
+
+ return(true);
+}
+
+/*****************************************************************//**
+Delete a given index from the auto defrag pool. */
+UNIV_INTERN
+void
+dict_stats_defrag_pool_del(
+/*=======================*/
+ const dict_table_t* table, /*!<in: if given, remove
+ all entries for the table */
+ const dict_index_t* index) /*!< in: if given, remove this index */
+{
+ ut_a((table && !index) || (!table && index));
+ ut_ad(!srv_read_only_mode);
+ ut_ad(mutex_own(&dict_sys->mutex));
+
+ mutex_enter(&defrag_pool_mutex);
+
+ defrag_pool_iterator_t iter = defrag_pool.begin();
+ while (iter != defrag_pool.end()) {
+ if ((table && (*iter).table_id == table->id)
+ || (index
+ && (*iter).table_id == index->table->id
+ && (*iter).index_id == index->id)) {
+ /* erase() invalidates the iterator */
+ iter = defrag_pool.erase(iter);
+ if (index)
+ break;
+ } else {
+ iter++;
+ }
+ }
+
+ mutex_exit(&defrag_pool_mutex);
+}
+
+/*****************************************************************//**
Wait until background stats thread has stopped using the specified table.
The caller must have locked the data dictionary using
row_mysql_lock_data_dictionary() and this function may unlock it temporarily
@@ -237,7 +360,10 @@ dict_stats_thread_init()
mutex_create(recalc_pool_mutex_key, &recalc_pool_mutex,
SYNC_STATS_AUTO_RECALC);
- dict_stats_recalc_pool_init();
+ /* We choose SYNC_STATS_DEFRAG to be below SYNC_FSP_PAGE. */
+ mutex_create(defrag_pool_mutex_key, &defrag_pool_mutex,
+ SYNC_STATS_DEFRAG);
+ dict_stats_pool_init();
}
/*****************************************************************//**
@@ -251,11 +377,14 @@ dict_stats_thread_deinit()
ut_a(!srv_read_only_mode);
ut_ad(!srv_dict_stats_thread_active);
- dict_stats_recalc_pool_deinit();
+ dict_stats_pool_deinit();
mutex_free(&recalc_pool_mutex);
memset(&recalc_pool_mutex, 0x0, sizeof(recalc_pool_mutex));
+ mutex_free(&defrag_pool_mutex);
+ memset(&defrag_pool_mutex, 0x0, sizeof(defrag_pool_mutex));
+
os_event_free(dict_stats_event);
dict_stats_event = NULL;
}
@@ -298,7 +427,7 @@ dict_stats_process_entry_from_recalc_pool()
return;
}
- table->stats_bg_flag = BG_STAT_IN_PROGRESS;
+ table->stats_bg_flag |= BG_STAT_IN_PROGRESS;
mutex_exit(&dict_sys->mutex);
@@ -325,7 +454,7 @@ dict_stats_process_entry_from_recalc_pool()
mutex_enter(&dict_sys->mutex);
- table->stats_bg_flag = BG_STAT_NONE;
+ table->stats_bg_flag &= ~BG_STAT_IN_PROGRESS;
dict_table_close(table, TRUE, FALSE);
@@ -333,6 +462,63 @@ dict_stats_process_entry_from_recalc_pool()
}
/*****************************************************************//**
+Get the first index that has been added for updating persistent defrag
+stats and eventually save its stats. */
+static
+void
+dict_stats_process_entry_from_defrag_pool()
+/*=======================================*/
+{
+ table_id_t table_id;
+ index_id_t index_id;
+
+ ut_ad(!srv_read_only_mode);
+
+ /* pop the first index from the auto defrag pool */
+ if (!dict_stats_defrag_pool_get(&table_id, &index_id)) {
+ /* no index in defrag pool */
+ return;
+ }
+
+ dict_table_t* table;
+
+ mutex_enter(&dict_sys->mutex);
+
+ /* If the table is no longer cached, we've already lost the in
+ memory stats so there's nothing really to write to disk. */
+ table = dict_table_open_on_id(table_id, TRUE,
+ DICT_TABLE_OP_OPEN_ONLY_IF_CACHED);
+
+ if (table == NULL) {
+ mutex_exit(&dict_sys->mutex);
+ return;
+ }
+
+ /* Check whether table is corrupted */
+ if (table->corrupted) {
+ dict_table_close(table, TRUE, FALSE);
+ mutex_exit(&dict_sys->mutex);
+ return;
+ }
+ mutex_exit(&dict_sys->mutex);
+
+ dict_index_t* index = dict_table_find_index_on_id(table, index_id);
+
+ if (index == NULL) {
+ return;
+ }
+
+ /* Check whether index is corrupted */
+ if (dict_index_is_corrupted(index)) {
+ dict_table_close(table, FALSE, FALSE);
+ return;
+ }
+
+ dict_stats_save_defrag_stats(index);
+ dict_table_close(table, FALSE, FALSE);
+}
+
+/*****************************************************************//**
This is the thread for background stats gathering. It pops tables, from
the auto recalc list and proceeds them, eventually recalculating their
statistics.
@@ -364,6 +550,9 @@ DECLARE_THREAD(dict_stats_thread)(
dict_stats_process_entry_from_recalc_pool();
+ while (defrag_pool.size())
+ dict_stats_process_entry_from_defrag_pool();
+
os_event_reset(dict_stats_event);
}
diff --git a/storage/innobase/fil/fil0crypt.cc b/storage/innobase/fil/fil0crypt.cc
new file mode 100644
index 00000000000..1cf6c0401bb
--- /dev/null
+++ b/storage/innobase/fil/fil0crypt.cc
@@ -0,0 +1,2515 @@
+/*****************************************************************************
+Copyright (C) 2013, 2015, Google Inc. All Rights Reserved.
+Copyright (C) 2014, 2015, MariaDB Corporation. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+/**************************************************//**
+@file fil0crypt.cc
+Innodb file space encrypt/decrypt
+
+Created Jonas Oreland Google
+Modified Jan Lindström jan.lindstrom@mariadb.com
+*******************************************************/
+
+#include "fil0fil.h"
+#include "fil0crypt.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "mach0data.h"
+#include "log0recv.h"
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "page0zip.h"
+#include "ut0ut.h"
+#include "btr0scrub.h"
+#include "fsp0fsp.h"
+#include "fil0pagecompress.h"
+#include "ha_prototypes.h" // IB_LOG_
+
+#include <my_crypt.h>
+
+/** Mutex for keys */
+UNIV_INTERN ib_mutex_t fil_crypt_key_mutex;
+
+static bool fil_crypt_threads_inited = false;
+
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t fil_crypt_key_mutex_key;
+#endif
+
+/** Is encryption enabled/disabled */
+UNIV_INTERN ulong srv_encrypt_tables = 0;
+
+/** No of key rotation threads requested */
+UNIV_INTERN uint srv_n_fil_crypt_threads = 0;
+
+/** No of key rotation threads started */
+static uint srv_n_fil_crypt_threads_started = 0;
+
+/** At this age or older a space/page will be rotated */
+UNIV_INTERN uint srv_fil_crypt_rotate_key_age = 1;
+
+/** Event to signal FROM the key rotation threads. */
+UNIV_INTERN os_event_t fil_crypt_event;
+
+/** Event to signal TO the key rotation threads. */
+UNIV_INTERN os_event_t fil_crypt_threads_event;
+
+/** Event for waking up threads throttle */
+UNIV_INTERN os_event_t fil_crypt_throttle_sleep_event;
+
+/** Mutex for key rotation threads */
+UNIV_INTERN ib_mutex_t fil_crypt_threads_mutex;
+
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t fil_crypt_threads_mutex_key;
+#endif
+
+/** Variable ensuring only 1 thread at time does initial conversion */
+static bool fil_crypt_start_converting = false;
+
+/** Variables for throttling */
+UNIV_INTERN uint srv_n_fil_crypt_iops = 100; // 10ms per iop
+static uint srv_alloc_time = 3; // allocate iops for 3s at a time
+static uint n_fil_crypt_iops_allocated = 0;
+
+/** Variables for scrubbing */
+extern uint srv_background_scrub_data_interval;
+extern uint srv_background_scrub_data_check_interval;
+
+#define DEBUG_KEYROTATION_THROTTLING 0
+
+/** Statistics variables */
+static fil_crypt_stat_t crypt_stat;
+static ib_mutex_t crypt_stat_mutex;
+
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t fil_crypt_stat_mutex_key;
+#endif
+
+/**
+ * key for crypt data mutex
+*/
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t fil_crypt_data_mutex_key;
+#endif
+
+static bool
+fil_crypt_needs_rotation(
+/*=====================*/
+ fil_encryption_t encrypt_mode, /*!< in: Encryption
+ mode */
+ uint key_version, /*!< in: Key version */
+ uint latest_key_version, /*!< in: Latest key version */
+ uint rotate_key_age); /*!< in: When to rotate */
+
+/**
+* Magic pattern in start of crypt data on page 0
+*/
+#define MAGIC_SZ 6
+
+static const unsigned char CRYPT_MAGIC[MAGIC_SZ] = {
+ 's', 0xE, 0xC, 'R', 'E', 't' };
+
+static const unsigned char EMPTY_PATTERN[MAGIC_SZ] = {
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 };
+
+/*********************************************************************
+Init space crypt */
+UNIV_INTERN
+void
+fil_space_crypt_init()
+/*==================*/
+{
+ mutex_create(fil_crypt_key_mutex_key,
+ &fil_crypt_key_mutex, SYNC_NO_ORDER_CHECK);
+
+ fil_crypt_throttle_sleep_event = os_event_create();
+
+ mutex_create(fil_crypt_stat_mutex_key,
+ &crypt_stat_mutex, SYNC_NO_ORDER_CHECK);
+ memset(&crypt_stat, 0, sizeof(crypt_stat));
+}
+
+/*********************************************************************
+Cleanup space crypt */
+UNIV_INTERN
+void
+fil_space_crypt_cleanup()
+/*=====================*/
+{
+ os_event_free(fil_crypt_throttle_sleep_event);
+}
+
+/******************************************************************
+Get the latest(key-version), waking the encrypt thread, if needed */
+static inline
+uint
+fil_crypt_get_latest_key_version(
+/*=============================*/
+ fil_space_crypt_t* crypt_data) /*!< in: crypt data */
+{
+ uint rc = encryption_key_get_latest_version(crypt_data->key_id);
+
+ if (fil_crypt_needs_rotation(crypt_data->encryption,
+ crypt_data->min_key_version,
+ rc, srv_fil_crypt_rotate_key_age)) {
+ os_event_set(fil_crypt_threads_event);
+ }
+
+ return rc;
+}
+
+/******************************************************************
+Mutex helper for crypt_data->scheme */
+static
+void
+crypt_data_scheme_locker(
+/*=====================*/
+ st_encryption_scheme* scheme,
+ int exit)
+{
+ fil_space_crypt_t* crypt_data =
+ static_cast<fil_space_crypt_t*>(scheme);
+
+ if (exit) {
+ mutex_exit(&crypt_data->mutex);
+ } else {
+ mutex_enter(&crypt_data->mutex);
+ }
+}
+
+/******************************************************************
+Create a fil_space_crypt_t object
+@return crypt object */
+UNIV_INTERN
+fil_space_crypt_t*
+fil_space_create_crypt_data(
+/*========================*/
+ fil_encryption_t encrypt_mode, /*!< in: encryption mode */
+ uint key_id) /*!< in: encryption key id */
+{
+ const uint sz = sizeof(fil_space_crypt_t);
+ fil_space_crypt_t* crypt_data =
+ static_cast<fil_space_crypt_t*>(malloc(sz));
+
+ memset(crypt_data, 0, sz);
+
+ if (encrypt_mode == FIL_SPACE_ENCRYPTION_OFF ||
+ (!srv_encrypt_tables && encrypt_mode == FIL_SPACE_ENCRYPTION_DEFAULT)) {
+ crypt_data->type = CRYPT_SCHEME_UNENCRYPTED;
+ crypt_data->min_key_version = 0;
+ } else {
+ crypt_data->type = CRYPT_SCHEME_1;
+ crypt_data->min_key_version = encryption_key_get_latest_version(key_id);
+ }
+
+ mutex_create(fil_crypt_data_mutex_key,
+ &crypt_data->mutex, SYNC_NO_ORDER_CHECK);
+ crypt_data->locker = crypt_data_scheme_locker;
+ my_random_bytes(crypt_data->iv, sizeof(crypt_data->iv));
+ crypt_data->encryption = encrypt_mode;
+ crypt_data->key_id = key_id;
+ return crypt_data;
+}
+
+/******************************************************************
+Merge fil_space_crypt_t object */
+UNIV_INTERN
+void
+fil_space_merge_crypt_data(
+/*=======================*/
+ fil_space_crypt_t* dst,/*!< out: Crypt data */
+ const fil_space_crypt_t* src)/*!< in: Crypt data */
+{
+ mutex_enter(&dst->mutex);
+
+ /* validate that they are mergeable */
+ ut_a(src->type == CRYPT_SCHEME_UNENCRYPTED ||
+ src->type == CRYPT_SCHEME_1);
+
+ ut_a(dst->type == CRYPT_SCHEME_UNENCRYPTED ||
+ dst->type == CRYPT_SCHEME_1);
+
+ /* no support for changing iv (yet?) */
+ ut_a(memcmp(src->iv, dst->iv, sizeof(src->iv)) == 0);
+
+ dst->encryption = src->encryption;
+ dst->type = src->type;
+ dst->min_key_version = src->min_key_version;
+ dst->keyserver_requests += src->keyserver_requests;
+
+ mutex_exit(&dst->mutex);
+}
+
+/******************************************************************
+Read crypt data from a page (0)
+@return crypt data from page 0. */
+UNIV_INTERN
+fil_space_crypt_t*
+fil_space_read_crypt_data(
+/*======================*/
+ ulint space, /*!< in: file space id*/
+ const byte* page, /*!< in: page 0 */
+ ulint offset) /*!< in: offset */
+{
+ if (memcmp(page + offset, EMPTY_PATTERN, MAGIC_SZ) == 0) {
+ /* Crypt data is not stored. */
+ return NULL;
+ }
+
+ if (memcmp(page + offset, CRYPT_MAGIC, MAGIC_SZ) != 0) {
+#ifdef UNIV_DEBUG
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "Found potentially bogus bytes on "
+ "page 0 offset %lu for space %lu : "
+ "[ %.2x %.2x %.2x %.2x %.2x %.2x ]. "
+ "Assuming space is not encrypted!.",
+ offset, space,
+ page[offset + 0],
+ page[offset + 1],
+ page[offset + 2],
+ page[offset + 3],
+ page[offset + 4],
+ page[offset + 5]);
+#endif
+ /* Create data is not stored. */
+ return NULL;
+ }
+
+ ulint type = mach_read_from_1(page + offset + MAGIC_SZ + 0);
+
+ if (! (type == CRYPT_SCHEME_UNENCRYPTED ||
+ type == CRYPT_SCHEME_1)) {
+
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Found non sensible crypt scheme: %lu for space %lu "
+ " offset: %lu bytes: "
+ "[ %.2x %.2x %.2x %.2x %.2x %.2x ].",
+ type, space, offset,
+ page[offset + 0 + MAGIC_SZ],
+ page[offset + 1 + MAGIC_SZ],
+ page[offset + 2 + MAGIC_SZ],
+ page[offset + 3 + MAGIC_SZ],
+ page[offset + 4 + MAGIC_SZ],
+ page[offset + 5 + MAGIC_SZ]);
+ ut_error;
+ }
+
+ fil_space_crypt_t* crypt_data;
+ ulint iv_length = mach_read_from_1(page + offset + MAGIC_SZ + 1);
+
+ if (! (iv_length == sizeof(crypt_data->iv))) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Found non sensible iv length: %lu for space %lu "
+ " offset: %lu type: %lu bytes: "
+ "[ %.2x %.2x %.2x %.2x %.2x %.2x ].",
+ iv_length, space, offset, type,
+ page[offset + 0 + MAGIC_SZ],
+ page[offset + 1 + MAGIC_SZ],
+ page[offset + 2 + MAGIC_SZ],
+ page[offset + 3 + MAGIC_SZ],
+ page[offset + 4 + MAGIC_SZ],
+ page[offset + 5 + MAGIC_SZ]);
+ ut_error;
+ }
+
+ uint min_key_version = mach_read_from_4
+ (page + offset + MAGIC_SZ + 2 + iv_length);
+
+ uint key_id = mach_read_from_4
+ (page + offset + MAGIC_SZ + 2 + iv_length + 4);
+
+ fil_encryption_t encryption = (fil_encryption_t)mach_read_from_1(
+ page + offset + MAGIC_SZ + 2 + iv_length + 8);
+
+ const uint sz = sizeof(fil_space_crypt_t) + iv_length;
+ crypt_data = static_cast<fil_space_crypt_t*>(malloc(sz));
+ memset(crypt_data, 0, sz);
+
+ crypt_data->type = type;
+ crypt_data->min_key_version = min_key_version;
+ crypt_data->key_id = key_id;
+ crypt_data->page0_offset = offset;
+ crypt_data->encryption = encryption;
+ mutex_create(fil_crypt_data_mutex_key,
+ &crypt_data->mutex, SYNC_NO_ORDER_CHECK);
+ crypt_data->locker = crypt_data_scheme_locker;
+ memcpy(crypt_data->iv, page + offset + MAGIC_SZ + 2, iv_length);
+
+ return crypt_data;
+}
+
+/******************************************************************
+Free a crypt data object */
+UNIV_INTERN
+void
+fil_space_destroy_crypt_data(
+/*=========================*/
+ fil_space_crypt_t **crypt_data) /*!< out: crypt data */
+{
+ if (crypt_data != NULL && (*crypt_data) != NULL) {
+ mutex_free(& (*crypt_data)->mutex);
+ free(*crypt_data);
+ (*crypt_data) = NULL;
+ }
+}
+
+/******************************************************************
+Write crypt data to a page (0) */
+static
+void
+fil_space_write_crypt_data_low(
+/*===========================*/
+ fil_space_crypt_t* crypt_data, /*<! out: crypt data */
+ ulint type, /*<! in: crypt scheme */
+ byte* page, /*<! in: page 0 */
+ ulint offset, /*<! in: offset */
+ ulint maxsize, /*<! in: size of crypt data */
+ mtr_t* mtr) /*<! in: minitransaction */
+{
+ ut_a(offset > 0 && offset < UNIV_PAGE_SIZE);
+ ulint space_id = mach_read_from_4(
+ page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ const uint len = sizeof(crypt_data->iv);
+ const uint min_key_version = crypt_data->min_key_version;
+ const uint key_id = crypt_data->key_id;
+ const fil_encryption_t encryption = crypt_data->encryption;
+ crypt_data->page0_offset = offset;
+ ut_a(2 + len + 4 + 1 + 4 + MAGIC_SZ < maxsize);
+
+ /*
+ redo log this as bytewise updates to page 0
+ followed by an MLOG_FILE_WRITE_CRYPT_DATA
+ (that will during recovery update fil_space_t)
+ */
+ mlog_write_string(page + offset, CRYPT_MAGIC, MAGIC_SZ, mtr);
+ mlog_write_ulint(page + offset + MAGIC_SZ + 0, type, MLOG_1BYTE, mtr);
+ mlog_write_ulint(page + offset + MAGIC_SZ + 1, len, MLOG_1BYTE, mtr);
+ mlog_write_string(page + offset + MAGIC_SZ + 2, crypt_data->iv, len,
+ mtr);
+ mlog_write_ulint(page + offset + MAGIC_SZ + 2 + len, min_key_version,
+ MLOG_4BYTES, mtr);
+ mlog_write_ulint(page + offset + MAGIC_SZ + 2 + len + 4, key_id,
+ MLOG_4BYTES, mtr);
+ mlog_write_ulint(page + offset + MAGIC_SZ + 2 + len + 8, encryption,
+ MLOG_1BYTE, mtr);
+
+ byte* log_ptr = mlog_open(mtr, 11 + 17 + len);
+
+ if (log_ptr != NULL) {
+ log_ptr = mlog_write_initial_log_record_fast(
+ page,
+ MLOG_FILE_WRITE_CRYPT_DATA,
+ log_ptr, mtr);
+ mach_write_to_4(log_ptr, space_id);
+ log_ptr += 4;
+ mach_write_to_2(log_ptr, offset);
+ log_ptr += 2;
+ mach_write_to_1(log_ptr, type);
+ log_ptr += 1;
+ mach_write_to_1(log_ptr, len);
+ log_ptr += 1;
+ mach_write_to_4(log_ptr, min_key_version);
+ log_ptr += 4;
+ mach_write_to_4(log_ptr, key_id);
+ log_ptr += 4;
+ mach_write_to_1(log_ptr, encryption);
+ log_ptr += 1;
+ mlog_close(mtr, log_ptr);
+
+ mlog_catenate_string(mtr, crypt_data->iv, len);
+ }
+}
+
+/******************************************************************
+Write crypt data to a page (0) */
+UNIV_INTERN
+void
+fil_space_write_crypt_data(
+/*=======================*/
+ ulint space, /*<! in: file space */
+ byte* page, /*<! in: page 0 */
+ ulint offset, /*<! in: offset */
+ ulint maxsize, /*<! in: size of crypt data */
+ mtr_t* mtr) /*<! in: minitransaction */
+{
+ fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(space);
+
+ /* If no crypt data is stored on memory cache for this space,
+ then do not continue writing crypt data to page 0. */
+ if (crypt_data == NULL) {
+ return;
+ }
+
+ fil_space_write_crypt_data_low(crypt_data, crypt_data->type,
+ page, offset, maxsize, mtr);
+}
+
+/******************************************************************
+Parse a MLOG_FILE_WRITE_CRYPT_DATA log entry
+@return position on log buffer */
+UNIV_INTERN
+byte*
+fil_parse_write_crypt_data(
+/*=======================*/
+ byte* ptr, /*!< in: Log entry start */
+ byte* end_ptr,/*!< in: Log entry end */
+ buf_block_t* block) /*!< in: buffer block */
+{
+ /* check that redo log entry is complete */
+ uint entry_size =
+ 4 + // size of space_id
+ 2 + // size of offset
+ 1 + // size of type
+ 1 + // size of iv-len
+ 4 + // size of min_key_version
+ 4 + // size of key_id
+ 1; // fil_encryption_t
+
+ if (end_ptr - ptr < entry_size){
+ return NULL;
+ }
+
+ ulint space_id = mach_read_from_4(ptr);
+ ptr += 4;
+ uint offset = mach_read_from_2(ptr);
+ ptr += 2;
+ uint type = mach_read_from_1(ptr);
+ ptr += 1;
+ uint len = mach_read_from_1(ptr);
+ ptr += 1;
+
+ ut_a(type == CRYPT_SCHEME_UNENCRYPTED ||
+ type == CRYPT_SCHEME_1); // only supported
+
+ ut_a(len == CRYPT_SCHEME_1_IV_LEN); // only supported
+ uint min_key_version = mach_read_from_4(ptr);
+ ptr += 4;
+
+ uint key_id = mach_read_from_4(ptr);
+ ptr += 4;
+
+ fil_encryption_t encryption = (fil_encryption_t)mach_read_from_1(ptr);
+ ptr +=1;
+
+ if (end_ptr - ptr < len) {
+ return NULL;
+ }
+
+ fil_space_crypt_t* crypt_data = fil_space_create_crypt_data(encryption, key_id);
+ crypt_data->page0_offset = offset;
+ crypt_data->min_key_version = min_key_version;
+ crypt_data->encryption = encryption;
+ memcpy(crypt_data->iv, ptr, len);
+ ptr += len;
+
+ /* update fil_space memory cache with crypt_data */
+ fil_space_set_crypt_data(space_id, crypt_data);
+
+ return ptr;
+}
+
+/******************************************************************
+Clear crypt data from a page (0) */
+UNIV_INTERN
+void
+fil_space_clear_crypt_data(
+/*=======================*/
+ byte* page, /*!< in/out: Page 0 */
+ ulint offset) /*!< in: Offset */
+{
+ //TODO(jonaso): pass crypt-data and read len from there
+ ulint len = CRYPT_SCHEME_1_IV_LEN;
+ ulint size =
+ sizeof(CRYPT_MAGIC) +
+ 1 + // type
+ 1 + // len
+ len + // iv
+ 4 + // min key version
+ 4 + // key id
+ 1; // fil_encryption_t
+ memset(page + offset, 0, size);
+}
+
+/******************************************************************
+Encrypt a page */
+UNIV_INTERN
+byte*
+fil_space_encrypt(
+/*==============*/
+ ulint space, /*!< in: Space id */
+ ulint offset, /*!< in: Page offset */
+ lsn_t lsn, /*!< in: lsn */
+ byte* src_frame, /*!< in: Source page to be encrypted */
+ ulint zip_size, /*!< in: compressed size if
+ row_format compressed */
+ byte* dst_frame) /*!< in: outbut buffer */
+{
+ fil_space_crypt_t* crypt_data = NULL;
+ ulint page_size = (zip_size) ? zip_size : UNIV_PAGE_SIZE;
+ uint key_version;
+
+ ulint orig_page_type = mach_read_from_2(src_frame+FIL_PAGE_TYPE);
+
+ if (orig_page_type==FIL_PAGE_TYPE_FSP_HDR
+ || orig_page_type==FIL_PAGE_TYPE_XDES) {
+ /* File space header or extent descriptor do not need to be
+ encrypted. */
+ return src_frame;
+ }
+
+ /* Get crypt data from file space */
+ crypt_data = fil_space_get_crypt_data(space);
+
+ if (crypt_data == NULL) {
+ return src_frame;
+ }
+
+ ut_ad(crypt_data->encryption != FIL_SPACE_ENCRYPTION_OFF);
+
+ key_version = fil_crypt_get_latest_key_version(crypt_data);
+
+ if (key_version == ENCRYPTION_KEY_VERSION_INVALID) {
+ ib_logf(IB_LOG_LEVEL_FATAL,
+ "Unknown key id %u. Can't continue!\n",
+ crypt_data->key_id);
+ ut_error;
+ }
+
+ ibool page_compressed = (orig_page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED);
+ ulint header_len = FIL_PAGE_DATA;
+
+ if (page_compressed) {
+ header_len += (FIL_PAGE_COMPRESSED_SIZE + FIL_PAGE_COMPRESSION_METHOD_SIZE);
+ }
+
+ /* FIL page header is not encrypted */
+ memcpy(dst_frame, src_frame, header_len);
+
+ /* Store key version */
+ mach_write_to_4(dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, key_version);
+
+ /* Calculate the start offset in a page */
+ ulint unencrypted_bytes = header_len + FIL_PAGE_DATA_END;
+ ulint srclen = page_size - unencrypted_bytes;
+ const byte* src = src_frame + header_len;
+ byte* dst = dst_frame + header_len;
+ uint32 dstlen = 0;
+
+ if (page_compressed) {
+ srclen = mach_read_from_2(src_frame + FIL_PAGE_DATA);
+ }
+
+ int rc = encryption_scheme_encrypt(src, srclen, dst, &dstlen,
+ crypt_data, key_version,
+ space, offset, lsn);
+
+ if (! ((rc == MY_AES_OK) && ((ulint) dstlen == srclen))) {
+ ib_logf(IB_LOG_LEVEL_FATAL,
+ "Unable to encrypt data-block "
+ " src: %p srclen: %ld buf: %p buflen: %d."
+ " return-code: %d. Can't continue!\n",
+ src, (long)srclen,
+ dst, dstlen, rc);
+ ut_error;
+ }
+
+ /* For compressed tables we do not store the FIL header because
+ the whole page is not stored to the disk. In compressed tables only
+ the FIL header + compressed (and now encrypted) payload alligned
+ to sector boundary is written. */
+ if (!page_compressed) {
+ /* FIL page trailer is also not encrypted */
+ memcpy(dst_frame + page_size - FIL_PAGE_DATA_END,
+ src_frame + page_size - FIL_PAGE_DATA_END,
+ FIL_PAGE_DATA_END);
+ }
+
+ /* handle post encryption checksum */
+ ib_uint32_t checksum = 0;
+ srv_checksum_algorithm_t algorithm =
+ static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm);
+
+ if (zip_size == 0) {
+ switch (algorithm) {
+ case SRV_CHECKSUM_ALGORITHM_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+ checksum = buf_calc_page_crc32(dst_frame);
+ break;
+ case SRV_CHECKSUM_ALGORITHM_INNODB:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+ checksum = (ib_uint32_t) buf_calc_page_new_checksum(
+ dst_frame);
+ break;
+ case SRV_CHECKSUM_ALGORITHM_NONE:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+ checksum = BUF_NO_CHECKSUM_MAGIC;
+ break;
+ /* no default so the compiler will emit a warning
+ * if new enum is added and not handled here */
+ }
+ } else {
+ checksum = page_zip_calc_checksum(dst_frame, zip_size,
+ algorithm);
+ }
+
+ // store the post-encryption checksum after the key-version
+ mach_write_to_4(dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + 4, checksum);
+
+ srv_stats.pages_encrypted.inc();
+
+ return dst_frame;
+}
+
+/*********************************************************************
+Check if extra buffer shall be allocated for decrypting after read
+@return true if fil space has encryption data. */
+UNIV_INTERN
+bool
+fil_space_check_encryption_read(
+/*=============================*/
+ ulint space) /*!< in: tablespace id */
+{
+ fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(space);
+
+ if (crypt_data == NULL) {
+ return false;
+ }
+
+ if (crypt_data->type == CRYPT_SCHEME_UNENCRYPTED) {
+ return false;
+ }
+
+ if (crypt_data->encryption == FIL_SPACE_ENCRYPTION_OFF) {
+ return false;
+ }
+
+ return true;
+}
+
+/******************************************************************
+Decrypt a page
+@return true if page decrypted, false if not.*/
+UNIV_INTERN
+bool
+fil_space_decrypt(
+/*==============*/
+ fil_space_crypt_t* crypt_data, /*!< in: crypt data */
+ byte* tmp_frame, /*!< in: temporary buffer */
+ ulint page_size, /*!< in: page size */
+ byte* src_frame) /*!< in:out: page buffer */
+{
+ ulint page_type = mach_read_from_2(src_frame+FIL_PAGE_TYPE);
+ uint key_version = mach_read_from_4(src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+ bool page_compressed = (page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED);
+
+ if (key_version == ENCRYPTION_KEY_NOT_ENCRYPTED) {
+ return false;
+ }
+
+ ut_ad(crypt_data->encryption != FIL_SPACE_ENCRYPTION_OFF);
+
+ /* read space & offset & lsn */
+ ulint space = mach_read_from_4(
+ src_frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ ulint offset = mach_read_from_4(
+ src_frame + FIL_PAGE_OFFSET);
+ ib_uint64_t lsn = mach_read_from_8(src_frame + FIL_PAGE_LSN);
+ ulint header_len = FIL_PAGE_DATA;
+
+ if (page_compressed) {
+ header_len += (FIL_PAGE_COMPRESSED_SIZE + FIL_PAGE_COMPRESSION_METHOD_SIZE);
+ }
+
+ /* Copy FIL page header, it is not encrypted */
+ memcpy(tmp_frame, src_frame, header_len);
+
+ /* Calculate the offset where decryption starts */
+ const byte* src = src_frame + header_len;
+ byte* dst = tmp_frame + header_len;
+ uint32 dstlen = 0;
+ ulint srclen = page_size - (header_len + FIL_PAGE_DATA_END);
+
+ if (page_compressed) {
+ srclen = mach_read_from_2(src_frame + FIL_PAGE_DATA);
+ }
+
+ int rc = encryption_scheme_decrypt(src, srclen, dst, &dstlen,
+ crypt_data, key_version,
+ space, offset, lsn);
+
+ if (! ((rc == MY_AES_OK) && ((ulint) dstlen == srclen))) {
+ ib_logf(IB_LOG_LEVEL_FATAL,
+ "Unable to decrypt data-block "
+ " src: %p srclen: %ld buf: %p buflen: %d."
+ " return-code: %d. Can't continue!\n",
+ src, (long)srclen,
+ dst, dstlen, rc);
+ ut_error;
+ }
+
+ /* For compressed tables we do not store the FIL header because
+ the whole page is not stored to the disk. In compressed tables only
+ the FIL header + compressed (and now encrypted) payload alligned
+ to sector boundary is written. */
+ if (!page_compressed) {
+ /* Copy FIL trailer */
+ memcpy(tmp_frame + page_size - FIL_PAGE_DATA_END,
+ src_frame + page_size - FIL_PAGE_DATA_END,
+ FIL_PAGE_DATA_END);
+
+ // clear key-version & crypt-checksum from dst
+ memset(tmp_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
+ }
+
+ srv_stats.pages_decrypted.inc();
+
+ return true; /* page was decrypted */
+}
+
+/******************************************************************
+Decrypt a page
+@return encrypted page, or original not encrypted page if encryption is
+not needed. */
+UNIV_INTERN
+byte*
+fil_space_decrypt(
+/*==============*/
+ ulint space, /*!< in: Fil space id */
+ byte* tmp_frame, /*!< in: temporary buffer */
+ ulint page_size, /*!< in: page size */
+ byte* src_frame) /*!< in/out: page buffer */
+{
+ bool encrypted = fil_space_decrypt(
+ fil_space_get_crypt_data(space),
+ tmp_frame,
+ page_size,
+ src_frame);
+
+ if (encrypted) {
+ /* Copy the decrypted page back to page buffer, not
+ really any other options. */
+ memcpy(src_frame, tmp_frame, page_size);
+ }
+
+ return src_frame;
+}
+
+/*********************************************************************
+Verify checksum for a page (iff it's encrypted)
+NOTE: currently this function can only be run in single threaded mode
+as it modifies srv_checksum_algorithm (temporarily)
+@return true if page is encrypted AND OK, false otherwise */
+UNIV_INTERN
+bool
+fil_space_verify_crypt_checksum(
+/*============================*/
+ const byte* src_frame, /*!< in: page the verify */
+ ulint zip_size) /*!< in: compressed size if
+ row_format compressed */
+{
+ // key version
+ uint key_version = mach_read_from_4(
+ src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+
+ if (key_version == 0) {
+ return false; // unencrypted page
+ }
+
+ /* "trick" the normal checksum routines by storing the post-encryption
+ * checksum into the normal checksum field allowing for reuse of
+ * the normal routines */
+
+ // post encryption checksum
+ ib_uint32_t stored_post_encryption = mach_read_from_4(
+ src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + 4);
+
+ // save pre encryption checksum for restore in end of this function
+ ib_uint32_t stored_pre_encryption = mach_read_from_4(
+ src_frame + FIL_PAGE_SPACE_OR_CHKSUM);
+
+ ib_uint32_t checksum_field2 = mach_read_from_4(
+ src_frame + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM);
+
+ /** prepare frame for usage of normal checksum routines */
+ mach_write_to_4(const_cast<byte*>(src_frame) + FIL_PAGE_SPACE_OR_CHKSUM,
+ stored_post_encryption);
+
+ /* NOTE: this function is (currently) only run when restoring
+ * dblwr-buffer, server is single threaded so it's safe to modify
+ * srv_checksum_algorithm */
+ srv_checksum_algorithm_t save_checksum_algorithm =
+ (srv_checksum_algorithm_t)srv_checksum_algorithm;
+
+ if (zip_size == 0 &&
+ (save_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB ||
+ save_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_INNODB)) {
+ /* handle ALGORITHM_INNODB specially,
+ * "downgrade" to ALGORITHM_INNODB and store BUF_NO_CHECKSUM_MAGIC
+ * checksum_field2 is sort of pointless anyway...
+ */
+ srv_checksum_algorithm = SRV_CHECKSUM_ALGORITHM_INNODB;
+ mach_write_to_4(const_cast<byte*>(src_frame) +
+ UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
+ BUF_NO_CHECKSUM_MAGIC);
+ }
+
+ /* verify checksums */
+ ibool corrupted = buf_page_is_corrupted(false, src_frame, zip_size);
+
+ /** restore frame & algorithm */
+ srv_checksum_algorithm = save_checksum_algorithm;
+
+ mach_write_to_4(const_cast<byte*>(src_frame) +
+ FIL_PAGE_SPACE_OR_CHKSUM,
+ stored_pre_encryption);
+
+ mach_write_to_4(const_cast<byte*>(src_frame) +
+ UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
+ checksum_field2);
+
+ if (!corrupted) {
+ return true; // page was encrypted and checksum matched
+ } else {
+ return false; // page was encrypted but checksum didn't match
+ }
+}
+
+/***********************************************************************/
+
+/** A copy of global key state */
+struct key_state_t {
+ key_state_t() : key_id(0), key_version(0),
+ rotate_key_age(srv_fil_crypt_rotate_key_age) {}
+ bool operator==(const key_state_t& other) const {
+ return key_version == other.key_version &&
+ rotate_key_age == other.rotate_key_age;
+ }
+ uint key_id;
+ uint key_version;
+ uint rotate_key_age;
+};
+
+/***********************************************************************
+Copy global key state */
+static void
+fil_crypt_get_key_state(
+/*====================*/
+ key_state_t *new_state) /*!< out: key state */
+{
+ if (srv_encrypt_tables) {
+ new_state->key_version =
+ encryption_key_get_latest_version(new_state->key_id);
+ new_state->rotate_key_age = srv_fil_crypt_rotate_key_age;
+ ut_a(new_state->key_version != ENCRYPTION_KEY_VERSION_INVALID);
+ ut_a(new_state->key_version != ENCRYPTION_KEY_NOT_ENCRYPTED);
+ } else {
+ new_state->key_version = 0;
+ new_state->rotate_key_age = 0;
+ }
+}
+
+/***********************************************************************
+Check if a key needs rotation given a key_state
+@return true if key needs rotation, false if not */
+static bool
+fil_crypt_needs_rotation(
+/*=====================*/
+ fil_encryption_t encrypt_mode, /*!< in: Encryption
+ mode */
+ uint key_version, /*!< in: Key version */
+ uint latest_key_version, /*!< in: Latest key version */
+ uint rotate_key_age) /*!< in: When to rotate */
+{
+ if (key_version == ENCRYPTION_KEY_VERSION_INVALID) {
+ return false;
+ }
+
+ if (key_version == 0 && latest_key_version != 0) {
+ /* this is rotation unencrypted => encrypted
+ * ignore rotate_key_age */
+ return true;
+ }
+
+ if (latest_key_version == 0 && key_version != 0) {
+ if (encrypt_mode == FIL_SPACE_ENCRYPTION_DEFAULT) {
+ /* this is rotation encrypted => unencrypted */
+ return true;
+ }
+ return false;
+ }
+
+ /* this is rotation encrypted => encrypted,
+ * only reencrypt if key is sufficiently old */
+ if (key_version + rotate_key_age < latest_key_version) {
+ return true;
+ }
+
+ return false;
+}
+
+/***********************************************************************
+Check if a space is closing (i.e just before drop)
+@return true if space is closing, false if not. */
+UNIV_INTERN
+bool
+fil_crypt_is_closing(
+/*=================*/
+ ulint space) /*!< in: FIL space id */
+{
+ bool closing=true;
+ fil_space_crypt_t *crypt_data = fil_space_get_crypt_data(space);
+
+ if (crypt_data) {
+ mutex_enter(&crypt_data->mutex);
+ closing = crypt_data->closing;
+ mutex_exit(&crypt_data->mutex);
+ }
+
+ return closing;
+}
+
+/***********************************************************************
+Start encrypting a space
+@return true if a pending op (fil_inc_pending_ops/fil_decr_pending_ops) is held
+*/
+static
+bool
+fil_crypt_start_encrypting_space(
+/*=============================*/
+ ulint space, /*!< in: FIL space id */
+ bool* recheck)/*!< out: true if recheck needed */
+{
+
+ /* we have a pending op when entering function */
+ bool pending_op = true;
+
+ mutex_enter(&fil_crypt_threads_mutex);
+
+ fil_space_crypt_t *crypt_data = fil_space_get_crypt_data(space);
+ ibool page_encrypted = (crypt_data != NULL);
+
+ /*If spage is not encrypted and encryption is not enabled, then
+ do not continue encrypting the space. */
+ if (!page_encrypted && !srv_encrypt_tables) {
+ mutex_exit(&fil_crypt_threads_mutex);
+ return pending_op;
+ }
+
+ if (crypt_data != NULL || fil_crypt_start_converting) {
+ /* someone beat us to it */
+ if (fil_crypt_start_converting) {
+ *recheck = true;
+ }
+
+ mutex_exit(&fil_crypt_threads_mutex);
+ return pending_op;
+ }
+
+ /* NOTE: we need to write and flush page 0 before publishing
+ * the crypt data. This so that after restart there is no
+ * risk of finding encrypted pages without having
+ * crypt data in page 0 */
+
+ /* 1 - create crypt data */
+ crypt_data = fil_space_create_crypt_data(FIL_SPACE_ENCRYPTION_DEFAULT, FIL_DEFAULT_ENCRYPTION_KEY);
+ if (crypt_data == NULL) {
+ mutex_exit(&fil_crypt_threads_mutex);
+ return pending_op;
+ }
+
+ crypt_data->type = CRYPT_SCHEME_UNENCRYPTED;
+ crypt_data->min_key_version = 0; // all pages are unencrypted
+ crypt_data->rotate_state.start_time = time(0);
+ crypt_data->rotate_state.starting = true;
+ crypt_data->rotate_state.active_threads = 1;
+
+ mutex_enter(&crypt_data->mutex);
+ crypt_data = fil_space_set_crypt_data(space, crypt_data);
+ mutex_exit(&crypt_data->mutex);
+
+ fil_crypt_start_converting = true;
+ mutex_exit(&fil_crypt_threads_mutex);
+
+ do
+ {
+ if (fil_crypt_is_closing(space) ||
+ fil_space_found_by_id(space)) {
+ break;
+ }
+
+ mtr_t mtr;
+ mtr_start(&mtr);
+
+ /* 2 - get page 0 */
+ ulint offset = 0;
+ ulint zip_size = fil_space_get_zip_size(space);
+ buf_block_t* block = buf_page_get_gen(space, zip_size, offset,
+ RW_X_LATCH,
+ NULL,
+ BUF_GET,
+ __FILE__, __LINE__,
+ &mtr);
+
+ if (fil_crypt_is_closing(space) ||
+ fil_space_found_by_id(space) == NULL) {
+ mtr_commit(&mtr);
+ break;
+ }
+
+ /* 3 - compute location to store crypt data */
+ byte* frame = buf_block_get_frame(block);
+ ulint maxsize;
+ ut_ad(crypt_data);
+ crypt_data->page0_offset =
+ fsp_header_get_crypt_offset(zip_size, &maxsize);
+
+ /* 4 - write crypt data to page 0 */
+ fil_space_write_crypt_data_low(crypt_data,
+ CRYPT_SCHEME_1,
+ frame,
+ crypt_data->page0_offset,
+ maxsize, &mtr);
+
+ mtr_commit(&mtr);
+
+ if (fil_crypt_is_closing(space) ||
+ fil_space_found_by_id(space) == NULL) {
+ break;
+ }
+
+ /* record lsn of update */
+ lsn_t end_lsn = mtr.end_lsn;
+
+ /* 4 - sync tablespace before publishing crypt data */
+
+ /* release "lock" while syncing */
+ fil_decr_pending_ops(space);
+ pending_op = false;
+
+ bool success = false;
+ ulint n_pages = 0;
+ ulint sum_pages = 0;
+ do {
+ success = buf_flush_list(ULINT_MAX, end_lsn, &n_pages);
+ buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
+ sum_pages += n_pages;
+ } while (!success &&
+ !fil_crypt_is_closing(space) &&
+ !fil_space_found_by_id(space));
+
+ /* try to reacquire pending op */
+ if (fil_inc_pending_ops(space, true)) {
+ break;
+ }
+
+ /* pending op reacquired! */
+ pending_op = true;
+
+ if (fil_crypt_is_closing(space) ||
+ fil_space_found_by_id(space) == NULL) {
+ break;
+ }
+
+ /* 5 - publish crypt data */
+ mutex_enter(&fil_crypt_threads_mutex);
+ ut_ad(crypt_data);
+ mutex_enter(&crypt_data->mutex);
+ crypt_data->type = CRYPT_SCHEME_1;
+ ut_a(crypt_data->rotate_state.active_threads == 1);
+ crypt_data->rotate_state.active_threads = 0;
+ crypt_data->rotate_state.starting = false;
+
+ fil_crypt_start_converting = false;
+ mutex_exit(&crypt_data->mutex);
+ mutex_exit(&fil_crypt_threads_mutex);
+
+ return pending_op;
+ } while (0);
+
+ ut_ad(crypt_data);
+ mutex_enter(&crypt_data->mutex);
+ ut_a(crypt_data->rotate_state.active_threads == 1);
+ crypt_data->rotate_state.active_threads = 0;
+ mutex_exit(&crypt_data->mutex);
+
+ mutex_enter(&fil_crypt_threads_mutex);
+ fil_crypt_start_converting = false;
+ mutex_exit(&fil_crypt_threads_mutex);
+
+ return pending_op;
+}
+
+/** State of a rotation thread */
+struct rotate_thread_t {
+ explicit rotate_thread_t(uint no) {
+ memset(this, 0, sizeof(* this));
+ thread_no = no;
+ first = true;
+ estimated_max_iops = 20;
+ }
+
+ uint thread_no;
+ bool first; /*!< is position before first space */
+ ulint space; /*!< current space */
+ ulint offset; /*!< current offset */
+ ulint batch; /*!< #pages to rotate */
+ uint min_key_version_found;/*!< min key version found but not rotated */
+ lsn_t end_lsn; /*!< max lsn when rotating this space */
+
+ uint estimated_max_iops; /*!< estimation of max iops */
+ uint allocated_iops; /*!< allocated iops */
+ uint cnt_waited; /*!< #times waited during this slot */
+ uint sum_waited_us; /*!< wait time during this slot */
+
+ fil_crypt_stat_t crypt_stat; // statistics
+
+ btr_scrub_t scrub_data; /* thread local data used by btr_scrub-functions
+ * when iterating pages of tablespace */
+
+ /* check if this thread should shutdown */
+ bool should_shutdown() const {
+ return ! (srv_shutdown_state == SRV_SHUTDOWN_NONE &&
+ thread_no < srv_n_fil_crypt_threads);
+ }
+};
+
+/***********************************************************************
+Check if space needs rotation given a key_state
+@return true if space needs key rotation */
+static
+bool
+fil_crypt_space_needs_rotation(
+/*===========================*/
+ rotate_thread_t* state, /*!< in: Key rotation state */
+ key_state_t* key_state, /*!< in: Key state */
+ bool* recheck) /*!< out: needs recheck ? */
+{
+ ulint space = state->space;
+
+ /* Make sure that tablespace is found and it is normal tablespace */
+ if (fil_space_found_by_id(space) == NULL ||
+ fil_space_get_type(space) != FIL_TABLESPACE) {
+ return false;
+ }
+
+ if (fil_inc_pending_ops(space, true)) {
+ /* tablespace being dropped */
+ return false;
+ }
+
+ /* keep track of if we have pending op */
+ bool pending_op = true;
+
+ fil_space_crypt_t *crypt_data = fil_space_get_crypt_data(space);
+
+ if (crypt_data == NULL) {
+ /**
+ * space has no crypt data
+ * start encrypting it...
+ */
+ pending_op = fil_crypt_start_encrypting_space(space, recheck);
+
+ crypt_data = fil_space_get_crypt_data(space);
+
+ if (crypt_data == NULL) {
+ if (pending_op) {
+ fil_decr_pending_ops(space);
+ }
+ return false;
+ }
+ }
+
+ mutex_enter(&crypt_data->mutex);
+
+ do {
+ /* prevent threads from starting to rotate space */
+ if (crypt_data->rotate_state.starting) {
+ /* recheck this space later */
+ *recheck = true;
+ break;
+ }
+
+ /* prevent threads from starting to rotate space */
+ if (crypt_data->closing) {
+ break;
+ }
+
+ if (crypt_data->rotate_state.flushing) {
+ break;
+ }
+
+ if (crypt_data->key_id != key_state->key_id) {
+ key_state->key_id= crypt_data->key_id;
+ fil_crypt_get_key_state(key_state);
+ }
+
+ bool need_key_rotation = fil_crypt_needs_rotation(
+ crypt_data->encryption,
+ crypt_data->min_key_version,
+ key_state->key_version, key_state->rotate_key_age);
+
+ crypt_data->rotate_state.scrubbing.is_active =
+ btr_scrub_start_space(space, &state->scrub_data);
+
+ time_t diff = time(0) - crypt_data->rotate_state.scrubbing.
+ last_scrub_completed;
+ bool need_scrubbing =
+ crypt_data->rotate_state.scrubbing.is_active
+ && diff >= srv_background_scrub_data_interval;
+
+ if (need_key_rotation == false && need_scrubbing == false)
+ break;
+
+ mutex_exit(&crypt_data->mutex);
+ /* NOTE! fil_decr_pending_ops is performed outside */
+ return true;
+ } while (0);
+
+ mutex_exit(&crypt_data->mutex);
+
+ if (pending_op) {
+ fil_decr_pending_ops(space);
+ }
+
+ return false;
+}
+
+/***********************************************************************
+Update global statistics with thread statistics */
+static void
+fil_crypt_update_total_stat(
+/*========================*/
+ rotate_thread_t *state) /*!< in: Key rotation status */
+{
+ mutex_enter(&crypt_stat_mutex);
+ crypt_stat.pages_read_from_cache +=
+ state->crypt_stat.pages_read_from_cache;
+ crypt_stat.pages_read_from_disk +=
+ state->crypt_stat.pages_read_from_disk;
+ crypt_stat.pages_modified += state->crypt_stat.pages_modified;
+ crypt_stat.pages_flushed += state->crypt_stat.pages_flushed;
+ // remote old estimate
+ crypt_stat.estimated_iops -= state->crypt_stat.estimated_iops;
+ // add new estimate
+ crypt_stat.estimated_iops += state->estimated_max_iops;
+ mutex_exit(&crypt_stat_mutex);
+
+ // make new estimate "current" estimate
+ memset(&state->crypt_stat, 0, sizeof(state->crypt_stat));
+ // record our old (current) estimate
+ state->crypt_stat.estimated_iops = state->estimated_max_iops;
+}
+
+/***********************************************************************
+Allocate iops to thread from global setting,
+used before starting to rotate a space.
+@return true if allocation succeeded, false if failed */
+static
+bool
+fil_crypt_alloc_iops(
+/*=================*/
+ rotate_thread_t *state) /*!< in: Key rotation status */
+{
+ ut_ad(state->allocated_iops == 0);
+
+ uint max_iops = state->estimated_max_iops;
+ mutex_enter(&fil_crypt_threads_mutex);
+
+ if (n_fil_crypt_iops_allocated >= srv_n_fil_crypt_iops) {
+ /* this can happen when user decreases srv_fil_crypt_iops */
+ mutex_exit(&fil_crypt_threads_mutex);
+ return false;
+ }
+
+ uint alloc = srv_n_fil_crypt_iops - n_fil_crypt_iops_allocated;
+
+ if (alloc > max_iops) {
+ alloc = max_iops;
+ }
+
+ n_fil_crypt_iops_allocated += alloc;
+ mutex_exit(&fil_crypt_threads_mutex);
+
+ state->allocated_iops = alloc;
+
+ return alloc > 0;
+}
+
+/***********************************************************************
+Reallocate iops to thread,
+used when inside a space */
+static
+void
+fil_crypt_realloc_iops(
+/*===================*/
+ rotate_thread_t *state) /*!< in: Key rotation status */
+{
+ ut_a(state->allocated_iops > 0);
+
+ if (10 * state->cnt_waited > state->batch) {
+ /* if we waited more than 10% re-estimate max_iops */
+ uint avg_wait_time_us =
+ state->sum_waited_us / state->cnt_waited;
+
+#if DEBUG_KEYROTATION_THROTTLING
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "thr_no: %u - update estimated_max_iops from %u to %u.",
+ state->thread_no,
+ state->estimated_max_iops,
+ 1000000 / avg_wait_time_us);
+#endif
+ if (avg_wait_time_us == 0) {
+ avg_wait_time_us = 1; // prevent division by zero
+ }
+
+ state->estimated_max_iops = 1000000 / avg_wait_time_us;
+ state->cnt_waited = 0;
+ state->sum_waited_us = 0;
+ } else {
+#if DEBUG_KEYROTATION_THROTTLING
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "thr_no: %u only waited %lu%% skip re-estimate.",
+ state->thread_no,
+ (100 * state->cnt_waited) / state->batch);
+#endif
+ }
+
+ if (state->estimated_max_iops <= state->allocated_iops) {
+ /* return extra iops */
+ uint extra = state->allocated_iops - state->estimated_max_iops;
+
+ if (extra > 0) {
+ mutex_enter(&fil_crypt_threads_mutex);
+ if (n_fil_crypt_iops_allocated < extra) {
+ /* unknown bug!
+ * crash in debug
+ * keep n_fil_crypt_iops_allocated unchanged
+ * in release */
+ ut_ad(0);
+ extra = 0;
+ }
+ n_fil_crypt_iops_allocated -= extra;
+ state->allocated_iops -= extra;
+
+ if (state->allocated_iops == 0) {
+ /* no matter how slow io system seems to be
+ * never decrease allocated_iops to 0... */
+ state->allocated_iops ++;
+ n_fil_crypt_iops_allocated ++;
+ }
+ mutex_exit(&fil_crypt_threads_mutex);
+ os_event_set(fil_crypt_threads_event);
+ }
+ } else {
+ /* see if there are more to get */
+ mutex_enter(&fil_crypt_threads_mutex);
+ if (n_fil_crypt_iops_allocated < srv_n_fil_crypt_iops) {
+ /* there are extra iops free */
+ uint extra = srv_n_fil_crypt_iops -
+ n_fil_crypt_iops_allocated;
+ if (state->allocated_iops + extra >
+ state->estimated_max_iops) {
+ /* but don't alloc more than our max */
+ extra = state->estimated_max_iops -
+ state->allocated_iops;
+ }
+ n_fil_crypt_iops_allocated += extra;
+ state->allocated_iops += extra;
+#if DEBUG_KEYROTATION_THROTTLING
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "thr_no: %u increased iops from %u to %u.",
+ state->thread_no,
+ state->allocated_iops - extra,
+ state->allocated_iops);
+#endif
+ }
+ mutex_exit(&fil_crypt_threads_mutex);
+ }
+
+ fil_crypt_update_total_stat(state);
+}
+
+/***********************************************************************
+Return allocated iops to global */
+static
+void
+fil_crypt_return_iops(
+/*==================*/
+ rotate_thread_t *state) /*!< in: Key rotation status */
+{
+ if (state->allocated_iops > 0) {
+ uint iops = state->allocated_iops;
+ mutex_enter(&fil_crypt_threads_mutex);
+ if (n_fil_crypt_iops_allocated < iops) {
+ /* unknown bug!
+ * crash in debug
+ * keep n_fil_crypt_iops_allocated unchanged
+ * in release */
+ ut_ad(0);
+ iops = 0;
+ }
+ n_fil_crypt_iops_allocated -= iops;
+ mutex_exit(&fil_crypt_threads_mutex);
+ state->allocated_iops = 0;
+ os_event_set(fil_crypt_threads_event);
+ }
+
+ fil_crypt_update_total_stat(state);
+}
+
+/***********************************************************************
+Search for a space needing rotation */
+UNIV_INTERN
+bool
+fil_crypt_find_space_to_rotate(
+/*===========================*/
+ key_state_t* key_state, /*!< in: Key state */
+ rotate_thread_t* state, /*!< in: Key rotation state */
+ bool* recheck) /*!< out: true if recheck
+ needed */
+{
+ /* we need iops to start rotating */
+ while (!state->should_shutdown() && !fil_crypt_alloc_iops(state)) {
+ os_event_reset(fil_crypt_threads_event);
+ os_event_wait_time(fil_crypt_threads_event, 1000000);
+ }
+
+ if (state->should_shutdown())
+ return false;
+
+ if (state->first) {
+ state->first = false;
+ state->space = fil_get_first_space_safe();
+ } else {
+ state->space = fil_get_next_space_safe(state->space);
+ }
+
+ while (!state->should_shutdown() && state->space != ULINT_UNDEFINED) {
+
+ if (fil_crypt_space_needs_rotation(state, key_state, recheck)) {
+ ut_ad(key_state->key_id);
+ /* init state->min_key_version_found before
+ * starting on a space */
+ state->min_key_version_found = key_state->key_version;
+ return true;
+ }
+
+ state->space = fil_get_next_space_safe(state->space);
+ }
+
+ /* if we didn't find any space return iops */
+ fil_crypt_return_iops(state);
+
+ return false;
+
+}
+
+/***********************************************************************
+Start rotating a space */
+static
+void
+fil_crypt_start_rotate_space(
+/*=========================*/
+ const key_state_t* key_state, /*!< in: Key state */
+ rotate_thread_t* state) /*!< in: Key rotation state */
+{
+ ulint space = state->space;
+ fil_space_crypt_t *crypt_data = fil_space_get_crypt_data(space);
+
+ ut_ad(crypt_data);
+ mutex_enter(&crypt_data->mutex);
+ ut_ad(key_state->key_id == crypt_data->key_id);
+
+ if (crypt_data->rotate_state.active_threads == 0) {
+ /* only first thread needs to init */
+ crypt_data->rotate_state.next_offset = 1; // skip page 0
+ /* no need to rotate beyond current max
+ * if space extends, it will be encrypted with newer version */
+ crypt_data->rotate_state.max_offset = fil_space_get_size(space);
+
+ crypt_data->rotate_state.end_lsn = 0;
+ crypt_data->rotate_state.min_key_version_found =
+ key_state->key_version;
+
+ crypt_data->rotate_state.start_time = time(0);
+
+ if (crypt_data->type == CRYPT_SCHEME_UNENCRYPTED &&
+ crypt_data->encryption != FIL_SPACE_ENCRYPTION_OFF &&
+ key_state->key_version != 0) {
+ /* this is rotation unencrypted => encrypted */
+ crypt_data->type = CRYPT_SCHEME_1;
+ }
+ }
+
+ /* count active threads in space */
+ crypt_data->rotate_state.active_threads++;
+
+ /* Initialize thread local state */
+ state->end_lsn = crypt_data->rotate_state.end_lsn;
+ state->min_key_version_found =
+ crypt_data->rotate_state.min_key_version_found;
+
+ mutex_exit(&crypt_data->mutex);
+}
+
+/***********************************************************************
+Search for batch of pages needing rotation
+@return true if page needing key rotation found, false if not found */
+static
+bool
+fil_crypt_find_page_to_rotate(
+/*==========================*/
+ const key_state_t* key_state, /*!< in: Key state */
+ rotate_thread_t* state) /*!< in: Key rotation state */
+{
+ ulint batch = srv_alloc_time * state->allocated_iops;
+ ulint space = state->space;
+ fil_space_crypt_t *crypt_data = fil_space_get_crypt_data(space);
+
+ /* Space might already be dropped */
+ if (crypt_data) {
+ mutex_enter(&crypt_data->mutex);
+ ut_ad(key_state->key_id == crypt_data->key_id);
+
+ if (crypt_data->closing == false &&
+ crypt_data->rotate_state.next_offset <
+ crypt_data->rotate_state.max_offset) {
+
+ state->offset = crypt_data->rotate_state.next_offset;
+ ulint remaining = crypt_data->rotate_state.max_offset -
+ crypt_data->rotate_state.next_offset;
+
+ if (batch <= remaining) {
+ state->batch = batch;
+ } else {
+ state->batch = remaining;
+ }
+
+ crypt_data->rotate_state.next_offset += batch;
+ mutex_exit(&crypt_data->mutex);
+ return true;
+ }
+
+ mutex_exit(&crypt_data->mutex);
+ }
+
+ return false;
+}
+
+/***********************************************************************
+Check if a page is uninitialized (doesn't need to be rotated)
+@return true if page is uninitialized, false if not.*/
+static
+bool
+fil_crypt_is_page_uninitialized(
+/*============================*/
+ const byte *frame, /*!< in: Page */
+ uint zip_size) /*!< in: compressed size if
+ row_format compressed */
+{
+ if (zip_size) {
+ ulint stored_checksum = mach_read_from_4(
+ frame + FIL_PAGE_SPACE_OR_CHKSUM);
+ /* empty pages aren't encrypted */
+ if (stored_checksum == 0) {
+ return true;
+ }
+ } else {
+ ulint size = UNIV_PAGE_SIZE;
+ ulint checksum_field1 = mach_read_from_4(
+ frame + FIL_PAGE_SPACE_OR_CHKSUM);
+ ulint checksum_field2 = mach_read_from_4(
+ frame + size - FIL_PAGE_END_LSN_OLD_CHKSUM);
+ /* empty pages are not encrypted */
+ if (checksum_field1 == 0 && checksum_field2 == 0
+ && mach_read_from_4(frame + FIL_PAGE_LSN) == 0) {
+ return true;
+ }
+ }
+ return false;
+}
+
+#define fil_crypt_get_page_throttle(state,space,zip_size,offset,mtr,sleeptime_ms) \
+ fil_crypt_get_page_throttle_func(state, space, zip_size, offset, mtr, \
+ sleeptime_ms, __FILE__, __LINE__)
+
+/***********************************************************************
+Get a page and compute sleep time
+@return page */
+static
+buf_block_t*
+fil_crypt_get_page_throttle_func(
+/*=============================*/
+ rotate_thread_t* state, /*!< in/out: Key rotation state */
+ ulint space, /*!< in: FIL space id */
+ uint zip_size, /*!< in: compressed size if
+ row_format compressed */
+ ulint offset, /*!< in: page offsett */
+ mtr_t* mtr, /*!< in/out: minitransaction */
+ ulint* sleeptime_ms, /*!< out: sleep time */
+ const char* file, /*!< in: file name */
+ ulint line) /*!< in: file line */
+{
+ buf_block_t* block = buf_page_try_get_func(space, offset, RW_X_LATCH,
+ true,
+ file, line, mtr);
+ if (block != NULL) {
+ /* page was in buffer pool */
+ state->crypt_stat.pages_read_from_cache++;
+ return block;
+ }
+
+ /* Before reading from tablespace we need to make sure that
+ tablespace exists and is not is just being dropped. */
+
+ if (fil_crypt_is_closing(space) ||
+ fil_space_found_by_id(space) == NULL) {
+ return NULL;
+ }
+
+ state->crypt_stat.pages_read_from_disk++;
+
+ ullint start = ut_time_us(NULL);
+ block = buf_page_get_gen(space, zip_size, offset,
+ RW_X_LATCH,
+ NULL, BUF_GET_POSSIBLY_FREED,
+ file, line, mtr);
+ ullint end = ut_time_us(NULL);
+
+ if (end < start) {
+ end = start; // safety...
+ }
+
+ state->cnt_waited++;
+ state->sum_waited_us += (end - start);
+
+ /* average page load */
+ ulint add_sleeptime_ms = 0;
+ ulint avg_wait_time_us = state->sum_waited_us / state->cnt_waited;
+ ulint alloc_wait_us = 1000000 / state->allocated_iops;
+
+ if (avg_wait_time_us < alloc_wait_us) {
+ /* we reading faster than we allocated */
+ add_sleeptime_ms = (alloc_wait_us - avg_wait_time_us) / 1000;
+ } else {
+ /* if page load time is longer than we want, skip sleeping */
+ }
+
+ *sleeptime_ms += add_sleeptime_ms;
+ return block;
+}
+
+
+/***********************************************************************
+Get block and allocation status
+
+note: innodb locks fil_space_latch and then block when allocating page
+but locks block and then fil_space_latch when freeing page.
+@return block
+*/
+static
+buf_block_t*
+btr_scrub_get_block_and_allocation_status(
+/*======================================*/
+ rotate_thread_t* state, /*!< in/out: Key rotation state */
+ ulint space, /*!< in: FIL space id */
+ uint zip_size, /*!< in: compressed size if
+ row_format compressed */
+ ulint offset, /*!< in: page offsett */
+ mtr_t* mtr, /*!< in/out: minitransaction
+ */
+ btr_scrub_page_allocation_status_t *allocation_status,
+ /*!< in/out: allocation status */
+ ulint* sleeptime_ms) /*!< out: sleep time */
+{
+ mtr_t local_mtr;
+ buf_block_t *block = NULL;
+ mtr_start(&local_mtr);
+ *allocation_status = fsp_page_is_free(space, offset, &local_mtr) ?
+ BTR_SCRUB_PAGE_FREE :
+ BTR_SCRUB_PAGE_ALLOCATED;
+
+ if (*allocation_status == BTR_SCRUB_PAGE_FREE) {
+ /* this is easy case, we lock fil_space_latch first and
+ then block */
+ block = fil_crypt_get_page_throttle(state,
+ space, zip_size,
+ offset, mtr,
+ sleeptime_ms);
+ mtr_commit(&local_mtr);
+ } else {
+ /* page is allocated according to xdes */
+
+ /* release fil_space_latch *before* fetching block */
+ mtr_commit(&local_mtr);
+
+ /* NOTE: when we have locked dict_index_get_lock(),
+ * it's safe to release fil_space_latch and then fetch block
+ * as dict_index_get_lock() is needed to make tree modifications
+ * such as free-ing a page
+ */
+
+ block = fil_crypt_get_page_throttle(state,
+ space, zip_size,
+ offset, mtr,
+ sleeptime_ms);
+ }
+
+ return block;
+}
+
+
+/***********************************************************************
+Rotate one page */
+static
+void
+fil_crypt_rotate_page(
+/*==================*/
+ const key_state_t* key_state, /*!< in: Key state */
+ rotate_thread_t* state) /*!< in: Key rotation state */
+{
+ ulint space = state->space;
+ ulint offset = state->offset;
+ const uint zip_size = fil_space_get_zip_size(space);
+ ulint sleeptime_ms = 0;
+
+ /* check if tablespace is closing before reading page */
+ if (fil_crypt_is_closing(space) || fil_space_found_by_id(space) == NULL) {
+ return;
+ }
+
+ if (space == TRX_SYS_SPACE && offset == TRX_SYS_PAGE_NO) {
+ /* don't encrypt this as it contains address to dblwr buffer */
+ return;
+ }
+
+ mtr_t mtr;
+ mtr_start(&mtr);
+ buf_block_t* block = fil_crypt_get_page_throttle(state,
+ space, zip_size,
+ offset, &mtr,
+ &sleeptime_ms);
+
+ if (block) {
+
+ bool modified = false;
+ int needs_scrubbing = BTR_SCRUB_SKIP_PAGE;
+ lsn_t block_lsn = block->page.newest_modification;
+ uint kv = block->page.key_version;
+
+ /* check if tablespace is closing after reading page */
+ if (!fil_crypt_is_closing(space)) {
+ byte* frame = buf_block_get_frame(block);
+ fil_space_crypt_t *crypt_data = fil_space_get_crypt_data(space);
+
+ if (kv == 0 &&
+ fil_crypt_is_page_uninitialized(frame, zip_size)) {
+ ;
+ } else if (fil_crypt_needs_rotation(
+ crypt_data->encryption,
+ kv, key_state->key_version,
+ key_state->rotate_key_age)) {
+
+ /* page can be "fresh" i.e never written in case
+ * kv == 0 or it should have a key version at least
+ * as big as the space minimum key version*/
+ ut_a(kv == 0 || kv >= crypt_data->min_key_version);
+
+ modified = true;
+
+ /* force rotation by dummy updating page */
+ mlog_write_ulint(frame +
+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+ space, MLOG_4BYTES, &mtr);
+
+ /* update block */
+ block->page.key_version = key_state->key_version;
+
+ /* statistics */
+ state->crypt_stat.pages_modified++;
+ } else {
+ if (crypt_data->encryption != FIL_SPACE_ENCRYPTION_OFF) {
+ ut_a(kv >= crypt_data->min_key_version ||
+ (kv == 0 && key_state->key_version == 0));
+
+ if (kv < state->min_key_version_found) {
+ state->min_key_version_found = kv;
+ }
+ }
+ }
+
+ needs_scrubbing = btr_page_needs_scrubbing(
+ &state->scrub_data, block,
+ BTR_SCRUB_PAGE_ALLOCATION_UNKNOWN);
+ }
+
+ mtr_commit(&mtr);
+ lsn_t end_lsn = mtr.end_lsn;
+
+ if (needs_scrubbing == BTR_SCRUB_PAGE) {
+ mtr_start(&mtr);
+ /*
+ * refetch page and allocation status
+ */
+ btr_scrub_page_allocation_status_t allocated;
+ block = btr_scrub_get_block_and_allocation_status(
+ state, space, zip_size, offset, &mtr,
+ &allocated,
+ &sleeptime_ms);
+
+ if (block) {
+
+ /* get required table/index and index-locks */
+ needs_scrubbing = btr_scrub_recheck_page(
+ &state->scrub_data, block, allocated, &mtr);
+
+ if (needs_scrubbing == BTR_SCRUB_PAGE) {
+ /* we need to refetch it once more now that we have
+ * index locked */
+ block = btr_scrub_get_block_and_allocation_status(
+ state, space, zip_size, offset, &mtr,
+ &allocated,
+ &sleeptime_ms);
+
+ needs_scrubbing = btr_scrub_page(&state->scrub_data,
+ block, allocated,
+ &mtr);
+ }
+
+ /* NOTE: mtr is committed inside btr_scrub_recheck_page()
+ * and/or btr_scrub_page. This is to make sure that
+ * locks & pages are latched in corrected order,
+ * the mtr is in some circumstances restarted.
+ * (mtr_commit() + mtr_start())
+ */
+ }
+ }
+
+ if (needs_scrubbing != BTR_SCRUB_PAGE) {
+ /* if page didn't need scrubbing it might be that cleanups
+ are needed. do those outside of any mtr to prevent deadlocks.
+
+ the information what kinds of cleanups that are needed are
+ encoded inside the needs_scrubbing, but this is opaque to
+ this function (except the value BTR_SCRUB_PAGE) */
+ btr_scrub_skip_page(&state->scrub_data, needs_scrubbing);
+ }
+
+ if (needs_scrubbing == BTR_SCRUB_TURNED_OFF) {
+ /* if we just detected that scrubbing was turned off
+ * update global state to reflect this */
+ fil_space_crypt_t *crypt_data = fil_space_get_crypt_data(space);
+ ut_ad(crypt_data);
+ mutex_enter(&crypt_data->mutex);
+ crypt_data->rotate_state.scrubbing.is_active = false;
+ mutex_exit(&crypt_data->mutex);
+ }
+
+ if (modified) {
+ /* if we modified page, we take lsn from mtr */
+ ut_a(end_lsn > state->end_lsn);
+ ut_a(end_lsn > block_lsn);
+ state->end_lsn = end_lsn;
+ } else {
+ /* if we did not modify page, check for max lsn */
+ if (block_lsn > state->end_lsn) {
+ state->end_lsn = block_lsn;
+ }
+ }
+ }
+
+ if (sleeptime_ms) {
+ os_event_reset(fil_crypt_throttle_sleep_event);
+ os_event_wait_time(fil_crypt_throttle_sleep_event,
+ 1000 * sleeptime_ms);
+ }
+}
+
+/***********************************************************************
+Rotate a batch of pages */
+static
+void
+fil_crypt_rotate_pages(
+/*===================*/
+ const key_state_t* key_state, /*!< in: Key state */
+ rotate_thread_t* state) /*!< in: Key rotation state */
+{
+ ulint space = state->space;
+ ulint end = state->offset + state->batch;
+
+ for (; state->offset < end; state->offset++) {
+
+ /* we can't rotate pages in dblwr buffer as
+ * it's not possible to read those due to lots of asserts
+ * in buffer pool.
+ *
+ * However since these are only (short-lived) copies of
+ * real pages, they will be updated anyway when the
+ * real page is updated
+ */
+ if (space == TRX_SYS_SPACE &&
+ buf_dblwr_page_inside(state->offset)) {
+ continue;
+ }
+
+ fil_crypt_rotate_page(key_state, state);
+ }
+}
+
+/***********************************************************************
+Flush rotated pages and then update page 0 */
+static
+void
+fil_crypt_flush_space(
+/*==================*/
+ rotate_thread_t* state, /*!< in: Key rotation state */
+ ulint space) /*!< in: FIL space id */
+{
+ fil_space_crypt_t *crypt_data = fil_space_get_crypt_data(space);
+
+ /* flush tablespace pages so that there are no pages left with old key */
+ lsn_t end_lsn = crypt_data->rotate_state.end_lsn;
+
+ if (end_lsn > 0 && !fil_crypt_is_closing(space)) {
+ bool success = false;
+ ulint n_pages = 0;
+ ulint sum_pages = 0;
+ ullint start = ut_time_us(NULL);
+
+ do {
+ success = buf_flush_list(ULINT_MAX, end_lsn, &n_pages);
+ buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
+ sum_pages += n_pages;
+ } while (!success && !fil_crypt_is_closing(space));
+
+ ullint end = ut_time_us(NULL);
+
+ if (sum_pages && end > start) {
+ state->cnt_waited += sum_pages;
+ state->sum_waited_us += (end - start);
+
+ /* statistics */
+ state->crypt_stat.pages_flushed += sum_pages;
+ }
+ }
+
+ if (crypt_data->min_key_version == 0) {
+ crypt_data->type = CRYPT_SCHEME_UNENCRYPTED;
+ }
+
+ /* update page 0 */
+ if (!fil_crypt_is_closing(space)) {
+ mtr_t mtr;
+ mtr_start(&mtr);
+ ulint offset = 0; // page 0
+ const uint zip_size = fil_space_get_zip_size(space);
+ buf_block_t* block = buf_page_get_gen(space, zip_size, offset,
+ RW_X_LATCH, NULL, BUF_GET,
+ __FILE__, __LINE__, &mtr);
+ byte* frame = buf_block_get_frame(block);
+ ulint maxsize;
+ crypt_data->page0_offset =
+ fsp_header_get_crypt_offset(zip_size, &maxsize);
+
+ fil_space_write_crypt_data(space, frame,
+ crypt_data->page0_offset,
+ ULINT_MAX, &mtr);
+ mtr_commit(&mtr);
+ }
+}
+
+/***********************************************************************
+Complete rotating a space */
+static
+void
+fil_crypt_complete_rotate_space(
+/*============================*/
+ const key_state_t* key_state, /*!< in: Key state */
+ rotate_thread_t* state) /*!< in: Key rotation state */
+{
+ ulint space = state->space;
+ fil_space_crypt_t *crypt_data = fil_space_get_crypt_data(space);
+
+ /* Space might already be dropped */
+ if (crypt_data) {
+ mutex_enter(&crypt_data->mutex);
+
+ /**
+ * Update crypt data state with state from thread
+ */
+ if (state->min_key_version_found <
+ crypt_data->rotate_state.min_key_version_found) {
+ crypt_data->rotate_state.min_key_version_found =
+ state->min_key_version_found;
+ }
+
+ if (state->end_lsn > crypt_data->rotate_state.end_lsn) {
+ crypt_data->rotate_state.end_lsn = state->end_lsn;
+ }
+
+ ut_a(crypt_data->rotate_state.active_threads > 0);
+ crypt_data->rotate_state.active_threads--;
+ bool last = crypt_data->rotate_state.active_threads == 0;
+
+ /**
+ * check if space is fully done
+ * this as when threads shutdown, it could be that we "complete"
+ * iterating before we have scanned the full space.
+ */
+ bool done = crypt_data->rotate_state.next_offset >=
+ crypt_data->rotate_state.max_offset;
+
+ /**
+ * we should flush space if we're last thread AND
+ * the iteration is done
+ */
+ bool should_flush = last && done;
+
+ if (should_flush) {
+ /* we're the last active thread */
+ crypt_data->rotate_state.flushing = true;
+ crypt_data->min_key_version =
+ crypt_data->rotate_state.min_key_version_found;
+ }
+
+ /* inform scrubbing */
+ crypt_data->rotate_state.scrubbing.is_active = false;
+ mutex_exit(&crypt_data->mutex);
+
+ /* all threads must call btr_scrub_complete_space wo/ mutex held */
+ if (btr_scrub_complete_space(&state->scrub_data) == true) {
+ if (should_flush) {
+ /* only last thread updates last_scrub_completed */
+ ut_ad(crypt_data);
+ mutex_enter(&crypt_data->mutex);
+ crypt_data->rotate_state.scrubbing.
+ last_scrub_completed = time(0);
+ mutex_exit(&crypt_data->mutex);
+ }
+ }
+
+ if (should_flush) {
+ fil_crypt_flush_space(state, space);
+
+ ut_ad(crypt_data);
+ mutex_enter(&crypt_data->mutex);
+ crypt_data->rotate_state.flushing = false;
+ mutex_exit(&crypt_data->mutex);
+ }
+ }
+}
+
+/*********************************************************************//**
+A thread which monitors global key state and rotates tablespaces accordingly
+@return a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(fil_crypt_thread)(
+/*=============================*/
+ void* arg __attribute__((unused))) /*!< in: a dummy parameter required
+ * by os_thread_create */
+{
+ UT_NOT_USED(arg);
+
+ mutex_enter(&fil_crypt_threads_mutex);
+ uint thread_no = srv_n_fil_crypt_threads_started;
+ srv_n_fil_crypt_threads_started++;
+ mutex_exit(&fil_crypt_threads_mutex);
+ os_event_set(fil_crypt_event); /* signal that we started */
+
+ /* state of this thread */
+ rotate_thread_t thr(thread_no);
+
+ /* if we find a space that is starting, skip over it and recheck it later */
+ bool recheck = false;
+
+ while (!thr.should_shutdown()) {
+
+ key_state_t new_state;
+
+ time_t wait_start = time(0);
+
+ while (!thr.should_shutdown()) {
+
+ /* wait for key state changes
+ * i.e either new key version of change or
+ * new rotate_key_age */
+ os_event_reset(fil_crypt_threads_event);
+ if (os_event_wait_time(fil_crypt_threads_event, 1000000) == 0) {
+ break;
+ }
+
+ if (recheck) {
+ /* check recheck here, after sleep, so
+ * that we don't busy loop while when one thread is starting
+ * a space*/
+ break;
+ }
+
+ time_t waited = time(0) - wait_start;
+
+ if (waited >= srv_background_scrub_data_check_interval) {
+ break;
+ }
+ }
+
+ recheck = false;
+ thr.first = true; // restart from first tablespace
+
+ /* iterate all spaces searching for those needing rotation */
+ while (!thr.should_shutdown() &&
+ fil_crypt_find_space_to_rotate(&new_state, &thr, &recheck)) {
+
+ /* we found a space to rotate */
+ fil_crypt_start_rotate_space(&new_state, &thr);
+
+ /* decrement pending ops that was incremented in
+ * fil_crypt_space_needs_rotation
+ * (called from fil_crypt_find_space_to_rotate),
+ * this makes sure that tablespace won't be dropped
+ * just after we decided to start processing it. */
+ fil_decr_pending_ops(thr.space);
+
+ /* iterate all pages (cooperativly with other threads) */
+ while (!thr.should_shutdown() &&
+ fil_crypt_find_page_to_rotate(&new_state, &thr)) {
+
+ /* rotate a (set) of pages */
+ fil_crypt_rotate_pages(&new_state, &thr);
+
+ /* realloc iops */
+ fil_crypt_realloc_iops(&thr);
+ }
+
+ /* complete rotation */
+ fil_crypt_complete_rotate_space(&new_state, &thr);
+
+ /* force key state refresh */
+ new_state.key_id= 0;
+
+ /* return iops */
+ fil_crypt_return_iops(&thr);
+ }
+ }
+
+ /* return iops if shutting down */
+ fil_crypt_return_iops(&thr);
+
+ mutex_enter(&fil_crypt_threads_mutex);
+ srv_n_fil_crypt_threads_started--;
+ mutex_exit(&fil_crypt_threads_mutex);
+ os_event_set(fil_crypt_event); /* signal that we stopped */
+
+ /* We count the number of threads in os_thread_exit(). A created
+ thread should always use that to exit and not use return() to exit. */
+
+ os_thread_exit(NULL);
+
+ OS_THREAD_DUMMY_RETURN;
+}
+
+/*********************************************************************
+Adjust thread count for key rotation */
+UNIV_INTERN
+void
+fil_crypt_set_thread_cnt(
+/*=====================*/
+ uint new_cnt) /*!< in: New key rotation thread count */
+{
+ if (new_cnt > srv_n_fil_crypt_threads) {
+ uint add = new_cnt - srv_n_fil_crypt_threads;
+ srv_n_fil_crypt_threads = new_cnt;
+ for (uint i = 0; i < add; i++) {
+ os_thread_id_t rotation_thread_id;
+ os_thread_create(fil_crypt_thread, NULL, &rotation_thread_id);
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "Creating #%d thread id %lu total threads %u\n",
+ i+1, os_thread_pf(rotation_thread_id), new_cnt);
+ }
+ } else if (new_cnt < srv_n_fil_crypt_threads) {
+ srv_n_fil_crypt_threads = new_cnt;
+ os_event_set(fil_crypt_threads_event);
+ }
+
+ while(srv_n_fil_crypt_threads_started != srv_n_fil_crypt_threads) {
+ os_event_reset(fil_crypt_event);
+ os_event_wait_time(fil_crypt_event, 1000000);
+ }
+}
+
+/*********************************************************************
+Adjust max key age */
+UNIV_INTERN
+void
+fil_crypt_set_rotate_key_age(
+/*=========================*/
+ uint val) /*!< in: New max key age */
+{
+ srv_fil_crypt_rotate_key_age = val;
+ os_event_set(fil_crypt_threads_event);
+}
+
+/*********************************************************************
+Adjust rotation iops */
+UNIV_INTERN
+void
+fil_crypt_set_rotation_iops(
+/*========================*/
+ uint val) /*!< in: New iops setting */
+{
+ srv_n_fil_crypt_iops = val;
+ os_event_set(fil_crypt_threads_event);
+}
+
+/*********************************************************************
+Adjust encrypt tables */
+UNIV_INTERN
+void
+fil_crypt_set_encrypt_tables(
+/*=========================*/
+ uint val) /*!< in: New srv_encrypt_tables setting */
+{
+ srv_encrypt_tables = val;
+ os_event_set(fil_crypt_threads_event);
+}
+
+/*********************************************************************
+Init threads for key rotation */
+UNIV_INTERN
+void
+fil_crypt_threads_init()
+/*====================*/
+{
+ fil_crypt_event = os_event_create();
+ fil_crypt_threads_event = os_event_create();
+ mutex_create(fil_crypt_threads_mutex_key,
+ &fil_crypt_threads_mutex, SYNC_NO_ORDER_CHECK);
+ fil_crypt_threads_inited = true;
+
+ uint cnt = srv_n_fil_crypt_threads;
+ srv_n_fil_crypt_threads = 0;
+ fil_crypt_set_thread_cnt(cnt);
+}
+
+/*********************************************************************
+End threads for key rotation */
+UNIV_INTERN
+void
+fil_crypt_threads_end()
+/*===================*/
+{
+ /* stop threads */
+ fil_crypt_set_thread_cnt(0);
+}
+
+/*********************************************************************
+Clean up key rotation threads resources */
+UNIV_INTERN
+void
+fil_crypt_threads_cleanup()
+/*=======================*/
+{
+ os_event_free(fil_crypt_event);
+ os_event_free(fil_crypt_threads_event);
+}
+
+/*********************************************************************
+Mark a space as closing */
+UNIV_INTERN
+void
+fil_space_crypt_mark_space_closing(
+/*===============================*/
+ ulint space) /*!< in: Space id */
+{
+ if (!fil_crypt_threads_inited) {
+ return;
+ }
+
+ mutex_enter(&fil_crypt_threads_mutex);
+
+ fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(space);
+
+ if (crypt_data == NULL) {
+ mutex_exit(&fil_crypt_threads_mutex);
+ return;
+ }
+
+ mutex_enter(&crypt_data->mutex);
+ mutex_exit(&fil_crypt_threads_mutex);
+ crypt_data->closing = true;
+ mutex_exit(&crypt_data->mutex);
+}
+
+/*********************************************************************
+Wait for crypt threads to stop accessing space */
+UNIV_INTERN
+void
+fil_space_crypt_close_tablespace(
+/*=============================*/
+ ulint space) /*!< in: Space id */
+{
+ if (!srv_encrypt_tables) {
+ return;
+ }
+
+ mutex_enter(&fil_crypt_threads_mutex);
+
+ fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(space);
+
+ if (crypt_data == NULL) {
+ mutex_exit(&fil_crypt_threads_mutex);
+ return;
+ }
+
+ uint start = time(0);
+ uint last = start;
+
+ mutex_enter(&crypt_data->mutex);
+ mutex_exit(&fil_crypt_threads_mutex);
+ crypt_data->closing = true;
+
+ uint cnt = crypt_data->rotate_state.active_threads;
+ bool flushing = crypt_data->rotate_state.flushing;
+
+ while (cnt > 0 || flushing) {
+ mutex_exit(&crypt_data->mutex);
+ /* release dict mutex so that scrub threads can release their
+ * table references */
+ dict_mutex_exit_for_mysql();
+ /* wakeup throttle (all) sleepers */
+ os_event_set(fil_crypt_throttle_sleep_event);
+ os_thread_sleep(20000);
+ dict_mutex_enter_for_mysql();
+ mutex_enter(&crypt_data->mutex);
+ cnt = crypt_data->rotate_state.active_threads;
+ flushing = crypt_data->rotate_state.flushing;
+
+ uint now = time(0);
+
+ if (now >= last + 30) {
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "Waited %u seconds to drop space: %lu.",
+ now - start, space);
+ last = now;
+ }
+ }
+
+ mutex_exit(&crypt_data->mutex);
+}
+
+/*********************************************************************
+Get crypt status for a space (used by information_schema)
+return 0 if crypt data present */
+UNIV_INTERN
+int
+fil_space_crypt_get_status(
+/*=======================*/
+ ulint id, /*!< in: space id */
+ struct fil_space_crypt_status_t* status) /*!< out: status */
+{
+ fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(id);
+
+ if (crypt_data != NULL) {
+ status->space = id;
+ status->scheme = crypt_data->type;
+ mutex_enter(&crypt_data->mutex);
+ status->keyserver_requests = crypt_data->keyserver_requests;
+ status->min_key_version = crypt_data->min_key_version;
+
+ if (crypt_data->rotate_state.active_threads > 0 ||
+ crypt_data->rotate_state.flushing) {
+ status->rotating = true;
+ status->flushing =
+ crypt_data->rotate_state.flushing;
+ status->rotate_next_page_number =
+ crypt_data->rotate_state.next_offset;
+ status->rotate_max_page_number =
+ crypt_data->rotate_state.max_offset;
+ } else {
+ status->rotating = false;
+ }
+ mutex_exit(&crypt_data->mutex);
+
+ if (srv_encrypt_tables || crypt_data->min_key_version) {
+ status->current_key_version =
+ fil_crypt_get_latest_key_version(crypt_data);
+ } else {
+ status->current_key_version = 0;
+ }
+ } else {
+ memset(status, 0, sizeof(*status));
+ if (srv_encrypt_tables) {
+ os_event_set(fil_crypt_threads_event);
+ }
+ }
+
+ return crypt_data == NULL ? 1 : 0;
+}
+
+/*********************************************************************
+Return crypt statistics */
+UNIV_INTERN
+void
+fil_crypt_total_stat(
+/*=================*/
+ fil_crypt_stat_t *stat) /*!< out: Crypt statistics */
+{
+ mutex_enter(&crypt_stat_mutex);
+ *stat = crypt_stat;
+ mutex_exit(&crypt_stat_mutex);
+}
+
+/*********************************************************************
+Get scrub status for a space (used by information_schema)
+return 0 if data found */
+UNIV_INTERN
+int
+fil_space_get_scrub_status(
+/*=======================*/
+ ulint id, /*!< in: space id */
+ struct fil_space_scrub_status_t* status) /*!< out: status */
+{
+ fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(id);
+ memset(status, 0, sizeof(*status));
+
+ if (crypt_data != NULL) {
+ status->space = id;
+ status->compressed = fil_space_get_zip_size(id) > 0;
+ mutex_enter(&crypt_data->mutex);
+ status->last_scrub_completed =
+ crypt_data->rotate_state.scrubbing.last_scrub_completed;
+ if (crypt_data->rotate_state.active_threads > 0 &&
+ crypt_data->rotate_state.scrubbing.is_active) {
+ status->scrubbing = true;
+ status->current_scrub_started =
+ crypt_data->rotate_state.start_time;
+ status->current_scrub_active_threads =
+ crypt_data->rotate_state.active_threads;
+ status->current_scrub_page_number =
+ crypt_data->rotate_state.next_offset;
+ status->current_scrub_max_page_number =
+ crypt_data->rotate_state.max_offset;
+ } else {
+ status->scrubbing = false;
+ }
+ mutex_exit(&crypt_data->mutex);
+ } else {
+ memset(status, 0, sizeof(*status));
+ }
+
+ return crypt_data == NULL ? 1 : 0;
+}
diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc
index 506ba320853..9d1662802ba 100644
--- a/storage/innobase/fil/fil0fil.cc
+++ b/storage/innobase/fil/fil0fil.cc
@@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2015, MariaDB Corporation. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -24,6 +25,9 @@ Created 10/25/1995 Heikki Tuuri
*******************************************************/
#include "fil0fil.h"
+#include "fil0pagecompress.h"
+#include "fsp0pagecompress.h"
+#include "fil0crypt.h"
#include <debug_sync.h>
#include <my_dbug.h>
@@ -45,6 +49,7 @@ Created 10/25/1995 Heikki Tuuri
#include "page0zip.h"
#include "trx0sys.h"
#include "row0mysql.h"
+#include "os0file.h"
#ifndef UNIV_HOTBACKUP
# include "buf0lru.h"
# include "ibuf0ibuf.h"
@@ -54,6 +59,13 @@ Created 10/25/1995 Heikki Tuuri
# include "srv0srv.h"
static ulint srv_data_read, srv_data_written;
#endif /* !UNIV_HOTBACKUP */
+#include "zlib.h"
+#ifdef __linux__
+#include <linux/fs.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#endif
+#include "row0mysql.h"
MYSQL_PLUGIN_IMPORT extern my_bool lower_case_file_system;
@@ -262,11 +274,16 @@ fil_read(
block size multiple */
void* buf, /*!< in/out: buffer where to store data read;
in aio this must be appropriately aligned */
- void* message) /*!< in: message for aio handler if non-sync
+ void* message, /*!< in: message for aio handler if non-sync
aio used, else ignored */
+ ulint* write_size) /*!< in/out: Actual write size initialized
+ after fist successfull trim
+ operation for this page and if
+ initialized we do not trim again if
+ actual page size does not decrease. */
{
return(fil_io(OS_FILE_READ, sync, space_id, zip_size, block_offset,
- byte_offset, len, buf, message));
+ byte_offset, len, buf, message, write_size));
}
/********************************************************************//**
@@ -291,18 +308,22 @@ fil_write(
be a block size multiple */
void* buf, /*!< in: buffer from which to write; in aio
this must be appropriately aligned */
- void* message) /*!< in: message for aio handler if non-sync
+ void* message, /*!< in: message for aio handler if non-sync
aio used, else ignored */
+ ulint* write_size) /*!< in/out: Actual write size initialized
+ after fist successfull trim
+ operation for this page and if
+ initialized we do not trim again if
+ actual page size does not decrease. */
{
ut_ad(!srv_read_only_mode);
return(fil_io(OS_FILE_WRITE, sync, space_id, zip_size, block_offset,
- byte_offset, len, buf, message));
+ byte_offset, len, buf, message, write_size));
}
/*******************************************************************//**
Returns the table space by a given id, NULL if not found. */
-UNIV_INLINE
fil_space_t*
fil_space_get_by_id(
/*================*/
@@ -321,6 +342,39 @@ fil_space_get_by_id(
}
/*******************************************************************//**
+Returns the table space by a given id, NULL if not found. */
+fil_space_t*
+fil_space_found_by_id(
+/*==================*/
+ ulint id) /*!< in: space id */
+{
+ fil_space_t* space = NULL;
+ mutex_enter(&fil_system->mutex);
+ space = fil_space_get_by_id(id);
+
+ /* Not found if space is being deleted */
+ if (space && space->stop_new_ops) {
+ space = NULL;
+ }
+
+ mutex_exit(&fil_system->mutex);
+ return space;
+}
+
+/****************************************************************//**
+Get space id from fil node */
+ulint
+fil_node_get_space_id(
+/*==================*/
+ fil_node_t* node) /*!< in: Compressed node*/
+{
+ ut_ad(node);
+ ut_ad(node->space);
+
+ return (node->space->id);
+}
+
+/*******************************************************************//**
Returns the table space by a given name, NULL if not found. */
UNIV_INLINE
fil_space_t*
@@ -540,8 +594,9 @@ fil_node_open_file(
byte* buf2;
byte* page;
ulint space_id;
- ulint flags;
+ ulint flags=0;
ulint page_size;
+ ulint atomic_writes=0;
ut_ad(mutex_own(&(system->mutex)));
ut_a(node->n_pending == 0);
@@ -558,7 +613,7 @@ fil_node_open_file(
node->handle = os_file_create_simple_no_error_handling(
innodb_file_data_key, node->name, OS_FILE_OPEN,
- OS_FILE_READ_ONLY, &success);
+ OS_FILE_READ_ONLY, &success, 0);
if (!success) {
/* The following call prints an error message */
os_file_get_last_error(true);
@@ -575,6 +630,10 @@ fil_node_open_file(
size_bytes = os_file_get_size(node->handle);
ut_a(size_bytes != (os_offset_t) -1);
+
+ node->file_block_size = os_file_get_block_size(node->handle, node->name);
+ space->file_block_size = node->file_block_size;
+
#ifdef UNIV_HOTBACKUP
if (space->id == 0) {
node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE);
@@ -607,9 +666,13 @@ fil_node_open_file(
page = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
success = os_file_read(node->handle, page, 0, UNIV_PAGE_SIZE);
+
space_id = fsp_header_get_space_id(page);
flags = fsp_header_get_flags(page);
+
page_size = fsp_flags_get_page_size(flags);
+ atomic_writes = fsp_flags_get_atomic_writes(flags);
+
ut_free(buf2);
@@ -660,13 +723,28 @@ fil_node_open_file(
ut_error;
}
- if (size_bytes >= 1024 * 1024) {
- /* Truncate the size to whole megabytes. */
- size_bytes = ut_2pow_round(size_bytes, 1024 * 1024);
+ if (UNIV_UNLIKELY(space->flags != flags)) {
+ if (!dict_tf_verify_flags(space->flags, flags)) {
+ fprintf(stderr,
+ "InnoDB: Error: table flags are 0x%lx"
+ " in the data dictionary\n"
+ "InnoDB: but the flags in file %s are 0x%lx!\n",
+ space->flags, node->name, flags);
+ ut_error;
+ }
+ }
+
+ if (size_bytes >= FSP_EXTENT_SIZE * UNIV_PAGE_SIZE) {
+ /* Truncate the size to whole extent size. */
+ size_bytes = ut_2pow_round(size_bytes,
+ FSP_EXTENT_SIZE *
+ UNIV_PAGE_SIZE);
}
if (!fsp_flags_is_compressed(flags)) {
- node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE);
+ node->size = (ulint)
+ (size_bytes
+ / fsp_flags_get_page_size(flags));
} else {
node->size = (ulint)
(size_bytes
@@ -679,6 +757,8 @@ add_size:
space->size += node->size;
}
+ atomic_writes = fsp_flags_get_atomic_writes(space->flags);
+
/* printf("Opening file %s\n", node->name); */
/* Open the file for reading and writing, in Windows normally in the
@@ -689,18 +769,23 @@ add_size:
node->handle = os_file_create(innodb_file_log_key,
node->name, OS_FILE_OPEN,
OS_FILE_AIO, OS_LOG_FILE,
- &ret);
+ &ret, atomic_writes);
} else if (node->is_raw_disk) {
node->handle = os_file_create(innodb_file_data_key,
node->name,
OS_FILE_OPEN_RAW,
OS_FILE_AIO, OS_DATA_FILE,
- &ret);
+ &ret, atomic_writes);
} else {
node->handle = os_file_create(innodb_file_data_key,
node->name, OS_FILE_OPEN,
OS_FILE_AIO, OS_DATA_FILE,
- &ret);
+ &ret, atomic_writes);
+ }
+
+ if (node->file_block_size == 0) {
+ node->file_block_size = os_file_get_block_size(node->handle, node->name);
+ space->file_block_size = node->file_block_size;
}
ut_a(ret);
@@ -1064,14 +1149,14 @@ fil_space_create(
const char* name, /*!< in: space name */
ulint id, /*!< in: space id */
ulint flags, /*!< in: tablespace flags */
- ulint purpose)/*!< in: FIL_TABLESPACE, or FIL_LOG if log */
+ ulint purpose,/*!< in: FIL_TABLESPACE, or FIL_LOG if log */
+ fil_space_crypt_t* crypt_data) /*!< in: crypt data */
{
fil_space_t* space;
DBUG_EXECUTE_IF("fil_space_create_failure", return(false););
ut_a(fil_system);
- ut_a(fsp_flags_is_valid(flags));
/* Look for a matching tablespace and if found free it. */
do {
@@ -1148,6 +1233,7 @@ fil_space_create(
space->flags = flags;
space->magic_n = FIL_SPACE_MAGIC_N;
+ space->printed_compression_failure = false;
rw_lock_create(fil_space_latch_key, &space->latch, SYNC_FSP);
@@ -1159,6 +1245,8 @@ fil_space_create(
UT_LIST_ADD_LAST(space_list, fil_system->space_list, space);
+ space->crypt_data = crypt_data;
+
mutex_exit(&fil_system->mutex);
return(TRUE);
@@ -1293,6 +1381,8 @@ fil_space_free(
rw_lock_free(&(space->latch));
+ fil_space_destroy_crypt_data(&(space->crypt_data));
+
mem_free(space->name);
mem_free(space);
@@ -1338,21 +1428,26 @@ fil_space_get_space(
}
/* The following code must change when InnoDB supports
- multiple datafiles per tablespace. */
- ut_a(1 == UT_LIST_GET_LEN(space->chain));
+ multiple datafiles per tablespace. Note that there is small
+ change that space is found from tablespace list but
+ we have not yet created node for it and as we hold
+ fil_system mutex here fil_node_create can't continue. */
+ ut_a(UT_LIST_GET_LEN(space->chain) == 1 || UT_LIST_GET_LEN(space->chain) == 0);
node = UT_LIST_GET_FIRST(space->chain);
- /* It must be a single-table tablespace and we have not opened
- the file yet; the following calls will open it and update the
- size fields */
+ if (node) {
+ /* It must be a single-table tablespace and we have not opened
+ the file yet; the following calls will open it and update the
+ size fields */
- if (!fil_node_prepare_for_io(node, fil_system, space)) {
- /* The single-table tablespace can't be opened,
- because the ibd file is missing. */
- return(NULL);
+ if (!fil_node_prepare_for_io(node, fil_system, space)) {
+ /* The single-table tablespace can't be opened,
+ because the ibd file is missing. */
+ return(NULL);
+ }
+ fil_node_complete_io(node, fil_system, OS_FILE_READ);
}
- fil_node_complete_io(node, fil_system, OS_FILE_READ);
}
return(space);
@@ -1526,6 +1621,8 @@ fil_init(
UT_LIST_INIT(fil_system->LRU);
fil_system->max_n_open = max_n_open;
+
+ fil_space_crypt_init();
}
/*******************************************************************//**
@@ -1725,12 +1822,13 @@ fil_write_lsn_and_arch_no_to_file(
buf = static_cast<byte*>(ut_align(buf1, UNIV_PAGE_SIZE));
err = fil_read(TRUE, space, 0, sum_of_sizes, 0,
- UNIV_PAGE_SIZE, buf, NULL);
+ UNIV_PAGE_SIZE, buf, NULL, 0);
if (err == DB_SUCCESS) {
- mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn);
+ mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION,
+ lsn);
err = fil_write(TRUE, space, 0, sum_of_sizes, 0,
- UNIV_PAGE_SIZE, buf, NULL);
+ UNIV_PAGE_SIZE, buf, NULL, 0);
}
mem_free(buf1);
@@ -1818,6 +1916,11 @@ fil_check_first_page(
flags = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page);
if (UNIV_PAGE_SIZE != fsp_flags_get_page_size(flags)) {
+ fprintf(stderr,
+ "InnoDB: Error: Current page size %lu != "
+ " page size on page %lu\n",
+ UNIV_PAGE_SIZE, fsp_flags_get_page_size(flags));
+
return("innodb-page-size mismatch");
}
@@ -1870,13 +1973,15 @@ fil_read_first_page(
#endif /* UNIV_LOG_ARCHIVE */
lsn_t* min_flushed_lsn, /*!< out: min of flushed
lsn values in data files */
- lsn_t* max_flushed_lsn) /*!< out: max of flushed
+ lsn_t* max_flushed_lsn, /*!< out: max of flushed
lsn values in data files */
+ fil_space_crypt_t** crypt_data) /*< out: crypt data */
{
byte* buf;
byte* page;
lsn_t flushed_lsn;
const char* check_msg = NULL;
+ fil_space_crypt_t* cdata;
buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
@@ -1893,11 +1998,38 @@ fil_read_first_page(
if (!one_read_already) {
*flags = fsp_header_get_flags(page);
*space_id = fsp_header_get_space_id(page);
+ }
+ if (!one_read_already) {
check_msg = fil_check_first_page(page);
}
- flushed_lsn = mach_read_from_8(page + FIL_PAGE_FILE_FLUSH_LSN);
+ flushed_lsn = mach_read_from_8(page +
+ FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+
+ ulint space = fsp_header_get_space_id(page);
+ ulint offset = fsp_header_get_crypt_offset(
+ fsp_flags_get_zip_size(*flags), NULL);
+ cdata = fil_space_read_crypt_data(space, page, offset);
+
+ /* If file space is encrypted we need to have at least some
+ encryption service available where to get keys */
+ if ((cdata && cdata->encryption == FIL_SPACE_ENCRYPTION_ON) ||
+ (srv_encrypt_tables &&
+ cdata && cdata->encryption == FIL_SPACE_ENCRYPTION_DEFAULT)) {
+
+ if (!encryption_key_id_exists(cdata->key_id)) {
+ ib_logf(IB_LOG_LEVEL_FATAL,
+ "Tablespace id %ld encrypted but encryption service"
+ " not available. Can't continue opening tablespace.\n",
+ space);
+ ut_error;
+ }
+ }
+
+ if (crypt_data) {
+ *crypt_data = cdata;
+ }
ut_free(buf);
@@ -2382,6 +2514,9 @@ fil_check_pending_operations(
*space = 0;
+ /* Wait for crypt threads to stop accessing space */
+ fil_space_crypt_close_tablespace(id);
+
mutex_enter(&fil_system->mutex);
fil_space_t* sp = fil_space_get_by_id(id);
if (sp) {
@@ -3027,7 +3162,7 @@ fil_create_link_file(
file = os_file_create_simple_no_error_handling(
innodb_file_data_key, link_filepath,
- OS_FILE_CREATE, OS_FILE_READ_WRITE, &success);
+ OS_FILE_CREATE, OS_FILE_READ_WRITE, &success, 0);
if (!success) {
/* The following call will print an error message */
@@ -3043,10 +3178,10 @@ fil_create_link_file(
ut_print_filename(stderr, filepath);
fputs(" already exists.\n", stderr);
err = DB_TABLESPACE_EXISTS;
-
} else if (error == OS_FILE_DISK_FULL) {
err = DB_OUT_OF_FILE_SPACE;
-
+ } else if (error == OS_FILE_OPERATION_NOT_SUPPORTED) {
+ err = DB_UNSUPPORTED;
} else {
err = DB_ERROR;
}
@@ -3057,7 +3192,7 @@ fil_create_link_file(
}
if (!os_file_write(link_filepath, file, filepath, 0,
- strlen(filepath))) {
+ strlen(filepath))) {
err = DB_ERROR;
}
@@ -3136,8 +3271,9 @@ fil_open_linked_file(
/*===============*/
const char* tablename, /*!< in: database/tablename */
char** remote_filepath,/*!< out: remote filepath */
- os_file_t* remote_file) /*!< out: remote file handle */
-
+ os_file_t* remote_file, /*!< out: remote file handle */
+ ulint atomic_writes) /*!< in: atomic writes table option
+ value */
{
ibool success;
@@ -3151,7 +3287,7 @@ fil_open_linked_file(
*remote_file = os_file_create_simple_no_error_handling(
innodb_file_data_key, *remote_filepath,
OS_FILE_OPEN, OS_FILE_READ_ONLY,
- &success);
+ &success, atomic_writes);
if (!success) {
char* link_filepath = fil_make_isl_name(tablename);
@@ -3206,6 +3342,7 @@ fil_create_new_single_table_tablespace(
/* TRUE if a table is created with CREATE TEMPORARY TABLE */
bool is_temp = !!(flags2 & DICT_TF2_TEMPORARY);
bool has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags);
+ ulint atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags);
ut_a(space_id > 0);
ut_ad(!srv_read_only_mode);
@@ -3238,7 +3375,8 @@ fil_create_new_single_table_tablespace(
OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT,
OS_FILE_NORMAL,
OS_DATA_FILE,
- &ret);
+ &ret,
+ atomic_writes);
if (ret == FALSE) {
/* The following call will print an error message */
@@ -3265,6 +3403,11 @@ fil_create_new_single_table_tablespace(
goto error_exit_3;
}
+ if (error == OS_FILE_OPERATION_NOT_SUPPORTED) {
+ err = DB_UNSUPPORTED;
+ goto error_exit_3;
+ }
+
if (error == OS_FILE_DISK_FULL) {
err = DB_OUT_OF_FILE_SPACE;
goto error_exit_3;
@@ -3303,6 +3446,7 @@ fil_create_new_single_table_tablespace(
flags = fsp_flags_set_page_size(flags, UNIV_PAGE_SIZE);
fsp_header_init_fields(page, space_id, flags);
mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id);
+ ut_ad(fsp_flags_is_valid(flags));
if (!(fsp_flags_is_compressed(flags))) {
buf_flush_init_for_writing(page, NULL, 0);
@@ -3352,7 +3496,9 @@ fil_create_new_single_table_tablespace(
}
}
- success = fil_space_create(tablename, space_id, flags, FIL_TABLESPACE);
+ success = fil_space_create(tablename, space_id, flags, FIL_TABLESPACE,
+ fil_space_create_crypt_data(FIL_SPACE_ENCRYPTION_DEFAULT, FIL_DEFAULT_ENCRYPTION_KEY));
+
if (!success || !fil_node_create(path, size, space_id, FALSE)) {
err = DB_ERROR;
goto error_exit_1;
@@ -3479,16 +3625,26 @@ fil_open_single_table_tablespace(
fsp_open_info remote;
ulint tablespaces_found = 0;
ulint valid_tablespaces_found = 0;
+ ulint atomic_writes = 0;
+ fil_space_crypt_t* crypt_data = NULL;
#ifdef UNIV_SYNC_DEBUG
ut_ad(!fix_dict || rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
#endif /* UNIV_SYNC_DEBUG */
ut_ad(!fix_dict || mutex_own(&(dict_sys->mutex)));
- if (!fsp_flags_is_valid(flags)) {
+ /* Table flags can be ULINT_UNDEFINED if
+ dict_tf_to_fsp_flags_failure is set. */
+ if (flags != ULINT_UNDEFINED) {
+ if (!fsp_flags_is_valid(flags)) {
+ return(DB_CORRUPTION);
+ }
+ } else {
return(DB_CORRUPTION);
}
+ atomic_writes = fsp_flags_get_atomic_writes(flags);
+
/* If the tablespace was relocated, we do not
compare the DATA_DIR flag */
ulint mod_flags = flags & ~FSP_FLAGS_MASK_DATA_DIR;
@@ -3513,7 +3669,7 @@ fil_open_single_table_tablespace(
}
link_file_found = fil_open_linked_file(
- tablename, &remote.filepath, &remote.file);
+ tablename, &remote.filepath, &remote.file, atomic_writes);
remote.success = link_file_found;
if (remote.success) {
/* possibility of multiple files. */
@@ -3541,7 +3697,7 @@ fil_open_single_table_tablespace(
if (dict.filepath) {
dict.file = os_file_create_simple_no_error_handling(
innodb_file_data_key, dict.filepath, OS_FILE_OPEN,
- OS_FILE_READ_ONLY, &dict.success);
+ OS_FILE_READ_ONLY, &dict.success, atomic_writes);
if (dict.success) {
/* possibility of multiple files. */
validate = true;
@@ -3553,7 +3709,7 @@ fil_open_single_table_tablespace(
ut_a(def.filepath);
def.file = os_file_create_simple_no_error_handling(
innodb_file_data_key, def.filepath, OS_FILE_OPEN,
- OS_FILE_READ_ONLY, &def.success);
+ OS_FILE_READ_ONLY, &def.success, atomic_writes);
if (def.success) {
tablespaces_found++;
}
@@ -3572,7 +3728,7 @@ fil_open_single_table_tablespace(
#ifdef UNIV_LOG_ARCHIVE
&space_arch_log_no, &space_arch_log_no,
#endif /* UNIV_LOG_ARCHIVE */
- &def.lsn, &def.lsn);
+ &def.lsn, &def.lsn, &def.crypt_data);
def.valid = !def.check_msg;
/* Validate this single-table-tablespace with SYS_TABLES,
@@ -3597,7 +3753,7 @@ fil_open_single_table_tablespace(
#ifdef UNIV_LOG_ARCHIVE
&remote.arch_log_no, &remote.arch_log_no,
#endif /* UNIV_LOG_ARCHIVE */
- &remote.lsn, &remote.lsn);
+ &remote.lsn, &remote.lsn, &remote.crypt_data);
remote.valid = !remote.check_msg;
/* Validate this single-table-tablespace with SYS_TABLES,
@@ -3623,7 +3779,7 @@ fil_open_single_table_tablespace(
#ifdef UNIV_LOG_ARCHIVE
&dict.arch_log_no, &dict.arch_log_no,
#endif /* UNIV_LOG_ARCHIVE */
- &dict.lsn, &dict.lsn);
+ &dict.lsn, &dict.lsn, &dict.crypt_data);
dict.valid = !dict.check_msg;
/* Validate this single-table-tablespace with SYS_TABLES,
@@ -3776,9 +3932,17 @@ fil_open_single_table_tablespace(
}
skip_validate:
+ if (remote.success)
+ crypt_data = remote.crypt_data;
+ else if (dict.success)
+ crypt_data = dict.crypt_data;
+ else if (def.success)
+ crypt_data = def.crypt_data;
+
if (err != DB_SUCCESS) {
; // Don't load the tablespace into the cache
- } else if (!fil_space_create(tablename, id, flags, FIL_TABLESPACE)) {
+ } else if (!fil_space_create(tablename, id, flags, FIL_TABLESPACE,
+ crypt_data)) {
err = DB_ERROR;
} else {
/* We do not measure the size of the file, that is why
@@ -3798,15 +3962,25 @@ cleanup_and_exit:
if (remote.filepath) {
mem_free(remote.filepath);
}
+ if (remote.crypt_data && remote.crypt_data != crypt_data) {
+ fil_space_destroy_crypt_data(&remote.crypt_data);
+ }
if (dict.success) {
os_file_close(dict.file);
}
if (dict.filepath) {
mem_free(dict.filepath);
}
+ if (dict.crypt_data && dict.crypt_data != crypt_data) {
+ fil_space_destroy_crypt_data(&dict.crypt_data);
+ }
if (def.success) {
os_file_close(def.file);
}
+ if (def.crypt_data && def.crypt_data != crypt_data) {
+ fil_space_destroy_crypt_data(&def.crypt_data);
+ }
+
mem_free(def.filepath);
return(err);
@@ -4000,7 +4174,7 @@ fil_user_tablespace_restore_page(
err = os_file_write(fsp->filepath, fsp->file, page,
(zip_size ? zip_size : page_size) * page_no,
- buflen);
+ buflen);
os_file_flush(fsp->file);
out:
@@ -4022,12 +4196,13 @@ fil_validate_single_table_tablespace(
check_first_page:
fsp->success = TRUE;
+ fsp->encryption_error = 0;
if (const char* check_msg = fil_read_first_page(
fsp->file, FALSE, &fsp->flags, &fsp->id,
#ifdef UNIV_LOG_ARCHIVE
&fsp->arch_log_no, &fsp->arch_log_no,
#endif /* UNIV_LOG_ARCHIVE */
- &fsp->lsn, &fsp->lsn)) {
+ &fsp->lsn, &fsp->lsn, &fsp->crypt_data)) {
ib_logf(IB_LOG_LEVEL_ERROR,
"%s in tablespace %s (table %s)",
check_msg, fsp->filepath, tablename);
@@ -4100,9 +4275,7 @@ fil_load_single_table_tablespace(
fsp_open_info def;
fsp_open_info remote;
os_offset_t size;
-#ifdef UNIV_HOTBACKUP
fil_space_t* space;
-#endif
memset(&def, 0, sizeof(def));
memset(&remote, 0, sizeof(remote));
@@ -4135,7 +4308,8 @@ fil_load_single_table_tablespace(
one of them is sent to this function. So if this table has
already been loaded, there is nothing to do.*/
mutex_enter(&fil_system->mutex);
- if (fil_space_get_by_name(tablename)) {
+ space = fil_space_get_by_name(tablename);
+ if (space) {
mem_free(tablename);
mutex_exit(&fil_system->mutex);
return;
@@ -4160,7 +4334,7 @@ fil_load_single_table_tablespace(
/* Check for a link file which locates a remote tablespace. */
remote.success = fil_open_linked_file(
- tablename, &remote.filepath, &remote.file);
+ tablename, &remote.filepath, &remote.file, FALSE);
/* Read the first page of the remote tablespace */
if (remote.success) {
@@ -4175,7 +4349,7 @@ fil_load_single_table_tablespace(
/* Try to open the tablespace in the datadir. */
def.file = os_file_create_simple_no_error_handling(
innodb_file_data_key, def.filepath, OS_FILE_OPEN,
- OS_FILE_READ_WRITE, &def.success);
+ OS_FILE_READ_WRITE, &def.success, FALSE);
/* Read the first page of the remote tablespace */
if (def.success) {
@@ -4186,6 +4360,14 @@ fil_load_single_table_tablespace(
}
if (!def.success && !remote.success) {
+
+ if (def.encryption_error || remote.encryption_error) {
+ fprintf(stderr,
+ "InnoDB: Error: could not open single-table"
+ " tablespace file %s. Encryption error!\n", def.filepath);
+ return;
+ }
+
/* The following call prints an error message */
os_file_get_last_error(true);
fprintf(stderr,
@@ -4369,7 +4551,8 @@ will_not_choose:
mutex_exit(&fil_system->mutex);
#endif /* UNIV_HOTBACKUP */
ibool file_space_create_success = fil_space_create(
- tablename, fsp->id, fsp->flags, FIL_TABLESPACE);
+ tablename, fsp->id, fsp->flags, FIL_TABLESPACE,
+ fsp->crypt_data);
if (!file_space_create_success) {
if (srv_force_recovery > 0) {
@@ -4903,6 +5086,7 @@ retry:
}
page_size = fsp_flags_get_zip_size(space->flags);
+
if (!page_size) {
page_size = UNIV_PAGE_SIZE;
}
@@ -4940,6 +5124,12 @@ retry:
start_page_no = space->size;
file_start_page_no = space->size - node->size;
+ /* Determine correct file block size */
+ if (node->file_block_size == 0) {
+ node->file_block_size = os_file_get_block_size(node->handle, node->name);
+ space->file_block_size = node->file_block_size;
+ }
+
#ifdef HAVE_POSIX_FALLOCATE
if (srv_use_posix_fallocate) {
os_offset_t start_offset = start_page_no * page_size;
@@ -4951,7 +5141,7 @@ retry:
"space for file \'%s\' failed. Current size "
INT64PF ", desired size " INT64PF "\n",
node->name, start_offset, len+start_offset);
- os_file_handle_error_no_exit(node->name, "posix_fallocate", FALSE);
+ os_file_handle_error_no_exit(node->name, "posix_fallocate", FALSE, __FILE__, __LINE__);
success = FALSE;
} else {
success = TRUE;
@@ -4961,9 +5151,11 @@ retry:
success = FALSE; errno = 28; os_has_said_disk_full = TRUE;);
mutex_enter(&fil_system->mutex);
+
if (success) {
- node->size += n_pages;
- space->size += n_pages;
+ node->size += (size_after_extend - start_page_no);
+ space->size += (size_after_extend - start_page_no);
+
os_has_said_disk_full = FALSE;
}
@@ -4999,7 +5191,7 @@ retry:
success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC,
node->name, node->handle, buf,
offset, page_size * n_pages,
- NULL, NULL);
+ node, NULL, 0);
#endif /* UNIV_HOTBACKUP */
@@ -5098,7 +5290,7 @@ fil_extend_tablespaces_to_stored_len(void)
single-threaded operation */
error = fil_read(TRUE, space->id,
fsp_flags_get_zip_size(space->flags),
- 0, 0, UNIV_PAGE_SIZE, buf, NULL);
+ 0, 0, UNIV_PAGE_SIZE, buf, NULL, 0);
ut_a(error == DB_SUCCESS);
size_in_header = fsp_get_size_low(buf);
@@ -5348,6 +5540,76 @@ fil_report_invalid_page_access(
}
/********************************************************************//**
+Find correct node from file space
+@return node */
+static
+fil_node_t*
+fil_space_get_node(
+ fil_space_t* space, /*!< in: file spage */
+ ulint space_id, /*!< in: space id */
+ ulint* block_offset, /*!< in/out: offset in number of blocks */
+ ulint byte_offset, /*!< in: remainder of offset in bytes; in
+ aio this must be divisible by the OS block
+ size */
+ ulint len) /*!< in: how many bytes to read or write; this
+ must not cross a file boundary; in aio this
+ must be a block size multiple */
+{
+ fil_node_t* node;
+ ut_ad(mutex_own(&fil_system->mutex));
+
+ node = UT_LIST_GET_FIRST(space->chain);
+
+ for (;;) {
+ if (node == NULL) {
+ return(NULL);
+ } else if (fil_is_user_tablespace_id(space->id)
+ && node->size == 0) {
+
+ /* We do not know the size of a single-table tablespace
+ before we open the file */
+ break;
+ } else if (node->size > *block_offset) {
+ /* Found! */
+ break;
+ } else {
+ *block_offset -= node->size;
+ node = UT_LIST_GET_NEXT(chain, node);
+ }
+ }
+
+ return (node);
+}
+/********************************************************************//**
+Return block size of node in file space
+@return file block size */
+UNIV_INTERN
+ulint
+fil_space_get_block_size(
+/*=====================*/
+ ulint space_id,
+ ulint block_offset,
+ ulint len)
+{
+ ulint block_size = 512;
+ ut_ad(!mutex_own(&fil_system->mutex));
+
+ mutex_enter(&fil_system->mutex);
+ fil_space_t* space = fil_space_get_space(space_id);
+
+ if (space) {
+ fil_node_t* node = fil_space_get_node(space, space_id, &block_offset, 0, len);
+
+ if (node) {
+ block_size = node->file_block_size;
+ }
+ }
+ mutex_exit(&fil_system->mutex);
+
+ return block_size;
+}
+
+/********************************************************************//**
Reads or writes data. This operation is asynchronous (aio).
@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do
i/o on a tablespace which does not exist */
@@ -5378,8 +5640,13 @@ fil_io(
void* buf, /*!< in/out: buffer where to store read data
or from where to write; in aio this must be
appropriately aligned */
- void* message) /*!< in: message for aio handler if non-sync
+ void* message, /*!< in: message for aio handler if non-sync
aio used, else ignored */
+ ulint* write_size) /*!< in/out: Actual write size initialized
+ after fist successfull trim
+ operation for this page and if
+ initialized we do not trim again if
+ actual page size does not decrease. */
{
ulint mode;
fil_space_t* space;
@@ -5388,7 +5655,7 @@ fil_io(
ulint is_log;
ulint wake_later;
os_offset_t offset;
- ibool ignore_nonexistent_pages;
+ bool ignore_nonexistent_pages;
is_log = type & OS_FILE_LOG;
type = type & ~OS_FILE_LOG;
@@ -5442,6 +5709,11 @@ fil_io(
} else if (type == OS_FILE_WRITE) {
ut_ad(!srv_read_only_mode);
srv_stats.data_written.add(len);
+ if (fil_page_is_index_page((byte *)buf)) {
+ srv_stats.index_pages_written.inc();
+ } else {
+ srv_stats.non_index_pages_written.inc();
+ }
}
/* Reserve the fil_system mutex and make sure that we can open at
@@ -5468,34 +5740,18 @@ fil_io(
ut_ad(mode != OS_AIO_IBUF || space->purpose == FIL_TABLESPACE);
- node = UT_LIST_GET_FIRST(space->chain);
-
- for (;;) {
- if (node == NULL) {
- if (ignore_nonexistent_pages) {
- mutex_exit(&fil_system->mutex);
- return(DB_ERROR);
- }
+ node = fil_space_get_node(space, space_id, &block_offset, byte_offset, len);
- fil_report_invalid_page_access(
+ if (!node) {
+ if (ignore_nonexistent_pages) {
+ mutex_exit(&fil_system->mutex);
+ return(DB_ERROR);
+ }
+ fil_report_invalid_page_access(
block_offset, space_id, space->name,
byte_offset, len, type);
- ut_error;
-
- } else if (fil_is_user_tablespace_id(space->id)
- && node->size == 0) {
-
- /* We do not know the size of a single-table tablespace
- before we open the file */
- break;
- } else if (node->size > block_offset) {
- /* Found! */
- break;
- } else {
- block_offset -= node->size;
- node = UT_LIST_GET_NEXT(chain, node);
- }
+ ut_error;
}
/* Open file if closed */
@@ -5578,8 +5834,18 @@ fil_io(
}
#else
/* Queue the aio request */
- ret = os_aio(type, mode | wake_later, node->name, node->handle, buf,
- offset, len, node, message);
+ ret = os_aio(
+ type,
+ mode | wake_later,
+ node->name,
+ node->handle,
+ buf,
+ offset,
+ len,
+ node,
+ message,
+ write_size);
+
#endif /* UNIV_HOTBACKUP */
@@ -5599,7 +5865,8 @@ fil_io(
if (!ret) {
return(DB_OUT_OF_FILE_SPACE);
} else {
- } return(DB_SUCCESS);
+ return(DB_SUCCESS);
+ }
}
#ifndef UNIV_HOTBACKUP
@@ -5726,7 +5993,7 @@ fil_flush(
node != NULL;
node = UT_LIST_GET_NEXT(chain, node)) {
- ib_int64_t old_mod_counter = node->modification_counter;;
+ ib_int64_t old_mod_counter = node->modification_counter;
if (old_mod_counter <= node->flush_counter) {
continue;
@@ -6007,6 +6274,8 @@ void
fil_close(void)
/*===========*/
{
+ fil_space_crypt_cleanup();
+
#ifndef UNIV_HOTBACKUP
/* The mutex should already have been freed. */
ut_ad(fil_system->mutex.magic_n == 0);
@@ -6056,6 +6325,8 @@ struct fil_iterator_t {
ulint n_io_buffers; /*!< Number of pages to use
for IO */
byte* io_buffer; /*!< Buffer to use for IO */
+ fil_space_crypt_t *crypt_data; /*!< Crypt data (if encrypted) */
+ byte* crypt_io_buffer; /*!< IO buffer when encrypted */
};
/********************************************************************//**
@@ -6118,8 +6389,12 @@ fil_iterate(
ut_ad(n_bytes > 0);
ut_ad(!(n_bytes % iter.page_size));
- if (!os_file_read(iter.file, io_buffer, offset,
- (ulint) n_bytes)) {
+ byte* readptr = io_buffer;
+ if (iter.crypt_data != NULL) {
+ readptr = iter.crypt_io_buffer;
+ }
+
+ if (!os_file_read(iter.file, readptr, offset, (ulint) n_bytes)) {
ib_logf(IB_LOG_LEVEL_ERROR, "os_file_read() failed");
@@ -6132,6 +6407,23 @@ fil_iterate(
for (ulint i = 0; i < n_pages_read; ++i) {
+ if (iter.crypt_data != NULL) {
+ ulint size = iter.page_size;
+ bool decrypted = fil_space_decrypt(
+ iter.crypt_data,
+ io_buffer + i * size, //dst
+ iter.page_size,
+ readptr + i * size); // src
+
+ if (decrypted) {
+ /* write back unencrypted page */
+ updated = true;
+ } else {
+ /* TODO: remove unnecessary memcpy's */
+ memcpy(io_buffer + i * size, readptr + i * size, size);
+ }
+ }
+
buf_block_set_file_page(block, space_id, page_no++);
dberr_t err;
@@ -6206,7 +6498,7 @@ fil_tablespace_iterate(
file = os_file_create_simple_no_error_handling(
innodb_file_data_key, filepath,
- OS_FILE_OPEN, OS_FILE_READ_WRITE, &success);
+ OS_FILE_OPEN, OS_FILE_READ_WRITE, &success, FALSE);
DBUG_EXECUTE_IF("fil_tablespace_iterate_failure",
{
@@ -6273,6 +6565,13 @@ fil_tablespace_iterate(
iter.n_io_buffers = n_io_buffers;
iter.page_size = callback.get_page_size();
+ ulint crypt_data_offset = fsp_header_get_crypt_offset(
+ callback.get_zip_size(), 0);
+
+ /* read (optional) crypt data */
+ iter.crypt_data = fil_space_read_crypt_data(
+ 0, page, crypt_data_offset);
+
/* Compressed pages can't be optimised for block IO for now.
We do the IMPORT page by page. */
@@ -6281,6 +6580,14 @@ fil_tablespace_iterate(
ut_a(iter.page_size == callback.get_zip_size());
}
+ /** If tablespace is encrypted, it needs extra buffers */
+ if (iter.crypt_data != NULL) {
+ /* decrease io buffers so that memory
+ * consumption doesnt double
+ * note: the +1 is to avoid n_io_buffers getting down to 0 */
+ iter.n_io_buffers = (iter.n_io_buffers + 1) / 2;
+ }
+
/** Add an extra page for compressed page scratch area. */
void* io_buffer = mem_alloc(
@@ -6289,9 +6596,45 @@ fil_tablespace_iterate(
iter.io_buffer = static_cast<byte*>(
ut_align(io_buffer, UNIV_PAGE_SIZE));
+ void* crypt_io_buffer = NULL;
+ if (iter.crypt_data != NULL) {
+ crypt_io_buffer = mem_alloc(
+ iter.n_io_buffers * UNIV_PAGE_SIZE);
+ iter.crypt_io_buffer = static_cast<byte*>(
+ crypt_io_buffer);
+ }
+
err = fil_iterate(iter, &block, callback);
mem_free(io_buffer);
+
+ if (iter.crypt_data != NULL) {
+ /* clear crypt data from page 0 and write it back */
+ os_file_read(file, page, 0, UNIV_PAGE_SIZE);
+ fil_space_clear_crypt_data(page, crypt_data_offset);
+ lsn_t lsn = mach_read_from_8(page + FIL_PAGE_LSN);
+ if (callback.get_zip_size() == 0) {
+ buf_flush_init_for_writing(
+ page, 0, lsn);
+ } else {
+ buf_flush_update_zip_checksum(
+ page, callback.get_zip_size(), lsn);
+ }
+
+ if (!os_file_write(
+ iter.filepath, iter.file, page,
+ 0, iter.page_size)) {
+
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "os_file_write() failed");
+
+ return(DB_IO_ERROR);
+ }
+
+ mem_free(crypt_io_buffer);
+ iter.crypt_io_buffer = NULL;
+ fil_space_destroy_crypt_data(&iter.crypt_data);
+ }
}
if (err == DB_SUCCESS) {
@@ -6424,3 +6767,245 @@ fil_mtr_rename_log(
0, 0, new_name, old_name, mtr);
}
}
+
+/****************************************************************//**
+Acquire fil_system mutex */
+void
+fil_system_enter(void)
+/*==================*/
+{
+ ut_ad(!mutex_own(&fil_system->mutex));
+ mutex_enter(&fil_system->mutex);
+}
+
+/****************************************************************//**
+Release fil_system mutex */
+void
+fil_system_exit(void)
+/*=================*/
+{
+ ut_ad(mutex_own(&fil_system->mutex));
+ mutex_exit(&fil_system->mutex);
+}
+
+
+/******************************************************************
+Get id of first tablespace or ULINT_UNDEFINED if none */
+UNIV_INTERN
+ulint
+fil_get_first_space()
+/*=================*/
+{
+ ulint out_id = ULINT_UNDEFINED;
+ fil_space_t* space;
+
+ mutex_enter(&fil_system->mutex);
+
+ space = UT_LIST_GET_FIRST(fil_system->space_list);
+ if (space != NULL) {
+ do
+ {
+ if (!space->stop_new_ops) {
+ out_id = space->id;
+ break;
+ }
+ space = UT_LIST_GET_NEXT(space_list, space);
+ } while (space != NULL);
+ }
+
+ mutex_exit(&fil_system->mutex);
+
+ return out_id;
+}
+
+/******************************************************************
+Get id of first tablespace that has node or ULINT_UNDEFINED if none */
+UNIV_INTERN
+ulint
+fil_get_first_space_safe()
+/*======================*/
+{
+ ulint out_id = ULINT_UNDEFINED;
+ fil_space_t* space;
+
+ mutex_enter(&fil_system->mutex);
+
+ space = UT_LIST_GET_FIRST(fil_system->space_list);
+ if (space != NULL) {
+ do
+ {
+ if (!space->stop_new_ops && UT_LIST_GET_LEN(space->chain) > 0) {
+ out_id = space->id;
+ break;
+ }
+
+ space = UT_LIST_GET_NEXT(space_list, space);
+ } while (space != NULL);
+ }
+
+ mutex_exit(&fil_system->mutex);
+
+ return out_id;
+}
+
+/******************************************************************
+Get id of next tablespace or ULINT_UNDEFINED if none */
+UNIV_INTERN
+ulint
+fil_get_next_space(
+/*===============*/
+ ulint id) /*!< in: previous space id */
+{
+ bool found;
+ fil_space_t* space;
+ ulint out_id = ULINT_UNDEFINED;
+
+ mutex_enter(&fil_system->mutex);
+
+ space = fil_space_get_by_id(id);
+ if (space == NULL) {
+ /* we didn't find it...search for space with space->id > id */
+ found = false;
+ space = UT_LIST_GET_FIRST(fil_system->space_list);
+ } else {
+ /* we found it, take next available space */
+ found = true;
+ }
+
+ while ((space = UT_LIST_GET_NEXT(space_list, space)) != NULL) {
+
+ if (!found && space->id <= id)
+ continue;
+
+ if (!space->stop_new_ops && UT_LIST_GET_LEN(space->chain) > 0) {
+ /* inc reference to prevent drop */
+ out_id = space->id;
+ break;
+ }
+ }
+
+ mutex_exit(&fil_system->mutex);
+
+ return out_id;
+}
+
+/******************************************************************
+Get id of next tablespace that has node or ULINT_UNDEFINED if none */
+UNIV_INTERN
+ulint
+fil_get_next_space_safe(
+/*====================*/
+ ulint id) /*!< in: previous space id */
+{
+ bool found;
+ fil_space_t* space;
+ ulint out_id = ULINT_UNDEFINED;
+
+ mutex_enter(&fil_system->mutex);
+
+ space = fil_space_get_by_id(id);
+ if (space == NULL) {
+ /* we didn't find it...search for space with space->id > id */
+ found = false;
+ space = UT_LIST_GET_FIRST(fil_system->space_list);
+ } else {
+ /* we found it, take next available space */
+ found = true;
+ }
+
+ while ((space = UT_LIST_GET_NEXT(space_list, space)) != NULL) {
+
+ if (!found && space->id <= id)
+ continue;
+
+ if (!space->stop_new_ops) {
+ /* inc reference to prevent drop */
+ out_id = space->id;
+ break;
+ }
+ }
+
+ mutex_exit(&fil_system->mutex);
+
+ return out_id;
+}
+
+/******************************************************************
+Get crypt data for a tablespace */
+UNIV_INTERN
+fil_space_crypt_t*
+fil_space_get_crypt_data(
+/*=====================*/
+ ulint id) /*!< in: space id */
+{
+ fil_space_t* space;
+ fil_space_crypt_t* crypt_data = NULL;
+
+ ut_ad(fil_system);
+
+ mutex_enter(&fil_system->mutex);
+
+ space = fil_space_get_by_id(id);
+
+ if (space != NULL) {
+ crypt_data = space->crypt_data;
+ }
+
+ mutex_exit(&fil_system->mutex);
+
+ return(crypt_data);
+}
+
+/******************************************************************
+Get crypt data for a tablespace */
+UNIV_INTERN
+fil_space_crypt_t*
+fil_space_set_crypt_data(
+/*=====================*/
+ ulint id, /*!< in: space id */
+ fil_space_crypt_t* crypt_data) /*!< in: crypt data */
+{
+ fil_space_t* space;
+ fil_space_crypt_t* free_crypt_data = NULL;
+ fil_space_crypt_t* ret_crypt_data = NULL;
+
+ ut_ad(fil_system);
+
+ mutex_enter(&fil_system->mutex);
+
+ space = fil_space_get_by_id(id);
+
+ if (space != NULL) {
+ if (space->crypt_data != NULL) {
+ /* Here we need to release fil_system mutex to
+ avoid mutex deadlock assertion. Here we would
+ taje mutexes in order fil_system, crypt_data and
+ in fil_crypt_start_encrypting_space we would
+ take them in order crypt_data, fil_system
+ at fil_space_get_flags -> fil_space_get_space */
+ mutex_exit(&fil_system->mutex);
+ fil_space_merge_crypt_data(space->crypt_data,
+ crypt_data);
+ ret_crypt_data = space->crypt_data;
+ free_crypt_data = crypt_data;
+ } else {
+ space->crypt_data = crypt_data;
+ ret_crypt_data = space->crypt_data;
+ mutex_exit(&fil_system->mutex);
+ }
+ } else {
+ /* there is a small risk that tablespace has been deleted */
+ free_crypt_data = crypt_data;
+ mutex_exit(&fil_system->mutex);
+ }
+
+ if (free_crypt_data != NULL) {
+ /* there was already crypt data present and the new crypt
+ * data provided as argument to this function has been merged
+ * into that => free new crypt data
+ */
+ fil_space_destroy_crypt_data(&free_crypt_data);
+ }
+
+ return ret_crypt_data;
+}
diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc
new file mode 100644
index 00000000000..e508d4733db
--- /dev/null
+++ b/storage/innobase/fil/fil0pagecompress.cc
@@ -0,0 +1,643 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2015, MariaDB Corporation. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fil/fil0pagecompress.cc
+Implementation for page compressed file spaces.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@mariadb.com
+Updated 14/02/2015
+***********************************************************************/
+
+#include "fil0fil.h"
+#include "fil0pagecompress.h"
+
+#include <debug_sync.h>
+#include <my_dbug.h>
+
+#include "mem0mem.h"
+#include "hash0hash.h"
+#include "os0file.h"
+#include "mach0data.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "log0recv.h"
+#include "fsp0fsp.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "dict0dict.h"
+#include "page0page.h"
+#include "page0zip.h"
+#include "trx0sys.h"
+#include "row0mysql.h"
+#include "ha_prototypes.h" // IB_LOG_
+#ifndef UNIV_HOTBACKUP
+# include "buf0lru.h"
+# include "ibuf0ibuf.h"
+# include "sync0sync.h"
+# include "os0sync.h"
+#else /* !UNIV_HOTBACKUP */
+# include "srv0srv.h"
+static ulint srv_data_read, srv_data_written;
+#endif /* !UNIV_HOTBACKUP */
+#include "zlib.h"
+#ifdef __linux__
+#include <linux/fs.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#endif
+#include "row0mysql.h"
+#ifdef HAVE_LZ4
+#include "lz4.h"
+#endif
+#ifdef HAVE_LZO
+#include "lzo/lzo1x.h"
+#endif
+#ifdef HAVE_LZMA
+#include "lzma.h"
+#endif
+#ifdef HAVE_BZIP2
+#include "bzlib.h"
+#endif
+#ifdef HAVE_SNAPPY
+#include "snappy-c.h"
+#endif
+
+/* Used for debugging */
+//#define UNIV_PAGECOMPRESS_DEBUG 1
+
+/****************************************************************//**
+For page compressed pages compress the page before actual write
+operation.
+@return compressed page to be written*/
+UNIV_INTERN
+byte*
+fil_compress_page(
+/*==============*/
+ ulint space_id, /*!< in: tablespace id of the
+ table. */
+ byte* buf, /*!< in: buffer from which to write; in aio
+ this must be appropriately aligned */
+ byte* out_buf, /*!< out: compressed buffer */
+ ulint len, /*!< in: length of input buffer.*/
+ ulint level, /* in: compression level */
+ ulint block_size, /*!< in: block size */
+ bool encrypted, /*!< in: is page also encrypted */
+ ulint* out_len, /*!< out: actual length of compressed
+ page */
+ byte* lzo_mem) /*!< in: temporal memory used by LZO */
+{
+ int err = Z_OK;
+ int comp_level = level;
+ ulint header_len = FIL_PAGE_DATA + FIL_PAGE_COMPRESSED_SIZE;
+ ulint write_size=0;
+ /* Cache to avoid change during function execution */
+ ulint comp_method = innodb_compression_algorithm;
+ ulint orig_page_type;
+
+ if (encrypted) {
+ header_len += FIL_PAGE_COMPRESSION_METHOD_SIZE;
+ }
+
+ ut_ad(buf);
+ ut_ad(out_buf);
+ ut_ad(len);
+ ut_ad(out_len);
+
+ /* read original page type */
+ orig_page_type = mach_read_from_2(buf + FIL_PAGE_TYPE);
+
+ /* Let's not compress file space header or
+ extent descriptor */
+ if (orig_page_type == 0 ||
+ orig_page_type == FIL_PAGE_TYPE_FSP_HDR ||
+ orig_page_type == FIL_PAGE_TYPE_XDES ||
+ orig_page_type == FIL_PAGE_PAGE_COMPRESSED) {
+ *out_len = len;
+ return (buf);
+ }
+
+ fil_system_enter();
+ fil_space_t* space = fil_space_get_by_id(space_id);
+ fil_system_exit();
+
+ /* If no compression level was provided to this table, use system
+ default level */
+ if (comp_level == 0) {
+ comp_level = page_zip_level;
+ }
+
+#ifdef UNIV_PAGECOMPRESS_DEBUG
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "Preparing for compress for space %lu name %s len %lu.",
+ space_id, fil_space_name(space), len);
+#endif /* UNIV_PAGECOMPRESS_DEBUG */
+
+ write_size = UNIV_PAGE_SIZE - header_len;
+
+ switch(comp_method) {
+#ifdef HAVE_LZ4
+ case PAGE_LZ4_ALGORITHM:
+ err = LZ4_compress_limitedOutput((const char *)buf,
+ (char *)out_buf+header_len, len, write_size);
+ write_size = err;
+
+ if (err == 0) {
+ /* If error we leave the actual page as it was */
+
+#ifndef UNIV_PAGECOMPRESS_DEBUG
+ if (space->printed_compression_failure == false) {
+#endif
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "Compression failed for space %lu name %s len %lu rt %d write %lu.",
+ space_id, fil_space_name(space), len, err, write_size);
+ space->printed_compression_failure = true;
+#ifndef UNIV_PAGECOMPRESS_DEBUG
+ }
+#endif
+ srv_stats.pages_page_compression_error.inc();
+ *out_len = len;
+ return (buf);
+ }
+ break;
+#endif /* HAVE_LZ4 */
+#ifdef HAVE_LZO
+ case PAGE_LZO_ALGORITHM:
+ err = lzo1x_1_15_compress(
+ buf, len, out_buf+header_len, &write_size, lzo_mem);
+
+ if (err != LZO_E_OK || write_size > UNIV_PAGE_SIZE-header_len) {
+ if (space->printed_compression_failure == false) {
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "Compression failed for space %lu name %s len %lu err %d write_size %lu.",
+ space_id, fil_space_name(space), len, err, write_size);
+ space->printed_compression_failure = true;
+ }
+ srv_stats.pages_page_compression_error.inc();
+ *out_len = len;
+ return (buf);
+ }
+
+ break;
+#endif /* HAVE_LZO */
+#ifdef HAVE_LZMA
+ case PAGE_LZMA_ALGORITHM: {
+ size_t out_pos=0;
+
+ err = lzma_easy_buffer_encode(
+ comp_level,
+ LZMA_CHECK_NONE,
+ NULL, /* No custom allocator, use malloc/free */
+ reinterpret_cast<uint8_t*>(buf),
+ len,
+ reinterpret_cast<uint8_t*>(out_buf + header_len),
+ &out_pos,
+ (size_t)write_size);
+
+ if (err != LZMA_OK || out_pos > UNIV_PAGE_SIZE-header_len) {
+ if (space->printed_compression_failure == false) {
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "Compression failed for space %lu name %s len %lu err %d write_size %lu",
+ space_id, fil_space_name(space), len, err, out_pos);
+ space->printed_compression_failure = true;
+ }
+ srv_stats.pages_page_compression_error.inc();
+ *out_len = len;
+ return (buf);
+ }
+
+ write_size = out_pos;
+
+ break;
+ }
+#endif /* HAVE_LZMA */
+
+#ifdef HAVE_BZIP2
+ case PAGE_BZIP2_ALGORITHM: {
+
+ err = BZ2_bzBuffToBuffCompress(
+ (char *)(out_buf + header_len),
+ (unsigned int *)&write_size,
+ (char *)buf,
+ len,
+ 1,
+ 0,
+ 0);
+
+ if (err != BZ_OK || write_size > UNIV_PAGE_SIZE-header_len) {
+ if (space->printed_compression_failure == false) {
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "Compression failed for space %lu name %s len %lu err %d write_size %lu.",
+ space_id, fil_space_name(space), len, err, write_size);
+ space->printed_compression_failure = true;
+ }
+ srv_stats.pages_page_compression_error.inc();
+ *out_len = len;
+ return (buf);
+ }
+ break;
+ }
+#endif /* HAVE_BZIP2 */
+
+#ifdef HAVE_SNAPPY
+ case PAGE_SNAPPY_ALGORITHM:
+ {
+ snappy_status cstatus;
+
+ cstatus = snappy_compress((const char *)buf, len, (char *)(out_buf+header_len), &write_size);
+
+ if (cstatus != SNAPPY_OK || write_size > UNIV_PAGE_SIZE-header_len) {
+ if (space->printed_compression_failure == false) {
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "Compression failed for space %lu name %s len %lu err %d write_size %lu.",
+ space_id, fil_space_name(space), len, (int)cstatus, write_size);
+ space->printed_compression_failure = true;
+ }
+ srv_stats.pages_page_compression_error.inc();
+ *out_len = len;
+ return (buf);
+ }
+ break;
+ }
+#endif /* HAVE_SNAPPY */
+
+ case PAGE_ZLIB_ALGORITHM:
+ err = compress2(out_buf+header_len, (ulong*)&write_size, buf, len, comp_level);
+
+ if (err != Z_OK) {
+ /* If error we leave the actual page as it was */
+
+ if (space->printed_compression_failure == false) {
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "Compression failed for space %lu name %s len %lu rt %d write %lu.",
+ space_id, fil_space_name(space), len, err, write_size);
+ space->printed_compression_failure = true;
+ }
+
+ srv_stats.pages_page_compression_error.inc();
+ *out_len = len;
+ return (buf);
+ }
+ break;
+
+ case PAGE_UNCOMPRESSED:
+ *out_len = len;
+ return (buf);
+ break;
+ default:
+ ut_error;
+ break;
+ }
+
+ /* Set up the page header */
+ memcpy(out_buf, buf, FIL_PAGE_DATA);
+ /* Set up the checksum */
+ mach_write_to_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM, BUF_NO_CHECKSUM_MAGIC);
+
+ /* Set up the compression algorithm */
+ mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, comp_method);
+
+ if (encrypted) {
+ /* Set up the correct page type */
+ mach_write_to_2(out_buf+FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED);
+ mach_write_to_2(out_buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, comp_method);
+ } else {
+ /* Set up the correct page type */
+ mach_write_to_2(out_buf+FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED);
+ }
+
+ /* Set up the actual payload lenght */
+ mach_write_to_2(out_buf+FIL_PAGE_DATA, write_size);
+
+#ifdef UNIV_DEBUG
+ /* Verify */
+ ut_ad(fil_page_is_compressed(out_buf) || fil_page_is_compressed_encrypted(out_buf));
+ ut_ad(mach_read_from_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM) == BUF_NO_CHECKSUM_MAGIC);
+ ut_ad(mach_read_from_2(out_buf+FIL_PAGE_DATA) == write_size);
+ ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) == (ulint)comp_method ||
+ mach_read_from_2(out_buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE) == (ulint)comp_method);
+
+ /* Verify that page can be decompressed */
+ {
+ byte *comp_page;
+ byte *uncomp_page;
+
+ comp_page = static_cast<byte *>(ut_malloc(UNIV_PAGE_SIZE*3));
+ uncomp_page = static_cast<byte *>(ut_malloc(UNIV_PAGE_SIZE*3));
+ memcpy(comp_page, out_buf, UNIV_PAGE_SIZE);
+
+ fil_decompress_page(uncomp_page, comp_page, len, NULL);
+ if(buf_page_is_corrupted(false, uncomp_page, 0)) {
+ buf_page_print(uncomp_page, 0, BUF_PAGE_PRINT_NO_CRASH);
+ ut_error;
+ }
+ ut_free(comp_page);
+ ut_free(uncomp_page);
+ }
+#endif /* UNIV_DEBUG */
+
+ write_size+=header_len;
+
+ /* Actual write needs to be alligned on block size */
+ if (write_size % block_size) {
+ size_t tmp = write_size;
+#ifdef UNIV_DEBUG
+ ut_a(block_size > 0);
+#endif
+ write_size = (size_t)ut_uint64_align_up((ib_uint64_t)write_size, block_size);
+#ifdef UNIV_DEBUG
+ ut_a(write_size > 0 && ((write_size % block_size) == 0));
+ ut_a(write_size >= tmp);
+#endif
+ }
+
+#ifdef UNIV_PAGECOMPRESS_DEBUG
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "Compression succeeded for space %lu name %s len %lu out_len %lu.",
+ space_id, fil_space_name(space), len, write_size);
+#endif /* UNIV_PAGECOMPRESS_DEBUG */
+
+ srv_stats.page_compression_saved.add((len - write_size));
+ srv_stats.pages_page_compressed.inc();
+
+ /* If we do not persistently trim rest of page, we need to write it
+ all */
+ if (!srv_use_trim) {
+ write_size = len;
+ }
+
+ *out_len = write_size;
+
+ return(out_buf);
+
+}
+
+/****************************************************************//**
+For page compressed pages decompress the page after actual read
+operation. */
+UNIV_INTERN
+void
+fil_decompress_page(
+/*================*/
+ byte* page_buf, /*!< in: preallocated buffer or NULL */
+ byte* buf, /*!< out: buffer from which to read; in aio
+ this must be appropriately aligned */
+ ulong len, /*!< in: length of output buffer.*/
+ ulint* write_size) /*!< in/out: Actual payload size of
+ the compressed data. */
+{
+ int err = 0;
+ ulint actual_size = 0;
+ ulint compression_alg = 0;
+ byte *in_buf;
+ ulint ptype;
+ ulint header_len = FIL_PAGE_DATA + FIL_PAGE_COMPRESSED_SIZE;
+
+ ut_ad(buf);
+ ut_ad(len);
+
+ ptype = mach_read_from_2(buf+FIL_PAGE_TYPE);
+
+ if (ptype == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) {
+ header_len += FIL_PAGE_COMPRESSION_METHOD_SIZE;
+ }
+
+ /* Do not try to uncompressed pages that are not compressed */
+ if (ptype != FIL_PAGE_PAGE_COMPRESSED &&
+ ptype != FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED &&
+ ptype != FIL_PAGE_TYPE_COMPRESSED) {
+ return;
+ }
+
+ // If no buffer was given, we need to allocate temporal buffer
+ if (page_buf == NULL) {
+ in_buf = static_cast<byte *>(ut_malloc(UNIV_PAGE_SIZE*3));
+ } else {
+ in_buf = page_buf;
+ }
+
+ /* Before actual decompress, make sure that page type is correct */
+
+ if (mach_read_from_4(buf+FIL_PAGE_SPACE_OR_CHKSUM) != BUF_NO_CHECKSUM_MAGIC ||
+ (ptype != FIL_PAGE_PAGE_COMPRESSED &&
+ ptype != FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED)) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Corruption: We try to uncompress corrupted page"
+ " CRC %lu type %lu len %lu.",
+ mach_read_from_4(buf+FIL_PAGE_SPACE_OR_CHKSUM),
+ mach_read_from_2(buf+FIL_PAGE_TYPE), len);
+
+ fflush(stderr);
+ ut_error;
+ }
+
+ /* Get compression algorithm */
+ if (ptype == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) {
+ compression_alg = mach_read_from_2(buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE);
+ } else {
+ compression_alg = mach_read_from_8(buf+FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+ }
+
+ /* Get the actual size of compressed page */
+ actual_size = mach_read_from_2(buf+FIL_PAGE_DATA);
+ /* Check if payload size is corrupted */
+ if (actual_size == 0 || actual_size > UNIV_PAGE_SIZE) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Corruption: We try to uncompress corrupted page"
+ " actual size %lu compression %s.",
+ actual_size, fil_get_compression_alg_name(compression_alg));
+ fflush(stderr);
+ ut_error;
+ }
+
+ /* Store actual payload size of the compressed data. This pointer
+ points to buffer pool. */
+ if (write_size) {
+ *write_size = actual_size;
+ }
+
+#ifdef UNIV_PAGECOMPRESS_DEBUG
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "Preparing for decompress for len %lu\n",
+ actual_size);
+#endif /* UNIV_PAGECOMPRESS_DEBUG */
+
+
+ switch(compression_alg) {
+ case PAGE_ZLIB_ALGORITHM:
+ err= uncompress(in_buf, &len, buf+header_len, (unsigned long)actual_size);
+
+ /* If uncompress fails it means that page is corrupted */
+ if (err != Z_OK) {
+
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Corruption: Page is marked as compressed"
+ " but uncompress failed with error %d "
+ " size %lu len %lu.",
+ err, actual_size, len);
+
+ fflush(stderr);
+
+ ut_error;
+ }
+ break;
+
+#ifdef HAVE_LZ4
+ case PAGE_LZ4_ALGORITHM:
+ err = LZ4_decompress_fast((const char *)buf+header_len, (char *)in_buf, len);
+
+ if (err != (int)actual_size) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Corruption: Page is marked as compressed"
+ " but decompression read only %d bytes "
+ " size %lu len %lu.",
+ err, actual_size, len);
+ fflush(stderr);
+
+ ut_error;
+ }
+ break;
+#endif /* HAVE_LZ4 */
+#ifdef HAVE_LZO
+ case PAGE_LZO_ALGORITHM: {
+ ulint olen=0;
+ err = lzo1x_decompress((const unsigned char *)buf+header_len,
+ actual_size,(unsigned char *)in_buf, &olen, NULL);
+
+ if (err != LZO_E_OK || (olen == 0 || olen > UNIV_PAGE_SIZE)) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Corruption: Page is marked as compressed"
+ " but decompression read only %ld bytes"
+ " size %lu len %lu.",
+ olen, actual_size, len);
+ fflush(stderr);
+
+ ut_error;
+ }
+ break;
+ }
+#endif /* HAVE_LZO */
+#ifdef HAVE_LZMA
+ case PAGE_LZMA_ALGORITHM: {
+
+ lzma_ret ret;
+ size_t src_pos = 0;
+ size_t dst_pos = 0;
+ uint64_t memlimit = UINT64_MAX;
+
+ ret = lzma_stream_buffer_decode(
+ &memlimit,
+ 0,
+ NULL,
+ buf+header_len,
+ &src_pos,
+ actual_size,
+ in_buf,
+ &dst_pos,
+ len);
+
+
+ if (ret != LZMA_OK || (dst_pos == 0 || dst_pos > UNIV_PAGE_SIZE)) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Corruption: Page is marked as compressed"
+ " but decompression read only %ld bytes"
+ " size %lu len %lu.",
+ dst_pos, actual_size, len);
+ fflush(stderr);
+
+ ut_error;
+ }
+
+ break;
+ }
+#endif /* HAVE_LZMA */
+#ifdef HAVE_BZIP2
+ case PAGE_BZIP2_ALGORITHM: {
+ unsigned int dst_pos = UNIV_PAGE_SIZE;
+
+ err = BZ2_bzBuffToBuffDecompress(
+ (char *)in_buf,
+ &dst_pos,
+ (char *)(buf+header_len),
+ actual_size,
+ 1,
+ 0);
+
+ if (err != BZ_OK || (dst_pos == 0 || dst_pos > UNIV_PAGE_SIZE)) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Corruption: Page is marked as compressed"
+ " but decompression read only %du bytes"
+ " size %lu len %lu err %d.",
+ dst_pos, actual_size, len, err);
+ fflush(stderr);
+
+ ut_error;
+ }
+ break;
+ }
+#endif /* HAVE_BZIP2 */
+#ifdef HAVE_SNAPPY
+ case PAGE_SNAPPY_ALGORITHM:
+ {
+ snappy_status cstatus;
+ ulint olen = 0;
+
+ cstatus = snappy_uncompress(
+ (const char *)(buf+header_len),
+ actual_size,
+ (char *)in_buf,
+ &olen);
+
+ if (cstatus != SNAPPY_OK || (olen == 0 || olen > UNIV_PAGE_SIZE)) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Corruption: Page is marked as compressed"
+ " but decompression read only %lu bytes"
+ " size %lu len %lu err %d.",
+ olen, actual_size, len, (int)cstatus);
+ fflush(stderr);
+
+ ut_error;
+ }
+ break;
+ }
+#endif /* HAVE_SNAPPY */
+ default:
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Corruption: Page is marked as compressed"
+ " but compression algorithm %s"
+ " is not known."
+ ,fil_get_compression_alg_name(compression_alg));
+
+ fflush(stderr);
+ ut_error;
+ break;
+ }
+
+ srv_stats.pages_page_decompressed.inc();
+
+ /* Copy the uncompressed page to the buffer pool, not
+ really any other options. */
+ memcpy(buf, in_buf, len);
+
+ // Need to free temporal buffer if no buffer was given
+ if (page_buf == NULL) {
+ ut_free(in_buf);
+ }
+}
diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc
index 6ecf97aaeb6..5c5e2d69514 100644
--- a/storage/innobase/fsp/fsp0fsp.cc
+++ b/storage/innobase/fsp/fsp0fsp.cc
@@ -31,6 +31,7 @@ Created 11/29/1995 Heikki Tuuri
#include "buf0buf.h"
#include "fil0fil.h"
+#include "fil0crypt.h"
#include "mtr0log.h"
#include "ut0byte.h"
#include "page0page.h"
@@ -728,7 +729,12 @@ fsp_header_init(
} else {
fsp_fill_free_list(TRUE, space, header, mtr);
}
+
+ ulint maxsize = 0;
+ ulint offset = fsp_header_get_crypt_offset(zip_size, &maxsize);
+ fil_space_write_crypt_data(space, page, offset, maxsize, mtr);
}
+
#endif /* !UNIV_HOTBACKUP */
/**********************************************************************//**
@@ -4083,3 +4089,61 @@ fsp_print(
fprintf(stderr, "NUMBER of file segments: %lu\n", (ulong) n_segs);
}
#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Compute offset after xdes where crypt data can be stored
+@return offset */
+ulint
+fsp_header_get_crypt_offset(
+/*========================*/
+ ulint zip_size, /*!< in: zip_size */
+ ulint* max_size) /*!< out: free space available for crypt data */
+{
+ ulint pageno = 0;
+ /* compute first page_no that will have xdes stored on page != 0*/
+ for (ulint i = 0;
+ (pageno = xdes_calc_descriptor_page(zip_size, i)) == 0; )
+ i++;
+
+ /* use pageno prior to this...i.e last page on page 0 */
+ ut_ad(pageno > 0);
+ pageno--;
+
+ ulint iv_offset = XDES_ARR_OFFSET +
+ XDES_SIZE * (1 + xdes_calc_descriptor_index(zip_size, pageno));
+
+ if (max_size != NULL) {
+ /* return how much free space there is available on page */
+ *max_size = (zip_size ? zip_size : UNIV_PAGE_SIZE) -
+ (FSP_HEADER_OFFSET + iv_offset + FIL_PAGE_DATA_END);
+ }
+
+ return FSP_HEADER_OFFSET + iv_offset;
+}
+
+/**********************************************************************//**
+Checks if a single page is free.
+@return true if free */
+UNIV_INTERN
+bool
+fsp_page_is_free_func(
+/*==============*/
+ ulint space, /*!< in: space id */
+ ulint page_no, /*!< in: page offset */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ const char *file,
+ ulint line)
+{
+ ulint flags;
+
+ ut_ad(mtr);
+
+ mtr_x_lock_func(fil_space_get_latch(space, &flags), file, line, mtr);
+ ulint zip_size = fsp_flags_get_zip_size(flags);
+
+ xdes_t* descr = xdes_get_descriptor(space, zip_size, page_no, mtr);
+ ut_a(descr);
+
+ return xdes_mtr_get_bit(
+ descr, XDES_FREE_BIT, page_no % FSP_EXTENT_SIZE, mtr);
+}
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index 7b57f072493..b9b8ecd4916 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -4,7 +4,7 @@ Copyright (c) 2000, 2015, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, 2009 Google Inc.
Copyright (c) 2009, Percona Inc.
Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2013, 2014 SkySQL Ab. All Rights Reserved.
+Copyright (c) 2013, 2015, MariaDB Corporation.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -57,6 +57,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
#include "buf0flu.h"
#include "buf0dblwr.h"
#include "btr0sea.h"
+#include "btr0defragment.h"
#include "os0file.h"
#include "os0thread.h"
#include "srv0start.h"
@@ -65,7 +66,6 @@ this program; if not, write to the Free Software Foundation, Inc.,
#include "trx0trx.h"
#include "trx0sys.h"
-#include "mtr0mtr.h"
#include "rem0types.h"
#include "row0ins.h"
#include "row0mysql.h"
@@ -79,6 +79,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
#include "fsp0fsp.h"
#include "sync0sync.h"
#include "fil0fil.h"
+#include "fil0crypt.h"
#include "trx0xa.h"
#include "row0merge.h"
#include "dict0boot.h"
@@ -86,6 +87,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
#include "dict0stats_bg.h"
#include "ha_prototypes.h"
#include "ut0mem.h"
+#include "ut0timer.h"
#include "ibuf0ibuf.h"
#include "dict0dict.h"
#include "srv0mon.h"
@@ -101,6 +103,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
#endif /* UNIV_DEBUG */
#include "fts0priv.h"
#include "page0zip.h"
+#include "fil0pagecompress.h"
#define thd_get_trx_isolation(X) ((enum_tx_isolation)thd_tx_isolation(X))
@@ -112,10 +115,40 @@ this program; if not, write to the Free Software Foundation, Inc.,
#include "ha_innodb.h"
#include "i_s.h"
+#include <mysql/plugin.h>
+#include <mysql/service_wsrep.h>
+
# ifndef MYSQL_PLUGIN_IMPORT
# define MYSQL_PLUGIN_IMPORT /* nothing */
# endif /* MYSQL_PLUGIN_IMPORT */
+#ifdef WITH_WSREP
+#include "dict0priv.h"
+#include "../storage/innobase/include/ut0byte.h"
+#include <mysql/service_md5.h>
+
+class binlog_trx_data;
+extern handlerton *binlog_hton;
+
+extern MYSQL_PLUGIN_IMPORT MYSQL_BIN_LOG mysql_bin_log;
+
+static inline wsrep_ws_handle_t*
+wsrep_ws_handle(THD* thd, const trx_t* trx) {
+ return wsrep_ws_handle_for_trx(wsrep_thd_ws_handle(thd),
+ (wsrep_trx_id_t)trx->id);
+}
+
+extern TC_LOG* tc_log;
+extern void wsrep_cleanup_transaction(THD *thd);
+static int
+wsrep_abort_transaction(handlerton* hton, THD *bf_thd, THD *victim_thd,
+ my_bool signal);
+static void
+wsrep_fake_trx_id(handlerton* hton, THD *thd);
+static int innobase_wsrep_set_checkpoint(handlerton* hton, const XID* xid);
+static int innobase_wsrep_get_checkpoint(handlerton* hton, XID* xid);
+#endif /* WITH_WSREP */
+
/** to protect innobase_open_files */
static mysql_mutex_t innobase_share_mutex;
/** to force correct commit order in binlog */
@@ -203,6 +236,19 @@ static char* internal_innobase_data_file_path = NULL;
static char* innodb_version_str = (char*) INNODB_VERSION_STR;
+extern uint srv_n_fil_crypt_threads;
+extern uint srv_fil_crypt_rotate_key_age;
+extern uint srv_n_fil_crypt_iops;
+
+extern my_bool srv_immediate_scrub_data_uncompressed;
+extern my_bool srv_background_scrub_data_uncompressed;
+extern my_bool srv_background_scrub_data_compressed;
+extern uint srv_background_scrub_data_interval;
+extern uint srv_background_scrub_data_check_interval;
+#ifdef UNIV_DEBUG
+extern my_bool srv_scrub_force_testing;
+#endif
+
/** Possible values for system variable "innodb_stats_method". The values
are defined the same as its corresponding MyISAM system variable
"myisam_stats_method"(see "myisam_stats_method_names"), for better usability */
@@ -224,12 +270,12 @@ static TYPELIB innodb_stats_method_typelib = {
/** Possible values for system variable "innodb_checksum_algorithm". */
static const char* innodb_checksum_algorithm_names[] = {
- "crc32",
- "strict_crc32",
- "innodb",
- "strict_innodb",
- "none",
- "strict_none",
+ "CRC32",
+ "STRICT_CRC32",
+ "INNODB",
+ "STRICT_INNODB",
+ "NONE",
+ "STRICT_NONE",
NullS
};
@@ -502,6 +548,42 @@ ib_cb_t innodb_api_cb[] = {
(ib_cb_t) ib_trx_read_only
};
+
+static void innodb_remember_check_sysvar_funcs();
+mysql_var_check_func check_sysvar_enum;
+
+static MYSQL_THDVAR_UINT(default_encryption_key_id, PLUGIN_VAR_RQCMDARG,
+ "Default encryption key id used for table encryption.",
+ NULL, NULL,
+ FIL_DEFAULT_ENCRYPTION_KEY, 1, UINT_MAX32, 0);
+
+/**
+ Structure for CREATE TABLE options (table options).
+ It needs to be called ha_table_option_struct.
+
+ The option values can be specified in the CREATE TABLE at the end:
+ CREATE TABLE ( ... ) *here*
+*/
+
+ha_create_table_option innodb_table_option_list[]=
+{
+ /* With this option user can enable page compression feature for the
+ table */
+ HA_TOPTION_BOOL("PAGE_COMPRESSED", page_compressed, 0),
+ /* With this option user can set zip compression level for page
+ compression for this table*/
+ HA_TOPTION_NUMBER("PAGE_COMPRESSION_LEVEL", page_compression_level, 0, 1, 9, 1),
+ /* With this option user can enable atomic writes feature for this table */
+ HA_TOPTION_ENUM("ATOMIC_WRITES", atomic_writes, "DEFAULT,ON,OFF", 0),
+ /* With this option the user can enable encryption for the table */
+ HA_TOPTION_ENUM("ENCRYPTED", encryption, "DEFAULT,YES,NO", 0),
+ /* With this option the user defines the key identifier using for the encryption */
+ HA_TOPTION_SYSVAR("ENCRYPTION_KEY_ID", encryption_key_id, default_encryption_key_id),
+
+ HA_TOPTION_END
+};
+
+
/*************************************************************//**
Check whether valid argument given to innodb_ft_*_stopword_table.
This function is registered as a callback with MySQL.
@@ -537,7 +619,39 @@ static inline
ulint
innobase_map_isolation_level(
/*=========================*/
- enum_tx_isolation iso); /*!< in: MySQL isolation level code */
+ enum_tx_isolation iso); /*!< in: MySQL isolation level code
+ */
+
+/*************************************************************//**
+Check for a valid value of innobase_compression_algorithm.
+@return 0 for valid innodb_compression_algorithm. */
+static
+int
+innodb_compression_algorithm_validate(
+/*==================================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to system
+ variable */
+ void* save, /*!< out: immediate result
+ for update function */
+ struct st_mysql_value* value); /*!< in: incoming string */
+
+static ibool innodb_have_lzo=IF_LZO(1, 0);
+static ibool innodb_have_lz4=IF_LZ4(1, 0);
+static ibool innodb_have_lzma=IF_LZMA(1, 0);
+static ibool innodb_have_bzip2=IF_BZIP2(1, 0);
+static ibool innodb_have_snappy=IF_SNAPPY(1, 0);
+
+static
+int
+innodb_encrypt_tables_validate(
+/*==================================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to system
+ variable */
+ void* save, /*!< out: immediate result
+ for update function */
+ struct st_mysql_value* value); /*!< in: incoming string */
static const char innobase_hton_name[]= "InnoDB";
@@ -690,6 +804,112 @@ static SHOW_VAR innodb_status_variables[]= {
{"purge_view_trx_id_age",
(char*) &export_vars.innodb_purge_view_trx_id_age, SHOW_LONG},
#endif /* UNIV_DEBUG */
+ /* Status variables for page compression */
+ {"page_compression_saved",
+ (char*) &export_vars.innodb_page_compression_saved, SHOW_LONGLONG},
+ {"page_compression_trim_sect512",
+ (char*) &export_vars.innodb_page_compression_trim_sect512, SHOW_LONGLONG},
+ {"page_compression_trim_sect1024",
+ (char*) &export_vars.innodb_page_compression_trim_sect1024, SHOW_LONGLONG},
+ {"page_compression_trim_sect2048",
+ (char*) &export_vars.innodb_page_compression_trim_sect2048, SHOW_LONGLONG},
+ {"page_compression_trim_sect4096",
+ (char*) &export_vars.innodb_page_compression_trim_sect4096, SHOW_LONGLONG},
+ {"page_compression_trim_sect8192",
+ (char*) &export_vars.innodb_page_compression_trim_sect8192, SHOW_LONGLONG},
+ {"page_compression_trim_sect16384",
+ (char*) &export_vars.innodb_page_compression_trim_sect16384, SHOW_LONGLONG},
+ {"page_compression_trim_sect32768",
+ (char*) &export_vars.innodb_page_compression_trim_sect32768, SHOW_LONGLONG},
+ {"num_index_pages_written",
+ (char*) &export_vars.innodb_index_pages_written, SHOW_LONGLONG},
+ {"num_non_index_pages_written",
+ (char*) &export_vars.innodb_non_index_pages_written, SHOW_LONGLONG},
+ {"num_pages_page_compressed",
+ (char*) &export_vars.innodb_pages_page_compressed, SHOW_LONGLONG},
+ {"num_page_compressed_trim_op",
+ (char*) &export_vars.innodb_page_compressed_trim_op, SHOW_LONGLONG},
+ {"num_page_compressed_trim_op_saved",
+ (char*) &export_vars.innodb_page_compressed_trim_op_saved, SHOW_LONGLONG},
+ {"num_pages_page_decompressed",
+ (char*) &export_vars.innodb_pages_page_decompressed, SHOW_LONGLONG},
+ {"num_pages_page_compression_error",
+ (char*) &export_vars.innodb_pages_page_compression_error, SHOW_LONGLONG},
+ {"num_pages_encrypted",
+ (char*) &export_vars.innodb_pages_encrypted, SHOW_LONGLONG},
+ {"num_pages_decrypted",
+ (char*) &export_vars.innodb_pages_decrypted, SHOW_LONGLONG},
+ {"have_lz4",
+ (char*) &innodb_have_lz4, SHOW_BOOL},
+ {"have_lzo",
+ (char*) &innodb_have_lzo, SHOW_BOOL},
+ {"have_lzma",
+ (char*) &innodb_have_lzma, SHOW_BOOL},
+ {"have_bzip2",
+ (char*) &innodb_have_bzip2, SHOW_BOOL},
+ {"have_snappy",
+ (char*) &innodb_have_snappy, SHOW_BOOL},
+
+ /* Defragmentation */
+ {"defragment_compression_failures",
+ (char*) &export_vars.innodb_defragment_compression_failures, SHOW_LONG},
+ {"defragment_failures",
+ (char*) &export_vars.innodb_defragment_failures, SHOW_LONG},
+ {"defragment_count",
+ (char*) &export_vars.innodb_defragment_count, SHOW_LONG},
+
+ /* Online alter table status variables */
+ {"onlineddl_rowlog_rows",
+ (char*) &export_vars.innodb_onlineddl_rowlog_rows, SHOW_LONG},
+ {"onlineddl_rowlog_pct_used",
+ (char*) &export_vars.innodb_onlineddl_rowlog_pct_used, SHOW_LONG},
+ {"onlineddl_pct_progress",
+ (char*) &export_vars.innodb_onlineddl_pct_progress, SHOW_LONG},
+
+ /* Times secondary index lookup triggered cluster lookup and
+ times prefix optimization avoided triggering cluster lookup */
+ {"secondary_index_triggered_cluster_reads",
+ (char*) &export_vars.innodb_sec_rec_cluster_reads, SHOW_LONG},
+ {"secondary_index_triggered_cluster_reads_avoided",
+ (char*) &export_vars.innodb_sec_rec_cluster_reads_avoided, SHOW_LONG},
+
+ /* Encryption */
+ {"encryption_rotation_pages_read_from_cache",
+ (char*) &export_vars.innodb_encryption_rotation_pages_read_from_cache,
+ SHOW_LONG},
+ {"encryption_rotation_pages_read_from_disk",
+ (char*) &export_vars.innodb_encryption_rotation_pages_read_from_disk,
+ SHOW_LONG},
+ {"encryption_rotation_pages_modified",
+ (char*) &export_vars.innodb_encryption_rotation_pages_modified,
+ SHOW_LONG},
+ {"encryption_rotation_pages_flushed",
+ (char*) &export_vars.innodb_encryption_rotation_pages_flushed,
+ SHOW_LONG},
+ {"encryption_rotation_estimated_iops",
+ (char*) &export_vars.innodb_encryption_rotation_estimated_iops,
+ SHOW_LONG},
+
+ /* scrubing */
+ {"scrub_background_page_reorganizations",
+ (char*) &export_vars.innodb_scrub_page_reorganizations,
+ SHOW_LONG},
+ {"scrub_background_page_splits",
+ (char*) &export_vars.innodb_scrub_page_splits,
+ SHOW_LONG},
+ {"scrub_background_page_split_failures_underflow",
+ (char*) &export_vars.innodb_scrub_page_split_failures_underflow,
+ SHOW_LONG},
+ {"scrub_background_page_split_failures_out_of_filespace",
+ (char*) &export_vars.innodb_scrub_page_split_failures_out_of_filespace,
+ SHOW_LONG},
+ {"scrub_background_page_split_failures_missing_index",
+ (char*) &export_vars.innodb_scrub_page_split_failures_missing_index,
+ SHOW_LONG},
+ {"scrub_background_page_split_failures_unknown",
+ (char*) &export_vars.innodb_scrub_page_split_failures_unknown,
+ SHOW_LONG},
+
{NullS, NullS, SHOW_LONG}
};
@@ -1191,6 +1411,10 @@ innobase_srv_conc_enter_innodb(
/*===========================*/
trx_t* trx) /*!< in: transaction handle */
{
+#ifdef WITH_WSREP
+ if (wsrep_on(trx->mysql_thd) &&
+ wsrep_thd_is_BF(trx->mysql_thd, FALSE)) return;
+#endif /* WITH_WSREP */
if (srv_thread_concurrency) {
if (trx->n_tickets_to_enter_innodb > 0) {
@@ -1225,6 +1449,10 @@ innobase_srv_conc_exit_innodb(
#ifdef UNIV_SYNC_DEBUG
ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
#endif /* UNIV_SYNC_DEBUG */
+#ifdef WITH_WSREP
+ if (wsrep_on(trx->mysql_thd) &&
+ wsrep_thd_is_BF(trx->mysql_thd, FALSE)) return;
+#endif /* WITH_WSREP */
/* This is to avoid making an unnecessary function call. */
if (trx->declared_to_be_inside_innodb
@@ -1345,6 +1573,15 @@ thd_to_trx(
{
return(*(trx_t**) thd_ha_data(thd, innodb_hton_ptr));
}
+#ifdef WITH_WSREP
+ulonglong
+thd_to_trx_id(
+/*=======*/
+ THD* thd) /*!< in: MySQL thread */
+{
+ return(thd_to_trx(thd)->id);
+}
+#endif /* WITH_WSREP */
/********************************************************************//**
Call this function when mysqld passes control to the client. That is to
@@ -1839,6 +2076,9 @@ int
innobase_mysql_tmpfile(void)
/*========================*/
{
+#ifdef WITH_INNODB_DISALLOW_WRITES
+ os_event_wait(srv_allow_writes_event);
+#endif /* WITH_INNODB_DISALLOW_WRITES */
int fd2 = -1;
File fd;
@@ -2295,9 +2535,11 @@ ha_innobase::ha_innobase(
HA_BINLOG_ROW_CAPABLE |
HA_CAN_GEOMETRY | HA_PARTIAL_COLUMN_READ |
HA_TABLE_SCAN_ON_INDEX | HA_CAN_FULLTEXT |
+ (srv_force_primary_key ? HA_REQUIRE_PRIMARY_KEY : 0 ) |
HA_CAN_FULLTEXT_EXT | HA_CAN_EXPORT),
start_of_scan(0),
- num_write_row(0)
+ num_write_row(0),
+ ha_partition_stats(NULL)
{}
/*********************************************************************//**
@@ -2905,11 +3147,21 @@ innobase_init(
innobase_hton->release_temporary_latches =
innobase_release_temporary_latches;
+#ifdef WITH_WSREP
+ innobase_hton->abort_transaction=wsrep_abort_transaction;
+ innobase_hton->set_checkpoint=innobase_wsrep_set_checkpoint;
+ innobase_hton->get_checkpoint=innobase_wsrep_get_checkpoint;
+ innobase_hton->fake_trx_id=wsrep_fake_trx_id;
+#endif /* WITH_WSREP */
innobase_hton->kill_query = innobase_kill_query;
if (srv_file_per_table)
innobase_hton->tablefile_extensions = ha_innobase_exts;
+ innobase_hton->table_options = innodb_table_option_list;
+
+ innodb_remember_check_sysvar_funcs();
+
ut_a(DATA_MYSQL_TRUE_VARCHAR == (ulint)MYSQL_TYPE_VARCHAR);
#ifndef DBUG_OFF
@@ -2944,6 +3196,74 @@ innobase_init(
}
}
+ if (UNIV_PAGE_SIZE != UNIV_PAGE_SIZE_DEF) {
+ fprintf(stderr,
+ "InnoDB: Warning: innodb_page_size has been "
+ "changed from default value %d to %ldd. (###EXPERIMENTAL### "
+ "operation)\n", UNIV_PAGE_SIZE_DEF, UNIV_PAGE_SIZE);
+
+ /* There is hang on buffer pool when trying to get a new
+ page if buffer pool size is too small for large page sizes */
+ if (innobase_buffer_pool_size < (24 * 1024 * 1024)) {
+ fprintf(stderr, "InnoDB: Error: innobase_page_size %lu requires "
+ "innodb_buffer_pool_size > 24M current %lld",
+ UNIV_PAGE_SIZE, innobase_buffer_pool_size);
+ goto error;
+ }
+ }
+
+#ifndef HAVE_LZ4
+ if (innodb_compression_algorithm == PAGE_LZ4_ALGORITHM) {
+ sql_print_error("InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+ "InnoDB: liblz4 is not installed. \n",
+ innodb_compression_algorithm);
+ goto error;
+ }
+#endif
+
+#ifndef HAVE_LZO
+ if (innodb_compression_algorithm == PAGE_LZO_ALGORITHM) {
+ sql_print_error("InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+ "InnoDB: liblzo is not installed. \n",
+ innodb_compression_algorithm);
+ goto error;
+ }
+#endif
+
+#ifndef HAVE_LZMA
+ if (innodb_compression_algorithm == PAGE_LZMA_ALGORITHM) {
+ sql_print_error("InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+ "InnoDB: liblzma is not installed. \n",
+ innodb_compression_algorithm);
+ goto error;
+ }
+#endif
+
+#ifndef HAVE_BZIP2
+ if (innodb_compression_algorithm == PAGE_BZIP2_ALGORITHM) {
+ sql_print_error("InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+ "InnoDB: libbz2 is not installed. \n",
+ innodb_compression_algorithm);
+ goto error;
+ }
+#endif
+
+#ifndef HAVE_SNAPPY
+ if (innodb_compression_algorithm == PAGE_SNAPPY_ALGORITHM) {
+ sql_print_error("InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+ "InnoDB: libsnappy is not installed. \n",
+ innodb_compression_algorithm);
+ goto error;
+ }
+#endif
+
+ if ((srv_encrypt_tables || srv_encrypt_log)
+ && !encryption_key_id_exists(FIL_DEFAULT_ENCRYPTION_KEY)) {
+ sql_print_error("InnoDB: cannot enable encryption, "
+ "encryption plugin is not available");
+ goto error;
+ }
+
os_innodb_umask = (ulint) my_umask;
/* First calculate the default path for innodb_data_home_dir etc.,
@@ -3517,10 +3837,30 @@ innobase_commit_low(
/*================*/
trx_t* trx) /*!< in: transaction handle */
{
+#ifdef WITH_WSREP
+ THD* thd = (THD*)trx->mysql_thd;
+ const char* tmp = 0;
+ if (wsrep_on(thd)) {
+#ifdef WSREP_PROC_INFO
+ char info[64];
+ info[sizeof(info) - 1] = '\0';
+ snprintf(info, sizeof(info) - 1,
+ "innobase_commit_low():trx_commit_for_mysql(%lld)",
+ (long long) wsrep_thd_trx_seqno(thd));
+ tmp = thd_proc_info(thd, info);
+
+#else
+ tmp = thd_proc_info(thd, "innobase_commit_low()");
+#endif /* WSREP_PROC_INFO */
+ }
+#endif /* WITH_WSREP */
if (trx_is_started(trx)) {
trx_commit_for_mysql(trx);
}
+#ifdef WITH_WSREP
+ if (wsrep_on(thd)) { thd_proc_info(thd, tmp); }
+#endif /* WITH_WSREP */
}
/*****************************************************************//**
@@ -4245,6 +4585,20 @@ innobase_kill_query(
DBUG_ENTER("innobase_kill_query");
DBUG_ASSERT(hton == innodb_hton_ptr);
+#ifdef WITH_WSREP
+ wsrep_thd_LOCK(thd);
+ if (wsrep_thd_get_conflict_state(thd) != NO_CONFLICT) {
+ /* if victim has been signaled by BF thread and/or aborting
+ is already progressing, following query aborting is not necessary
+ any more.
+ Also, BF thread should own trx mutex for the victim, which would
+ conflict with trx_mutex_enter() below
+ */
+ wsrep_thd_UNLOCK(thd);
+ DBUG_VOID_RETURN;
+ }
+ wsrep_thd_UNLOCK(thd);
+#endif /* WITH_WSREP */
trx = thd_to_trx(thd);
if (trx) {
@@ -4252,7 +4606,7 @@ innobase_kill_query(
THD *owner = trx->current_lock_mutex_owner;
/* Cancel a pending lock request. */
- if (owner != cur) {
+ if (!owner || owner != cur) {
lock_mutex_enter();
}
trx_mutex_enter(trx);
@@ -4260,7 +4614,7 @@ innobase_kill_query(
lock_cancel_waiting_and_release(trx->lock.wait_lock);
}
trx_mutex_exit(trx);
- if (owner != cur) {
+ if (!owner || owner != cur) {
lock_mutex_exit();
}
}
@@ -4419,7 +4773,11 @@ ha_innobase::max_supported_key_length() const
case 8192:
return(1536);
default:
+#ifdef WITH_WSREP
+ return(3500);
+#else
return(3500);
+#endif
}
}
@@ -5528,6 +5886,117 @@ get_field_offset(
return((uint) (field->ptr - table->record[0]));
}
+#ifdef WITH_WSREP
+UNIV_INTERN
+int
+wsrep_innobase_mysql_sort(
+/*===============*/
+ /* out: str contains sort string */
+ int mysql_type, /* in: MySQL type */
+ uint charset_number, /* in: number of the charset */
+ unsigned char* str, /* in: data field */
+ unsigned int str_length, /* in: data field length,
+ not UNIV_SQL_NULL */
+ unsigned int buf_length) /* in: total str buffer length */
+
+{
+ CHARSET_INFO* charset;
+ enum_field_types mysql_tp;
+ int ret_length = str_length;
+
+ DBUG_ASSERT(str_length != UNIV_SQL_NULL);
+
+ mysql_tp = (enum_field_types) mysql_type;
+
+ switch (mysql_tp) {
+
+ case MYSQL_TYPE_BIT:
+ case MYSQL_TYPE_STRING:
+ case MYSQL_TYPE_VAR_STRING:
+ case MYSQL_TYPE_TINY_BLOB:
+ case MYSQL_TYPE_MEDIUM_BLOB:
+ case MYSQL_TYPE_BLOB:
+ case MYSQL_TYPE_LONG_BLOB:
+ case MYSQL_TYPE_VARCHAR:
+ {
+ uchar tmp_str[REC_VERSION_56_MAX_INDEX_COL_LEN] = {'\0'};
+ uint tmp_length = REC_VERSION_56_MAX_INDEX_COL_LEN;
+
+ /* Use the charset number to pick the right charset struct for
+ the comparison. Since the MySQL function get_charset may be
+ slow before Bar removes the mutex operation there, we first
+ look at 2 common charsets directly. */
+
+ if (charset_number == default_charset_info->number) {
+ charset = default_charset_info;
+ } else if (charset_number == my_charset_latin1.number) {
+ charset = &my_charset_latin1;
+ } else {
+ charset = get_charset(charset_number, MYF(MY_WME));
+
+ if (charset == NULL) {
+ sql_print_error("InnoDB needs charset %lu for doing "
+ "a comparison, but MySQL cannot "
+ "find that charset.",
+ (ulong) charset_number);
+ ut_a(0);
+ }
+ }
+
+ ut_a(str_length <= tmp_length);
+ memcpy(tmp_str, str, str_length);
+
+ tmp_length = charset->coll->strnxfrm(charset, str, str_length,
+ str_length, tmp_str,
+ tmp_length, 0);
+ DBUG_ASSERT(tmp_length <= str_length);
+ if (wsrep_protocol_version < 3) {
+ tmp_length = charset->coll->strnxfrm(
+ charset, str, str_length,
+ str_length, tmp_str, tmp_length, 0);
+ DBUG_ASSERT(tmp_length <= str_length);
+ } else {
+ /* strnxfrm will expand the destination string,
+ protocols < 3 truncated the sorted sring
+ protocols >= 3 gets full sorted sring
+ */
+ tmp_length = charset->coll->strnxfrm(
+ charset, str, buf_length,
+ str_length, tmp_str, str_length, 0);
+ DBUG_ASSERT(tmp_length <= buf_length);
+ ret_length = tmp_length;
+ }
+
+ break;
+ }
+ case MYSQL_TYPE_DECIMAL :
+ case MYSQL_TYPE_TINY :
+ case MYSQL_TYPE_SHORT :
+ case MYSQL_TYPE_LONG :
+ case MYSQL_TYPE_FLOAT :
+ case MYSQL_TYPE_DOUBLE :
+ case MYSQL_TYPE_NULL :
+ case MYSQL_TYPE_TIMESTAMP :
+ case MYSQL_TYPE_LONGLONG :
+ case MYSQL_TYPE_INT24 :
+ case MYSQL_TYPE_DATE :
+ case MYSQL_TYPE_TIME :
+ case MYSQL_TYPE_DATETIME :
+ case MYSQL_TYPE_YEAR :
+ case MYSQL_TYPE_NEWDATE :
+ case MYSQL_TYPE_NEWDECIMAL :
+ case MYSQL_TYPE_ENUM :
+ case MYSQL_TYPE_SET :
+ case MYSQL_TYPE_GEOMETRY :
+ break;
+ default:
+ break;
+ }
+
+ return ret_length;
+}
+#endif /* WITH_WSREP */
+
/*************************************************************//**
InnoDB uses this function to compare two data fields for which the data type
is such that we must use MySQL code to compare them. NOTE that the prototype
@@ -6028,11 +6497,313 @@ innobase_read_from_2_little_endian(
return((uint) ((ulint)(buf[0]) + 256 * ((ulint)(buf[1]))));
}
+#ifdef WITH_WSREP
/*******************************************************************//**
Stores a key value for a row to a buffer.
@return key value length as stored in buff */
UNIV_INTERN
uint
+wsrep_store_key_val_for_row(
+/*===============================*/
+ THD* thd,
+ TABLE* table,
+ uint keynr, /*!< in: key number */
+ char* buff, /*!< in/out: buffer for the key value (in MySQL
+ format) */
+ uint buff_len,/*!< in: buffer length */
+ const uchar* record,
+ ibool* key_is_null)/*!< out: full key was null */
+{
+ KEY* key_info = table->key_info + keynr;
+ KEY_PART_INFO* key_part = key_info->key_part;
+ KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
+ char* buff_start = buff;
+ enum_field_types mysql_type;
+ Field* field;
+ uint buff_space = buff_len;
+
+ DBUG_ENTER("wsrep_store_key_val_for_row");
+
+ memset(buff, 0, buff_len);
+ *key_is_null = TRUE;
+
+ for (; key_part != end; key_part++) {
+
+ uchar sorted[REC_VERSION_56_MAX_INDEX_COL_LEN] = {'\0'};
+ ibool part_is_null = FALSE;
+
+ if (key_part->null_bit) {
+ if (buff_space > 0) {
+ if (record[key_part->null_offset]
+ & key_part->null_bit) {
+ *buff = 1;
+ part_is_null = TRUE;
+ } else {
+ *buff = 0;
+ }
+ buff++;
+ buff_space--;
+ } else {
+ fprintf (stderr, "WSREP: key truncated: %s\n",
+ wsrep_thd_query(thd));
+ }
+ }
+ if (!part_is_null) *key_is_null = FALSE;
+
+ field = key_part->field;
+ mysql_type = field->type();
+
+ if (mysql_type == MYSQL_TYPE_VARCHAR) {
+ /* >= 5.0.3 true VARCHAR */
+ ulint lenlen;
+ ulint len;
+ const byte* data;
+ ulint key_len;
+ ulint true_len;
+ const CHARSET_INFO* cs;
+ int error=0;
+
+ key_len = key_part->length;
+
+ if (part_is_null) {
+ true_len = key_len + 2;
+ if (true_len > buff_space) {
+ fprintf (stderr,
+ "WSREP: key truncated: %s\n",
+ wsrep_thd_query(thd));
+ true_len = buff_space;
+ }
+ buff += true_len;
+ buff_space -= true_len;
+ continue;
+ }
+ cs = field->charset();
+
+ lenlen = (ulint)
+ (((Field_varstring*)field)->length_bytes);
+
+ data = row_mysql_read_true_varchar(&len,
+ (byte*) (record
+ + (ulint)get_field_offset(table, field)),
+ lenlen);
+
+ true_len = len;
+
+ /* For multi byte character sets we need to calculate
+ the true length of the key */
+
+ if (len > 0 && cs->mbmaxlen > 1) {
+ true_len = (ulint) cs->cset->well_formed_len(cs,
+ (const char *) data,
+ (const char *) data + len,
+ (uint) (key_len /
+ cs->mbmaxlen),
+ &error);
+ }
+
+ /* In a column prefix index, we may need to truncate
+ the stored value: */
+
+ if (true_len > key_len) {
+ true_len = key_len;
+ }
+
+ memcpy(sorted, data, true_len);
+ true_len = wsrep_innobase_mysql_sort(
+ mysql_type, cs->number, sorted, true_len,
+ REC_VERSION_56_MAX_INDEX_COL_LEN);
+
+ if (wsrep_protocol_version > 1) {
+ /* Note that we always reserve the maximum possible
+ length of the true VARCHAR in the key value, though
+ only len first bytes after the 2 length bytes contain
+ actual data. The rest of the space was reset to zero
+ in the bzero() call above. */
+ if (true_len > buff_space) {
+ fprintf (stderr,
+ "WSREP: key truncated: %s\n",
+ wsrep_thd_query(thd));
+ true_len = buff_space;
+ }
+ memcpy(buff, sorted, true_len);
+ buff += true_len;
+ buff_space -= true_len;
+ } else {
+ buff += key_len;
+ }
+ } else if (mysql_type == MYSQL_TYPE_TINY_BLOB
+ || mysql_type == MYSQL_TYPE_MEDIUM_BLOB
+ || mysql_type == MYSQL_TYPE_BLOB
+ || mysql_type == MYSQL_TYPE_LONG_BLOB
+ /* MYSQL_TYPE_GEOMETRY data is treated
+ as BLOB data in innodb. */
+ || mysql_type == MYSQL_TYPE_GEOMETRY) {
+
+ const CHARSET_INFO* cs;
+ ulint key_len;
+ ulint true_len;
+ int error=0;
+ ulint blob_len;
+ const byte* blob_data;
+
+ ut_a(key_part->key_part_flag & HA_PART_KEY_SEG);
+
+ key_len = key_part->length;
+
+ if (part_is_null) {
+ true_len = key_len + 2;
+ if (true_len > buff_space) {
+ fprintf (stderr,
+ "WSREP: key truncated: %s\n",
+ wsrep_thd_query(thd));
+ true_len = buff_space;
+ }
+ buff += true_len;
+ buff_space -= true_len;
+
+ continue;
+ }
+
+ cs = field->charset();
+
+ blob_data = row_mysql_read_blob_ref(&blob_len,
+ (byte*) (record
+ + (ulint)get_field_offset(table, field)),
+ (ulint) field->pack_length());
+
+ true_len = blob_len;
+
+ ut_a(get_field_offset(table, field)
+ == key_part->offset);
+
+ /* For multi byte character sets we need to calculate
+ the true length of the key */
+
+ if (blob_len > 0 && cs->mbmaxlen > 1) {
+ true_len = (ulint) cs->cset->well_formed_len(cs,
+ (const char *) blob_data,
+ (const char *) blob_data
+ + blob_len,
+ (uint) (key_len /
+ cs->mbmaxlen),
+ &error);
+ }
+
+ /* All indexes on BLOB and TEXT are column prefix
+ indexes, and we may need to truncate the data to be
+ stored in the key value: */
+
+ if (true_len > key_len) {
+ true_len = key_len;
+ }
+
+ memcpy(sorted, blob_data, true_len);
+ true_len = wsrep_innobase_mysql_sort(
+ mysql_type, cs->number, sorted, true_len,
+ REC_VERSION_56_MAX_INDEX_COL_LEN);
+
+
+ /* Note that we always reserve the maximum possible
+ length of the BLOB prefix in the key value. */
+ if (wsrep_protocol_version > 1) {
+ if (true_len > buff_space) {
+ fprintf (stderr,
+ "WSREP: key truncated: %s\n",
+ wsrep_thd_query(thd));
+ true_len = buff_space;
+ }
+ buff += true_len;
+ buff_space -= true_len;
+ } else {
+ buff += key_len;
+ }
+ memcpy(buff, sorted, true_len);
+ } else {
+ /* Here we handle all other data types except the
+ true VARCHAR, BLOB and TEXT. Note that the column
+ value we store may be also in a column prefix
+ index. */
+
+ const CHARSET_INFO* cs = NULL;
+ ulint true_len;
+ ulint key_len;
+ const uchar* src_start;
+ int error=0;
+ enum_field_types real_type;
+
+ key_len = key_part->length;
+
+ if (part_is_null) {
+ true_len = key_len;
+ if (true_len > buff_space) {
+ fprintf (stderr,
+ "WSREP: key truncated: %s\n",
+ wsrep_thd_query(thd));
+ true_len = buff_space;
+ }
+ buff += true_len;
+ buff_space -= true_len;
+
+ continue;
+ }
+
+ src_start = record + key_part->offset;
+ real_type = field->real_type();
+ true_len = key_len;
+
+ /* Character set for the field is defined only
+ to fields whose type is string and real field
+ type is not enum or set. For these fields check
+ if character set is multi byte. */
+
+ if (real_type != MYSQL_TYPE_ENUM
+ && real_type != MYSQL_TYPE_SET
+ && ( mysql_type == MYSQL_TYPE_VAR_STRING
+ || mysql_type == MYSQL_TYPE_STRING)) {
+
+ cs = field->charset();
+
+ /* For multi byte character sets we need to
+ calculate the true length of the key */
+
+ if (key_len > 0 && cs->mbmaxlen > 1) {
+
+ true_len = (ulint)
+ cs->cset->well_formed_len(cs,
+ (const char *)src_start,
+ (const char *)src_start
+ + key_len,
+ (uint) (key_len /
+ cs->mbmaxlen),
+ &error);
+ }
+ memcpy(sorted, src_start, true_len);
+ true_len = wsrep_innobase_mysql_sort(
+ mysql_type, cs->number, sorted, true_len,
+ REC_VERSION_56_MAX_INDEX_COL_LEN);
+
+ if (true_len > buff_space) {
+ fprintf (stderr,
+ "WSREP: key truncated: %s\n",
+ wsrep_thd_query(thd));
+ true_len = buff_space;
+ }
+ memcpy(buff, sorted, true_len);
+ } else {
+ memcpy(buff, src_start, true_len);
+ }
+ buff += true_len;
+ buff_space -= true_len;
+ }
+ }
+
+ ut_a(buff <= buff_start + buff_len);
+
+ DBUG_RETURN((uint)(buff - buff_start));
+}
+#endif /* WITH_WSREP */
+UNIV_INTERN
+uint
ha_innobase::store_key_val_for_row(
/*===============================*/
uint keynr, /*!< in: key number */
@@ -6414,11 +7185,20 @@ build_template_field(
templ->col_no = i;
templ->clust_rec_field_no = dict_col_get_clust_pos(col, clust_index);
ut_a(templ->clust_rec_field_no != ULINT_UNDEFINED);
+ templ->rec_field_is_prefix = FALSE;
if (dict_index_is_clust(index)) {
templ->rec_field_no = templ->clust_rec_field_no;
+ templ->rec_prefix_field_no = ULINT_UNDEFINED;
} else {
- templ->rec_field_no = dict_index_get_nth_col_pos(index, i);
+ /* If we're in a secondary index, keep track
+ * of the original index position even if this
+ * is just a prefix index; we will use this
+ * later to avoid a cluster index lookup in
+ * some cases.*/
+
+ templ->rec_field_no = dict_index_get_nth_col_pos(index, i,
+ &templ->rec_prefix_field_no);
}
if (field->real_maybe_null()) {
@@ -6449,6 +7229,13 @@ build_template_field(
if (!dict_index_is_clust(index)
&& templ->rec_field_no == ULINT_UNDEFINED) {
prebuilt->need_to_access_clustered = TRUE;
+
+ if (templ->rec_prefix_field_no != ULINT_UNDEFINED) {
+ dict_field_t* field = dict_index_get_nth_field(
+ index,
+ templ->rec_prefix_field_no);
+ templ->rec_field_is_prefix = (field->prefix_len != 0);
+ }
}
if (prebuilt->mysql_prefix_len < templ->mysql_col_offset
@@ -6610,7 +7397,8 @@ ha_innobase::build_template(
} else {
templ->icp_rec_field_no
= dict_index_get_nth_col_pos(
- prebuilt->index, i);
+ prebuilt->index, i,
+ NULL);
}
if (dict_index_is_clust(prebuilt->index)) {
@@ -6640,7 +7428,7 @@ ha_innobase::build_template(
templ->icp_rec_field_no
= dict_index_get_nth_col_or_prefix_pos(
- prebuilt->index, i, TRUE);
+ prebuilt->index, i, TRUE, NULL);
ut_ad(templ->icp_rec_field_no
!= ULINT_UNDEFINED);
@@ -6878,6 +7666,9 @@ ha_innobase::write_row(
dberr_t error;
int error_result= 0;
ibool auto_inc_used= FALSE;
+#ifdef WITH_WSREP
+ ibool auto_inc_inserted= FALSE; /* if NULL was inserted */
+#endif
ulint sql_command;
trx_t* trx = thd_to_trx(user_thd);
@@ -6911,8 +7702,20 @@ ha_innobase::write_row(
if ((sql_command == SQLCOM_ALTER_TABLE
|| sql_command == SQLCOM_OPTIMIZE
|| sql_command == SQLCOM_CREATE_INDEX
+#ifdef WITH_WSREP
+ || (wsrep_on(user_thd) && wsrep_load_data_splitting &&
+ sql_command == SQLCOM_LOAD &&
+ !thd_test_options(
+ user_thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
+#endif /* WITH_WSREP */
|| sql_command == SQLCOM_DROP_INDEX)
&& num_write_row >= 10000) {
+#ifdef WITH_WSREP
+ if (wsrep_on(user_thd) && sql_command == SQLCOM_LOAD) {
+ WSREP_DEBUG("forced trx split for LOAD: %s",
+ wsrep_thd_query(user_thd));
+ }
+#endif /* WITH_WSREP */
/* ALTER TABLE is COMMITted at every 10000 copied rows.
The IX table lock for the original table has to be re-issued.
As this method will be called on a temporary table where the
@@ -6946,6 +7749,23 @@ no_commit:
*/
;
} else if (src_table == prebuilt->table) {
+#ifdef WITH_WSREP
+ if (wsrep_on(user_thd)) {
+ switch (wsrep_run_wsrep_commit(user_thd, 1))
+ {
+ case WSREP_TRX_OK:
+ break;
+ case WSREP_TRX_SIZE_EXCEEDED:
+ case WSREP_TRX_CERT_FAIL:
+ case WSREP_TRX_ERROR:
+ DBUG_RETURN(1);
+ }
+
+ if (binlog_hton->commit(binlog_hton, user_thd, 1))
+ DBUG_RETURN(1);
+ wsrep_post_commit(user_thd, TRUE);
+ }
+#endif /* WITH_WSREP */
/* Source table is not in InnoDB format:
no need to re-acquire locks on it. */
@@ -6956,6 +7776,23 @@ no_commit:
/* We will need an IX lock on the destination table. */
prebuilt->sql_stat_start = TRUE;
} else {
+#ifdef WITH_WSREP
+ if (wsrep_on(user_thd)) {
+ switch (wsrep_run_wsrep_commit(user_thd, 1))
+ {
+ case WSREP_TRX_OK:
+ break;
+ case WSREP_TRX_SIZE_EXCEEDED:
+ case WSREP_TRX_CERT_FAIL:
+ case WSREP_TRX_ERROR:
+ DBUG_RETURN(1);
+ }
+
+ if (binlog_hton->commit(binlog_hton, user_thd, 1))
+ DBUG_RETURN(1);
+ wsrep_post_commit(user_thd, TRUE);
+ }
+#endif /* WITH_WSREP */
/* Ensure that there are no other table locks than
LOCK_IX and LOCK_AUTO_INC on the destination table. */
@@ -6985,6 +7822,10 @@ no_commit:
innobase_get_auto_increment(). */
prebuilt->autoinc_error = DB_SUCCESS;
+#ifdef WITH_WSREP
+ auto_inc_inserted= (table->next_number_field->val_int() == 0);
+#endif
+
if ((error_result = update_auto_increment())) {
/* We don't want to mask autoinc overflow errors. */
@@ -7063,6 +7904,40 @@ no_commit:
case SQLCOM_REPLACE_SELECT:
goto set_max_autoinc;
+#ifdef WITH_WSREP
+ /* workaround for LP bug #355000, retrying the insert */
+ case SQLCOM_INSERT:
+
+ WSREP_DEBUG("DUPKEY error for autoinc\n"
+ "THD %ld, value %llu, off %llu inc %llu",
+ thd_get_thread_id(current_thd),
+ auto_inc,
+ prebuilt->autoinc_offset,
+ prebuilt->autoinc_increment);
+
+ if (wsrep_on(current_thd) &&
+ auto_inc_inserted &&
+ wsrep_drupal_282555_workaround &&
+ wsrep_thd_retry_counter(current_thd) == 0 &&
+ !thd_test_options(current_thd,
+ OPTION_NOT_AUTOCOMMIT |
+ OPTION_BEGIN)) {
+ WSREP_DEBUG(
+ "retrying insert: %s",
+ (*wsrep_thd_query(current_thd)) ?
+ wsrep_thd_query(current_thd) :
+ (char *)"void");
+ error= DB_SUCCESS;
+ wsrep_thd_set_conflict_state(
+ current_thd, MUST_ABORT);
+ innobase_srv_conc_exit_innodb(prebuilt->trx);
+ /* jump straight to func exit over
+ * later wsrep hooks */
+ goto func_exit;
+ }
+ break;
+#endif /* WITH_WSREP */
+
default:
break;
}
@@ -7122,6 +7997,21 @@ report_error:
prebuilt->table->flags,
user_thd);
+#ifdef WITH_WSREP
+ if (!error_result && wsrep_thd_exec_mode(user_thd) == LOCAL_STATE &&
+ wsrep_on(user_thd) && !wsrep_consistency_check(user_thd) &&
+ (sql_command != SQLCOM_LOAD ||
+ thd_binlog_format(user_thd) == BINLOG_FORMAT_ROW)) {
+
+ if (wsrep_append_keys(user_thd, false, record, NULL)) {
+ DBUG_PRINT("wsrep", ("row key failed"));
+ error_result = HA_ERR_INTERNAL_ERROR;
+ goto wsrep_error;
+ }
+ }
+wsrep_error:
+#endif /* WITH_WSREP */
+
if (error_result == HA_FTS_INVALID_DOCID) {
my_error(HA_FTS_INVALID_DOCID, MYF(0));
}
@@ -7409,6 +8299,88 @@ calc_row_difference(
return(DB_SUCCESS);
}
+#ifdef WITH_WSREP
+static
+int
+wsrep_calc_row_hash(
+/*================*/
+ byte* digest, /*!< in/out: md5 sum */
+ const uchar* row, /*!< in: row in MySQL format */
+ TABLE* table, /*!< in: table in MySQL data
+ dictionary */
+ row_prebuilt_t* prebuilt, /*!< in: InnoDB prebuilt struct */
+ THD* thd) /*!< in: user thread */
+{
+ Field* field;
+ enum_field_types field_mysql_type;
+ uint n_fields;
+ ulint len;
+ const byte* ptr;
+ ulint col_type;
+ uint i;
+
+ void *ctx = alloca(my_md5_context_size());
+ my_md5_init(ctx);
+
+ n_fields = table->s->fields;
+
+ for (i = 0; i < n_fields; i++) {
+ byte null_byte=0;
+ byte true_byte=1;
+
+ field = table->field[i];
+
+ ptr = (const byte*) row + get_field_offset(table, field);
+ len = field->pack_length();
+
+ field_mysql_type = field->type();
+
+ col_type = prebuilt->table->cols[i].mtype;
+
+ switch (col_type) {
+
+ case DATA_BLOB:
+ ptr = row_mysql_read_blob_ref(&len, ptr, len);
+
+ break;
+
+ case DATA_VARCHAR:
+ case DATA_BINARY:
+ case DATA_VARMYSQL:
+ if (field_mysql_type == MYSQL_TYPE_VARCHAR) {
+ /* This is a >= 5.0.3 type true VARCHAR where
+ the real payload data length is stored in
+ 1 or 2 bytes */
+
+ ptr = row_mysql_read_true_varchar(
+ &len, ptr,
+ (ulint)
+ (((Field_varstring*)field)->length_bytes));
+
+ }
+
+ break;
+ default:
+ ;
+ }
+ /*
+ if (field->null_ptr &&
+ field_in_record_is_null(table, field, (char*) row)) {
+ */
+
+ if (field->is_null_in_record(row)) {
+ my_md5_input(ctx, &null_byte, 1);
+ } else {
+ my_md5_input(ctx, &true_byte, 1);
+ my_md5_input(ctx, ptr, len);
+ }
+ }
+
+ my_md5_result(ctx, digest);
+
+ return(0);
+}
+#endif /* WITH_WSREP */
/**********************************************************************//**
Updates a row given as a parameter to a new value. Note that we are given
whole rows, not just the fields which are updated: this incurs some
@@ -7546,6 +8518,24 @@ func_exit:
innobase_active_small();
+#ifdef WITH_WSREP
+ if (error == DB_SUCCESS &&
+ wsrep_thd_exec_mode(user_thd) == LOCAL_STATE &&
+ wsrep_on(user_thd)) {
+
+ DBUG_PRINT("wsrep", ("update row key"));
+
+ if (wsrep_append_keys(user_thd, false, old_row, new_row)) {
+ WSREP_DEBUG("WSREP: UPDATE_ROW_KEY FAILED");
+ DBUG_PRINT("wsrep", ("row key failed"));
+ err = HA_ERR_INTERNAL_ERROR;
+ goto wsrep_error;
+ }
+ }
+wsrep_error:
+#endif /* WITH_WSREP */
+
+
DBUG_RETURN(err);
}
@@ -7593,6 +8583,19 @@ ha_innobase::delete_row(
innobase_active_small();
+#ifdef WITH_WSREP
+ if (error == DB_SUCCESS &&
+ wsrep_thd_exec_mode(user_thd) == LOCAL_STATE &&
+ wsrep_on(user_thd)) {
+
+ if (wsrep_append_keys(user_thd, false, record, NULL)) {
+ DBUG_PRINT("wsrep", ("delete fail"));
+ error = (dberr_t)HA_ERR_INTERNAL_ERROR;
+ goto wsrep_error;
+ }
+ }
+wsrep_error:
+#endif
DBUG_RETURN(convert_error_code_to_mysql(
error, prebuilt->table->flags, user_thd));
}
@@ -8794,6 +9797,393 @@ ha_innobase::ft_end()
rnd_end();
}
+#ifdef WITH_WSREP
+extern dict_index_t*
+wsrep_dict_foreign_find_index(
+ dict_table_t* table,
+ const char** col_names,
+ const char** columns,
+ ulint n_cols,
+ dict_index_t* types_idx,
+ ibool check_charsets,
+ ulint check_null);
+
+
+extern dberr_t
+wsrep_append_foreign_key(
+/*===========================*/
+ trx_t* trx, /*!< in: trx */
+ dict_foreign_t* foreign, /*!< in: foreign key constraint */
+ const rec_t* rec, /*!<in: clustered index record */
+ dict_index_t* index, /*!<in: clustered index */
+ ibool referenced, /*!<in: is check for referenced table */
+ ibool shared) /*!<in: is shared access */
+{
+ ut_a(trx);
+ THD* thd = (THD*)trx->mysql_thd;
+ ulint rcode = DB_SUCCESS;
+ char cache_key[513] = {'\0'};
+ int cache_key_len;
+ bool const copy = true;
+
+ if (!wsrep_on(trx->mysql_thd) ||
+ wsrep_thd_exec_mode(thd) != LOCAL_STATE)
+ return DB_SUCCESS;
+
+ if (!thd || !foreign ||
+ (!foreign->referenced_table && !foreign->foreign_table))
+ {
+ WSREP_INFO("FK: %s missing in: %s",
+ (!thd) ? "thread" :
+ ((!foreign) ? "constraint" :
+ ((!foreign->referenced_table) ?
+ "referenced table" : "foreign table")),
+ (thd && wsrep_thd_query(thd)) ?
+ wsrep_thd_query(thd) : "void");
+ return DB_ERROR;
+ }
+
+ if ( !((referenced) ?
+ foreign->referenced_table : foreign->foreign_table))
+ {
+ WSREP_DEBUG("pulling %s table into cache",
+ (referenced) ? "referenced" : "foreign");
+ mutex_enter(&(dict_sys->mutex));
+ if (referenced)
+ {
+ foreign->referenced_table =
+ dict_table_get_low(
+ foreign->referenced_table_name_lookup);
+ if (foreign->referenced_table)
+ {
+ foreign->referenced_index =
+ wsrep_dict_foreign_find_index(
+ foreign->referenced_table, NULL,
+ foreign->referenced_col_names,
+ foreign->n_fields,
+ foreign->foreign_index,
+ TRUE, FALSE);
+ }
+ }
+ else
+ {
+ foreign->foreign_table =
+ dict_table_get_low(
+ foreign->foreign_table_name_lookup);
+ if (foreign->foreign_table)
+ {
+ foreign->foreign_index =
+ wsrep_dict_foreign_find_index(
+ foreign->foreign_table, NULL,
+ foreign->foreign_col_names,
+ foreign->n_fields,
+ foreign->referenced_index,
+ TRUE, FALSE);
+ }
+ }
+ mutex_exit(&(dict_sys->mutex));
+ }
+
+ if ( !((referenced) ?
+ foreign->referenced_table : foreign->foreign_table))
+ {
+ WSREP_WARN("FK: %s missing in query: %s",
+ (!foreign->referenced_table) ?
+ "referenced table" : "foreign table",
+ (wsrep_thd_query(thd)) ?
+ wsrep_thd_query(thd) : "void");
+ return DB_ERROR;
+ }
+ byte key[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'};
+ ulint len = WSREP_MAX_SUPPORTED_KEY_LENGTH;
+
+ dict_index_t *idx_target = (referenced) ?
+ foreign->referenced_index : index;
+ dict_index_t *idx = (referenced) ?
+ UT_LIST_GET_FIRST(foreign->referenced_table->indexes) :
+ UT_LIST_GET_FIRST(foreign->foreign_table->indexes);
+ int i = 0;
+ while (idx != NULL && idx != idx_target) {
+ if (innobase_strcasecmp (idx->name, innobase_index_reserve_name) != 0) {
+ i++;
+ }
+ idx = UT_LIST_GET_NEXT(indexes, idx);
+ }
+ ut_a(idx);
+ key[0] = (char)i;
+
+ rcode = wsrep_rec_get_foreign_key(
+ &key[1], &len, rec, index, idx,
+ wsrep_protocol_version > 1);
+ if (rcode != DB_SUCCESS) {
+ WSREP_ERROR(
+ "FK key set failed: %lu (%lu %lu), index: %s %s, %s",
+ rcode, referenced, shared,
+ (index && index->name) ? index->name :
+ "void index",
+ (index && index->table_name) ? index->table_name :
+ "void table",
+ wsrep_thd_query(thd));
+ return DB_ERROR;
+ }
+ strncpy(cache_key,
+ (wsrep_protocol_version > 1) ?
+ ((referenced) ?
+ foreign->referenced_table->name :
+ foreign->foreign_table->name) :
+ foreign->foreign_table->name, sizeof(cache_key) - 1);
+ cache_key_len = strlen(cache_key);
+#ifdef WSREP_DEBUG_PRINT
+ ulint j;
+ fprintf(stderr, "FK parent key, table: %s %s len: %lu ",
+ cache_key, (shared) ? "shared" : "exclusive", len+1);
+ for (j=0; j<len+1; j++) {
+ fprintf(stderr, " %hhX, ", key[j]);
+ }
+ fprintf(stderr, "\n");
+#endif
+ char *p = strchr(cache_key, '/');
+ if (p) {
+ *p = '\0';
+ } else {
+ WSREP_WARN("unexpected foreign key table %s %s",
+ foreign->referenced_table->name,
+ foreign->foreign_table->name);
+ }
+
+ wsrep_buf_t wkey_part[3];
+ wsrep_key_t wkey = {wkey_part, 3};
+ if (!wsrep_prepare_key(
+ (const uchar*)cache_key,
+ cache_key_len + 1,
+ (const uchar*)key, len+1,
+ wkey_part,
+ (size_t*)&wkey.key_parts_num)) {
+ WSREP_WARN("key prepare failed for cascaded FK: %s",
+ (wsrep_thd_query(thd)) ?
+ wsrep_thd_query(thd) : "void");
+ return DB_ERROR;
+ }
+ wsrep_t *wsrep= get_wsrep();
+ rcode = (int)wsrep->append_key(
+ wsrep,
+ wsrep_ws_handle(thd, trx),
+ &wkey,
+ 1,
+ shared ? WSREP_KEY_SHARED : WSREP_KEY_EXCLUSIVE,
+ copy);
+ if (rcode) {
+ DBUG_PRINT("wsrep", ("row key failed: %lu", rcode));
+ WSREP_ERROR("Appending cascaded fk row key failed: %s, %lu",
+ (wsrep_thd_query(thd)) ?
+ wsrep_thd_query(thd) : "void", rcode);
+ return DB_ERROR;
+ }
+
+ return DB_SUCCESS;
+}
+
+static int
+wsrep_append_key(
+/*==================*/
+ THD *thd,
+ trx_t *trx,
+ TABLE_SHARE *table_share,
+ TABLE *table,
+ const char* key,
+ uint16_t key_len,
+ bool shared
+)
+{
+ DBUG_ENTER("wsrep_append_key");
+ bool const copy = true;
+#ifdef WSREP_DEBUG_PRINT
+ fprintf(stderr, "%s conn %ld, trx %llu, keylen %d, table %s\n Query: %s ",
+ (shared) ? "Shared" : "Exclusive",
+ thd_get_thread_id(thd), (long long)trx->id, key_len,
+ table_share->table_name.str, wsrep_thd_query(thd));
+ for (int i=0; i<key_len; i++) {
+ fprintf(stderr, "%hhX, ", key[i]);
+ }
+ fprintf(stderr, "\n");
+#endif
+ wsrep_buf_t wkey_part[3];
+ wsrep_key_t wkey = {wkey_part, 3};
+ if (!wsrep_prepare_key(
+ (const uchar*)table_share->table_cache_key.str,
+ table_share->table_cache_key.length,
+ (const uchar*)key, key_len,
+ wkey_part,
+ (size_t*)&wkey.key_parts_num)) {
+ WSREP_WARN("key prepare failed for: %s",
+ (wsrep_thd_query(thd)) ?
+ wsrep_thd_query(thd) : "void");
+ DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
+ }
+
+ wsrep_t *wsrep= get_wsrep();
+ int rcode = (int)wsrep->append_key(
+ wsrep,
+ wsrep_ws_handle(thd, trx),
+ &wkey,
+ 1,
+ shared ? WSREP_KEY_SHARED : WSREP_KEY_EXCLUSIVE,
+ copy);
+ if (rcode) {
+ DBUG_PRINT("wsrep", ("row key failed: %d", rcode));
+ WSREP_WARN("Appending row key failed: %s, %d",
+ (wsrep_thd_query(thd)) ?
+ wsrep_thd_query(thd) : "void", rcode);
+ DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
+ }
+ DBUG_RETURN(0);
+}
+
+int
+ha_innobase::wsrep_append_keys(
+/*==================*/
+ THD *thd,
+ bool shared,
+ const uchar* record0, /* in: row in MySQL format */
+ const uchar* record1) /* in: row in MySQL format */
+{
+ int rcode;
+ DBUG_ENTER("wsrep_append_keys");
+
+ bool key_appended = false;
+ trx_t *trx = thd_to_trx(thd);
+
+ if (table_share && table_share->tmp_table != NO_TMP_TABLE) {
+ WSREP_DEBUG("skipping tmp table DML: THD: %lu tmp: %d SQL: %s",
+ thd_get_thread_id(thd),
+ table_share->tmp_table,
+ (wsrep_thd_query(thd)) ?
+ wsrep_thd_query(thd) : "void");
+ DBUG_RETURN(0);
+ }
+
+ if (wsrep_protocol_version == 0) {
+ uint len;
+ char keyval[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'};
+ char *key = &keyval[0];
+ ibool is_null;
+
+ len = wsrep_store_key_val_for_row(
+ thd, table, 0, key, WSREP_MAX_SUPPORTED_KEY_LENGTH,
+ record0, &is_null);
+
+ if (!is_null) {
+ rcode = wsrep_append_key(
+ thd, trx, table_share, table, keyval,
+ len, shared);
+ if (rcode) DBUG_RETURN(rcode);
+ }
+ else
+ {
+ WSREP_DEBUG("NULL key skipped (proto 0): %s",
+ wsrep_thd_query(thd));
+ }
+ } else {
+ ut_a(table->s->keys <= 256);
+ uint i;
+ bool hasPK= false;
+
+ for (i=0; i<table->s->keys; ++i) {
+ KEY* key_info = table->key_info + i;
+ if (key_info->flags & HA_NOSAME) {
+ hasPK = true;
+ }
+ }
+
+ for (i=0; i<table->s->keys; ++i) {
+ uint len;
+ char keyval0[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'};
+ char keyval1[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'};
+ char* key0 = &keyval0[1];
+ char* key1 = &keyval1[1];
+ KEY* key_info = table->key_info + i;
+ ibool is_null;
+
+ dict_index_t* idx = innobase_get_index(i);
+ dict_table_t* tab = (idx) ? idx->table : NULL;
+
+ keyval0[0] = (char)i;
+ keyval1[0] = (char)i;
+
+ if (!tab) {
+ WSREP_WARN("MySQL-InnoDB key mismatch %s %s",
+ table->s->table_name.str,
+ key_info->name);
+ }
+ /* !hasPK == table with no PK, must append all non-unique keys */
+ if (!hasPK || key_info->flags & HA_NOSAME ||
+ ((tab &&
+ dict_table_get_referenced_constraint(tab, idx)) ||
+ (!tab && referenced_by_foreign_key()))) {
+
+ len = wsrep_store_key_val_for_row(
+ thd, table, i, key0,
+ WSREP_MAX_SUPPORTED_KEY_LENGTH,
+ record0, &is_null);
+ if (!is_null) {
+ rcode = wsrep_append_key(
+ thd, trx, table_share, table,
+ keyval0, len+1, shared);
+ if (rcode) DBUG_RETURN(rcode);
+
+ if (key_info->flags & HA_NOSAME || shared)
+ key_appended = true;
+ }
+ else
+ {
+ WSREP_DEBUG("NULL key skipped: %s",
+ wsrep_thd_query(thd));
+ }
+ if (record1) {
+ len = wsrep_store_key_val_for_row(
+ thd, table, i, key1,
+ WSREP_MAX_SUPPORTED_KEY_LENGTH,
+ record1, &is_null);
+ if (!is_null && memcmp(key0, key1, len)) {
+ rcode = wsrep_append_key(
+ thd, trx, table_share,
+ table,
+ keyval1, len+1, shared);
+ if (rcode) DBUG_RETURN(rcode);
+ }
+ }
+ }
+ }
+ }
+
+ /* if no PK, calculate hash of full row, to be the key value */
+ if (!key_appended && wsrep_certify_nonPK) {
+ uchar digest[16];
+ int rcode;
+
+ wsrep_calc_row_hash(digest, record0, table, prebuilt, thd);
+ if ((rcode = wsrep_append_key(thd, trx, table_share, table,
+ (const char*) digest, 16,
+ shared))) {
+ DBUG_RETURN(rcode);
+ }
+
+ if (record1) {
+ wsrep_calc_row_hash(
+ digest, record1, table, prebuilt, thd);
+ if ((rcode = wsrep_append_key(thd, trx, table_share,
+ table,
+ (const char*) digest,
+ 16, shared))) {
+ DBUG_RETURN(rcode);
+ }
+ }
+ DBUG_RETURN(0);
+ }
+
+ DBUG_RETURN(0);
+}
+#endif /* WITH_WSREP */
/*********************************************************************//**
Stores a reference to the current row to 'ref' field of the handle. Note
@@ -9668,11 +11058,16 @@ innobase_table_flags(
enum row_type row_format;
rec_format_t innodb_row_format = REC_FORMAT_COMPACT;
bool use_data_dir;
+ ha_table_option_struct *options= form->s->option_struct;
/* Cache the value of innodb_file_format, in case it is
modified by another thread while the table is being created. */
const ulint file_format_allowed = srv_file_format;
+ /* Cache the value of innobase_compression_level, in case it is
+ modified by another thread while the table is being created. */
+ const ulint default_compression_level = page_zip_level;
+
*flags = 0;
*flags2 = 0;
@@ -9726,6 +11121,8 @@ index_bad:
}
}
+ row_format = form->s->row_type;
+
if (create_info->key_block_size) {
/* The requested compressed page size (key_block_size)
is given in kilobytes. If it is a valid number, store
@@ -9735,7 +11132,7 @@ index_bad:
ulint kbsize; /* Key Block Size */
for (zssize = kbsize = 1;
zssize <= ut_min(UNIV_PAGE_SSIZE_MAX,
- PAGE_ZIP_SSIZE_MAX);
+ PAGE_ZIP_SSIZE_MAX);
zssize++, kbsize <<= 1) {
if (kbsize == create_info->key_block_size) {
zip_ssize = zssize;
@@ -9763,8 +11160,8 @@ index_bad:
}
if (!zip_allowed
- || zssize > ut_min(UNIV_PAGE_SSIZE_MAX,
- PAGE_ZIP_SSIZE_MAX)) {
+ || zssize > ut_min(UNIV_PAGE_SSIZE_MAX,
+ PAGE_ZIP_SSIZE_MAX)) {
push_warning_printf(
thd, Sql_condition::WARN_LEVEL_WARN,
ER_ILLEGAL_HA_CREATE_OPTION,
@@ -9773,8 +11170,6 @@ index_bad:
}
}
- row_format = form->s->row_type;
-
if (zip_ssize && zip_allowed) {
/* if ROW_FORMAT is set to default,
automatically change it to COMPRESSED.*/
@@ -9811,7 +11206,6 @@ index_bad:
case ROW_TYPE_REDUNDANT:
innodb_row_format = REC_FORMAT_REDUNDANT;
break;
-
case ROW_TYPE_COMPRESSED:
case ROW_TYPE_DYNAMIC:
if (!use_tablespace) {
@@ -9829,10 +11223,18 @@ index_bad:
" innodb_file_format > Antelope.",
get_row_format_name(row_format));
} else {
- innodb_row_format = (row_format == ROW_TYPE_DYNAMIC
- ? REC_FORMAT_DYNAMIC
- : REC_FORMAT_COMPRESSED);
- break;
+ switch(row_format) {
+ case ROW_TYPE_COMPRESSED:
+ innodb_row_format = REC_FORMAT_COMPRESSED;
+ break;
+ case ROW_TYPE_DYNAMIC:
+ innodb_row_format = REC_FORMAT_DYNAMIC;
+ break;
+ default:
+ /* Not possible, avoid compiler warning */
+ break;
+ }
+ break; /* Correct row_format */
}
zip_allowed = FALSE;
/* fall through to set row_format = COMPACT */
@@ -9859,7 +11261,15 @@ index_bad:
&& ((create_info->data_file_name != NULL)
&& !(create_info->options & HA_LEX_CREATE_TMP_TABLE));
- dict_tf_set(flags, innodb_row_format, zip_ssize, use_data_dir);
+ /* Set up table dictionary flags */
+ dict_tf_set(flags,
+ innodb_row_format,
+ zip_ssize,
+ use_data_dir,
+ options->page_compressed,
+ options->page_compression_level == 0 ?
+ default_compression_level : options->page_compression_level,
+ options->atomic_writes);
if (create_info->options & HA_LEX_CREATE_TMP_TABLE) {
*flags2 |= DICT_TF2_TEMPORARY;
@@ -9877,6 +11287,144 @@ index_bad:
DBUG_RETURN(true);
}
+
+/*****************************************************************//**
+Check engine specific table options not handled by SQL-parser.
+@return NULL if valid, string if not */
+UNIV_INTERN
+const char*
+ha_innobase::check_table_options(
+ THD *thd, /*!< in: thread handle */
+ TABLE* table, /*!< in: information on table
+ columns and indexes */
+ HA_CREATE_INFO* create_info, /*!< in: more information of the
+ created table, contains also the
+ create statement string */
+ const bool use_tablespace, /*!< in: use file par table */
+ const ulint file_format)
+{
+ enum row_type row_format = table->s->row_type;
+ ha_table_option_struct *options= table->s->option_struct;
+ atomic_writes_t awrites = (atomic_writes_t)options->atomic_writes;
+ fil_encryption_t encrypt = (fil_encryption_t)options->encryption;
+
+ if (encrypt != FIL_SPACE_ENCRYPTION_DEFAULT && !use_tablespace) {
+ push_warning(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: ENCRYPTED requires innodb_file_per_table");
+ return "ENCRYPTED";
+ }
+
+ if (encrypt == FIL_SPACE_ENCRYPTION_OFF && srv_encrypt_tables == 2) {
+ push_warning(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: ENCRYPTED=OFF cannot be used when innodb_encrypt_tables=FORCE");
+ return "ENCRYPTED";
+ }
+
+ /* Check page compression requirements */
+ if (options->page_compressed) {
+
+ if (row_format == ROW_TYPE_COMPRESSED) {
+ push_warning(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: PAGE_COMPRESSED table can't have"
+ " ROW_TYPE=COMPRESSED");
+ return "PAGE_COMPRESSED";
+ }
+
+ if (row_format == ROW_TYPE_REDUNDANT) {
+ push_warning(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: PAGE_COMPRESSED table can't have"
+ " ROW_TYPE=REDUNDANT");
+ return "PAGE_COMPRESSED";
+ }
+
+ if (!use_tablespace) {
+ push_warning(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: PAGE_COMPRESSED requires"
+ " innodb_file_per_table.");
+ return "PAGE_COMPRESSED";
+ }
+
+ if (file_format < UNIV_FORMAT_B) {
+ push_warning(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: PAGE_COMPRESSED requires"
+ " innodb_file_format > Antelope.");
+ return "PAGE_COMPRESSED";
+ }
+
+ if (create_info->key_block_size) {
+ push_warning(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: PAGE_COMPRESSED table can't have"
+ " key_block_size");
+ return "PAGE_COMPRESSED";
+ }
+ }
+
+ /* Check page compression level requirements, some of them are
+ already checked above */
+ if (options->page_compression_level != 0) {
+ if (options->page_compressed == false) {
+ push_warning(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: PAGE_COMPRESSION_LEVEL requires"
+ " PAGE_COMPRESSED");
+ return "PAGE_COMPRESSION_LEVEL";
+ }
+
+ if (options->page_compression_level < 1 || options->page_compression_level > 9) {
+ push_warning_printf(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: invalid PAGE_COMPRESSION_LEVEL = %lu."
+ " Valid values are [1, 2, 3, 4, 5, 6, 7, 8, 9]",
+ options->page_compression_level);
+ return "PAGE_COMPRESSION_LEVEL";
+ }
+ }
+
+ if (encrypt == FIL_SPACE_ENCRYPTION_ON ||
+ (encrypt == FIL_SPACE_ENCRYPTION_DEFAULT && srv_encrypt_tables)) {
+ if (!encryption_key_id_exists(options->encryption_key_id)) {
+ push_warning_printf(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: ENCRYPTION_KEY_ID %lu not available",
+ options->encryption_key_id
+ );
+ return "ENCRYPTION_KEY_ID";
+ }
+ }
+
+ /* Check atomic writes requirements */
+ if (awrites == ATOMIC_WRITES_ON ||
+ (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes)) {
+ if (!use_tablespace) {
+ push_warning(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: ATOMIC_WRITES requires"
+ " innodb_file_per_table.");
+ return "ATOMIC_WRITES";
+ }
+ }
+
+ return 0;
+}
+
/*****************************************************************//**
Creates a new table to an InnoDB database.
@return error number */
@@ -9908,6 +11456,7 @@ ha_innobase::create(
while creating the table. So we read the current value here
and make all further decisions based on this. */
bool use_tablespace = srv_file_per_table;
+ const ulint file_format = srv_file_format;
/* Zip Shift Size - log2 - 9 of compressed page size,
zero for uncompressed */
@@ -9917,6 +11466,10 @@ ha_innobase::create(
const char* stmt;
size_t stmt_len;
+ /* Cache table options */
+ ha_table_option_struct *options= form->s->option_struct;
+ fil_encryption_t encrypt = (fil_encryption_t)options->encryption;
+ ulint key_id = options->encryption_key_id;
DBUG_ENTER("ha_innobase::create");
@@ -9931,6 +11484,12 @@ ha_innobase::create(
/* Create the table definition in InnoDB */
+ /* Validate table options not handled by the SQL-parser */
+ if(check_table_options(thd, form, create_info, use_tablespace,
+ file_format)) {
+ DBUG_RETURN(HA_WRONG_CREATE_OPTION);
+ }
+
/* Validate create options if innodb_strict_mode is set. */
if (create_options_are_invalid(
thd, form, create_info, use_tablespace)) {
@@ -10163,6 +11722,48 @@ ha_innobase::create(
DBUG_ASSERT(innobase_table != 0);
+ /* If user has requested that table should be encrypted or table
+ should remain as unencrypted store crypt data */
+ if (encrypt != FIL_SPACE_ENCRYPTION_DEFAULT) {
+ ulint maxsize=0;
+ ulint zip_size = fil_space_get_zip_size(innobase_table->space);
+ fil_space_crypt_t* old_crypt_data = fil_space_get_crypt_data(innobase_table->space);
+ fil_space_crypt_t* crypt_data;
+
+ crypt_data = fil_space_create_crypt_data(encrypt, key_id);
+ crypt_data->page0_offset = fsp_header_get_crypt_offset(zip_size, &maxsize);
+ crypt_data->encryption = encrypt;
+
+ /* If there is old crypt data, copy IV */
+ if (old_crypt_data) {
+ memcpy(crypt_data->iv, old_crypt_data->iv, sizeof(crypt_data->iv));
+ }
+
+ mtr_t mtr;
+ mtr_start(&mtr);
+ /* Get page 0*/
+ ulint offset = 0;
+ buf_block_t* block = buf_page_get_gen(innobase_table->space,
+ zip_size,
+ offset,
+ RW_X_LATCH,
+ NULL,
+ BUF_GET,
+ __FILE__, __LINE__,
+ &mtr);
+
+ /* Set up new crypt data */
+ crypt_data = fil_space_set_crypt_data(innobase_table->space, crypt_data);
+
+ /* Compute location to store crypt data */
+ byte* frame = buf_block_get_frame(block);
+
+ /* Write crypt data to page 0 */
+ fil_space_write_crypt_data(innobase_table->space, frame, crypt_data->page0_offset, maxsize, &mtr);
+
+ mtr_commit(&mtr);
+ }
+
innobase_copy_frm_flags_from_create_info(innobase_table, create_info);
dict_stats_update(innobase_table, DICT_STATS_EMPTY_TABLE);
@@ -10501,6 +12102,71 @@ ha_innobase::delete_table(
DBUG_RETURN(convert_error_code_to_mysql(err, 0, NULL));
}
+/*****************************************************************//**
+Defragment table.
+@return error number */
+UNIV_INTERN
+int
+ha_innobase::defragment_table(
+/*==========================*/
+ const char* name, /*!< in: table name */
+ const char* index_name, /*!< in: index name */
+ bool async) /*!< in: whether to wait until finish */
+{
+ char norm_name[FN_REFLEN];
+ dict_table_t* table;
+ dict_index_t* index;
+ ibool one_index = (index_name != 0);
+ int ret = 0;
+ if (!srv_defragment) {
+ return ER_FEATURE_DISABLED;
+ }
+ normalize_table_name(norm_name, name);
+ table = dict_table_open_on_name(norm_name, FALSE,
+ FALSE, DICT_ERR_IGNORE_NONE);
+ for (index = dict_table_get_first_index(table); index;
+ index = dict_table_get_next_index(index)) {
+ if (one_index && strcasecmp(index_name, index->name) != 0)
+ continue;
+ if (btr_defragment_find_index(index)) {
+ // We borrow this error code. When the same index is
+ // already in the defragmentation queue, issue another
+ // defragmentation only introduces overhead. We return
+ // an error here to let the user know this is not
+ // necessary. Note that this will fail a query that's
+ // trying to defragment a full table if one of the
+ // indicies in that table is already in defragmentation.
+ // We choose this behavior so user is aware of this
+ // rather than silently defragment other indicies of
+ // that table.
+ ret = ER_SP_ALREADY_EXISTS;
+ break;
+ }
+ os_event_t event = btr_defragment_add_index(index, async);
+ if (!async && event) {
+ while(os_event_wait_time(event, 1000000)) {
+ if (thd_killed(current_thd)) {
+ btr_defragment_remove_index(index);
+ ret = ER_QUERY_INTERRUPTED;
+ break;
+ }
+ }
+ os_event_free(event);
+ }
+ if (ret) {
+ break;
+ }
+ if (one_index) {
+ one_index = FALSE;
+ break;
+ }
+ }
+ dict_table_close(table, FALSE, FALSE);
+ if (ret == 0 && one_index) {
+ ret = ER_NO_SUCH_INDEX;
+ }
+ return ret;
+}
/*****************************************************************//**
Removes all tables in the named database inside InnoDB. */
@@ -11669,6 +13335,27 @@ ha_innobase::optimize(
This works OK otherwise, but MySQL locks the entire table during
calls to OPTIMIZE, which is undesirable. */
+ if (srv_defragment) {
+ int err;
+
+ err = defragment_table(prebuilt->table->name, NULL, false);
+
+ if (err == 0) {
+ return (HA_ADMIN_OK);
+ } else {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ err,
+ "InnoDB: Cannot defragment table %s: returned error code %d\n",
+ prebuilt->table->name, err);
+
+ if(err == ER_SP_ALREADY_EXISTS) {
+ return (HA_ADMIN_OK);
+ } else {
+ return (HA_ADMIN_TRY_ALTER);
+ }
+ }
+ }
+
if (innodb_optimize_fulltext_only) {
if (prebuilt->table->fts && prebuilt->table->fts->cache
&& !dict_table_is_discarded(prebuilt->table)) {
@@ -11768,7 +13455,7 @@ ha_innobase::check(
CHECK TABLE. */
os_increment_counter_by_amount(
server_mutex,
- srv_fatal_semaphore_wait_threshold,
+ srv_fatal_semaphore_wait_threshold,
SRV_SEMAPHORE_WAIT_EXTENSION);
bool valid = btr_validate_index(index, prebuilt->trx);
@@ -11776,7 +13463,7 @@ ha_innobase::check(
CHECK TABLE. */
os_decrement_counter_by_amount(
server_mutex,
- srv_fatal_semaphore_wait_threshold,
+ srv_fatal_semaphore_wait_threshold,
SRV_SEMAPHORE_WAIT_EXTENSION);
if (!valid) {
@@ -12584,11 +14271,18 @@ ha_innobase::external_lock(
/* used by test case */
DBUG_EXECUTE_IF("no_innodb_binlog_errors", skip = true;);
if (!skip) {
+#ifdef WITH_WSREP
+ if (!wsrep_on(thd) || wsrep_thd_exec_mode(thd) == LOCAL_STATE)
+ {
+#endif /* WITH_WSREP */
my_error(ER_BINLOG_STMT_MODE_AND_ROW_ENGINE, MYF(0),
" InnoDB is limited to row-logging when "
"transaction isolation level is "
"READ COMMITTED or READ UNCOMMITTED.");
DBUG_RETURN(HA_ERR_LOGGING_IMPOSSIBLE);
+#ifdef WITH_WSREP
+ }
+#endif /* WITH_WSREP */
}
}
@@ -13678,7 +15372,16 @@ ha_innobase::get_auto_increment(
next value in the series. */
if (prebuilt->autoinc_increment > increment) {
+ WSREP_DEBUG("autoinc decrease: %llu -> %llu\n"
+ "THD: %ld, current: %llu, autoinc: %llu",
+ prebuilt->autoinc_increment,
+ increment,
+ thd_get_thread_id(ha_thd()),
+ current, autoinc);
+ if (!wsrep_on(ha_thd()))
+ {
current = autoinc - prebuilt->autoinc_increment;
+ }
current = innobase_next_autoinc(
current, 1, increment, 1, col_max_value);
@@ -14040,6 +15743,9 @@ innobase_xa_prepare(
to the session variable take effect only in the next transaction */
if (!trx->support_xa) {
+#ifdef WITH_WSREP
+ thd_get_xid(thd, (MYSQL_XID*) &trx->xid);
+#endif // WITH_WSREP
return(0);
}
@@ -14227,6 +15933,12 @@ ha_innobase::check_if_incompatible_data(
HA_CREATE_INFO* info,
uint table_changes)
{
+ ha_table_option_struct *param_old, *param_new;
+
+ /* Cache engine specific options */
+ param_new = info->option_struct;
+ param_old = table->s->option_struct;
+
innobase_copy_frm_flags_from_create_info(prebuilt->table, info);
if (table_changes != IS_EQUAL_YES) {
@@ -14253,6 +15965,13 @@ ha_innobase::check_if_incompatible_data(
return(COMPATIBLE_DATA_NO);
}
+ /* Changes on engine specific table options requests a rebuild of the table. */
+ if (param_new->page_compressed != param_old->page_compressed ||
+ param_new->page_compression_level != param_old->page_compression_level ||
+ param_new->atomic_writes != param_old->atomic_writes) {
+ return(COMPATIBLE_DATA_NO);
+ }
+
return(COMPATIBLE_DATA_YES);
}
@@ -14392,6 +16111,13 @@ innodb_max_dirty_pages_pct_lwm_update(
srv_max_dirty_pages_pct_lwm = in_val;
}
+UNIV_INTERN
+void
+ha_innobase::set_partition_owner_stats(ha_statistics *stats)
+{
+ ha_partition_stats= stats;
+}
+
/************************************************************//**
Validate the file format name and return its corresponding id.
@return valid file format id */
@@ -15645,6 +17371,23 @@ innodb_reset_all_monitor_update(
TRUE);
}
+static
+void
+innodb_defragment_frequency_update(
+/*===============================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to
+ system variable */
+ void* var_ptr,/*!< out: where the
+ formal string goes */
+ const void* save) /*!< in: immediate result
+ from check function */
+{
+ srv_defragment_frequency = (*static_cast<const uint*>(save));
+ srv_defragment_interval = ut_microseconds_to_timer(
+ 1000000.0 / srv_defragment_frequency);
+}
+
/****************************************************************//**
Parse and enable InnoDB monitor counters during server startup.
User can list the monitor counters/groups to be enable by specifying
@@ -16078,6 +17821,7 @@ which control InnoDB "status monitor" output to the error log.
static
void
innodb_status_output_update(
+/*========================*/
THD* thd __attribute__((unused)),
struct st_mysql_sys_var* var __attribute__((unused)),
void* var_ptr __attribute__((unused)),
@@ -16088,6 +17832,74 @@ innodb_status_output_update(
os_event_set(srv_monitor_event);
}
+/******************************************************************
+Update the system variable innodb_encryption_threads */
+static
+void
+innodb_encryption_threads_update(
+/*=============================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to
+ system variable */
+ void* var_ptr,/*!< out: where the
+ formal string goes */
+ const void* save) /*!< in: immediate result
+ from check function */
+{
+ fil_crypt_set_thread_cnt(*static_cast<const uint*>(save));
+}
+
+/******************************************************************
+Update the system variable innodb_encryption_rotate_key_age */
+static
+void
+innodb_encryption_rotate_key_age_update(
+/*====================================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to
+ system variable */
+ void* var_ptr,/*!< out: where the
+ formal string goes */
+ const void* save) /*!< in: immediate result
+ from check function */
+{
+ fil_crypt_set_rotate_key_age(*static_cast<const uint*>(save));
+}
+
+/******************************************************************
+Update the system variable innodb_encryption_rotation_iops */
+static
+void
+innodb_encryption_rotation_iops_update(
+/*===================================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to
+ system variable */
+ void* var_ptr,/*!< out: where the
+ formal string goes */
+ const void* save) /*!< in: immediate result
+ from check function */
+{
+ fil_crypt_set_rotation_iops(*static_cast<const uint*>(save));
+}
+
+/******************************************************************
+Update the system variable innodb_encrypt_tables*/
+static
+void
+innodb_encrypt_tables_update(
+/*=========================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to
+ system variable */
+ void* var_ptr,/*!< out: where the
+ formal string goes */
+ const void* save) /*!< in: immediate result
+ from check function */
+{
+ fil_crypt_set_encrypt_tables(*static_cast<const uint*>(save));
+}
+
static SHOW_VAR innodb_status_variables_export[]= {
{"Innodb", (char*) &show_innodb_vars, SHOW_FUNC},
{NullS, NullS, SHOW_LONG}
@@ -16096,6 +17908,290 @@ static SHOW_VAR innodb_status_variables_export[]= {
static struct st_mysql_storage_engine innobase_storage_engine=
{ MYSQL_HANDLERTON_INTERFACE_VERSION };
+#ifdef WITH_WSREP
+void
+wsrep_abort_slave_trx(wsrep_seqno_t bf_seqno, wsrep_seqno_t victim_seqno)
+{
+ WSREP_ERROR("Trx %lld tries to abort slave trx %lld. This could be "
+ "caused by:\n\t"
+ "1) unsupported configuration options combination, please check documentation.\n\t"
+ "2) a bug in the code.\n\t"
+ "3) a database corruption.\n Node consistency compromized, "
+ "need to abort. Restart the node to resync with cluster.",
+ (long long)bf_seqno, (long long)victim_seqno);
+ abort();
+}
+/*******************************************************************//**
+This function is used to kill one transaction in BF. */
+
+int
+wsrep_innobase_kill_one_trx(void * const bf_thd_ptr,
+ const trx_t * const bf_trx,
+ trx_t *victim_trx, ibool signal)
+{
+ ut_ad(lock_mutex_own());
+ ut_ad(trx_mutex_own(victim_trx));
+ ut_ad(bf_thd_ptr);
+ ut_ad(victim_trx);
+
+ DBUG_ENTER("wsrep_innobase_kill_one_trx");
+ THD *bf_thd = bf_thd_ptr ? (THD*) bf_thd_ptr : NULL;
+ THD *thd = (THD *) victim_trx->mysql_thd;
+ int64_t bf_seqno = (bf_thd) ? wsrep_thd_trx_seqno(bf_thd) : 0;
+
+ if (!thd) {
+ DBUG_PRINT("wsrep", ("no thd for conflicting lock"));
+ WSREP_WARN("no THD for trx: %lu", victim_trx->id);
+ DBUG_RETURN(1);
+ }
+ if (!bf_thd) {
+ DBUG_PRINT("wsrep", ("no BF thd for conflicting lock"));
+ WSREP_WARN("no BF THD for trx: %lu", (bf_trx) ? bf_trx->id : 0);
+ DBUG_RETURN(1);
+ }
+
+ WSREP_LOG_CONFLICT(bf_thd, thd, TRUE);
+
+ WSREP_DEBUG("BF kill (%lu, seqno: %lld), victim: (%lu) trx: %lu",
+ signal, (long long)bf_seqno,
+ thd_get_thread_id(thd),
+ victim_trx->id);
+
+ WSREP_DEBUG("Aborting query: %s",
+ (thd && wsrep_thd_query(thd)) ? wsrep_thd_query(thd) : "void");
+
+ wsrep_thd_LOCK(thd);
+
+ if (wsrep_thd_query_state(thd) == QUERY_EXITING) {
+ WSREP_DEBUG("kill trx EXITING for %lu", victim_trx->id);
+ wsrep_thd_UNLOCK(thd);
+ DBUG_RETURN(0);
+ }
+ if(wsrep_thd_exec_mode(thd) != LOCAL_STATE) {
+ WSREP_DEBUG("withdraw for BF trx: %lu, state: %d",
+ victim_trx->id,
+ wsrep_thd_get_conflict_state(thd));
+ }
+
+ switch (wsrep_thd_get_conflict_state(thd)) {
+ case NO_CONFLICT:
+ wsrep_thd_set_conflict_state(thd, MUST_ABORT);
+ break;
+ case MUST_ABORT:
+ WSREP_DEBUG("victim %lu in MUST ABORT state",
+ victim_trx->id);
+ wsrep_thd_UNLOCK(thd);
+ wsrep_thd_awake(thd, signal);
+ DBUG_RETURN(0);
+ break;
+ case ABORTED:
+ case ABORTING: // fall through
+ default:
+ WSREP_DEBUG("victim %lu in state %d",
+ victim_trx->id, wsrep_thd_get_conflict_state(thd));
+ wsrep_thd_UNLOCK(thd);
+ DBUG_RETURN(0);
+ break;
+ }
+
+ switch (wsrep_thd_query_state(thd)) {
+ case QUERY_COMMITTING:
+ enum wsrep_status rcode;
+
+ WSREP_DEBUG("kill query for: %ld",
+ thd_get_thread_id(thd));
+ WSREP_DEBUG("kill trx QUERY_COMMITTING for %lu",
+ victim_trx->id);
+
+ if (wsrep_thd_exec_mode(thd) == REPL_RECV) {
+ wsrep_abort_slave_trx(bf_seqno,
+ wsrep_thd_trx_seqno(thd));
+ } else {
+ wsrep_t *wsrep= get_wsrep();
+ rcode = wsrep->abort_pre_commit(
+ wsrep, bf_seqno,
+ (wsrep_trx_id_t)victim_trx->id
+ );
+
+ switch (rcode) {
+ case WSREP_WARNING:
+ WSREP_DEBUG("cancel commit warning: %lu",
+ victim_trx->id);
+ wsrep_thd_UNLOCK(thd);
+ wsrep_thd_awake(thd, signal);
+ DBUG_RETURN(1);
+ break;
+ case WSREP_OK:
+ break;
+ default:
+ WSREP_ERROR(
+ "cancel commit bad exit: %d %lu",
+ rcode,
+ victim_trx->id);
+ /* unable to interrupt, must abort */
+ /* note: kill_mysql() will block, if we cannot.
+ * kill the lock holder first.
+ */
+ abort();
+ break;
+ }
+ }
+ wsrep_thd_UNLOCK(thd);
+ wsrep_thd_awake(thd, signal);
+ break;
+ case QUERY_EXEC:
+ /* it is possible that victim trx is itself waiting for some
+ * other lock. We need to cancel this waiting
+ */
+ WSREP_DEBUG("kill trx QUERY_EXEC for %lu", victim_trx->id);
+
+ victim_trx->lock.was_chosen_as_deadlock_victim= TRUE;
+ if (victim_trx->lock.wait_lock) {
+ WSREP_DEBUG("victim has wait flag: %ld",
+ thd_get_thread_id(thd));
+ lock_t* wait_lock = victim_trx->lock.wait_lock;
+ if (wait_lock) {
+ WSREP_DEBUG("canceling wait lock");
+ victim_trx->lock.was_chosen_as_deadlock_victim= TRUE;
+ lock_cancel_waiting_and_release(wait_lock);
+ }
+
+ wsrep_thd_UNLOCK(thd);
+ wsrep_thd_awake(thd, signal);
+ } else {
+ /* abort currently executing query */
+ DBUG_PRINT("wsrep",("sending KILL_QUERY to: %ld",
+ thd_get_thread_id(thd)));
+ WSREP_DEBUG("kill query for: %ld",
+ thd_get_thread_id(thd));
+ /* Note that innobase_kill_connection will take lock_mutex
+ and trx_mutex */
+ wsrep_thd_UNLOCK(thd);
+ wsrep_thd_awake(thd, signal);
+
+ /* for BF thd, we need to prevent him from committing */
+ if (wsrep_thd_exec_mode(thd) == REPL_RECV) {
+ wsrep_abort_slave_trx(bf_seqno,
+ wsrep_thd_trx_seqno(thd));
+ }
+ }
+ break;
+ case QUERY_IDLE:
+ {
+ WSREP_DEBUG("kill IDLE for %lu", victim_trx->id);
+
+ if (wsrep_thd_exec_mode(thd) == REPL_RECV) {
+ WSREP_DEBUG("kill BF IDLE, seqno: %lld",
+ (long long)wsrep_thd_trx_seqno(thd));
+ wsrep_thd_UNLOCK(thd);
+ wsrep_abort_slave_trx(bf_seqno,
+ wsrep_thd_trx_seqno(thd));
+ DBUG_RETURN(0);
+ }
+ /* This will lock thd from proceeding after net_read() */
+ wsrep_thd_set_conflict_state(thd, ABORTING);
+
+ wsrep_lock_rollback();
+
+ if (wsrep_aborting_thd_contains(thd)) {
+ WSREP_WARN("duplicate thd aborter %lu",
+ thd_get_thread_id(thd));
+ } else {
+ wsrep_aborting_thd_enqueue(thd);
+ DBUG_PRINT("wsrep",("enqueuing trx abort for %lu",
+ thd_get_thread_id(thd)));
+ WSREP_DEBUG("enqueuing trx abort for (%lu)",
+ thd_get_thread_id(thd));
+ }
+
+ DBUG_PRINT("wsrep",("signalling wsrep rollbacker"));
+ WSREP_DEBUG("signaling aborter");
+ wsrep_unlock_rollback();
+ wsrep_thd_UNLOCK(thd);
+
+ break;
+ }
+ default:
+ WSREP_WARN("bad wsrep query state: %d",
+ wsrep_thd_query_state(thd));
+ wsrep_thd_UNLOCK(thd);
+ break;
+ }
+
+ DBUG_RETURN(0);
+}
+
+static
+int
+wsrep_abort_transaction(handlerton* hton, THD *bf_thd, THD *victim_thd,
+ my_bool signal)
+{
+ DBUG_ENTER("wsrep_innobase_abort_thd");
+ trx_t* victim_trx = thd_to_trx(victim_thd);
+ trx_t* bf_trx = (bf_thd) ? thd_to_trx(bf_thd) : NULL;
+ WSREP_DEBUG("abort transaction: BF: %s victim: %s",
+ wsrep_thd_query(bf_thd),
+ wsrep_thd_query(victim_thd));
+
+ if (victim_trx)
+ {
+ lock_mutex_enter();
+ trx_mutex_enter(victim_trx);
+ int rcode = wsrep_innobase_kill_one_trx(bf_thd, bf_trx,
+ victim_trx, signal);
+ trx_mutex_exit(victim_trx);
+ lock_mutex_exit();
+ wsrep_srv_conc_cancel_wait(victim_trx);
+
+ DBUG_RETURN(rcode);
+ } else {
+ WSREP_DEBUG("victim does not have transaction");
+ wsrep_thd_LOCK(victim_thd);
+ wsrep_thd_set_conflict_state(victim_thd, MUST_ABORT);
+ wsrep_thd_UNLOCK(victim_thd);
+ wsrep_thd_awake(victim_thd, signal);
+ }
+ DBUG_RETURN(-1);
+}
+
+static int innobase_wsrep_set_checkpoint(handlerton* hton, const XID* xid)
+{
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+ if (wsrep_is_wsrep_xid(xid)) {
+ mtr_t mtr;
+ mtr_start(&mtr);
+ trx_sysf_t* sys_header = trx_sysf_get(&mtr);
+ trx_sys_update_wsrep_checkpoint(xid, sys_header, &mtr);
+ mtr_commit(&mtr);
+ innobase_flush_logs(hton);
+ return 0;
+ } else {
+ return 1;
+ }
+}
+
+static int innobase_wsrep_get_checkpoint(handlerton* hton, XID* xid)
+{
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+ trx_sys_read_wsrep_checkpoint(xid);
+ return 0;
+}
+
+static void
+wsrep_fake_trx_id(
+/*==================*/
+ handlerton *hton,
+ THD *thd) /*!< in: user thread handle */
+{
+ mutex_enter(&trx_sys->mutex);
+ trx_id_t trx_id = trx_sys_get_new_trx_id();
+ mutex_exit(&trx_sys->mutex);
+
+ (void *)wsrep_ws_handle_for_trx(wsrep_thd_ws_handle(thd), trx_id);
+}
+
+#endif /* WITH_WSREP */
+
/* plugin options */
static MYSQL_SYSVAR_ENUM(checksum_algorithm, srv_checksum_algorithm,
@@ -16167,6 +18263,13 @@ static MYSQL_SYSVAR_ULONG(io_capacity_max, srv_max_io_capacity,
SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT, 100,
SRV_MAX_IO_CAPACITY_LIMIT, 0);
+static MYSQL_SYSVAR_ULONG(idle_flush_pct,
+ srv_idle_flush_pct,
+ PLUGIN_VAR_RQCMDARG,
+ "Up to what percentage of dirty pages should be flushed when innodb "
+ "finds it has spare resources to do so.",
+ NULL, NULL, 100, 0, 100, 0);
+
#ifdef UNIV_DEBUG
static MYSQL_SYSVAR_BOOL(purge_run_now, innodb_purge_run_now,
PLUGIN_VAR_OPCMDARG,
@@ -16426,7 +18529,7 @@ static MYSQL_SYSVAR_ULONG(replication_delay, srv_replication_delay,
static MYSQL_SYSVAR_UINT(compression_level, page_zip_level,
PLUGIN_VAR_RQCMDARG,
- "Compression level used for compressed row format. 0 is no compression"
+ "Compression level used for zlib compression. 0 is no compression"
", 1 is fastest, 9 is best compression and default is 6.",
NULL, NULL, DEFAULT_COMPRESSION_LEVEL, 0, 9, 0);
@@ -16437,7 +18540,7 @@ static MYSQL_SYSVAR_BOOL(log_compressed_pages, page_zip_log_pages,
" the zlib compression algorithm changes."
" When turned OFF, InnoDB will assume that the zlib"
" compression algorithm doesn't change.",
- NULL, NULL, TRUE);
+ NULL, NULL, FALSE);
static MYSQL_SYSVAR_LONG(additional_mem_pool_size, innobase_additional_mem_pool_size,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
@@ -16512,6 +18615,60 @@ static MYSQL_SYSVAR_BOOL(buffer_pool_load_at_startup, srv_buffer_pool_load_at_st
"Load the buffer pool from a file named @@innodb_buffer_pool_filename",
NULL, NULL, FALSE);
+static MYSQL_SYSVAR_BOOL(defragment, srv_defragment,
+ PLUGIN_VAR_RQCMDARG,
+ "Enable/disable InnoDB defragmentation (default FALSE). When set to FALSE, all existing "
+ "defragmentation will be paused. And new defragmentation command will fail."
+ "Paused defragmentation commands will resume when this variable is set to "
+ "true again.",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_UINT(defragment_n_pages, srv_defragment_n_pages,
+ PLUGIN_VAR_RQCMDARG,
+ "Number of pages considered at once when merging multiple pages to "
+ "defragment",
+ NULL, NULL, 7, 2, 32, 0);
+
+static MYSQL_SYSVAR_UINT(defragment_stats_accuracy,
+ srv_defragment_stats_accuracy,
+ PLUGIN_VAR_RQCMDARG,
+ "How many defragment stats changes there are before the stats "
+ "are written to persistent storage. Set to 0 meaning disable "
+ "defragment stats tracking.",
+ NULL, NULL, 0, 0, ~0U, 0);
+
+static MYSQL_SYSVAR_UINT(defragment_fill_factor_n_recs,
+ srv_defragment_fill_factor_n_recs,
+ PLUGIN_VAR_RQCMDARG,
+ "How many records of space defragmentation should leave on the page. "
+ "This variable, together with innodb_defragment_fill_factor, is introduced "
+ "so defragmentation won't pack the page too full and cause page split on "
+ "the next insert on every page. The variable indicating more defragmentation"
+ " gain is the one effective.",
+ NULL, NULL, 20, 1, 100, 0);
+
+static MYSQL_SYSVAR_DOUBLE(defragment_fill_factor, srv_defragment_fill_factor,
+ PLUGIN_VAR_RQCMDARG,
+ "A number between [0.7, 1] that tells defragmentation how full it should "
+ "fill a page. Default is 0.9. Number below 0.7 won't make much sense."
+ "This variable, together with innodb_defragment_fill_factor_n_recs, is "
+ "introduced so defragmentation won't pack the page too full and cause "
+ "page split on the next insert on every page. The variable indicating more "
+ "defragmentation gain is the one effective.",
+ NULL, NULL, 0.9, 0.7, 1, 0);
+
+static MYSQL_SYSVAR_UINT(defragment_frequency, srv_defragment_frequency,
+ PLUGIN_VAR_RQCMDARG,
+ "Do not defragment a single index more than this number of time per second."
+ "This controls the number of time defragmentation thread can request X_LOCK "
+ "on an index. Defragmentation thread will check whether "
+ "1/defragment_frequency (s) has passed since it worked on this index last "
+ "time, and put the index back to the queue if not enough time has passed. "
+ "The actual frequency can only be lower than this given number.",
+ NULL, innodb_defragment_frequency_update,
+ SRV_DEFRAGMENT_FREQUENCY_DEFAULT, 1, 1000, 0);
+
+
static MYSQL_SYSVAR_ULONG(lru_scan_depth, srv_LRU_scan_depth,
PLUGIN_VAR_RQCMDARG,
"How deep to scan LRU to keep it clean",
@@ -16700,6 +18857,12 @@ static MYSQL_SYSVAR_ULONG(
1000000, 0); /* Maximum value */
#endif /* HAVE_ATOMIC_BUILTINS */
+static MYSQL_SYSVAR_BOOL(prefix_index_cluster_optimization,
+ srv_prefix_index_cluster_optimization,
+ PLUGIN_VAR_OPCMDARG,
+ "Enable prefix optimization to sometimes avoid cluster index lookups.",
+ NULL, NULL, FALSE);
+
static MYSQL_SYSVAR_ULONG(thread_sleep_delay, srv_thread_sleep_delay,
PLUGIN_VAR_RQCMDARG,
"Time of innodb thread sleeping before joining InnoDB queue (usec). "
@@ -16838,6 +19001,48 @@ static MYSQL_SYSVAR_BOOL(disable_background_merge,
NULL, NULL, FALSE);
#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+static MYSQL_SYSVAR_ULONG(buf_dump_status_frequency, srv_buf_dump_status_frequency,
+ PLUGIN_VAR_RQCMDARG,
+ "A number between [0, 100] that tells how oftern buffer pool dump status "
+ "in percentages should be printed. E.g. 10 means that buffer pool dump "
+ "status is printed when every 10% of number of buffer pool pages are "
+ "dumped. Default is 0 (only start and end status is printed).",
+ NULL, NULL, 0, 0, 100, 0);
+
+#ifdef WITH_INNODB_DISALLOW_WRITES
+/*******************************************************
+ * innobase_disallow_writes variable definition *
+ *******************************************************/
+
+/* Must always init to FALSE. */
+static my_bool innobase_disallow_writes = FALSE;
+
+/**************************************************************************
+An "update" method for innobase_disallow_writes variable. */
+static
+void
+innobase_disallow_writes_update(
+/*============================*/
+ THD* thd, /* in: thread handle */
+ st_mysql_sys_var* var, /* in: pointer to system
+ variable */
+ void* var_ptr, /* out: pointer to dynamic
+ variable */
+ const void* save) /* in: temporary storage */
+{
+ *(my_bool*)var_ptr = *(my_bool*)save;
+ ut_a(srv_allow_writes_event);
+ if (*(my_bool*)var_ptr)
+ os_event_reset(srv_allow_writes_event);
+ else
+ os_event_set(srv_allow_writes_event);
+}
+
+static MYSQL_SYSVAR_BOOL(disallow_writes, innobase_disallow_writes,
+ PLUGIN_VAR_NOCMDOPT,
+ "Tell InnoDB to stop any writes to disk",
+ NULL, innobase_disallow_writes_update, FALSE);
+#endif /* WITH_INNODB_DISALLOW_WRITES */
static MYSQL_SYSVAR_BOOL(random_read_ahead, srv_random_read_ahead,
PLUGIN_VAR_NOCMDARG,
"Whether to use read ahead for random access within an extent.",
@@ -16945,6 +19150,168 @@ static MYSQL_SYSVAR_UINT(simulate_comp_failures, srv_simulate_comp_failures,
"Simulate compression failures.",
NULL, NULL, 0, 0, 99, 0);
+static MYSQL_SYSVAR_BOOL(force_primary_key,
+ srv_force_primary_key,
+ PLUGIN_VAR_OPCMDARG,
+ "Do not allow to create table without primary key (off by default)",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(use_trim, srv_use_trim,
+ PLUGIN_VAR_OPCMDARG,
+ "Use trim. Default FALSE.",
+ NULL, NULL, FALSE);
+
+static const char *page_compression_algorithms[]= { "none", "zlib", "lz4", "lzo", "lzma", "bzip2", "snappy", 0 };
+static TYPELIB page_compression_algorithms_typelib=
+{
+ array_elements(page_compression_algorithms) - 1, 0,
+ page_compression_algorithms, 0
+};
+static MYSQL_SYSVAR_ENUM(compression_algorithm, innodb_compression_algorithm,
+ PLUGIN_VAR_OPCMDARG,
+ "Compression algorithm used on page compression. One of: none, zlib, lz4, lzo, lzma, or bzip2",
+ innodb_compression_algorithm_validate, NULL,
+ /* We use here the largest number of supported compression method to
+ enable all those methods that are available. Availability of compression
+ method is verified on innodb_compression_algorithm_validate function. */
+ PAGE_UNCOMPRESSED,
+ &page_compression_algorithms_typelib);
+
+static MYSQL_SYSVAR_LONG(mtflush_threads, srv_mtflush_threads,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Number of multi-threaded flush threads",
+ NULL, NULL,
+ MTFLUSH_DEFAULT_WORKER, /* Default setting */
+ 1, /* Minimum setting */
+ MTFLUSH_MAX_WORKER, /* Max setting */
+ 0);
+
+static MYSQL_SYSVAR_BOOL(use_mtflush, srv_use_mtflush,
+ PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+ "Use multi-threaded flush. Default FALSE.",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONG(fatal_semaphore_wait_threshold, srv_fatal_semaphore_wait_threshold,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Maximum number of seconds that semaphore times out in InnoDB.",
+ NULL, NULL,
+ DEFAULT_SRV_FATAL_SEMAPHORE_TIMEOUT, /* Default setting */
+ 1, /* Minimum setting */
+ UINT_MAX32, /* Maximum setting */
+ 0);
+
+static const char* srv_encrypt_tables_names[] = { "OFF", "ON", "FORCE", 0 };
+static TYPELIB srv_encrypt_tables_typelib = {
+ array_elements(srv_encrypt_tables_names)-1, 0, srv_encrypt_tables_names,
+ NULL
+};
+static MYSQL_SYSVAR_ENUM(encrypt_tables, srv_encrypt_tables,
+ PLUGIN_VAR_OPCMDARG,
+ "Enable encryption for tables. "
+ "Don't forget to enable --innodb-encrypt-log too",
+ innodb_encrypt_tables_validate,
+ innodb_encrypt_tables_update,
+ 0,
+ &srv_encrypt_tables_typelib);
+
+static MYSQL_SYSVAR_UINT(encryption_threads, srv_n_fil_crypt_threads,
+ PLUGIN_VAR_RQCMDARG,
+ "Number of threads performing background key rotation and "
+ "scrubbing",
+ NULL,
+ innodb_encryption_threads_update,
+ srv_n_fil_crypt_threads, 0, UINT_MAX32, 0);
+
+static MYSQL_SYSVAR_UINT(encryption_rotate_key_age,
+ srv_fil_crypt_rotate_key_age,
+ PLUGIN_VAR_RQCMDARG,
+ "Key rotation - re-encrypt in background "
+ "all pages that were encrypted with a key that "
+ "many (or more) versions behind",
+ NULL,
+ innodb_encryption_rotate_key_age_update,
+ srv_fil_crypt_rotate_key_age, 0, UINT_MAX32, 0);
+
+static MYSQL_SYSVAR_UINT(encryption_rotation_iops, srv_n_fil_crypt_iops,
+ PLUGIN_VAR_RQCMDARG,
+ "Use this many iops for background key rotation",
+ NULL,
+ innodb_encryption_rotation_iops_update,
+ srv_n_fil_crypt_iops, 0, UINT_MAX32, 0);
+
+static MYSQL_SYSVAR_BOOL(scrub_log, srv_scrub_log,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+ "Enable background redo log (ib_logfile0, ib_logfile1...) scrubbing",
+ 0, 0, 0);
+
+static MYSQL_SYSVAR_ULONGLONG(scrub_log_speed, innodb_scrub_log_speed,
+ PLUGIN_VAR_OPCMDARG,
+ "Background redo log scrubbing speed in bytes/sec",
+ NULL, NULL,
+ 256, /* 256 bytes/sec, corresponds to 2000 ms scrub_log_interval */
+ 1, /* min */
+ 50000, 0); /* 50Kbyte/sec, corresponds to 10 ms scrub_log_interval */
+
+static MYSQL_SYSVAR_BOOL(encrypt_log, srv_encrypt_log,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+ "Enable redo log encryption",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(immediate_scrub_data_uncompressed,
+ srv_immediate_scrub_data_uncompressed,
+ 0,
+ "Enable scrubbing of data",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(background_scrub_data_uncompressed,
+ srv_background_scrub_data_uncompressed,
+ 0,
+ "Enable scrubbing of uncompressed data by "
+ "background threads (same as encryption_threads)",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(background_scrub_data_compressed,
+ srv_background_scrub_data_compressed,
+ 0,
+ "Enable scrubbing of compressed data by "
+ "background threads (same as encryption_threads)",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_UINT(background_scrub_data_check_interval,
+ srv_background_scrub_data_check_interval,
+ 0,
+ "check if spaces needs scrubbing every "
+ "innodb_background_scrub_data_check_interval "
+ "seconds",
+ NULL, NULL,
+ srv_background_scrub_data_check_interval,
+ 1,
+ UINT_MAX32, 0);
+
+static MYSQL_SYSVAR_UINT(background_scrub_data_interval,
+ srv_background_scrub_data_interval,
+ 0,
+ "scrub spaces that were last scrubbed longer than "
+ " innodb_background_scrub_data_interval seconds ago",
+ NULL, NULL,
+ srv_background_scrub_data_interval,
+ 1,
+ UINT_MAX32, 0);
+
+#ifdef UNIV_DEBUG
+static MYSQL_SYSVAR_BOOL(debug_force_scrubbing,
+ srv_scrub_force_testing,
+ 0,
+ "Perform extra scrubbing to increase test exposure",
+ NULL, NULL, FALSE);
+#endif /* UNIV_DEBUG */
+
+static MYSQL_SYSVAR_BOOL(instrument_semaphores, srv_instrument_semaphores,
+ PLUGIN_VAR_OPCMDARG,
+ "Enable semaphore request instrumentation. This could have some effect on performance but allows better"
+ " information on long semaphore wait problems. (Default: not enabled)",
+ 0, 0, FALSE);
+
static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(additional_mem_pool_size),
MYSQL_SYSVAR(api_trx_level),
@@ -16961,6 +19328,12 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(buffer_pool_load_now),
MYSQL_SYSVAR(buffer_pool_load_abort),
MYSQL_SYSVAR(buffer_pool_load_at_startup),
+ MYSQL_SYSVAR(defragment),
+ MYSQL_SYSVAR(defragment_n_pages),
+ MYSQL_SYSVAR(defragment_stats_accuracy),
+ MYSQL_SYSVAR(defragment_fill_factor),
+ MYSQL_SYSVAR(defragment_fill_factor_n_recs),
+ MYSQL_SYSVAR(defragment_frequency),
MYSQL_SYSVAR(lru_scan_depth),
MYSQL_SYSVAR(flush_neighbors),
MYSQL_SYSVAR(checksum_algorithm),
@@ -17054,6 +19427,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
#ifdef HAVE_ATOMIC_BUILTINS
MYSQL_SYSVAR(adaptive_max_sleep_delay),
#endif /* HAVE_ATOMIC_BUILTINS */
+ MYSQL_SYSVAR(prefix_index_cluster_optimization),
MYSQL_SYSVAR(thread_sleep_delay),
MYSQL_SYSVAR(autoinc_lock_mode),
MYSQL_SYSVAR(version),
@@ -17065,11 +19439,15 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(change_buffering_debug),
MYSQL_SYSVAR(disable_background_merge),
#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+#ifdef WITH_INNODB_DISALLOW_WRITES
+ MYSQL_SYSVAR(disallow_writes),
+#endif /* WITH_INNODB_DISALLOW_WRITES */
MYSQL_SYSVAR(random_read_ahead),
MYSQL_SYSVAR(read_ahead_threshold),
MYSQL_SYSVAR(read_only),
MYSQL_SYSVAR(io_capacity),
MYSQL_SYSVAR(io_capacity_max),
+ MYSQL_SYSVAR(idle_flush_pct),
MYSQL_SYSVAR(monitor_enable),
MYSQL_SYSVAR(monitor_disable),
MYSQL_SYSVAR(monitor_reset),
@@ -17105,6 +19483,33 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(saved_page_number_debug),
#endif /* UNIV_DEBUG */
MYSQL_SYSVAR(simulate_comp_failures),
+ MYSQL_SYSVAR(force_primary_key),
+ MYSQL_SYSVAR(fatal_semaphore_wait_threshold),
+ /* Table page compression feature */
+ MYSQL_SYSVAR(use_trim),
+ MYSQL_SYSVAR(compression_algorithm),
+ MYSQL_SYSVAR(mtflush_threads),
+ MYSQL_SYSVAR(use_mtflush),
+ /* Encryption feature */
+ MYSQL_SYSVAR(encrypt_tables),
+ MYSQL_SYSVAR(encryption_threads),
+ MYSQL_SYSVAR(encryption_rotate_key_age),
+ MYSQL_SYSVAR(encryption_rotation_iops),
+ MYSQL_SYSVAR(scrub_log),
+ MYSQL_SYSVAR(scrub_log_speed),
+ MYSQL_SYSVAR(encrypt_log),
+ MYSQL_SYSVAR(default_encryption_key_id),
+ /* Scrubing feature */
+ MYSQL_SYSVAR(immediate_scrub_data_uncompressed),
+ MYSQL_SYSVAR(background_scrub_data_uncompressed),
+ MYSQL_SYSVAR(background_scrub_data_compressed),
+ MYSQL_SYSVAR(background_scrub_data_interval),
+ MYSQL_SYSVAR(background_scrub_data_check_interval),
+#ifdef UNIV_DEBUG
+ MYSQL_SYSVAR(debug_force_scrubbing),
+#endif
+ MYSQL_SYSVAR(instrument_semaphores),
+ MYSQL_SYSVAR(buf_dump_status_frequency),
NULL
};
@@ -17114,7 +19519,7 @@ maria_declare_plugin(innobase)
&innobase_storage_engine,
innobase_hton_name,
plugin_author,
- "Supports transactions, row-level locking, and foreign keys",
+ "Supports transactions, row-level locking, foreign keys and encryption for tables",
PLUGIN_LICENSE_GPL,
innobase_init, /* Plugin Init */
NULL, /* Plugin Deinit */
@@ -17122,7 +19527,7 @@ maria_declare_plugin(innobase)
innodb_status_variables_export,/* status variables */
innobase_system_variables, /* system variables */
INNODB_VERSION_STR, /* string version */
- MariaDB_PLUGIN_MATURITY_STABLE /* maturity */
+ MariaDB_PLUGIN_MATURITY_BETA /* maturity */
},
i_s_innodb_trx,
i_s_innodb_locks,
@@ -17151,8 +19556,11 @@ i_s_innodb_sys_fields,
i_s_innodb_sys_foreign,
i_s_innodb_sys_foreign_cols,
i_s_innodb_sys_tablespaces,
-i_s_innodb_sys_datafiles
-
+i_s_innodb_sys_datafiles,
+i_s_innodb_mutexes,
+i_s_innodb_sys_semaphore_waits,
+i_s_innodb_tablespaces_encryption,
+i_s_innodb_tablespaces_scrubbing
maria_declare_plugin_end;
/** @brief Initialize the default value of innodb_commit_concurrency.
@@ -17447,6 +19855,9 @@ ib_senderrf(
case IB_LOG_LEVEL_FATAL:
l = 0;
break;
+ default:
+ l = 0;
+ break;
}
my_printv_error(code, format, MYF(l), args);
@@ -17633,3 +20044,118 @@ ib_warn_row_too_big(const dict_table_t* table)
" ROW_FORMAT=COMPRESSED ": ""
, prefix ? DICT_MAX_FIXED_COL_LEN : 0);
}
+
+/*************************************************************//**
+Check for a valid value of innobase_compression_algorithm.
+@return 0 for valid innodb_compression_algorithm. */
+static
+int
+innodb_compression_algorithm_validate(
+/*==================================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to system
+ variable */
+ void* save, /*!< out: immediate result
+ for update function */
+ struct st_mysql_value* value) /*!< in: incoming string */
+{
+ long compression_algorithm;
+ DBUG_ENTER("innobase_compression_algorithm_validate");
+
+ if (check_sysvar_enum(thd, var, save, value)) {
+ DBUG_RETURN(1);
+ }
+
+ compression_algorithm = *reinterpret_cast<ulong*>(save);
+ (void)compression_algorithm;
+
+#ifndef HAVE_LZ4
+ if (compression_algorithm == PAGE_LZ4_ALGORITHM) {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_UNSUPPORTED,
+ "InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+ "InnoDB: liblz4 is not installed. \n",
+ compression_algorithm);
+ DBUG_RETURN(1);
+ }
+#endif
+
+#ifndef HAVE_LZO
+ if (compression_algorithm == PAGE_LZO_ALGORITHM) {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_UNSUPPORTED,
+ "InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+ "InnoDB: liblzo is not installed. \n",
+ compression_algorithm);
+ DBUG_RETURN(1);
+ }
+#endif
+
+#ifndef HAVE_LZMA
+ if (compression_algorithm == PAGE_LZMA_ALGORITHM) {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_UNSUPPORTED,
+ "InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+ "InnoDB: liblzma is not installed. \n",
+ compression_algorithm);
+ DBUG_RETURN(1);
+ }
+#endif
+
+#ifndef HAVE_BZIP2
+ if (compression_algorithm == PAGE_BZIP2_ALGORITHM) {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_UNSUPPORTED,
+ "InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+ "InnoDB: libbz2 is not installed. \n",
+ compression_algorithm);
+ DBUG_RETURN(1);
+ }
+#endif
+
+#ifndef HAVE_SNAPPY
+ if (compression_algorithm == PAGE_SNAPPY_ALGORITHM) {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_UNSUPPORTED,
+ "InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+ "InnoDB: libsnappy is not installed. \n",
+ compression_algorithm);
+ DBUG_RETURN(1);
+ }
+#endif
+ DBUG_RETURN(0);
+}
+
+static
+int
+innodb_encrypt_tables_validate(
+/*=================================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to system
+ variable */
+ void* save, /*!< out: immediate result
+ for update function */
+ struct st_mysql_value* value) /*!< in: incoming string */
+{
+ if (check_sysvar_enum(thd, var, save, value))
+ return 1;
+
+ long encrypt_tables = *(long*)save;
+
+ if (encrypt_tables
+ && !encryption_key_id_exists(FIL_DEFAULT_ENCRYPTION_KEY)) {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_UNSUPPORTED,
+ "InnoDB: cannot enable encryption, "
+ "encryption plugin is not available");
+ return 1;
+ }
+ return 0;
+}
+
+static void innodb_remember_check_sysvar_funcs()
+{
+ /* remember build-in sysvar check functions */
+ ut_ad((MYSQL_SYSVAR_NAME(checksum_algorithm).flags & 0x1FF) == PLUGIN_VAR_ENUM);
+ check_sysvar_enum = MYSQL_SYSVAR_NAME(checksum_algorithm).check;
+}
diff --git a/storage/innobase/handler/ha_innodb.h b/storage/innobase/handler/ha_innodb.h
index 5cebc425769..b613053a992 100644
--- a/storage/innobase/handler/ha_innodb.h
+++ b/storage/innobase/handler/ha_innodb.h
@@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 2000, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -56,6 +57,24 @@ typedef struct st_innobase_share {
/** Prebuilt structures in an InnoDB table handle used within MySQL */
struct row_prebuilt_t;
+/** Engine specific table options are defined using this struct */
+struct ha_table_option_struct
+{
+ bool page_compressed; /*!< Table is using page compression
+ if this option is true. */
+ int page_compression_level; /*!< Table page compression level
+ or UNIV_UNSPECIFIED. */
+ uint atomic_writes; /*!< Use atomic writes for this
+ table if this options is ON or
+ in DEFAULT if
+ srv_use_atomic_writes=1.
+ Atomic writes are not used if
+ value OFF.*/
+ uint encryption; /*!< DEFAULT, ON, OFF */
+ int encryption_key_id; /*!< encryption key id*/
+};
+
+
/** The class defining a handle to an Innodb table */
class ha_innobase: public handler
{
@@ -81,6 +100,8 @@ class ha_innobase: public handler
or undefined */
uint num_write_row; /*!< number of write_row() calls */
+ ha_statistics* ha_partition_stats; /*!< stats of the partition owner
+ handler (if there is one) */
uint store_key_val_for_row(uint keynr, char* buff, uint buff_len,
const uchar* record);
inline void update_thd(THD* thd);
@@ -95,6 +116,10 @@ class ha_innobase: public handler
void innobase_initialize_autoinc();
dict_index_t* innobase_get_index(uint keynr);
+#ifdef WITH_WSREP
+ int wsrep_append_keys(THD *thd, bool shared,
+ const uchar* record0, const uchar* record1);
+#endif
/* Init values for the class: */
public:
ha_innobase(handlerton *hton, TABLE_SHARE *table_arg);
@@ -175,11 +200,15 @@ class ha_innobase: public handler
char* norm_name,
char* temp_path,
char* remote_path);
+ const char* check_table_options(THD *thd, TABLE* table,
+ HA_CREATE_INFO* create_info, const bool use_tablespace, const ulint file_format);
int create(const char *name, register TABLE *form,
HA_CREATE_INFO *create_info);
int truncate();
int delete_table(const char *name);
int rename_table(const char* from, const char* to);
+ int defragment_table(const char* name, const char* index_name,
+ bool async);
int check(THD* thd, HA_CHECK_OPT* check_opt);
char* update_table_comment(const char* comment);
char* get_foreign_key_create_info();
@@ -283,6 +312,7 @@ class ha_innobase: public handler
Alter_inplace_info* ha_alter_info,
bool commit);
/** @} */
+ void set_partition_owner_stats(ha_statistics *stats);
bool check_if_incompatible_data(HA_CREATE_INFO *info,
uint table_changes);
private:
@@ -440,7 +470,9 @@ __attribute__((nonnull));
*/
extern void mysql_bin_log_commit_pos(THD *thd, ulonglong *out_pos, const char **out_file);
-struct trx_t;
+#ifdef WITH_WSREP
+#include <mysql/service_wsrep.h>
+#endif
extern const struct _ft_vft ft_vft_result;
@@ -478,6 +510,9 @@ innobase_index_name_is_reserved(
__attribute__((nonnull, warn_unused_result));
/*****************************************************************//**
+#ifdef WITH_WSREP
+extern "C" int wsrep_trx_is_aborting(void *thd_ptr);
+#endif
Determines InnoDB table flags.
@retval true if successful, false if error */
UNIV_INTERN
diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc
index 3f393d9d431..4cb912cd023 100644
--- a/storage/innobase/handler/handler0alter.cc
+++ b/storage/innobase/handler/handler0alter.cc
@@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 2005, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -263,6 +264,29 @@ ha_innobase::check_if_supported_inplace_alter(
update_thd();
trx_search_latch_release_if_reserved(prebuilt->trx);
+ /* Change on engine specific table options require rebuild of the
+ table */
+ if (ha_alter_info->handler_flags
+ & Alter_inplace_info::CHANGE_CREATE_OPTION) {
+ ha_table_option_struct *new_options= ha_alter_info->create_info->option_struct;
+ ha_table_option_struct *old_options= table->s->option_struct;
+
+ if (new_options->page_compressed != old_options->page_compressed ||
+ new_options->page_compression_level != old_options->page_compression_level ||
+ new_options->atomic_writes != old_options->atomic_writes) {
+ ha_alter_info->unsupported_reason = innobase_get_err_msg(
+ ER_ALTER_OPERATION_NOT_SUPPORTED_REASON);
+ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+ }
+
+ if (new_options->encryption != old_options->encryption ||
+ new_options->encryption_key_id != old_options->encryption_key_id) {
+ ha_alter_info->unsupported_reason = innobase_get_err_msg(
+ ER_ALTER_OPERATION_NOT_SUPPORTED_REASON);
+ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+ }
+ }
+
if (ha_alter_info->handler_flags
& ~(INNOBASE_INPLACE_IGNORE
| INNOBASE_ALTER_NOREBUILD
@@ -1179,7 +1203,8 @@ innobase_rec_to_mysql(
field->reset();
- ipos = dict_index_get_nth_col_or_prefix_pos(index, i, TRUE);
+ ipos = dict_index_get_nth_col_or_prefix_pos(index, i, TRUE,
+ NULL);
if (ipos == ULINT_UNDEFINED
|| rec_offs_nth_extern(offsets, ipos)) {
@@ -1231,7 +1256,8 @@ innobase_fields_to_mysql(
field->reset();
- ipos = dict_index_get_nth_col_or_prefix_pos(index, i, TRUE);
+ ipos = dict_index_get_nth_col_or_prefix_pos(index, i, TRUE,
+ NULL);
if (ipos == ULINT_UNDEFINED
|| dfield_is_ext(&fields[ipos])
@@ -3375,6 +3401,11 @@ ha_innobase::prepare_inplace_alter_table(
DBUG_ASSERT(ha_alter_info->create_info);
DBUG_ASSERT(!srv_read_only_mode);
+ /* Init online ddl status variables */
+ onlineddl_rowlog_rows = 0;
+ onlineddl_rowlog_pct_used = 0;
+ onlineddl_pct_progress = 0;
+
MONITOR_ATOMIC_INC(MONITOR_PENDING_ALTER_TABLE);
#ifdef UNIV_DEBUG
@@ -3397,6 +3428,17 @@ ha_innobase::prepare_inplace_alter_table(
if (ha_alter_info->handler_flags
& Alter_inplace_info::CHANGE_CREATE_OPTION) {
+ /* Check engine specific table options */
+ if (const char* invalid_tbopt = check_table_options(
+ user_thd, altered_table,
+ ha_alter_info->create_info,
+ prebuilt->table->space != 0,
+ srv_file_format)) {
+ my_error(ER_ILLEGAL_HA_CREATE_OPTION, MYF(0),
+ table_type(), invalid_tbopt);
+ goto err_exit_no_heap;
+ }
+
if (const char* invalid_opt = create_options_are_invalid(
user_thd, altered_table,
ha_alter_info->create_info,
@@ -4010,6 +4052,11 @@ oom:
ctx->thr, prebuilt->table, altered_table);
}
+ /* Init online ddl status variables */
+ onlineddl_rowlog_rows = 0;
+ onlineddl_rowlog_pct_used = 0;
+ onlineddl_pct_progress = 0;
+
DEBUG_SYNC_C("inplace_after_index_build");
DBUG_EXECUTE_IF("create_index_fail",
diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc
index 6168ce0f0d2..ef69e7df29d 100644
--- a/storage/innobase/handler/i_s.cc
+++ b/storage/innobase/handler/i_s.cc
@@ -1,6 +1,7 @@
/*****************************************************************************
-Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2007, 2015, Oracle and/or its affiliates.
+Copyrigth (c) 2014, 2015, MariaDB Corporation
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -21,8 +22,11 @@ this program; if not, write to the Free Software Foundation, Inc.,
InnoDB INFORMATION SCHEMA tables interface to MySQL.
Created July 18, 2007 Vasil Dimov
+Modified Dec 29, 2014 Jan Lindström (Added sys_semaphore_waits)
*******************************************************/
+#include "univ.i"
+
#include <mysqld_error.h>
#include <sql_acl.h>
@@ -56,6 +60,9 @@ Created July 18, 2007 Vasil Dimov
#include "fts0priv.h"
#include "btr0btr.h"
#include "page0zip.h"
+#include "sync0arr.h"
+#include "fil0fil.h"
+#include "fil0crypt.h"
/** structure associates a name string with a file page type and/or buffer
page state. */
@@ -92,6 +99,7 @@ static buf_page_desc_t i_s_page_type[] = {
{"COMPRESSED_BLOB", FIL_PAGE_TYPE_ZBLOB},
{"COMPRESSED_BLOB2", FIL_PAGE_TYPE_ZBLOB2},
{"IBUF_INDEX", I_S_PAGE_TYPE_IBUF},
+ {"PAGE COMPRESSED", FIL_PAGE_PAGE_COMPRESSED},
{"UNKNOWN", I_S_PAGE_TYPE_UNKNOWN}
};
@@ -136,48 +144,6 @@ struct buf_page_info_t{
index_id_t index_id; /*!< Index ID if a index page */
};
-/** maximum number of buffer page info we would cache. */
-#define MAX_BUF_INFO_CACHED 10000
-
-#define OK(expr) \
- if ((expr) != 0) { \
- DBUG_RETURN(1); \
- }
-
-#define RETURN_IF_INNODB_NOT_STARTED(plugin_name) \
-do { \
- if (!srv_was_started) { \
- push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, \
- ER_CANT_FIND_SYSTEM_REC, \
- "InnoDB: SELECTing from " \
- "INFORMATION_SCHEMA.%s but " \
- "the InnoDB storage engine " \
- "is not installed", plugin_name); \
- DBUG_RETURN(0); \
- } \
-} while (0)
-
-#if !defined __STRICT_ANSI__ && defined __GNUC__ && (__GNUC__) > 2 && !defined __INTEL_COMPILER && !defined __clang__
-#ifdef HAVE_C99_INITIALIZERS
-#define STRUCT_FLD(name, value) .name = value
-#else
-#define STRUCT_FLD(name, value) name: value
-#endif /* HAVE_C99_INITIALIZERS */
-#else
-#define STRUCT_FLD(name, value) value
-#endif
-
-/* Don't use a static const variable here, as some C++ compilers (notably
-HPUX aCC: HP ANSI C++ B3910B A.03.65) can't handle it. */
-#define END_OF_ST_FIELD_INFO \
- {STRUCT_FLD(field_name, NULL), \
- STRUCT_FLD(field_length, 0), \
- STRUCT_FLD(field_type, MYSQL_TYPE_NULL), \
- STRUCT_FLD(value, 0), \
- STRUCT_FLD(field_flags, 0), \
- STRUCT_FLD(old_name, ""), \
- STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}
-
/*
Use the following types mapping:
@@ -206,6 +172,20 @@ time_t MYSQL_TYPE_DATETIME
---------------------------------
*/
+/** Implemented on sync0arr.cc */
+/*******************************************************************//**
+Function to populate INFORMATION_SCHEMA.INNODB_SYS_SEMAPHORE_WAITS table.
+Loop through each item on sync array, and extract the column
+information and fill the INFORMATION_SCHEMA.INNODB_SYS_SEMAPHORE_WAITS table.
+@return 0 on success */
+UNIV_INTERN
+int
+sync_arr_fill_sys_semphore_waits_table(
+/*===================================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ); /*!< in: condition (not used) */
+
/*******************************************************************//**
Common function to fill any of the dynamic tables:
INFORMATION_SCHEMA.innodb_trx
@@ -263,7 +243,6 @@ field_store_time_t(
/*******************************************************************//**
Auxiliary function to store char* value in MYSQL_TYPE_STRING field.
@return 0 on success */
-static
int
field_store_string(
/*===============*/
@@ -330,7 +309,6 @@ field_store_index_name(
Auxiliary function to store ulint value in MYSQL_TYPE_LONGLONG field.
If the value is ULINT_UNDEFINED then the field it set to NULL.
@return 0 on success */
-static
int
field_store_ulint(
/*==============*/
@@ -805,7 +783,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_trx =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_locks */
@@ -1065,7 +1043,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_locks =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_lock_waits */
@@ -1248,7 +1226,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_lock_waits =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
/*******************************************************************//**
@@ -1584,7 +1562,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_cmp =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
UNIV_INTERN struct st_maria_plugin i_s_innodb_cmp_reset =
@@ -1634,7 +1612,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_cmp_reset =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
/* Fields of the dynamic tables
@@ -1934,7 +1912,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_cmp_per_index =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
UNIV_INTERN struct st_maria_plugin i_s_innodb_cmp_per_index_reset =
@@ -1984,7 +1962,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_cmp_per_index_reset =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
/* Fields of the dynamic table information_schema.innodb_cmpmem. */
@@ -2227,7 +2205,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_cmpmem =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
UNIV_INTERN struct st_maria_plugin i_s_innodb_cmpmem_reset =
@@ -2277,7 +2255,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_cmpmem_reset =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_metrics */
@@ -2801,7 +2779,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_metrics =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_ft_default_stopword */
static ST_FIELD_INFO i_s_stopword_fields_info[] =
@@ -2888,7 +2866,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_default_stopword =
/* general descriptive text (for SHOW PLUGINS) */
/* const char* */
- STRUCT_FLD(descr, "Default stopword list for InnDB Full Text Search"),
+ STRUCT_FLD(descr, "Default stopword list for InnoDB Full Text Search"),
/* the plugin license (PLUGIN_LICENSE_XXX) */
/* int */
@@ -2914,7 +2892,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_default_stopword =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED
@@ -3082,7 +3060,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_deleted =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
/*******************************************************************//**
@@ -3165,7 +3143,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_being_deleted =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHED and
@@ -3454,7 +3432,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_index_cache =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
/*******************************************************************//**
@@ -3889,7 +3867,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_index_table =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_CONFIG */
@@ -4087,7 +4065,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_ft_config =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
/* Fields of the dynamic table INNODB_BUFFER_POOL_STATS. */
@@ -4470,10 +4448,14 @@ i_s_innodb_stats_fill(
info->pages_written_rate));
if (info->n_page_get_delta) {
- OK(fields[IDX_BUF_STATS_HIT_RATE]->store(
- static_cast<double>(
- 1000 - (1000 * info->page_read_delta
- / info->n_page_get_delta))));
+ if (info->page_read_delta <= info->n_page_get_delta) {
+ OK(fields[IDX_BUF_STATS_HIT_RATE]->store(
+ static_cast<double>(
+ 1000 - (1000 * info->page_read_delta
+ / info->n_page_get_delta))));
+ } else {
+ OK(fields[IDX_BUF_STATS_HIT_RATE]->store(0));
+ }
OK(fields[IDX_BUF_STATS_MADE_YOUNG_PCT]->store(
static_cast<double>(
@@ -4632,7 +4614,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_buffer_stats =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
/* Fields of the dynamic table INNODB_BUFFER_POOL_PAGE. */
@@ -5344,7 +5326,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_buffer_page =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
static ST_FIELD_INFO i_s_innodb_buf_page_lru_fields_info[] =
@@ -5891,7 +5873,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_buffer_page_lru =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
/*******************************************************************//**
@@ -6185,7 +6167,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_tables =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
/** SYS_TABLESTATS ***********************************************/
@@ -6475,7 +6457,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_tablestats =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
/** SYS_INDEXES **************************************************/
@@ -6727,7 +6709,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_indexes =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
/** SYS_COLUMNS **************************************************/
@@ -6964,7 +6946,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_columns =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
/** SYS_FIELDS ***************************************************/
@@ -7174,7 +7156,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_fields =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
/** SYS_FOREIGN ********************************************/
@@ -7399,7 +7381,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_foreign =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
/** SYS_FOREIGN_COLS ********************************************/
@@ -7616,7 +7598,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_foreign_cols =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
/** SYS_TABLESPACES ********************************************/
@@ -7706,7 +7688,7 @@ i_s_dict_fill_sys_tablespaces(
{
Field** fields;
ulint atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(flags);
- ulint page_size = fsp_flags_get_page_size(flags);;
+ ulint page_size = fsp_flags_get_page_size(flags);
ulint zip_size = fsp_flags_get_zip_size(flags);
const char* file_format;
const char* row_format;
@@ -7884,7 +7866,7 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_tablespaces =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
/** SYS_DATAFILES ************************************************/
@@ -8073,5 +8055,1078 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_datafiles =
/* Maria extension */
STRUCT_FLD(version_info, INNODB_VERSION_STR),
- STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
+};
+
+/** TABLESPACES_ENCRYPTION ********************************************/
+/* Fields of the table INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION */
+static ST_FIELD_INFO innodb_tablespaces_encryption_fields_info[] =
+{
+#define TABLESPACES_ENCRYPTION_SPACE 0
+ {STRUCT_FLD(field_name, "SPACE"),
+ STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define TABLESPACES_ENCRYPTION_NAME 1
+ {STRUCT_FLD(field_name, "NAME"),
+ STRUCT_FLD(field_length, MAX_FULL_NAME_LEN + 1),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define TABLESPACES_ENCRYPTION_ENCRYPTION_SCHEME 2
+ {STRUCT_FLD(field_name, "ENCRYPTION_SCHEME"),
+ STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define TABLESPACES_ENCRYPTION_KEYSERVER_REQUESTS 3
+ {STRUCT_FLD(field_name, "KEYSERVER_REQUESTS"),
+ STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define TABLESPACES_ENCRYPTION_MIN_KEY_VERSION 4
+ {STRUCT_FLD(field_name, "MIN_KEY_VERSION"),
+ STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define TABLESPACES_ENCRYPTION_CURRENT_KEY_VERSION 5
+ {STRUCT_FLD(field_name, "CURRENT_KEY_VERSION"),
+ STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define TABLESPACES_ENCRYPTION_KEY_ROTATION_PAGE_NUMBER 6
+ {STRUCT_FLD(field_name, "KEY_ROTATION_PAGE_NUMBER"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED | MY_I_S_MAYBE_NULL),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define TABLESPACES_ENCRYPTION_KEY_ROTATION_MAX_PAGE_NUMBER 7
+ {STRUCT_FLD(field_name, "KEY_ROTATION_MAX_PAGE_NUMBER"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED | MY_I_S_MAYBE_NULL),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ END_OF_ST_FIELD_INFO
+};
+
+/**********************************************************************//**
+Function to fill INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION
+with information collected by scanning SYS_TABLESPACES table and then use
+fil_space()
+@return 0 on success */
+static
+int
+i_s_dict_fill_tablespaces_encryption(
+/*==========================*/
+ THD* thd, /*!< in: thread */
+ ulint space, /*!< in: space ID */
+ const char* name, /*!< in: tablespace name */
+ TABLE* table_to_fill) /*!< in/out: fill this table */
+{
+ Field** fields;
+ struct fil_space_crypt_status_t status;
+
+ DBUG_ENTER("i_s_dict_fill_tablespaces_encryption");
+
+ fields = table_to_fill->field;
+
+ fil_space_crypt_get_status(space, &status);
+ OK(fields[TABLESPACES_ENCRYPTION_SPACE]->store(space));
+
+ OK(field_store_string(fields[TABLESPACES_ENCRYPTION_NAME],
+ name));
+
+ OK(fields[TABLESPACES_ENCRYPTION_ENCRYPTION_SCHEME]->store(
+ status.scheme));
+ OK(fields[TABLESPACES_ENCRYPTION_KEYSERVER_REQUESTS]->store(
+ status.keyserver_requests));
+ OK(fields[TABLESPACES_ENCRYPTION_MIN_KEY_VERSION]->store(
+ status.min_key_version));
+ OK(fields[TABLESPACES_ENCRYPTION_CURRENT_KEY_VERSION]->store(
+ status.current_key_version));
+ if (status.rotating) {
+ fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_PAGE_NUMBER]->set_notnull();
+ OK(fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_PAGE_NUMBER]->store(
+ status.rotate_next_page_number));
+ fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_MAX_PAGE_NUMBER]->set_notnull();
+ OK(fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_MAX_PAGE_NUMBER]->store(
+ status.rotate_max_page_number));
+ } else {
+ fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_PAGE_NUMBER]
+ ->set_null();
+ fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_MAX_PAGE_NUMBER]
+ ->set_null();
+ }
+ OK(schema_table_store_record(thd, table_to_fill));
+
+ DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to populate INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION table.
+Loop through each record in TABLESPACES_ENCRYPTION, and extract the column
+information and fill the INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION table.
+@return 0 on success */
+static
+int
+i_s_tablespaces_encryption_fill_table(
+/*===========================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ) /*!< in: condition (not used) */
+{
+ btr_pcur_t pcur;
+ const rec_t* rec;
+ mem_heap_t* heap;
+ mtr_t mtr;
+ bool found_space_0 = false;
+
+ DBUG_ENTER("i_s_tablespaces_encryption_fill_table");
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+ /* deny access to user without PROCESS_ACL privilege */
+ if (check_global_access(thd, SUPER_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ heap = mem_heap_create(1000);
+ mutex_enter(&dict_sys->mutex);
+ mtr_start(&mtr);
+
+ rec = dict_startscan_system(&pcur, &mtr, SYS_TABLESPACES);
+
+ while (rec) {
+ const char* err_msg;
+ ulint space;
+ const char* name;
+ ulint flags;
+
+ /* Extract necessary information from a SYS_TABLESPACES row */
+ err_msg = dict_process_sys_tablespaces(
+ heap, rec, &space, &name, &flags);
+
+ mtr_commit(&mtr);
+ mutex_exit(&dict_sys->mutex);
+
+ if (space == 0) {
+ found_space_0 = true;
+ }
+
+ if (!err_msg) {
+ i_s_dict_fill_tablespaces_encryption(
+ thd, space, name, tables->table);
+ } else {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_CANT_FIND_SYSTEM_REC, "%s",
+ err_msg);
+ }
+
+ mem_heap_empty(heap);
+
+ /* Get the next record */
+ mutex_enter(&dict_sys->mutex);
+ mtr_start(&mtr);
+ rec = dict_getnext_system(&pcur, &mtr);
+ }
+
+ mtr_commit(&mtr);
+ mutex_exit(&dict_sys->mutex);
+ mem_heap_free(heap);
+
+ if (found_space_0 == false) {
+ /* space 0 does for what ever unknown reason not show up
+ * in iteration above, add it manually */
+ ulint space = 0;
+ const char* name = NULL;
+ i_s_dict_fill_tablespaces_encryption(
+ thd, space, name, tables->table);
+ }
+
+ DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION
+@return 0 on success */
+static
+int
+innodb_tablespaces_encryption_init(
+/*========================*/
+ void* p) /*!< in/out: table schema object */
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("innodb_tablespaces_encryption_init");
+
+ schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = innodb_tablespaces_encryption_fields_info;
+ schema->fill_table = i_s_tablespaces_encryption_fill_table;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_tablespaces_encryption =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_TABLESPACES_ENCRYPTION"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, "Google Inc"),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB TABLESPACES_ENCRYPTION"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_BSD),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_tablespaces_encryption_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA)
+};
+
+/** TABLESPACES_SCRUBBING ********************************************/
+/* Fields of the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING */
+static ST_FIELD_INFO innodb_tablespaces_scrubbing_fields_info[] =
+{
+#define TABLESPACES_SCRUBBING_SPACE 0
+ {STRUCT_FLD(field_name, "SPACE"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define TABLESPACES_SCRUBBING_NAME 1
+ {STRUCT_FLD(field_name, "NAME"),
+ STRUCT_FLD(field_length, MAX_FULL_NAME_LEN + 1),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define TABLESPACES_SCRUBBING_COMPRESSED 2
+ {STRUCT_FLD(field_name, "COMPRESSED"),
+ STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define TABLESPACES_SCRUBBING_LAST_SCRUB_COMPLETED 3
+ {STRUCT_FLD(field_name, "LAST_SCRUB_COMPLETED"),
+ STRUCT_FLD(field_length, 0),
+ STRUCT_FLD(field_type, MYSQL_TYPE_DATETIME),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define TABLESPACES_SCRUBBING_CURRENT_SCRUB_STARTED 4
+ {STRUCT_FLD(field_name, "CURRENT_SCRUB_STARTED"),
+ STRUCT_FLD(field_length, 0),
+ STRUCT_FLD(field_type, MYSQL_TYPE_DATETIME),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define TABLESPACES_SCRUBBING_CURRENT_SCRUB_ACTIVE_THREADS 5
+ {STRUCT_FLD(field_name, "CURRENT_SCRUB_ACTIVE_THREADS"),
+ STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED | MY_I_S_MAYBE_NULL),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define TABLESPACES_SCRUBBING_CURRENT_SCRUB_PAGE_NUMBER 6
+ {STRUCT_FLD(field_name, "CURRENT_SCRUB_PAGE_NUMBER"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define TABLESPACES_SCRUBBING_CURRENT_SCRUB_MAX_PAGE_NUMBER 7
+ {STRUCT_FLD(field_name, "CURRENT_SCRUB_MAX_PAGE_NUMBER"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ END_OF_ST_FIELD_INFO
+};
+
+/**********************************************************************//**
+Function to fill INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING
+with information collected by scanning SYS_TABLESPACES table and then use
+fil_space()
+@return 0 on success */
+static
+int
+i_s_dict_fill_tablespaces_scrubbing(
+/*==========================*/
+ THD* thd, /*!< in: thread */
+ ulint space, /*!< in: space ID */
+ const char* name, /*!< in: tablespace name */
+ TABLE* table_to_fill) /*!< in/out: fill this table */
+{
+ Field** fields;
+ struct fil_space_scrub_status_t status;
+
+ DBUG_ENTER("i_s_dict_fill_tablespaces_scrubbing");
+
+ fields = table_to_fill->field;
+
+ fil_space_get_scrub_status(space, &status);
+ OK(fields[TABLESPACES_SCRUBBING_SPACE]->store(space));
+
+ OK(field_store_string(fields[TABLESPACES_SCRUBBING_NAME],
+ name));
+
+ OK(fields[TABLESPACES_SCRUBBING_COMPRESSED]->store(
+ status.compressed ? 1 : 0));
+
+ if (status.last_scrub_completed == 0) {
+ fields[TABLESPACES_SCRUBBING_LAST_SCRUB_COMPLETED]->set_null();
+ } else {
+ fields[TABLESPACES_SCRUBBING_LAST_SCRUB_COMPLETED]
+ ->set_notnull();
+ OK(field_store_time_t(
+ fields[TABLESPACES_SCRUBBING_LAST_SCRUB_COMPLETED],
+ status.last_scrub_completed));
+ }
+
+ int field_numbers[] = {
+ TABLESPACES_SCRUBBING_CURRENT_SCRUB_STARTED,
+ TABLESPACES_SCRUBBING_CURRENT_SCRUB_ACTIVE_THREADS,
+ TABLESPACES_SCRUBBING_CURRENT_SCRUB_PAGE_NUMBER,
+ TABLESPACES_SCRUBBING_CURRENT_SCRUB_MAX_PAGE_NUMBER };
+ if (status.scrubbing) {
+ for (uint i = 0; i < array_elements(field_numbers); i++) {
+ fields[field_numbers[i]]->set_notnull();
+ }
+
+ OK(field_store_time_t(
+ fields[TABLESPACES_SCRUBBING_CURRENT_SCRUB_STARTED],
+ status.current_scrub_started));
+ OK(fields[TABLESPACES_SCRUBBING_CURRENT_SCRUB_ACTIVE_THREADS]
+ ->store(status.current_scrub_active_threads));
+ OK(fields[TABLESPACES_SCRUBBING_CURRENT_SCRUB_PAGE_NUMBER]
+ ->store(status.current_scrub_page_number));
+ OK(fields[TABLESPACES_SCRUBBING_CURRENT_SCRUB_MAX_PAGE_NUMBER]
+ ->store(status.current_scrub_max_page_number));
+ } else {
+ for (uint i = 0; i < array_elements(field_numbers); i++) {
+ fields[field_numbers[i]]->set_null();
+ }
+ }
+ OK(schema_table_store_record(thd, table_to_fill));
+
+ DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to populate INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING table.
+Loop through each record in TABLESPACES_SCRUBBING, and extract the column
+information and fill the INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING table.
+@return 0 on success */
+static
+int
+i_s_tablespaces_scrubbing_fill_table(
+/*===========================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ) /*!< in: condition (not used) */
+{
+ btr_pcur_t pcur;
+ const rec_t* rec;
+ mem_heap_t* heap;
+ mtr_t mtr;
+ bool found_space_0 = false;
+
+ DBUG_ENTER("i_s_tablespaces_scrubbing_fill_table");
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+ /* deny access to user without SUPER_ACL privilege */
+ if (check_global_access(thd, SUPER_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ heap = mem_heap_create(1000);
+ mutex_enter(&dict_sys->mutex);
+ mtr_start(&mtr);
+
+ rec = dict_startscan_system(&pcur, &mtr, SYS_TABLESPACES);
+
+ while (rec) {
+ const char* err_msg;
+ ulint space;
+ const char* name;
+ ulint flags;
+
+ /* Extract necessary information from a SYS_TABLESPACES row */
+ err_msg = dict_process_sys_tablespaces(
+ heap, rec, &space, &name, &flags);
+
+ mtr_commit(&mtr);
+ mutex_exit(&dict_sys->mutex);
+
+ if (space == 0) {
+ found_space_0 = true;
+ }
+
+ if (!err_msg) {
+ i_s_dict_fill_tablespaces_scrubbing(
+ thd, space, name, tables->table);
+ } else {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_CANT_FIND_SYSTEM_REC, "%s",
+ err_msg);
+ }
+
+ mem_heap_empty(heap);
+
+ /* Get the next record */
+ mutex_enter(&dict_sys->mutex);
+ mtr_start(&mtr);
+ rec = dict_getnext_system(&pcur, &mtr);
+ }
+
+ mtr_commit(&mtr);
+ mutex_exit(&dict_sys->mutex);
+ mem_heap_free(heap);
+
+ if (found_space_0 == false) {
+ /* space 0 does for what ever unknown reason not show up
+ * in iteration above, add it manually */
+ ulint space = 0;
+ const char* name = NULL;
+ i_s_dict_fill_tablespaces_scrubbing(
+ thd, space, name, tables->table);
+ }
+
+ DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING
+@return 0 on success */
+static
+int
+innodb_tablespaces_scrubbing_init(
+/*========================*/
+ void* p) /*!< in/out: table schema object */
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("innodb_tablespaces_scrubbing_init");
+
+ schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = innodb_tablespaces_scrubbing_fields_info;
+ schema->fill_table = i_s_tablespaces_scrubbing_fill_table;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_tablespaces_scrubbing =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_TABLESPACES_SCRUBBING"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, "Google Inc"),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB TABLESPACES_SCRUBBING"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_BSD),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_tablespaces_scrubbing_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA)
+};
+
+/** INNODB_MUTEXES *********************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_MUTEXES */
+static ST_FIELD_INFO innodb_mutexes_fields_info[] =
+{
+#define MUTEXES_NAME 0
+ {STRUCT_FLD(field_name, "NAME"),
+ STRUCT_FLD(field_length, OS_FILE_MAX_PATH),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+#define MUTEXES_CREATE_FILE 1
+ {STRUCT_FLD(field_name, "CREATE_FILE"),
+ STRUCT_FLD(field_length, OS_FILE_MAX_PATH),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+#define MUTEXES_CREATE_LINE 2
+ {STRUCT_FLD(field_name, "CREATE_LINE"),
+ STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+#define MUTEXES_OS_WAITS 3
+ {STRUCT_FLD(field_name, "OS_WAITS"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Function to populate INFORMATION_SCHEMA.INNODB_MUTEXES table.
+Loop through each record in mutex and rw_lock lists, and extract the column
+information and fill the INFORMATION_SCHEMA.INNODB_MUTEXES table.
+@return 0 on success */
+static
+int
+i_s_innodb_mutexes_fill_table(
+/*==========================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ) /*!< in: condition (not used) */
+{
+ ib_mutex_t* mutex;
+ rw_lock_t* lock;
+ ulint block_mutex_oswait_count = 0;
+ ulint block_lock_oswait_count = 0;
+ ib_mutex_t* block_mutex = NULL;
+ rw_lock_t* block_lock = NULL;
+ Field** fields = tables->table->field;
+
+ DBUG_ENTER("i_s_innodb_mutexes_fill_table");
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+ /* deny access to user without PROCESS_ACL privilege */
+ if (check_global_access(thd, PROCESS_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ mutex_enter(&mutex_list_mutex);
+
+ for (mutex = UT_LIST_GET_FIRST(mutex_list); mutex != NULL;
+ mutex = UT_LIST_GET_NEXT(list, mutex)) {
+ if (mutex->count_os_wait == 0) {
+ continue;
+ }
+
+ if (buf_pool_is_block_mutex(mutex)) {
+ block_mutex = mutex;
+ block_mutex_oswait_count += mutex->count_os_wait;
+ continue;
+ }
+
+ OK(field_store_string(fields[MUTEXES_NAME], mutex->cmutex_name));
+ OK(field_store_string(fields[MUTEXES_CREATE_FILE], innobase_basename(mutex->cfile_name)));
+ OK(field_store_ulint(fields[MUTEXES_CREATE_LINE], mutex->cline));
+ OK(field_store_ulint(fields[MUTEXES_OS_WAITS], (longlong)mutex->count_os_wait));
+ OK(schema_table_store_record(thd, tables->table));
+ }
+
+ if (block_mutex) {
+ char buf1[IO_SIZE];
+
+ my_snprintf(buf1, sizeof buf1, "combined %s",
+ innobase_basename(block_mutex->cfile_name));
+
+ OK(field_store_string(fields[MUTEXES_NAME], block_mutex->cmutex_name));
+ OK(field_store_string(fields[MUTEXES_CREATE_FILE], buf1));
+ OK(field_store_ulint(fields[MUTEXES_CREATE_LINE], block_mutex->cline));
+ OK(field_store_ulint(fields[MUTEXES_OS_WAITS], (longlong)block_mutex_oswait_count));
+ OK(schema_table_store_record(thd, tables->table));
+ }
+
+ mutex_exit(&mutex_list_mutex);
+
+ mutex_enter(&rw_lock_list_mutex);
+
+ for (lock = UT_LIST_GET_FIRST(rw_lock_list); lock != NULL;
+ lock = UT_LIST_GET_NEXT(list, lock)) {
+ if (lock->count_os_wait == 0) {
+ continue;
+ }
+
+ if (buf_pool_is_block_lock(lock)) {
+ block_lock = lock;
+ block_lock_oswait_count += lock->count_os_wait;
+ continue;
+ }
+
+ OK(field_store_string(fields[MUTEXES_NAME], lock->lock_name));
+ OK(field_store_string(fields[MUTEXES_CREATE_FILE], innobase_basename(lock->cfile_name)));
+ OK(field_store_ulint(fields[MUTEXES_CREATE_LINE], lock->cline));
+ OK(field_store_ulint(fields[MUTEXES_OS_WAITS], (longlong)lock->count_os_wait));
+ OK(schema_table_store_record(thd, tables->table));
+ }
+
+ if (block_lock) {
+ char buf1[IO_SIZE];
+
+ my_snprintf(buf1, sizeof buf1, "combined %s",
+ innobase_basename(block_lock->cfile_name));
+
+ OK(field_store_string(fields[MUTEXES_NAME], block_lock->lock_name));
+ OK(field_store_string(fields[MUTEXES_CREATE_FILE], buf1));
+ OK(field_store_ulint(fields[MUTEXES_CREATE_LINE], block_lock->cline));
+ OK(field_store_ulint(fields[MUTEXES_OS_WAITS], (longlong)block_lock_oswait_count));
+ OK(schema_table_store_record(thd, tables->table));
+ }
+
+ mutex_exit(&rw_lock_list_mutex);
+
+ DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_MUTEXES
+@return 0 on success */
+static
+int
+innodb_mutexes_init(
+/*================*/
+ void* p) /*!< in/out: table schema object */
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("innodb_mutexes_init");
+
+ schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = innodb_mutexes_fields_info;
+ schema->fill_table = i_s_innodb_mutexes_fill_table;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_mutexes =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_MUTEXES"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB SYS_DATAFILES"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_mutexes_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
+};
+
+/** SYS_SEMAPHORE_WAITS ************************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_SEMAPHORE_WAITS */
+static ST_FIELD_INFO innodb_sys_semaphore_waits_fields_info[] =
+{
+ // SYS_SEMAPHORE_WAITS_THREAD_ID 0
+ {STRUCT_FLD(field_name, "THREAD_ID"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ // SYS_SEMAPHORE_WAITS_OBJECT_NAME 1
+ {STRUCT_FLD(field_name, "OBJECT_NAME"),
+ STRUCT_FLD(field_length, OS_FILE_MAX_PATH),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ // SYS_SEMAPHORE_WAITS_FILE 2
+ {STRUCT_FLD(field_name, "FILE"),
+ STRUCT_FLD(field_length, OS_FILE_MAX_PATH),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ // SYS_SEMAPHORE_WAITS_LINE 3
+ {STRUCT_FLD(field_name, "LINE"),
+ STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ // SYS_SEMAPHORE_WAITS_WAIT_TIME 4
+ {STRUCT_FLD(field_name, "WAIT_TIME"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ // SYS_SEMAPHORE_WAITS_WAIT_OBJECT 5
+ {STRUCT_FLD(field_name, "WAIT_OBJECT"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ // SYS_SEMAPHORE_WAITS_WAIT_TYPE 6
+ {STRUCT_FLD(field_name, "WAIT_TYPE"),
+ STRUCT_FLD(field_length, 16),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ // SYS_SEMAPHORE_WAITS_HOLDER_THREAD_ID 7
+ {STRUCT_FLD(field_name, "HOLDER_THREAD_ID"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ // SYS_SEMAPHORE_WAITS_HOLDER_FILE 8
+ {STRUCT_FLD(field_name, "HOLDER_FILE"),
+ STRUCT_FLD(field_length, OS_FILE_MAX_PATH),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ // SYS_SEMAPHORE_WAITS_HOLDER_LINE 9
+ {STRUCT_FLD(field_name, "HOLDER_LINE"),
+ STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ // SYS_SEMAPHORE_WAITS_CREATED_FILE 10
+ {STRUCT_FLD(field_name, "CREATED_FILE"),
+ STRUCT_FLD(field_length, OS_FILE_MAX_PATH),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ // SYS_SEMAPHORE_WAITS_CREATED_LINE 11
+ {STRUCT_FLD(field_name, "CREATED_LINE"),
+ STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ // SYS_SEMAPHORE_WAITS_WRITER_THREAD 12
+ {STRUCT_FLD(field_name, "WRITER_THREAD"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ // SYS_SEMAPHORE_WAITS_RESERVATION_MODE 13
+ {STRUCT_FLD(field_name, "RESERVATION_MODE"),
+ STRUCT_FLD(field_length, 16),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ // SYS_SEMAPHORE_WAITS_READERS 14
+ {STRUCT_FLD(field_name, "READERS"),
+ STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ // SYS_SEMAPHORE_WAITS_WAITERS_FLAG 15
+ {STRUCT_FLD(field_name, "WAITERS_FLAG"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ // SYS_SEMAPHORE_WAITS_LOCK_WORD 16
+ {STRUCT_FLD(field_name, "LOCK_WORD"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ // SYS_SEMAPHORE_WAITS_LAST_READER_FILE 17
+ {STRUCT_FLD(field_name, "LAST_READER_FILE"),
+ STRUCT_FLD(field_length, OS_FILE_MAX_PATH),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ // SYS_SEMAPHORE_WAITS_LAST_READER_LINE 18
+ {STRUCT_FLD(field_name, "LAST_READER_LINE"),
+ STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ // SYS_SEMAPHORE_WAITS_LAST_WRITER_FILE 19
+ {STRUCT_FLD(field_name, "LAST_WRITER_FILE"),
+ STRUCT_FLD(field_length, OS_FILE_MAX_PATH),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ // SYS_SEMAPHORE_WAITS_LAST_WRITER_LINE 20
+ {STRUCT_FLD(field_name, "LAST_WRITER_LINE"),
+ STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ // SYS_SEMAPHORE_WAITS_OS_WAIT_COUNT 21
+ {STRUCT_FLD(field_name, "OS_WAIT_COUNT"),
+ STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ END_OF_ST_FIELD_INFO
+};
+
+
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_SYS_SEMAPHORE_WAITS
+@return 0 on success */
+static
+int
+innodb_sys_semaphore_waits_init(
+/*============================*/
+ void* p) /*!< in/out: table schema object */
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("innodb_sys_semaphore_waits_init");
+
+ schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = innodb_sys_semaphore_waits_fields_info;
+ schema->fill_table = sync_arr_fill_sys_semphore_waits_table;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_semaphore_waits =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_SYS_SEMAPHORE_WAITS"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, maria_plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB SYS_SEMAPHORE_WAITS"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_sys_semaphore_waits_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* Maria extension */
+ STRUCT_FLD(version_info, INNODB_VERSION_STR),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA),
};
diff --git a/storage/innobase/handler/i_s.h b/storage/innobase/handler/i_s.h
index a2b324cb314..979d9d80a7f 100644
--- a/storage/innobase/handler/i_s.h
+++ b/storage/innobase/handler/i_s.h
@@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyrigth (c) 2014, 2015, MariaDB Corporation
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -21,12 +22,14 @@ this program; if not, write to the Free Software Foundation, Inc.,
InnoDB INFORMATION SCHEMA tables interface to MySQL.
Created July 18, 2007 Vasil Dimov
+Modified Dec 29, 2014 Jan Lindström
*******************************************************/
#ifndef i_s_h
#define i_s_h
const char plugin_author[] = "Oracle Corporation";
+const char maria_plugin_author[] = "MariaDB Corporation";
extern struct st_maria_plugin i_s_innodb_trx;
extern struct st_maria_plugin i_s_innodb_locks;
@@ -56,5 +59,95 @@ extern struct st_maria_plugin i_s_innodb_sys_foreign;
extern struct st_maria_plugin i_s_innodb_sys_foreign_cols;
extern struct st_maria_plugin i_s_innodb_sys_tablespaces;
extern struct st_maria_plugin i_s_innodb_sys_datafiles;
+extern struct st_maria_plugin i_s_innodb_mutexes;
+extern struct st_maria_plugin i_s_innodb_tablespaces_encryption;
+extern struct st_maria_plugin i_s_innodb_tablespaces_scrubbing;
+extern struct st_maria_plugin i_s_innodb_sys_semaphore_waits;
+
+/** maximum number of buffer page info we would cache. */
+#define MAX_BUF_INFO_CACHED 10000
+
+#define OK(expr) \
+ if ((expr) != 0) { \
+ DBUG_RETURN(1); \
+ }
+
+#define RETURN_IF_INNODB_NOT_STARTED(plugin_name) \
+do { \
+ if (!srv_was_started) { \
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, \
+ ER_CANT_FIND_SYSTEM_REC, \
+ "InnoDB: SELECTing from " \
+ "INFORMATION_SCHEMA.%s but " \
+ "the InnoDB storage engine " \
+ "is not installed", plugin_name); \
+ DBUG_RETURN(0); \
+ } \
+} while (0)
+
+#if !defined __STRICT_ANSI__ && defined __GNUC__ && (__GNUC__) > 2 && !defined __INTEL_COMPILER && !defined __clang__
+#ifdef HAVE_C99_INITIALIZERS
+#define STRUCT_FLD(name, value) .name = value
+#else
+#define STRUCT_FLD(name, value) name: value
+#endif /* HAVE_C99_INITIALIZERS */
+#else
+#define STRUCT_FLD(name, value) value
+#endif
+
+/* Don't use a static const variable here, as some C++ compilers (notably
+HPUX aCC: HP ANSI C++ B3910B A.03.65) can't handle it. */
+#define END_OF_ST_FIELD_INFO \
+ {STRUCT_FLD(field_name, NULL), \
+ STRUCT_FLD(field_length, 0), \
+ STRUCT_FLD(field_type, MYSQL_TYPE_NULL), \
+ STRUCT_FLD(value, 0), \
+ STRUCT_FLD(field_flags, 0), \
+ STRUCT_FLD(old_name, ""), \
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}
+
+/** Fields on INFORMATION_SCHEMA.SYS_SEMAMPHORE_WAITS table */
+#define SYS_SEMAPHORE_WAITS_THREAD_ID 0
+#define SYS_SEMAPHORE_WAITS_OBJECT_NAME 1
+#define SYS_SEMAPHORE_WAITS_FILE 2
+#define SYS_SEMAPHORE_WAITS_LINE 3
+#define SYS_SEMAPHORE_WAITS_WAIT_TIME 4
+#define SYS_SEMAPHORE_WAITS_WAIT_OBJECT 5
+#define SYS_SEMAPHORE_WAITS_WAIT_TYPE 6
+#define SYS_SEMAPHORE_WAITS_HOLDER_THREAD_ID 7
+#define SYS_SEMAPHORE_WAITS_HOLDER_FILE 8
+#define SYS_SEMAPHORE_WAITS_HOLDER_LINE 9
+#define SYS_SEMAPHORE_WAITS_CREATED_FILE 10
+#define SYS_SEMAPHORE_WAITS_CREATED_LINE 11
+#define SYS_SEMAPHORE_WAITS_WRITER_THREAD 12
+#define SYS_SEMAPHORE_WAITS_RESERVATION_MODE 13
+#define SYS_SEMAPHORE_WAITS_READERS 14
+#define SYS_SEMAPHORE_WAITS_WAITERS_FLAG 15
+#define SYS_SEMAPHORE_WAITS_LOCK_WORD 16
+#define SYS_SEMAPHORE_WAITS_LAST_READER_FILE 17
+#define SYS_SEMAPHORE_WAITS_LAST_READER_LINE 18
+#define SYS_SEMAPHORE_WAITS_LAST_WRITER_FILE 19
+#define SYS_SEMAPHORE_WAITS_LAST_WRITER_LINE 20
+#define SYS_SEMAPHORE_WAITS_OS_WAIT_COUNT 21
+
+/*******************************************************************//**
+Auxiliary function to store ulint value in MYSQL_TYPE_LONGLONG field.
+If the value is ULINT_UNDEFINED then the field it set to NULL.
+@return 0 on success */
+int
+field_store_ulint(
+/*==============*/
+ Field* field, /*!< in/out: target field for storage */
+ ulint n); /*!< in: value to store */
+
+/*******************************************************************//**
+Auxiliary function to store char* value in MYSQL_TYPE_STRING field.
+@return 0 on success */
+int
+field_store_string(
+/*===============*/
+ Field* field, /*!< in/out: target field for storage */
+ const char* str); /*!< in: NUL-terminated utf-8 string,
+ or NULL */
#endif /* i_s_h */
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h
index 305acf7e322..3cd44e3a39f 100644
--- a/storage/innobase/include/btr0btr.h
+++ b/storage/innobase/include/btr0btr.h
@@ -2,6 +2,7 @@
Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2014, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -447,7 +448,7 @@ btr_root_raise_and_insert(
const dtuple_t* tuple, /*!< in: tuple to insert */
ulint n_ext, /*!< in: number of externally stored columns */
mtr_t* mtr) /*!< in: mtr */
- __attribute__((nonnull, warn_unused_result));
+ __attribute__((nonnull(2,3,4,7), warn_unused_result));
/*************************************************************//**
Reorganizes an index page.
@@ -542,7 +543,7 @@ btr_page_split_and_insert(
const dtuple_t* tuple, /*!< in: tuple to insert */
ulint n_ext, /*!< in: number of externally stored columns */
mtr_t* mtr) /*!< in: mtr */
- __attribute__((nonnull, warn_unused_result));
+ __attribute__((nonnull(2,3,4,7), warn_unused_result));
/*******************************************************//**
Inserts a data tuple to a tree on a non-leaf level. It is assumed
that mtr holds an x-latch on the tree. */
@@ -671,6 +672,21 @@ btr_get_size(
is s-latched */
__attribute__((nonnull, warn_unused_result));
/**************************************************************//**
+Gets the number of reserved and used pages in a B-tree.
+@return number of pages reserved, or ULINT_UNDEFINED if the index
+is unavailable */
+UNIV_INTERN
+ulint
+btr_get_size_and_reserved(
+/*======================*/
+ dict_index_t* index, /*!< in: index */
+ ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
+ ulint* used, /*!< out: number of pages used (<= reserved) */
+ mtr_t* mtr) /*!< in/out: mini-transaction where index
+ is s-latched */
+ __attribute__((nonnull));
+
+/**************************************************************//**
Allocates a new file page to be used in an index tree. NOTE: we assume
that the caller has made the reservation for free extents!
@retval NULL if no page could be allocated
@@ -715,8 +731,36 @@ btr_page_free_low(
dict_index_t* index, /*!< in: index tree */
buf_block_t* block, /*!< in: block to be freed, x-latched */
ulint level, /*!< in: page level */
+ bool blob, /*!< in: blob page */
mtr_t* mtr) /*!< in: mtr */
__attribute__((nonnull));
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@retval true if the operation was successful
+@retval false if it is a compressed page, and recompression failed */
+UNIV_INTERN
+bool
+btr_page_reorganize_block(
+/*======================*/
+ bool recovery,/*!< in: true if called in recovery:
+ locks should not be updated, i.e.,
+ there cannot exist locks on the
+ page, and a hash index should not be
+ dropped: it cannot exist */
+ ulint z_level,/*!< in: compression level to be used
+ if dealing with compressed page */
+ buf_block_t* block, /*!< in/out: B-tree page */
+ dict_index_t* index, /*!< in: the index tree of the page */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ __attribute__((nonnull));
+
#ifdef UNIV_BTR_PRINT
/*************************************************************//**
Prints size info of a B-tree. */
@@ -762,6 +806,60 @@ btr_validate_index(
const trx_t* trx) /*!< in: transaction or 0 */
__attribute__((nonnull(1), warn_unused_result));
+#ifdef UNIV_SYNC_DEBUG
+/*************************************************************//**
+Removes a page from the level list of pages.
+@param space in: space where removed
+@param zip_size in: compressed page size in bytes, or 0 for uncompressed
+@param page in/out: page to remove
+@param index in: index tree
+@param mtr in/out: mini-transaction */
+# define btr_level_list_remove(space,zip_size,page,index,mtr) \
+ btr_level_list_remove_func(space,zip_size,page,index,mtr)
+#else /* UNIV_SYNC_DEBUG */
+/*************************************************************//**
+Removes a page from the level list of pages.
+@param space in: space where removed
+@param zip_size in: compressed page size in bytes, or 0 for uncompressed
+@param page in/out: page to remove
+@param index in: index tree
+@param mtr in/out: mini-transaction */
+# define btr_level_list_remove(space,zip_size,page,index,mtr) \
+ btr_level_list_remove_func(space,zip_size,page,mtr)
+#endif /* UNIV_SYNC_DEBUG */
+
+/*************************************************************//**
+Removes a page from the level list of pages. */
+UNIV_INTERN
+void
+btr_level_list_remove_func(
+/*=======================*/
+ ulint space, /*!< in: space where removed */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ page_t* page, /*!< in/out: page to remove */
+#ifdef UNIV_SYNC_DEBUG
+ const dict_index_t* index, /*!< in: index tree */
+#endif /* UNIV_SYNC_DEBUG */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+ __attribute__((nonnull));
+
+/*************************************************************//**
+If page is the only on its level, this function moves its records to the
+father page, thus reducing the tree height.
+@return father block */
+UNIV_INTERN
+buf_block_t*
+btr_lift_page_up(
+/*=============*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: page which is the only on its level;
+ must not be empty: use
+ btr_discard_only_page_on_level if the last
+ record from the page should be removed */
+ mtr_t* mtr) /*!< in: mtr */
+ __attribute__((nonnull));
+
#define BTR_N_LEAF_PAGES 1
#define BTR_TOTAL_SIZE 2
#endif /* !UNIV_HOTBACKUP */
@@ -770,4 +868,8 @@ btr_validate_index(
#include "btr0btr.ic"
#endif
+/****************************************************************
+Global variable controlling if scrubbing should be performed */
+extern my_bool srv_immediate_scrub_data_uncompressed;
+
#endif
diff --git a/storage/innobase/include/btr0btr.ic b/storage/innobase/include/btr0btr.ic
index 00f50b5dcaf..40b468b200a 100644
--- a/storage/innobase/include/btr0btr.ic
+++ b/storage/innobase/include/btr0btr.ic
@@ -163,9 +163,10 @@ btr_page_get_next(
/*!< in: mini-transaction handle */
{
ut_ad(page && mtr);
+#ifndef UNIV_INNOCHECKSUM
ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)
|| mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_S_FIX));
-
+#endif /* UNIV_INNOCHECKSUM */
return(mach_read_from_4(page + FIL_PAGE_NEXT));
}
diff --git a/storage/innobase/include/btr0defragment.h b/storage/innobase/include/btr0defragment.h
new file mode 100644
index 00000000000..8fef3c6519a
--- /dev/null
+++ b/storage/innobase/include/btr0defragment.h
@@ -0,0 +1,101 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#ifndef btr0defragment_h
+#define btr0defragment_h
+
+#include "univ.i"
+
+#ifndef UNIV_HOTBACKUP
+
+#include "btr0pcur.h"
+
+/* Max number of pages to consider at once during defragmentation. */
+#define BTR_DEFRAGMENT_MAX_N_PAGES 32
+
+/** stats in btr_defragment */
+extern ulint btr_defragment_compression_failures;
+extern ulint btr_defragment_failures;
+extern ulint btr_defragment_count;
+
+/** Item in the work queue for btr_degrament_thread. */
+struct btr_defragment_item_t
+{
+ btr_pcur_t* pcur; /* persistent cursor where
+ btr_defragment_n_pages should start */
+ os_event_t event; /* if not null, signal after work
+ is done */
+ bool removed; /* Mark an item as removed */
+ ulonglong last_processed; /* timestamp of last time this index
+ is processed by defragment thread */
+
+ btr_defragment_item_t(btr_pcur_t* pcur, os_event_t event);
+ ~btr_defragment_item_t();
+};
+
+/******************************************************************//**
+Initialize defragmentation. */
+void
+btr_defragment_init(void);
+/******************************************************************//**
+Shutdown defragmentation. */
+void
+btr_defragment_shutdown();
+/******************************************************************//**
+Check whether the given index is in btr_defragment_wq. */
+bool
+btr_defragment_find_index(
+ dict_index_t* index); /*!< Index to find. */
+/******************************************************************//**
+Add an index to btr_defragment_wq. Return a pointer to os_event if this
+is a synchronized defragmentation. */
+os_event_t
+btr_defragment_add_index(
+ dict_index_t* index, /*!< index to be added */
+ bool async); /*!< whether this is an async defragmentation */
+/******************************************************************//**
+When table is dropped, this function is called to mark a table as removed in
+btr_efragment_wq. The difference between this function and the remove_index
+function is this will not NULL the event. */
+void
+btr_defragment_remove_table(
+ dict_table_t* table); /*!< Index to be removed. */
+/******************************************************************//**
+Mark an index as removed from btr_defragment_wq. */
+void
+btr_defragment_remove_index(
+ dict_index_t* index); /*!< Index to be removed. */
+/*********************************************************************//**
+Check whether we should save defragmentation statistics to persistent storage.*/
+UNIV_INTERN
+void
+btr_defragment_save_defrag_stats_if_needed(
+ dict_index_t* index); /*!< in: index */
+/******************************************************************//**
+Thread that merges consecutive b-tree pages into fewer pages to defragment
+the index. */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(btr_defragment_thread)(
+/*==========================================*/
+ void* arg); /*!< in: a dummy parameter required by
+ os_thread_create */
+
+
+#endif /* !UNIV_HOTBACKUP */
+#endif
diff --git a/storage/innobase/include/btr0scrub.h b/storage/innobase/include/btr0scrub.h
new file mode 100644
index 00000000000..608266c206d
--- /dev/null
+++ b/storage/innobase/include/btr0scrub.h
@@ -0,0 +1,166 @@
+// Copyright 2014 Google
+
+#ifndef btr0scrub_h
+#define btr0scrub_h
+
+#include "univ.i"
+
+#include "dict0dict.h"
+#include "data0data.h"
+#include "page0cur.h"
+#include "mtr0mtr.h"
+#include "btr0types.h"
+
+/**
+ * enum describing page allocation status
+ */
+enum btr_scrub_page_allocation_status_t {
+ BTR_SCRUB_PAGE_FREE,
+ BTR_SCRUB_PAGE_ALLOCATED,
+ BTR_SCRUB_PAGE_ALLOCATION_UNKNOWN
+};
+
+/**
+* constants returned by btr_page_needs_scrubbing & btr_scrub_recheck_page
+*/
+#define BTR_SCRUB_PAGE 1 /* page should be scrubbed */
+#define BTR_SCRUB_SKIP_PAGE 2 /* no scrub & no action */
+#define BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE 3 /* no scrub & close table */
+#define BTR_SCRUB_SKIP_PAGE_AND_COMPLETE_SPACE 4 /* no scrub & complete space */
+#define BTR_SCRUB_TURNED_OFF 5 /* we detected that scrubbing
+ was disabled by global
+ variable */
+
+/**************************************************************//**
+struct for keeping scrub statistics. */
+struct btr_scrub_stat_t {
+ /* page reorganizations */
+ ulint page_reorganizations;
+ /* page splits */
+ ulint page_splits;
+ /* scrub failures */
+ ulint page_split_failures_underflow;
+ ulint page_split_failures_out_of_filespace;
+ ulint page_split_failures_missing_index;
+ ulint page_split_failures_unknown;
+};
+
+/**************************************************************//**
+struct for thread local scrub state. */
+struct btr_scrub_t {
+
+ /* current space */
+ ulint space;
+
+ /* is scrubbing enabled for this space */
+ bool scrubbing;
+
+ /* is current space compressed */
+ bool compressed;
+
+ dict_table_t* current_table;
+ dict_index_t* current_index;
+ /* savepoint for X_LATCH of block */
+ ulint savepoint;
+
+ /* statistic counters */
+ btr_scrub_stat_t scrub_stat;
+};
+
+/*********************************************************************
+Init scrub global variables */
+UNIV_INTERN
+void
+btr_scrub_init();
+
+/*********************************************************************
+Cleanup scrub globals */
+UNIV_INTERN
+void
+btr_scrub_cleanup();
+
+/***********************************************************************
+Return crypt statistics */
+UNIV_INTERN
+void
+btr_scrub_total_stat(
+/*==================*/
+ btr_scrub_stat_t *stat); /*!< out: stats to update */
+
+/**************************************************************//**
+Check if a page needs scrubbing
+* @return BTR_SCRUB_PAGE if page should be scrubbed
+* else btr_scrub_skip_page should be called
+* with this return value (and without any latches held)
+*/
+UNIV_INTERN
+int
+btr_page_needs_scrubbing(
+/*=====================*/
+ btr_scrub_t* scrub_data, /*!< in: scrub data */
+ buf_block_t* block, /*!< in: block to check, latched */
+ btr_scrub_page_allocation_status_t allocated); /*!< in: is block
+ allocated, free or
+ unknown */
+
+/****************************************************************
+Recheck if a page needs scrubbing, and if it does load appropriate
+table and index
+* @return BTR_SCRUB_PAGE if page should be scrubbed
+* else btr_scrub_skip_page should be called
+* with this return value (and without any latches held)
+*/
+UNIV_INTERN
+int
+btr_scrub_recheck_page(
+/*====================*/
+ btr_scrub_t* scrub_data, /*!< inut: scrub data */
+ buf_block_t* block, /*!< in: block */
+ btr_scrub_page_allocation_status_t allocated, /*!< in: is block
+ allocated or free */
+ mtr_t* mtr); /*!< in: mtr */
+
+/****************************************************************
+Perform actual scrubbing of page */
+UNIV_INTERN
+int
+btr_scrub_page(
+/*============*/
+ btr_scrub_t* scrub_data, /*!< in/out: scrub data */
+ buf_block_t* block, /*!< in: block */
+ btr_scrub_page_allocation_status_t allocated, /*!< in: is block
+ allocated or free */
+ mtr_t* mtr); /*!< in: mtr */
+
+/****************************************************************
+Perform cleanup needed for a page not needing scrubbing */
+UNIV_INTERN
+void
+btr_scrub_skip_page(
+/*============*/
+ btr_scrub_t* scrub_data, /*!< in/out: scrub data */
+ int needs_scrubbing); /*!< in: return value from
+ btr_page_needs_scrubbing or
+ btr_scrub_recheck_page which encodes what kind
+ of cleanup is needed */
+
+/****************************************************************
+Start iterating a space
+* @return true if scrubbing is turned on */
+UNIV_INTERN
+bool
+btr_scrub_start_space(
+/*===================*/
+ ulint space, /*!< in: space */
+ btr_scrub_t* scrub_data); /*!< in/out: scrub data */
+
+/****************************************************************
+Complete iterating a space
+* @return true if space was scrubbed */
+UNIV_INTERN
+bool
+btr_scrub_complete_space(
+/*=====================*/
+ btr_scrub_t* scrub_data); /*!< in/out: scrub data */
+
+#endif
diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h
index 4b2556524fa..dda70a5cbe9 100644
--- a/storage/innobase/include/buf0buf.h
+++ b/storage/innobase/include/buf0buf.h
@@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -374,11 +375,13 @@ Given a tablespace id and page number tries to get that page. If the
page is not in the buffer pool it is not loaded and NULL is returned.
Suitable for using when holding the lock_sys_t::mutex. */
UNIV_INTERN
-const buf_block_t*
+buf_block_t*
buf_page_try_get_func(
/*==================*/
ulint space_id,/*!< in: tablespace id */
ulint page_no,/*!< in: page number */
+ ulint rw_latch, /*!< in: RW_S_LATCH, RW_X_LATCH */
+ bool possibly_freed, /*!< in: don't mind if page is freed */
const char* file, /*!< in: file name */
ulint line, /*!< in: line where called */
mtr_t* mtr); /*!< in: mini-transaction */
@@ -390,7 +393,8 @@ not loaded. Suitable for using when holding the lock_sys_t::mutex.
@param mtr in: mini-transaction
@return the page if in buffer pool, NULL if not */
#define buf_page_try_get(space_id, page_no, mtr) \
- buf_page_try_get_func(space_id, page_no, __FILE__, __LINE__, mtr);
+ buf_page_try_get_func(space_id, page_no, RW_S_LATCH, false, \
+ __FILE__, __LINE__, mtr);
/********************************************************************//**
Get read access to a compressed page (usually of type
@@ -1195,7 +1199,9 @@ UNIV_INTERN
bool
buf_page_io_complete(
/*=================*/
- buf_page_t* bpage); /*!< in: pointer to the block in question */
+ buf_page_t* bpage, /*!< in: pointer to the block in question */
+ bool evict = false);/*!< in: whether or not to evict
+ the page from LRU list. */
/********************************************************************//**
Calculates a folded value of a file page address to use in the page hash
table.
@@ -1435,6 +1441,76 @@ buf_flush_update_zip_checksum(
#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+The hook that is called just before a page is written to disk.
+The function encrypts the content of the page and returns a pointer
+to a frame that will be written instead of the real frame. */
+UNIV_INTERN
+byte*
+buf_page_encrypt_before_write(
+/*==========================*/
+ buf_page_t* page, /*!< in/out: buffer page to be flushed */
+ byte* frame, /*!< in: src frame */
+ ulint space_id); /*!< in: space id */
+
+/**********************************************************************
+The hook that is called after page is written to disk.
+The function releases any resources needed for encryption that was allocated
+in buf_page_encrypt_before_write */
+UNIV_INTERN
+ibool
+buf_page_encrypt_after_write(
+/*=========================*/
+ buf_page_t* page); /*!< in/out: buffer page that was flushed */
+
+/********************************************************************//**
+The hook that is called just before a page is read from disk.
+The function allocates memory that is used to temporarily store disk content
+before getting decrypted */
+UNIV_INTERN
+byte*
+buf_page_decrypt_before_read(
+/*=========================*/
+ buf_page_t* page, /*!< in/out: buffer page read from disk */
+ ulint zip_size); /*!< in: compressed page size, or 0 */
+
+/********************************************************************//**
+The hook that is called just after a page is read from disk.
+The function decrypt disk content into buf_page_t and releases the
+temporary buffer that was allocated in buf_page_decrypt_before_read */
+UNIV_INTERN
+ibool
+buf_page_decrypt_after_read(
+/*========================*/
+ buf_page_t* page); /*!< in/out: buffer page read from disk */
+
+/** @brief The temporary memory structure.
+
+NOTE! The definition appears here only for other modules of this
+directory (buf) to see it. Do not use from outside! */
+
+typedef struct {
+ bool reserved; /*!< true if this slot is reserved
+ */
+#ifdef HAVE_LZO
+ byte* lzo_mem; /*!< Temporal memory used by LZO */
+#endif
+ byte* crypt_buf; /*!< for encryption the data needs to be
+ copied to a separate buffer before it's
+ encrypted&written. this as a page can be
+ read while it's being flushed */
+ byte* crypt_buf_free; /*!< for encryption, allocated buffer
+ that is then alligned */
+ byte* comp_buf; /*!< for compression we need
+ temporal buffer because page
+ can be read while it's being flushed */
+ byte* comp_buf_free; /*!< for compression, allocated
+ buffer that is then alligned */
+ byte* out_buf; /*!< resulting buffer after
+ encryption/compression. This is a
+ pointer and not allocated. */
+} buf_tmp_buffer_t;
+
/** The common buffer control block structure
for compressed and uncompressed frames */
@@ -1499,7 +1575,23 @@ struct buf_page_t{
state == BUF_BLOCK_ZIP_PAGE and
zip.data == NULL means an active
buf_pool->watch */
-#ifndef UNIV_HOTBACKUP
+
+ ulint write_size; /* Write size is set when this
+ page is first time written and then
+ if written again we check is TRIM
+ operation needed. */
+
+ unsigned key_version; /*!< key version for this block */
+ ulint real_size; /*!< Real size of the page
+ Normal pages == UNIV_PAGE_SIZE
+ page compressed pages, payload
+ size alligned to sector boundary.
+ */
+
+ buf_tmp_buffer_t* slot; /*!< Slot for temporary memory
+ used for encryption/compression
+ or NULL */
+ #ifndef UNIV_HOTBACKUP
buf_page_t* hash; /*!< node used in chaining to
buf_pool->page_hash or
buf_pool->zip_hash */
@@ -1757,6 +1849,133 @@ Compute the hash fold value for blocks in buf_pool->zip_hash. */
#define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b))
/* @} */
+/** A "Hazard Pointer" class used to iterate over page lists
+inside the buffer pool. A hazard pointer is a buf_page_t pointer
+which we intend to iterate over next and we want it remain valid
+even after we release the buffer pool mutex. */
+class HazardPointer {
+
+public:
+ /** Constructor
+ @param buf_pool buffer pool instance
+ @param mutex mutex that is protecting the hp. */
+ HazardPointer(const buf_pool_t* buf_pool, const ib_mutex_t* mutex)
+ :
+ m_buf_pool(buf_pool)
+#ifdef UNIV_DEBUG
+ , m_mutex(mutex)
+#endif /* UNIV_DEBUG */
+ , m_hp() {}
+
+ /** Destructor */
+ virtual ~HazardPointer() {}
+
+ /** Get current value */
+ buf_page_t* get()
+ {
+ ut_ad(mutex_own(m_mutex));
+ return(m_hp);
+ }
+
+ /** Set current value
+ @param bpage buffer block to be set as hp */
+ void set(buf_page_t* bpage);
+
+ /** Checks if a bpage is the hp
+ @param bpage buffer block to be compared
+ @return true if it is hp */
+ bool is_hp(const buf_page_t* bpage);
+
+ /** Adjust the value of hp. This happens when some
+ other thread working on the same list attempts to
+ remove the hp from the list. Must be implemented
+ by the derived classes.
+ @param bpage buffer block to be compared */
+ virtual void adjust(const buf_page_t*) = 0;
+
+protected:
+ /** Disable copying */
+ HazardPointer(const HazardPointer&);
+ HazardPointer& operator=(const HazardPointer&);
+
+ /** Buffer pool instance */
+ const buf_pool_t* m_buf_pool;
+
+#if UNIV_DEBUG
+ /** mutex that protects access to the m_hp. */
+ const ib_mutex_t* m_mutex;
+#endif /* UNIV_DEBUG */
+
+ /** hazard pointer. */
+ buf_page_t* m_hp;
+};
+
+/** Class implementing buf_pool->flush_list hazard pointer */
+class FlushHp: public HazardPointer {
+
+public:
+ /** Constructor
+ @param buf_pool buffer pool instance
+ @param mutex mutex that is protecting the hp. */
+ FlushHp(const buf_pool_t* buf_pool, const ib_mutex_t* mutex)
+ :
+ HazardPointer(buf_pool, mutex) {}
+
+ /** Destructor */
+ virtual ~FlushHp() {}
+
+ /** Adjust the value of hp. This happens when some
+ other thread working on the same list attempts to
+ remove the hp from the list.
+ @param bpage buffer block to be compared */
+ void adjust(const buf_page_t* bpage);
+};
+
+/** Class implementing buf_pool->LRU hazard pointer */
+class LRUHp: public HazardPointer {
+
+public:
+ /** Constructor
+ @param buf_pool buffer pool instance
+ @param mutex mutex that is protecting the hp. */
+ LRUHp(const buf_pool_t* buf_pool, const ib_mutex_t* mutex)
+ :
+ HazardPointer(buf_pool, mutex) {}
+
+ /** Destructor */
+ virtual ~LRUHp() {}
+
+ /** Adjust the value of hp. This happens when some
+ other thread working on the same list attempts to
+ remove the hp from the list.
+ @param bpage buffer block to be compared */
+ void adjust(const buf_page_t* bpage);
+};
+
+/** Special purpose iterators to be used when scanning the LRU list.
+The idea is that when one thread finishes the scan it leaves the
+itr in that position and the other thread can start scan from
+there */
+class LRUItr: public LRUHp {
+
+public:
+ /** Constructor
+ @param buf_pool buffer pool instance
+ @param mutex mutex that is protecting the hp. */
+ LRUItr(const buf_pool_t* buf_pool, const ib_mutex_t* mutex)
+ :
+ LRUHp(buf_pool, mutex) {}
+
+ /** Destructor */
+ virtual ~LRUItr() {}
+
+ /** Selects from where to start a scan. If we have scanned
+ too deep into the LRU list it resets the value to the tail
+ of the LRU list.
+ @return buf_page_t from where to start scan. */
+ buf_page_t* start();
+};
+
/** Struct that is embedded in the free zip blocks */
struct buf_buddy_free_t {
union {
@@ -1816,6 +2035,17 @@ struct buf_buddy_stat_t {
ib_uint64_t relocated_usec;
};
+/** @brief The temporary memory array structure.
+
+NOTE! The definition appears here only for other modules of this
+directory (buf) to see it. Do not use from outside! */
+
+typedef struct {
+ ulint n_slots; /*!< Total number of slots */
+ buf_tmp_buffer_t *slots; /*!< Pointer to the slots in the
+ array */
+} buf_tmp_array_t;
+
/** @brief The buffer pool structure.
NOTE! The definition appears here only for other modules of this
@@ -1889,7 +2119,7 @@ struct buf_pool_t{
also protects writes to
bpage::oldest_modification and
flush_list_hp */
- const buf_page_t* flush_list_hp;/*!< "hazard pointer"
+ FlushHp flush_hp;/*!< "hazard pointer"
used during scan of flush_list
while doing flush list batch.
Protected by flush_list_mutex */
@@ -1947,6 +2177,19 @@ struct buf_pool_t{
UT_LIST_BASE_NODE_T(buf_page_t) free;
/*!< base node of the free
block list */
+
+ /** "hazard pointer" used during scan of LRU while doing
+ LRU list batch. Protected by buf_pool::mutex */
+ LRUHp lru_hp;
+
+ /** Iterator used to scan the LRU list when searching for
+ replacable victim. Protected by buf_pool::mutex. */
+ LRUItr lru_scan_itr;
+
+ /** Iterator used to scan the LRU list when searching for
+ single page flushing victim. Protected by buf_pool::mutex. */
+ LRUItr single_scan_itr;
+
UT_LIST_BASE_NODE_T(buf_page_t) LRU;
/*!< base node of the LRU list */
buf_page_t* LRU_old; /*!< pointer to the about
@@ -1986,6 +2229,10 @@ struct buf_pool_t{
pool watches. Protected by
buf_pool->mutex. */
+ buf_tmp_array_t* tmp_arr;
+ /*!< Array for temporal memory
+ used in compression and encryption */
+
#if BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN
# error "BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN"
#endif
diff --git a/storage/innobase/include/buf0buf.ic b/storage/innobase/include/buf0buf.ic
index 56616c6deeb..6c128b097b0 100644
--- a/storage/innobase/include/buf0buf.ic
+++ b/storage/innobase/include/buf0buf.ic
@@ -2,6 +2,7 @@
Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, Google Inc.
+Copyright (c) 2014, 2015, MariaDB Corporation.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -275,7 +276,6 @@ buf_page_set_state(
}
#endif /* UNIV_DEBUG */
bpage->state = state;
- ut_ad(buf_page_get_state(bpage) == state);
}
/*********************************************************************//**
@@ -1457,4 +1457,24 @@ buf_get_nth_chunk_block(
*chunk_size = chunk->size;
return(chunk->blocks);
}
+
+/********************************************************************//**
+Get buf frame. */
+UNIV_INLINE
+void *
+buf_page_get_frame(
+/*===============*/
+ const buf_page_t* bpage) /*!< in: buffer pool page */
+{
+ /* In encryption/compression buffer pool page may contain extra
+ buffer where result is stored. */
+ if (bpage->slot && bpage->slot->out_buf) {
+ return bpage->slot->out_buf;
+ } else if (bpage->zip.data) {
+ return bpage->zip.data;
+ } else {
+ return ((buf_block_t*) bpage)->frame;
+ }
+}
+
#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h
index f116720574b..3ab3f7c308a 100644
--- a/storage/innobase/include/buf0flu.h
+++ b/storage/innobase/include/buf0flu.h
@@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2014, SkySQL Ab.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -36,6 +37,17 @@ Created 11/5/1995 Heikki Tuuri
/** Flag indicating if the page_cleaner is in active state. */
extern ibool buf_page_cleaner_is_active;
+/** Event to synchronise with the flushing. */
+extern os_event_t buf_flush_event;
+
+/** Handled page counters for a single flush */
+struct flush_counters_t {
+ ulint flushed; /*!< number of dirty pages flushed */
+ ulint evicted; /*!< number of clean pages evicted */
+ ulint unzip_LRU_evicted;/*!< number of uncompressed page images
+ evicted */
+};
+
/********************************************************************//**
Remove a block from the flush list of modified blocks. */
UNIV_INTERN
@@ -110,12 +122,12 @@ buf_flush_list(
which were processed is passed
back to caller. Ignored if NULL */
/******************************************************************//**
-This function picks up a single dirty page from the tail of the LRU
-list, flushes it, removes it from page_hash and LRU list and puts
-it on the free list. It is called from user threads when they are
-unable to find a replacable page at the tail of the LRU list i.e.:
-when the background LRU flushing in the page_cleaner thread is not
-fast enough to keep pace with the workload.
+This function picks up a single page from the tail of the LRU
+list, flushes it (if it is dirty), removes it from page_hash and LRU
+list and puts it on the free list. It is called from user threads when
+they are unable to find a replaceable page at the tail of the LRU
+list i.e.: when the background LRU flushing in the page_cleaner thread
+is not fast enough to keep pace with the workload.
@return TRUE if success. */
UNIV_INTERN
ibool
@@ -279,6 +291,57 @@ buf_flush_get_dirty_pages_count(
#endif /* !UNIV_HOTBACKUP */
+/******************************************************************//**
+Start a buffer flush batch for LRU or flush list */
+ibool
+buf_flush_start(
+/*============*/
+ buf_pool_t* buf_pool, /*!< buffer pool instance */
+ buf_flush_t flush_type); /*!< in: BUF_FLUSH_LRU
+ or BUF_FLUSH_LIST */
+/******************************************************************//**
+End a buffer flush batch for LRU or flush list */
+void
+buf_flush_end(
+/*==========*/
+ buf_pool_t* buf_pool, /*!< buffer pool instance */
+ buf_flush_t flush_type); /*!< in: BUF_FLUSH_LRU
+ or BUF_FLUSH_LIST */
+/******************************************************************//**
+Gather the aggregated stats for both flush list and LRU list flushing */
+void
+buf_flush_common(
+/*=============*/
+ buf_flush_t flush_type, /*!< in: type of flush */
+ ulint page_count); /*!< in: number of pages flushed */
+
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the LRU list or flush_list.
+NOTE 1: in the case of an LRU flush the calling thread may own latches to
+pages: to avoid deadlocks, this function must be written so that it cannot
+end up waiting for these latches! NOTE 2: in the case of a flush list flush,
+the calling thread is not allowed to own any latches on pages! */
+__attribute__((nonnull))
+void
+buf_flush_batch(
+/*============*/
+ buf_pool_t* buf_pool, /*!< in: buffer pool instance */
+ buf_flush_t flush_type, /*!< in: BUF_FLUSH_LRU or
+ BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
+ then the caller must not own any
+ latches on pages */
+ ulint min_n, /*!< in: wished minimum mumber of blocks
+ flushed (it is not guaranteed that the
+ actual number is that big, though) */
+ lsn_t lsn_limit, /*!< in: in the case of BUF_FLUSH_LIST
+ all blocks whose oldest_modification is
+ smaller than this should be flushed
+ (if their number does not exceed
+ min_n), otherwise ignored */
+ flush_counters_t* n); /*!< out: flushed/evicted page
+ counts */
+
+
#ifndef UNIV_NONINL
#include "buf0flu.ic"
#endif
diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h
index ecdaef685a1..f1f6abd2d68 100644
--- a/storage/innobase/include/buf0lru.h
+++ b/storage/innobase/include/buf0lru.h
@@ -117,7 +117,7 @@ buf_LRU_get_free_only(
buf_pool_t* buf_pool); /*!< buffer pool instance */
/******************************************************************//**
Returns a free block from the buf_pool. The block is taken off the
-free list. If it is empty, blocks are moved from the end of the
+free list. If free list is empty, blocks are moved from the end of the
LRU list to the free list.
This function is called from a user thread when it needs a clean
block to read in a page. Note that we only ever get a block from
@@ -125,8 +125,6 @@ the free list. Even when we flush a page or find a page in LRU scan
we put it to free list to be used.
* iteration 0:
* get a block from free list, success:done
- * if there is an LRU flush batch in progress:
- * wait for batch to end: retry free list
* if buf_pool->try_LRU_scan is set
* scan LRU up to srv_LRU_scan_depth to find a clean block
* the above will put the block on free list
@@ -139,7 +137,7 @@ we put it to free list to be used.
* scan whole LRU list
* scan LRU list even if buf_pool->try_LRU_scan is not set
* iteration > 1:
- * same as iteration 1 but sleep 100ms
+ * same as iteration 1 but sleep 10ms
@return the free control block, in state BUF_BLOCK_READY_FOR_USE */
UNIV_INTERN
buf_block_t*
@@ -231,6 +229,15 @@ buf_LRU_free_one_page(
may or may not be a hash index to the page */
__attribute__((nonnull));
+/******************************************************************//**
+Adjust LRU hazard pointers if needed. */
+
+void
+buf_LRU_adjust_hp(
+/*==============*/
+ buf_pool_t* buf_pool,/*!< in: buffer pool instance */
+ const buf_page_t* bpage); /*!< in: control block */
+
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
/**********************************************************************//**
Validates the LRU list.
diff --git a/storage/innobase/include/buf0mtflu.h b/storage/innobase/include/buf0mtflu.h
new file mode 100644
index 00000000000..0475335bbf5
--- /dev/null
+++ b/storage/innobase/include/buf0mtflu.h
@@ -0,0 +1,95 @@
+/*****************************************************************************
+
+Copyright (C) 2014 SkySQL Ab. All Rights Reserved.
+Copyright (C) 2014 Fusion-io. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/buf0mtflu.h
+Multi-threadef flush method interface function prototypes
+
+Created 06/02/2014 Jan Lindström jan.lindstrom@skysql.com
+ Dhananjoy Das DDas@fusionio.com
+***********************************************************************/
+
+#ifndef buf0mtflu_h
+#define buf0mtflu_h
+
+/******************************************************************//**
+Add exit work item to work queue to signal multi-threded flush
+threads that they should exit.
+*/
+void
+buf_mtflu_io_thread_exit(void);
+/*===========================*/
+
+/******************************************************************//**
+Initialize multi-threaded flush thread syncronization data.
+@return Initialized multi-threaded flush thread syncroniztion data. */
+void*
+buf_mtflu_handler_init(
+/*===================*/
+ ulint n_threads, /*!< in: Number of threads to create */
+ ulint wrk_cnt); /*!< in: Number of work items */
+
+/******************************************************************//**
+Return true if multi-threaded flush is initialized
+@return true if initialized, false if not */
+bool
+buf_mtflu_init_done(void);
+/*======================*/
+
+/*********************************************************************//**
+Clears up tail of the LRU lists:
+* Put replaceable pages at the tail of LRU to the free list
+* Flush dirty pages at the tail of LRU to the disk
+The depth to which we scan each buffer pool is controlled by dynamic
+config parameter innodb_LRU_scan_depth.
+@return total pages flushed */
+UNIV_INTERN
+ulint
+buf_mtflu_flush_LRU_tail(void);
+/*===========================*/
+
+/*******************************************************************//**
+Multi-threaded version of buf_flush_list
+*/
+bool
+buf_mtflu_flush_list(
+/*=================*/
+ ulint min_n, /*!< in: wished minimum mumber of blocks
+ flushed (it is not guaranteed that the
+ actual number is that big, though) */
+ lsn_t lsn_limit, /*!< in the case BUF_FLUSH_LIST all
+ blocks whose oldest_modification is
+ smaller than this should be flushed
+ (if their number does not exceed
+ min_n), otherwise ignored */
+ ulint* n_processed); /*!< out: the number of pages
+ which were processed is passed
+ back to caller. Ignored if NULL */
+
+/*********************************************************************//**
+Set correct thread identifiers to io thread array based on
+information we have. */
+void
+buf_mtflu_set_thread_ids(
+/*=====================*/
+ ulint n_threads, /*!<in: Number of threads to fill */
+ void* ctx, /*!<in: thread context */
+ os_thread_id_t* thread_ids); /*!<in: thread id array */
+
+#endif
diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h
index 9e007809471..820353a0c93 100644
--- a/storage/innobase/include/dict0dict.h
+++ b/storage/innobase/include/dict0dict.h
@@ -2,6 +2,7 @@
Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -42,6 +43,8 @@ Created 1/8/1996 Heikki Tuuri
#include "ut0byte.h"
#include "trx0types.h"
#include "row0types.h"
+#include "fsp0fsp.h"
+#include "dict0pagecompress.h"
extern bool innodb_table_stats_not_found;
extern bool innodb_index_stats_not_found;
@@ -120,7 +123,9 @@ enum dict_table_op_t {
DICT_TABLE_OP_DROP_ORPHAN,
/** Silently load the tablespace if it does not exist,
and do not load the definitions of incomplete indexes. */
- DICT_TABLE_OP_LOAD_TABLESPACE
+ DICT_TABLE_OP_LOAD_TABLESPACE,
+ /** Open the table only if it's in table cache. */
+ DICT_TABLE_OP_OPEN_ONLY_IF_CACHED
};
/**********************************************************************//**
@@ -134,6 +139,17 @@ dict_table_open_on_id(
ibool dict_locked, /*!< in: TRUE=data dictionary locked */
dict_table_op_t table_op) /*!< in: operation to perform */
__attribute__((warn_unused_result));
+
+/**********************************************************************//**
+Returns a table object based on table id.
+@return table, NULL if does not exist */
+UNIV_INTERN
+dict_table_t*
+dict_table_open_on_index_id(
+/*==================*/
+ table_id_t table_id, /*!< in: table id */
+ bool dict_locked) /*!< in: TRUE=data dictionary locked */
+ __attribute__((warn_unused_result));
/********************************************************************//**
Decrements the count of open handles to a table. */
UNIV_INTERN
@@ -918,7 +934,14 @@ dict_tf_set(
ulint* flags, /*!< in/out: table */
rec_format_t format, /*!< in: file format */
ulint zip_ssize, /*!< in: zip shift size */
- bool remote_path) /*!< in: table uses DATA DIRECTORY */
+ bool remote_path, /*!< in: table uses DATA DIRECTORY
+ */
+ bool page_compressed,/*!< in: table uses page compressed
+ pages */
+ ulint page_compression_level, /*!< in: table page compression
+ level */
+ ulint atomic_writes) /*!< in: table atomic
+ writes option value*/
__attribute__((nonnull));
/********************************************************************//**
Convert a 32 bit integer table flags to the 32 bit integer that is
@@ -946,6 +969,7 @@ dict_tf_get_zip_size(
/*=================*/
ulint flags) /*!< in: flags */
__attribute__((const));
+
/********************************************************************//**
Check whether the table uses the compressed compact page format.
@return compressed page size, or 0 if not compressed */
@@ -1157,8 +1181,9 @@ ulint
dict_index_get_nth_col_pos(
/*=======================*/
const dict_index_t* index, /*!< in: index */
- ulint n) /*!< in: column number */
- __attribute__((nonnull, warn_unused_result));
+ ulint n, /*!< in: column number */
+ ulint* prefix_col_pos) /*!< out: col num if prefix */
+ __attribute__((nonnull(1), warn_unused_result));
/********************************************************************//**
Looks for column n in an index.
@return position in internal representation of the index;
@@ -1169,9 +1194,11 @@ dict_index_get_nth_col_or_prefix_pos(
/*=================================*/
const dict_index_t* index, /*!< in: index */
ulint n, /*!< in: column number */
- ibool inc_prefix) /*!< in: TRUE=consider
+ ibool inc_prefix, /*!< in: TRUE=consider
column prefixes too */
- __attribute__((nonnull, warn_unused_result));
+ ulint* prefix_col_pos) /*!< out: col num if prefix */
+
+ __attribute__((nonnull(1), warn_unused_result));
/********************************************************************//**
Returns TRUE if the index contains a column or a prefix of that column.
@return TRUE if contains the column or its prefix */
@@ -1441,8 +1468,12 @@ dict_index_calc_min_rec_len(
Reserves the dictionary system mutex for MySQL. */
UNIV_INTERN
void
-dict_mutex_enter_for_mysql(void);
+dict_mutex_enter_for_mysql_func(const char * file, ulint line);
/*============================*/
+
+#define dict_mutex_enter_for_mysql() \
+ dict_mutex_enter_for_mysql_func(__FILE__, __LINE__)
+
/********************************************************************//**
Releases the dictionary system mutex for MySQL. */
UNIV_INTERN
@@ -1521,6 +1552,16 @@ dict_table_get_index_on_name(
const char* name) /*!< in: name of the index to find */
__attribute__((nonnull, warn_unused_result));
/**********************************************************************//**
+Looks for an index with the given id given a table instance.
+@return index or NULL */
+UNIV_INTERN
+dict_index_t*
+dict_table_find_index_on_id(
+/*========================*/
+ const dict_table_t* table, /*!< in: table instance */
+ index_id_t id) /*!< in: index id */
+ __attribute__((nonnull, warn_unused_result));
+/**********************************************************************//**
In case there is more than one index with the same name return the index
with the min(id).
@return index, NULL if does not exist */
@@ -1848,6 +1889,7 @@ dict_table_get_index_on_first_col(
#endif /* !UNIV_HOTBACKUP */
+
#ifndef UNIV_NONINL
#include "dict0dict.ic"
#endif
diff --git a/storage/innobase/include/dict0dict.ic b/storage/innobase/include/dict0dict.ic
index 066ffe47e4a..dd42b478c1f 100644
--- a/storage/innobase/include/dict0dict.ic
+++ b/storage/innobase/include/dict0dict.ic
@@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -537,9 +538,25 @@ dict_tf_is_valid(
ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(flags);
ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(flags);
ulint unused = DICT_TF_GET_UNUSED(flags);
+ ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(flags);
+ ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags);
+ ulint data_dir = DICT_TF_HAS_DATA_DIR(flags);
+ ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(flags);
/* Make sure there are no bits that we do not know about. */
if (unused != 0) {
+ fprintf(stderr,
+ "InnoDB: Error: table unused flags are %ld"
+ " in the data dictionary and are corrupted\n"
+ "InnoDB: Error: data dictionary flags are\n"
+ "InnoDB: compact %ld atomic_blobs %ld\n"
+ "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n"
+ "InnoDB: page_compression %ld page_compression_level %ld\n"
+ "InnoDB: atomic_writes %ld\n",
+ unused,
+ compact, atomic_blobs, unused, data_dir, zip_ssize,
+ page_compression, page_compression_level, atomic_writes
+ );
return(false);
@@ -550,12 +567,34 @@ dict_tf_is_valid(
data stored off-page in the clustered index. */
if (!compact) {
+ fprintf(stderr,
+ "InnoDB: Error: table compact flags are %ld"
+ " in the data dictionary and are corrupted\n"
+ "InnoDB: Error: data dictionary flags are\n"
+ "InnoDB: compact %ld atomic_blobs %ld\n"
+ "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n"
+ "InnoDB: page_compression %ld page_compression_level %ld\n"
+ "InnoDB: atomic_writes %ld\n",
+ compact, compact, atomic_blobs, unused, data_dir, zip_ssize,
+ page_compression, page_compression_level, atomic_writes
+ );
return(false);
}
} else if (zip_ssize) {
/* Antelope does not support COMPRESSED row format. */
+ fprintf(stderr,
+ "InnoDB: Error: table flags are %ld"
+ " in the data dictionary and are corrupted\n"
+ "InnoDB: Error: data dictionary flags are\n"
+ "InnoDB: compact %ld atomic_blobs %ld\n"
+ "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n"
+ "InnoDB: page_compression %ld page_compression_level %ld\n"
+ "InnoDB: atomic_writes %ld\n",
+ flags, compact, atomic_blobs, unused, data_dir, zip_ssize,
+ page_compression, page_compression_level, atomic_writes
+ );
return(false);
}
@@ -568,6 +607,58 @@ dict_tf_is_valid(
|| !atomic_blobs
|| zip_ssize > PAGE_ZIP_SSIZE_MAX) {
+ fprintf(stderr,
+ "InnoDB: Error: table compact flags are %ld in the data dictionary and are corrupted\n"
+ "InnoDB: Error: data dictionary flags are\n"
+ "InnoDB: compact %ld atomic_blobs %ld\n"
+ "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n"
+ "InnoDB: page_compression %ld page_compression_level %ld\n"
+ "InnoDB: atomic_writes %ld\n",
+ flags,
+ compact, atomic_blobs, unused, data_dir, zip_ssize,
+ page_compression, page_compression_level, atomic_writes
+
+ );
+ return(false);
+ }
+ }
+
+ if (page_compression || page_compression_level) {
+ /* Page compression format must have compact and
+ atomic_blobs and page_compression_level requires
+ page_compression */
+ if (!compact
+ || !page_compression
+ || !atomic_blobs) {
+
+ fprintf(stderr,
+ "InnoDB: Error: table flags are %ld in the data dictionary and are corrupted\n"
+ "InnoDB: Error: data dictionary flags are\n"
+ "InnoDB: compact %ld atomic_blobs %ld\n"
+ "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n"
+ "InnoDB: page_compression %ld page_compression_level %ld\n"
+ "InnoDB: atomic_writes %ld\n",
+ flags, compact, atomic_blobs, unused, data_dir, zip_ssize,
+ page_compression, page_compression_level, atomic_writes
+ );
+ return(false);
+ }
+ }
+
+ if (atomic_writes) {
+
+ if(atomic_writes > ATOMIC_WRITES_OFF) {
+
+ fprintf(stderr,
+ "InnoDB: Error: table flags are %ld in the data dictionary and are corrupted\n"
+ "InnoDB: Error: data dictionary flags are\n"
+ "InnoDB: compact %ld atomic_blobs %ld\n"
+ "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n"
+ "InnoDB: page_compression %ld page_compression_level %ld\n"
+ "InnoDB: atomic_writes %ld\n",
+ flags, compact, atomic_blobs, unused, data_dir, zip_ssize,
+ page_compression, page_compression_level, atomic_writes
+ );
return(false);
}
}
@@ -594,6 +685,11 @@ dict_sys_tables_type_validate(
ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(type);
ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(type);
ulint unused = DICT_TF_GET_UNUSED(type);
+ ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(type);
+ ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(type);
+ ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(type);
+
+ ut_a(atomic_writes <= ATOMIC_WRITES_OFF);
/* The low order bit of SYS_TABLES.TYPE is always set to 1.
If the format is UNIV_FORMAT_B or higher, this field is the same
@@ -604,12 +700,16 @@ dict_sys_tables_type_validate(
if (redundant) {
if (zip_ssize || atomic_blobs) {
+ fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=Redundant, zip_ssize %lu atomic_blobs %lu\n",
+ zip_ssize, atomic_blobs);
return(ULINT_UNDEFINED);
}
}
/* Make sure there are no bits that we do not know about. */
if (unused) {
+ fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, unused %lu\n",
+ type, unused);
return(ULINT_UNDEFINED);
}
@@ -624,6 +724,8 @@ dict_sys_tables_type_validate(
} else if (zip_ssize) {
/* Antelope does not support COMPRESSED format. */
+ fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu\n",
+ type, zip_ssize);
return(ULINT_UNDEFINED);
}
@@ -633,11 +735,15 @@ dict_sys_tables_type_validate(
should be in N_COLS, but we already know about the
low_order_bit and DICT_N_COLS_COMPACT flags. */
if (!atomic_blobs) {
+ fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu atomic_blobs %lu\n",
+ type, zip_ssize, atomic_blobs);
return(ULINT_UNDEFINED);
}
/* Validate that the number is within allowed range. */
if (zip_ssize > PAGE_ZIP_SSIZE_MAX) {
+ fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu max %d\n",
+ type, zip_ssize, PAGE_ZIP_SSIZE_MAX);
return(ULINT_UNDEFINED);
}
}
@@ -647,6 +753,27 @@ dict_sys_tables_type_validate(
format, so the DATA_DIR flag is compatible with any other
table flags. However, it is not used with TEMPORARY tables.*/
+ if (page_compression || page_compression_level) {
+ /* page compressed row format must have low_order_bit and
+ atomic_blobs bits set and the DICT_N_COLS_COMPACT flag
+ should be in N_COLS, but we already know about the
+ low_order_bit and DICT_N_COLS_COMPACT flags. */
+
+ if (!atomic_blobs || !page_compression) {
+ fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, page_compression %lu page_compression_level %lu\n"
+ "InnoDB: Error: atomic_blobs %lu\n",
+ type, page_compression, page_compression_level, atomic_blobs);
+ return(ULINT_UNDEFINED);
+ }
+ }
+
+ /* Validate that the atomic writes number is within allowed range. */
+ if (atomic_writes > ATOMIC_WRITES_OFF) {
+ fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, atomic_writes %lu\n",
+ type, atomic_writes);
+ return(ULINT_UNDEFINED);
+ }
+
/* Return the validated SYS_TABLES.TYPE. */
return(type);
}
@@ -719,8 +846,16 @@ dict_tf_set(
ulint* flags, /*!< in/out: table flags */
rec_format_t format, /*!< in: file format */
ulint zip_ssize, /*!< in: zip shift size */
- bool use_data_dir) /*!< in: table uses DATA DIRECTORY */
+ bool use_data_dir, /*!< in: table uses DATA DIRECTORY
+ */
+ bool page_compressed,/*!< in: table uses page compressed
+ pages */
+ ulint page_compression_level, /*!< in: table page compression
+ level */
+ ulint atomic_writes) /*!< in: table atomic writes setup */
{
+ atomic_writes_t awrites = (atomic_writes_t)atomic_writes;
+
switch (format) {
case REC_FORMAT_REDUNDANT:
*flags = 0;
@@ -742,6 +877,19 @@ dict_tf_set(
break;
}
+ if (page_compressed) {
+ *flags |= (1 << DICT_TF_POS_ATOMIC_BLOBS)
+ | (1 << DICT_TF_POS_PAGE_COMPRESSION)
+ | (page_compression_level << DICT_TF_POS_PAGE_COMPRESSION_LEVEL);
+
+ ut_ad(zip_ssize == 0);
+ ut_ad(dict_tf_get_page_compression(*flags) == TRUE);
+ ut_ad(dict_tf_get_page_compression_level(*flags) == page_compression_level);
+ }
+
+ *flags |= (atomic_writes << DICT_TF_POS_ATOMIC_WRITES);
+ ut_a(dict_tf_get_atomic_writes(*flags) == awrites);
+
if (use_data_dir) {
*flags |= (1 << DICT_TF_POS_DATA_DIR);
}
@@ -765,6 +913,9 @@ dict_tf_to_fsp_flags(
ulint table_flags) /*!< in: dict_table_t::flags */
{
ulint fsp_flags;
+ ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(table_flags);
+ ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(table_flags);
+ ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(table_flags);
DBUG_EXECUTE_IF("dict_tf_to_fsp_flags_failure",
return(ULINT_UNDEFINED););
@@ -783,7 +934,20 @@ dict_tf_to_fsp_flags(
fsp_flags |= DICT_TF_HAS_DATA_DIR(table_flags)
? FSP_FLAGS_MASK_DATA_DIR : 0;
+ /* In addition, tablespace flags also contain if the page
+ compression is used for this table. */
+ fsp_flags |= FSP_FLAGS_SET_PAGE_COMPRESSION(fsp_flags, page_compression);
+
+ /* In addition, tablespace flags also contain page compression level
+ if page compression is used for this table. */
+ fsp_flags |= FSP_FLAGS_SET_PAGE_COMPRESSION_LEVEL(fsp_flags, page_compression_level);
+
+ /* In addition, tablespace flags also contain flag if atomic writes
+ is used for this table */
+ fsp_flags |= FSP_FLAGS_SET_ATOMIC_WRITES(fsp_flags, atomic_writes);
+
ut_a(fsp_flags_is_valid(fsp_flags));
+ ut_a(dict_tf_verify_flags(table_flags, fsp_flags));
return(fsp_flags);
}
@@ -811,10 +975,16 @@ dict_sys_tables_type_to_tf(
/* Adjust bit zero. */
flags = redundant ? 0 : 1;
- /* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */
+ /* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION,
+ PAGE_COMPRESSION_LEVEL, ATOMIC_WRITES are the same. */
flags |= type & (DICT_TF_MASK_ZIP_SSIZE
| DICT_TF_MASK_ATOMIC_BLOBS
- | DICT_TF_MASK_DATA_DIR);
+ | DICT_TF_MASK_DATA_DIR
+ | DICT_TF_MASK_PAGE_COMPRESSION
+ | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL
+ | DICT_TF_MASK_ATOMIC_WRITES
+
+ );
return(flags);
}
@@ -842,10 +1012,14 @@ dict_tf_to_sys_tables_type(
/* Adjust bit zero. It is always 1 in SYS_TABLES.TYPE */
type = 1;
- /* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */
+ /* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION,
+ PAGE_COMPRESSION_LEVEL, ATOMIC_WRITES are the same. */
type |= flags & (DICT_TF_MASK_ZIP_SSIZE
| DICT_TF_MASK_ATOMIC_BLOBS
- | DICT_TF_MASK_DATA_DIR);
+ | DICT_TF_MASK_DATA_DIR
+ | DICT_TF_MASK_PAGE_COMPRESSION
+ | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL
+ | DICT_TF_MASK_ATOMIC_WRITES);
return(type);
}
@@ -1048,7 +1222,8 @@ dict_index_get_sys_col_pos(
}
return(dict_index_get_nth_col_pos(
- index, dict_table_get_sys_col_no(index->table, type)));
+ index, dict_table_get_sys_col_no(index->table, type),
+ NULL));
}
/*********************************************************************//**
@@ -1100,9 +1275,11 @@ ulint
dict_index_get_nth_col_pos(
/*=======================*/
const dict_index_t* index, /*!< in: index */
- ulint n) /*!< in: column number */
+ ulint n, /*!< in: column number */
+ ulint* prefix_col_pos) /*!< out: col num if prefix */
{
- return(dict_index_get_nth_col_or_prefix_pos(index, n, FALSE));
+ return(dict_index_get_nth_col_or_prefix_pos(index, n, FALSE,
+ prefix_col_pos));
}
#ifndef UNIV_HOTBACKUP
diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h
index 5f6811f0719..24db728ae08 100644
--- a/storage/innobase/include/dict0mem.h
+++ b/storage/innobase/include/dict0mem.h
@@ -2,6 +2,7 @@
Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -125,11 +126,34 @@ This flag prevents older engines from attempting to open the table and
allows InnoDB to update_create_info() accordingly. */
#define DICT_TF_WIDTH_DATA_DIR 1
+/**
+Width of the page compression flag
+*/
+#define DICT_TF_WIDTH_PAGE_COMPRESSION 1
+#define DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL 4
+
+/**
+Width of the page encryption flag
+*/
+#define DICT_TF_WIDTH_PAGE_ENCRYPTION 1
+#define DICT_TF_WIDTH_PAGE_ENCRYPTION_KEY 8
+
+/**
+Width of atomic writes flag
+DEFAULT=0, ON = 1, OFF = 2
+*/
+#define DICT_TF_WIDTH_ATOMIC_WRITES 2
+
/** Width of all the currently known table flags */
#define DICT_TF_BITS (DICT_TF_WIDTH_COMPACT \
+ DICT_TF_WIDTH_ZIP_SSIZE \
+ DICT_TF_WIDTH_ATOMIC_BLOBS \
- + DICT_TF_WIDTH_DATA_DIR)
+ + DICT_TF_WIDTH_DATA_DIR \
+ + DICT_TF_WIDTH_PAGE_COMPRESSION \
+ + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL \
+ + DICT_TF_WIDTH_ATOMIC_WRITES \
+ + DICT_TF_WIDTH_PAGE_ENCRYPTION \
+ + DICT_TF_WIDTH_PAGE_ENCRYPTION_KEY)
/** A mask of all the known/used bits in table flags */
#define DICT_TF_BIT_MASK (~(~0 << DICT_TF_BITS))
@@ -145,9 +169,23 @@ allows InnoDB to update_create_info() accordingly. */
/** Zero relative shift position of the DATA_DIR field */
#define DICT_TF_POS_DATA_DIR (DICT_TF_POS_ATOMIC_BLOBS \
+ DICT_TF_WIDTH_ATOMIC_BLOBS)
-/** Zero relative shift position of the start of the UNUSED bits */
-#define DICT_TF_POS_UNUSED (DICT_TF_POS_DATA_DIR \
- + DICT_TF_WIDTH_DATA_DIR)
+/** Zero relative shift position of the PAGE_COMPRESSION field */
+#define DICT_TF_POS_PAGE_COMPRESSION (DICT_TF_POS_DATA_DIR \
+ + DICT_TF_WIDTH_DATA_DIR)
+/** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */
+#define DICT_TF_POS_PAGE_COMPRESSION_LEVEL (DICT_TF_POS_PAGE_COMPRESSION \
+ + DICT_TF_WIDTH_PAGE_COMPRESSION)
+/** Zero relative shift position of the ATOMIC_WRITES field */
+#define DICT_TF_POS_ATOMIC_WRITES (DICT_TF_POS_PAGE_COMPRESSION_LEVEL \
+ + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL)
+/** Zero relative shift position of the PAGE_ENCRYPTION field */
+#define DICT_TF_POS_PAGE_ENCRYPTION (DICT_TF_POS_ATOMIC_WRITES \
+ + DICT_TF_WIDTH_ATOMIC_WRITES)
+/** Zero relative shift position of the PAGE_ENCRYPTION_KEY field */
+#define DICT_TF_POS_PAGE_ENCRYPTION_KEY (DICT_TF_POS_PAGE_ENCRYPTION \
+ + DICT_TF_WIDTH_PAGE_ENCRYPTION)
+#define DICT_TF_POS_UNUSED (DICT_TF_POS_PAGE_ENCRYPTION_KEY \
+ + DICT_TF_WIDTH_PAGE_ENCRYPTION_KEY)
/** Bit mask of the COMPACT field */
#define DICT_TF_MASK_COMPACT \
@@ -165,6 +203,26 @@ allows InnoDB to update_create_info() accordingly. */
#define DICT_TF_MASK_DATA_DIR \
((~(~0 << DICT_TF_WIDTH_DATA_DIR)) \
<< DICT_TF_POS_DATA_DIR)
+/** Bit mask of the PAGE_COMPRESSION field */
+#define DICT_TF_MASK_PAGE_COMPRESSION \
+ ((~(~0 << DICT_TF_WIDTH_PAGE_COMPRESSION)) \
+ << DICT_TF_POS_PAGE_COMPRESSION)
+/** Bit mask of the PAGE_COMPRESSION_LEVEL field */
+#define DICT_TF_MASK_PAGE_COMPRESSION_LEVEL \
+ ((~(~0 << DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL)) \
+ << DICT_TF_POS_PAGE_COMPRESSION_LEVEL)
+/** Bit mask of the ATOMIC_WRITES field */
+#define DICT_TF_MASK_ATOMIC_WRITES \
+ ((~(~0 << DICT_TF_WIDTH_ATOMIC_WRITES)) \
+ << DICT_TF_POS_ATOMIC_WRITES)
+/** Bit mask of the PAGE_ENCRYPTION field */
+#define DICT_TF_MASK_PAGE_ENCRYPTION \
+ ((~(~0 << DICT_TF_WIDTH_PAGE_ENCRYPTION)) \
+ << DICT_TF_POS_PAGE_ENCRYPTION)
+/** Bit mask of the PAGE_ENCRYPTION_KEY field */
+#define DICT_TF_MASK_PAGE_ENCRYPTION_KEY \
+ ((~(~0 << DICT_TF_WIDTH_PAGE_ENCRYPTION_KEY)) \
+ << DICT_TF_POS_PAGE_ENCRYPTION_KEY)
/** Return the value of the COMPACT field */
#define DICT_TF_GET_COMPACT(flags) \
@@ -182,6 +240,27 @@ allows InnoDB to update_create_info() accordingly. */
#define DICT_TF_HAS_DATA_DIR(flags) \
((flags & DICT_TF_MASK_DATA_DIR) \
>> DICT_TF_POS_DATA_DIR)
+/** Return the value of the PAGE_COMPRESSION field */
+#define DICT_TF_GET_PAGE_COMPRESSION(flags) \
+ ((flags & DICT_TF_MASK_PAGE_COMPRESSION) \
+ >> DICT_TF_POS_PAGE_COMPRESSION)
+/** Return the value of the PAGE_COMPRESSION_LEVEL field */
+#define DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags) \
+ ((flags & DICT_TF_MASK_PAGE_COMPRESSION_LEVEL) \
+ >> DICT_TF_POS_PAGE_COMPRESSION_LEVEL)
+/** Return the value of the ATOMIC_WRITES field */
+#define DICT_TF_GET_ATOMIC_WRITES(flags) \
+ ((flags & DICT_TF_MASK_ATOMIC_WRITES) \
+ >> DICT_TF_POS_ATOMIC_WRITES)
+/** Return the contents of the PAGE_ENCRYPTION field */
+#define DICT_TF_GET_PAGE_ENCRYPTION(flags) \
+ ((flags & DICT_TF_MASK_PAGE_ENCRYPTION) \
+ >> DICT_TF_POS_PAGE_ENCRYPTION)
+/** Return the contents of the PAGE_ENCRYPTION KEY field */
+#define DICT_TF_GET_PAGE_ENCRYPTION_KEY(flags) \
+ ((flags & DICT_TF_MASK_PAGE_ENCRYPTION_KEY) \
+ >> DICT_TF_POS_PAGE_ENCRYPTION_KEY)
+
/** Return the contents of the UNUSED bits */
#define DICT_TF_GET_UNUSED(flags) \
(flags >> DICT_TF_POS_UNUSED)
@@ -493,6 +572,9 @@ be REC_VERSION_56_MAX_INDEX_COL_LEN (3072) bytes */
/** Defines the maximum fixed length column size */
#define DICT_MAX_FIXED_COL_LEN DICT_ANTELOPE_MAX_INDEX_COL_LEN
+#ifdef WITH_WSREP
+#define WSREP_MAX_SUPPORTED_KEY_LENGTH 3500
+#endif /* WITH_WSREP */
/** Data structure for a field in an index */
struct dict_field_t{
@@ -567,6 +649,10 @@ struct zip_pad_info_t {
/*!< Creation state of mutex member */
};
+/** Number of samples of data size kept when page compression fails for
+a certain index.*/
+#define STAT_DEFRAG_DATA_SIZE_N_SAMPLE 10
+
/** Data structure for an index. Most fields will be
initialized to 0, NULL or FALSE in dict_mem_index_create(). */
struct dict_index_t{
@@ -658,6 +744,23 @@ struct dict_index_t{
/*!< has persistent statistics error printed
for this index ? */
/* @} */
+ /** Statistics for defragmentation, these numbers are estimations and
+ could be very inaccurate at certain times, e.g. right after restart,
+ during defragmentation, etc. */
+ /* @{ */
+ ulint stat_defrag_modified_counter;
+ ulint stat_defrag_n_pages_freed;
+ /* number of pages freed by defragmentation. */
+ ulint stat_defrag_n_page_split;
+ /* number of page splits since last full index
+ defragmentation. */
+ ulint stat_defrag_data_size_sample[STAT_DEFRAG_DATA_SIZE_N_SAMPLE];
+ /* data size when compression failure happened
+ the most recent 10 times. */
+ ulint stat_defrag_sample_next_slot;
+ /* in which slot the next sample should be
+ saved. */
+ /* @} */
rw_lock_t lock; /*!< read-write lock protecting the
upper levels of the index tree */
trx_id_t trx_id; /*!< id of the transaction that created this
@@ -1120,20 +1223,29 @@ struct dict_table_t{
calculation; this counter is not protected by
any latch, because this is only used for
heuristics */
-#define BG_STAT_NONE 0
-#define BG_STAT_IN_PROGRESS (1 << 0)
+
+#define BG_STAT_IN_PROGRESS ((byte)(1 << 0))
/*!< BG_STAT_IN_PROGRESS is set in
stats_bg_flag when the background
stats code is working on this table. The DROP
TABLE code waits for this to be cleared
before proceeding. */
-#define BG_STAT_SHOULD_QUIT (1 << 1)
+#define BG_STAT_SHOULD_QUIT ((byte)(1 << 1))
/*!< BG_STAT_SHOULD_QUIT is set in
stats_bg_flag when DROP TABLE starts
waiting on BG_STAT_IN_PROGRESS to be cleared,
the background stats thread will detect this
and will eventually quit sooner */
- byte stats_bg_flag;
+#define BG_SCRUB_IN_PROGRESS ((byte)(1 << 2))
+ /*!< BG_SCRUB_IN_PROGRESS is set in
+ stats_bg_flag when the background
+ scrub code is working on this table. The DROP
+ TABLE code waits for this to be cleared
+ before proceeding. */
+
+#define BG_IN_PROGRESS (BG_STAT_IN_PROGRESS | BG_SCRUB_IN_PROGRESS)
+
+ byte stats_bg_flag;
/*!< see BG_STAT_* above.
Writes are covered by dict_sys->mutex.
Dirty reads are possible. */
diff --git a/storage/innobase/include/dict0pagecompress.h b/storage/innobase/include/dict0pagecompress.h
new file mode 100644
index 00000000000..19a2a6c52f3
--- /dev/null
+++ b/storage/innobase/include/dict0pagecompress.h
@@ -0,0 +1,94 @@
+/*****************************************************************************
+
+Copyright (C) 2013 SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0pagecompress.h
+Helper functions for extracting/storing page compression information
+to dictionary.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+#ifndef dict0pagecompress_h
+#define dict0pagecompress_h
+
+/********************************************************************//**
+Extract the page compression level from table flags.
+@return page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_tf_get_page_compression_level(
+/*===============================*/
+ ulint flags) /*!< in: flags */
+ __attribute__((const));
+/********************************************************************//**
+Extract the page compression flag from table flags
+@return page compression flag, or false if not compressed */
+UNIV_INLINE
+ibool
+dict_tf_get_page_compression(
+/*==========================*/
+ ulint flags) /*!< in: flags */
+ __attribute__((const));
+
+/********************************************************************//**
+Check whether the table uses the page compressed page format.
+@return page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_page_compression_level(
+/*==============================*/
+ const dict_table_t* table) /*!< in: table */
+ __attribute__((const));
+
+/********************************************************************//**
+Verify that dictionary flags match tablespace flags
+@return true if flags match, false if not */
+UNIV_INLINE
+ibool
+dict_tf_verify_flags(
+/*=================*/
+ ulint table_flags, /*!< in: dict_table_t::flags */
+ ulint fsp_flags) /*!< in: fil_space_t::flags */
+ __attribute__((const));
+
+/********************************************************************//**
+Extract the atomic writes flag from table flags.
+@return true if atomic writes are used, false if not used */
+UNIV_INLINE
+atomic_writes_t
+dict_tf_get_atomic_writes(
+/*======================*/
+ ulint flags) /*!< in: flags */
+ __attribute__((const));
+
+/********************************************************************//**
+Check whether the table uses the atomic writes.
+@return true if atomic writes is used, false if not */
+UNIV_INLINE
+atomic_writes_t
+dict_table_get_atomic_writes(
+/*=========================*/
+ const dict_table_t* table); /*!< in: table */
+
+
+#ifndef UNIV_NONINL
+#include "dict0pagecompress.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/dict0pagecompress.ic b/storage/innobase/include/dict0pagecompress.ic
new file mode 100644
index 00000000000..811976434a8
--- /dev/null
+++ b/storage/innobase/include/dict0pagecompress.ic
@@ -0,0 +1,191 @@
+/*****************************************************************************
+
+Copyright (C) 2013 SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0pagecompress.ic
+Inline implementation for helper functions for extracting/storing
+page compression and atomic writes information to dictionary.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+/********************************************************************//**
+Verify that dictionary flags match tablespace flags
+@return true if flags match, false if not */
+UNIV_INLINE
+ibool
+dict_tf_verify_flags(
+/*=================*/
+ ulint table_flags, /*!< in: dict_table_t::flags */
+ ulint fsp_flags) /*!< in: fil_space_t::flags */
+{
+ ulint table_unused = DICT_TF_GET_UNUSED(table_flags);
+ ulint compact = DICT_TF_GET_COMPACT(table_flags);
+ ulint ssize = DICT_TF_GET_ZIP_SSIZE(table_flags);
+ ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(table_flags);
+ ulint data_dir = DICT_TF_HAS_DATA_DIR(table_flags);
+ ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(table_flags);
+ ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(table_flags);
+ ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(table_flags);
+ ulint post_antelope = FSP_FLAGS_GET_POST_ANTELOPE(fsp_flags);
+ ulint zip_ssize = FSP_FLAGS_GET_ZIP_SSIZE(fsp_flags);
+ ulint fsp_atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(fsp_flags);
+ ulint page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(fsp_flags);
+ ulint fsp_unused = FSP_FLAGS_GET_UNUSED(fsp_flags);
+ ulint fsp_page_compression = FSP_FLAGS_GET_PAGE_COMPRESSION(fsp_flags);
+ ulint fsp_page_compression_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(fsp_flags);
+ ulint fsp_atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(fsp_flags);
+
+ DBUG_EXECUTE_IF("dict_tf_verify_flags_failure",
+ return(ULINT_UNDEFINED););
+
+ ut_a(!table_unused);
+ ut_a(!fsp_unused);
+ ut_a(page_ssize == 0 || page_ssize != 0); /* silence compiler */
+ ut_a(compact == 0 || compact == 1); /* silence compiler */
+ ut_a(data_dir == 0 || data_dir == 1); /* silence compiler */
+ ut_a(post_antelope == 0 || post_antelope == 1); /* silence compiler */
+
+ if (ssize != zip_ssize) {
+ fprintf(stderr,
+ "InnoDB: Error: table flags has zip_ssize %ld"
+ " in the data dictionary\n"
+ "InnoDB: but the flags in file has zip_ssize %ld\n",
+ ssize, zip_ssize);
+ return (FALSE);
+ }
+ if (atomic_blobs != fsp_atomic_blobs) {
+ fprintf(stderr,
+ "InnoDB: Error: table flags has atomic_blobs %ld"
+ " in the data dictionary\n"
+ "InnoDB: but the flags in file has atomic_blobs %ld\n",
+ atomic_blobs, fsp_atomic_blobs);
+
+ return (FALSE);
+ }
+ if (page_compression != fsp_page_compression) {
+ fprintf(stderr,
+ "InnoDB: Error: table flags has page_compression %ld"
+ " in the data dictionary\n"
+ "InnoDB: but the flags in file ahas page_compression %ld\n",
+ page_compression, fsp_page_compression);
+
+ return (FALSE);
+ }
+ if (page_compression_level != fsp_page_compression_level) {
+ fprintf(stderr,
+ "InnoDB: Error: table flags has page_compression_level %ld"
+ " in the data dictionary\n"
+ "InnoDB: but the flags in file has page_compression_level %ld\n",
+ page_compression_level, fsp_page_compression_level);
+
+ return (FALSE);
+ }
+
+ if (atomic_writes != fsp_atomic_writes) {
+ fprintf(stderr,
+ "InnoDB: Error: table flags has atomic writes %ld"
+ " in the data dictionary\n"
+ "InnoDB: but the flags in file has atomic_writes %ld\n",
+ atomic_writes, fsp_atomic_writes);
+
+ return (FALSE);
+ }
+
+ return(TRUE);
+}
+
+/********************************************************************//**
+Extract the page compression level from dict_table_t::flags.
+These flags are in memory, so assert that they are valid.
+@return page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_tf_get_page_compression_level(
+/*===============================*/
+ ulint flags) /*!< in: flags */
+{
+ ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags);
+
+ ut_ad(page_compression_level <= 9);
+
+ return(page_compression_level);
+}
+
+/********************************************************************//**
+Check whether the table uses the page compression page format.
+@return page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_page_compression_level(
+/*==============================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table);
+ ut_ad(dict_tf_get_page_compression(table->flags));
+
+ return(dict_tf_get_page_compression_level(table->flags));
+}
+
+/********************************************************************//**
+Check whether the table uses the page compression page format.
+@return true if page compressed, false if not */
+UNIV_INLINE
+ibool
+dict_tf_get_page_compression(
+/*=========================*/
+ ulint flags) /*!< in: flags */
+{
+ return(DICT_TF_GET_PAGE_COMPRESSION(flags));
+}
+
+/********************************************************************//**
+Check whether the table uses the page compression page format.
+@return true if page compressed, false if not */
+UNIV_INLINE
+ibool
+dict_table_is_page_compressed(
+/*==========================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ return (dict_tf_get_page_compression(table->flags));
+}
+
+/********************************************************************//**
+Extract the atomic writes flag from table flags.
+@return enumerated value of atomic writes */
+UNIV_INLINE
+atomic_writes_t
+dict_tf_get_atomic_writes(
+/*======================*/
+ ulint flags) /*!< in: flags */
+{
+ return((atomic_writes_t)DICT_TF_GET_ATOMIC_WRITES(flags));
+}
+
+/********************************************************************//**
+Check whether the table uses the atomic writes.
+@return enumerated value of atomic writes */
+UNIV_INLINE
+atomic_writes_t
+dict_table_get_atomic_writes(
+/*=========================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ return ((atomic_writes_t)dict_tf_get_atomic_writes(table->flags));
+}
diff --git a/storage/innobase/include/dict0priv.h b/storage/innobase/include/dict0priv.h
index 9a3c8e22992..e034662aba0 100644
--- a/storage/innobase/include/dict0priv.h
+++ b/storage/innobase/include/dict0priv.h
@@ -53,8 +53,9 @@ dict_table_t*
dict_table_open_on_id_low(
/*=====================*/
table_id_t table_id, /*!< in: table id */
- dict_err_ignore_t ignore_err); /*!< in: errors to ignore
+ dict_err_ignore_t ignore_err, /*!< in: errors to ignore
when loading the table */
+ ibool open_only_if_in_cache);
#ifndef UNIV_NONINL
#include "dict0priv.ic"
diff --git a/storage/innobase/include/dict0priv.ic b/storage/innobase/include/dict0priv.ic
index 30ba8fb60aa..983218af78a 100644
--- a/storage/innobase/include/dict0priv.ic
+++ b/storage/innobase/include/dict0priv.ic
@@ -74,8 +74,9 @@ dict_table_t*
dict_table_open_on_id_low(
/*======================*/
table_id_t table_id, /*!< in: table id */
- dict_err_ignore_t ignore_err) /*!< in: errors to ignore
+ dict_err_ignore_t ignore_err, /*!< in: errors to ignore
when loading the table */
+ ibool open_only_if_in_cache)
{
dict_table_t* table;
ulint fold;
@@ -88,7 +89,7 @@ dict_table_open_on_id_low(
HASH_SEARCH(id_hash, dict_sys->table_id_hash, fold,
dict_table_t*, table, ut_ad(table->cached),
table->id == table_id);
- if (table == NULL) {
+ if (table == NULL && !open_only_if_in_cache) {
table = dict_load_table_on_id(table_id, ignore_err);
}
diff --git a/storage/innobase/include/dict0stats.h b/storage/innobase/include/dict0stats.h
index 186f90e3694..abf56b2f0c7 100644
--- a/storage/innobase/include/dict0stats.h
+++ b/storage/innobase/include/dict0stats.h
@@ -195,6 +195,39 @@ dict_stats_rename_table(
is returned */
size_t errstr_sz); /*!< in: errstr size */
+/*********************************************************************//**
+Save defragmentation result.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_save_defrag_summary(
+ dict_index_t* index); /*!< in: index */
+
+/*********************************************************************//**
+Save defragmentation stats for a given index.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_save_defrag_stats(
+ dict_index_t* index); /*!< in: index */
+
+/**********************************************************************//**
+Clear defragmentation summary. */
+UNIV_INTERN
+void
+dict_stats_empty_defrag_summary(
+/*==================*/
+ dict_index_t* index); /*!< in: index to clear defragmentation stats */
+
+/**********************************************************************//**
+Clear defragmentation related index stats. */
+UNIV_INTERN
+void
+dict_stats_empty_defrag_stats(
+/*==================*/
+ dict_index_t* index); /*!< in: index to clear defragmentation stats */
+
+
#ifndef UNIV_NONINL
#include "dict0stats.ic"
#endif
diff --git a/storage/innobase/include/dict0stats_bg.h b/storage/innobase/include/dict0stats_bg.h
index e866ab419fe..32fac3015e8 100644
--- a/storage/innobase/include/dict0stats_bg.h
+++ b/storage/innobase/include/dict0stats_bg.h
@@ -56,6 +56,28 @@ dict_stats_recalc_pool_del(
/*=======================*/
const dict_table_t* table); /*!< in: table to remove */
+/*****************************************************************//**
+Add an index in a table to the defrag pool, which is processed by the
+background stats gathering thread. Only the table id and index id are
+added to the list, so the table can be closed after being enqueued and
+it will be opened when needed. If the table or index does not exist later
+(has been DROPped), then it will be removed from the pool and skipped. */
+UNIV_INTERN
+void
+dict_stats_defrag_pool_add(
+/*=======================*/
+ const dict_index_t* index); /*!< in: table to add */
+
+/*****************************************************************//**
+Delete a given index from the auto defrag pool. */
+UNIV_INTERN
+void
+dict_stats_defrag_pool_del(
+/*=======================*/
+ const dict_table_t* table, /*!<in: if given, remove
+ all entries for the table */
+ const dict_index_t* index); /*!< in: index to remove */
+
/** Yield the data dictionary latch when waiting
for the background thread to stop accessing a table.
@param trx transaction holding the data dictionary locks */
diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h
index d34b6f7eab3..35430e8ea62 100644
--- a/storage/innobase/include/dict0types.h
+++ b/storage/innobase/include/dict0types.h
@@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -75,6 +76,13 @@ enum ib_quiesce_t {
QUIESCE_COMPLETE /*!< All done */
};
+/** Enum values for atomic_writes table option */
+typedef enum {
+ ATOMIC_WRITES_DEFAULT = 0,
+ ATOMIC_WRITES_ON = 1,
+ ATOMIC_WRITES_OFF = 2
+} atomic_writes_t;
+
/** Prefix for tmp tables, adopted from sql/table.h */
#define tmp_file_prefix "#sql"
#define tmp_file_prefix_length 4
diff --git a/storage/innobase/include/fil0crypt.h b/storage/innobase/include/fil0crypt.h
new file mode 100644
index 00000000000..ce0901e81f8
--- /dev/null
+++ b/storage/innobase/include/fil0crypt.h
@@ -0,0 +1,385 @@
+/*****************************************************************************
+Copyright (C) 2013, 2015, Google Inc. All Rights Reserved.
+Copyright (c) 2015, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fil0crypt.h
+The low-level file system encryption support functions
+
+Created 04/01/2015 Jan Lindström
+*******************************************************/
+
+#ifndef fil0crypt_h
+#define fil0crypt_h
+
+/* This key will be used if nothing else is given */
+#define FIL_DEFAULT_ENCRYPTION_KEY ENCRYPTION_KEY_SYSTEM_DATA
+
+/** Enum values for encryption table option */
+typedef enum {
+ FIL_SPACE_ENCRYPTION_DEFAULT = 0, /* Tablespace encrypted if
+ srv_encrypt_tables = ON */
+ FIL_SPACE_ENCRYPTION_ON = 1, /* Tablespace is encrypted always */
+ FIL_SPACE_ENCRYPTION_OFF = 2 /* Tablespace is not encrypted */
+} fil_encryption_t;
+
+/**
+ * CRYPT_SCHEME_UNENCRYPTED
+ *
+ * Used as intermediate state when convering a space from unencrypted
+ * to encrypted
+ */
+/**
+ * CRYPT_SCHEME_1
+ *
+ * xxx is AES_CTR or AES_CBC (or another block cypher with the same key and iv lengths)
+ * L = AES_ECB(KEY, IV)
+ * CRYPT(PAGE) = xxx(KEY=L, IV=C, PAGE)
+ */
+
+#define CRYPT_SCHEME_1 1
+#define CRYPT_SCHEME_1_IV_LEN 16
+#define CRYPT_SCHEME_UNENCRYPTED 0
+
+/* Cached L or key for given key_version */
+struct key_struct
+{
+ uint key_version; /*!< Version of the key */
+ uint key_length; /*!< Key length */
+ unsigned char key[MY_AES_MAX_KEY_LENGTH]; /*!< Cached key
+ (that is L in CRYPT_SCHEME_1) */
+};
+
+struct fil_space_rotate_state_t
+{
+ time_t start_time; // time when rotation started
+ ulint active_threads; // active threads in space
+ ulint next_offset; // next "free" offset
+ ulint max_offset; // max offset needing to be rotated
+ uint min_key_version_found; // min key version found but not rotated
+ lsn_t end_lsn; // max lsn created when rotating this space
+ bool starting; // initial write of IV
+ bool flushing; // space is being flushed at end of rotate
+ struct {
+ bool is_active; // is scrubbing active in this space
+ time_t last_scrub_completed; // when was last scrub completed
+ } scrubbing;
+};
+
+struct fil_space_crypt_struct : st_encryption_scheme
+{
+ uint min_key_version; // min key version for this space
+ ulint page0_offset; // byte offset on page 0 for crypt data
+ fil_encryption_t encryption; // Encryption setup
+
+ ib_mutex_t mutex; // mutex protecting following variables
+ bool closing; // is tablespace being closed
+ fil_space_rotate_state_t rotate_state;
+};
+
+/* structure containing encryption specification */
+typedef struct fil_space_crypt_struct fil_space_crypt_t;
+
+/*********************************************************************
+Init global resources needed for tablespace encryption/decryption */
+UNIV_INTERN
+void
+fil_space_crypt_init();
+
+/*********************************************************************
+Cleanup global resources needed for tablespace encryption/decryption */
+UNIV_INTERN
+void
+fil_space_crypt_cleanup();
+
+/*********************************************************************
+Create crypt data, i.e data that is used for a single tablespace */
+UNIV_INTERN
+fil_space_crypt_t *
+fil_space_create_crypt_data(
+/*========================*/
+ fil_encryption_t encrypt_mode, /*!< in: encryption mode */
+ uint key_id); /*!< in: encryption key id */
+
+/*********************************************************************
+Destroy crypt data */
+UNIV_INTERN
+void
+fil_space_destroy_crypt_data(
+/*=========================*/
+ fil_space_crypt_t **crypt_data); /*!< in/out: crypt data */
+
+/*********************************************************************
+Get crypt data for a space*/
+UNIV_INTERN
+fil_space_crypt_t *
+fil_space_get_crypt_data(
+/*=====================*/
+ ulint space); /*!< in: tablespace id */
+
+/*********************************************************************
+Set crypt data for a space*/
+UNIV_INTERN
+fil_space_crypt_t*
+fil_space_set_crypt_data(
+/*=====================*/
+ ulint space, /*!< in: tablespace id */
+ fil_space_crypt_t* crypt_data); /*!< in: crypt data to set */
+
+/*********************************************************************
+Merge crypt data */
+UNIV_INTERN
+void
+fil_space_merge_crypt_data(
+/*=======================*/
+ fil_space_crypt_t* dst_crypt_data, /*!< in: crypt_data */
+ const fil_space_crypt_t* src_crypt_data); /*!< in: crypt data */
+
+/*********************************************************************
+Read crypt data from buffer page */
+UNIV_INTERN
+fil_space_crypt_t *
+fil_space_read_crypt_data(
+/*======================*/
+ ulint space, /*!< in: tablespace id */
+ const byte* page, /*!< in: buffer page */
+ ulint offset); /*!< in: offset where crypt data is stored */
+
+/*********************************************************************
+Write crypt data to buffer page */
+UNIV_INTERN
+void
+fil_space_write_crypt_data(
+/*=======================*/
+ ulint space, /*!< in: tablespace id */
+ byte* page, /*!< in: buffer page */
+ ulint offset, /*!< in: offset where to store data */
+ ulint maxsize, /*!< in: max space available to store crypt data in */
+ mtr_t * mtr); /*!< in: mini-transaction */
+
+/*********************************************************************
+Clear crypt data from page 0 (used for import tablespace) */
+UNIV_INTERN
+void
+fil_space_clear_crypt_data(
+/*=======================*/
+ byte* page, /*!< in: buffer page */
+ ulint offset); /*!< in: offset where crypt data is stored */
+
+/*********************************************************************
+Parse crypt data log record */
+UNIV_INTERN
+byte*
+fil_parse_write_crypt_data(
+/*=======================*/
+ byte* ptr, /*!< in: start of log record */
+ byte* end_ptr, /*!< in: end of log record */
+ buf_block_t*); /*!< in: buffer page to apply record to */
+
+/*********************************************************************
+Check if extra buffer shall be allocated for decrypting after read */
+UNIV_INTERN
+bool
+fil_space_check_encryption_read(
+/*============================*/
+ ulint space); /*!< in: tablespace id */
+/******************************************************************
+Decrypt a page
+@return true if page is decrypted, false if not. */
+UNIV_INTERN
+bool
+fil_space_decrypt(
+/*==============*/
+ fil_space_crypt_t* crypt_data, /*!< in: crypt data */
+ byte* tmp_frame, /*!< in: temporary buffer */
+ ulint page_size, /*!< in: page size */
+ byte* src_frame); /*!< in:out: page buffer */
+
+/*********************************************************************
+Encrypt buffer page
+@return encrypted page, or original not encrypted page if encrypt
+is not needed. */
+UNIV_INTERN
+byte*
+fil_space_encrypt(
+/*==============*/
+ ulint space, /*!< in: tablespace id */
+ ulint offset, /*!< in: page no */
+ lsn_t lsn, /*!< in: page lsn */
+ byte* src_frame, /*!< in: page frame */
+ ulint size, /*!< in: size of data to encrypt */
+ byte* dst_frame); /*!< in: where to encrypt to */
+
+/*********************************************************************
+Decrypt buffer page
+@return decrypted page, or original not encrypted page if decrypt is
+not needed.*/
+UNIV_INTERN
+byte*
+fil_space_decrypt(
+/*==============*/
+ ulint space, /*!< in: tablespace id */
+ byte* src_frame, /*!< in: page frame */
+ ulint page_size, /*!< in: size of data to encrypt */
+ byte* dst_frame); /*!< in: where to decrypt to */
+
+/*********************************************************************
+fil_space_verify_crypt_checksum
+NOTE: currently this function can only be run in single threaded mode
+as it modifies srv_checksum_algorithm (temporarily)
+@return true if page is encrypted AND OK, false otherwise */
+UNIV_INTERN
+bool
+fil_space_verify_crypt_checksum(
+/*============================*/
+ const byte* src_frame,/*!< in: page frame */
+ ulint zip_size); /*!< in: size of data to encrypt */
+
+/*********************************************************************
+Init threads for key rotation */
+UNIV_INTERN
+void
+fil_crypt_threads_init();
+
+/*********************************************************************
+Set thread count (e.g start or stops threads) used for key rotation */
+UNIV_INTERN
+void
+fil_crypt_set_thread_cnt(
+/*=====================*/
+ uint new_cnt); /*!< in: requested #threads */
+
+/*********************************************************************
+End threads for key rotation */
+UNIV_INTERN
+void
+fil_crypt_threads_end();
+
+/*********************************************************************
+Cleanup resources for threads for key rotation */
+UNIV_INTERN
+void
+fil_crypt_threads_cleanup();
+
+/*********************************************************************
+Set rotate key age */
+UNIV_INTERN
+void
+fil_crypt_set_rotate_key_age(
+/*=========================*/
+ uint rotate_age); /*!< in: requested rotate age */
+
+/*********************************************************************
+Set rotation threads iops */
+UNIV_INTERN
+void
+fil_crypt_set_rotation_iops(
+/*========================*/
+ uint iops); /*!< in: requested iops */
+
+/*********************************************************************
+Mark a space as closing */
+UNIV_INTERN
+void
+fil_space_crypt_mark_space_closing(
+/*===============================*/
+ ulint space); /*!< in: tablespace id */
+
+/*********************************************************************
+Wait for crypt threads to stop accessing space */
+UNIV_INTERN
+void
+fil_space_crypt_close_tablespace(
+/*=============================*/
+ ulint space); /*!< in: tablespace id */
+
+/** Struct for retreiving info about encryption */
+struct fil_space_crypt_status_t {
+ ulint space; /*!< tablespace id */
+ ulint scheme; /*!< encryption scheme */
+ uint min_key_version; /*!< min key version */
+ uint current_key_version;/*!< current key version */
+ uint keyserver_requests;/*!< no of key requests to key server */
+ bool rotating; /*!< is key rotation ongoing */
+ bool flushing; /*!< is flush at end of rotation ongoing */
+ ulint rotate_next_page_number; /*!< next page if key rotating */
+ ulint rotate_max_page_number; /*!< max page if key rotating */
+};
+
+/*********************************************************************
+Get crypt status for a space
+@return 0 if crypt data found */
+UNIV_INTERN
+int
+fil_space_crypt_get_status(
+/*=======================*/
+ ulint id, /*!< in: space id */
+ struct fil_space_crypt_status_t * status); /*!< out: status */
+
+/** Struct for retreiving statistics about encryption key rotation */
+struct fil_crypt_stat_t {
+ ulint pages_read_from_cache;
+ ulint pages_read_from_disk;
+ ulint pages_modified;
+ ulint pages_flushed;
+ ulint estimated_iops;
+};
+
+/*********************************************************************
+Get crypt rotation statistics */
+UNIV_INTERN
+void
+fil_crypt_total_stat(
+/*==================*/
+ fil_crypt_stat_t* stat); /*!< out: crypt stat */
+
+/** Struct for retreiving info about scrubbing */
+struct fil_space_scrub_status_t {
+ ulint space; /*!< tablespace id */
+ bool compressed; /*!< is space compressed */
+ time_t last_scrub_completed; /*!< when was last scrub completed */
+ bool scrubbing; /*!< is scrubbing ongoing */
+ time_t current_scrub_started; /*!< when started current scrubbing */
+ ulint current_scrub_active_threads; /*!< current scrub active threads */
+ ulint current_scrub_page_number; /*!< current scrub page no */
+ ulint current_scrub_max_page_number; /*!< current scrub max page no */
+};
+
+/*********************************************************************
+Get scrub status for a space
+@return 0 if no scrub info found */
+UNIV_INTERN
+int
+fil_space_get_scrub_status(
+/*=======================*/
+ ulint id, /*!< in: space id */
+ struct fil_space_scrub_status_t * status); /*!< out: status */
+
+/*********************************************************************
+Adjust encrypt tables */
+UNIV_INTERN
+void
+fil_crypt_set_encrypt_tables(
+/*=========================*/
+ uint val); /*!< in: New srv_encrypt_tables setting */
+
+
+#ifndef UNIV_NONINL
+#include "fil0crypt.ic"
+#endif
+
+#endif /* fil0crypt_h */
diff --git a/storage/innobase/include/fil0crypt.ic b/storage/innobase/include/fil0crypt.ic
new file mode 100644
index 00000000000..5fafa6cd3f0
--- /dev/null
+++ b/storage/innobase/include/fil0crypt.ic
@@ -0,0 +1,68 @@
+/*****************************************************************************
+
+Copyright (c) 2015, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fil0fil.h
+The low-level file system encryption support functions
+
+Created 04/01/2015 Jan Lindström
+*******************************************************/
+
+/*******************************************************************//**
+Find out whether the page is page encrypted
+@return true if page is page encrypted, false if not */
+UNIV_INLINE
+bool
+fil_page_is_encrypted(
+/*==================*/
+ const byte *buf) /*!< in: page */
+{
+ return(mach_read_from_4(buf+FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) != 0);
+}
+
+/*******************************************************************//**
+Find out whether the page can be decrypted.
+The function for decrypting the page should already be executed before this.
+@return 1 if key provider not available or key is not available
+ 0 if decryption should be possible
+*/
+UNIV_INLINE
+bool
+fil_page_encryption_status(
+/*===================*/
+ const byte *buf, /*!< in: page */
+ ulint space_id) /*!< in: space_id */
+{
+ fil_space_crypt_t *crypt_data = fil_space_get_crypt_data(space_id);
+ ulint page_type = mach_read_from_2(buf+FIL_PAGE_TYPE);
+
+ if (page_type == FIL_PAGE_TYPE_FSP_HDR) {
+ if (crypt_data != NULL) {
+ if (!encryption_key_id_exists(crypt_data->key_id)) {
+ /* accessing table would surely fail, because no key or no key provider available */
+ return 1;
+ }
+ }
+ } else {
+ ulint key = mach_read_from_4(buf + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+ if (!encryption_key_version_exists(crypt_data->key_id, key)) {
+ return 1;
+ }
+ }
+ return 0;
+}
diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h
index da2ee1c5730..a3785292115 100644
--- a/storage/innobase/include/fil0fil.h
+++ b/storage/innobase/include/fil0fil.h
@@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2015, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -25,7 +26,6 @@ Created 10/25/1995 Heikki Tuuri
#ifndef fil0fil_h
#define fil0fil_h
-
#include "univ.i"
#ifndef UNIV_INNOCHECKSUM
@@ -126,16 +126,27 @@ extern fil_addr_t fil_addr_null;
MySQL/InnoDB 5.1.7 or later, the
contents of this field is valid
for all uncompressed pages. */
-#define FIL_PAGE_FILE_FLUSH_LSN 26 /*!< this is only defined for the
- first page in a system tablespace
- data file (ibdata*, not *.ibd):
- the file has been flushed to disk
- at least up to this lsn */
+#define FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION 26 /*!< for the first page
+ in a system tablespace data file
+ (ibdata*, not *.ibd): the file has
+ been flushed to disk at least up
+ to this lsn
+ for other pages: a 32-bit key version
+ used to encrypt the page + 32-bit checksum
+ or 64 bits of zero if no encryption
+ */
#define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID 34 /*!< starting from 4.1.x this
contains the space id of the page */
#define FIL_PAGE_SPACE_ID FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID
#define FIL_PAGE_DATA 38 /*!< start of the data on the page */
+/* Following are used when page compression is used */
+#define FIL_PAGE_COMPRESSED_SIZE 2 /*!< Number of bytes used to store
+ actual payload data size on
+ compressed pages. */
+#define FIL_PAGE_COMPRESSION_METHOD_SIZE 2
+ /*!< Number of bytes used to store
+ actual compression method. */
/* @} */
/** File page trailer @{ */
#define FIL_PAGE_END_LSN_OLD_CHKSUM 8 /*!< the low 4 bytes of this are used
@@ -146,6 +157,9 @@ extern fil_addr_t fil_addr_null;
/* @} */
/** File page types (values of FIL_PAGE_TYPE) @{ */
+#define FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED 37401 /*!< Page is compressed and
+ then encrypted */
+#define FIL_PAGE_PAGE_COMPRESSED 34354 /*!< page compressed page */
#define FIL_PAGE_INDEX 17855 /*!< B-tree node */
#define FIL_PAGE_UNDO_LOG 2 /*!< Undo log page */
#define FIL_PAGE_INODE 3 /*!< Index node */
@@ -160,7 +174,8 @@ extern fil_addr_t fil_addr_null;
#define FIL_PAGE_TYPE_BLOB 10 /*!< Uncompressed BLOB page */
#define FIL_PAGE_TYPE_ZBLOB 11 /*!< First compressed BLOB page */
#define FIL_PAGE_TYPE_ZBLOB2 12 /*!< Subsequent compressed BLOB page */
-#define FIL_PAGE_TYPE_LAST FIL_PAGE_TYPE_ZBLOB2
+#define FIL_PAGE_TYPE_COMPRESSED 13 /*!< Compressed page */
+#define FIL_PAGE_TYPE_LAST FIL_PAGE_TYPE_COMPRESSED
/*!< Last page type */
/* @} */
@@ -171,6 +186,9 @@ extern fil_addr_t fil_addr_null;
#ifndef UNIV_INNOCHECKSUM
+/* structure containing encryption specification */
+typedef struct fil_space_crypt_struct fil_space_crypt_t;
+
/** The number of fsyncs done to the log */
extern ulint fil_n_log_flushes;
@@ -191,9 +209,11 @@ struct fsp_open_info {
lsn_t lsn; /*!< Flushed LSN from header page */
ulint id; /*!< Space ID */
ulint flags; /*!< Tablespace flags */
+ ulint encryption_error; /*!< if an encryption error occurs */
#ifdef UNIV_LOG_ARCHIVE
ulint arch_log_no; /*!< latest archived log file number */
#endif /* UNIV_LOG_ARCHIVE */
+ fil_space_crypt_t* crypt_data; /*!< crypt data */
};
struct fil_space_t;
@@ -228,6 +248,7 @@ struct fil_node_t {
ib_int64_t flush_counter;/*!< up to what
modification_counter value we have
flushed the modifications to disk */
+ ulint file_block_size;/*!< file system block size */
UT_LIST_NODE_T(fil_node_t) chain;
/*!< link field for the file chain */
UT_LIST_NODE_T(fil_node_t) LRU;
@@ -305,8 +326,13 @@ struct fil_space_t {
bool is_in_unflushed_spaces;
/*!< true if this space is currently in
unflushed_spaces */
+ bool printed_compression_failure;
+ /*!< true if we have already printed
+ compression failure */
UT_LIST_NODE_T(fil_space_t) space_list;
/*!< list of all spaces */
+ fil_space_crypt_t* crypt_data;
+ ulint file_block_size;/*!< file system block size */
ulint magic_n;/*!< FIL_SPACE_MAGIC_N */
};
@@ -401,6 +427,7 @@ ulint
fil_space_get_type(
/*===============*/
ulint id); /*!< in: space id */
+
#endif /* !UNIV_HOTBACKUP */
/*******************************************************************//**
Appends a new file to the chain of files of a space. File must be closed.
@@ -441,7 +468,9 @@ fil_space_create(
ulint id, /*!< in: space id */
ulint zip_size,/*!< in: compressed page size, or
0 for uncompressed tablespaces */
- ulint purpose);/*!< in: FIL_TABLESPACE, or FIL_LOG if log */
+ ulint purpose, /*!< in: FIL_TABLESPACE, or FIL_LOG if log */
+ fil_space_crypt_t* crypt_data); /*!< in: crypt data */
+
/*******************************************************************//**
Assigns a new space id for a new single-table tablespace. This works simply by
incrementing the global counter. If 4 billion id's is not enough, we may need
@@ -580,8 +609,10 @@ fil_read_first_page(
#endif /* UNIV_LOG_ARCHIVE */
lsn_t* min_flushed_lsn, /*!< out: min of flushed
lsn values in data files */
- lsn_t* max_flushed_lsn) /*!< out: max of flushed
+ lsn_t* max_flushed_lsn, /*!< out: max of flushed
lsn values in data files */
+ fil_space_crypt_t** crypt_data) /*!< out: crypt data */
+
__attribute__((warn_unused_result));
/*******************************************************************//**
Increments the count of pending operation, if space is not being deleted.
@@ -945,8 +976,13 @@ fil_io(
void* buf, /*!< in/out: buffer where to store read data
or from where to write; in aio this must be
appropriately aligned */
- void* message) /*!< in: message for aio handler if non-sync
+ void* message, /*!< in: message for aio handler if non-sync
aio used, else ignored */
+ ulint* write_size) /*!< in/out: Actual write size initialized
+ after fist successfull trim
+ operation for this page and if
+ initialized we do not trim again if
+ actual page size does not decrease. */
__attribute__((nonnull(8)));
/**********************************************************************//**
Waits for an aio operation to complete. This function is used to write the
@@ -1197,5 +1233,90 @@ fil_user_tablespace_restore_page(
ulint page_no); /* in: page_no to obtain from double
write buffer */
+/*******************************************************************//**
+Return space flags */
+UNIV_INLINE
+ulint
+fil_space_flags(
+/*===========*/
+ fil_space_t* space); /*!< in: space */
+
#endif /* !UNIV_INNOCHECKSUM */
+
+/****************************************************************//**
+Acquire fil_system mutex */
+void
+fil_system_enter(void);
+/*==================*/
+/****************************************************************//**
+Release fil_system mutex */
+void
+fil_system_exit(void);
+/*==================*/
+
+#ifndef UNIV_INNOCHECKSUM
+/*******************************************************************//**
+Returns the table space by a given id, NULL if not found. */
+fil_space_t*
+fil_space_found_by_id(
+/*==================*/
+ ulint id); /*!< in: space id */
+
+/*******************************************************************//**
+Returns the table space by a given id, NULL if not found. */
+fil_space_t*
+fil_space_get_by_id(
+/*================*/
+ ulint id); /*!< in: space id */
+
+/******************************************************************
+Get id of first tablespace or ULINT_UNDEFINED if none */
+UNIV_INTERN
+ulint
+fil_get_first_space();
+/*=================*/
+
+/******************************************************************
+Get id of next tablespace or ULINT_UNDEFINED if none */
+UNIV_INTERN
+ulint
+fil_get_next_space(
+/*===============*/
+ ulint id); /*!< in: space id */
+
+/******************************************************************
+Get id of first tablespace that has node or ULINT_UNDEFINED if none */
+UNIV_INTERN
+ulint
+fil_get_first_space_safe();
+/*======================*/
+
+/******************************************************************
+Get id of next tablespace that has node or ULINT_UNDEFINED if none */
+UNIV_INTERN
+ulint
+fil_get_next_space_safe(
+/*====================*/
+ ulint id); /*!< in: previous space id */
+
+
+/*******************************************************************//**
+Returns the block size of the file space
+@return block size */
+UNIV_INTERN
+ulint
+fil_space_get_block_size(
+/*=====================*/
+ ulint id, /*!< in: space id */
+ ulint offset, /*!< in: page offset */
+ ulint len); /*!< in: page len */
+
+#endif /* UNIV_INNOCHECKSUM */
+
+#ifndef UNIV_INNOCHECKSUM
+#ifndef UNIV_NONINL
+#include "fil0fil.ic"
+#endif
+#endif
+
#endif /* fil0fil_h */
diff --git a/storage/innobase/include/fil0fil.ic b/storage/innobase/include/fil0fil.ic
new file mode 100644
index 00000000000..33800650bae
--- /dev/null
+++ b/storage/innobase/include/fil0fil.ic
@@ -0,0 +1,177 @@
+/*****************************************************************************
+
+Copyright (c) 2015, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fil0fil.ic
+The low-level file system support functions
+
+Created 31/03/2015 Jan Lindström
+*******************************************************/
+
+#ifndef fil0fil_ic
+#define fil0fil_ic
+
+/*******************************************************************//**
+Return space name */
+UNIV_INLINE
+char*
+fil_space_name(
+/*===========*/
+ fil_space_t* space) /*!< in: space */
+{
+ return (space->name);
+}
+
+/*******************************************************************//**
+Return space flags */
+UNIV_INLINE
+ulint
+fil_space_flags(
+/*===========*/
+ fil_space_t* space) /*!< in: space */
+{
+ return (space->flags);
+}
+
+/*******************************************************************//**
+Return page type name */
+UNIV_INLINE
+const char*
+fil_get_page_type_name(
+/*===================*/
+ ulint page_type) /*!< in: FIL_PAGE_TYPE */
+{
+ switch(page_type) {
+ case FIL_PAGE_PAGE_COMPRESSED:
+ return (const char*)"PAGE_COMPRESSED";
+ case FIL_PAGE_INDEX:
+ return (const char*)"INDEX";
+ case FIL_PAGE_UNDO_LOG:
+ return (const char*)"UNDO LOG";
+ case FIL_PAGE_INODE:
+ return (const char*)"INODE";
+ case FIL_PAGE_IBUF_FREE_LIST:
+ return (const char*)"IBUF_FREE_LIST";
+ case FIL_PAGE_TYPE_ALLOCATED:
+ return (const char*)"ALLOCATED";
+ case FIL_PAGE_IBUF_BITMAP:
+ return (const char*)"IBUF_BITMAP";
+ case FIL_PAGE_TYPE_SYS:
+ return (const char*)"SYS";
+ case FIL_PAGE_TYPE_TRX_SYS:
+ return (const char*)"TRX_SYS";
+ case FIL_PAGE_TYPE_FSP_HDR:
+ return (const char*)"FSP_HDR";
+ case FIL_PAGE_TYPE_XDES:
+ return (const char*)"XDES";
+ case FIL_PAGE_TYPE_BLOB:
+ return (const char*)"BLOB";
+ case FIL_PAGE_TYPE_ZBLOB:
+ return (const char*)"ZBLOB";
+ case FIL_PAGE_TYPE_ZBLOB2:
+ return (const char*)"ZBLOB2";
+ case FIL_PAGE_TYPE_COMPRESSED:
+ return (const char*)"ORACLE PAGE COMPRESSED";
+ default:
+ return (const char*)"PAGE TYPE CORRUPTED";
+ }
+}
+
+/****************************************************************//**
+Get block size from fil node
+@return block size*/
+UNIV_INLINE
+ulint
+fil_node_get_block_size(
+/*====================*/
+ fil_node_t* node) /*!< in: Node where to get block
+ size */
+{
+ return (node->file_block_size);
+}
+
+/****************************************************************//**
+Validate page type.
+@return true if valid, false if not */
+UNIV_INLINE
+bool
+fil_page_type_validate(
+ const byte* page) /*!< in: page */
+{
+#ifdef UNIV_DEBUG
+ ulint page_type = mach_read_from_2(page + FIL_PAGE_TYPE);
+#ifdef UNIV_ENCRYPTION_EXTRA_DEBUG
+ uint key_version = mach_read_from_4(page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+ bool page_compressed = (page_type == FIL_PAGE_PAGE_COMPRESSED);
+ bool page_compressed_encrypted = (page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED);
+ ulint space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ ulint offset = mach_read_from_4(page + FIL_PAGE_OFFSET);
+ ib_uint64_t lsn = mach_read_from_8(page + FIL_PAGE_LSN);
+ ulint compressed_len = mach_read_from_2(page + FIL_PAGE_DATA);
+ fil_system_enter();
+ fil_space_t* rspace = fil_space_get_by_id(space);
+ fil_system_exit();
+
+ /* Dump out the page info */
+ fprintf(stderr, "InnoDB: Space %lu offset %lu name %s page_type %lu page_type_name %s\n"
+ "InnoDB: key_version %u page_compressed %d lsn %lu compressed_len %lu\n",
+ space, offset, rspace->name, page_type, fil_get_page_type_name(page_type),
+ key_version, page_compressed, lsn, compressed_len);
+ fflush(stderr);
+#endif
+
+ /* Validate page type */
+ if (!((page_type == FIL_PAGE_PAGE_COMPRESSED ||
+ page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED ||
+ page_type == FIL_PAGE_INDEX ||
+ page_type == FIL_PAGE_UNDO_LOG ||
+ page_type == FIL_PAGE_INODE ||
+ page_type == FIL_PAGE_IBUF_FREE_LIST ||
+ page_type == FIL_PAGE_TYPE_ALLOCATED ||
+ page_type == FIL_PAGE_IBUF_BITMAP ||
+ page_type == FIL_PAGE_TYPE_SYS ||
+ page_type == FIL_PAGE_TYPE_TRX_SYS ||
+ page_type == FIL_PAGE_TYPE_FSP_HDR ||
+ page_type == FIL_PAGE_TYPE_XDES ||
+ page_type == FIL_PAGE_TYPE_BLOB ||
+ page_type == FIL_PAGE_TYPE_ZBLOB ||
+ page_type == FIL_PAGE_TYPE_COMPRESSED))) {
+
+ ut_ad(page_type == FIL_PAGE_PAGE_COMPRESSED ||
+ page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED ||
+ page_type == FIL_PAGE_INDEX ||
+ page_type == FIL_PAGE_UNDO_LOG ||
+ page_type == FIL_PAGE_INODE ||
+ page_type == FIL_PAGE_IBUF_FREE_LIST ||
+ page_type == FIL_PAGE_TYPE_ALLOCATED ||
+ page_type == FIL_PAGE_IBUF_BITMAP ||
+ page_type == FIL_PAGE_TYPE_SYS ||
+ page_type == FIL_PAGE_TYPE_TRX_SYS ||
+ page_type == FIL_PAGE_TYPE_FSP_HDR ||
+ page_type == FIL_PAGE_TYPE_XDES ||
+ page_type == FIL_PAGE_TYPE_BLOB ||
+ page_type == FIL_PAGE_TYPE_ZBLOB ||
+ page_type == FIL_PAGE_TYPE_COMPRESSED);
+ return false;
+ }
+
+#endif /* UNIV_DEBUG */
+ return true;
+}
+
+#endif /* fil0fil_ic */
diff --git a/storage/innobase/include/fil0pagecompress.h b/storage/innobase/include/fil0pagecompress.h
new file mode 100644
index 00000000000..8316083d52d
--- /dev/null
+++ b/storage/innobase/include/fil0pagecompress.h
@@ -0,0 +1,166 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2015 MariaDB Corporation. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+#ifndef fil0pagecompress_h
+#define fil0pagecompress_h
+
+#include "fsp0fsp.h"
+#include "fsp0pagecompress.h"
+
+/******************************************************************//**
+@file include/fil0pagecompress.h
+Helper functions for extracting/storing page compression and
+atomic writes information to table space.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+/*******************************************************************//**
+Returns the page compression level flag of the space, or 0 if the space
+is not compressed. The tablespace must be cached in the memory cache.
+@return page compression level if page compressed, ULINT_UNDEFINED if space not found */
+UNIV_INTERN
+ulint
+fil_space_get_page_compression_level(
+/*=================================*/
+ ulint id); /*!< in: space id */
+/*******************************************************************//**
+Returns the page compression flag of the space, or false if the space
+is not compressed. The tablespace must be cached in the memory cache.
+@return true if page compressed, false if not or space not found */
+UNIV_INTERN
+ibool
+fil_space_is_page_compressed(
+/*=========================*/
+ ulint id); /*!< in: space id */
+/*******************************************************************//**
+Returns the page compression flag of the space, or false if the space
+is not compressed. The tablespace must be cached in the memory cache.
+@return true if page compressed, false if not or space not found */
+UNIV_INTERN
+ibool
+fil_space_get_page_compressed(
+/*=========================*/
+ fil_space_t* space); /*!< in: space id */
+/*******************************************************************//**
+Returns the atomic writes flag of the space, or false if the space
+is not using atomic writes. The tablespace must be cached in the memory cache.
+@return atomic write table option value */
+UNIV_INTERN
+atomic_writes_t
+fil_space_get_atomic_writes(
+/*=========================*/
+ ulint id); /*!< in: space id */
+/*******************************************************************//**
+Find out wheather the page is index page or not
+@return true if page type index page, false if not */
+UNIV_INTERN
+ibool
+fil_page_is_index_page(
+/*===================*/
+ byte *buf); /*!< in: page */
+
+/****************************************************************//**
+Get the name of the compression algorithm used for page
+compression.
+@return compression algorithm name or "UNKNOWN" if not known*/
+UNIV_INTERN
+const char*
+fil_get_compression_alg_name(
+/*=========================*/
+ ulint comp_alg); /*!<in: compression algorithm number */
+
+/****************************************************************//**
+For page compressed pages compress the page before actual write
+operation.
+@return compressed page to be written*/
+UNIV_INTERN
+byte*
+fil_compress_page(
+/*==============*/
+ ulint space_id, /*!< in: tablespace id of the
+ table. */
+ byte* buf, /*!< in: buffer from which to write; in aio
+ this must be appropriately aligned */
+ byte* out_buf, /*!< out: compressed buffer */
+ ulint len, /*!< in: length of input buffer.*/
+ ulint level, /* in: compression level */
+ ulint block_size, /*!< in: block size */
+ bool encrypted, /*!< in: is page also encrypted */
+ ulint* out_len, /*!< out: actual length of compressed
+ page */
+ byte* lzo_mem); /*!< in: temporal memory used by LZO */
+
+/****************************************************************//**
+For page compressed pages decompress the page after actual read
+operation. */
+UNIV_INTERN
+void
+fil_decompress_page(
+/*================*/
+ byte* page_buf, /*!< in: preallocated buffer or NULL */
+ byte* buf, /*!< out: buffer from which to read; in aio
+ this must be appropriately aligned */
+ ulong len, /*!< in: length of output buffer.*/
+ ulint* write_size); /*!< in/out: Actual payload size of
+ the compressed data. */
+
+/****************************************************************//**
+Get space id from fil node
+@return space id*/
+UNIV_INTERN
+ulint
+fil_node_get_space_id(
+/*==================*/
+ fil_node_t* node); /*!< in: Node where to get space id*/
+
+/****************************************************************//**
+Get block size from fil node
+@return block size*/
+UNIV_INTERN
+ulint
+fil_node_get_block_size(
+ fil_node_t* node); /*!< in: Node where to get block
+ size */
+/*******************************************************************//**
+Find out wheather the page is page compressed
+@return true if page is page compressed*/
+UNIV_INTERN
+ibool
+fil_page_is_compressed(
+/*===================*/
+ byte* buf); /*!< in: page */
+
+/*******************************************************************//**
+Find out wheather the page is page compressed
+@return true if page is page compressed*/
+UNIV_INTERN
+ibool
+fil_page_is_compressed_encrypted(
+/*=============================*/
+ byte* buf); /*!< in: page */
+
+/*******************************************************************//**
+Find out wheather the page is page compressed with lzo method
+@return true if page is page compressed with lzo method*/
+UNIV_INTERN
+ibool
+fil_page_is_lzo_compressed(
+/*=======================*/
+ byte* buf); /*!< in: page */
+#endif
diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h
index a587ccc9f20..2bac42eb081 100644
--- a/storage/innobase/include/fsp0fsp.h
+++ b/storage/innobase/include/fsp0fsp.h
@@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -28,13 +29,14 @@ Created 12/18/1995 Heikki Tuuri
#include "univ.i"
+#include "fsp0types.h"
+
#ifndef UNIV_INNOCHECKSUM
#include "mtr0mtr.h"
#include "fut0lst.h"
#include "ut0byte.h"
#include "page0types.h"
-#include "fsp0types.h"
#endif /* !UNIV_INNOCHECKSUM */
@@ -53,12 +55,22 @@ to the two Barracuda row formats COMPRESSED and DYNAMIC. */
/** Width of the DATA_DIR flag. This flag indicates that the tablespace
is found in a remote location, not the default data directory. */
#define FSP_FLAGS_WIDTH_DATA_DIR 1
+/** Number of flag bits used to indicate the page compression and compression level */
+#define FSP_FLAGS_WIDTH_PAGE_COMPRESSION 1
+#define FSP_FLAGS_WIDTH_PAGE_COMPRESSION_LEVEL 4
+
+/** Number of flag bits used to indicate atomic writes for this tablespace */
+#define FSP_FLAGS_WIDTH_ATOMIC_WRITES 2
+
/** Width of all the currently known tablespace flags */
#define FSP_FLAGS_WIDTH (FSP_FLAGS_WIDTH_POST_ANTELOPE \
+ FSP_FLAGS_WIDTH_ZIP_SSIZE \
+ FSP_FLAGS_WIDTH_ATOMIC_BLOBS \
+ FSP_FLAGS_WIDTH_PAGE_SSIZE \
- + FSP_FLAGS_WIDTH_DATA_DIR)
+ + FSP_FLAGS_WIDTH_DATA_DIR \
+ + FSP_FLAGS_WIDTH_PAGE_COMPRESSION \
+ + FSP_FLAGS_WIDTH_PAGE_COMPRESSION_LEVEL \
+ + FSP_FLAGS_WIDTH_ATOMIC_WRITES )
/** A mask of all the known/used bits in tablespace flags */
#define FSP_FLAGS_MASK (~(~0 << FSP_FLAGS_WIDTH))
@@ -71,9 +83,20 @@ is found in a remote location, not the default data directory. */
/** Zero relative shift position of the ATOMIC_BLOBS field */
#define FSP_FLAGS_POS_ATOMIC_BLOBS (FSP_FLAGS_POS_ZIP_SSIZE \
+ FSP_FLAGS_WIDTH_ZIP_SSIZE)
-/** Zero relative shift position of the PAGE_SSIZE field */
-#define FSP_FLAGS_POS_PAGE_SSIZE (FSP_FLAGS_POS_ATOMIC_BLOBS \
+/** Note that these need to be before the page size to be compatible with
+dictionary */
+/** Zero relative shift position of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_POS_PAGE_COMPRESSION (FSP_FLAGS_POS_ATOMIC_BLOBS \
+ FSP_FLAGS_WIDTH_ATOMIC_BLOBS)
+/** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */
+#define FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL (FSP_FLAGS_POS_PAGE_COMPRESSION \
+ + FSP_FLAGS_WIDTH_PAGE_COMPRESSION)
+/** Zero relative shift position of the ATOMIC_WRITES field */
+#define FSP_FLAGS_POS_ATOMIC_WRITES (FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL \
+ + FSP_FLAGS_WIDTH_PAGE_COMPRESSION_LEVEL)
+/** Zero relative shift position of the PAGE_SSIZE field */
+#define FSP_FLAGS_POS_PAGE_SSIZE (FSP_FLAGS_POS_ATOMIC_WRITES \
+ + FSP_FLAGS_WIDTH_ATOMIC_WRITES)
/** Zero relative shift position of the start of the UNUSED bits */
#define FSP_FLAGS_POS_DATA_DIR (FSP_FLAGS_POS_PAGE_SSIZE \
+ FSP_FLAGS_WIDTH_PAGE_SSIZE)
@@ -101,7 +124,18 @@ is found in a remote location, not the default data directory. */
#define FSP_FLAGS_MASK_DATA_DIR \
((~(~0 << FSP_FLAGS_WIDTH_DATA_DIR)) \
<< FSP_FLAGS_POS_DATA_DIR)
-
+/** Bit mask of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_MASK_PAGE_COMPRESSION \
+ ((~(~0 << FSP_FLAGS_WIDTH_PAGE_COMPRESSION)) \
+ << FSP_FLAGS_POS_PAGE_COMPRESSION)
+/** Bit mask of the PAGE_COMPRESSION_LEVEL field */
+#define FSP_FLAGS_MASK_PAGE_COMPRESSION_LEVEL \
+ ((~(~0 << FSP_FLAGS_WIDTH_PAGE_COMPRESSION_LEVEL)) \
+ << FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL)
+/** Bit mask of the ATOMIC_WRITES field */
+#define FSP_FLAGS_MASK_ATOMIC_WRITES \
+ ((~(~0 << FSP_FLAGS_WIDTH_ATOMIC_WRITES)) \
+ << FSP_FLAGS_POS_ATOMIC_WRITES)
/** Return the value of the POST_ANTELOPE field */
#define FSP_FLAGS_GET_POST_ANTELOPE(flags) \
((flags & FSP_FLAGS_MASK_POST_ANTELOPE) \
@@ -126,11 +160,39 @@ is found in a remote location, not the default data directory. */
#define FSP_FLAGS_GET_UNUSED(flags) \
(flags >> FSP_FLAGS_POS_UNUSED)
+/** Return the value of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_GET_PAGE_COMPRESSION(flags) \
+ ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION) \
+ >> FSP_FLAGS_POS_PAGE_COMPRESSION)
+/** Return the value of the PAGE_COMPRESSION_LEVEL field */
+#define FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags) \
+ ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION_LEVEL) \
+ >> FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL)
+/** Return the value of the ATOMIC_WRITES field */
+#define FSP_FLAGS_GET_ATOMIC_WRITES(flags) \
+ ((flags & FSP_FLAGS_MASK_ATOMIC_WRITES) \
+ >> FSP_FLAGS_POS_ATOMIC_WRITES)
+
/** Set a PAGE_SSIZE into the correct bits in a given
tablespace flags. */
#define FSP_FLAGS_SET_PAGE_SSIZE(flags, ssize) \
(flags | (ssize << FSP_FLAGS_POS_PAGE_SSIZE))
+/** Set a PAGE_COMPRESSION into the correct bits in a given
+tablespace flags. */
+#define FSP_FLAGS_SET_PAGE_COMPRESSION(flags, compression) \
+ (flags | (compression << FSP_FLAGS_POS_PAGE_COMPRESSION))
+
+/** Set a PAGE_COMPRESSION_LEVEL into the correct bits in a given
+tablespace flags. */
+#define FSP_FLAGS_SET_PAGE_COMPRESSION_LEVEL(flags, level) \
+ (flags | (level << FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL))
+
+/** Set a ATOMIC_WRITES into the correct bits in a given
+tablespace flags. */
+#define FSP_FLAGS_SET_ATOMIC_WRITES(flags, atomics) \
+ (flags | (atomics << FSP_FLAGS_POS_ATOMIC_WRITES))
+
/* @} */
/* @defgroup Tablespace Header Constants (moved from fsp0fsp.c) @{ */
@@ -198,7 +260,6 @@ descriptor page, but used only in the first. */
FSP_FREE_LIMIT at a time */
/* @} */
-#ifndef UNIV_INNOCHECKSUM
/* @defgroup File Segment Inode Constants (moved from fsp0fsp.c) @{ */
@@ -336,6 +397,7 @@ the extent are free and which contain old tuple version to clean. */
/* @} */
+#ifndef UNIV_INNOCHECKSUM
/**********************************************************************//**
Initializes the file space system. */
UNIV_INTERN
@@ -740,6 +802,33 @@ fsp_flags_get_page_size(
/*====================*/
ulint flags); /*!< in: tablespace flags */
+/*********************************************************************/
+/* @return offset into fsp header where crypt data is stored */
+UNIV_INTERN
+ulint
+fsp_header_get_crypt_offset(
+/*========================*/
+ ulint zip_size, /*!< in: zip_size */
+ ulint* max_size); /*!< out: free space after offset */
+
+#define fsp_page_is_free(space,page,mtr) \
+ fsp_page_is_free_func(space,page,mtr, __FILE__, __LINE__)
+
+#ifndef UNIV_INNOCHECKSUM
+/**********************************************************************//**
+Checks if a single page is free.
+@return true if free */
+UNIV_INTERN
+bool
+fsp_page_is_free_func(
+/*==============*/
+ ulint space, /*!< in: space id */
+ ulint page, /*!< in: page offset */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ const char *file,
+ ulint line);
+#endif
+
#ifndef UNIV_NONINL
#include "fsp0fsp.ic"
#endif
diff --git a/storage/innobase/include/fsp0fsp.ic b/storage/innobase/include/fsp0fsp.ic
index 0d81e817cc9..9f09a9d53e1 100644
--- a/storage/innobase/include/fsp0fsp.ic
+++ b/storage/innobase/include/fsp0fsp.ic
@@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -63,12 +64,17 @@ fsp_flags_is_valid(
ulint atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(flags);
ulint page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags);
ulint unused = FSP_FLAGS_GET_UNUSED(flags);
+ ulint page_compression = FSP_FLAGS_GET_PAGE_COMPRESSION(flags);
+ ulint page_compression_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags);
+ ulint atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags);
DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return(false););
/* fsp_flags is zero unless atomic_blobs is set. */
/* Make sure there are no bits that we do not know about. */
if (unused != 0 || flags == 1) {
+ fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted unused %lu\n",
+ flags, unused);
return(false);
} else if (post_antelope) {
/* The Antelope row formats REDUNDANT and COMPACT did
@@ -76,6 +82,8 @@ fsp_flags_is_valid(
4-byte field is zero for Antelope row formats. */
if (!atomic_blobs) {
+ fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted atomic_blobs %lu\n",
+ flags, atomic_blobs);
return(false);
}
}
@@ -87,10 +95,14 @@ fsp_flags_is_valid(
externally stored parts. */
if (post_antelope || zip_ssize != 0) {
+ fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted zip_ssize %lu atomic_blobs %lu\n",
+ flags, zip_ssize, atomic_blobs);
return(false);
}
} else if (!post_antelope || zip_ssize > PAGE_ZIP_SSIZE_MAX) {
+ fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted zip_ssize %lu max %d\n",
+ flags, zip_ssize, PAGE_ZIP_SSIZE_MAX);
return(false);
} else if (page_ssize > UNIV_PAGE_SSIZE_MAX) {
@@ -98,12 +110,33 @@ fsp_flags_is_valid(
be zero for an original 16k page size.
Validate the page shift size is within allowed range. */
+ fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted page_ssize %lu max %lu\n",
+ flags, page_ssize, UNIV_PAGE_SSIZE_MAX);
return(false);
} else if (UNIV_PAGE_SIZE != UNIV_PAGE_SIZE_ORIG && !page_ssize) {
+ fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted page_ssize %lu max %lu:%d\n",
+ flags, page_ssize, UNIV_PAGE_SIZE, UNIV_PAGE_SIZE_ORIG);
return(false);
}
+ /* Page compression level requires page compression and atomic blobs
+ to be set */
+ if (page_compression_level || page_compression) {
+ if (!page_compression || !atomic_blobs) {
+ fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted page_compression %lu\n"
+ "InnoDB: Error: page_compression_level %lu atomic_blobs %lu\n",
+ flags, page_compression, page_compression_level, atomic_blobs);
+ return(false);
+ }
+ }
+
+ if (atomic_writes > ATOMIC_WRITES_OFF) {
+ fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted atomic_writes %lu\n",
+ flags, atomic_writes);
+ return (false);
+ }
+
#if UNIV_FORMAT_MAX != UNIV_FORMAT_B
# error "UNIV_FORMAT_MAX != UNIV_FORMAT_B, Add more validations."
#endif
@@ -242,6 +275,7 @@ xdes_calc_descriptor_index(
return(ut_2pow_remainder(offset, zip_size) / FSP_EXTENT_SIZE);
}
}
+#endif /* !UNIV_INNOCHECKSUM */
/**********************************************************************//**
Gets a descriptor bit of a page.
@@ -269,6 +303,7 @@ xdes_get_bit(
bit_index));
}
+#ifndef UNIV_INNOCHECKSUM
/********************************************************************//**
Calculates the page where the descriptor of a page resides.
@return descriptor page offset */
@@ -312,3 +347,4 @@ xdes_calc_descriptor_page(
}
#endif /* !UNIV_INNOCHECKSUM */
+
diff --git a/storage/innobase/include/fsp0pagecompress.h b/storage/innobase/include/fsp0pagecompress.h
new file mode 100644
index 00000000000..5f943ee2b83
--- /dev/null
+++ b/storage/innobase/include/fsp0pagecompress.h
@@ -0,0 +1,84 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2015, MariaDB Corporation. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fsp0pagecompress.h
+Helper functions for extracting/storing page compression and
+atomic writes information to file space.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+#ifndef fsp0pagecompress_h
+#define fsp0pagecompress_h
+
+/* Supported page compression methods */
+
+#define PAGE_UNCOMPRESSED 0
+#define PAGE_ZLIB_ALGORITHM 1
+#define PAGE_LZ4_ALGORITHM 2
+#define PAGE_LZO_ALGORITHM 3
+#define PAGE_LZMA_ALGORITHM 4
+#define PAGE_BZIP2_ALGORITHM 5
+#define PAGE_SNAPPY_ALGORITHM 6
+#define PAGE_ALGORITHM_LAST PAGE_SNAPPY_ALGORITHM
+
+/**********************************************************************//**
+Reads the page compression level from the first page of a tablespace.
+@return page compression level, or 0 if uncompressed */
+UNIV_INTERN
+ulint
+fsp_header_get_compression_level(
+/*=============================*/
+ const page_t* page); /*!< in: first page of a tablespace */
+
+/********************************************************************//**
+Determine if the tablespace is page compressed from dict_table_t::flags.
+@return TRUE if page compressed, FALSE if not compressed */
+UNIV_INLINE
+ibool
+fsp_flags_is_page_compressed(
+/*=========================*/
+ ulint flags); /*!< in: tablespace flags */
+
+/********************************************************************//**
+Extract the page compression level from tablespace flags.
+A tablespace has only one physical page compression level
+whether that page is compressed or not.
+@return page compression level of the file-per-table tablespace,
+or zero if the table is not compressed. */
+UNIV_INLINE
+ulint
+fsp_flags_get_page_compression_level(
+/*=================================*/
+ ulint flags); /*!< in: tablespace flags */
+
+/********************************************************************//**
+Determine the tablespace is using atomic writes from dict_table_t::flags.
+@return true if atomic writes is used, false if not */
+UNIV_INLINE
+atomic_writes_t
+fsp_flags_get_atomic_writes(
+/*========================*/
+ ulint flags); /*!< in: tablespace flags */
+
+#ifndef UNIV_NONINL
+#include "fsp0pagecompress.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/fsp0pagecompress.ic b/storage/innobase/include/fsp0pagecompress.ic
new file mode 100644
index 00000000000..1ac80defd89
--- /dev/null
+++ b/storage/innobase/include/fsp0pagecompress.ic
@@ -0,0 +1,211 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2015, MariaDB Corporation. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fsp0pagecompress.ic
+Implementation for helper functions for extracting/storing page
+compression and atomic writes information to file space.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+/********************************************************************//**
+Determine if the tablespace is page compressed from dict_table_t::flags.
+@return TRUE if page compressed, FALSE if not page compressed */
+UNIV_INLINE
+ibool
+fsp_flags_is_page_compressed(
+/*=========================*/
+ ulint flags) /*!< in: tablespace flags */
+{
+ return(FSP_FLAGS_GET_PAGE_COMPRESSION(flags));
+}
+
+/********************************************************************//**
+Determine the tablespace is page compression level from dict_table_t::flags.
+@return page compression level or 0 if not compressed*/
+UNIV_INLINE
+ulint
+fsp_flags_get_page_compression_level(
+/*=================================*/
+ ulint flags) /*!< in: tablespace flags */
+{
+ return(FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags));
+}
+
+/********************************************************************//**
+Determine the tablespace is using atomic writes from dict_table_t::flags.
+@return true if atomic writes is used, false if not */
+UNIV_INLINE
+atomic_writes_t
+fsp_flags_get_atomic_writes(
+/*========================*/
+ ulint flags) /*!< in: tablespace flags */
+{
+ return((atomic_writes_t)FSP_FLAGS_GET_ATOMIC_WRITES(flags));
+}
+
+/*******************************************************************//**
+Find out wheather the page is index page or not
+@return true if page type index page, false if not */
+UNIV_INLINE
+ibool
+fil_page_is_index_page(
+/*===================*/
+ byte* buf) /*!< in: page */
+{
+ return(mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_INDEX);
+}
+
+/*******************************************************************//**
+Find out wheather the page is page compressed
+@return true if page is page compressed, false if not */
+UNIV_INLINE
+ibool
+fil_page_is_compressed(
+/*===================*/
+ byte* buf) /*!< in: page */
+{
+ return(mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_PAGE_COMPRESSED);
+}
+
+/*******************************************************************//**
+Find out wheather the page is page compressed
+@return true if page is page compressed, false if not */
+UNIV_INLINE
+ibool
+fil_page_is_compressed_encrypted(
+/*=============================*/
+ byte* buf) /*!< in: page */
+{
+ return(mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED);
+}
+
+/*******************************************************************//**
+Returns the page compression level of the space, or 0 if the space
+is not compressed. The tablespace must be cached in the memory cache.
+@return page compression level, ULINT_UNDEFINED if space not found */
+UNIV_INLINE
+ulint
+fil_space_get_page_compression_level(
+/*=================================*/
+ ulint id) /*!< in: space id */
+{
+ ulint flags;
+
+ flags = fil_space_get_flags(id);
+
+ if (flags && flags != ULINT_UNDEFINED) {
+
+ return(fsp_flags_get_page_compression_level(flags));
+ }
+
+ return(flags);
+}
+
+/*******************************************************************//**
+Extract the page compression from space.
+@return true if space is page compressed, false if space is not found
+or space is not page compressed. */
+UNIV_INLINE
+ibool
+fil_space_is_page_compressed(
+/*=========================*/
+ ulint id) /*!< in: space id */
+{
+ ulint flags;
+
+ flags = fil_space_get_flags(id);
+
+ if (flags && flags != ULINT_UNDEFINED) {
+
+ return(fsp_flags_is_page_compressed(flags));
+ }
+
+ return(flags);
+}
+
+/****************************************************************//**
+Get the name of the compression algorithm used for page
+compression.
+@return compression algorithm name or "UNKNOWN" if not known*/
+UNIV_INLINE
+const char*
+fil_get_compression_alg_name(
+/*=========================*/
+ ulint comp_alg) /*!<in: compression algorithm number */
+{
+ switch(comp_alg) {
+ case PAGE_UNCOMPRESSED:
+ return ("uncompressed");
+ break;
+ case PAGE_ZLIB_ALGORITHM:
+ return ("ZLIB");
+ break;
+ case PAGE_LZ4_ALGORITHM:
+ return ("LZ4");
+ break;
+ case PAGE_LZO_ALGORITHM:
+ return ("LZO");
+ break;
+ case PAGE_LZMA_ALGORITHM:
+ return ("LZMA");
+ break;
+ default:
+ return("UNKNOWN");
+ ut_error;
+ break;
+ }
+}
+
+/*******************************************************************//**
+Returns the atomic writes flag of the space, or false if the space
+is not using atomic writes. The tablespace must be cached in the memory cache.
+@return atomic writes table option value */
+UNIV_INLINE
+atomic_writes_t
+fil_space_get_atomic_writes(
+/*========================*/
+ ulint id) /*!< in: space id */
+{
+ ulint flags;
+
+ flags = fil_space_get_flags(id);
+
+ if (flags && flags != ULINT_UNDEFINED) {
+
+ return((atomic_writes_t)fsp_flags_get_atomic_writes(flags));
+ }
+
+ return((atomic_writes_t)0);
+}
+
+/*******************************************************************//**
+Find out wheather the page is page compressed with lzo method
+@return true if page is page compressed with lzo method, false if not */
+UNIV_INLINE
+ibool
+fil_page_is_lzo_compressed(
+/*=======================*/
+ byte* buf) /*!< in: page */
+{
+ return((mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_PAGE_COMPRESSED &&
+ mach_read_from_8(buf+FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) == PAGE_LZO_ALGORITHM) ||
+ (mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED &&
+ mach_read_from_2(buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE) == PAGE_LZO_ALGORITHM));
+}
diff --git a/storage/innobase/include/fsp0types.h b/storage/innobase/include/fsp0types.h
index 94fd908ab0c..a6797cd66de 100644
--- a/storage/innobase/include/fsp0types.h
+++ b/storage/innobase/include/fsp0types.h
@@ -29,6 +29,7 @@ Created May 26, 2009 Vasil Dimov
#include "univ.i"
#include "fil0fil.h" /* for FIL_PAGE_DATA */
+#include "ut0byte.h"
/** @name Flags for inserting records in order
If records are inserted in order, there are the following
@@ -41,14 +42,17 @@ fseg_alloc_free_page) */
#define FSP_NO_DIR ((byte)113) /*!< no order */
/* @} */
-/** File space extent size (one megabyte) in pages */
-#define FSP_EXTENT_SIZE (1048576U / UNIV_PAGE_SIZE)
+/** File space extent size (one megabyte if default two or four if not) in pages */
+#define FSP_EXTENT_SIZE ((UNIV_PAGE_SIZE <= (1 << 14) ? \
+ (1048576U / UNIV_PAGE_SIZE) : \
+ ((UNIV_PAGE_SIZE <= 1 << 15) ? \
+ (2097152U / UNIV_PAGE_SIZE) : (4194304U / UNIV_PAGE_SIZE))))
-/** File space extent size (one megabyte) in pages for MAX page size */
-#define FSP_EXTENT_SIZE_MAX (1048576 / UNIV_PAGE_SIZE_MAX)
+/** File space extent size (four megabytes) in pages for MAX page size */
+#define FSP_EXTENT_SIZE_MAX (4194304U / UNIV_PAGE_SIZE_MAX)
/** File space extent size (one megabyte) in pages for MIN page size */
-#define FSP_EXTENT_SIZE_MIN (1048576 / UNIV_PAGE_SIZE_MIN)
+#define FSP_EXTENT_SIZE_MIN (1048576U / UNIV_PAGE_SIZE_MIN)
/** On a page of any file segment, data may be put starting from this
offset */
diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h
index 20409f85e0c..4d523cf1289 100644
--- a/storage/innobase/include/ha_prototypes.h
+++ b/storage/innobase/include/ha_prototypes.h
@@ -286,6 +286,16 @@ innobase_casedn_str(
/*================*/
char* a); /*!< in/out: string to put in lower case */
+#ifdef WITH_WSREP
+UNIV_INTERN
+int
+wsrep_innobase_kill_one_trx(void *thd_ptr,
+ const trx_t *bf_trx, trx_t *victim_trx, ibool signal);
+int wsrep_innobase_mysql_sort(int mysql_type, uint charset_number,
+ unsigned char* str, unsigned int str_length,
+ unsigned int buf_length);
+#endif /* WITH_WSREP */
+
/**********************************************************************//**
Determines the connection character set.
@return connection character set */
diff --git a/storage/innobase/include/hash0hash.h b/storage/innobase/include/hash0hash.h
index 6f9a628df5d..9a4077befb1 100644
--- a/storage/innobase/include/hash0hash.h
+++ b/storage/innobase/include/hash0hash.h
@@ -144,6 +144,33 @@ do {\
}\
} while (0)
+#ifdef WITH_WSREP
+/*******************************************************************//**
+Inserts a struct to the head of hash table. */
+
+#define HASH_PREPEND(TYPE, NAME, TABLE, FOLD, DATA) \
+do { \
+ hash_cell_t* cell3333; \
+ TYPE* struct3333; \
+ \
+ HASH_ASSERT_OWN(TABLE, FOLD) \
+ \
+ (DATA)->NAME = NULL; \
+ \
+ cell3333 = hash_get_nth_cell(TABLE, hash_calc_hash(FOLD, TABLE));\
+ \
+ if (cell3333->node == NULL) { \
+ cell3333->node = DATA; \
+ DATA->NAME = NULL; \
+ } else { \
+ struct3333 = (TYPE*) cell3333->node; \
+ \
+ DATA->NAME = struct3333; \
+ \
+ cell3333->node = DATA; \
+ } \
+} while (0)
+#endif /*WITH_WSREP */
#ifdef UNIV_HASH_DEBUG
# define HASH_ASSERT_VALID(DATA) ut_a((void*) (DATA) != (void*) -1)
# define HASH_INVALIDATE(DATA, NAME) *(void**) (&DATA->NAME) = (void*) -1
diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h
index bf4a4ae1c35..d96fdfa9d89 100644
--- a/storage/innobase/include/lock0lock.h
+++ b/storage/innobase/include/lock0lock.h
@@ -181,6 +181,16 @@ lock_update_merge_left(
const buf_block_t* right_block); /*!< in: merged index page
which will be discarded */
/*************************************************************//**
+Updates the lock table when a page is splited and merged to
+two pages. */
+UNIV_INTERN
+void
+lock_update_split_and_merge(
+ const buf_block_t* left_block, /*!< in: left page to which merged */
+ const rec_t* orig_pred, /*!< in: original predecessor of
+ supremum on the left page before merge*/
+ const buf_block_t* right_block);/*!< in: right page from which merged */
+/*************************************************************//**
Resets the original locks on heir and replaces them with gap type locks
inherited from rec. */
UNIV_INTERN
@@ -972,6 +982,16 @@ extern lock_sys_t* lock_sys;
mutex_exit(&lock_sys->wait_mutex); \
} while (0)
+#ifdef WITH_WSREP
+/*********************************************************************//**
+Cancels a waiting lock request and releases possible other transactions
+waiting behind it. */
+UNIV_INTERN
+void
+lock_cancel_waiting_and_release(
+/*============================*/
+ lock_t* lock); /*!< in/out: waiting lock request */
+#endif /* WITH_WSREP */
#ifndef UNIV_NONINL
#include "lock0lock.ic"
#endif
diff --git a/storage/innobase/include/log0crypt.h b/storage/innobase/include/log0crypt.h
new file mode 100644
index 00000000000..b04f16d2a29
--- /dev/null
+++ b/storage/innobase/include/log0crypt.h
@@ -0,0 +1,68 @@
+/**************************************************//**
+@file include/log0crypt.h
+Innodb log encrypt/decrypt
+
+Created 11/25/2013 Minli Zhu
+*******************************************************/
+#ifndef log0crypt_h
+#define log0crypt_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#include "my_crypt.h"
+
+typedef int Crypt_result;
+
+/* If true, enable redo log encryption. */
+extern my_bool srv_encrypt_log;
+
+/***********************************************************************
+Set next checkpoint's key version to latest one, and generate new key */
+UNIV_INTERN
+void
+log_crypt_set_ver_and_key(
+/*======================*/
+ ib_uint64_t next_checkpoint_no);
+
+
+/*********************************************************************//**
+Writes the crypto (version, msg and iv) info, which has been used for
+log blocks with lsn <= this checkpoint's lsn, to a log header's
+checkpoint buf. */
+UNIV_INTERN
+void
+log_crypt_write_checkpoint_buf(
+/*===========================*/
+ byte* buf); /*!< in/out: checkpoint buffer */
+
+/*********************************************************************//**
+Read the crypto (version, msg and iv) info, which has been used for
+log blocks with lsn <= this checkpoint's lsn, from a log header's
+checkpoint buf. */
+UNIV_INTERN
+bool
+log_crypt_read_checkpoint_buf(
+/*===========================*/
+ const byte* buf); /*!< in: checkpoint buffer */
+
+/********************************************************
+Encrypt one or more log block before it is flushed to disk */
+UNIV_INTERN
+void
+log_encrypt_before_write(
+/*===========================*/
+ ib_uint64_t next_checkpoint_no, /*!< in: log group to be flushed */
+ byte* block, /*!< in/out: pointer to a log block */
+ const ulint size); /*!< in: size of log blocks */
+
+/********************************************************
+Decrypt a specified log segment after they are read from a log file to a buffer.
+*/
+UNIV_INTERN
+void
+log_decrypt_after_read(
+/*==========================*/
+ byte* frame, /*!< in/out: log segment */
+ const ulint size); /*!< in: log segment size */
+
+#endif // log0crypt.h
diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h
index ad9710b1870..a960bc50454 100644
--- a/storage/innobase/include/log0log.h
+++ b/storage/innobase/include/log0log.h
@@ -40,9 +40,8 @@ Created 12/9/1995 Heikki Tuuri
#include "sync0sync.h"
#include "sync0rw.h"
#endif /* !UNIV_HOTBACKUP */
+#include "log0crypt.h"
-/* Type used for all log sequence number storage and arithmetics */
-typedef ib_uint64_t lsn_t;
#define LSN_MAX IB_UINT64_MAX
#define LSN_PF UINT64PF
@@ -677,8 +676,16 @@ extern log_t* log_sys;
is valid */
#endif
#define LOG_CHECKPOINT_OFFSET_HIGH32 (16 + LOG_CHECKPOINT_ARRAY_END)
-#define LOG_CHECKPOINT_SIZE (20 + LOG_CHECKPOINT_ARRAY_END)
+#define LOG_CRYPT_VER (20 + LOG_CHECKPOINT_ARRAY_END)
+#define LOG_CRYPT_MAX_ENTRIES (5)
+#define LOG_CRYPT_ENTRY_SIZE (4 + 4 + 2 * MY_AES_BLOCK_SIZE)
+#define LOG_CRYPT_SIZE (1 + 1 + \
+ (LOG_CRYPT_MAX_ENTRIES * \
+ LOG_CRYPT_ENTRY_SIZE))
+
+#define LOG_CHECKPOINT_SIZE (20 + LOG_CHECKPOINT_ARRAY_END + \
+ LOG_CRYPT_SIZE)
/* Offsets of a log file header */
#define LOG_GROUP_ID 0 /* log group number */
@@ -1006,6 +1013,22 @@ struct log_t{
/* @} */
#endif /* UNIV_LOG_ARCHIVE */
+extern os_event_t log_scrub_event;
+/* log scrubbing speed, in bytes/sec */
+extern ulonglong innodb_scrub_log_speed;
+
+/*****************************************************************//**
+This is the main thread for log scrub. It waits for an event and
+when waked up fills current log block with dummy records and
+sleeps again.
+@return this function does not return, it calls os_thread_exit() */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(log_scrub_thread)(
+/*===============================*/
+ void* arg); /*!< in: a dummy parameter
+ required by os_thread_create */
+
#ifndef UNIV_NONINL
#include "log0log.ic"
#endif
diff --git a/storage/innobase/include/mach0data.ic b/storage/innobase/include/mach0data.ic
index c46fcec107e..fe55adaf002 100644
--- a/storage/innobase/include/mach0data.ic
+++ b/storage/innobase/include/mach0data.ic
@@ -42,6 +42,7 @@ mach_write_to_1(
b[0] = (byte) n;
}
+#endif /* !UNIV_INNOCHECKSUM */
/********************************************************//**
The following function is used to fetch data from one byte.
@@ -73,8 +74,6 @@ mach_write_to_2(
b[1] = (byte)(n);
}
-#endif /* !UNIV_INNOCHECKSUM */
-
/********************************************************//**
The following function is used to fetch data from 2 consecutive
bytes. The most significant byte is at the lowest address.
@@ -860,7 +859,10 @@ mach_write_ulonglong(
*dest ^= 0x80;
}
}
+#endif /* !UNIV_HOTBACKUP */
+#endif /* !UNIV_INNOCHECKSUM */
+#ifndef UNIV_HOTBACKUP
/********************************************************//**
Reads 1 - 4 bytes from a file page buffered in the buffer pool.
@return value read */
@@ -886,4 +888,3 @@ mach_read_ulint(
}
#endif /* !UNIV_HOTBACKUP */
-#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/mtr0log.ic b/storage/innobase/include/mtr0log.ic
index 3ed4876eeab..6457e02d455 100644
--- a/storage/innobase/include/mtr0log.ic
+++ b/storage/innobase/include/mtr0log.ic
@@ -191,7 +191,7 @@ mlog_write_initial_log_record_fast(
ulint offset;
ut_ad(mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_X_FIX));
- ut_ad(type <= MLOG_BIGGEST_TYPE);
+ ut_ad(type <= MLOG_BIGGEST_TYPE || EXTRA_CHECK_MLOG_NUMBER(type));
ut_ad(ptr && log_ptr);
page = (const byte*) ut_align_down(ptr, UNIV_PAGE_SIZE);
diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h
index b91dbd0353c..eae981f2fbb 100644
--- a/storage/innobase/include/mtr0mtr.h
+++ b/storage/innobase/include/mtr0mtr.h
@@ -189,6 +189,14 @@ For 1 - 8 bytes, the flag value must give the length also! @{ */
page */
#define MLOG_BIGGEST_TYPE ((byte)53) /*!< biggest value (used in
assertions) */
+
+#define MLOG_FILE_WRITE_CRYPT_DATA ((byte)100) /*!< log record for
+ writing/updating crypt data of
+ a tablespace */
+
+#define EXTRA_CHECK_MLOG_NUMBER(x) \
+ ((x) == MLOG_FILE_WRITE_CRYPT_DATA)
+
/* @} */
/** @name Flags for MLOG_FILE operations
@@ -251,6 +259,18 @@ mtr_release_s_latch_at_savepoint(
#else /* !UNIV_HOTBACKUP */
# define mtr_release_s_latch_at_savepoint(mtr,savepoint,lock) ((void) 0)
#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************//**
+Releases a buf_page stored in an mtr memo after a
+savepoint. */
+UNIV_INTERN
+void
+mtr_release_buf_page_at_savepoint(
+/*=============================*/
+ mtr_t* mtr, /*!< in: mtr */
+ ulint savepoint, /*!< in: savepoint */
+ buf_block_t* block); /*!< in: block to release */
+
/***************************************************************//**
Gets the logging mode of a mini-transaction.
@return logging mode: MTR_LOG_NONE, ... */
diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h
index 5077c9e37eb..04593f5b754 100644
--- a/storage/innobase/include/os0file.h
+++ b/storage/innobase/include/os0file.h
@@ -2,6 +2,7 @@
Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, 2015, MariaDB Corporation.
Portions of this file contain modifications contributed and copyrighted
by Percona Inc.. Those modifications are
@@ -151,10 +152,9 @@ enum os_file_create_t {
#define OS_FILE_INSUFFICIENT_RESOURCE 78
#define OS_FILE_AIO_INTERRUPTED 79
#define OS_FILE_OPERATION_ABORTED 80
-
#define OS_FILE_ACCESS_VIOLATION 81
-
-#define OS_FILE_ERROR_MAX 100
+#define OS_FILE_OPERATION_NOT_SUPPORTED 125
+#define OS_FILE_ERROR_MAX 200
/* @} */
/** Types for aio operations @{ */
@@ -295,36 +295,38 @@ os_file_write
The wrapper functions have the prefix of "innodb_". */
#ifdef UNIV_PFS_IO
-# define os_file_create(key, name, create, purpose, type, success) \
+# define os_file_create(key, name, create, purpose, type, success, atomic_writes) \
pfs_os_file_create_func(key, name, create, purpose, type, \
- success, __FILE__, __LINE__)
+ success, atomic_writes, __FILE__, __LINE__)
# define os_file_create_simple(key, name, create, access, success) \
pfs_os_file_create_simple_func(key, name, create, access, \
success, __FILE__, __LINE__)
# define os_file_create_simple_no_error_handling( \
- key, name, create_mode, access, success) \
+ key, name, create_mode, access, success, atomic_writes) \
pfs_os_file_create_simple_no_error_handling_func( \
- key, name, create_mode, access, success, __FILE__, __LINE__)
+ key, name, create_mode, access, success, atomic_writes, __FILE__, __LINE__)
# define os_file_close(file) \
pfs_os_file_close_func(file, __FILE__, __LINE__)
# define os_aio(type, mode, name, file, buf, offset, \
- n, message1, message2) \
+ n, message1, message2, write_size) \
pfs_os_aio_func(type, mode, name, file, buf, offset, \
- n, message1, message2, __FILE__, __LINE__)
+ n, message1, message2, write_size, \
+ __FILE__, __LINE__)
+
# define os_file_read(file, buf, offset, n) \
pfs_os_file_read_func(file, buf, offset, n, __FILE__, __LINE__)
# define os_file_read_no_error_handling(file, buf, offset, n) \
pfs_os_file_read_no_error_handling_func(file, buf, offset, n, \
- __FILE__, __LINE__)
+ __FILE__, __LINE__)
-# define os_file_write(name, file, buf, offset, n) \
- pfs_os_file_write_func(name, file, buf, offset, \
+# define os_file_write(name, file, buf, offset, n) \
+ pfs_os_file_write_func(name, file, buf, offset, \
n, __FILE__, __LINE__)
# define os_file_flush(file) \
@@ -342,24 +344,25 @@ The wrapper functions have the prefix of "innodb_". */
/* If UNIV_PFS_IO is not defined, these I/O APIs point
to original un-instrumented file I/O APIs */
-# define os_file_create(key, name, create, purpose, type, success) \
- os_file_create_func(name, create, purpose, type, success)
+# define os_file_create(key, name, create, purpose, type, success, atomic_writes) \
+ os_file_create_func(name, create, purpose, type, success, atomic_writes)
-# define os_file_create_simple(key, name, create_mode, access, success) \
+# define os_file_create_simple(key, name, create_mode, access, success) \
os_file_create_simple_func(name, create_mode, access, success)
# define os_file_create_simple_no_error_handling( \
- key, name, create_mode, access, success) \
- os_file_create_simple_no_error_handling_func( \
- name, create_mode, access, success)
+ key, name, create_mode, access, success, atomic_writes) \
+ os_file_create_simple_no_error_handling_func( \
+ name, create_mode, access, success, atomic_writes)
# define os_file_close(file) os_file_close_func(file)
-# define os_aio(type, mode, name, file, buf, offset, n, message1, message2) \
+# define os_aio(type, mode, name, file, buf, offset, n, message1, \
+ message2, write_size) \
os_aio_func(type, mode, name, file, buf, offset, n, \
- message1, message2)
+ message1, message2, write_size)
-# define os_file_read(file, buf, offset, n) \
+# define os_file_read(file, buf, offset, n) \
os_file_read_func(file, buf, offset, n)
# define os_file_read_no_error_handling(file, buf, offset, n) \
@@ -524,7 +527,9 @@ os_file_create_simple_no_error_handling_func(
OS_FILE_READ_WRITE, or
OS_FILE_READ_ALLOW_DELETE; the last option is
used by a backup program reading the file */
- ibool* success)/*!< out: TRUE if succeed, FALSE if error */
+ ibool* success,/*!< out: TRUE if succeed, FALSE if error */
+ ulint atomic_writes)/*!< in: atomic writes table option
+ value */
__attribute__((nonnull, warn_unused_result));
/****************************************************************//**
Tries to disable OS caching on an opened file descriptor. */
@@ -558,7 +563,9 @@ os_file_create_func(
async i/o or unbuffered i/o: look in the
function source code for the exact rules */
ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */
- ibool* success)/*!< out: TRUE if succeed, FALSE if error */
+ ibool* success,/*!< out: TRUE if succeed, FALSE if error */
+ ulint atomic_writes)/*!< in: atomic writes table option
+ value */
__attribute__((nonnull, warn_unused_result));
/***********************************************************************//**
Deletes a file. The file has to be closed before calling this.
@@ -648,6 +655,8 @@ pfs_os_file_create_simple_no_error_handling_func(
OS_FILE_READ_ALLOW_DELETE; the last option is
used by a backup program reading the file */
ibool* success,/*!< out: TRUE if succeed, FALSE if error */
+ ulint atomic_writes,/*!< in: atomic writes table option
+ value */
const char* src_file,/*!< in: file name where func invoked */
ulint src_line)/*!< in: line where the func invoked */
__attribute__((nonnull, warn_unused_result));
@@ -676,6 +685,8 @@ pfs_os_file_create_func(
function source code for the exact rules */
ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */
ibool* success,/*!< out: TRUE if succeed, FALSE if error */
+ ulint atomic_writes,/*!< in: atomic writes table option
+ value*/
const char* src_file,/*!< in: file name where func invoked */
ulint src_line)/*!< in: line where the func invoked */
__attribute__((nonnull, warn_unused_result));
@@ -754,6 +765,11 @@ pfs_os_aio_func(
(can be used to identify a completed
aio operation); ignored if mode is
OS_AIO_SYNC */
+ ulint* write_size,/*!< in/out: Actual write size initialized
+ after fist successfull trim
+ operation for this page and if
+ initialized we do not trim again if
+ actual page size does not decrease. */
const char* src_file,/*!< in: file name where func invoked */
ulint src_line);/*!< in: line where the func invoked */
/*******************************************************************//**
@@ -952,6 +968,7 @@ os_file_write_func(
const void* buf, /*!< in: buffer from which to write */
os_offset_t offset, /*!< in: file offset where to write */
ulint n); /*!< in: number of bytes to write */
+
/*******************************************************************//**
Check the existence and type of the given file.
@return TRUE if call succeeded */
@@ -1114,10 +1131,15 @@ os_aio_func(
(can be used to identify a completed
aio operation); ignored if mode is
OS_AIO_SYNC */
- void* message2);/*!< in: message for the aio handler
+ void* message2,/*!< in: message for the aio handler
(can be used to identify a completed
aio operation); ignored if mode is
OS_AIO_SYNC */
+ ulint* write_size);/*!< in/out: Actual write size initialized
+ after fist successfull trim
+ operation for this page and if
+ initialized we do not trim again if
+ actual page size does not decrease. */
/************************************************************************//**
Wakes up all async i/o threads so that they know to exit themselves in
shutdown. */
@@ -1291,8 +1313,20 @@ os_file_handle_error_no_exit(
/*=========================*/
const char* name, /*!< in: name of a file or NULL */
const char* operation, /*!< in: operation */
- ibool on_error_silent);/*!< in: if TRUE then don't print
+ ibool on_error_silent,/*!< in: if TRUE then don't print
any message to the log. */
+ const char* file, /*!< in: file name */
+ const ulint line); /*!< in: line */
+
+/***********************************************************************//**
+Try to get number of bytes per sector from file system.
+@return file block size */
+UNIV_INTERN
+ulint
+os_file_get_block_size(
+/*===================*/
+ os_file_t file, /*!< in: handle to a file */
+ const char* name); /*!< in: file name */
#ifndef UNIV_NONINL
#include "os0file.ic"
diff --git a/storage/innobase/include/os0file.ic b/storage/innobase/include/os0file.ic
index defd8204ba3..db525bcea19 100644
--- a/storage/innobase/include/os0file.ic
+++ b/storage/innobase/include/os0file.ic
@@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 2010, 2011, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -88,6 +89,8 @@ pfs_os_file_create_simple_no_error_handling_func(
OS_FILE_READ_ALLOW_DELETE; the last option is
used by a backup program reading the file */
ibool* success,/*!< out: TRUE if succeed, FALSE if error */
+ ulint atomic_writes,/*!< in: atomic writes table option
+ value */
const char* src_file,/*!< in: file name where func invoked */
ulint src_line)/*!< in: line where the func invoked */
{
@@ -103,7 +106,7 @@ pfs_os_file_create_simple_no_error_handling_func(
name, src_file, src_line);
file = os_file_create_simple_no_error_handling_func(
- name, create_mode, access_type, success);
+ name, create_mode, access_type, success, atomic_writes);
register_pfs_file_open_end(locker, file);
@@ -134,6 +137,8 @@ pfs_os_file_create_func(
function source code for the exact rules */
ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */
ibool* success,/*!< out: TRUE if succeed, FALSE if error */
+ ulint atomic_writes, /*!< in: atomic writes table option
+ value */
const char* src_file,/*!< in: file name where func invoked */
ulint src_line)/*!< in: line where the func invoked */
{
@@ -148,7 +153,7 @@ pfs_os_file_create_func(
: PSI_FILE_OPEN),
name, src_file, src_line);
- file = os_file_create_func(name, create_mode, purpose, type, success);
+ file = os_file_create_func(name, create_mode, purpose, type, success, atomic_writes);
register_pfs_file_open_end(locker, file);
@@ -210,6 +215,11 @@ pfs_os_aio_func(
(can be used to identify a completed
aio operation); ignored if mode is
OS_AIO_SYNC */
+ ulint* write_size,/*!< in/out: Actual write size initialized
+ after fist successfull trim
+ operation for this page and if
+ initialized we do not trim again if
+ actual page size does not decrease. */
const char* src_file,/*!< in: file name where func invoked */
ulint src_line)/*!< in: line where the func invoked */
{
@@ -225,7 +235,7 @@ pfs_os_aio_func(
src_file, src_line);
result = os_aio_func(type, mode, name, file, buf, offset,
- n, message1, message2);
+ n, message1, message2, write_size);
register_pfs_file_io_end(locker, n);
diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h
index bd9bce75e57..d83b6e1985d 100644
--- a/storage/innobase/include/page0page.h
+++ b/storage/innobase/include/page0page.h
@@ -162,6 +162,8 @@ directory. */
#define PAGE_DIR_SLOT_MAX_N_OWNED 8
#define PAGE_DIR_SLOT_MIN_N_OWNED 4
+extern my_bool srv_immediate_scrub_data_uncompressed;
+
/************************************************************//**
Gets the start of a page.
@return start of the page */
diff --git a/storage/innobase/include/page0page.ic b/storage/innobase/include/page0page.ic
index 99e17001c0a..cde3cad33f0 100644
--- a/storage/innobase/include/page0page.ic
+++ b/storage/innobase/include/page0page.ic
@@ -1169,6 +1169,13 @@ page_mem_free(
ut_ad(rec_offs_validate(rec, index, offsets));
free = page_header_get_ptr(page, PAGE_FREE);
+ bool scrub = srv_immediate_scrub_data_uncompressed;
+ if (scrub) {
+ /* scrub record */
+ uint size = rec_offs_data_size(offsets);
+ memset(rec, 0, size);
+ }
+
page_rec_set_next(rec, free);
page_header_set_ptr(page, page_zip, PAGE_FREE, rec);
diff --git a/storage/innobase/include/rem0rec.h b/storage/innobase/include/rem0rec.h
index cfd0f10642a..94453cc7b89 100644
--- a/storage/innobase/include/rem0rec.h
+++ b/storage/innobase/include/rem0rec.h
@@ -984,6 +984,15 @@ are given in one byte (resp. two byte) format. */
two upmost bits in a two byte offset for special purposes */
#define REC_MAX_DATA_SIZE (16 * 1024)
+#ifdef WITH_WSREP
+int wsrep_rec_get_foreign_key(
+ byte *buf, /* out: extracted key */
+ ulint *buf_len, /* in/out: length of buf */
+ const rec_t* rec, /* in: physical record */
+ dict_index_t* index_for, /* in: index for foreign table */
+ dict_index_t* index_ref, /* in: index for referenced table */
+ ibool new_protocol); /* in: protocol > 1 */
+#endif /* WITH_WSREP */
#ifndef UNIV_NONINL
#include "rem0rec.ic"
#endif
diff --git a/storage/innobase/include/row0log.h b/storage/innobase/include/row0log.h
index 62715fe8808..f105838eece 100644
--- a/storage/innobase/include/row0log.h
+++ b/storage/innobase/include/row0log.h
@@ -35,6 +35,10 @@ Created 2011-05-26 Marko Makela
#include "trx0types.h"
#include "que0types.h"
+extern ulint onlineddl_rowlog_rows;
+extern ulint onlineddl_rowlog_pct_used;
+extern ulint onlineddl_pct_progress;
+
/******************************************************//**
Allocate the row log for an index and flag the index
for online creation.
diff --git a/storage/innobase/include/row0merge.h b/storage/innobase/include/row0merge.h
index de353d46202..f280644de70 100644
--- a/storage/innobase/include/row0merge.h
+++ b/storage/innobase/include/row0merge.h
@@ -40,6 +40,18 @@ Created 13/06/2005 Jan Lindstrom
#include "lock0types.h"
#include "srv0srv.h"
+/* Cluster index read task is mandatory */
+#define COST_READ_CLUSTERED_INDEX 1.0
+
+/* Basic fixed cost to build all type of index */
+#define COST_BUILD_INDEX_STATIC 0.5
+/* Dynamic cost to build all type of index, dynamic cost will be re-distributed based on page count ratio of each index */
+#define COST_BUILD_INDEX_DYNAMIC 0.5
+
+/* Sum of below two must be 1.0 */
+#define PCT_COST_MERGESORT_INDEX 0.4
+#define PCT_COST_INSERT_INDEX 0.6
+
// Forward declaration
struct ib_sequence_t;
@@ -371,7 +383,10 @@ row_merge_sort(
merge_file_t* file, /*!< in/out: file containing
index entries */
row_merge_block_t* block, /*!< in/out: 3 buffers */
- int* tmpfd) /*!< in/out: temporary file handle */
+ int* tmpfd, /*!< in/out: temporary file handle */
+ const bool update_progress, /*!< in: update progress status variable or not */
+ const float pct_progress, /*!< in: total progress percent until now */
+ const float pct_cost) /*!< in: current progress percent */
__attribute__((nonnull));
/*********************************************************************//**
Allocate a sort buffer.
diff --git a/storage/innobase/include/row0mysql.h b/storage/innobase/include/row0mysql.h
index 06c07002c2b..440001410f0 100644
--- a/storage/innobase/include/row0mysql.h
+++ b/storage/innobase/include/row0mysql.h
@@ -606,6 +606,12 @@ struct mysql_row_templ_t {
Innobase record in the current index;
not defined if template_type is
ROW_MYSQL_WHOLE_ROW */
+ ibool rec_field_is_prefix; /* is this field in a prefix index? */
+ ulint rec_prefix_field_no; /* record field, even if just a
+ prefix; same as rec_field_no when not a
+ prefix, otherwise rec_field_no is
+ ULINT_UNDEFINED but this is the true
+ field number*/
ulint clust_rec_field_no; /*!< field number of the column in an
Innobase record in the clustered index;
not defined if template_type is
@@ -707,7 +713,9 @@ struct row_prebuilt_t {
columns through a secondary index
and at least one column is not in
the secondary index, then this is
- set to TRUE */
+ set to TRUE; note that sometimes this
+ is set but we later optimize out the
+ clustered index lookup */
unsigned templ_contains_blob:1;/*!< TRUE if the template contains
a column with DATA_BLOB ==
get_innobase_type_from_mysql_type();
diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h
index 2d90f47eefe..79a8a5bf848 100644
--- a/storage/innobase/include/srv0mon.h
+++ b/storage/innobase/include/srv0mon.h
@@ -2,6 +2,7 @@
Copyright (c) 2010, 2013, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
@@ -163,7 +164,11 @@ enum monitor_id_t {
MONITOR_OVLD_BUF_POOL_PAGES_FREE,
MONITOR_OVLD_PAGE_CREATED,
MONITOR_OVLD_PAGES_WRITTEN,
+ MONITOR_OVLD_INDEX_PAGES_WRITTEN,
+ MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN,
MONITOR_OVLD_PAGES_READ,
+ MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS,
+ MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS_AVOIDED,
MONITOR_OVLD_BYTE_READ,
MONITOR_OVLD_BYTE_WRITTEN,
MONITOR_FLUSH_BATCH_SCANNED,
@@ -194,9 +199,12 @@ enum monitor_id_t {
MONITOR_LRU_BATCH_SCANNED,
MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
MONITOR_LRU_BATCH_SCANNED_PER_CALL,
- MONITOR_LRU_BATCH_TOTAL_PAGE,
- MONITOR_LRU_BATCH_COUNT,
- MONITOR_LRU_BATCH_PAGES,
+ MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
+ MONITOR_LRU_BATCH_FLUSH_COUNT,
+ MONITOR_LRU_BATCH_FLUSH_PAGES,
+ MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
+ MONITOR_LRU_BATCH_EVICT_COUNT,
+ MONITOR_LRU_BATCH_EVICT_PAGES,
MONITOR_LRU_SINGLE_FLUSH_SCANNED,
MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
@@ -304,6 +312,24 @@ enum monitor_id_t {
MONITOR_PAGE_DECOMPRESS,
MONITOR_PAD_INCREMENTS,
MONITOR_PAD_DECREMENTS,
+ /* New monitor variables for page compression */
+ MONITOR_OVLD_PAGE_COMPRESS_SAVED,
+ MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT512,
+ MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT1024,
+ MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT2048,
+ MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT4096,
+ MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT8192,
+ MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT16384,
+ MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT32768,
+ MONITOR_OVLD_PAGES_PAGE_COMPRESSED,
+ MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP,
+ MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP_SAVED,
+ MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED,
+ MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR,
+
+ /* New monitor variables for page encryption */
+ MONITOR_OVLD_PAGES_ENCRYPTED,
+ MONITOR_OVLD_PAGES_DECRYPTED,
/* Index related counters */
MONITOR_MODULE_INDEX,
diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
index d06a14a9153..2ae5f1ea13e 100644
--- a/storage/innobase/include/srv0srv.h
+++ b/storage/innobase/include/srv0srv.h
@@ -3,7 +3,7 @@
Copyright (c) 1995, 2013, Oracle and/or its affiliates. All rights reserved.
Copyright (c) 2008, 2009, Google Inc.
Copyright (c) 2009, Percona Inc.
-Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved.
+Copyright (c) 2013, 2015, MariaDB Corporation. All Rights Reserved.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -103,6 +103,41 @@ struct srv_stats_t {
a disk page */
ulint_ctr_1_t buf_pool_reads;
+ /** Number of bytes saved by page compression */
+ ulint_ctr_64_t page_compression_saved;
+ /** Number of 512Byte TRIM by page compression */
+ ulint_ctr_64_t page_compression_trim_sect512;
+ /** Number of 1K TRIM by page compression */
+ ulint_ctr_64_t page_compression_trim_sect1024;
+ /** Number of 2K TRIM by page compression */
+ ulint_ctr_64_t page_compression_trim_sect2048;
+ /** Number of 4K TRIM by page compression */
+ ulint_ctr_64_t page_compression_trim_sect4096;
+ /** Number of 8K TRIM by page compression */
+ ulint_ctr_64_t page_compression_trim_sect8192;
+ /** Number of 16K TRIM by page compression */
+ ulint_ctr_64_t page_compression_trim_sect16384;
+ /** Number of 32K TRIM by page compression */
+ ulint_ctr_64_t page_compression_trim_sect32768;
+ /* Number of index pages written */
+ ulint_ctr_64_t index_pages_written;
+ /* Number of non index pages written */
+ ulint_ctr_64_t non_index_pages_written;
+ /* Number of pages compressed with page compression */
+ ulint_ctr_64_t pages_page_compressed;
+ /* Number of TRIM operations induced by page compression */
+ ulint_ctr_64_t page_compressed_trim_op;
+ /* Number of TRIM operations saved by using actual write size knowledge */
+ ulint_ctr_64_t page_compressed_trim_op_saved;
+ /* Number of pages decompressed with page compression */
+ ulint_ctr_64_t pages_page_decompressed;
+ /* Number of page compression errors */
+ ulint_ctr_64_t pages_page_compression_error;
+ /* Number of pages encrypted */
+ ulint_ctr_64_t pages_encrypted;
+ /* Number of pages decrypted */
+ ulint_ctr_64_t pages_decrypted;
+
/** Number of data read in total (in bytes) */
ulint_ctr_1_t data_read;
@@ -138,6 +173,12 @@ struct srv_stats_t {
/** Number of system rows inserted */
ulint_ctr_64_t n_system_rows_inserted;
+
+ /** Number of times secondary index lookup triggered cluster lookup */
+ ulint_ctr_64_t n_sec_rec_cluster_reads;
+
+ /** Number of times prefix optimization avoided triggering cluster lookup */
+ ulint_ctr_64_t n_sec_rec_cluster_reads_avoided;
};
extern const char* srv_main_thread_op_info;
@@ -230,6 +271,31 @@ OS (provided we compiled Innobase with it in), otherwise we will
use simulated aio we build below with threads.
Currently we support native aio on windows and linux */
extern my_bool srv_use_native_aio;
+
+/* Use trim operation */
+extern my_bool srv_use_trim;
+
+/* Use posix fallocate */
+#ifdef HAVE_POSIX_FALLOCATE
+extern my_bool srv_use_posix_fallocate;
+#endif
+
+/* Use atomic writes i.e disable doublewrite buffer */
+extern my_bool srv_use_atomic_writes;
+
+/* Compression algorithm*/
+extern ulong innodb_compression_algorithm;
+
+/* Number of flush threads */
+#define MTFLUSH_MAX_WORKER 64
+#define MTFLUSH_DEFAULT_WORKER 8
+
+/* Number of threads used for multi-threaded flush */
+extern long srv_mtflush_threads;
+
+/* If this flag is TRUE, then we will use multi threaded flush. */
+extern my_bool srv_use_mtflush;
+
#ifdef __WIN__
extern ibool srv_use_native_conditions;
#endif /* __WIN__ */
@@ -260,6 +326,10 @@ extern ulong srv_auto_extend_increment;
extern ibool srv_created_new_raw;
+/* Optimize prefix index queries to skip cluster index lookup when possible */
+/* Enables or disables this prefix optimization. Disabled by default. */
+extern my_bool srv_prefix_index_cluster_optimization;
+
/** Maximum number of srv_n_log_files, or innodb_log_files_in_group */
#define SRV_N_LOG_FILES_MAX 100
extern ulong srv_n_log_files;
@@ -270,6 +340,10 @@ extern ulong srv_flush_log_at_trx_commit;
extern uint srv_flush_log_at_timeout;
extern char srv_adaptive_flushing;
+#ifdef WITH_INNODB_DISALLOW_WRITES
+/* When this event is reset we do not allow any file writes to take place. */
+extern os_event_t srv_allow_writes_event;
+#endif /* WITH_INNODB_DISALLOW_WRITES */
/* If this flag is TRUE, then we will load the indexes' (and tables') metadata
even if they are marked as "corrupted". Mostly it is for DBA to process
corrupted index and table */
@@ -301,6 +375,17 @@ extern my_bool srv_random_read_ahead;
extern ulong srv_read_ahead_threshold;
extern ulint srv_n_read_io_threads;
extern ulint srv_n_write_io_threads;
+/* Defragmentation, Origianlly facebook default value is 100, but it's too high */
+#define SRV_DEFRAGMENT_FREQUENCY_DEFAULT 40
+extern my_bool srv_defragment;
+extern uint srv_defragment_n_pages;
+extern uint srv_defragment_stats_accuracy;
+extern uint srv_defragment_fill_factor_n_recs;
+extern double srv_defragment_fill_factor;
+extern uint srv_defragment_frequency;
+extern ulonglong srv_defragment_interval;
+
+extern ulong srv_idle_flush_pct;
/* Number of IO operations per second the server can do */
extern ulong srv_io_capacity;
@@ -362,10 +447,7 @@ extern my_bool srv_stats_sample_traditional;
extern ibool srv_use_doublewrite_buf;
extern ulong srv_doublewrite_batch_size;
-extern ibool srv_use_atomic_writes;
-#ifdef HAVE_POSIX_FALLOCATE
-extern ibool srv_use_posix_fallocate;
-#endif
+extern my_bool srv_force_primary_key;
extern double srv_max_buf_pool_modified_pct;
extern ulong srv_max_purge_lag;
@@ -393,6 +475,11 @@ extern ibool srv_buf_dump_thread_active;
/* TRUE during the lifetime of the stats thread */
extern ibool srv_dict_stats_thread_active;
+/* TRUE if enable log scrubbing */
+extern my_bool srv_scrub_log;
+/* TRUE during the lifetime of the log scrub thread */
+extern ibool srv_log_scrub_thread_active;
+
extern ulong srv_n_spin_wait_rounds;
extern ulong srv_n_free_tickets_to_enter;
extern ulong srv_thread_sleep_delay;
@@ -427,7 +514,6 @@ extern my_bool srv_ibuf_disable_background_merge;
extern my_bool srv_purge_view_update_only_debug;
#endif /* UNIV_DEBUG */
-extern ulint srv_fatal_semaphore_wait_threshold;
#define SRV_SEMAPHORE_WAIT_EXTENSION 7200
extern ulint srv_dml_needed_delay;
@@ -457,6 +543,9 @@ extern my_bool srv_print_all_deadlocks;
extern my_bool srv_cmp_per_index_enabled;
+/* is encryption enabled */
+extern ulong srv_encrypt_tables;
+
/** Status variables to be passed to MySQL */
extern struct export_var_t export_vars;
@@ -466,6 +555,17 @@ extern srv_stats_t srv_stats;
/** Simulate compression failures. */
extern uint srv_simulate_comp_failures;
+/** Fatal semaphore wait threshold = maximum number of seconds
+that semaphore times out in InnoDB */
+#define DEFAULT_SRV_FATAL_SEMAPHORE_TIMEOUT 600
+extern ulong srv_fatal_semaphore_wait_threshold;
+
+/** Enable semaphore request instrumentation */
+extern my_bool srv_instrument_semaphores;
+
+/** Buffer pool dump status frequence in percentages */
+extern ulong srv_buf_dump_status_frequency;
+
# ifdef UNIV_PFS_THREAD
/* Keys to register InnoDB threads with performance schema */
extern mysql_pfs_key_t buf_page_cleaner_thread_key;
@@ -865,12 +965,79 @@ struct export_var_t{
ulint innodb_system_rows_deleted; /*!< srv_n_system_rows_deleted*/
ulint innodb_num_open_files; /*!< fil_n_file_opened */
ulint innodb_truncated_status_writes; /*!< srv_truncated_status_writes */
- ulint innodb_available_undo_logs; /*!< srv_available_undo_logs */
+ ulint innodb_available_undo_logs; /*!< srv_available_undo_logs
+ */
+ ulint innodb_defragment_compression_failures; /*!< Number of
+ defragment re-compression
+ failures */
+
+ ulint innodb_defragment_failures; /*!< Number of defragment
+ failures*/
+ ulint innodb_defragment_count; /*!< Number of defragment
+ operations*/
+
+ ulint innodb_onlineddl_rowlog_rows; /*!< Online alter rows */
+ ulint innodb_onlineddl_rowlog_pct_used; /*!< Online alter percentage
+ of used row log buffer */
+ ulint innodb_onlineddl_pct_progress; /*!< Online alter progress */
+
#ifdef UNIV_DEBUG
ulint innodb_purge_trx_id_age; /*!< rw_max_trx_id - purged trx_id */
ulint innodb_purge_view_trx_id_age; /*!< rw_max_trx_id
- purged view's min trx_id */
#endif /* UNIV_DEBUG */
+
+ ib_int64_t innodb_page_compression_saved;/*!< Number of bytes saved
+ by page compression */
+ ib_int64_t innodb_page_compression_trim_sect512;/*!< Number of 512b TRIM
+ by page compression */
+ ib_int64_t innodb_page_compression_trim_sect1024;/*!< Number of 1K TRIM
+ by page compression */
+ ib_int64_t innodb_page_compression_trim_sect2048;/*!< Number of 2K TRIM
+ by page compression */
+ ib_int64_t innodb_page_compression_trim_sect4096;/*!< Number of 4K byte TRIM
+ by page compression */
+ ib_int64_t innodb_page_compression_trim_sect8192;/*!< Number of 8K TRIM
+ by page compression */
+ ib_int64_t innodb_page_compression_trim_sect16384;/*!< Number of 16K TRIM
+ by page compression */
+ ib_int64_t innodb_page_compression_trim_sect32768;/*!< Number of 32K TRIM
+ by page compression */
+ ib_int64_t innodb_index_pages_written; /*!< Number of index pages
+ written */
+ ib_int64_t innodb_non_index_pages_written; /*!< Number of non index pages
+ written */
+ ib_int64_t innodb_pages_page_compressed;/*!< Number of pages
+ compressed by page compression */
+ ib_int64_t innodb_page_compressed_trim_op;/*!< Number of TRIM operations
+ induced by page compression */
+ ib_int64_t innodb_page_compressed_trim_op_saved;/*!< Number of TRIM operations
+ saved by page compression */
+ ib_int64_t innodb_pages_page_decompressed;/*!< Number of pages
+ decompressed by page
+ compression */
+ ib_int64_t innodb_pages_page_compression_error;/*!< Number of page
+ compression errors */
+ ib_int64_t innodb_pages_encrypted; /*!< Number of pages
+ encrypted */
+ ib_int64_t innodb_pages_decrypted; /*!< Number of pages
+ decrypted */
+
+ ulint innodb_sec_rec_cluster_reads; /*!< srv_sec_rec_cluster_reads */
+ ulint innodb_sec_rec_cluster_reads_avoided;/*!< srv_sec_rec_cluster_reads_avoided */
+
+ ulint innodb_encryption_rotation_pages_read_from_cache;
+ ulint innodb_encryption_rotation_pages_read_from_disk;
+ ulint innodb_encryption_rotation_pages_modified;
+ ulint innodb_encryption_rotation_pages_flushed;
+ ulint innodb_encryption_rotation_estimated_iops;
+
+ ulint innodb_scrub_page_reorganizations;
+ ulint innodb_scrub_page_splits;
+ ulint innodb_scrub_page_split_failures_underflow;
+ ulint innodb_scrub_page_split_failures_out_of_filespace;
+ ulint innodb_scrub_page_split_failures_missing_index;
+ ulint innodb_scrub_page_split_failures_unknown;
};
/** Thread slot in the thread table. */
@@ -910,5 +1077,13 @@ struct srv_slot_t{
# define srv_start_raw_disk_in_use 0
# define srv_file_per_table 1
#endif /* !UNIV_HOTBACKUP */
+#ifdef WITH_WSREP
+UNIV_INTERN
+void
+wsrep_srv_conc_cancel_wait(
+/*==================*/
+ trx_t* trx); /*!< in: transaction object associated with the
+ thread */
+#endif /* WITH_WSREP */
#endif
diff --git a/storage/innobase/include/srv0start.h b/storage/innobase/include/srv0start.h
index 40d502f4459..e1c19982ba5 100644
--- a/storage/innobase/include/srv0start.h
+++ b/storage/innobase/include/srv0start.h
@@ -37,7 +37,8 @@ Created 10/10/1995 Heikki Tuuri
#endif
/*********************************************************************//**
-Normalizes a directory path for Windows: converts slashes to backslashes. */
+Normalizes a directory path for Windows: converts slashes to backslashes.
+*/
UNIV_INTERN
void
srv_normalize_path_for_win(
diff --git a/storage/innobase/include/sync0arr.h b/storage/innobase/include/sync0arr.h
index 0e735192024..6c3225b1826 100644
--- a/storage/innobase/include/sync0arr.h
+++ b/storage/innobase/include/sync0arr.h
@@ -31,7 +31,7 @@ Created 9/5/1995 Heikki Tuuri
#include "ut0mem.h"
#include "os0thread.h"
-/** Synchronization wait array cell */
+/** Synchonization cell */
struct sync_cell_t;
/** Synchronization wait array */
struct sync_array_t;
@@ -154,6 +154,16 @@ UNIV_INTERN
void
sync_array_print_innodb(void);
+/*****************************************************************//**
+Gets the nth cell in array.
+@return cell */
+UNIV_INTERN
+sync_cell_t*
+sync_array_get_nth_cell(
+/*====================*/
+ sync_array_t* arr, /*!< in: sync array */
+ ulint n); /*!< in: index */
+
#ifndef UNIV_NONINL
#include "sync0arr.ic"
#endif
diff --git a/storage/innobase/include/sync0rw.h b/storage/innobase/include/sync0rw.h
index b36e04f2810..d212ac17871 100644
--- a/storage/innobase/include/sync0rw.h
+++ b/storage/innobase/include/sync0rw.h
@@ -40,6 +40,9 @@ Created 9/11/1995 Heikki Tuuri
#include "sync0sync.h"
#include "os0sync.h"
+/** Enable semaphore request instrumentation */
+extern my_bool srv_instrument_semaphores;
+
/* The following undef is to prevent a name conflict with a macro
in MySQL: */
#undef rw_lock_t
@@ -159,7 +162,7 @@ defined, the rwlock are instrumented with performance schema probes. */
# endif/* UNIV_SYNC_DEBUG */
# else /* UNIV_DEBUG */
# define rw_lock_create(K, L, level) \
- rw_lock_create_func((L), __FILE__, __LINE__)
+ rw_lock_create_func((L), #L, __FILE__, __LINE__)
# endif /* UNIV_DEBUG */
/**************************************************************//**
@@ -224,7 +227,7 @@ unlocking, not the corresponding function. */
# endif/* UNIV_SYNC_DEBUG */
# else /* UNIV_DEBUG */
# define rw_lock_create(K, L, level) \
- pfs_rw_lock_create_func((K), (L), __FILE__, __LINE__)
+ pfs_rw_lock_create_func((K), (L), #L, __FILE__, __LINE__)
# endif /* UNIV_DEBUG */
/******************************************************************
@@ -294,8 +297,8 @@ rw_lock_create_func(
# ifdef UNIV_SYNC_DEBUG
ulint level, /*!< in: level */
# endif /* UNIV_SYNC_DEBUG */
- const char* cmutex_name, /*!< in: mutex name */
#endif /* UNIV_DEBUG */
+ const char* cmutex_name, /*!< in: mutex name */
const char* cfile_name, /*!< in: file name where created */
ulint cline); /*!< in: file line where created */
/******************************************************************//**
@@ -610,6 +613,10 @@ struct rw_lock_t {
#endif
ulint count_os_wait; /*!< Count of os_waits. May not be accurate */
const char* cfile_name;/*!< File name where lock created */
+ const char* lock_name; /*!< lock name */
+ os_thread_id_t thread_id;/*!< thread id */
+ const char* file_name;/*!< File name where the lock was obtained */
+ ulint line; /*!< Line where the rw-lock was locked */
/* last s-lock file/line is not guaranteed to be correct */
const char* last_s_file_name;/*!< File name where last s-locked */
const char* last_x_file_name;/*!< File name where last x-locked */
@@ -688,8 +695,8 @@ pfs_rw_lock_create_func(
# ifdef UNIV_SYNC_DEBUG
ulint level, /*!< in: level */
# endif /* UNIV_SYNC_DEBUG */
- const char* cmutex_name, /*!< in: mutex name */
#endif /* UNIV_DEBUG */
+ const char* cmutex_name, /*!< in: mutex name */
const char* cfile_name, /*!< in: file name where created */
ulint cline); /*!< in: file line where created */
diff --git a/storage/innobase/include/sync0rw.ic b/storage/innobase/include/sync0rw.ic
index 69251da6e35..8c4e938002a 100644
--- a/storage/innobase/include/sync0rw.ic
+++ b/storage/innobase/include/sync0rw.ic
@@ -325,6 +325,12 @@ rw_lock_s_lock_low(
lock->last_s_file_name = file_name;
lock->last_s_line = line;
+ if (srv_instrument_semaphores) {
+ lock->thread_id = os_thread_get_curr_id();
+ lock->file_name = file_name;
+ lock->line = line;
+ }
+
return(TRUE); /* locking succeeded */
}
@@ -431,6 +437,12 @@ rw_lock_x_lock_func_nowait(
rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line);
#endif
+ if (srv_instrument_semaphores) {
+ lock->thread_id = os_thread_get_curr_id();
+ lock->file_name = file_name;
+ lock->line = line;
+ }
+
lock->last_x_file_name = file_name;
lock->last_x_line = line;
@@ -551,8 +563,8 @@ pfs_rw_lock_create_func(
# ifdef UNIV_SYNC_DEBUG
ulint level, /*!< in: level */
# endif /* UNIV_SYNC_DEBUG */
- const char* cmutex_name, /*!< in: mutex name */
# endif /* UNIV_DEBUG */
+ const char* cmutex_name, /*!< in: mutex name */
const char* cfile_name, /*!< in: file name where created */
ulint cline) /*!< in: file line where created */
{
@@ -565,8 +577,8 @@ pfs_rw_lock_create_func(
# ifdef UNIV_SYNC_DEBUG
level,
# endif /* UNIV_SYNC_DEBUG */
- cmutex_name,
# endif /* UNIV_DEBUG */
+ cmutex_name,
cfile_name,
cline);
}
diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h
index 7b00e16476b..d9c9d7cfbac 100644
--- a/storage/innobase/include/sync0sync.h
+++ b/storage/innobase/include/sync0sync.h
@@ -42,6 +42,9 @@ Created 9/5/1995 Heikki Tuuri
#include "os0sync.h"
#include "sync0arr.h"
+/** Enable semaphore request instrumentation */
+extern my_bool srv_instrument_semaphores;
+
#if defined(UNIV_DEBUG) && !defined(UNIV_HOTBACKUP)
extern "C" my_bool timed_mutexes;
#endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */
@@ -180,7 +183,7 @@ necessary only if the memory block containing it is freed. */
# endif/* UNIV_SYNC_DEBUG */
# else
# define mutex_create(K, M, level) \
- pfs_mutex_create_func((K), (M), __FILE__, __LINE__)
+ pfs_mutex_create_func((K), (M), #M, __FILE__, __LINE__)
# endif /* UNIV_DEBUG */
# define mutex_enter(M) \
@@ -207,7 +210,7 @@ original non-instrumented functions */
# endif /* UNIV_SYNC_DEBUG */
# else /* UNIV_DEBUG */
# define mutex_create(K, M, level) \
- mutex_create_func((M), __FILE__, __LINE__)
+ mutex_create_func((M), #M, __FILE__, __LINE__)
# endif /* UNIV_DEBUG */
# define mutex_enter(M) mutex_enter_func((M), __FILE__, __LINE__)
@@ -231,8 +234,8 @@ void
mutex_create_func(
/*==============*/
ib_mutex_t* mutex, /*!< in: pointer to memory */
-#ifdef UNIV_DEBUG
const char* cmutex_name, /*!< in: mutex name */
+#ifdef UNIV_DEBUG
# ifdef UNIV_SYNC_DEBUG
ulint level, /*!< in: level */
# endif /* UNIV_SYNC_DEBUG */
@@ -305,8 +308,8 @@ pfs_mutex_create_func(
/*==================*/
PSI_mutex_key key, /*!< in: Performance Schema key */
ib_mutex_t* mutex, /*!< in: pointer to memory */
-# ifdef UNIV_DEBUG
const char* cmutex_name, /*!< in: mutex name */
+# ifdef UNIV_DEBUG
# ifdef UNIV_SYNC_DEBUG
ulint level, /*!< in: level */
# endif /* UNIV_SYNC_DEBUG */
@@ -687,6 +690,7 @@ or row lock! */
#define SYNC_EXTERN_STORAGE 500
#define SYNC_FSP 400
#define SYNC_FSP_PAGE 395
+#define SYNC_STATS_DEFRAG 390
/*------------------------------------- Change buffer headers */
#define SYNC_IBUF_MUTEX 370 /* ibuf_mutex */
/*------------------------------------- Change buffer tree */
@@ -762,22 +766,22 @@ struct ib_mutex_t {
UT_LIST_NODE_T(ib_mutex_t) list; /*!< All allocated mutexes are put into
a list. Pointers to the next and prev. */
#ifdef UNIV_SYNC_DEBUG
- const char* file_name; /*!< File where the mutex was locked */
- ulint line; /*!< Line where the mutex was locked */
ulint level; /*!< Level in the global latching order */
#endif /* UNIV_SYNC_DEBUG */
+
+ const char* file_name; /*!< File where the mutex was locked */
+ ulint line; /*!< Line where the mutex was locked */
const char* cfile_name;/*!< File name where mutex created */
ulint cline; /*!< Line where created */
ulong count_os_wait; /*!< count of os_wait */
+ const char* cmutex_name; /*!< mutex name */
+ os_thread_id_t thread_id; /*!< The thread id of the thread
+ which locked the mutex. */
#ifdef UNIV_DEBUG
/** Value of mutex_t::magic_n */
# define MUTEX_MAGIC_N 979585UL
-
- os_thread_id_t thread_id; /*!< The thread id of the thread
- which locked the mutex. */
ulint magic_n; /*!< MUTEX_MAGIC_N */
- const char* cmutex_name; /*!< mutex name */
ulint ib_mutex_type; /*!< 0=usual mutex, 1=rw_lock mutex */
#endif /* UNIV_DEBUG */
#ifdef UNIV_PFS_MUTEX
diff --git a/storage/innobase/include/sync0sync.ic b/storage/innobase/include/sync0sync.ic
index 97ec63c0dd2..d29932b39a6 100644
--- a/storage/innobase/include/sync0sync.ic
+++ b/storage/innobase/include/sync0sync.ic
@@ -162,7 +162,7 @@ mutex_exit_func(
{
ut_ad(mutex_own(mutex));
- ut_d(mutex->thread_id = (os_thread_id_t) ULINT_UNDEFINED);
+ mutex->thread_id = (os_thread_id_t) ULINT_UNDEFINED;
#ifdef UNIV_SYNC_DEBUG
sync_thread_reset_level(mutex);
@@ -204,16 +204,24 @@ mutex_enter_func(
ulint line) /*!< in: line where locked */
{
ut_ad(mutex_validate(mutex));
+#ifndef WITH_WSREP
+ /* this cannot be be granted when BF trx kills a trx in lock wait state */
ut_ad(!mutex_own(mutex));
+#endif /* WITH_WSREP */
/* Note that we do not peek at the value of lock_word before trying
the atomic test_and_set; we could peek, and possibly save time. */
if (!ib_mutex_test_and_set(mutex)) {
- ut_d(mutex->thread_id = os_thread_get_curr_id());
+ mutex->thread_id = os_thread_get_curr_id();
#ifdef UNIV_SYNC_DEBUG
mutex_set_debug_info(mutex, file_name, line);
#endif
+ if (srv_instrument_semaphores) {
+ mutex->file_name = file_name;
+ mutex->line = line;
+ }
+
return; /* Succeeded! */
}
@@ -321,8 +329,8 @@ pfs_mutex_create_func(
/*==================*/
mysql_pfs_key_t key, /*!< in: Performance Schema key */
ib_mutex_t* mutex, /*!< in: pointer to memory */
-# ifdef UNIV_DEBUG
const char* cmutex_name, /*!< in: mutex name */
+# ifdef UNIV_DEBUG
# ifdef UNIV_SYNC_DEBUG
ulint level, /*!< in: level */
# endif /* UNIV_SYNC_DEBUG */
@@ -333,8 +341,8 @@ pfs_mutex_create_func(
mutex->pfs_psi = PSI_MUTEX_CALL(init_mutex)(key, mutex);
mutex_create_func(mutex,
-# ifdef UNIV_DEBUG
cmutex_name,
+# ifdef UNIV_DEBUG
# ifdef UNIV_SYNC_DEBUG
level,
# endif /* UNIV_SYNC_DEBUG */
diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h
index 70f214d1ac7..9ffc8d99a7f 100644
--- a/storage/innobase/include/trx0sys.h
+++ b/storage/innobase/include/trx0sys.h
@@ -42,6 +42,9 @@ Created 3/26/1996 Heikki Tuuri
#include "read0types.h"
#include "page0types.h"
#include "ut0bh.h"
+#ifdef WITH_WSREP
+#include "trx0xa.h"
+#endif /* WITH_WSREP */
typedef UT_LIST_BASE_NODE_T(trx_t) trx_list_t;
@@ -293,6 +296,9 @@ trx_sys_update_mysql_binlog_offset(
ib_int64_t offset, /*!< in: position in that log file */
ulint field, /*!< in: offset of the MySQL log info field in
the trx sys header */
+#ifdef WITH_WSREP
+ trx_sysf_t* sys_header, /*!< in: trx sys header */
+#endif /* WITH_WSREP */
mtr_t* mtr); /*!< in: mtr */
/*****************************************************************//**
Prints to stderr the MySQL binlog offset info in the trx system header if
@@ -301,6 +307,19 @@ UNIV_INTERN
void
trx_sys_print_mysql_binlog_offset(void);
/*===================================*/
+#ifdef WITH_WSREP
+/** Update WSREP checkpoint XID in sys header. */
+void
+trx_sys_update_wsrep_checkpoint(
+ const XID* xid, /*!< in: WSREP XID */
+ trx_sysf_t* sys_header, /*!< in: sys_header */
+ mtr_t* mtr); /*!< in: mtr */
+
+void
+/** Read WSREP checkpoint XID from sys header. */
+trx_sys_read_wsrep_checkpoint(
+ XID* xid); /*!< out: WSREP XID */
+#endif /* WITH_WSREP */
/*****************************************************************//**
Prints to stderr the MySQL master log offset info in the trx system header if
the magic number shows it valid. */
@@ -529,6 +548,20 @@ this contains the same fields as TRX_SYS_MYSQL_LOG_INFO below */
within that file */
#define TRX_SYS_MYSQL_LOG_NAME 12 /*!< MySQL log file name */
+#ifdef WITH_WSREP
+/* The offset to WSREP XID headers */
+#define TRX_SYS_WSREP_XID_INFO (UNIV_PAGE_SIZE - 3500)
+#define TRX_SYS_WSREP_XID_MAGIC_N_FLD 0
+#define TRX_SYS_WSREP_XID_MAGIC_N 0x77737265
+
+/* XID field: formatID, gtrid_len, bqual_len, xid_data */
+#define TRX_SYS_WSREP_XID_LEN (4 + 4 + 4 + XIDDATASIZE)
+#define TRX_SYS_WSREP_XID_FORMAT 4
+#define TRX_SYS_WSREP_XID_GTRID_LEN 8
+#define TRX_SYS_WSREP_XID_BQUAL_LEN 12
+#define TRX_SYS_WSREP_XID_DATA 16
+#endif /* WITH_WSREP*/
+
/** Doublewrite buffer */
/* @{ */
/** The offset of the doublewrite buffer header on the trx system header page */
diff --git a/storage/innobase/include/trx0sys.ic b/storage/innobase/include/trx0sys.ic
index e097e29b551..7265a97ae25 100644
--- a/storage/innobase/include/trx0sys.ic
+++ b/storage/innobase/include/trx0sys.ic
@@ -445,7 +445,10 @@ trx_id_t
trx_sys_get_new_trx_id(void)
/*========================*/
{
+#ifndef WITH_WSREP
+ /* wsrep_fake_trx_id violates this assert */
ut_ad(mutex_own(&trx_sys->mutex));
+#endif /* WITH_WSREP */
/* VERY important: after the database is started, max_trx_id value is
divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the following if
diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h
index 57b91844aca..6cb082edc67 100644
--- a/storage/innobase/include/trx0trx.h
+++ b/storage/innobase/include/trx0trx.h
@@ -1022,6 +1022,10 @@ struct trx_t{
ulint total_table_lock_wait_time;
/*!< Total table lock wait time
up to this moment. */
+
+#ifdef WITH_WSREP
+ os_event_t wsrep_event; /* event waited for in srv_conc_slot */
+#endif /* WITH_WSREP */
};
/* Transaction isolation levels (trx->isolation_level) */
diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i
index 3e17f65e4bc..4732ac48290 100644
--- a/storage/innobase/include/univ.i
+++ b/storage/innobase/include/univ.i
@@ -2,6 +2,7 @@
Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, Google Inc.
+Copyright (c) 2013, 2015, MariaDB Corporation.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -331,6 +332,36 @@ typedef enum innodb_file_formats_enum innodb_file_formats_t;
/** The 2-logarithm of UNIV_PAGE_SIZE: */
#define UNIV_PAGE_SIZE_SHIFT srv_page_size_shift
+#ifdef HAVE_LZO
+#define IF_LZO(A,B) A
+#else
+#define IF_LZO(A,B) B
+#endif
+
+#ifdef HAVE_LZ4
+#define IF_LZ4(A,B) A
+#else
+#define IF_LZ4(A,B) B
+#endif
+
+#ifdef HAVE_LZMA
+#define IF_LZMA(A,B) A
+#else
+#define IF_LZMA(A,B) B
+#endif
+
+#ifdef HAVE_BZIP2
+#define IF_BZIP2(A,B) A
+#else
+#define IF_BZIP2(A,B) B
+#endif
+
+#ifdef HAVE_SNAPPY
+#define IF_SNAPPY(A,B) A
+#else
+#define IF_SNAPPY(A,B) B
+#endif
+
/** The universal page size of the database */
#define UNIV_PAGE_SIZE ((ulint) srv_page_size)
@@ -344,13 +375,15 @@ and 2 bits for flags. This limits the uncompressed page size to 16k.
Even though a 16k uncompressed page can theoretically be compressed
into a larger compressed page, it is not a useful feature so we will
limit both with this same constant. */
-#define UNIV_ZIP_SIZE_SHIFT_MAX 14
+#define UNIV_ZIP_SIZE_SHIFT_MAX 15
/* Define the Min, Max, Default page sizes. */
/** Minimum Page Size Shift (power of 2) */
#define UNIV_PAGE_SIZE_SHIFT_MIN 12
+/** log2 of largest page size (1<<16 == 64436 bytes). */
/** Maximum Page Size Shift (power of 2) */
-#define UNIV_PAGE_SIZE_SHIFT_MAX 14
+#define UNIV_PAGE_SIZE_SHIFT_MAX 16
+/** log2 of default page size (1<<14 == 16384 bytes). */
/** Default Page Size Shift (power of 2) */
#define UNIV_PAGE_SIZE_SHIFT_DEF 14
/** Original 16k InnoDB Page Size Shift, in case the default changes */
@@ -451,6 +484,9 @@ typedef uint32_t ib_uint32_t;
# define IB_ID_FMT UINT64PF
+/* Type used for all log sequence number storage and arithmetics */
+typedef ib_uint64_t lsn_t;
+
#ifdef _WIN64
typedef unsigned __int64 ulint;
typedef __int64 lint;
diff --git a/storage/innobase/include/ut0list.h b/storage/innobase/include/ut0list.h
index 29fc8669ce4..796a272db59 100644
--- a/storage/innobase/include/ut0list.h
+++ b/storage/innobase/include/ut0list.h
@@ -150,6 +150,15 @@ ib_list_is_empty(
/* out: TRUE if empty else */
const ib_list_t* list); /* in: list */
+/********************************************************************
+Get number of items on list.
+@return number of items on list */
+UNIV_INLINE
+ulint
+ib_list_len(
+/*========*/
+ const ib_list_t* list); /*<! in: list */
+
/* List. */
struct ib_list_t {
ib_list_node_t* first; /*!< first node */
diff --git a/storage/innobase/include/ut0list.ic b/storage/innobase/include/ut0list.ic
index d9dcb2eac99..7a7f53adb2f 100644
--- a/storage/innobase/include/ut0list.ic
+++ b/storage/innobase/include/ut0list.ic
@@ -58,3 +58,23 @@ ib_list_is_empty(
{
return(!(list->first || list->last));
}
+
+/********************************************************************
+Get number of items on list.
+@return number of items on list */
+UNIV_INLINE
+ulint
+ib_list_len(
+/*========*/
+ const ib_list_t* list) /*<! in: list */
+{
+ ulint len = 0;
+ ib_list_node_t* node = list->first;
+
+ while(node) {
+ len++;
+ node = node->next;
+ }
+
+ return (len);
+}
diff --git a/storage/innobase/include/ut0timer.h b/storage/innobase/include/ut0timer.h
new file mode 100644
index 00000000000..f361ae79bf5
--- /dev/null
+++ b/storage/innobase/include/ut0timer.h
@@ -0,0 +1,104 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved.
+Copyright (c) 2014, SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/ut0timer.h
+Timer rountines
+
+Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com
+modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6
+*************************************************************************/
+#ifndef ut0timer_h
+#define ut0timer_h
+
+#include "univ.i"
+#include "data0type.h"
+#include <my_rdtsc.h>
+
+/* Current timer stats */
+extern struct my_timer_unit_info ut_timer;
+
+/**************************************************************//**
+Function pointer to point selected timer function.
+@return timer current value */
+extern ulonglong (*ut_timer_now)(void);
+
+/**************************************************************//**
+Sets up the data required for use of my_timer_* functions.
+Selects the best timer by high frequency, and tight resolution.
+Points my_timer_now() to the selected timer function.
+Initializes my_timer struct to contain the info for selected timer.*/
+UNIV_INTERN
+void ut_init_timer(void);
+
+/**************************************************************//**
+Return time passed since time then, automatically adjusted
+for the estimated timer overhead.
+@return time passed since "then" */
+UNIV_INLINE
+ulonglong
+ut_timer_since(
+/*===========*/
+ ulonglong then); /*!< in: time where to calculate */
+/**************************************************************//**
+Get time passed since "then", and update then to now
+@return time passed sinche "then" */
+UNIV_INLINE
+ulonglong
+ut_timer_since_and_update(
+/*======================*/
+ ulonglong *then); /*!< in: time where to calculate */
+/**************************************************************//**
+Convert native timer units in a ulonglong into seconds in a double
+@return time in a seconds */
+UNIV_INLINE
+double
+ut_timer_to_seconds(
+/*=================*/
+ ulonglong when); /*!< in: time where to calculate */
+/**************************************************************//**
+Convert native timer units in a ulonglong into milliseconds in a double
+@return time in milliseconds */
+UNIV_INLINE
+double
+ut_timer_to_milliseconds(
+/*=====================*/
+ ulonglong when); /*!< in: time where to calculate */
+/**************************************************************//**
+Convert native timer units in a ulonglong into microseconds in a double
+@return time in microseconds */
+UNIV_INLINE
+double
+ut_timer_to_microseconds(
+/*=====================*/
+ ulonglong when); /*!< in: time where to calculate */
+/**************************************************************//**
+Convert microseconds in a double to native timer units in a ulonglong
+@return time in microseconds */
+UNIV_INLINE
+ulonglong
+ut_microseconds_to_timer(
+/*=====================*/
+ ulonglong when); /*!< in: time where to calculate */
+
+#ifndef UNIV_NONINL
+#include "ut0timer.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/ut0timer.ic b/storage/innobase/include/ut0timer.ic
new file mode 100644
index 00000000000..027e89c6279
--- /dev/null
+++ b/storage/innobase/include/ut0timer.ic
@@ -0,0 +1,113 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved.
+Copyright (c) 2014, SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/ut0timer.ic
+Timer rountines
+
+Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com
+modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6
+*************************************************************************/
+
+/**************************************************************//**
+Return time passed since time then, automatically adjusted
+for the estimated timer overhead.
+@return time passed since "then" */
+UNIV_INLINE
+ulonglong
+ut_timer_since(
+/*===========*/
+ ulonglong then) /*!< in: time where to calculate */
+{
+ return (ut_timer_now() - then) - ut_timer.overhead;
+}
+
+/**************************************************************//**
+Get time passed since "then", and update then to now
+@return time passed sinche "then" */
+UNIV_INLINE
+ulonglong
+ut_timer_since_and_update(
+/*======================*/
+ ulonglong *then) /*!< in: time where to calculate */
+{
+ ulonglong now = ut_timer_now();
+ ulonglong ret = (now - (*then)) - ut_timer.overhead;
+ *then = now;
+ return ret;
+}
+
+/**************************************************************//**
+Convert native timer units in a ulonglong into seconds in a double
+@return time in a seconds */
+UNIV_INLINE
+double
+ut_timer_to_seconds(
+/*=================*/
+ ulonglong when) /*!< in: time where to calculate */
+{
+ double ret = (double)(when);
+ ret /= (double)(ut_timer.frequency);
+ return ret;
+}
+
+/**************************************************************//**
+Convert native timer units in a ulonglong into milliseconds in a double
+@return time in milliseconds */
+UNIV_INLINE
+double
+ut_timer_to_milliseconds(
+/*=====================*/
+ ulonglong when) /*!< in: time where to calculate */
+{
+ double ret = (double)(when);
+ ret *= 1000.0;
+ ret /= (double)(ut_timer.frequency);
+ return ret;
+}
+
+/**************************************************************//**
+Convert native timer units in a ulonglong into microseconds in a double
+@return time in microseconds */
+UNIV_INLINE
+double
+ut_timer_to_microseconds(
+/*=====================*/
+ ulonglong when) /*!< in: time where to calculate */
+{
+ double ret = (double)(when);
+ ret *= 1000000.0;
+ ret /= (double)(ut_timer.frequency);
+ return ret;
+}
+
+/**************************************************************//**
+Convert microseconds in a double to native timer units in a ulonglong
+@return time in microseconds */
+UNIV_INLINE
+ulonglong
+ut_microseconds_to_timer(
+/*=====================*/
+ ulonglong when) /*!< in: time where to calculate */
+{
+ double ret = when;
+ ret *= (double)(ut_timer.frequency);
+ ret /= 1000000.0;
+ return (ulonglong)ret;
+}
diff --git a/storage/innobase/include/ut0ut.h b/storage/innobase/include/ut0ut.h
index 939ccee6e3e..bf8aeefcbe1 100644
--- a/storage/innobase/include/ut0ut.h
+++ b/storage/innobase/include/ut0ut.h
@@ -224,12 +224,15 @@ ut_2_power_up(
ulint n) /*!< in: number != 0 */
__attribute__((const));
+#endif /* !UNIV_INNOCHECKSUM */
+
/** Determine how many bytes (groups of 8 bits) are needed to
store the given number of bits.
@param b in: bits
@return number of bytes (octets) needed to represent b */
#define UT_BITS_IN_BYTES(b) (((b) + 7) / 8)
+#ifndef UNIV_INNOCHECKSUM
/**********************************************************//**
Returns system time. We do not specify the format of the time returned:
the only way to manipulate it is to use the function ut_difftime.
diff --git a/storage/innobase/include/ut0wqueue.h b/storage/innobase/include/ut0wqueue.h
index 33385ddf2d4..9906e299808 100644
--- a/storage/innobase/include/ut0wqueue.h
+++ b/storage/innobase/include/ut0wqueue.h
@@ -95,6 +95,23 @@ ib_wqueue_timedwait(
ib_wqueue_t* wq, /* in: work queue */
ib_time_t wait_in_usecs); /* in: wait time in micro seconds */
+/********************************************************************
+Return first item on work queue or NULL if queue is empty
+@return work item or NULL */
+void*
+ib_wqueue_nowait(
+/*=============*/
+ ib_wqueue_t* wq); /*<! in: work queue */
+
+/********************************************************************
+Get number of items on queue.
+@return number of items on queue */
+ulint
+ib_wqueue_len(
+/*==========*/
+ ib_wqueue_t* wq); /*<! in: work queue */
+
+
/* Work queue. */
struct ib_wqueue_t {
ib_mutex_t mutex; /*!< mutex protecting everything */
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index 7755d6d5ef1..85be4b3ad0b 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -52,6 +52,8 @@ Created 5/7/1996 Heikki Tuuri
#include <set>
#include "mysql/plugin.h"
+#include <mysql/service_wsrep.h>
+
/* Restricts the length of search we will do in the waits-for
graph of transactions */
#define LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK 1000000
@@ -962,6 +964,9 @@ UNIV_INLINE
ibool
lock_rec_has_to_wait(
/*=================*/
+#ifdef WITH_WSREP
+ ibool for_locking, /*!< is caller locking or releasing */
+#endif /* WITH_WSREP */
const trx_t* trx, /*!< in: trx of new lock */
ulint type_mode,/*!< in: precise mode of the new lock
to set: LOCK_S or LOCK_X, possibly
@@ -1058,6 +1063,50 @@ lock_rec_has_to_wait(
return (FALSE);
}
+#ifdef WITH_WSREP
+ /* if BF thread is locking and has conflict with another BF
+ thread, we need to look at trx ordering and lock types */
+ if (for_locking &&
+ wsrep_thd_is_BF(trx->mysql_thd, FALSE) &&
+ wsrep_thd_is_BF(lock2->trx->mysql_thd, TRUE)) {
+
+ if (wsrep_debug) {
+ fprintf(stderr, "\n BF-BF lock conflict \n");
+ lock_rec_print(stderr, lock2);
+ }
+
+ if (wsrep_trx_order_before(trx->mysql_thd,
+ lock2->trx->mysql_thd) &&
+ (type_mode & LOCK_MODE_MASK) == LOCK_X &&
+ (lock2->type_mode & LOCK_MODE_MASK) == LOCK_X)
+ {
+ /* exclusive lock conflicts are not accepted */
+ fprintf(stderr, "BF-BF X lock conflict,"
+ "type_mode: %lu supremum: %lu\n",
+ type_mode, lock_is_on_supremum);
+ fprintf(stderr, "conflicts states: my %d locked %d\n",
+ wsrep_thd_conflict_state(trx->mysql_thd, FALSE),
+ wsrep_thd_conflict_state(lock2->trx->mysql_thd, FALSE) );
+ lock_rec_print(stderr, lock2);
+ return FALSE;
+ //abort();
+ } else {
+ /* if lock2->index->n_uniq <=
+ lock2->index->n_user_defined_cols
+ operation is on uniq index
+ */
+ if (wsrep_debug) fprintf(stderr,
+ "BF conflict, modes: %lu %lu, "
+ "idx: %s-%s n_uniq %u n_user %u\n",
+ type_mode, lock2->type_mode,
+ lock2->index->name,
+ lock2->index->table_name,
+ lock2->index->n_uniq,
+ lock2->index->n_user_defined_cols);
+ return FALSE;
+ }
+ }
+#endif /* WITH_WSREP */
return(TRUE);
}
@@ -1088,7 +1137,11 @@ lock_has_to_wait(
/* If this lock request is for a supremum record
then the second bit on the lock bitmap is set */
+#ifdef WITH_WSREP
+ return(lock_rec_has_to_wait(FALSE, lock1->trx,
+#else
return(lock_rec_has_to_wait(lock1->trx,
+#endif /* WITH_WSREP */
lock1->type_mode, lock2,
lock_rec_get_nth_bit(
lock1, 1)));
@@ -1557,6 +1610,11 @@ lock_rec_has_expl(
return(NULL);
}
+#ifdef WITH_WSREP
+static
+void
+lock_rec_discard(lock_t* in_lock);
+#endif
#ifdef UNIV_DEBUG
/*********************************************************************//**
Checks if some other transaction has a lock request in the queue.
@@ -1605,6 +1663,69 @@ lock_rec_other_has_expl_req(
}
#endif /* UNIV_DEBUG */
+#ifdef WITH_WSREP
+static
+void
+wsrep_kill_victim(
+ const trx_t * const trx,
+ const lock_t *lock)
+{
+ ut_ad(lock_mutex_own());
+ ut_ad(trx_mutex_own(lock->trx));
+ my_bool bf_this = wsrep_thd_is_BF(trx->mysql_thd, FALSE);
+ my_bool bf_other = wsrep_thd_is_BF(lock->trx->mysql_thd, TRUE);
+
+ if ((bf_this && !bf_other) ||
+ (bf_this && bf_other && wsrep_trx_order_before(
+ trx->mysql_thd, lock->trx->mysql_thd))) {
+
+ if (lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+ if (wsrep_debug) {
+ fprintf(stderr, "WSREP: BF victim waiting\n");
+ }
+ /* cannot release lock, until our lock
+ is in the queue*/
+ } else if (lock->trx != trx) {
+ if (wsrep_log_conflicts) {
+ mutex_enter(&trx_sys->mutex);
+ if (bf_this) {
+ fputs("\n*** Priority TRANSACTION:\n",
+ stderr);
+ } else {
+ fputs("\n*** Victim TRANSACTION:\n",
+ stderr);
+ }
+
+ trx_print_latched(stderr, trx, 3000);
+
+ if (bf_other) {
+ fputs("\n*** Priority TRANSACTION:\n",
+ stderr);
+ } else {
+ fputs("\n*** Victim TRANSACTION:\n",
+ stderr);
+ }
+
+ trx_print_latched(stderr, lock->trx, 3000);
+
+ mutex_exit(&trx_sys->mutex);
+
+ fputs("*** WAITING FOR THIS LOCK TO BE GRANTED:\n",
+ stderr);
+
+ if (lock_get_type(lock) == LOCK_REC) {
+ lock_rec_print(stderr, lock);
+ } else {
+ lock_table_print(stderr, lock);
+ }
+ }
+
+ wsrep_innobase_kill_one_trx(trx->mysql_thd,
+ (const trx_t*) trx, lock->trx, TRUE);
+ }
+ }
+}
+#endif
/*********************************************************************//**
Checks if some other transaction has a conflicting explicit lock request
in the queue, so that we have to wait.
@@ -1633,7 +1754,15 @@ lock_rec_other_has_conflicting(
lock != NULL;
lock = lock_rec_get_next_const(heap_no, lock)) {
- if (lock_rec_has_to_wait(trx, mode, lock, is_supremum)) {
+#ifdef WITH_WSREP
+ if (lock_rec_has_to_wait(TRUE, trx, mode, lock, is_supremum)) {
+ trx_mutex_enter(lock->trx);
+ wsrep_kill_victim(trx, lock);
+ trx_mutex_exit(lock->trx);
+#else
+ if (lock_rec_has_to_wait(trx, mode, lock, is_supremum)) {
+#endif /* WITH_WSREP */
+
return(lock);
}
}
@@ -1814,6 +1943,28 @@ lock_number_of_rows_locked(
/*============== RECORD LOCK CREATION AND QUEUE MANAGEMENT =============*/
+#ifdef WITH_WSREP
+static
+void
+wsrep_print_wait_locks(
+/*============*/
+ lock_t* c_lock) /* conflicting lock to print */
+{
+ if (wsrep_debug && c_lock->trx->lock.wait_lock != c_lock) {
+ fprintf(stderr, "WSREP: c_lock != wait lock\n");
+ if (lock_get_type_low(c_lock) & LOCK_TABLE)
+ lock_table_print(stderr, c_lock);
+ else
+ lock_rec_print(stderr, c_lock);
+
+ if (lock_get_type_low(c_lock->trx->lock.wait_lock) & LOCK_TABLE)
+ lock_table_print(stderr, c_lock->trx->lock.wait_lock);
+ else
+ lock_rec_print(stderr, c_lock->trx->lock.wait_lock);
+ }
+}
+#endif /* WITH_WSREP */
+
/*********************************************************************//**
Creates a new record lock and inserts it to the lock queue. Does NOT check
for deadlocks or lock compatibility!
@@ -1822,6 +1973,10 @@ static
lock_t*
lock_rec_create(
/*============*/
+#ifdef WITH_WSREP
+ lock_t* const c_lock, /* conflicting lock */
+ que_thr_t* thr,
+#endif
ulint type_mode,/*!< in: lock mode and wait
flag, type is ignored and
replaced by LOCK_REC */
@@ -1896,8 +2051,88 @@ lock_rec_create(
ut_ad(index->table->n_ref_count > 0 || !index->table->can_be_evicted);
+#ifdef WITH_WSREP
+ if (c_lock && wsrep_thd_is_BF(trx->mysql_thd, FALSE)) {
+ lock_t *hash = (lock_t *)c_lock->hash;
+ lock_t *prev = NULL;
+
+ while (hash &&
+ wsrep_thd_is_BF(((lock_t *)hash)->trx->mysql_thd, TRUE) &&
+ wsrep_trx_order_before(
+ ((lock_t *)hash)->trx->mysql_thd,
+ trx->mysql_thd)) {
+ prev = hash;
+ hash = (lock_t *)hash->hash;
+ }
+ lock->hash = hash;
+ if (prev) {
+ prev->hash = lock;
+ } else {
+ c_lock->hash = lock;
+ }
+ /*
+ * delayed conflict resolution '...kill_one_trx' was not called,
+ * if victim was waiting for some other lock
+ */
+ trx_mutex_enter(c_lock->trx);
+ if (c_lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+
+ c_lock->trx->lock.was_chosen_as_deadlock_victim = TRUE;
+
+ if (wsrep_debug) {
+ wsrep_print_wait_locks(c_lock);
+ }
+
+ trx->lock.que_state = TRX_QUE_LOCK_WAIT;
+ lock_set_lock_and_trx_wait(lock, trx);
+ UT_LIST_ADD_LAST(trx_locks, trx->lock.trx_locks, lock);
+
+ ut_ad(thr != NULL);
+ trx->lock.wait_thr = thr;
+ thr->state = QUE_THR_LOCK_WAIT;
+
+ /* have to release trx mutex for the duration of
+ victim lock release. This will eventually call
+ lock_grant, which wants to grant trx mutex again
+ */
+ if (caller_owns_trx_mutex) {
+ trx_mutex_exit(trx);
+ }
+ lock_cancel_waiting_and_release(
+ c_lock->trx->lock.wait_lock);
+
+ if (caller_owns_trx_mutex) {
+ trx_mutex_enter(trx);
+ }
+
+ /* trx might not wait for c_lock, but some other lock
+ does not matter if wait_lock was released above
+ */
+ if (c_lock->trx->lock.wait_lock == c_lock) {
+ lock_reset_lock_and_trx_wait(lock);
+ }
+
+ trx_mutex_exit(c_lock->trx);
+
+ if (wsrep_debug) {
+ fprintf(
+ stderr,
+ "WSREP: c_lock canceled %llu\n",
+ (ulonglong) c_lock->trx->id);
+ }
+
+ /* have to bail out here to avoid lock_set_lock... */
+ return(lock);
+ }
+ trx_mutex_exit(c_lock->trx);
+ } else {
+ HASH_INSERT(lock_t, hash, lock_sys->rec_hash,
+ lock_rec_fold(space, page_no), lock);
+ }
+#else
HASH_INSERT(lock_t, hash, lock_sys->rec_hash,
lock_rec_fold(space, page_no), lock);
+#endif /* WITH_WSREP */
if (!caller_owns_trx_mutex) {
trx_mutex_enter(trx);
@@ -1905,7 +2140,6 @@ lock_rec_create(
ut_ad(trx_mutex_own(trx));
if (type_mode & LOCK_WAIT) {
-
lock_set_lock_and_trx_wait(lock, trx);
}
@@ -1917,7 +2151,6 @@ lock_rec_create(
MONITOR_INC(MONITOR_RECLOCK_CREATED);
MONITOR_INC(MONITOR_NUM_RECLOCK);
-
return(lock);
}
@@ -1932,6 +2165,9 @@ static
dberr_t
lock_rec_enqueue_waiting(
/*=====================*/
+#ifdef WITH_WSREP
+ lock_t* c_lock, /* conflicting lock */
+#endif
ulint type_mode,/*!< in: lock mode this
transaction is requesting:
LOCK_S or LOCK_X, possibly
@@ -1989,6 +2225,9 @@ lock_rec_enqueue_waiting(
/* Enqueue the lock request that will wait to be granted, note that
we already own the trx mutex. */
lock = lock_rec_create(
+#ifdef WITH_WSREP
+ c_lock, thr,
+#endif /* WITH_WSREP */
type_mode | LOCK_WAIT, block, heap_no, index, trx, TRUE);
/* Release the mutex to obey the latching order.
@@ -2092,7 +2331,19 @@ lock_rec_add_to_queue(
const lock_t* other_lock
= lock_rec_other_has_expl_req(mode, 0, LOCK_WAIT,
block, heap_no, trx);
+#ifdef WITH_WSREP
+ /* this can potentionally assert with wsrep */
+ if (wsrep_thd_is_wsrep(trx->mysql_thd)) {
+ if (wsrep_debug && other_lock) {
+ fprintf(stderr,
+ "WSREP: InnoDB assert ignored\n");
+ }
+ } else {
+ ut_a(!other_lock);
+ }
+#else
ut_a(!other_lock);
+#endif /* WITH_WSREP */
}
#endif /* UNIV_DEBUG */
@@ -2120,7 +2371,16 @@ lock_rec_add_to_queue(
if (lock_get_wait(lock)
&& lock_rec_get_nth_bit(lock, heap_no)) {
-
+#ifdef WITH_WSREP
+ if (wsrep_thd_is_BF(trx->mysql_thd, FALSE)) {
+ if (wsrep_debug) {
+ fprintf(stderr,
+ "BF skipping wait: %lu\n",
+ trx->id);
+ lock_rec_print(stderr, lock);
+ }
+ } else
+#endif
goto somebody_waits;
}
}
@@ -2143,9 +2403,15 @@ lock_rec_add_to_queue(
}
somebody_waits:
- return(lock_rec_create(
+#ifdef WITH_WSREP
+ return(lock_rec_create(NULL, NULL,
type_mode, block, heap_no, index, trx,
caller_owns_trx_mutex));
+#else
+ return(lock_rec_create(
+ type_mode, block, heap_no, index, trx,
+ caller_owns_trx_mutex));
+#endif /* WITH_WSREP */
}
/** Record locking request status */
@@ -2208,9 +2474,13 @@ lock_rec_lock_fast(
if (lock == NULL) {
if (!impl) {
/* Note that we don't own the trx mutex. */
+#ifdef WITH_WSREP
+ lock = lock_rec_create(NULL, thr,
+ mode, block, heap_no, index, trx, FALSE);
+#else
lock = lock_rec_create(
mode, block, heap_no, index, trx, FALSE);
-
+#endif /* WITH_WSREP */
}
status = LOCK_REC_SUCCESS_CREATED;
} else {
@@ -2263,6 +2533,9 @@ lock_rec_lock_slow(
que_thr_t* thr) /*!< in: query thread */
{
trx_t* trx;
+#ifdef WITH_WSREP
+ lock_t* c_lock(NULL);
+#endif
dberr_t err = DB_SUCCESS;
ut_ad(lock_mutex_own());
@@ -2286,18 +2559,31 @@ lock_rec_lock_slow(
/* The trx already has a strong enough lock on rec: do
nothing */
-
+#ifdef WITH_WSREP
+ } else if ((c_lock = (ib_lock_t*)lock_rec_other_has_conflicting(
+ static_cast<enum lock_mode>(mode),
+ block, heap_no, trx))) {
+#else
} else if (lock_rec_other_has_conflicting(
static_cast<enum lock_mode>(mode),
block, heap_no, trx)) {
+#endif /* WITH_WSREP */
/* If another transaction has a non-gap conflicting
request in the queue, as this transaction does not
have a lock strong enough already granted on the
record, we have to wait. */
+#ifdef WITH_WSREP
+ /* c_lock is NULL here if jump to enqueue_waiting happened
+ but it's ok because lock is not NULL in that case and c_lock
+ is not used. */
+ err = lock_rec_enqueue_waiting(c_lock,
+ mode, block, heap_no, index, thr);
+#else
err = lock_rec_enqueue_waiting(
mode, block, heap_no, index, thr);
+#endif /* WITH_WSREP */
} else if (!impl) {
/* Set the requested lock on the record, note that
@@ -2403,7 +2689,13 @@ lock_rec_has_to_wait_in_queue(
if (heap_no < lock_rec_get_n_bits(lock)
&& (p[bit_offset] & bit_mask)
&& lock_has_to_wait(wait_lock, lock)) {
-
+#ifdef WITH_WSREP
+ if (wsrep_thd_is_BF(wait_lock->trx->mysql_thd, FALSE) &&
+ wsrep_thd_is_BF(lock->trx->mysql_thd, TRUE)) {
+ /* don't wait for another BF lock */
+ continue;
+ }
+#endif
return(lock);
}
}
@@ -3328,6 +3620,47 @@ lock_update_merge_left(
}
/*************************************************************//**
+Updates the lock table when a page is split and merged to
+two pages. */
+UNIV_INTERN
+void
+lock_update_split_and_merge(
+ const buf_block_t* left_block, /*!< in: left page to which merged */
+ const rec_t* orig_pred, /*!< in: original predecessor of
+ supremum on the left page before merge*/
+ const buf_block_t* right_block) /*!< in: right page from which merged */
+{
+ const rec_t* left_next_rec;
+
+ ut_a(left_block && right_block);
+ ut_a(orig_pred);
+
+ lock_mutex_enter();
+
+ left_next_rec = page_rec_get_next_const(orig_pred);
+
+ /* Inherit the locks on the supremum of the left page to the
+ first record which was moved from the right page */
+ lock_rec_inherit_to_gap(
+ left_block, left_block,
+ page_rec_get_heap_no(left_next_rec),
+ PAGE_HEAP_NO_SUPREMUM);
+
+ /* Reset the locks on the supremum of the left page,
+ releasing waiting transactions */
+ lock_rec_reset_and_release_wait(left_block,
+ PAGE_HEAP_NO_SUPREMUM);
+
+ /* Inherit the locks to the supremum of the left page from the
+ successor of the infimum on the right page */
+ lock_rec_inherit_to_gap(left_block, right_block,
+ PAGE_HEAP_NO_SUPREMUM,
+ lock_get_min_heap_no(right_block));
+
+ lock_mutex_exit();
+}
+
+/*************************************************************//**
Resets the original locks on heir and replaces them with gap type locks
inherited from rec. */
UNIV_INTERN
@@ -3798,10 +4131,22 @@ lock_deadlock_select_victim(
/* The joining transaction is 'smaller',
choose it as the victim and roll it back. */
- return(ctx->start);
+#ifdef WITH_WSREP
+ if (wsrep_thd_is_BF(ctx->start->mysql_thd, TRUE)) {
+ return(ctx->wait_lock->trx);
+ }
+ else
+#endif /* WITH_WSREP */
+ return(ctx->start);
}
- return(ctx->wait_lock->trx);
+#ifdef WITH_WSREP
+ if (wsrep_thd_is_BF(ctx->wait_lock->trx->mysql_thd, TRUE)) {
+ return(ctx->start);
+ }
+ else
+#endif /* WITH_WSREP */
+ return(ctx->wait_lock->trx);
}
/********************************************************************//**
@@ -3931,8 +4276,14 @@ lock_deadlock_search(
ctx->too_deep = TRUE;
+#ifdef WITH_WSREP
+ if (wsrep_thd_is_BF(ctx->start->mysql_thd, TRUE)) {
+ return(ctx->wait_lock->trx->id);
+ }
+ else
+#endif /* WITH_WSREP */
/* Select the joining transaction as the victim. */
- return(ctx->start->id);
+ return(ctx->start->id);
} else {
/* We do not need to report autoinc locks to the upper
@@ -3973,6 +4324,11 @@ lock_deadlock_search(
size not big enough. */
ctx->too_deep = TRUE;
+#ifdef WITH_WSREP
+ if (wsrep_thd_is_BF(ctx->start->mysql_thd, TRUE))
+ return(lock->trx->id);
+ else
+#endif /* WITH_WSREP */
return(ctx->start->id);
}
@@ -4157,9 +4513,18 @@ lock_deadlock_check_and_resolve(
ut_a(trx == ctx.start);
ut_a(victim_trx_id == trx->id);
- if (!srv_read_only_mode) {
- lock_deadlock_joining_trx_print(trx, lock);
+#ifdef WITH_WSREP
+ if (!wsrep_thd_is_BF(ctx.start->mysql_thd, TRUE))
+ {
+#endif /* WITH_WSREP */
+ if (!srv_read_only_mode) {
+ lock_deadlock_joining_trx_print(trx, lock);
+ }
+#ifdef WITH_WSREP
+ } else {
+ /* BF processor */;
}
+#endif /* WITH_WSREP */
MONITOR_INC(MONITOR_DEADLOCK);
@@ -4197,6 +4562,9 @@ UNIV_INLINE
lock_t*
lock_table_create(
/*==============*/
+#ifdef WITH_WSREP
+ lock_t* c_lock, /*!< in: conflicting lock */
+#endif
dict_table_t* table, /*!< in/out: database table
in dictionary cache */
ulint type_mode,/*!< in: lock mode possibly ORed with
@@ -4242,7 +4610,59 @@ lock_table_create(
ut_ad(table->n_ref_count > 0 || !table->can_be_evicted);
UT_LIST_ADD_LAST(trx_locks, trx->lock.trx_locks, lock);
+
+#ifdef WITH_WSREP
+ if (wsrep_thd_is_wsrep(trx->mysql_thd)) {
+ if (c_lock && wsrep_thd_is_BF(trx->mysql_thd, FALSE)) {
+ UT_LIST_INSERT_AFTER(
+ un_member.tab_lock.locks, table->locks, c_lock, lock);
+ } else {
+ UT_LIST_ADD_LAST(un_member.tab_lock.locks, table->locks, lock);
+ }
+
+ if (c_lock) {
+ trx_mutex_enter(c_lock->trx);
+ }
+
+ if (c_lock && c_lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+
+ c_lock->trx->lock.was_chosen_as_deadlock_victim = TRUE;
+
+ if (wsrep_debug) {
+ wsrep_print_wait_locks(c_lock);
+ wsrep_print_wait_locks(c_lock->trx->lock.wait_lock);
+ }
+
+ /* have to release trx mutex for the duration of
+ victim lock release. This will eventually call
+ lock_grant, which wants to grant trx mutex again
+ */
+ /* caller has trx_mutex, have to release for lock cancel */
+ trx_mutex_exit(trx);
+ lock_cancel_waiting_and_release(c_lock->trx->lock.wait_lock);
+ trx_mutex_enter(trx);
+
+ /* trx might not wait for c_lock, but some other lock
+ does not matter if wait_lock was released above
+ */
+ if (c_lock->trx->lock.wait_lock == c_lock) {
+ lock_reset_lock_and_trx_wait(lock);
+ }
+
+ if (wsrep_debug) {
+ fprintf(stderr, "WSREP: c_lock canceled %llu\n",
+ (ulonglong) c_lock->trx->id);
+ }
+ }
+ if (c_lock) {
+ trx_mutex_exit(c_lock->trx);
+ }
+ } else {
+ UT_LIST_ADD_LAST(un_member.tab_lock.locks, table->locks, lock);
+ }
+#else
UT_LIST_ADD_LAST(un_member.tab_lock.locks, table->locks, lock);
+#endif /* WITH_WSREP */
if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) {
@@ -4399,6 +4819,9 @@ static
dberr_t
lock_table_enqueue_waiting(
/*=======================*/
+#ifdef WITH_WSREP
+ lock_t* c_lock, /*!< in: conflicting lock */
+#endif
ulint mode, /*!< in: lock mode this transaction is
requesting */
dict_table_t* table, /*!< in/out: table */
@@ -4443,7 +4866,14 @@ lock_table_enqueue_waiting(
/* Enqueue the lock request that will wait to be granted */
- lock = lock_table_create(table, mode | LOCK_WAIT, trx);
+#ifdef WITH_WSREP
+ if (trx->lock.was_chosen_as_deadlock_victim) {
+ return(DB_DEADLOCK);
+ }
+ lock = lock_table_create(c_lock, table, mode | LOCK_WAIT, trx);
+#else
+ lock = lock_table_create(table, mode | LOCK_WAIT, trx);
+#endif /* WITH_WSREP */
/* Release the mutex to obey the latching order.
This is safe, because lock_deadlock_check_and_resolve()
@@ -4516,6 +4946,18 @@ lock_table_other_has_incompatible(
&& !lock_mode_compatible(lock_get_mode(lock), mode)
&& (wait || !lock_get_wait(lock))) {
+#ifdef WITH_WSREP
+ if(wsrep_thd_is_wsrep(trx->mysql_thd)) {
+ if (wsrep_debug) {
+ fprintf(stderr, "WSREP: trx %ld table lock abort\n",
+ trx->id);
+ }
+ trx_mutex_enter(lock->trx);
+ wsrep_kill_victim((trx_t *)trx, (lock_t *)lock);
+ trx_mutex_exit(lock->trx);
+ }
+#endif
+
return(lock);
}
}
@@ -4538,6 +4980,9 @@ lock_table(
enum lock_mode mode, /*!< in: lock mode */
que_thr_t* thr) /*!< in: query thread */
{
+#ifdef WITH_WSREP
+ lock_t *c_lock = NULL;
+#endif
trx_t* trx;
dberr_t err;
const lock_t* wait_for;
@@ -4565,11 +5010,19 @@ lock_table(
lock_mutex_enter();
+ DBUG_EXECUTE_IF("fatal-semaphore-timeout",
+ { os_thread_sleep(3600000000); });
+
/* We have to check if the new lock is compatible with any locks
other transactions have in the table lock queue. */
+#ifdef WITH_WSREP
+ wait_for = lock_table_other_has_incompatible(
+ trx, LOCK_WAIT, table, mode);
+#else
wait_for = lock_table_other_has_incompatible(
trx, LOCK_WAIT, table, mode);
+#endif
trx_mutex_enter(trx);
@@ -4577,9 +5030,17 @@ lock_table(
mode: this trx may have to wait */
if (wait_for != NULL) {
+#ifdef WITH_WSREP
+ err = lock_table_enqueue_waiting((ib_lock_t*)wait_for, mode | flags, table, thr);
+#else
err = lock_table_enqueue_waiting(mode | flags, table, thr);
+#endif
} else {
+#ifdef WITH_WSREP
+ lock_table_create(c_lock, table, mode | flags, trx);
+#else
lock_table_create(table, mode | flags, trx);
+#endif
ut_a(!flags || mode == LOCK_S || mode == LOCK_X);
@@ -4617,7 +5078,11 @@ lock_table_ix_resurrect(
trx, LOCK_WAIT, table, LOCK_IX));
trx_mutex_enter(trx);
+#ifdef WITH_WSREP
+ lock_table_create(NULL, table, LOCK_IX, trx);
+#else
lock_table_create(table, LOCK_IX, trx);
+#endif
lock_mutex_exit();
trx_mutex_exit(trx);
}
@@ -5788,6 +6253,7 @@ lock_rec_queue_validate(
if (!lock_rec_get_gap(lock) && !lock_get_wait(lock)) {
+#ifndef WITH_WSREP
enum lock_mode mode;
if (lock_get_mode(lock) == LOCK_S) {
@@ -5796,7 +6262,8 @@ lock_rec_queue_validate(
mode = LOCK_S;
}
ut_a(!lock_rec_other_has_expl_req(
- mode, 0, 0, block, heap_no, lock->trx));
+ mode, 0, 0, block, heap_no, lock->trx));
+#endif /* WITH_WSREP */
} else if (lock_get_wait(lock) && !lock_rec_get_gap(lock)) {
@@ -6101,6 +6568,9 @@ lock_rec_insert_check_and_lock(
dberr_t err;
ulint next_rec_heap_no;
ibool inherit_in = *inherit;
+#ifdef WITH_WSREP
+ lock_t* c_lock=NULL;
+#endif
ut_ad(block->frame == page_align(rec));
ut_ad(!dict_index_is_online_ddl(index)
@@ -6157,17 +6627,30 @@ lock_rec_insert_check_and_lock(
had to wait for their insert. Both had waiting gap type lock requests
on the successor, which produced an unnecessary deadlock. */
+#ifdef WITH_WSREP
+ if ((c_lock = (ib_lock_t*)lock_rec_other_has_conflicting(
+ static_cast<enum lock_mode>(
+ LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION),
+ block, next_rec_heap_no, trx))) {
+#else
if (lock_rec_other_has_conflicting(
static_cast<enum lock_mode>(
LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION),
block, next_rec_heap_no, trx)) {
+#endif /* WITH_WSREP */
/* Note that we may get DB_SUCCESS also here! */
trx_mutex_enter(trx);
+#ifdef WITH_WSREP
+ err = lock_rec_enqueue_waiting(c_lock,
+ LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION,
+ block, next_rec_heap_no, index, thr);
+#else
err = lock_rec_enqueue_waiting(
LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION,
block, next_rec_heap_no, index, thr);
+#endif /* WITH_WSREP */
trx_mutex_exit(trx);
} else {
diff --git a/storage/innobase/log/log0crypt.cc b/storage/innobase/log/log0crypt.cc
new file mode 100644
index 00000000000..79f4ba35b69
--- /dev/null
+++ b/storage/innobase/log/log0crypt.cc
@@ -0,0 +1,501 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2015, Google Inc. All Rights Reserved.
+Copyright (C) 2014, 2015, MariaDB Corporation. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+/**************************************************//**
+@file log0crypt.cc
+Innodb log encrypt/decrypt
+
+Created 11/25/2013 Minli Zhu Google
+Modified Jan Lindström jan.lindstrom@mariadb.com
+*******************************************************/
+#include "m_string.h"
+#include "log0crypt.h"
+#include <my_crypt.h>
+#include <my_crypt.h>
+
+#include "log0log.h"
+#include "srv0start.h" // for srv_start_lsn
+#include "log0recv.h" // for recv_sys
+
+#include "ha_prototypes.h" // IB_LOG_
+
+#include "my_crypt.h"
+
+#define UNENCRYPTED_KEY_VER 0
+
+/* If true, enable redo log encryption. */
+extern my_bool srv_encrypt_log;
+
+
+#include <algorithm> // std::sort
+#include <deque>
+
+/* If true, enable redo log encryption. */
+UNIV_INTERN my_bool srv_encrypt_log = FALSE;
+/*
+ Sub system type for InnoDB redo log crypto.
+ Set and used to validate crypto msg.
+*/
+static const byte redo_log_purpose_byte = 0x02;
+
+#define LOG_DEFAULT_ENCRYPTION_KEY 1
+
+/*
+ Store this many keys into each checkpoint info
+*/
+static const size_t kMaxSavedKeys = LOG_CRYPT_MAX_ENTRIES;
+
+struct crypt_info_t {
+ ib_uint64_t checkpoint_no; /*!< checkpoint no */
+ uint key_version; /*!< mysqld key version */
+ byte crypt_msg[MY_AES_BLOCK_SIZE];
+ byte crypt_key[MY_AES_BLOCK_SIZE];
+ byte crypt_nonce[MY_AES_BLOCK_SIZE];
+};
+
+static std::deque<crypt_info_t> crypt_info;
+
+/*********************************************************************//**
+Get a log block's start lsn.
+@return a log block's start lsn */
+static inline
+lsn_t
+log_block_get_start_lsn(
+/*====================*/
+ lsn_t lsn, /*!< in: checkpoint lsn */
+ ulint log_block_no) /*!< in: log block number */
+{
+ lsn_t start_lsn =
+ (lsn & (lsn_t)0xffffffff00000000ULL) |
+ (((log_block_no - 1) & (lsn_t)0x3fffffff) << 9);
+ return start_lsn;
+}
+
+static
+const crypt_info_t*
+get_crypt_info(
+/*===========*/
+ ib_uint64_t checkpoint_no)
+{
+ /* so that no one is modifying array while we search */
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ /* a log block only stores 4-bytes of checkpoint no */
+ checkpoint_no &= 0xFFFFFFFF;
+ for (size_t i = 0; i < crypt_info.size(); i++) {
+ struct crypt_info_t* it = &crypt_info[i];
+
+ if (it->checkpoint_no == checkpoint_no) {
+ return it;
+ }
+ }
+ return NULL;
+}
+
+static
+const crypt_info_t*
+get_crypt_info(
+/*===========*/
+ const byte* log_block) {
+ ib_uint64_t checkpoint_no = log_block_get_checkpoint_no(log_block);
+ return get_crypt_info(checkpoint_no);
+}
+
+/*********************************************************************//**
+Call AES CTR to encrypt/decrypt log blocks. */
+static
+Crypt_result
+log_blocks_crypt(
+/*=============*/
+ const byte* block, /*!< in: blocks before encrypt/decrypt*/
+ ulint size, /*!< in: size of block */
+ byte* dst_block, /*!< out: blocks after encrypt/decrypt */
+ bool is_encrypt) /*!< in: encrypt or decrypt*/
+{
+ byte *log_block = (byte*)block;
+ Crypt_result rc = MY_AES_OK;
+ uint dst_len;
+ byte aes_ctr_counter[MY_AES_BLOCK_SIZE];
+ lsn_t lsn = is_encrypt ? log_sys->lsn : srv_start_lsn;
+
+ const int src_len = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE;
+ for (ulint i = 0; i < size ; i += OS_FILE_LOG_BLOCK_SIZE) {
+ ulint log_block_no = log_block_get_hdr_no(log_block);
+ lsn_t log_block_start_lsn = log_block_get_start_lsn(
+ lsn, log_block_no);
+
+ const crypt_info_t* info = get_crypt_info(log_block);
+#ifdef DEBUG_CRYPT
+ fprintf(stderr,
+ "%s %lu chkpt: %lu key: %u lsn: %lu\n",
+ is_encrypt ? "crypt" : "decrypt",
+ log_block_no,
+ log_block_get_checkpoint_no(log_block),
+ info ? info->key_version : 0,
+ log_block_start_lsn);
+#endif
+ if (info == NULL ||
+ info->key_version == UNENCRYPTED_KEY_VER) {
+ memcpy(dst_block, log_block, OS_FILE_LOG_BLOCK_SIZE);
+ goto next;
+ }
+
+ // Assume log block header is not encrypted
+ memcpy(dst_block, log_block, LOG_BLOCK_HDR_SIZE);
+
+ // aes_ctr_counter = nonce(3-byte) + start lsn to a log block
+ // (8-byte) + lbn (4-byte) + abn
+ // (1-byte, only 5 bits are used). "+" means concatenate.
+ bzero(aes_ctr_counter, MY_AES_BLOCK_SIZE);
+ memcpy(aes_ctr_counter, info->crypt_nonce, 3);
+ mach_write_to_8(aes_ctr_counter + 3, log_block_start_lsn);
+ mach_write_to_4(aes_ctr_counter + 11, log_block_no);
+ bzero(aes_ctr_counter + 15, 1);
+
+ int rc;
+ if (is_encrypt) {
+ rc = encryption_encrypt(log_block + LOG_BLOCK_HDR_SIZE, src_len,
+ dst_block + LOG_BLOCK_HDR_SIZE, &dst_len,
+ (unsigned char*)(info->crypt_key), 16,
+ aes_ctr_counter, MY_AES_BLOCK_SIZE, 1,
+ LOG_DEFAULT_ENCRYPTION_KEY,
+ info->key_version);
+ } else {
+ rc = encryption_decrypt(log_block + LOG_BLOCK_HDR_SIZE, src_len,
+ dst_block + LOG_BLOCK_HDR_SIZE, &dst_len,
+ (unsigned char*)(info->crypt_key), 16,
+ aes_ctr_counter, MY_AES_BLOCK_SIZE, 1,
+ LOG_DEFAULT_ENCRYPTION_KEY,
+ info->key_version);
+ }
+
+ ut_a(rc == MY_AES_OK);
+ ut_a(dst_len == src_len);
+next:
+ log_block += OS_FILE_LOG_BLOCK_SIZE;
+ dst_block += OS_FILE_LOG_BLOCK_SIZE;
+ }
+
+ return rc;
+}
+
+/*********************************************************************//**
+Generate crypt key from crypt msg.
+@return true if successfull, false if not. */
+static
+bool
+init_crypt_key(
+/*===========*/
+ crypt_info_t* info) /*< in/out: crypt info */
+{
+ if (info->key_version == UNENCRYPTED_KEY_VER) {
+ memset(info->crypt_key, 0, sizeof(info->crypt_key));
+ memset(info->crypt_msg, 0, sizeof(info->crypt_msg));
+ memset(info->crypt_nonce, 0, sizeof(info->crypt_nonce));
+ return true;
+ }
+
+ byte mysqld_key[MY_AES_BLOCK_SIZE] = {0};
+ uint keylen= sizeof(mysqld_key);
+
+ if (encryption_key_get(LOG_DEFAULT_ENCRYPTION_KEY, info->key_version, mysqld_key, &keylen))
+ {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Redo log crypto: getting mysqld crypto key "
+ "from key version failed. Reason could be that requested"
+ " key_version %u is not found or required encryption "
+ " key management is not found.", info->key_version);
+ return false;
+ }
+
+ uint dst_len;
+ int rc= my_aes_encrypt_ecb(info->crypt_msg, sizeof(info->crypt_msg), //src, srclen
+ info->crypt_key, &dst_len, //dst, &dstlen
+ (unsigned char*)&mysqld_key, sizeof(mysqld_key),
+ NULL, 0, 1);
+
+ if (rc != MY_AES_OK || dst_len != MY_AES_BLOCK_SIZE) {
+ fprintf(stderr,
+ "\nInnodb redo log crypto: getting redo log crypto key "
+ "failed.\n");
+ return false;
+ }
+
+ return true;
+}
+
+static bool mysort(const crypt_info_t& i,
+ const crypt_info_t& j)
+{
+ return i.checkpoint_no > j.checkpoint_no;
+}
+
+static
+bool add_crypt_info(crypt_info_t* info)
+{
+ /* so that no one is searching array while we modify it */
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ if (get_crypt_info(info->checkpoint_no) != NULL) {
+ // already present...
+ return true;
+ }
+
+ if (!init_crypt_key(info)) {
+ return false;
+ }
+
+ crypt_info.push_back(*info);
+
+ /* a log block only stores 4-bytes of checkpoint no */
+ crypt_info.back().checkpoint_no &= 0xFFFFFFFF;
+
+ // keep keys sorted, assuming that last added key will be used most
+ std::sort(crypt_info.begin(), crypt_info.end(), mysort);
+
+ return true;
+}
+
+/*********************************************************************//**
+Encrypt log blocks. */
+UNIV_INTERN
+Crypt_result
+log_blocks_encrypt(
+/*===============*/
+ const byte* block, /*!< in: blocks before encryption */
+ const ulint size, /*!< in: size of blocks, must be multiple of a log block */
+ byte* dst_block) /*!< out: blocks after encryption */
+{
+ return log_blocks_crypt(block, size, dst_block, true);
+}
+
+/*********************************************************************//**
+Set next checkpoint's key version to latest one, and generate current
+key. Key version 0 means no encryption. */
+UNIV_INTERN
+void
+log_crypt_set_ver_and_key(
+/*======================*/
+ ib_uint64_t next_checkpoint_no)
+{
+ crypt_info_t info;
+ info.checkpoint_no = next_checkpoint_no;
+
+ if (!srv_encrypt_log) {
+ info.key_version = UNENCRYPTED_KEY_VER;
+ } else {
+ info.key_version = encryption_key_get_latest_version(LOG_DEFAULT_ENCRYPTION_KEY);
+ }
+
+ if (info.key_version == UNENCRYPTED_KEY_VER) {
+ memset(info.crypt_msg, 0, sizeof(info.crypt_msg));
+ memset(info.crypt_nonce, 0, sizeof(info.crypt_nonce));
+ } else {
+ if (my_random_bytes(info.crypt_msg, MY_AES_BLOCK_SIZE) != MY_AES_OK) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Redo log crypto: generate "
+ "%u-byte random number as crypto msg failed.",
+ MY_AES_BLOCK_SIZE);
+ ut_error;
+ }
+
+ if (my_random_bytes(info.crypt_nonce, MY_AES_BLOCK_SIZE) != MY_AES_OK) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Redo log crypto: generate "
+ "%u-byte random number as AES_CTR nonce failed.",
+ MY_AES_BLOCK_SIZE);
+ ut_error;
+ }
+
+ }
+
+ add_crypt_info(&info);
+}
+
+/********************************************************
+Encrypt one or more log block before it is flushed to disk */
+UNIV_INTERN
+void
+log_encrypt_before_write(
+/*===========================*/
+ ib_uint64_t next_checkpoint_no, /*!< in: log group to be flushed */
+ byte* block, /*!< in/out: pointer to a log block */
+ const ulint size) /*!< in: size of log blocks */
+{
+ ut_ad(size % OS_FILE_LOG_BLOCK_SIZE == 0);
+
+ const crypt_info_t* info = get_crypt_info(next_checkpoint_no);
+ if (info == NULL) {
+ return;
+ }
+
+ if (info->key_version == UNENCRYPTED_KEY_VER) {
+ return;
+ }
+
+ byte* dst_frame = (byte*)malloc(size);
+
+ //encrypt log blocks content
+ Crypt_result result = log_blocks_crypt(block, size, dst_frame, true);
+
+ if (result == MY_AES_OK) {
+ ut_ad(block[0] == dst_frame[0]);
+ memcpy(block, dst_frame, size);
+ }
+ free(dst_frame);
+
+ if (unlikely(result != MY_AES_OK)) {
+ ut_error;
+ }
+}
+
+/********************************************************
+Decrypt a specified log segment after they are read from a log file to a buffer.
+*/
+void
+log_decrypt_after_read(
+/*==========================*/
+ byte* frame, /*!< in/out: log segment */
+ const ulint size) /*!< in: log segment size */
+{
+ ut_ad(size % OS_FILE_LOG_BLOCK_SIZE == 0);
+ byte* dst_frame = (byte*)malloc(size);
+
+ // decrypt log blocks content
+ Crypt_result result = log_blocks_crypt(frame, size, dst_frame, false);
+
+ if (result == MY_AES_OK) {
+ memcpy(frame, dst_frame, size);
+ }
+ free(dst_frame);
+
+ if (unlikely(result != MY_AES_OK)) {
+ ut_error;
+ }
+}
+
+/*********************************************************************//**
+Writes the crypto (version, msg and iv) info, which has been used for
+log blocks with lsn <= this checkpoint's lsn, to a log header's
+checkpoint buf. */
+UNIV_INTERN
+void
+log_crypt_write_checkpoint_buf(
+/*===========================*/
+ byte* buf) /*!< in/out: checkpoint buffer */
+{
+ byte *save = buf;
+
+ // Only write kMaxSavedKeys (sort keys to remove oldest)
+ std::sort(crypt_info.begin(), crypt_info.end(), mysort);
+ while (crypt_info.size() > kMaxSavedKeys) {
+ crypt_info.pop_back();
+ }
+
+ bool encrypted = false;
+ for (size_t i = 0; i < crypt_info.size(); i++) {
+ const crypt_info_t & it = crypt_info[i];
+ if (it.key_version != UNENCRYPTED_KEY_VER) {
+ encrypted = true;
+ break;
+ }
+ }
+
+ if (encrypted == false) {
+ // if no encryption is inuse then zero out
+ // crypt data for upward/downward compability
+ memset(buf + LOG_CRYPT_VER, 0, LOG_CRYPT_SIZE);
+ return;
+ }
+
+ ib_uint64_t checkpoint_no = mach_read_from_8(buf + LOG_CHECKPOINT_NO);
+ buf += LOG_CRYPT_VER;
+
+ mach_write_to_1(buf + 0, redo_log_purpose_byte);
+ mach_write_to_1(buf + 1, crypt_info.size());
+ buf += 2;
+ for (size_t i = 0; i < crypt_info.size(); i++) {
+ struct crypt_info_t* it = &crypt_info[i];
+ mach_write_to_4(buf + 0, it->checkpoint_no);
+ mach_write_to_4(buf + 4, it->key_version);
+ memcpy(buf + 8, it->crypt_msg, MY_AES_BLOCK_SIZE);
+ memcpy(buf + 24, it->crypt_nonce, MY_AES_BLOCK_SIZE);
+ buf += LOG_CRYPT_ENTRY_SIZE;
+ }
+
+#ifdef DEBUG_CRYPT
+ fprintf(stderr, "write chk: %lu [ chk key ]: ", checkpoint_no);
+ for (size_t i = 0; i < crypt_info.size(); i++) {
+ struct crypt_info_t* it = &crypt_info[i];
+ fprintf(stderr, "[ %lu %u ] ",
+ it->checkpoint_no,
+ it->key_version);
+ }
+ fprintf(stderr, "\n");
+#else
+ (void)checkpoint_no; // unused variable
+#endif
+ ut_a((ulint)(buf - save) <= OS_FILE_LOG_BLOCK_SIZE);
+}
+
+/*********************************************************************//**
+Read the crypto (version, msg and iv) info, which has been used for
+log blocks with lsn <= this checkpoint's lsn, from a log header's
+checkpoint buf. */
+UNIV_INTERN
+bool
+log_crypt_read_checkpoint_buf(
+/*===========================*/
+ const byte* buf) { /*!< in: checkpoint buffer */
+
+ buf += LOG_CRYPT_VER;
+
+ byte scheme = buf[0];
+ if (scheme != redo_log_purpose_byte) {
+ return true;
+ }
+ buf++;
+ size_t n = buf[0];
+ buf++;
+
+ for (size_t i = 0; i < n; i++) {
+ struct crypt_info_t info;
+ info.checkpoint_no = mach_read_from_4(buf + 0);
+ info.key_version = mach_read_from_4(buf + 4);
+ memcpy(info.crypt_msg, buf + 8, MY_AES_BLOCK_SIZE);
+ memcpy(info.crypt_nonce, buf + 24, MY_AES_BLOCK_SIZE);
+
+ if (!add_crypt_info(&info)) {
+ return false;
+ }
+ buf += LOG_CRYPT_ENTRY_SIZE;
+ }
+
+#ifdef DEBUG_CRYPT
+ fprintf(stderr, "read [ chk key ]: ");
+ for (size_t i = 0; i < crypt_info.size(); i++) {
+ struct crypt_info_t* it = &crypt_info[i];
+ fprintf(stderr, "[ %lu %u ] ",
+ it->checkpoint_no,
+ it->key_version);
+ }
+ fprintf(stderr, "\n");
+#endif
+ return true;
+}
+
diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc
index 1850e798ed3..82c90275fb3 100644
--- a/storage/innobase/log/log0log.cc
+++ b/storage/innobase/log/log0log.cc
@@ -2,6 +2,7 @@
Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2009, Google Inc.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -80,6 +81,10 @@ reduce the size of the log.
/* Global log system variable */
UNIV_INTERN log_t* log_sys = NULL;
+/* Next log block number to do dummy record filling if no log records written
+for a while */
+static ulint next_lbn_to_pad = 0;
+
#ifdef UNIV_PFS_RWLOCK
UNIV_INTERN mysql_pfs_key_t checkpoint_lock_key;
# ifdef UNIV_LOG_ARCHIVE
@@ -277,7 +282,7 @@ log_reserve_and_open(
log_t* log = log_sys;
ulint len_upper_limit;
#ifdef UNIV_LOG_ARCHIVE
- ulint archived_lsn_age;
+ lsn_t archived_lsn_age;
ulint dummy;
#endif /* UNIV_LOG_ARCHIVE */
#ifdef UNIV_DEBUG
@@ -531,10 +536,9 @@ function_exit:
return(lsn);
}
-#ifdef UNIV_LOG_ARCHIVE
/******************************************************//**
Pads the current log block full with dummy log records. Used in producing
-consistent archived log files. */
+consistent archived log files and scrubbing redo log. */
static
void
log_pad_current_log_block(void)
@@ -563,7 +567,6 @@ log_pad_current_log_block(void)
ut_a(lsn % OS_FILE_LOG_BLOCK_SIZE == LOG_BLOCK_HDR_SIZE);
}
-#endif /* UNIV_LOG_ARCHIVE */
/******************************************************//**
Calculates the data capacity of a log group, when the log file headers are not
@@ -944,7 +947,7 @@ log_init(void)
log_block_set_first_rec_group(log_sys->buf, LOG_BLOCK_HDR_SIZE);
log_sys->buf_free = LOG_BLOCK_HDR_SIZE;
- log_sys->lsn = LOG_START_LSN + LOG_BLOCK_HDR_SIZE;
+ log_sys->lsn = LOG_START_LSN + LOG_BLOCK_HDR_SIZE; // TODO(minliz): ensure various LOG_START_LSN?
MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
log_sys->lsn - log_sys->last_checkpoint_lsn);
@@ -1272,7 +1275,7 @@ log_group_file_header_flush(
(ulint) (dest_offset / UNIV_PAGE_SIZE),
(ulint) (dest_offset % UNIV_PAGE_SIZE),
OS_FILE_LOG_BLOCK_SIZE,
- buf, group);
+ buf, group, 0);
srv_stats.os_log_pending_writes.dec();
}
@@ -1397,10 +1400,13 @@ loop:
ut_a(next_offset / UNIV_PAGE_SIZE <= ULINT_MAX);
+ log_encrypt_before_write(log_sys->next_checkpoint_no,
+ buf, write_len);
+
fil_io(OS_FILE_WRITE | OS_FILE_LOG, true, group->space_id, 0,
(ulint) (next_offset / UNIV_PAGE_SIZE),
(ulint) (next_offset % UNIV_PAGE_SIZE), write_len, buf,
- group);
+ group, 0);
srv_stats.os_log_pending_writes.dec();
@@ -1883,6 +1889,8 @@ log_group_checkpoint(
mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys->next_checkpoint_no);
mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys->next_checkpoint_lsn);
+ log_crypt_write_checkpoint_buf(buf);
+
lsn_offset = log_group_calc_lsn_offset(log_sys->next_checkpoint_lsn,
group);
mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET_LOW32,
@@ -1966,7 +1974,7 @@ log_group_checkpoint(
write_offset / UNIV_PAGE_SIZE,
write_offset % UNIV_PAGE_SIZE,
OS_FILE_LOG_BLOCK_SIZE,
- buf, ((byte*) group + 1));
+ buf, ((byte*) group + 1), 0);
ut_ad(((ulint) group & 0x1UL) == 0);
}
@@ -2007,6 +2015,8 @@ log_reset_first_header_and_checkpoint(
mach_write_to_8(buf + LOG_CHECKPOINT_NO, 0);
mach_write_to_8(buf + LOG_CHECKPOINT_LSN, lsn);
+ log_crypt_write_checkpoint_buf(buf);
+
mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET_LOW32,
LOG_FILE_HDR_SIZE + LOG_BLOCK_HDR_SIZE);
mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET_HIGH32, 0);
@@ -2046,7 +2056,7 @@ log_group_read_checkpoint_info(
fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->space_id, 0,
field / UNIV_PAGE_SIZE, field % UNIV_PAGE_SIZE,
- OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL);
+ OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL, 0);
}
/******************************************************//**
@@ -2145,7 +2155,6 @@ log_checkpoint(
}
log_sys->next_checkpoint_lsn = oldest_lsn;
-
#ifdef UNIV_DEBUG
if (log_debug_writes) {
fprintf(stderr, "Making checkpoint no "
@@ -2155,6 +2164,13 @@ log_checkpoint(
}
#endif /* UNIV_DEBUG */
+ /* generate key version and key used to encrypt future blocks,
+ *
+ * NOTE: the +1 is as the next_checkpoint_no will be updated once
+ * the checkpoint info has been written and THEN blocks will be encrypted
+ * with new key
+ */
+ log_crypt_set_ver_and_key(log_sys->next_checkpoint_no + 1);
log_groups_write_checkpoint_info();
MONITOR_INC(MONITOR_NUM_CHECKPOINT);
@@ -2340,7 +2356,9 @@ loop:
fil_io(OS_FILE_READ | OS_FILE_LOG, sync, group->space_id, 0,
(ulint) (source_offset / UNIV_PAGE_SIZE),
(ulint) (source_offset % UNIV_PAGE_SIZE),
- len, buf, NULL);
+ len, buf, NULL, 0);
+
+ log_decrypt_after_read(buf, len);
start_lsn += len;
buf += len;
@@ -2405,7 +2423,7 @@ log_group_archive_file_header_write(
dest_offset / UNIV_PAGE_SIZE,
dest_offset % UNIV_PAGE_SIZE,
2 * OS_FILE_LOG_BLOCK_SIZE,
- buf, &log_archive_io);
+ buf, &log_archive_io, 0);
}
/******************************************************//**
@@ -2441,7 +2459,7 @@ log_group_archive_completed_header_write(
dest_offset % UNIV_PAGE_SIZE,
OS_FILE_LOG_BLOCK_SIZE,
buf + LOG_FILE_ARCH_COMPLETED,
- &log_archive_io);
+ &log_archive_io, 0);
}
/******************************************************//**
@@ -2565,11 +2583,14 @@ loop:
MONITOR_INC(MONITOR_LOG_IO);
+ //TODO (jonaso): This must be dead code??
+ log_encrypt_before_write(log_sys->next_checkpoint_no, buf, len);
+
fil_io(OS_FILE_WRITE | OS_FILE_LOG, false, group->archive_space_id,
(ulint) (next_offset / UNIV_PAGE_SIZE),
(ulint) (next_offset % UNIV_PAGE_SIZE),
ut_calc_align(len, OS_FILE_LOG_BLOCK_SIZE), buf,
- &log_archive_io);
+ &log_archive_io, 0);
start_lsn += len;
next_offset += len;
@@ -3743,4 +3764,65 @@ log_mem_free(void)
log_sys = NULL;
}
}
+
+/** Event to wake up the log scrub thread */
+UNIV_INTERN os_event_t log_scrub_event = NULL;
+
+UNIV_INTERN ibool srv_log_scrub_thread_active = FALSE;
+
+/*****************************************************************//*
+If no log record has been written for a while, fill current log
+block with dummy records. */
+static
+void
+log_scrub()
+/*=========*/
+{
+ ulint cur_lbn = log_block_convert_lsn_to_no(log_sys->lsn);
+ if (next_lbn_to_pad == cur_lbn)
+ {
+ log_pad_current_log_block();
+ }
+ next_lbn_to_pad = log_block_convert_lsn_to_no(log_sys->lsn);
+}
+
+/* log scrubbing speed, in bytes/sec */
+UNIV_INTERN ulonglong innodb_scrub_log_speed;
+
+/*****************************************************************//**
+This is the main thread for log scrub. It waits for an event and
+when waked up fills current log block with dummy records and
+sleeps again.
+@return this function does not return, it calls os_thread_exit() */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(log_scrub_thread)(
+/*===============================*/
+ void* arg __attribute__((unused))) /*!< in: a dummy parameter
+ required by os_thread_create */
+{
+ ut_ad(!srv_read_only_mode);
+
+ srv_log_scrub_thread_active = TRUE;
+
+ while(srv_shutdown_state == SRV_SHUTDOWN_NONE)
+ {
+ /* log scrubbing interval in µs. */
+ ulonglong interval = 1000*1000*512/innodb_scrub_log_speed;
+
+ os_event_wait_time(log_scrub_event, interval);
+
+ log_scrub();
+
+ os_event_reset(log_scrub_event);
+ }
+
+ srv_log_scrub_thread_active = FALSE;
+
+ /* We count the number of threads in os_thread_exit(). A created
+ thread should always use that to exit and not use return() to exit. */
+ os_thread_exit(NULL);
+
+ OS_THREAD_DUMMY_RETURN;
+}
#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc
index 9affec63252..8e16e74ba1c 100644
--- a/storage/innobase/log/log0recv.cc
+++ b/storage/innobase/log/log0recv.cc
@@ -2,6 +2,7 @@
Copyright (c) 1997, 2015, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -48,6 +49,7 @@ Created 9/20/1997 Heikki Tuuri
#include "trx0undo.h"
#include "trx0rec.h"
#include "fil0fil.h"
+#include "fil0crypt.h"
#ifndef UNIV_HOTBACKUP
# include "buf0rea.h"
# include "srv0srv.h"
@@ -347,7 +349,10 @@ DECLARE_THREAD(recv_writer_thread)(
while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
- os_thread_sleep(100000);
+ /* Wait till we get a signal to clean the LRU list.
+ Bounded by max wait time of 100ms. */
+ ib_int64_t sig_count = os_event_reset(buf_flush_event);
+ os_event_wait_time_low(buf_flush_event, 100000, sig_count);
mutex_enter(&recv_sys->writer_mutex);
@@ -690,8 +695,9 @@ recv_synchronize_groups(
recovered_lsn */
log_group_set_fields(group, recovered_lsn);
- }
+ ut_a(log_sys);
+ }
/* Copy the checkpoint info to the groups; remember that we have
incremented checkpoint_no by one, and the info will not be written
over the max checkpoint info, thus making the preservation of max
@@ -800,6 +806,10 @@ recv_find_max_checkpoint(
checkpoint_no = mach_read_from_8(
buf + LOG_CHECKPOINT_NO);
+ if (!log_crypt_read_checkpoint_buf(buf)) {
+ return DB_ERROR;
+ }
+
#ifdef UNIV_DEBUG
if (log_debug_writes) {
fprintf(stderr,
@@ -929,6 +939,12 @@ log_block_checksum_is_ok_or_old_format(
return(TRUE);
}
+ fprintf(stderr, "BROKEN: block: %lu checkpoint: %lu %.8lx %.8lx\n",
+ log_block_get_hdr_no(block),
+ log_block_get_checkpoint_no(block),
+ log_block_calc_checksum(block),
+ log_block_get_checksum(block));
+
return(FALSE);
}
@@ -1140,7 +1156,9 @@ recv_parse_or_apply_log_rec_body(
+ 0 /*FLST_PREV*/
|| offs == PAGE_BTR_IBUF_FREE_LIST_NODE
+ PAGE_HEADER + FIL_ADDR_PAGE
- + FIL_ADDR_SIZE /*FLST_NEXT*/);
+ + FIL_ADDR_SIZE /*FLST_NEXT*/
+ || offs ==
+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
break;
}
}
@@ -1367,6 +1385,9 @@ recv_parse_or_apply_log_rec_body(
ptr, end_ptr, page, page_zip, index);
}
break;
+ case MLOG_FILE_WRITE_CRYPT_DATA:
+ ptr = fil_parse_write_crypt_data(ptr, end_ptr, block);
+ break;
default:
ptr = NULL;
recv_sys->found_corrupt_log = TRUE;
@@ -2078,7 +2099,7 @@ recv_apply_log_recs_for_backup(void)
error = fil_io(OS_FILE_READ, true,
recv_addr->space, zip_size,
recv_addr->page_no, 0, zip_size,
- block->page.zip.data, NULL);
+ block->page.zip.data, NULL, 0);
if (error == DB_SUCCESS
&& !buf_zip_decompress(block, TRUE)) {
exit(1);
@@ -2088,7 +2109,7 @@ recv_apply_log_recs_for_backup(void)
recv_addr->space, 0,
recv_addr->page_no, 0,
UNIV_PAGE_SIZE,
- block->frame, NULL);
+ block->frame, NULL, 0);
}
if (error != DB_SUCCESS) {
@@ -2117,13 +2138,13 @@ recv_apply_log_recs_for_backup(void)
recv_addr->space, zip_size,
recv_addr->page_no, 0,
zip_size,
- block->page.zip.data, NULL);
+ block->page.zip.data, NULL, 0);
} else {
error = fil_io(OS_FILE_WRITE, true,
recv_addr->space, 0,
recv_addr->page_no, 0,
UNIV_PAGE_SIZE,
- block->frame, NULL);
+ block->frame, NULL, 0);
}
skip_this_recv_addr:
recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
@@ -2735,6 +2756,13 @@ recv_scan_log_recs(
finished = TRUE;
+ /* Crash if we encounter a garbage log block */
+ if (!srv_force_recovery) {
+ fputs("InnoDB: Set innodb_force_recovery"
+ " to ignore this error.\n", stderr);
+ ut_error;
+ }
+
break;
}
@@ -3082,7 +3110,7 @@ recv_recovery_from_checkpoint_start_func(
fil_io(OS_FILE_READ | OS_FILE_LOG, true, max_cp_group->space_id, 0,
0, 0, LOG_FILE_HDR_SIZE,
- log_hdr_buf, max_cp_group);
+ log_hdr_buf, max_cp_group, 0);
if (0 == ut_memcmp(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP,
(byte*)"ibbackup", (sizeof "ibbackup") - 1)) {
@@ -3113,7 +3141,7 @@ recv_recovery_from_checkpoint_start_func(
fil_io(OS_FILE_WRITE | OS_FILE_LOG, true,
max_cp_group->space_id, 0,
0, 0, OS_FILE_LOG_BLOCK_SIZE,
- log_hdr_buf, max_cp_group);
+ log_hdr_buf, max_cp_group, 0);
}
#ifdef UNIV_LOG_ARCHIVE
@@ -3132,12 +3160,10 @@ recv_recovery_from_checkpoint_start_func(
/* Start reading the log groups from the checkpoint lsn up. The
variable contiguous_lsn contains an lsn up to which the log is
known to be contiguously written to all log groups. */
-
recv_sys->parse_start_lsn = checkpoint_lsn;
recv_sys->scanned_lsn = checkpoint_lsn;
recv_sys->scanned_checkpoint_no = 0;
recv_sys->recovered_lsn = checkpoint_lsn;
-
srv_start_lsn = checkpoint_lsn;
}
@@ -3220,7 +3246,6 @@ recv_recovery_from_checkpoint_start_func(
group = UT_LIST_GET_NEXT(log_groups, group);
}
-
/* Done with startup scan. Clear the flag. */
recv_log_scan_is_startup_type = FALSE;
if (TYPE_CHECKPOINT) {
@@ -3308,6 +3333,9 @@ recv_recovery_from_checkpoint_start_func(
log_sys->next_checkpoint_lsn = checkpoint_lsn;
log_sys->next_checkpoint_no = checkpoint_no + 1;
+ /* here the checkpoint info is written without any redo logging ongoing
+ * and next_checkpoint_no is updated directly hence no +1 */
+ log_crypt_set_ver_and_key(log_sys->next_checkpoint_no);
#ifdef UNIV_LOG_ARCHIVE
log_sys->archived_lsn = archived_lsn;
@@ -3338,6 +3366,7 @@ recv_recovery_from_checkpoint_start_func(
log_sys->lsn - log_sys->last_checkpoint_lsn);
log_sys->next_checkpoint_no = checkpoint_no + 1;
+ log_crypt_set_ver_and_key(log_sys->next_checkpoint_no);
#ifdef UNIV_LOG_ARCHIVE
if (archived_lsn == LSN_MAX) {
@@ -3743,7 +3772,7 @@ ask_again:
/* Read the archive file header */
fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->archive_space_id, 0, 0,
- LOG_FILE_HDR_SIZE, buf, NULL);
+ LOG_FILE_HDR_SIZE, buf, NULL, 0);
/* Check if the archive file header is consistent */
@@ -3816,7 +3845,7 @@ ask_again:
fil_io(OS_FILE_READ | OS_FILE_LOG, true,
group->archive_space_id, read_offset / UNIV_PAGE_SIZE,
- read_offset % UNIV_PAGE_SIZE, len, buf, NULL);
+ read_offset % UNIV_PAGE_SIZE, len, buf, NULL, 0);
ret = recv_scan_log_recs(
(buf_pool_get_n_pages()
@@ -4016,4 +4045,3 @@ byte* recv_dblwr_t::find_page(ulint space_id, ulint page_no)
return(result);
}
-
diff --git a/storage/innobase/mtr/mtr0log.cc b/storage/innobase/mtr/mtr0log.cc
index 5335cb4c9ef..82df1df63d4 100644
--- a/storage/innobase/mtr/mtr0log.cc
+++ b/storage/innobase/mtr/mtr0log.cc
@@ -75,7 +75,7 @@ mlog_write_initial_log_record(
{
byte* log_ptr;
- ut_ad(type <= MLOG_BIGGEST_TYPE);
+ ut_ad(type <= MLOG_BIGGEST_TYPE || EXTRA_CHECK_MLOG_NUMBER(type));
ut_ad(type > MLOG_8BYTES);
log_ptr = mlog_open(mtr, 11);
@@ -111,7 +111,7 @@ mlog_parse_initial_log_record(
}
*type = (byte)((ulint)*ptr & ~MLOG_SINGLE_REC_FLAG);
- ut_ad(*type <= MLOG_BIGGEST_TYPE);
+ ut_ad(*type <= MLOG_BIGGEST_TYPE || EXTRA_CHECK_MLOG_NUMBER(*type));
ptr++;
@@ -150,8 +150,6 @@ mlog_parse_nbytes(
ib_uint64_t dval;
ut_a(type <= MLOG_8BYTES);
- ut_a(!page || !page_zip || fil_page_get_type(page) != FIL_PAGE_INDEX);
-
if (end_ptr < ptr + 2) {
return(NULL);
@@ -160,6 +158,11 @@ mlog_parse_nbytes(
offset = mach_read_from_2(ptr);
ptr += 2;
+ ut_a(!page || !page_zip || fil_page_get_type(page) != FIL_PAGE_INDEX ||
+ /* scrubbing changes page type from FIL_PAGE_INDEX to
+ * FIL_PAGE_TYPE_ALLOCATED (rest of this assertion is below) */
+ (type == MLOG_2BYTES && offset == FIL_PAGE_TYPE));
+
if (offset >= UNIV_PAGE_SIZE) {
recv_sys->found_corrupt_log = TRUE;
@@ -219,6 +222,14 @@ mlog_parse_nbytes(
}
mach_write_to_2(page + offset, val);
}
+ ut_a(!page || !page_zip ||
+ fil_page_get_type(page) != FIL_PAGE_INDEX ||
+ /* scrubbing changes page type from FIL_PAGE_INDEX to
+ * FIL_PAGE_TYPE_ALLOCATED */
+ (type == MLOG_2BYTES &&
+ offset == FIL_PAGE_TYPE &&
+ val == FIL_PAGE_TYPE_ALLOCATED));
+
break;
case MLOG_4BYTES:
if (page) {
diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc
index 869586bcd90..400aa9bff57 100644
--- a/storage/innobase/mtr/mtr0mtr.cc
+++ b/storage/innobase/mtr/mtr0mtr.cc
@@ -437,3 +437,36 @@ mtr_print(
}
# endif /* !UNIV_HOTBACKUP */
#endif /* UNIV_DEBUG */
+
+/**********************************************************//**
+Releases a buf_page stored in an mtr memo after a
+savepoint. */
+UNIV_INTERN
+void
+mtr_release_buf_page_at_savepoint(
+/*=============================*/
+ mtr_t* mtr, /*!< in: mtr */
+ ulint savepoint, /*!< in: savepoint */
+ buf_block_t* block) /*!< in: block to release */
+{
+ mtr_memo_slot_t* slot;
+ dyn_array_t* memo;
+
+ ut_ad(mtr);
+ ut_ad(mtr->magic_n == MTR_MAGIC_N);
+ ut_ad(mtr->state == MTR_ACTIVE);
+
+ memo = &(mtr->memo);
+
+ ut_ad(dyn_array_get_data_size(memo) > savepoint);
+
+ slot = (mtr_memo_slot_t*) dyn_array_get_element(memo, savepoint);
+
+ ut_ad(slot->object == block);
+ ut_ad(slot->type == MTR_MEMO_PAGE_S_FIX ||
+ slot->type == MTR_MEMO_PAGE_X_FIX ||
+ slot->type == MTR_MEMO_BUF_FIX);
+
+ buf_page_release((buf_block_t*) slot->object, slot->type);
+ slot->object = NULL;
+}
diff --git a/storage/innobase/mysql-test/storage_engine/type_char_indexes.rdiff b/storage/innobase/mysql-test/storage_engine/type_char_indexes.rdiff
index 7a388552c57..98e17f3c825 100644
--- a/storage/innobase/mysql-test/storage_engine/type_char_indexes.rdiff
+++ b/storage/innobase/mysql-test/storage_engine/type_char_indexes.rdiff
@@ -1,6 +1,6 @@
---- suite/storage_engine/type_char_indexes.result 2012-07-12 19:27:42.191013570 +0400
-+++ suite/storage_engine/type_char_indexes.reject 2012-07-15 17:51:55.810034331 +0400
-@@ -135,7 +135,7 @@
+--- suite/storage_engine/type_char_indexes.result 2014-10-12 14:22:11.000000000 +0400
++++ suite/storage_engine/type_char_indexes.reject 2014-10-12 14:23:28.000000000 +0400
+@@ -137,7 +137,7 @@
r3a
EXPLAIN SELECT c,c20,v16,v128 FROM t1 WHERE v16 = 'varchar1a' OR v16 = 'varchar3a' ORDER BY v16;
id select_type table type possible_keys key key_len ref rows Extra
diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc
index e1c98f6ace3..80525a39d1e 100644
--- a/storage/innobase/os/os0file.cc
+++ b/storage/innobase/os/os0file.cc
@@ -2,6 +2,7 @@
Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, 2015, MariaDB Corporation.
Portions of this file contain modifications contributed and copyrighted
by Percona Inc.. Those modifications are
@@ -42,8 +43,16 @@ Created 10/21/1995 Heikki Tuuri
#include "srv0srv.h"
#include "srv0start.h"
#include "fil0fil.h"
+#include "fil0crypt.h"
+#include "fsp0fsp.h"
+#include "fil0pagecompress.h"
#include "buf0buf.h"
#include "srv0mon.h"
+#include "srv0srv.h"
+#ifdef HAVE_POSIX_FALLOCATE
+#include "unistd.h"
+#include "fcntl.h"
+#endif
#ifndef UNIV_HOTBACKUP
# include "os0sync.h"
# include "os0thread.h"
@@ -60,6 +69,38 @@ Created 10/21/1995 Heikki Tuuri
#include <libaio.h>
#endif
+#if defined(UNIV_LINUX) && defined(HAVE_SYS_IOCTL_H)
+# include <sys/ioctl.h>
+# ifndef DFS_IOCTL_ATOMIC_WRITE_SET
+# define DFS_IOCTL_ATOMIC_WRITE_SET _IOW(0x95, 2, uint)
+# endif
+#endif
+
+#if defined(UNIV_LINUX) && defined(HAVE_SYS_STATVFS_H)
+#include <sys/statvfs.h>
+#endif
+
+#if defined(UNIV_LINUX) && defined(HAVE_LINUX_FALLOC_H)
+#include <linux/falloc.h>
+#endif
+
+#if defined(HAVE_FALLOCATE)
+#ifndef FALLOC_FL_KEEP_SIZE
+#define FALLOC_FL_KEEP_SIZE 0x01
+#endif
+#ifndef FALLOC_FL_PUNCH_HOLE
+#define FALLOC_FL_PUNCH_HOLE 0x02
+#endif
+#endif
+
+#ifdef HAVE_LZO
+#include "lzo/lzo1x.h"
+#endif
+
+#ifdef HAVE_SNAPPY
+#include "snappy-c.h"
+#endif
+
/** Insert buffer segment id */
static const ulint IO_IBUF_SEGMENT = 0;
@@ -87,6 +128,12 @@ UNIV_INTERN os_ib_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
/* In simulated aio, merge at most this many consecutive i/os */
#define OS_AIO_MERGE_N_CONSECUTIVE 64
+#ifdef WITH_INNODB_DISALLOW_WRITES
+#define WAIT_ALLOW_WRITES() os_event_wait(srv_allow_writes_event)
+#else
+#define WAIT_ALLOW_WRITES() do { } while (0)
+#endif /* WITH_INNODB_DISALLOW_WRITES */
+
/**********************************************************************
InnoDB AIO Implementation:
@@ -175,6 +222,16 @@ struct os_aio_slot_t{
and which can be used to identify
which pending aio operation was
completed */
+ ulint bitmap;
+
+ ulint* write_size; /*!< Actual write size initialized
+ after fist successfull trim
+ operation for this page and if
+ initialized we do not trim again if
+ actual page size does not decrease. */
+
+ ulint file_block_size;/*!< file block size */
+
#ifdef WIN_ASYNC_IO
HANDLE handle; /*!< handle object we need in the
OVERLAPPED struct */
@@ -294,6 +351,68 @@ UNIV_INTERN ulint os_n_pending_writes = 0;
/** Number of pending read operations */
UNIV_INTERN ulint os_n_pending_reads = 0;
+/** After first fallocate failure we will disable os_file_trim */
+UNIV_INTERN ibool os_fallocate_failed = FALSE;
+
+/**********************************************************************//**
+Directly manipulate the allocated disk space by deallocating for the file referred to
+by fd for the byte range starting at offset and continuing for len bytes.
+Within the specified range, partial file system blocks are zeroed, and whole
+file system blocks are removed from the file. After a successful call,
+subsequent reads from this range will return zeroes.
+@return true if success, false if error */
+UNIV_INTERN
+ibool
+os_file_trim(
+/*=========*/
+ os_aio_slot_t* slot); /*!< in: slot structure */
+
+/****************************************************************//**
+Does error handling when a file operation fails.
+@return TRUE if we should retry the operation */
+ibool
+os_file_handle_error_no_exit(
+/*=========================*/
+ const char* name, /*!< in: name of a file or NULL */
+ const char* operation, /*!< in: operation */
+ ibool on_error_silent,/*!< in: if TRUE then don't print
+ any message to the log. */
+ const char* file, /*!< in: file name */
+ const ulint line); /*!< in: line */
+
+/****************************************************************//**
+Tries to enable the atomic write feature, if available, for the specified file
+handle.
+@return TRUE if success */
+static __attribute__((warn_unused_result))
+ibool
+os_file_set_atomic_writes(
+/*======================*/
+ const char* name /*!< in: name of the file */
+ __attribute__((unused)),
+ os_file_t file /*!< in: handle to the file */
+ __attribute__((unused)))
+{
+#ifdef DFS_IOCTL_ATOMIC_WRITE_SET
+ int atomic_option = 1;
+
+ if (ioctl(file, DFS_IOCTL_ATOMIC_WRITE_SET, &atomic_option)) {
+
+ fprintf(stderr, "InnoDB: Warning:Trying to enable atomic writes on "
+ "file %s on non-supported platform!\n", name);
+ os_file_handle_error_no_exit(name, "ioctl", FALSE, __FILE__, __LINE__);
+ return(FALSE);
+ }
+
+ return(TRUE);
+#else
+ fprintf(stderr, "InnoDB: Error: trying to enable atomic writes on "
+ "file %s on non-supported platform!\n", name);
+ return(FALSE);
+#endif
+}
+
+
#ifdef UNIV_DEBUG
# ifndef UNIV_HOTBACKUP
/**********************************************************************//**
@@ -439,6 +558,19 @@ os_file_get_last_error_low(
"InnoDB: because of either a thread exit"
" or an application request.\n"
"InnoDB: Retry attempt is made.\n");
+ } else if (err == ECANCELED || err == ENOTTY) {
+ if (strerror(err) != NULL) {
+ fprintf(stderr,
+ "InnoDB: Error number %d"
+ " means '%s'.\n",
+ err, strerror(err));
+ }
+
+ if(srv_use_atomic_writes) {
+ fprintf(stderr,
+ "InnoDB: Error trying to enable atomic writes on "
+ "non-supported destination!\n");
+ }
} else {
fprintf(stderr,
"InnoDB: Some operating system error numbers"
@@ -503,6 +635,19 @@ os_file_get_last_error_low(
"InnoDB: The error means mysqld does not have"
" the access rights to\n"
"InnoDB: the directory.\n");
+ } else if (err == ECANCELED || err == ENOTTY) {
+ if (strerror(err) != NULL) {
+ fprintf(stderr,
+ "InnoDB: Error number %d"
+ " means '%s'.\n",
+ err, strerror(err));
+ }
+
+ if(srv_use_atomic_writes) {
+ fprintf(stderr,
+ "InnoDB: Error trying to enable atomic writes on "
+ "non-supported destination!\n");
+ }
} else {
if (strerror(err) != NULL) {
fprintf(stderr,
@@ -536,6 +681,9 @@ os_file_get_last_error_low(
case ENOTDIR:
case EISDIR:
return(OS_FILE_PATH_ERROR);
+ case ECANCELED:
+ case ENOTTY:
+ return(OS_FILE_OPERATION_NOT_SUPPORTED);
case EAGAIN:
if (srv_use_native_aio) {
return(OS_FILE_AIO_RESOURCES_RESERVED);
@@ -582,9 +730,11 @@ os_file_handle_error_cond_exit(
const char* operation, /*!< in: operation */
ibool should_exit, /*!< in: call exit(3) if unknown error
and this parameter is TRUE */
- ibool on_error_silent)/*!< in: if TRUE then don't print
+ ibool on_error_silent,/*!< in: if TRUE then don't print
any message to the log iff it is
an unknown non-fatal error */
+ const char* file, /*!< in: file name */
+ const ulint line) /*!< in: line */
{
ulint err;
@@ -614,6 +764,9 @@ os_file_handle_error_cond_exit(
" InnoDB: Disk is full. Try to clean the disk"
" to free space.\n");
+ fprintf(stderr,
+ " InnoDB: at file %s and at line %ld\n", file, line);
+
os_has_said_disk_full = TRUE;
fflush(stderr);
@@ -649,6 +802,12 @@ os_file_handle_error_cond_exit(
to the log. */
if (should_exit || !on_error_silent) {
+ fprintf(stderr,
+ " InnoDB: Operation %s to file %s and at line %ld\n",
+ operation, file, line);
+ }
+
+ if (should_exit || !on_error_silent) {
ib_logf(IB_LOG_LEVEL_ERROR, "File %s: '%s' returned OS "
"error " ULINTPF ".%s", name ? name : "(unknown)",
operation, err, should_exit
@@ -671,10 +830,12 @@ ibool
os_file_handle_error(
/*=================*/
const char* name, /*!< in: name of a file or NULL */
- const char* operation) /*!< in: operation */
+ const char* operation, /*!< in: operation */
+ const char* file, /*!< in: file name */
+ const ulint line) /*!< in: line */
{
/* exit in case of unknown error */
- return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE));
+ return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE, file, line));
}
/****************************************************************//**
@@ -685,12 +846,14 @@ os_file_handle_error_no_exit(
/*=========================*/
const char* name, /*!< in: name of a file or NULL */
const char* operation, /*!< in: operation */
- ibool on_error_silent)/*!< in: if TRUE then don't print
+ ibool on_error_silent,/*!< in: if TRUE then don't print
any message to the log. */
+ const char* file, /*!< in: file name */
+ const ulint line) /*!< in: line */
{
/* don't exit in case of unknown error */
return(os_file_handle_error_cond_exit(
- name, operation, FALSE, on_error_silent));
+ name, operation, FALSE, on_error_silent, file, line));
}
#undef USE_FILE_LOCK
@@ -766,7 +929,9 @@ os_file_create_tmpfile(void)
/*========================*/
{
FILE* file = NULL;
- int fd = innobase_mysql_tmpfile();
+ int fd;
+ WAIT_ALLOW_WRITES();
+ fd = innobase_mysql_tmpfile();
ut_ad(!srv_read_only_mode);
@@ -830,7 +995,7 @@ os_file_opendir(
if (dir == INVALID_HANDLE_VALUE) {
if (error_is_fatal) {
- os_file_handle_error(dirname, "opendir");
+ os_file_handle_error(dirname, "opendir", __FILE__, __LINE__);
}
return(NULL);
@@ -841,7 +1006,7 @@ os_file_opendir(
dir = opendir(dirname);
if (dir == NULL && error_is_fatal) {
- os_file_handle_error(dirname, "opendir");
+ os_file_handle_error(dirname, "opendir", __FILE__, __LINE__);
}
return(dir);
@@ -863,7 +1028,7 @@ os_file_closedir(
ret = FindClose(dir);
if (!ret) {
- os_file_handle_error_no_exit(NULL, "closedir", FALSE);
+ os_file_handle_error_no_exit(NULL, "closedir", FALSE, __FILE__, __LINE__);
return(-1);
}
@@ -875,7 +1040,7 @@ os_file_closedir(
ret = closedir(dir);
if (ret) {
- os_file_handle_error_no_exit(NULL, "closedir", FALSE);
+ os_file_handle_error_no_exit(NULL, "closedir", FALSE, __FILE__, __LINE__);
}
return(ret);
@@ -947,7 +1112,7 @@ next_file:
return(1);
} else {
- os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE);
+ os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE, __FILE__, __LINE__);
return(-1);
}
#else
@@ -1033,7 +1198,7 @@ next_file:
goto next_file;
}
- os_file_handle_error_no_exit(full_path, "stat", FALSE);
+ os_file_handle_error_no_exit(full_path, "stat", FALSE, __FILE__, __LINE__);
ut_free(full_path);
@@ -1084,7 +1249,7 @@ os_file_create_directory(
&& !fail_if_exists))) {
os_file_handle_error_no_exit(
- pathname, "CreateDirectory", FALSE);
+ pathname, "CreateDirectory", FALSE, __FILE__, __LINE__);
return(FALSE);
}
@@ -1092,12 +1257,13 @@ os_file_create_directory(
return(TRUE);
#else
int rcode;
+ WAIT_ALLOW_WRITES();
rcode = mkdir(pathname, 0770);
if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
/* failure */
- os_file_handle_error_no_exit(pathname, "mkdir", FALSE);
+ os_file_handle_error_no_exit(pathname, "mkdir", FALSE, __FILE__, __LINE__);
return(FALSE);
}
@@ -1207,7 +1373,7 @@ os_file_create_simple_func(
retry = os_file_handle_error(
name, create_mode == OS_FILE_OPEN ?
- "open" : "create");
+ "open" : "create", __FILE__, __LINE__);
} else {
*success = TRUE;
@@ -1218,6 +1384,8 @@ os_file_create_simple_func(
#else /* __WIN__ */
int create_flag;
+ if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
+ WAIT_ALLOW_WRITES();
ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
@@ -1275,7 +1443,7 @@ os_file_create_simple_func(
retry = os_file_handle_error(
name,
create_mode == OS_FILE_OPEN
- ? "open" : "create");
+ ? "open" : "create", __FILE__, __LINE__);
} else {
*success = TRUE;
retry = false;
@@ -1317,9 +1485,12 @@ os_file_create_simple_no_error_handling_func(
OS_FILE_READ_WRITE, or
OS_FILE_READ_ALLOW_DELETE; the last option is
used by a backup program reading the file */
- ibool* success)/*!< out: TRUE if succeed, FALSE if error */
+ ibool* success,/*!< out: TRUE if succeed, FALSE if error */
+ ulint atomic_writes) /*! in: atomic writes table option
+ value */
{
os_file_t file;
+ atomic_writes_t awrites = (atomic_writes_t) atomic_writes;
*success = FALSE;
#ifdef __WIN__
@@ -1380,11 +1551,30 @@ os_file_create_simple_no_error_handling_func(
attributes,
NULL); // No template file
+ /* If we have proper file handle and atomic writes should be used,
+ try to set atomic writes and if that fails when creating a new
+ table, produce a error. If atomic writes are used on existing
+ file, ignore error and use traditional writes for that file */
+ if (file != INVALID_HANDLE_VALUE
+ && (awrites == ATOMIC_WRITES_ON ||
+ (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT))
+ && !os_file_set_atomic_writes(name, file)) {
+ if (create_mode == OS_FILE_CREATE) {
+ fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n");
+ CloseHandle(file);
+ os_file_delete_if_exists_func(name);
+ *success = FALSE;
+ file = INVALID_HANDLE_VALUE;
+ }
+ }
+
*success = (file != INVALID_HANDLE_VALUE);
#else /* __WIN__ */
int create_flag;
ut_a(name);
+ if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
+ WAIT_ALLOW_WRITES();
ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
@@ -1440,6 +1630,24 @@ os_file_create_simple_no_error_handling_func(
}
#endif /* USE_FILE_LOCK */
+ /* If we have proper file handle and atomic writes should be used,
+ try to set atomic writes and if that fails when creating a new
+ table, produce a error. If atomic writes are used on existing
+ file, ignore error and use traditional writes for that file */
+ if (file != -1
+ && (awrites == ATOMIC_WRITES_ON ||
+ (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT))
+ && !os_file_set_atomic_writes(name, file)) {
+ if (create_mode == OS_FILE_CREATE) {
+ fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n");
+ close(file);
+ os_file_delete_if_exists_func(name);
+ *success = FALSE;
+ file = -1;
+ }
+ }
+
+
#endif /* __WIN__ */
return(file);
@@ -1524,12 +1732,15 @@ os_file_create_func(
async i/o or unbuffered i/o: look in the
function source code for the exact rules */
ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */
- ibool* success)/*!< out: TRUE if succeed, FALSE if error */
+ ibool* success,/*!< out: TRUE if succeed, FALSE if error */
+ ulint atomic_writes) /*! in: atomic writes table option
+ value */
{
os_file_t file;
ibool retry;
ibool on_error_no_exit;
ibool on_error_silent;
+ atomic_writes_t awrites = (atomic_writes_t) atomic_writes;
#ifdef __WIN__
DBUG_EXECUTE_IF(
@@ -1662,9 +1873,9 @@ os_file_create_func(
if (on_error_no_exit) {
retry = os_file_handle_error_no_exit(
- name, operation, on_error_silent);
+ name, operation, on_error_silent, __FILE__, __LINE__);
} else {
- retry = os_file_handle_error(name, operation);
+ retry = os_file_handle_error(name, operation, __FILE__, __LINE__);
}
} else {
*success = TRUE;
@@ -1673,9 +1884,27 @@ os_file_create_func(
} while (retry);
+ /* If we have proper file handle and atomic writes should be used,
+ try to set atomic writes and if that fails when creating a new
+ table, produce a error. If atomic writes are used on existing
+ file, ignore error and use traditional writes for that file */
+ if (file != INVALID_HANDLE_VALUE && type == OS_DATA_FILE
+ && (awrites == ATOMIC_WRITES_ON ||
+ (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT))
+ && !os_file_set_atomic_writes(name, file)) {
+ if (create_mode == OS_FILE_CREATE) {
+ fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n");
+ CloseHandle(file);
+ os_file_delete_if_exists_func(name);
+ *success = FALSE;
+ file = INVALID_HANDLE_VALUE;
+ }
+ }
#else /* __WIN__ */
int create_flag;
const char* mode_str = NULL;
+ if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
+ WAIT_ALLOW_WRITES();
on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
? TRUE : FALSE;
@@ -1747,9 +1976,9 @@ os_file_create_func(
if (on_error_no_exit) {
retry = os_file_handle_error_no_exit(
- name, operation, on_error_silent);
+ name, operation, on_error_silent, __FILE__, __LINE__);
} else {
- retry = os_file_handle_error(name, operation);
+ retry = os_file_handle_error(name, operation, __FILE__, __LINE__);
}
} else {
*success = TRUE;
@@ -1801,6 +2030,22 @@ os_file_create_func(
}
#endif /* USE_FILE_LOCK */
+ /* If we have proper file handle and atomic writes should be used,
+ try to set atomic writes and if that fails when creating a new
+ table, produce a error. If atomic writes are used on existing
+ file, ignore error and use traditional writes for that file */
+ if (file != -1 && type == OS_DATA_FILE
+ && (awrites == ATOMIC_WRITES_ON ||
+ (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT))
+ && !os_file_set_atomic_writes(name, file)) {
+ if (create_mode == OS_FILE_CREATE) {
+ fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n");
+ close(file);
+ os_file_delete_if_exists_func(name);
+ *success = FALSE;
+ file = -1;
+ }
+ }
#endif /* __WIN__ */
return(file);
@@ -1855,11 +2100,12 @@ loop:
goto loop;
#else
int ret;
+ WAIT_ALLOW_WRITES();
ret = unlink(name);
if (ret != 0 && errno != ENOENT) {
- os_file_handle_error_no_exit(name, "delete", FALSE);
+ os_file_handle_error_no_exit(name, "delete", FALSE, __FILE__, __LINE__);
return(false);
}
@@ -1919,11 +2165,12 @@ loop:
goto loop;
#else
int ret;
+ WAIT_ALLOW_WRITES();
ret = unlink(name);
if (ret != 0) {
- os_file_handle_error_no_exit(name, "delete", FALSE);
+ os_file_handle_error_no_exit(name, "delete", FALSE, __FILE__, __LINE__);
return(false);
}
@@ -1967,16 +2214,17 @@ os_file_rename_func(
return(TRUE);
}
- os_file_handle_error_no_exit(oldpath, "rename", FALSE);
+ os_file_handle_error_no_exit(oldpath, "rename", FALSE, __FILE__, __LINE__);
return(FALSE);
#else
int ret;
+ WAIT_ALLOW_WRITES();
ret = rename(oldpath, newpath);
if (ret != 0) {
- os_file_handle_error_no_exit(oldpath, "rename", FALSE);
+ os_file_handle_error_no_exit(oldpath, "rename", FALSE, __FILE__, __LINE__);
return(FALSE);
}
@@ -2005,7 +2253,7 @@ os_file_close_func(
return(TRUE);
}
- os_file_handle_error(NULL, "close");
+ os_file_handle_error(NULL, "close", __FILE__, __LINE__);
return(FALSE);
#else
@@ -2014,7 +2262,7 @@ os_file_close_func(
ret = close(file);
if (ret == -1) {
- os_file_handle_error(NULL, "close");
+ os_file_handle_error(NULL, "close", __FILE__, __LINE__);
return(FALSE);
}
@@ -2114,15 +2362,15 @@ os_file_set_size(
fprintf(stderr, "InnoDB: Error: preallocating file "
"space for file \'%s\' failed. Current size "
"%lu, desired size %lu\n",
- name, (long unsigned) current_size, (long unsigned) size);
- os_file_handle_error_no_exit(name, "posix_fallocate", FALSE);
+ name, current_size, size);
+ os_file_handle_error_no_exit(name, "posix_fallocate", FALSE, __FILE__, __LINE__);
+
return(FALSE);
}
return(TRUE);
}
#endif
-
/* Write up to 1 megabyte at a time. */
buf_size = ut_min(64, (ulint) (size / UNIV_PAGE_SIZE))
* UNIV_PAGE_SIZE;
@@ -2149,6 +2397,7 @@ os_file_set_size(
}
ret = os_file_write(name, file, buf, current_size, n_bytes);
+
if (!ret) {
ut_free(buf2);
goto error_handling;
@@ -2196,6 +2445,7 @@ os_file_set_eof(
HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
return(SetEndOfFile(h));
#else /* __WIN__ */
+ WAIT_ALLOW_WRITES();
return(!ftruncate(fileno(file), ftell(file)));
#endif /* __WIN__ */
}
@@ -2279,7 +2529,7 @@ os_file_flush_func(
return(TRUE);
}
- os_file_handle_error(NULL, "flush");
+ os_file_handle_error(NULL, "flush", __FILE__, __LINE__);
/* It is a fatal error if a file flush does not succeed, because then
the database can get corrupt on disk */
@@ -2288,6 +2538,7 @@ os_file_flush_func(
return(FALSE);
#else
int ret;
+ WAIT_ALLOW_WRITES();
#if defined(HAVE_DARWIN_THREADS)
# ifndef F_FULLFSYNC
@@ -2333,7 +2584,7 @@ os_file_flush_func(
ib_logf(IB_LOG_LEVEL_ERROR, "The OS said file flush did not succeed");
- os_file_handle_error(NULL, "flush");
+ os_file_handle_error(NULL, "flush", __FILE__, __LINE__);
/* It is a fatal error if a file flush does not succeed, because then
the database can get corrupt on disk */
@@ -2667,7 +2918,7 @@ try_again:
#ifdef __WIN__
error_handling:
#endif
- retry = os_file_handle_error(NULL, "read");
+ retry = os_file_handle_error(NULL, "read", __FILE__, __LINE__);
if (retry) {
goto try_again;
@@ -2798,7 +3049,7 @@ try_again:
#ifdef __WIN__
error_handling:
#endif
- retry = os_file_handle_error_no_exit(NULL, "read", FALSE);
+ retry = os_file_handle_error_no_exit(NULL, "read", FALSE, __FILE__, __LINE__);
if (retry) {
goto try_again;
@@ -2869,6 +3120,7 @@ os_file_write_func(
ut_ad(buf);
ut_ad(n > 0);
+
retry:
low = (DWORD) offset & 0xFFFFFFFF;
high = (DWORD) (offset >> 32);
@@ -3000,6 +3252,7 @@ retry:
return(FALSE);
#else
ssize_t ret;
+ WAIT_ALLOW_WRITES();
ret = os_file_pwrite(file, buf, n, offset);
@@ -3073,7 +3326,7 @@ os_file_status(
} else if (ret) {
/* file exists, but stat call failed */
- os_file_handle_error_no_exit(path, "stat", FALSE);
+ os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__);
return(FALSE);
}
@@ -3101,7 +3354,7 @@ os_file_status(
} else if (ret) {
/* file exists, but stat call failed */
- os_file_handle_error_no_exit(path, "stat", FALSE);
+ os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__);
return(FALSE);
}
@@ -3150,7 +3403,7 @@ os_file_get_status(
} else if (ret) {
/* file exists, but stat call failed */
- os_file_handle_error_no_exit(path, "stat", FALSE);
+ os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__);
return(DB_FAIL);
@@ -3203,7 +3456,7 @@ os_file_get_status(
} else if (ret) {
/* file exists, but stat call failed */
- os_file_handle_error_no_exit(path, "stat", FALSE);
+ os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__);
return(DB_FAIL);
@@ -3770,7 +4023,8 @@ os_aio_array_create(
array->slots = static_cast<os_aio_slot_t*>(
ut_malloc(n * sizeof(*array->slots)));
- memset(array->slots, 0x0, sizeof(n * sizeof(*array->slots)));
+ memset(array->slots, 0x0, n * sizeof(*array->slots));
+
#ifdef __WIN__
array->handles = static_cast<HANDLE*>(ut_malloc(n * sizeof(HANDLE)));
#endif /* __WIN__ */
@@ -3859,8 +4113,7 @@ os_aio_array_free(
os_aio_array_t*& array) /*!< in, own: array to free */
{
#ifdef WIN_ASYNC_IO
- ulint i;
-
+ ulint i;
for (i = 0; i < array->n_slots; i++) {
os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i);
CloseHandle(slot->handle);
@@ -4214,7 +4467,12 @@ os_aio_array_reserve_slot(
void* buf, /*!< in: buffer where to read or from which
to write */
os_offset_t offset, /*!< in: file offset */
- ulint len) /*!< in: length of the block to read or write */
+ ulint len, /*!< in: length of the block to read or write */
+ ulint* write_size)/*!< in/out: Actual write size initialized
+ after fist successfull trim
+ operation for this page and if
+ initialized we do not trim again if
+ actual page size does not decrease. */
{
os_aio_slot_t* slot = NULL;
#ifdef WIN_ASYNC_IO
@@ -4301,9 +4559,15 @@ found:
slot->name = name;
slot->len = len;
slot->type = type;
- slot->buf = static_cast<byte*>(buf);
slot->offset = offset;
slot->io_already_done = FALSE;
+ slot->write_size = write_size;
+
+ if (message1) {
+ slot->file_block_size = fil_node_get_block_size(message1);
+ }
+
+ slot->buf = static_cast<byte*>(buf);
#ifdef WIN_ASYNC_IO
control = &slot->control;
@@ -4578,14 +4842,20 @@ os_aio_func(
(can be used to identify a completed
aio operation); ignored if mode is
OS_AIO_SYNC */
- void* message2)/*!< in: message for the aio handler
+ void* message2,/*!< in: message for the aio handler
(can be used to identify a completed
aio operation); ignored if mode is
OS_AIO_SYNC */
+ ulint* write_size)/*!< in/out: Actual write size initialized
+ after fist successfull trim
+ operation for this page and if
+ initialized we do not trim again if
+ actual page size does not decrease. */
{
os_aio_array_t* array;
os_aio_slot_t* slot;
#ifdef WIN_ASYNC_IO
+ void* buffer = NULL;
ibool retval;
BOOL ret = TRUE;
DWORD len = (DWORD) n;
@@ -4604,6 +4874,7 @@ os_aio_func(
ut_ad((n & 0xFFFFFFFFUL) == n);
#endif
+
wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
@@ -4643,7 +4914,8 @@ os_aio_func(
os_has_said_disk_full = FALSE; ret = 0; errno = 28;);
if (!ret) {
- os_file_handle_error_cond_exit(name, "os_file_write_func", TRUE, FALSE);
+ os_file_handle_error_cond_exit(name, "os_file_write_func", TRUE, FALSE,
+ __FILE__, __LINE__);
}
}
@@ -4693,7 +4965,8 @@ try_again:
}
slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
- name, buf, offset, n);
+ name, buf, offset, n, write_size);
+
if (type == OS_FILE_READ) {
if (srv_use_native_aio) {
os_n_file_reads++;
@@ -4719,7 +4992,10 @@ try_again:
if (srv_use_native_aio) {
os_n_file_writes++;
#ifdef WIN_ASYNC_IO
- ret = WriteFile(file, buf, (DWORD) n, &len,
+
+ n = slot->len;
+ buffer = buf;
+ ret = WriteFile(file, buffer, (DWORD) n, &len,
&(slot->control));
#elif defined(LINUX_NATIVE_AIO)
@@ -4773,7 +5049,7 @@ err_exit:
os_aio_array_free_slot(array, slot);
if (os_file_handle_error(
- name,type == OS_FILE_READ ? "aio read" : "aio write")) {
+ name,type == OS_FILE_READ ? "aio read" : "aio write", __FILE__, __LINE__)) {
goto try_again;
}
@@ -4886,9 +5162,17 @@ os_aio_windows_handle(
if (ret && len == slot->len) {
ret_val = TRUE;
- } else if (os_file_handle_error(slot->name, "Windows aio")) {
+ } else if (!ret || (len != slot->len)) {
- retry = TRUE;
+ if (!ret) {
+ if (os_file_handle_error(slot->name, "Windows aio", __FILE__, __LINE__)) {
+ retry = TRUE;
+ } else {
+ ret_val = FALSE;
+ }
+ } else {
+ retry = TRUE;
+ }
} else {
ret_val = FALSE;
@@ -4914,12 +5198,11 @@ os_aio_windows_handle(
ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
- switch (slot->type) {
- case OS_FILE_WRITE:
+ switch (slot->type) {
+ case OS_FILE_WRITE:
ret = WriteFile(slot->file, slot->buf,
(DWORD) slot->len, &len,
&(slot->control));
-
break;
case OS_FILE_READ:
ret = ReadFile(slot->file, slot->buf,
@@ -4950,6 +5233,11 @@ os_aio_windows_handle(
ret_val = ret && len == slot->len;
}
+ if (slot->type == OS_FILE_WRITE && srv_use_trim && os_fallocate_failed == FALSE) {
+ // Deallocate unused blocks from file system
+ os_file_trim(slot);
+ }
+
os_aio_array_free_slot(array, slot);
return(ret_val);
@@ -5039,6 +5327,11 @@ retry:
/* We have not overstepped to next segment. */
ut_a(slot->pos < end_pos);
+ if (slot->type == OS_FILE_WRITE && srv_use_trim && os_fallocate_failed == FALSE) {
+ // Deallocate unused blocks from file system
+ os_file_trim(slot);
+ }
+
/* Mark this request as completed. The error handling
will be done in the calling function. */
os_mutex_enter(array->mutex);
@@ -5182,6 +5475,13 @@ found:
} else {
errno = -slot->ret;
+ if (slot->ret == 0) {
+ fprintf(stderr,
+ "InnoDB: Number of bytes after aio %d requested %lu\n"
+ "InnoDB: from file %s\n",
+ slot->n_bytes, slot->len, slot->name);
+ }
+
/* os_file_handle_error does tell us if we should retry
this IO. As it stands now, we don't do this retry when
reaping requests from a different context than
@@ -5189,7 +5489,7 @@ found:
windows and linux native AIO.
We should probably look into this to transparently
re-submit the IO. */
- os_file_handle_error(slot->name, "Linux aio");
+ os_file_handle_error(slot->name, "Linux aio", __FILE__, __LINE__);
ret = FALSE;
}
@@ -5473,7 +5773,8 @@ consecutive_loop:
errno = 28;);
if (!ret) {
- os_file_handle_error_cond_exit(aio_slot->name, "os_file_write_func", TRUE, FALSE);
+ os_file_handle_error_cond_exit(aio_slot->name, "os_file_write_func", TRUE, FALSE,
+ __FILE__, __LINE__);
}
} else {
@@ -5869,4 +6170,245 @@ os_aio_all_slots_free(void)
}
#endif /* UNIV_DEBUG */
+#ifdef _WIN32
+#include <winioctl.h>
+#ifndef FSCTL_FILE_LEVEL_TRIM
+#define FSCTL_FILE_LEVEL_TRIM CTL_CODE(FILE_DEVICE_FILE_SYSTEM, 130, METHOD_BUFFERED, FILE_WRITE_DATA)
+typedef struct _FILE_LEVEL_TRIM_RANGE {
+ DWORDLONG Offset;
+ DWORDLONG Length;
+} FILE_LEVEL_TRIM_RANGE, *PFILE_LEVEL_TRIM_RANGE;
+
+typedef struct _FILE_LEVEL_TRIM {
+ DWORD Key;
+ DWORD NumRanges;
+ FILE_LEVEL_TRIM_RANGE Ranges[1];
+} FILE_LEVEL_TRIM, *PFILE_LEVEL_TRIM;
+#endif
+#endif
+
+/**********************************************************************//**
+Directly manipulate the allocated disk space by deallocating for the file referred to
+by fd for the byte range starting at offset and continuing for len bytes.
+Within the specified range, partial file system blocks are zeroed, and whole
+file system blocks are removed from the file. After a successful call,
+subsequent reads from this range will return zeroes.
+@return true if success, false if error */
+UNIV_INTERN
+ibool
+os_file_trim(
+/*=========*/
+ os_aio_slot_t* slot) /*!< in: slot structure */
+{
+
+ size_t len = slot->len;
+ size_t trim_len = UNIV_PAGE_SIZE - len;
+ os_offset_t off = slot->offset + len;
+ size_t bsize = slot->file_block_size;
+
+ // len here should be alligned to sector size
+ ut_ad((trim_len % bsize) == 0);
+ ut_ad((len % bsize) == 0);
+ ut_ad(bsize != 0);
+ ut_ad((off % bsize) == 0);
+
+#ifdef UNIV_TRIM_DEBUG
+ fprintf(stderr, "Note: TRIM: write_size %lu trim_len %lu len %lu off %lu bz %lu\n",
+ *slot->write_size, trim_len, len, off, bsize);
+#endif
+
+ // Nothing to do if trim length is zero or if actual write
+ // size is initialized and it is smaller than current write size.
+ // In first write if we trim we set write_size to actual bytes
+ // written and rest of the page is trimmed. In following writes
+ // there is no need to trim again if write_size only increases
+ // because rest of the page is already trimmed. If actual write
+ // size decreases we need to trim again.
+ if (trim_len == 0 ||
+ (slot->write_size &&
+ *slot->write_size > 0 &&
+ len >= *slot->write_size)) {
+
+#ifdef UNIV_PAGECOMPRESS_DEBUG
+ fprintf(stderr, "Note: TRIM: write_size %lu trim_len %lu len %lu\n",
+ *slot->write_size, trim_len, len);
+#endif
+
+ if (*slot->write_size > 0 && len >= *slot->write_size) {
+ srv_stats.page_compressed_trim_op_saved.inc();
+ }
+
+ *slot->write_size = len;
+
+ return (TRUE);
+ }
+
+#ifdef __linux__
+#if defined(HAVE_FALLOCATE)
+ int ret = fallocate(slot->file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off, trim_len);
+
+ if (ret) {
+ /* After first failure do not try to trim again */
+ os_fallocate_failed = TRUE;
+ srv_use_trim = FALSE;
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: fallocate call failed with error code %d.\n"
+ " InnoDB: start: %lu len: %lu payload: %lu\n"
+ " InnoDB: Disabling fallocate for now.\n", errno, off, trim_len, len);
+
+ os_file_handle_error_no_exit(slot->name,
+ " fallocate(FALLOC_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE) ",
+ FALSE, __FILE__, __LINE__);
+
+ if (slot->write_size) {
+ *slot->write_size = 0;
+ }
+
+ return (FALSE);
+ } else {
+ if (slot->write_size) {
+ *slot->write_size = len;
+ }
+ }
+#else
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: fallocate not supported on this installation."
+ " InnoDB: Disabling fallocate for now.");
+ os_fallocate_failed = TRUE;
+ srv_use_trim = FALSE;
+ if (slot->write_size) {
+ *slot->write_size = 0;
+ }
+
+#endif /* HAVE_FALLOCATE ... */
+
+#elif defined(_WIN32)
+ FILE_LEVEL_TRIM flt;
+ flt.Key = 0;
+ flt.NumRanges = 1;
+ flt.Ranges[0].Offset = off;
+ flt.Ranges[0].Length = trim_len;
+
+ BOOL ret = DeviceIoControl(slot->file, FSCTL_FILE_LEVEL_TRIM,
+ &flt, sizeof(flt), NULL, NULL, NULL, NULL);
+
+ if (!ret) {
+ /* After first failure do not try to trim again */
+ os_fallocate_failed = TRUE;
+ srv_use_trim=FALSE;
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: fallocate call failed with error.\n"
+ " InnoDB: start: %lu len: %lu payload: %lu\n"
+ " InnoDB: Disabling fallocate for now.\n", off, trim_len, len);
+
+ os_file_handle_error_no_exit(slot->name,
+ " DeviceIOControl(FSCTL_FILE_LEVEL_TRIM) ",
+ FALSE, __FILE__, __LINE__);
+
+ if (slot->write_size) {
+ *slot->write_size = 0;
+ }
+ return (FALSE);
+ } else {
+ if (slot->write_size) {
+ *slot->write_size = len;
+ }
+ }
+#endif
+
+ switch(bsize) {
+ case 512:
+ srv_stats.page_compression_trim_sect512.add((trim_len / bsize));
+ break;
+ case 1024:
+ srv_stats.page_compression_trim_sect1024.add((trim_len / bsize));
+ break;
+ case 2948:
+ srv_stats.page_compression_trim_sect2048.add((trim_len / bsize));
+ break;
+ case 4096:
+ srv_stats.page_compression_trim_sect4096.add((trim_len / bsize));
+ break;
+ case 8192:
+ srv_stats.page_compression_trim_sect8192.add((trim_len / bsize));
+ break;
+ case 16384:
+ srv_stats.page_compression_trim_sect16384.add((trim_len / bsize));
+ break;
+ case 32768:
+ srv_stats.page_compression_trim_sect32768.add((trim_len / bsize));
+ break;
+ default:
+ break;
+ }
+
+ srv_stats.page_compressed_trim_op.inc();
+
+ return (TRUE);
+
+}
#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************************//**
+Try to get number of bytes per sector from file system.
+@return file block size */
+UNIV_INTERN
+ulint
+os_file_get_block_size(
+/*===================*/
+ os_file_t file, /*!< in: handle to a file */
+ const char* name) /*!< in: file name */
+{
+ ulint fblock_size = 512;
+
+#if defined(UNIV_LINUX) && defined(HAVE_SYS_STATVFS_H)
+ struct statvfs fstat;
+ int err;
+
+ err = fstatvfs(file, &fstat);
+
+ if (err != 0) {
+ fprintf(stderr, "InnoDB: Warning: fstatvfs() failed on file %s\n", name);
+ os_file_handle_error_no_exit(name, "fstatvfs()", FALSE, __FILE__, __LINE__);
+ } else {
+ fblock_size = fstat.f_bsize;
+ }
+#endif /* UNIV_LINUX */
+#ifdef __WIN__
+ {
+ DWORD SectorsPerCluster = 0;
+ DWORD BytesPerSector = 0;
+ DWORD NumberOfFreeClusters = 0;
+ DWORD TotalNumberOfClusters = 0;
+
+ /*
+ if (GetFreeSpace((LPCTSTR)name, &SectorsPerCluster, &BytesPerSector, &NumberOfFreeClusters, &TotalNumberOfClusters)) {
+ fblock_size = BytesPerSector;
+ } else {
+ fprintf(stderr, "InnoDB: Warning: GetFreeSpace() failed on file %s\n", name);
+ os_file_handle_error_no_exit(name, "GetFreeSpace()", FALSE, __FILE__, __LINE__);
+ }
+ */
+ }
+#endif /* __WIN__*/
+
+ if (fblock_size > UNIV_PAGE_SIZE/2 || fblock_size < 512) {
+ fprintf(stderr, "InnoDB: Note: File system for file %s has "
+ "file block size %lu not supported for page_size %lu\n",
+ name, fblock_size, UNIV_PAGE_SIZE);
+
+ if (fblock_size < 512) {
+ fblock_size = 512;
+ } else {
+ fblock_size = UNIV_PAGE_SIZE/2;
+ }
+
+ fprintf(stderr, "InnoDB: Note: Using file block size %ld for file %s\n",
+ fblock_size, name);
+ }
+
+ return fblock_size;
+}
diff --git a/storage/innobase/page/page0cur.cc b/storage/innobase/page/page0cur.cc
index f5f7e1299ce..97405261392 100644
--- a/storage/innobase/page/page0cur.cc
+++ b/storage/innobase/page/page0cur.cc
@@ -1349,6 +1349,21 @@ page_cur_insert_rec_zip(
return(insert_rec);
}
+ /* Page compress failed. If this happened on a
+ leaf page, put the data size into the sample
+ buffer. */
+ if (page_is_leaf(page)) {
+ ulint occupied = page_get_data_size(page)
+ + page_dir_calc_reserved_space(
+ page_get_n_recs(page));
+ index->stat_defrag_data_size_sample[
+ index->stat_defrag_sample_next_slot] =
+ occupied;
+ index->stat_defrag_sample_next_slot =
+ (index->stat_defrag_sample_next_slot
+ + 1) % STAT_DEFRAG_DATA_SIZE_N_SAMPLE;
+ }
+
ut_ad(cursor->rec
== (pos > 1
? page_rec_get_nth(
diff --git a/storage/innobase/page/page0page.cc b/storage/innobase/page/page0page.cc
index cb2381df48c..fca8641342c 100644
--- a/storage/innobase/page/page0page.cc
+++ b/storage/innobase/page/page0page.cc
@@ -1087,7 +1087,9 @@ delete_all:
last_rec = page_rec_get_prev(page_get_supremum_rec(page));
- if ((size == ULINT_UNDEFINED) || (n_recs == ULINT_UNDEFINED)) {
+ bool scrub = srv_immediate_scrub_data_uncompressed;
+ if ((size == ULINT_UNDEFINED) || (n_recs == ULINT_UNDEFINED) ||
+ scrub) {
rec_t* rec2 = rec;
/* Calculate the sum of sizes and the number of records */
size = 0;
@@ -1104,6 +1106,12 @@ delete_all:
size += s;
n_recs++;
+ if (scrub) {
+ /* scrub record */
+ uint recsize = rec_offs_data_size(offsets);
+ memset(rec2, 0, recsize);
+ }
+
rec2 = page_rec_get_next(rec2);
} while (!page_rec_is_supremum(rec2));
diff --git a/storage/innobase/page/page0zip.cc b/storage/innobase/page/page0zip.cc
index 68a8bb1532f..e8fd10912a7 100644
--- a/storage/innobase/page/page0zip.cc
+++ b/storage/innobase/page/page0zip.cc
@@ -87,7 +87,7 @@ UNIV_INTERN uint page_zip_level = DEFAULT_COMPRESSION_LEVEL;
/* Whether or not to log compressed page images to avoid possible
compression algorithm changes in zlib. */
-UNIV_INTERN my_bool page_zip_log_pages = true;
+UNIV_INTERN my_bool page_zip_log_pages = false;
/* Please refer to ../include/page0zip.ic for a description of the
compressed page format. */
@@ -672,7 +672,7 @@ page_zip_dir_encode(
#if PAGE_ZIP_DIR_SLOT_MASK & (PAGE_ZIP_DIR_SLOT_MASK + 1)
# error "PAGE_ZIP_DIR_SLOT_MASK is not 1 less than a power of 2"
#endif
-#if PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE_MAX - 1
+#if PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE_DEF - 1
# error "PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE_MAX - 1"
#endif
if (UNIV_UNLIKELY(rec_get_n_owned_new(rec))) {
@@ -925,7 +925,7 @@ page_zip_compress_sec(
rec - REC_N_NEW_EXTRA_BYTES
- c_stream->next_in);
- if (UNIV_LIKELY(c_stream->avail_in)) {
+ if (UNIV_LIKELY(c_stream->avail_in != 0)) {
UNIV_MEM_ASSERT_RW(c_stream->next_in,
c_stream->avail_in);
err = deflate(c_stream, Z_NO_FLUSH);
@@ -1020,7 +1020,7 @@ page_zip_compress_clust_ext(
c_stream->avail_in = static_cast<uInt>(
src - c_stream->next_in);
- if (UNIV_LIKELY(c_stream->avail_in)) {
+ if (UNIV_LIKELY(c_stream->avail_in != 0)) {
err = deflate(c_stream, Z_NO_FLUSH);
if (UNIV_UNLIKELY(err != Z_OK)) {
diff --git a/storage/innobase/pars/pars0opt.cc b/storage/innobase/pars/pars0opt.cc
index cbed2b39eeb..5a7e1861d74 100644
--- a/storage/innobase/pars/pars0opt.cc
+++ b/storage/innobase/pars/pars0opt.cc
@@ -948,12 +948,14 @@ opt_find_all_cols(
/* Fill in the field_no fields in sym_node */
sym_node->field_nos[SYM_CLUST_FIELD_NO] = dict_index_get_nth_col_pos(
- dict_table_get_first_index(index->table), sym_node->col_no);
+ dict_table_get_first_index(index->table), sym_node->col_no,
+ NULL);
if (!dict_index_is_clust(index)) {
ut_a(plan);
- col_pos = dict_index_get_nth_col_pos(index, sym_node->col_no);
+ col_pos = dict_index_get_nth_col_pos(index, sym_node->col_no,
+ NULL);
if (col_pos == ULINT_UNDEFINED) {
diff --git a/storage/innobase/pars/pars0pars.cc b/storage/innobase/pars/pars0pars.cc
index 655e5ba1324..c87e1f8e247 100644
--- a/storage/innobase/pars/pars0pars.cc
+++ b/storage/innobase/pars/pars0pars.cc
@@ -1232,7 +1232,8 @@ pars_process_assign_list(
col_sym = assign_node->col;
upd_field_set_field_no(upd_field, dict_index_get_nth_col_pos(
- clust_index, col_sym->col_no),
+ clust_index, col_sym->col_no,
+ NULL),
clust_index, NULL);
upd_field->exp = assign_node->val;
diff --git a/storage/innobase/rem/rem0rec.cc b/storage/innobase/rem/rem0rec.cc
index 0d7b7c16785..3ff71d5c59e 100644
--- a/storage/innobase/rem/rem0rec.cc
+++ b/storage/innobase/rem/rem0rec.cc
@@ -33,6 +33,9 @@ Created 5/30/1994 Heikki Tuuri
#include "mtr0mtr.h"
#include "mtr0log.h"
#include "fts0fts.h"
+#ifdef WITH_WSREP
+#include <ha_prototypes.h>
+#endif /* WITH_WSREP */
/* PHYSICAL RECORD (OLD STYLE)
===========================
@@ -1961,3 +1964,134 @@ rec_get_trx_id(
}
# endif /* UNIV_DEBUG */
#endif /* !UNIV_HOTBACKUP */
+
+#ifdef WITH_WSREP
+int
+wsrep_rec_get_foreign_key(
+ byte *buf, /* out: extracted key */
+ ulint *buf_len, /* in/out: length of buf */
+ const rec_t* rec, /* in: physical record */
+ dict_index_t* index_for, /* in: index in foreign table */
+ dict_index_t* index_ref, /* in: index in referenced table */
+ ibool new_protocol) /* in: protocol > 1 */
+{
+ const byte* data;
+ ulint len;
+ ulint key_len = 0;
+ ulint i;
+ uint key_parts;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ const ulint* offsets;
+
+ ut_ad(index_for);
+ ut_ad(index_ref);
+
+ rec_offs_init(offsets_);
+ offsets = rec_get_offsets(rec, index_for, offsets_,
+ ULINT_UNDEFINED, &heap);
+
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+ ut_ad(rec);
+
+ key_parts = dict_index_get_n_unique_in_tree(index_for);
+ for (i = 0;
+ i < key_parts &&
+ (index_for->type & DICT_CLUSTERED || i < key_parts - 1);
+ i++) {
+ dict_field_t* field_f =
+ dict_index_get_nth_field(index_for, i);
+ const dict_col_t* col_f = dict_field_get_col(field_f);
+ dict_field_t* field_r =
+ dict_index_get_nth_field(index_ref, i);
+ const dict_col_t* col_r = dict_field_get_col(field_r);
+
+ data = rec_get_nth_field(rec, offsets, i, &len);
+ if (key_len + ((len != UNIV_SQL_NULL) ? len + 1 : 1) >
+ *buf_len) {
+ fprintf (stderr,
+ "WSREP: FK key len exceeded %lu %lu %lu\n",
+ key_len, len, *buf_len);
+ goto err_out;
+ }
+
+ if (len == UNIV_SQL_NULL) {
+ ut_a(!(col_f->prtype & DATA_NOT_NULL));
+ *buf++ = 1;
+ key_len++;
+ } else if (!new_protocol) {
+ if (!(col_r->prtype & DATA_NOT_NULL)) {
+ *buf++ = 0;
+ key_len++;
+ }
+ memcpy(buf, data, len);
+ *buf_len = wsrep_innobase_mysql_sort(
+ (int)(col_f->prtype & DATA_MYSQL_TYPE_MASK),
+ (uint)dtype_get_charset_coll(col_f->prtype),
+ buf, len, *buf_len);
+ } else { /* new protocol */
+ if (!(col_r->prtype & DATA_NOT_NULL)) {
+ *buf++ = 0;
+ key_len++;
+ }
+ switch (col_f->mtype) {
+ case DATA_INT: {
+ byte* ptr = buf+len;
+ for (;;) {
+ ptr--;
+ *ptr = *data;
+ if (ptr == buf) {
+ break;
+ }
+ data++;
+ }
+
+ if (!(col_f->prtype & DATA_UNSIGNED)) {
+ buf[len-1] = (byte) (buf[len-1] ^ 128);
+ }
+
+ break;
+ }
+ case DATA_VARCHAR:
+ case DATA_VARMYSQL:
+ case DATA_CHAR:
+ case DATA_MYSQL:
+ /* Copy the actual data */
+ ut_memcpy(buf, data, len);
+ len = wsrep_innobase_mysql_sort(
+ (int)
+ (col_f->prtype & DATA_MYSQL_TYPE_MASK),
+ (uint)
+ dtype_get_charset_coll(col_f->prtype),
+ buf, len, *buf_len);
+ break;
+ case DATA_BLOB:
+ case DATA_BINARY:
+ memcpy(buf, data, len);
+ break;
+ default:
+ break;
+ }
+
+ key_len += len;
+ buf += len;
+ }
+ }
+
+ rec_validate(rec, offsets);
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ *buf_len = key_len;
+ return DB_SUCCESS;
+
+ err_out:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return DB_ERROR;
+}
+#endif // WITH_WSREP
diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc
index 621a14d27c2..b3817e86080 100644
--- a/storage/innobase/row/row0ftsort.cc
+++ b/storage/innobase/row/row0ftsort.cc
@@ -848,7 +848,7 @@ exit:
error = row_merge_sort(psort_info->psort_common->trx,
psort_info->psort_common->dup,
- merge_file[i], block[i], &tmpfd[i]);
+ merge_file[i], block[i], &tmpfd[i], false, 0.0/* pct_progress */, 0.0/* pct_cost */);
if (error != DB_SUCCESS) {
close(tmpfd[i]);
goto func_exit;
@@ -1409,8 +1409,9 @@ row_fts_merge_insert(
fd[i] = psort_info[i].merge_file[id]->fd;
foffs[i] = 0;
- buf[i] = static_cast<unsigned char (*)[16384]>(
+ buf[i] = static_cast<unsigned char (*)[65536]>(
mem_heap_alloc(heap, sizeof *buf[i]));
+
count_diag += (int) psort_info[i].merge_file[id]->n_rec;
}
diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc
index c513320afc1..d5f766ef51b 100644
--- a/storage/innobase/row/row0import.cc
+++ b/storage/innobase/row/row0import.cc
@@ -1990,7 +1990,8 @@ PageConverter::update_header(
}
mach_write_to_8(
- get_frame(block) + FIL_PAGE_FILE_FLUSH_LSN, m_current_lsn);
+ get_frame(block) + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION,
+ m_current_lsn);
/* Write space_id to the tablespace header, page 0. */
mach_write_to_4(
diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc
index ac9ca7b44eb..44c9ac32d16 100644
--- a/storage/innobase/row/row0ins.cc
+++ b/storage/innobase/row/row0ins.cc
@@ -920,6 +920,14 @@ row_ins_invalidate_query_cache(
innobase_invalidate_query_cache(thr_get_trx(thr), buf, len);
mem_free(buf);
}
+#ifdef WITH_WSREP
+dberr_t wsrep_append_foreign_key(trx_t *trx,
+ dict_foreign_t* foreign,
+ const rec_t* clust_rec,
+ dict_index_t* clust_index,
+ ibool referenced,
+ ibool shared);
+#endif /* WITH_WSREP */
/*********************************************************************//**
Perform referential actions or checks when a parent row is deleted or updated
@@ -1271,7 +1279,19 @@ row_ins_foreign_check_on_constraint(
cascade->state = UPD_NODE_UPDATE_CLUSTERED;
- err = row_update_cascade_for_mysql(thr, cascade,
+#ifdef WITH_WSREP
+ err = wsrep_append_foreign_key(
+ thr_get_trx(thr),
+ foreign,
+ clust_rec,
+ clust_index,
+ FALSE, FALSE);
+ if (err != DB_SUCCESS) {
+ fprintf(stderr,
+ "WSREP: foreign key append failed: %d\n", err);
+ } else
+#endif /* WITH_WSREP */
+ err = row_update_cascade_for_mysql(thr, cascade,
foreign->foreign_table);
if (foreign->foreign_table->n_foreign_key_checks_running == 0) {
@@ -1603,7 +1623,14 @@ run_again:
if (check_ref) {
err = DB_SUCCESS;
-
+#ifdef WITH_WSREP
+ err = wsrep_append_foreign_key(
+ thr_get_trx(thr),
+ foreign,
+ rec,
+ check_index,
+ check_ref, TRUE);
+#endif /* WITH_WSREP */
goto end_scan;
} else if (foreign->type != 0) {
/* There is an ON UPDATE or ON DELETE
diff --git a/storage/innobase/row/row0log.cc b/storage/innobase/row/row0log.cc
index 74ebe159677..c615b37a99a 100644
--- a/storage/innobase/row/row0log.cc
+++ b/storage/innobase/row/row0log.cc
@@ -40,6 +40,10 @@ Created 2011-05-26 Marko Makela
#include<map>
+ulint onlineddl_rowlog_rows;
+ulint onlineddl_rowlog_pct_used;
+ulint onlineddl_pct_progress;
+
/** Table row modification operations during online table rebuild.
Delete-marked records are not copied to the rebuilt table. */
enum row_tab_op {
@@ -470,6 +474,10 @@ write_failed:
log->tail.total += size;
UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf);
mutex_exit(&log->mutex);
+
+ os_atomic_increment_ulint(&onlineddl_rowlog_rows, 1);
+ /* 10000 means 100.00%, 4525 means 45.25% */
+ onlineddl_rowlog_pct_used = (log->tail.total * 10000) / srv_online_max_size;
}
#ifdef UNIV_DEBUG
diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc
index 284081d4b0c..c1d3e08beaa 100644
--- a/storage/innobase/row/row0merge.cc
+++ b/storage/innobase/row/row0merge.cc
@@ -23,6 +23,8 @@ New index creation routines using a merge sort
Created 12/4/2005 Jan Lindstrom
Completed by Sunny Bains and Marko Makela
*******************************************************/
+#include <my_config.h>
+#include <log.h>
#include "row0merge.h"
#include "row0ext.h"
@@ -38,6 +40,13 @@ Completed by Sunny Bains and Marko Makela
#include "row0import.h"
#include "handler0alter.h"
#include "ha_prototypes.h"
+#include "math.h" /* log() */
+
+float my_log2f(float n)
+{
+ /* log(n) / log(2) is log2. */
+ return (float)(log((double)n) / log((double)2));
+}
/* Ignore posix_fadvise() on those platforms where it does not exist */
#if defined __WIN__
@@ -870,7 +879,8 @@ row_merge_read(
#endif /* UNIV_DEBUG */
success = os_file_read_no_error_handling(OS_FILE_FROM_FD(fd), buf,
- ofs, srv_sort_buf_size);
+ ofs, srv_sort_buf_size);
+
#ifdef POSIX_FADV_DONTNEED
/* Each block is read exactly once. Free up the file cache. */
posix_fadvise(fd, ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED);
@@ -1281,7 +1291,8 @@ row_merge_read_clustered_index(
AUTO_INCREMENT column, or
ULINT_UNDEFINED if none is added */
ib_sequence_t& sequence,/*!< in/out: autoinc sequence */
- row_merge_block_t* block) /*!< in/out: file buffer */
+ row_merge_block_t* block, /*!< in/out: file buffer */
+ float pct_cost) /*!< in: percent of task weight out of total alter job */
{
dict_index_t* clust_index; /* Clustered index */
mem_heap_t* row_heap; /* Heap memory to create
@@ -1302,11 +1313,21 @@ row_merge_read_clustered_index(
ibool fts_pll_sort = FALSE;
ib_int64_t sig_count = 0;
mem_heap_t* conv_heap = NULL;
+
+ float curr_progress;
+ ib_int64_t read_rows = 0;
+ ib_int64_t table_total_rows;
DBUG_ENTER("row_merge_read_clustered_index");
ut_ad((old_table == new_table) == !col_map);
ut_ad(!add_cols || col_map);
+ table_total_rows = dict_table_get_n_rows(old_table);
+ if(table_total_rows == 0) {
+ /* We don't know total row count */
+ table_total_rows = 1;
+ }
+
trx->op_info = "reading clustered index";
#ifdef FTS_INTERNAL_DIAG_PRINT
@@ -1823,6 +1844,17 @@ write_buffers:
}
mem_heap_empty(row_heap);
+
+ /* Increment innodb_onlineddl_pct_progress status variable */
+ read_rows++;
+ if(read_rows % 1000 == 0) {
+ /* Update progress for each 1000 rows */
+ curr_progress = (read_rows >= table_total_rows) ?
+ pct_cost :
+ ((pct_cost * read_rows) / table_total_rows);
+ /* presenting 10.12% as 1012 integer */
+ onlineddl_pct_progress = curr_progress * 100;
+ }
}
func_exit:
@@ -2289,18 +2321,37 @@ row_merge_sort(
merge_file_t* file, /*!< in/out: file containing
index entries */
row_merge_block_t* block, /*!< in/out: 3 buffers */
- int* tmpfd) /*!< in/out: temporary file handle */
+ int* tmpfd, /*!< in/out: temporary file handle
+ */
+ const bool update_progress,
+ /*!< in: update progress
+ status variable or not */
+ const float pct_progress,
+ /*!< in: total progress percent
+ until now */
+ const float pct_cost) /*!< in: current progress percent */
{
const ulint half = file->offset / 2;
ulint num_runs;
ulint cur_run = 0;
ulint* run_offset;
dberr_t error = DB_SUCCESS;
+ ulint merge_count = 0;
+ ulint total_merge_sort_count;
+ float curr_progress = 0;
+
DBUG_ENTER("row_merge_sort");
/* Record the number of merge runs we need to perform */
num_runs = file->offset;
+ /* Find the number N which 2^N is greater or equal than num_runs */
+ /* N is merge sort running count */
+ total_merge_sort_count = ceil(my_log2f(num_runs));
+ if(total_merge_sort_count <= 0) {
+ total_merge_sort_count=1;
+ }
+
/* If num_runs are less than 1, nothing to merge */
if (num_runs <= 1) {
DBUG_RETURN(error);
@@ -2318,17 +2369,28 @@ row_merge_sort(
ut_ad(file->offset > 0);
thd_progress_init(trx->mysql_thd, num_runs);
+ sql_print_information("InnoDB: Online DDL : merge-sorting has estimated %lu runs", num_runs);
/* Merge the runs until we have one big run */
do {
cur_run++;
- error = row_merge(trx, dup, file, block, tmpfd,
- &num_runs, run_offset);
-
/* Report progress of merge sort to MySQL for
show processlist progress field */
thd_progress_report(trx->mysql_thd, cur_run, num_runs);
+ sql_print_information("InnoDB: Online DDL : merge-sorting current run %lu estimated %lu runs", cur_run, num_runs);
+
+ error = row_merge(trx, dup, file, block, tmpfd,
+ &num_runs, run_offset);
+
+ if(update_progress) {
+ merge_count++;
+ curr_progress = (merge_count >= total_merge_sort_count) ?
+ pct_cost :
+ ((pct_cost * merge_count) / total_merge_sort_count);
+ /* presenting 10.12% as 1012 integer */;
+ onlineddl_pct_progress = (pct_progress + curr_progress) * 100;
+ }
if (error != DB_SUCCESS) {
break;
@@ -2399,7 +2461,10 @@ row_merge_insert_index_tuples(
dict_index_t* index, /*!< in: index */
const dict_table_t* old_table,/*!< in: old table */
int fd, /*!< in: file descriptor */
- row_merge_block_t* block) /*!< in/out: file buffer */
+ row_merge_block_t* block, /*!< in/out: file buffer */
+ const ib_int64_t table_total_rows, /*!< in: total rows of old table */
+ const float pct_progress, /*!< in: total progress percent until now */
+ const float pct_cost) /*!< in: current progress percent */
{
const byte* b;
mem_heap_t* heap;
@@ -2409,6 +2474,8 @@ row_merge_insert_index_tuples(
ulint foffs = 0;
ulint* offsets;
mrec_buf_t* buf;
+ ib_int64_t inserted_rows = 0;
+ float curr_progress;
DBUG_ENTER("row_merge_insert_index_tuples");
ut_ad(!srv_read_only_mode);
@@ -2585,6 +2652,19 @@ row_merge_insert_index_tuples(
mem_heap_empty(tuple_heap);
mem_heap_empty(ins_heap);
+
+ /* Increment innodb_onlineddl_pct_progress status variable */
+ inserted_rows++;
+ if(inserted_rows % 1000 == 0) {
+ /* Update progress for each 1000 rows */
+ curr_progress = (inserted_rows >= table_total_rows ||
+ table_total_rows <= 0) ?
+ pct_cost :
+ ((pct_cost * inserted_rows) / table_total_rows);
+
+ /* presenting 10.12% as 1012 integer */;
+ onlineddl_pct_progress = (pct_progress + curr_progress) * 100;
+ }
}
}
@@ -3584,6 +3664,13 @@ row_merge_build_indexes(
fts_psort_t* merge_info = NULL;
ib_int64_t sig_count = 0;
bool fts_psort_initiated = false;
+
+ float total_static_cost = 0;
+ float total_dynamic_cost = 0;
+ uint total_index_blocks = 0;
+ float pct_cost=0;
+ float pct_progress=0;
+
DBUG_ENTER("row_merge_build_indexes");
ut_ad(!srv_read_only_mode);
@@ -3614,6 +3701,9 @@ row_merge_build_indexes(
merge_files[i].fd = -1;
}
+ total_static_cost = COST_BUILD_INDEX_STATIC * n_indexes + COST_READ_CLUSTERED_INDEX;
+ total_dynamic_cost = COST_BUILD_INDEX_DYNAMIC * n_indexes;
+
for (i = 0; i < n_indexes; i++) {
if (row_merge_file_create(&merge_files[i]) < 0) {
error = DB_OUT_OF_MEMORY;
@@ -3658,6 +3748,12 @@ row_merge_build_indexes(
duplicate keys. */
innobase_rec_reset(table);
+ sql_print_information("InnoDB: Online DDL : Start");
+ sql_print_information("InnoDB: Online DDL : Start reading clustered "
+ "index of the table and create temporary files");
+
+ pct_cost = COST_READ_CLUSTERED_INDEX * 100 / (total_static_cost + total_dynamic_cost);
+
/* Read clustered index of the table and create files for
secondary index entries for merge sort */
@@ -3665,10 +3761,18 @@ row_merge_build_indexes(
trx, table, old_table, new_table, online, indexes,
fts_sort_idx, psort_info, merge_files, key_numbers,
n_indexes, add_cols, col_map,
- add_autoinc, sequence, block);
+ add_autoinc, sequence, block, pct_cost);
- if (error != DB_SUCCESS) {
+ pct_progress += pct_cost;
+
+ sql_print_information("InnoDB: Online DDL : End of reading "
+ "clustered index of the table and create temporary files");
+
+ for (i = 0; i < n_indexes; i++) {
+ total_index_blocks += merge_files[i].offset;
+ }
+ if (error != DB_SUCCESS) {
goto func_exit;
}
@@ -3747,17 +3851,59 @@ wait_again:
DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Insert\n");
#endif
} else {
+ char buf[3 * NAME_LEN];
+ char *bufend;
row_merge_dup_t dup = {
sort_idx, table, col_map, 0};
+ pct_cost = (COST_BUILD_INDEX_STATIC +
+ (total_dynamic_cost * merge_files[i].offset /
+ total_index_blocks)) /
+ (total_static_cost + total_dynamic_cost)
+ * PCT_COST_MERGESORT_INDEX * 100;
+
+ bufend = innobase_convert_name(buf, sizeof buf,
+ indexes[i]->name, strlen(indexes[i]->name),
+ trx ? trx->mysql_thd : NULL,
+ FALSE);
+
+ buf[bufend - buf]='\0';
+
+ sql_print_information("InnoDB: Online DDL : Start merge-sorting"
+ " index %s (%lu / %lu), estimated cost : %2.4f",
+ buf, (i+1), n_indexes, pct_cost);
+
error = row_merge_sort(
trx, &dup, &merge_files[i],
- block, &tmpfd);
+ block, &tmpfd, true, pct_progress, pct_cost);
+
+ pct_progress += pct_cost;
+
+ sql_print_information("InnoDB: Online DDL : End of "
+ " merge-sorting index %s (%lu / %lu)",
+ buf, (i+1), n_indexes);
if (error == DB_SUCCESS) {
+ pct_cost = (COST_BUILD_INDEX_STATIC +
+ (total_dynamic_cost * merge_files[i].offset /
+ total_index_blocks)) /
+ (total_static_cost + total_dynamic_cost) *
+ PCT_COST_INSERT_INDEX * 100;
+
+ sql_print_information("InnoDB: Online DDL : Start "
+ "building index %s (%lu / %lu), estimated "
+ "cost : %2.4f", buf, (i+1),
+ n_indexes, pct_cost);
+
error = row_merge_insert_index_tuples(
trx->id, sort_idx, old_table,
- merge_files[i].fd, block);
+ merge_files[i].fd, block,
+ merge_files[i].n_rec, pct_progress, pct_cost);
+ pct_progress += pct_cost;
+
+ sql_print_information("InnoDB: Online DDL : "
+ "End of building index %s (%lu / %lu)",
+ buf, (i+1), n_indexes);
}
}
@@ -3774,11 +3920,15 @@ wait_again:
ut_ad(sort_idx->online_status
== ONLINE_INDEX_COMPLETE);
} else {
+ sql_print_information("InnoDB: Online DDL : Start applying row log");
DEBUG_SYNC_C("row_log_apply_before");
error = row_log_apply(trx, sort_idx, table);
DEBUG_SYNC_C("row_log_apply_after");
+ sql_print_information("InnoDB: Online DDL : End of applying row log");
}
+ sql_print_information("InnoDB: Online DDL : Completed");
+
if (error != DB_SUCCESS) {
trx->error_key_num = key_numbers[i];
goto func_exit;
diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc
index 6184bcddcb5..cab7621a0d4 100644
--- a/storage/innobase/row/row0mysql.cc
+++ b/storage/innobase/row/row0mysql.cc
@@ -55,7 +55,9 @@ Created 9/17/2000 Heikki Tuuri
#include "rem0cmp.h"
#include "log0log.h"
#include "btr0sea.h"
+#include "btr0defragment.h"
#include "fil0fil.h"
+#include "fil0crypt.h"
#include "ibuf0ibuf.h"
#include "fts0fts.h"
#include "fts0types.h"
@@ -3238,6 +3240,41 @@ run_again:
return(err);
}
+static
+void
+fil_wait_crypt_bg_threads(
+ dict_table_t* table)
+{
+ uint start = time(0);
+ uint last = start;
+
+ if (table->space != 0) {
+ fil_space_crypt_mark_space_closing(table->space);
+ }
+
+ while (table->n_ref_count > 0) {
+ dict_mutex_exit_for_mysql();
+ os_thread_sleep(20000);
+ dict_mutex_enter_for_mysql();
+ uint now = time(0);
+ if (now >= last + 30) {
+ fprintf(stderr,
+ "WARNING: waited %u seconds "
+ "for ref-count on table: %s space: %u\n",
+ now - start, table->name, table->space);
+ last = now;
+ }
+
+ if (now >= start + 300) {
+ fprintf(stderr,
+ "WARNING: after %u seconds, gave up waiting "
+ "for ref-count on table: %s space: %u\n",
+ now - start, table->name, table->space);
+ break;
+ }
+ }
+}
+
/*********************************************************************//**
Truncates a table for MySQL.
@return error code or DB_SUCCESS */
@@ -3907,6 +3944,8 @@ row_drop_table_for_mysql(
if (!dict_table_is_temporary(table)) {
dict_stats_recalc_pool_del(table);
+ dict_stats_defrag_pool_del(table, NULL);
+ btr_defragment_remove_table(table);
/* Remove stats for this table and all of its indexes from the
persistent storage if it exists and if there are stats for this
@@ -4024,6 +4063,9 @@ row_drop_table_for_mysql(
shouldn't have to. There should never be record locks on a table
that is going to be dropped. */
+ /* Wait on background threads to stop using table */
+ fil_wait_crypt_bg_threads(table);
+
if (table->n_ref_count == 0) {
lock_remove_all_on_table(table, TRUE);
ut_a(table->n_rec_locks == 0);
@@ -5168,18 +5210,6 @@ end:
trx->error_state = DB_SUCCESS;
trx_rollback_to_savepoint(trx, NULL);
trx->error_state = DB_SUCCESS;
- } else {
- if (old_is_tmp && !new_is_tmp) {
- /* After ALTER TABLE the table statistics
- needs to be rebuilt. Even if we close
- table below there could be other
- transactions using this table (e.g.
- SELECT * FROM INFORMATION_SCHEMA.`TABLE_CONSTRAINTS`),
- thus we can't remove table from dictionary cache
- here. Therefore, we initialize the
- transient statistics here. */
- dict_stats_update_transient(table);
- }
}
}
diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc
index 69c8498839e..b0e0c89b778 100644
--- a/storage/innobase/row/row0sel.cc
+++ b/storage/innobase/row/row0sel.cc
@@ -56,6 +56,7 @@ Created 12/19/1997 Heikki Tuuri
#include "row0mysql.h"
#include "read0read.h"
#include "buf0lru.h"
+#include "srv0srv.h"
#include "ha_prototypes.h"
#include "m_string.h" /* for my_sys.h */
#include "my_sys.h" /* DEBUG_SYNC_C */
@@ -2933,9 +2934,14 @@ row_sel_store_mysql_rec(
: templ->rec_field_no;
/* We should never deliver column prefixes to MySQL,
except for evaluating innobase_index_cond(). */
+ /* ...actually, we do want to do this in order to
+ support the prefix query optimization.
+
ut_ad(dict_index_get_nth_field(index, field_no)->prefix_len
== 0);
+ ...so we disable this assert. */
+
if (!row_sel_store_mysql_field(mysql_rec, prebuilt,
rec, index, offsets,
field_no, templ)) {
@@ -3028,6 +3034,8 @@ row_sel_get_clust_rec_for_mysql(
dberr_t err;
trx_t* trx;
+ srv_stats.n_sec_rec_cluster_reads.inc();
+
*out_rec = NULL;
trx = thr_get_trx(thr);
@@ -3683,6 +3691,7 @@ row_search_for_mysql(
ulint* offsets = offsets_;
ibool table_lock_waited = FALSE;
byte* next_buf = 0;
+ ibool use_clustered_index = FALSE;
rec_offs_init(offsets_);
@@ -4706,10 +4715,68 @@ locks_ok:
}
/* Get the clustered index record if needed, if we did not do the
- search using the clustered index. */
-
- if (index != clust_index && prebuilt->need_to_access_clustered) {
+ search using the clustered index... */
+
+ use_clustered_index =
+ (index != clust_index && prebuilt->need_to_access_clustered);
+
+ if (use_clustered_index && srv_prefix_index_cluster_optimization
+ && prebuilt->n_template <= index->n_fields) {
+ /* ...but, perhaps avoid the clustered index lookup if
+ all of the following are true:
+ 1) all columns are in the secondary index
+ 2) all values for columns that are prefix-only
+ indexes are shorter than the prefix size
+ This optimization can avoid many IOs for certain schemas.
+ */
+ ibool row_contains_all_values = TRUE;
+ int i;
+ for (i = 0; i < prebuilt->n_template; i++) {
+ /* Condition (1) from above: is the field in the
+ index (prefix or not)? */
+ mysql_row_templ_t* templ =
+ prebuilt->mysql_template + i;
+ ulint secondary_index_field_no =
+ templ->rec_prefix_field_no;
+ if (secondary_index_field_no == ULINT_UNDEFINED) {
+ row_contains_all_values = FALSE;
+ break;
+ }
+ /* Condition (2) from above: if this is a
+ prefix, is this row's value size shorter
+ than the prefix? */
+ if (templ->rec_field_is_prefix) {
+ ulint record_size = rec_offs_nth_size(
+ offsets,
+ secondary_index_field_no);
+ const dict_field_t *field =
+ dict_index_get_nth_field(
+ index,
+ secondary_index_field_no);
+ ut_a(field->prefix_len > 0);
+ if (record_size >= field->prefix_len) {
+ row_contains_all_values = FALSE;
+ break;
+ }
+ }
+ }
+ /* If (1) and (2) were true for all columns above, use
+ rec_prefix_field_no instead of rec_field_no, and skip
+ the clustered lookup below. */
+ if (row_contains_all_values) {
+ for (i = 0; i < prebuilt->n_template; i++) {
+ mysql_row_templ_t* templ =
+ prebuilt->mysql_template + i;
+ templ->rec_field_no =
+ templ->rec_prefix_field_no;
+ ut_a(templ->rec_field_no != ULINT_UNDEFINED);
+ }
+ use_clustered_index = FALSE;
+ srv_stats.n_sec_rec_cluster_reads_avoided.inc();
+ }
+ }
+ if (use_clustered_index) {
requires_clust_rec:
ut_ad(index != clust_index);
/* We use a 'goto' to the preceding label if a consistent
diff --git a/storage/innobase/row/row0umod.cc b/storage/innobase/row/row0umod.cc
index 8580aa45145..7649add4b33 100644
--- a/storage/innobase/row/row0umod.cc
+++ b/storage/innobase/row/row0umod.cc
@@ -364,8 +364,15 @@ row_undo_mod_clust(
}
}
- ut_ad(rec_get_trx_id(btr_pcur_get_rec(pcur), index)
- == node->new_trx_id);
+ /**
+ * when scrubbing, and records gets cleared,
+ * the transaction id is not present afterwards.
+ * this is safe as: since the record is on free-list
+ * it can be reallocated at any time after this mtr-commits
+ * which is just below
+ */
+ ut_ad(srv_immediate_scrub_data_uncompressed ||
+ rec_get_trx_id(btr_pcur_get_rec(pcur), index) == node->new_trx_id);
btr_pcur_commit_specify_mtr(pcur, &mtr);
diff --git a/storage/innobase/row/row0upd.cc b/storage/innobase/row/row0upd.cc
index a8c2eaa6683..0ea4865d15f 100644
--- a/storage/innobase/row/row0upd.cc
+++ b/storage/innobase/row/row0upd.cc
@@ -53,6 +53,9 @@ Created 12/27/1996 Heikki Tuuri
#include "buf0lru.h"
#include <algorithm>
+#include <mysql/plugin.h>
+#include <mysql/service_wsrep.h>
+
/* What kind of latch and lock can we assume when the control comes to
-------------------------------------------------------------------
an update node?
@@ -162,6 +165,52 @@ row_upd_index_is_referenced(
return(is_referenced);
}
+#ifdef WITH_WSREP
+static
+ibool
+wsrep_row_upd_index_is_foreign(
+/*========================*/
+ dict_index_t* index, /*!< in: index */
+ trx_t* trx) /*!< in: transaction */
+{
+ dict_table_t* table = index->table;
+ dict_foreign_t* foreign;
+ ibool froze_data_dict = FALSE;
+ ibool is_referenced = FALSE;
+
+ if (table->foreign_set.empty()) {
+
+ return(FALSE);
+ }
+
+ if (trx->dict_operation_lock_mode == 0) {
+ row_mysql_freeze_data_dictionary(trx);
+ froze_data_dict = TRUE;
+ }
+
+ for (dict_foreign_set::iterator it= table->foreign_set.begin();
+ it != table->foreign_set.end();
+ ++ it)
+ {
+ foreign= *it;
+
+ if (foreign->foreign_index == index) {
+
+ is_referenced = TRUE;
+ goto func_exit;
+ }
+
+ }
+
+func_exit:
+ if (froze_data_dict) {
+ row_mysql_unfreeze_data_dictionary(trx);
+ }
+
+ return(is_referenced);
+}
+#endif /* WITH_WSREP */
+
/*********************************************************************//**
Checks if possible foreign key constraints hold after a delete of the record
under pcur.
@@ -281,7 +330,125 @@ run_again:
}
err = DB_SUCCESS;
+func_exit:
+ if (got_s_lock) {
+ row_mysql_unfreeze_data_dictionary(trx);
+ }
+
+ mem_heap_free(heap);
+
+ return(err);
+}
+#ifdef WITH_WSREP
+static
+dberr_t
+wsrep_row_upd_check_foreign_constraints(
+/*=================================*/
+ upd_node_t* node, /*!< in: row update node */
+ btr_pcur_t* pcur, /*!< in: cursor positioned on a record; NOTE: the
+ cursor position is lost in this function! */
+ dict_table_t* table, /*!< in: table in question */
+ dict_index_t* index, /*!< in: index of the cursor */
+ ulint* offsets,/*!< in/out: rec_get_offsets(pcur.rec, index) */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dict_foreign_t* foreign;
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ trx_t* trx;
+ const rec_t* rec;
+ ulint n_ext;
+ dberr_t err;
+ ibool got_s_lock = FALSE;
+ ibool opened = FALSE;
+
+ if (table->foreign_set.empty()) {
+
+ return(DB_SUCCESS);
+ }
+ trx = thr_get_trx(thr);
+
+ /* TODO: make native slave thread bail out here */
+
+ rec = btr_pcur_get_rec(pcur);
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ heap = mem_heap_create(500);
+
+ entry = row_rec_to_index_entry(rec, index, offsets,
+ &n_ext, heap);
+
+ mtr_commit(mtr);
+
+ mtr_start(mtr);
+
+ if (trx->dict_operation_lock_mode == 0) {
+ got_s_lock = TRUE;
+
+ row_mysql_freeze_data_dictionary(trx);
+ }
+
+ for (dict_foreign_set::iterator it= table->foreign_set.begin();
+ it != table->foreign_set.end();
+ ++ it)
+ {
+ foreign= *it;
+
+ /* Note that we may have an update which updates the index
+ record, but does NOT update the first fields which are
+ referenced in a foreign key constraint. Then the update does
+ NOT break the constraint. */
+
+ if (foreign->foreign_index == index
+ && (node->is_delete
+ || row_upd_changes_first_fields_binary(
+ entry, index, node->update,
+ foreign->n_fields))) {
+
+ if (foreign->referenced_table == NULL) {
+ foreign->referenced_table =
+ dict_table_open_on_name(
+ foreign->referenced_table_name_lookup,
+ FALSE, FALSE, DICT_ERR_IGNORE_NONE);
+ opened = TRUE;
+ }
+
+ if (foreign->referenced_table) {
+ os_inc_counter(dict_sys->mutex,
+ foreign->referenced_table
+ ->n_foreign_key_checks_running);
+ }
+
+ /* NOTE that if the thread ends up waiting for a lock
+ we will release dict_operation_lock temporarily!
+ But the counter on the table protects 'foreign' from
+ being dropped while the check is running. */
+
+ err = row_ins_check_foreign_constraint(
+ TRUE, foreign, table, entry, thr);
+
+ if (foreign->referenced_table) {
+ os_dec_counter(dict_sys->mutex,
+ foreign->referenced_table
+ ->n_foreign_key_checks_running);
+
+ if (opened == TRUE) {
+ dict_table_close(foreign->referenced_table, TRUE, FALSE);
+ opened = FALSE;
+ }
+ }
+
+ if (err != DB_SUCCESS) {
+
+ goto func_exit;
+ }
+ }
+
+ }
+
+ err = DB_SUCCESS;
func_exit:
if (got_s_lock) {
row_mysql_unfreeze_data_dictionary(trx);
@@ -293,6 +460,7 @@ func_exit:
return(err);
}
+#endif /* WITH_WSREP */
/*********************************************************************//**
Creates an update node for a query graph.
@@ -1667,6 +1835,9 @@ row_upd_sec_index_entry(
index = node->index;
referenced = row_upd_index_is_referenced(index, trx);
+#ifdef WITH_WSREP
+ ibool foreign = wsrep_row_upd_index_is_foreign(index, trx);
+#endif /* WITH_WSREP */
heap = mem_heap_create(1024);
@@ -1794,6 +1965,9 @@ row_upd_sec_index_entry(
row_ins_sec_index_entry() below */
if (!rec_get_deleted_flag(
rec, dict_table_is_comp(index->table))) {
+#ifdef WITH_WSREP
+ que_node_t *parent = que_node_get_parent(node);
+#endif /* WITH_WSREP */
err = btr_cur_del_mark_set_sec_rec(
0, btr_cur, TRUE, thr, &mtr);
@@ -1811,6 +1985,37 @@ row_upd_sec_index_entry(
node, &pcur, index->table,
index, offsets, thr, &mtr);
}
+#ifdef WITH_WSREP
+ if (err == DB_SUCCESS && !referenced &&
+ !(parent && que_node_get_type(parent) ==
+ QUE_NODE_UPDATE &&
+ ((upd_node_t*)parent)->cascade_node == node) &&
+ foreign
+ ) {
+ ulint* offsets =
+ rec_get_offsets(
+ rec, index, NULL, ULINT_UNDEFINED,
+ &heap);
+ err = wsrep_row_upd_check_foreign_constraints(
+ node, &pcur, index->table,
+ index, offsets, thr, &mtr);
+ switch (err) {
+ case DB_SUCCESS:
+ case DB_NO_REFERENCED_ROW:
+ err = DB_SUCCESS;
+ break;
+ case DB_DEADLOCK:
+ if (wsrep_debug) fprintf (stderr,
+ "WSREP: sec index FK check fail for deadlock");
+ break;
+ default:
+ fprintf (stderr,
+ "WSREP: referenced FK check fail: %d",
+ (int)err);
+ break;
+ }
+ }
+#endif /* WITH_WSREP */
}
break;
}
@@ -1965,6 +2170,9 @@ row_upd_clust_rec_by_insert(
que_thr_t* thr, /*!< in: query thread */
ibool referenced,/*!< in: TRUE if index may be referenced in
a foreign key constraint */
+#ifdef WITH_WSREP
+ ibool foreign, /*!< in: TRUE if index is foreign key index */
+#endif /* WITH_WSREP */
mtr_t* mtr) /*!< in/out: mtr; gets committed here */
{
mem_heap_t* heap;
@@ -1978,6 +2186,9 @@ row_upd_clust_rec_by_insert(
rec_t* rec;
ulint* offsets = NULL;
+#ifdef WITH_WSREP
+ que_node_t *parent = que_node_get_parent(node);
+#endif /* WITH_WSREP */
ut_ad(node);
ut_ad(dict_index_is_clust(index));
@@ -2060,6 +2271,34 @@ err_exit:
goto err_exit;
}
}
+#ifdef WITH_WSREP
+ if (!referenced &&
+ !(parent && que_node_get_type(parent) == QUE_NODE_UPDATE &&
+ ((upd_node_t*)parent)->cascade_node == node) &&
+ foreign
+ ) {
+ err = wsrep_row_upd_check_foreign_constraints(
+ node, pcur, table, index, offsets, thr, mtr);
+ switch (err) {
+ case DB_SUCCESS:
+ case DB_NO_REFERENCED_ROW:
+ err = DB_SUCCESS;
+ break;
+ case DB_DEADLOCK:
+ if (wsrep_debug) fprintf (stderr,
+ "WSREP: insert FK check fail for deadlock");
+ break;
+ default:
+ fprintf (stderr,
+ "WSREP: referenced FK check fail: %d",
+ (int)err);
+ break;
+ }
+ if (err != DB_SUCCESS) {
+ goto err_exit;
+ }
+ }
+#endif /* WITH_WSREP */
}
mtr_commit(mtr);
@@ -2252,11 +2491,18 @@ row_upd_del_mark_clust_rec(
ibool referenced,
/*!< in: TRUE if index may be referenced in
a foreign key constraint */
+#ifdef WITH_WSREP
+ ibool foreign,/*!< in: TRUE if index is foreign key index */
+#endif /* WITH_WSREP */
mtr_t* mtr) /*!< in: mtr; gets committed here */
{
btr_pcur_t* pcur;
btr_cur_t* btr_cur;
dberr_t err;
+#ifdef WITH_WSREP
+ rec_t* rec;
+ que_node_t *parent = que_node_get_parent(node);
+#endif /* WITH_WSREP */
ut_ad(node);
ut_ad(dict_index_is_clust(index));
@@ -2273,8 +2519,16 @@ row_upd_del_mark_clust_rec(
/* Mark the clustered index record deleted; we do not have to check
locks, because we assume that we have an x-lock on the record */
+#ifdef WITH_WSREP
+ rec = btr_cur_get_rec(btr_cur);
+#endif /* WITH_WSREP */
+
err = btr_cur_del_mark_set_clust_rec(
+#ifdef WITH_WSREP
+ btr_cur_get_block(btr_cur), rec,
+#else
btr_cur_get_block(btr_cur), btr_cur_get_rec(btr_cur),
+#endif /* WITH_WSREP */
index, offsets, thr, mtr);
if (err == DB_SUCCESS && referenced) {
/* NOTE that the following call loses the position of pcur ! */
@@ -2282,6 +2536,32 @@ row_upd_del_mark_clust_rec(
err = row_upd_check_references_constraints(
node, pcur, index->table, index, offsets, thr, mtr);
}
+#ifdef WITH_WSREP
+ if (err == DB_SUCCESS && !referenced &&
+ !(parent && que_node_get_type(parent) == QUE_NODE_UPDATE &&
+ ((upd_node_t*)parent)->cascade_node == node) &&
+ thr_get_trx(thr) &&
+ foreign
+ ) {
+ err = wsrep_row_upd_check_foreign_constraints(
+ node, pcur, index->table, index, offsets, thr, mtr);
+ switch (err) {
+ case DB_SUCCESS:
+ case DB_NO_REFERENCED_ROW:
+ err = DB_SUCCESS;
+ break;
+ case DB_DEADLOCK:
+ if (wsrep_debug) fprintf (stderr,
+ "WSREP: clust rec FK check fail for deadlock");
+ break;
+ default:
+ fprintf (stderr,
+ "WSREP: clust rec referenced FK check fail: %d",
+ (int)err);
+ break;
+ }
+ }
+#endif /* WITH_WSREP */
mtr_commit(mtr);
@@ -2314,6 +2594,10 @@ row_upd_clust_step(
index = dict_table_get_first_index(node->table);
referenced = row_upd_index_is_referenced(index, thr_get_trx(thr));
+#ifdef WITH_WSREP
+ ibool foreign = wsrep_row_upd_index_is_foreign(
+ index, thr_get_trx(thr));
+#endif /* WITH_WSREP */
pcur = node->pcur;
@@ -2408,7 +2692,11 @@ row_upd_clust_step(
if (node->is_delete) {
err = row_upd_del_mark_clust_rec(
+#ifdef WITH_WSREP
+ node, index, offsets, thr, referenced, foreign, &mtr);
+#else
node, index, offsets, thr, referenced, &mtr);
+#endif /* WITH_WSREP */
if (err == DB_SUCCESS) {
node->state = UPD_NODE_UPDATE_ALL_SEC;
@@ -2453,7 +2741,11 @@ row_upd_clust_step(
externally! */
err = row_upd_clust_rec_by_insert(
+#ifdef WITH_WSREP
+ node, index, thr, referenced, foreign, &mtr);
+#else
node, index, thr, referenced, &mtr);
+#endif /* WITH_WSREP */
if (err != DB_SUCCESS) {
diff --git a/storage/innobase/srv/srv0conc.cc b/storage/innobase/srv/srv0conc.cc
index dc3c0b1dd88..8942eb20080 100644
--- a/storage/innobase/srv/srv0conc.cc
+++ b/storage/innobase/srv/srv0conc.cc
@@ -41,7 +41,8 @@ Created 2011/04/18 Sunny Bains
#include "sync0sync.h"
#include "trx0trx.h"
-#include "mysql/plugin.h"
+#include <mysql/plugin.h>
+#include <mysql/service_wsrep.h>
/** Number of times a thread is allowed to enter InnoDB within the same
SQL query after it has once got the ticket. */
@@ -86,6 +87,9 @@ struct srv_conc_slot_t{
reserved may still be TRUE at that
point */
srv_conc_node_t srv_conc_queue; /*!< queue node */
+#ifdef WITH_WSREP
+ void *thd; /*!< to see priority */
+#endif
};
/** Queue of threads waiting to get in */
@@ -145,6 +149,9 @@ srv_conc_init(void)
conc_slot->event = os_event_create();
ut_a(conc_slot->event);
+#ifdef WITH_WSREP
+ conc_slot->thd = NULL;
+#endif /* WITH_WSREP */
}
#endif /* !HAVE_ATOMIC_BUILTINS */
}
@@ -202,6 +209,16 @@ srv_conc_enter_innodb_with_atomics(
for (;;) {
ulint sleep_in_us;
+#ifdef WITH_WSREP
+ if (wsrep_on(trx->mysql_thd) &&
+ wsrep_trx_is_aborting(trx->mysql_thd)) {
+ if (wsrep_debug)
+ fprintf(stderr,
+ "srv_conc_enter due to MUST_ABORT");
+ srv_conc_force_enter_innodb(trx);
+ return;
+ }
+#endif /* WITH_WSREP */
if (srv_conc.n_active < (lint) srv_thread_concurrency) {
ulint n_active;
@@ -319,6 +336,9 @@ srv_conc_exit_innodb_without_atomics(
slot = NULL;
if (srv_conc.n_active < (lint) srv_thread_concurrency) {
+#ifdef WITH_WSREP
+ srv_conc_slot_t* wsrep_slot;
+#endif
/* Look for a slot where a thread is waiting and no other
thread has yet released the thread */
@@ -329,6 +349,19 @@ srv_conc_exit_innodb_without_atomics(
/* No op */
}
+#ifdef WITH_WSREP
+ /* look for aborting trx, they must be released asap */
+ wsrep_slot= slot;
+ while (wsrep_slot && (wsrep_slot->wait_ended == TRUE ||
+ !wsrep_trx_is_aborting(wsrep_slot->thd))) {
+ wsrep_slot = UT_LIST_GET_NEXT(srv_conc_queue, wsrep_slot);
+ }
+ if (wsrep_slot) {
+ slot = wsrep_slot;
+ if (wsrep_debug)
+ fprintf(stderr, "WSREP: releasing aborting thd\n");
+ }
+#endif
if (slot != NULL) {
slot->wait_ended = TRUE;
@@ -384,6 +417,13 @@ retry:
return;
}
+#ifdef WITH_WSREP
+ if (wsrep_on(trx->mysql_thd) &&
+ wsrep_thd_is_brute_force(trx->mysql_thd)) {
+ srv_conc_force_enter_innodb(trx);
+ return;
+ }
+#endif
/* If the transaction is not holding resources, let it sleep
for srv_thread_sleep_delay microseconds, and try again then */
@@ -450,6 +490,9 @@ retry:
/* Add to the queue */
slot->reserved = TRUE;
slot->wait_ended = FALSE;
+#ifdef WITH_WSREP
+ slot->thd = trx->mysql_thd;
+#endif
UT_LIST_ADD_LAST(srv_conc_queue, srv_conc_queue, slot);
@@ -457,6 +500,18 @@ retry:
srv_conc.n_waiting++;
+#ifdef WITH_WSREP
+ if (wsrep_on(trx->mysql_thd) &&
+ wsrep_trx_is_aborting(trx->mysql_thd)) {
+ os_fast_mutex_unlock(&srv_conc_mutex);
+ if (wsrep_debug)
+ fprintf(stderr, "srv_conc_enter due to MUST_ABORT");
+ trx->declared_to_be_inside_innodb = TRUE;
+ trx->n_tickets_to_enter_innodb = srv_n_free_tickets_to_enter;
+ return;
+ }
+ trx->wsrep_event = slot->event;
+#endif /* WITH_WSREP */
os_fast_mutex_unlock(&srv_conc_mutex);
/* Go to wait for the event; when a thread leaves InnoDB it will
@@ -472,6 +527,9 @@ retry:
os_event_wait(slot->event);
thd_wait_end(trx->mysql_thd);
+#ifdef WITH_WSREP
+ trx->wsrep_event = NULL;
+#endif /* WITH_WSREP */
trx->op_info = "";
@@ -483,6 +541,9 @@ retry:
incremented the thread counter on behalf of this thread */
slot->reserved = FALSE;
+#ifdef WITH_WSREP
+ slot->thd = NULL;
+#endif
UT_LIST_REMOVE(srv_conc_queue, srv_conc_queue, slot);
@@ -593,5 +654,32 @@ srv_conc_get_active_threads(void)
/*==============================*/
{
return(srv_conc.n_active);
- }
+}
+
+#ifdef WITH_WSREP
+UNIV_INTERN
+void
+wsrep_srv_conc_cancel_wait(
+/*==================*/
+ trx_t* trx) /*!< in: transaction object associated with the
+ thread */
+{
+#ifdef HAVE_ATOMIC_BUILTINS
+ /* aborting transactions will enter innodb by force in
+ srv_conc_enter_innodb_with_atomics(). No need to cancel here,
+ thr will wake up after os_sleep and let to enter innodb
+ */
+ if (wsrep_debug)
+ fprintf(stderr, "WSREP: conc slot cancel, no atomics\n");
+#else
+ os_fast_mutex_lock(&srv_conc_mutex);
+ if (trx->wsrep_event) {
+ if (wsrep_debug)
+ fprintf(stderr, "WSREP: conc slot cancel\n");
+ os_event_set(trx->wsrep_event);
+ }
+ os_fast_mutex_unlock(&srv_conc_mutex);
+#endif
+}
+#endif /* WITH_WSREP */
diff --git a/storage/innobase/srv/srv0mon.cc b/storage/innobase/srv/srv0mon.cc
index a0dd32c203f..0bcc876f591 100644
--- a/storage/innobase/srv/srv0mon.cc
+++ b/storage/innobase/srv/srv0mon.cc
@@ -2,6 +2,7 @@
Copyright (c) 2010, 2014, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, 2014, MariaDB Corporation
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -290,12 +291,36 @@ static monitor_info_t innodb_counter_info[] =
MONITOR_EXISTING | MONITOR_DEFAULT_ON),
MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_WRITTEN},
+ {"buffer_index_pages_written", "buffer",
+ "Number of index pages written (innodb_index_pages_written)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_INDEX_PAGES_WRITTEN},
+
+ {"buffer_non_index_pages_written", "buffer",
+ "Number of non index pages written (innodb_non_index_pages_written)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN},
+
{"buffer_pages_read", "buffer",
"Number of pages read (innodb_pages_read)",
static_cast<monitor_type_t>(
MONITOR_EXISTING | MONITOR_DEFAULT_ON),
MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_READ},
+ {"buffer_index_sec_rec_cluster_reads", "buffer",
+ "Number of secondary record reads triggered cluster read",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS},
+
+ {"buffer_index_sec_rec_cluster_reads_avoided", "buffer",
+ "Number of secondary record reads avoided triggering cluster read",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS_AVOIDED},
+
{"buffer_data_reads", "buffer",
"Amount of data read in bytes (innodb_data_reads)",
static_cast<monitor_type_t>(
@@ -457,20 +482,36 @@ static monitor_info_t innodb_counter_info[] =
MONITOR_LRU_BATCH_SCANNED_PER_CALL},
/* Cumulative counter for LRU batch pages flushed */
- {"buffer_LRU_batch_total_pages", "buffer",
+ {"buffer_LRU_batch_flush_total_pages", "buffer",
"Total pages flushed as part of LRU batches",
- MONITOR_SET_OWNER, MONITOR_LRU_BATCH_COUNT,
- MONITOR_LRU_BATCH_TOTAL_PAGE},
+ MONITOR_SET_OWNER, MONITOR_LRU_BATCH_FLUSH_COUNT,
+ MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE},
+
+ {"buffer_LRU_batches_flush", "buffer",
+ "Number of LRU batches",
+ MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
+ MONITOR_LRU_BATCH_FLUSH_COUNT},
+
+ {"buffer_LRU_batch_flush_pages", "buffer",
+ "Pages queued as an LRU batch",
+ MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
+ MONITOR_LRU_BATCH_FLUSH_PAGES},
+
+ /* Cumulative counter for LRU batch pages flushed */
+ {"buffer_LRU_batch_evict_total_pages", "buffer",
+ "Total pages evicted as part of LRU batches",
+ MONITOR_SET_OWNER, MONITOR_LRU_BATCH_EVICT_COUNT,
+ MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE},
- {"buffer_LRU_batches", "buffer",
+ {"buffer_LRU_batches_evict", "buffer",
"Number of LRU batches",
- MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_TOTAL_PAGE,
- MONITOR_LRU_BATCH_COUNT},
+ MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
+ MONITOR_LRU_BATCH_EVICT_COUNT},
- {"buffer_LRU_batch_pages", "buffer",
+ {"buffer_LRU_batch_evict_pages", "buffer",
"Pages queued as an LRU batch",
- MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_TOTAL_PAGE,
- MONITOR_LRU_BATCH_PAGES},
+ MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
+ MONITOR_LRU_BATCH_EVICT_PAGES},
/* Cumulative counter for single page LRU scans */
{"buffer_LRU_single_flush_scanned", "buffer",
@@ -879,6 +920,81 @@ static monitor_info_t innodb_counter_info[] =
MONITOR_NONE,
MONITOR_DEFAULT_START, MONITOR_PAD_DECREMENTS},
+ {"compress_saved", "compression",
+ "Number of bytes saved by page compression",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_SAVED},
+
+ {"compress_trim_sect512", "compression",
+ "Number of sect-512 TRIMed by page compression",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT512},
+
+ {"compress_trim_sect1024", "compression",
+ "Number of sect-1024 TRIMed by page compression",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT1024},
+
+ {"compress_trim_sect2048", "compression",
+ "Number of sect-2048 TRIMed by page compression",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT2048},
+
+ {"compress_trim_sect4096", "compression",
+ "Number of sect-4K TRIMed by page compression",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT4096},
+
+ {"compress_trim_sect8192", "compression",
+ "Number of sect-8K TRIMed by page compression",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT8192},
+
+ {"compress_trim_sect16384", "compression",
+ "Number of sect-16K TRIMed by page compression",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT16384},
+
+ {"compress_trim_sect32768", "compression",
+ "Number of sect-32K TRIMed by page compression",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT32768},
+
+ {"compress_pages_page_compressed", "compression",
+ "Number of pages compressed by page compression",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_COMPRESSED},
+
+ {"compress_page_compressed_trim_op", "compression",
+ "Number of TRIM operation performed by page compression",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP},
+
+ {"compress_page_compressed_trim_op_saved", "compression",
+ "Number of TRIM operation saved by page compression",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP_SAVED},
+
+ {"compress_pages_page_decompressed", "compression",
+ "Number of pages decompressed by page compression",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED},
+
+ {"compress_pages_page_compression_error", "compression",
+ "Number of page compression errors",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR},
+
+ {"compress_pages_encrypted", "compression",
+ "Number of pages encrypted",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_ENCRYPTED},
+
+ {"compress_pages_decrypted", "compression",
+ "Number of pages decrypted",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_DECRYPTED},
+
/* ========== Counters for Index ========== */
{"module_index", "index", "Index Manager",
MONITOR_MODULE,
@@ -1573,12 +1689,32 @@ srv_mon_process_existing_counter(
value = stat.n_pages_written;
break;
+ /* innodb_index_pages_written, the number of index pages written */
+ case MONITOR_OVLD_INDEX_PAGES_WRITTEN:
+ value = srv_stats.index_pages_written;
+ break;
+
+ /* innodb_non_index_pages_written, the number of non index pages written */
+ case MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN:
+ value = srv_stats.non_index_pages_written;
+ break;
+
/* innodb_pages_read */
case MONITOR_OVLD_PAGES_READ:
buf_get_total_stat(&stat);
value = stat.n_pages_read;
break;
+ /* Number of times secondary index lookup triggered cluster lookup */
+ case MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS:
+ value = srv_stats.n_sec_rec_cluster_reads;
+ break;
+ /* Number of times prefix optimization avoided triggering cluster
+ lookup */
+ case MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS_AVOIDED:
+ value = srv_stats.n_sec_rec_cluster_reads_avoided;
+ break;
+
/* innodb_data_reads, the total number of data reads */
case MONITOR_OVLD_BYTE_READ:
value = srv_stats.data_read;
@@ -1834,6 +1970,52 @@ srv_mon_process_existing_counter(
value = btr_cur_n_non_sea;
break;
+ case MONITOR_OVLD_PAGE_COMPRESS_SAVED:
+ value = srv_stats.page_compression_saved;
+ break;
+ case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT512:
+ value = srv_stats.page_compression_trim_sect512;
+ break;
+ case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT1024:
+ value = srv_stats.page_compression_trim_sect1024;
+ break;
+ case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT2048:
+ value = srv_stats.page_compression_trim_sect2048;
+ break;
+ case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT4096:
+ value = srv_stats.page_compression_trim_sect4096;
+ break;
+ case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT8192:
+ value = srv_stats.page_compression_trim_sect8192;
+ break;
+ case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT16384:
+ value = srv_stats.page_compression_trim_sect16384;
+ break;
+ case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT32768:
+ value = srv_stats.page_compression_trim_sect32768;
+ break;
+ case MONITOR_OVLD_PAGES_PAGE_COMPRESSED:
+ value = srv_stats.pages_page_compressed;
+ break;
+ case MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP:
+ value = srv_stats.page_compressed_trim_op;
+ break;
+ case MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP_SAVED:
+ value = srv_stats.page_compressed_trim_op_saved;
+ break;
+ case MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED:
+ value = srv_stats.pages_page_decompressed;
+ break;
+ case MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR:
+ value = srv_stats.pages_page_compression_error;
+ break;
+ case MONITOR_OVLD_PAGES_ENCRYPTED:
+ value = srv_stats.pages_encrypted;
+ break;
+ case MONITOR_OVLD_PAGES_DECRYPTED:
+ value = srv_stats.pages_decrypted;
+ break;
+
default:
ut_error;
}
diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc
index cd3bed9e2fe..df7cb656d8c 100644
--- a/storage/innobase/srv/srv0srv.cc
+++ b/storage/innobase/srv/srv0srv.cc
@@ -3,7 +3,7 @@
Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, 2009 Google Inc.
Copyright (c) 2009, Percona Inc.
-Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved.
+Copyright (c) 2013, 2015, MariaDB Corporation. All Rights Reserved.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -63,17 +63,27 @@ Created 10/8/1995 Heikki Tuuri
#include "dict0stats_bg.h" /* dict_stats_event */
#include "srv0start.h"
#include "row0mysql.h"
+#include "row0log.h"
#include "ha_prototypes.h"
#include "trx0i_s.h"
#include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */
#include "srv0mon.h"
#include "ut0crc32.h"
+#include "btr0defragment.h"
#include "mysql/plugin.h"
#include "mysql/service_thd_wait.h"
-
+#include "fil0fil.h"
+#include "fil0crypt.h"
+#include "fil0pagecompress.h"
+#include "btr0scrub.h"
+
+#ifdef WITH_WSREP
+extern int wsrep_debug;
+extern int wsrep_trx_is_aborting(void *thd_ptr);
+#endif
/* The following is the maximum allowed duration of a lock wait. */
-UNIV_INTERN ulint srv_fatal_semaphore_wait_threshold = 600;
+UNIV_INTERN ulong srv_fatal_semaphore_wait_threshold = DEFAULT_SRV_FATAL_SEMAPHORE_TIMEOUT;
/* How much data manipulation language (DML) statements need to be delayed,
in microseconds, in order to reduce the lagging of the purge thread. */
@@ -86,6 +96,9 @@ UNIV_INTERN ibool srv_buf_dump_thread_active = FALSE;
UNIV_INTERN ibool srv_dict_stats_thread_active = FALSE;
+UNIV_INTERN ibool srv_log_scrub_active = FALSE;
+UNIV_INTERN my_bool srv_scrub_log = FALSE;
+
UNIV_INTERN const char* srv_main_thread_op_info = "";
/** Prefix used by MySQL to indicate pre-5.1 table name encoding */
@@ -146,6 +159,20 @@ use simulated aio we build below with threads.
Currently we support native aio on windows and linux */
UNIV_INTERN my_bool srv_use_native_aio = TRUE;
+/* If this flag is TRUE, then we will use fallocate(PUCH_HOLE)
+to the pages */
+UNIV_INTERN my_bool srv_use_trim = FALSE;
+/* If this flag is TRUE, then we will use posix fallocate for file extentsion */
+UNIV_INTERN my_bool srv_use_posix_fallocate = FALSE;
+/* If this flag is TRUE, then we disable doublewrite buffer */
+UNIV_INTERN my_bool srv_use_atomic_writes = FALSE;
+/* If this flag IS TRUE, then we use this algorithm for page compressing the pages */
+UNIV_INTERN ulong innodb_compression_algorithm = PAGE_ZLIB_ALGORITHM;
+/* Number of threads used for multi-threaded flush */
+UNIV_INTERN long srv_mtflush_threads = MTFLUSH_DEFAULT_WORKER;
+/* If this flag is TRUE, then we will use multi threaded flush. */
+UNIV_INTERN my_bool srv_use_mtflush = FALSE;
+
#ifdef __WIN__
/* Windows native condition variables. We use runtime loading / function
pointers, because they are not available on Windows Server 2003 and
@@ -208,6 +235,10 @@ srv_printf_innodb_monitor() will request mutex acquisition
with mutex_enter(), which will wait until it gets the mutex. */
#define MUTEX_NOWAIT(mutex_skipped) ((mutex_skipped) < MAX_MUTEX_NOWAIT)
+#ifdef WITH_INNODB_DISALLOW_WRITES
+UNIV_INTERN os_event_t srv_allow_writes_event;
+#endif /* WITH_INNODB_DISALLOW_WRITES */
+
/** The sort order table of the MySQL latin1_swedish_ci character set
collation */
UNIV_INTERN const byte* srv_latin1_ordering;
@@ -232,6 +263,8 @@ UNIV_INTERN ulint srv_buf_pool_curr_size = 0;
UNIV_INTERN ulint srv_mem_pool_size = ULINT_MAX;
UNIV_INTERN ulint srv_lock_table_size = ULINT_MAX;
+UNIV_INTERN ulong srv_idle_flush_pct = 100;
+
/* This parameter is deprecated. Use srv_n_io_[read|write]_threads
instead. */
UNIV_INTERN ulint srv_n_file_io_threads = ULINT_MAX;
@@ -329,6 +362,10 @@ UNIV_INTERN ulint srv_fast_shutdown = 0;
/* Generate a innodb_status.<pid> file */
UNIV_INTERN ibool srv_innodb_status = FALSE;
+/* Optimize prefix index queries to skip cluster index lookup when possible */
+/* Enables or disables this prefix optimization. Disabled by default. */
+UNIV_INTERN my_bool srv_prefix_index_cluster_optimization = 0;
+
/* When estimating number of different key values in an index, sample
this many index pages, there are 2 ways to calculate statistics:
* persistent stats that are calculated by ANALYZE TABLE and saved
@@ -356,11 +393,6 @@ batch flushing i.e.: LRU flushing and flush_list flushing. The rest
of the pages are used for single page flushing. */
UNIV_INTERN ulong srv_doublewrite_batch_size = 120;
-UNIV_INTERN ibool srv_use_atomic_writes = FALSE;
-#ifdef HAVE_POSIX_FALLOCATE
-UNIV_INTERN ibool srv_use_posix_fallocate = TRUE;
-#endif
-
UNIV_INTERN ulong srv_replication_delay = 0;
/*-------------------------------------------*/
@@ -393,6 +425,26 @@ static ulint srv_n_system_rows_read_old = 0;
UNIV_INTERN ulint srv_truncated_status_writes = 0;
UNIV_INTERN ulint srv_available_undo_logs = 0;
+UNIV_INTERN ib_uint64_t srv_page_compression_saved = 0;
+UNIV_INTERN ib_uint64_t srv_page_compression_trim_sect512 = 0;
+UNIV_INTERN ib_uint64_t srv_page_compression_trim_sect4096 = 0;
+UNIV_INTERN ib_uint64_t srv_index_pages_written = 0;
+UNIV_INTERN ib_uint64_t srv_non_index_pages_written = 0;
+UNIV_INTERN ib_uint64_t srv_pages_page_compressed = 0;
+UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op = 0;
+UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op_saved = 0;
+UNIV_INTERN ib_uint64_t srv_index_page_decompressed = 0;
+
+/* Defragmentation */
+UNIV_INTERN my_bool srv_defragment = FALSE;
+UNIV_INTERN uint srv_defragment_n_pages = 7;
+UNIV_INTERN uint srv_defragment_stats_accuracy = 0;
+UNIV_INTERN uint srv_defragment_fill_factor_n_recs = 20;
+UNIV_INTERN double srv_defragment_fill_factor = 0.9;
+UNIV_INTERN uint srv_defragment_frequency =
+ SRV_DEFRAGMENT_FREQUENCY_DEFAULT;
+UNIV_INTERN ulonglong srv_defragment_interval = 0;
+
/* Set the following to 0 if you want InnoDB to write messages on
stderr on startup/shutdown. */
UNIV_INTERN ibool srv_print_verbose_log = TRUE;
@@ -401,6 +453,9 @@ UNIV_INTERN my_bool srv_print_innodb_lock_monitor = FALSE;
UNIV_INTERN ibool srv_print_innodb_tablespace_monitor = FALSE;
UNIV_INTERN ibool srv_print_innodb_table_monitor = FALSE;
+/** If this flag is set tables without primary key are not allowed */
+UNIV_INTERN my_bool srv_force_primary_key = FALSE;
+
/* Array of English strings describing the current state of an
i/o handler thread */
@@ -468,6 +523,9 @@ thread ensures that we flush the log files at least once per
second. */
static time_t srv_last_log_flush_time;
+/** Enable semaphore request instrumentation */
+UNIV_INTERN my_bool srv_instrument_semaphores = FALSE;
+
/* Interval in seconds at which various tasks are performed by the
master thread when server is active. In order to balance the workload,
we should try to keep intervals such that they are not multiple of
@@ -486,6 +544,9 @@ current_time % 5 != 0. */
/** Simulate compression failures. */
UNIV_INTERN uint srv_simulate_comp_failures = 0;
+/** Buffer pool dump status frequence in percentages */
+UNIV_INTERN ulong srv_buf_dump_status_frequency = 0;
+
/** Acquire the system_mutex. */
#define srv_sys_mutex_enter() do { \
mutex_enter(&srv_sys->mutex); \
@@ -1000,6 +1061,14 @@ srv_init(void)
dict_ind_init();
srv_conc_init();
+#ifdef WITH_INNODB_DISALLOW_WRITES
+ /* Writes have to be enabled on init or else we hang. Thus, we
+ always set the event here regardless of innobase_disallow_writes.
+ That flag will always be 0 at this point because it isn't settable
+ via my.cnf or command line arg. */
+ srv_allow_writes_event = os_event_create();
+ os_event_set(srv_allow_writes_event);
+#endif /* WITH_INNODB_DISALLOW_WRITES */
/* Initialize some INFORMATION SCHEMA internal structures */
trx_i_s_cache_init(trx_i_s_cache);
@@ -1369,10 +1438,14 @@ srv_export_innodb_status(void)
ulint LRU_len;
ulint free_len;
ulint flush_list_len;
+ fil_crypt_stat_t crypt_stat;
+ btr_scrub_stat_t scrub_stat;
buf_get_total_stat(&stat);
buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len);
buf_get_total_list_size_in_bytes(&buf_pools_list_size);
+ fil_crypt_total_stat(&crypt_stat);
+ btr_scrub_total_stat(&scrub_stat);
mutex_enter(&srv_innodb_monitor_mutex);
@@ -1518,6 +1591,27 @@ srv_export_innodb_status(void)
srv_truncated_status_writes;
export_vars.innodb_available_undo_logs = srv_available_undo_logs;
+ export_vars.innodb_page_compression_saved = srv_stats.page_compression_saved;
+ export_vars.innodb_page_compression_trim_sect512 = srv_stats.page_compression_trim_sect512;
+ export_vars.innodb_page_compression_trim_sect4096 = srv_stats.page_compression_trim_sect4096;
+ export_vars.innodb_index_pages_written = srv_stats.index_pages_written;
+ export_vars.innodb_non_index_pages_written = srv_stats.non_index_pages_written;
+ export_vars.innodb_pages_page_compressed = srv_stats.pages_page_compressed;
+ export_vars.innodb_page_compressed_trim_op = srv_stats.page_compressed_trim_op;
+ export_vars.innodb_page_compressed_trim_op_saved = srv_stats.page_compressed_trim_op_saved;
+ export_vars.innodb_pages_page_decompressed = srv_stats.pages_page_decompressed;
+ export_vars.innodb_pages_page_compression_error = srv_stats.pages_page_compression_error;
+ export_vars.innodb_pages_decrypted = srv_stats.pages_decrypted;
+ export_vars.innodb_pages_encrypted = srv_stats.pages_encrypted;
+
+ export_vars.innodb_defragment_compression_failures =
+ btr_defragment_compression_failures;
+ export_vars.innodb_defragment_failures = btr_defragment_failures;
+ export_vars.innodb_defragment_count = btr_defragment_count;
+
+ export_vars.innodb_onlineddl_rowlog_rows = onlineddl_rowlog_rows;
+ export_vars.innodb_onlineddl_rowlog_pct_used = onlineddl_rowlog_pct_used;
+ export_vars.innodb_onlineddl_pct_progress = onlineddl_pct_progress;
#ifdef UNIV_DEBUG
rw_lock_s_lock(&purge_sys->latch);
@@ -1547,6 +1641,35 @@ srv_export_innodb_status(void)
}
#endif /* UNIV_DEBUG */
+ export_vars.innodb_sec_rec_cluster_reads =
+ srv_stats.n_sec_rec_cluster_reads;
+ export_vars.innodb_sec_rec_cluster_reads_avoided =
+ srv_stats.n_sec_rec_cluster_reads_avoided;
+
+ export_vars.innodb_encryption_rotation_pages_read_from_cache =
+ crypt_stat.pages_read_from_cache;
+ export_vars.innodb_encryption_rotation_pages_read_from_disk =
+ crypt_stat.pages_read_from_disk;
+ export_vars.innodb_encryption_rotation_pages_modified =
+ crypt_stat.pages_modified;
+ export_vars.innodb_encryption_rotation_pages_flushed =
+ crypt_stat.pages_flushed;
+ export_vars.innodb_encryption_rotation_estimated_iops =
+ crypt_stat.estimated_iops;
+
+ export_vars.innodb_scrub_page_reorganizations =
+ scrub_stat.page_reorganizations;
+ export_vars.innodb_scrub_page_splits =
+ scrub_stat.page_splits;
+ export_vars.innodb_scrub_page_split_failures_underflow =
+ scrub_stat.page_split_failures_underflow;
+ export_vars.innodb_scrub_page_split_failures_out_of_filespace =
+ scrub_stat.page_split_failures_out_of_filespace;
+ export_vars.innodb_scrub_page_split_failures_missing_index =
+ scrub_stat.page_split_failures_missing_index;
+ export_vars.innodb_scrub_page_split_failures_unknown =
+ scrub_stat.page_split_failures_unknown;
+
mutex_exit(&srv_innodb_monitor_mutex);
}
@@ -1803,7 +1926,20 @@ loop:
if (sync_array_print_long_waits(&waiter, &sema)
&& sema == old_sema && os_thread_eq(waiter, old_waiter)) {
+#if defined(WITH_WSREP) && defined(WITH_INNODB_DISALLOW_WRITES)
+ if (srv_allow_writes_event->is_set) {
+#endif /* WITH_WSREP */
fatal_cnt++;
+#if defined(WITH_WSREP) && defined(WITH_INNODB_DISALLOW_WRITES)
+ } else {
+ fprintf(stderr,
+ "WSREP: avoiding InnoDB self crash due to long "
+ "semaphore wait of > %lu seconds\n"
+ "Server is processing SST donor operation, "
+ "fatal_cnt now: %lu",
+ (ulong) srv_fatal_semaphore_wait_threshold, fatal_cnt);
+ }
+#endif /* WITH_WSREP */
if (fatal_cnt > 10) {
fprintf(stderr,
@@ -1917,6 +2053,8 @@ srv_any_background_threads_are_active(void)
thread_active = "buf_dump_thread";
} else if (srv_dict_stats_thread_active) {
thread_active = "dict_stats_thread";
+ } else if (srv_scrub_log && srv_log_scrub_thread_active) {
+ thread_active = "log_scrub_thread";
}
os_event_set(srv_error_event);
@@ -1924,6 +2062,8 @@ srv_any_background_threads_are_active(void)
os_event_set(srv_buf_dump_event);
os_event_set(lock_sys->timeout_event);
os_event_set(dict_stats_event);
+ if (srv_scrub_log)
+ os_event_set(log_scrub_event);
return(thread_active);
}
diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc
index 7048a44ae97..648e9e95f19 100644
--- a/storage/innobase/srv/srv0start.cc
+++ b/storage/innobase/srv/srv0start.cc
@@ -3,6 +3,7 @@
Copyright (c) 1996, 2015, Oracle and/or its affiliates. All rights reserved.
Copyright (c) 2008, Google Inc.
Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, 2015, MariaDB Corporation
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -42,6 +43,7 @@ Created 2/16/1996 Heikki Tuuri
#include "pars0pars.h"
#include "row0ftsort.h"
#include "ut0mem.h"
+#include "ut0timer.h"
#include "mem0mem.h"
#include "data0data.h"
#include "data0type.h"
@@ -51,6 +53,7 @@ Created 2/16/1996 Heikki Tuuri
#include "os0file.h"
#include "os0thread.h"
#include "fil0fil.h"
+#include "fil0crypt.h"
#include "fsp0fsp.h"
#include "rem0rec.h"
#include "mtr0mtr.h"
@@ -66,12 +69,15 @@ Created 2/16/1996 Heikki Tuuri
#include "ibuf0ibuf.h"
#include "srv0start.h"
#include "srv0srv.h"
+#include "btr0defragment.h"
+
#ifndef UNIV_HOTBACKUP
# include "trx0rseg.h"
# include "os0proc.h"
# include "sync0sync.h"
# include "buf0flu.h"
# include "buf0rea.h"
+# include "buf0mtflu.h"
# include "dict0boot.h"
# include "dict0load.h"
# include "dict0stats_bg.h"
@@ -94,6 +100,7 @@ Created 2/16/1996 Heikki Tuuri
# include "os0sync.h"
# include "zlib.h"
# include "ut0crc32.h"
+# include "btr0scrub.h"
/** Log sequence number immediately after startup */
UNIV_INTERN lsn_t srv_start_lsn;
@@ -129,7 +136,11 @@ static os_file_t files[1000];
/** io_handler_thread parameters for thread identification */
static ulint n[SRV_MAX_N_IO_THREADS + 6];
/** io_handler_thread identifiers, 32 is the maximum number of purge threads */
-static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6 + 32];
+/** 6 is the ? */
+#define START_OLD_THREAD_CNT (SRV_MAX_N_IO_THREADS + 6 + 32)
+static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6 + 32 + MTFLUSH_MAX_WORKER];
+/* Thread contex data for multi-threaded flush */
+void *mtflush_ctx=NULL;
/** Thead handles */
static os_thread_t thread_handles[SRV_MAX_N_IO_THREADS + 6 + 32];
@@ -486,7 +497,8 @@ DECLARE_THREAD(io_handler_thread)(
segment = *((ulint*) arg);
#ifdef UNIV_DEBUG_THREAD_CREATION
- fprintf(stderr, "Io handler thread %lu starts, id %lu\n", segment,
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "Io handler thread %lu starts, id %lu\n", segment,
os_thread_pf(os_thread_get_curr_id()));
#endif
@@ -544,7 +556,7 @@ create_log_file(
*file = os_file_create(
innodb_file_log_key, name,
OS_FILE_CREATE|OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_NORMAL,
- OS_LOG_FILE, &ret);
+ OS_LOG_FILE, &ret, FALSE);
if (!ret) {
ib_logf(IB_LOG_LEVEL_ERROR, "Cannot create %s", name);
@@ -657,7 +669,8 @@ create_log_files(
fil_space_create(
logfilename, SRV_LOG_SPACE_FIRST_ID,
fsp_flags_set_page_size(0, UNIV_PAGE_SIZE),
- FIL_LOG);
+ FIL_LOG,
+ NULL /* no encryption yet */);
ut_a(fil_validate());
logfile0 = fil_node_create(
@@ -751,7 +764,7 @@ open_log_file(
*file = os_file_create(innodb_file_log_key, name,
OS_FILE_OPEN, OS_FILE_AIO,
- OS_LOG_FILE, &ret);
+ OS_LOG_FILE, &ret, FALSE);
if (!ret) {
ib_logf(IB_LOG_LEVEL_ERROR, "Unable to open '%s'", name);
return(DB_ERROR);
@@ -795,6 +808,7 @@ open_or_create_data_files(
ulint space;
ulint rounded_size_pages;
char name[10000];
+ fil_space_crypt_t* crypt_data;
if (srv_n_data_files >= 1000) {
@@ -842,7 +856,7 @@ open_or_create_data_files(
files[i] = os_file_create(
innodb_file_data_key, name, OS_FILE_CREATE,
- OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+ OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE);
if (srv_read_only_mode) {
@@ -885,7 +899,7 @@ open_or_create_data_files(
files[i] = os_file_create(
innodb_file_data_key, name, OS_FILE_OPEN_RAW,
- OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+ OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE);
if (!ret) {
ib_logf(IB_LOG_LEVEL_ERROR,
@@ -900,7 +914,7 @@ open_or_create_data_files(
#ifdef UNIV_LOG_ARCHIVE
min_arch_log_no, max_arch_log_no,
#endif /* UNIV_LOG_ARCHIVE */
- min_flushed_lsn, max_flushed_lsn);
+ min_flushed_lsn, max_flushed_lsn, NULL);
/* If first page is valid, don't overwrite DB.
It prevents overwriting DB when mysql_install_db
@@ -936,17 +950,17 @@ open_or_create_data_files(
files[i] = os_file_create(
innodb_file_data_key,
name, OS_FILE_OPEN_RAW,
- OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+ OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE);
} else if (i == 0) {
files[i] = os_file_create(
innodb_file_data_key,
name, OS_FILE_OPEN_RETRY,
- OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+ OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE);
} else {
files[i] = os_file_create(
innodb_file_data_key,
name, OS_FILE_OPEN, OS_FILE_NORMAL,
- OS_DATA_FILE, &ret);
+ OS_DATA_FILE, &ret, FALSE);
}
if (!ret) {
@@ -1031,7 +1045,7 @@ check_first_page:
#ifdef UNIV_LOG_ARCHIVE
min_arch_log_no, max_arch_log_no,
#endif /* UNIV_LOG_ARCHIVE */
- min_flushed_lsn, max_flushed_lsn);
+ min_flushed_lsn, max_flushed_lsn, &crypt_data);
if (check_msg) {
@@ -1125,6 +1139,8 @@ check_first_page:
}
*sum_of_new_sizes += srv_data_file_sizes[i];
+
+ crypt_data = fil_space_create_crypt_data(FIL_SPACE_ENCRYPTION_DEFAULT, FIL_DEFAULT_ENCRYPTION_KEY);
}
ret = os_file_close(files[i]);
@@ -1132,7 +1148,9 @@ check_first_page:
if (i == 0) {
flags = fsp_flags_set_page_size(0, UNIV_PAGE_SIZE);
- fil_space_create(name, 0, flags, FIL_TABLESPACE);
+ fil_space_create(name, 0, flags, FIL_TABLESPACE,
+ crypt_data);
+ crypt_data = NULL;
}
ut_a(fil_validate());
@@ -1166,7 +1184,7 @@ srv_undo_tablespace_create(
innodb_file_data_key,
name,
srv_read_only_mode ? OS_FILE_OPEN : OS_FILE_CREATE,
- OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+ OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE);
if (srv_read_only_mode && ret) {
ib_logf(IB_LOG_LEVEL_INFO,
@@ -1253,7 +1271,8 @@ srv_undo_tablespace_open(
| OS_FILE_ON_ERROR_SILENT,
OS_FILE_NORMAL,
OS_DATA_FILE,
- &ret);
+ &ret,
+ FALSE);
/* If the file open was successful then load the tablespace. */
@@ -1277,7 +1296,8 @@ srv_undo_tablespace_open(
/* Set the compressed page size to 0 (non-compressed) */
flags = fsp_flags_set_page_size(0, UNIV_PAGE_SIZE);
- fil_space_create(name, space, flags, FIL_TABLESPACE);
+ fil_space_create(name, space, flags, FIL_TABLESPACE,
+ NULL /* no encryption */);
ut_a(fil_validate());
@@ -1555,6 +1575,9 @@ innobase_start_or_create_for_mysql(void)
size_t dirnamelen;
bool sys_datafiles_created = false;
+ /* This should be initialized early */
+ ut_init_timer();
+
if (srv_force_recovery > SRV_FORCE_NO_TRX_UNDO) {
srv_read_only_mode = true;
}
@@ -1616,53 +1639,45 @@ innobase_start_or_create_for_mysql(void)
}
#ifdef UNIV_DEBUG
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: !!!!!!!! UNIV_DEBUG switched on !!!!!!!!!\n");
+ ib_logf(IB_LOG_LEVEL_INFO,
+ " InnoDB: !!!!!!!! UNIV_DEBUG switched on !!!!!!!!!");
#endif
#ifdef UNIV_IBUF_DEBUG
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: !!!!!!!! UNIV_IBUF_DEBUG switched on !!!!!!!!!\n");
+ ib_logf(IB_LOG_LEVEL_INFO,
+ " InnoDB: !!!!!!!! UNIV_IBUF_DEBUG switched on !!!!!!!!!");
# ifdef UNIV_IBUF_COUNT_DEBUG
- ut_print_timestamp(stderr);
- fprintf(stderr,
+ ib_logf(IB_LOG_LEVEL_INFO,
" InnoDB: !!!!!!!! UNIV_IBUF_COUNT_DEBUG switched on "
- "!!!!!!!!!\n");
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: Crash recovery will fail with UNIV_IBUF_COUNT_DEBUG\n");
+ "!!!!!!!!!");
+ ib_logf(IB_LOG_LEVEL_INFO,
+ " InnoDB: Crash recovery will fail with UNIV_IBUF_COUNT_DEBUG");
# endif
#endif
#ifdef UNIV_BLOB_DEBUG
- fprintf(stderr,
+ ib_logf(IB_LOG_LEVEL_INFO,
"InnoDB: !!!!!!!! UNIV_BLOB_DEBUG switched on !!!!!!!!!\n"
- "InnoDB: Server restart may fail with UNIV_BLOB_DEBUG\n");
+ "InnoDB: Server restart may fail with UNIV_BLOB_DEBUG");
#endif /* UNIV_BLOB_DEBUG */
#ifdef UNIV_SYNC_DEBUG
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: !!!!!!!! UNIV_SYNC_DEBUG switched on !!!!!!!!!\n");
+ ib_logf(IB_LOG_LEVEL_INFO,
+ " InnoDB: !!!!!!!! UNIV_SYNC_DEBUG switched on !!!!!!!!!");
#endif
#ifdef UNIV_SEARCH_DEBUG
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: !!!!!!!! UNIV_SEARCH_DEBUG switched on !!!!!!!!!\n");
+ ib_logf(IB_LOG_LEVEL_INFO,
+ " InnoDB: !!!!!!!! UNIV_SEARCH_DEBUG switched on !!!!!!!!!");
#endif
#ifdef UNIV_LOG_LSN_DEBUG
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: !!!!!!!! UNIV_LOG_LSN_DEBUG switched on !!!!!!!!!\n");
+ ib_logf(IB_LOG_LEVEL_INFO,
+ " InnoDB: !!!!!!!! UNIV_LOG_LSN_DEBUG switched on !!!!!!!!!");
#endif /* UNIV_LOG_LSN_DEBUG */
#ifdef UNIV_MEM_DEBUG
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: !!!!!!!! UNIV_MEM_DEBUG switched on !!!!!!!!!\n");
+ ib_logf(IB_LOG_LEVEL_INFO,
+ " InnoDB: !!!!!!!! UNIV_MEM_DEBUG switched on !!!!!!!!!");
#endif
if (srv_use_sys_malloc) {
@@ -2263,7 +2278,8 @@ innobase_start_or_create_for_mysql(void)
fil_space_create(logfilename,
SRV_LOG_SPACE_FIRST_ID,
fsp_flags_set_page_size(0, UNIV_PAGE_SIZE),
- FIL_LOG);
+ FIL_LOG,
+ NULL /* no encryption yet */);
ut_a(fil_validate());
@@ -2319,6 +2335,11 @@ files_checked:
dict_stats_thread_init();
}
+ if (!srv_read_only_mode && srv_scrub_log) {
+ /* TODO(minliz): have/use log_scrub_thread_init() instead? */
+ log_scrub_event = os_event_create();
+ }
+
trx_sys_file_format_init();
trx_sys_create();
@@ -2754,6 +2775,20 @@ files_checked:
}
if (!srv_read_only_mode) {
+
+ if (srv_use_mtflush) {
+ /* Start multi-threaded flush threads */
+ mtflush_ctx = buf_mtflu_handler_init(
+ srv_mtflush_threads,
+ srv_buf_pool_instances);
+
+ /* Set up the thread ids */
+ buf_mtflu_set_thread_ids(
+ srv_mtflush_threads,
+ mtflush_ctx,
+ (thread_ids + 6 + 32));
+ }
+
buf_flush_page_cleaner_thread_handle = os_thread_create(buf_flush_page_cleaner_thread, NULL, NULL);
buf_flush_page_cleaner_thread_started = true;
}
@@ -2889,6 +2924,15 @@ files_checked:
(ulong) srv_force_recovery);
}
+ if (!srv_read_only_mode) {
+ /*
+ Create a checkpoint before logging anything new, so that
+ the current encryption key in use is definitely logged
+ before any log blocks encrypted with that key.
+ */
+ log_make_checkpoint_at(LSN_MAX, TRUE);
+ }
+
if (srv_force_recovery == 0) {
/* In the insert buffer we may have even bigger tablespace
id's, because we may have dropped those tablespaces, but
@@ -2909,8 +2953,21 @@ files_checked:
/* Create the thread that will optimize the FTS sub-system. */
fts_optimize_init();
+
+ /* Init data for datafile scrub threads */
+ btr_scrub_init();
+
+ /* Create thread(s) that handles key rotation */
+ fil_crypt_threads_init();
+
+ /* Create the log scrub thread */
+ if (srv_scrub_log)
+ os_thread_create(log_scrub_thread, NULL, NULL);
}
+ /* Initialize online defragmentation. */
+ btr_defragment_init();
+
srv_was_started = TRUE;
return(DB_SUCCESS);
@@ -2971,6 +3028,9 @@ innobase_shutdown_for_mysql(void)
fts_optimize_start_shutdown();
fts_optimize_end();
+
+ /* Shutdown key rotation threads */
+ fil_crypt_threads_end();
}
/* 1. Flush the buffer pool to disk, write the current lsn to
@@ -3021,6 +3081,13 @@ innobase_shutdown_for_mysql(void)
logs_empty_and_mark_files_at_shutdown() and should have
already quit or is quitting right now. */
+
+ if (srv_use_mtflush) {
+ /* g. Exit the multi threaded flush threads */
+
+ buf_mtflu_io_thread_exit();
+ }
+
os_mutex_enter(os_sync_mutex);
if (os_thread_count == 0) {
@@ -3072,6 +3139,18 @@ innobase_shutdown_for_mysql(void)
if (!srv_read_only_mode) {
dict_stats_thread_deinit();
+ if (srv_scrub_log) {
+ /* TODO(minliz): have/use log_scrub_thread_deinit() instead? */
+ os_event_free(log_scrub_event);
+ log_scrub_event = NULL;
+ }
+ }
+
+ if (!srv_read_only_mode) {
+ fil_crypt_threads_cleanup();
+
+ /* Cleanup data for datafile scrubbing */
+ btr_scrub_cleanup();
}
#ifdef __WIN__
diff --git a/storage/innobase/sync/sync0arr.cc b/storage/innobase/sync/sync0arr.cc
index c7163695a3f..d464515a228 100644
--- a/storage/innobase/sync/sync0arr.cc
+++ b/storage/innobase/sync/sync0arr.cc
@@ -2,6 +2,7 @@
Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, Google Inc.
+Copyright (c) 2013, 2015, MariaDB Corporation. All Rights Reserved.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -30,11 +31,26 @@ The wait array used in synchronization primitives
Created 9/5/1995 Heikki Tuuri
*******************************************************/
+#include "univ.i"
+
#include "sync0arr.h"
#ifdef UNIV_NONINL
#include "sync0arr.ic"
#endif
+#include <mysqld_error.h>
+#include <mysql/plugin.h>
+#include <hash.h>
+#include <myisampack.h>
+#include <sql_acl.h>
+#include <mysys_err.h>
+#include <my_sys.h>
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "i_s.h"
+#include <sql_plugin.h>
+#include <innodb_priv.h>
+
#include "sync0sync.h"
#include "sync0rw.h"
#include "os0sync.h"
@@ -115,7 +131,6 @@ for an event allocated for the array without owning the
protecting mutex (depending on the case: OS or database mutex), but
all changes (set or reset) to the state of the event must be made
while owning the mutex. */
-
/** Synchronization array */
struct sync_array_t {
ulint n_reserved; /*!< number of currently reserved
@@ -168,7 +183,6 @@ sync_array_detect_deadlock(
/*****************************************************************//**
Gets the nth cell in array.
@return cell */
-static
sync_cell_t*
sync_array_get_nth_cell(
/*====================*/
@@ -486,16 +500,13 @@ sync_array_cell_print(
if (mutex) {
fprintf(file,
"Mutex at %p created file %s line %lu, lock var %lu\n"
-#ifdef UNIV_SYNC_DEBUG
- "Last time reserved in file %s line %lu, "
-#endif /* UNIV_SYNC_DEBUG */
+ "Last time reserved by thread %lu in file %s line %lu, "
"waiters flag %lu\n",
(void*) mutex, innobase_basename(mutex->cfile_name),
(ulong) mutex->cline,
(ulong) mutex->lock_word,
-#ifdef UNIV_SYNC_DEBUG
+ mutex->thread_id,
mutex->file_name, (ulong) mutex->line,
-#endif /* UNIV_SYNC_DEBUG */
(ulong) mutex->waiters);
}
@@ -507,15 +518,17 @@ sync_array_cell_print(
: type == RW_LOCK_WAIT_EX ? "X-lock (wait_ex) on"
: "S-lock on", file);
- rwlock = cell->old_wait_rw_lock;
+ rwlock = (rw_lock_t*)cell->old_wait_rw_lock;
if (rwlock) {
fprintf(file,
" RW-latch at %p created in file %s line %lu\n",
(void*) rwlock, innobase_basename(rwlock->cfile_name),
(ulong) rwlock->cline);
+
writer = rw_lock_get_writer(rwlock);
- if (writer != RW_LOCK_NOT_LOCKED) {
+
+ if (writer && writer != RW_LOCK_NOT_LOCKED) {
fprintf(file,
"a writer (thread id %lu) has"
" reserved it in mode %s",
@@ -538,6 +551,11 @@ sync_array_cell_print(
(ulong) rwlock->last_s_line,
rwlock->last_x_file_name,
(ulong) rwlock->last_x_line);
+
+ fprintf(file,
+ "Holder thread %lu file %s line %lu\n",
+ rwlock->thread_id, rwlock->file_name, rwlock->line);
+
}
} else {
ut_error;
@@ -1282,3 +1300,153 @@ sync_array_print_innodb(void)
fputs("InnoDB: Semaphore wait debug output ended:\n", stderr);
}
+
+/**********************************************************************//**
+Get number of items on sync array. */
+UNIV_INTERN
+ulint
+sync_arr_get_n_items(void)
+/*======================*/
+{
+ sync_array_t* sync_arr = sync_array_get();
+ return (ulint) sync_arr->n_cells;
+}
+
+/******************************************************************//**
+Get specified item from sync array if it is reserved. Set given
+pointer to array item if it is reserved.
+@return true if item is reserved, false othervise */
+UNIV_INTERN
+ibool
+sync_arr_get_item(
+/*==============*/
+ ulint i, /*!< in: requested item */
+ sync_cell_t **cell) /*!< out: cell contents if item
+ reserved */
+{
+ sync_array_t* sync_arr;
+ sync_cell_t* wait_cell;
+ void* wait_object;
+ ibool found = FALSE;
+
+ sync_arr = sync_array_get();
+ wait_cell = sync_array_get_nth_cell(sync_arr, i);
+
+ if (wait_cell) {
+ wait_object = wait_cell->wait_object;
+
+ if(wait_object != NULL && wait_cell->waiting) {
+ found = TRUE;
+ *cell = wait_cell;
+ }
+ }
+
+ return found;
+}
+
+/*******************************************************************//**
+Function to populate INFORMATION_SCHEMA.INNODB_SYS_SEMAPHORE_WAITS table.
+Loop through each item on sync array, and extract the column
+information and fill the INFORMATION_SCHEMA.INNODB_SYS_SEMAPHORE_WAITS table.
+@return 0 on success */
+UNIV_INTERN
+int
+sync_arr_fill_sys_semphore_waits_table(
+/*===================================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ Item* ) /*!< in: condition (not used) */
+{
+ Field** fields;
+ ulint n_items;
+
+ DBUG_ENTER("i_s_sys_semaphore_waits_fill_table");
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+ /* deny access to user without PROCESS_ACL privilege */
+ if (check_global_access(thd, PROCESS_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ fields = tables->table->field;
+ n_items = sync_arr_get_n_items();
+ ulint type;
+
+ for(ulint i=0; i < n_items;i++) {
+ sync_cell_t *cell=NULL;
+ if (sync_arr_get_item(i, &cell)) {
+ ib_mutex_t* mutex;
+ type = cell->request_type;
+ OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_THREAD_ID], (longlong)os_thread_pf(cell->thread)));
+ OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_FILE], innobase_basename(cell->file)));
+ OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_LINE], cell->line));
+ OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_WAIT_TIME], (longlong)difftime(time(NULL), cell->reservation_time)));
+
+ if (type == SYNC_MUTEX) {
+ mutex = static_cast<ib_mutex_t*>(cell->old_wait_mutex);
+
+ if (mutex) {
+ OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_OBJECT_NAME], mutex->cmutex_name));
+ OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_WAIT_OBJECT], (longlong)mutex));
+ OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_WAIT_TYPE], "MUTEX"));
+ OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_HOLDER_THREAD_ID], (longlong)mutex->thread_id));
+ OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_HOLDER_FILE], innobase_basename(mutex->file_name)));
+ OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_HOLDER_LINE], mutex->line));
+ OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_CREATED_FILE], innobase_basename(mutex->cfile_name)));
+ OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_CREATED_LINE], mutex->cline));
+ OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_WAITERS_FLAG], (longlong)mutex->waiters));
+ OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_LOCK_WORD], (longlong)mutex->lock_word));
+ OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_LAST_WRITER_FILE], innobase_basename(mutex->file_name)));
+ OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_LAST_WRITER_LINE], mutex->line));
+ OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_OS_WAIT_COUNT], mutex->count_os_wait));
+ }
+ } else if (type == RW_LOCK_EX
+ || type == RW_LOCK_WAIT_EX
+ || type == RW_LOCK_SHARED) {
+ rw_lock_t* rwlock=NULL;
+
+ rwlock = static_cast<rw_lock_t *> (cell->old_wait_rw_lock);
+
+ if (rwlock) {
+ ulint writer = rw_lock_get_writer(rwlock);
+
+ OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_WAIT_OBJECT], (longlong)rwlock));
+ if (type == RW_LOCK_EX) {
+ OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_WAIT_TYPE], "RW_LOCK_EX"));
+ } else if (type == RW_LOCK_WAIT_EX) {
+ OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_WAIT_TYPE], "RW_LOCK_WAIT_EX"));
+ } else if (type == RW_LOCK_SHARED) {
+ OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_WAIT_TYPE], "RW_LOCK_SHARED"));
+ }
+
+ if (writer != RW_LOCK_NOT_LOCKED) {
+ OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_OBJECT_NAME], rwlock->lock_name));
+ OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_WRITER_THREAD], (longlong)os_thread_pf(rwlock->writer_thread)));
+
+ if (writer == RW_LOCK_EX) {
+ OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_RESERVATION_MODE], "RW_LOCK_EX"));
+ } else if (writer == RW_LOCK_WAIT_EX) {
+ OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_RESERVATION_MODE], "RW_LOCK_WAIT_EX"));
+ }
+
+ OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_HOLDER_THREAD_ID], (longlong)rwlock->thread_id));
+ OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_HOLDER_FILE], innobase_basename(rwlock->file_name)));
+ OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_HOLDER_LINE], rwlock->line));
+ OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_READERS], rw_lock_get_reader_count(rwlock)));
+ OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_WAITERS_FLAG], (longlong)rwlock->waiters));
+ OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_LOCK_WORD], (longlong)rwlock->lock_word));
+ OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_LAST_READER_FILE], innobase_basename(rwlock->last_s_file_name)));
+ OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_LAST_READER_LINE], rwlock->last_s_line));
+ OK(field_store_string(fields[SYS_SEMAPHORE_WAITS_LAST_WRITER_FILE], innobase_basename(rwlock->last_x_file_name)));
+ OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_LAST_WRITER_LINE], rwlock->last_x_line));
+ OK(field_store_ulint(fields[SYS_SEMAPHORE_WAITS_OS_WAIT_COUNT], rwlock->count_os_wait));
+ }
+ }
+ }
+
+ OK(schema_table_store_record(thd, tables->table));
+ }
+ }
+
+ DBUG_RETURN(0);
+}
diff --git a/storage/innobase/sync/sync0rw.cc b/storage/innobase/sync/sync0rw.cc
index e77c7a9b396..8919716ff9c 100644
--- a/storage/innobase/sync/sync0rw.cc
+++ b/storage/innobase/sync/sync0rw.cc
@@ -209,8 +209,8 @@ rw_lock_create_func(
# ifdef UNIV_SYNC_DEBUG
ulint level, /*!< in: level */
# endif /* UNIV_SYNC_DEBUG */
- const char* cmutex_name, /*!< in: mutex name */
#endif /* UNIV_DEBUG */
+ const char* cmutex_name, /*!< in: mutex name */
const char* cfile_name, /*!< in: file name where created */
ulint cline) /*!< in: file line where created */
{
@@ -223,8 +223,7 @@ rw_lock_create_func(
lock->mutex.cfile_name = cfile_name;
lock->mutex.cline = cline;
-
- ut_d(lock->mutex.cmutex_name = cmutex_name);
+ lock->mutex.lock_name = cmutex_name;
ut_d(lock->mutex.ib_mutex_type = 1);
#else /* INNODB_RW_LOCKS_USE_ATOMICS */
# ifdef UNIV_DEBUG
@@ -253,8 +252,10 @@ rw_lock_create_func(
lock->cfile_name = cfile_name;
lock->cline = (unsigned int) cline;
-
+ lock->lock_name = cmutex_name;
lock->count_os_wait = 0;
+ lock->file_name = "not yet reserved";
+ lock->line = 0;
lock->last_s_file_name = "not yet reserved";
lock->last_x_file_name = "not yet reserved";
lock->last_s_line = 0;
@@ -516,6 +517,12 @@ rw_lock_x_lock_wait(
file_name, line);
#endif
+ if (srv_instrument_semaphores) {
+ lock->thread_id = os_thread_get_curr_id();
+ lock->file_name = file_name;
+ lock->line = line;
+ }
+
sync_array_wait_event(sync_arr, index);
#ifdef UNIV_SYNC_DEBUG
rw_lock_remove_debug_info(
@@ -590,6 +597,13 @@ rw_lock_x_lock_low(
#ifdef UNIV_SYNC_DEBUG
rw_lock_add_debug_info(lock, pass, RW_LOCK_EX, file_name, line);
#endif
+
+ if (srv_instrument_semaphores) {
+ lock->thread_id = os_thread_get_curr_id();
+ lock->file_name = file_name;
+ lock->line = line;
+ }
+
lock->last_x_file_name = file_name;
lock->last_x_line = (unsigned int) line;
diff --git a/storage/innobase/sync/sync0sync.cc b/storage/innobase/sync/sync0sync.cc
index e0f132574c9..2e1737da3ec 100644
--- a/storage/innobase/sync/sync0sync.cc
+++ b/storage/innobase/sync/sync0sync.cc
@@ -261,8 +261,8 @@ void
mutex_create_func(
/*==============*/
ib_mutex_t* mutex, /*!< in: pointer to memory */
-#ifdef UNIV_DEBUG
const char* cmutex_name, /*!< in: mutex name */
+#ifdef UNIV_DEBUG
# ifdef UNIV_SYNC_DEBUG
ulint level, /*!< in: level */
# endif /* UNIV_SYNC_DEBUG */
@@ -281,14 +281,16 @@ mutex_create_func(
#ifdef UNIV_DEBUG
mutex->magic_n = MUTEX_MAGIC_N;
#endif /* UNIV_DEBUG */
-#ifdef UNIV_SYNC_DEBUG
+
mutex->line = 0;
mutex->file_name = "not yet reserved";
+#ifdef UNIV_SYNC_DEBUG
mutex->level = level;
#endif /* UNIV_SYNC_DEBUG */
mutex->cfile_name = cfile_name;
mutex->cline = cline;
mutex->count_os_wait = 0;
+ mutex->cmutex_name = cmutex_name;
/* Check that lock_word is aligned; this is important on Intel */
ut_ad(((ulint)(&(mutex->lock_word))) % 4 == 0);
@@ -394,11 +396,15 @@ mutex_enter_nowait_func(
if (!ib_mutex_test_and_set(mutex)) {
- ut_d(mutex->thread_id = os_thread_get_curr_id());
+ mutex->thread_id = os_thread_get_curr_id();
#ifdef UNIV_SYNC_DEBUG
mutex_set_debug_info(mutex, file_name, line);
+#else
+ if (srv_instrument_semaphores) {
+ mutex->file_name = file_name;
+ mutex->line = line;
+ }
#endif
-
return(0); /* Succeeded! */
}
@@ -516,10 +522,15 @@ spin_loop:
if (ib_mutex_test_and_set(mutex) == 0) {
/* Succeeded! */
- ut_d(mutex->thread_id = os_thread_get_curr_id());
+ mutex->thread_id = os_thread_get_curr_id();
#ifdef UNIV_SYNC_DEBUG
mutex_set_debug_info(mutex, file_name, line);
#endif
+ if (srv_instrument_semaphores) {
+ mutex->file_name = file_name;
+ mutex->line = line;
+ }
+
return;
}
@@ -559,10 +570,14 @@ spin_loop:
sync_array_free_cell(sync_arr, index);
- ut_d(mutex->thread_id = os_thread_get_curr_id());
+ mutex->thread_id = os_thread_get_curr_id();
#ifdef UNIV_SYNC_DEBUG
mutex_set_debug_info(mutex, file_name, line);
#endif
+ if (srv_instrument_semaphores) {
+ mutex->file_name = file_name;
+ mutex->line = line;
+ }
return;
@@ -1157,6 +1172,7 @@ sync_thread_add_level(
case SYNC_IBUF_MUTEX:
case SYNC_INDEX_ONLINE_LOG:
case SYNC_STATS_AUTO_RECALC:
+ case SYNC_STATS_DEFRAG:
if (!sync_thread_levels_g(array, level, TRUE)) {
fprintf(stderr,
"InnoDB: sync_thread_levels_g(array, %lu)"
diff --git a/storage/innobase/trx/trx0rec.cc b/storage/innobase/trx/trx0rec.cc
index 11ad7fe4afd..fa3fe0904b8 100644
--- a/storage/innobase/trx/trx0rec.cc
+++ b/storage/innobase/trx/trx0rec.cc
@@ -781,7 +781,8 @@ trx_undo_page_report_modify(
}
pos = dict_index_get_nth_col_pos(index,
- col_no);
+ col_no,
+ NULL);
ptr += mach_write_compressed(ptr, pos);
/* Save the old value of field */
diff --git a/storage/innobase/trx/trx0sys.cc b/storage/innobase/trx/trx0sys.cc
index 5eb3cef46c1..81df8062c54 100644
--- a/storage/innobase/trx/trx0sys.cc
+++ b/storage/innobase/trx/trx0sys.cc
@@ -44,6 +44,8 @@ Created 3/26/1996 Heikki Tuuri
#include "os0file.h"
#include "read0read.h"
+#include <mysql/service_wsrep.h>
+
/** The file format tag structure with id and name. */
struct file_format_t {
ulint id; /*!< id of the file format */
@@ -174,7 +176,12 @@ trx_sys_flush_max_trx_id(void)
mtr_t mtr;
trx_sysf_t* sys_header;
+#ifndef WITH_WSREP
+ /* wsrep_fake_trx_id violates this assert
+ * Copied from trx_sys_get_new_trx_id
+ */
ut_ad(mutex_own(&trx_sys->mutex));
+#endif /* WITH_WSREP */
if (!srv_read_only_mode) {
mtr_start(&mtr);
@@ -202,9 +209,14 @@ trx_sys_update_mysql_binlog_offset(
ib_int64_t offset, /*!< in: position in that log file */
ulint field, /*!< in: offset of the MySQL log info field in
the trx sys header */
+#ifdef WITH_WSREP
+ trx_sysf_t* sys_header, /*!< in: trx sys header */
+#endif /* WITH_WSREP */
mtr_t* mtr) /*!< in: mtr */
{
+#ifndef WITH_WSREP
trx_sysf_t* sys_header;
+#endif /* !WITH_WSREP */
if (ut_strlen(file_name) >= TRX_SYS_MYSQL_LOG_NAME_LEN) {
@@ -213,7 +225,9 @@ trx_sys_update_mysql_binlog_offset(
return;
}
+#ifndef WITH_WSREP
sys_header = trx_sysf_get(mtr);
+#endif /* !WITH_WSREP */
if (mach_read_from_4(sys_header + field
+ TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
@@ -300,6 +314,124 @@ trx_sys_print_mysql_binlog_offset(void)
mtr_commit(&mtr);
}
+#ifdef WITH_WSREP
+
+#ifdef UNIV_DEBUG
+static long long trx_sys_cur_xid_seqno = -1;
+static unsigned char trx_sys_cur_xid_uuid[16];
+
+long long read_wsrep_xid_seqno(const XID* xid)
+{
+ long long seqno;
+ memcpy(&seqno, xid->data + 24, sizeof(long long));
+ return seqno;
+}
+
+void read_wsrep_xid_uuid(const XID* xid, unsigned char* buf)
+{
+ memcpy(buf, xid->data + 8, 16);
+}
+
+#endif /* UNIV_DEBUG */
+
+void
+trx_sys_update_wsrep_checkpoint(
+ const XID* xid, /*!< in: transaction XID */
+ trx_sysf_t* sys_header, /*!< in: sys_header */
+ mtr_t* mtr) /*!< in: mtr */
+{
+#ifdef UNIV_DEBUG
+ {
+ /* Check that seqno is monotonically increasing */
+ unsigned char xid_uuid[16];
+ long long xid_seqno = read_wsrep_xid_seqno(xid);
+ read_wsrep_xid_uuid(xid, xid_uuid);
+ if (!memcmp(xid_uuid, trx_sys_cur_xid_uuid, 8))
+ {
+ ut_ad(xid_seqno > trx_sys_cur_xid_seqno);
+ trx_sys_cur_xid_seqno = xid_seqno;
+ }
+ else
+ {
+ memcpy(trx_sys_cur_xid_uuid, xid_uuid, 16);
+ }
+ trx_sys_cur_xid_seqno = xid_seqno;
+ }
+#endif /* UNIV_DEBUG */
+
+ ut_ad(xid && mtr);
+ ut_a(xid->formatID == -1 || wsrep_is_wsrep_xid(xid));
+
+ if (mach_read_from_4(sys_header + TRX_SYS_WSREP_XID_INFO
+ + TRX_SYS_WSREP_XID_MAGIC_N_FLD)
+ != TRX_SYS_WSREP_XID_MAGIC_N) {
+ mlog_write_ulint(sys_header + TRX_SYS_WSREP_XID_INFO
+ + TRX_SYS_WSREP_XID_MAGIC_N_FLD,
+ TRX_SYS_WSREP_XID_MAGIC_N,
+ MLOG_4BYTES, mtr);
+ }
+
+ mlog_write_ulint(sys_header + TRX_SYS_WSREP_XID_INFO
+ + TRX_SYS_WSREP_XID_FORMAT,
+ (int)xid->formatID,
+ MLOG_4BYTES, mtr);
+ mlog_write_ulint(sys_header + TRX_SYS_WSREP_XID_INFO
+ + TRX_SYS_WSREP_XID_GTRID_LEN,
+ (int)xid->gtrid_length,
+ MLOG_4BYTES, mtr);
+ mlog_write_ulint(sys_header + TRX_SYS_WSREP_XID_INFO
+ + TRX_SYS_WSREP_XID_BQUAL_LEN,
+ (int)xid->bqual_length,
+ MLOG_4BYTES, mtr);
+ mlog_write_string(sys_header + TRX_SYS_WSREP_XID_INFO
+ + TRX_SYS_WSREP_XID_DATA,
+ (const unsigned char*) xid->data,
+ XIDDATASIZE, mtr);
+
+}
+
+void
+trx_sys_read_wsrep_checkpoint(XID* xid)
+/*===================================*/
+{
+ trx_sysf_t* sys_header;
+ mtr_t mtr;
+ ulint magic;
+
+ ut_ad(xid);
+
+ mtr_start(&mtr);
+
+ sys_header = trx_sysf_get(&mtr);
+
+ if ((magic = mach_read_from_4(sys_header + TRX_SYS_WSREP_XID_INFO
+ + TRX_SYS_WSREP_XID_MAGIC_N_FLD))
+ != TRX_SYS_WSREP_XID_MAGIC_N) {
+ memset(xid, 0, sizeof(*xid));
+ xid->formatID = -1;
+ trx_sys_update_wsrep_checkpoint(xid, sys_header, &mtr);
+ mtr_commit(&mtr);
+ return;
+ }
+
+ xid->formatID = (int)mach_read_from_4(
+ sys_header
+ + TRX_SYS_WSREP_XID_INFO + TRX_SYS_WSREP_XID_FORMAT);
+ xid->gtrid_length = (int)mach_read_from_4(
+ sys_header
+ + TRX_SYS_WSREP_XID_INFO + TRX_SYS_WSREP_XID_GTRID_LEN);
+ xid->bqual_length = (int)mach_read_from_4(
+ sys_header
+ + TRX_SYS_WSREP_XID_INFO + TRX_SYS_WSREP_XID_BQUAL_LEN);
+ ut_memcpy(xid->data,
+ sys_header + TRX_SYS_WSREP_XID_INFO + TRX_SYS_WSREP_XID_DATA,
+ XIDDATASIZE);
+
+ mtr_commit(&mtr);
+}
+
+#endif /* WITH_WSREP */
+
/*****************************************************************//**
Prints to stderr the MySQL master log offset info in the trx system header if
the magic number shows it valid. */
diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc
index f072cc6e9c4..7f3cfa22255 100644
--- a/storage/innobase/trx/trx0trx.cc
+++ b/storage/innobase/trx/trx0trx.cc
@@ -29,6 +29,8 @@ Created 3/26/1996 Heikki Tuuri
#include "trx0trx.ic"
#endif
+#include <mysql/service_wsrep.h>
+
#include "trx0undo.h"
#include "trx0rseg.h"
#include "log0log.h"
@@ -162,6 +164,9 @@ trx_create(void)
trx->lock.table_locks = ib_vector_create(
heap_alloc, sizeof(void**), 32);
+#ifdef WITH_WSREP
+ trx->wsrep_event = NULL;
+#endif /* WITH_WSREP */
return(trx);
}
@@ -856,6 +861,11 @@ trx_start_low(
srv_undo_logs, srv_undo_tablespaces);
}
+#ifdef WITH_WSREP
+ memset(&trx->xid, 0, sizeof(trx->xid));
+ trx->xid.formatID = -1;
+#endif /* WITH_WSREP */
+
/* The initial value for trx->no: TRX_ID_MAX is used in
read_view_open_now: */
@@ -970,6 +980,9 @@ trx_write_serialisation_history(
trx_t* trx, /*!< in/out: transaction */
mtr_t* mtr) /*!< in/out: mini-transaction */
{
+#ifdef WITH_WSREP
+ trx_sysf_t* sys_header;
+#endif /* WITH_WSREP */
trx_rseg_t* rseg;
rseg = trx->rseg;
@@ -1016,6 +1029,15 @@ trx_write_serialisation_history(
MONITOR_INC(MONITOR_TRX_COMMIT_UNDO);
+#ifdef WITH_WSREP
+ sys_header = trx_sysf_get(mtr);
+ /* Update latest MySQL wsrep XID in trx sys header. */
+ if (wsrep_is_wsrep_xid(&trx->xid))
+ {
+ trx_sys_update_wsrep_checkpoint(&trx->xid, sys_header, mtr);
+ }
+#endif /* WITH_WSREP */
+
/* Update the latest MySQL binlog name and offset info
in trx sys header if MySQL binlogging is on or the database
server is a MySQL replication slave */
@@ -1026,7 +1048,11 @@ trx_write_serialisation_history(
trx_sys_update_mysql_binlog_offset(
trx->mysql_log_file_name,
trx->mysql_log_offset,
- TRX_SYS_MYSQL_LOG_INFO, mtr);
+ TRX_SYS_MYSQL_LOG_INFO,
+#ifdef WITH_WSREP
+ sys_header,
+#endif /* WITH_WSREP */
+ mtr);
trx->mysql_log_file_name = NULL;
}
@@ -1320,6 +1346,11 @@ trx_commit_in_memory(
ut_ad(!trx->in_ro_trx_list);
ut_ad(!trx->in_rw_trx_list);
+#ifdef WITH_WSREP
+ if (wsrep_on(trx->mysql_thd)) {
+ trx->lock.was_chosen_as_deadlock_victim = FALSE;
+ }
+#endif
trx->dict_operation = TRX_DICT_OP_NONE;
trx->error_state = DB_SUCCESS;
@@ -1504,6 +1535,10 @@ trx_commit_or_rollback_prepare(
switch (trx->state) {
case TRX_STATE_NOT_STARTED:
+#ifdef WITH_WSREP
+ ut_d(trx->start_file = __FILE__);
+ ut_d(trx->start_line = __LINE__);
+#endif /* WITH_WSREP */
trx_start_low(trx);
/* fall through */
case TRX_STATE_ACTIVE:
diff --git a/storage/innobase/ut/ut0timer.cc b/storage/innobase/ut/ut0timer.cc
new file mode 100644
index 00000000000..85292cce28c
--- /dev/null
+++ b/storage/innobase/ut/ut0timer.cc
@@ -0,0 +1,92 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved.
+Copyright (c) 2014, SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file ut/ut0timer.cc
+Timer rountines
+
+Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com
+modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6
+*************************************************************************/
+
+#include "data0type.h"
+#include <my_rdtsc.h>
+#include <ut0timer.h>
+
+/**************************************************************//**
+Initial timer definition
+@return 0 */
+static
+ulonglong
+ut_timer_none(void)
+/*===============*/
+{
+ return 0;
+}
+
+/**************************************************************//**
+Function pointer to point selected timer function.
+@return timer current value */
+ulonglong (*ut_timer_now)(void) = &ut_timer_none;
+
+struct my_timer_unit_info ut_timer;
+
+/**************************************************************//**
+Sets up the data required for use of my_timer_* functions.
+Selects the best timer by high frequency, and tight resolution.
+Points my_timer_now() to the selected timer function.
+Initializes my_timer struct to contain the info for selected timer.*/
+UNIV_INTERN
+void
+ut_init_timer(void)
+/*===============*/
+{
+ MY_TIMER_INFO all_timer_info;
+ my_timer_init(&all_timer_info);
+
+ if (all_timer_info.cycles.frequency > 1000000 &&
+ all_timer_info.cycles.resolution == 1) {
+ ut_timer = all_timer_info.cycles;
+ ut_timer_now = &my_timer_cycles;
+ } else if (all_timer_info.nanoseconds.frequency > 1000000 &&
+ all_timer_info.nanoseconds.resolution == 1) {
+ ut_timer = all_timer_info.nanoseconds;
+ ut_timer_now = &my_timer_nanoseconds;
+ } else if (all_timer_info.microseconds.frequency >= 1000000 &&
+ all_timer_info.microseconds.resolution == 1) {
+ ut_timer = all_timer_info.microseconds;
+ ut_timer_now = &my_timer_microseconds;
+
+ } else if (all_timer_info.milliseconds.frequency >= 1000 &&
+ all_timer_info.milliseconds.resolution == 1) {
+ ut_timer = all_timer_info.milliseconds;
+ ut_timer_now = &my_timer_milliseconds;
+ } else if (all_timer_info.ticks.frequency >= 1000 &&
+ /* Will probably be false */
+ all_timer_info.ticks.resolution == 1) {
+ ut_timer = all_timer_info.ticks;
+ ut_timer_now = &my_timer_ticks;
+ } else {
+ /* None are acceptable, so leave it as "None", and fill in struct */
+ ut_timer.frequency = 1; /* Avoid div-by-zero */
+ ut_timer.overhead = 0; /* Since it doesn't do anything */
+ ut_timer.resolution = 10; /* Another sign it's bad */
+ ut_timer.routine = 0; /* None */
+ }
+}
diff --git a/storage/innobase/ut/ut0wqueue.cc b/storage/innobase/ut/ut0wqueue.cc
index d1ba36b3b00..1607e535a94 100644
--- a/storage/innobase/ut/ut0wqueue.cc
+++ b/storage/innobase/ut/ut0wqueue.cc
@@ -162,6 +162,38 @@ ib_wqueue_timedwait(
}
/********************************************************************
+Return first item on work queue or NULL if queue is empty
+@return work item or NULL */
+void*
+ib_wqueue_nowait(
+/*=============*/
+ ib_wqueue_t* wq) /*<! in: work queue */
+{
+ ib_list_node_t* node = NULL;
+
+ mutex_enter(&wq->mutex);
+
+ if(!ib_list_is_empty(wq->items)) {
+ node = ib_list_get_first(wq->items);
+
+ if (node) {
+ ib_list_remove(wq->items, node);
+
+ }
+ }
+
+ /* We must reset the event when the list
+ gets emptied. */
+ if(ib_list_is_empty(wq->items)) {
+ os_event_reset(wq->event);
+ }
+
+ mutex_exit(&wq->mutex);
+
+ return (node ? node->data : NULL);
+}
+
+/********************************************************************
Check if queue is empty. */
ibool
@@ -173,3 +205,20 @@ ib_wqueue_is_empty(
{
return(ib_list_is_empty(wq->items));
}
+
+/********************************************************************
+Get number of items on queue.
+@return number of items on queue */
+ulint
+ib_wqueue_len(
+/*==========*/
+ ib_wqueue_t* wq) /*<! in: work queue */
+{
+ ulint len = 0;
+
+ mutex_enter(&wq->mutex);
+ len = ib_list_len(wq->items);
+ mutex_exit(&wq->mutex);
+
+ return(len);
+}