summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--SConstruct1
-rw-r--r--build_posix/Make.base1
-rw-r--r--build_win/filelist.win1
-rw-r--r--dist/api_data.py2
-rw-r--r--dist/filelist1
-rw-r--r--dist/flags.py1
-rw-r--r--dist/s_string.ok2
-rw-r--r--examples/c/ex_all.c4
-rw-r--r--lang/java/java_doc.i1
-rw-r--r--src/btree/bt_handle.c25
-rw-r--r--src/btree/bt_rebalance.c473
-rw-r--r--src/config/config_def.c34
-rw-r--r--src/conn/conn_api.c1
-rw-r--r--src/include/btree.h12
-rw-r--r--src/include/config.h39
-rw-r--r--src/include/extern.h2
-rw-r--r--src/include/flags.h21
-rw-r--r--src/include/wiredtiger.in26
-rw-r--r--src/session/session_api.c27
-rw-r--r--src/txn/txn_ckpt.c5
-rw-r--r--src/utilities/util.h1
-rw-r--r--src/utilities/util_main.c5
-rw-r--r--src/utilities/util_rebalance.c63
-rw-r--r--test/format/Makefile.am2
-rw-r--r--test/format/config.c2
-rw-r--r--test/format/config.h6
-rw-r--r--test/format/format.h5
-rw-r--r--test/format/rebalance.c84
-rw-r--r--test/format/t.c5
-rw-r--r--test/format/wts.c26
30 files changed, 807 insertions, 71 deletions
diff --git a/SConstruct b/SConstruct
index 6a2b0497d15..914a7a137e1 100644
--- a/SConstruct
+++ b/SConstruct
@@ -290,6 +290,7 @@ wtbin = env.Program("wt", [
"src/utilities/util_misc.c",
"src/utilities/util_printlog.c",
"src/utilities/util_read.c",
+ "src/utilities/util_rebalance.c",
"src/utilities/util_rename.c",
"src/utilities/util_salvage.c",
"src/utilities/util_stat.c",
diff --git a/build_posix/Make.base b/build_posix/Make.base
index 3037d70528a..4efbe3f76c3 100644
--- a/build_posix/Make.base
+++ b/build_posix/Make.base
@@ -31,6 +31,7 @@ wt_SOURCES =\
src/utilities/util_misc.c \
src/utilities/util_printlog.c \
src/utilities/util_read.c \
+ src/utilities/util_rebalance.c \
src/utilities/util_rename.c \
src/utilities/util_salvage.c \
src/utilities/util_stat.c \
diff --git a/build_win/filelist.win b/build_win/filelist.win
index af6ddf98da9..9a7c26217ec 100644
--- a/build_win/filelist.win
+++ b/build_win/filelist.win
@@ -31,6 +31,7 @@ src/btree/bt_misc.c
src/btree/bt_ovfl.c
src/btree/bt_page.c
src/btree/bt_read.c
+src/btree/bt_rebalance.c
src/btree/bt_ret.c
src/btree/bt_slvg.c
src/btree/bt_split.c
diff --git a/dist/api_data.py b/dist/api_data.py
index ff6d3f3ccb5..40dabedd6f7 100644
--- a/dist/api_data.py
+++ b/dist/api_data.py
@@ -564,6 +564,7 @@ connection_runtime_config = [
'mutex',
'overflow',
'read',
+ 'rebalance',
'reconcile',
'recovery',
'salvage',
@@ -892,6 +893,7 @@ methods = {
type='list'),
]),
+'WT_SESSION.rebalance' : Method([]),
'WT_SESSION.rename' : Method([]),
'WT_SESSION.reset' : Method([]),
'WT_SESSION.salvage' : Method([
diff --git a/dist/filelist b/dist/filelist
index 52af87c2a68..830e58fcc25 100644
--- a/dist/filelist
+++ b/dist/filelist
@@ -31,6 +31,7 @@ src/btree/bt_misc.c
src/btree/bt_ovfl.c
src/btree/bt_page.c
src/btree/bt_read.c
+src/btree/bt_rebalance.c
src/btree/bt_ret.c
src/btree/bt_slvg.c
src/btree/bt_split.c
diff --git a/dist/flags.py b/dist/flags.py
index 7d237dd39a4..84261555ce1 100644
--- a/dist/flags.py
+++ b/dist/flags.py
@@ -70,6 +70,7 @@ flags = {
'VERB_MUTEX',
'VERB_OVERFLOW',
'VERB_READ',
+ 'VERB_REBALANCE',
'VERB_RECONCILE',
'VERB_RECOVERY',
'VERB_SALVAGE',
diff --git a/dist/s_string.ok b/dist/s_string.ok
index 81393664de2..352fcb1a4ad 100644
--- a/dist/s_string.ok
+++ b/dist/s_string.ok
@@ -254,6 +254,7 @@ RNG
RPC
RUNDIR
Radu
+Rebalance
RedHat
Redistributions
Resize
@@ -857,6 +858,7 @@ readlock
readonly
readunlock
realloc
+rebalance
rebalancing
recno
recnos
diff --git a/examples/c/ex_all.c b/examples/c/ex_all.c
index 213e058d4cc..22a2afdb345 100644
--- a/examples/c/ex_all.c
+++ b/examples/c/ex_all.c
@@ -673,6 +673,10 @@ session_ops(WT_SESSION *session)
ret = session->compact(session, "table:mytable", NULL);
/*! [Compact a table] */
+ /*! [Rebalance a table] */
+ ret = session->rebalance(session, "table:old", NULL);
+ /*! [Rebalance a table] */
+
/*! [Rename a table] */
ret = session->rename(session, "table:old", "table:new", NULL);
/*! [Rename a table] */
diff --git a/lang/java/java_doc.i b/lang/java/java_doc.i
index 17317ab875b..450cb1d5ab2 100644
--- a/lang/java/java_doc.i
+++ b/lang/java/java_doc.i
@@ -36,6 +36,7 @@ COPYDOC(__wt_session, WT_SESSION, drop)
COPYDOC(__wt_session, WT_SESSION, join)
COPYDOC(__wt_session, WT_SESSION, log_flush)
COPYDOC(__wt_session, WT_SESSION, log_printf)
+COPYDOC(__wt_session, WT_SESSION, rebalance)
COPYDOC(__wt_session, WT_SESSION, rename)
COPYDOC(__wt_session, WT_SESSION, reset)
COPYDOC(__wt_session, WT_SESSION, salvage)
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index 9935d92677c..1b5dbcb21e7 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -105,14 +105,23 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[])
WT_ERR(__wt_btree_tree_open(
session, root_addr, root_addr_size));
- /* Warm the cache, if possible. */
- WT_WITH_PAGE_INDEX(session,
- ret = __btree_preload(session));
- WT_ERR(ret);
-
- /* Get the last record number in a column-store file. */
- if (btree->type != BTREE_ROW)
- WT_ERR(__btree_get_last_recno(session));
+ /*
+ * Rebalance uses the cache, but only wants the root
+ * page, nothing else.
+ */
+ if (!F_ISSET(btree, WT_BTREE_REBALANCE)) {
+ /* Warm the cache, if possible. */
+ WT_WITH_PAGE_INDEX(session,
+ ret = __btree_preload(session));
+ WT_ERR(ret);
+
+ /*
+ * Get the last record number in a column-store
+ * file.
+ */
+ if (btree->type != BTREE_ROW)
+ WT_ERR(__btree_get_last_recno(session));
+ }
}
}
diff --git a/src/btree/bt_rebalance.c b/src/btree/bt_rebalance.c
new file mode 100644
index 00000000000..9e7d6ac2fad
--- /dev/null
+++ b/src/btree/bt_rebalance.c
@@ -0,0 +1,473 @@
+/*-
+ * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Shared rebalance information.
+ */
+typedef struct {
+ WT_REF **leaf; /* List of leaf pages */
+ uint64_t leaf_next; /* Next entry */
+ size_t leaf_allocated; /* Allocated bytes */
+
+ WT_ADDR *fl; /* List of objects to free */
+ uint64_t fl_next; /* Next entry */
+ size_t fl_allocated; /* Allocated bytes */
+
+ WT_PAGE *root; /* Created root page */
+
+ uint8_t type; /* Internal page type */
+
+#define WT_REBALANCE_PROGRESS_INTERVAL 100
+ uint64_t progress; /* Progress counter */
+
+ WT_ITEM *tmp1; /* Temporary buffers */
+ WT_ITEM *tmp2;
+} WT_RSTUFF;
+
+/*
+ * __rebalance_discard --
+ * Free the allocated information.
+ */
+static void
+__rebalance_discard(WT_SESSION_IMPL *session, WT_RSTUFF *rs)
+{
+ while (rs->leaf_next > 0) {
+ --rs->leaf_next;
+ __wt_free_ref(
+ session, rs->leaf[rs->leaf_next], rs->type, false);
+ }
+ __wt_free(session, rs->leaf);
+
+ while (rs->fl_next > 0) {
+ --rs->fl_next;
+ __wt_free(session, rs->fl[rs->fl_next].addr);
+ }
+ __wt_free(session, rs->fl);
+}
+
+/*
+ * __rebalance_leaf_append --
+ * Add a new entry to the list of leaf pages.
+ */
+static int
+__rebalance_leaf_append(WT_SESSION_IMPL *session,
+ const uint8_t *key, size_t key_len,
+ uint64_t recno,
+ const uint8_t *addr, size_t addr_len, u_int addr_type, WT_RSTUFF *rs)
+{
+ WT_ADDR *copy_addr;
+ WT_REF *copy;
+
+ WT_RET(__wt_verbose(session, WT_VERB_REBALANCE,
+ "rebalance leaf-list append %s, %s",
+ __wt_buf_set_printable(session, key, key_len, rs->tmp2),
+ __wt_addr_string(session, addr, addr_len, rs->tmp1)));
+
+ /* Allocate and initialize a new leaf page reference. */
+ WT_RET(__wt_realloc_def(
+ session, &rs->leaf_allocated, rs->leaf_next + 1, &rs->leaf));
+ WT_RET(__wt_calloc_one(session, &copy));
+ rs->leaf[rs->leaf_next++] = copy;
+
+ copy->page = NULL;
+ copy->home = NULL;
+ copy->pindex_hint = 0;
+ copy->state = WT_REF_DISK;
+
+ WT_RET(__wt_calloc_one(session, &copy_addr));
+ copy->addr = copy_addr;
+ WT_RET(__wt_strndup(session, addr, addr_len, &copy_addr->addr));
+ copy_addr->size = (uint8_t)addr_len;
+ copy_addr->type = (uint8_t)addr_type;
+
+ if (recno == WT_RECNO_OOB)
+ WT_RET(__wt_row_ikey(session, 0, key, key_len, copy));
+ else
+ copy->key.recno = recno;
+
+ copy->page_del = NULL;
+ return (0);
+}
+
+/*
+ * __rebalance_fl_append --
+ * Add a new entry to the free list.
+ */
+static int
+__rebalance_fl_append(WT_SESSION_IMPL *session,
+ const uint8_t *addr, size_t addr_len, WT_RSTUFF *rs)
+{
+ WT_ADDR *copy;
+
+ WT_RET(__wt_realloc_def(
+ session, &rs->fl_allocated, rs->fl_next + 1, &rs->fl));
+ copy = &rs->fl[rs->fl_next++];
+
+ WT_RET(__wt_strndup(session, addr, addr_len, &copy->addr));
+ copy->size = (uint8_t)addr_len;
+ copy->type = 0;
+
+ return (0);
+}
+
+/*
+ * __rebalance_internal --
+ * Build an in-memory page that references all of the leaf pages we've
+ * found.
+ */
+static int
+__rebalance_internal(WT_SESSION_IMPL *session, WT_RSTUFF *rs)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_INDEX *pindex;
+ WT_REF **refp;
+ uint64_t i;
+
+ btree = S2BT(session);
+
+ /*
+ * There's a limit to the number of pages we can rebalance: the number
+ * of elements on a page is a 4B quantity and it's technically possible
+ * there could be more pages than that in a tree.
+ */
+ if (rs->leaf_next > UINT32_MAX)
+ WT_RET_MSG(session, ENOTSUP,
+ "too many leaf pages to rebalance, %" PRIu64 " pages "
+ "exceeds the maximum of %" PRIu32,
+ rs->leaf_next, UINT32_MAX);
+
+ /* Allocate a row-store root (internal) page and fill it in. */
+ WT_RET(__wt_page_alloc(session, rs->type,
+ rs->type == WT_PAGE_COL_INT ? 1 : 0, rs->leaf_next, false, &page));
+ page->pg_intl_parent_ref = &btree->root;
+ WT_ERR(__wt_page_modify_init(session, page));
+ __wt_page_modify_set(session, page);
+
+ pindex = WT_INTL_INDEX_GET_SAFE(page);
+ for (refp = pindex->index, i = 0; i < rs->leaf_next; ++i) {
+ rs->leaf[i]->home = page;
+ *refp++ = rs->leaf[i];
+ rs->leaf[i] = NULL;
+ }
+
+ rs->root = page;
+ return (0);
+
+err: __wt_page_out(session, &page);
+ return (ret);
+}
+
+/*
+ * __rebalance_free_original --
+ * Free the tracked internal pages and overflow keys.
+ */
+static int
+__rebalance_free_original(WT_SESSION_IMPL *session, WT_RSTUFF *rs)
+{
+ WT_ADDR *addr;
+ uint64_t i;
+
+ for (i = 0; i < rs->fl_next; ++i) {
+ addr = &rs->fl[i];
+
+ WT_RET(__wt_verbose(session, WT_VERB_REBALANCE,
+ "rebalance discarding %s",
+ __wt_addr_string(
+ session, addr->addr, addr->size, rs->tmp1)));
+
+ WT_RET(__wt_btree_block_free(session, addr->addr, addr->size));
+ }
+ return (0);
+}
+
+/*
+ * __rebalance_col_walk --
+ * Walk a column-store page and its descendants.
+ */
+static int
+__rebalance_col_walk(
+ WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_RSTUFF *rs)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK unpack;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ uint32_t i;
+
+ btree = S2BT(session);
+
+ WT_ERR(__wt_scr_alloc(session, 0, &buf));
+
+ /* Report progress periodically. */
+ if (++rs->progress % WT_REBALANCE_PROGRESS_INTERVAL == 0)
+ WT_ERR(__wt_progress(session, NULL, rs->progress));
+
+ /*
+ * Walk the page, instantiating keys: the page contains sorted key and
+ * location cookie pairs. Keys are on-page/overflow items and location
+ * cookies are WT_CELL_ADDR_XXX items.
+ */
+ WT_CELL_FOREACH(btree, dsk, cell, &unpack, i) {
+ __wt_cell_unpack(cell, &unpack);
+ switch (unpack.type) {
+ case WT_CELL_ADDR_INT:
+ /* An internal page: read it and recursively walk it. */
+ WT_ERR(__wt_bt_read(
+ session, buf, unpack.data, unpack.size));
+ WT_ERR(__rebalance_col_walk(session, buf->data, rs));
+ WT_ERR(__wt_verbose(session, WT_VERB_REBALANCE,
+ "free-list append internal page: %s",
+ __wt_addr_string(
+ session, unpack.data, unpack.size, rs->tmp1)));
+ WT_ERR(__rebalance_fl_append(
+ session, unpack.data, unpack.size, rs));
+ break;
+ case WT_CELL_ADDR_LEAF:
+ case WT_CELL_ADDR_LEAF_NO:
+ WT_ERR(__rebalance_leaf_append(session,
+ NULL, 0, unpack.v, unpack.data, unpack.size,
+ unpack.type == WT_CELL_ADDR_LEAF ?
+ WT_ADDR_LEAF : WT_ADDR_LEAF_NO, rs));
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+ }
+
+err: __wt_scr_free(session, &buf);
+ return (ret);
+}
+
+/*
+ * __rebalance_row_leaf_key --
+ * Acquire a copy of the key for a leaf page.
+ */
+static int
+__rebalance_row_leaf_key(WT_SESSION_IMPL *session,
+ const uint8_t *addr, size_t addr_len, WT_ITEM *key, WT_RSTUFF *rs)
+{
+ WT_PAGE *page;
+ WT_DECL_RET;
+
+ /*
+ * We need the first key from a leaf page. Leaf pages are relatively
+ * complex (Huffman encoding, prefix compression, and so on), do the
+ * work to instantiate the page and copy the first key to the buffer.
+ */
+ WT_RET(__wt_bt_read(session, rs->tmp1, addr, addr_len));
+ WT_RET(__wt_page_inmem(session, NULL, rs->tmp1->data, 0, 0, &page));
+ ret = __wt_row_leaf_key_copy(session, page, &page->pg_row_d[0], key);
+ __wt_page_out(session, &page);
+ return (ret);
+}
+
+/*
+ * __rebalance_row_walk --
+ * Walk a row-store page and its descendants.
+ */
+static int
+__rebalance_row_walk(
+ WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_RSTUFF *rs)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK key, unpack;
+ WT_DECL_ITEM(buf);
+ WT_DECL_ITEM(leafkey);
+ WT_DECL_RET;
+ size_t len;
+ uint32_t i;
+ bool first_cell;
+ const void *p;
+
+ btree = S2BT(session);
+ WT_CLEAR(key); /* [-Werror=maybe-uninitialized] */
+
+ WT_ERR(__wt_scr_alloc(session, 0, &buf));
+ WT_ERR(__wt_scr_alloc(session, 0, &leafkey));
+
+ /* Report progress periodically. */
+ if (++rs->progress % WT_REBALANCE_PROGRESS_INTERVAL == 0)
+ WT_ERR(__wt_progress(session, NULL, rs->progress));
+
+ /*
+ * Walk the page, instantiating keys: the page contains sorted key and
+ * location cookie pairs. Keys are on-page/overflow items and location
+ * cookies are WT_CELL_ADDR_XXX items.
+ */
+ first_cell = true;
+ WT_CELL_FOREACH(btree, dsk, cell, &unpack, i) {
+ __wt_cell_unpack(cell, &unpack);
+ switch (unpack.type) {
+ case WT_CELL_KEY:
+ key = unpack;
+ break;
+ case WT_CELL_KEY_OVFL:
+ /*
+ * Any overflow key that references an internal page is
+ * no longer of any use.
+ *
+ * We could potentially use the same overflow key being
+ * freed here for the internal page we're creating, but
+ * that's more work to get reconciliation to understand
+ * and overflow keys are (well, should be), uncommon.
+ */
+ WT_ERR(__wt_verbose(session, WT_VERB_REBALANCE,
+ "free-list append overflow key: %s",
+ __wt_addr_string(
+ session, unpack.data, unpack.size, rs->tmp1)));
+
+ WT_ERR(__rebalance_fl_append(
+ session, unpack.data, unpack.size, rs));
+
+ key = unpack;
+ break;
+ case WT_CELL_ADDR_DEL:
+ /*
+ * A deleted leaf page: we're rebalancing this tree,
+ * which means no transaction can be active in it,
+ * which means no deleted leaf page is interesting,
+ * ignore it.
+ */
+ first_cell = false;
+ break;
+ case WT_CELL_ADDR_INT:
+ /* An internal page: read it and recursively walk it. */
+ WT_ERR(__wt_bt_read(
+ session, buf, unpack.data, unpack.size));
+ WT_ERR(__rebalance_row_walk(session, buf->data, rs));
+ WT_ERR(__wt_verbose(session, WT_VERB_REBALANCE,
+ "free-list append internal page: %s",
+ __wt_addr_string(
+ session, unpack.data, unpack.size, rs->tmp1)));
+ WT_ERR(__rebalance_fl_append(
+ session, unpack.data, unpack.size, rs));
+ break;
+ case WT_CELL_ADDR_LEAF:
+ case WT_CELL_ADDR_LEAF_NO:
+ /*
+ * A leaf page.
+ * If the internal page key is an overflow, instantiate
+ * it and use it.
+ * Else, we can't trust the 0th key on an internal page
+ * (we generally try not to instantiate them during
+ * reconciliation because it saves space), so we have to
+ * get it from the underlying leaf page.
+ * Else, we can use the internal page's key as is, it's
+ * sufficient for the page.
+ */
+ if (key.type == WT_CELL_KEY_OVFL) {
+ WT_ERR(__wt_dsk_cell_data_ref(
+ session, WT_PAGE_ROW_INT, &key, leafkey));
+ p = leafkey->data;
+ len = leafkey->size;
+ } else if (first_cell) {
+ WT_ERR(__rebalance_row_leaf_key(session,
+ unpack.data, unpack.size, leafkey, rs));
+ p = leafkey->data;
+ len = leafkey->size;
+ } else {
+ p = key.data;
+ len = key.size;
+ }
+ WT_ERR(__rebalance_leaf_append(session,
+ p, len, WT_RECNO_OOB, unpack.data, unpack.size,
+ unpack.type == WT_CELL_ADDR_LEAF ?
+ WT_ADDR_LEAF : WT_ADDR_LEAF_NO, rs));
+
+ first_cell = false;
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+ }
+
+err: __wt_scr_free(session, &buf);
+ __wt_scr_free(session, &leafkey);
+ return (ret);
+}
+
+/*
+ * __wt_bt_rebalance --
+ * Rebalance the last checkpoint in the file.
+ */
+int
+__wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_RSTUFF *rs, _rstuff;
+ bool evict_reset;
+
+ WT_UNUSED(cfg);
+
+ btree = S2BT(session);
+
+ WT_CLEAR(_rstuff);
+ rs = &_rstuff;
+
+ WT_ERR(__wt_scr_alloc(session, 0, &rs->tmp1));
+ WT_ERR(__wt_scr_alloc(session, 0, &rs->tmp2));
+
+ /* Set the internal page tree type. */
+ rs->type = btree->root.page->type;
+
+ /*
+ * Get exclusive access to the file. (Not required, the only page in the
+ * cache is the root page, and that cannot be evicted; however, this way
+ * eviction ignores the tree entirely.)
+ */
+ WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset));
+
+ /* Recursively walk the tree. */
+ switch (rs->type) {
+ case WT_PAGE_ROW_INT:
+ WT_ERR(
+ __rebalance_row_walk(session, btree->root.page->dsk, rs));
+ break;
+ case WT_PAGE_COL_INT:
+ WT_ERR(
+ __rebalance_col_walk(session, btree->root.page->dsk, rs));
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ /* Build a new root page. */
+ WT_ERR(__rebalance_internal(session, rs));
+
+ /*
+ * Schedule the free of the original blocks (they shouldn't actually be
+ * freed until the next checkpoint completes).
+ */
+ WT_ERR(__rebalance_free_original(session, rs));
+
+ /*
+ * Swap the old root page for our newly built root page, writing the new
+ * root page as part of a checkpoint will finish the rebalance.
+ */
+ __wt_page_out(session, &btree->root.page);
+ btree->root.page = rs->root;
+ rs->root = NULL;
+
+err: /* Discard any leftover root page we created. */
+ if (rs->root != NULL) {
+ __wt_page_modify_clear(session, rs->root);
+ __wt_page_out(session, &rs->root);
+ }
+
+ /* Discard any leftover leaf and internal page information. */
+ __rebalance_discard(session, rs);
+
+ __wt_scr_free(session, &rs->tmp1);
+ __wt_scr_free(session, &rs->tmp2);
+
+ return (ret);
+}
diff --git a/src/config/config_def.c b/src/config/config_def.c
index 9d12e953498..71131e567fd 100644
--- a/src/config/config_def.c
+++ b/src/config/config_def.c
@@ -151,9 +151,9 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = {
NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\","
"\"evict\",\"evictserver\",\"fileops\",\"log\",\"lsm\","
"\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\","
- "\"reconcile\",\"recovery\",\"salvage\",\"shared_cache\","
- "\"split\",\"temporary\",\"transaction\",\"verify\",\"version\","
- "\"write\"]",
+ "\"rebalance\",\"reconcile\",\"recovery\",\"salvage\","
+ "\"shared_cache\",\"split\",\"temporary\",\"transaction\","
+ "\"verify\",\"version\",\"write\"]",
NULL, 0 },
{ NULL, NULL, NULL, NULL, NULL, 0 }
};
@@ -563,9 +563,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = {
NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\","
"\"evict\",\"evictserver\",\"fileops\",\"log\",\"lsm\","
"\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\","
- "\"reconcile\",\"recovery\",\"salvage\",\"shared_cache\","
- "\"split\",\"temporary\",\"transaction\",\"verify\",\"version\","
- "\"write\"]",
+ "\"rebalance\",\"reconcile\",\"recovery\",\"salvage\","
+ "\"shared_cache\",\"split\",\"temporary\",\"transaction\","
+ "\"verify\",\"version\",\"write\"]",
NULL, 0 },
{ "write_through", "list",
NULL, "choices=[\"data\",\"log\"]",
@@ -643,9 +643,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = {
NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\","
"\"evict\",\"evictserver\",\"fileops\",\"log\",\"lsm\","
"\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\","
- "\"reconcile\",\"recovery\",\"salvage\",\"shared_cache\","
- "\"split\",\"temporary\",\"transaction\",\"verify\",\"version\","
- "\"write\"]",
+ "\"rebalance\",\"reconcile\",\"recovery\",\"salvage\","
+ "\"shared_cache\",\"split\",\"temporary\",\"transaction\","
+ "\"verify\",\"version\",\"write\"]",
NULL, 0 },
{ "version", "string", NULL, NULL, NULL, 0 },
{ "write_through", "list",
@@ -718,9 +718,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = {
NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\","
"\"evict\",\"evictserver\",\"fileops\",\"log\",\"lsm\","
"\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\","
- "\"reconcile\",\"recovery\",\"salvage\",\"shared_cache\","
- "\"split\",\"temporary\",\"transaction\",\"verify\",\"version\","
- "\"write\"]",
+ "\"rebalance\",\"reconcile\",\"recovery\",\"salvage\","
+ "\"shared_cache\",\"split\",\"temporary\",\"transaction\","
+ "\"verify\",\"version\",\"write\"]",
NULL, 0 },
{ "version", "string", NULL, NULL, NULL, 0 },
{ "write_through", "list",
@@ -793,9 +793,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = {
NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\","
"\"evict\",\"evictserver\",\"fileops\",\"log\",\"lsm\","
"\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\","
- "\"reconcile\",\"recovery\",\"salvage\",\"shared_cache\","
- "\"split\",\"temporary\",\"transaction\",\"verify\",\"version\","
- "\"write\"]",
+ "\"rebalance\",\"reconcile\",\"recovery\",\"salvage\","
+ "\"shared_cache\",\"split\",\"temporary\",\"transaction\","
+ "\"verify\",\"version\",\"write\"]",
NULL, 0 },
{ "write_through", "list",
NULL, "choices=[\"data\",\"log\"]",
@@ -926,6 +926,10 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"skip_sort_check=0,statistics=,target=",
confchk_WT_SESSION_open_cursor, 12
},
+ { "WT_SESSION.rebalance",
+ "",
+ NULL, 0
+ },
{ "WT_SESSION.reconfigure",
"isolation=read-committed",
confchk_WT_SESSION_reconfigure, 1
diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c
index bd14e1bf4fd..aa8cf8a3ec9 100644
--- a/src/conn/conn_api.c
+++ b/src/conn/conn_api.c
@@ -1605,6 +1605,7 @@ __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[])
{ "mutex", WT_VERB_MUTEX },
{ "overflow", WT_VERB_OVERFLOW },
{ "read", WT_VERB_READ },
+ { "rebalance", WT_VERB_REBALANCE },
{ "reconcile", WT_VERB_RECONCILE },
{ "recovery", WT_VERB_RECOVERY },
{ "salvage", WT_VERB_SALVAGE },
diff --git a/src/include/btree.h b/src/include/btree.h
index a1d8e395cfc..04cb561cb32 100644
--- a/src/include/btree.h
+++ b/src/include/btree.h
@@ -153,16 +153,18 @@ struct __wt_btree {
#define WT_BTREE_NO_CHECKPOINT 0x00800 /* Disable checkpoints */
#define WT_BTREE_NO_EVICTION 0x01000 /* Disable eviction */
#define WT_BTREE_NO_LOGGING 0x02000 /* Disable logging */
-#define WT_BTREE_SALVAGE 0x04000 /* Handle is for salvage */
-#define WT_BTREE_SKIP_CKPT 0x08000 /* Handle skipped checkpoint */
-#define WT_BTREE_UPGRADE 0x10000 /* Handle is for upgrade */
-#define WT_BTREE_VERIFY 0x20000 /* Handle is for verify */
+#define WT_BTREE_REBALANCE 0x04000 /* Handle is for rebalance */
+#define WT_BTREE_SALVAGE 0x08000 /* Handle is for salvage */
+#define WT_BTREE_SKIP_CKPT 0x10000 /* Handle skipped checkpoint */
+#define WT_BTREE_UPGRADE 0x20000 /* Handle is for upgrade */
+#define WT_BTREE_VERIFY 0x40000 /* Handle is for verify */
uint32_t flags;
};
/* Flags that make a btree handle special (not for normal use). */
#define WT_BTREE_SPECIAL_FLAGS \
- (WT_BTREE_BULK | WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)
+ (WT_BTREE_BULK | WT_BTREE_REBALANCE | \
+ WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)
/*
* WT_SALVAGE_COOKIE --
diff --git a/src/include/config.h b/src/include/config.h
index e836abaccba..a960ab2e8d6 100644
--- a/src/include/config.h
+++ b/src/include/config.h
@@ -72,25 +72,26 @@ struct __wt_config_parser_impl {
#define WT_CONFIG_ENTRY_WT_SESSION_log_flush 20
#define WT_CONFIG_ENTRY_WT_SESSION_log_printf 21
#define WT_CONFIG_ENTRY_WT_SESSION_open_cursor 22
-#define WT_CONFIG_ENTRY_WT_SESSION_reconfigure 23
-#define WT_CONFIG_ENTRY_WT_SESSION_rename 24
-#define WT_CONFIG_ENTRY_WT_SESSION_reset 25
-#define WT_CONFIG_ENTRY_WT_SESSION_rollback_transaction 26
-#define WT_CONFIG_ENTRY_WT_SESSION_salvage 27
-#define WT_CONFIG_ENTRY_WT_SESSION_snapshot 28
-#define WT_CONFIG_ENTRY_WT_SESSION_strerror 29
-#define WT_CONFIG_ENTRY_WT_SESSION_transaction_sync 30
-#define WT_CONFIG_ENTRY_WT_SESSION_truncate 31
-#define WT_CONFIG_ENTRY_WT_SESSION_upgrade 32
-#define WT_CONFIG_ENTRY_WT_SESSION_verify 33
-#define WT_CONFIG_ENTRY_colgroup_meta 34
-#define WT_CONFIG_ENTRY_file_meta 35
-#define WT_CONFIG_ENTRY_index_meta 36
-#define WT_CONFIG_ENTRY_table_meta 37
-#define WT_CONFIG_ENTRY_wiredtiger_open 38
-#define WT_CONFIG_ENTRY_wiredtiger_open_all 39
-#define WT_CONFIG_ENTRY_wiredtiger_open_basecfg 40
-#define WT_CONFIG_ENTRY_wiredtiger_open_usercfg 41
+#define WT_CONFIG_ENTRY_WT_SESSION_rebalance 23
+#define WT_CONFIG_ENTRY_WT_SESSION_reconfigure 24
+#define WT_CONFIG_ENTRY_WT_SESSION_rename 25
+#define WT_CONFIG_ENTRY_WT_SESSION_reset 26
+#define WT_CONFIG_ENTRY_WT_SESSION_rollback_transaction 27
+#define WT_CONFIG_ENTRY_WT_SESSION_salvage 28
+#define WT_CONFIG_ENTRY_WT_SESSION_snapshot 29
+#define WT_CONFIG_ENTRY_WT_SESSION_strerror 30
+#define WT_CONFIG_ENTRY_WT_SESSION_transaction_sync 31
+#define WT_CONFIG_ENTRY_WT_SESSION_truncate 32
+#define WT_CONFIG_ENTRY_WT_SESSION_upgrade 33
+#define WT_CONFIG_ENTRY_WT_SESSION_verify 34
+#define WT_CONFIG_ENTRY_colgroup_meta 35
+#define WT_CONFIG_ENTRY_file_meta 36
+#define WT_CONFIG_ENTRY_index_meta 37
+#define WT_CONFIG_ENTRY_table_meta 38
+#define WT_CONFIG_ENTRY_wiredtiger_open 39
+#define WT_CONFIG_ENTRY_wiredtiger_open_all 40
+#define WT_CONFIG_ENTRY_wiredtiger_open_basecfg 41
+#define WT_CONFIG_ENTRY_wiredtiger_open_usercfg 42
/*
* configuration section: END
* DO NOT EDIT: automatically built by dist/flags.py.
diff --git a/src/include/extern.h b/src/include/extern.h
index 948aba20997..70eedb64373 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -139,6 +139,7 @@ extern const char *__wt_page_type_string(u_int type);
extern const char *__wt_cell_type_string(uint8_t type);
extern const char *__wt_page_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, WT_ITEM *buf);
extern const char *__wt_addr_string(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, WT_ITEM *buf);
+extern const char *__wt_buf_set_printable( WT_SESSION_IMPL *session, const void *p, size_t size, WT_ITEM *buf);
extern int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store);
extern int __wt_ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, void *cookie, WT_CELL_UNPACK *vpack);
extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell);
@@ -151,6 +152,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
, const char *file, int line
#endif
);
+extern int __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd);
extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]);
extern void __wt_split_stash_discard(WT_SESSION_IMPL *session);
diff --git a/src/include/flags.h b/src/include/flags.h
index bafff92fbc0..e9d29b8ea0e 100644
--- a/src/include/flags.h
+++ b/src/include/flags.h
@@ -85,16 +85,17 @@
#define WT_VERB_MUTEX 0x00000800
#define WT_VERB_OVERFLOW 0x00001000
#define WT_VERB_READ 0x00002000
-#define WT_VERB_RECONCILE 0x00004000
-#define WT_VERB_RECOVERY 0x00008000
-#define WT_VERB_SALVAGE 0x00010000
-#define WT_VERB_SHARED_CACHE 0x00020000
-#define WT_VERB_SPLIT 0x00040000
-#define WT_VERB_TEMPORARY 0x00080000
-#define WT_VERB_TRANSACTION 0x00100000
-#define WT_VERB_VERIFY 0x00200000
-#define WT_VERB_VERSION 0x00400000
-#define WT_VERB_WRITE 0x00800000
+#define WT_VERB_REBALANCE 0x00004000
+#define WT_VERB_RECONCILE 0x00008000
+#define WT_VERB_RECOVERY 0x00010000
+#define WT_VERB_SALVAGE 0x00020000
+#define WT_VERB_SHARED_CACHE 0x00040000
+#define WT_VERB_SPLIT 0x00080000
+#define WT_VERB_TEMPORARY 0x00100000
+#define WT_VERB_TRANSACTION 0x00200000
+#define WT_VERB_VERIFY 0x00400000
+#define WT_VERB_VERSION 0x00800000
+#define WT_VERB_WRITE 0x01000000
#define WT_VISIBILITY_ERR 0x00000010
/*
* flags section: END
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index bdd8bb65910..fcd78d572a4 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -1329,6 +1329,19 @@ struct __wt_session {
int __F(log_printf)(WT_SESSION *session, const char *fmt, ...);
/*!
+ * Rebalance a table
+ *
+ * @snippet ex_all.c Rebalance a table
+ *
+ * @param session the session handle
+ * @param uri the current URI of the object, such as \c "table:mytable"
+ * @configempty{WT_SESSION.rebalance, see dist/api_data.py}
+ * @ebusy_errors
+ */
+ int __F(rebalance)(
+ WT_SESSION *session, const char *uri, const char *config);
+
+ /*!
* Rename an object.
*
* @snippet ex_all.c Rename a table
@@ -1920,9 +1933,10 @@ struct __wt_connection {
* "block"\, \c "checkpoint"\, \c "compact"\, \c "evict"\, \c
* "evictserver"\, \c "fileops"\, \c "log"\, \c "lsm"\, \c
* "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c "overflow"\, \c
- * "read"\, \c "reconcile"\, \c "recovery"\, \c "salvage"\, \c
- * "shared_cache"\, \c "split"\, \c "temporary"\, \c "transaction"\, \c
- * "verify"\, \c "version"\, \c "write"; default empty.}
+ * "read"\, \c "rebalance"\, \c "reconcile"\, \c "recovery"\, \c
+ * "salvage"\, \c "shared_cache"\, \c "split"\, \c "temporary"\, \c
+ * "transaction"\, \c "verify"\, \c "version"\, \c "write"; default
+ * empty.}
* @configend
* @errors
*/
@@ -2405,9 +2419,9 @@ struct __wt_connection {
* values chosen from the following options: \c "api"\, \c "block"\, \c
* "checkpoint"\, \c "compact"\, \c "evict"\, \c "evictserver"\, \c "fileops"\,
* \c "log"\, \c "lsm"\, \c "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c
- * "overflow"\, \c "read"\, \c "reconcile"\, \c "recovery"\, \c "salvage"\, \c
- * "shared_cache"\, \c "split"\, \c "temporary"\, \c "transaction"\, \c
- * "verify"\, \c "version"\, \c "write"; default empty.}
+ * "overflow"\, \c "read"\, \c "rebalance"\, \c "reconcile"\, \c "recovery"\, \c
+ * "salvage"\, \c "shared_cache"\, \c "split"\, \c "temporary"\, \c
+ * "transaction"\, \c "verify"\, \c "version"\, \c "write"; default empty.}
* @config{write_through, Use \c FILE_FLAG_WRITE_THROUGH on Windows to write to
* files. Ignored on non-Windows systems. Options are given as a list\, such
* as <code>"write_through=[data]"</code>. Configuring \c write_through requires
diff --git a/src/session/session_api.c b/src/session/session_api.c
index f0d0f26db54..da24f40cfb0 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -554,6 +554,32 @@ err: API_END_RET(session, ret);
}
/*
+ * __session_rebalance --
+ * WT_SESSION->rebalance method.
+ */
+static int
+__session_rebalance(WT_SESSION *wt_session, const char *uri, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ SESSION_API_CALL(session, rebalance, config, cfg);
+
+ if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
+ WT_ERR(ENOTSUP);
+
+ /* Block out checkpoints to avoid spurious EBUSY errors. */
+ WT_WITH_CHECKPOINT_LOCK(session,
+ WT_WITH_SCHEMA_LOCK(session, ret =
+ __wt_schema_worker(session, uri, __wt_bt_rebalance,
+ NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_REBALANCE)));
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
* __session_rename --
* WT_SESSION->rename method.
*/
@@ -1287,6 +1313,7 @@ __open_session(WT_CONNECTION_IMPL *conn,
__session_join,
__session_log_flush,
__session_log_printf,
+ __session_rebalance,
__session_rename,
__session_reset,
__session_salvage,
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index bc1537ca878..fd1cea5564d 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -1037,12 +1037,13 @@ nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT);
"for a bulk-loaded file");
fake_ckpt = true;
goto fake;
+ case WT_BTREE_REBALANCE:
case WT_BTREE_SALVAGE:
case WT_BTREE_UPGRADE:
case WT_BTREE_VERIFY:
WT_ERR_MSG(session, EINVAL,
- "checkpoints are blocked during salvage, upgrade "
- "or verify operations");
+ "checkpoints are blocked during rebalance, "
+ "salvage, upgrade or verify operations");
}
/*
diff --git a/src/utilities/util.h b/src/utilities/util.h
index 08d0537956f..b9399c33e78 100644
--- a/src/utilities/util.h
+++ b/src/utilities/util.h
@@ -42,6 +42,7 @@ char *util_name(WT_SESSION *, const char *, const char *);
int util_printlog(WT_SESSION *, int, char *[]);
int util_read(WT_SESSION *, int, char *[]);
int util_read_line(WT_SESSION *, ULINE *, bool, bool *);
+int util_rebalance(WT_SESSION *, int, char *[]);
int util_rename(WT_SESSION *, int, char *[]);
int util_salvage(WT_SESSION *, int, char *[]);
int util_stat(WT_SESSION *, int, char *[]);
diff --git a/src/utilities/util_main.c b/src/utilities/util_main.c
index 3b7187bd0de..8e0bb8df448 100644
--- a/src/utilities/util_main.c
+++ b/src/utilities/util_main.c
@@ -159,6 +159,8 @@ main(int argc, char *argv[])
case 'r':
if (strcmp(command, "read") == 0)
func = util_read;
+ else if (strcmp(command, "rebalance") == 0)
+ func = util_rebalance;
else if (strcmp(command, "rename") == 0)
func = util_rename;
break;
@@ -259,9 +261,10 @@ usage(void)
"\t" "dump\t dump an object\n"
"\t" "list\t list database objects\n"
"\t" "load\t load an object\n"
- "\t" "loadtext\t load an object from a text file\n"
+ "\t" "loadtext load an object from a text file\n"
"\t" "printlog display the database log\n"
"\t" "read\t read values from an object\n"
+ "\t" "rebalance rebalance an object\n"
"\t" "rename\t rename an object\n"
"\t" "salvage\t salvage a file\n"
"\t" "stat\t display statistics for an object\n"
diff --git a/src/utilities/util_rebalance.c b/src/utilities/util_rebalance.c
new file mode 100644
index 00000000000..671afc19cdc
--- /dev/null
+++ b/src/utilities/util_rebalance.c
@@ -0,0 +1,63 @@
+/*-
+ * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_rebalance(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_DECL_RET;
+ int ch;
+ char *name;
+
+ name = NULL;
+ while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
+ switch (ch) {
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* The remaining argument is the table name. */
+ if (argc != 1)
+ return (usage());
+ if ((name = util_name(session, *argv, "table")) == NULL)
+ return (1);
+
+ if ((ret = session->rebalance(session, name, NULL)) != 0) {
+ fprintf(stderr, "%s: rebalance(%s): %s\n",
+ progname, name, session->strerror(session, ret));
+ goto err;
+ }
+
+ /* Verbose configures a progress counter, move to the next line. */
+ if (verbose)
+ printf("\n");
+
+ if (0) {
+err: ret = 1;
+ }
+
+ free(name);
+
+ return (ret);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "rebalance uri\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/test/format/Makefile.am b/test/format/Makefile.am
index 17cbd53222b..8a2e2b49e4b 100644
--- a/test/format/Makefile.am
+++ b/test/format/Makefile.am
@@ -9,7 +9,7 @@ noinst_PROGRAMS = t
noinst_SCRIPTS = s_dumpcmp
t_SOURCES =\
config.h format.h backup.c bulk.c compact.c config.c lrt.c ops.c \
- salvage.c t.c util.c wts.c
+ rebalance.c salvage.c t.c util.c wts.c
if HAVE_BERKELEY_DB
t_SOURCES += bdb.c
diff --git a/test/format/config.c b/test/format/config.c
index 7aa4575efbd..8651aa9d086 100644
--- a/test/format/config.c
+++ b/test/format/config.c
@@ -331,6 +331,8 @@ config_in_memory(void)
g.c_compression = 0;
if (!config_is_perm("logging"))
g.c_logging = 0;
+ if (!config_is_perm("rebalance"))
+ g.c_rebalance = 0;
if (!config_is_perm("salvage"))
g.c_salvage = 0;
if (!config_is_perm("verify"))
diff --git a/test/format/config.h b/test/format/config.h
index a259217690c..6f97af66719 100644
--- a/test/format/config.h
+++ b/test/format/config.h
@@ -263,8 +263,12 @@ static CONFIG c[] = {
"the number of runs",
C_IGNORE, 0, UINT_MAX, UINT_MAX, &g.c_runs, NULL },
+ { "rebalance",
+ "rebalance testing", /* 100% */
+ C_BOOL, 100, 1, 0, &g.c_rebalance, NULL },
+
{ "salvage",
- "verify integrity via salvage", /* 100% */
+ "salvage testing", /* 100% */
C_BOOL, 100, 1, 0, &g.c_salvage, NULL },
{ "split_pct",
diff --git a/test/format/format.h b/test/format/format.h
index 88300c28292..e15de473e59 100644
--- a/test/format/format.h
+++ b/test/format/format.h
@@ -122,6 +122,8 @@ typedef struct {
char *helium_mount; /* Helium volume */
+ char *wiredtiger_open_config; /* Database open config */
+
#ifdef HAVE_BERKELEY_DB
void *bdb; /* BDB comparison handle */
void *dbc; /* BDB cursor handle */
@@ -216,6 +218,7 @@ typedef struct {
uint32_t c_reverse;
uint32_t c_rows;
uint32_t c_runs;
+ uint32_t c_rebalance;
uint32_t c_salvage;
uint32_t c_split_pct;
uint32_t c_statistics;
@@ -328,6 +331,8 @@ void wts_load(void);
void wts_open(const char *, int, WT_CONNECTION **);
void wts_ops(int);
void wts_read_scan(void);
+void wts_rebalance(void);
+void wts_reopen(void);
void wts_salvage(void);
void wts_stats(void);
void wts_verify(const char *);
diff --git a/test/format/rebalance.c b/test/format/rebalance.c
new file mode 100644
index 00000000000..477368c36db
--- /dev/null
+++ b/test/format/rebalance.c
@@ -0,0 +1,84 @@
+/*-
+ * Public Domain 2014-2015 MongoDB, Inc.
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "format.h"
+
+void
+wts_rebalance(void)
+{
+ WT_CONNECTION *conn;
+ WT_SESSION *session;
+ int ret;
+ char cmd[1024];
+
+ if (g.c_rebalance == 0)
+ return;
+
+ track("rebalance", 0ULL, NULL);
+
+ /* Dump the current object. */
+ (void)snprintf(cmd, sizeof(cmd),
+ "../../wt -h %s dump -f %s/rebalance.orig %s",
+ g.home, g.home, g.uri);
+ if ((ret = system(cmd)) != 0)
+ die(ret, "command failed: %s", cmd);
+
+ /* Rebalance, then verify the object. */
+ wts_reopen();
+ conn = g.wts_conn;
+ if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0)
+ die(ret, "connection.open_session");
+ if (g.logging != 0)
+ (void)g.wt_api->msg_printf(g.wt_api, session,
+ "=============== rebalance start ===============");
+
+ if ((ret = session->rebalance(session, g.uri, NULL)) != 0)
+ die(ret, "session.rebalance: %s: %s", g.uri);
+
+ if (g.logging != 0)
+ (void)g.wt_api->msg_printf(g.wt_api, session,
+ "=============== rebalance stop ===============");
+ if ((ret = session->close(session, NULL)) != 0)
+ die(ret, "session.close");
+
+ wts_verify("post-rebalance verify");
+ wts_close();
+
+ (void)snprintf(cmd, sizeof(cmd),
+ "../../wt -h %s dump -f %s/rebalance.new %s",
+ g.home, g.home, g.uri);
+ if ((ret = system(cmd)) != 0)
+ die(ret, "command failed: %s", cmd);
+
+ /* Compare the old/new versions of the object. */
+ (void)snprintf(cmd, sizeof(cmd),
+ "cmp %s/rebalance.orig %s/rebalance.new > /dev/null",
+ g.home, g.home);
+ if ((ret = system(cmd)) != 0)
+ die(ret, "command failed: %s", cmd);
+}
diff --git a/test/format/t.c b/test/format/t.c
index 603706e0ba1..9687c3c6dba 100644
--- a/test/format/t.c
+++ b/test/format/t.c
@@ -238,6 +238,11 @@ main(int argc, char *argv[])
wts_close();
/*
+ * Rebalance testing.
+ */
+ wts_rebalance();
+
+ /*
* If single-threaded, we can dump and compare the WiredTiger
* and Berkeley DB data sets.
*/
diff --git a/test/format/wts.c b/test/format/wts.c
index c1ea469d7b5..98cb2daa7b9 100644
--- a/test/format/wts.c
+++ b/test/format/wts.c
@@ -132,12 +132,16 @@ wts_open(const char *home, int set_api, WT_CONNECTION **connp)
{
WT_CONNECTION *conn;
int ret;
- char config[4096], *end, *p;
+ char *config, *end, *p;
*connp = NULL;
- p = config;
- end = config + sizeof(config);
+#define WIREDTIGER_OPEN_CONFIG_LEN (4 * 1024)
+ if ((g.wiredtiger_open_config =
+ calloc(WIREDTIGER_OPEN_CONFIG_LEN, sizeof(char))) == NULL)
+ die(ENOMEM, "calloc");
+ config = p = g.wiredtiger_open_config;
+ end = config + WIREDTIGER_OPEN_CONFIG_LEN;
p += snprintf(p, REMAIN(p, end),
"create,checkpoint_sync=false,cache_size=%" PRIu32 "MB",
@@ -273,6 +277,20 @@ wts_open(const char *home, int set_api, WT_CONNECTION **connp)
}
/*
+ * wts_reopen --
+ * Re-open a connection to a WiredTiger database.
+ */
+void
+wts_reopen(void)
+{
+ int ret;
+
+ if ((ret = wiredtiger_open(g.home,
+ &event_handler, g.wiredtiger_open_config, &g.wts_conn)) != 0)
+ die(ret, "wiredtiger_open: %s", g.home);
+}
+
+/*
* wts_create --
* Create the underlying store.
*/
@@ -452,6 +470,8 @@ wts_close(void)
if ((ret = conn->close(conn, config)) != 0)
die(ret, "connection.close");
+ g.wts_conn = NULL;
+ g.wt_api = NULL;
}
void