summaryrefslogtreecommitdiff
path: root/ft
diff options
context:
space:
mode:
authorJohn Esmet <john.esmet@gmail.com>2014-05-22 19:21:43 -0400
committerJohn Esmet <john.esmet@gmail.com>2014-05-25 12:42:52 -0400
commitce63b1dddab0219a8f3144f68370203c2fbba1ce (patch)
treefcdc3eeae06646be61cb9a4d3ed980746b156729 /ft
parentfa28ddaa030d36ece130f5340bd7d073699a73d0 (diff)
downloadmariadb-git-ce63b1dddab0219a8f3144f68370203c2fbba1ce.tar.gz
fixes #158 Use promotion to record the blocknum of the rightmost
non-root leaf node in each FT. When the FT detects a rightmost insertion pattern, it attempts to do inserts and unique checks directly into the rightmost leaf node, greatly optimizing sequential insert speed.
Diffstat (limited to 'ft')
-rw-r--r--ft/ft-cachetable-wrappers.cc22
-rw-r--r--ft/ft-cachetable-wrappers.h3
-rw-r--r--ft/ft-flusher.cc25
-rw-r--r--ft/ft-internal.h23
-rw-r--r--ft/ft-ops.cc352
-rw-r--r--ft/ft-ops.h3
-rw-r--r--ft/tests/test_rightmost_leaf_seqinsert_heuristic.cc183
-rw-r--r--ft/tests/test_rightmost_leaf_split_merge.cc212
8 files changed, 797 insertions, 26 deletions
diff --git a/ft/ft-cachetable-wrappers.cc b/ft/ft-cachetable-wrappers.cc
index 1f3aa3e0baa..91a0040b02e 100644
--- a/ft/ft-cachetable-wrappers.cc
+++ b/ft/ft-cachetable-wrappers.cc
@@ -403,3 +403,25 @@ toku_unpin_ftnode_read_only(FT ft, FTNODE node)
);
assert(r==0);
}
+
+void toku_ftnode_swap_pair_values(FTNODE a, FTNODE b)
+// Effect: Swap the blocknum, fullhash, and PAIR for for a and b
+// Requires: Both nodes are pinned
+{
+ BLOCKNUM tmp_blocknum = a->thisnodename;
+ uint32_t tmp_fullhash = a->fullhash;
+ PAIR tmp_pair = a->ct_pair;
+
+ a->thisnodename = b->thisnodename;
+ a->fullhash = b->fullhash;
+ a->ct_pair = b->ct_pair;
+
+ b->thisnodename = tmp_blocknum;
+ b->fullhash = tmp_fullhash;
+ b->ct_pair = tmp_pair;
+
+ // A and B swapped pair pointers, but we still have to swap
+ // the actual pair values (ie: the FTNODEs they represent)
+ // in the cachetable.
+ toku_cachetable_swap_pair_values(a->ct_pair, b->ct_pair);
+}
diff --git a/ft/ft-cachetable-wrappers.h b/ft/ft-cachetable-wrappers.h
index 9a56f4ff220..dc84d7f006b 100644
--- a/ft/ft-cachetable-wrappers.h
+++ b/ft/ft-cachetable-wrappers.h
@@ -190,4 +190,7 @@ int toku_maybe_pin_ftnode_clean(FT ft, BLOCKNUM blocknum, uint32_t fullhash, pai
void toku_unpin_ftnode(FT h, FTNODE node);
void toku_unpin_ftnode_read_only(FT ft, FTNODE node);
+// Effect: Swaps pair values of two pinned nodes
+void toku_ftnode_swap_pair_values(FTNODE nodea, FTNODE nodeb);
+
#endif
diff --git a/ft/ft-flusher.cc b/ft/ft-flusher.cc
index 0fe556aec0f..dc4096a7993 100644
--- a/ft/ft-flusher.cc
+++ b/ft/ft-flusher.cc
@@ -565,6 +565,7 @@ static bool may_node_be_reactive(FT ft, FTNODE node)
*/
static void
handle_split_of_child(
+ FT ft,
FTNODE node,
int childnum,
FTNODE childa,
@@ -607,8 +608,20 @@ handle_split_of_child(
paranoid_invariant(BP_BLOCKNUM(node, childnum).b==childa->thisnodename.b); // use the same child
+ // We never set the rightmost blocknum to be the root.
+ // Instead, we wait for the root to split and let promotion initialize the rightmost
+ // blocknum to be the first non-root leaf node on the right extreme to recieve an insert.
+ invariant(ft->h->root_blocknum.b != ft->rightmost_blocknum.b);
+ if (childa->thisnodename.b == ft->rightmost_blocknum.b) {
+ // The rightmost leaf (a) split into (a) and (b). We want (b) to swap pair values
+ // with (a), now that it is the new rightmost leaf. This keeps the rightmost blocknum
+ // constant, the same the way we keep the root blocknum constant.
+ toku_ftnode_swap_pair_values(childa, childb);
+ BP_BLOCKNUM(node, childnum) = childa->thisnodename;
+ }
+
BP_BLOCKNUM(node, childnum+1) = childb->thisnodename;
- BP_WORKDONE(node, childnum+1) = 0;
+ BP_WORKDONE(node, childnum+1) = 0;
BP_STATE(node,childnum+1) = PT_AVAIL;
NONLEAF_CHILDINFO new_bnc = toku_create_empty_nl();
@@ -1071,7 +1084,7 @@ ft_split_child(
ft_nonleaf_split(h, child, &nodea, &nodeb, &splitk, 2, dep_nodes);
}
// printf("%s:%d child did split\n", __FILE__, __LINE__);
- handle_split_of_child (node, childnum, nodea, nodeb, &splitk);
+ handle_split_of_child (h, node, childnum, nodea, nodeb, &splitk);
// for test
call_flusher_thread_callback(flt_flush_during_split);
@@ -1489,6 +1502,14 @@ ft_merge_child(
&node->childkeys[childnuma+1],
(node->n_children-childnumb)*sizeof(node->childkeys[0]));
REALLOC_N(node->n_children-1, node->childkeys);
+
+ // Handle a merge of the rightmost leaf node.
+ if (did_merge && childb->thisnodename.b == h->rightmost_blocknum.b) {
+ invariant(childb->thisnodename.b != h->h->root_blocknum.b);
+ toku_ftnode_swap_pair_values(childa, childb);
+ BP_BLOCKNUM(node, childnuma) = childa->thisnodename;
+ }
+
paranoid_invariant(BP_BLOCKNUM(node, childnuma).b == childa->thisnodename.b);
childa->dirty = 1; // just to make sure
childb->dirty = 1; // just to make sure
diff --git a/ft/ft-internal.h b/ft/ft-internal.h
index 42d27638330..f182a4f6aed 100644
--- a/ft/ft-internal.h
+++ b/ft/ft-internal.h
@@ -123,6 +123,10 @@ enum { FT_DEFAULT_FANOUT = 16 };
enum { FT_DEFAULT_NODE_SIZE = 4 * 1024 * 1024 };
enum { FT_DEFAULT_BASEMENT_NODE_SIZE = 128 * 1024 };
+// We optimize for a sequential insert pattern if 100 consecutive injections
+// happen into the rightmost leaf node due to promotion.
+enum { FT_SEQINSERT_SCORE_THRESHOLD = 100 };
+
//
// Field in ftnode_fetch_extra that tells the
// partial fetch callback what piece of the node
@@ -572,6 +576,22 @@ struct ft {
// is this ft a blackhole? if so, all messages are dropped.
bool blackhole;
+
+ // The blocknum of the rightmost leaf node in the tree. Stays constant through splits
+ // and merges using pair-swapping (like the root node, see toku_ftnode_swap_pair_values())
+ //
+ // This field only transitions from RESERVED_BLOCKNUM_NULL to non-null, never back.
+ // We initialize it when promotion inserts into a non-root leaf node on the right extreme.
+ // We use the blocktable lock to protect the initialize transition, though it's not really
+ // necessary since all threads should be setting it to the same value. We maintain that invariant
+ // on first initialization, see ft_set_or_verify_rightmost_blocknum()
+ BLOCKNUM rightmost_blocknum;
+
+ // sequential access pattern heuristic
+ // - when promotion pushes a message directly into the rightmost leaf, the score goes up.
+ // - if the score is high enough, we optimistically attempt to insert directly into the rightmost leaf
+ // - if our attempt fails because the key was not in range of the rightmost leaf, we reset the score back to 0
+ uint32_t seqinsert_score;
};
// Allocate a DB struct off the stack and only set its comparison
@@ -1186,6 +1206,9 @@ typedef enum {
FT_PRO_NUM_DIDNT_WANT_PROMOTE,
FT_BASEMENT_DESERIALIZE_FIXED_KEYSIZE, // how many basement nodes were deserialized with a fixed keysize
FT_BASEMENT_DESERIALIZE_VARIABLE_KEYSIZE, // how many basement nodes were deserialized with a variable keysize
+ FT_PRO_RIGHTMOST_LEAF_SHORTCUT_SUCCESS,
+ FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_POS,
+ FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_REACTIVE,
FT_STATUS_NUM_ROWS
} ft_status_entry;
diff --git a/ft/ft-ops.cc b/ft/ft-ops.cc
index ab7de1a0a2c..9521a9228a7 100644
--- a/ft/ft-ops.cc
+++ b/ft/ft-ops.cc
@@ -367,6 +367,9 @@ status_init(void)
STATUS_INIT(FT_PRO_NUM_DIDNT_WANT_PROMOTE, PROMOTION_STOPPED_AFTER_LOCKING_CHILD, PARCOUNT, "promotion: stopped anyway, after locking the child", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(FT_BASEMENT_DESERIALIZE_FIXED_KEYSIZE, BASEMENT_DESERIALIZATION_FIXED_KEY, PARCOUNT, "basement nodes deserialized with fixed-keysize", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(FT_BASEMENT_DESERIALIZE_VARIABLE_KEYSIZE, BASEMENT_DESERIALIZATION_VARIABLE_KEY, PARCOUNT, "basement nodes deserialized with variable-keysize", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
+ STATUS_INIT(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_SUCCESS, nullptr, PARCOUNT, "promotion: succeeded in using the rightmost leaf shortcut", TOKU_ENGINE_STATUS);
+ STATUS_INIT(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_POS, nullptr, PARCOUNT, "promotion: tried the rightmost leaf shorcut but failed (out-of-bounds)", TOKU_ENGINE_STATUS);
+ STATUS_INIT(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_REACTIVE,nullptr, PARCOUNT, "promotion: tried the rightmost leaf shorcut but failed (child reactive)", TOKU_ENGINE_STATUS);
ft_status.initialized = true;
}
@@ -1643,12 +1646,10 @@ ft_init_new_root(FT ft, FTNODE oldroot, FTNODE *newrootp)
BLOCKNUM old_blocknum = oldroot->thisnodename;
uint32_t old_fullhash = oldroot->fullhash;
- PAIR old_pair = oldroot->ct_pair;
int new_height = oldroot->height+1;
uint32_t new_fullhash;
BLOCKNUM new_blocknum;
- PAIR new_pair = NULL;
cachetable_put_empty_node_with_dep_nodes(
ft,
@@ -1658,7 +1659,6 @@ ft_init_new_root(FT ft, FTNODE oldroot, FTNODE *newrootp)
&new_fullhash,
&newroot
);
- new_pair = newroot->ct_pair;
assert(newroot);
assert(new_height > 0);
@@ -1670,22 +1670,18 @@ ft_init_new_root(FT ft, FTNODE oldroot, FTNODE *newrootp)
ft->h->layout_version,
ft->h->flags
);
+ newroot->fullhash = new_fullhash;
MSN msna = oldroot->max_msn_applied_to_node_on_disk;
newroot->max_msn_applied_to_node_on_disk = msna;
BP_STATE(newroot,0) = PT_AVAIL;
newroot->dirty = 1;
- // now do the "switcheroo"
- BP_BLOCKNUM(newroot,0) = new_blocknum;
- newroot->thisnodename = old_blocknum;
- newroot->fullhash = old_fullhash;
- newroot->ct_pair = old_pair;
-
- oldroot->thisnodename = new_blocknum;
- oldroot->fullhash = new_fullhash;
- oldroot->ct_pair = new_pair;
-
- toku_cachetable_swap_pair_values(old_pair, new_pair);
+ // Set the first child to have the new blocknum,
+ // and then swap newroot with oldroot. The new root
+ // will inherit the hash/blocknum/pair from oldroot,
+ // keeping the root blocknum constant.
+ BP_BLOCKNUM(newroot, 0) = new_blocknum;
+ toku_ftnode_swap_pair_values(newroot, oldroot);
toku_ft_split_child(
ft,
@@ -2774,6 +2770,16 @@ static void inject_message_in_locked_node(
// verify that msn of latest message was captured in root node
paranoid_invariant(msg->msn.msn == node->max_msn_applied_to_node_on_disk.msn);
+ if (node->thisnodename.b == ft->rightmost_blocknum.b) {
+ if (ft->seqinsert_score < FT_SEQINSERT_SCORE_THRESHOLD) {
+ // we promoted to the rightmost leaf node and the seqinsert score has not yet saturated.
+ toku_sync_fetch_and_add(&ft->seqinsert_score, 1);
+ }
+ } else if (ft->seqinsert_score != 0) {
+ // we promoted to something other than the rightmost leaf node and the score should reset
+ ft->seqinsert_score = 0;
+ }
+
// if we call toku_ft_flush_some_child, then that function unpins the root
// otherwise, we unpin ourselves
if (node->height > 0 && toku_ft_nonleaf_is_gorged(node, ft->h->nodesize)) {
@@ -2930,6 +2936,21 @@ static inline bool should_inject_in_node(seqinsert_loc loc, int height, int dept
return (height == 0 || (loc == NEITHER_EXTREME && (height <= 1 || depth >= 2)));
}
+static void ft_set_or_verify_rightmost_blocknum(FT ft, BLOCKNUM b)
+// Given: 'b', the _definitive_ and constant rightmost blocknum of 'ft'
+{
+ if (ft->rightmost_blocknum.b == RESERVED_BLOCKNUM_NULL) {
+ toku_ft_lock(ft);
+ if (ft->rightmost_blocknum.b == RESERVED_BLOCKNUM_NULL) {
+ ft->rightmost_blocknum = b;
+ }
+ toku_ft_unlock(ft);
+ }
+ // The rightmost blocknum only transitions from RESERVED_BLOCKNUM_NULL to non-null.
+ // If it's already set, verify that the stored value is consistent with 'b'
+ invariant(ft->rightmost_blocknum.b == b.b);
+}
+
static void push_something_in_subtree(
FT ft,
FTNODE subtree_root,
@@ -2977,6 +2998,14 @@ static void push_something_in_subtree(
default:
STATUS_INC(FT_PRO_NUM_INJECT_DEPTH_GT3, 1); break;
}
+ // If the target node is a non-root leaf node on the right extreme,
+ // set the rightmost blocknum. We know there are no messages above us
+ // because promotion would not chose to inject directly into this leaf
+ // otherwise. We explicitly skip the root node because then we don't have
+ // to worry about changing the rightmost blocknum when the root splits.
+ if (subtree_root->height == 0 && loc == RIGHT_EXTREME && subtree_root->thisnodename.b != ft->h->root_blocknum.b) {
+ ft_set_or_verify_rightmost_blocknum(ft, subtree_root->thisnodename);
+ }
inject_message_in_locked_node(ft, subtree_root, target_childnum, msg, flow_deltas, gc_info);
} else {
int r;
@@ -3247,7 +3276,260 @@ void toku_ft_root_put_msg(
}
}
-// Effect: Insert the key-val pair into ft.
+static int ft_compare_keys(FT ft, const DBT *a, const DBT *b)
+// Effect: Compare two keys using the given fractal tree's comparator/descriptor
+{
+ FAKE_DB(db, &ft->cmp_descriptor);
+ return ft->compare_fun(&db, a, b);
+}
+
+static LEAFENTRY bn_get_le_and_key(BASEMENTNODE bn, int idx, DBT *key)
+// Effect: Gets the i'th leafentry from the given basement node and
+// fill its key in *key
+// Requires: The i'th leafentry exists.
+{
+ LEAFENTRY le;
+ uint32_t le_len;
+ void *le_key;
+ int r = bn->data_buffer.fetch_klpair(idx, &le, &le_len, &le_key);
+ invariant_zero(r);
+ toku_fill_dbt(key, le_key, le_len);
+ return le;
+}
+
+static LEAFENTRY ft_leaf_leftmost_le_and_key(FTNODE leaf, DBT *leftmost_key)
+// Effect: If a leftmost key exists in the given leaf, toku_fill_dbt()
+// the key into *leftmost_key
+// Requires: Leaf is fully in memory and pinned for read or write.
+// Return: leafentry if it exists, nullptr otherwise
+{
+ for (int i = 0; i < leaf->n_children; i++) {
+ BASEMENTNODE bn = BLB(leaf, i);
+ if (bn->data_buffer.num_klpairs() > 0) {
+ // Get the first (leftmost) leafentry and its key
+ return bn_get_le_and_key(bn, 0, leftmost_key);
+ }
+ }
+ return nullptr;
+}
+
+static LEAFENTRY ft_leaf_rightmost_le_and_key(FTNODE leaf, DBT *rightmost_key)
+// Effect: If a rightmost key exists in the given leaf, toku_fill_dbt()
+// the key into *rightmost_key
+// Requires: Leaf is fully in memory and pinned for read or write.
+// Return: leafentry if it exists, nullptr otherwise
+{
+ for (int i = leaf->n_children - 1; i >= 0; i--) {
+ BASEMENTNODE bn = BLB(leaf, i);
+ size_t num_les = bn->data_buffer.num_klpairs();
+ if (num_les > 0) {
+ // Get the last (rightmost) leafentry and its key
+ return bn_get_le_and_key(bn, num_les - 1, rightmost_key);
+ }
+ }
+ return nullptr;
+}
+
+static int ft_leaf_get_relative_key_pos(FT ft, FTNODE leaf, const DBT *key, bool *nondeleted_key_found, int *target_childnum)
+// Effect: Determines what the relative position of the given key is with
+// respect to a leaf node, and if it exists.
+// Requires: Leaf is fully in memory and pinned for read or write.
+// Requires: target_childnum is non-null
+// Return: < 0 if key is less than the leftmost key in the leaf OR the relative position is unknown, for any reason.
+// 0 if key is in the bounds [leftmost_key, rightmost_key] for this leaf or the leaf is empty
+// > 0 if key is greater than the rightmost key in the leaf
+// *nondeleted_key_found is set (if non-null) if the target key was found and is not deleted, unmodified otherwise
+// *target_childnum is set to the child that (does or would) contain the key, if calculated, unmodified otherwise
+{
+ DBT rightmost_key;
+ LEAFENTRY rightmost_le = ft_leaf_rightmost_le_and_key(leaf, &rightmost_key);
+ if (rightmost_le == nullptr) {
+ // If we can't get a rightmost key then the leaf is empty.
+ // In such a case, we don't have any information about what keys would be in this leaf.
+ // We have to assume the leaf node that would contain this key is to the left.
+ return -1;
+ }
+ // We have a rightmost leafentry, so it must exist in some child node
+ invariant(leaf->n_children > 0);
+
+ int relative_pos = 0;
+ int c = ft_compare_keys(ft, key, &rightmost_key);
+ if (c > 0) {
+ relative_pos = 1;
+ *target_childnum = leaf->n_children - 1;
+ } else if (c == 0) {
+ if (nondeleted_key_found != nullptr && !le_latest_is_del(rightmost_le)) {
+ *nondeleted_key_found = true;
+ }
+ relative_pos = 0;
+ *target_childnum = leaf->n_children - 1;
+ } else {
+ // The key is less than the rightmost. It may still be in bounds if it's >= the leftmost.
+ DBT leftmost_key;
+ LEAFENTRY leftmost_le = ft_leaf_leftmost_le_and_key(leaf, &leftmost_key);
+ invariant_notnull(leftmost_le); // Must exist because a rightmost exists
+ c = ft_compare_keys(ft, key, &leftmost_key);
+ if (c > 0) {
+ if (nondeleted_key_found != nullptr) {
+ // The caller wants to know if a nondeleted key can be found.
+ LEAFENTRY target_le;
+ int childnum = toku_ftnode_which_child(leaf, key, &ft->cmp_descriptor, ft->compare_fun);
+ BASEMENTNODE bn = BLB(leaf, childnum);
+ struct msg_leafval_heaviside_extra extra = { ft->compare_fun, &ft->cmp_descriptor, key };
+ int r = bn->data_buffer.find_zero<decltype(extra), toku_msg_leafval_heaviside>(
+ extra,
+ &target_le,
+ nullptr, nullptr, nullptr
+ );
+ *target_childnum = childnum;
+ if (r == 0 && !le_latest_is_del(leftmost_le)) {
+ *nondeleted_key_found = true;
+ }
+ }
+ relative_pos = 0;
+ } else if (c == 0) {
+ if (nondeleted_key_found != nullptr && !le_latest_is_del(leftmost_le)) {
+ *nondeleted_key_found = true;
+ }
+ relative_pos = 0;
+ *target_childnum = 0;
+ } else {
+ relative_pos = -1;
+ }
+ }
+
+ return relative_pos;
+}
+
+static void ft_insert_directly_into_leaf(FT ft, FTNODE leaf, int target_childnum, DBT *key, DBT *val,
+ XIDS message_xids, enum ft_msg_type type, txn_gc_info *gc_info);
+static int getf_nothing(ITEMLEN, bytevec, ITEMLEN, bytevec, void *, bool);
+
+static int ft_maybe_insert_into_rightmost_leaf(FT ft, DBT *key, DBT *val, XIDS message_xids, enum ft_msg_type type,
+ txn_gc_info *gc_info, bool unique)
+// Effect: Pins the rightmost leaf node and attempts to do an insert.
+// There are three reasons why we may not succeed.
+// - The rightmost leaf is too full and needs a split.
+// - The key to insert is not within the provable bounds of this leaf node.
+// - The key is within bounds, but it already exists.
+// Return: 0 if this function did insert, DB_KEYEXIST if a unique key constraint exists and
+// some nondeleted leafentry with the same key exists
+// < 0 if this function did not insert, for a reason other than DB_KEYEXIST.
+// Note: Treat this function as a possible, but not necessary, optimization for insert.
+// Rationale: We want O(1) insertions down the rightmost path of the tree.
+{
+ int r = -1;
+
+ uint32_t rightmost_fullhash;
+ BLOCKNUM rightmost_blocknum = ft->rightmost_blocknum;
+ FTNODE rightmost_leaf = nullptr;
+
+ // Don't do the optimization if our heurstic suggests that
+ // insertion pattern is not sequential.
+ if (ft->seqinsert_score < FT_SEQINSERT_SCORE_THRESHOLD) {
+ goto cleanup;
+ }
+
+ // We know the seqinsert score is high enough that we should
+ // attemp to directly insert into the right most leaf. Because
+ // the score is non-zero, the rightmost blocknum must have been
+ // set. See inject_message_in_locked_node(), which only increases
+ // the score if the target node blocknum == rightmost_blocknum
+ invariant(rightmost_blocknum.b != RESERVED_BLOCKNUM_NULL);
+
+ // Pin the rightmost leaf with a write lock.
+ rightmost_fullhash = toku_cachetable_hash(ft->cf, rightmost_blocknum);
+ struct ftnode_fetch_extra bfe;
+ fill_bfe_for_full_read(&bfe, ft);
+ toku_pin_ftnode(ft, rightmost_blocknum, rightmost_fullhash, &bfe, PL_WRITE_CHEAP, &rightmost_leaf, true);
+
+ // The rightmost blocknum never chances once it is initialized to something
+ // other than null. Verify that the pinned node has the correct blocknum.
+ invariant(rightmost_leaf->thisnodename.b == rightmost_blocknum.b);
+
+ // If the rightmost leaf is reactive, bail out out and let the normal promotion pass
+ // take care of it. This also ensures that if any of our ancestors are reactive,
+ // they'll be taken care of too.
+ if (get_leaf_reactivity(rightmost_leaf, ft->h->nodesize) != RE_STABLE) {
+ STATUS_INC(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_REACTIVE, 1);
+ goto cleanup;
+ }
+
+ // The groundwork has been laid for an insertion directly into the rightmost
+ // leaf node. We know that it is pinned for write, fully in memory, has
+ // no messages above it, and is not reactive.
+ //
+ // Now, two more things must be true for this insertion to actually happen:
+ // 1. The key to insert is within the bounds of this leafnode, or to the right.
+ // 2. If there is a uniqueness constraint, it passes.
+ bool nondeleted_key_found;
+ int relative_pos;
+ int target_childnum;
+
+ nondeleted_key_found = false;
+ target_childnum = -1;
+ relative_pos = ft_leaf_get_relative_key_pos(ft, rightmost_leaf, key,
+ unique ? &nondeleted_key_found : nullptr,
+ &target_childnum);
+ if (relative_pos >= 0) {
+ STATUS_INC(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_SUCCESS, 1);
+ if (unique && nondeleted_key_found) {
+ r = DB_KEYEXIST;
+ } else {
+ ft_insert_directly_into_leaf(ft, rightmost_leaf, target_childnum,
+ key, val, message_xids, type, gc_info);
+ r = 0;
+ }
+ } else {
+ STATUS_INC(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_POS, 1);
+ r = -1;
+ }
+
+cleanup:
+ // If we did the insert, the rightmost leaf was unpinned for us.
+ if (r != 0 && rightmost_leaf != nullptr) {
+ toku_unpin_ftnode(ft, rightmost_leaf);
+ }
+
+ return r;
+}
+
+static void ft_txn_log_insert(FT ft, DBT *key, DBT *val, TOKUTXN txn, bool do_logging, enum ft_msg_type type);
+
+int toku_ft_insert_unique(FT_HANDLE ft_h, DBT *key, DBT *val, TOKUTXN txn, bool do_logging) {
+// Effect: Insert a unique key-val pair into the fractal tree.
+// Return: 0 on success, DB_KEYEXIST if the overwrite constraint failed
+ XIDS message_xids = txn != nullptr ? toku_txn_get_xids(txn) : xids_get_root_xids();
+
+ TXN_MANAGER txn_manager = toku_ft_get_txn_manager(ft_h);
+ txn_manager_state txn_state_for_gc(txn_manager);
+
+ TXNID oldest_referenced_xid_estimate = toku_ft_get_oldest_referenced_xid_estimate(ft_h);
+ txn_gc_info gc_info(&txn_state_for_gc,
+ oldest_referenced_xid_estimate,
+ // no messages above us, we can implicitly promote uxrs based on this xid
+ oldest_referenced_xid_estimate,
+ true);
+ int r = ft_maybe_insert_into_rightmost_leaf(ft_h->ft, key, val, message_xids, FT_INSERT, &gc_info, true);
+ if (r != 0 && r != DB_KEYEXIST) {
+ // Default to a regular unique check + insert algorithm if we couldn't
+ // do it based on the rightmost leaf alone.
+ int lookup_r = toku_ft_lookup(ft_h, key, getf_nothing, nullptr);
+ if (lookup_r == DB_NOTFOUND) {
+ toku_ft_send_insert(ft_h, key, val, message_xids, FT_INSERT, &gc_info);
+ r = 0;
+ } else {
+ r = DB_KEYEXIST;
+ }
+ }
+
+ if (r == 0) {
+ ft_txn_log_insert(ft_h->ft, key, val, txn, do_logging, FT_INSERT);
+ }
+ return r;
+}
+
+// Effect: Insert the key-val pair into an ft.
void toku_ft_insert (FT_HANDLE ft_handle, DBT *key, DBT *val, TOKUTXN txn) {
toku_ft_maybe_insert(ft_handle, key, val, txn, false, ZERO_LSN, true, FT_INSERT);
}
@@ -3373,32 +3655,38 @@ TXNID toku_ft_get_oldest_referenced_xid_estimate(FT_HANDLE ft_h) {
return txn_manager != nullptr ? toku_txn_manager_get_oldest_referenced_xid_estimate(txn_manager) : TXNID_NONE;
}
-void toku_ft_maybe_insert (FT_HANDLE ft_h, DBT *key, DBT *val, TOKUTXN txn, bool oplsn_valid, LSN oplsn, bool do_logging, enum ft_msg_type type) {
- paranoid_invariant(type==FT_INSERT || type==FT_INSERT_NO_OVERWRITE);
- XIDS message_xids = xids_get_root_xids(); //By default use committed messages
+static void ft_txn_log_insert(FT ft, DBT *key, DBT *val, TOKUTXN txn, bool do_logging, enum ft_msg_type type) {
+ paranoid_invariant(type == FT_INSERT || type == FT_INSERT_NO_OVERWRITE);
+
+ //By default use committed messages
TXNID_PAIR xid = toku_txn_get_txnid(txn);
if (txn) {
BYTESTRING keybs = {key->size, (char *) key->data};
- toku_logger_save_rollback_cmdinsert(txn, toku_cachefile_filenum(ft_h->ft->cf), &keybs);
- toku_txn_maybe_note_ft(txn, ft_h->ft);
- message_xids = toku_txn_get_xids(txn);
+ toku_logger_save_rollback_cmdinsert(txn, toku_cachefile_filenum(ft->cf), &keybs);
+ toku_txn_maybe_note_ft(txn, ft);
}
TOKULOGGER logger = toku_txn_logger(txn);
if (do_logging && logger) {
BYTESTRING keybs = {.len=key->size, .data=(char *) key->data};
BYTESTRING valbs = {.len=val->size, .data=(char *) val->data};
if (type == FT_INSERT) {
- toku_log_enq_insert(logger, (LSN*)0, 0, txn, toku_cachefile_filenum(ft_h->ft->cf), xid, keybs, valbs);
+ toku_log_enq_insert(logger, (LSN*)0, 0, txn, toku_cachefile_filenum(ft->cf), xid, keybs, valbs);
}
else {
- toku_log_enq_insert_no_overwrite(logger, (LSN*)0, 0, txn, toku_cachefile_filenum(ft_h->ft->cf), xid, keybs, valbs);
+ toku_log_enq_insert_no_overwrite(logger, (LSN*)0, 0, txn, toku_cachefile_filenum(ft->cf), xid, keybs, valbs);
}
}
+}
+
+void toku_ft_maybe_insert (FT_HANDLE ft_h, DBT *key, DBT *val, TOKUTXN txn, bool oplsn_valid, LSN oplsn, bool do_logging, enum ft_msg_type type) {
+ ft_txn_log_insert(ft_h->ft, key, val, txn, do_logging, type);
LSN treelsn;
if (oplsn_valid && oplsn.lsn <= (treelsn = toku_ft_checkpoint_lsn(ft_h->ft)).lsn) {
// do nothing
} else {
+ XIDS message_xids = txn ? toku_txn_get_xids(txn) : xids_get_root_xids();
+
TXN_MANAGER txn_manager = toku_ft_get_txn_manager(ft_h);
txn_manager_state txn_state_for_gc(txn_manager);
@@ -3408,10 +3696,26 @@ void toku_ft_maybe_insert (FT_HANDLE ft_h, DBT *key, DBT *val, TOKUTXN txn, bool
// no messages above us, we can implicitly promote uxrs based on this xid
oldest_referenced_xid_estimate,
txn != nullptr ? !txn->for_recovery : false);
- toku_ft_send_insert(ft_h, key, val, message_xids, type, &gc_info);
+ int r = ft_maybe_insert_into_rightmost_leaf(ft_h->ft, key, val, message_xids, FT_INSERT, &gc_info, false);
+ if (r != 0) {
+ toku_ft_send_insert(ft_h, key, val, message_xids, type, &gc_info);
+ }
}
}
+static void ft_insert_directly_into_leaf(FT ft, FTNODE leaf, int target_childnum, DBT *key, DBT *val,
+ XIDS message_xids, enum ft_msg_type type, txn_gc_info *gc_info)
+// Effect: Insert directly into a leaf node a fractal tree. Does not do any logging.
+// Requires: Leaf is fully in memory and pinned for write.
+// Requires: If this insertion were to happen through the root node, the promotion
+// algorithm would have selected the given leaf node as the point of injection.
+// That means this function relies on the current implementation of promotion.
+{
+ FT_MSG_S ftcmd = { type, ZERO_MSN, message_xids, .u = { .id = { key, val } } };
+ size_t flow_deltas[] = { 0, 0 };
+ inject_message_in_locked_node(ft, leaf, target_childnum, &ftcmd, flow_deltas, gc_info);
+}
+
static void
ft_send_update_msg(FT_HANDLE ft_h, FT_MSG_S *msg, TOKUTXN txn) {
msg->xids = (txn
diff --git a/ft/ft-ops.h b/ft/ft-ops.h
index b482d2b8206..cfa6ba20f6f 100644
--- a/ft/ft-ops.h
+++ b/ft/ft-ops.h
@@ -213,6 +213,9 @@ int toku_ft_lookup (FT_HANDLE ft_h, DBT *k, FT_GET_CALLBACK_FUNCTION getf, void
// Effect: Insert a key and data pair into an ft
void toku_ft_insert (FT_HANDLE ft_h, DBT *k, DBT *v, TOKUTXN txn);
+// Returns: 0 if the key was inserted, DB_KEYEXIST if the key already exists
+int toku_ft_insert_unique(FT_HANDLE ft, DBT *k, DBT *v, TOKUTXN txn, bool do_logging);
+
// Effect: Optimize the ft
void toku_ft_optimize (FT_HANDLE ft_h);
diff --git a/ft/tests/test_rightmost_leaf_seqinsert_heuristic.cc b/ft/tests/test_rightmost_leaf_seqinsert_heuristic.cc
new file mode 100644
index 00000000000..100e5153636
--- /dev/null
+++ b/ft/tests/test_rightmost_leaf_seqinsert_heuristic.cc
@@ -0,0 +1,183 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*
+COPYING CONDITIONS NOTICE:
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation, and provided that the
+ following conditions are met:
+
+ * Redistributions of source code must retain this COPYING
+ CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+ DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+ PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+ GRANT (below).
+
+ * Redistributions in binary form must reproduce this COPYING
+ CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+ DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+ PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+ GRANT (below) in the documentation and/or other materials
+ provided with the distribution.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ 02110-1301, USA.
+
+COPYRIGHT NOTICE:
+
+ TokuDB, Tokutek Fractal Tree Indexing Library.
+ Copyright (C) 2007-2014 Tokutek, Inc.
+
+DISCLAIMER:
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+UNIVERSITY PATENT NOTICE:
+
+ The technology is licensed by the Massachusetts Institute of
+ Technology, Rutgers State University of New Jersey, and the Research
+ Foundation of State University of New York at Stony Brook under
+ United States of America Serial No. 11/760379 and to the patents
+ and/or patent applications resulting from it.
+
+PATENT MARKING NOTICE:
+
+ This software is covered by US Patent No. 8,185,551.
+ This software is covered by US Patent No. 8,489,638.
+
+PATENT RIGHTS GRANT:
+
+ "THIS IMPLEMENTATION" means the copyrightable works distributed by
+ Tokutek as part of the Fractal Tree project.
+
+ "PATENT CLAIMS" means the claims of patents that are owned or
+ licensable by Tokutek, both currently or in the future; and that in
+ the absence of this license would be infringed by THIS
+ IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
+
+ "PATENT CHALLENGE" shall mean a challenge to the validity,
+ patentability, enforceability and/or non-infringement of any of the
+ PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
+
+ Tokutek hereby grants to you, for the term and geographical scope of
+ the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
+ irrevocable (except as stated in this section) patent license to
+ make, have made, use, offer to sell, sell, import, transfer, and
+ otherwise run, modify, and propagate the contents of THIS
+ IMPLEMENTATION, where such license applies only to the PATENT
+ CLAIMS. This grant does not include claims that would be infringed
+ only as a consequence of further modifications of THIS
+ IMPLEMENTATION. If you or your agent or licensee institute or order
+ or agree to the institution of patent litigation against any entity
+ (including a cross-claim or counterclaim in a lawsuit) alleging that
+ THIS IMPLEMENTATION constitutes direct or contributory patent
+ infringement, or inducement of patent infringement, then any rights
+ granted to you under this License shall terminate as of the date
+ such litigation is filed. If you or your agent or exclusive
+ licensee institute or order or agree to the institution of a PATENT
+ CHALLENGE, then Tokutek may terminate any rights granted to you
+ under this License.
+*/
+
+#ident "Copyright (c) 2014 Tokutek Inc. All rights reserved."
+
+#include "test.h"
+
+#include <ft/ybt.h>
+#include <ft/ft-cachetable-wrappers.h>
+
+// Each FT maintains a sequential insert heuristic to determine if its
+// worth trying to insert directly into a well-known rightmost leaf node.
+//
+// The heuristic is only maintained when a rightmost leaf node is known.
+//
+// This test verifies that sequential inserts increase the seqinsert score
+// and that a single non-sequential insert resets the score.
+
+static void test_seqinsert_heuristic(void) {
+ int r = 0;
+ char name[TOKU_PATH_MAX + 1];
+ toku_path_join(name, 2, TOKU_TEST_FILENAME, "ftdata");
+ toku_os_recursive_delete(TOKU_TEST_FILENAME);
+ r = toku_os_mkdir(TOKU_TEST_FILENAME, S_IRWXU); CKERR(r);
+
+ FT_HANDLE ft_handle;
+ CACHETABLE ct;
+ toku_cachetable_create(&ct, 0, ZERO_LSN, NULL_LOGGER);
+ r = toku_open_ft_handle(name, 1, &ft_handle,
+ 4*1024*1024, 64*1024,
+ TOKU_DEFAULT_COMPRESSION_METHOD, ct, NULL,
+ toku_builtin_compare_fun); CKERR(r);
+ FT ft = ft_handle->ft;
+
+ int k;
+ DBT key, val;
+ const int val_size = 1024 * 1024;
+ char *XMALLOC_N(val_size, val_buf);
+ memset(val_buf, 'x', val_size);
+ toku_fill_dbt(&val, val_buf, val_size);
+
+ // Insert many rows sequentially. This is enough data to:
+ // - force the root to split (the righmost leaf will then be known)
+ // - raise the seqinsert score high enough to enable direct rightmost injections
+ const int rows_to_insert = 200;
+ for (int i = 0; i < rows_to_insert; i++) {
+ k = toku_htonl(i);
+ toku_fill_dbt(&key, &k, sizeof(k));
+ toku_ft_insert(ft_handle, &key, &val, NULL);
+ }
+ invariant(ft->rightmost_blocknum.b != RESERVED_BLOCKNUM_NULL);
+ invariant(ft->seqinsert_score == FT_SEQINSERT_SCORE_THRESHOLD);
+
+ // Insert on the left extreme. The seq insert score is high enough
+ // that we will attempt to insert into the rightmost leaf. We won't
+ // be successful because key 0 won't be in the bounds of the rightmost leaf.
+ // This failure should reset the seqinsert score back to 0.
+ k = toku_htonl(0);
+ toku_fill_dbt(&key, &k, sizeof(k));
+ toku_ft_insert(ft_handle, &key, &val, NULL);
+ invariant(ft->seqinsert_score == 0);
+
+ // Insert in the middle. The score should not go up.
+ k = toku_htonl(rows_to_insert / 2);
+ toku_fill_dbt(&key, &k, sizeof(k));
+ toku_ft_insert(ft_handle, &key, &val, NULL);
+ invariant(ft->seqinsert_score == 0);
+
+ // Insert on the right extreme. The score should go up.
+ k = toku_htonl(rows_to_insert);
+ toku_fill_dbt(&key, &k, sizeof(k));
+ toku_ft_insert(ft_handle, &key, &val, NULL);
+ invariant(ft->seqinsert_score == 1);
+
+ // Insert again on the right extreme again, the score should go up.
+ k = toku_htonl(rows_to_insert + 1);
+ toku_fill_dbt(&key, &k, sizeof(k));
+ toku_ft_insert(ft_handle, &key, &val, NULL);
+ invariant(ft->seqinsert_score == 2);
+
+ // Insert close to, but not at, the right extreme. The score should reset.
+ // -- the magic number 4 derives from the fact that vals are 1mb and nodes are 4mb
+ k = toku_htonl(rows_to_insert - 4);
+ toku_fill_dbt(&key, &k, sizeof(k));
+ toku_ft_insert(ft_handle, &key, &val, NULL);
+ invariant(ft->seqinsert_score == 0);
+
+ toku_free(val_buf);
+ toku_ft_handle_close(ft_handle);
+ toku_cachetable_close(&ct);
+ toku_os_recursive_delete(TOKU_TEST_FILENAME);
+}
+
+int test_main(int argc, const char *argv[]) {
+ default_parse_args(argc, argv);
+ test_seqinsert_heuristic();
+ return 0;
+}
diff --git a/ft/tests/test_rightmost_leaf_split_merge.cc b/ft/tests/test_rightmost_leaf_split_merge.cc
new file mode 100644
index 00000000000..517fc277fd3
--- /dev/null
+++ b/ft/tests/test_rightmost_leaf_split_merge.cc
@@ -0,0 +1,212 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*
+COPYING CONDITIONS NOTICE:
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation, and provided that the
+ following conditions are met:
+
+ * Redistributions of source code must retain this COPYING
+ CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+ DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+ PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+ GRANT (below).
+
+ * Redistributions in binary form must reproduce this COPYING
+ CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+ DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+ PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+ GRANT (below) in the documentation and/or other materials
+ provided with the distribution.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ 02110-1301, USA.
+
+COPYRIGHT NOTICE:
+
+ TokuDB, Tokutek Fractal Tree Indexing Library.
+ Copyright (C) 2007-2014 Tokutek, Inc.
+
+DISCLAIMER:
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+UNIVERSITY PATENT NOTICE:
+
+ The technology is licensed by the Massachusetts Institute of
+ Technology, Rutgers State University of New Jersey, and the Research
+ Foundation of State University of New York at Stony Brook under
+ United States of America Serial No. 11/760379 and to the patents
+ and/or patent applications resulting from it.
+
+PATENT MARKING NOTICE:
+
+ This software is covered by US Patent No. 8,185,551.
+ This software is covered by US Patent No. 8,489,638.
+
+PATENT RIGHTS GRANT:
+
+ "THIS IMPLEMENTATION" means the copyrightable works distributed by
+ Tokutek as part of the Fractal Tree project.
+
+ "PATENT CLAIMS" means the claims of patents that are owned or
+ licensable by Tokutek, both currently or in the future; and that in
+ the absence of this license would be infringed by THIS
+ IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
+
+ "PATENT CHALLENGE" shall mean a challenge to the validity,
+ patentability, enforceability and/or non-infringement of any of the
+ PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
+
+ Tokutek hereby grants to you, for the term and geographical scope of
+ the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
+ irrevocable (except as stated in this section) patent license to
+ make, have made, use, offer to sell, sell, import, transfer, and
+ otherwise run, modify, and propagate the contents of THIS
+ IMPLEMENTATION, where such license applies only to the PATENT
+ CLAIMS. This grant does not include claims that would be infringed
+ only as a consequence of further modifications of THIS
+ IMPLEMENTATION. If you or your agent or licensee institute or order
+ or agree to the institution of patent litigation against any entity
+ (including a cross-claim or counterclaim in a lawsuit) alleging that
+ THIS IMPLEMENTATION constitutes direct or contributory patent
+ infringement, or inducement of patent infringement, then any rights
+ granted to you under this License shall terminate as of the date
+ such litigation is filed. If you or your agent or exclusive
+ licensee institute or order or agree to the institution of a PATENT
+ CHALLENGE, then Tokutek may terminate any rights granted to you
+ under this License.
+*/
+
+#ident "Copyright (c) 2014 Tokutek Inc. All rights reserved."
+
+#include "test.h"
+
+#include <ft/ybt.h>
+#include <ft/ft-cachetable-wrappers.h>
+
+// Promotion tracks the rightmost blocknum in the FT when a message
+// is successfully promoted to a non-root leaf node on the right extreme.
+//
+// This test verifies that a split or merge of the rightmost leaf properly
+// maintains the rightmost blocknum (which is constant - the pair's swap values,
+// like the root blocknum).
+
+static void test_split_merge(void) {
+ int r = 0;
+ char name[TOKU_PATH_MAX + 1];
+ toku_path_join(name, 2, TOKU_TEST_FILENAME, "ftdata");
+ toku_os_recursive_delete(TOKU_TEST_FILENAME);
+ r = toku_os_mkdir(TOKU_TEST_FILENAME, S_IRWXU); CKERR(r);
+
+ FT_HANDLE ft_handle;
+ CACHETABLE ct;
+ toku_cachetable_create(&ct, 0, ZERO_LSN, NULL_LOGGER);
+ r = toku_open_ft_handle(name, 1, &ft_handle,
+ 4*1024*1024, 64*1024,
+ TOKU_DEFAULT_COMPRESSION_METHOD, ct, NULL,
+ toku_builtin_compare_fun); CKERR(r);
+
+ // We have a root blocknum, but no rightmost blocknum yet.
+ FT ft = ft_handle->ft;
+ invariant(ft->h->root_blocknum.b != RESERVED_BLOCKNUM_NULL);
+ invariant(ft->rightmost_blocknum.b == RESERVED_BLOCKNUM_NULL);
+
+ int k;
+ DBT key, val;
+ const int val_size = 1 * 1024 * 1024;
+ char *XMALLOC_N(val_size, val_buf);
+ memset(val_buf, 'x', val_size);
+ toku_fill_dbt(&val, val_buf, val_size);
+
+ // Insert 16 rows (should induce a few splits)
+ const int rows_to_insert = 16;
+ for (int i = 0; i < rows_to_insert; i++) {
+ k = toku_htonl(i);
+ toku_fill_dbt(&key, &k, sizeof(k));
+ toku_ft_insert(ft_handle, &key, &val, NULL);
+ }
+
+ // rightmost blocknum should be set, because the root split and promotion
+ // did a rightmost insertion directly into the rightmost leaf, lazily
+ // initializing the rightmost blocknum.
+ invariant(ft->rightmost_blocknum.b != RESERVED_BLOCKNUM_NULL);
+
+ BLOCKNUM root_blocknum = ft->h->root_blocknum;
+ FTNODE root_node;
+ struct ftnode_fetch_extra bfe;
+ fill_bfe_for_full_read(&bfe, ft);
+ toku_pin_ftnode(ft, root_blocknum,
+ toku_cachetable_hash(ft->cf, ft->h->root_blocknum),
+ &bfe, PL_WRITE_EXPENSIVE, &root_node, true);
+ // root blocknum should be consistent
+ invariant(root_node->thisnodename.b == ft->h->root_blocknum.b);
+ // root should have split at least once, and it should now be at height 1
+ invariant(root_node->n_children > 1);
+ invariant(root_node->height == 1);
+ // rightmost blocknum should no longer be the root, since the root split
+ invariant(ft->h->root_blocknum.b != ft->rightmost_blocknum.b);
+ // the right child should have the rightmost blocknum
+ invariant(BP_BLOCKNUM(root_node, root_node->n_children - 1).b == ft->rightmost_blocknum.b);
+
+ BLOCKNUM rightmost_blocknum_before_merge = ft->rightmost_blocknum;
+ const int num_children_before_merge = root_node->n_children;
+
+ // delete the last 6 rows.
+ // - 1mb each, so 6mb deleted
+ // - should be enough to delete the entire rightmost leaf + some of its neighbor
+ const int rows_to_delete = 6;
+ toku_unpin_ftnode(ft, root_node);
+ for (int i = 0; i < rows_to_delete; i++) {
+ k = toku_htonl(rows_to_insert - i);
+ toku_fill_dbt(&key, &k, sizeof(k));
+ toku_ft_delete(ft_handle, &key, NULL);
+ }
+ toku_pin_ftnode(ft, root_blocknum,
+ toku_cachetable_hash(ft->cf, root_blocknum),
+ &bfe, PL_WRITE_EXPENSIVE, &root_node, true);
+
+ // - rightmost leaf should be fusible after those deletes (which were promoted directly to the leaf)
+ FTNODE rightmost_leaf;
+ toku_pin_ftnode(ft, rightmost_blocknum_before_merge,
+ toku_cachetable_hash(ft->cf, rightmost_blocknum_before_merge),
+ &bfe, PL_WRITE_EXPENSIVE, &rightmost_leaf, true);
+ invariant(get_node_reactivity(ft, rightmost_leaf) == RE_FUSIBLE);
+ toku_unpin_ftnode(ft, rightmost_leaf);
+
+ // - merge the rightmost child now that it's fusible
+ toku_ft_merge_child(ft, root_node, root_node->n_children - 1);
+ toku_pin_ftnode(ft, root_blocknum,
+ toku_cachetable_hash(ft->cf, root_blocknum),
+ &bfe, PL_WRITE_EXPENSIVE, &root_node, true);
+
+ // the merge should have worked, and the root should still be at height 1
+ invariant(root_node->n_children < num_children_before_merge);
+ invariant(root_node->height == 1);
+ // the rightmost child of the root has the rightmost blocknum
+ invariant(BP_BLOCKNUM(root_node, root_node->n_children - 1).b == ft->rightmost_blocknum.b);
+ // the value for rightmost blocknum itself should not have changed
+ // (we keep it constant, like the root blocknum)
+ invariant(rightmost_blocknum_before_merge.b == ft->rightmost_blocknum.b);
+
+ toku_unpin_ftnode(ft, root_node);
+
+ toku_free(val_buf);
+ toku_ft_handle_close(ft_handle);
+ toku_cachetable_close(&ct);
+ toku_os_recursive_delete(TOKU_TEST_FILENAME);
+}
+
+int test_main(int argc, const char *argv[]) {
+ default_parse_args(argc, argv);
+ test_split_merge();
+ return 0;
+}