summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party/wiredtiger/src/lsm/lsm_cursor.c')
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_cursor.c3067
1 files changed, 1486 insertions, 1581 deletions
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
index 0667008485d..de9b35cdc17 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
@@ -8,12 +8,12 @@
#include "wt_internal.h"
-#define WT_FORALL_CURSORS(clsm, c, i) \
- for ((i) = (clsm)->nchunks; (i) > 0;) \
- if (((c) = (clsm)->chunks[--(i)]->cursor) != NULL)
+#define WT_FORALL_CURSORS(clsm, c, i) \
+ for ((i) = (clsm)->nchunks; (i) > 0;) \
+ if (((c) = (clsm)->chunks[--(i)]->cursor) != NULL)
-#define WT_LSM_CURCMP(s, lsm_tree, c1, c2, cmp) \
- __wt_compare(s, (lsm_tree)->collator, &(c1)->key, &(c2)->key, &(cmp))
+#define WT_LSM_CURCMP(s, lsm_tree, c1, c2, cmp) \
+ __wt_compare(s, (lsm_tree)->collator, &(c1)->key, &(c2)->key, &(cmp))
static int __clsm_lookup(WT_CURSOR_LSM *, WT_ITEM *);
static int __clsm_open_cursors(WT_CURSOR_LSM *, bool, u_int, uint32_t);
@@ -22,1677 +22,1584 @@ static int __clsm_search_near(WT_CURSOR *cursor, int *exactp);
/*
* __wt_clsm_request_switch --
- * Request an LSM tree switch for a cursor operation.
+ * Request an LSM tree switch for a cursor operation.
*/
int
__wt_clsm_request_switch(WT_CURSOR_LSM *clsm)
{
- WT_DECL_RET;
- WT_LSM_TREE *lsm_tree;
- WT_SESSION_IMPL *session;
-
- lsm_tree = clsm->lsm_tree;
- session = (WT_SESSION_IMPL *)clsm->iface.session;
-
- if (!lsm_tree->need_switch) {
- /*
- * Check that we are up-to-date: don't set the switch if the
- * tree has changed since we last opened cursors: that can lead
- * to switching multiple times when only one switch is
- * required, creating very small chunks.
- */
- __wt_lsm_tree_readlock(session, lsm_tree);
- if (lsm_tree->nchunks == 0 ||
- (clsm->dsk_gen == lsm_tree->dsk_gen &&
- !lsm_tree->need_switch)) {
- lsm_tree->need_switch = true;
- ret = __wt_lsm_manager_push_entry(
- session, WT_LSM_WORK_SWITCH, 0, lsm_tree);
- }
- __wt_lsm_tree_readunlock(session, lsm_tree);
- }
-
- return (ret);
+ WT_DECL_RET;
+ WT_LSM_TREE *lsm_tree;
+ WT_SESSION_IMPL *session;
+
+ lsm_tree = clsm->lsm_tree;
+ session = (WT_SESSION_IMPL *)clsm->iface.session;
+
+ if (!lsm_tree->need_switch) {
+ /*
+ * Check that we are up-to-date: don't set the switch if the tree has changed since we last
+ * opened cursors: that can lead to switching multiple times when only one switch is
+ * required, creating very small chunks.
+ */
+ __wt_lsm_tree_readlock(session, lsm_tree);
+ if (lsm_tree->nchunks == 0 ||
+ (clsm->dsk_gen == lsm_tree->dsk_gen && !lsm_tree->need_switch)) {
+ lsm_tree->need_switch = true;
+ ret = __wt_lsm_manager_push_entry(session, WT_LSM_WORK_SWITCH, 0, lsm_tree);
+ }
+ __wt_lsm_tree_readunlock(session, lsm_tree);
+ }
+
+ return (ret);
}
/*
* __wt_clsm_await_switch --
- * Wait for a switch to have completed in the LSM tree
+ * Wait for a switch to have completed in the LSM tree
*/
int
__wt_clsm_await_switch(WT_CURSOR_LSM *clsm)
{
- WT_LSM_TREE *lsm_tree;
- WT_SESSION_IMPL *session;
- int waited;
-
- lsm_tree = clsm->lsm_tree;
- session = (WT_SESSION_IMPL *)clsm->iface.session;
-
- /*
- * If there is no primary chunk, or a chunk has overflowed the hard
- * limit, which either means a worker thread has fallen behind or there
- * has just been a user-level checkpoint, wait until the tree changes.
- *
- * We used to switch chunks in the application thread here, but that is
- * problematic because there is a transaction in progress and it could
- * roll back, leaving the metadata inconsistent.
- */
- for (waited = 0;
- lsm_tree->nchunks == 0 ||
- clsm->dsk_gen == lsm_tree->dsk_gen;
- ++waited) {
- if (waited % WT_THOUSAND == 0)
- WT_RET(__wt_lsm_manager_push_entry(
- session, WT_LSM_WORK_SWITCH, 0, lsm_tree));
- __wt_sleep(0, 10);
- }
- return (0);
+ WT_LSM_TREE *lsm_tree;
+ WT_SESSION_IMPL *session;
+ int waited;
+
+ lsm_tree = clsm->lsm_tree;
+ session = (WT_SESSION_IMPL *)clsm->iface.session;
+
+ /*
+ * If there is no primary chunk, or a chunk has overflowed the hard
+ * limit, which either means a worker thread has fallen behind or there
+ * has just been a user-level checkpoint, wait until the tree changes.
+ *
+ * We used to switch chunks in the application thread here, but that is
+ * problematic because there is a transaction in progress and it could
+ * roll back, leaving the metadata inconsistent.
+ */
+ for (waited = 0; lsm_tree->nchunks == 0 || clsm->dsk_gen == lsm_tree->dsk_gen; ++waited) {
+ if (waited % WT_THOUSAND == 0)
+ WT_RET(__wt_lsm_manager_push_entry(session, WT_LSM_WORK_SWITCH, 0, lsm_tree));
+ __wt_sleep(0, 10);
+ }
+ return (0);
}
/*
* __clsm_enter_update --
- * Make sure an LSM cursor is ready to perform an update.
+ * Make sure an LSM cursor is ready to perform an update.
*/
static int
__clsm_enter_update(WT_CURSOR_LSM *clsm)
{
- WT_CURSOR *primary;
- WT_LSM_CHUNK *primary_chunk;
- WT_LSM_TREE *lsm_tree;
- WT_SESSION_IMPL *session;
- bool hard_limit, have_primary, ovfl;
-
- lsm_tree = clsm->lsm_tree;
- session = (WT_SESSION_IMPL *)clsm->iface.session;
-
- if (clsm->nchunks == 0) {
- primary = NULL;
- have_primary = false;
- } else {
- primary = clsm->chunks[clsm->nchunks - 1]->cursor;
- primary_chunk = clsm->primary_chunk;
- WT_ASSERT(session, F_ISSET(&session->txn, WT_TXN_HAS_ID));
- have_primary = (primary != NULL && primary_chunk != NULL &&
- (primary_chunk->switch_txn == WT_TXN_NONE ||
- WT_TXNID_LT(session->txn.id, primary_chunk->switch_txn)));
- }
-
- /*
- * In LSM there are multiple btrees active at one time. The tree
- * switch code needs to use btree API methods, and it wants to
- * operate on the btree for the primary chunk. Set that up now.
- *
- * If the primary chunk has grown too large, set a flag so the worker
- * thread will switch when it gets a chance to avoid introducing high
- * latency into application threads. Don't do this indefinitely: if a
- * chunk grows twice as large as the configured size, block until it
- * can be switched.
- */
- hard_limit = lsm_tree->need_switch;
-
- if (have_primary) {
- WT_ENTER_PAGE_INDEX(session);
- WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)primary)->btree,
- ovfl = __wt_btree_lsm_over_size(session, hard_limit ?
- 2 * lsm_tree->chunk_size : lsm_tree->chunk_size));
- WT_LEAVE_PAGE_INDEX(session);
-
- /* If there was no overflow, we're done. */
- if (!ovfl)
- return (0);
- }
-
- /* Request a switch. */
- WT_RET(__wt_clsm_request_switch(clsm));
-
- /* If we only overflowed the soft limit, we're done. */
- if (have_primary && !hard_limit)
- return (0);
-
- WT_RET(__wt_clsm_await_switch(clsm));
-
- return (0);
+ WT_CURSOR *primary;
+ WT_LSM_CHUNK *primary_chunk;
+ WT_LSM_TREE *lsm_tree;
+ WT_SESSION_IMPL *session;
+ bool hard_limit, have_primary, ovfl;
+
+ lsm_tree = clsm->lsm_tree;
+ session = (WT_SESSION_IMPL *)clsm->iface.session;
+
+ if (clsm->nchunks == 0) {
+ primary = NULL;
+ have_primary = false;
+ } else {
+ primary = clsm->chunks[clsm->nchunks - 1]->cursor;
+ primary_chunk = clsm->primary_chunk;
+ WT_ASSERT(session, F_ISSET(&session->txn, WT_TXN_HAS_ID));
+ have_primary = (primary != NULL && primary_chunk != NULL &&
+ (primary_chunk->switch_txn == WT_TXN_NONE ||
+ WT_TXNID_LT(session->txn.id, primary_chunk->switch_txn)));
+ }
+
+ /*
+ * In LSM there are multiple btrees active at one time. The tree
+ * switch code needs to use btree API methods, and it wants to
+ * operate on the btree for the primary chunk. Set that up now.
+ *
+ * If the primary chunk has grown too large, set a flag so the worker
+ * thread will switch when it gets a chance to avoid introducing high
+ * latency into application threads. Don't do this indefinitely: if a
+ * chunk grows twice as large as the configured size, block until it
+ * can be switched.
+ */
+ hard_limit = lsm_tree->need_switch;
+
+ if (have_primary) {
+ WT_ENTER_PAGE_INDEX(session);
+ WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)primary)->btree,
+ ovfl = __wt_btree_lsm_over_size(
+ session, hard_limit ? 2 * lsm_tree->chunk_size : lsm_tree->chunk_size));
+ WT_LEAVE_PAGE_INDEX(session);
+
+ /* If there was no overflow, we're done. */
+ if (!ovfl)
+ return (0);
+ }
+
+ /* Request a switch. */
+ WT_RET(__wt_clsm_request_switch(clsm));
+
+ /* If we only overflowed the soft limit, we're done. */
+ if (have_primary && !hard_limit)
+ return (0);
+
+ WT_RET(__wt_clsm_await_switch(clsm));
+
+ return (0);
}
/*
* __clsm_enter --
- * Start an operation on an LSM cursor, update if the tree has changed.
+ * Start an operation on an LSM cursor, update if the tree has changed.
*/
static inline int
__clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update)
{
- WT_DECL_RET;
- WT_LSM_TREE *lsm_tree;
- WT_SESSION_IMPL *session;
- WT_TXN *txn;
- uint64_t i, pinned_id , switch_txn;
-
- lsm_tree = clsm->lsm_tree;
- session = (WT_SESSION_IMPL *)clsm->iface.session;
- txn = &session->txn;
-
- /* Merge cursors never update. */
- if (F_ISSET(clsm, WT_CLSM_MERGE))
- return (0);
-
- if (reset) {
- WT_ASSERT(session, !F_ISSET(&clsm->iface,
- WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT));
- WT_RET(__clsm_reset_cursors(clsm, NULL));
- }
-
- for (;;) {
- /* Check if the cursor looks up-to-date. */
- if (clsm->dsk_gen != lsm_tree->dsk_gen &&
- lsm_tree->nchunks != 0)
- goto open;
-
- /* Update the maximum transaction ID in the primary chunk. */
- if (update) {
- /*
- * Ensure that there is a transaction snapshot active.
- */
- WT_RET(__wt_txn_autocommit_check(session));
- WT_RET(__wt_txn_id_check(session));
-
- WT_RET(__clsm_enter_update(clsm));
- /*
- * Switching the tree will update the generation before
- * updating the switch transaction. We test the
- * transaction in clsm_enter_update. Now test the
- * disk generation to avoid races.
- */
- if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen)
- goto open;
-
- if (txn->isolation == WT_ISO_SNAPSHOT)
- __wt_txn_cursor_op(session);
-
- /*
- * Figure out how many updates are required for
- * snapshot isolation.
- *
- * This is not a normal visibility check on the maximum
- * transaction ID in each chunk: any transaction ID
- * that overlaps with our snapshot is a potential
- * conflict.
- *
- * Note that the pinned ID is correct here: it tracks
- * concurrent transactions excluding special
- * transactions such as checkpoint (which we can't
- * conflict with because checkpoint only writes the
- * metadata, which is not an LSM tree).
- */
- clsm->nupdates = 1;
- if (txn->isolation == WT_ISO_SNAPSHOT &&
- F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
- WT_ASSERT(session,
- F_ISSET(txn, WT_TXN_HAS_SNAPSHOT));
- pinned_id =
- WT_SESSION_TXN_STATE(session)->pinned_id;
- for (i = clsm->nchunks - 2;
- clsm->nupdates < clsm->nchunks;
- clsm->nupdates++, i--) {
- switch_txn =
- clsm->chunks[i]->switch_txn;
- if (WT_TXNID_LT(switch_txn, pinned_id))
- break;
- WT_ASSERT(session,
- !__wt_txn_visible_all(
- session, switch_txn, WT_TS_NONE));
- }
- }
- }
-
- /*
- * Stop when we are up-to-date, as long as this is:
- * - a snapshot isolation update and the cursor is set up for
- * that;
- * - an update operation with a primary chunk, or
- * - a read operation and the cursor is open for reading.
- */
- if ((!update ||
- txn->isolation != WT_ISO_SNAPSHOT ||
- F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) &&
- ((update && clsm->primary_chunk != NULL) ||
- (!update && F_ISSET(clsm, WT_CLSM_OPEN_READ))))
- break;
-
-open: WT_WITH_SCHEMA_LOCK(session,
- ret = __clsm_open_cursors(clsm, update, 0, 0));
- WT_RET(ret);
- }
-
- if (!F_ISSET(clsm, WT_CLSM_ACTIVE)) {
- /*
- * Opening this LSM cursor has opened a number of btree
- * cursors, ensure other code doesn't think this is the first
- * cursor in a session.
- */
- ++session->ncursors;
- WT_RET(__cursor_enter(session));
- F_SET(clsm, WT_CLSM_ACTIVE);
- }
-
- return (0);
+ WT_DECL_RET;
+ WT_LSM_TREE *lsm_tree;
+ WT_SESSION_IMPL *session;
+ WT_TXN *txn;
+ uint64_t i, pinned_id, switch_txn;
+
+ lsm_tree = clsm->lsm_tree;
+ session = (WT_SESSION_IMPL *)clsm->iface.session;
+ txn = &session->txn;
+
+ /* Merge cursors never update. */
+ if (F_ISSET(clsm, WT_CLSM_MERGE))
+ return (0);
+
+ if (reset) {
+ WT_ASSERT(session, !F_ISSET(&clsm->iface, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT));
+ WT_RET(__clsm_reset_cursors(clsm, NULL));
+ }
+
+ for (;;) {
+ /* Check if the cursor looks up-to-date. */
+ if (clsm->dsk_gen != lsm_tree->dsk_gen && lsm_tree->nchunks != 0)
+ goto open;
+
+ /* Update the maximum transaction ID in the primary chunk. */
+ if (update) {
+ /*
+ * Ensure that there is a transaction snapshot active.
+ */
+ WT_RET(__wt_txn_autocommit_check(session));
+ WT_RET(__wt_txn_id_check(session));
+
+ WT_RET(__clsm_enter_update(clsm));
+ /*
+ * Switching the tree will update the generation before updating the switch transaction.
+ * We test the transaction in clsm_enter_update. Now test the disk generation to avoid
+ * races.
+ */
+ if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen)
+ goto open;
+
+ if (txn->isolation == WT_ISO_SNAPSHOT)
+ __wt_txn_cursor_op(session);
+
+ /*
+ * Figure out how many updates are required for
+ * snapshot isolation.
+ *
+ * This is not a normal visibility check on the maximum
+ * transaction ID in each chunk: any transaction ID
+ * that overlaps with our snapshot is a potential
+ * conflict.
+ *
+ * Note that the pinned ID is correct here: it tracks
+ * concurrent transactions excluding special
+ * transactions such as checkpoint (which we can't
+ * conflict with because checkpoint only writes the
+ * metadata, which is not an LSM tree).
+ */
+ clsm->nupdates = 1;
+ if (txn->isolation == WT_ISO_SNAPSHOT && F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
+ WT_ASSERT(session, F_ISSET(txn, WT_TXN_HAS_SNAPSHOT));
+ pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id;
+ for (i = clsm->nchunks - 2; clsm->nupdates < clsm->nchunks; clsm->nupdates++, i--) {
+ switch_txn = clsm->chunks[i]->switch_txn;
+ if (WT_TXNID_LT(switch_txn, pinned_id))
+ break;
+ WT_ASSERT(session, !__wt_txn_visible_all(session, switch_txn, WT_TS_NONE));
+ }
+ }
+ }
+
+ /*
+ * Stop when we are up-to-date, as long as this is:
+ * - a snapshot isolation update and the cursor is set up for
+ * that;
+ * - an update operation with a primary chunk, or
+ * - a read operation and the cursor is open for reading.
+ */
+ if ((!update || txn->isolation != WT_ISO_SNAPSHOT ||
+ F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) &&
+ ((update && clsm->primary_chunk != NULL) ||
+ (!update && F_ISSET(clsm, WT_CLSM_OPEN_READ))))
+ break;
+
+open:
+ WT_WITH_SCHEMA_LOCK(session, ret = __clsm_open_cursors(clsm, update, 0, 0));
+ WT_RET(ret);
+ }
+
+ if (!F_ISSET(clsm, WT_CLSM_ACTIVE)) {
+ /*
+ * Opening this LSM cursor has opened a number of btree cursors, ensure other code doesn't
+ * think this is the first cursor in a session.
+ */
+ ++session->ncursors;
+ WT_RET(__cursor_enter(session));
+ F_SET(clsm, WT_CLSM_ACTIVE);
+ }
+
+ return (0);
}
/*
* __clsm_leave --
- * Finish an operation on an LSM cursor.
+ * Finish an operation on an LSM cursor.
*/
static void
__clsm_leave(WT_CURSOR_LSM *clsm)
{
- WT_SESSION_IMPL *session;
+ WT_SESSION_IMPL *session;
- session = (WT_SESSION_IMPL *)clsm->iface.session;
+ session = (WT_SESSION_IMPL *)clsm->iface.session;
- if (F_ISSET(clsm, WT_CLSM_ACTIVE)) {
- --session->ncursors;
- __cursor_leave(session);
- F_CLR(clsm, WT_CLSM_ACTIVE);
- }
+ if (F_ISSET(clsm, WT_CLSM_ACTIVE)) {
+ --session->ncursors;
+ __cursor_leave(session);
+ F_CLR(clsm, WT_CLSM_ACTIVE);
+ }
}
/*
- * We need a tombstone to mark deleted records, and we use the special
- * value below for that purpose. We use two 0x14 (Device Control 4) bytes to
- * minimize the likelihood of colliding with an application-chosen encoding
- * byte, if the application uses two leading DC4 byte for some reason, we'll do
- * a wasted data copy each time a new value is inserted into the object.
+ * We need a tombstone to mark deleted records, and we use the special value below for that purpose.
+ * We use two 0x14 (Device Control 4) bytes to minimize the likelihood of colliding with an
+ * application-chosen encoding byte, if the application uses two leading DC4 byte for some reason,
+ * we'll do a wasted data copy each time a new value is inserted into the object.
*/
-static const WT_ITEM __tombstone = { "\x14\x14", 2, NULL, 0, 0 };
+static const WT_ITEM __tombstone = {"\x14\x14", 2, NULL, 0, 0};
/*
* __clsm_deleted --
- * Check whether the current value is a tombstone.
+ * Check whether the current value is a tombstone.
*/
static inline bool
__clsm_deleted(WT_CURSOR_LSM *clsm, const WT_ITEM *item)
{
- return (!F_ISSET(clsm, WT_CLSM_MINOR_MERGE) &&
- item->size == __tombstone.size &&
- memcmp(item->data, __tombstone.data, __tombstone.size) == 0);
+ return (!F_ISSET(clsm, WT_CLSM_MINOR_MERGE) && item->size == __tombstone.size &&
+ memcmp(item->data, __tombstone.data, __tombstone.size) == 0);
}
/*
* __clsm_deleted_encode --
- * Encode values that are in the encoded name space.
+ * Encode values that are in the encoded name space.
*/
static inline int
-__clsm_deleted_encode(WT_SESSION_IMPL *session,
- const WT_ITEM *value, WT_ITEM *final_value, WT_ITEM **tmpp)
+__clsm_deleted_encode(
+ WT_SESSION_IMPL *session, const WT_ITEM *value, WT_ITEM *final_value, WT_ITEM **tmpp)
{
- WT_ITEM *tmp;
-
- /*
- * If value requires encoding, get a scratch buffer of the right size
- * and create a copy of the data with the first byte of the tombstone
- * appended.
- */
- if (value->size >= __tombstone.size &&
- memcmp(value->data, __tombstone.data, __tombstone.size) == 0) {
- WT_RET(__wt_scr_alloc(session, value->size + 1, tmpp));
- tmp = *tmpp;
-
- memcpy(tmp->mem, value->data, value->size);
- memcpy((uint8_t *)tmp->mem + value->size, __tombstone.data, 1);
- final_value->data = tmp->mem;
- final_value->size = value->size + 1;
- } else {
- final_value->data = value->data;
- final_value->size = value->size;
- }
-
- return (0);
+ WT_ITEM *tmp;
+
+ /*
+ * If value requires encoding, get a scratch buffer of the right size and create a copy of the
+ * data with the first byte of the tombstone appended.
+ */
+ if (value->size >= __tombstone.size &&
+ memcmp(value->data, __tombstone.data, __tombstone.size) == 0) {
+ WT_RET(__wt_scr_alloc(session, value->size + 1, tmpp));
+ tmp = *tmpp;
+
+ memcpy(tmp->mem, value->data, value->size);
+ memcpy((uint8_t *)tmp->mem + value->size, __tombstone.data, 1);
+ final_value->data = tmp->mem;
+ final_value->size = value->size + 1;
+ } else {
+ final_value->data = value->data;
+ final_value->size = value->size;
+ }
+
+ return (0);
}
/*
* __clsm_deleted_decode --
- * Decode values that start with the tombstone.
+ * Decode values that start with the tombstone.
*/
static inline void
__clsm_deleted_decode(WT_CURSOR_LSM *clsm, WT_ITEM *value)
{
- /*
- * Take care with this check: when an LSM cursor is used for a merge,
- * and/or to create a Bloom filter, it is valid to return the tombstone
- * value.
- */
- if (!F_ISSET(clsm, WT_CLSM_MERGE) &&
- value->size > __tombstone.size &&
- memcmp(value->data, __tombstone.data, __tombstone.size) == 0)
- --value->size;
+ /*
+ * Take care with this check: when an LSM cursor is used for a merge, and/or to create a Bloom
+ * filter, it is valid to return the tombstone value.
+ */
+ if (!F_ISSET(clsm, WT_CLSM_MERGE) && value->size > __tombstone.size &&
+ memcmp(value->data, __tombstone.data, __tombstone.size) == 0)
+ --value->size;
}
/*
* __clsm_close_cursors --
- * Close any btree cursors that are not needed.
+ * Close any btree cursors that are not needed.
*/
static int
-__clsm_close_cursors(
- WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, u_int start, u_int end)
+__clsm_close_cursors(WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, u_int start, u_int end)
{
- WT_BLOOM *bloom;
- WT_CURSOR *c;
- u_int i;
-
- __wt_verbose(session, WT_VERB_LSM,
- "LSM closing cursor session(%p):clsm(%p), start: %u, end: %u",
- (void *)session, (void *)clsm, start, end);
-
- if (clsm->chunks == NULL || clsm->nchunks == 0)
- return (0);
-
- /*
- * Walk the cursors, closing any we don't need. Note that the exit
- * condition here is special, don't use WT_FORALL_CURSORS, and be
- * careful with unsigned integer wrapping.
- */
- for (i = start; i < end; i++) {
- if ((c = (clsm)->chunks[i]->cursor) != NULL) {
- clsm->chunks[i]->cursor = NULL;
- WT_RET(c->close(c));
- }
- if ((bloom = clsm->chunks[i]->bloom) != NULL) {
- clsm->chunks[i]->bloom = NULL;
- WT_RET(__wt_bloom_close(bloom));
- }
- }
-
- return (0);
+ WT_BLOOM *bloom;
+ WT_CURSOR *c;
+ u_int i;
+
+ __wt_verbose(session, WT_VERB_LSM,
+ "LSM closing cursor session(%p):clsm(%p), start: %u, end: %u", (void *)session, (void *)clsm,
+ start, end);
+
+ if (clsm->chunks == NULL || clsm->nchunks == 0)
+ return (0);
+
+ /*
+ * Walk the cursors, closing any we don't need. Note that the exit condition here is special,
+ * don't use WT_FORALL_CURSORS, and be careful with unsigned integer wrapping.
+ */
+ for (i = start; i < end; i++) {
+ if ((c = (clsm)->chunks[i]->cursor) != NULL) {
+ clsm->chunks[i]->cursor = NULL;
+ WT_RET(c->close(c));
+ }
+ if ((bloom = clsm->chunks[i]->bloom) != NULL) {
+ clsm->chunks[i]->bloom = NULL;
+ WT_RET(__wt_bloom_close(bloom));
+ }
+ }
+
+ return (0);
}
/*
* __clsm_resize_chunks --
- * Allocates an array of unit objects for each chunk.
+ * Allocates an array of unit objects for each chunk.
*/
static int
-__clsm_resize_chunks(
- WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, u_int nchunks)
+__clsm_resize_chunks(WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, u_int nchunks)
{
- WT_LSM_CURSOR_CHUNK *chunk;
-
- /* Don't allocate more iterators if we don't need them. */
- if (clsm->chunks_count >= nchunks)
- return (0);
-
- WT_RET(__wt_realloc_def(session, &clsm->chunks_alloc, nchunks,
- &clsm->chunks));
- for (; clsm->chunks_count < nchunks; clsm->chunks_count++) {
- WT_RET(__wt_calloc_one(session, &chunk));
- clsm->chunks[clsm->chunks_count] = chunk;
- }
- return (0);
+ WT_LSM_CURSOR_CHUNK *chunk;
+
+ /* Don't allocate more iterators if we don't need them. */
+ if (clsm->chunks_count >= nchunks)
+ return (0);
+
+ WT_RET(__wt_realloc_def(session, &clsm->chunks_alloc, nchunks, &clsm->chunks));
+ for (; clsm->chunks_count < nchunks; clsm->chunks_count++) {
+ WT_RET(__wt_calloc_one(session, &chunk));
+ clsm->chunks[clsm->chunks_count] = chunk;
+ }
+ return (0);
}
/*
* __clsm_free_chunks --
- * Allocates an array of unit objects for each chunk.
+ * Allocates an array of unit objects for each chunk.
*/
static void
__clsm_free_chunks(WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm)
{
- size_t i;
+ size_t i;
- for (i = 0; i < clsm->chunks_count; i++)
- __wt_free(session, clsm->chunks[i]);
+ for (i = 0; i < clsm->chunks_count; i++)
+ __wt_free(session, clsm->chunks[i]);
- __wt_free(session, clsm->chunks);
+ __wt_free(session, clsm->chunks);
}
/*
* __clsm_open_cursors --
- * Open cursors for the current set of files.
+ * Open cursors for the current set of files.
*/
static int
-__clsm_open_cursors(
- WT_CURSOR_LSM *clsm, bool update, u_int start_chunk, uint32_t start_id)
+__clsm_open_cursors(WT_CURSOR_LSM *clsm, bool update, u_int start_chunk, uint32_t start_id)
{
- WT_BTREE *btree;
- WT_CURSOR *c, *cursor, *primary;
- WT_DECL_RET;
- WT_LSM_CHUNK *chunk;
- WT_LSM_TREE *lsm_tree;
- WT_SESSION_IMPL *session;
- WT_TXN *txn;
- uint64_t saved_gen;
- u_int close_range_end, close_range_start;
- u_int i, nchunks, ngood, nupdates;
- const char *checkpoint, *ckpt_cfg[3];
- bool locked;
-
- c = &clsm->iface;
- cursor = NULL;
- session = (WT_SESSION_IMPL *)c->session;
- txn = &session->txn;
- chunk = NULL;
- locked = false;
- lsm_tree = clsm->lsm_tree;
-
- /*
- * Ensure that any snapshot update has cursors on the right set of
- * chunks to guarantee visibility is correct.
- */
- if (update && txn->isolation == WT_ISO_SNAPSHOT)
- F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT);
-
- /*
- * Query operations need a full set of cursors. Overwrite cursors
- * do queries in service of updates.
- */
- if (!update || !F_ISSET(c, WT_CURSTD_OVERWRITE))
- F_SET(clsm, WT_CLSM_OPEN_READ);
-
- if (lsm_tree->nchunks == 0)
- return (0);
-
- ckpt_cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor);
- ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw";
- ckpt_cfg[2] = NULL;
-
- /*
- * If the key is pointing to memory that is pinned by a chunk
- * cursor, take a copy before closing cursors.
- */
- if (F_ISSET(c, WT_CURSTD_KEY_INT))
- WT_ERR(__cursor_needkey(c));
-
- F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);
-
- __wt_lsm_tree_readlock(session, lsm_tree);
- locked = true;
-
- /* Merge cursors have already figured out how many chunks they need. */
-retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) {
- nchunks = clsm->nchunks;
- ngood = 0;
- WT_ERR(__clsm_resize_chunks(session, clsm, nchunks));
- /*
- * We may have raced with another merge completing. Check that
- * we're starting at the right offset in the chunk array.
- */
- if (start_chunk >= lsm_tree->nchunks ||
- lsm_tree->chunk[start_chunk]->id != start_id) {
- for (start_chunk = 0;
- start_chunk < lsm_tree->nchunks;
- start_chunk++) {
- chunk = lsm_tree->chunk[start_chunk];
- if (chunk->id == start_id)
- break;
- }
- /* We have to find the start chunk: merge locked it. */
- WT_ASSERT(session, start_chunk < lsm_tree->nchunks);
- }
- } else {
- nchunks = lsm_tree->nchunks;
- WT_ERR(__clsm_resize_chunks(session, clsm, nchunks));
-
- /*
- * If we are only opening the cursor for updates, only open the
- * primary chunk, plus any other chunks that might be required
- * to detect snapshot isolation conflicts.
- */
- if (F_ISSET(clsm, WT_CLSM_OPEN_READ))
- ngood = nupdates = 0;
- else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
- /*
- * Keep going until all updates in the next
- * chunk are globally visible. Copy the maximum
- * transaction IDs into the cursor as we go.
- */
- for (ngood = nchunks - 1, nupdates = 1; ngood > 0;
- ngood--, nupdates++) {
- chunk = lsm_tree->chunk[ngood - 1];
- clsm->chunks[ngood - 1]->switch_txn =
- chunk->switch_txn;
- if (__wt_lsm_chunk_visible_all(session, chunk))
- break;
- }
- } else {
- nupdates = 1;
- ngood = nchunks - 1;
- }
-
- /* Check how many cursors are already open. */
- for (; ngood < clsm->nchunks && ngood < nchunks; ngood++) {
- chunk = lsm_tree->chunk[ngood];
- cursor = clsm->chunks[ngood]->cursor;
-
- /* If the cursor isn't open yet, we're done. */
- if (cursor == NULL)
- break;
-
- /* Easy case: the URIs don't match. */
- if (strcmp(cursor->uri, chunk->uri) != 0)
- break;
-
- /*
- * Make sure the checkpoint config matches when not
- * using a custom data source.
- */
- if (lsm_tree->custom_generation == 0 ||
- chunk->generation < lsm_tree->custom_generation) {
- checkpoint = ((WT_CURSOR_BTREE *)cursor)->
- btree->dhandle->checkpoint;
- if (checkpoint == NULL &&
- F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
- !chunk->empty)
- break;
- }
-
- /* Make sure the Bloom config matches. */
- if (clsm->chunks[ngood]->bloom == NULL &&
- F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
- break;
- }
-
- /* Spurious generation bump? */
- if (ngood == clsm->nchunks && clsm->nchunks == nchunks) {
- clsm->dsk_gen = lsm_tree->dsk_gen;
- goto err;
- }
-
- /*
- * Close any cursors we no longer need.
- *
- * Drop the LSM tree lock while we do this: if the cache is
- * full, we may block while closing a cursor. Save the
- * generation number and retry if it has changed under us.
- */
- if (clsm->chunks != NULL && ngood < clsm->nchunks) {
- close_range_start = ngood;
- close_range_end = clsm->nchunks;
- } else if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0) {
- close_range_start = 0;
- close_range_end = WT_MIN(nchunks, clsm->nchunks);
- if (close_range_end > nupdates)
- close_range_end -= nupdates;
- else
- close_range_end = 0;
- WT_ASSERT(session, ngood >= close_range_end);
- } else {
- close_range_end = 0;
- close_range_start = 0;
- }
- if (close_range_end > close_range_start) {
- saved_gen = lsm_tree->dsk_gen;
- locked = false;
- __wt_lsm_tree_readunlock(session, lsm_tree);
- WT_ERR(__clsm_close_cursors(session,
- clsm, close_range_start, close_range_end));
- __wt_lsm_tree_readlock(session, lsm_tree);
- locked = true;
- if (lsm_tree->dsk_gen != saved_gen)
- goto retry;
- }
-
- /* Detach from our old primary. */
- clsm->primary_chunk = NULL;
- clsm->current = NULL;
- }
-
- WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks);
- clsm->nchunks = nchunks;
-
- /* Open the cursors for chunks that have changed. */
- __wt_verbose(session, WT_VERB_LSM,
- "LSM opening cursor session(%p):clsm(%p)%s, chunks: %u, good: %u",
- (void *)session, (void *)clsm,
- update ? ", update" : "", nchunks, ngood);
- for (i = ngood; i != nchunks; i++) {
- chunk = lsm_tree->chunk[i + start_chunk];
- /* Copy the maximum transaction ID. */
- if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT))
- clsm->chunks[i]->switch_txn = chunk->switch_txn;
-
- /*
- * Read from the checkpoint if the file has been written.
- * Once all cursors switch, the in-memory tree can be evicted.
- */
- WT_ASSERT(session, clsm->chunks[i]->cursor == NULL);
- ret = __wt_open_cursor(session, chunk->uri, c,
- (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ?
- ckpt_cfg : NULL, &clsm->chunks[i]->cursor);
-
- /*
- * XXX kludge: we may have an empty chunk where no checkpoint
- * was written. If so, try to open the ordinary handle on that
- * chunk instead.
- */
- if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
- ret = __wt_open_cursor(session,
- chunk->uri, c, NULL, &clsm->chunks[i]->cursor);
- if (ret == 0)
- chunk->empty = 1;
- }
- WT_ERR(ret);
-
- /*
- * Setup all cursors other than the primary to only do conflict
- * checks on insert operations. This allows us to execute
- * inserts on non-primary chunks as a way of checking for
- * write conflicts with concurrent updates.
- */
- if (i != nchunks - 1)
- clsm->chunks[i]->cursor->insert =
- __wt_curfile_insert_check;
-
- if (!F_ISSET(clsm, WT_CLSM_MERGE) &&
- F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
- WT_ERR(__wt_bloom_open(session, chunk->bloom_uri,
- lsm_tree->bloom_bit_count,
- lsm_tree->bloom_hash_count,
- c, &clsm->chunks[i]->bloom));
-
- /* Child cursors always use overwrite and raw mode. */
- F_SET(clsm->chunks[i]->cursor,
- WT_CURSTD_OVERWRITE | WT_CURSTD_RAW);
- }
-
- /* Setup the count values for each chunk in the chunks */
- for (i = 0; i != clsm->nchunks; i++)
- clsm->chunks[i]->count =
- lsm_tree->chunk[i + start_chunk]->count;
-
- /* The last chunk is our new primary. */
- if (chunk != NULL &&
- !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
- chunk->switch_txn == WT_TXN_NONE) {
- primary = clsm->chunks[clsm->nchunks - 1]->cursor;
- btree = ((WT_CURSOR_BTREE *)primary)->btree;
-
- /*
- * If the primary is not yet set as the primary, do that now.
- * Note that eviction was configured off when the underlying
- * object was created, which is what we want, leave it alone.
- *
- * We don't have to worry about races here: every thread that
- * modifies the tree will have to come through here, at worse
- * we set the flag repeatedly. We don't use a WT_BTREE handle
- * flag, however, we could race doing the read-modify-write of
- * the flags field.
- *
- * If something caused the chunk to be closed and reopened
- * since it was created, we can no longer use it as a primary
- * chunk and we need to force a switch. We detect the tree was
- * created when it was opened by checking the "original" flag.
- */
- if (!btree->lsm_primary && btree->original)
- btree->lsm_primary = true;
- if (btree->lsm_primary)
- clsm->primary_chunk = chunk;
- }
-
- clsm->dsk_gen = lsm_tree->dsk_gen;
+ WT_BTREE *btree;
+ WT_CURSOR *c, *cursor, *primary;
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_TREE *lsm_tree;
+ WT_SESSION_IMPL *session;
+ WT_TXN *txn;
+ uint64_t saved_gen;
+ u_int close_range_end, close_range_start;
+ u_int i, nchunks, ngood, nupdates;
+ const char *checkpoint, *ckpt_cfg[3];
+ bool locked;
+
+ c = &clsm->iface;
+ cursor = NULL;
+ session = (WT_SESSION_IMPL *)c->session;
+ txn = &session->txn;
+ chunk = NULL;
+ locked = false;
+ lsm_tree = clsm->lsm_tree;
+
+ /*
+ * Ensure that any snapshot update has cursors on the right set of chunks to guarantee
+ * visibility is correct.
+ */
+ if (update && txn->isolation == WT_ISO_SNAPSHOT)
+ F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT);
+
+ /*
+ * Query operations need a full set of cursors. Overwrite cursors do queries in service of
+ * updates.
+ */
+ if (!update || !F_ISSET(c, WT_CURSTD_OVERWRITE))
+ F_SET(clsm, WT_CLSM_OPEN_READ);
+
+ if (lsm_tree->nchunks == 0)
+ return (0);
+
+ ckpt_cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor);
+ ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw";
+ ckpt_cfg[2] = NULL;
+
+ /*
+ * If the key is pointing to memory that is pinned by a chunk cursor, take a copy before closing
+ * cursors.
+ */
+ if (F_ISSET(c, WT_CURSTD_KEY_INT))
+ WT_ERR(__cursor_needkey(c));
+
+ F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);
+
+ __wt_lsm_tree_readlock(session, lsm_tree);
+ locked = true;
+
+/* Merge cursors have already figured out how many chunks they need. */
+retry:
+ if (F_ISSET(clsm, WT_CLSM_MERGE)) {
+ nchunks = clsm->nchunks;
+ ngood = 0;
+ WT_ERR(__clsm_resize_chunks(session, clsm, nchunks));
+ /*
+ * We may have raced with another merge completing. Check that we're starting at the right
+ * offset in the chunk array.
+ */
+ if (start_chunk >= lsm_tree->nchunks || lsm_tree->chunk[start_chunk]->id != start_id) {
+ for (start_chunk = 0; start_chunk < lsm_tree->nchunks; start_chunk++) {
+ chunk = lsm_tree->chunk[start_chunk];
+ if (chunk->id == start_id)
+ break;
+ }
+ /* We have to find the start chunk: merge locked it. */
+ WT_ASSERT(session, start_chunk < lsm_tree->nchunks);
+ }
+ } else {
+ nchunks = lsm_tree->nchunks;
+ WT_ERR(__clsm_resize_chunks(session, clsm, nchunks));
+
+ /*
+ * If we are only opening the cursor for updates, only open the primary chunk, plus any
+ * other chunks that might be required to detect snapshot isolation conflicts.
+ */
+ if (F_ISSET(clsm, WT_CLSM_OPEN_READ))
+ ngood = nupdates = 0;
+ else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
+ /*
+ * Keep going until all updates in the next chunk are globally visible. Copy the maximum
+ * transaction IDs into the cursor as we go.
+ */
+ for (ngood = nchunks - 1, nupdates = 1; ngood > 0; ngood--, nupdates++) {
+ chunk = lsm_tree->chunk[ngood - 1];
+ clsm->chunks[ngood - 1]->switch_txn = chunk->switch_txn;
+ if (__wt_lsm_chunk_visible_all(session, chunk))
+ break;
+ }
+ } else {
+ nupdates = 1;
+ ngood = nchunks - 1;
+ }
+
+ /* Check how many cursors are already open. */
+ for (; ngood < clsm->nchunks && ngood < nchunks; ngood++) {
+ chunk = lsm_tree->chunk[ngood];
+ cursor = clsm->chunks[ngood]->cursor;
+
+ /* If the cursor isn't open yet, we're done. */
+ if (cursor == NULL)
+ break;
+
+ /* Easy case: the URIs don't match. */
+ if (strcmp(cursor->uri, chunk->uri) != 0)
+ break;
+
+ /*
+ * Make sure the checkpoint config matches when not using a custom data source.
+ */
+ if (lsm_tree->custom_generation == 0 ||
+ chunk->generation < lsm_tree->custom_generation) {
+ checkpoint = ((WT_CURSOR_BTREE *)cursor)->btree->dhandle->checkpoint;
+ if (checkpoint == NULL && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty)
+ break;
+ }
+
+ /* Make sure the Bloom config matches. */
+ if (clsm->chunks[ngood]->bloom == NULL && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+ break;
+ }
+
+ /* Spurious generation bump? */
+ if (ngood == clsm->nchunks && clsm->nchunks == nchunks) {
+ clsm->dsk_gen = lsm_tree->dsk_gen;
+ goto err;
+ }
+
+ /*
+ * Close any cursors we no longer need.
+ *
+ * Drop the LSM tree lock while we do this: if the cache is
+ * full, we may block while closing a cursor. Save the
+ * generation number and retry if it has changed under us.
+ */
+ if (clsm->chunks != NULL && ngood < clsm->nchunks) {
+ close_range_start = ngood;
+ close_range_end = clsm->nchunks;
+ } else if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0) {
+ close_range_start = 0;
+ close_range_end = WT_MIN(nchunks, clsm->nchunks);
+ if (close_range_end > nupdates)
+ close_range_end -= nupdates;
+ else
+ close_range_end = 0;
+ WT_ASSERT(session, ngood >= close_range_end);
+ } else {
+ close_range_end = 0;
+ close_range_start = 0;
+ }
+ if (close_range_end > close_range_start) {
+ saved_gen = lsm_tree->dsk_gen;
+ locked = false;
+ __wt_lsm_tree_readunlock(session, lsm_tree);
+ WT_ERR(__clsm_close_cursors(session, clsm, close_range_start, close_range_end));
+ __wt_lsm_tree_readlock(session, lsm_tree);
+ locked = true;
+ if (lsm_tree->dsk_gen != saved_gen)
+ goto retry;
+ }
+
+ /* Detach from our old primary. */
+ clsm->primary_chunk = NULL;
+ clsm->current = NULL;
+ }
+
+ WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks);
+ clsm->nchunks = nchunks;
+
+ /* Open the cursors for chunks that have changed. */
+ __wt_verbose(session, WT_VERB_LSM,
+ "LSM opening cursor session(%p):clsm(%p)%s, chunks: %u, good: %u", (void *)session,
+ (void *)clsm, update ? ", update" : "", nchunks, ngood);
+ for (i = ngood; i != nchunks; i++) {
+ chunk = lsm_tree->chunk[i + start_chunk];
+ /* Copy the maximum transaction ID. */
+ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT))
+ clsm->chunks[i]->switch_txn = chunk->switch_txn;
+
+ /*
+ * Read from the checkpoint if the file has been written. Once all cursors switch, the
+ * in-memory tree can be evicted.
+ */
+ WT_ASSERT(session, clsm->chunks[i]->cursor == NULL);
+ ret = __wt_open_cursor(session, chunk->uri, c,
+ (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? ckpt_cfg : NULL,
+ &clsm->chunks[i]->cursor);
+
+ /*
+ * XXX kludge: we may have an empty chunk where no checkpoint was written. If so, try to
+ * open the ordinary handle on that chunk instead.
+ */
+ if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
+ ret = __wt_open_cursor(session, chunk->uri, c, NULL, &clsm->chunks[i]->cursor);
+ if (ret == 0)
+ chunk->empty = 1;
+ }
+ WT_ERR(ret);
+
+ /*
+ * Setup all cursors other than the primary to only do conflict checks on insert operations.
+ * This allows us to execute inserts on non-primary chunks as a way of checking for write
+ * conflicts with concurrent updates.
+ */
+ if (i != nchunks - 1)
+ clsm->chunks[i]->cursor->insert = __wt_curfile_insert_check;
+
+ if (!F_ISSET(clsm, WT_CLSM_MERGE) && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+ WT_ERR(__wt_bloom_open(session, chunk->bloom_uri, lsm_tree->bloom_bit_count,
+ lsm_tree->bloom_hash_count, c, &clsm->chunks[i]->bloom));
+
+ /* Child cursors always use overwrite and raw mode. */
+ F_SET(clsm->chunks[i]->cursor, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW);
+ }
+
+ /* Setup the count values for each chunk in the chunks */
+ for (i = 0; i != clsm->nchunks; i++)
+ clsm->chunks[i]->count = lsm_tree->chunk[i + start_chunk]->count;
+
+ /* The last chunk is our new primary. */
+ if (chunk != NULL && !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && chunk->switch_txn == WT_TXN_NONE) {
+ primary = clsm->chunks[clsm->nchunks - 1]->cursor;
+ btree = ((WT_CURSOR_BTREE *)primary)->btree;
+
+ /*
+ * If the primary is not yet set as the primary, do that now.
+ * Note that eviction was configured off when the underlying
+ * object was created, which is what we want, leave it alone.
+ *
+ * We don't have to worry about races here: every thread that
+ * modifies the tree will have to come through here, at worse
+ * we set the flag repeatedly. We don't use a WT_BTREE handle
+ * flag, however, we could race doing the read-modify-write of
+ * the flags field.
+ *
+ * If something caused the chunk to be closed and reopened
+ * since it was created, we can no longer use it as a primary
+ * chunk and we need to force a switch. We detect the tree was
+ * created when it was opened by checking the "original" flag.
+ */
+ if (!btree->lsm_primary && btree->original)
+ btree->lsm_primary = true;
+ if (btree->lsm_primary)
+ clsm->primary_chunk = chunk;
+ }
+
+ clsm->dsk_gen = lsm_tree->dsk_gen;
err:
#ifdef HAVE_DIAGNOSTIC
- /* Check that all cursors are open as expected. */
- if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) {
- for (i = 0; i != clsm->nchunks; i++) {
- cursor = clsm->chunks[i]->cursor;
- chunk = lsm_tree->chunk[i + start_chunk];
-
- /* Make sure the first cursor is open. */
- WT_ASSERT(session, cursor != NULL);
-
- /* Easy case: the URIs should match. */
- WT_ASSERT(
- session, strcmp(cursor->uri, chunk->uri) == 0);
-
- /*
- * Make sure the checkpoint config matches when not
- * using a custom data source.
- */
- if (lsm_tree->custom_generation == 0 ||
- chunk->generation < lsm_tree->custom_generation) {
- checkpoint = ((WT_CURSOR_BTREE *)cursor)->
- btree->dhandle->checkpoint;
- WT_ASSERT(session,
- (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
- !chunk->empty) ?
- checkpoint != NULL : checkpoint == NULL);
- }
-
- /* Make sure the Bloom config matches. */
- WT_ASSERT(session,
- (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) &&
- !F_ISSET(clsm, WT_CLSM_MERGE)) ?
- clsm->chunks[i]->bloom != NULL :
- clsm->chunks[i]->bloom == NULL);
- }
- }
+ /* Check that all cursors are open as expected. */
+ if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) {
+ for (i = 0; i != clsm->nchunks; i++) {
+ cursor = clsm->chunks[i]->cursor;
+ chunk = lsm_tree->chunk[i + start_chunk];
+
+ /* Make sure the first cursor is open. */
+ WT_ASSERT(session, cursor != NULL);
+
+ /* Easy case: the URIs should match. */
+ WT_ASSERT(session, strcmp(cursor->uri, chunk->uri) == 0);
+
+ /*
+ * Make sure the checkpoint config matches when not using a custom data source.
+ */
+ if (lsm_tree->custom_generation == 0 ||
+ chunk->generation < lsm_tree->custom_generation) {
+ checkpoint = ((WT_CURSOR_BTREE *)cursor)->btree->dhandle->checkpoint;
+ WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ?
+ checkpoint != NULL :
+ checkpoint == NULL);
+ }
+
+ /* Make sure the Bloom config matches. */
+ WT_ASSERT(
+ session, (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) && !F_ISSET(clsm, WT_CLSM_MERGE)) ?
+ clsm->chunks[i]->bloom != NULL :
+ clsm->chunks[i]->bloom == NULL);
+ }
+ }
#endif
- if (locked)
- __wt_lsm_tree_readunlock(session, lsm_tree);
- return (ret);
+ if (locked)
+ __wt_lsm_tree_readunlock(session, lsm_tree);
+ return (ret);
}
/*
* __wt_clsm_init_merge --
- * Initialize an LSM cursor for a merge.
+ * Initialize an LSM cursor for a merge.
*/
int
-__wt_clsm_init_merge(
- WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, u_int nchunks)
+__wt_clsm_init_merge(WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, u_int nchunks)
{
- WT_CURSOR_LSM *clsm;
- WT_DECL_RET;
- WT_SESSION_IMPL *session;
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
- clsm = (WT_CURSOR_LSM *)cursor;
- session = (WT_SESSION_IMPL *)cursor->session;
+ clsm = (WT_CURSOR_LSM *)cursor;
+ session = (WT_SESSION_IMPL *)cursor->session;
- F_SET(clsm, WT_CLSM_MERGE);
- if (start_chunk != 0)
- F_SET(clsm, WT_CLSM_MINOR_MERGE);
- clsm->nchunks = nchunks;
+ F_SET(clsm, WT_CLSM_MERGE);
+ if (start_chunk != 0)
+ F_SET(clsm, WT_CLSM_MINOR_MERGE);
+ clsm->nchunks = nchunks;
- WT_WITH_SCHEMA_LOCK(session,
- ret = __clsm_open_cursors(clsm, false, start_chunk, start_id));
- return (ret);
+ WT_WITH_SCHEMA_LOCK(session, ret = __clsm_open_cursors(clsm, false, start_chunk, start_id));
+ return (ret);
}
/*
* __clsm_get_current --
- * Find the smallest / largest of the cursors and copy its key/value.
+ * Find the smallest / largest of the cursors and copy its key/value.
*/
static int
-__clsm_get_current(WT_SESSION_IMPL *session,
- WT_CURSOR_LSM *clsm, bool smallest, bool *deletedp)
+__clsm_get_current(WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, bool smallest, bool *deletedp)
{
- WT_CURSOR *c, *current;
- u_int i;
- int cmp;
- bool multiple;
-
- current = NULL;
- multiple = false;
-
- WT_FORALL_CURSORS(clsm, c, i) {
- if (!F_ISSET(c, WT_CURSTD_KEY_INT))
- continue;
- if (current == NULL) {
- current = c;
- continue;
- }
- WT_RET(WT_LSM_CURCMP(session, clsm->lsm_tree, c, current, cmp));
- if (smallest ? cmp < 0 : cmp > 0) {
- current = c;
- multiple = false;
- } else if (cmp == 0)
- multiple = true;
- }
-
- c = &clsm->iface;
- if ((clsm->current = current) == NULL) {
- F_CLR(c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
- return (WT_NOTFOUND);
- }
-
- if (multiple)
- F_SET(clsm, WT_CLSM_MULTIPLE);
- else
- F_CLR(clsm, WT_CLSM_MULTIPLE);
-
- WT_RET(current->get_key(current, &c->key));
- WT_RET(current->get_value(current, &c->value));
-
- F_CLR(c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
- if ((*deletedp = __clsm_deleted(clsm, &c->value)) == false)
- F_SET(c, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
-
- return (0);
+ WT_CURSOR *c, *current;
+ u_int i;
+ int cmp;
+ bool multiple;
+
+ current = NULL;
+ multiple = false;
+
+ WT_FORALL_CURSORS(clsm, c, i)
+ {
+ if (!F_ISSET(c, WT_CURSTD_KEY_INT))
+ continue;
+ if (current == NULL) {
+ current = c;
+ continue;
+ }
+ WT_RET(WT_LSM_CURCMP(session, clsm->lsm_tree, c, current, cmp));
+ if (smallest ? cmp < 0 : cmp > 0) {
+ current = c;
+ multiple = false;
+ } else if (cmp == 0)
+ multiple = true;
+ }
+
+ c = &clsm->iface;
+ if ((clsm->current = current) == NULL) {
+ F_CLR(c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ return (WT_NOTFOUND);
+ }
+
+ if (multiple)
+ F_SET(clsm, WT_CLSM_MULTIPLE);
+ else
+ F_CLR(clsm, WT_CLSM_MULTIPLE);
+
+ WT_RET(current->get_key(current, &c->key));
+ WT_RET(current->get_value(current, &c->value));
+
+ F_CLR(c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ if ((*deletedp = __clsm_deleted(clsm, &c->value)) == false)
+ F_SET(c, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+ return (0);
}
/*
* __clsm_compare --
- * WT_CURSOR->compare implementation for the LSM cursor type.
+ * WT_CURSOR->compare implementation for the LSM cursor type.
*/
static int
__clsm_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
{
- WT_CURSOR_LSM *alsm;
- WT_DECL_RET;
- WT_SESSION_IMPL *session;
+ WT_CURSOR_LSM *alsm;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
- /* There's no need to sync with the LSM tree, avoid WT_LSM_ENTER. */
- alsm = (WT_CURSOR_LSM *)a;
- CURSOR_API_CALL(a, session, compare, NULL);
+ /* There's no need to sync with the LSM tree, avoid WT_LSM_ENTER. */
+ alsm = (WT_CURSOR_LSM *)a;
+ CURSOR_API_CALL(a, session, compare, NULL);
- /*
- * Confirm both cursors refer to the same source and have keys, then
- * compare the keys.
- */
- if (strcmp(a->uri, b->uri) != 0)
- WT_ERR_MSG(session, EINVAL,
- "comparison method cursors must reference the same object");
+ /*
+ * Confirm both cursors refer to the same source and have keys, then compare the keys.
+ */
+ if (strcmp(a->uri, b->uri) != 0)
+ WT_ERR_MSG(session, EINVAL, "comparison method cursors must reference the same object");
- WT_ERR(__cursor_needkey(a));
- WT_ERR(__cursor_needkey(b));
+ WT_ERR(__cursor_needkey(a));
+ WT_ERR(__cursor_needkey(b));
- WT_ERR(__wt_compare(
- session, alsm->lsm_tree->collator, &a->key, &b->key, cmpp));
+ WT_ERR(__wt_compare(session, alsm->lsm_tree->collator, &a->key, &b->key, cmpp));
-err: API_END_RET(session, ret);
+err:
+ API_END_RET(session, ret);
}
/*
* __clsm_position_chunk --
- * Position a chunk cursor.
+ * Position a chunk cursor.
*/
static int
-__clsm_position_chunk(
- WT_CURSOR_LSM *clsm, WT_CURSOR *c, bool forward, int *cmpp)
+__clsm_position_chunk(WT_CURSOR_LSM *clsm, WT_CURSOR *c, bool forward, int *cmpp)
{
- WT_CURSOR *cursor;
- WT_SESSION_IMPL *session;
-
- cursor = &clsm->iface;
- session = (WT_SESSION_IMPL *)cursor->session;
-
- c->set_key(c, &cursor->key);
- WT_RET(c->search_near(c, cmpp));
-
- while (forward ? *cmpp < 0 : *cmpp > 0) {
- WT_RET(forward ? c->next(c) : c->prev(c));
-
- /*
- * With higher isolation levels, where we have stable reads,
- * we're done: the cursor is now positioned as expected.
- *
- * With read-uncommitted isolation, a new record could have
- * appeared in between the search and stepping forward / back.
- * In that case, keep going until we see a key in the expected
- * range.
- */
- if (session->txn.isolation != WT_ISO_READ_UNCOMMITTED)
- return (0);
-
- WT_RET(WT_LSM_CURCMP(session,
- clsm->lsm_tree, c, cursor, *cmpp));
- }
-
- return (0);
+ WT_CURSOR *cursor;
+ WT_SESSION_IMPL *session;
+
+ cursor = &clsm->iface;
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ c->set_key(c, &cursor->key);
+ WT_RET(c->search_near(c, cmpp));
+
+ while (forward ? *cmpp < 0 : *cmpp > 0) {
+ WT_RET(forward ? c->next(c) : c->prev(c));
+
+ /*
+ * With higher isolation levels, where we have stable reads,
+ * we're done: the cursor is now positioned as expected.
+ *
+ * With read-uncommitted isolation, a new record could have
+ * appeared in between the search and stepping forward / back.
+ * In that case, keep going until we see a key in the expected
+ * range.
+ */
+ if (session->txn.isolation != WT_ISO_READ_UNCOMMITTED)
+ return (0);
+
+ WT_RET(WT_LSM_CURCMP(session, clsm->lsm_tree, c, cursor, *cmpp));
+ }
+
+ return (0);
}
/*
* __clsm_next --
- * WT_CURSOR->next method for the LSM cursor type.
+ * WT_CURSOR->next method for the LSM cursor type.
*/
static int
__clsm_next(WT_CURSOR *cursor)
{
- WT_CURSOR *c;
- WT_CURSOR_LSM *clsm;
- WT_DECL_RET;
- WT_SESSION_IMPL *session;
- u_int i;
- int cmp;
- bool deleted;
-
- clsm = (WT_CURSOR_LSM *)cursor;
-
- CURSOR_API_CALL(cursor, session, next, NULL);
- __cursor_novalue(cursor);
- WT_ERR(__clsm_enter(clsm, false, false));
-
- /* If we aren't positioned for a forward scan, get started. */
- if (clsm->current == NULL || !F_ISSET(clsm, WT_CLSM_ITERATE_NEXT)) {
- WT_FORALL_CURSORS(clsm, c, i) {
- if (!F_ISSET(cursor, WT_CURSTD_KEY_SET)) {
- WT_ERR(c->reset(c));
- ret = c->next(c);
- } else if (c != clsm->current && (ret =
- __clsm_position_chunk(clsm, c, true, &cmp)) == 0 &&
- cmp == 0 && clsm->current == NULL)
- clsm->current = c;
- WT_ERR_NOTFOUND_OK(ret);
- }
- F_SET(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_MULTIPLE);
- F_CLR(clsm, WT_CLSM_ITERATE_PREV);
-
- /* We just positioned *at* the key, now move. */
- if (clsm->current != NULL)
- goto retry;
- } else {
+ WT_CURSOR *c;
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ u_int i;
+ int cmp;
+ bool deleted;
+
+ clsm = (WT_CURSOR_LSM *)cursor;
+
+ CURSOR_API_CALL(cursor, session, next, NULL);
+ __cursor_novalue(cursor);
+ WT_ERR(__clsm_enter(clsm, false, false));
+
+ /* If we aren't positioned for a forward scan, get started. */
+ if (clsm->current == NULL || !F_ISSET(clsm, WT_CLSM_ITERATE_NEXT)) {
+ WT_FORALL_CURSORS(clsm, c, i)
+ {
+ if (!F_ISSET(cursor, WT_CURSTD_KEY_SET)) {
+ WT_ERR(c->reset(c));
+ ret = c->next(c);
+ } else if (c != clsm->current &&
+ (ret = __clsm_position_chunk(clsm, c, true, &cmp)) == 0 && cmp == 0 &&
+ clsm->current == NULL)
+ clsm->current = c;
+ WT_ERR_NOTFOUND_OK(ret);
+ }
+ F_SET(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_MULTIPLE);
+ F_CLR(clsm, WT_CLSM_ITERATE_PREV);
+
+ /* We just positioned *at* the key, now move. */
+ if (clsm->current != NULL)
+ goto retry;
+ } else {
retry:
- /*
- * If there are multiple cursors on that key, move them
- * forward.
- */
- if (F_ISSET(clsm, WT_CLSM_MULTIPLE)) {
- WT_FORALL_CURSORS(clsm, c, i) {
- if (!F_ISSET(c, WT_CURSTD_KEY_INT))
- continue;
- if (c != clsm->current) {
- WT_ERR(WT_LSM_CURCMP(session,
- clsm->lsm_tree, c, clsm->current,
- cmp));
- if (cmp == 0)
- WT_ERR_NOTFOUND_OK(c->next(c));
- }
- }
- }
-
- /* Move the smallest cursor forward. */
- c = clsm->current;
- WT_ERR_NOTFOUND_OK(c->next(c));
- }
-
- /* Find the cursor(s) with the smallest key. */
- if ((ret = __clsm_get_current(session, clsm, true, &deleted)) == 0 &&
- deleted)
- goto retry;
-
-err: __clsm_leave(clsm);
- if (ret == 0)
- __clsm_deleted_decode(clsm, &cursor->value);
- API_END_RET(session, ret);
+ /*
+ * If there are multiple cursors on that key, move them forward.
+ */
+ if (F_ISSET(clsm, WT_CLSM_MULTIPLE)) {
+ WT_FORALL_CURSORS(clsm, c, i)
+ {
+ if (!F_ISSET(c, WT_CURSTD_KEY_INT))
+ continue;
+ if (c != clsm->current) {
+ WT_ERR(WT_LSM_CURCMP(session, clsm->lsm_tree, c, clsm->current, cmp));
+ if (cmp == 0)
+ WT_ERR_NOTFOUND_OK(c->next(c));
+ }
+ }
+ }
+
+ /* Move the smallest cursor forward. */
+ c = clsm->current;
+ WT_ERR_NOTFOUND_OK(c->next(c));
+ }
+
+ /* Find the cursor(s) with the smallest key. */
+ if ((ret = __clsm_get_current(session, clsm, true, &deleted)) == 0 && deleted)
+ goto retry;
+
+err:
+ __clsm_leave(clsm);
+ if (ret == 0)
+ __clsm_deleted_decode(clsm, &cursor->value);
+ API_END_RET(session, ret);
}
/*
* __clsm_random_chunk --
- * Pick a chunk at random, weighted by the size of all chunks. Weighting
- * proportional to documents avoids biasing towards small chunks. Then return
- * the cursor on the chunk we have picked.
+ * Pick a chunk at random, weighted by the size of all chunks. Weighting proportional to
+ * documents avoids biasing towards small chunks. Then return the cursor on the chunk we have
+ * picked.
*/
static int
-__clsm_random_chunk(WT_SESSION_IMPL *session,
- WT_CURSOR_LSM *clsm, WT_CURSOR **cursor)
+__clsm_random_chunk(WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, WT_CURSOR **cursor)
{
- uint64_t checked_docs, i, rand_doc, total_docs;
-
- /*
- * If the tree is empty we cannot do a random lookup, so return a
- * WT_NOTFOUND.
- */
- if (clsm->nchunks == 0)
- return (WT_NOTFOUND);
- for (total_docs = i = 0; i < clsm->nchunks; i++) {
- total_docs += clsm->chunks[i]->count;
- }
- if (total_docs == 0)
- return (WT_NOTFOUND);
-
- rand_doc = __wt_random(&session->rnd) % total_docs;
-
- for (checked_docs = i = 0; i < clsm->nchunks; i++) {
- checked_docs += clsm->chunks[i]->count;
- if (rand_doc <= checked_docs) {
- *cursor = clsm->chunks[i]->cursor;
- break;
- }
- }
- return (0);
+ uint64_t checked_docs, i, rand_doc, total_docs;
+
+ /*
+ * If the tree is empty we cannot do a random lookup, so return a WT_NOTFOUND.
+ */
+ if (clsm->nchunks == 0)
+ return (WT_NOTFOUND);
+ for (total_docs = i = 0; i < clsm->nchunks; i++) {
+ total_docs += clsm->chunks[i]->count;
+ }
+ if (total_docs == 0)
+ return (WT_NOTFOUND);
+
+ rand_doc = __wt_random(&session->rnd) % total_docs;
+
+ for (checked_docs = i = 0; i < clsm->nchunks; i++) {
+ checked_docs += clsm->chunks[i]->count;
+ if (rand_doc <= checked_docs) {
+ *cursor = clsm->chunks[i]->cursor;
+ break;
+ }
+ }
+ return (0);
}
/*
* __clsm_next_random --
- * WT_CURSOR->next method for the LSM cursor type when configured with
- * next_random.
+ * WT_CURSOR->next method for the LSM cursor type when configured with next_random.
*/
static int
__clsm_next_random(WT_CURSOR *cursor)
{
- WT_CURSOR *c;
- WT_CURSOR_LSM *clsm;
- WT_DECL_RET;
- WT_SESSION_IMPL *session;
- int exact;
-
- c = NULL;
- clsm = (WT_CURSOR_LSM *)cursor;
-
- CURSOR_API_CALL(cursor, session, next, NULL);
- __cursor_novalue(cursor);
- WT_ERR(__clsm_enter(clsm, false, false));
-
- for (;;) {
- WT_ERR(__clsm_random_chunk(session, clsm, &c));
- /*
- * This call to next_random on the chunk can potentially end in
- * WT_NOTFOUND if the chunk we picked is empty. We want to retry
- * in that case.
- */
- ret = __wt_curfile_next_random(c);
- if (ret == WT_NOTFOUND)
- continue;
-
- WT_ERR(ret);
- F_SET(cursor, WT_CURSTD_KEY_INT);
- WT_ERR(c->get_key(c, &cursor->key));
- /*
- * Search near the current key to resolve any tombstones
- * and position to a valid document. If we see a
- * WT_NOTFOUND here that is valid, as the tree has no
- * documents visible to us.
- */
- WT_ERR(__clsm_search_near(cursor, &exact));
- break;
- }
-
- /* We have found a valid doc. Set that we are now positioned */
- if (0) {
-err: F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
- }
- __clsm_leave(clsm);
- API_END_RET(session, ret);
+ WT_CURSOR *c;
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ int exact;
+
+ c = NULL;
+ clsm = (WT_CURSOR_LSM *)cursor;
+
+ CURSOR_API_CALL(cursor, session, next, NULL);
+ __cursor_novalue(cursor);
+ WT_ERR(__clsm_enter(clsm, false, false));
+
+ for (;;) {
+ WT_ERR(__clsm_random_chunk(session, clsm, &c));
+ /*
+ * This call to next_random on the chunk can potentially end in WT_NOTFOUND if the chunk we
+ * picked is empty. We want to retry in that case.
+ */
+ ret = __wt_curfile_next_random(c);
+ if (ret == WT_NOTFOUND)
+ continue;
+
+ WT_ERR(ret);
+ F_SET(cursor, WT_CURSTD_KEY_INT);
+ WT_ERR(c->get_key(c, &cursor->key));
+ /*
+ * Search near the current key to resolve any tombstones and position to a valid document.
+ * If we see a WT_NOTFOUND here that is valid, as the tree has no documents visible to us.
+ */
+ WT_ERR(__clsm_search_near(cursor, &exact));
+ break;
+ }
+
+ /* We have found a valid doc. Set that we are now positioned */
+ if (0) {
+err:
+ F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+ }
+ __clsm_leave(clsm);
+ API_END_RET(session, ret);
}
/*
* __clsm_prev --
- * WT_CURSOR->prev method for the LSM cursor type.
+ * WT_CURSOR->prev method for the LSM cursor type.
*/
static int
__clsm_prev(WT_CURSOR *cursor)
{
- WT_CURSOR *c;
- WT_CURSOR_LSM *clsm;
- WT_DECL_RET;
- WT_SESSION_IMPL *session;
- u_int i;
- int cmp;
- bool deleted;
-
- clsm = (WT_CURSOR_LSM *)cursor;
-
- CURSOR_API_CALL(cursor, session, prev, NULL);
- __cursor_novalue(cursor);
- WT_ERR(__clsm_enter(clsm, false, false));
-
- /* If we aren't positioned for a reverse scan, get started. */
- if (clsm->current == NULL || !F_ISSET(clsm, WT_CLSM_ITERATE_PREV)) {
- WT_FORALL_CURSORS(clsm, c, i) {
- if (!F_ISSET(cursor, WT_CURSTD_KEY_SET)) {
- WT_ERR(c->reset(c));
- ret = c->prev(c);
- } else if (c != clsm->current && (ret =
- __clsm_position_chunk(clsm, c, false, &cmp)) == 0 &&
- cmp == 0 && clsm->current == NULL)
- clsm->current = c;
- WT_ERR_NOTFOUND_OK(ret);
- }
- F_SET(clsm, WT_CLSM_ITERATE_PREV | WT_CLSM_MULTIPLE);
- F_CLR(clsm, WT_CLSM_ITERATE_NEXT);
-
- /* We just positioned *at* the key, now move. */
- if (clsm->current != NULL)
- goto retry;
- } else {
+ WT_CURSOR *c;
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ u_int i;
+ int cmp;
+ bool deleted;
+
+ clsm = (WT_CURSOR_LSM *)cursor;
+
+ CURSOR_API_CALL(cursor, session, prev, NULL);
+ __cursor_novalue(cursor);
+ WT_ERR(__clsm_enter(clsm, false, false));
+
+ /* If we aren't positioned for a reverse scan, get started. */
+ if (clsm->current == NULL || !F_ISSET(clsm, WT_CLSM_ITERATE_PREV)) {
+ WT_FORALL_CURSORS(clsm, c, i)
+ {
+ if (!F_ISSET(cursor, WT_CURSTD_KEY_SET)) {
+ WT_ERR(c->reset(c));
+ ret = c->prev(c);
+ } else if (c != clsm->current &&
+ (ret = __clsm_position_chunk(clsm, c, false, &cmp)) == 0 && cmp == 0 &&
+ clsm->current == NULL)
+ clsm->current = c;
+ WT_ERR_NOTFOUND_OK(ret);
+ }
+ F_SET(clsm, WT_CLSM_ITERATE_PREV | WT_CLSM_MULTIPLE);
+ F_CLR(clsm, WT_CLSM_ITERATE_NEXT);
+
+ /* We just positioned *at* the key, now move. */
+ if (clsm->current != NULL)
+ goto retry;
+ } else {
retry:
- /*
- * If there are multiple cursors on that key, move them
- * backwards.
- */
- if (F_ISSET(clsm, WT_CLSM_MULTIPLE)) {
- WT_FORALL_CURSORS(clsm, c, i) {
- if (!F_ISSET(c, WT_CURSTD_KEY_INT))
- continue;
- if (c != clsm->current) {
- WT_ERR(WT_LSM_CURCMP(session,
- clsm->lsm_tree, c, clsm->current,
- cmp));
- if (cmp == 0)
- WT_ERR_NOTFOUND_OK(c->prev(c));
- }
- }
- }
-
- /* Move the largest cursor backwards. */
- c = clsm->current;
- WT_ERR_NOTFOUND_OK(c->prev(c));
- }
-
- /* Find the cursor(s) with the largest key. */
- if ((ret = __clsm_get_current(session, clsm, false, &deleted)) == 0 &&
- deleted)
- goto retry;
-
-err: __clsm_leave(clsm);
- if (ret == 0)
- __clsm_deleted_decode(clsm, &cursor->value);
- API_END_RET(session, ret);
+ /*
+ * If there are multiple cursors on that key, move them backwards.
+ */
+ if (F_ISSET(clsm, WT_CLSM_MULTIPLE)) {
+ WT_FORALL_CURSORS(clsm, c, i)
+ {
+ if (!F_ISSET(c, WT_CURSTD_KEY_INT))
+ continue;
+ if (c != clsm->current) {
+ WT_ERR(WT_LSM_CURCMP(session, clsm->lsm_tree, c, clsm->current, cmp));
+ if (cmp == 0)
+ WT_ERR_NOTFOUND_OK(c->prev(c));
+ }
+ }
+ }
+
+ /* Move the largest cursor backwards. */
+ c = clsm->current;
+ WT_ERR_NOTFOUND_OK(c->prev(c));
+ }
+
+ /* Find the cursor(s) with the largest key. */
+ if ((ret = __clsm_get_current(session, clsm, false, &deleted)) == 0 && deleted)
+ goto retry;
+
+err:
+ __clsm_leave(clsm);
+ if (ret == 0)
+ __clsm_deleted_decode(clsm, &cursor->value);
+ API_END_RET(session, ret);
}
/*
* __clsm_reset_cursors --
- * Reset any positioned chunk cursors.
- *
- * If the skip parameter is non-NULL, that cursor is about to be used, so
- * there is no need to reset it.
+ * Reset any positioned chunk cursors. If the skip parameter is non-NULL, that cursor is about
+ * to be used, so there is no need to reset it.
*/
static int
__clsm_reset_cursors(WT_CURSOR_LSM *clsm, WT_CURSOR *skip)
{
- WT_CURSOR *c;
- WT_DECL_RET;
- u_int i;
-
- /* Fast path if the cursor is not positioned. */
- if ((clsm->current == NULL || clsm->current == skip) &&
- !F_ISSET(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV))
- return (0);
-
- WT_FORALL_CURSORS(clsm, c, i) {
- if (c == skip)
- continue;
- if (F_ISSET(c, WT_CURSTD_KEY_INT))
- WT_TRET(c->reset(c));
- }
-
- clsm->current = NULL;
- F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);
-
- return (ret);
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ u_int i;
+
+ /* Fast path if the cursor is not positioned. */
+ if ((clsm->current == NULL || clsm->current == skip) &&
+ !F_ISSET(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV))
+ return (0);
+
+ WT_FORALL_CURSORS(clsm, c, i)
+ {
+ if (c == skip)
+ continue;
+ if (F_ISSET(c, WT_CURSTD_KEY_INT))
+ WT_TRET(c->reset(c));
+ }
+
+ clsm->current = NULL;
+ F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);
+
+ return (ret);
}
/*
* __clsm_reset --
- * WT_CURSOR->reset method for the LSM cursor type.
+ * WT_CURSOR->reset method for the LSM cursor type.
*/
static int
__clsm_reset(WT_CURSOR *cursor)
{
- WT_CURSOR_LSM *clsm;
- WT_DECL_RET;
- WT_SESSION_IMPL *session;
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
- /*
- * Don't use the normal __clsm_enter path: that is wasted work when all
- * we want to do is give up our position.
- */
- clsm = (WT_CURSOR_LSM *)cursor;
- CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, reset, NULL);
- F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ /*
+ * Don't use the normal __clsm_enter path: that is wasted work when all we want to do is give up
+ * our position.
+ */
+ clsm = (WT_CURSOR_LSM *)cursor;
+ CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, reset, NULL);
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
- WT_TRET(__clsm_reset_cursors(clsm, NULL));
+ WT_TRET(__clsm_reset_cursors(clsm, NULL));
- /* In case we were left positioned, clear that. */
- __clsm_leave(clsm);
+ /* In case we were left positioned, clear that. */
+ __clsm_leave(clsm);
-err: API_END_RET(session, ret);
+err:
+ API_END_RET(session, ret);
}
/*
* __clsm_lookup --
- * Position an LSM cursor.
+ * Position an LSM cursor.
*/
static int
__clsm_lookup(WT_CURSOR_LSM *clsm, WT_ITEM *value)
{
- WT_BLOOM *bloom;
- WT_BLOOM_HASH bhash;
- WT_CURSOR *c, *cursor;
- WT_DECL_RET;
- WT_SESSION_IMPL *session;
- u_int i;
- bool have_hash;
-
- c = NULL;
- cursor = &clsm->iface;
- have_hash = false;
- session = (WT_SESSION_IMPL *)cursor->session;
-
- WT_FORALL_CURSORS(clsm, c, i) {
- /* If there is a Bloom filter, see if we can skip the read. */
- bloom = NULL;
- if ((bloom = clsm->chunks[i]->bloom) != NULL) {
- if (!have_hash) {
- __wt_bloom_hash(bloom, &cursor->key, &bhash);
- have_hash = true;
- }
-
- ret = __wt_bloom_hash_get(bloom, &bhash);
- if (ret == WT_NOTFOUND) {
- WT_LSM_TREE_STAT_INCR(
- session, clsm->lsm_tree->bloom_miss);
- continue;
- }
- if (ret == 0)
- WT_LSM_TREE_STAT_INCR(
- session, clsm->lsm_tree->bloom_hit);
- WT_ERR(ret);
- }
- c->set_key(c, &cursor->key);
- if ((ret = c->search(c)) == 0) {
- WT_ERR(c->get_key(c, &cursor->key));
- WT_ERR(c->get_value(c, value));
- if (__clsm_deleted(clsm, value))
- ret = WT_NOTFOUND;
- goto done;
- }
- WT_ERR_NOTFOUND_OK(ret);
- F_CLR(c, WT_CURSTD_KEY_SET);
- /* Update stats: the active chunk can't have a bloom filter. */
- if (bloom != NULL)
- WT_LSM_TREE_STAT_INCR(session,
- clsm->lsm_tree->bloom_false_positive);
- else if (clsm->primary_chunk == NULL || i != clsm->nchunks)
- WT_LSM_TREE_STAT_INCR(session,
- clsm->lsm_tree->lsm_lookup_no_bloom);
- }
- WT_ERR(WT_NOTFOUND);
+ WT_BLOOM *bloom;
+ WT_BLOOM_HASH bhash;
+ WT_CURSOR *c, *cursor;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ u_int i;
+ bool have_hash;
+
+ c = NULL;
+ cursor = &clsm->iface;
+ have_hash = false;
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ WT_FORALL_CURSORS(clsm, c, i)
+ {
+ /* If there is a Bloom filter, see if we can skip the read. */
+ bloom = NULL;
+ if ((bloom = clsm->chunks[i]->bloom) != NULL) {
+ if (!have_hash) {
+ __wt_bloom_hash(bloom, &cursor->key, &bhash);
+ have_hash = true;
+ }
+
+ ret = __wt_bloom_hash_get(bloom, &bhash);
+ if (ret == WT_NOTFOUND) {
+ WT_LSM_TREE_STAT_INCR(session, clsm->lsm_tree->bloom_miss);
+ continue;
+ }
+ if (ret == 0)
+ WT_LSM_TREE_STAT_INCR(session, clsm->lsm_tree->bloom_hit);
+ WT_ERR(ret);
+ }
+ c->set_key(c, &cursor->key);
+ if ((ret = c->search(c)) == 0) {
+ WT_ERR(c->get_key(c, &cursor->key));
+ WT_ERR(c->get_value(c, value));
+ if (__clsm_deleted(clsm, value))
+ ret = WT_NOTFOUND;
+ goto done;
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ F_CLR(c, WT_CURSTD_KEY_SET);
+ /* Update stats: the active chunk can't have a bloom filter. */
+ if (bloom != NULL)
+ WT_LSM_TREE_STAT_INCR(session, clsm->lsm_tree->bloom_false_positive);
+ else if (clsm->primary_chunk == NULL || i != clsm->nchunks)
+ WT_LSM_TREE_STAT_INCR(session, clsm->lsm_tree->lsm_lookup_no_bloom);
+ }
+ WT_ERR(WT_NOTFOUND);
done:
-err: if (ret == 0) {
- F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
- F_SET(cursor, WT_CURSTD_KEY_INT);
- clsm->current = c;
- if (value == &cursor->value)
- F_SET(cursor, WT_CURSTD_VALUE_INT);
- } else if (c != NULL)
- WT_TRET(c->reset(c));
-
- return (ret);
+err:
+ if (ret == 0) {
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ F_SET(cursor, WT_CURSTD_KEY_INT);
+ clsm->current = c;
+ if (value == &cursor->value)
+ F_SET(cursor, WT_CURSTD_VALUE_INT);
+ } else if (c != NULL)
+ WT_TRET(c->reset(c));
+
+ return (ret);
}
/*
* __clsm_search --
- * WT_CURSOR->search method for the LSM cursor type.
+ * WT_CURSOR->search method for the LSM cursor type.
*/
static int
__clsm_search(WT_CURSOR *cursor)
{
- WT_CURSOR_LSM *clsm;
- WT_DECL_RET;
- WT_SESSION_IMPL *session;
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
- clsm = (WT_CURSOR_LSM *)cursor;
+ clsm = (WT_CURSOR_LSM *)cursor;
- CURSOR_API_CALL(cursor, session, search, NULL);
- WT_ERR(__cursor_needkey(cursor));
- __cursor_novalue(cursor);
- WT_ERR(__clsm_enter(clsm, true, false));
- F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);
+ CURSOR_API_CALL(cursor, session, search, NULL);
+ WT_ERR(__cursor_needkey(cursor));
+ __cursor_novalue(cursor);
+ WT_ERR(__clsm_enter(clsm, true, false));
+ F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);
- ret = __clsm_lookup(clsm, &cursor->value);
+ ret = __clsm_lookup(clsm, &cursor->value);
-err: __clsm_leave(clsm);
- if (ret == 0)
- __clsm_deleted_decode(clsm, &cursor->value);
- API_END_RET(session, ret);
+err:
+ __clsm_leave(clsm);
+ if (ret == 0)
+ __clsm_deleted_decode(clsm, &cursor->value);
+ API_END_RET(session, ret);
}
/*
* __clsm_search_near --
- * WT_CURSOR->search_near method for the LSM cursor type.
+ * WT_CURSOR->search_near method for the LSM cursor type.
*/
static int
__clsm_search_near(WT_CURSOR *cursor, int *exactp)
{
- WT_CURSOR *c, *closest;
- WT_CURSOR_LSM *clsm;
- WT_DECL_RET;
- WT_SESSION_IMPL *session;
- u_int i;
- int cmp, exact;
- bool deleted;
-
- closest = NULL;
- clsm = (WT_CURSOR_LSM *)cursor;
- exact = 0;
-
- CURSOR_API_CALL(cursor, session, search_near, NULL);
- WT_ERR(__cursor_needkey(cursor));
- __cursor_novalue(cursor);
- WT_ERR(__clsm_enter(clsm, true, false));
- F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);
-
- /*
- * search_near is somewhat fiddly: we can't just use a nearby key from
- * the in-memory chunk because there could be a closer key on disk.
- *
- * As we search down the chunks, we stop as soon as we find an exact
- * match. Otherwise, we maintain the smallest cursor larger than the
- * search key and the largest cursor smaller than the search key. At
- * the end, we prefer the larger cursor, but if no record is larger,
- * position on the last record in the tree.
- */
- WT_FORALL_CURSORS(clsm, c, i) {
- c->set_key(c, &cursor->key);
- if ((ret = c->search_near(c, &cmp)) == WT_NOTFOUND) {
- ret = 0;
- continue;
- }
- if (ret != 0)
- goto err;
-
- /* Do we have an exact match? */
- if (cmp == 0) {
- closest = c;
- exact = 1;
- break;
- }
-
- /*
- * Prefer larger cursors. There are two reasons: (1) we expect
- * prefix searches to be a common case (as in our own indices);
- * and (2) we need a way to unambiguously know we have the
- * "closest" result.
- */
- if (cmp < 0) {
- if ((ret = c->next(c)) == WT_NOTFOUND) {
- ret = 0;
- continue;
- }
- if (ret != 0)
- goto err;
- }
-
- /*
- * We are trying to find the smallest cursor greater than the
- * search key.
- */
- if (closest == NULL)
- closest = c;
- else {
- WT_ERR(WT_LSM_CURCMP(session,
- clsm->lsm_tree, c, closest, cmp));
- if (cmp < 0)
- closest = c;
- }
- }
-
- /*
- * At this point, we either have an exact match, or closest is the
- * smallest cursor larger than the search key, or it is NULL if the
- * search key is larger than any record in the tree.
- */
- cmp = exact ? 0 : 1;
-
- /*
- * If we land on a deleted item, try going forwards or backwards to
- * find one that isn't deleted. If the whole tree is empty, we'll
- * end up with WT_NOTFOUND, as expected.
- */
- if (closest == NULL)
- deleted = true;
- else {
- WT_ERR(closest->get_key(closest, &cursor->key));
- WT_ERR(closest->get_value(closest, &cursor->value));
- clsm->current = closest;
- closest = NULL;
- deleted = __clsm_deleted(clsm, &cursor->value);
- if (!deleted)
- __clsm_deleted_decode(clsm, &cursor->value);
- else {
- /*
- * We have a key pointing at memory that is
- * pinned by the current chunk cursor. In the
- * unlikely event that we have to reopen cursors
- * to move to the next record, make sure the cursor
- * flags are set so a copy is made before the current
- * chunk cursor releases its position.
- */
- F_CLR(cursor, WT_CURSTD_KEY_SET);
- F_SET(cursor, WT_CURSTD_KEY_INT);
- /*
- * We call __clsm_next here as we want to advance
- * forward. If we are a random LSM cursor calling next
- * on the cursor will not advance as we intend.
- */
- if ((ret = __clsm_next(cursor)) == 0) {
- cmp = 1;
- deleted = false;
- }
- }
- WT_ERR_NOTFOUND_OK(ret);
- }
- if (deleted) {
- clsm->current = NULL;
- /*
- * We call prev directly here as cursor->prev may be "invalid"
- * if this is a random cursor.
- */
- WT_ERR(__clsm_prev(cursor));
- cmp = -1;
- }
- *exactp = cmp;
-
-err: __clsm_leave(clsm);
- if (closest != NULL)
- WT_TRET(closest->reset(closest));
-
- F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
- if (ret == 0) {
- F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
- } else
- clsm->current = NULL;
-
- API_END_RET(session, ret);
+ WT_CURSOR *c, *closest;
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ u_int i;
+ int cmp, exact;
+ bool deleted;
+
+ closest = NULL;
+ clsm = (WT_CURSOR_LSM *)cursor;
+ exact = 0;
+
+ CURSOR_API_CALL(cursor, session, search_near, NULL);
+ WT_ERR(__cursor_needkey(cursor));
+ __cursor_novalue(cursor);
+ WT_ERR(__clsm_enter(clsm, true, false));
+ F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);
+
+ /*
+ * search_near is somewhat fiddly: we can't just use a nearby key from
+ * the in-memory chunk because there could be a closer key on disk.
+ *
+ * As we search down the chunks, we stop as soon as we find an exact
+ * match. Otherwise, we maintain the smallest cursor larger than the
+ * search key and the largest cursor smaller than the search key. At
+ * the end, we prefer the larger cursor, but if no record is larger,
+ * position on the last record in the tree.
+ */
+ WT_FORALL_CURSORS(clsm, c, i)
+ {
+ c->set_key(c, &cursor->key);
+ if ((ret = c->search_near(c, &cmp)) == WT_NOTFOUND) {
+ ret = 0;
+ continue;
+ }
+ if (ret != 0)
+ goto err;
+
+ /* Do we have an exact match? */
+ if (cmp == 0) {
+ closest = c;
+ exact = 1;
+ break;
+ }
+
+ /*
+ * Prefer larger cursors. There are two reasons: (1) we expect
+ * prefix searches to be a common case (as in our own indices);
+ * and (2) we need a way to unambiguously know we have the
+ * "closest" result.
+ */
+ if (cmp < 0) {
+ if ((ret = c->next(c)) == WT_NOTFOUND) {
+ ret = 0;
+ continue;
+ }
+ if (ret != 0)
+ goto err;
+ }
+
+ /*
+ * We are trying to find the smallest cursor greater than the search key.
+ */
+ if (closest == NULL)
+ closest = c;
+ else {
+ WT_ERR(WT_LSM_CURCMP(session, clsm->lsm_tree, c, closest, cmp));
+ if (cmp < 0)
+ closest = c;
+ }
+ }
+
+ /*
+ * At this point, we either have an exact match, or closest is the smallest cursor larger than
+ * the search key, or it is NULL if the search key is larger than any record in the tree.
+ */
+ cmp = exact ? 0 : 1;
+
+ /*
+ * If we land on a deleted item, try going forwards or backwards to find one that isn't deleted.
+ * If the whole tree is empty, we'll end up with WT_NOTFOUND, as expected.
+ */
+ if (closest == NULL)
+ deleted = true;
+ else {
+ WT_ERR(closest->get_key(closest, &cursor->key));
+ WT_ERR(closest->get_value(closest, &cursor->value));
+ clsm->current = closest;
+ closest = NULL;
+ deleted = __clsm_deleted(clsm, &cursor->value);
+ if (!deleted)
+ __clsm_deleted_decode(clsm, &cursor->value);
+ else {
+ /*
+ * We have a key pointing at memory that is pinned by the current chunk cursor. In the
+ * unlikely event that we have to reopen cursors to move to the next record, make sure
+ * the cursor flags are set so a copy is made before the current chunk cursor releases
+ * its position.
+ */
+ F_CLR(cursor, WT_CURSTD_KEY_SET);
+ F_SET(cursor, WT_CURSTD_KEY_INT);
+ /*
+ * We call __clsm_next here as we want to advance forward. If we are a random LSM cursor
+ * calling next on the cursor will not advance as we intend.
+ */
+ if ((ret = __clsm_next(cursor)) == 0) {
+ cmp = 1;
+ deleted = false;
+ }
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ }
+ if (deleted) {
+ clsm->current = NULL;
+ /*
+ * We call prev directly here as cursor->prev may be "invalid" if this is a random cursor.
+ */
+ WT_ERR(__clsm_prev(cursor));
+ cmp = -1;
+ }
+ *exactp = cmp;
+
+err:
+ __clsm_leave(clsm);
+ if (closest != NULL)
+ WT_TRET(closest->reset(closest));
+
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ if (ret == 0) {
+ F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+ } else
+ clsm->current = NULL;
+
+ API_END_RET(session, ret);
}
/*
* __clsm_put --
- * Put an entry into the in-memory tree, trigger a file switch if
- * necessary.
+ * Put an entry into the in-memory tree, trigger a file switch if necessary.
*/
static inline int
-__clsm_put(WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm,
- const WT_ITEM *key, const WT_ITEM *value, bool position, bool reserve)
+__clsm_put(WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, const WT_ITEM *key, const WT_ITEM *value,
+ bool position, bool reserve)
{
- WT_CURSOR *c, *primary;
- WT_LSM_TREE *lsm_tree;
- u_int i, slot;
- int (*func)(WT_CURSOR *);
-
- lsm_tree = clsm->lsm_tree;
-
- WT_ASSERT(session,
- F_ISSET(&session->txn, WT_TXN_HAS_ID) &&
- clsm->primary_chunk != NULL &&
- (clsm->primary_chunk->switch_txn == WT_TXN_NONE ||
- WT_TXNID_LE(session->txn.id, clsm->primary_chunk->switch_txn)));
-
- /*
- * Clear the existing cursor position. Don't clear the primary cursor:
- * we're about to use it anyway.
- */
- primary = clsm->chunks[clsm->nchunks - 1]->cursor;
- WT_RET(__clsm_reset_cursors(clsm, primary));
-
- /* If necessary, set the position for future scans. */
- if (position)
- clsm->current = primary;
-
- for (i = 0, slot = clsm->nchunks - 1; i < clsm->nupdates; i++, slot--) {
- /* Check if we need to keep updating old chunks. */
- if (i > 0 && __wt_txn_visible(
- session, clsm->chunks[slot]->switch_txn, WT_TS_NONE)) {
- clsm->nupdates = i;
- break;
- }
-
- c = clsm->chunks[slot]->cursor;
- c->set_key(c, key);
- func = c->insert;
- if (i == 0 && position)
- func = reserve ? c->reserve : c->update;
- if (func != c->reserve)
- c->set_value(c, value);
- WT_RET(func(c));
- }
-
- /*
- * Update the record count. It is in a shared structure, but it's only
- * approximate, so don't worry about protecting access.
- *
- * Throttle if necessary. Every 100 update operations on each cursor,
- * check if throttling is required. Don't rely only on the shared
- * counter because it can race, and because for some workloads, there
- * may not be enough records per chunk to get effective throttling.
- */
- if ((++clsm->primary_chunk->count % 100 == 0 ||
- ++clsm->update_count >= 100) &&
- lsm_tree->merge_throttle + lsm_tree->ckpt_throttle > 0) {
- clsm->update_count = 0;
- WT_LSM_TREE_STAT_INCRV(session,
- lsm_tree->lsm_checkpoint_throttle, lsm_tree->ckpt_throttle);
- WT_STAT_CONN_INCRV(session,
- lsm_checkpoint_throttle, lsm_tree->ckpt_throttle);
- WT_LSM_TREE_STAT_INCRV(session,
- lsm_tree->lsm_merge_throttle, lsm_tree->merge_throttle);
- WT_STAT_CONN_INCRV(session,
- lsm_merge_throttle, lsm_tree->merge_throttle);
- __wt_sleep(0,
- lsm_tree->ckpt_throttle + lsm_tree->merge_throttle);
- }
-
- return (0);
+ WT_CURSOR *c, *primary;
+ WT_LSM_TREE *lsm_tree;
+ u_int i, slot;
+ int (*func)(WT_CURSOR *);
+
+ lsm_tree = clsm->lsm_tree;
+
+ WT_ASSERT(session, F_ISSET(&session->txn, WT_TXN_HAS_ID) && clsm->primary_chunk != NULL &&
+ (clsm->primary_chunk->switch_txn == WT_TXN_NONE ||
+ WT_TXNID_LE(session->txn.id, clsm->primary_chunk->switch_txn)));
+
+ /*
+ * Clear the existing cursor position. Don't clear the primary cursor: we're about to use it
+ * anyway.
+ */
+ primary = clsm->chunks[clsm->nchunks - 1]->cursor;
+ WT_RET(__clsm_reset_cursors(clsm, primary));
+
+ /* If necessary, set the position for future scans. */
+ if (position)
+ clsm->current = primary;
+
+ for (i = 0, slot = clsm->nchunks - 1; i < clsm->nupdates; i++, slot--) {
+ /* Check if we need to keep updating old chunks. */
+ if (i > 0 && __wt_txn_visible(session, clsm->chunks[slot]->switch_txn, WT_TS_NONE)) {
+ clsm->nupdates = i;
+ break;
+ }
+
+ c = clsm->chunks[slot]->cursor;
+ c->set_key(c, key);
+ func = c->insert;
+ if (i == 0 && position)
+ func = reserve ? c->reserve : c->update;
+ if (func != c->reserve)
+ c->set_value(c, value);
+ WT_RET(func(c));
+ }
+
+ /*
+ * Update the record count. It is in a shared structure, but it's only
+ * approximate, so don't worry about protecting access.
+ *
+ * Throttle if necessary. Every 100 update operations on each cursor,
+ * check if throttling is required. Don't rely only on the shared
+ * counter because it can race, and because for some workloads, there
+ * may not be enough records per chunk to get effective throttling.
+ */
+ if ((++clsm->primary_chunk->count % 100 == 0 || ++clsm->update_count >= 100) &&
+ lsm_tree->merge_throttle + lsm_tree->ckpt_throttle > 0) {
+ clsm->update_count = 0;
+ WT_LSM_TREE_STAT_INCRV(session, lsm_tree->lsm_checkpoint_throttle, lsm_tree->ckpt_throttle);
+ WT_STAT_CONN_INCRV(session, lsm_checkpoint_throttle, lsm_tree->ckpt_throttle);
+ WT_LSM_TREE_STAT_INCRV(session, lsm_tree->lsm_merge_throttle, lsm_tree->merge_throttle);
+ WT_STAT_CONN_INCRV(session, lsm_merge_throttle, lsm_tree->merge_throttle);
+ __wt_sleep(0, lsm_tree->ckpt_throttle + lsm_tree->merge_throttle);
+ }
+
+ return (0);
}
/*
* __clsm_insert --
- * WT_CURSOR->insert method for the LSM cursor type.
+ * WT_CURSOR->insert method for the LSM cursor type.
*/
static int
__clsm_insert(WT_CURSOR *cursor)
{
- WT_CURSOR_LSM *clsm;
- WT_DECL_ITEM(buf);
- WT_DECL_RET;
- WT_ITEM value;
- WT_SESSION_IMPL *session;
-
- clsm = (WT_CURSOR_LSM *)cursor;
-
- CURSOR_UPDATE_API_CALL(cursor, session, insert);
- WT_ERR(__cursor_needkey(cursor));
- WT_ERR(__cursor_needvalue(cursor));
- WT_ERR(__clsm_enter(clsm, false, true));
-
- /*
- * It isn't necessary to copy the key out after the lookup in this
- * case because any non-failed lookup results in an error, and a
- * failed lookup leaves the original key intact.
- */
- if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
- (ret = __clsm_lookup(clsm, &value)) != WT_NOTFOUND) {
- if (ret == 0)
- ret = WT_DUPLICATE_KEY;
- goto err;
- }
-
- WT_ERR(__clsm_deleted_encode(session, &cursor->value, &value, &buf));
- WT_ERR(__clsm_put(session, clsm, &cursor->key, &value, false, false));
-
- /*
- * WT_CURSOR.insert doesn't leave the cursor positioned, and the
- * application may want to free the memory used to configure the
- * insert; don't read that memory again (matching the underlying
- * file object cursor insert semantics).
- */
- F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
-
-err: __wt_scr_free(session, &buf);
- __clsm_leave(clsm);
- CURSOR_UPDATE_API_END(session, ret);
- return (ret);
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_ITEM value;
+ WT_SESSION_IMPL *session;
+
+ clsm = (WT_CURSOR_LSM *)cursor;
+
+ CURSOR_UPDATE_API_CALL(cursor, session, insert);
+ WT_ERR(__cursor_needkey(cursor));
+ WT_ERR(__cursor_needvalue(cursor));
+ WT_ERR(__clsm_enter(clsm, false, true));
+
+ /*
+ * It isn't necessary to copy the key out after the lookup in this case because any non-failed
+ * lookup results in an error, and a failed lookup leaves the original key intact.
+ */
+ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
+ (ret = __clsm_lookup(clsm, &value)) != WT_NOTFOUND) {
+ if (ret == 0)
+ ret = WT_DUPLICATE_KEY;
+ goto err;
+ }
+
+ WT_ERR(__clsm_deleted_encode(session, &cursor->value, &value, &buf));
+ WT_ERR(__clsm_put(session, clsm, &cursor->key, &value, false, false));
+
+ /*
+ * WT_CURSOR.insert doesn't leave the cursor positioned, and the application may want to free
+ * the memory used to configure the insert; don't read that memory again (matching the
+ * underlying file object cursor insert semantics).
+ */
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+err:
+ __wt_scr_free(session, &buf);
+ __clsm_leave(clsm);
+ CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
}
/*
* __clsm_update --
- * WT_CURSOR->update method for the LSM cursor type.
+ * WT_CURSOR->update method for the LSM cursor type.
*/
static int
__clsm_update(WT_CURSOR *cursor)
{
- WT_CURSOR_LSM *clsm;
- WT_DECL_ITEM(buf);
- WT_DECL_RET;
- WT_ITEM value;
- WT_SESSION_IMPL *session;
-
- clsm = (WT_CURSOR_LSM *)cursor;
-
- CURSOR_UPDATE_API_CALL(cursor, session, update);
- WT_ERR(__cursor_needkey(cursor));
- WT_ERR(__cursor_needvalue(cursor));
- WT_ERR(__clsm_enter(clsm, false, true));
-
- if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
- WT_ERR(__clsm_lookup(clsm, &value));
- /*
- * Copy the key out, since the insert resets non-primary chunk
- * cursors which our lookup may have landed on.
- */
- WT_ERR(__cursor_needkey(cursor));
- }
- WT_ERR(__clsm_deleted_encode(session, &cursor->value, &value, &buf));
- WT_ERR(__clsm_put(session, clsm, &cursor->key, &value, true, false));
-
- /*
- * Set the cursor to reference the internal key/value of the positioned
- * cursor.
- */
- F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
- WT_ITEM_SET(cursor->key, clsm->current->key);
- WT_ITEM_SET(cursor->value, clsm->current->value);
- WT_ASSERT(session,
- F_MASK(clsm->current, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT);
- WT_ASSERT(session,
- F_MASK(clsm->current, WT_CURSTD_VALUE_SET) == WT_CURSTD_VALUE_INT);
- F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
-
-err: __wt_scr_free(session, &buf);
- __clsm_leave(clsm);
- CURSOR_UPDATE_API_END(session, ret);
- return (ret);
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_ITEM value;
+ WT_SESSION_IMPL *session;
+
+ clsm = (WT_CURSOR_LSM *)cursor;
+
+ CURSOR_UPDATE_API_CALL(cursor, session, update);
+ WT_ERR(__cursor_needkey(cursor));
+ WT_ERR(__cursor_needvalue(cursor));
+ WT_ERR(__clsm_enter(clsm, false, true));
+
+ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
+ WT_ERR(__clsm_lookup(clsm, &value));
+ /*
+ * Copy the key out, since the insert resets non-primary chunk cursors which our lookup may
+ * have landed on.
+ */
+ WT_ERR(__cursor_needkey(cursor));
+ }
+ WT_ERR(__clsm_deleted_encode(session, &cursor->value, &value, &buf));
+ WT_ERR(__clsm_put(session, clsm, &cursor->key, &value, true, false));
+
+ /*
+ * Set the cursor to reference the internal key/value of the positioned cursor.
+ */
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ WT_ITEM_SET(cursor->key, clsm->current->key);
+ WT_ITEM_SET(cursor->value, clsm->current->value);
+ WT_ASSERT(session, F_MASK(clsm->current, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT);
+ WT_ASSERT(session, F_MASK(clsm->current, WT_CURSTD_VALUE_SET) == WT_CURSTD_VALUE_INT);
+ F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+err:
+ __wt_scr_free(session, &buf);
+ __clsm_leave(clsm);
+ CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
}
/*
* __clsm_remove --
- * WT_CURSOR->remove method for the LSM cursor type.
+ * WT_CURSOR->remove method for the LSM cursor type.
*/
static int
__clsm_remove(WT_CURSOR *cursor)
{
- WT_CURSOR_LSM *clsm;
- WT_DECL_RET;
- WT_ITEM value;
- WT_SESSION_IMPL *session;
- bool positioned;
-
- clsm = (WT_CURSOR_LSM *)cursor;
-
- /* Check if the cursor is positioned. */
- positioned = F_ISSET(cursor, WT_CURSTD_KEY_INT);
-
- CURSOR_REMOVE_API_CALL(cursor, session, NULL);
- WT_ERR(__cursor_needkey(cursor));
- __cursor_novalue(cursor);
- WT_ERR(__clsm_enter(clsm, false, true));
-
- if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
- WT_ERR(__clsm_lookup(clsm, &value));
- /*
- * Copy the key out, since the insert resets non-primary chunk
- * cursors which our lookup may have landed on.
- */
- WT_ERR(__cursor_needkey(cursor));
- }
- WT_ERR(__clsm_put(
- session, clsm, &cursor->key, &__tombstone, positioned, false));
-
- /*
- * If the cursor was positioned, it stays positioned with a key but no
- * no value, otherwise, there's no position, key or value. This isn't
- * just cosmetic, without a reset, iteration on this cursor won't start
- * at the beginning/end of the table.
- */
- F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
- if (positioned)
- F_SET(cursor, WT_CURSTD_KEY_INT);
- else
- WT_TRET(cursor->reset(cursor));
-
-err: __clsm_leave(clsm);
- CURSOR_UPDATE_API_END(session, ret);
- return (ret);
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_ITEM value;
+ WT_SESSION_IMPL *session;
+ bool positioned;
+
+ clsm = (WT_CURSOR_LSM *)cursor;
+
+ /* Check if the cursor is positioned. */
+ positioned = F_ISSET(cursor, WT_CURSTD_KEY_INT);
+
+ CURSOR_REMOVE_API_CALL(cursor, session, NULL);
+ WT_ERR(__cursor_needkey(cursor));
+ __cursor_novalue(cursor);
+ WT_ERR(__clsm_enter(clsm, false, true));
+
+ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
+ WT_ERR(__clsm_lookup(clsm, &value));
+ /*
+ * Copy the key out, since the insert resets non-primary chunk cursors which our lookup may
+ * have landed on.
+ */
+ WT_ERR(__cursor_needkey(cursor));
+ }
+ WT_ERR(__clsm_put(session, clsm, &cursor->key, &__tombstone, positioned, false));
+
+ /*
+ * If the cursor was positioned, it stays positioned with a key but no no value, otherwise,
+ * there's no position, key or value. This isn't just cosmetic, without a reset, iteration on
+ * this cursor won't start at the beginning/end of the table.
+ */
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ if (positioned)
+ F_SET(cursor, WT_CURSTD_KEY_INT);
+ else
+ WT_TRET(cursor->reset(cursor));
+
+err:
+ __clsm_leave(clsm);
+ CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
}
/*
@@ -1702,182 +1609,180 @@ err: __clsm_leave(clsm);
static int
__clsm_reserve(WT_CURSOR *cursor)
{
- WT_CURSOR_LSM *clsm;
- WT_DECL_RET;
- WT_ITEM value;
- WT_SESSION_IMPL *session;
-
- clsm = (WT_CURSOR_LSM *)cursor;
-
- CURSOR_UPDATE_API_CALL(cursor, session, reserve);
- WT_ERR(__cursor_needkey(cursor));
- __cursor_novalue(cursor);
- WT_ERR(__wt_txn_context_check(session, true));
- WT_ERR(__clsm_enter(clsm, false, true));
-
- WT_ERR(__clsm_lookup(clsm, &value));
- /*
- * Copy the key out, since the insert resets non-primary chunk cursors
- * which our lookup may have landed on.
- */
- WT_ERR(__cursor_needkey(cursor));
- ret = __clsm_put(session, clsm, &cursor->key, NULL, true, true);
-
-err: __clsm_leave(clsm);
- CURSOR_UPDATE_API_END(session, ret);
-
- /*
- * The application might do a WT_CURSOR.get_value call when we return,
- * so we need a value and the underlying functions didn't set one up.
- * For various reasons, those functions may not have done a search and
- * any previous value in the cursor might race with WT_CURSOR.reserve
- * (and in cases like LSM, the reserve never encountered the original
- * key). For simplicity, repeat the search here.
- */
- return (ret == 0 ? cursor->search(cursor) : ret);
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_ITEM value;
+ WT_SESSION_IMPL *session;
+
+ clsm = (WT_CURSOR_LSM *)cursor;
+
+ CURSOR_UPDATE_API_CALL(cursor, session, reserve);
+ WT_ERR(__cursor_needkey(cursor));
+ __cursor_novalue(cursor);
+ WT_ERR(__wt_txn_context_check(session, true));
+ WT_ERR(__clsm_enter(clsm, false, true));
+
+ WT_ERR(__clsm_lookup(clsm, &value));
+ /*
+ * Copy the key out, since the insert resets non-primary chunk cursors which our lookup may have
+ * landed on.
+ */
+ WT_ERR(__cursor_needkey(cursor));
+ ret = __clsm_put(session, clsm, &cursor->key, NULL, true, true);
+
+err:
+ __clsm_leave(clsm);
+ CURSOR_UPDATE_API_END(session, ret);
+
+ /*
+ * The application might do a WT_CURSOR.get_value call when we return,
+ * so we need a value and the underlying functions didn't set one up.
+ * For various reasons, those functions may not have done a search and
+ * any previous value in the cursor might race with WT_CURSOR.reserve
+ * (and in cases like LSM, the reserve never encountered the original
+ * key). For simplicity, repeat the search here.
+ */
+ return (ret == 0 ? cursor->search(cursor) : ret);
}
/*
* __wt_clsm_close --
- * WT_CURSOR->close method for the LSM cursor type.
+ * WT_CURSOR->close method for the LSM cursor type.
*/
int
__wt_clsm_close(WT_CURSOR *cursor)
{
- WT_CURSOR_LSM *clsm;
- WT_DECL_RET;
- WT_SESSION_IMPL *session;
-
- /*
- * Don't use the normal __clsm_enter path: that is wasted work when
- * closing, and the cursor may never have been used.
- */
- clsm = (WT_CURSOR_LSM *)cursor;
- CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL);
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ /*
+ * Don't use the normal __clsm_enter path: that is wasted work when closing, and the cursor may
+ * never have been used.
+ */
+ clsm = (WT_CURSOR_LSM *)cursor;
+ CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL);
err:
- WT_TRET(__clsm_close_cursors(session, clsm, 0, clsm->nchunks));
- __clsm_free_chunks(session, clsm);
+ WT_TRET(__clsm_close_cursors(session, clsm, 0, clsm->nchunks));
+ __clsm_free_chunks(session, clsm);
- /* In case we were somehow left positioned, clear that. */
- __clsm_leave(clsm);
+ /* In case we were somehow left positioned, clear that. */
+ __clsm_leave(clsm);
- if (clsm->lsm_tree != NULL)
- __wt_lsm_tree_release(session, clsm->lsm_tree);
- __wt_cursor_close(cursor);
+ if (clsm->lsm_tree != NULL)
+ __wt_lsm_tree_release(session, clsm->lsm_tree);
+ __wt_cursor_close(cursor);
- API_END_RET(session, ret);
+ API_END_RET(session, ret);
}
/*
* __wt_clsm_open --
- * WT_SESSION->open_cursor method for LSM cursors.
+ * WT_SESSION->open_cursor method for LSM cursors.
*/
int
-__wt_clsm_open(WT_SESSION_IMPL *session,
- const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
+__wt_clsm_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[],
+ WT_CURSOR **cursorp)
{
- WT_CONFIG_ITEM cval;
- WT_CURSOR_STATIC_INIT(iface,
- __wt_cursor_get_key, /* get-key */
- __wt_cursor_get_value, /* get-value */
- __wt_cursor_set_key, /* set-key */
- __wt_cursor_set_value, /* set-value */
- __clsm_compare, /* compare */
- __wt_cursor_equals, /* equals */
- __clsm_next, /* next */
- __clsm_prev, /* prev */
- __clsm_reset, /* reset */
- __clsm_search, /* search */
- __clsm_search_near, /* search-near */
- __clsm_insert, /* insert */
- __wt_cursor_modify_notsup, /* modify */
- __clsm_update, /* update */
- __clsm_remove, /* remove */
- __clsm_reserve, /* reserve */
- __wt_cursor_reconfigure, /* reconfigure */
- __wt_cursor_notsup, /* cache */
- __wt_cursor_reopen_notsup, /* reopen */
- __wt_clsm_close); /* close */
- WT_CURSOR *cursor;
- WT_CURSOR_LSM *clsm;
- WT_DECL_RET;
- WT_LSM_TREE *lsm_tree;
- bool bulk;
-
- WT_STATIC_ASSERT(offsetof(WT_CURSOR_LSM, iface) == 0);
-
- clsm = NULL;
- cursor = NULL;
- lsm_tree = NULL;
-
- if (!WT_PREFIX_MATCH(uri, "lsm:"))
- return (__wt_unexpected_object_type(session, uri, "lsm:"));
-
- WT_RET(__wt_inmem_unsupported_op(session, "LSM trees"));
-
- WT_RET(__wt_config_gets_def(session, cfg, "checkpoint", 0, &cval));
- if (cval.len != 0)
- WT_RET_MSG(session, EINVAL,
- "LSM does not support opening by checkpoint");
-
- WT_RET(__wt_config_gets_def(session, cfg, "bulk", 0, &cval));
- bulk = cval.val != 0;
-
- /* Get the LSM tree. */
- ret = __wt_lsm_tree_get(session, uri, bulk, &lsm_tree);
-
- /*
- * Check whether the exclusive open for a bulk load succeeded, and
- * if it did ensure that it's safe to bulk load into the tree.
- */
- if (bulk && (ret == EBUSY || (ret == 0 && lsm_tree->nchunks > 1)))
- WT_ERR_MSG(session, EINVAL,
- "bulk-load is only supported on newly created LSM trees");
- /* Flag any errors from the tree get. */
- WT_ERR(ret);
-
- /* Make sure we have exclusive access if and only if we want it */
- WT_ASSERT(session, !bulk || lsm_tree->excl_session != NULL);
-
- WT_ERR(__wt_calloc_one(session, &clsm));
- cursor = (WT_CURSOR *)clsm;
- *cursor = iface;
- cursor->session = (WT_SESSION *)session;
- WT_ERR(__wt_strdup(session, lsm_tree->name, &cursor->uri));
- cursor->key_format = lsm_tree->key_format;
- cursor->value_format = lsm_tree->value_format;
-
- clsm->lsm_tree = lsm_tree;
- lsm_tree = NULL;
-
- /*
- * The tree's dsk_gen starts at one, so starting the cursor on zero
- * will force a call into open_cursors on the first operation.
- */
- clsm->dsk_gen = 0;
-
- /* If the next_random option is set, configure a random cursor */
- WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval));
- if (cval.val != 0) {
- __wt_cursor_set_notsup(cursor);
- cursor->next = __clsm_next_random;
- }
-
- WT_ERR(__wt_cursor_init(cursor, cursor->uri, owner, cfg, cursorp));
-
- if (bulk)
- WT_ERR(__wt_clsm_open_bulk(clsm, cfg));
-
- if (0) {
+ WT_CONFIG_ITEM cval;
+ WT_CURSOR_STATIC_INIT(iface, __wt_cursor_get_key, /* get-key */
+ __wt_cursor_get_value, /* get-value */
+ __wt_cursor_set_key, /* set-key */
+ __wt_cursor_set_value, /* set-value */
+ __clsm_compare, /* compare */
+ __wt_cursor_equals, /* equals */
+ __clsm_next, /* next */
+ __clsm_prev, /* prev */
+ __clsm_reset, /* reset */
+ __clsm_search, /* search */
+ __clsm_search_near, /* search-near */
+ __clsm_insert, /* insert */
+ __wt_cursor_modify_notsup, /* modify */
+ __clsm_update, /* update */
+ __clsm_remove, /* remove */
+ __clsm_reserve, /* reserve */
+ __wt_cursor_reconfigure, /* reconfigure */
+ __wt_cursor_notsup, /* cache */
+ __wt_cursor_reopen_notsup, /* reopen */
+ __wt_clsm_close); /* close */
+ WT_CURSOR *cursor;
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_LSM_TREE *lsm_tree;
+ bool bulk;
+
+ WT_STATIC_ASSERT(offsetof(WT_CURSOR_LSM, iface) == 0);
+
+ clsm = NULL;
+ cursor = NULL;
+ lsm_tree = NULL;
+
+ if (!WT_PREFIX_MATCH(uri, "lsm:"))
+ return (__wt_unexpected_object_type(session, uri, "lsm:"));
+
+ WT_RET(__wt_inmem_unsupported_op(session, "LSM trees"));
+
+ WT_RET(__wt_config_gets_def(session, cfg, "checkpoint", 0, &cval));
+ if (cval.len != 0)
+ WT_RET_MSG(session, EINVAL, "LSM does not support opening by checkpoint");
+
+ WT_RET(__wt_config_gets_def(session, cfg, "bulk", 0, &cval));
+ bulk = cval.val != 0;
+
+ /* Get the LSM tree. */
+ ret = __wt_lsm_tree_get(session, uri, bulk, &lsm_tree);
+
+ /*
+ * Check whether the exclusive open for a bulk load succeeded, and if it did ensure that it's
+ * safe to bulk load into the tree.
+ */
+ if (bulk && (ret == EBUSY || (ret == 0 && lsm_tree->nchunks > 1)))
+ WT_ERR_MSG(session, EINVAL, "bulk-load is only supported on newly created LSM trees");
+ /* Flag any errors from the tree get. */
+ WT_ERR(ret);
+
+ /* Make sure we have exclusive access if and only if we want it */
+ WT_ASSERT(session, !bulk || lsm_tree->excl_session != NULL);
+
+ WT_ERR(__wt_calloc_one(session, &clsm));
+ cursor = (WT_CURSOR *)clsm;
+ *cursor = iface;
+ cursor->session = (WT_SESSION *)session;
+ WT_ERR(__wt_strdup(session, lsm_tree->name, &cursor->uri));
+ cursor->key_format = lsm_tree->key_format;
+ cursor->value_format = lsm_tree->value_format;
+
+ clsm->lsm_tree = lsm_tree;
+ lsm_tree = NULL;
+
+ /*
+ * The tree's dsk_gen starts at one, so starting the cursor on zero will force a call into
+ * open_cursors on the first operation.
+ */
+ clsm->dsk_gen = 0;
+
+ /* If the next_random option is set, configure a random cursor */
+ WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval));
+ if (cval.val != 0) {
+ __wt_cursor_set_notsup(cursor);
+ cursor->next = __clsm_next_random;
+ }
+
+ WT_ERR(__wt_cursor_init(cursor, cursor->uri, owner, cfg, cursorp));
+
+ if (bulk)
+ WT_ERR(__wt_clsm_open_bulk(clsm, cfg));
+
+ if (0) {
err:
- if (clsm != NULL)
- WT_TRET(__wt_clsm_close(cursor));
- else if (lsm_tree != NULL)
- __wt_lsm_tree_release(session, lsm_tree);
+ if (clsm != NULL)
+ WT_TRET(__wt_clsm_close(cursor));
+ else if (lsm_tree != NULL)
+ __wt_lsm_tree_release(session, lsm_tree);
- *cursorp = NULL;
- }
+ *cursorp = NULL;
+ }
- return (ret);
+ return (ret);
}