summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/btree/bt_curprev.c
diff options
context:
space:
mode:
authorEliot Horowitz <eliot@10gen.com>2014-11-04 15:46:40 -0500
committerEliot Horowitz <eliot@10gen.com>2014-11-05 11:21:19 -0500
commit5ca2daf551a2c631a5f573cb054406f5d49fbef5 (patch)
treeb0a23d34ffdb376bac0b79ed17b5619cfc0d9b47 /src/third_party/wiredtiger/src/btree/bt_curprev.c
parent017704acdfc7517efadb3fab167bba06c025c01a (diff)
downloadmongo-5ca2daf551a2c631a5f573cb054406f5d49fbef5.tar.gz
SERVER-15953: add wiredtiger to third_party
Diffstat (limited to 'src/third_party/wiredtiger/src/btree/bt_curprev.c')
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curprev.c560
1 files changed, 560 insertions, 0 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c
new file mode 100644
index 00000000000..8de784d1f1d
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c
@@ -0,0 +1,560 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Walking backwards through skip lists.
+ *
+ * The skip list stack is an array of pointers set up by a search. It points
+ * to the position a node should go in the skip list. In other words, the skip
+ * list search stack always points *after* the search item (that is, into the
+ * search item's next array).
+ *
+ * Helper macros to go from a stack pointer at level i, pointing into a next
+ * array, back to the insert node containing that next array.
+ */
+#undef PREV_ITEM
+#define PREV_ITEM(ins_head, insp, i) \
+ (((insp) == &(ins_head)->head[i] || (insp) == NULL) ? NULL : \
+ (WT_INSERT *)((char *)((insp) - (i)) - offsetof(WT_INSERT, next)))
+
+#undef PREV_INS
+#define PREV_INS(cbt, i) \
+ PREV_ITEM((cbt)->ins_head, (cbt)->ins_stack[(i)], (i))
+
+/*
+ * __cursor_skip_prev --
+ * Move back one position in a skip list stack (aka "finger").
+ */
+static inline int
+__cursor_skip_prev(WT_CURSOR_BTREE *cbt)
+{
+ WT_INSERT *current, *ins;
+ WT_ITEM key;
+ WT_SESSION_IMPL *session;
+ int i;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+
+restart:
+ /*
+ * If the search stack does not point at the current item, fill it in
+ * with a search.
+ */
+ while ((current = cbt->ins) != PREV_INS(cbt, 0)) {
+ if (cbt->btree->type == BTREE_ROW) {
+ key.data = WT_INSERT_KEY(current);
+ key.size = WT_INSERT_KEY_SIZE(current);
+ WT_RET(__wt_search_insert(session, cbt, &key));
+ } else
+ cbt->ins = __col_insert_search(cbt->ins_head,
+ cbt->ins_stack, cbt->next_stack,
+ WT_INSERT_RECNO(current));
+ }
+
+ /*
+ * Find the first node up the search stack that does not move.
+ *
+ * The depth of the current item must be at least this level, since we
+ * see it in that many levels of the stack.
+ *
+ * !!! Watch these loops carefully: they all rely on the value of i,
+ * and the exit conditions to end up with the right values are
+ * non-trivial.
+ */
+ ins = NULL; /* -Wconditional-uninitialized */
+ for (i = 0; i < WT_SKIP_MAXDEPTH - 1; i++)
+ if ((ins = PREV_INS(cbt, i + 1)) != current)
+ break;
+
+ /*
+ * Find a starting point for the new search. That is either at the
+ * non-moving node if we found a valid node, or the beginning of the
+ * next list down that is not the current node.
+ *
+ * Since it is the beginning of a list, and we know the current node is
+ * has a skip depth at least this high, any node we find must sort
+ * before the current node.
+ */
+ if (ins == NULL || ins == current)
+ for (; i >= 0; i--) {
+ cbt->ins_stack[i] = NULL;
+ cbt->next_stack[i] = NULL;
+ ins = cbt->ins_head->head[i];
+ if (ins != NULL && ins != current)
+ break;
+ }
+
+ /* Walk any remaining levels until just before the current node. */
+ while (i >= 0) {
+ /*
+ * If we get to the end of a list without finding the current
+ * item, we must have raced with an insert. Restart the search.
+ */
+ if (ins == NULL) {
+ cbt->ins_stack[0] = NULL;
+ cbt->next_stack[0] = NULL;
+ goto restart;
+ }
+ if (ins->next[i] != current) /* Stay at this level */
+ ins = ins->next[i];
+ else { /* Drop down a level */
+ cbt->ins_stack[i] = &ins->next[i];
+ cbt->next_stack[i] = ins->next[i];
+ --i;
+ }
+ }
+
+ /* If we found a previous node, the next one must be current. */
+ if (cbt->ins_stack[0] != NULL && *cbt->ins_stack[0] != current)
+ goto restart;
+
+ cbt->ins = PREV_INS(cbt, 0);
+ return (0);
+}
+
+/*
+ * __cursor_fix_append_prev --
+ * Return the previous fixed-length entry on the append list.
+ */
+static inline int
+__cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, int newpage)
+{
+ WT_ITEM *val;
+ WT_PAGE *page;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ page = cbt->ref->page;
+ val = &cbt->iface.value;
+
+ if (newpage) {
+ if ((cbt->ins = WT_SKIP_LAST(cbt->ins_head)) == NULL)
+ return (WT_NOTFOUND);
+ } else {
+ /*
+ * Handle the special case of leading implicit records, that is,
+ * there aren't any records in the tree not on the append list,
+ * and the first record on the append list isn't record 1.
+ *
+ * The "right" place to handle this is probably in our caller.
+ * The high-level cursor-previous routine would:
+ * -- call this routine to walk the append list
+ * -- call the routine to walk the standard page items
+ * -- call the tree walk routine looking for a previous page
+ * Each of them returns WT_NOTFOUND, at which point our caller
+ * checks the cursor record number, and if it's larger than 1,
+ * returns the implicit records. Instead, I'm trying to detect
+ * the case here, mostly because I don't want to put that code
+ * into our caller. Anyway, if this code breaks for any reason,
+ * that's the way I'd go.
+ *
+ * If we're not pointing to a WT_INSERT entry, or we can't find
+ * a WT_INSERT record that precedes our record name-space, check
+ * if there are any records on the page. If there aren't, then
+ * we're in the magic zone, keep going until we get to a record
+ * number of 1.
+ */
+ if (cbt->ins != NULL &&
+ cbt->recno <= WT_INSERT_RECNO(cbt->ins))
+ WT_RET(__cursor_skip_prev(cbt));
+ if (cbt->ins == NULL &&
+ (cbt->recno == 1 || __col_fix_last_recno(page) != 0))
+ return (WT_NOTFOUND);
+ }
+
+ /*
+ * This code looks different from the cursor-next code. The append
+ * list appears on the last page of the tree and contains the last
+ * records in the tree. If we're iterating through the tree, starting
+ * at the last record in the tree, by definition we're starting a new
+ * iteration and we set the record number to the last record found in
+ * the tree. Otherwise, decrement the record.
+ */
+ if (newpage)
+ __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
+ else
+ __cursor_set_recno(cbt, cbt->recno - 1);
+
+ /*
+ * Fixed-width column store appends are inherently non-transactional.
+ * Even a non-visible update by a concurrent or aborted transaction
+ * changes the effective end of the data. The effect is subtle because
+ * of the blurring between deleted and empty values, but ideally we
+ * would skip all uncommitted changes at the end of the data. This
+ * doesn't apply to variable-width column stores because the implicitly
+ * created records written by reconciliation are deleted and so can be
+ * never seen by a read.
+ */
+ if (cbt->ins == NULL ||
+ cbt->recno > WT_INSERT_RECNO(cbt->ins) ||
+ (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) {
+ cbt->v = 0;
+ val->data = &cbt->v;
+ } else
+ val->data = WT_UPDATE_DATA(upd);
+ val->size = 1;
+ return (0);
+}
+
+/*
+ * __cursor_fix_prev --
+ * Move to the previous, fixed-length column-store item.
+ */
+static inline int
+__cursor_fix_prev(WT_CURSOR_BTREE *cbt, int newpage)
+{
+ WT_BTREE *btree;
+ WT_ITEM *val;
+ WT_PAGE *page;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ page = cbt->ref->page;
+ btree = S2BT(session);
+ val = &cbt->iface.value;
+
+ /* Initialize for each new page. */
+ if (newpage) {
+ cbt->last_standard_recno = __col_fix_last_recno(page);
+ if (cbt->last_standard_recno == 0)
+ return (WT_NOTFOUND);
+ __cursor_set_recno(cbt, cbt->last_standard_recno);
+ goto new_page;
+ }
+
+ /* Move to the previous entry and return the item. */
+ if (cbt->recno == page->pg_fix_recno)
+ return (WT_NOTFOUND);
+ __cursor_set_recno(cbt, cbt->recno - 1);
+
+new_page:
+ /* Check any insert list for a matching record. */
+ cbt->ins_head = WT_COL_UPDATE_SINGLE(page);
+ cbt->ins = __col_insert_search(
+ cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno);
+ if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins))
+ cbt->ins = NULL;
+ upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd);
+ if (upd == NULL) {
+ cbt->v = __bit_getv_recno(page, cbt->recno, btree->bitcnt);
+ val->data = &cbt->v;
+ } else
+ val->data = WT_UPDATE_DATA(upd);
+ val->size = 1;
+ return (0);
+}
+
+/*
+ * __cursor_var_append_prev --
+ * Return the previous variable-length entry on the append list.
+ */
+static inline int
+__cursor_var_append_prev(WT_CURSOR_BTREE *cbt, int newpage)
+{
+ WT_ITEM *val;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ val = &cbt->iface.value;
+
+ if (newpage) {
+ cbt->ins = WT_SKIP_LAST(cbt->ins_head);
+ goto new_page;
+ }
+
+ for (;;) {
+ WT_RET(__cursor_skip_prev(cbt));
+new_page: if (cbt->ins == NULL)
+ return (WT_NOTFOUND);
+
+ __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
+ if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL ||
+ WT_UPDATE_DELETED_ISSET(upd))
+ continue;
+ val->data = WT_UPDATE_DATA(upd);
+ val->size = upd->size;
+ break;
+ }
+ return (0);
+}
+
+/*
+ * __cursor_var_prev --
+ * Move to the previous, variable-length column-store item.
+ */
+static inline int
+__cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage)
+{
+ WT_CELL *cell;
+ WT_CELL_UNPACK unpack;
+ WT_COL *cip;
+ WT_ITEM *val;
+ WT_PAGE *page;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ page = cbt->ref->page;
+ val = &cbt->iface.value;
+
+ /* Initialize for each new page. */
+ if (newpage) {
+ cbt->last_standard_recno = __col_var_last_recno(page);
+ if (cbt->last_standard_recno == 0)
+ return (WT_NOTFOUND);
+ __cursor_set_recno(cbt, cbt->last_standard_recno);
+ goto new_page;
+ }
+
+ /* Move to the previous entry and return the item. */
+ for (;;) {
+ __cursor_set_recno(cbt, cbt->recno - 1);
+
+new_page: if (cbt->recno < page->pg_var_recno)
+ return (WT_NOTFOUND);
+
+ /* Find the matching WT_COL slot. */
+ if ((cip = __col_var_search(page, cbt->recno)) == NULL)
+ return (WT_NOTFOUND);
+ cbt->slot = WT_COL_SLOT(page, cip);
+
+ /* Check any insert list for a matching record. */
+ cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot);
+ cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno);
+ upd = cbt->ins == NULL ?
+ NULL : __wt_txn_read(session, cbt->ins->upd);
+ if (upd != NULL) {
+ if (WT_UPDATE_DELETED_ISSET(upd))
+ continue;
+
+ val->data = WT_UPDATE_DATA(upd);
+ val->size = upd->size;
+ return (0);
+ }
+
+ /*
+ * If we're at the same slot as the last reference and there's
+ * no matching insert list item, re-use the return information
+ * (so encoded items with large repeat counts aren't repeatedly
+ * decoded). Otherwise, unpack the cell and build the return
+ * information.
+ */
+ if (cbt->cip_saved != cip) {
+ if ((cell = WT_COL_PTR(page, cip)) == NULL)
+ continue;
+ __wt_cell_unpack(cell, &unpack);
+ if (unpack.type == WT_CELL_DEL)
+ continue;
+ WT_RET(__wt_page_cell_data_ref(
+ session, page, &unpack, &cbt->tmp));
+
+ cbt->cip_saved = cip;
+ }
+ val->data = cbt->tmp.data;
+ val->size = cbt->tmp.size;
+ return (0);
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __cursor_row_prev --
+ * Move to the previous row-store item.
+ */
+static inline int
+__cursor_row_prev(WT_CURSOR_BTREE *cbt, int newpage)
+{
+ WT_INSERT *ins;
+ WT_ITEM *key, *val;
+ WT_PAGE *page;
+ WT_ROW *rip;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ page = cbt->ref->page;
+ key = &cbt->iface.key;
+ val = &cbt->iface.value;
+
+ /*
+ * For row-store pages, we need a single item that tells us the part
+ * of the page we're walking (otherwise switching from next to prev
+ * and vice-versa is just too complicated), so we map the WT_ROW and
+ * WT_INSERT_HEAD insert array slots into a single name space: slot 1
+ * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is
+ * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are
+ * odd-numbered slots, and WT_ROW array slots are even-numbered slots.
+ *
+ * New page configuration.
+ */
+ if (newpage) {
+ /*
+ * If we haven't instantiated keys on this page, do so, else it
+ * is a very, very slow traversal.
+ */
+ if (!F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS))
+ WT_RET(__wt_row_leaf_keys(session, page));
+
+ if (page->pg_row_entries == 0)
+ cbt->ins_head = WT_ROW_INSERT_SMALLEST(page);
+ else
+ cbt->ins_head =
+ WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1);
+ cbt->ins = WT_SKIP_LAST(cbt->ins_head);
+ cbt->row_iteration_slot = page->pg_row_entries * 2 + 1;
+ goto new_insert;
+ }
+
+ /* Move to the previous entry and return the item. */
+ for (;;) {
+ /*
+ * Continue traversing any insert list. Maintain the reference
+ * to the current insert element in case we switch to a cursor
+ * next movement.
+ */
+ if (cbt->ins != NULL)
+ WT_RET(__cursor_skip_prev(cbt));
+
+new_insert: if ((ins = cbt->ins) != NULL) {
+ if ((upd = __wt_txn_read(session, ins->upd)) == NULL ||
+ WT_UPDATE_DELETED_ISSET(upd))
+ continue;
+ key->data = WT_INSERT_KEY(ins);
+ key->size = WT_INSERT_KEY_SIZE(ins);
+ val->data = WT_UPDATE_DATA(upd);
+ val->size = upd->size;
+ return (0);
+ }
+
+ /* Check for the beginning of the page. */
+ if (cbt->row_iteration_slot == 1)
+ return (WT_NOTFOUND);
+ --cbt->row_iteration_slot;
+
+ /*
+ * Odd-numbered slots configure as WT_INSERT_HEAD entries,
+ * even-numbered slots configure as WT_ROW entries.
+ */
+ if (cbt->row_iteration_slot & 0x01) {
+ cbt->ins_head = cbt->row_iteration_slot == 1 ?
+ WT_ROW_INSERT_SMALLEST(page) :
+ WT_ROW_INSERT_SLOT(
+ page, cbt->row_iteration_slot / 2 - 1);
+ cbt->ins = WT_SKIP_LAST(cbt->ins_head);
+ goto new_insert;
+ }
+ cbt->ins_head = NULL;
+ cbt->ins = NULL;
+
+ cbt->slot = cbt->row_iteration_slot / 2 - 1;
+ rip = &page->pg_row_d[cbt->slot];
+ upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip));
+ if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd))
+ continue;
+
+ return (__cursor_row_slot_return(cbt, rip, upd));
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __wt_btcur_prev --
+ * Move to the previous record in the tree.
+ */
+int
+__wt_btcur_prev(WT_CURSOR_BTREE *cbt, int truncating)
+{
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_SESSION_IMPL *session;
+ uint32_t flags;
+ int newpage;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_prev);
+ WT_STAT_FAST_DATA_INCR(session, cursor_prev);
+
+ flags = WT_READ_PREV | WT_READ_SKIP_INTL; /* Tree walk flags. */
+ if (truncating)
+ LF_SET(WT_READ_TRUNCATE);
+
+ WT_RET(__cursor_func_init(cbt, 0));
+
+ /*
+ * If we aren't already iterating in the right direction, there's
+ * some setup to do.
+ */
+ if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV))
+ __wt_btcur_iterate_setup(cbt, 0);
+
+ /*
+ * Walk any page we're holding until the underlying call returns not-
+ * found. Then, move to the previous page, until we reach the start
+ * of the file.
+ */
+ page = cbt->ref == NULL ? NULL : cbt->ref->page;
+ for (newpage = 0;; newpage = 1) {
+ if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ ret = __cursor_fix_append_prev(cbt, newpage);
+ break;
+ case WT_PAGE_COL_VAR:
+ ret = __cursor_var_append_prev(cbt, newpage);
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+ if (ret == 0)
+ break;
+ F_CLR(cbt, WT_CBT_ITERATE_APPEND);
+ if (ret != WT_NOTFOUND)
+ break;
+ newpage = 1;
+ }
+ if (page != NULL) {
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ ret = __cursor_fix_prev(cbt, newpage);
+ break;
+ case WT_PAGE_COL_VAR:
+ ret = __cursor_var_prev(cbt, newpage);
+ break;
+ case WT_PAGE_ROW_LEAF:
+ ret = __cursor_row_prev(cbt, newpage);
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+ if (ret != WT_NOTFOUND)
+ break;
+ }
+
+ WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
+ WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
+
+ page = cbt->ref->page;
+ WT_ASSERT(session, !WT_PAGE_IS_INTERNAL(page));
+
+ /*
+ * The last page in a column-store has appended entries.
+ * We handle it separately from the usual cursor code:
+ * it's only that one page and it's in a simple format.
+ */
+ if (page->type != WT_PAGE_ROW_LEAF &&
+ (cbt->ins_head = WT_COL_APPEND(page)) != NULL)
+ F_SET(cbt, WT_CBT_ITERATE_APPEND);
+ }
+
+err: if (ret != 0)
+ WT_TRET(__cursor_reset(cbt));
+ return (ret);
+}