summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/btree/col_srch.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party/wiredtiger/src/btree/col_srch.c')
-rw-r--r--src/third_party/wiredtiger/src/btree/col_srch.c199
1 files changed, 199 insertions, 0 deletions
diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c
new file mode 100644
index 00000000000..e4083e2282f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/col_srch.c
@@ -0,0 +1,199 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_col_search --
+ * Search a column-store tree for a specific record-based key.
+ */
+int
+__wt_col_search(WT_SESSION_IMPL *session,
+ uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt)
+{
+ WT_BTREE *btree;
+ WT_COL *cip;
+ WT_DECL_RET;
+ WT_INSERT *ins;
+ WT_INSERT_HEAD *ins_head;
+ WT_PAGE *page;
+ WT_PAGE_INDEX *pindex;
+ WT_REF *current, *descent;
+ uint32_t base, indx, limit;
+ int depth;
+
+ btree = S2BT(session);
+
+ __cursor_pos_clear(cbt);
+
+ /*
+ * In the service of eviction splits, we're only searching a single leaf
+ * page, not a full tree.
+ */
+ if (leaf != NULL) {
+ current = leaf;
+ goto leaf_only;
+ }
+
+ /* Search the internal pages of the tree. */
+ current = &btree->root;
+ for (depth = 2;; ++depth) {
+restart: page = current->page;
+ if (page->type != WT_PAGE_COL_INT)
+ break;
+
+ WT_ASSERT(session, current->key.recno == page->pg_intl_recno);
+
+ pindex = WT_INTL_INDEX_COPY(page);
+ base = pindex->entries;
+ descent = pindex->index[base - 1];
+
+ /* Fast path appends. */
+ if (recno >= descent->key.recno)
+ goto descend;
+
+ /* Binary search of internal pages. */
+ for (base = 0,
+ limit = pindex->entries - 1; limit != 0; limit >>= 1) {
+ indx = base + (limit >> 1);
+ descent = pindex->index[indx];
+
+ if (recno == descent->key.recno)
+ break;
+ if (recno < descent->key.recno)
+ continue;
+ base = indx + 1;
+ --limit;
+ }
+descend: /*
+ * Reference the slot used for next step down the tree.
+ *
+ * Base is the smallest index greater than recno and may be the
+ * (last + 1) index. The slot for descent is the one before
+ * base.
+ */
+ if (recno != descent->key.recno) {
+ /*
+ * We don't have to correct for base == 0 because the
+ * only way for base to be 0 is if recno is the page's
+ * starting recno.
+ */
+ WT_ASSERT(session, base > 0);
+ descent = pindex->index[base - 1];
+ }
+
+ /*
+ * Swap the current page for the child page. If the page splits
+ * while we're retrieving it, restart the search in the current
+ * page; otherwise return on error, the swap call ensures we're
+ * holding nothing on failure.
+ */
+ switch (ret = __wt_page_swap(session, current, descent, 0)) {
+ case 0:
+ current = descent;
+ break;
+ case WT_RESTART:
+ goto restart;
+ default:
+ return (ret);
+ }
+ }
+
+ /* Track how deep the tree gets. */
+ if (depth > btree->maximum_depth)
+ btree->maximum_depth = depth;
+
+leaf_only:
+ page = current->page;
+ cbt->ref = current;
+ cbt->recno = recno;
+ cbt->compare = 0;
+
+ /*
+ * Set the on-page slot to an impossible value larger than any possible
+ * slot (it's used to interpret the search function's return after the
+ * search returns an insert list for a page that has no entries).
+ */
+ cbt->slot = UINT32_MAX;
+
+ /*
+ * Search the leaf page. We do not check in the search path for a
+ * record greater than the maximum record in the tree; in that case,
+ * we arrive here with a record that's impossibly large for the page.
+ */
+ if (page->type == WT_PAGE_COL_FIX) {
+ if (recno >= page->pg_fix_recno + page->pg_fix_entries) {
+ cbt->recno = page->pg_fix_recno + page->pg_fix_entries;
+ goto past_end;
+ } else
+ ins_head = WT_COL_UPDATE_SINGLE(page);
+ } else
+ if ((cip = __col_var_search(page, recno)) == NULL) {
+ cbt->recno = __col_var_last_recno(page);
+ goto past_end;
+ } else {
+ cbt->slot = WT_COL_SLOT(page, cip);
+ ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot);
+ }
+
+ /*
+ * We have a match on the page, check for an update. Check the page's
+ * update list (fixed-length), or slot's update list (variable-length)
+ * for a better match. The only better match we can find is an exact
+ * match, otherwise the existing match on the page is the one we want.
+ * For that reason, don't set the cursor's WT_INSERT_HEAD/WT_INSERT pair
+ * until we know we have a useful entry.
+ */
+ if ((ins = __col_insert_search(
+ ins_head, cbt->ins_stack, cbt->next_stack, recno)) != NULL)
+ if (recno == WT_INSERT_RECNO(ins)) {
+ cbt->ins_head = ins_head;
+ cbt->ins = ins;
+ }
+ return (0);
+
+past_end:
+ /*
+ * A record past the end of the page's standard information. Check the
+ * append list; by definition, any record on the append list is closer
+ * than the last record on the page, so it's a better choice for return.
+ * This is a rarely used path: we normally find exact matches, because
+ * column-store files are dense, but in this case the caller searched
+ * past the end of the table.
+ *
+ * Don't bother searching if the caller is appending a new record where
+ * we'll allocate the record number; we're not going to find a match by
+ * definition, and we figure out the position when we do the work.
+ */
+ cbt->ins_head = WT_COL_APPEND(page);
+ if (recno == UINT64_MAX)
+ cbt->ins = NULL;
+ else
+ cbt->ins = __col_insert_search(
+ cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno);
+ if (cbt->ins == NULL)
+ cbt->compare = -1;
+ else {
+ cbt->recno = WT_INSERT_RECNO(cbt->ins);
+ if (recno == cbt->recno)
+ cbt->compare = 0;
+ else if (recno < cbt->recno)
+ cbt->compare = 1;
+ else
+ cbt->compare = -1;
+ }
+
+ /*
+ * Note if the record is past the maximum record in the tree, the cursor
+ * search functions need to know for fixed-length column-stores because
+ * appended records implicitly create any skipped records, and cursor
+ * search functions have to handle that case.
+ */
+ if (cbt->compare == -1)
+ F_SET(cbt, WT_CBT_MAX_RECORD);
+ return (0);
+}