summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/btree/col_srch.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party/wiredtiger/src/btree/col_srch.c')
-rw-r--r--src/third_party/wiredtiger/src/btree/col_srch.c122
1 files changed, 98 insertions, 24 deletions
diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c
index e9fa570f97b..cb5a227495f 100644
--- a/src/third_party/wiredtiger/src/btree/col_srch.c
+++ b/src/third_party/wiredtiger/src/btree/col_srch.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -9,12 +9,60 @@
#include "wt_internal.h"
/*
+ * __check_leaf_key_range --
+ * Check the search key is in the leaf page's key range.
+ */
+static inline int
+__check_leaf_key_range(WT_SESSION_IMPL *session,
+ uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt)
+{
+ WT_PAGE_INDEX *pindex;
+ uint32_t indx;
+
+ /*
+ * There are reasons we can't do the fast checks, and we continue with
+ * the leaf page search in those cases, only skipping the complete leaf
+ * page search if we know it's not going to work.
+ */
+ cbt->compare = 0;
+
+ /*
+ * Check if the search key is smaller than the parent's starting key for
+ * this page.
+ */
+ if (recno < leaf->key.recno) {
+ cbt->compare = 1; /* page keys > search key */
+ return (0);
+ }
+
+ /*
+ * Check if the search key is greater than or equal to the starting key
+ * for the parent's next page.
+ *
+ * !!!
+ * Check that "indx + 1" is a valid page-index entry first, because it
+ * also checks that "indx" is a valid page-index entry, and we have to
+ * do that latter check before looking at the indx slot of the array
+ * for a match to leaf (in other words, our page hint might be wrong).
+ */
+ WT_INTL_INDEX_GET(session, leaf->home, pindex);
+ indx = leaf->pindex_hint;
+ if (indx + 1 < pindex->entries && pindex->index[indx] == leaf)
+ if (recno >= pindex->index[indx + 1]->key.recno) {
+ cbt->compare = -1; /* page keys < search key */
+ return (0);
+ }
+
+ return (0);
+}
+
+/*
* __wt_col_search --
* Search a column-store tree for a specific record-based key.
*/
int
__wt_col_search(WT_SESSION_IMPL *session,
- uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt)
+ uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt)
{
WT_BTREE *btree;
WT_COL *cip;
@@ -24,6 +72,7 @@ __wt_col_search(WT_SESSION_IMPL *session,
WT_PAGE *page;
WT_PAGE_INDEX *pindex, *parent_pindex;
WT_REF *current, *descent;
+ uint64_t recno;
uint32_t base, indx, limit;
int depth;
@@ -31,8 +80,38 @@ __wt_col_search(WT_SESSION_IMPL *session,
__cursor_pos_clear(cbt);
- /* We may only be searching a single leaf page, not the full tree. */
+ /*
+ * When appending a new record, the search record number will be an
+ * out-of-band value, search for the largest key in the table instead.
+ */
+ if ((recno = search_recno) == WT_RECNO_OOB)
+ recno = UINT64_MAX;
+
+ /*
+ * We may be searching only a single leaf page, not the full tree. In
+ * the normal case where the page links to a parent, check the page's
+ * parent keys before doing the full search, it's faster when the
+ * cursor is being re-positioned. (One case where the page doesn't
+ * have a parent is if it is being re-instantiated in memory as part
+ * of a split).
+ */
if (leaf != NULL) {
+ WT_ASSERT(session, search_recno != WT_RECNO_OOB);
+
+ if (leaf->home != NULL) {
+ WT_RET(__check_leaf_key_range(
+ session, recno, leaf, cbt));
+ if (cbt->compare != 0) {
+ /*
+ * !!!
+ * WT_CURSOR.search_near uses the slot value to
+ * decide if there was an on-page match.
+ */
+ cbt->slot = 0;
+ return (0);
+ }
+ }
+
current = leaf;
goto leaf_only;
}
@@ -103,7 +182,8 @@ descend: /*
* page; otherwise return on error, the swap call ensures we're
* holding nothing on failure.
*/
- if ((ret = __wt_page_swap(session, current, descent, 0)) == 0) {
+ if ((ret = __wt_page_swap(
+ session, current, descent, WT_READ_RESTART_OK)) == 0) {
current = descent;
continue;
}
@@ -120,7 +200,17 @@ leaf_only:
page = current->page;
cbt->ref = current;
cbt->recno = recno;
- cbt->compare = 0;
+
+ /*
+ * Don't bother searching if the caller is appending a new record where
+ * we'll allocate the record number; we're not going to find a match by
+ * definition, and we figure out the record number and position when we
+ * do the work.
+ */
+ if (search_recno == WT_RECNO_OOB) {
+ cbt->compare = -1;
+ return (0);
+ }
/*
* Set the on-page slot to an impossible value larger than any possible
@@ -142,6 +232,7 @@ leaf_only:
* that's impossibly large for the page. We do have additional setup to
* do in that case, the record may be appended to the page.
*/
+ cbt->compare = 0;
if (page->type == WT_PAGE_COL_FIX) {
if (recno < page->pg_fix_recno) {
cbt->compare = 1;
@@ -190,18 +281,10 @@ past_end:
* This is a rarely used path: we normally find exact matches, because
* column-store files are dense, but in this case the caller searched
* past the end of the table.
- *
- * Don't bother searching if the caller is appending a new record where
- * we'll allocate the record number; we're not going to find a match by
- * definition, and we figure out the position when we do the work.
*/
cbt->ins_head = WT_COL_APPEND(page);
- if (recno == UINT64_MAX)
- cbt->ins = NULL;
- else
- cbt->ins = __col_insert_search(
- cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno);
- if (cbt->ins == NULL)
+ if ((cbt->ins = __col_insert_search(
+ cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno)) == NULL)
cbt->compare = -1;
else {
cbt->recno = WT_INSERT_RECNO(cbt->ins);
@@ -212,14 +295,5 @@ past_end:
else
cbt->compare = -1;
}
-
- /*
- * Note if the record is past the maximum record in the tree, the cursor
- * search functions need to know for fixed-length column-stores because
- * appended records implicitly create any skipped records, and cursor
- * search functions have to handle that case.
- */
- if (cbt->compare == -1)
- F_SET(cbt, WT_CBT_MAX_RECORD);
return (0);
}