summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/btree/col_srch.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party/wiredtiger/src/btree/col_srch.c')
-rw-r--r--src/third_party/wiredtiger/src/btree/col_srch.c516
1 files changed, 251 insertions, 265 deletions
diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c
index 6b040557cfc..f202dbd7f7b 100644
--- a/src/third_party/wiredtiger/src/btree/col_srch.c
+++ b/src/third_party/wiredtiger/src/btree/col_srch.c
@@ -10,308 +10,294 @@
/*
* __check_leaf_key_range --
- * Check the search key is in the leaf page's key range.
+ * Check the search key is in the leaf page's key range.
*/
static inline int
-__check_leaf_key_range(WT_SESSION_IMPL *session,
- uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt)
+__check_leaf_key_range(WT_SESSION_IMPL *session, uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt)
{
- WT_PAGE_INDEX *pindex;
- uint32_t indx;
+ WT_PAGE_INDEX *pindex;
+ uint32_t indx;
- /*
- * There are reasons we can't do the fast checks, and we continue with
- * the leaf page search in those cases, only skipping the complete leaf
- * page search if we know it's not going to work.
- */
- cbt->compare = 0;
+ /*
+ * There are reasons we can't do the fast checks, and we continue with the leaf page search in
+ * those cases, only skipping the complete leaf page search if we know it's not going to work.
+ */
+ cbt->compare = 0;
- /*
- * Check if the search key is smaller than the parent's starting key for
- * this page.
- */
- if (recno < leaf->ref_recno) {
- cbt->compare = 1; /* page keys > search key */
- return (0);
- }
+ /*
+ * Check if the search key is smaller than the parent's starting key for this page.
+ */
+ if (recno < leaf->ref_recno) {
+ cbt->compare = 1; /* page keys > search key */
+ return (0);
+ }
- /*
- * Check if the search key is greater than or equal to the starting key
- * for the parent's next page.
- *
- * !!!
- * Check that "indx + 1" is a valid page-index entry first, because it
- * also checks that "indx" is a valid page-index entry, and we have to
- * do that latter check before looking at the indx slot of the array
- * for a match to leaf (in other words, our page hint might be wrong).
- */
- WT_INTL_INDEX_GET(session, leaf->home, pindex);
- indx = leaf->pindex_hint;
- if (indx + 1 < pindex->entries && pindex->index[indx] == leaf)
- if (recno >= pindex->index[indx + 1]->ref_recno) {
- cbt->compare = -1; /* page keys < search key */
- return (0);
- }
+ /*
+ * Check if the search key is greater than or equal to the starting key
+ * for the parent's next page.
+ *
+ * !!!
+ * Check that "indx + 1" is a valid page-index entry first, because it
+ * also checks that "indx" is a valid page-index entry, and we have to
+ * do that latter check before looking at the indx slot of the array
+ * for a match to leaf (in other words, our page hint might be wrong).
+ */
+ WT_INTL_INDEX_GET(session, leaf->home, pindex);
+ indx = leaf->pindex_hint;
+ if (indx + 1 < pindex->entries && pindex->index[indx] == leaf)
+ if (recno >= pindex->index[indx + 1]->ref_recno) {
+ cbt->compare = -1; /* page keys < search key */
+ return (0);
+ }
- return (0);
+ return (0);
}
/*
* __wt_col_search --
- * Search a column-store tree for a specific record-based key.
+ * Search a column-store tree for a specific record-based key.
*/
int
-__wt_col_search(WT_SESSION_IMPL *session,
- uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool restore)
+__wt_col_search(
+ WT_SESSION_IMPL *session, uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool restore)
{
- WT_BTREE *btree;
- WT_COL *cip;
- WT_DECL_RET;
- WT_INSERT *ins;
- WT_INSERT_HEAD *ins_head;
- WT_PAGE *page;
- WT_PAGE_INDEX *pindex, *parent_pindex;
- WT_REF *current, *descent;
- uint64_t recno;
- uint32_t base, indx, limit, read_flags;
- int depth;
+ WT_BTREE *btree;
+ WT_COL *cip;
+ WT_DECL_RET;
+ WT_INSERT *ins;
+ WT_INSERT_HEAD *ins_head;
+ WT_PAGE *page;
+ WT_PAGE_INDEX *pindex, *parent_pindex;
+ WT_REF *current, *descent;
+ uint64_t recno;
+ uint32_t base, indx, limit, read_flags;
+ int depth;
- btree = S2BT(session);
- current = NULL;
+ btree = S2BT(session);
+ current = NULL;
- __cursor_pos_clear(cbt);
+ __cursor_pos_clear(cbt);
- /*
- * When appending a new record, the search record number will be an
- * out-of-band value, search for the largest key in the table instead.
- */
- if ((recno = search_recno) == WT_RECNO_OOB)
- recno = UINT64_MAX;
+ /*
+ * When appending a new record, the search record number will be an out-of-band value, search
+ * for the largest key in the table instead.
+ */
+ if ((recno = search_recno) == WT_RECNO_OOB)
+ recno = UINT64_MAX;
- /*
- * We may be searching only a single leaf page, not the full tree. In
- * the normal case where we are searching a tree, check the page's
- * parent keys before doing the full search, it's faster when the
- * cursor is being re-positioned. Skip this if the page is being
- * re-instantiated in memory.
- */
- if (leaf != NULL) {
- WT_ASSERT(session, search_recno != WT_RECNO_OOB);
+ /*
+ * We may be searching only a single leaf page, not the full tree. In the normal case where we
+ * are searching a tree, check the page's parent keys before doing the full search, it's faster
+ * when the cursor is being re-positioned. Skip this if the page is being re-instantiated in
+ * memory.
+ */
+ if (leaf != NULL) {
+ WT_ASSERT(session, search_recno != WT_RECNO_OOB);
- if (!restore) {
- WT_RET(__check_leaf_key_range(
- session, recno, leaf, cbt));
- if (cbt->compare != 0) {
- /*
- * !!!
- * WT_CURSOR.search_near uses the slot value to
- * decide if there was an on-page match.
- */
- cbt->slot = 0;
- return (0);
- }
- }
+ if (!restore) {
+ WT_RET(__check_leaf_key_range(session, recno, leaf, cbt));
+ if (cbt->compare != 0) {
+ /*
+ * !!!
+ * WT_CURSOR.search_near uses the slot value to
+ * decide if there was an on-page match.
+ */
+ cbt->slot = 0;
+ return (0);
+ }
+ }
- current = leaf;
- goto leaf_only;
- }
+ current = leaf;
+ goto leaf_only;
+ }
- if (0) {
+ if (0) {
restart:
- /*
- * Discard the currently held page and restart the search from
- * the root.
- */
- WT_RET(__wt_page_release(session, current, 0));
- }
+ /*
+ * Discard the currently held page and restart the search from the root.
+ */
+ WT_RET(__wt_page_release(session, current, 0));
+ }
- /* Search the internal pages of the tree. */
- current = &btree->root;
- for (depth = 2, pindex = NULL;; ++depth) {
- parent_pindex = pindex;
- page = current->page;
- if (page->type != WT_PAGE_COL_INT)
- break;
+ /* Search the internal pages of the tree. */
+ current = &btree->root;
+ for (depth = 2, pindex = NULL;; ++depth) {
+ parent_pindex = pindex;
+ page = current->page;
+ if (page->type != WT_PAGE_COL_INT)
+ break;
- WT_INTL_INDEX_GET(session, page, pindex);
- base = pindex->entries;
- descent = pindex->index[base - 1];
+ WT_INTL_INDEX_GET(session, page, pindex);
+ base = pindex->entries;
+ descent = pindex->index[base - 1];
- /* Fast path appends. */
- if (recno >= descent->ref_recno) {
- /*
- * If on the last slot (the key is larger than any key
- * on the page), check for an internal page split race.
- */
- if (__wt_split_descent_race(
- session, current, parent_pindex))
- goto restart;
+ /* Fast path appends. */
+ if (recno >= descent->ref_recno) {
+ /*
+ * If on the last slot (the key is larger than any key on the page), check for an
+ * internal page split race.
+ */
+ if (__wt_split_descent_race(session, current, parent_pindex))
+ goto restart;
- goto descend;
- }
+ goto descend;
+ }
- /* Binary search of internal pages. */
- for (base = 0,
- limit = pindex->entries - 1; limit != 0; limit >>= 1) {
- indx = base + (limit >> 1);
- descent = pindex->index[indx];
+ /* Binary search of internal pages. */
+ for (base = 0, limit = pindex->entries - 1; limit != 0; limit >>= 1) {
+ indx = base + (limit >> 1);
+ descent = pindex->index[indx];
- if (recno == descent->ref_recno)
- break;
- if (recno < descent->ref_recno)
- continue;
- base = indx + 1;
- --limit;
- }
+ if (recno == descent->ref_recno)
+ break;
+ if (recno < descent->ref_recno)
+ continue;
+ base = indx + 1;
+ --limit;
+ }
descend:
- /*
- * Reference the slot used for next step down the tree.
- *
- * Base is the smallest index greater than recno and may be the
- * (last + 1) index. The slot for descent is the one before
- * base.
- */
- if (recno != descent->ref_recno) {
- /*
- * We don't have to correct for base == 0 because the
- * only way for base to be 0 is if recno is the page's
- * starting recno.
- */
- WT_ASSERT(session, base > 0);
- descent = pindex->index[base - 1];
- }
+ /*
+ * Reference the slot used for next step down the tree.
+ *
+ * Base is the smallest index greater than recno and may be the
+ * (last + 1) index. The slot for descent is the one before
+ * base.
+ */
+ if (recno != descent->ref_recno) {
+ /*
+ * We don't have to correct for base == 0 because the only way for base to be 0 is if
+ * recno is the page's starting recno.
+ */
+ WT_ASSERT(session, base > 0);
+ descent = pindex->index[base - 1];
+ }
- /* Encourage races. */
- WT_DIAGNOSTIC_YIELD;
+ /* Encourage races. */
+ WT_DIAGNOSTIC_YIELD;
- /*
- * Swap the current page for the child page. If the page splits
- * while we're retrieving it, restart the search at the root.
- * We cannot restart in the "current" page; for example, if a
- * thread is appending to the tree, the page it's waiting for
- * did an insert-split into the parent, then the parent split
- * into its parent, the name space we are searching for may have
- * moved above the current page in the tree.
- *
- * On other error, simply return, the swap call ensures we're
- * holding nothing on failure.
- */
- read_flags = WT_READ_RESTART_OK;
- if (F_ISSET(cbt, WT_CBT_READ_ONCE))
- FLD_SET(read_flags, WT_READ_WONT_NEED);
- if ((ret = __wt_page_swap(session,
- current, descent, read_flags)) == 0) {
- current = descent;
- continue;
- }
- if (ret == WT_RESTART)
- goto restart;
- return (ret);
- }
+ /*
+ * Swap the current page for the child page. If the page splits
+ * while we're retrieving it, restart the search at the root.
+ * We cannot restart in the "current" page; for example, if a
+ * thread is appending to the tree, the page it's waiting for
+ * did an insert-split into the parent, then the parent split
+ * into its parent, the name space we are searching for may have
+ * moved above the current page in the tree.
+ *
+ * On other error, simply return, the swap call ensures we're
+ * holding nothing on failure.
+ */
+ read_flags = WT_READ_RESTART_OK;
+ if (F_ISSET(cbt, WT_CBT_READ_ONCE))
+ FLD_SET(read_flags, WT_READ_WONT_NEED);
+ if ((ret = __wt_page_swap(session, current, descent, read_flags)) == 0) {
+ current = descent;
+ continue;
+ }
+ if (ret == WT_RESTART)
+ goto restart;
+ return (ret);
+ }
- /* Track how deep the tree gets. */
- if (depth > btree->maximum_depth)
- btree->maximum_depth = depth;
+ /* Track how deep the tree gets. */
+ if (depth > btree->maximum_depth)
+ btree->maximum_depth = depth;
leaf_only:
- page = current->page;
- cbt->ref = current;
+ page = current->page;
+ cbt->ref = current;
- /*
- * Don't bother searching if the caller is appending a new record where
- * we'll allocate the record number; we're not going to find a match by
- * definition, and we figure out the record number and position when we
- * do the work.
- */
- if (search_recno == WT_RECNO_OOB) {
- cbt->compare = -1;
- return (0);
- }
+ /*
+ * Don't bother searching if the caller is appending a new record where we'll allocate the
+ * record number; we're not going to find a match by definition, and we figure out the record
+ * number and position when we do the work.
+ */
+ if (search_recno == WT_RECNO_OOB) {
+ cbt->compare = -1;
+ return (0);
+ }
- /*
- * Search the leaf page.
- *
- * Search after a page is pinned does a search of the pinned page before
- * doing a full tree search, in which case we might be searching for a
- * record logically before the page. Return failure, and there's nothing
- * else to do, the record isn't going to be on this page.
- *
- * We don't check inside the search path for a record greater than the
- * maximum record in the tree; in that case, we get here with a record
- * that's impossibly large for the page. We do have additional setup to
- * do in that case, the record may be appended to the page.
- */
- if (page->type == WT_PAGE_COL_FIX) {
- if (recno < current->ref_recno) {
- cbt->recno = current->ref_recno;
- cbt->compare = 1;
- return (0);
- }
- if (recno >= current->ref_recno + page->entries) {
- cbt->recno = current->ref_recno + page->entries;
- goto past_end;
- } else {
- cbt->recno = recno;
- cbt->compare = 0;
- ins_head = WT_COL_UPDATE_SINGLE(page);
- }
- } else {
- if (recno < current->ref_recno) {
- cbt->recno = current->ref_recno;
- cbt->slot = 0;
- cbt->compare = 1;
- return (0);
- }
- if ((cip = __col_var_search(current, recno, NULL)) == NULL) {
- cbt->recno = __col_var_last_recno(current);
- cbt->slot = page->entries == 0 ? 0 : page->entries - 1;
- goto past_end;
- } else {
- cbt->recno = recno;
- cbt->slot = WT_COL_SLOT(page, cip);
- cbt->compare = 0;
- ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot);
- F_SET(cbt, WT_CBT_VAR_ONPAGE_MATCH);
- }
- }
+ /*
+ * Search the leaf page.
+ *
+ * Search after a page is pinned does a search of the pinned page before
+ * doing a full tree search, in which case we might be searching for a
+ * record logically before the page. Return failure, and there's nothing
+ * else to do, the record isn't going to be on this page.
+ *
+ * We don't check inside the search path for a record greater than the
+ * maximum record in the tree; in that case, we get here with a record
+ * that's impossibly large for the page. We do have additional setup to
+ * do in that case, the record may be appended to the page.
+ */
+ if (page->type == WT_PAGE_COL_FIX) {
+ if (recno < current->ref_recno) {
+ cbt->recno = current->ref_recno;
+ cbt->compare = 1;
+ return (0);
+ }
+ if (recno >= current->ref_recno + page->entries) {
+ cbt->recno = current->ref_recno + page->entries;
+ goto past_end;
+ } else {
+ cbt->recno = recno;
+ cbt->compare = 0;
+ ins_head = WT_COL_UPDATE_SINGLE(page);
+ }
+ } else {
+ if (recno < current->ref_recno) {
+ cbt->recno = current->ref_recno;
+ cbt->slot = 0;
+ cbt->compare = 1;
+ return (0);
+ }
+ if ((cip = __col_var_search(current, recno, NULL)) == NULL) {
+ cbt->recno = __col_var_last_recno(current);
+ cbt->slot = page->entries == 0 ? 0 : page->entries - 1;
+ goto past_end;
+ } else {
+ cbt->recno = recno;
+ cbt->slot = WT_COL_SLOT(page, cip);
+ cbt->compare = 0;
+ ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot);
+ F_SET(cbt, WT_CBT_VAR_ONPAGE_MATCH);
+ }
+ }
- /*
- * We have a match on the page, check for an update. Check the page's
- * update list (fixed-length), or slot's update list (variable-length)
- * for a better match. The only better match we can find is an exact
- * match, otherwise the existing match on the page is the one we want.
- * For that reason, don't set the cursor's WT_INSERT_HEAD/WT_INSERT pair
- * until we know we have a useful entry.
- */
- if ((ins = __col_insert_search(
- ins_head, cbt->ins_stack, cbt->next_stack, recno)) != NULL)
- if (recno == WT_INSERT_RECNO(ins)) {
- cbt->ins_head = ins_head;
- cbt->ins = ins;
- }
- return (0);
+ /*
+ * We have a match on the page, check for an update. Check the page's update list
+ * (fixed-length), or slot's update list (variable-length) for a better match. The only better
+ * match we can find is an exact match, otherwise the existing match on the page is the one we
+ * want. For that reason, don't set the cursor's WT_INSERT_HEAD/WT_INSERT pair until we know we
+ * have a useful entry.
+ */
+ if ((ins = __col_insert_search(ins_head, cbt->ins_stack, cbt->next_stack, recno)) != NULL)
+ if (recno == WT_INSERT_RECNO(ins)) {
+ cbt->ins_head = ins_head;
+ cbt->ins = ins;
+ }
+ return (0);
past_end:
- /*
- * A record past the end of the page's standard information. Check the
- * append list; by definition, any record on the append list is closer
- * than the last record on the page, so it's a better choice for return.
- * This is a rarely used path: we normally find exact matches, because
- * column-store files are dense, but in this case the caller searched
- * past the end of the table.
- */
- cbt->ins_head = WT_COL_APPEND(page);
- if ((cbt->ins = __col_insert_search(
- cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno)) == NULL)
- cbt->compare = -1;
- else {
- cbt->recno = WT_INSERT_RECNO(cbt->ins);
- if (recno == cbt->recno)
- cbt->compare = 0;
- else if (recno < cbt->recno)
- cbt->compare = 1;
- else
- cbt->compare = -1;
- }
- return (0);
+ /*
+ * A record past the end of the page's standard information. Check the append list; by
+ * definition, any record on the append list is closer than the last record on the page, so it's
+ * a better choice for return. This is a rarely used path: we normally find exact matches,
+ * because column-store files are dense, but in this case the caller searched past the end of
+ * the table.
+ */
+ cbt->ins_head = WT_COL_APPEND(page);
+ if ((cbt->ins = __col_insert_search(cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno)) ==
+ NULL)
+ cbt->compare = -1;
+ else {
+ cbt->recno = WT_INSERT_RECNO(cbt->ins);
+ if (recno == cbt->recno)
+ cbt->compare = 0;
+ else if (recno < cbt->recno)
+ cbt->compare = 1;
+ else
+ cbt->compare = -1;
+ }
+ return (0);
}