diff options
author | Keith Bostic <keith.bostic@wiredtiger.com> | 2011-08-28 13:24:21 +0000 |
---|---|---|
committer | Keith Bostic <keith.bostic@wiredtiger.com> | 2011-08-28 13:24:21 +0000 |
commit | c259c5bc57cf091bbe2d2fa9055de8372bfb8ced (patch) | |
tree | 77676b131c0b08f5ed289221ae66453ba899dea7 /src/btree/bt_curnext.c | |
parent | 31ed5ed7260f81d3f320ab5c4a03bc434f03c6a7 (diff) | |
download | mongo-c259c5bc57cf091bbe2d2fa9055de8372bfb8ced.tar.gz |
Intermediate commit of new cursor code. Both prev/next cursor motions
work with row-store files now. I re-wrote the "get the next/prev page"
code to do latching UP the tree, replacing the previous calls to the
existing walk-the-tree code, and I changed the current cursor position
to be the item that was just returned, not the next item: we're going
to need that when we try and switch cursor directions, that is, if we
do a next followed by a prev, or a prev followed by a next. I'm sure
that's not working yet, though.
--HG--
extra : rebase_source : 784f64497b8b9214d9e7b7225b1b35619a87143f
Diffstat (limited to 'src/btree/bt_curnext.c')
-rw-r--r-- | src/btree/bt_curnext.c | 259 |
1 files changed, 182 insertions, 77 deletions
diff --git a/src/btree/bt_curnext.c b/src/btree/bt_curnext.c index aa28a9233bc..c5a0e35f791 100644 --- a/src/btree/bt_curnext.c +++ b/src/btree/bt_curnext.c @@ -160,7 +160,7 @@ __next_var(WT_CURSOR_BTREE *cbt, * Move to the next row-store item. */ static inline int -__next_row(WT_CURSOR_BTREE *cbt, WT_BUF *key, WT_BUF *value, int skip) +__next_row(WT_CURSOR_BTREE *cbt, WT_BUF *key, WT_BUF *value, int newpage) { WT_CELL *cell; WT_IKEY *ikey; @@ -171,42 +171,64 @@ __next_row(WT_CURSOR_BTREE *cbt, WT_BUF *key, WT_BUF *value, int skip) session = (WT_SESSION_IMPL *)cbt->iface.session; - /* - * Return the entry referenced by the cursor, and then move to the - * next entry. - */ - for (;; skip = 0) { + /* New page configuration. */ + if (newpage) { + cbt->ins_head = WT_ROW_INSERT_SMALLEST(cbt->page); + cbt->ins = WT_SKIP_FIRST(cbt->ins_head); + cbt->slot = 0; + F_SET(cbt, WT_CBT_RET_SLOT | WT_CBT_RET_INSERT); + } + + /* Move to the next entry and return the item. */ + for (;; newpage = 0) { /* Continue traversing any insert list. */ - if ((ins = cbt->ins) != NULL) { + if (cbt->ins != NULL) { + /* + * If it's not the first entry on the page, move to the + * next entry in the insert list. + */ + if (F_ISSET(cbt, WT_CBT_RET_INSERT)) + F_CLR(cbt, WT_CBT_RET_INSERT); + else + cbt->ins = WT_SKIP_NEXT(cbt->ins); ++cbt->ins_entry_cnt; - cbt->ins = WT_SKIP_NEXT(ins); - - upd = ins->upd; - if (skip || WT_UPDATE_DELETED_ISSET(upd)) - continue; - key->data = WT_INSERT_KEY(ins); - key->size = WT_INSERT_KEY_SIZE(ins); - value->data = WT_UPDATE_DATA(upd); - value->size = upd->size; - return (0); + if ((ins = cbt->ins) != NULL) { + upd = ins->upd; + if (WT_UPDATE_DELETED_ISSET(upd)) + continue; + key->data = WT_INSERT_KEY(ins); + key->size = WT_INSERT_KEY_SIZE(ins); + value->data = WT_UPDATE_DATA(upd); + value->size = upd->size; + return (0); + } } - /* Check to see if we've finished with this page. */ - if (cbt->slot == cbt->page->entries) - return (WT_NOTFOUND); + /* + * If we've returned the current slot, move to the next slot + * (first checking to see if we're done with this page). + */ + if (F_ISSET(cbt, WT_CBT_RET_SLOT)) + F_CLR(cbt, WT_CBT_RET_SLOT); + else { + if (cbt->slot == cbt->page->entries - 1) + return (WT_NOTFOUND); + ++cbt->slot; + } /* * Set up for this slot, and any insert list that follows this * slot. */ - rip = &cbt->page->u.row_leaf.d[cbt->slot++]; + rip = &cbt->page->u.row_leaf.d[cbt->slot]; cbt->ins_head = WT_ROW_INSERT(cbt->page, rip); cbt->ins = WT_SKIP_FIRST(cbt->ins_head); cbt->ins_entry_cnt = 0; + F_SET(cbt, WT_CBT_RET_INSERT); /* If the slot has been deleted, we don't have a record. */ upd = WT_ROW_UPDATE(cbt->page, rip); - if (skip || (upd != NULL && WT_UPDATE_DELETED_ISSET(upd))) + if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) continue; /* @@ -252,37 +274,60 @@ int __wt_btcur_search_setup(WT_CURSOR_BTREE *cbt, int next) { WT_INSERT *ins; - WT_SESSION_IMPL *session; - session = (WT_SESSION_IMPL *)cbt->iface.session; + cbt->flags = 0; /* - * A cursor that was positioned using a search function is now being - * used for iteration. We don't want the search function to be slow, - * so we put off doing expensive tasks until we know the application - * is going to iterate with the cursor. - * - * First, build a stack of pages in the tree. + * If we're in an insert list and moving forward through the tree, and + * it's the "smaller than any page key" insert list, reset the slot to + * 0, that's our traversal slot, and make sure we return the items on + * it. */ - WT_RET(__wt_walk_set(session, cbt->page, &cbt->walk, 0)); + if (next && cbt->ins_head != NULL && + cbt->ins_head == WT_ROW_INSERT_SMALLEST(cbt->page)) { + F_SET(cbt, WT_CBT_RET_SLOT); + cbt->slot = 0; + } /* - * If we're in an insert list and moving forward through the tree, and + * If we're not in an insert list and moving forward through the tree, + * set up the insert list that follows our current slot, it's the next + * thing we return. + */ + if (next && cbt->ins_head == NULL) { + cbt->ins_head = WT_ROW_INSERT_SLOT(cbt->page, cbt->slot); + cbt->ins = WT_SKIP_FIRST(cbt->ins_head); + F_SET(cbt, WT_CBT_RET_INSERT); + } + + /* + * If we're in an insert list and moving backward through the tree, and * it's the "smaller than any page key" insert list, reset the slot to - * 0, that's our traversal slot going forward. If not the "smallest" - * insert list, increment the page slot because we've logically already - * returned the current slot, it appears before any insert list. + * 0, that's our traversal slot. If it's any other insert list, make + * sure we return the items on it. */ - if (cbt->ins_head != NULL && next) { + if (!next && cbt->ins_head != NULL) { if (cbt->ins_head == WT_ROW_INSERT_SMALLEST(cbt->page)) cbt->slot = 0; else - ++cbt->slot; + F_SET(cbt, WT_CBT_RET_SLOT); + } + + /* + * If we're not in an insert list and moving backward through the tree, + * set up the insert list that precedes our current slot, it's the next + * thing we return. + */ + if (!next && cbt->ins_head == NULL) { + cbt->ins_head = cbt->slot == 0 ? + WT_ROW_INSERT_SMALLEST(cbt->page) : + WT_ROW_INSERT_SLOT(cbt->page, cbt->slot - 1); + F_SET(cbt, WT_CBT_RET_INSERT); } /* * If we're in an insert list, figure out how far in, we have to track - * our current slot in case we reverse direction. + * our current slot for previous traversals. */ cbt->ins_entry_cnt = 0; if (cbt->ins_head != NULL) @@ -292,10 +337,6 @@ __wt_btcur_search_setup(WT_CURSOR_BTREE *cbt, int next) break; } - /* Our page hazard reference is now a tree hazard reference. */ - F_CLR(cbt, WT_CBT_PAGE_RELEASE); - F_SET(cbt, WT_CBT_WALK_RELEASE); - return (0); } @@ -306,9 +347,7 @@ __wt_btcur_search_setup(WT_CURSOR_BTREE *cbt, int next) int __wt_btcur_first(WT_CURSOR_BTREE *cbt) { - __wt_cursor_hazard_clear(cbt); - - cbt->iter_state = WT_CBT_NOTHING; + __wt_cursor_clear(cbt); return (__wt_btcur_next(cbt)); } @@ -322,9 +361,7 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt) { WT_CURSOR *cursor; WT_SESSION_IMPL *session; - int newpage, ret, skip; - - skip = 0; + int newpage, ret; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; @@ -332,31 +369,16 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt) F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); - switch (cbt->iter_state) { - case WT_CBT_NOTHING: - WT_RET(__wt_walk_first(session, &cbt->walk, 0)); - break; - case WT_CBT_SEARCH: + /* If iterating from a search position, there's some setup to do. */ + if (F_ISSET(cbt, WT_CBT_SEARCH_SET)) WT_RET(__wt_btcur_search_setup(cbt, 1)); - /* - * The iteration functions return the record referenced by the - * cursor, and then move the cursor: setting up a search means - * referencing a record we've already returned (or deleted, or - * whatever), skip that record and move to the next record. - */ - skip = 1; - break; - case WT_CBT_ITERATING: - break; - } - /* * Walk any page we're holding until the underlying call returns not- * found. Then, move to the next page, until we reach the end of the * file. */ - for (newpage = 0;; newpage = 1, skip = 0) { + for (newpage = 0;; newpage = 1) { if (cbt->page != NULL) { switch (cbt->page->type) { case WT_PAGE_COL_FIX: @@ -369,7 +391,7 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt) break; case WT_PAGE_ROW_LEAF: ret = __next_row( - cbt, &cursor->key, &cursor->value, skip); + cbt, &cursor->key, &cursor->value, newpage); break; WT_ILLEGAL_FORMAT_ERR(session); } @@ -378,25 +400,108 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt) } do { - WT_ERR(__wt_walk_next(session, &cbt->walk, &cbt->page)); + WT_ERR(__wt_xxx_np(session, &cbt->page, 1)); WT_ERR_TEST(cbt->page == NULL, WT_NOTFOUND); } while ( cbt->page->type == WT_PAGE_COL_INT || cbt->page->type == WT_PAGE_ROW_INT); - - switch (cbt->page->type) { - case WT_PAGE_ROW_LEAF: - cbt->ins_head = WT_ROW_INSERT_SMALLEST(cbt->page); - cbt->ins = WT_SKIP_FIRST(cbt->ins_head); - cbt->slot = 0; - break; - } } err: if (ret != 0) return (ret); F_SET(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); - cbt->iter_state = WT_CBT_ITERATING; + return (0); +} + +/* + * __wt_xxx_np -- + * Move to the next/previous page in the tree. + */ +int +__wt_xxx_np(WT_SESSION_IMPL *session, WT_PAGE **pagep, int next) +{ + WT_BTREE *btree; + WT_PAGE *page, *t; + WT_ROW_REF *rref; + uint32_t slot; + + btree = session->btree; + + /* + * Take a copy of any returned page; we have a hazard reference on the + * page, by definition. + */ + page = *pagep; + *pagep = NULL; + + /* If no page is active, begin a walk from the start of the tree. */ + if (page == NULL) { + if ((page = btree->root_page.page) == NULL) + return (WT_ERROR); + slot = 0; + goto descend; + } + + /* If the active page was the root, we've reached the walk's end. */ + if (WT_PAGE_IS_ROOT(page)) + return (0); + + /* + * Swap our hazard reference for the hazard reference of our parent. + * + * We're hazard-reference coupling up the tree and that's OK: first, + * hazard references can't deadlock, so there's none of the usual + * problems found when logically locking up a Btree; second, we don't + * release our current hazard reference until we have our parent's + * hazard reference. If the eviction thread tries to evict the active + * page, that fails because of our hazard reference. If eviction tries + * to evict our parent, that fails because the parent has a child page + * that can't be discarded. + * + * Get the page if it's not the root page; we could access it directly + * because we know it's in memory, but we need a hazard reference. + */ + t = page->parent; + if (!WT_PAGE_IS_ROOT(t)) + WT_RET(__wt_page_in(session, t, t->parent_ref, 0)); + + /* Figure out the currently slot. */ + slot = WT_ROW_REF_SLOT(t, page->parent_ref); + + /* Release our previous hazard reference. */ + __wt_page_release(session, page); + page = t; + + /* + * If we're at the last/first slot on the page, return this page in + * post-order traversal. Otherwise we move to the next/prev slot + * and left/right-most element in its subtree. + */ + if ((next && slot == page->entries - 1) || (!next && slot == 0)) { + *pagep = page; + return (0); + } + if (next) + ++slot; + else + --slot; + +descend: + /* + * We're starting a new subtree on page/slot, descend to the left-most + * item in the subtree, swapping hazard references at each level. + */ + for (;;) { + rref = &page->u.row_int.t[slot]; + WT_RET(__wt_page_in(session, page, &rref->ref, 0)); + __wt_page_release(session, page); + + page = WT_ROW_REF_PAGE(rref); + if (page->type != WT_PAGE_ROW_INT) + break; + slot = next ? 0 : page->entries - 1; + } + *pagep = page; return (0); } |