diff options
author | Keith Bostic <keith@wiredtiger.com> | 2015-03-28 13:15:53 -0400 |
---|---|---|
committer | Keith Bostic <keith@wiredtiger.com> | 2015-03-28 13:15:53 -0400 |
commit | 0c9f1341e2fdb93d3bd4d3fc58176f6ad169825e (patch) | |
tree | fd787a5b5b1953186f08c67872bca74f49cc8ec9 | |
parent | 835bbdfa6a2a0c842237764bd2d8dd3a522ab525 (diff) | |
download | mongo-0c9f1341e2fdb93d3bd4d3fc58176f6ad169825e.tar.gz |
When we find a record in the slot's update skiplist, but then want to
jump past the rest of the deleted records, we have to adjust based on
the starting record of the slot, use the page's repeat array to find
that starting record.
Another run at the __col_insert_search_gt (the greater-than skiplist
search), hopefully it's finally correct.
-rw-r--r-- | dist/s_string.ok | 1 | ||||
-rw-r--r-- | src/btree/bt_curnext.c | 44 | ||||
-rw-r--r-- | src/btree/bt_curprev.c | 43 | ||||
-rw-r--r-- | src/btree/col_srch.c | 2 | ||||
-rw-r--r-- | src/include/column.i | 38 |
5 files changed, 83 insertions, 45 deletions
diff --git a/dist/s_string.ok b/dist/s_string.ok index 4bf793624e0..5e714947d68 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -282,6 +282,7 @@ REQ RET RHEL RLE +RLEs RNG ROCKSDB RPC diff --git a/src/btree/bt_curnext.c b/src/btree/bt_curnext.c index b9b71087bb9..37ad61247b5 100644 --- a/src/btree/bt_curnext.c +++ b/src/btree/bt_curnext.c @@ -169,7 +169,7 @@ __cursor_var_next(WT_CURSOR_BTREE *cbt, int newpage) WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; - uint64_t rle; + uint64_t rle, rle_start; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; @@ -191,7 +191,8 @@ __cursor_var_next(WT_CURSOR_BTREE *cbt, int newpage) __cursor_set_recno(cbt, cbt->recno + 1); new_page: /* Find the matching WT_COL slot. */ - if ((cip = __col_var_search(page, cbt->recno)) == NULL) + if ((cip = + __col_var_search(page, cbt->recno, &rle_start)) == NULL) return (WT_NOTFOUND); cbt->slot = WT_COL_SLOT(page, cip); @@ -223,23 +224,38 @@ new_page: /* Find the matching WT_COL slot. */ continue; __wt_cell_unpack(cell, &unpack); if (unpack.type == WT_CELL_DEL) { + if ((rle = __wt_cell_rle(&unpack)) == 1) + continue; + /* * There can be huge gaps in the variable-length * column-store name space appearing as deleted * records. If more than one deleted record, do - * the work of finding the next useful record. - * Note adjustment for the increment done in the - * outer loop. + * the work of finding the next record to return + * instead of looping through the records. + * + * First, find the smallest record in the update + * list that's larger than the current record. */ - if ((rle = __wt_cell_rle(&unpack)) > 1) { - if ((ins = __col_insert_search_gt( - cbt->ins_head, cbt->recno)) == NULL) - cbt->recno += rle - 1; - else - cbt->recno = - WT_MIN(cbt->recno + rle, - WT_INSERT_RECNO(ins)) - 1; - } + ins = __col_insert_search_gt( + cbt->ins_head, cbt->recno); + + /* + * Second, for records with RLEs greater than 1, + * the above call to __col_var_search located + * this record in the page's list of repeating + * records, and returned the starting record. + * The starting record plus the RLE is the + * record to which we could skip, if there was + * no smaller record in the update list. + */ + cbt->recno = rle_start + rle; + if (ins != NULL && + WT_INSERT_RECNO(ins) < cbt->recno) + cbt->recno = WT_INSERT_RECNO(ins); + + /* Adjust for the outer loop increment. */ + --cbt->recno; continue; } WT_RET(__wt_page_cell_data_ref( diff --git a/src/btree/bt_curprev.c b/src/btree/bt_curprev.c index 27f4f87d48e..f6fca0621e0 100644 --- a/src/btree/bt_curprev.c +++ b/src/btree/bt_curprev.c @@ -306,7 +306,7 @@ __cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage) WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; - uint64_t rle; + uint64_t rle_start; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; @@ -329,7 +329,8 @@ new_page: if (cbt->recno < page->pg_var_recno) return (WT_NOTFOUND); /* Find the matching WT_COL slot. */ - if ((cip = __col_var_search(page, cbt->recno)) == NULL) + if ((cip = + __col_var_search(page, cbt->recno, &rle_start)) == NULL) return (WT_NOTFOUND); cbt->slot = WT_COL_SLOT(page, cip); @@ -361,23 +362,37 @@ new_page: if (cbt->recno < page->pg_var_recno) continue; __wt_cell_unpack(cell, &unpack); if (unpack.type == WT_CELL_DEL) { + if (__wt_cell_rle(&unpack) == 1) + continue; /* * There can be huge gaps in the variable-length * column-store name space appearing as deleted * records. If more than one deleted record, do - * the work of finding the next useful record. - * Note adjustment for the increment done in the - * outer loop. + * the work of finding the next record to return + * instead of looping through the records. + * + * First, find the largest record in the update + * list that's smaller than the current record. */ - if ((rle = __wt_cell_rle(&unpack)) > 1) { - if ((ins = __col_insert_search_lt( - cbt->ins_head, cbt->recno)) == NULL) - cbt->recno -= rle - 1; - else - cbt->recno = - WT_MAX(cbt->recno - rle, - WT_INSERT_RECNO(ins)) + 1; - } + ins = __col_insert_search_lt( + cbt->ins_head, cbt->recno); + + /* + * Second, for records with RLEs greater than 1, + * the above call to __col_var_search located + * this record in the page's list of repeating + * records, and returned the starting record. + * The starting record - 1 is the record to + * which we could skip, if there was no larger + * record in the update list. + */ + cbt->recno = rle_start - 1; + if (ins != NULL && + WT_INSERT_RECNO(ins) > cbt->recno) + cbt->recno = WT_INSERT_RECNO(ins); + + /* Adjust for the outer loop decrement. */ + ++cbt->recno; continue; } WT_RET(__wt_page_cell_data_ref( diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c index 4ebdde8674c..df31748e569 100644 --- a/src/btree/col_srch.c +++ b/src/btree/col_srch.c @@ -133,7 +133,7 @@ leaf_only: } else ins_head = WT_COL_UPDATE_SINGLE(page); } else - if ((cip = __col_var_search(page, recno)) == NULL) { + if ((cip = __col_var_search(page, recno, NULL)) == NULL) { cbt->recno = __col_var_last_recno(page); goto past_end; } else { diff --git a/src/include/column.i b/src/include/column.i index 5c1f4ee36b8..62422d3ddde 100644 --- a/src/include/column.i +++ b/src/include/column.i @@ -28,26 +28,29 @@ __col_insert_search_gt(WT_INSERT_HEAD *inshead, uint64_t recno) * The insert list is a skip list: start at the highest skip level, then * go as far as possible at each level before stepping down to the next. */ + ins = NULL; for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0;) - if (*insp != NULL && recno <= WT_INSERT_RECNO(*insp)) { - ins = *insp; + if (*insp != NULL && recno > WT_INSERT_RECNO(*insp)) { + ins = *insp; /* GTE: keep going at this level */ insp = &(*insp)->next[i]; } else { - --i; + --i; /* LT: drop down a level */ --insp; } /* - * If the skiplist consisted of a single record larger than the target, - * we never updated ins and it references that record. Otherwise, ins - * references a record less-than-or-equal to the target, move to a - * record after it, that is, the subsequent record that's greater than - * the target. Because inserts happen concurrently, additional records - * may be inserted after the searched-for record that are still smaller - * than the target, continue to move forward until reaching a record - * larger than the target. There isn't any safety testing because we - * confirmed such a record exists before searching. + * If we didn't find any records smaller than the target, we never set + * the return value, set it to the first record in the list. Otherwise, + * it references a record less-than-or-equal to the target, move to a + * later record, that is, a subsequent record greater than the target. + * Because inserts happen concurrently, additional records might be + * inserted after the searched-for record that are still smaller than + * the target, continue to move forward until reaching a record larger + * than the target. There isn't any safety testing because we confirmed + * such a record exists before searching. */ + if (ins == NULL) + ins = WT_SKIP_FIRST(inshead); while (recno >= WT_INSERT_RECNO(ins)) ins = WT_SKIP_NEXT(ins); return (ins); @@ -77,10 +80,10 @@ __col_insert_search_lt(WT_INSERT_HEAD *inshead, uint64_t recno) */ for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0;) if (*insp != NULL && recno < WT_INSERT_RECNO(*insp)) { - ins = *insp; + ins = *insp; /* LT: keep going at this level */ insp = &(*insp)->next[i]; } else { - --i; + --i; /* GTE: drop down a level */ --insp; } @@ -235,7 +238,7 @@ __col_fix_last_recno(WT_PAGE *page) * Search a variable-length column-store page for a record. */ static inline WT_COL * -__col_var_search(WT_PAGE *page, uint64_t recno) +__col_var_search(WT_PAGE *page, uint64_t recno, uint64_t *start_recnop) { WT_COL_RLE *repeat; uint64_t start_recno; @@ -255,8 +258,11 @@ __col_var_search(WT_PAGE *page, uint64_t recno) repeat = page->pg_var_repeats + indx; if (recno >= repeat->recno && - recno < repeat->recno + repeat->rle) + recno < repeat->recno + repeat->rle) { + if (start_recnop != NULL) + *start_recnop = repeat->recno; return (page->pg_var_d + repeat->indx); + } if (recno < repeat->recno) continue; base = indx + 1; |