diff options
35 files changed, 1083 insertions, 1206 deletions
diff --git a/build_posix/aclocal/version-set.m4 b/build_posix/aclocal/version-set.m4 index 9ca84e7bd9b..d41c4c14fbc 100644 --- a/build_posix/aclocal/version-set.m4 +++ b/build_posix/aclocal/version-set.m4 @@ -3,7 +3,7 @@ dnl build by dist/s_version VERSION_MAJOR=0 VERSION_MINOR=7 VERSION_PATCH=0 -VERSION_STRING='"WiredTiger 0.7.0: (September 6, 2011)"' +VERSION_STRING='"WiredTiger 0.7.0: (September 11, 2011)"' AC_SUBST(VERSION_MAJOR) AC_SUBST(VERSION_MINOR) diff --git a/dist/api_data.py b/dist/api_data.py index e68547c9de0..f0411456d05 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -61,14 +61,6 @@ file_meta = format_meta + [ Config('allocation_size', '512B', r''' file unit allocation size, in bytes''', min='512B', max='128MB'), - Config('column_internal_extend', '10000', r''' - configure the number of records a column-store internal page is - extended by when records are appended''', - min='500', max='10M'), - Config('column_leaf_extend', '10000', r''' - configure the number of records a column-store leaf page is - extended by when records are appended''', - min='500', max='10M'), Config('huffman_key', '', r''' use Huffman encoding for Btree keys. Permitted values are empty (off), \c "english" or \c "<filename>". See @ref huffman diff --git a/dist/filelist b/dist/filelist index 91bd06053d8..e873ef35505 100644 --- a/dist/filelist +++ b/dist/filelist @@ -40,7 +40,6 @@ src/btree/bt_sync.c src/btree/bt_vrfy.c src/btree/bt_vrfy_dsk.c src/btree/bt_walk.c -src/btree/col_extend.c src/btree/col_modify.c src/btree/col_srch.c src/btree/row_key.c diff --git a/dist/serial.py b/dist/serial.py index 7136b2f415f..c9520631fa4 100644 --- a/dist/serial.py +++ b/dist/serial.py @@ -17,23 +17,21 @@ class Serial: self.args = args msgtypes = [ +Serial('append', 'WT_WORKQ_FUNC', 1, [ + SerialArg('WT_INSERT_HEAD **', 'inshead'), + SerialArg('WT_INSERT ***', 'ins_stack'), + SerialArg('WT_INSERT_HEAD **', 'new_inslist', 1), + SerialArg('WT_INSERT_HEAD *', 'new_inshead', 1), + SerialArg('WT_INSERT *', 'new_ins', 0), + SerialArg('u_int', 'skipdepth'), + ]), + Serial('cache_read', 'WT_WORKQ_READ', 0, [ SerialArg('WT_PAGE *', 'parent'), SerialArg('WT_REF *', 'parent_ref'), SerialArg('int', 'dsk_verify'), ]), -Serial('col_extend', 'WT_WORKQ_FUNC', 1, [ - SerialArg('WT_PAGE *', 'page'), - SerialArg('WT_PAGE *', 'new_intl', 1), - SerialArg('WT_COL_REF *', 't', 1), - SerialArg('uint32_t', 'internal_extend'), - SerialArg('WT_PAGE *', 'new_leaf', 1), - SerialArg('void *', 'entries', 1), - SerialArg('uint32_t', 'leaf_extend'), - SerialArg('uint64_t', 'recno'), - ]), - Serial('evict_file', 'WT_WORKQ_EVICT', 0, [ SerialArg('int', 'close_method'), ]), @@ -45,8 +43,8 @@ Serial('insert', 'WT_WORKQ_FUNC', 1, [ SerialArg('WT_INSERT ***', 'ins_stack'), SerialArg('WT_INSERT_HEAD **', 'new_inslist', 1), SerialArg('WT_INSERT_HEAD *', 'new_inshead', 1), - SerialArg('WT_INSERT *', 'ins', 1), - SerialArg('uint32_t', 'depth'), + SerialArg('WT_INSERT *', 'new_ins', 0), + SerialArg('u_int', 'skipdepth'), ]), Serial('row_key', 'WT_WORKQ_FUNC', 1, [ @@ -60,7 +58,7 @@ Serial('update', 'WT_WORKQ_FUNC', 1, [ SerialArg('uint32_t', 'write_gen'), SerialArg('WT_UPDATE **', 'srch_upd'), SerialArg('WT_UPDATE **', 'new_upd', 1), - SerialArg('WT_UPDATE *', 'upd', 1), + SerialArg('WT_UPDATE *', 'upd', 0), ]), ] diff --git a/src/api/config_def.c b/src/api/config_def.c index f8203f36dd8..96d3a752683 100644 --- a/src/api/config_def.c +++ b/src/api/config_def.c @@ -76,26 +76,23 @@ __wt_confchk_cursor_close = const char * __wt_confdfl_file_meta = - "allocation_size=512B,block_compressor=,column_internal_extend=10000," - "column_leaf_extend=10000,columns=,huffman_key=,huffman_value=," - "internal_key_truncate=true,internal_node_max=2KB,internal_node_min=2KB," - "key_format=u,key_gap=10,leaf_node_max=1MB,leaf_node_min=32KB," - "prefix_compression=true,split_min=false,split_pct=75,type=btree," - "value_format=u"; + "allocation_size=512B,block_compressor=,columns=,huffman_key=," + "huffman_value=,internal_key_truncate=true,internal_node_max=2KB," + "internal_node_min=2KB,key_format=u,key_gap=10,leaf_node_max=1MB," + "leaf_node_min=32KB,prefix_compression=true,split_min=false,split_pct=75," + "type=btree,value_format=u"; const char * __wt_confchk_file_meta = "allocation_size=(type=int,min=512B,max=128MB),block_compressor=()," - "column_internal_extend=(type=int,min=500,max=10M)," - "column_leaf_extend=(type=int,min=500,max=10M),columns=(type=list)," - "huffman_key=(),huffman_value=(),internal_key_truncate=(type=boolean)," - "internal_node_max=(type=int,min=512B,max=512MB)," - "internal_node_min=(type=int,min=512B,max=512MB),key_format=(type=format)" - ",key_gap=(type=int,min=0),leaf_node_max=(type=int,min=512B,max=512MB)," - "leaf_node_min=(type=int,min=512B,max=512MB)," - "prefix_compression=(type=boolean),split_min=(type=boolean)," - "split_pct=(type=int,min=0,max=100),type=(choices=[\"btree\"])," - "value_format=(type=format)"; + "columns=(type=list),huffman_key=(),huffman_value=()," + "internal_key_truncate=(type=boolean),internal_node_max=(type=int," + "min=512B,max=512MB),internal_node_min=(type=int,min=512B,max=512MB)," + "key_format=(type=format),key_gap=(type=int,min=0)," + "leaf_node_max=(type=int,min=512B,max=512MB),leaf_node_min=(type=int," + "min=512B,max=512MB),prefix_compression=(type=boolean)," + "split_min=(type=boolean),split_pct=(type=int,min=0,max=100)," + "type=(choices=[\"btree\"]),value_format=(type=format)"; const char * __wt_confdfl_index_meta = @@ -144,8 +141,7 @@ __wt_confchk_session_commit_transaction = const char * __wt_confdfl_session_create = - "allocation_size=512B,block_compressor=,colgroups=," - "column_internal_extend=10000,column_leaf_extend=10000,columns=,columns=," + "allocation_size=512B,block_compressor=,colgroups=,columns=,columns=," "exclusive=false,filename=,huffman_key=,huffman_value=," "internal_key_truncate=true,internal_node_max=2KB,internal_node_min=2KB," "key_format=u,key_format=u,key_gap=10,leaf_node_max=1MB," @@ -155,18 +151,16 @@ __wt_confdfl_session_create = const char * __wt_confchk_session_create = "allocation_size=(type=int,min=512B,max=128MB),block_compressor=()," - "colgroups=(),column_internal_extend=(type=int,min=500,max=10M)," - "column_leaf_extend=(type=int,min=500,max=10M),columns=(type=list)," - "columns=(type=list),exclusive=(type=boolean),filename=(),huffman_key=()," - "huffman_value=(),internal_key_truncate=(type=boolean)," - "internal_node_max=(type=int,min=512B,max=512MB)," - "internal_node_min=(type=int,min=512B,max=512MB),key_format=(type=format)" - ",key_format=(type=format),key_gap=(type=int,min=0)," - "leaf_node_max=(type=int,min=512B,max=512MB),leaf_node_min=(type=int," - "min=512B,max=512MB),prefix_compression=(type=boolean)," - "split_min=(type=boolean),split_pct=(type=int,min=0,max=100)," - "type=(choices=[\"btree\"]),value_format=(type=format)," - "value_format=(type=format)"; + "colgroups=(),columns=(type=list),columns=(type=list)," + "exclusive=(type=boolean),filename=(),huffman_key=(),huffman_value=()," + "internal_key_truncate=(type=boolean),internal_node_max=(type=int," + "min=512B,max=512MB),internal_node_min=(type=int,min=512B,max=512MB)," + "key_format=(type=format),key_format=(type=format),key_gap=(type=int," + "min=0),leaf_node_max=(type=int,min=512B,max=512MB)," + "leaf_node_min=(type=int,min=512B,max=512MB)," + "prefix_compression=(type=boolean),split_min=(type=boolean)," + "split_pct=(type=int,min=0,max=100),type=(choices=[\"btree\"])," + "value_format=(type=format),value_format=(type=format)"; const char * __wt_confdfl_session_drop = diff --git a/src/btree/bt_bulk.c b/src/btree/bt_bulk.c index 95d7be5deb4..15479bcb41f 100644 --- a/src/btree/bt_bulk.c +++ b/src/btree/bt_bulk.c @@ -36,6 +36,8 @@ __wt_bulk_init(WT_CURSOR_BULK *cbulk) if (F_ISSET(btree->root_page.page, WT_PAGE_INITIAL_EMPTY)) { btree->root_page.state = WT_REF_DISK; __wt_free(session, btree->root_page.page); + + btree->last_page = NULL; } else { __wt_errx( session, "bulk-load is only possible for empty trees"); @@ -158,8 +160,7 @@ __bulk_col_var(WT_CURSOR_BULK *cbulk) * Allocate an WT_UPDATE item and append the V object onto the page's * update list. */ - WT_RET(__wt_update_alloc( - session, (WT_ITEM *)&cursor->value, &upd, NULL)); + WT_RET(__wt_update_alloc(session, (WT_ITEM *)&cursor->value, &upd)); (*cbulk->updp) = upd; cbulk->updp = &upd->next; @@ -196,10 +197,10 @@ __bulk_row(WT_CURSOR_BULK *cbulk) * Allocate a WT_INSERT/WT_UPDATE pair and append the K/V pair onto the * page's insert list. */ - WT_RET(__wt_row_insert_alloc( - session, (WT_ITEM *)&cursor->key, 1, &ins, NULL)); - WT_ERR(__wt_update_alloc( - session, (WT_ITEM *)&cursor->value, &ins->upd, NULL)); + WT_RET( + __wt_row_insert_alloc(session, (WT_ITEM *)&cursor->key, 1, &ins)); + WT_ERR( + __wt_update_alloc(session, (WT_ITEM *)&cursor->value, &ins->upd)); *cbulk->insp = ins; cbulk->insp = &WT_SKIP_NEXT(ins); diff --git a/src/btree/bt_curnext.c b/src/btree/bt_curnext.c index 7ec71be6d48..c50f3c045cc 100644 --- a/src/btree/bt_curnext.c +++ b/src/btree/bt_curnext.c @@ -8,6 +8,42 @@ #include "wt_internal.h" /* + * __cursor_col_append_next -- + * Return the next entry on the append list. + */ +static inline int +__cursor_col_append_next(WT_CURSOR_BTREE *cbt, int newpage) +{ + WT_BUF *val; + + val = &cbt->iface.value; + + if (newpage) { + cbt->ins_entry_cnt = 1; + goto new_page; + } + + for (;;) { + if ((cbt->ins = WT_SKIP_NEXT(cbt->ins)) == NULL) + return (WT_NOTFOUND); + ++cbt->ins_entry_cnt; + +new_page: if (cbt->page->type == WT_PAGE_COL_FIX) { + val->data = WT_UPDATE_DATA(cbt->ins->upd); + val->size = 1; + break; + } else { + if (WT_UPDATE_DELETED_ISSET(cbt->ins->upd)) + continue; + val->data = WT_UPDATE_DATA(cbt->ins->upd); + val->size = cbt->ins->upd->size; + break; + } + } + return (0); +} + +/* * __cursor_fix_next -- * Move to the next, fixed-length column-store item. */ @@ -29,6 +65,7 @@ __cursor_fix_next(WT_CURSOR_BTREE *cbt, int newpage) if (newpage) { cbt->ins = WT_SKIP_FIRST(WT_COL_INSERT_SINGLE(cbt->page)); cbt->recno = cbt->page->u.col_leaf.recno; + cbt->last_standard_recno = __col_last_recno(cbt->page); goto new_page; } @@ -41,8 +78,7 @@ __cursor_fix_next(WT_CURSOR_BTREE *cbt, int newpage) /* Move to the next entry and return the item. */ for (;;) { - if (cbt->recno >= - cbt->page->u.col_leaf.recno + (cbt->page->entries - 1)) + if (cbt->recno >= cbt->last_standard_recno) return (WT_NOTFOUND); ++cbt->recno; new_page: *recnop = cbt->recno; @@ -104,18 +140,20 @@ __cursor_var_next(WT_CURSOR_BTREE *cbt, int newpage) /* Initialize for each new page. */ if (newpage) { cbt->recno = cbt->page->u.col_leaf.recno; + cbt->last_standard_recno = __col_last_recno(cbt->page); cbt->vslot = UINT32_MAX; goto new_page; } /* Move to the next entry and return the item. */ for (;;) { + if (cbt->recno >= cbt->last_standard_recno) + return (WT_NOTFOUND); ++cbt->recno; new_page: *recnop = cbt->recno; /* Find the matching WT_COL slot. */ - if ((cip = - __cursor_col_rle_search(cbt->page, cbt->recno)) == NULL) + if ((cip = __col_var_search(cbt->page, cbt->recno)) == NULL) return (WT_NOTFOUND); slot = WT_COL_SLOT(cbt->page, cip); @@ -273,44 +311,71 @@ new_insert: if (cbt->ins != NULL) { } /* - * __wt_btcur_search_setup -- - * Initialize a cursor for iteration based on a search. + * __wt_btcur_iterate_setup -- + * Initialize a cursor for iteration, usually based on a search. */ -int -__wt_btcur_search_setup(WT_CURSOR_BTREE *cbt) +void +__wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt, int next) { WT_INSERT *ins; + WT_PAGE *page; - if (cbt->page->type != WT_PAGE_ROW_LEAF) - return (0); + WT_UNUSED(next); /* - * For row-store pages, we need a single item that tells us the part - * of the page we're walking (otherwise switching from next to prev - * and vice-versa is just too complicated), so we map the WT_ROW and - * WT_INSERT_HEAD array slots into a single name space: slot 1 is the - * "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is - * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are - * odd-numbered slots, and WT_ROW array slots are even-numbered slots. - * - * !!! - * I'm re-using WT_CURSOR_BTREE->slot for this purpose, which means that - * WT_CURSOR_BTREE->slot is now useless outside of cursor next/prev. If - * that turns out to be a bad idea because we need the original value of - * WT_CURSOR_BTREE->slot after a next/prev call, switch to another field - * to hold the iteration slot. + * We don't currently have to do any setup when we switch between next + * and prev calls, but I'm sure we will someday -- I'm leaving support + * here for both flags for that reason. */ - cbt->slot = (cbt->slot + 1) * 2; - if (cbt->ins_head != NULL) { - if (cbt->ins_head == WT_ROW_INSERT_SMALLEST(cbt->page)) - cbt->slot = 1; - else - cbt->slot += 1; + F_SET(cbt, WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV); + + /* If we don't have a search page, then we're done. */ + if ((page = cbt->page) == NULL) + return; + + if (page->type == WT_PAGE_ROW_LEAF) { + /* + * For row-store pages, we need a single item that tells us the + * part of the page we're walking (otherwise switching from next + * to prev and vice-versa is just too complicated), so we map + * the WT_ROW and WT_INSERT_HEAD array slots into a single name + * space: slot 1 is the "smallest key insert list", slot 2 is + * WT_ROW[0], slot 3 is WT_INSERT_HEAD[0], and so on. This + * means WT_INSERT lists are odd-numbered slots, and WT_ROW + * array slots are even-numbered slots. + * + * !!! + * I'm re-using WT_CURSOR_BTREE->slot for this purpose, which + * means that WT_CURSOR_BTREE->slot is now useless outside of + * cursor next/prev. If that turns out to be a bad idea because + * we need the original value of WT_CURSOR_BTREE->slot after a + * next/prev call, switch to another field to hold the iteration + * slot. + */ + cbt->slot = (cbt->slot + 1) * 2; + if (cbt->ins_head != NULL) { + if (cbt->ins_head == WT_ROW_INSERT_SMALLEST(page)) + cbt->slot = 1; + else + cbt->slot += 1; + } + } else { + /* + * For column-store pages, calculate the largest record on the + * page. + */ + cbt->last_standard_recno = __col_last_recno(page); + + /* If we're traversing the append list, set the reference. */ + if (cbt->ins_head != NULL && + cbt->ins_head == WT_COL_INSERT_APPEND(page)) + F_SET(cbt, WT_CBT_ITERATE_APPEND); } /* - * If we're in an insert list, figure out how far in, we have to track - * our current slot for previous traversals. + * If we're in a row-store insert list or a column-store append list, + * figure out how far in, we have to track the current slot for prev + * traversals. */ cbt->ins_entry_cnt = 0; if (cbt->ins_head != NULL) @@ -319,9 +384,6 @@ __wt_btcur_search_setup(WT_CURSOR_BTREE *cbt) if (ins == cbt->ins) break; } - - F_CLR(cbt, WT_CBT_SEARCH_SET); - return (0); } /* @@ -334,11 +396,10 @@ __wt_btcur_first(WT_CURSOR_BTREE *cbt) WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)cbt->iface.session; - WT_BSTAT_INCR(session, cursor_first); - __cursor_func_clear(cbt, 1); - F_CLR(cbt, WT_CBT_SEARCH_SET); + __cursor_func_init(cbt, 1); + F_SET(cbt, WT_CBT_ITERATE_NEXT); return (__wt_btcur_next(cbt)); } @@ -350,19 +411,20 @@ __wt_btcur_first(WT_CURSOR_BTREE *cbt) int __wt_btcur_next(WT_CURSOR_BTREE *cbt) { - WT_CURSOR *cursor; WT_SESSION_IMPL *session; int newpage, ret; - cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cursor->session; + session = (WT_SESSION_IMPL *)cbt->iface.session; WT_BSTAT_INCR(session, cursor_read_next); - __cursor_func_clear(cbt, 0); + __cursor_func_init(cbt, 0); - /* If iterating from a search position, there's some setup to do. */ - if (F_ISSET(cbt, WT_CBT_SEARCH_SET)) - WT_RET(__wt_btcur_search_setup(cbt)); + /* + * If we aren't already iterating in the right direction, there's + * some setup to do. + */ + if (!F_ISSET(cbt, WT_CBT_ITERATE_NEXT)) + __wt_btcur_iterate_setup(cbt, 1); /* * Walk any page we're holding until the underlying call returns not- @@ -370,7 +432,13 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt) * file. */ for (newpage = 0;; newpage = 1) { - if (cbt->page != NULL) { + if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { + if ((ret = __cursor_col_append_next(cbt, newpage)) == 0) + break; + F_CLR(cbt, WT_CBT_ITERATE_APPEND); + if (ret != WT_NOTFOUND) + break; + } else if (cbt->page != NULL) { switch (cbt->page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_next(cbt, newpage); @@ -385,6 +453,18 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt) } if (ret != WT_NOTFOUND) break; + + /* + * The last page in a column-store has appended entries. + * We handle it separately from the usual cursor code: + * it's only that one page and it's in a simple format. + */ + if (cbt->page->type != WT_PAGE_ROW_LEAF && + (cbt->ins = WT_SKIP_FIRST( + WT_COL_INSERT_APPEND(cbt->page))) != NULL) { + F_SET(cbt, WT_CBT_ITERATE_APPEND); + continue; + } } do { @@ -395,6 +475,6 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt) cbt->page->type == WT_PAGE_ROW_INT); } -err: __cursor_func_set(cbt, ret); +err: __cursor_func_resolve(cbt, ret); return (ret); } diff --git a/src/btree/bt_curprev.c b/src/btree/bt_curprev.c index 65b8160d60f..d7e70b0a4be 100644 --- a/src/btree/bt_curprev.c +++ b/src/btree/bt_curprev.c @@ -8,43 +8,49 @@ #include "wt_internal.h" /* - * __search_insert -- - * Search an insert list. + * __cursor_col_append_prev -- + * Return the previous entry on the append list. */ -static inline WT_INSERT * -__search_insert(WT_INSERT_HEAD *inshead, uint64_t recno) +static inline int +__cursor_col_append_prev(WT_CURSOR_BTREE *cbt, int newpage) { - WT_INSERT **ins; - uint64_t ins_recno; - int cmp, i; + WT_BUF *val; + WT_INSERT *ins; + uint32_t i; - /* If there's no insert chain to search, we're done. */ - if (inshead == NULL) - return (NULL); + val = &cbt->iface.value; - /* - * The insert list is a skip list: start at the highest skip level, then - * go as far as possible at each level before stepping down to the next. - */ - for (i = WT_SKIP_MAXDEPTH - 1, ins = &inshead->head[i]; i >= 0; ) { - if (*ins == NULL) - cmp = -1; - else { - ins_recno = WT_INSERT_RECNO(*ins); - cmp = (recno == ins_recno) ? 0 : - (recno < ins_recno) ? -1 : 1; + if (newpage) { + cbt->ins_entry_cnt = 0; + WT_SKIP_FOREACH(ins, cbt->ins_head) + ++cbt->ins_entry_cnt; + goto new_page; + } + + for (;;) { + if (--cbt->ins_entry_cnt == 0) { + F_CLR(cbt, WT_CBT_ITERATE_APPEND); + return (WT_NOTFOUND); } - if (cmp == 0) /* Exact match: return */ - return (*ins); - else if (cmp > 0) /* Keep going at this level */ - ins = &(*ins)->next[i]; - else { /* Drop down a level */ - --i; - --ins; + +new_page: for (i = cbt->ins_entry_cnt, + ins = WT_SKIP_FIRST(cbt->ins_head); i > 1; --i) + ins = WT_SKIP_NEXT(ins); + cbt->ins = ins; + + if (cbt->page->type == WT_PAGE_COL_FIX) { + val->data = WT_UPDATE_DATA(cbt->ins->upd); + val->size = 1; + break; + } else { + if (WT_UPDATE_DELETED_ISSET(cbt->ins->upd)) + continue; + val->data = WT_UPDATE_DATA(cbt->ins->upd); + val->size = cbt->ins->upd->size; + break; } } - - return (NULL); + return (0); } /* @@ -74,8 +80,7 @@ __cursor_fix_prev(WT_CURSOR_BTREE *cbt, int newpage) /* Initialize for each new page. */ if (newpage) { - cbt->recno = - cbt->page->u.col_leaf.recno + (cbt->page->entries - 1); + cbt->recno = __col_last_recno(cbt->page); goto new_page; } @@ -92,7 +97,7 @@ new_page: *recnop = cbt->recno; * to search the entire list. We use the skiplist structure, * rather than doing it linearly. */ - if ((ins = __search_insert( + if ((ins = __col_insert_search( WT_COL_INSERT_SINGLE(cbt->page), cbt->recno)) != NULL) { val->data = WT_UPDATE_DATA(ins->upd); val->size = 1; @@ -135,7 +140,7 @@ __cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage) /* Initialize for each new page. */ if (newpage) { - cbt->recno = __cursor_col_rle_last(cbt->page); + cbt->recno = __col_last_recno(cbt->page); cbt->vslot = UINT32_MAX; goto new_page; } @@ -148,8 +153,7 @@ __cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage) new_page: *recnop = cbt->recno; /* Find the matching WT_COL slot. */ - if ((cip = - __cursor_col_rle_search(cbt->page, cbt->recno)) == NULL) + if ((cip = __col_var_search(cbt->page, cbt->recno)) == NULL) return (WT_NOTFOUND); slot = WT_COL_SLOT(cbt->page, cip); @@ -159,7 +163,7 @@ new_page: *recnop = cbt->recno; * to search the entire list. We use the skiplist structure, * rather than doing it linearly. */ - if ((ins = __search_insert( + if ((ins = __col_insert_search( WT_COL_INSERT(cbt->page, cip), cbt->recno)) != NULL) { if (WT_UPDATE_DELETED_ISSET(ins->upd)) continue; @@ -228,8 +232,11 @@ __cursor_row_prev(WT_CURSOR_BTREE *cbt, int newpage) * New page configuration. */ if (newpage) { - cbt->ins_head = WT_ROW_INSERT_SLOT(cbt->page, - (cbt->page->entries > 0) ? cbt->page->entries - 1 : 0); + if (cbt->page->entries == 0) + cbt->ins_head = WT_ROW_INSERT_SMALLEST(cbt->page); + else + cbt->ins_head = WT_ROW_INSERT_SLOT( + cbt->page, cbt->page->entries - 1); cbt->ins_entry_cnt = 0; WT_SKIP_FOREACH(ins, cbt->ins_head) ++cbt->ins_entry_cnt; @@ -309,8 +316,8 @@ __wt_btcur_last(WT_CURSOR_BTREE *cbt) WT_BSTAT_INCR(session, cursor_last); - __cursor_func_clear(cbt, 1); - F_CLR(cbt, WT_CBT_SEARCH_SET); + __cursor_func_init(cbt, 1); + F_SET(cbt, WT_CBT_ITERATE_PREV); return (__wt_btcur_prev(cbt)); } @@ -322,19 +329,20 @@ __wt_btcur_last(WT_CURSOR_BTREE *cbt) int __wt_btcur_prev(WT_CURSOR_BTREE *cbt) { - WT_CURSOR *cursor; WT_SESSION_IMPL *session; int newpage, ret; - cursor = &cbt->iface; - session = (WT_SESSION_IMPL *)cursor->session; + session = (WT_SESSION_IMPL *)cbt->iface.session; WT_BSTAT_INCR(session, cursor_read_prev); - __cursor_func_clear(cbt, 0); + __cursor_func_init(cbt, 0); - /* If iterating from a search position, there's some setup to do. */ - if (F_ISSET(cbt, WT_CBT_SEARCH_SET)) - WT_RET(__wt_btcur_search_setup(cbt)); + /* + * If we aren't already iterating in the right direction, there's + * some setup to do. + */ + if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV)) + __wt_btcur_iterate_setup(cbt, 0); /* * Walk any page we're holding until the underlying call returns not- @@ -342,6 +350,14 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt) * of the file. */ for (newpage = 0;; newpage = 1) { + if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { + if ((ret = __cursor_col_append_prev(cbt, newpage)) == 0) + break; + F_CLR(cbt, WT_CBT_ITERATE_APPEND); + if (ret != WT_NOTFOUND) + break; + newpage = 1; + } if (cbt->page != NULL) { switch (cbt->page->type) { case WT_PAGE_COL_FIX: @@ -365,8 +381,18 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt) } while ( cbt->page->type == WT_PAGE_COL_INT || cbt->page->type == WT_PAGE_ROW_INT); + + /* + * The last page in a column-store has appended entries. + * We handle it separately from the usual cursor code: + * it's only that one page and it's in a simple format. + */ + if (cbt->page->type != WT_PAGE_ROW_LEAF && + (cbt->ins = WT_SKIP_FIRST( + WT_COL_INSERT_APPEND(cbt->page))) != NULL) + F_SET(cbt, WT_CBT_ITERATE_APPEND); } -err: __cursor_func_set(cbt, ret); +err: __cursor_func_resolve(cbt, ret); return (ret); } diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index 951eaa5064b..13126d8a563 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -74,7 +74,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_BSTAT_INCR(session, cursor_read); - __cursor_func_clear(cbt, 1); + __cursor_func_init(cbt, 1); WT_ERR(btree->type == BTREE_ROW ? __wt_row_search(session, cbt, 0) : @@ -84,7 +84,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) else ret = __wt_kv_return(session, cbt, 0); -err: __cursor_func_set(cbt, ret); +err: __cursor_func_resolve(cbt, ret); return (ret); } @@ -117,7 +117,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exact) * If we find an exact match, or the search key is smaller than the tree * key, and the tree key has not been deleted, return the tree key. */ - __cursor_func_clear(cbt, 1); + __cursor_func_init(cbt, 1); WT_ERR(srch(session, cbt, 0)); if (cbt->compare == 0 || cbt->compare == 1) if (!__cursor_invalid(cbt)) { @@ -142,7 +142,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exact) * a subsequent record. If we don't find a previous record, there's no * record to return, quit. */ - __cursor_func_clear(cbt, 1); + __cursor_func_init(cbt, 1); WT_ERR(srch(session, cbt, 0)); if (!__cursor_invalid(cbt)) { *exact = cbt->compare; @@ -154,7 +154,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exact) ret = __wt_btcur_prev(cbt); err: -done: __cursor_func_set(cbt, ret); +done: __cursor_func_resolve(cbt, ret); return (ret); } @@ -176,7 +176,7 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt) session = (WT_SESSION_IMPL *)cursor->session; WT_BSTAT_INCR(session, cursor_inserts); -retry: __cursor_func_clear(cbt, 1); +retry: __cursor_func_init(cbt, 1); switch (btree->type) { case BTREE_COL_FIX: @@ -190,15 +190,22 @@ retry: __cursor_func_clear(cbt, 1); /* FALLTHROUGH */ case BTREE_COL_VAR: /* - * Insert in column stores allocates a new key (ignoring the - * application's key), and creates a new record. - * - * XXX - * This semantic not yet implemented. + * If WT_CURSTD_OVERWRITE set, insert/update the application + * specified record, otherwise insert a new record (ignoring + * the application's record number), return the record number + * to the application. */ - WT_ERR(__wt_col_search(session, cbt, 1)); - if ((ret = __wt_col_modify(session, cbt, 0)) == WT_RESTART) - goto retry; + if (F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { + WT_ERR(__wt_col_search(session, cbt, 1)); + if ((ret = __wt_col_modify(session, + cbt, cbt->compare == 0 ? 3 : 1)) == WT_RESTART) + goto retry; + } else { + if ((ret = + __wt_col_modify(session, cbt, 1)) == WT_RESTART) + goto retry; + cbt->iface.recno = cbt->recno; + } break; case BTREE_ROW: /* @@ -206,22 +213,20 @@ retry: __cursor_func_clear(cbt, 1); * configuration "overwrite" not set), otherwise creates * a new record. */ - while ((ret = __wt_row_search(session, cbt, 1)) == WT_RESTART) - ; - if (ret == 0) { - if (cbt->compare == 0 && !__cursor_invalid(cbt) && - !F_ISSET(cursor, WT_CURSTD_OVERWRITE)) - ret = WT_DUPLICATE_KEY; - else - if ((ret = __wt_row_modify( - session, cbt, 0)) == WT_RESTART) - goto retry; + WT_ERR(__wt_row_search(session, cbt, 1)); + if (cbt->compare == 0 && + !__cursor_invalid(cbt) && + !F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { + ret = WT_DUPLICATE_KEY; + break; } + if ((ret = __wt_row_modify(session, cbt, 0)) == WT_RESTART) + goto retry; break; WT_ILLEGAL_FORMAT(session); } -err: __cursor_func_set(cbt, ret); +err: __cursor_func_resolve(cbt, ret); return (ret); } @@ -243,18 +248,20 @@ __wt_btcur_remove(WT_CURSOR_BTREE *cbt) session = (WT_SESSION_IMPL *)cursor->session; WT_BSTAT_INCR(session, cursor_removes); -retry: __cursor_func_clear(cbt, 1); +retry: __cursor_func_init(cbt, 1); switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: + /* Remove the record if it exists. */ WT_ERR(__wt_col_search(session, cbt, 1)); if (cbt->compare != 0 || __cursor_invalid(cbt)) ret = WT_NOTFOUND; - else if ((ret = __wt_col_modify(session, cbt, 1)) == WT_RESTART) + else if ((ret = __wt_col_modify(session, cbt, 2)) == WT_RESTART) goto retry; break; case BTREE_ROW: + /* Remove the record if it exists. */ WT_ERR(__wt_row_search(session, cbt, 1)); if (cbt->compare != 0 || __cursor_invalid(cbt)) ret = WT_NOTFOUND; @@ -264,7 +271,7 @@ retry: __cursor_func_clear(cbt, 1); WT_ILLEGAL_FORMAT(session); } -err: __cursor_func_set(cbt, ret); +err: __cursor_func_resolve(cbt, ret); return (ret); } @@ -286,7 +293,7 @@ __wt_btcur_update(WT_CURSOR_BTREE *cbt) session = (WT_SESSION_IMPL *)cursor->session; WT_BSTAT_INCR(session, cursor_updates); -retry: __cursor_func_clear(cbt, 1); +retry: __cursor_func_init(cbt, 1); switch (btree->type) { case BTREE_COL_FIX: @@ -299,16 +306,15 @@ retry: __cursor_func_clear(cbt, 1); } /* FALLTHROUGH */ case BTREE_COL_VAR: - /* Update in column stores is an unconditional overwrite. */ + /* Update the record. */ WT_ERR(__wt_col_search(session, cbt, 1)); - if ((ret = __wt_col_modify(session, cbt, 0)) == WT_RESTART) + if (cbt->compare != 0 || __cursor_invalid(cbt)) + ret = WT_NOTFOUND; + if ((ret = __wt_col_modify(session, cbt, 3)) == WT_RESTART) goto retry; break; case BTREE_ROW: - /* - * Update in row stores fails if the key doesn't exist, else - * overwrites the value. - */ + /* Update the record it it exists. */ WT_ERR(__wt_row_search(session, cbt, 1)); if (cbt->compare != 0 || __cursor_invalid(cbt)) ret = WT_NOTFOUND; @@ -318,7 +324,7 @@ retry: __cursor_func_clear(cbt, 1); WT_ILLEGAL_FORMAT(session); } -err: __cursor_func_set(cbt, ret); +err: __cursor_func_resolve(cbt, ret); return (ret); } @@ -335,7 +341,7 @@ __wt_btcur_close(WT_CURSOR_BTREE *cbt, const char *config) WT_UNUSED(config); session = (WT_SESSION_IMPL *)cbt->iface.session; - __cursor_func_clear(cbt, 1); + __cursor_func_init(cbt, 1); __wt_buf_free(session, &cbt->value); diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index 96c79eb21e2..c7d72cd87c1 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -25,7 +25,7 @@ typedef struct { static void __debug_byte_string(WT_DBG *, const uint8_t *, size_t); static int __debug_cell(WT_DBG *, WT_CELL_UNPACK *); static int __debug_cell_data(WT_DBG *, const char *, WT_CELL_UNPACK *); -static void __debug_col_insert(WT_DBG *, WT_INSERT_HEAD *, int); +static void __debug_col_list(WT_DBG *, WT_INSERT_HEAD *, const char *, int); static int __debug_config(WT_SESSION_IMPL *, WT_DBG *, const char *); static int __debug_dsk_cell(WT_DBG *, WT_PAGE_DISK *); static void __debug_dsk_col_fix(WT_DBG *, WT_PAGE_DISK *); @@ -39,7 +39,7 @@ static int __debug_page_row_int(WT_DBG *, WT_PAGE *, uint32_t); static int __debug_page_row_leaf(WT_DBG *, WT_PAGE *); static int __debug_page_work(WT_DBG *, WT_PAGE *, uint32_t); static void __debug_ref(WT_DBG *, WT_REF *); -static void __debug_row_insert(WT_DBG *, WT_INSERT_HEAD *); +static void __debug_row_list(WT_DBG *, WT_INSERT_HEAD *); static void __debug_update(WT_DBG *, WT_UPDATE *, int); static void __dmsg(WT_DBG *, const char *, ...) WT_GCC_ATTRIBUTE((format (printf, 2, 3))); @@ -415,7 +415,6 @@ static void __debug_page_col_fix(WT_DBG *ds, WT_PAGE *page) { WT_BTREE *btree; - WT_CONNECTION_IMPL *conn; WT_INSERT *ins; WT_PAGE_DISK *dsk; WT_SESSION_IMPL *session; @@ -425,7 +424,6 @@ __debug_page_col_fix(WT_DBG *ds, WT_PAGE *page) session = ds->session; btree = session->btree; - conn = S2C(session); dsk = page->dsk; recno = page->u.col_leaf.recno; @@ -447,8 +445,15 @@ __debug_page_col_fix(WT_DBG *ds, WT_PAGE *page) ++recno; } } - __dmsg(ds, "%s\n", conn->sep); - __debug_col_insert(ds, WT_COL_INSERT_SINGLE(page), 1); + + if (WT_COL_INSERT_SINGLE(page) != NULL) { + __dmsg(ds, "%s\n", S2C(session)->sep); + __debug_col_list(ds, WT_COL_INSERT_SINGLE(page), "insert", 1); + } + if (WT_COL_INSERT_APPEND(page) != NULL) { + __dmsg(ds, "%s\n", S2C(session)->sep); + __debug_col_list(ds, WT_COL_INSERT_APPEND(page), "append", 1); + } } /* @@ -487,10 +492,12 @@ __debug_page_col_var(WT_DBG *ds, WT_PAGE *page) WT_CELL_UNPACK *unpack, _unpack; WT_COL *cip; WT_INSERT_HEAD *inshead; + WT_SESSION_IMPL *session; uint64_t recno, rle; uint32_t i; char tag[64]; + session = ds->session; unpack = &_unpack; recno = page->u.col_leaf.recno; @@ -506,9 +513,15 @@ __debug_page_col_var(WT_DBG *ds, WT_PAGE *page) WT_RET(__debug_cell_data(ds, tag, unpack)); if ((inshead = WT_COL_INSERT(page, cip)) != NULL) - __debug_col_insert(ds, inshead, 0); + __debug_col_list(ds, inshead, "update", 0); recno += rle; } + + if (WT_COL_INSERT_APPEND(page) != NULL) { + __dmsg(ds, "%s\n", S2C(session)->sep); + __debug_col_list(ds, WT_COL_INSERT_APPEND(page), "append", 0); + } + return (0); } @@ -559,7 +572,7 @@ __debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page) * key on the page. */ if ((inshead = WT_ROW_INSERT_SMALLEST(page)) != NULL) - __debug_row_insert(ds, inshead); + __debug_row_list(ds, inshead); /* Dump the page's K/V pairs. */ WT_ROW_FOREACH(page, rip, i) { @@ -581,34 +594,35 @@ __debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page) __debug_update(ds, upd, 0); if ((inshead = WT_ROW_INSERT(page, rip)) != NULL) - __debug_row_insert(ds, inshead); + __debug_row_list(ds, inshead); } return (0); } /* - * __debug_col_insert -- - * Dump a column-store insert array. + * __debug_col_list -- + * Dump a column-store skiplist. */ static void -__debug_col_insert(WT_DBG *ds, WT_INSERT_HEAD *inshead, int hexbyte) +__debug_col_list( + WT_DBG *ds, WT_INSERT_HEAD *inshead, const char *tag, int hexbyte) { WT_INSERT *ins; WT_SKIP_FOREACH(ins, inshead) { __dmsg(ds, - "\tinsert %" PRIu64 "\n", WT_INSERT_RECNO(ins)); + "\t%s %" PRIu64 "\n", tag, WT_INSERT_RECNO(ins)); __debug_update(ds, ins->upd, hexbyte); } } /* - * __debug_row_insert -- + * __debug_row_list -- * Dump an insert array. */ static void -__debug_row_insert(WT_DBG *ds, WT_INSERT_HEAD *inshead) +__debug_row_list(WT_DBG *ds, WT_INSERT_HEAD *inshead) { WT_INSERT *ins; diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c index b250b937761..8c6ea22abb4 100644 --- a/src/btree/bt_discard.c +++ b/src/btree/bt_discard.c @@ -208,9 +208,8 @@ __free_insert( */ for (insheadp = insert_head; entries > 0; --entries, ++insheadp) if (*insheadp != NULL) { - __free_insert_list(session, - WT_SKIP_FIRST(*insheadp)); - __wt_sb_free(session, (*insheadp)->sb); + __free_insert_list(session, WT_SKIP_FIRST(*insheadp)); + __wt_free(session, *insheadp); } /* Free the page's array of inserts. */ diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c index 2038859bbaa..0576c52c50f 100644 --- a/src/btree/bt_evict.c +++ b/src/btree/bt_evict.c @@ -61,11 +61,10 @@ __evict_clr(WT_EVICT_LIST *e) static inline void __evict_req_set(WT_SESSION_IMPL *session, WT_EVICT_REQ *r, int close_method) { + /* Should be empty */ + WT_ASSERT(session, r->session == NULL); + r->close_method = close_method; - WT_ASSERT(session, r->retry == NULL); - WT_ASSERT(session, r->retry_next == 0); - WT_ASSERT(session, r->retry_entries == 0); - WT_ASSERT(session, r->retry_allocated == 0); WT_MEMORY_FLUSH; /* Flush before turning entry on */ r->session = session; @@ -74,16 +73,15 @@ __evict_req_set(WT_SESSION_IMPL *session, WT_EVICT_REQ *r, int close_method) /* * __evict_req_clr -- - * Set an entry in the eviction request list. + * Clear an entry in the eviction request list. */ static inline void __evict_req_clr(WT_SESSION_IMPL *session, WT_EVICT_REQ *r) { __wt_free(session, r->retry); - r->retry_next = r->retry_entries = 0; - r->retry_allocated = 0; - r->session = NULL; + memset(r, 0, sizeof(WT_EVICT_REQ)); + WT_MEMORY_FLUSH; /* Turn entry off */ } @@ -413,7 +411,8 @@ __evict_file(WT_SESSION_IMPL *session, WT_EVICT_REQ *er) */ if (!er->close_method && !WT_PAGE_IS_MODIFIED(page)) continue; - if (__wt_page_reconcile(session, page, flags) == 0) + if (!F_ISSET(page, WT_PAGE_PINNED) && + __wt_page_reconcile(session, page, flags) == 0) continue; /* @@ -449,9 +448,10 @@ err: /* End the walk cleanly. */ static int __evict_request_retry(WT_SESSION_IMPL *session) { - WT_SESSION_IMPL *request_session; WT_CACHE *cache; WT_EVICT_REQ *er, *er_end; + WT_PAGE *page; + WT_SESSION_IMPL *request_session; uint32_t i, flags; int pending_retry; @@ -490,10 +490,10 @@ __evict_request_retry(WT_SESSION_IMPL *session) /* Walk the list of retry requests. */ for (pending_retry = 0, i = 0; i < er->retry_entries; ++i) { - if (er->retry[i] == NULL) + if ((page = er->retry[i]) == NULL) continue; - if (__wt_page_reconcile( - session, er->retry[i], flags) == 0) + if (!F_ISSET(page, WT_PAGE_PINNED) && + __wt_page_reconcile(session, page, flags) == 0) er->retry[i] = NULL; else pending_retry = 1; @@ -503,10 +503,11 @@ __evict_request_retry(WT_SESSION_IMPL *session) * If we finished, clean up and resolve the request, otherwise * there's still work to do. */ - if (pending_retry) + if (pending_retry && ++er->retry_cnt < 5) cache->pending_retry = 1; else { - __wt_session_serialize_wrapup(request_session, NULL, 0); + __wt_session_serialize_wrapup( + request_session, NULL, pending_retry ? EBUSY : 0); __evict_req_clr(session, er); } diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index ce25cea43cf..3d77ec1476a 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -8,6 +8,7 @@ #include "wt_internal.h" static int __btree_conf(WT_SESSION_IMPL *, const char *, const char *); +static int __btree_last(WT_SESSION_IMPL *); static int __btree_page_sizes(WT_SESSION_IMPL *); static int __btree_type(WT_SESSION_IMPL *); @@ -125,9 +126,14 @@ __wt_btree_open(WT_SESSION_IMPL *session, WT_ERR(__wt_page_in(session, NULL, &btree->root_page, LF_ISSET(WT_BTREE_VERIFY) ? 1 : 0)); F_SET(btree->root_page.page, WT_PAGE_PINNED); + + WT_MEMORY_FLUSH; /* flush pin before release */ __wt_hazard_clear(session, btree->root_page.page); } + /* Get the last page of the file. */ + WT_ERR(__btree_last(session)); + done: /* Add to the connection list. */ __wt_lock(session, conn->mtx); btree->refcnt = 1; @@ -246,6 +252,40 @@ __wt_btree_root_init(WT_SESSION_IMPL *session) } /* + * __btree_last -- + * Read and pin the last page of the file. + */ +static int +__btree_last(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_PAGE *page; + + btree = session->btree; + + page = NULL; + WT_RET(__wt_tree_np(session, &page, 0)); + if (page == NULL) + return (WT_NOTFOUND); + + btree->last_page = page; + if (page->type != WT_PAGE_ROW_LEAF) + btree->last_recno = __col_last_recno(page); + + /* + * If the page is already pinned (that is, the last page is the root + * page), we're done, otherwise, pin the last page into memory. + */ + if (!F_ISSET(page, WT_PAGE_PINNED)) { + F_SET(page, WT_PAGE_PINNED); + + WT_MEMORY_FLUSH; /* flush pin before release */ + __wt_hazard_clear(session, page); + } + return (0); +} + +/* * __wt_btree_close -- * Close a Btree. */ @@ -271,6 +311,12 @@ __wt_btree_close(WT_SESSION_IMPL *session) if (inuse) return (0); + /* Unpin any pages we have locked down. */ + if (btree->last_page != NULL) + F_CLR(btree->last_page, WT_PAGE_PINNED); + if (btree->root_page.page != NULL) + F_CLR(btree->root_page.page, WT_PAGE_PINNED); + /* * If it's a normal tree, ask the eviction thread to flush any pages * that remain in the cache. If there is still a root page in memory, diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c index ea86db9aeac..ed3d1caf8ee 100644 --- a/src/btree/bt_page.c +++ b/src/btree/bt_page.c @@ -49,7 +49,8 @@ __wt_page_in_func( * can't get a hazard reference is because the page is * being evicted; yield and try again. */ - if (__wt_hazard_set(session, ref + if (F_ISSET(ref->page, WT_PAGE_PINNED) || + __wt_hazard_set(session, ref #ifdef HAVE_DIAGNOSTIC , file, line #endif diff --git a/src/btree/bt_reconcile.c b/src/btree/bt_reconcile.c index 8f008976a21..06b7b57007b 100644 --- a/src/btree/bt_reconcile.c +++ b/src/btree/bt_reconcile.c @@ -588,9 +588,9 @@ __rec_init(WT_SESSION_IMPL *session, uint32_t flags) r->key_pfx_compress_conf = (cval.val != 0); } - r->evict = LF_ISSET(WT_REC_EVICT); - r->locked = LF_ISSET(WT_REC_LOCKED); - r->salvage = LF_ISSET(WT_REC_SALVAGE); + r->evict = LF_ISSET(WT_REC_EVICT) ? 1 : 0; + r->locked = LF_ISSET(WT_REC_LOCKED) ? 1 : 0; + r->salvage = LF_ISSET(WT_REC_SALVAGE) ? 1 : 0; /* * During internal page reconciliation we track referenced objects that @@ -1636,10 +1636,10 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page) WT_SKIP_FOREACH(ins, WT_COL_INSERT_APPEND(page)) for (;;) { /* - * The application may have inserted records, leaving - * gaps in the name space, fill in any gaps. + * The application may have inserted records which left + * gaps in the name space. */ - for (recno = WT_INSERT_RECNO(ins) - 1; + for (recno = WT_INSERT_RECNO(ins); nrecs > 0 && r->recno < recno; --nrecs, ++entry, ++r->recno) __bit_setv( @@ -1648,10 +1648,10 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page) if (nrecs > 0) { __bit_setv(r->first_free, entry, btree->bitcnt, ((uint8_t *)WT_UPDATE_DATA(ins->upd))[0]); + --nrecs; ++entry; ++r->recno; - if (--nrecs > 0 || WT_SKIP_NEXT(ins) == NULL) - break; + break; } /* @@ -1670,7 +1670,6 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page) } /* Update the counters. */ - r->recno += entry; __rec_incrv(session, r, entry, __bitstr_size(entry * btree->bitcnt)); /* Write the remnant page. */ @@ -1724,7 +1723,7 @@ __rec_col_fix_slvg( --nrecs, --page_take, ++page_start, ++entry) __bit_setv(r->first_free, entry, btree->bitcnt, __bit_getv(page->u.col_leaf.bitf, - (uint32_t)page_start, btree->bitcnt)); + (uint32_t)page_start, btree->bitcnt)); r->recno += entry; __rec_incrv( @@ -2043,9 +2042,9 @@ __rec_col_var( } /* Swap the current/last state. */ - last_deleted = deleted; - if (!last_deleted) + if (!deleted) WT_RET(__wt_buf_set(session, last, data, size)); + last_deleted = deleted; /* Reset RLE counter and turn on comparisons. */ rle = repeat_count; @@ -2053,6 +2052,51 @@ __rec_col_var( } } + /* Walk any append list. */ + WT_SKIP_FOREACH(ins, WT_COL_INSERT_APPEND(page)) + for (n = WT_INSERT_RECNO(ins); src_recno <= n; ++src_recno) { + /* + * The application may have inserted records which left + * gaps in the name space. + */ + if (src_recno < n) + deleted = 1; + else { + upd = ins->upd; + deleted = WT_UPDATE_DELETED_ISSET(upd); + if (!deleted) { + data = WT_UPDATE_DATA(upd); + size = upd->size; + } + } + + /* + * Handle RLE accounting and comparisons -- see comment + * above, this code fragment does the same thing. + */ + if (can_compare) { + if ((deleted && last_deleted) || + (!last_deleted && !deleted && + last->size == size && + memcmp(last->data, data, size) == 0)) { + ++rle; + continue; + } + + WT_RET(__rec_col_var_helper(session, + salvage, last, last_deleted, 0, rle)); + } + + /* Swap the current/last state. */ + if (!deleted) + WT_RET(__wt_buf_set(session, last, data, size)); + last_deleted = deleted; + + /* Reset RLE counter and turn on comparisons. */ + rle = 1; + can_compare = 1; + } + /* If we were tracking a record, write it. */ if (can_compare) WT_RET(__rec_col_var_helper( diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c index 2459332e643..5e35ca4359b 100644 --- a/src/btree/bt_vrfy.c +++ b/src/btree/bt_vrfy.c @@ -311,7 +311,7 @@ recno_chk: if (parent_recno != recno) { ref = &cref->ref; WT_RET(__wt_page_in(session, page, ref, 1)); ret = __verify_tree(session, ref, cref->recno, vs); - __wt_hazard_clear(session, ref->page); + __wt_page_release(session, ref->page); WT_RET_TEST(ret, ret); } break; @@ -336,7 +336,7 @@ recno_chk: if (parent_recno != recno) { ref = &rref->ref; WT_RET(__wt_page_in(session, page, ref, 1)); ret = __verify_tree(session, ref, (uint64_t)0, vs); - __wt_hazard_clear(session, ref->page); + __wt_page_release(session, ref->page); WT_RET_TEST(ret, ret); } break; diff --git a/src/btree/col_extend.c b/src/btree/col_extend.c deleted file mode 100644 index 1cdb8f80890..00000000000 --- a/src/btree/col_extend.c +++ /dev/null @@ -1,369 +0,0 @@ -/*- - * See the file LICENSE for redistribution information. - * - * Copyright (c) 2008-2011 WiredTiger, Inc. - * All rights reserved. - */ - -#include "wt_internal.h" - -static int __col_next_recno(WT_SESSION_IMPL *, WT_PAGE *, uint64_t *); - -/* - * __wt_col_extend -- - * Extend a column-store file. - */ -int -__wt_col_extend(WT_SESSION_IMPL *session, WT_PAGE *page, uint64_t recno) -{ - WT_BTREE *btree; - WT_COL *d; - WT_COL_REF *t; - WT_CONFIG_ITEM cval; - WT_PAGE *new_intl, *new_leaf, *parent; - uint64_t next; - size_t entries_size, new_intl_size, new_leaf_size, t_size; - uint32_t internal_extend, leaf_extend; - uint8_t *bitf; - int ret; - void *entries; - - btree = session->btree; - d = NULL; - t = NULL; - new_intl = new_leaf = NULL; - entries_size = new_intl_size = new_leaf_size = t_size = 0; - internal_extend = leaf_extend = 0; - bitf = NULL; - ret = 0; - - /* - * Another thread may have already done the work, or a default extension - * may not be sufficient. Get the starting record for the next page and - * make sure we're doing what we need to do. - */ - WT_RET(__col_next_recno(session, page, &next)); - if (recno < next) /* Fits on the current page. */ - return (WT_RESTART); - - /* - * Figure out how much we'll extend the leaf key space. - * - * If it's a fixed-length store, we can't allocate more than maximum - * leaf page size number of bits, because we can't ever split those - * pages. - * - * If it's a variable-length store, we can split those pages so we - * can allocate whatever we need. - * - * XXX - * If the application is extending the file by more than will reasonably - * fit on a page, insert an RLE record that gets us all the way to the - * insert record. - * - * We always need a new bitfield or entries array, allocate them. - */ - switch (page->type) { - case WT_PAGE_COL_FIX: - leaf_extend = WT_FIX_NRECS(btree); - - WT_RET(__wt_calloc_def(session, (size_t)leaf_extend, &bitf)); - entries = bitf; - entries_size = leaf_extend; - break; - case WT_PAGE_COL_VAR: - WT_RET(__wt_config_getones(session, - session->btree->config, "column_leaf_extend", &cval)); - leaf_extend = (uint32_t)cval.val; - if (recno >= next + leaf_extend) - leaf_extend = (uint32_t)(recno - next) + 100; - - WT_RET(__wt_calloc_def(session, (size_t)leaf_extend, &d)); - entries = d; - entries_size = leaf_extend * sizeof(WT_COL); - break; - } - - /* - * Check if the page is a newly created page: all we'll need is a new - * entries array. - */ - if (page->entries == 0) - goto done; - - /* We'll need a new leaf page. */ - WT_ERR(__wt_calloc_def(session, 1, &new_leaf)); - new_leaf_size = sizeof(WT_PAGE); - - /* Check if there's a parent page with room for a new leaf page. */ - parent = page->parent; - if (!WT_PAGE_IS_ROOT(page) && - parent->u.col_int.ext_entries > parent->entries) - goto done; - - /* We'll need a new parent page, with its own entries array. */ - WT_ERR(__wt_calloc_def(session, 1, &new_intl)); - new_intl_size = sizeof(WT_PAGE); - WT_RET(__wt_config_getones(session, - session->btree->config, "column_internal_extend", &cval)); - internal_extend = (uint32_t)cval.val; - WT_ERR(__wt_calloc_def(session, (size_t)internal_extend, &t)); - t_size = internal_extend * sizeof(WT_COL_REF); - -done: return (__wt_col_extend_serial(session, page, &new_intl, new_intl_size, - &t, t_size, internal_extend, &new_leaf, new_leaf_size, &entries, - entries_size, leaf_extend, recno)); - -err: - if (d != NULL) - __wt_free(session, d); - if (t != NULL) - __wt_free(session, t); - if (new_intl != NULL) - __wt_free(session, new_intl); - if (new_leaf != NULL) - __wt_free(session, new_leaf); - if (bitf != NULL) - __wt_free(session, bitf); - return (ret); -} - -/* - * __wt_col_extend_serial_func -- - * Server function to extend a column-store page. - */ -int -__wt_col_extend_serial_func(WT_SESSION_IMPL *session) -{ - WT_COL_REF *cref, *t; - WT_PAGE *new_leaf, *new_intl, *page, *parent; - WT_REF *orig_ref; - uint64_t next, recno; - uint32_t internal_extend, leaf_extend; - int ret; - void *entries; - - __wt_col_extend_unpack(session, &page, &new_intl, - &t, &internal_extend, &new_leaf, &entries, &leaf_extend, &recno); - - ret = 0; - - /* - * We don't care about write generations in this code: in the hard cases - * we're working in the tree above the page in which we ran out of room, - * not the search page, and the search page's write generation doesn't - * matter. In other words, we depend on our review of the situation on - * ground. - * - * This is safe because the reconciliation code can't touch the subtree - * we're in: we have a hazard reference on the lowest page, that fixes - * the tree into memory. - * - * We need a new entries array or bitfield, make sure our caller passed - * us one. - */ - if (entries == NULL) - goto done; - - /* - * Check if the current page needs an entries array. - * - * Setting the page's entries value turns on the change. - */ - switch (page->type) { - case WT_PAGE_COL_FIX: - if (page->u.col_leaf.bitf == NULL) { - page->u.col_leaf.bitf = entries; - goto entries; - } - break; - case WT_PAGE_COL_VAR: - if (page->u.col_leaf.d == NULL) { - page->u.col_leaf.d = entries; - -entries: __wt_col_extend_entries_taken(session, page); - WT_MEMORY_FLUSH; - page->entries = leaf_extend; - goto done; - } - break; - } - - /* We need a new leaf page, make sure our caller passed us one. */ - if (new_leaf == NULL) - goto done; - - /* - * Get the starting record for the next page, but check, another thread - * may have already done the work. - */ - WT_RET(__col_next_recno(session, page, &next)); - if (next > recno) - goto done; - - /* - * Check if the page's parent has room for a new leaf page. - * - * Setting the parent page's entries value turns on the change. - */ - parent = page->parent; - if (!WT_PAGE_IS_ROOT(page) && - parent->u.col_int.ext_entries > parent->entries) { - cref = &parent->u.col_int.t[parent->entries]; - cref->recno = next; - WT_COL_REF_ADDR(cref) = WT_ADDR_INVALID; - WT_COL_REF_PAGE(cref) = new_leaf; - WT_COL_REF_SIZE(cref) = 0; - WT_COL_REF_STATE(cref) = WT_REF_MEM; - WT_PAGE_SET_MODIFIED(parent); - - new_leaf->parent = page->parent; - new_leaf->parent_ref = &cref->ref; - new_leaf->read_gen = __wt_cache_read_gen(session); - new_leaf->u.col_leaf.recno = next; - new_leaf->u.col_leaf.d = entries; - new_leaf->u.col_leaf.bitf = entries; - new_leaf->dsk = NULL; - new_leaf->entries = leaf_extend; - new_leaf->type = page->type; - WT_PAGE_SET_MODIFIED(new_leaf); - __wt_cache_page_workq(session); - - WT_MEMORY_FLUSH; - ++parent->entries; - - __wt_col_extend_new_leaf_taken(session, new_leaf); - __wt_col_extend_entries_taken(session, new_leaf); - - goto done; - } - - /* We need a new internal page, make sure our caller passed us one. */ - if (new_intl == NULL) - goto done; - - /* - * Split by replacing the existing leaf page with an internal page that - * references the leaf page (which deepens the tree by a level). This - * is a little like splits in the reconciliation code, but it's all done - * while other threads of control are going through the structures. - * - * Get a reference to the top WT_REF structure, and mark the top-level - * page dirty, we're going to have to reconcile it so our newly created - * level is merged back in. - */ - orig_ref = page->parent_ref; - if (!WT_PAGE_IS_ROOT(page)) - WT_PAGE_SET_MODIFIED(page->parent); - - /* - * Configure the new internal page. - */ - new_intl->parent = page->parent; - new_intl->parent_ref = page->parent_ref; - new_intl->read_gen = __wt_cache_read_gen(session); - new_intl->u.col_int.recno = page->u.col_leaf.recno; - new_intl->u.col_int.ext_entries = internal_extend; - new_intl->u.col_int.t = t; - new_intl->dsk = NULL; - new_intl->entries = 2; - new_intl->type = WT_PAGE_COL_INT; - WT_PAGE_SET_MODIFIED(new_intl); - __wt_cache_page_workq(session); - - /* - * If the new internal page isn't the root page, then we should merge - * it into its parent, we don't want the tree to deepen permanently. - */ - if (!WT_PAGE_IS_ROOT(page)) - F_SET(new_intl, WT_PAGE_MERGE); - - /* Slot 0 of the new internal page references the original leaf page. */ - cref = &new_intl->u.col_int.t[0]; - cref->recno = page->u.col_leaf.recno; - cref->ref = *page->parent_ref; - - /* Re-point the original page. */ - page->parent = new_intl; - page->parent_ref = &new_intl->u.col_int.t[0].ref; - - /* Slot 1 of the new internal page references the new leaf page. */ - cref = &new_intl->u.col_int.t[1]; - cref->recno = next; - WT_COL_REF_ADDR(cref) = WT_ADDR_INVALID; - WT_COL_REF_PAGE(cref) = new_leaf; - WT_COL_REF_SIZE(cref) = 0; - WT_COL_REF_STATE(cref) = WT_REF_MEM; - - /* Configure the new leaf page. */ - new_leaf->parent = new_intl; - new_leaf->parent_ref = &new_intl->u.col_int.t[1].ref; - new_leaf->read_gen = __wt_cache_read_gen(session); - new_leaf->u.col_leaf.recno = next; - new_leaf->u.col_leaf.d = entries; - new_leaf->u.col_leaf.bitf = entries; - new_leaf->dsk = NULL; - new_leaf->entries = leaf_extend; - new_leaf->type = page->type; - WT_PAGE_SET_MODIFIED(new_leaf); - __wt_cache_page_workq(session); - - __wt_col_extend_new_intl_taken(session, new_intl); - __wt_col_extend_t_taken(session, new_intl); - __wt_col_extend_new_leaf_taken(session, new_leaf); - __wt_col_extend_entries_taken(session, new_leaf); - - /* - * Make the switch: set the addr/size pair then update the pointer (we - * are not changing the state, nor are we changing the record number). - * This is safe as we're updating one set of structures for another set - * of structures which reference identical information. Eviction can't - * get in here because we hold a hazard reference on the original page. - * Setting the original page parent's in-memory pointer to reference our - * new internal page turns on the change. - */ - orig_ref->addr = WT_ADDR_INVALID; - orig_ref->size = 0; - WT_MEMORY_FLUSH; - orig_ref->page = new_intl; - WT_MEMORY_FLUSH; - -done: __wt_session_serialize_wrapup(session, page, ret); - return (ret); -} - -/* - * __col_next_recno -- - * Return the recno of the next page following the argument page. - */ -static int -__col_next_recno(WT_SESSION_IMPL *session, WT_PAGE *page, uint64_t *recnop) -{ - WT_CELL *cell; - WT_CELL_UNPACK *unpack, _unpack; - WT_COL *cip; - uint32_t i; - uint64_t recno; - - recno = page->u.col_leaf.recno; - unpack = &_unpack; - - switch (page->type) { - case WT_PAGE_COL_FIX: - recno += page->entries; - break; - case WT_PAGE_COL_VAR: - WT_COL_FOREACH(page, cip, i) - if ((cell = WT_COL_PTR(page, cip)) == NULL) - ++recno; - else { - __wt_cell_unpack(cell, unpack); - recno += unpack->rle; - } - break; - WT_ILLEGAL_FORMAT(session); - } - - *recnop = recno; - return (0); -} diff --git a/src/btree/col_modify.c b/src/btree/col_modify.c index c91bda56308..1ad680013f0 100644 --- a/src/btree/col_modify.c +++ b/src/btree/col_modify.c @@ -7,63 +7,51 @@ #include "wt_internal.h" -static int __col_insert_alloc( - WT_SESSION_IMPL *, uint64_t, u_int, WT_INSERT **, size_t *); +static int __col_insert_alloc(WT_SESSION_IMPL *, uint64_t, u_int, WT_INSERT **); /* * __wt_col_modify -- - * Column-store delete insert, and update. + * Column-store delete, insert, and update. */ int -__wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove) +__wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int op) { WT_BTREE *btree; WT_INSERT *ins; WT_INSERT_HEAD **inshead, *new_inshead, **new_inslist; WT_ITEM *value, _value; WT_PAGE *page; - WT_SESSION_BUFFER *sb; WT_UPDATE *upd; - size_t ins_size, new_inslist_size, new_inshead_size, upd_size; + size_t new_inshead_size, new_inslist_size; uint64_t recno; u_int skipdepth; - int hazard_ref, i, ret; + int i, ret; btree = cbt->btree; page = cbt->page; - recno = cbt->iface.recno; - if (is_remove) { + switch (op) { + case 1: /* Insert */ + page = btree->last_page; + __cursor_search_clear(cbt); + + value = (WT_ITEM *)&cbt->iface.value; + recno = 0; /* Engine allocates */ + break; + case 2: /* Remove */ if (btree->type == BTREE_COL_FIX) { value = &_value; value->data = ""; value->size = 1; } else value = NULL; - } else + recno = cbt->iface.recno; /* App specified */ + break; + case 3: /* Update */ + default: value = (WT_ITEM *)&cbt->iface.value; - - /* - * Append a column-store entry (the only place you can insert into a - * column-store file is after the key space, column-store records are - * immutable). If we don't have an exact match, it's an append and we - * need to extend the file. - */ - if (cbt->compare != 0) { - /* - * We may have, and need to hold, a hazard reference on a page, - * but we're possibly doing some page shuffling of the root, - * which means the standard test to determine whether we should - * release a hazard reference on the page isn't right. Check - * now, before we do the page shuffling. - */ - hazard_ref = page == session->btree->root_page.page ? 0 : 1; - ret = __wt_col_extend(session, page, recno); - if (hazard_ref) { - __wt_page_release(session, page); - cbt->page = NULL; /* XXX WRONG */ - } - return (ret == 0 ? WT_RESTART : 0); + recno = cbt->iface.recno; /* App specified */ + break; } ins = NULL; @@ -73,63 +61,45 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove) ret = 0; /* - * Delete or update a column-store entry. - * Column-store changes mean working in a WT_INSERT list. + * Delete, insert or update a column-store entry. */ - if (cbt->ins != NULL) { - /* - * If changing an already changed record, create a new WT_UPDATE - * entry and have the workQ link it into an existing WT_INSERT - * entry's WT_UPDATE list. - */ - WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size)); - - /* workQ: insert the WT_UPDATE structure. */ - ret = __wt_update_serial(session, page, - cbt->write_gen, &cbt->ins->upd, NULL, 0, &upd, upd_size); - } else { - /* - * We may not have an WT_INSERT_HEAD array (in the case of - * variable length column store) or WT_INSERT_HEAD slot (in the - * case of fixed length column store). Also, there may be an - * insert array but no list at the point we are inserting. - * Allocate as necessary. - */ + if (cbt->ins == NULL) { + /* There may be no WT_INSERT_HEAD, allocate as necessary. */ new_inshead_size = new_inslist_size = 0; - if (page->u.col_leaf.ins == NULL) - switch (page->type) { - case WT_PAGE_COL_FIX: + if (op == 1) { + if (page == NULL || page->u.col_leaf.append == NULL) { new_inslist_size = 1 * sizeof(WT_INSERT_HEAD *); - WT_ERR(__wt_calloc_def(session, - 1, &new_inslist)); + WT_ERR( + __wt_calloc_def(session, 1, &new_inslist)); inshead = &new_inslist[0]; - break; - case WT_PAGE_COL_VAR: + } else + inshead = &page->u.col_leaf.append[0]; + cbt->ins_head = *inshead; + } else if (page->type == WT_PAGE_COL_FIX) { + if (page->u.col_leaf.ins == NULL) { + new_inslist_size = 1 * + sizeof(WT_INSERT_HEAD *); + WT_ERR( + __wt_calloc_def(session, 1, &new_inslist)); + inshead = &new_inslist[0]; + } else + inshead = &page->u.col_leaf.ins[0]; + } else { + if (page->u.col_leaf.ins == NULL) { new_inslist_size = page->entries * sizeof(WT_INSERT_HEAD *); WT_ERR(__wt_calloc_def( session, page->entries, &new_inslist)); inshead = &new_inslist[cbt->slot]; - break; - WT_ILLEGAL_FORMAT(session); - } - else - switch (page->type) { - case WT_PAGE_COL_FIX: - inshead = &page->u.col_leaf.ins[0]; - break; - case WT_PAGE_COL_VAR: + } else inshead = &page->u.col_leaf.ins[cbt->slot]; - break; - WT_ILLEGAL_FORMAT(session); - } + } + /* There may be no WT_INSERT list, allocate as necessary. */ if (*inshead == NULL) { new_inshead_size = sizeof(WT_INSERT_HEAD); - WT_RET(__wt_sb_alloc(session, - sizeof(WT_INSERT_HEAD), &new_inshead, &sb)); - new_inshead->sb = sb; + WT_RET(__wt_calloc_def(session, 1, &new_inshead)); for (i = 0; i < WT_SKIP_MAXDEPTH; i++) cbt->ins_stack[i] = &new_inshead->head[i]; cbt->ins_head = new_inshead; @@ -139,30 +109,43 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove) skipdepth = __wt_skip_choose_depth(); /* - * Allocate a new WT_INSERT/WT_UPDATE pair, link it into the - * WT_INSERT array. + * Allocate a WT_INSERT/WT_UPDATE pair, and update the cursor + * to reference it. */ - WT_ERR(__col_insert_alloc( - session, recno, skipdepth, &ins, &ins_size)); - WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size)); + WT_ERR(__col_insert_alloc(session, recno, skipdepth, &ins)); + WT_ERR(__wt_update_alloc(session, value, &upd)); ins->upd = upd; - ins_size += upd_size; cbt->ins = ins; /* - * workQ: insert the WT_INSERT structure. - * - * For fixed-width stores, we are installing a single insert - * head for the page. Pass NULL to the insert serialization - * function, there is no need to set it again, and we only want - * to account for it once. + * workQ: insert or append the WT_INSERT structure. */ - ret = __wt_insert_serial(session, - page, cbt->write_gen, - inshead, cbt->ins_stack, - &new_inslist, new_inslist_size, - &new_inshead, new_inshead_size, - &ins, ins_size, skipdepth); + if (op == 1) { + WT_ERR(__wt_append_serial(session, + inshead, cbt->ins_stack, + &new_inslist, new_inslist_size, + &new_inshead, new_inshead_size, ins, skipdepth)); + + /* Set up the cursor for the inserted page and value. */ + cbt->page = btree->last_page; + cbt->recno = WT_INSERT_RECNO(ins); + } else + WT_ERR(__wt_insert_serial(session, + page, cbt->write_gen, + inshead, cbt->ins_stack, + &new_inslist, new_inslist_size, + &new_inshead, new_inshead_size, ins, skipdepth)); + } else { + /* + * If changing an already changed record, create a new WT_UPDATE + * entry and have the workQ link it into an existing WT_INSERT + * entry's WT_UPDATE list. + */ + WT_ERR(__wt_update_alloc(session, value, &upd)); + + /* workQ: insert the WT_UPDATE structure. */ + ret = __wt_update_serial(session, page, + cbt->write_gen, &cbt->ins->upd, NULL, 0, upd); } if (ret != 0) { @@ -173,6 +156,7 @@ err: if (ins != NULL) } __wt_free(session, new_inslist); + __wt_free(session, new_inshead); return (ret); } @@ -183,8 +167,8 @@ err: if (ins != NULL) * buffer and fill it in. */ static int -__col_insert_alloc(WT_SESSION_IMPL *session, - uint64_t recno, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep) +__col_insert_alloc( + WT_SESSION_IMPL *session, uint64_t recno, u_int skipdepth, WT_INSERT **insp) { WT_SESSION_BUFFER *sb; WT_INSERT *ins; @@ -194,14 +178,78 @@ __col_insert_alloc(WT_SESSION_IMPL *session, * Allocate the WT_INSERT structure and skiplist pointers, then copy * the record number into place. */ - ins_size = sizeof(WT_INSERT) + - skipdepth * sizeof(WT_INSERT *); + ins_size = sizeof(WT_INSERT) + skipdepth * sizeof(WT_INSERT *); WT_RET(__wt_sb_alloc(session, ins_size, &ins, &sb)); ins->sb = sb; WT_INSERT_RECNO(ins) = recno; *insp = ins; - *ins_sizep = ins_size; + return (0); +} + +/* + * __wt_append_serial_func -- + * Server function to append an WT_INSERT entry to the tree. + */ +int +__wt_append_serial_func(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_PAGE *page; + WT_INSERT_HEAD **inshead, **new_inslist, *new_inshead; + WT_INSERT *new_ins, ***ins_stack; + uint64_t recno; + u_int i, skipdepth; + + btree = session->btree; + page = btree->last_page; + + __wt_append_unpack(session, &inshead, &ins_stack, + &new_inslist, &new_inshead, &new_ins, &skipdepth); + + /* + * If the page does not yet have an insert array, our caller passed + * us one. + */ + if (page->u.col_leaf.append == NULL) { + page->u.col_leaf.append = new_inslist; + __wt_append_new_inslist_taken(session, page); + } + + /* + * If the insert head does not yet have an insert list, our caller + * passed us one. + */ + if (*inshead == NULL) { + *inshead = new_inshead; + __wt_append_new_inshead_taken(session, page); + } + + /* + * If the application specified a record number, there's a race: the + * application may have searched for the record, not found it, then + * called into the append code, and another thread might have added + * the record. Fortunately, we're in the right place because if the + * record didn't exist at some point, it can only have been created + * on this list. Search for the record, if specified. + */ + if ((recno = WT_INSERT_RECNO(new_ins)) == 0) + recno = WT_INSERT_RECNO(new_ins) = ++btree->last_recno; + (void)__col_insert_search_stack(*inshead, ins_stack, recno); + + /* + * First, point the new WT_INSERT item's skiplist references to the next + * elements in the insert list, then flush memory. Second, update the + * skiplist elements that reference the new WT_INSERT item, this ensures + * the list is never inconsistent. + */ + for (i = 0; i < skipdepth; i++) + new_ins->next[i] = *ins_stack[i]; + WT_MEMORY_FLUSH; + for (i = 0; i < skipdepth; i++) + *ins_stack[i] = new_ins; + + __wt_session_serialize_wrapup(session, page, 0); return (0); } diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c index abcb054b181..24a22cb27d4 100644 --- a/src/btree/col_srch.c +++ b/src/btree/col_srch.c @@ -8,44 +8,6 @@ #include "wt_internal.h" /* - * __search_insert -- - * Search the slot's insert list. - */ -static inline WT_INSERT * -__search_insert(WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *inshead, uint64_t recno) -{ - WT_INSERT **ins; - uint64_t ins_recno; - int cmp, i; - - /* If there's no insert chain to search, we're done. */ - if (inshead == NULL) - return (NULL); - - /* - * The insert list is a skip list: start at the highest skip level, then - * go as far as possible at each level before stepping down to the next. - */ - for (i = WT_SKIP_MAXDEPTH - 1, ins = &inshead->head[i]; i >= 0; ) { - if (*ins == NULL) { - cbt->ins_stack[i--] = ins--; - continue; - } - - ins_recno = WT_INSERT_RECNO(*ins); - cmp = (recno == ins_recno) ? 0 : (recno < ins_recno) ? -1 : 1; - - if (cmp == 0) /* Exact match: return */ - return (*ins); - else if (cmp > 0) /* Keep going at this level */ - ins = &(*ins)->next[i]; - else /* Drop down a level */ - cbt->ins_stack[i--] = ins--; - } - return (NULL); -} - -/* * __wt_col_search -- * Search a column-store tree for a specific record-based key. */ @@ -125,42 +87,39 @@ __wt_col_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_modify) WT_MEMORY_FLUSH; } cbt->page = page; + cbt->compare = 0; /* * Search the leaf page. We do not check in the search path for a * record greater than the maximum record in the tree; in that case, * we arrive here with a record that's impossibly large for the page. */ - switch (page->type) { - case WT_PAGE_COL_FIX: + if (page->type == WT_PAGE_COL_FIX) { if (recno >= page->u.col_leaf.recno + page->entries) { - cbt->compare = 1; - F_SET(cbt, WT_CBT_SEARCH_SET); - return (0); - } - cbt->ins_head = - page->u.col_leaf.ins == NULL ? NULL : *page->u.col_leaf.ins; - cbt->compare = 0; - break; - case WT_PAGE_COL_VAR: - if ((cip = __cursor_col_rle_search(page, recno)) == NULL) - cbt->compare = 1; - else { - cbt->compare = 0; + cbt->compare = -1; + cbt->ins_head = WT_COL_INSERT_APPEND(page); + } else + cbt->ins_head = WT_COL_INSERT_SINGLE(page); + } else { + if ((cip = __col_var_search(page, recno)) == NULL) { + cbt->compare = -1; + cbt->ins_head = WT_COL_INSERT_APPEND(page); + } else { cbt->slot = WT_COL_SLOT(page, cip); cbt->ins_head = WT_COL_INSERT_SLOT(page, cbt->slot); } - break; - WT_ILLEGAL_FORMAT(session); } /* - * Search the insert list for a match; __search_insert sets the return - * insert information appropriately. + * Search the insert or append list for a match; __search_insert sets + * the return insert information appropriately. */ - cbt->ins = __search_insert(cbt, cbt->ins_head, recno); - - F_SET(cbt, WT_CBT_SEARCH_SET); + if (cbt->ins_head == NULL) + cbt->ins = NULL; + else + if ((cbt->ins = __col_insert_search_stack( + cbt->ins_head, cbt->ins_stack, recno)) != NULL) + cbt->compare = 0; return (0); err: __wt_page_release(session, page); diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c index b01729d9e47..90fdf6ea6e6 100644 --- a/src/btree/row_modify.c +++ b/src/btree/row_modify.c @@ -18,11 +18,10 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove) WT_INSERT_HEAD **inshead, *new_inshead, **new_inslist; WT_ITEM *key, *value; WT_PAGE *page; - WT_SESSION_BUFFER *sb; WT_UPDATE **new_upd, *upd, **upd_entry; - size_t ins_size, new_inshead_size, new_inslist_size; - size_t new_upd_size, upd_size; - uint32_t ins_slot, skipdepth; + size_t new_inshead_size, new_inslist_size, new_upd_size; + uint32_t ins_slot; + u_int skipdepth; int i, ret; key = (WT_ITEM *)&cbt->iface.key; @@ -66,11 +65,11 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove) upd_entry = &cbt->ins->upd; /* Allocate room for the new value from per-thread memory. */ - WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size)); + WT_ERR(__wt_update_alloc(session, value, &upd)); /* workQ: insert the WT_UPDATE structure. */ - ret = __wt_update_serial(session, page, cbt->write_gen, - upd_entry, &new_upd, new_upd_size, &upd, upd_size); + ret = __wt_update_serial(session, page, + cbt->write_gen, upd_entry, &new_upd, new_upd_size, upd); } else { /* * Allocate insert array if necessary, and set the WT_INSERT @@ -106,9 +105,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove) */ if (*inshead == NULL) { new_inshead_size = sizeof(WT_INSERT_HEAD); - WT_ERR(__wt_sb_alloc(session, - sizeof(WT_INSERT_HEAD), &new_inshead, &sb)); - new_inshead->sb = sb; + WT_ERR(__wt_calloc_def(session, 1, &new_inshead)); for (i = 0; i < WT_SKIP_MAXDEPTH; i++) cbt->ins_stack[i] = &new_inshead->head[i]; cbt->ins_head = new_inshead; @@ -121,20 +118,16 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove) * Allocate a WT_INSERT/WT_UPDATE pair, and update the cursor * to reference it. */ - WT_ERR(__wt_row_insert_alloc( - session, key, skipdepth, &ins, &ins_size)); - WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size)); + WT_ERR(__wt_row_insert_alloc(session, key, skipdepth, &ins)); + WT_ERR(__wt_update_alloc(session, value, &upd)); ins->upd = upd; - ins_size += upd_size; cbt->ins = ins; /* workQ: insert the WT_INSERT structure. */ - ret = __wt_insert_serial(session, - page, cbt->write_gen, + ret = __wt_insert_serial(session, page, cbt->write_gen, inshead, cbt->ins_stack, &new_inslist, new_inslist_size, - &new_inshead, new_inshead_size, - &ins, ins_size, skipdepth); + &new_inshead, new_inshead_size, ins, skipdepth); } if (ret != 0) { @@ -145,8 +138,8 @@ err: if (ins != NULL) } /* Free any insert, update arrays. */ - __wt_free(session, new_inshead); __wt_free(session, new_inslist); + __wt_free(session, new_inshead); __wt_free(session, new_upd); return (ret); @@ -158,8 +151,8 @@ err: if (ins != NULL) * buffer and fill it in. */ int -__wt_row_insert_alloc(WT_SESSION_IMPL *session, - WT_ITEM *key, uint32_t skipdepth, WT_INSERT **insp, size_t *ins_sizep) +__wt_row_insert_alloc( + WT_SESSION_IMPL *session, WT_ITEM *key, u_int skipdepth, WT_INSERT **insp) { WT_SESSION_BUFFER *sb; WT_INSERT *ins; @@ -179,29 +172,27 @@ __wt_row_insert_alloc(WT_SESSION_IMPL *session, memcpy(WT_INSERT_KEY(ins), key->data, key->size); *insp = ins; - if (ins_sizep != NULL) - *ins_sizep = ins_size; - return (0); } /* * __wt_insert_serial_func -- - * Server function to add an WT_INSERT entry to the page tree. + * Server function to add an WT_INSERT entry to the page. */ int __wt_insert_serial_func(WT_SESSION_IMPL *session) { WT_PAGE *page; WT_INSERT_HEAD **inshead, **new_inslist, *new_inshead; - WT_INSERT *ins, ***ins_stack; - uint32_t i, skipdepth, write_gen; - int ret; + WT_INSERT *new_ins, ***ins_stack; + uint32_t write_gen; + u_int i, skipdepth; + int ret; ret = 0; __wt_insert_unpack(session, &page, &write_gen, &inshead, - &ins_stack, &new_inslist, &new_inshead, &ins, &skipdepth); + &ins_stack, &new_inslist, &new_inshead, &new_ins, &skipdepth); /* Check the page's write-generation. */ WT_ERR(__wt_page_write_gen_check(page, write_gen)); @@ -222,8 +213,8 @@ __wt_insert_serial_func(WT_SESSION_IMPL *session) } /* - * If the slot does not yet have an insert list, our caller passed us - * one. + * If the insert head does not yet have an insert list, our caller + * passed us one. */ if (*inshead == NULL) { *inshead = new_inshead; @@ -237,11 +228,10 @@ __wt_insert_serial_func(WT_SESSION_IMPL *session) * the list is never inconsistent. */ for (i = 0; i < skipdepth; i++) - ins->next[i] = *ins_stack[i]; + new_ins->next[i] = *ins_stack[i]; WT_MEMORY_FLUSH; for (i = 0; i < skipdepth; i++) - *ins_stack[i] = ins; - __wt_insert_ins_taken(session, page); + *ins_stack[i] = new_ins; err: __wt_session_serialize_wrapup(session, page, 0); return (ret); @@ -253,8 +243,7 @@ err: __wt_session_serialize_wrapup(session, page, 0); * buffer and fill it in. */ int -__wt_update_alloc(WT_SESSION_IMPL *session, - WT_ITEM *value, WT_UPDATE **updp, size_t *upd_sizep) +__wt_update_alloc(WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp) { WT_SESSION_BUFFER *sb; WT_UPDATE *upd; @@ -274,10 +263,7 @@ __wt_update_alloc(WT_SESSION_IMPL *session, memcpy(WT_UPDATE_DATA(upd), value->data, size); } - if (upd_sizep != NULL) - *upd_sizep = size + sizeof(WT_UPDATE); *updp = upd; - return (0); } @@ -319,7 +305,6 @@ __wt_update_serial_func(WT_SESSION_IMPL *session) upd->next = *upd_entry; WT_MEMORY_FLUSH; *upd_entry = upd; - __wt_update_upd_taken(session, page); err: __wt_session_serialize_wrapup(session, page, 0); return (ret); diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c index c383b394962..c2acab719d0 100644 --- a/src/btree/row_srch.c +++ b/src/btree/row_srch.c @@ -183,8 +183,6 @@ __wt_row_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_modify) WT_ASSERT(session, rip != NULL); cbt->compare = 0; cbt->slot = WT_ROW_SLOT(page, rip); - - F_SET(cbt, WT_CBT_SEARCH_SET); return (0); } @@ -227,8 +225,6 @@ __wt_row_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_modify) * insert information appropriately. */ cbt->ins = __search_insert(session, cbt, cbt->ins_head, key); - - F_SET(cbt, WT_CBT_SEARCH_SET); return (0); err: __wt_page_release(session, page); diff --git a/src/cursor/cur_file.c b/src/cursor/cur_file.c index 6bac16c2e2b..2bc1095f6c2 100644 --- a/src/cursor/cur_file.c +++ b/src/cursor/cur_file.c @@ -134,6 +134,8 @@ __curfile_insert(WT_CURSOR *cursor) WT_SESSION_IMPL *session; int ret; + ret = 0; + cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_API_CALL(cursor, session, insert, cbt->btree); if (cbt->btree->type == BTREE_ROW) diff --git a/src/include/btmem.h b/src/include/btmem.h index 9bf2854e093..6679c755e34 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -43,11 +43,34 @@ struct __wt_page { /* Column-store leaf page. */ struct { uint64_t recno; /* Starting recno */ - uint8_t *bitf; /* COL_FIX bits */ - WT_COL *d; /* COL_VAR objects */ - WT_INSERT_HEAD **ins; /* Inserts (RLE) */ + + uint8_t *bitf; /* COL_FIX items */ + WT_COL *d; /* COL_VAR items */ + + /* + * The last page of both fix- and variable-length column + * stores includes a skiplist of appended entries. + */ + WT_INSERT_HEAD **append;/* Appended items */ + + /* + * Updated items in column-stores: variable-length RLE + * entries can expand to multiple entries which requires + * some kind of list we can expand on demand. Updated + * items in fixed-length files could be done based on an + * WT_UPDATE array as in row-stores, but there can be a + * very large number of bits on a single page, and the + * cost of the WT_UPDATE array would be huge. + */ + WT_INSERT_HEAD **ins; /* Updated items */ + + /* + * Variable-length column-store files maintain a list of + * RLE entries on the page so it's unnecessary to walk + * the page counting records to find a specific entry. + */ WT_COL_RLE *repeats; /* RLE array for lookups */ - uint32_t nrepeats; /* Number of repeats. */ + uint32_t nrepeats; /* Number of repeat slots. */ } col_leaf; /* Bulk-loaded linked list. */ @@ -496,7 +519,6 @@ struct __wt_insert { * The head of a skip list of WT_INSERT items. */ struct __wt_insert_head { - WT_SESSION_BUFFER *sb; /* session buffer holding this update */ WT_INSERT *head[WT_SKIP_MAXDEPTH]; /* first item on skiplists */ }; @@ -530,11 +552,16 @@ struct __wt_insert_head { #define WT_COL_INSERT(page, ip) \ WT_COL_INSERT_SLOT(page, WT_COL_SLOT(page, ip)) /* - * WT_COL_INSERT_SINGLE references a single WT_INSERT list, which is used for - * fixed-length column-store updates. + * WT_COL_INSERT_{APPEND,SINGLE} reference a single WT_INSERT list, which are + * used for fixed-length column-store updates, and variable- and fixed-length + * column-store appends. */ +#define WT_COL_INSERT_APPEND(page) \ + ((page)->u.col_leaf.append == NULL ? \ + NULL : (page)->u.col_leaf.append[0]) #define WT_COL_INSERT_SINGLE(page) \ - ((page)->u.col_leaf.ins == NULL ? NULL : (page)->u.col_leaf.ins[0]) + ((page)->u.col_leaf.ins == NULL ? \ + NULL : (page)->u.col_leaf.ins[0]) /* WT_FIX_FOREACH walks fixed-length bit-fields on a disk page. */ #define WT_FIX_FOREACH(btree, dsk, v, i) \ diff --git a/src/include/btree.h b/src/include/btree.h index 34b3e31bf0f..203551f3d2a 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -125,7 +125,10 @@ struct __wt_btree { uint32_t leafmin; /* Min/max leaf page size */ uint32_t leafmax; - WT_WALK evict_walk; /* Eviction thread's walk state */ + WT_WALK evict_walk; /* Eviction thread's walk state */ + + WT_PAGE *last_page; /* Col-store append, last page */ + uint64_t last_recno; /* Col-store append, last recno */ void *reconcile; /* Reconciliation structure */ diff --git a/src/include/btree.i b/src/include/btree.i index aa01dd3620f..bda693248ba 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -6,20 +6,6 @@ */ /* - * __wt_cache_page_workq -- - * Create pages into the cache. - */ -static inline void -__wt_cache_page_workq(WT_SESSION_IMPL *session) -{ - WT_CACHE *cache; - - cache = S2C(session)->cache; - - ++cache->pages_workq; -} - -/* * __wt_cache_page_workq_incr -- * Increment a page's memory footprint in the cache. */ @@ -95,7 +81,7 @@ __wt_cache_pages_inuse(WT_CACHE *cache) * (although "interesting" corruption is vanishingly unlikely, these * values just increment over time). */ - pages_in = cache->pages_read + cache->pages_workq; + pages_in = cache->pages_read; pages_out = cache->pages_evict; return (pages_in > pages_out ? pages_in - pages_out : 0); } @@ -167,13 +153,13 @@ __wt_page_reconcile(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) /* * __wt_page_release -- - * Release a reference to a page, unless it's the root page, which remains - * pinned for the life of the table handle. + * Release a reference to a page, unless it's pinned into memory, in which + * case we never acquired a hazard reference. */ static inline void __wt_page_release(WT_SESSION_IMPL *session, WT_PAGE *page) { - if (page != NULL && !WT_PAGE_IS_ROOT(page)) + if (page != NULL && !F_ISSET(page, WT_PAGE_PINNED)) __wt_hazard_clear(session, page); } @@ -181,10 +167,11 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_PAGE *page) * __wt_skip_choose_depth -- * Randomly choose a depth for a skiplist insert. */ -static inline uint32_t +static inline u_int __wt_skip_choose_depth(void) { - uint32_t d; + u_int d; + for (d = 1; d < WT_SKIP_MAXDEPTH && __wt_random() < WT_SKIP_PROBABILITY; d++) ; diff --git a/src/include/cache.h b/src/include/cache.h index bd9e6eea48e..a93579a5baf 100644 --- a/src/include/cache.h +++ b/src/include/cache.h @@ -26,6 +26,7 @@ struct __wt_evict_req { uint32_t retry_next; /* Next retry slot */ uint32_t retry_entries; /* Total retry slots */ size_t retry_allocated; /* Bytes allocated */ + int retry_cnt; /* We only try a few times. */ int close_method; /* Discard pages */ }; @@ -92,7 +93,6 @@ struct __wt_cache { uint64_t bytes_read; /* Bytes/pages read by read server */ uint64_t pages_read; uint64_t bytes_workq; /* Bytes/pages created by workQ */ - uint64_t pages_workq; uint64_t bytes_evict; /* Bytes/pages discarded by eviction */ uint64_t pages_evict; }; diff --git a/src/include/column.i b/src/include/column.i new file mode 100644 index 00000000000..31c659ce1fc --- /dev/null +++ b/src/include/column.i @@ -0,0 +1,164 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + */ + +/* + * __col_insert_search -- + * Search an column-store insert list. + */ +static inline WT_INSERT * +__col_insert_search(WT_INSERT_HEAD *inshead, uint64_t recno) +{ + WT_INSERT **ins; + uint64_t ins_recno; + int cmp, i; + + /* If there's no insert chain to search, we're done. */ + if (inshead == NULL) + return (NULL); + + /* + * The insert list is a skip list: start at the highest skip level, then + * go as far as possible at each level before stepping down to the next. + */ + for (i = WT_SKIP_MAXDEPTH - 1, ins = &inshead->head[i]; i >= 0; ) { + if (*ins == NULL) { + --i; + --ins; + continue; + } + + ins_recno = WT_INSERT_RECNO(*ins); + cmp = (recno == ins_recno) ? 0 : (recno < ins_recno) ? -1 : 1; + + if (cmp == 0) /* Exact match: return */ + return (*ins); + else if (cmp > 0) /* Keep going at this level */ + ins = &(*ins)->next[i]; + else { /* Drop down a level */ + --i; + --ins; + } + } + + return (NULL); +} + +/* + * __col_insert_search_stack -- + * Search a column-store insert list, updating the skiplist stack as we go. + */ +static inline WT_INSERT * +__col_insert_search_stack( + WT_INSERT_HEAD *inshead, WT_INSERT ***ins_stack, uint64_t recno) +{ + WT_INSERT **ins; + uint64_t ins_recno; + int cmp, i; + + /* + * The insert list is a skip list: start at the highest skip level, then + * go as far as possible at each level before stepping down to the next. + */ + for (i = WT_SKIP_MAXDEPTH - 1, ins = &inshead->head[i]; i >= 0; ) { + if (*ins == NULL) { + ins_stack[i--] = ins--; + continue; + } + + ins_recno = WT_INSERT_RECNO(*ins); + cmp = (recno == ins_recno) ? 0 : (recno < ins_recno) ? -1 : 1; + + if (cmp == 0) /* Exact match: return */ + return (*ins); + else if (cmp > 0) /* Keep going at this level */ + ins = &(*ins)->next[i]; + else /* Drop down a level */ + ins_stack[i--] = ins--; + } + return (NULL); +} + +/* + * __col_last_recno -- + * Return the last record number for a variable-length column-store page. + */ +static inline uint64_t +__col_last_recno(WT_PAGE *page) +{ + WT_COL_RLE *repeat; + + /* + * If there's an append list (the last page), then there may be more + * records on the page. This function ignores those records, so our + * callers have to handle that explicitly, if they care. + * + * WT_PAGE_COL_FIX pages have no nrepeat values, so this works for + * fixed-length column-stores without any further check. + */ + if (page->u.col_leaf.nrepeats == 0) + return (page->entries == 0 ? 0 : + page->u.col_leaf.recno + (page->entries - 1)); + + repeat = &page->u.col_leaf.repeats[page->u.col_leaf.nrepeats - 1]; + return ( + (repeat->recno + repeat->rle) - 1 + + (page->entries - (repeat->indx + 1))); +} + +/* + * __col_var_search -- + * Search a variable-length column-store page for a record. + */ +static inline WT_COL * +__col_var_search(WT_PAGE *page, uint64_t recno) +{ + WT_COL_RLE *repeat; + uint64_t start_recno; + uint32_t base, indx, limit, start_indx; + + /* + * Find the matching slot. + * + * This is done in two stages: first, we do a binary search among any + * repeating records to find largest repeating less than the search key. + * Once there, we can do a simple offset calculation to find the correct + * slot for this record number, because we know any intervening records + * have repeat counts of 1. + */ + for (base = 0, + limit = page->u.col_leaf.nrepeats; limit != 0; limit >>= 1) { + indx = base + (limit >> 1); + + repeat = page->u.col_leaf.repeats + indx; + if (recno >= repeat->recno && + recno < repeat->recno + repeat->rle) + return (page->u.col_leaf.d + repeat->indx); + if (recno < repeat->recno) + continue; + base = indx + 1; + --limit; + } + + /* + * We didn't find an exact match, move forward from the largest repeat + * less than the search key. + */ + if (base == 0) { + start_indx = 0; + start_recno = page->u.col_leaf.recno; + } else { + repeat = page->u.col_leaf.repeats + (base - 1); + start_indx = repeat->indx + 1; + start_recno = repeat->recno + repeat->rle; + } + + if (recno >= start_recno + (page->entries - start_indx)) + return (NULL); + + return (page->u.col_leaf.d + + start_indx + (uint32_t)(recno - start_recno)); +} diff --git a/src/include/cursor.h b/src/include/cursor.h index f1d0e9b29ed..d37b5527c4e 100644 --- a/src/include/cursor.h +++ b/src/include/cursor.h @@ -37,17 +37,21 @@ struct __wt_cursor_btree { int compare; /* - * The following fields are maintained by cursor iteration functions. - * * We can't walk an insert list in reverse order, it's only linked in a - * forward, sorted order. We don't care for column-store files, the - * record number gives us a "key" for lookup; for row-store files, we - * maintain a count of the current entry we're on. For each iteration, - * we return one entry earlier in the list. + * forward, sorted order. Maintain a count of the current entry we're + * on. For each iteration, we return one entry earlier in the list. */ uint32_t ins_entry_cnt; /* 1-based insert list entry count */ /* + * It's relatively expensive to calculate the last record on a variable- + * length column-store page because of the repeat values. Calculate it + * once per page and cache it. This value doesn't include the skiplist + * of appended entries on the last page. + */ + uint64_t last_standard_recno; + + /* * Variable-length column-store items are run-length encoded, and * optionally Huffman encoded. To avoid repeatedly decompressing the * item, we decompress it once into the value buffer. The vslot field @@ -61,12 +65,15 @@ struct __wt_cursor_btree { /* * Fixed-length column-store items are a single byte, and it's simpler - * and cheaper to allocate the space for it now. + * and cheaper to allocate the space for it now than keep checking to + * see if we need to grow the buffer. */ uint8_t v; /* Fixed-length return value */ -#define WT_CBT_SEARCH_SET 0x01 /* Search has set a page */ -#define WT_CBT_SEARCH_SMALLEST 0x02 /* Smallest-key insert list */ +#define WT_CBT_ITERATE_APPEND 0x01 /* Col-store: iterating append list */ +#define WT_CBT_ITERATE_NEXT 0x02 /* Next iteration configuration */ +#define WT_CBT_ITERATE_PREV 0x04 /* Prev iteration configuration */ +#define WT_CBT_SEARCH_SMALLEST 0x08 /* Row-store: small-key insert list */ uint8_t flags; }; diff --git a/src/include/cursor.i b/src/include/cursor.i index 226510b96c8..392467d9807 100644 --- a/src/include/cursor.i +++ b/src/include/cursor.i @@ -20,21 +20,22 @@ __cursor_search_clear(WT_CURSOR_BTREE *cbt) cbt->ins = NULL; /* We don't bother clearing the insert stack, that's more expensive. */ + cbt->recno = 0; /* Illegal value */ cbt->write_gen = 0; cbt->compare = 2; /* Illegal value */ cbt->vslot = WT_CBT_VSLOT_OOB; - F_CLR(cbt, WT_CBT_SEARCH_SET | WT_CBT_SEARCH_SMALLEST); + cbt->flags = 0; } /* - * __cursor_func_clear -- + * __cursor_func_init -- * Reset the cursor's state for a new call. */ static inline void -__cursor_func_clear(WT_CURSOR_BTREE *cbt, int page_release) +__cursor_func_init(WT_CURSOR_BTREE *cbt, int page_release) { WT_CURSOR *cursor; WT_SESSION_IMPL *session; @@ -53,11 +54,11 @@ __cursor_func_clear(WT_CURSOR_BTREE *cbt, int page_release) } /* - * __cursor_func_set -- + * __cursor_func_resolve -- * Resolve the cursor's state for return. */ static inline void -__cursor_func_set(WT_CURSOR_BTREE *cbt, int ret) +__cursor_func_resolve(WT_CURSOR_BTREE *cbt, int ret) { WT_CURSOR *cursor; WT_SESSION_IMPL *session; @@ -73,11 +74,8 @@ __cursor_func_set(WT_CURSOR_BTREE *cbt, int ret) if (ret == 0) F_SET(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); else { - if (cbt->page != NULL) { - __wt_page_release(session, cbt->page); - cbt->page = NULL; - } - F_CLR(cbt, WT_CBT_SEARCH_SET | WT_CBT_SEARCH_SMALLEST); + __cursor_func_init(cbt, 1); + __cursor_search_clear(cbt); } } @@ -139,75 +137,3 @@ __cursor_row_slot_return(WT_CURSOR_BTREE *cbt, WT_ROW *rip) return (0); } - -/* - * __cursor_col_rle_last -- - * Return the last record number for a variable-length column-store page. - */ -static inline uint64_t -__cursor_col_rle_last(WT_PAGE *page) -{ - WT_COL_RLE *repeat; - - if (page->u.col_leaf.nrepeats == 0) - return (page->u.col_leaf.recno + (page->entries - 1)); - - repeat = &page->u.col_leaf.repeats[page->u.col_leaf.nrepeats - 1]; - return ( - (repeat->recno + repeat->rle) - 1 + - (page->entries - (repeat->indx + 1))); -} - -/* - * __cursor_col_rle_search -- - * Search a variable-length column-store page for a record. - */ -static inline WT_COL * -__cursor_col_rle_search(WT_PAGE *page, uint64_t recno) -{ - WT_COL_RLE *repeat; - uint64_t start_recno; - uint32_t base, indx, limit, start_indx; - - /* - * Find the matching slot. - * - * This is done in two stages: first, we do a binary search among any - * repeating records to find largest repeating less than the search key. - * Once there, we can do a simple offset calculation to find the correct - * slot for this record number, because we know any intervening records - * have repeat counts of 1. - */ - for (base = 0, - limit = page->u.col_leaf.nrepeats; limit != 0; limit >>= 1) { - indx = base + (limit >> 1); - - repeat = page->u.col_leaf.repeats + indx; - if (recno >= repeat->recno && - recno < repeat->recno + repeat->rle) - return (page->u.col_leaf.d + repeat->indx); - if (recno < repeat->recno) - continue; - base = indx + 1; - --limit; - } - - /* - * We didn't find an exact match, move forward from the largest repeat - * less than the search key. - */ - if (base == 0) { - start_indx = 0; - start_recno = page->u.col_leaf.recno; - } else { - repeat = page->u.col_leaf.repeats + (base - 1); - start_indx = repeat->indx + 1; - start_recno = repeat->recno + repeat->rle; - } - - if (recno >= start_recno + (page->entries - start_indx)) - return (NULL); - - return (page->u.col_leaf.d + - start_indx + (uint32_t)(recno - start_recno)); -} diff --git a/src/include/extern.h b/src/include/extern.h index 3e55c02c149..a129c19af31 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -155,7 +155,7 @@ extern int __wt_cell_unpack_copy( WT_SESSION_IMPL *session, extern int __wt_btree_lex_compare( WT_BTREE *btree, const WT_ITEM *user_item, const WT_ITEM *tree_item); -extern int __wt_btcur_search_setup(WT_CURSOR_BTREE *cbt); +extern void __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt, int next); extern int __wt_btcur_first(WT_CURSOR_BTREE *cbt); extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt); extern int __wt_btcur_last(WT_CURSOR_BTREE *cbt); @@ -193,13 +193,13 @@ extern void __wt_workq_evict_server_exit(WT_CONNECTION_IMPL *conn); extern int __wt_evict_file_serial_func(WT_SESSION_IMPL *session); extern void *__wt_cache_evict_server(void *arg); extern int __wt_btree_create(WT_SESSION_IMPL *session, const char *filename); -extern int __wt_btree_root_init(WT_SESSION_IMPL *session); extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *name, const char *filename, const char *treeconfig, const char *cfg[], uint32_t flags); +extern int __wt_btree_root_init(WT_SESSION_IMPL *session); extern int __wt_btree_close(WT_SESSION_IMPL *session); extern int __wt_btree_huffman_open(WT_SESSION_IMPL *session); extern void __wt_btree_huffman_close(WT_SESSION_IMPL *session); @@ -253,13 +253,10 @@ extern int __wt_walk_prev(WT_SESSION_IMPL *session, WT_WALK *walk, WT_PAGE **pagep); extern int __wt_tree_np(WT_SESSION_IMPL *session, WT_PAGE **pagep, int next); -extern int __wt_col_extend(WT_SESSION_IMPL *session, - WT_PAGE *page, - uint64_t recno); -extern int __wt_col_extend_serial_func(WT_SESSION_IMPL *session); extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, - int is_remove); + int op); +extern int __wt_append_serial_func(WT_SESSION_IMPL *session); extern int __wt_col_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_modify); @@ -277,16 +274,14 @@ extern int __wt_row_key_serial_func(WT_SESSION_IMPL *session); extern int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove); -extern int __wt_row_insert_alloc(WT_SESSION_IMPL *session, +extern int __wt_row_insert_alloc( WT_SESSION_IMPL *session, WT_ITEM *key, - uint32_t skipdepth, - WT_INSERT **insp, - size_t *ins_sizep); + u_int skipdepth, + WT_INSERT **insp); extern int __wt_insert_serial_func(WT_SESSION_IMPL *session); extern int __wt_update_alloc(WT_SESSION_IMPL *session, WT_ITEM *value, - WT_UPDATE **updp, - size_t *upd_sizep); + WT_UPDATE **updp); extern int __wt_update_serial_func(WT_SESSION_IMPL *session); extern int __wt_row_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, diff --git a/src/include/serial.i b/src/include/serial.i index dc812dbe525..3fe1723cca1 100644 --- a/src/include/serial.i +++ b/src/include/serial.i @@ -1,196 +1,142 @@ /* DO NOT EDIT: automatically built by dist/serial.py. */ typedef struct { - WT_PAGE *parent; - WT_REF *parent_ref; - int dsk_verify; -} __wt_cache_read_args; - -static inline int -__wt_cache_read_serial( - WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *parent_ref, int - dsk_verify) -{ - __wt_cache_read_args _args, *args = &_args; - int ret; - - args->parent = parent; - - args->parent_ref = parent_ref; - - args->dsk_verify = dsk_verify; - - ret = __wt_session_serialize_func(session, - WT_WORKQ_READ, 0, __wt_cache_read_serial_func, args); - - return (ret); -} - -static inline void -__wt_cache_read_unpack( - WT_SESSION_IMPL *session, WT_PAGE **parentp, WT_REF **parent_refp, int - *dsk_verifyp) -{ - __wt_cache_read_args *args = - (__wt_cache_read_args *)session->wq_args; - - *parentp = args->parent; - *parent_refp = args->parent_ref; - *dsk_verifyp = args->dsk_verify; -} - -typedef struct { - WT_PAGE *page; - WT_PAGE *new_intl; - size_t new_intl_size; - int new_intl_taken; - WT_COL_REF *t; - size_t t_size; - int t_taken; - uint32_t internal_extend; - WT_PAGE *new_leaf; - size_t new_leaf_size; - int new_leaf_taken; - void *entries; - size_t entries_size; - int entries_taken; - uint32_t leaf_extend; - uint64_t recno; -} __wt_col_extend_args; + WT_INSERT_HEAD **inshead; + WT_INSERT ***ins_stack; + WT_INSERT_HEAD **new_inslist; + size_t new_inslist_size; + int new_inslist_taken; + WT_INSERT_HEAD *new_inshead; + size_t new_inshead_size; + int new_inshead_taken; + WT_INSERT *new_ins; + u_int skipdepth; +} __wt_append_args; static inline int -__wt_col_extend_serial( - WT_SESSION_IMPL *session, WT_PAGE *page, WT_PAGE **new_intlp, size_t - new_intl_size, WT_COL_REF **tp, size_t t_size, uint32_t - internal_extend, WT_PAGE **new_leafp, size_t new_leaf_size, void - **entriesp, size_t entries_size, uint32_t leaf_extend, uint64_t recno) +__wt_append_serial( + WT_SESSION_IMPL *session, WT_INSERT_HEAD **inshead, WT_INSERT + ***ins_stack, WT_INSERT_HEAD ***new_inslistp, size_t new_inslist_size, + WT_INSERT_HEAD **new_insheadp, size_t new_inshead_size, WT_INSERT + *new_ins, u_int skipdepth) { - __wt_col_extend_args _args, *args = &_args; + __wt_append_args _args, *args = &_args; int ret; - args->page = page; - - if (new_intlp == NULL) - args->new_intl = NULL; - else { - args->new_intl = *new_intlp; - *new_intlp = NULL; - args->new_intl_size = new_intl_size; - } - args->new_intl_taken = 0; - - if (tp == NULL) - args->t = NULL; - else { - args->t = *tp; - *tp = NULL; - args->t_size = t_size; - } - args->t_taken = 0; + args->inshead = inshead; - args->internal_extend = internal_extend; + args->ins_stack = ins_stack; - if (new_leafp == NULL) - args->new_leaf = NULL; + if (new_inslistp == NULL) + args->new_inslist = NULL; else { - args->new_leaf = *new_leafp; - *new_leafp = NULL; - args->new_leaf_size = new_leaf_size; + args->new_inslist = *new_inslistp; + *new_inslistp = NULL; + args->new_inslist_size = new_inslist_size; } - args->new_leaf_taken = 0; + args->new_inslist_taken = 0; - if (entriesp == NULL) - args->entries = NULL; + if (new_insheadp == NULL) + args->new_inshead = NULL; else { - args->entries = *entriesp; - *entriesp = NULL; - args->entries_size = entries_size; + args->new_inshead = *new_insheadp; + *new_insheadp = NULL; + args->new_inshead_size = new_inshead_size; } - args->entries_taken = 0; + args->new_inshead_taken = 0; - args->leaf_extend = leaf_extend; + args->new_ins = new_ins; - args->recno = recno; + args->skipdepth = skipdepth; ret = __wt_session_serialize_func(session, - WT_WORKQ_FUNC, 1, __wt_col_extend_serial_func, args); - - if (!args->new_intl_taken) - __wt_free(session, args->new_intl); - if (!args->t_taken) - __wt_free(session, args->t); - if (!args->new_leaf_taken) - __wt_free(session, args->new_leaf); - if (!args->entries_taken) - __wt_free(session, args->entries); + WT_WORKQ_FUNC, 1, __wt_append_serial_func, args); + + if (!args->new_inslist_taken) + __wt_free(session, args->new_inslist); + if (!args->new_inshead_taken) + __wt_free(session, args->new_inshead); return (ret); } static inline void -__wt_col_extend_unpack( - WT_SESSION_IMPL *session, WT_PAGE **pagep, WT_PAGE **new_intlp, - WT_COL_REF **tp, uint32_t *internal_extendp, WT_PAGE **new_leafp, void - **entriesp, uint32_t *leaf_extendp, uint64_t *recnop) +__wt_append_unpack( + WT_SESSION_IMPL *session, WT_INSERT_HEAD ***insheadp, WT_INSERT + ****ins_stackp, WT_INSERT_HEAD ***new_inslistp, WT_INSERT_HEAD + **new_insheadp, WT_INSERT **new_insp, u_int *skipdepthp) { - __wt_col_extend_args *args = - (__wt_col_extend_args *)session->wq_args; + __wt_append_args *args = + (__wt_append_args *)session->wq_args; - *pagep = args->page; - *new_intlp = args->new_intl; - *tp = args->t; - *internal_extendp = args->internal_extend; - *new_leafp = args->new_leaf; - *entriesp = args->entries; - *leaf_extendp = args->leaf_extend; - *recnop = args->recno; + *insheadp = args->inshead; + *ins_stackp = args->ins_stack; + *new_inslistp = args->new_inslist; + *new_insheadp = args->new_inshead; + *new_insp = args->new_ins; + *skipdepthp = args->skipdepth; } static inline void -__wt_col_extend_new_intl_taken(WT_SESSION_IMPL *session, WT_PAGE *page) +__wt_append_new_inslist_taken(WT_SESSION_IMPL *session, WT_PAGE *page) { - __wt_col_extend_args *args = - (__wt_col_extend_args *)session->wq_args; + __wt_append_args *args = + (__wt_append_args *)session->wq_args; - args->new_intl_taken = 1; + args->new_inslist_taken = 1; - WT_ASSERT(session, args->new_intl_size != 0); - __wt_cache_page_workq_incr(session, page, args->new_intl_size); + WT_ASSERT(session, args->new_inslist_size != 0); + __wt_cache_page_workq_incr(session, page, args->new_inslist_size); } static inline void -__wt_col_extend_t_taken(WT_SESSION_IMPL *session, WT_PAGE *page) +__wt_append_new_inshead_taken(WT_SESSION_IMPL *session, WT_PAGE *page) { - __wt_col_extend_args *args = - (__wt_col_extend_args *)session->wq_args; + __wt_append_args *args = + (__wt_append_args *)session->wq_args; - args->t_taken = 1; + args->new_inshead_taken = 1; - WT_ASSERT(session, args->t_size != 0); - __wt_cache_page_workq_incr(session, page, args->t_size); + WT_ASSERT(session, args->new_inshead_size != 0); + __wt_cache_page_workq_incr(session, page, args->new_inshead_size); } -static inline void -__wt_col_extend_new_leaf_taken(WT_SESSION_IMPL *session, WT_PAGE *page) +typedef struct { + WT_PAGE *parent; + WT_REF *parent_ref; + int dsk_verify; +} __wt_cache_read_args; + +static inline int +__wt_cache_read_serial( + WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *parent_ref, int + dsk_verify) { - __wt_col_extend_args *args = - (__wt_col_extend_args *)session->wq_args; + __wt_cache_read_args _args, *args = &_args; + int ret; + + args->parent = parent; + + args->parent_ref = parent_ref; - args->new_leaf_taken = 1; + args->dsk_verify = dsk_verify; - WT_ASSERT(session, args->new_leaf_size != 0); - __wt_cache_page_workq_incr(session, page, args->new_leaf_size); + ret = __wt_session_serialize_func(session, + WT_WORKQ_READ, 0, __wt_cache_read_serial_func, args); + + return (ret); } static inline void -__wt_col_extend_entries_taken(WT_SESSION_IMPL *session, WT_PAGE *page) +__wt_cache_read_unpack( + WT_SESSION_IMPL *session, WT_PAGE **parentp, WT_REF **parent_refp, int + *dsk_verifyp) { - __wt_col_extend_args *args = - (__wt_col_extend_args *)session->wq_args; - - args->entries_taken = 1; + __wt_cache_read_args *args = + (__wt_cache_read_args *)session->wq_args; - WT_ASSERT(session, args->entries_size != 0); - __wt_cache_page_workq_incr(session, page, args->entries_size); + *parentp = args->parent; + *parent_refp = args->parent_ref; + *dsk_verifyp = args->dsk_verify; } typedef struct { @@ -233,10 +179,8 @@ typedef struct { WT_INSERT_HEAD *new_inshead; size_t new_inshead_size; int new_inshead_taken; - WT_INSERT *ins; - size_t ins_size; - int ins_taken; - uint32_t depth; + WT_INSERT *new_ins; + u_int skipdepth; } __wt_insert_args; static inline int @@ -244,8 +188,8 @@ __wt_insert_serial( WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t write_gen, WT_INSERT_HEAD **inshead, WT_INSERT ***ins_stack, WT_INSERT_HEAD ***new_inslistp, size_t new_inslist_size, WT_INSERT_HEAD - **new_insheadp, size_t new_inshead_size, WT_INSERT **insp, size_t - ins_size, uint32_t depth) + **new_insheadp, size_t new_inshead_size, WT_INSERT *new_ins, u_int + skipdepth) { __wt_insert_args _args, *args = &_args; int ret; @@ -276,16 +220,9 @@ __wt_insert_serial( } args->new_inshead_taken = 0; - if (insp == NULL) - args->ins = NULL; - else { - args->ins = *insp; - *insp = NULL; - args->ins_size = ins_size; - } - args->ins_taken = 0; + args->new_ins = new_ins; - args->depth = depth; + args->skipdepth = skipdepth; ret = __wt_session_serialize_func(session, WT_WORKQ_FUNC, 1, __wt_insert_serial_func, args); @@ -294,8 +231,6 @@ __wt_insert_serial( __wt_free(session, args->new_inslist); if (!args->new_inshead_taken) __wt_free(session, args->new_inshead); - if (!args->ins_taken) - __wt_free(session, args->ins); return (ret); } @@ -303,8 +238,8 @@ static inline void __wt_insert_unpack( WT_SESSION_IMPL *session, WT_PAGE **pagep, uint32_t *write_genp, WT_INSERT_HEAD ***insheadp, WT_INSERT ****ins_stackp, WT_INSERT_HEAD - ***new_inslistp, WT_INSERT_HEAD **new_insheadp, WT_INSERT **insp, - uint32_t *depthp) + ***new_inslistp, WT_INSERT_HEAD **new_insheadp, WT_INSERT **new_insp, + u_int *skipdepthp) { __wt_insert_args *args = (__wt_insert_args *)session->wq_args; @@ -315,8 +250,8 @@ __wt_insert_unpack( *ins_stackp = args->ins_stack; *new_inslistp = args->new_inslist; *new_insheadp = args->new_inshead; - *insp = args->ins; - *depthp = args->depth; + *new_insp = args->new_ins; + *skipdepthp = args->skipdepth; } static inline void @@ -343,18 +278,6 @@ __wt_insert_new_inshead_taken(WT_SESSION_IMPL *session, WT_PAGE *page) __wt_cache_page_workq_incr(session, page, args->new_inshead_size); } -static inline void -__wt_insert_ins_taken(WT_SESSION_IMPL *session, WT_PAGE *page) -{ - __wt_insert_args *args = - (__wt_insert_args *)session->wq_args; - - args->ins_taken = 1; - - WT_ASSERT(session, args->ins_size != 0); - __wt_cache_page_workq_incr(session, page, args->ins_size); -} - typedef struct { WT_PAGE *page; WT_ROW *row_arg; @@ -402,15 +325,13 @@ typedef struct { size_t new_upd_size; int new_upd_taken; WT_UPDATE *upd; - size_t upd_size; - int upd_taken; } __wt_update_args; static inline int __wt_update_serial( WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t write_gen, WT_UPDATE **srch_upd, WT_UPDATE ***new_updp, size_t new_upd_size, WT_UPDATE - **updp, size_t upd_size) + *upd) { __wt_update_args _args, *args = &_args; int ret; @@ -430,22 +351,13 @@ __wt_update_serial( } args->new_upd_taken = 0; - if (updp == NULL) - args->upd = NULL; - else { - args->upd = *updp; - *updp = NULL; - args->upd_size = upd_size; - } - args->upd_taken = 0; + args->upd = upd; ret = __wt_session_serialize_func(session, WT_WORKQ_FUNC, 1, __wt_update_serial_func, args); if (!args->new_upd_taken) __wt_free(session, args->new_upd); - if (!args->upd_taken) - __wt_free(session, args->upd); return (ret); } @@ -475,15 +387,3 @@ __wt_update_new_upd_taken(WT_SESSION_IMPL *session, WT_PAGE *page) WT_ASSERT(session, args->new_upd_size != 0); __wt_cache_page_workq_incr(session, page, args->new_upd_size); } - -static inline void -__wt_update_upd_taken(WT_SESSION_IMPL *session, WT_PAGE *page) -{ - __wt_update_args *args = - (__wt_update_args *)session->wq_args; - - args->upd_taken = 1; - - WT_ASSERT(session, args->upd_size != 0); - __wt_cache_page_workq_incr(session, page, args->upd_size); -} diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index ec02eddbaae..aef78a9e55b 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -509,12 +509,6 @@ struct wt_session { * must appear in at least one column group. Each column group must be * created with a separate call to WT_SESSION::create. The value must be * a string.,} - * @config{column_internal_extend, configure the number of records a - * column-store internal page is extended by when records are appended. - * The value must be an integer between 500 and 10M.,\c 10000} - * @config{column_leaf_extend, configure the number of records a - * column-store leaf page is extended by when records are appended. The - * value must be an integer between 500 and 10M.,\c 10000} * @config{columns, list of the column names. Comma-separated list of * the form <code>(column[\,...])</code>. For tables\, the number of * entries must match the total number of values in \c key_format and \c diff --git a/src/include/wt_internal.in b/src/include/wt_internal.in index 196d0b82607..b078dd90464 100644 --- a/src/include/wt_internal.in +++ b/src/include/wt_internal.in @@ -171,6 +171,7 @@ struct __wt_walk_entry; #include "bitstring.i" #include "btree.i" #include "cell.i" +#include "column.i" #include "cursor.i" #include "log.i" #include "packing.i" diff --git a/test/format/format.h b/test/format/format.h index 6115b3144df..cf03481235a 100644 --- a/test/format/format.h +++ b/test/format/format.h @@ -45,6 +45,7 @@ typedef struct { void *wts_conn; /* WT_CONNECTION handle */ void *wts_cursor; /* WT_CURSOR handle */ + void *wts_cursor_insert; /* WT_CURSOR insert handle */ void *wts_session; /* WT_SESSION handle */ FILE *rand_log; /* Random number log */ diff --git a/test/format/wts.c b/test/format/wts.c index 0a9a4ee6038..2c74d53fce8 100644 --- a/test/format/wts.c +++ b/test/format/wts.c @@ -10,9 +10,10 @@ static int bulk(WT_ITEM **, WT_ITEM **); static int wts_close(WT_CONNECTION *); static int wts_col_del(uint64_t, int *); -static int wts_col_put(uint64_t, int); -static int wts_np(int, int *); +static int wts_col_insert(uint64_t *); +static int wts_col_put(uint64_t); static int wts_notfound_chk(const char *, int, int, uint64_t); +static int wts_np(int, int, int *); static int wts_open(WT_CONNECTION **, WT_SESSION **session); static int wts_read(uint64_t); static int wts_row_del(uint64_t, int *); @@ -116,7 +117,7 @@ wts_startup(void) { time_t now; WT_CONNECTION *conn; - WT_CURSOR *cursor; + WT_CURSOR *cursor, *cursor_insert; WT_SESSION *session; int ret; char config[512], *end, *p; @@ -162,6 +163,21 @@ wts_startup(void) return (1); } + /* + * We open 2 cursors, one configured for overwriting, one not configured + * for overwriting. The reason is that for row-store and column-store + * files where we're testing with existing records, we don't track if a + * record was deleted or not, which means we need to use cursor->insert + * with overwriting configured. But, in column-store files where we're + * testing with new, appended records, we don't want to have to specify + * the record number, which means we can't configure with overwriting. + */ + if ((ret = session->open_cursor( + session, WT_TABLENAME, NULL, NULL, &cursor_insert)) != 0) { + fprintf(stderr, "%s: open_cursor: %s\n", + g.progname, wiredtiger_strerror(ret)); + return (1); + } if ((ret = session->open_cursor( session, WT_TABLENAME, NULL, "overwrite", &cursor)) != 0) { fprintf(stderr, "%s: open_cursor: %s\n", @@ -179,6 +195,8 @@ wts_startup(void) g.wts_conn = conn; g.wts_session = session; g.wts_cursor = cursor; + g.wts_cursor_insert = cursor_insert; + return (0); } @@ -481,13 +499,13 @@ wts_ops(void) uint64_t cnt, keyno; uint32_t op; u_int np; - int notfound; + int insert, notfound; for (cnt = 0; cnt < g.c_ops; ++cnt) { if (cnt % 10 == 0) track("read/write ops", cnt); - notfound = 0; + insert = notfound = 0; keyno = MMRAND(1, g.c_rows); /* @@ -509,17 +527,6 @@ wts_ops(void) return (1); break; case FIX: - /* - * We don't delete records in fixed-length - * column-store files: a "delete" is the same - * as a store of 0x00, which means we're not - * really testing anything interesting, and, - * if we reconcile the page, the engine code - * discards trailing, deleted records, which - * can give us test failures because we don't - * match the contents of the BDB database. - */ - break; case VAR: if (wts_col_del(keyno, ¬found)) return (1); @@ -532,22 +539,10 @@ wts_ops(void) return (1); break; case FIX: - /* - * We don't insert records in fixed-length - * column-store files: an insert extends the - * file by creating a large number of "deleted" - * records: since a deleted record is a store - * of 0x00, we can't distinguish between a - * legitimate value and a deleted record, and - * so we don't match the contents of the BDB - * database. - */ - break; case VAR: - /* Column-store tables only support append. */ - keyno = ++g.c_rows; - if (wts_col_put(keyno, 1)) + if (wts_col_insert(&keyno)) return (1); + insert = 1; break; } } else if ( @@ -559,7 +554,7 @@ wts_ops(void) break; case FIX: case VAR: - if (wts_col_put(keyno, 0)) + if (wts_col_put(keyno)) return (1); break; } @@ -576,7 +571,7 @@ wts_ops(void) for (np = 0; np < MMRAND(1, 4); ++np) { if (notfound) break; - if (wts_np(MMRAND(0, 1), ¬found)) + if (wts_np(MMRAND(0, 1), insert, ¬found)) return (1); } @@ -705,7 +700,7 @@ wts_read(uint64_t keyno) * Read and verify the next/prev element in a row- or column-store file. */ static int -wts_np(int next, int *notfoundp) +wts_np(int next, int insert, int *notfoundp) { static WT_ITEM key, value, bdb_key, bdb_value; WT_CURSOR *cursor; @@ -715,7 +710,7 @@ wts_np(int next, int *notfoundp) uint8_t bitfield; const char *which; - cursor = g.wts_cursor; + cursor = insert ? g.wts_cursor_insert : g.wts_cursor; session = g.wts_session; which = next ? "next" : "prev"; @@ -774,7 +769,7 @@ wts_np(int next, int *notfoundp) /* * wts_row_put -- - * Replace an element in a row-store file. + * Update an element in a row-store file. */ static int wts_row_put(uint64_t keyno, int insert) @@ -818,10 +813,10 @@ wts_row_put(uint64_t keyno, int insert) /* * wts_col_put -- - * Replace an element in a column-store file. + * Update an element in a column-store file. */ static int -wts_col_put(uint64_t keyno, int insert) +wts_col_put(uint64_t keyno) { static WT_ITEM key, value; WT_CURSOR *cursor; @@ -835,41 +830,96 @@ wts_col_put(uint64_t keyno, int insert) value_gen(&value.data, &value.size, keyno); /* Log the operation */ - if (g.logging) { + if (g.logging) if (g.c_file_type == FIX) (void)session->msg_printf(session, "%-10s%" PRIu64 " {0x%02" PRIx8 "}", - insert ? "insert" : "put", - keyno, ((uint8_t *)value.data)[0]); + "put", keyno, + ((uint8_t *)value.data)[0]); else (void)session->msg_printf(session, "%-10s%" PRIu64 " {%.*s}", - insert ? "insert" : "put", - keyno, (int)value.size, (char *)value.data); - } + "put", keyno, + (int)value.size, (char *)value.data); - if (bdb_put(key.data, key.size, value.data, value.size, ¬found)) - return (1); - cursor->set_key(cursor, keyno); if (g.c_file_type == FIX) cursor->set_value(cursor, *(uint8_t *)value.data); else cursor->set_value(cursor, &value); - ret = cursor->insert(cursor); + ret = cursor->update(cursor); if (ret != 0 && ret != WT_NOTFOUND) { fprintf(stderr, - "%s: wts_col_put: %s col %" PRIu64 " by key: %s\n", - g.progname, - insert ? "insert" : "put", keyno, wiredtiger_strerror(ret)); + "%s: wts_col_put: %" PRIu64 " : %s\n", + g.progname, keyno, wiredtiger_strerror(ret)); return (1); } + if (bdb_put(key.data, key.size, value.data, value.size, ¬found)) + return (1); + NTF_CHK(wts_notfound_chk("wts_col_put", ret, notfound, keyno)); return (0); } /* + * wts_col_insert -- + * Insert an element in a column-store file. + */ +static int +wts_col_insert(uint64_t *keynop) +{ + static WT_ITEM key, value; + WT_CURSOR *cursor; + WT_SESSION *session; + uint64_t keyno; + int notfound, ret; + + cursor = g.wts_cursor_insert; + session = g.wts_session; + + value_gen(&value.data, &value.size, 0); + + if (g.c_file_type == FIX) + cursor->set_value(cursor, *(uint8_t *)value.data); + else + cursor->set_value(cursor, &value); + ret = cursor->insert(cursor); + if (ret != 0) { + fprintf(stderr, "%s: wts_col_insert: %s\n", + g.progname, wiredtiger_strerror(ret)); + return (1); + } + if ((ret = cursor->get_key(cursor, &keyno)) != 0) { + fprintf(stderr, "%s: cursor->get_key: %s\n", + g.progname, wiredtiger_strerror(ret)); + return (1); + } + if (keyno <= g.c_rows) { + fprintf(stderr, + "%s: inserted key did not create new row\n", g.progname); + return (1); + } + g.c_rows = *keynop = (uint32_t)keyno; + + if (g.logging) + if (g.c_file_type == FIX) + (void)session->msg_printf(session, + "%-10s%" PRIu64 " {0x%02" PRIx8 "}", + "insert", keyno, + ((uint8_t *)value.data)[0]); + else + (void)session->msg_printf(session, + "%-10s%" PRIu64 " {%.*s}", + "insert", keyno, + (int)value.size, (char *)value.data); + + key_gen(&key.data, &key.size, keyno, 0); + return (bdb_put( + key.data, key.size, value.data, value.size, ¬found) ? 1 : 0); +} + +/* * wts_row_del -- * Delete an element from a row-store file. */ |