summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--build_posix/aclocal/version-set.m42
-rw-r--r--dist/api_data.py8
-rw-r--r--dist/filelist1
-rw-r--r--dist/serial.py26
-rw-r--r--src/api/config_def.c54
-rw-r--r--src/btree/bt_bulk.c13
-rw-r--r--src/btree/bt_curnext.c174
-rw-r--r--src/btree/bt_curprev.c124
-rw-r--r--src/btree/bt_cursor.c80
-rw-r--r--src/btree/bt_debug.c44
-rw-r--r--src/btree/bt_discard.c5
-rw-r--r--src/btree/bt_evict.c31
-rw-r--r--src/btree/bt_handle.c46
-rw-r--r--src/btree/bt_page.c3
-rw-r--r--src/btree/bt_reconcile.c68
-rw-r--r--src/btree/bt_vrfy.c4
-rw-r--r--src/btree/col_extend.c369
-rw-r--r--src/btree/col_modify.c244
-rw-r--r--src/btree/col_srch.c79
-rw-r--r--src/btree/row_modify.c65
-rw-r--r--src/btree/row_srch.c4
-rw-r--r--src/cursor/cur_file.c2
-rw-r--r--src/include/btmem.h43
-rw-r--r--src/include/btree.h5
-rw-r--r--src/include/btree.i27
-rw-r--r--src/include/cache.h2
-rw-r--r--src/include/column.i164
-rw-r--r--src/include/cursor.h25
-rw-r--r--src/include/cursor.i90
-rw-r--r--src/include/extern.h21
-rw-r--r--src/include/serial.i308
-rw-r--r--src/include/wiredtiger.in6
-rw-r--r--src/include/wt_internal.in1
-rw-r--r--test/format/format.h1
-rw-r--r--test/format/wts.c150
35 files changed, 1083 insertions, 1206 deletions
diff --git a/build_posix/aclocal/version-set.m4 b/build_posix/aclocal/version-set.m4
index 9ca84e7bd9b..d41c4c14fbc 100644
--- a/build_posix/aclocal/version-set.m4
+++ b/build_posix/aclocal/version-set.m4
@@ -3,7 +3,7 @@ dnl build by dist/s_version
VERSION_MAJOR=0
VERSION_MINOR=7
VERSION_PATCH=0
-VERSION_STRING='"WiredTiger 0.7.0: (September 6, 2011)"'
+VERSION_STRING='"WiredTiger 0.7.0: (September 11, 2011)"'
AC_SUBST(VERSION_MAJOR)
AC_SUBST(VERSION_MINOR)
diff --git a/dist/api_data.py b/dist/api_data.py
index e68547c9de0..f0411456d05 100644
--- a/dist/api_data.py
+++ b/dist/api_data.py
@@ -61,14 +61,6 @@ file_meta = format_meta + [
Config('allocation_size', '512B', r'''
file unit allocation size, in bytes''',
min='512B', max='128MB'),
- Config('column_internal_extend', '10000', r'''
- configure the number of records a column-store internal page is
- extended by when records are appended''',
- min='500', max='10M'),
- Config('column_leaf_extend', '10000', r'''
- configure the number of records a column-store leaf page is
- extended by when records are appended''',
- min='500', max='10M'),
Config('huffman_key', '', r'''
use Huffman encoding for Btree keys. Permitted values are
empty (off), \c "english" or \c "<filename>". See @ref huffman
diff --git a/dist/filelist b/dist/filelist
index 91bd06053d8..e873ef35505 100644
--- a/dist/filelist
+++ b/dist/filelist
@@ -40,7 +40,6 @@ src/btree/bt_sync.c
src/btree/bt_vrfy.c
src/btree/bt_vrfy_dsk.c
src/btree/bt_walk.c
-src/btree/col_extend.c
src/btree/col_modify.c
src/btree/col_srch.c
src/btree/row_key.c
diff --git a/dist/serial.py b/dist/serial.py
index 7136b2f415f..c9520631fa4 100644
--- a/dist/serial.py
+++ b/dist/serial.py
@@ -17,23 +17,21 @@ class Serial:
self.args = args
msgtypes = [
+Serial('append', 'WT_WORKQ_FUNC', 1, [
+ SerialArg('WT_INSERT_HEAD **', 'inshead'),
+ SerialArg('WT_INSERT ***', 'ins_stack'),
+ SerialArg('WT_INSERT_HEAD **', 'new_inslist', 1),
+ SerialArg('WT_INSERT_HEAD *', 'new_inshead', 1),
+ SerialArg('WT_INSERT *', 'new_ins', 0),
+ SerialArg('u_int', 'skipdepth'),
+ ]),
+
Serial('cache_read', 'WT_WORKQ_READ', 0, [
SerialArg('WT_PAGE *', 'parent'),
SerialArg('WT_REF *', 'parent_ref'),
SerialArg('int', 'dsk_verify'),
]),
-Serial('col_extend', 'WT_WORKQ_FUNC', 1, [
- SerialArg('WT_PAGE *', 'page'),
- SerialArg('WT_PAGE *', 'new_intl', 1),
- SerialArg('WT_COL_REF *', 't', 1),
- SerialArg('uint32_t', 'internal_extend'),
- SerialArg('WT_PAGE *', 'new_leaf', 1),
- SerialArg('void *', 'entries', 1),
- SerialArg('uint32_t', 'leaf_extend'),
- SerialArg('uint64_t', 'recno'),
- ]),
-
Serial('evict_file', 'WT_WORKQ_EVICT', 0, [
SerialArg('int', 'close_method'),
]),
@@ -45,8 +43,8 @@ Serial('insert', 'WT_WORKQ_FUNC', 1, [
SerialArg('WT_INSERT ***', 'ins_stack'),
SerialArg('WT_INSERT_HEAD **', 'new_inslist', 1),
SerialArg('WT_INSERT_HEAD *', 'new_inshead', 1),
- SerialArg('WT_INSERT *', 'ins', 1),
- SerialArg('uint32_t', 'depth'),
+ SerialArg('WT_INSERT *', 'new_ins', 0),
+ SerialArg('u_int', 'skipdepth'),
]),
Serial('row_key', 'WT_WORKQ_FUNC', 1, [
@@ -60,7 +58,7 @@ Serial('update', 'WT_WORKQ_FUNC', 1, [
SerialArg('uint32_t', 'write_gen'),
SerialArg('WT_UPDATE **', 'srch_upd'),
SerialArg('WT_UPDATE **', 'new_upd', 1),
- SerialArg('WT_UPDATE *', 'upd', 1),
+ SerialArg('WT_UPDATE *', 'upd', 0),
]),
]
diff --git a/src/api/config_def.c b/src/api/config_def.c
index f8203f36dd8..96d3a752683 100644
--- a/src/api/config_def.c
+++ b/src/api/config_def.c
@@ -76,26 +76,23 @@ __wt_confchk_cursor_close =
const char *
__wt_confdfl_file_meta =
- "allocation_size=512B,block_compressor=,column_internal_extend=10000,"
- "column_leaf_extend=10000,columns=,huffman_key=,huffman_value=,"
- "internal_key_truncate=true,internal_node_max=2KB,internal_node_min=2KB,"
- "key_format=u,key_gap=10,leaf_node_max=1MB,leaf_node_min=32KB,"
- "prefix_compression=true,split_min=false,split_pct=75,type=btree,"
- "value_format=u";
+ "allocation_size=512B,block_compressor=,columns=,huffman_key=,"
+ "huffman_value=,internal_key_truncate=true,internal_node_max=2KB,"
+ "internal_node_min=2KB,key_format=u,key_gap=10,leaf_node_max=1MB,"
+ "leaf_node_min=32KB,prefix_compression=true,split_min=false,split_pct=75,"
+ "type=btree,value_format=u";
const char *
__wt_confchk_file_meta =
"allocation_size=(type=int,min=512B,max=128MB),block_compressor=(),"
- "column_internal_extend=(type=int,min=500,max=10M),"
- "column_leaf_extend=(type=int,min=500,max=10M),columns=(type=list),"
- "huffman_key=(),huffman_value=(),internal_key_truncate=(type=boolean),"
- "internal_node_max=(type=int,min=512B,max=512MB),"
- "internal_node_min=(type=int,min=512B,max=512MB),key_format=(type=format)"
- ",key_gap=(type=int,min=0),leaf_node_max=(type=int,min=512B,max=512MB),"
- "leaf_node_min=(type=int,min=512B,max=512MB),"
- "prefix_compression=(type=boolean),split_min=(type=boolean),"
- "split_pct=(type=int,min=0,max=100),type=(choices=[\"btree\"]),"
- "value_format=(type=format)";
+ "columns=(type=list),huffman_key=(),huffman_value=(),"
+ "internal_key_truncate=(type=boolean),internal_node_max=(type=int,"
+ "min=512B,max=512MB),internal_node_min=(type=int,min=512B,max=512MB),"
+ "key_format=(type=format),key_gap=(type=int,min=0),"
+ "leaf_node_max=(type=int,min=512B,max=512MB),leaf_node_min=(type=int,"
+ "min=512B,max=512MB),prefix_compression=(type=boolean),"
+ "split_min=(type=boolean),split_pct=(type=int,min=0,max=100),"
+ "type=(choices=[\"btree\"]),value_format=(type=format)";
const char *
__wt_confdfl_index_meta =
@@ -144,8 +141,7 @@ __wt_confchk_session_commit_transaction =
const char *
__wt_confdfl_session_create =
- "allocation_size=512B,block_compressor=,colgroups=,"
- "column_internal_extend=10000,column_leaf_extend=10000,columns=,columns=,"
+ "allocation_size=512B,block_compressor=,colgroups=,columns=,columns=,"
"exclusive=false,filename=,huffman_key=,huffman_value=,"
"internal_key_truncate=true,internal_node_max=2KB,internal_node_min=2KB,"
"key_format=u,key_format=u,key_gap=10,leaf_node_max=1MB,"
@@ -155,18 +151,16 @@ __wt_confdfl_session_create =
const char *
__wt_confchk_session_create =
"allocation_size=(type=int,min=512B,max=128MB),block_compressor=(),"
- "colgroups=(),column_internal_extend=(type=int,min=500,max=10M),"
- "column_leaf_extend=(type=int,min=500,max=10M),columns=(type=list),"
- "columns=(type=list),exclusive=(type=boolean),filename=(),huffman_key=(),"
- "huffman_value=(),internal_key_truncate=(type=boolean),"
- "internal_node_max=(type=int,min=512B,max=512MB),"
- "internal_node_min=(type=int,min=512B,max=512MB),key_format=(type=format)"
- ",key_format=(type=format),key_gap=(type=int,min=0),"
- "leaf_node_max=(type=int,min=512B,max=512MB),leaf_node_min=(type=int,"
- "min=512B,max=512MB),prefix_compression=(type=boolean),"
- "split_min=(type=boolean),split_pct=(type=int,min=0,max=100),"
- "type=(choices=[\"btree\"]),value_format=(type=format),"
- "value_format=(type=format)";
+ "colgroups=(),columns=(type=list),columns=(type=list),"
+ "exclusive=(type=boolean),filename=(),huffman_key=(),huffman_value=(),"
+ "internal_key_truncate=(type=boolean),internal_node_max=(type=int,"
+ "min=512B,max=512MB),internal_node_min=(type=int,min=512B,max=512MB),"
+ "key_format=(type=format),key_format=(type=format),key_gap=(type=int,"
+ "min=0),leaf_node_max=(type=int,min=512B,max=512MB),"
+ "leaf_node_min=(type=int,min=512B,max=512MB),"
+ "prefix_compression=(type=boolean),split_min=(type=boolean),"
+ "split_pct=(type=int,min=0,max=100),type=(choices=[\"btree\"]),"
+ "value_format=(type=format),value_format=(type=format)";
const char *
__wt_confdfl_session_drop =
diff --git a/src/btree/bt_bulk.c b/src/btree/bt_bulk.c
index 95d7be5deb4..15479bcb41f 100644
--- a/src/btree/bt_bulk.c
+++ b/src/btree/bt_bulk.c
@@ -36,6 +36,8 @@ __wt_bulk_init(WT_CURSOR_BULK *cbulk)
if (F_ISSET(btree->root_page.page, WT_PAGE_INITIAL_EMPTY)) {
btree->root_page.state = WT_REF_DISK;
__wt_free(session, btree->root_page.page);
+
+ btree->last_page = NULL;
} else {
__wt_errx(
session, "bulk-load is only possible for empty trees");
@@ -158,8 +160,7 @@ __bulk_col_var(WT_CURSOR_BULK *cbulk)
* Allocate an WT_UPDATE item and append the V object onto the page's
* update list.
*/
- WT_RET(__wt_update_alloc(
- session, (WT_ITEM *)&cursor->value, &upd, NULL));
+ WT_RET(__wt_update_alloc(session, (WT_ITEM *)&cursor->value, &upd));
(*cbulk->updp) = upd;
cbulk->updp = &upd->next;
@@ -196,10 +197,10 @@ __bulk_row(WT_CURSOR_BULK *cbulk)
* Allocate a WT_INSERT/WT_UPDATE pair and append the K/V pair onto the
* page's insert list.
*/
- WT_RET(__wt_row_insert_alloc(
- session, (WT_ITEM *)&cursor->key, 1, &ins, NULL));
- WT_ERR(__wt_update_alloc(
- session, (WT_ITEM *)&cursor->value, &ins->upd, NULL));
+ WT_RET(
+ __wt_row_insert_alloc(session, (WT_ITEM *)&cursor->key, 1, &ins));
+ WT_ERR(
+ __wt_update_alloc(session, (WT_ITEM *)&cursor->value, &ins->upd));
*cbulk->insp = ins;
cbulk->insp = &WT_SKIP_NEXT(ins);
diff --git a/src/btree/bt_curnext.c b/src/btree/bt_curnext.c
index 7ec71be6d48..c50f3c045cc 100644
--- a/src/btree/bt_curnext.c
+++ b/src/btree/bt_curnext.c
@@ -8,6 +8,42 @@
#include "wt_internal.h"
/*
+ * __cursor_col_append_next --
+ * Return the next entry on the append list.
+ */
+static inline int
+__cursor_col_append_next(WT_CURSOR_BTREE *cbt, int newpage)
+{
+ WT_BUF *val;
+
+ val = &cbt->iface.value;
+
+ if (newpage) {
+ cbt->ins_entry_cnt = 1;
+ goto new_page;
+ }
+
+ for (;;) {
+ if ((cbt->ins = WT_SKIP_NEXT(cbt->ins)) == NULL)
+ return (WT_NOTFOUND);
+ ++cbt->ins_entry_cnt;
+
+new_page: if (cbt->page->type == WT_PAGE_COL_FIX) {
+ val->data = WT_UPDATE_DATA(cbt->ins->upd);
+ val->size = 1;
+ break;
+ } else {
+ if (WT_UPDATE_DELETED_ISSET(cbt->ins->upd))
+ continue;
+ val->data = WT_UPDATE_DATA(cbt->ins->upd);
+ val->size = cbt->ins->upd->size;
+ break;
+ }
+ }
+ return (0);
+}
+
+/*
* __cursor_fix_next --
* Move to the next, fixed-length column-store item.
*/
@@ -29,6 +65,7 @@ __cursor_fix_next(WT_CURSOR_BTREE *cbt, int newpage)
if (newpage) {
cbt->ins = WT_SKIP_FIRST(WT_COL_INSERT_SINGLE(cbt->page));
cbt->recno = cbt->page->u.col_leaf.recno;
+ cbt->last_standard_recno = __col_last_recno(cbt->page);
goto new_page;
}
@@ -41,8 +78,7 @@ __cursor_fix_next(WT_CURSOR_BTREE *cbt, int newpage)
/* Move to the next entry and return the item. */
for (;;) {
- if (cbt->recno >=
- cbt->page->u.col_leaf.recno + (cbt->page->entries - 1))
+ if (cbt->recno >= cbt->last_standard_recno)
return (WT_NOTFOUND);
++cbt->recno;
new_page: *recnop = cbt->recno;
@@ -104,18 +140,20 @@ __cursor_var_next(WT_CURSOR_BTREE *cbt, int newpage)
/* Initialize for each new page. */
if (newpage) {
cbt->recno = cbt->page->u.col_leaf.recno;
+ cbt->last_standard_recno = __col_last_recno(cbt->page);
cbt->vslot = UINT32_MAX;
goto new_page;
}
/* Move to the next entry and return the item. */
for (;;) {
+ if (cbt->recno >= cbt->last_standard_recno)
+ return (WT_NOTFOUND);
++cbt->recno;
new_page: *recnop = cbt->recno;
/* Find the matching WT_COL slot. */
- if ((cip =
- __cursor_col_rle_search(cbt->page, cbt->recno)) == NULL)
+ if ((cip = __col_var_search(cbt->page, cbt->recno)) == NULL)
return (WT_NOTFOUND);
slot = WT_COL_SLOT(cbt->page, cip);
@@ -273,44 +311,71 @@ new_insert: if (cbt->ins != NULL) {
}
/*
- * __wt_btcur_search_setup --
- * Initialize a cursor for iteration based on a search.
+ * __wt_btcur_iterate_setup --
+ * Initialize a cursor for iteration, usually based on a search.
*/
-int
-__wt_btcur_search_setup(WT_CURSOR_BTREE *cbt)
+void
+__wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt, int next)
{
WT_INSERT *ins;
+ WT_PAGE *page;
- if (cbt->page->type != WT_PAGE_ROW_LEAF)
- return (0);
+ WT_UNUSED(next);
/*
- * For row-store pages, we need a single item that tells us the part
- * of the page we're walking (otherwise switching from next to prev
- * and vice-versa is just too complicated), so we map the WT_ROW and
- * WT_INSERT_HEAD array slots into a single name space: slot 1 is the
- * "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is
- * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are
- * odd-numbered slots, and WT_ROW array slots are even-numbered slots.
- *
- * !!!
- * I'm re-using WT_CURSOR_BTREE->slot for this purpose, which means that
- * WT_CURSOR_BTREE->slot is now useless outside of cursor next/prev. If
- * that turns out to be a bad idea because we need the original value of
- * WT_CURSOR_BTREE->slot after a next/prev call, switch to another field
- * to hold the iteration slot.
+ * We don't currently have to do any setup when we switch between next
+ * and prev calls, but I'm sure we will someday -- I'm leaving support
+ * here for both flags for that reason.
*/
- cbt->slot = (cbt->slot + 1) * 2;
- if (cbt->ins_head != NULL) {
- if (cbt->ins_head == WT_ROW_INSERT_SMALLEST(cbt->page))
- cbt->slot = 1;
- else
- cbt->slot += 1;
+ F_SET(cbt, WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV);
+
+ /* If we don't have a search page, then we're done. */
+ if ((page = cbt->page) == NULL)
+ return;
+
+ if (page->type == WT_PAGE_ROW_LEAF) {
+ /*
+ * For row-store pages, we need a single item that tells us the
+ * part of the page we're walking (otherwise switching from next
+ * to prev and vice-versa is just too complicated), so we map
+ * the WT_ROW and WT_INSERT_HEAD array slots into a single name
+ * space: slot 1 is the "smallest key insert list", slot 2 is
+ * WT_ROW[0], slot 3 is WT_INSERT_HEAD[0], and so on. This
+ * means WT_INSERT lists are odd-numbered slots, and WT_ROW
+ * array slots are even-numbered slots.
+ *
+ * !!!
+ * I'm re-using WT_CURSOR_BTREE->slot for this purpose, which
+ * means that WT_CURSOR_BTREE->slot is now useless outside of
+ * cursor next/prev. If that turns out to be a bad idea because
+ * we need the original value of WT_CURSOR_BTREE->slot after a
+ * next/prev call, switch to another field to hold the iteration
+ * slot.
+ */
+ cbt->slot = (cbt->slot + 1) * 2;
+ if (cbt->ins_head != NULL) {
+ if (cbt->ins_head == WT_ROW_INSERT_SMALLEST(page))
+ cbt->slot = 1;
+ else
+ cbt->slot += 1;
+ }
+ } else {
+ /*
+ * For column-store pages, calculate the largest record on the
+ * page.
+ */
+ cbt->last_standard_recno = __col_last_recno(page);
+
+ /* If we're traversing the append list, set the reference. */
+ if (cbt->ins_head != NULL &&
+ cbt->ins_head == WT_COL_INSERT_APPEND(page))
+ F_SET(cbt, WT_CBT_ITERATE_APPEND);
}
/*
- * If we're in an insert list, figure out how far in, we have to track
- * our current slot for previous traversals.
+ * If we're in a row-store insert list or a column-store append list,
+ * figure out how far in, we have to track the current slot for prev
+ * traversals.
*/
cbt->ins_entry_cnt = 0;
if (cbt->ins_head != NULL)
@@ -319,9 +384,6 @@ __wt_btcur_search_setup(WT_CURSOR_BTREE *cbt)
if (ins == cbt->ins)
break;
}
-
- F_CLR(cbt, WT_CBT_SEARCH_SET);
- return (0);
}
/*
@@ -334,11 +396,10 @@ __wt_btcur_first(WT_CURSOR_BTREE *cbt)
WT_SESSION_IMPL *session;
session = (WT_SESSION_IMPL *)cbt->iface.session;
-
WT_BSTAT_INCR(session, cursor_first);
- __cursor_func_clear(cbt, 1);
- F_CLR(cbt, WT_CBT_SEARCH_SET);
+ __cursor_func_init(cbt, 1);
+ F_SET(cbt, WT_CBT_ITERATE_NEXT);
return (__wt_btcur_next(cbt));
}
@@ -350,19 +411,20 @@ __wt_btcur_first(WT_CURSOR_BTREE *cbt)
int
__wt_btcur_next(WT_CURSOR_BTREE *cbt)
{
- WT_CURSOR *cursor;
WT_SESSION_IMPL *session;
int newpage, ret;
- cursor = &cbt->iface;
- session = (WT_SESSION_IMPL *)cursor->session;
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
WT_BSTAT_INCR(session, cursor_read_next);
- __cursor_func_clear(cbt, 0);
+ __cursor_func_init(cbt, 0);
- /* If iterating from a search position, there's some setup to do. */
- if (F_ISSET(cbt, WT_CBT_SEARCH_SET))
- WT_RET(__wt_btcur_search_setup(cbt));
+ /*
+ * If we aren't already iterating in the right direction, there's
+ * some setup to do.
+ */
+ if (!F_ISSET(cbt, WT_CBT_ITERATE_NEXT))
+ __wt_btcur_iterate_setup(cbt, 1);
/*
* Walk any page we're holding until the underlying call returns not-
@@ -370,7 +432,13 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt)
* file.
*/
for (newpage = 0;; newpage = 1) {
- if (cbt->page != NULL) {
+ if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
+ if ((ret = __cursor_col_append_next(cbt, newpage)) == 0)
+ break;
+ F_CLR(cbt, WT_CBT_ITERATE_APPEND);
+ if (ret != WT_NOTFOUND)
+ break;
+ } else if (cbt->page != NULL) {
switch (cbt->page->type) {
case WT_PAGE_COL_FIX:
ret = __cursor_fix_next(cbt, newpage);
@@ -385,6 +453,18 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt)
}
if (ret != WT_NOTFOUND)
break;
+
+ /*
+ * The last page in a column-store has appended entries.
+ * We handle it separately from the usual cursor code:
+ * it's only that one page and it's in a simple format.
+ */
+ if (cbt->page->type != WT_PAGE_ROW_LEAF &&
+ (cbt->ins = WT_SKIP_FIRST(
+ WT_COL_INSERT_APPEND(cbt->page))) != NULL) {
+ F_SET(cbt, WT_CBT_ITERATE_APPEND);
+ continue;
+ }
}
do {
@@ -395,6 +475,6 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt)
cbt->page->type == WT_PAGE_ROW_INT);
}
-err: __cursor_func_set(cbt, ret);
+err: __cursor_func_resolve(cbt, ret);
return (ret);
}
diff --git a/src/btree/bt_curprev.c b/src/btree/bt_curprev.c
index 65b8160d60f..d7e70b0a4be 100644
--- a/src/btree/bt_curprev.c
+++ b/src/btree/bt_curprev.c
@@ -8,43 +8,49 @@
#include "wt_internal.h"
/*
- * __search_insert --
- * Search an insert list.
+ * __cursor_col_append_prev --
+ * Return the previous entry on the append list.
*/
-static inline WT_INSERT *
-__search_insert(WT_INSERT_HEAD *inshead, uint64_t recno)
+static inline int
+__cursor_col_append_prev(WT_CURSOR_BTREE *cbt, int newpage)
{
- WT_INSERT **ins;
- uint64_t ins_recno;
- int cmp, i;
+ WT_BUF *val;
+ WT_INSERT *ins;
+ uint32_t i;
- /* If there's no insert chain to search, we're done. */
- if (inshead == NULL)
- return (NULL);
+ val = &cbt->iface.value;
- /*
- * The insert list is a skip list: start at the highest skip level, then
- * go as far as possible at each level before stepping down to the next.
- */
- for (i = WT_SKIP_MAXDEPTH - 1, ins = &inshead->head[i]; i >= 0; ) {
- if (*ins == NULL)
- cmp = -1;
- else {
- ins_recno = WT_INSERT_RECNO(*ins);
- cmp = (recno == ins_recno) ? 0 :
- (recno < ins_recno) ? -1 : 1;
+ if (newpage) {
+ cbt->ins_entry_cnt = 0;
+ WT_SKIP_FOREACH(ins, cbt->ins_head)
+ ++cbt->ins_entry_cnt;
+ goto new_page;
+ }
+
+ for (;;) {
+ if (--cbt->ins_entry_cnt == 0) {
+ F_CLR(cbt, WT_CBT_ITERATE_APPEND);
+ return (WT_NOTFOUND);
}
- if (cmp == 0) /* Exact match: return */
- return (*ins);
- else if (cmp > 0) /* Keep going at this level */
- ins = &(*ins)->next[i];
- else { /* Drop down a level */
- --i;
- --ins;
+
+new_page: for (i = cbt->ins_entry_cnt,
+ ins = WT_SKIP_FIRST(cbt->ins_head); i > 1; --i)
+ ins = WT_SKIP_NEXT(ins);
+ cbt->ins = ins;
+
+ if (cbt->page->type == WT_PAGE_COL_FIX) {
+ val->data = WT_UPDATE_DATA(cbt->ins->upd);
+ val->size = 1;
+ break;
+ } else {
+ if (WT_UPDATE_DELETED_ISSET(cbt->ins->upd))
+ continue;
+ val->data = WT_UPDATE_DATA(cbt->ins->upd);
+ val->size = cbt->ins->upd->size;
+ break;
}
}
-
- return (NULL);
+ return (0);
}
/*
@@ -74,8 +80,7 @@ __cursor_fix_prev(WT_CURSOR_BTREE *cbt, int newpage)
/* Initialize for each new page. */
if (newpage) {
- cbt->recno =
- cbt->page->u.col_leaf.recno + (cbt->page->entries - 1);
+ cbt->recno = __col_last_recno(cbt->page);
goto new_page;
}
@@ -92,7 +97,7 @@ new_page: *recnop = cbt->recno;
* to search the entire list. We use the skiplist structure,
* rather than doing it linearly.
*/
- if ((ins = __search_insert(
+ if ((ins = __col_insert_search(
WT_COL_INSERT_SINGLE(cbt->page), cbt->recno)) != NULL) {
val->data = WT_UPDATE_DATA(ins->upd);
val->size = 1;
@@ -135,7 +140,7 @@ __cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage)
/* Initialize for each new page. */
if (newpage) {
- cbt->recno = __cursor_col_rle_last(cbt->page);
+ cbt->recno = __col_last_recno(cbt->page);
cbt->vslot = UINT32_MAX;
goto new_page;
}
@@ -148,8 +153,7 @@ __cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage)
new_page: *recnop = cbt->recno;
/* Find the matching WT_COL slot. */
- if ((cip =
- __cursor_col_rle_search(cbt->page, cbt->recno)) == NULL)
+ if ((cip = __col_var_search(cbt->page, cbt->recno)) == NULL)
return (WT_NOTFOUND);
slot = WT_COL_SLOT(cbt->page, cip);
@@ -159,7 +163,7 @@ new_page: *recnop = cbt->recno;
* to search the entire list. We use the skiplist structure,
* rather than doing it linearly.
*/
- if ((ins = __search_insert(
+ if ((ins = __col_insert_search(
WT_COL_INSERT(cbt->page, cip), cbt->recno)) != NULL) {
if (WT_UPDATE_DELETED_ISSET(ins->upd))
continue;
@@ -228,8 +232,11 @@ __cursor_row_prev(WT_CURSOR_BTREE *cbt, int newpage)
* New page configuration.
*/
if (newpage) {
- cbt->ins_head = WT_ROW_INSERT_SLOT(cbt->page,
- (cbt->page->entries > 0) ? cbt->page->entries - 1 : 0);
+ if (cbt->page->entries == 0)
+ cbt->ins_head = WT_ROW_INSERT_SMALLEST(cbt->page);
+ else
+ cbt->ins_head = WT_ROW_INSERT_SLOT(
+ cbt->page, cbt->page->entries - 1);
cbt->ins_entry_cnt = 0;
WT_SKIP_FOREACH(ins, cbt->ins_head)
++cbt->ins_entry_cnt;
@@ -309,8 +316,8 @@ __wt_btcur_last(WT_CURSOR_BTREE *cbt)
WT_BSTAT_INCR(session, cursor_last);
- __cursor_func_clear(cbt, 1);
- F_CLR(cbt, WT_CBT_SEARCH_SET);
+ __cursor_func_init(cbt, 1);
+ F_SET(cbt, WT_CBT_ITERATE_PREV);
return (__wt_btcur_prev(cbt));
}
@@ -322,19 +329,20 @@ __wt_btcur_last(WT_CURSOR_BTREE *cbt)
int
__wt_btcur_prev(WT_CURSOR_BTREE *cbt)
{
- WT_CURSOR *cursor;
WT_SESSION_IMPL *session;
int newpage, ret;
- cursor = &cbt->iface;
- session = (WT_SESSION_IMPL *)cursor->session;
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
WT_BSTAT_INCR(session, cursor_read_prev);
- __cursor_func_clear(cbt, 0);
+ __cursor_func_init(cbt, 0);
- /* If iterating from a search position, there's some setup to do. */
- if (F_ISSET(cbt, WT_CBT_SEARCH_SET))
- WT_RET(__wt_btcur_search_setup(cbt));
+ /*
+ * If we aren't already iterating in the right direction, there's
+ * some setup to do.
+ */
+ if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV))
+ __wt_btcur_iterate_setup(cbt, 0);
/*
* Walk any page we're holding until the underlying call returns not-
@@ -342,6 +350,14 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt)
* of the file.
*/
for (newpage = 0;; newpage = 1) {
+ if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
+ if ((ret = __cursor_col_append_prev(cbt, newpage)) == 0)
+ break;
+ F_CLR(cbt, WT_CBT_ITERATE_APPEND);
+ if (ret != WT_NOTFOUND)
+ break;
+ newpage = 1;
+ }
if (cbt->page != NULL) {
switch (cbt->page->type) {
case WT_PAGE_COL_FIX:
@@ -365,8 +381,18 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt)
} while (
cbt->page->type == WT_PAGE_COL_INT ||
cbt->page->type == WT_PAGE_ROW_INT);
+
+ /*
+ * The last page in a column-store has appended entries.
+ * We handle it separately from the usual cursor code:
+ * it's only that one page and it's in a simple format.
+ */
+ if (cbt->page->type != WT_PAGE_ROW_LEAF &&
+ (cbt->ins = WT_SKIP_FIRST(
+ WT_COL_INSERT_APPEND(cbt->page))) != NULL)
+ F_SET(cbt, WT_CBT_ITERATE_APPEND);
}
-err: __cursor_func_set(cbt, ret);
+err: __cursor_func_resolve(cbt, ret);
return (ret);
}
diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c
index 951eaa5064b..13126d8a563 100644
--- a/src/btree/bt_cursor.c
+++ b/src/btree/bt_cursor.c
@@ -74,7 +74,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt)
WT_BSTAT_INCR(session, cursor_read);
- __cursor_func_clear(cbt, 1);
+ __cursor_func_init(cbt, 1);
WT_ERR(btree->type == BTREE_ROW ?
__wt_row_search(session, cbt, 0) :
@@ -84,7 +84,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt)
else
ret = __wt_kv_return(session, cbt, 0);
-err: __cursor_func_set(cbt, ret);
+err: __cursor_func_resolve(cbt, ret);
return (ret);
}
@@ -117,7 +117,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exact)
* If we find an exact match, or the search key is smaller than the tree
* key, and the tree key has not been deleted, return the tree key.
*/
- __cursor_func_clear(cbt, 1);
+ __cursor_func_init(cbt, 1);
WT_ERR(srch(session, cbt, 0));
if (cbt->compare == 0 || cbt->compare == 1)
if (!__cursor_invalid(cbt)) {
@@ -142,7 +142,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exact)
* a subsequent record. If we don't find a previous record, there's no
* record to return, quit.
*/
- __cursor_func_clear(cbt, 1);
+ __cursor_func_init(cbt, 1);
WT_ERR(srch(session, cbt, 0));
if (!__cursor_invalid(cbt)) {
*exact = cbt->compare;
@@ -154,7 +154,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exact)
ret = __wt_btcur_prev(cbt);
err:
-done: __cursor_func_set(cbt, ret);
+done: __cursor_func_resolve(cbt, ret);
return (ret);
}
@@ -176,7 +176,7 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt)
session = (WT_SESSION_IMPL *)cursor->session;
WT_BSTAT_INCR(session, cursor_inserts);
-retry: __cursor_func_clear(cbt, 1);
+retry: __cursor_func_init(cbt, 1);
switch (btree->type) {
case BTREE_COL_FIX:
@@ -190,15 +190,22 @@ retry: __cursor_func_clear(cbt, 1);
/* FALLTHROUGH */
case BTREE_COL_VAR:
/*
- * Insert in column stores allocates a new key (ignoring the
- * application's key), and creates a new record.
- *
- * XXX
- * This semantic not yet implemented.
+ * If WT_CURSTD_OVERWRITE set, insert/update the application
+ * specified record, otherwise insert a new record (ignoring
+ * the application's record number), return the record number
+ * to the application.
*/
- WT_ERR(__wt_col_search(session, cbt, 1));
- if ((ret = __wt_col_modify(session, cbt, 0)) == WT_RESTART)
- goto retry;
+ if (F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
+ WT_ERR(__wt_col_search(session, cbt, 1));
+ if ((ret = __wt_col_modify(session,
+ cbt, cbt->compare == 0 ? 3 : 1)) == WT_RESTART)
+ goto retry;
+ } else {
+ if ((ret =
+ __wt_col_modify(session, cbt, 1)) == WT_RESTART)
+ goto retry;
+ cbt->iface.recno = cbt->recno;
+ }
break;
case BTREE_ROW:
/*
@@ -206,22 +213,20 @@ retry: __cursor_func_clear(cbt, 1);
* configuration "overwrite" not set), otherwise creates
* a new record.
*/
- while ((ret = __wt_row_search(session, cbt, 1)) == WT_RESTART)
- ;
- if (ret == 0) {
- if (cbt->compare == 0 && !__cursor_invalid(cbt) &&
- !F_ISSET(cursor, WT_CURSTD_OVERWRITE))
- ret = WT_DUPLICATE_KEY;
- else
- if ((ret = __wt_row_modify(
- session, cbt, 0)) == WT_RESTART)
- goto retry;
+ WT_ERR(__wt_row_search(session, cbt, 1));
+ if (cbt->compare == 0 &&
+ !__cursor_invalid(cbt) &&
+ !F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
+ ret = WT_DUPLICATE_KEY;
+ break;
}
+ if ((ret = __wt_row_modify(session, cbt, 0)) == WT_RESTART)
+ goto retry;
break;
WT_ILLEGAL_FORMAT(session);
}
-err: __cursor_func_set(cbt, ret);
+err: __cursor_func_resolve(cbt, ret);
return (ret);
}
@@ -243,18 +248,20 @@ __wt_btcur_remove(WT_CURSOR_BTREE *cbt)
session = (WT_SESSION_IMPL *)cursor->session;
WT_BSTAT_INCR(session, cursor_removes);
-retry: __cursor_func_clear(cbt, 1);
+retry: __cursor_func_init(cbt, 1);
switch (btree->type) {
case BTREE_COL_FIX:
case BTREE_COL_VAR:
+ /* Remove the record if it exists. */
WT_ERR(__wt_col_search(session, cbt, 1));
if (cbt->compare != 0 || __cursor_invalid(cbt))
ret = WT_NOTFOUND;
- else if ((ret = __wt_col_modify(session, cbt, 1)) == WT_RESTART)
+ else if ((ret = __wt_col_modify(session, cbt, 2)) == WT_RESTART)
goto retry;
break;
case BTREE_ROW:
+ /* Remove the record if it exists. */
WT_ERR(__wt_row_search(session, cbt, 1));
if (cbt->compare != 0 || __cursor_invalid(cbt))
ret = WT_NOTFOUND;
@@ -264,7 +271,7 @@ retry: __cursor_func_clear(cbt, 1);
WT_ILLEGAL_FORMAT(session);
}
-err: __cursor_func_set(cbt, ret);
+err: __cursor_func_resolve(cbt, ret);
return (ret);
}
@@ -286,7 +293,7 @@ __wt_btcur_update(WT_CURSOR_BTREE *cbt)
session = (WT_SESSION_IMPL *)cursor->session;
WT_BSTAT_INCR(session, cursor_updates);
-retry: __cursor_func_clear(cbt, 1);
+retry: __cursor_func_init(cbt, 1);
switch (btree->type) {
case BTREE_COL_FIX:
@@ -299,16 +306,15 @@ retry: __cursor_func_clear(cbt, 1);
}
/* FALLTHROUGH */
case BTREE_COL_VAR:
- /* Update in column stores is an unconditional overwrite. */
+ /* Update the record. */
WT_ERR(__wt_col_search(session, cbt, 1));
- if ((ret = __wt_col_modify(session, cbt, 0)) == WT_RESTART)
+ if (cbt->compare != 0 || __cursor_invalid(cbt))
+ ret = WT_NOTFOUND;
+ if ((ret = __wt_col_modify(session, cbt, 3)) == WT_RESTART)
goto retry;
break;
case BTREE_ROW:
- /*
- * Update in row stores fails if the key doesn't exist, else
- * overwrites the value.
- */
+ /* Update the record it it exists. */
WT_ERR(__wt_row_search(session, cbt, 1));
if (cbt->compare != 0 || __cursor_invalid(cbt))
ret = WT_NOTFOUND;
@@ -318,7 +324,7 @@ retry: __cursor_func_clear(cbt, 1);
WT_ILLEGAL_FORMAT(session);
}
-err: __cursor_func_set(cbt, ret);
+err: __cursor_func_resolve(cbt, ret);
return (ret);
}
@@ -335,7 +341,7 @@ __wt_btcur_close(WT_CURSOR_BTREE *cbt, const char *config)
WT_UNUSED(config);
session = (WT_SESSION_IMPL *)cbt->iface.session;
- __cursor_func_clear(cbt, 1);
+ __cursor_func_init(cbt, 1);
__wt_buf_free(session, &cbt->value);
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index 96c79eb21e2..c7d72cd87c1 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -25,7 +25,7 @@ typedef struct {
static void __debug_byte_string(WT_DBG *, const uint8_t *, size_t);
static int __debug_cell(WT_DBG *, WT_CELL_UNPACK *);
static int __debug_cell_data(WT_DBG *, const char *, WT_CELL_UNPACK *);
-static void __debug_col_insert(WT_DBG *, WT_INSERT_HEAD *, int);
+static void __debug_col_list(WT_DBG *, WT_INSERT_HEAD *, const char *, int);
static int __debug_config(WT_SESSION_IMPL *, WT_DBG *, const char *);
static int __debug_dsk_cell(WT_DBG *, WT_PAGE_DISK *);
static void __debug_dsk_col_fix(WT_DBG *, WT_PAGE_DISK *);
@@ -39,7 +39,7 @@ static int __debug_page_row_int(WT_DBG *, WT_PAGE *, uint32_t);
static int __debug_page_row_leaf(WT_DBG *, WT_PAGE *);
static int __debug_page_work(WT_DBG *, WT_PAGE *, uint32_t);
static void __debug_ref(WT_DBG *, WT_REF *);
-static void __debug_row_insert(WT_DBG *, WT_INSERT_HEAD *);
+static void __debug_row_list(WT_DBG *, WT_INSERT_HEAD *);
static void __debug_update(WT_DBG *, WT_UPDATE *, int);
static void __dmsg(WT_DBG *, const char *, ...)
WT_GCC_ATTRIBUTE((format (printf, 2, 3)));
@@ -415,7 +415,6 @@ static void
__debug_page_col_fix(WT_DBG *ds, WT_PAGE *page)
{
WT_BTREE *btree;
- WT_CONNECTION_IMPL *conn;
WT_INSERT *ins;
WT_PAGE_DISK *dsk;
WT_SESSION_IMPL *session;
@@ -425,7 +424,6 @@ __debug_page_col_fix(WT_DBG *ds, WT_PAGE *page)
session = ds->session;
btree = session->btree;
- conn = S2C(session);
dsk = page->dsk;
recno = page->u.col_leaf.recno;
@@ -447,8 +445,15 @@ __debug_page_col_fix(WT_DBG *ds, WT_PAGE *page)
++recno;
}
}
- __dmsg(ds, "%s\n", conn->sep);
- __debug_col_insert(ds, WT_COL_INSERT_SINGLE(page), 1);
+
+ if (WT_COL_INSERT_SINGLE(page) != NULL) {
+ __dmsg(ds, "%s\n", S2C(session)->sep);
+ __debug_col_list(ds, WT_COL_INSERT_SINGLE(page), "insert", 1);
+ }
+ if (WT_COL_INSERT_APPEND(page) != NULL) {
+ __dmsg(ds, "%s\n", S2C(session)->sep);
+ __debug_col_list(ds, WT_COL_INSERT_APPEND(page), "append", 1);
+ }
}
/*
@@ -487,10 +492,12 @@ __debug_page_col_var(WT_DBG *ds, WT_PAGE *page)
WT_CELL_UNPACK *unpack, _unpack;
WT_COL *cip;
WT_INSERT_HEAD *inshead;
+ WT_SESSION_IMPL *session;
uint64_t recno, rle;
uint32_t i;
char tag[64];
+ session = ds->session;
unpack = &_unpack;
recno = page->u.col_leaf.recno;
@@ -506,9 +513,15 @@ __debug_page_col_var(WT_DBG *ds, WT_PAGE *page)
WT_RET(__debug_cell_data(ds, tag, unpack));
if ((inshead = WT_COL_INSERT(page, cip)) != NULL)
- __debug_col_insert(ds, inshead, 0);
+ __debug_col_list(ds, inshead, "update", 0);
recno += rle;
}
+
+ if (WT_COL_INSERT_APPEND(page) != NULL) {
+ __dmsg(ds, "%s\n", S2C(session)->sep);
+ __debug_col_list(ds, WT_COL_INSERT_APPEND(page), "append", 0);
+ }
+
return (0);
}
@@ -559,7 +572,7 @@ __debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page)
* key on the page.
*/
if ((inshead = WT_ROW_INSERT_SMALLEST(page)) != NULL)
- __debug_row_insert(ds, inshead);
+ __debug_row_list(ds, inshead);
/* Dump the page's K/V pairs. */
WT_ROW_FOREACH(page, rip, i) {
@@ -581,34 +594,35 @@ __debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page)
__debug_update(ds, upd, 0);
if ((inshead = WT_ROW_INSERT(page, rip)) != NULL)
- __debug_row_insert(ds, inshead);
+ __debug_row_list(ds, inshead);
}
return (0);
}
/*
- * __debug_col_insert --
- * Dump a column-store insert array.
+ * __debug_col_list --
+ * Dump a column-store skiplist.
*/
static void
-__debug_col_insert(WT_DBG *ds, WT_INSERT_HEAD *inshead, int hexbyte)
+__debug_col_list(
+ WT_DBG *ds, WT_INSERT_HEAD *inshead, const char *tag, int hexbyte)
{
WT_INSERT *ins;
WT_SKIP_FOREACH(ins, inshead) {
__dmsg(ds,
- "\tinsert %" PRIu64 "\n", WT_INSERT_RECNO(ins));
+ "\t%s %" PRIu64 "\n", tag, WT_INSERT_RECNO(ins));
__debug_update(ds, ins->upd, hexbyte);
}
}
/*
- * __debug_row_insert --
+ * __debug_row_list --
* Dump an insert array.
*/
static void
-__debug_row_insert(WT_DBG *ds, WT_INSERT_HEAD *inshead)
+__debug_row_list(WT_DBG *ds, WT_INSERT_HEAD *inshead)
{
WT_INSERT *ins;
diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c
index b250b937761..8c6ea22abb4 100644
--- a/src/btree/bt_discard.c
+++ b/src/btree/bt_discard.c
@@ -208,9 +208,8 @@ __free_insert(
*/
for (insheadp = insert_head; entries > 0; --entries, ++insheadp)
if (*insheadp != NULL) {
- __free_insert_list(session,
- WT_SKIP_FIRST(*insheadp));
- __wt_sb_free(session, (*insheadp)->sb);
+ __free_insert_list(session, WT_SKIP_FIRST(*insheadp));
+ __wt_free(session, *insheadp);
}
/* Free the page's array of inserts. */
diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c
index 2038859bbaa..0576c52c50f 100644
--- a/src/btree/bt_evict.c
+++ b/src/btree/bt_evict.c
@@ -61,11 +61,10 @@ __evict_clr(WT_EVICT_LIST *e)
static inline void
__evict_req_set(WT_SESSION_IMPL *session, WT_EVICT_REQ *r, int close_method)
{
+ /* Should be empty */
+ WT_ASSERT(session, r->session == NULL);
+
r->close_method = close_method;
- WT_ASSERT(session, r->retry == NULL);
- WT_ASSERT(session, r->retry_next == 0);
- WT_ASSERT(session, r->retry_entries == 0);
- WT_ASSERT(session, r->retry_allocated == 0);
WT_MEMORY_FLUSH; /* Flush before turning entry on */
r->session = session;
@@ -74,16 +73,15 @@ __evict_req_set(WT_SESSION_IMPL *session, WT_EVICT_REQ *r, int close_method)
/*
* __evict_req_clr --
- * Set an entry in the eviction request list.
+ * Clear an entry in the eviction request list.
*/
static inline void
__evict_req_clr(WT_SESSION_IMPL *session, WT_EVICT_REQ *r)
{
__wt_free(session, r->retry);
- r->retry_next = r->retry_entries = 0;
- r->retry_allocated = 0;
- r->session = NULL;
+ memset(r, 0, sizeof(WT_EVICT_REQ));
+
WT_MEMORY_FLUSH; /* Turn entry off */
}
@@ -413,7 +411,8 @@ __evict_file(WT_SESSION_IMPL *session, WT_EVICT_REQ *er)
*/
if (!er->close_method && !WT_PAGE_IS_MODIFIED(page))
continue;
- if (__wt_page_reconcile(session, page, flags) == 0)
+ if (!F_ISSET(page, WT_PAGE_PINNED) &&
+ __wt_page_reconcile(session, page, flags) == 0)
continue;
/*
@@ -449,9 +448,10 @@ err: /* End the walk cleanly. */
static int
__evict_request_retry(WT_SESSION_IMPL *session)
{
- WT_SESSION_IMPL *request_session;
WT_CACHE *cache;
WT_EVICT_REQ *er, *er_end;
+ WT_PAGE *page;
+ WT_SESSION_IMPL *request_session;
uint32_t i, flags;
int pending_retry;
@@ -490,10 +490,10 @@ __evict_request_retry(WT_SESSION_IMPL *session)
/* Walk the list of retry requests. */
for (pending_retry = 0, i = 0; i < er->retry_entries; ++i) {
- if (er->retry[i] == NULL)
+ if ((page = er->retry[i]) == NULL)
continue;
- if (__wt_page_reconcile(
- session, er->retry[i], flags) == 0)
+ if (!F_ISSET(page, WT_PAGE_PINNED) &&
+ __wt_page_reconcile(session, page, flags) == 0)
er->retry[i] = NULL;
else
pending_retry = 1;
@@ -503,10 +503,11 @@ __evict_request_retry(WT_SESSION_IMPL *session)
* If we finished, clean up and resolve the request, otherwise
* there's still work to do.
*/
- if (pending_retry)
+ if (pending_retry && ++er->retry_cnt < 5)
cache->pending_retry = 1;
else {
- __wt_session_serialize_wrapup(request_session, NULL, 0);
+ __wt_session_serialize_wrapup(
+ request_session, NULL, pending_retry ? EBUSY : 0);
__evict_req_clr(session, er);
}
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index ce25cea43cf..3d77ec1476a 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -8,6 +8,7 @@
#include "wt_internal.h"
static int __btree_conf(WT_SESSION_IMPL *, const char *, const char *);
+static int __btree_last(WT_SESSION_IMPL *);
static int __btree_page_sizes(WT_SESSION_IMPL *);
static int __btree_type(WT_SESSION_IMPL *);
@@ -125,9 +126,14 @@ __wt_btree_open(WT_SESSION_IMPL *session,
WT_ERR(__wt_page_in(session, NULL,
&btree->root_page, LF_ISSET(WT_BTREE_VERIFY) ? 1 : 0));
F_SET(btree->root_page.page, WT_PAGE_PINNED);
+
+ WT_MEMORY_FLUSH; /* flush pin before release */
__wt_hazard_clear(session, btree->root_page.page);
}
+ /* Get the last page of the file. */
+ WT_ERR(__btree_last(session));
+
done: /* Add to the connection list. */
__wt_lock(session, conn->mtx);
btree->refcnt = 1;
@@ -246,6 +252,40 @@ __wt_btree_root_init(WT_SESSION_IMPL *session)
}
/*
+ * __btree_last --
+ * Read and pin the last page of the file.
+ */
+static int
+__btree_last(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+ WT_PAGE *page;
+
+ btree = session->btree;
+
+ page = NULL;
+ WT_RET(__wt_tree_np(session, &page, 0));
+ if (page == NULL)
+ return (WT_NOTFOUND);
+
+ btree->last_page = page;
+ if (page->type != WT_PAGE_ROW_LEAF)
+ btree->last_recno = __col_last_recno(page);
+
+ /*
+ * If the page is already pinned (that is, the last page is the root
+ * page), we're done, otherwise, pin the last page into memory.
+ */
+ if (!F_ISSET(page, WT_PAGE_PINNED)) {
+ F_SET(page, WT_PAGE_PINNED);
+
+ WT_MEMORY_FLUSH; /* flush pin before release */
+ __wt_hazard_clear(session, page);
+ }
+ return (0);
+}
+
+/*
* __wt_btree_close --
* Close a Btree.
*/
@@ -271,6 +311,12 @@ __wt_btree_close(WT_SESSION_IMPL *session)
if (inuse)
return (0);
+ /* Unpin any pages we have locked down. */
+ if (btree->last_page != NULL)
+ F_CLR(btree->last_page, WT_PAGE_PINNED);
+ if (btree->root_page.page != NULL)
+ F_CLR(btree->root_page.page, WT_PAGE_PINNED);
+
/*
* If it's a normal tree, ask the eviction thread to flush any pages
* that remain in the cache. If there is still a root page in memory,
diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c
index ea86db9aeac..ed3d1caf8ee 100644
--- a/src/btree/bt_page.c
+++ b/src/btree/bt_page.c
@@ -49,7 +49,8 @@ __wt_page_in_func(
* can't get a hazard reference is because the page is
* being evicted; yield and try again.
*/
- if (__wt_hazard_set(session, ref
+ if (F_ISSET(ref->page, WT_PAGE_PINNED) ||
+ __wt_hazard_set(session, ref
#ifdef HAVE_DIAGNOSTIC
, file, line
#endif
diff --git a/src/btree/bt_reconcile.c b/src/btree/bt_reconcile.c
index 8f008976a21..06b7b57007b 100644
--- a/src/btree/bt_reconcile.c
+++ b/src/btree/bt_reconcile.c
@@ -588,9 +588,9 @@ __rec_init(WT_SESSION_IMPL *session, uint32_t flags)
r->key_pfx_compress_conf = (cval.val != 0);
}
- r->evict = LF_ISSET(WT_REC_EVICT);
- r->locked = LF_ISSET(WT_REC_LOCKED);
- r->salvage = LF_ISSET(WT_REC_SALVAGE);
+ r->evict = LF_ISSET(WT_REC_EVICT) ? 1 : 0;
+ r->locked = LF_ISSET(WT_REC_LOCKED) ? 1 : 0;
+ r->salvage = LF_ISSET(WT_REC_SALVAGE) ? 1 : 0;
/*
* During internal page reconciliation we track referenced objects that
@@ -1636,10 +1636,10 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page)
WT_SKIP_FOREACH(ins, WT_COL_INSERT_APPEND(page))
for (;;) {
/*
- * The application may have inserted records, leaving
- * gaps in the name space, fill in any gaps.
+ * The application may have inserted records which left
+ * gaps in the name space.
*/
- for (recno = WT_INSERT_RECNO(ins) - 1;
+ for (recno = WT_INSERT_RECNO(ins);
nrecs > 0 && r->recno < recno;
--nrecs, ++entry, ++r->recno)
__bit_setv(
@@ -1648,10 +1648,10 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page)
if (nrecs > 0) {
__bit_setv(r->first_free, entry, btree->bitcnt,
((uint8_t *)WT_UPDATE_DATA(ins->upd))[0]);
+ --nrecs;
++entry;
++r->recno;
- if (--nrecs > 0 || WT_SKIP_NEXT(ins) == NULL)
- break;
+ break;
}
/*
@@ -1670,7 +1670,6 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page)
}
/* Update the counters. */
- r->recno += entry;
__rec_incrv(session, r, entry, __bitstr_size(entry * btree->bitcnt));
/* Write the remnant page. */
@@ -1724,7 +1723,7 @@ __rec_col_fix_slvg(
--nrecs, --page_take, ++page_start, ++entry)
__bit_setv(r->first_free, entry, btree->bitcnt,
__bit_getv(page->u.col_leaf.bitf,
- (uint32_t)page_start, btree->bitcnt));
+ (uint32_t)page_start, btree->bitcnt));
r->recno += entry;
__rec_incrv(
@@ -2043,9 +2042,9 @@ __rec_col_var(
}
/* Swap the current/last state. */
- last_deleted = deleted;
- if (!last_deleted)
+ if (!deleted)
WT_RET(__wt_buf_set(session, last, data, size));
+ last_deleted = deleted;
/* Reset RLE counter and turn on comparisons. */
rle = repeat_count;
@@ -2053,6 +2052,51 @@ __rec_col_var(
}
}
+ /* Walk any append list. */
+ WT_SKIP_FOREACH(ins, WT_COL_INSERT_APPEND(page))
+ for (n = WT_INSERT_RECNO(ins); src_recno <= n; ++src_recno) {
+ /*
+ * The application may have inserted records which left
+ * gaps in the name space.
+ */
+ if (src_recno < n)
+ deleted = 1;
+ else {
+ upd = ins->upd;
+ deleted = WT_UPDATE_DELETED_ISSET(upd);
+ if (!deleted) {
+ data = WT_UPDATE_DATA(upd);
+ size = upd->size;
+ }
+ }
+
+ /*
+ * Handle RLE accounting and comparisons -- see comment
+ * above, this code fragment does the same thing.
+ */
+ if (can_compare) {
+ if ((deleted && last_deleted) ||
+ (!last_deleted && !deleted &&
+ last->size == size &&
+ memcmp(last->data, data, size) == 0)) {
+ ++rle;
+ continue;
+ }
+
+ WT_RET(__rec_col_var_helper(session,
+ salvage, last, last_deleted, 0, rle));
+ }
+
+ /* Swap the current/last state. */
+ if (!deleted)
+ WT_RET(__wt_buf_set(session, last, data, size));
+ last_deleted = deleted;
+
+ /* Reset RLE counter and turn on comparisons. */
+ rle = 1;
+ can_compare = 1;
+ }
+
/* If we were tracking a record, write it. */
if (can_compare)
WT_RET(__rec_col_var_helper(
diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c
index 2459332e643..5e35ca4359b 100644
--- a/src/btree/bt_vrfy.c
+++ b/src/btree/bt_vrfy.c
@@ -311,7 +311,7 @@ recno_chk: if (parent_recno != recno) {
ref = &cref->ref;
WT_RET(__wt_page_in(session, page, ref, 1));
ret = __verify_tree(session, ref, cref->recno, vs);
- __wt_hazard_clear(session, ref->page);
+ __wt_page_release(session, ref->page);
WT_RET_TEST(ret, ret);
}
break;
@@ -336,7 +336,7 @@ recno_chk: if (parent_recno != recno) {
ref = &rref->ref;
WT_RET(__wt_page_in(session, page, ref, 1));
ret = __verify_tree(session, ref, (uint64_t)0, vs);
- __wt_hazard_clear(session, ref->page);
+ __wt_page_release(session, ref->page);
WT_RET_TEST(ret, ret);
}
break;
diff --git a/src/btree/col_extend.c b/src/btree/col_extend.c
deleted file mode 100644
index 1cdb8f80890..00000000000
--- a/src/btree/col_extend.c
+++ /dev/null
@@ -1,369 +0,0 @@
-/*-
- * See the file LICENSE for redistribution information.
- *
- * Copyright (c) 2008-2011 WiredTiger, Inc.
- * All rights reserved.
- */
-
-#include "wt_internal.h"
-
-static int __col_next_recno(WT_SESSION_IMPL *, WT_PAGE *, uint64_t *);
-
-/*
- * __wt_col_extend --
- * Extend a column-store file.
- */
-int
-__wt_col_extend(WT_SESSION_IMPL *session, WT_PAGE *page, uint64_t recno)
-{
- WT_BTREE *btree;
- WT_COL *d;
- WT_COL_REF *t;
- WT_CONFIG_ITEM cval;
- WT_PAGE *new_intl, *new_leaf, *parent;
- uint64_t next;
- size_t entries_size, new_intl_size, new_leaf_size, t_size;
- uint32_t internal_extend, leaf_extend;
- uint8_t *bitf;
- int ret;
- void *entries;
-
- btree = session->btree;
- d = NULL;
- t = NULL;
- new_intl = new_leaf = NULL;
- entries_size = new_intl_size = new_leaf_size = t_size = 0;
- internal_extend = leaf_extend = 0;
- bitf = NULL;
- ret = 0;
-
- /*
- * Another thread may have already done the work, or a default extension
- * may not be sufficient. Get the starting record for the next page and
- * make sure we're doing what we need to do.
- */
- WT_RET(__col_next_recno(session, page, &next));
- if (recno < next) /* Fits on the current page. */
- return (WT_RESTART);
-
- /*
- * Figure out how much we'll extend the leaf key space.
- *
- * If it's a fixed-length store, we can't allocate more than maximum
- * leaf page size number of bits, because we can't ever split those
- * pages.
- *
- * If it's a variable-length store, we can split those pages so we
- * can allocate whatever we need.
- *
- * XXX
- * If the application is extending the file by more than will reasonably
- * fit on a page, insert an RLE record that gets us all the way to the
- * insert record.
- *
- * We always need a new bitfield or entries array, allocate them.
- */
- switch (page->type) {
- case WT_PAGE_COL_FIX:
- leaf_extend = WT_FIX_NRECS(btree);
-
- WT_RET(__wt_calloc_def(session, (size_t)leaf_extend, &bitf));
- entries = bitf;
- entries_size = leaf_extend;
- break;
- case WT_PAGE_COL_VAR:
- WT_RET(__wt_config_getones(session,
- session->btree->config, "column_leaf_extend", &cval));
- leaf_extend = (uint32_t)cval.val;
- if (recno >= next + leaf_extend)
- leaf_extend = (uint32_t)(recno - next) + 100;
-
- WT_RET(__wt_calloc_def(session, (size_t)leaf_extend, &d));
- entries = d;
- entries_size = leaf_extend * sizeof(WT_COL);
- break;
- }
-
- /*
- * Check if the page is a newly created page: all we'll need is a new
- * entries array.
- */
- if (page->entries == 0)
- goto done;
-
- /* We'll need a new leaf page. */
- WT_ERR(__wt_calloc_def(session, 1, &new_leaf));
- new_leaf_size = sizeof(WT_PAGE);
-
- /* Check if there's a parent page with room for a new leaf page. */
- parent = page->parent;
- if (!WT_PAGE_IS_ROOT(page) &&
- parent->u.col_int.ext_entries > parent->entries)
- goto done;
-
- /* We'll need a new parent page, with its own entries array. */
- WT_ERR(__wt_calloc_def(session, 1, &new_intl));
- new_intl_size = sizeof(WT_PAGE);
- WT_RET(__wt_config_getones(session,
- session->btree->config, "column_internal_extend", &cval));
- internal_extend = (uint32_t)cval.val;
- WT_ERR(__wt_calloc_def(session, (size_t)internal_extend, &t));
- t_size = internal_extend * sizeof(WT_COL_REF);
-
-done: return (__wt_col_extend_serial(session, page, &new_intl, new_intl_size,
- &t, t_size, internal_extend, &new_leaf, new_leaf_size, &entries,
- entries_size, leaf_extend, recno));
-
-err:
- if (d != NULL)
- __wt_free(session, d);
- if (t != NULL)
- __wt_free(session, t);
- if (new_intl != NULL)
- __wt_free(session, new_intl);
- if (new_leaf != NULL)
- __wt_free(session, new_leaf);
- if (bitf != NULL)
- __wt_free(session, bitf);
- return (ret);
-}
-
-/*
- * __wt_col_extend_serial_func --
- * Server function to extend a column-store page.
- */
-int
-__wt_col_extend_serial_func(WT_SESSION_IMPL *session)
-{
- WT_COL_REF *cref, *t;
- WT_PAGE *new_leaf, *new_intl, *page, *parent;
- WT_REF *orig_ref;
- uint64_t next, recno;
- uint32_t internal_extend, leaf_extend;
- int ret;
- void *entries;
-
- __wt_col_extend_unpack(session, &page, &new_intl,
- &t, &internal_extend, &new_leaf, &entries, &leaf_extend, &recno);
-
- ret = 0;
-
- /*
- * We don't care about write generations in this code: in the hard cases
- * we're working in the tree above the page in which we ran out of room,
- * not the search page, and the search page's write generation doesn't
- * matter. In other words, we depend on our review of the situation on
- * ground.
- *
- * This is safe because the reconciliation code can't touch the subtree
- * we're in: we have a hazard reference on the lowest page, that fixes
- * the tree into memory.
- *
- * We need a new entries array or bitfield, make sure our caller passed
- * us one.
- */
- if (entries == NULL)
- goto done;
-
- /*
- * Check if the current page needs an entries array.
- *
- * Setting the page's entries value turns on the change.
- */
- switch (page->type) {
- case WT_PAGE_COL_FIX:
- if (page->u.col_leaf.bitf == NULL) {
- page->u.col_leaf.bitf = entries;
- goto entries;
- }
- break;
- case WT_PAGE_COL_VAR:
- if (page->u.col_leaf.d == NULL) {
- page->u.col_leaf.d = entries;
-
-entries: __wt_col_extend_entries_taken(session, page);
- WT_MEMORY_FLUSH;
- page->entries = leaf_extend;
- goto done;
- }
- break;
- }
-
- /* We need a new leaf page, make sure our caller passed us one. */
- if (new_leaf == NULL)
- goto done;
-
- /*
- * Get the starting record for the next page, but check, another thread
- * may have already done the work.
- */
- WT_RET(__col_next_recno(session, page, &next));
- if (next > recno)
- goto done;
-
- /*
- * Check if the page's parent has room for a new leaf page.
- *
- * Setting the parent page's entries value turns on the change.
- */
- parent = page->parent;
- if (!WT_PAGE_IS_ROOT(page) &&
- parent->u.col_int.ext_entries > parent->entries) {
- cref = &parent->u.col_int.t[parent->entries];
- cref->recno = next;
- WT_COL_REF_ADDR(cref) = WT_ADDR_INVALID;
- WT_COL_REF_PAGE(cref) = new_leaf;
- WT_COL_REF_SIZE(cref) = 0;
- WT_COL_REF_STATE(cref) = WT_REF_MEM;
- WT_PAGE_SET_MODIFIED(parent);
-
- new_leaf->parent = page->parent;
- new_leaf->parent_ref = &cref->ref;
- new_leaf->read_gen = __wt_cache_read_gen(session);
- new_leaf->u.col_leaf.recno = next;
- new_leaf->u.col_leaf.d = entries;
- new_leaf->u.col_leaf.bitf = entries;
- new_leaf->dsk = NULL;
- new_leaf->entries = leaf_extend;
- new_leaf->type = page->type;
- WT_PAGE_SET_MODIFIED(new_leaf);
- __wt_cache_page_workq(session);
-
- WT_MEMORY_FLUSH;
- ++parent->entries;
-
- __wt_col_extend_new_leaf_taken(session, new_leaf);
- __wt_col_extend_entries_taken(session, new_leaf);
-
- goto done;
- }
-
- /* We need a new internal page, make sure our caller passed us one. */
- if (new_intl == NULL)
- goto done;
-
- /*
- * Split by replacing the existing leaf page with an internal page that
- * references the leaf page (which deepens the tree by a level). This
- * is a little like splits in the reconciliation code, but it's all done
- * while other threads of control are going through the structures.
- *
- * Get a reference to the top WT_REF structure, and mark the top-level
- * page dirty, we're going to have to reconcile it so our newly created
- * level is merged back in.
- */
- orig_ref = page->parent_ref;
- if (!WT_PAGE_IS_ROOT(page))
- WT_PAGE_SET_MODIFIED(page->parent);
-
- /*
- * Configure the new internal page.
- */
- new_intl->parent = page->parent;
- new_intl->parent_ref = page->parent_ref;
- new_intl->read_gen = __wt_cache_read_gen(session);
- new_intl->u.col_int.recno = page->u.col_leaf.recno;
- new_intl->u.col_int.ext_entries = internal_extend;
- new_intl->u.col_int.t = t;
- new_intl->dsk = NULL;
- new_intl->entries = 2;
- new_intl->type = WT_PAGE_COL_INT;
- WT_PAGE_SET_MODIFIED(new_intl);
- __wt_cache_page_workq(session);
-
- /*
- * If the new internal page isn't the root page, then we should merge
- * it into its parent, we don't want the tree to deepen permanently.
- */
- if (!WT_PAGE_IS_ROOT(page))
- F_SET(new_intl, WT_PAGE_MERGE);
-
- /* Slot 0 of the new internal page references the original leaf page. */
- cref = &new_intl->u.col_int.t[0];
- cref->recno = page->u.col_leaf.recno;
- cref->ref = *page->parent_ref;
-
- /* Re-point the original page. */
- page->parent = new_intl;
- page->parent_ref = &new_intl->u.col_int.t[0].ref;
-
- /* Slot 1 of the new internal page references the new leaf page. */
- cref = &new_intl->u.col_int.t[1];
- cref->recno = next;
- WT_COL_REF_ADDR(cref) = WT_ADDR_INVALID;
- WT_COL_REF_PAGE(cref) = new_leaf;
- WT_COL_REF_SIZE(cref) = 0;
- WT_COL_REF_STATE(cref) = WT_REF_MEM;
-
- /* Configure the new leaf page. */
- new_leaf->parent = new_intl;
- new_leaf->parent_ref = &new_intl->u.col_int.t[1].ref;
- new_leaf->read_gen = __wt_cache_read_gen(session);
- new_leaf->u.col_leaf.recno = next;
- new_leaf->u.col_leaf.d = entries;
- new_leaf->u.col_leaf.bitf = entries;
- new_leaf->dsk = NULL;
- new_leaf->entries = leaf_extend;
- new_leaf->type = page->type;
- WT_PAGE_SET_MODIFIED(new_leaf);
- __wt_cache_page_workq(session);
-
- __wt_col_extend_new_intl_taken(session, new_intl);
- __wt_col_extend_t_taken(session, new_intl);
- __wt_col_extend_new_leaf_taken(session, new_leaf);
- __wt_col_extend_entries_taken(session, new_leaf);
-
- /*
- * Make the switch: set the addr/size pair then update the pointer (we
- * are not changing the state, nor are we changing the record number).
- * This is safe as we're updating one set of structures for another set
- * of structures which reference identical information. Eviction can't
- * get in here because we hold a hazard reference on the original page.
- * Setting the original page parent's in-memory pointer to reference our
- * new internal page turns on the change.
- */
- orig_ref->addr = WT_ADDR_INVALID;
- orig_ref->size = 0;
- WT_MEMORY_FLUSH;
- orig_ref->page = new_intl;
- WT_MEMORY_FLUSH;
-
-done: __wt_session_serialize_wrapup(session, page, ret);
- return (ret);
-}
-
-/*
- * __col_next_recno --
- * Return the recno of the next page following the argument page.
- */
-static int
-__col_next_recno(WT_SESSION_IMPL *session, WT_PAGE *page, uint64_t *recnop)
-{
- WT_CELL *cell;
- WT_CELL_UNPACK *unpack, _unpack;
- WT_COL *cip;
- uint32_t i;
- uint64_t recno;
-
- recno = page->u.col_leaf.recno;
- unpack = &_unpack;
-
- switch (page->type) {
- case WT_PAGE_COL_FIX:
- recno += page->entries;
- break;
- case WT_PAGE_COL_VAR:
- WT_COL_FOREACH(page, cip, i)
- if ((cell = WT_COL_PTR(page, cip)) == NULL)
- ++recno;
- else {
- __wt_cell_unpack(cell, unpack);
- recno += unpack->rle;
- }
- break;
- WT_ILLEGAL_FORMAT(session);
- }
-
- *recnop = recno;
- return (0);
-}
diff --git a/src/btree/col_modify.c b/src/btree/col_modify.c
index c91bda56308..1ad680013f0 100644
--- a/src/btree/col_modify.c
+++ b/src/btree/col_modify.c
@@ -7,63 +7,51 @@
#include "wt_internal.h"
-static int __col_insert_alloc(
- WT_SESSION_IMPL *, uint64_t, u_int, WT_INSERT **, size_t *);
+static int __col_insert_alloc(WT_SESSION_IMPL *, uint64_t, u_int, WT_INSERT **);
/*
* __wt_col_modify --
- * Column-store delete insert, and update.
+ * Column-store delete, insert, and update.
*/
int
-__wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
+__wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int op)
{
WT_BTREE *btree;
WT_INSERT *ins;
WT_INSERT_HEAD **inshead, *new_inshead, **new_inslist;
WT_ITEM *value, _value;
WT_PAGE *page;
- WT_SESSION_BUFFER *sb;
WT_UPDATE *upd;
- size_t ins_size, new_inslist_size, new_inshead_size, upd_size;
+ size_t new_inshead_size, new_inslist_size;
uint64_t recno;
u_int skipdepth;
- int hazard_ref, i, ret;
+ int i, ret;
btree = cbt->btree;
page = cbt->page;
- recno = cbt->iface.recno;
- if (is_remove) {
+ switch (op) {
+ case 1: /* Insert */
+ page = btree->last_page;
+ __cursor_search_clear(cbt);
+
+ value = (WT_ITEM *)&cbt->iface.value;
+ recno = 0; /* Engine allocates */
+ break;
+ case 2: /* Remove */
if (btree->type == BTREE_COL_FIX) {
value = &_value;
value->data = "";
value->size = 1;
} else
value = NULL;
- } else
+ recno = cbt->iface.recno; /* App specified */
+ break;
+ case 3: /* Update */
+ default:
value = (WT_ITEM *)&cbt->iface.value;
-
- /*
- * Append a column-store entry (the only place you can insert into a
- * column-store file is after the key space, column-store records are
- * immutable). If we don't have an exact match, it's an append and we
- * need to extend the file.
- */
- if (cbt->compare != 0) {
- /*
- * We may have, and need to hold, a hazard reference on a page,
- * but we're possibly doing some page shuffling of the root,
- * which means the standard test to determine whether we should
- * release a hazard reference on the page isn't right. Check
- * now, before we do the page shuffling.
- */
- hazard_ref = page == session->btree->root_page.page ? 0 : 1;
- ret = __wt_col_extend(session, page, recno);
- if (hazard_ref) {
- __wt_page_release(session, page);
- cbt->page = NULL; /* XXX WRONG */
- }
- return (ret == 0 ? WT_RESTART : 0);
+ recno = cbt->iface.recno; /* App specified */
+ break;
}
ins = NULL;
@@ -73,63 +61,45 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
ret = 0;
/*
- * Delete or update a column-store entry.
- * Column-store changes mean working in a WT_INSERT list.
+ * Delete, insert or update a column-store entry.
*/
- if (cbt->ins != NULL) {
- /*
- * If changing an already changed record, create a new WT_UPDATE
- * entry and have the workQ link it into an existing WT_INSERT
- * entry's WT_UPDATE list.
- */
- WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
-
- /* workQ: insert the WT_UPDATE structure. */
- ret = __wt_update_serial(session, page,
- cbt->write_gen, &cbt->ins->upd, NULL, 0, &upd, upd_size);
- } else {
- /*
- * We may not have an WT_INSERT_HEAD array (in the case of
- * variable length column store) or WT_INSERT_HEAD slot (in the
- * case of fixed length column store). Also, there may be an
- * insert array but no list at the point we are inserting.
- * Allocate as necessary.
- */
+ if (cbt->ins == NULL) {
+ /* There may be no WT_INSERT_HEAD, allocate as necessary. */
new_inshead_size = new_inslist_size = 0;
- if (page->u.col_leaf.ins == NULL)
- switch (page->type) {
- case WT_PAGE_COL_FIX:
+ if (op == 1) {
+ if (page == NULL || page->u.col_leaf.append == NULL) {
new_inslist_size = 1 *
sizeof(WT_INSERT_HEAD *);
- WT_ERR(__wt_calloc_def(session,
- 1, &new_inslist));
+ WT_ERR(
+ __wt_calloc_def(session, 1, &new_inslist));
inshead = &new_inslist[0];
- break;
- case WT_PAGE_COL_VAR:
+ } else
+ inshead = &page->u.col_leaf.append[0];
+ cbt->ins_head = *inshead;
+ } else if (page->type == WT_PAGE_COL_FIX) {
+ if (page->u.col_leaf.ins == NULL) {
+ new_inslist_size = 1 *
+ sizeof(WT_INSERT_HEAD *);
+ WT_ERR(
+ __wt_calloc_def(session, 1, &new_inslist));
+ inshead = &new_inslist[0];
+ } else
+ inshead = &page->u.col_leaf.ins[0];
+ } else {
+ if (page->u.col_leaf.ins == NULL) {
new_inslist_size = page->entries *
sizeof(WT_INSERT_HEAD *);
WT_ERR(__wt_calloc_def(
session, page->entries, &new_inslist));
inshead = &new_inslist[cbt->slot];
- break;
- WT_ILLEGAL_FORMAT(session);
- }
- else
- switch (page->type) {
- case WT_PAGE_COL_FIX:
- inshead = &page->u.col_leaf.ins[0];
- break;
- case WT_PAGE_COL_VAR:
+ } else
inshead = &page->u.col_leaf.ins[cbt->slot];
- break;
- WT_ILLEGAL_FORMAT(session);
- }
+ }
+ /* There may be no WT_INSERT list, allocate as necessary. */
if (*inshead == NULL) {
new_inshead_size = sizeof(WT_INSERT_HEAD);
- WT_RET(__wt_sb_alloc(session,
- sizeof(WT_INSERT_HEAD), &new_inshead, &sb));
- new_inshead->sb = sb;
+ WT_RET(__wt_calloc_def(session, 1, &new_inshead));
for (i = 0; i < WT_SKIP_MAXDEPTH; i++)
cbt->ins_stack[i] = &new_inshead->head[i];
cbt->ins_head = new_inshead;
@@ -139,30 +109,43 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
skipdepth = __wt_skip_choose_depth();
/*
- * Allocate a new WT_INSERT/WT_UPDATE pair, link it into the
- * WT_INSERT array.
+ * Allocate a WT_INSERT/WT_UPDATE pair, and update the cursor
+ * to reference it.
*/
- WT_ERR(__col_insert_alloc(
- session, recno, skipdepth, &ins, &ins_size));
- WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
+ WT_ERR(__col_insert_alloc(session, recno, skipdepth, &ins));
+ WT_ERR(__wt_update_alloc(session, value, &upd));
ins->upd = upd;
- ins_size += upd_size;
cbt->ins = ins;
/*
- * workQ: insert the WT_INSERT structure.
- *
- * For fixed-width stores, we are installing a single insert
- * head for the page. Pass NULL to the insert serialization
- * function, there is no need to set it again, and we only want
- * to account for it once.
+ * workQ: insert or append the WT_INSERT structure.
*/
- ret = __wt_insert_serial(session,
- page, cbt->write_gen,
- inshead, cbt->ins_stack,
- &new_inslist, new_inslist_size,
- &new_inshead, new_inshead_size,
- &ins, ins_size, skipdepth);
+ if (op == 1) {
+ WT_ERR(__wt_append_serial(session,
+ inshead, cbt->ins_stack,
+ &new_inslist, new_inslist_size,
+ &new_inshead, new_inshead_size, ins, skipdepth));
+
+ /* Set up the cursor for the inserted page and value. */
+ cbt->page = btree->last_page;
+ cbt->recno = WT_INSERT_RECNO(ins);
+ } else
+ WT_ERR(__wt_insert_serial(session,
+ page, cbt->write_gen,
+ inshead, cbt->ins_stack,
+ &new_inslist, new_inslist_size,
+ &new_inshead, new_inshead_size, ins, skipdepth));
+ } else {
+ /*
+ * If changing an already changed record, create a new WT_UPDATE
+ * entry and have the workQ link it into an existing WT_INSERT
+ * entry's WT_UPDATE list.
+ */
+ WT_ERR(__wt_update_alloc(session, value, &upd));
+
+ /* workQ: insert the WT_UPDATE structure. */
+ ret = __wt_update_serial(session, page,
+ cbt->write_gen, &cbt->ins->upd, NULL, 0, upd);
}
if (ret != 0) {
@@ -173,6 +156,7 @@ err: if (ins != NULL)
}
__wt_free(session, new_inslist);
+ __wt_free(session, new_inshead);
return (ret);
}
@@ -183,8 +167,8 @@ err: if (ins != NULL)
* buffer and fill it in.
*/
static int
-__col_insert_alloc(WT_SESSION_IMPL *session,
- uint64_t recno, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep)
+__col_insert_alloc(
+ WT_SESSION_IMPL *session, uint64_t recno, u_int skipdepth, WT_INSERT **insp)
{
WT_SESSION_BUFFER *sb;
WT_INSERT *ins;
@@ -194,14 +178,78 @@ __col_insert_alloc(WT_SESSION_IMPL *session,
* Allocate the WT_INSERT structure and skiplist pointers, then copy
* the record number into place.
*/
- ins_size = sizeof(WT_INSERT) +
- skipdepth * sizeof(WT_INSERT *);
+ ins_size = sizeof(WT_INSERT) + skipdepth * sizeof(WT_INSERT *);
WT_RET(__wt_sb_alloc(session, ins_size, &ins, &sb));
ins->sb = sb;
WT_INSERT_RECNO(ins) = recno;
*insp = ins;
- *ins_sizep = ins_size;
+ return (0);
+}
+
+/*
+ * __wt_append_serial_func --
+ * Server function to append an WT_INSERT entry to the tree.
+ */
+int
+__wt_append_serial_func(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+ WT_PAGE *page;
+ WT_INSERT_HEAD **inshead, **new_inslist, *new_inshead;
+ WT_INSERT *new_ins, ***ins_stack;
+ uint64_t recno;
+ u_int i, skipdepth;
+
+ btree = session->btree;
+ page = btree->last_page;
+
+ __wt_append_unpack(session, &inshead, &ins_stack,
+ &new_inslist, &new_inshead, &new_ins, &skipdepth);
+
+ /*
+ * If the page does not yet have an insert array, our caller passed
+ * us one.
+ */
+ if (page->u.col_leaf.append == NULL) {
+ page->u.col_leaf.append = new_inslist;
+ __wt_append_new_inslist_taken(session, page);
+ }
+
+ /*
+ * If the insert head does not yet have an insert list, our caller
+ * passed us one.
+ */
+ if (*inshead == NULL) {
+ *inshead = new_inshead;
+ __wt_append_new_inshead_taken(session, page);
+ }
+
+ /*
+ * If the application specified a record number, there's a race: the
+ * application may have searched for the record, not found it, then
+ * called into the append code, and another thread might have added
+ * the record. Fortunately, we're in the right place because if the
+ * record didn't exist at some point, it can only have been created
+ * on this list. Search for the record, if specified.
+ */
+ if ((recno = WT_INSERT_RECNO(new_ins)) == 0)
+ recno = WT_INSERT_RECNO(new_ins) = ++btree->last_recno;
+ (void)__col_insert_search_stack(*inshead, ins_stack, recno);
+
+ /*
+ * First, point the new WT_INSERT item's skiplist references to the next
+ * elements in the insert list, then flush memory. Second, update the
+ * skiplist elements that reference the new WT_INSERT item, this ensures
+ * the list is never inconsistent.
+ */
+ for (i = 0; i < skipdepth; i++)
+ new_ins->next[i] = *ins_stack[i];
+ WT_MEMORY_FLUSH;
+ for (i = 0; i < skipdepth; i++)
+ *ins_stack[i] = new_ins;
+
+ __wt_session_serialize_wrapup(session, page, 0);
return (0);
}
diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c
index abcb054b181..24a22cb27d4 100644
--- a/src/btree/col_srch.c
+++ b/src/btree/col_srch.c
@@ -8,44 +8,6 @@
#include "wt_internal.h"
/*
- * __search_insert --
- * Search the slot's insert list.
- */
-static inline WT_INSERT *
-__search_insert(WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *inshead, uint64_t recno)
-{
- WT_INSERT **ins;
- uint64_t ins_recno;
- int cmp, i;
-
- /* If there's no insert chain to search, we're done. */
- if (inshead == NULL)
- return (NULL);
-
- /*
- * The insert list is a skip list: start at the highest skip level, then
- * go as far as possible at each level before stepping down to the next.
- */
- for (i = WT_SKIP_MAXDEPTH - 1, ins = &inshead->head[i]; i >= 0; ) {
- if (*ins == NULL) {
- cbt->ins_stack[i--] = ins--;
- continue;
- }
-
- ins_recno = WT_INSERT_RECNO(*ins);
- cmp = (recno == ins_recno) ? 0 : (recno < ins_recno) ? -1 : 1;
-
- if (cmp == 0) /* Exact match: return */
- return (*ins);
- else if (cmp > 0) /* Keep going at this level */
- ins = &(*ins)->next[i];
- else /* Drop down a level */
- cbt->ins_stack[i--] = ins--;
- }
- return (NULL);
-}
-
-/*
* __wt_col_search --
* Search a column-store tree for a specific record-based key.
*/
@@ -125,42 +87,39 @@ __wt_col_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_modify)
WT_MEMORY_FLUSH;
}
cbt->page = page;
+ cbt->compare = 0;
/*
* Search the leaf page. We do not check in the search path for a
* record greater than the maximum record in the tree; in that case,
* we arrive here with a record that's impossibly large for the page.
*/
- switch (page->type) {
- case WT_PAGE_COL_FIX:
+ if (page->type == WT_PAGE_COL_FIX) {
if (recno >= page->u.col_leaf.recno + page->entries) {
- cbt->compare = 1;
- F_SET(cbt, WT_CBT_SEARCH_SET);
- return (0);
- }
- cbt->ins_head =
- page->u.col_leaf.ins == NULL ? NULL : *page->u.col_leaf.ins;
- cbt->compare = 0;
- break;
- case WT_PAGE_COL_VAR:
- if ((cip = __cursor_col_rle_search(page, recno)) == NULL)
- cbt->compare = 1;
- else {
- cbt->compare = 0;
+ cbt->compare = -1;
+ cbt->ins_head = WT_COL_INSERT_APPEND(page);
+ } else
+ cbt->ins_head = WT_COL_INSERT_SINGLE(page);
+ } else {
+ if ((cip = __col_var_search(page, recno)) == NULL) {
+ cbt->compare = -1;
+ cbt->ins_head = WT_COL_INSERT_APPEND(page);
+ } else {
cbt->slot = WT_COL_SLOT(page, cip);
cbt->ins_head = WT_COL_INSERT_SLOT(page, cbt->slot);
}
- break;
- WT_ILLEGAL_FORMAT(session);
}
/*
- * Search the insert list for a match; __search_insert sets the return
- * insert information appropriately.
+ * Search the insert or append list for a match; __search_insert sets
+ * the return insert information appropriately.
*/
- cbt->ins = __search_insert(cbt, cbt->ins_head, recno);
-
- F_SET(cbt, WT_CBT_SEARCH_SET);
+ if (cbt->ins_head == NULL)
+ cbt->ins = NULL;
+ else
+ if ((cbt->ins = __col_insert_search_stack(
+ cbt->ins_head, cbt->ins_stack, recno)) != NULL)
+ cbt->compare = 0;
return (0);
err: __wt_page_release(session, page);
diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c
index b01729d9e47..90fdf6ea6e6 100644
--- a/src/btree/row_modify.c
+++ b/src/btree/row_modify.c
@@ -18,11 +18,10 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
WT_INSERT_HEAD **inshead, *new_inshead, **new_inslist;
WT_ITEM *key, *value;
WT_PAGE *page;
- WT_SESSION_BUFFER *sb;
WT_UPDATE **new_upd, *upd, **upd_entry;
- size_t ins_size, new_inshead_size, new_inslist_size;
- size_t new_upd_size, upd_size;
- uint32_t ins_slot, skipdepth;
+ size_t new_inshead_size, new_inslist_size, new_upd_size;
+ uint32_t ins_slot;
+ u_int skipdepth;
int i, ret;
key = (WT_ITEM *)&cbt->iface.key;
@@ -66,11 +65,11 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
upd_entry = &cbt->ins->upd;
/* Allocate room for the new value from per-thread memory. */
- WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
+ WT_ERR(__wt_update_alloc(session, value, &upd));
/* workQ: insert the WT_UPDATE structure. */
- ret = __wt_update_serial(session, page, cbt->write_gen,
- upd_entry, &new_upd, new_upd_size, &upd, upd_size);
+ ret = __wt_update_serial(session, page,
+ cbt->write_gen, upd_entry, &new_upd, new_upd_size, upd);
} else {
/*
* Allocate insert array if necessary, and set the WT_INSERT
@@ -106,9 +105,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
*/
if (*inshead == NULL) {
new_inshead_size = sizeof(WT_INSERT_HEAD);
- WT_ERR(__wt_sb_alloc(session,
- sizeof(WT_INSERT_HEAD), &new_inshead, &sb));
- new_inshead->sb = sb;
+ WT_ERR(__wt_calloc_def(session, 1, &new_inshead));
for (i = 0; i < WT_SKIP_MAXDEPTH; i++)
cbt->ins_stack[i] = &new_inshead->head[i];
cbt->ins_head = new_inshead;
@@ -121,20 +118,16 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
* Allocate a WT_INSERT/WT_UPDATE pair, and update the cursor
* to reference it.
*/
- WT_ERR(__wt_row_insert_alloc(
- session, key, skipdepth, &ins, &ins_size));
- WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
+ WT_ERR(__wt_row_insert_alloc(session, key, skipdepth, &ins));
+ WT_ERR(__wt_update_alloc(session, value, &upd));
ins->upd = upd;
- ins_size += upd_size;
cbt->ins = ins;
/* workQ: insert the WT_INSERT structure. */
- ret = __wt_insert_serial(session,
- page, cbt->write_gen,
+ ret = __wt_insert_serial(session, page, cbt->write_gen,
inshead, cbt->ins_stack,
&new_inslist, new_inslist_size,
- &new_inshead, new_inshead_size,
- &ins, ins_size, skipdepth);
+ &new_inshead, new_inshead_size, ins, skipdepth);
}
if (ret != 0) {
@@ -145,8 +138,8 @@ err: if (ins != NULL)
}
/* Free any insert, update arrays. */
- __wt_free(session, new_inshead);
__wt_free(session, new_inslist);
+ __wt_free(session, new_inshead);
__wt_free(session, new_upd);
return (ret);
@@ -158,8 +151,8 @@ err: if (ins != NULL)
* buffer and fill it in.
*/
int
-__wt_row_insert_alloc(WT_SESSION_IMPL *session,
- WT_ITEM *key, uint32_t skipdepth, WT_INSERT **insp, size_t *ins_sizep)
+__wt_row_insert_alloc(
+ WT_SESSION_IMPL *session, WT_ITEM *key, u_int skipdepth, WT_INSERT **insp)
{
WT_SESSION_BUFFER *sb;
WT_INSERT *ins;
@@ -179,29 +172,27 @@ __wt_row_insert_alloc(WT_SESSION_IMPL *session,
memcpy(WT_INSERT_KEY(ins), key->data, key->size);
*insp = ins;
- if (ins_sizep != NULL)
- *ins_sizep = ins_size;
-
return (0);
}
/*
* __wt_insert_serial_func --
- * Server function to add an WT_INSERT entry to the page tree.
+ * Server function to add an WT_INSERT entry to the page.
*/
int
__wt_insert_serial_func(WT_SESSION_IMPL *session)
{
WT_PAGE *page;
WT_INSERT_HEAD **inshead, **new_inslist, *new_inshead;
- WT_INSERT *ins, ***ins_stack;
- uint32_t i, skipdepth, write_gen;
- int ret;
+ WT_INSERT *new_ins, ***ins_stack;
+ uint32_t write_gen;
+ u_int i, skipdepth;
+ int ret;
ret = 0;
__wt_insert_unpack(session, &page, &write_gen, &inshead,
- &ins_stack, &new_inslist, &new_inshead, &ins, &skipdepth);
+ &ins_stack, &new_inslist, &new_inshead, &new_ins, &skipdepth);
/* Check the page's write-generation. */
WT_ERR(__wt_page_write_gen_check(page, write_gen));
@@ -222,8 +213,8 @@ __wt_insert_serial_func(WT_SESSION_IMPL *session)
}
/*
- * If the slot does not yet have an insert list, our caller passed us
- * one.
+ * If the insert head does not yet have an insert list, our caller
+ * passed us one.
*/
if (*inshead == NULL) {
*inshead = new_inshead;
@@ -237,11 +228,10 @@ __wt_insert_serial_func(WT_SESSION_IMPL *session)
* the list is never inconsistent.
*/
for (i = 0; i < skipdepth; i++)
- ins->next[i] = *ins_stack[i];
+ new_ins->next[i] = *ins_stack[i];
WT_MEMORY_FLUSH;
for (i = 0; i < skipdepth; i++)
- *ins_stack[i] = ins;
- __wt_insert_ins_taken(session, page);
+ *ins_stack[i] = new_ins;
err: __wt_session_serialize_wrapup(session, page, 0);
return (ret);
@@ -253,8 +243,7 @@ err: __wt_session_serialize_wrapup(session, page, 0);
* buffer and fill it in.
*/
int
-__wt_update_alloc(WT_SESSION_IMPL *session,
- WT_ITEM *value, WT_UPDATE **updp, size_t *upd_sizep)
+__wt_update_alloc(WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp)
{
WT_SESSION_BUFFER *sb;
WT_UPDATE *upd;
@@ -274,10 +263,7 @@ __wt_update_alloc(WT_SESSION_IMPL *session,
memcpy(WT_UPDATE_DATA(upd), value->data, size);
}
- if (upd_sizep != NULL)
- *upd_sizep = size + sizeof(WT_UPDATE);
*updp = upd;
-
return (0);
}
@@ -319,7 +305,6 @@ __wt_update_serial_func(WT_SESSION_IMPL *session)
upd->next = *upd_entry;
WT_MEMORY_FLUSH;
*upd_entry = upd;
- __wt_update_upd_taken(session, page);
err: __wt_session_serialize_wrapup(session, page, 0);
return (ret);
diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c
index c383b394962..c2acab719d0 100644
--- a/src/btree/row_srch.c
+++ b/src/btree/row_srch.c
@@ -183,8 +183,6 @@ __wt_row_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_modify)
WT_ASSERT(session, rip != NULL);
cbt->compare = 0;
cbt->slot = WT_ROW_SLOT(page, rip);
-
- F_SET(cbt, WT_CBT_SEARCH_SET);
return (0);
}
@@ -227,8 +225,6 @@ __wt_row_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_modify)
* insert information appropriately.
*/
cbt->ins = __search_insert(session, cbt, cbt->ins_head, key);
-
- F_SET(cbt, WT_CBT_SEARCH_SET);
return (0);
err: __wt_page_release(session, page);
diff --git a/src/cursor/cur_file.c b/src/cursor/cur_file.c
index 6bac16c2e2b..2bc1095f6c2 100644
--- a/src/cursor/cur_file.c
+++ b/src/cursor/cur_file.c
@@ -134,6 +134,8 @@ __curfile_insert(WT_CURSOR *cursor)
WT_SESSION_IMPL *session;
int ret;
+ ret = 0;
+
cbt = (WT_CURSOR_BTREE *)cursor;
CURSOR_API_CALL(cursor, session, insert, cbt->btree);
if (cbt->btree->type == BTREE_ROW)
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 9bf2854e093..6679c755e34 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -43,11 +43,34 @@ struct __wt_page {
/* Column-store leaf page. */
struct {
uint64_t recno; /* Starting recno */
- uint8_t *bitf; /* COL_FIX bits */
- WT_COL *d; /* COL_VAR objects */
- WT_INSERT_HEAD **ins; /* Inserts (RLE) */
+
+ uint8_t *bitf; /* COL_FIX items */
+ WT_COL *d; /* COL_VAR items */
+
+ /*
+ * The last page of both fix- and variable-length column
+ * stores includes a skiplist of appended entries.
+ */
+ WT_INSERT_HEAD **append;/* Appended items */
+
+ /*
+ * Updated items in column-stores: variable-length RLE
+ * entries can expand to multiple entries which requires
+ * some kind of list we can expand on demand. Updated
+ * items in fixed-length files could be done based on an
+ * WT_UPDATE array as in row-stores, but there can be a
+ * very large number of bits on a single page, and the
+ * cost of the WT_UPDATE array would be huge.
+ */
+ WT_INSERT_HEAD **ins; /* Updated items */
+
+ /*
+ * Variable-length column-store files maintain a list of
+ * RLE entries on the page so it's unnecessary to walk
+ * the page counting records to find a specific entry.
+ */
WT_COL_RLE *repeats; /* RLE array for lookups */
- uint32_t nrepeats; /* Number of repeats. */
+ uint32_t nrepeats; /* Number of repeat slots. */
} col_leaf;
/* Bulk-loaded linked list. */
@@ -496,7 +519,6 @@ struct __wt_insert {
* The head of a skip list of WT_INSERT items.
*/
struct __wt_insert_head {
- WT_SESSION_BUFFER *sb; /* session buffer holding this update */
WT_INSERT *head[WT_SKIP_MAXDEPTH]; /* first item on skiplists */
};
@@ -530,11 +552,16 @@ struct __wt_insert_head {
#define WT_COL_INSERT(page, ip) \
WT_COL_INSERT_SLOT(page, WT_COL_SLOT(page, ip))
/*
- * WT_COL_INSERT_SINGLE references a single WT_INSERT list, which is used for
- * fixed-length column-store updates.
+ * WT_COL_INSERT_{APPEND,SINGLE} reference a single WT_INSERT list, which are
+ * used for fixed-length column-store updates, and variable- and fixed-length
+ * column-store appends.
*/
+#define WT_COL_INSERT_APPEND(page) \
+ ((page)->u.col_leaf.append == NULL ? \
+ NULL : (page)->u.col_leaf.append[0])
#define WT_COL_INSERT_SINGLE(page) \
- ((page)->u.col_leaf.ins == NULL ? NULL : (page)->u.col_leaf.ins[0])
+ ((page)->u.col_leaf.ins == NULL ? \
+ NULL : (page)->u.col_leaf.ins[0])
/* WT_FIX_FOREACH walks fixed-length bit-fields on a disk page. */
#define WT_FIX_FOREACH(btree, dsk, v, i) \
diff --git a/src/include/btree.h b/src/include/btree.h
index 34b3e31bf0f..203551f3d2a 100644
--- a/src/include/btree.h
+++ b/src/include/btree.h
@@ -125,7 +125,10 @@ struct __wt_btree {
uint32_t leafmin; /* Min/max leaf page size */
uint32_t leafmax;
- WT_WALK evict_walk; /* Eviction thread's walk state */
+ WT_WALK evict_walk; /* Eviction thread's walk state */
+
+ WT_PAGE *last_page; /* Col-store append, last page */
+ uint64_t last_recno; /* Col-store append, last recno */
void *reconcile; /* Reconciliation structure */
diff --git a/src/include/btree.i b/src/include/btree.i
index aa01dd3620f..bda693248ba 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -6,20 +6,6 @@
*/
/*
- * __wt_cache_page_workq --
- * Create pages into the cache.
- */
-static inline void
-__wt_cache_page_workq(WT_SESSION_IMPL *session)
-{
- WT_CACHE *cache;
-
- cache = S2C(session)->cache;
-
- ++cache->pages_workq;
-}
-
-/*
* __wt_cache_page_workq_incr --
* Increment a page's memory footprint in the cache.
*/
@@ -95,7 +81,7 @@ __wt_cache_pages_inuse(WT_CACHE *cache)
* (although "interesting" corruption is vanishingly unlikely, these
* values just increment over time).
*/
- pages_in = cache->pages_read + cache->pages_workq;
+ pages_in = cache->pages_read;
pages_out = cache->pages_evict;
return (pages_in > pages_out ? pages_in - pages_out : 0);
}
@@ -167,13 +153,13 @@ __wt_page_reconcile(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
/*
* __wt_page_release --
- * Release a reference to a page, unless it's the root page, which remains
- * pinned for the life of the table handle.
+ * Release a reference to a page, unless it's pinned into memory, in which
+ * case we never acquired a hazard reference.
*/
static inline void
__wt_page_release(WT_SESSION_IMPL *session, WT_PAGE *page)
{
- if (page != NULL && !WT_PAGE_IS_ROOT(page))
+ if (page != NULL && !F_ISSET(page, WT_PAGE_PINNED))
__wt_hazard_clear(session, page);
}
@@ -181,10 +167,11 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_PAGE *page)
* __wt_skip_choose_depth --
* Randomly choose a depth for a skiplist insert.
*/
-static inline uint32_t
+static inline u_int
__wt_skip_choose_depth(void)
{
- uint32_t d;
+ u_int d;
+
for (d = 1; d < WT_SKIP_MAXDEPTH &&
__wt_random() < WT_SKIP_PROBABILITY; d++)
;
diff --git a/src/include/cache.h b/src/include/cache.h
index bd9e6eea48e..a93579a5baf 100644
--- a/src/include/cache.h
+++ b/src/include/cache.h
@@ -26,6 +26,7 @@ struct __wt_evict_req {
uint32_t retry_next; /* Next retry slot */
uint32_t retry_entries; /* Total retry slots */
size_t retry_allocated; /* Bytes allocated */
+ int retry_cnt; /* We only try a few times. */
int close_method; /* Discard pages */
};
@@ -92,7 +93,6 @@ struct __wt_cache {
uint64_t bytes_read; /* Bytes/pages read by read server */
uint64_t pages_read;
uint64_t bytes_workq; /* Bytes/pages created by workQ */
- uint64_t pages_workq;
uint64_t bytes_evict; /* Bytes/pages discarded by eviction */
uint64_t pages_evict;
};
diff --git a/src/include/column.i b/src/include/column.i
new file mode 100644
index 00000000000..31c659ce1fc
--- /dev/null
+++ b/src/include/column.i
@@ -0,0 +1,164 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ */
+
+/*
+ * __col_insert_search --
+ * Search an column-store insert list.
+ */
+static inline WT_INSERT *
+__col_insert_search(WT_INSERT_HEAD *inshead, uint64_t recno)
+{
+ WT_INSERT **ins;
+ uint64_t ins_recno;
+ int cmp, i;
+
+ /* If there's no insert chain to search, we're done. */
+ if (inshead == NULL)
+ return (NULL);
+
+ /*
+ * The insert list is a skip list: start at the highest skip level, then
+ * go as far as possible at each level before stepping down to the next.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, ins = &inshead->head[i]; i >= 0; ) {
+ if (*ins == NULL) {
+ --i;
+ --ins;
+ continue;
+ }
+
+ ins_recno = WT_INSERT_RECNO(*ins);
+ cmp = (recno == ins_recno) ? 0 : (recno < ins_recno) ? -1 : 1;
+
+ if (cmp == 0) /* Exact match: return */
+ return (*ins);
+ else if (cmp > 0) /* Keep going at this level */
+ ins = &(*ins)->next[i];
+ else { /* Drop down a level */
+ --i;
+ --ins;
+ }
+ }
+
+ return (NULL);
+}
+
+/*
+ * __col_insert_search_stack --
+ * Search a column-store insert list, updating the skiplist stack as we go.
+ */
+static inline WT_INSERT *
+__col_insert_search_stack(
+ WT_INSERT_HEAD *inshead, WT_INSERT ***ins_stack, uint64_t recno)
+{
+ WT_INSERT **ins;
+ uint64_t ins_recno;
+ int cmp, i;
+
+ /*
+ * The insert list is a skip list: start at the highest skip level, then
+ * go as far as possible at each level before stepping down to the next.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, ins = &inshead->head[i]; i >= 0; ) {
+ if (*ins == NULL) {
+ ins_stack[i--] = ins--;
+ continue;
+ }
+
+ ins_recno = WT_INSERT_RECNO(*ins);
+ cmp = (recno == ins_recno) ? 0 : (recno < ins_recno) ? -1 : 1;
+
+ if (cmp == 0) /* Exact match: return */
+ return (*ins);
+ else if (cmp > 0) /* Keep going at this level */
+ ins = &(*ins)->next[i];
+ else /* Drop down a level */
+ ins_stack[i--] = ins--;
+ }
+ return (NULL);
+}
+
+/*
+ * __col_last_recno --
+ * Return the last record number for a variable-length column-store page.
+ */
+static inline uint64_t
+__col_last_recno(WT_PAGE *page)
+{
+ WT_COL_RLE *repeat;
+
+ /*
+ * If there's an append list (the last page), then there may be more
+ * records on the page. This function ignores those records, so our
+ * callers have to handle that explicitly, if they care.
+ *
+ * WT_PAGE_COL_FIX pages have no nrepeat values, so this works for
+ * fixed-length column-stores without any further check.
+ */
+ if (page->u.col_leaf.nrepeats == 0)
+ return (page->entries == 0 ? 0 :
+ page->u.col_leaf.recno + (page->entries - 1));
+
+ repeat = &page->u.col_leaf.repeats[page->u.col_leaf.nrepeats - 1];
+ return (
+ (repeat->recno + repeat->rle) - 1 +
+ (page->entries - (repeat->indx + 1)));
+}
+
+/*
+ * __col_var_search --
+ * Search a variable-length column-store page for a record.
+ */
+static inline WT_COL *
+__col_var_search(WT_PAGE *page, uint64_t recno)
+{
+ WT_COL_RLE *repeat;
+ uint64_t start_recno;
+ uint32_t base, indx, limit, start_indx;
+
+ /*
+ * Find the matching slot.
+ *
+ * This is done in two stages: first, we do a binary search among any
+ * repeating records to find largest repeating less than the search key.
+ * Once there, we can do a simple offset calculation to find the correct
+ * slot for this record number, because we know any intervening records
+ * have repeat counts of 1.
+ */
+ for (base = 0,
+ limit = page->u.col_leaf.nrepeats; limit != 0; limit >>= 1) {
+ indx = base + (limit >> 1);
+
+ repeat = page->u.col_leaf.repeats + indx;
+ if (recno >= repeat->recno &&
+ recno < repeat->recno + repeat->rle)
+ return (page->u.col_leaf.d + repeat->indx);
+ if (recno < repeat->recno)
+ continue;
+ base = indx + 1;
+ --limit;
+ }
+
+ /*
+ * We didn't find an exact match, move forward from the largest repeat
+ * less than the search key.
+ */
+ if (base == 0) {
+ start_indx = 0;
+ start_recno = page->u.col_leaf.recno;
+ } else {
+ repeat = page->u.col_leaf.repeats + (base - 1);
+ start_indx = repeat->indx + 1;
+ start_recno = repeat->recno + repeat->rle;
+ }
+
+ if (recno >= start_recno + (page->entries - start_indx))
+ return (NULL);
+
+ return (page->u.col_leaf.d +
+ start_indx + (uint32_t)(recno - start_recno));
+}
diff --git a/src/include/cursor.h b/src/include/cursor.h
index f1d0e9b29ed..d37b5527c4e 100644
--- a/src/include/cursor.h
+++ b/src/include/cursor.h
@@ -37,17 +37,21 @@ struct __wt_cursor_btree {
int compare;
/*
- * The following fields are maintained by cursor iteration functions.
- *
* We can't walk an insert list in reverse order, it's only linked in a
- * forward, sorted order. We don't care for column-store files, the
- * record number gives us a "key" for lookup; for row-store files, we
- * maintain a count of the current entry we're on. For each iteration,
- * we return one entry earlier in the list.
+ * forward, sorted order. Maintain a count of the current entry we're
+ * on. For each iteration, we return one entry earlier in the list.
*/
uint32_t ins_entry_cnt; /* 1-based insert list entry count */
/*
+ * It's relatively expensive to calculate the last record on a variable-
+ * length column-store page because of the repeat values. Calculate it
+ * once per page and cache it. This value doesn't include the skiplist
+ * of appended entries on the last page.
+ */
+ uint64_t last_standard_recno;
+
+ /*
* Variable-length column-store items are run-length encoded, and
* optionally Huffman encoded. To avoid repeatedly decompressing the
* item, we decompress it once into the value buffer. The vslot field
@@ -61,12 +65,15 @@ struct __wt_cursor_btree {
/*
* Fixed-length column-store items are a single byte, and it's simpler
- * and cheaper to allocate the space for it now.
+ * and cheaper to allocate the space for it now than keep checking to
+ * see if we need to grow the buffer.
*/
uint8_t v; /* Fixed-length return value */
-#define WT_CBT_SEARCH_SET 0x01 /* Search has set a page */
-#define WT_CBT_SEARCH_SMALLEST 0x02 /* Smallest-key insert list */
+#define WT_CBT_ITERATE_APPEND 0x01 /* Col-store: iterating append list */
+#define WT_CBT_ITERATE_NEXT 0x02 /* Next iteration configuration */
+#define WT_CBT_ITERATE_PREV 0x04 /* Prev iteration configuration */
+#define WT_CBT_SEARCH_SMALLEST 0x08 /* Row-store: small-key insert list */
uint8_t flags;
};
diff --git a/src/include/cursor.i b/src/include/cursor.i
index 226510b96c8..392467d9807 100644
--- a/src/include/cursor.i
+++ b/src/include/cursor.i
@@ -20,21 +20,22 @@ __cursor_search_clear(WT_CURSOR_BTREE *cbt)
cbt->ins = NULL;
/* We don't bother clearing the insert stack, that's more expensive. */
+ cbt->recno = 0; /* Illegal value */
cbt->write_gen = 0;
cbt->compare = 2; /* Illegal value */
cbt->vslot = WT_CBT_VSLOT_OOB;
- F_CLR(cbt, WT_CBT_SEARCH_SET | WT_CBT_SEARCH_SMALLEST);
+ cbt->flags = 0;
}
/*
- * __cursor_func_clear --
+ * __cursor_func_init --
* Reset the cursor's state for a new call.
*/
static inline void
-__cursor_func_clear(WT_CURSOR_BTREE *cbt, int page_release)
+__cursor_func_init(WT_CURSOR_BTREE *cbt, int page_release)
{
WT_CURSOR *cursor;
WT_SESSION_IMPL *session;
@@ -53,11 +54,11 @@ __cursor_func_clear(WT_CURSOR_BTREE *cbt, int page_release)
}
/*
- * __cursor_func_set --
+ * __cursor_func_resolve --
* Resolve the cursor's state for return.
*/
static inline void
-__cursor_func_set(WT_CURSOR_BTREE *cbt, int ret)
+__cursor_func_resolve(WT_CURSOR_BTREE *cbt, int ret)
{
WT_CURSOR *cursor;
WT_SESSION_IMPL *session;
@@ -73,11 +74,8 @@ __cursor_func_set(WT_CURSOR_BTREE *cbt, int ret)
if (ret == 0)
F_SET(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
else {
- if (cbt->page != NULL) {
- __wt_page_release(session, cbt->page);
- cbt->page = NULL;
- }
- F_CLR(cbt, WT_CBT_SEARCH_SET | WT_CBT_SEARCH_SMALLEST);
+ __cursor_func_init(cbt, 1);
+ __cursor_search_clear(cbt);
}
}
@@ -139,75 +137,3 @@ __cursor_row_slot_return(WT_CURSOR_BTREE *cbt, WT_ROW *rip)
return (0);
}
-
-/*
- * __cursor_col_rle_last --
- * Return the last record number for a variable-length column-store page.
- */
-static inline uint64_t
-__cursor_col_rle_last(WT_PAGE *page)
-{
- WT_COL_RLE *repeat;
-
- if (page->u.col_leaf.nrepeats == 0)
- return (page->u.col_leaf.recno + (page->entries - 1));
-
- repeat = &page->u.col_leaf.repeats[page->u.col_leaf.nrepeats - 1];
- return (
- (repeat->recno + repeat->rle) - 1 +
- (page->entries - (repeat->indx + 1)));
-}
-
-/*
- * __cursor_col_rle_search --
- * Search a variable-length column-store page for a record.
- */
-static inline WT_COL *
-__cursor_col_rle_search(WT_PAGE *page, uint64_t recno)
-{
- WT_COL_RLE *repeat;
- uint64_t start_recno;
- uint32_t base, indx, limit, start_indx;
-
- /*
- * Find the matching slot.
- *
- * This is done in two stages: first, we do a binary search among any
- * repeating records to find largest repeating less than the search key.
- * Once there, we can do a simple offset calculation to find the correct
- * slot for this record number, because we know any intervening records
- * have repeat counts of 1.
- */
- for (base = 0,
- limit = page->u.col_leaf.nrepeats; limit != 0; limit >>= 1) {
- indx = base + (limit >> 1);
-
- repeat = page->u.col_leaf.repeats + indx;
- if (recno >= repeat->recno &&
- recno < repeat->recno + repeat->rle)
- return (page->u.col_leaf.d + repeat->indx);
- if (recno < repeat->recno)
- continue;
- base = indx + 1;
- --limit;
- }
-
- /*
- * We didn't find an exact match, move forward from the largest repeat
- * less than the search key.
- */
- if (base == 0) {
- start_indx = 0;
- start_recno = page->u.col_leaf.recno;
- } else {
- repeat = page->u.col_leaf.repeats + (base - 1);
- start_indx = repeat->indx + 1;
- start_recno = repeat->recno + repeat->rle;
- }
-
- if (recno >= start_recno + (page->entries - start_indx))
- return (NULL);
-
- return (page->u.col_leaf.d +
- start_indx + (uint32_t)(recno - start_recno));
-}
diff --git a/src/include/extern.h b/src/include/extern.h
index 3e55c02c149..a129c19af31 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -155,7 +155,7 @@ extern int __wt_cell_unpack_copy( WT_SESSION_IMPL *session,
extern int __wt_btree_lex_compare( WT_BTREE *btree,
const WT_ITEM *user_item,
const WT_ITEM *tree_item);
-extern int __wt_btcur_search_setup(WT_CURSOR_BTREE *cbt);
+extern void __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt, int next);
extern int __wt_btcur_first(WT_CURSOR_BTREE *cbt);
extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt);
extern int __wt_btcur_last(WT_CURSOR_BTREE *cbt);
@@ -193,13 +193,13 @@ extern void __wt_workq_evict_server_exit(WT_CONNECTION_IMPL *conn);
extern int __wt_evict_file_serial_func(WT_SESSION_IMPL *session);
extern void *__wt_cache_evict_server(void *arg);
extern int __wt_btree_create(WT_SESSION_IMPL *session, const char *filename);
-extern int __wt_btree_root_init(WT_SESSION_IMPL *session);
extern int __wt_btree_open(WT_SESSION_IMPL *session,
const char *name,
const char *filename,
const char *treeconfig,
const char *cfg[],
uint32_t flags);
+extern int __wt_btree_root_init(WT_SESSION_IMPL *session);
extern int __wt_btree_close(WT_SESSION_IMPL *session);
extern int __wt_btree_huffman_open(WT_SESSION_IMPL *session);
extern void __wt_btree_huffman_close(WT_SESSION_IMPL *session);
@@ -253,13 +253,10 @@ extern int __wt_walk_prev(WT_SESSION_IMPL *session,
WT_WALK *walk,
WT_PAGE **pagep);
extern int __wt_tree_np(WT_SESSION_IMPL *session, WT_PAGE **pagep, int next);
-extern int __wt_col_extend(WT_SESSION_IMPL *session,
- WT_PAGE *page,
- uint64_t recno);
-extern int __wt_col_extend_serial_func(WT_SESSION_IMPL *session);
extern int __wt_col_modify(WT_SESSION_IMPL *session,
WT_CURSOR_BTREE *cbt,
- int is_remove);
+ int op);
+extern int __wt_append_serial_func(WT_SESSION_IMPL *session);
extern int __wt_col_search(WT_SESSION_IMPL *session,
WT_CURSOR_BTREE *cbt,
int is_modify);
@@ -277,16 +274,14 @@ extern int __wt_row_key_serial_func(WT_SESSION_IMPL *session);
extern int __wt_row_modify(WT_SESSION_IMPL *session,
WT_CURSOR_BTREE *cbt,
int is_remove);
-extern int __wt_row_insert_alloc(WT_SESSION_IMPL *session,
+extern int __wt_row_insert_alloc( WT_SESSION_IMPL *session,
WT_ITEM *key,
- uint32_t skipdepth,
- WT_INSERT **insp,
- size_t *ins_sizep);
+ u_int skipdepth,
+ WT_INSERT **insp);
extern int __wt_insert_serial_func(WT_SESSION_IMPL *session);
extern int __wt_update_alloc(WT_SESSION_IMPL *session,
WT_ITEM *value,
- WT_UPDATE **updp,
- size_t *upd_sizep);
+ WT_UPDATE **updp);
extern int __wt_update_serial_func(WT_SESSION_IMPL *session);
extern int __wt_row_search(WT_SESSION_IMPL *session,
WT_CURSOR_BTREE *cbt,
diff --git a/src/include/serial.i b/src/include/serial.i
index dc812dbe525..3fe1723cca1 100644
--- a/src/include/serial.i
+++ b/src/include/serial.i
@@ -1,196 +1,142 @@
/* DO NOT EDIT: automatically built by dist/serial.py. */
typedef struct {
- WT_PAGE *parent;
- WT_REF *parent_ref;
- int dsk_verify;
-} __wt_cache_read_args;
-
-static inline int
-__wt_cache_read_serial(
- WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *parent_ref, int
- dsk_verify)
-{
- __wt_cache_read_args _args, *args = &_args;
- int ret;
-
- args->parent = parent;
-
- args->parent_ref = parent_ref;
-
- args->dsk_verify = dsk_verify;
-
- ret = __wt_session_serialize_func(session,
- WT_WORKQ_READ, 0, __wt_cache_read_serial_func, args);
-
- return (ret);
-}
-
-static inline void
-__wt_cache_read_unpack(
- WT_SESSION_IMPL *session, WT_PAGE **parentp, WT_REF **parent_refp, int
- *dsk_verifyp)
-{
- __wt_cache_read_args *args =
- (__wt_cache_read_args *)session->wq_args;
-
- *parentp = args->parent;
- *parent_refp = args->parent_ref;
- *dsk_verifyp = args->dsk_verify;
-}
-
-typedef struct {
- WT_PAGE *page;
- WT_PAGE *new_intl;
- size_t new_intl_size;
- int new_intl_taken;
- WT_COL_REF *t;
- size_t t_size;
- int t_taken;
- uint32_t internal_extend;
- WT_PAGE *new_leaf;
- size_t new_leaf_size;
- int new_leaf_taken;
- void *entries;
- size_t entries_size;
- int entries_taken;
- uint32_t leaf_extend;
- uint64_t recno;
-} __wt_col_extend_args;
+ WT_INSERT_HEAD **inshead;
+ WT_INSERT ***ins_stack;
+ WT_INSERT_HEAD **new_inslist;
+ size_t new_inslist_size;
+ int new_inslist_taken;
+ WT_INSERT_HEAD *new_inshead;
+ size_t new_inshead_size;
+ int new_inshead_taken;
+ WT_INSERT *new_ins;
+ u_int skipdepth;
+} __wt_append_args;
static inline int
-__wt_col_extend_serial(
- WT_SESSION_IMPL *session, WT_PAGE *page, WT_PAGE **new_intlp, size_t
- new_intl_size, WT_COL_REF **tp, size_t t_size, uint32_t
- internal_extend, WT_PAGE **new_leafp, size_t new_leaf_size, void
- **entriesp, size_t entries_size, uint32_t leaf_extend, uint64_t recno)
+__wt_append_serial(
+ WT_SESSION_IMPL *session, WT_INSERT_HEAD **inshead, WT_INSERT
+ ***ins_stack, WT_INSERT_HEAD ***new_inslistp, size_t new_inslist_size,
+ WT_INSERT_HEAD **new_insheadp, size_t new_inshead_size, WT_INSERT
+ *new_ins, u_int skipdepth)
{
- __wt_col_extend_args _args, *args = &_args;
+ __wt_append_args _args, *args = &_args;
int ret;
- args->page = page;
-
- if (new_intlp == NULL)
- args->new_intl = NULL;
- else {
- args->new_intl = *new_intlp;
- *new_intlp = NULL;
- args->new_intl_size = new_intl_size;
- }
- args->new_intl_taken = 0;
-
- if (tp == NULL)
- args->t = NULL;
- else {
- args->t = *tp;
- *tp = NULL;
- args->t_size = t_size;
- }
- args->t_taken = 0;
+ args->inshead = inshead;
- args->internal_extend = internal_extend;
+ args->ins_stack = ins_stack;
- if (new_leafp == NULL)
- args->new_leaf = NULL;
+ if (new_inslistp == NULL)
+ args->new_inslist = NULL;
else {
- args->new_leaf = *new_leafp;
- *new_leafp = NULL;
- args->new_leaf_size = new_leaf_size;
+ args->new_inslist = *new_inslistp;
+ *new_inslistp = NULL;
+ args->new_inslist_size = new_inslist_size;
}
- args->new_leaf_taken = 0;
+ args->new_inslist_taken = 0;
- if (entriesp == NULL)
- args->entries = NULL;
+ if (new_insheadp == NULL)
+ args->new_inshead = NULL;
else {
- args->entries = *entriesp;
- *entriesp = NULL;
- args->entries_size = entries_size;
+ args->new_inshead = *new_insheadp;
+ *new_insheadp = NULL;
+ args->new_inshead_size = new_inshead_size;
}
- args->entries_taken = 0;
+ args->new_inshead_taken = 0;
- args->leaf_extend = leaf_extend;
+ args->new_ins = new_ins;
- args->recno = recno;
+ args->skipdepth = skipdepth;
ret = __wt_session_serialize_func(session,
- WT_WORKQ_FUNC, 1, __wt_col_extend_serial_func, args);
-
- if (!args->new_intl_taken)
- __wt_free(session, args->new_intl);
- if (!args->t_taken)
- __wt_free(session, args->t);
- if (!args->new_leaf_taken)
- __wt_free(session, args->new_leaf);
- if (!args->entries_taken)
- __wt_free(session, args->entries);
+ WT_WORKQ_FUNC, 1, __wt_append_serial_func, args);
+
+ if (!args->new_inslist_taken)
+ __wt_free(session, args->new_inslist);
+ if (!args->new_inshead_taken)
+ __wt_free(session, args->new_inshead);
return (ret);
}
static inline void
-__wt_col_extend_unpack(
- WT_SESSION_IMPL *session, WT_PAGE **pagep, WT_PAGE **new_intlp,
- WT_COL_REF **tp, uint32_t *internal_extendp, WT_PAGE **new_leafp, void
- **entriesp, uint32_t *leaf_extendp, uint64_t *recnop)
+__wt_append_unpack(
+ WT_SESSION_IMPL *session, WT_INSERT_HEAD ***insheadp, WT_INSERT
+ ****ins_stackp, WT_INSERT_HEAD ***new_inslistp, WT_INSERT_HEAD
+ **new_insheadp, WT_INSERT **new_insp, u_int *skipdepthp)
{
- __wt_col_extend_args *args =
- (__wt_col_extend_args *)session->wq_args;
+ __wt_append_args *args =
+ (__wt_append_args *)session->wq_args;
- *pagep = args->page;
- *new_intlp = args->new_intl;
- *tp = args->t;
- *internal_extendp = args->internal_extend;
- *new_leafp = args->new_leaf;
- *entriesp = args->entries;
- *leaf_extendp = args->leaf_extend;
- *recnop = args->recno;
+ *insheadp = args->inshead;
+ *ins_stackp = args->ins_stack;
+ *new_inslistp = args->new_inslist;
+ *new_insheadp = args->new_inshead;
+ *new_insp = args->new_ins;
+ *skipdepthp = args->skipdepth;
}
static inline void
-__wt_col_extend_new_intl_taken(WT_SESSION_IMPL *session, WT_PAGE *page)
+__wt_append_new_inslist_taken(WT_SESSION_IMPL *session, WT_PAGE *page)
{
- __wt_col_extend_args *args =
- (__wt_col_extend_args *)session->wq_args;
+ __wt_append_args *args =
+ (__wt_append_args *)session->wq_args;
- args->new_intl_taken = 1;
+ args->new_inslist_taken = 1;
- WT_ASSERT(session, args->new_intl_size != 0);
- __wt_cache_page_workq_incr(session, page, args->new_intl_size);
+ WT_ASSERT(session, args->new_inslist_size != 0);
+ __wt_cache_page_workq_incr(session, page, args->new_inslist_size);
}
static inline void
-__wt_col_extend_t_taken(WT_SESSION_IMPL *session, WT_PAGE *page)
+__wt_append_new_inshead_taken(WT_SESSION_IMPL *session, WT_PAGE *page)
{
- __wt_col_extend_args *args =
- (__wt_col_extend_args *)session->wq_args;
+ __wt_append_args *args =
+ (__wt_append_args *)session->wq_args;
- args->t_taken = 1;
+ args->new_inshead_taken = 1;
- WT_ASSERT(session, args->t_size != 0);
- __wt_cache_page_workq_incr(session, page, args->t_size);
+ WT_ASSERT(session, args->new_inshead_size != 0);
+ __wt_cache_page_workq_incr(session, page, args->new_inshead_size);
}
-static inline void
-__wt_col_extend_new_leaf_taken(WT_SESSION_IMPL *session, WT_PAGE *page)
+typedef struct {
+ WT_PAGE *parent;
+ WT_REF *parent_ref;
+ int dsk_verify;
+} __wt_cache_read_args;
+
+static inline int
+__wt_cache_read_serial(
+ WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *parent_ref, int
+ dsk_verify)
{
- __wt_col_extend_args *args =
- (__wt_col_extend_args *)session->wq_args;
+ __wt_cache_read_args _args, *args = &_args;
+ int ret;
+
+ args->parent = parent;
+
+ args->parent_ref = parent_ref;
- args->new_leaf_taken = 1;
+ args->dsk_verify = dsk_verify;
- WT_ASSERT(session, args->new_leaf_size != 0);
- __wt_cache_page_workq_incr(session, page, args->new_leaf_size);
+ ret = __wt_session_serialize_func(session,
+ WT_WORKQ_READ, 0, __wt_cache_read_serial_func, args);
+
+ return (ret);
}
static inline void
-__wt_col_extend_entries_taken(WT_SESSION_IMPL *session, WT_PAGE *page)
+__wt_cache_read_unpack(
+ WT_SESSION_IMPL *session, WT_PAGE **parentp, WT_REF **parent_refp, int
+ *dsk_verifyp)
{
- __wt_col_extend_args *args =
- (__wt_col_extend_args *)session->wq_args;
-
- args->entries_taken = 1;
+ __wt_cache_read_args *args =
+ (__wt_cache_read_args *)session->wq_args;
- WT_ASSERT(session, args->entries_size != 0);
- __wt_cache_page_workq_incr(session, page, args->entries_size);
+ *parentp = args->parent;
+ *parent_refp = args->parent_ref;
+ *dsk_verifyp = args->dsk_verify;
}
typedef struct {
@@ -233,10 +179,8 @@ typedef struct {
WT_INSERT_HEAD *new_inshead;
size_t new_inshead_size;
int new_inshead_taken;
- WT_INSERT *ins;
- size_t ins_size;
- int ins_taken;
- uint32_t depth;
+ WT_INSERT *new_ins;
+ u_int skipdepth;
} __wt_insert_args;
static inline int
@@ -244,8 +188,8 @@ __wt_insert_serial(
WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t write_gen,
WT_INSERT_HEAD **inshead, WT_INSERT ***ins_stack, WT_INSERT_HEAD
***new_inslistp, size_t new_inslist_size, WT_INSERT_HEAD
- **new_insheadp, size_t new_inshead_size, WT_INSERT **insp, size_t
- ins_size, uint32_t depth)
+ **new_insheadp, size_t new_inshead_size, WT_INSERT *new_ins, u_int
+ skipdepth)
{
__wt_insert_args _args, *args = &_args;
int ret;
@@ -276,16 +220,9 @@ __wt_insert_serial(
}
args->new_inshead_taken = 0;
- if (insp == NULL)
- args->ins = NULL;
- else {
- args->ins = *insp;
- *insp = NULL;
- args->ins_size = ins_size;
- }
- args->ins_taken = 0;
+ args->new_ins = new_ins;
- args->depth = depth;
+ args->skipdepth = skipdepth;
ret = __wt_session_serialize_func(session,
WT_WORKQ_FUNC, 1, __wt_insert_serial_func, args);
@@ -294,8 +231,6 @@ __wt_insert_serial(
__wt_free(session, args->new_inslist);
if (!args->new_inshead_taken)
__wt_free(session, args->new_inshead);
- if (!args->ins_taken)
- __wt_free(session, args->ins);
return (ret);
}
@@ -303,8 +238,8 @@ static inline void
__wt_insert_unpack(
WT_SESSION_IMPL *session, WT_PAGE **pagep, uint32_t *write_genp,
WT_INSERT_HEAD ***insheadp, WT_INSERT ****ins_stackp, WT_INSERT_HEAD
- ***new_inslistp, WT_INSERT_HEAD **new_insheadp, WT_INSERT **insp,
- uint32_t *depthp)
+ ***new_inslistp, WT_INSERT_HEAD **new_insheadp, WT_INSERT **new_insp,
+ u_int *skipdepthp)
{
__wt_insert_args *args =
(__wt_insert_args *)session->wq_args;
@@ -315,8 +250,8 @@ __wt_insert_unpack(
*ins_stackp = args->ins_stack;
*new_inslistp = args->new_inslist;
*new_insheadp = args->new_inshead;
- *insp = args->ins;
- *depthp = args->depth;
+ *new_insp = args->new_ins;
+ *skipdepthp = args->skipdepth;
}
static inline void
@@ -343,18 +278,6 @@ __wt_insert_new_inshead_taken(WT_SESSION_IMPL *session, WT_PAGE *page)
__wt_cache_page_workq_incr(session, page, args->new_inshead_size);
}
-static inline void
-__wt_insert_ins_taken(WT_SESSION_IMPL *session, WT_PAGE *page)
-{
- __wt_insert_args *args =
- (__wt_insert_args *)session->wq_args;
-
- args->ins_taken = 1;
-
- WT_ASSERT(session, args->ins_size != 0);
- __wt_cache_page_workq_incr(session, page, args->ins_size);
-}
-
typedef struct {
WT_PAGE *page;
WT_ROW *row_arg;
@@ -402,15 +325,13 @@ typedef struct {
size_t new_upd_size;
int new_upd_taken;
WT_UPDATE *upd;
- size_t upd_size;
- int upd_taken;
} __wt_update_args;
static inline int
__wt_update_serial(
WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t write_gen, WT_UPDATE
**srch_upd, WT_UPDATE ***new_updp, size_t new_upd_size, WT_UPDATE
- **updp, size_t upd_size)
+ *upd)
{
__wt_update_args _args, *args = &_args;
int ret;
@@ -430,22 +351,13 @@ __wt_update_serial(
}
args->new_upd_taken = 0;
- if (updp == NULL)
- args->upd = NULL;
- else {
- args->upd = *updp;
- *updp = NULL;
- args->upd_size = upd_size;
- }
- args->upd_taken = 0;
+ args->upd = upd;
ret = __wt_session_serialize_func(session,
WT_WORKQ_FUNC, 1, __wt_update_serial_func, args);
if (!args->new_upd_taken)
__wt_free(session, args->new_upd);
- if (!args->upd_taken)
- __wt_free(session, args->upd);
return (ret);
}
@@ -475,15 +387,3 @@ __wt_update_new_upd_taken(WT_SESSION_IMPL *session, WT_PAGE *page)
WT_ASSERT(session, args->new_upd_size != 0);
__wt_cache_page_workq_incr(session, page, args->new_upd_size);
}
-
-static inline void
-__wt_update_upd_taken(WT_SESSION_IMPL *session, WT_PAGE *page)
-{
- __wt_update_args *args =
- (__wt_update_args *)session->wq_args;
-
- args->upd_taken = 1;
-
- WT_ASSERT(session, args->upd_size != 0);
- __wt_cache_page_workq_incr(session, page, args->upd_size);
-}
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index ec02eddbaae..aef78a9e55b 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -509,12 +509,6 @@ struct wt_session {
* must appear in at least one column group. Each column group must be
* created with a separate call to WT_SESSION::create. The value must be
* a string.,}
- * @config{column_internal_extend, configure the number of records a
- * column-store internal page is extended by when records are appended.
- * The value must be an integer between 500 and 10M.,\c 10000}
- * @config{column_leaf_extend, configure the number of records a
- * column-store leaf page is extended by when records are appended. The
- * value must be an integer between 500 and 10M.,\c 10000}
* @config{columns, list of the column names. Comma-separated list of
* the form <code>(column[\,...])</code>. For tables\, the number of
* entries must match the total number of values in \c key_format and \c
diff --git a/src/include/wt_internal.in b/src/include/wt_internal.in
index 196d0b82607..b078dd90464 100644
--- a/src/include/wt_internal.in
+++ b/src/include/wt_internal.in
@@ -171,6 +171,7 @@ struct __wt_walk_entry;
#include "bitstring.i"
#include "btree.i"
#include "cell.i"
+#include "column.i"
#include "cursor.i"
#include "log.i"
#include "packing.i"
diff --git a/test/format/format.h b/test/format/format.h
index 6115b3144df..cf03481235a 100644
--- a/test/format/format.h
+++ b/test/format/format.h
@@ -45,6 +45,7 @@ typedef struct {
void *wts_conn; /* WT_CONNECTION handle */
void *wts_cursor; /* WT_CURSOR handle */
+ void *wts_cursor_insert; /* WT_CURSOR insert handle */
void *wts_session; /* WT_SESSION handle */
FILE *rand_log; /* Random number log */
diff --git a/test/format/wts.c b/test/format/wts.c
index 0a9a4ee6038..2c74d53fce8 100644
--- a/test/format/wts.c
+++ b/test/format/wts.c
@@ -10,9 +10,10 @@
static int bulk(WT_ITEM **, WT_ITEM **);
static int wts_close(WT_CONNECTION *);
static int wts_col_del(uint64_t, int *);
-static int wts_col_put(uint64_t, int);
-static int wts_np(int, int *);
+static int wts_col_insert(uint64_t *);
+static int wts_col_put(uint64_t);
static int wts_notfound_chk(const char *, int, int, uint64_t);
+static int wts_np(int, int, int *);
static int wts_open(WT_CONNECTION **, WT_SESSION **session);
static int wts_read(uint64_t);
static int wts_row_del(uint64_t, int *);
@@ -116,7 +117,7 @@ wts_startup(void)
{
time_t now;
WT_CONNECTION *conn;
- WT_CURSOR *cursor;
+ WT_CURSOR *cursor, *cursor_insert;
WT_SESSION *session;
int ret;
char config[512], *end, *p;
@@ -162,6 +163,21 @@ wts_startup(void)
return (1);
}
+ /*
+ * We open 2 cursors, one configured for overwriting, one not configured
+ * for overwriting. The reason is that for row-store and column-store
+ * files where we're testing with existing records, we don't track if a
+ * record was deleted or not, which means we need to use cursor->insert
+ * with overwriting configured. But, in column-store files where we're
+ * testing with new, appended records, we don't want to have to specify
+ * the record number, which means we can't configure with overwriting.
+ */
+ if ((ret = session->open_cursor(
+ session, WT_TABLENAME, NULL, NULL, &cursor_insert)) != 0) {
+ fprintf(stderr, "%s: open_cursor: %s\n",
+ g.progname, wiredtiger_strerror(ret));
+ return (1);
+ }
if ((ret = session->open_cursor(
session, WT_TABLENAME, NULL, "overwrite", &cursor)) != 0) {
fprintf(stderr, "%s: open_cursor: %s\n",
@@ -179,6 +195,8 @@ wts_startup(void)
g.wts_conn = conn;
g.wts_session = session;
g.wts_cursor = cursor;
+ g.wts_cursor_insert = cursor_insert;
+
return (0);
}
@@ -481,13 +499,13 @@ wts_ops(void)
uint64_t cnt, keyno;
uint32_t op;
u_int np;
- int notfound;
+ int insert, notfound;
for (cnt = 0; cnt < g.c_ops; ++cnt) {
if (cnt % 10 == 0)
track("read/write ops", cnt);
- notfound = 0;
+ insert = notfound = 0;
keyno = MMRAND(1, g.c_rows);
/*
@@ -509,17 +527,6 @@ wts_ops(void)
return (1);
break;
case FIX:
- /*
- * We don't delete records in fixed-length
- * column-store files: a "delete" is the same
- * as a store of 0x00, which means we're not
- * really testing anything interesting, and,
- * if we reconcile the page, the engine code
- * discards trailing, deleted records, which
- * can give us test failures because we don't
- * match the contents of the BDB database.
- */
- break;
case VAR:
if (wts_col_del(keyno, &notfound))
return (1);
@@ -532,22 +539,10 @@ wts_ops(void)
return (1);
break;
case FIX:
- /*
- * We don't insert records in fixed-length
- * column-store files: an insert extends the
- * file by creating a large number of "deleted"
- * records: since a deleted record is a store
- * of 0x00, we can't distinguish between a
- * legitimate value and a deleted record, and
- * so we don't match the contents of the BDB
- * database.
- */
- break;
case VAR:
- /* Column-store tables only support append. */
- keyno = ++g.c_rows;
- if (wts_col_put(keyno, 1))
+ if (wts_col_insert(&keyno))
return (1);
+ insert = 1;
break;
}
} else if (
@@ -559,7 +554,7 @@ wts_ops(void)
break;
case FIX:
case VAR:
- if (wts_col_put(keyno, 0))
+ if (wts_col_put(keyno))
return (1);
break;
}
@@ -576,7 +571,7 @@ wts_ops(void)
for (np = 0; np < MMRAND(1, 4); ++np) {
if (notfound)
break;
- if (wts_np(MMRAND(0, 1), &notfound))
+ if (wts_np(MMRAND(0, 1), insert, &notfound))
return (1);
}
@@ -705,7 +700,7 @@ wts_read(uint64_t keyno)
* Read and verify the next/prev element in a row- or column-store file.
*/
static int
-wts_np(int next, int *notfoundp)
+wts_np(int next, int insert, int *notfoundp)
{
static WT_ITEM key, value, bdb_key, bdb_value;
WT_CURSOR *cursor;
@@ -715,7 +710,7 @@ wts_np(int next, int *notfoundp)
uint8_t bitfield;
const char *which;
- cursor = g.wts_cursor;
+ cursor = insert ? g.wts_cursor_insert : g.wts_cursor;
session = g.wts_session;
which = next ? "next" : "prev";
@@ -774,7 +769,7 @@ wts_np(int next, int *notfoundp)
/*
* wts_row_put --
- * Replace an element in a row-store file.
+ * Update an element in a row-store file.
*/
static int
wts_row_put(uint64_t keyno, int insert)
@@ -818,10 +813,10 @@ wts_row_put(uint64_t keyno, int insert)
/*
* wts_col_put --
- * Replace an element in a column-store file.
+ * Update an element in a column-store file.
*/
static int
-wts_col_put(uint64_t keyno, int insert)
+wts_col_put(uint64_t keyno)
{
static WT_ITEM key, value;
WT_CURSOR *cursor;
@@ -835,41 +830,96 @@ wts_col_put(uint64_t keyno, int insert)
value_gen(&value.data, &value.size, keyno);
/* Log the operation */
- if (g.logging) {
+ if (g.logging)
if (g.c_file_type == FIX)
(void)session->msg_printf(session,
"%-10s%" PRIu64 " {0x%02" PRIx8 "}",
- insert ? "insert" : "put",
- keyno, ((uint8_t *)value.data)[0]);
+ "put", keyno,
+ ((uint8_t *)value.data)[0]);
else
(void)session->msg_printf(session,
"%-10s%" PRIu64 " {%.*s}",
- insert ? "insert" : "put",
- keyno, (int)value.size, (char *)value.data);
- }
+ "put", keyno,
+ (int)value.size, (char *)value.data);
- if (bdb_put(key.data, key.size, value.data, value.size, &notfound))
- return (1);
-
cursor->set_key(cursor, keyno);
if (g.c_file_type == FIX)
cursor->set_value(cursor, *(uint8_t *)value.data);
else
cursor->set_value(cursor, &value);
- ret = cursor->insert(cursor);
+ ret = cursor->update(cursor);
if (ret != 0 && ret != WT_NOTFOUND) {
fprintf(stderr,
- "%s: wts_col_put: %s col %" PRIu64 " by key: %s\n",
- g.progname,
- insert ? "insert" : "put", keyno, wiredtiger_strerror(ret));
+ "%s: wts_col_put: %" PRIu64 " : %s\n",
+ g.progname, keyno, wiredtiger_strerror(ret));
return (1);
}
+ if (bdb_put(key.data, key.size, value.data, value.size, &notfound))
+ return (1);
+
NTF_CHK(wts_notfound_chk("wts_col_put", ret, notfound, keyno));
return (0);
}
/*
+ * wts_col_insert --
+ * Insert an element in a column-store file.
+ */
+static int
+wts_col_insert(uint64_t *keynop)
+{
+ static WT_ITEM key, value;
+ WT_CURSOR *cursor;
+ WT_SESSION *session;
+ uint64_t keyno;
+ int notfound, ret;
+
+ cursor = g.wts_cursor_insert;
+ session = g.wts_session;
+
+ value_gen(&value.data, &value.size, 0);
+
+ if (g.c_file_type == FIX)
+ cursor->set_value(cursor, *(uint8_t *)value.data);
+ else
+ cursor->set_value(cursor, &value);
+ ret = cursor->insert(cursor);
+ if (ret != 0) {
+ fprintf(stderr, "%s: wts_col_insert: %s\n",
+ g.progname, wiredtiger_strerror(ret));
+ return (1);
+ }
+ if ((ret = cursor->get_key(cursor, &keyno)) != 0) {
+ fprintf(stderr, "%s: cursor->get_key: %s\n",
+ g.progname, wiredtiger_strerror(ret));
+ return (1);
+ }
+ if (keyno <= g.c_rows) {
+ fprintf(stderr,
+ "%s: inserted key did not create new row\n", g.progname);
+ return (1);
+ }
+ g.c_rows = *keynop = (uint32_t)keyno;
+
+ if (g.logging)
+ if (g.c_file_type == FIX)
+ (void)session->msg_printf(session,
+ "%-10s%" PRIu64 " {0x%02" PRIx8 "}",
+ "insert", keyno,
+ ((uint8_t *)value.data)[0]);
+ else
+ (void)session->msg_printf(session,
+ "%-10s%" PRIu64 " {%.*s}",
+ "insert", keyno,
+ (int)value.size, (char *)value.data);
+
+ key_gen(&key.data, &key.size, keyno, 0);
+ return (bdb_put(
+ key.data, key.size, value.data, value.size, &notfound) ? 1 : 0);
+}
+
+/*
* wts_row_del --
* Delete an element from a row-store file.
*/