diff options
Diffstat (limited to 'src/btree')
-rw-r--r-- | src/btree/bt_compact.c | 12 | ||||
-rw-r--r-- | src/btree/bt_curnext.c | 10 | ||||
-rw-r--r-- | src/btree/bt_curprev.c | 10 | ||||
-rw-r--r-- | src/btree/bt_cursor.c | 248 | ||||
-rw-r--r-- | src/btree/bt_debug.c | 8 | ||||
-rw-r--r-- | src/btree/bt_delete.c | 4 | ||||
-rw-r--r-- | src/btree/bt_discard.c | 4 | ||||
-rw-r--r-- | src/btree/bt_handle.c | 22 | ||||
-rw-r--r-- | src/btree/bt_huffman.c | 2 | ||||
-rw-r--r-- | src/btree/bt_io.c | 2 | ||||
-rw-r--r-- | src/btree/bt_misc.c | 2 | ||||
-rw-r--r-- | src/btree/bt_ovfl.c | 2 | ||||
-rw-r--r-- | src/btree/bt_page.c | 2 | ||||
-rw-r--r-- | src/btree/bt_random.c | 5 | ||||
-rw-r--r-- | src/btree/bt_read.c | 26 | ||||
-rw-r--r-- | src/btree/bt_rebalance.c | 2 | ||||
-rw-r--r-- | src/btree/bt_ret.c | 12 | ||||
-rw-r--r-- | src/btree/bt_slvg.c | 2 | ||||
-rw-r--r-- | src/btree/bt_split.c | 182 | ||||
-rw-r--r-- | src/btree/bt_stat.c | 25 | ||||
-rw-r--r-- | src/btree/bt_sync.c | 26 | ||||
-rw-r--r-- | src/btree/bt_upgrade.c | 2 | ||||
-rw-r--r-- | src/btree/bt_vrfy.c | 2 | ||||
-rw-r--r-- | src/btree/bt_vrfy_dsk.c | 2 | ||||
-rw-r--r-- | src/btree/bt_walk.c | 101 | ||||
-rw-r--r-- | src/btree/col_modify.c | 44 | ||||
-rw-r--r-- | src/btree/col_srch.c | 2 | ||||
-rw-r--r-- | src/btree/row_key.c | 4 | ||||
-rw-r--r-- | src/btree/row_modify.c | 62 | ||||
-rw-r--r-- | src/btree/row_srch.c | 2 |
30 files changed, 370 insertions, 459 deletions
diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c index 2edcac76d0b..c6a412aa84e 100644 --- a/src/btree/bt_compact.c +++ b/src/btree/bt_compact.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -60,7 +60,7 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) */ if (mod->rec_result == WT_PM_REC_REPLACE || mod->rec_result == WT_PM_REC_MULTIBLOCK) - __wt_writelock(session, &page->page_lock); + WT_PAGE_LOCK(session, page); if (mod->rec_result == WT_PM_REC_REPLACE) ret = bm->compact_page_skip(bm, session, @@ -80,7 +80,7 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) if (mod->rec_result == WT_PM_REC_REPLACE || mod->rec_result == WT_PM_REC_MULTIBLOCK) - __wt_writeunlock(session, &page->page_lock); + WT_PAGE_UNLOCK(session, page); return (ret); } @@ -228,12 +228,8 @@ __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) bm, session, addr, addr_size, skipp); } - /* - * Reset the WT_REF state and push the change. The full-barrier isn't - * necessary, but it's better to keep pages in circulation than not. - */ + /* Reset the WT_REF state. */ ref->state = WT_REF_DISK; - WT_FULL_BARRIER(); return (ret); } diff --git a/src/btree/bt_curnext.c b/src/btree/bt_curnext.c index 21e575ffca9..7b92a58991d 100644 --- a/src/btree/bt_curnext.c +++ b/src/btree/bt_curnext.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -142,7 +142,7 @@ new_page: if (cbt->ins == NULL) __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) continue; - if (WT_UPDATE_DELETED_ISSET(upd)) { + if (upd->type == WT_UPDATE_DELETED) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; @@ -205,7 +205,7 @@ new_page: /* Find the matching WT_COL slot. */ upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd != NULL) { - if (WT_UPDATE_DELETED_ISSET(upd)) { + if (upd->type == WT_UPDATE_DELETED) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; @@ -325,7 +325,7 @@ __cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage) new_insert: if ((ins = cbt->ins) != NULL) { if ((upd = __wt_txn_read(session, ins->upd)) == NULL) continue; - if (WT_UPDATE_DELETED_ISSET(upd)) { + if (upd->type == WT_UPDATE_DELETED) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; @@ -358,7 +358,7 @@ new_insert: if ((ins = cbt->ins) != NULL) { cbt->slot = cbt->row_iteration_slot / 2 - 1; rip = &page->pg_row[cbt->slot]; upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); - if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) { + if (upd != NULL && upd->type == WT_UPDATE_DELETED) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; diff --git a/src/btree/bt_curprev.c b/src/btree/bt_curprev.c index bf4bdad6529..55b5095fe91 100644 --- a/src/btree/bt_curprev.c +++ b/src/btree/bt_curprev.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -288,7 +288,7 @@ new_page: if (cbt->ins == NULL) __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) continue; - if (WT_UPDATE_DELETED_ISSET(upd)) { + if (upd->type == WT_UPDATE_DELETED) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; @@ -352,7 +352,7 @@ new_page: if (cbt->recno < cbt->ref->ref_recno) upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd != NULL) { - if (WT_UPDATE_DELETED_ISSET(upd)) { + if (upd->type == WT_UPDATE_DELETED) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; @@ -482,7 +482,7 @@ __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage) new_insert: if ((ins = cbt->ins) != NULL) { if ((upd = __wt_txn_read(session, ins->upd)) == NULL) continue; - if (WT_UPDATE_DELETED_ISSET(upd)) { + if (upd->type == WT_UPDATE_DELETED) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; @@ -517,7 +517,7 @@ new_insert: if ((ins = cbt->ins) != NULL) { cbt->slot = cbt->row_iteration_slot / 2 - 1; rip = &page->pg_row[cbt->slot]; upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); - if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) { + if (upd != NULL && upd->type == WT_UPDATE_DELETED) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index 944e276fc01..52435eeefed 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -64,29 +64,6 @@ __cursor_page_pinned(WT_CURSOR_BTREE *cbt) } /* - * __cursor_copy_int_key -- - * If we're pointing into the tree, save the key into local memory. - */ -static inline int -__cursor_copy_int_key(WT_CURSOR *cursor) -{ - /* - * We're about to discard the cursor's position and the cursor layer - * might retry the operation. We discard pinned pages on error, which - * will invalidate pinned keys. Clear WT_CURSTD_KEY_INT in all cases, - * the underlying page is gone whether we can allocate memory or not. - */ - if (F_ISSET(cursor, WT_CURSTD_KEY_INT)) { - F_CLR(cursor, WT_CURSTD_KEY_INT); - if (!WT_DATA_IN_ITEM(&cursor->key)) - WT_RET(__wt_buf_set((WT_SESSION_IMPL *)cursor->session, - &cursor->key, cursor->key.data, cursor->key.size)); - F_SET(cursor, WT_CURSTD_KEY_EXT); - } - return (0); -} - -/* * __cursor_size_chk -- * Return if an inserted item is too large. */ @@ -247,7 +224,7 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) */ if (cbt->ins != NULL && (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) { - if (WT_UPDATE_DELETED_ISSET(upd)) + if (upd->type == WT_UPDATE_DELETED) return (false); if (updp != NULL) *updp = upd; @@ -320,7 +297,7 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) page->modify->mod_row_update != NULL && (upd = __wt_txn_read(session, page->modify->mod_row_update[cbt->slot])) != NULL) { - if (WT_UPDATE_DELETED_ISSET(upd)) + if (upd->type == WT_UPDATE_DELETED) return (false); if (updp != NULL) *updp = upd; @@ -366,10 +343,10 @@ __cursor_row_search( */ static inline int __cursor_col_modify( - WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool is_remove) + WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, u_int modify_type) { - return (__wt_col_modify(session, - cbt, cbt->iface.recno, &cbt->iface.value, NULL, is_remove)); + return (__wt_col_modify(session, cbt, + cbt->iface.recno, &cbt->iface.value, NULL, modify_type, false)); } /* @@ -378,10 +355,10 @@ __cursor_col_modify( */ static inline int __cursor_row_modify( - WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool is_remove) + WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, u_int modify_type) { - return (__wt_row_modify(session, - cbt, &cbt->iface.key, &cbt->iface.value, NULL, is_remove)); + return (__wt_row_modify(session, cbt, + &cbt->iface.key, &cbt->iface.value, NULL, modify_type, false)); } /* @@ -431,10 +408,14 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) __cursor_state_save(cursor, &state); /* - * The pinned page goes away if we do a search, make sure there's a - * local copy of any key, then re-save the cursor state. + * The pinned page goes away if we search the tree, get a local copy of + * any pinned key and discard any pinned value, then re-save the cursor + * state. Done before searching pinned pages (unlike other cursor + * functions), because we don't anticipate applications searching for a + * key they currently have pinned.) */ - WT_ERR(__cursor_copy_int_key(cursor)); + WT_ERR(__cursor_localkey(cursor)); + __cursor_novalue(cursor); __cursor_state_save(cursor, &state); /* @@ -516,10 +497,14 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) __cursor_state_save(cursor, &state); /* - * The pinned page goes away if we do a search, make sure there's a - * local copy of any key, then re-save the cursor state. + * The pinned page goes away if we search the tree, get a local copy of + * any pinned key and discard any pinned value, then re-save the cursor + * state. Done before searching pinned pages (unlike other cursor + * functions), because we don't anticipate applications searching for a + * key they currently have pinned.) */ - WT_ERR(__cursor_copy_int_key(cursor)); + WT_ERR(__cursor_localkey(cursor)); + __cursor_novalue(cursor); __cursor_state_save(cursor, &state); /* @@ -640,8 +625,6 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt) WT_STAT_DATA_INCRV(session, cursor_insert_bytes, cursor->key.size + cursor->value.size); - __cursor_state_save(cursor, &state); - if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); @@ -658,6 +641,9 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt) append_key = F_ISSET(cursor, WT_CURSTD_APPEND) && btree->type != BTREE_ROW; + /* Save the cursor state. */ + __cursor_state_save(cursor, &state); + /* * If inserting with overwrite configured, and positioned to an on-page * key, the update doesn't require another search. The cursor won't be @@ -676,28 +662,30 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt) */ cbt->compare = 0; ret = btree->type == BTREE_ROW ? - __cursor_row_modify(session, cbt, false) : - __cursor_col_modify(session, cbt, false); + __cursor_row_modify(session, cbt, WT_UPDATE_STANDARD) : + __cursor_col_modify(session, cbt, WT_UPDATE_STANDARD); if (ret == 0) goto done; /* - * The pinned page goes away if we fail for any reason, make - * sure there's a local copy of any key. (Restart could still + * The pinned page goes away if we fail for any reason, get a + * local copy of any pinned key or value. (Restart could still * use the pinned page, but that's an unlikely path.) Re-save * the cursor state: we may retry but eventually fail. */ - WT_TRET(__cursor_copy_int_key(cursor)); + WT_TRET(__cursor_localkey(cursor)); + WT_TRET(__cursor_localvalue(cursor)); __cursor_state_save(cursor, &state); goto err; } /* - * The pinned page goes away if we do a search, make sure there's a - * local copy of any key. Re-save the cursor state: we may retry but + * The pinned page goes away if we do a search, get a local copy of any + * pinned key or value. Re-save the cursor state: we may retry but * eventually fail. */ - WT_ERR(__cursor_copy_int_key(cursor)); + WT_ERR(__cursor_localkey(cursor)); + WT_ERR(__cursor_localvalue(cursor)); __cursor_state_save(cursor, &state); retry: WT_ERR(__cursor_func_init(cbt, true)); @@ -712,7 +700,7 @@ retry: WT_ERR(__cursor_func_init(cbt, true)); cbt->compare == 0 && __wt_cursor_valid(cbt, NULL)) WT_ERR(WT_DUPLICATE_KEY); - ret = __cursor_row_modify(session, cbt, false); + ret = __cursor_row_modify(session, cbt, WT_UPDATE_STANDARD); } else { /* * Optionally insert a new record (ignoring the application's @@ -735,7 +723,7 @@ retry: WT_ERR(__cursor_func_init(cbt, true)); (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt)))) WT_ERR(WT_DUPLICATE_KEY); - WT_ERR(__cursor_col_modify(session, cbt, false)); + WT_ERR(__cursor_col_modify(session, cbt, WT_UPDATE_STANDARD)); if (append_key) cbt->iface.recno = cbt->recno; @@ -812,12 +800,13 @@ __wt_btcur_insert_check(WT_CURSOR_BTREE *cbt) session = (WT_SESSION_IMPL *)cursor->session; /* - * The pinned page goes away if we do a search, make sure there's a - * local copy of any key. Unlike most of the btree cursor routines, - * we don't have to save/restore the cursor key state, none of the - * work done here changes the key state. + * The pinned page goes away if we do a search, get a local copy of any + * pinned key and discard any pinned value. Unlike most of the btree + * cursor routines, we don't have to save/restore the cursor key state, + * none of the work done here changes the cursor state. */ - WT_ERR(__cursor_copy_int_key(cursor)); + WT_ERR(__cursor_localkey(cursor)); + __cursor_novalue(cursor); retry: WT_ERR(__cursor_func_init(cbt, true)); @@ -865,14 +854,15 @@ __wt_btcur_remove(WT_CURSOR_BTREE *cbt) WT_STAT_DATA_INCR(session, cursor_remove); WT_STAT_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size); - __cursor_state_save(cursor, &state); - /* * WT_CURSOR.remove has a unique semantic, the cursor stays positioned * if it starts positioned, otherwise clear the cursor on completion. */ positioned = F_ISSET(cursor, WT_CURSTD_KEY_INT); + /* Save the cursor state. */ + __cursor_state_save(cursor, &state); + /* * If remove positioned to an on-page key, the remove doesn't require * another search. We don't care about the "overwrite" configuration @@ -891,28 +881,33 @@ __wt_btcur_remove(WT_CURSOR_BTREE *cbt) */ cbt->compare = 0; ret = btree->type == BTREE_ROW ? - __cursor_row_modify(session, cbt, true) : - __cursor_col_modify(session, cbt, true); + __cursor_row_modify(session, cbt, WT_UPDATE_DELETED) : + __cursor_col_modify(session, cbt, WT_UPDATE_DELETED); if (ret == 0) goto done; /* - * The pinned page goes away if we fail for any reason, make - * sure there's a local copy of any key. (Restart could still - * use the pinned page, but that's an unlikely path.) Re-save - * the cursor state: we may retry but eventually fail. + * The pinned page goes away if we fail for any reason, get a + * local copy of any pinned key and discard any value (remove + * discards any previous value on success or failure). (Restart + * could still use the pinned page, but that's an unlikely + * path.) Re-save the cursor state: we may retry but eventually + * fail. */ - WT_TRET(__cursor_copy_int_key(cursor)); + WT_TRET(__cursor_localkey(cursor)); + F_CLR(cursor, WT_CURSTD_VALUE_SET); __cursor_state_save(cursor, &state); goto err; } /* - * The pinned page goes away if we do a search, make sure there's a - * local copy of any key. Re-save the cursor state: we may retry but - * eventually fail. + * The pinned page goes away if we do a search, get a local copy of any + * pinned key and discard any value (remove discards any previous + * value on success or failure). Re-save the cursor state: we may retry + * but eventually fail. */ - WT_ERR(__cursor_copy_int_key(cursor)); + WT_ERR(__cursor_localkey(cursor)); + F_CLR(cursor, WT_CURSTD_VALUE_SET); __cursor_state_save(cursor, &state); retry: WT_ERR(__cursor_func_init(cbt, true)); @@ -926,7 +921,7 @@ retry: WT_ERR(__cursor_func_init(cbt, true)); if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) WT_ERR(WT_NOTFOUND); - ret = __cursor_row_modify(session, cbt, true); + ret = __cursor_row_modify(session, cbt, WT_UPDATE_DELETED); } else { WT_ERR(__cursor_col_search(session, cbt, NULL)); @@ -953,7 +948,8 @@ retry: WT_ERR(__cursor_func_init(cbt, true)); */ cbt->recno = cursor->recno; } else - ret = __cursor_col_modify(session, cbt, true); + ret = __cursor_col_modify( + session, cbt, WT_UPDATE_DELETED); } err: if (ret == WT_RESTART) { @@ -987,11 +983,11 @@ done: /* } /* - * __wt_btcur_update -- + * __btcur_update -- * Update a record in the tree. */ -int -__wt_btcur_update(WT_CURSOR_BTREE *cbt) +static int +__btcur_update(WT_CURSOR_BTREE *cbt, u_int modify_type) { WT_BTREE *btree; WT_CURFILE_STATE state; @@ -1003,19 +999,12 @@ __wt_btcur_update(WT_CURSOR_BTREE *cbt) cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; - WT_STAT_CONN_INCR(session, cursor_update); - WT_STAT_DATA_INCR(session, cursor_update); - WT_STAT_DATA_INCRV(session, cursor_update_bytes, cursor->value.size); - - __cursor_state_save(cursor, &state); - - if (btree->type == BTREE_ROW) - WT_RET(__cursor_size_chk(session, &cursor->key)); - WT_RET(__cursor_size_chk(session, &cursor->value)); - /* It's no longer possible to bulk-load into the tree. */ __cursor_disable_bulk(session, btree); + /* Save the cursor state. */ + __cursor_state_save(cursor, &state); + /* * If update positioned to an on-page key, the update doesn't require * another search. We don't care about the "overwrite" configuration @@ -1033,28 +1022,30 @@ __wt_btcur_update(WT_CURSOR_BTREE *cbt) */ cbt->compare = 0; ret = btree->type == BTREE_ROW ? - __cursor_row_modify(session, cbt, false) : - __cursor_col_modify(session, cbt, false); + __cursor_row_modify(session, cbt, modify_type) : + __cursor_col_modify(session, cbt, modify_type); if (ret == 0) goto done; /* - * The pinned page goes away if we fail for any reason, make - * sure there's a local copy of any key. (Restart could still + * The pinned page goes away if we fail for any reason, get a + * a local copy of any pinned key or value. (Restart could still * use the pinned page, but that's an unlikely path.) Re-save * the cursor state: we may retry but eventually fail. */ - WT_TRET(__cursor_copy_int_key(cursor)); + WT_TRET(__cursor_localkey(cursor)); + WT_TRET(__cursor_localvalue(cursor)); __cursor_state_save(cursor, &state); goto err; } /* - * The pinned page goes away if we do a search, make sure there's a - * local copy of any key. Re-save the cursor state: we may retry but + * The pinned page goes away if we do a search, get a local copy of any + * pinned key or value. Re-save the cursor state: we may retry but * eventually fail. */ - WT_ERR(__cursor_copy_int_key(cursor)); + WT_ERR(__cursor_localkey(cursor)); + WT_ERR(__cursor_localvalue(cursor)); __cursor_state_save(cursor, &state); retry: WT_ERR(__cursor_func_init(cbt, true)); @@ -1070,7 +1061,7 @@ retry: WT_ERR(__cursor_func_init(cbt, true)); if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) WT_ERR(WT_NOTFOUND); } - ret = __cursor_row_modify(session, cbt, false); + ret = __cursor_row_modify(session, cbt, modify_type); } else { WT_ERR(__cursor_col_search(session, cbt, NULL)); @@ -1089,7 +1080,7 @@ retry: WT_ERR(__cursor_func_init(cbt, true)); !__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); } - ret = __cursor_col_modify(session, cbt, false); + ret = __cursor_col_modify(session, cbt, modify_type); } err: if (ret == WT_RESTART) { @@ -1106,8 +1097,14 @@ err: if (ret == WT_RESTART) { * To make this work, we add a field to the btree cursor to pass back a * pointer to the modify function's allocated update structure. */ -done: if (ret == 0) - WT_TRET(__wt_kv_return(session, cbt, cbt->modify_update)); +done: if (ret == 0) { + if (modify_type == WT_UPDATE_RESERVED) { + F_CLR(cursor, WT_CURSTD_VALUE_SET); + WT_TRET(__wt_key_return(session, cbt)); + } else + WT_TRET( + __wt_kv_return(session, cbt, cbt->modify_update)); + } if (ret != 0) { WT_TRET(__cursor_reset(cbt)); @@ -1118,6 +1115,59 @@ done: if (ret == 0) } /* + * __wt_btcur_reserve -- + * Reserve a record in the tree. + */ +int +__wt_btcur_reserve(WT_CURSOR_BTREE *cbt) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + WT_SESSION_IMPL *session; + bool overwrite; + + cursor = &cbt->iface; + session = (WT_SESSION_IMPL *)cursor->session; + + WT_STAT_CONN_INCR(session, cursor_reserve); + WT_STAT_DATA_INCR(session, cursor_reserve); + + /* WT_CURSOR.reserve is update-without-overwrite and a special value. */ + overwrite = F_ISSET(cursor, WT_CURSTD_OVERWRITE); + F_CLR(cursor, WT_CURSTD_OVERWRITE); + ret = __btcur_update(cbt, WT_UPDATE_RESERVED); + if (overwrite) + F_SET(cursor, WT_CURSTD_OVERWRITE); + return (ret); +} + +/* + * __wt_btcur_update -- + * Update a record in the tree. + */ +int +__wt_btcur_update(WT_CURSOR_BTREE *cbt) +{ + WT_BTREE *btree; + WT_CURSOR *cursor; + WT_SESSION_IMPL *session; + + btree = cbt->btree; + cursor = &cbt->iface; + session = (WT_SESSION_IMPL *)cursor->session; + + WT_STAT_CONN_INCR(session, cursor_update); + WT_STAT_DATA_INCR(session, cursor_update); + WT_STAT_DATA_INCRV(session, cursor_update_bytes, cursor->value.size); + + if (btree->type == BTREE_ROW) + WT_RET(__cursor_size_chk(session, &cursor->key)); + WT_RET(__cursor_size_chk(session, &cursor->value)); + + return (__btcur_update(cbt, WT_UPDATE_STANDARD)); +} + +/* * __wt_btcur_compare -- * Return a comparison between two cursors. */ @@ -1237,7 +1287,7 @@ __wt_btcur_equals(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *equalp) static int __cursor_truncate(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop, - int (*rmfunc)(WT_SESSION_IMPL *, WT_CURSOR_BTREE *, bool)) + int (*rmfunc)(WT_SESSION_IMPL *, WT_CURSOR_BTREE *, u_int)) { WT_DECL_RET; @@ -1265,7 +1315,7 @@ retry: WT_RET(__wt_btcur_search(start)); F_MASK((WT_CURSOR *)start, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT); for (;;) { - if ((ret = rmfunc(session, start, 1)) != 0) + if ((ret = rmfunc(session, start, WT_UPDATE_DELETED)) != 0) break; if (stop != NULL && __cursor_equals(start, stop)) @@ -1292,7 +1342,7 @@ retry: WT_RET(__wt_btcur_search(start)); static int __cursor_truncate_fix(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop, - int (*rmfunc)(WT_SESSION_IMPL *, WT_CURSOR_BTREE *, bool)) + int (*rmfunc)(WT_SESSION_IMPL *, WT_CURSOR_BTREE *, u_int)) { WT_DECL_RET; const uint8_t *value; @@ -1323,7 +1373,7 @@ retry: WT_RET(__wt_btcur_search(start)); for (;;) { value = (const uint8_t *)start->iface.value.data; if (*value != 0 && - (ret = rmfunc(session, start, 1)) != 0) + (ret = rmfunc(session, start, WT_UPDATE_DELETED)) != 0) break; if (stop != NULL && __cursor_equals(start, stop)) diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index d3f02e29b90..394ac6c7b84 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -689,8 +689,6 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref) WT_RET(ds->f(ds, ", entries %" PRIu32, entries)); WT_RET(ds->f(ds, ", %s", __wt_page_is_modified(page) ? "dirty" : "clean")); - WT_RET(ds->f(ds, ", %s", __wt_rwlock_islocked( - session, &page->page_lock) ? "locked" : "unlocked")); if (F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS)) WT_RET(ds->f(ds, ", keys-built")); @@ -985,8 +983,10 @@ static int __debug_update(WT_DBG *ds, WT_UPDATE *upd, bool hexbyte) { for (; upd != NULL; upd = upd->next) - if (WT_UPDATE_DELETED_ISSET(upd)) + if (upd->type == WT_UPDATE_DELETED) WT_RET(ds->f(ds, "\tvalue {deleted}\n")); + else if (upd->type == WT_UPDATE_RESERVED) + WT_RET(ds->f(ds, "\tvalue {reserved}\n")); else if (hexbyte) { WT_RET(ds->f(ds, "\t{")); WT_RET(__debug_hex_byte(ds, diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c index b55ad291c5e..4a88b672d47 100644 --- a/src/btree/bt_delete.c +++ b/src/btree/bt_delete.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -333,7 +333,7 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) */ for (i = 0, size = 0; i < page->entries; ++i) { WT_ERR(__wt_calloc_one(session, &upd)); - WT_UPDATE_DELETED_SET(upd); + upd->type = WT_UPDATE_DELETED; if (page_del == NULL) upd->txnid = WT_TXN_NONE; /* Globally visible */ diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c index bab7b8145d6..bfa8eb25aac 100644 --- a/src/btree/bt_discard.c +++ b/src/btree/bt_discard.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -98,7 +98,6 @@ __page_out_int(WT_SESSION_IMPL *session, WT_PAGE **pagep, bool rewrite) */ WT_ASSERT(session, !__wt_page_is_modified(page)); WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)); - WT_ASSERT(session, !__wt_rwlock_islocked(session, &page->page_lock)); /* * If a root page split, there may be one or more pages linked from the @@ -254,6 +253,7 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) __wt_ovfl_discard_free(session, page); __wt_free(session, page->modify->ovfl_track); + __wt_spin_destroy(session, &page->modify->page_lock); __wt_free(session, page->modify); } diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index d76720b19ae..06fbd6b74c7 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -418,15 +418,13 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) WT_RET(__wt_compressor_config(session, &cval, &btree->compressor)); /* - * We do not use __wt_config_gets_none here because "none" - * and the empty string have different meanings. The - * empty string means inherit the system encryption setting - * and "none" means this table is in the clear even if the - * database is encrypted. If this is the metadata handle - * always inherit from the connection. + * We do not use __wt_config_gets_none here because "none" and the empty + * string have different meanings. The empty string means inherit the + * system encryption setting and "none" means this table is in the clear + * even if the database is encrypted. */ WT_RET(__wt_config_gets(session, cfg, "encryption.name", &cval)); - if (WT_IS_METADATA(btree->dhandle) || cval.len == 0) + if (cval.len == 0) btree->kencryptor = conn->kencryptor; else if (WT_STRING_MATCH("none", cval.str, cval.len)) btree->kencryptor = NULL; @@ -444,12 +442,14 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) } /* Initialize locks. */ - __wt_rwlock_init(session, &btree->ovfl_lock); + WT_RET(__wt_rwlock_init(session, &btree->ovfl_lock)); WT_RET(__wt_spin_init(session, &btree->flush_lock, "btree flush")); - btree->checkpointing = WT_CKPT_OFF; /* Not checkpointing */ btree->modified = false; /* Clean */ - btree->write_gen = ckpt->write_gen; /* Write generation */ + + btree->checkpointing = WT_CKPT_OFF; /* Not checkpointing */ + btree->write_gen = ckpt->write_gen; /* Write generation */ + btree->checkpoint_gen = __wt_gen(session, WT_GEN_CHECKPOINT); return (0); } diff --git a/src/btree/bt_huffman.c b/src/btree/bt_huffman.c index 918791d9c6e..c5cc9ccf0b0 100644 --- a/src/btree/bt_huffman.c +++ b/src/btree/bt_huffman.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/btree/bt_io.c b/src/btree/bt_io.c index b5e4d52394a..262532a4eab 100644 --- a/src/btree/bt_io.c +++ b/src/btree/bt_io.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/btree/bt_misc.c b/src/btree/bt_misc.c index 3bec65c2567..04b607082d1 100644 --- a/src/btree/bt_misc.c +++ b/src/btree/bt_misc.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/btree/bt_ovfl.c b/src/btree/bt_ovfl.c index ae0da62af57..3d09f655c65 100644 --- a/src/btree/bt_ovfl.c +++ b/src/btree/bt_ovfl.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c index f20f6398e37..ca5f05fe3dc 100644 --- a/src/btree/bt_page.c +++ b/src/btree/bt_page.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/btree/bt_random.c b/src/btree/bt_random.c index c5948ec4ab5..1bdf0fd1c8b 100644 --- a/src/btree/bt_random.c +++ b/src/btree/bt_random.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -395,8 +395,7 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) */ for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) { n = skip; - WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip, - WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED)); + WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip)); if (n == skip) { if (skip == 0) break; diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index 64874547b9c..de84a711019 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -90,7 +90,8 @@ __col_instantiate(WT_SESSION_IMPL *session, { /* Search the page and add updates. */ WT_RET(__wt_col_search(session, recno, ref, cbt)); - WT_RET(__wt_col_modify(session, cbt, recno, NULL, upd, false)); + WT_RET(__wt_col_modify( + session, cbt, recno, NULL, upd, WT_UPDATE_STANDARD, false)); return (0); } @@ -104,7 +105,8 @@ __row_instantiate(WT_SESSION_IMPL *session, { /* Search the page and add updates. */ WT_RET(__wt_row_search(session, key, ref, cbt, true)); - WT_RET(__wt_row_modify(session, cbt, key, NULL, upd, false)); + WT_RET(__wt_row_modify( + session, cbt, key, NULL, upd, WT_UPDATE_STANDARD, false)); return (0); } @@ -127,7 +129,8 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_UPDATE *first_upd, *last_upd, *upd; size_t incr, total_incr; uint64_t current_recno, las_counter, las_txnid, recno, upd_txnid; - uint32_t las_id, upd_size, session_flags; + uint32_t las_id, session_flags; + uint8_t upd_type; int exact; const uint8_t *p; @@ -188,10 +191,10 @@ __las_page_instantiate(WT_SESSION_IMPL *session, /* Allocate the WT_UPDATE structure. */ WT_ERR(cursor->get_value( - cursor, &upd_txnid, &upd_size, las_value)); - WT_ERR(__wt_update_alloc(session, - (upd_size == WT_UPDATE_DELETED_VALUE) ? NULL : las_value, - &upd, &incr)); + cursor, &upd_txnid, &upd_type, las_value)); + WT_ERR(__wt_update_alloc(session, las_value, &upd, &incr, + upd_type == WT_UPDATE_DELETED ? + WT_UPDATE_DELETED : WT_UPDATE_STANDARD)); total_incr += incr; upd->txnid = upd_txnid; @@ -586,15 +589,10 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags * if the page qualifies for forced eviction and update * the page's generation number. If eviction isn't being * done on this file, we're done. - * In-memory split of large pages is allowed while - * no_eviction is set on btree, whereas reconciliation - * is not allowed. */ if (LF_ISSET(WT_READ_NO_EVICT) || F_ISSET(session, WT_SESSION_NO_EVICTION) || - btree->lsm_primary || - (btree->evict_disabled > 0 && - !F_ISSET(btree, WT_BTREE_ALLOW_SPLITS))) + btree->evict_disabled > 0 || btree->lsm_primary) goto skip_evict; /* diff --git a/src/btree/bt_rebalance.c b/src/btree/bt_rebalance.c index 68848c7c8f5..47c7888af35 100644 --- a/src/btree/bt_rebalance.c +++ b/src/btree/bt_rebalance.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/btree/bt_ret.c b/src/btree/bt_ret.c index f17fa1b85d1..7212de72d6e 100644 --- a/src/btree/bt_ret.c +++ b/src/btree/bt_ret.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -147,9 +147,13 @@ __wt_key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) cursor = &cbt->iface; /* - * We may already have an internal key, in which case the cursor may - * not be set up to get another copy (for example, when we rely on a - * search-function result). + * We may already have an internal key and the cursor may not be set up + * to get another copy, so we have to leave it alone. Consider a cursor + * search followed by an update: the update doesn't repeat the search, + * it simply updates the currently referenced key's value. We will end + * up here with the correct internal key, but we can't "return" the key + * again even if we wanted to do the additional work, the cursor isn't + * set up for that because we didn't just complete a search. */ F_CLR(cursor, WT_CURSTD_KEY_EXT); if (!F_ISSET(cursor, WT_CURSTD_KEY_INT)) { diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c index 165f932afb2..eb39301abc7 100644 --- a/src/btree/bt_slvg.c +++ b/src/btree/bt_slvg.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 49043c8bab4..71346baee2e 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -31,143 +31,6 @@ typedef enum { } WT_SPLIT_ERROR_PHASE; /* - * __split_oldest_gen -- - * Calculate the oldest active split generation. - */ -static uint64_t -__split_oldest_gen(WT_SESSION_IMPL *session) -{ - WT_CONNECTION_IMPL *conn; - WT_SESSION_IMPL *s; - uint64_t gen, oldest; - u_int i, session_cnt; - - conn = S2C(session); - WT_ORDERED_READ(session_cnt, conn->session_cnt); - for (i = 0, s = conn->sessions, oldest = conn->split_gen + 1; - i < session_cnt; - i++, s++) - if (((gen = s->split_gen) != 0) && gen < oldest) - oldest = gen; - - return (oldest); -} - -/* - * __wt_split_obsolete -- - * Check if it is safe to free / evict based on split generation. - */ -bool -__wt_split_obsolete(WT_SESSION_IMPL *session, uint64_t split_gen) -{ - return (split_gen < __split_oldest_gen(session)); -} - -/* - * __split_stash_add -- - * Add a new entry into the session's split stash list. - */ -static int -__split_stash_add( - WT_SESSION_IMPL *session, uint64_t split_gen, void *p, size_t len) -{ - WT_CONNECTION_IMPL *conn; - WT_SPLIT_STASH *stash; - - WT_ASSERT(session, p != NULL); - - conn = S2C(session); - - /* Grow the list as necessary. */ - WT_RET(__wt_realloc_def(session, &session->split_stash_alloc, - session->split_stash_cnt + 1, &session->split_stash)); - - stash = session->split_stash + session->split_stash_cnt++; - stash->split_gen = split_gen; - stash->p = p; - stash->len = len; - - (void)__wt_atomic_add64(&conn->split_stashed_bytes, len); - (void)__wt_atomic_add64(&conn->split_stashed_objects, 1); - - /* See if we can free any previous entries. */ - if (session->split_stash_cnt > 1) - __wt_split_stash_discard(session); - - return (0); -} - -/* - * __wt_split_stash_discard -- - * Discard any memory from a session's split stash that we can. - */ -void -__wt_split_stash_discard(WT_SESSION_IMPL *session) -{ - WT_CONNECTION_IMPL *conn; - WT_SPLIT_STASH *stash; - uint64_t oldest; - size_t i; - - conn = S2C(session); - - /* Get the oldest split generation. */ - oldest = __split_oldest_gen(session); - - for (i = 0, stash = session->split_stash; - i < session->split_stash_cnt; - ++i, ++stash) { - if (stash->p == NULL) - continue; - if (stash->split_gen >= oldest) - break; - /* - * It's a bad thing if another thread is in this memory after - * we free it, make sure nothing good happens to that thread. - */ - (void)__wt_atomic_sub64(&conn->split_stashed_bytes, stash->len); - (void)__wt_atomic_sub64(&conn->split_stashed_objects, 1); - __wt_overwrite_and_free_len(session, stash->p, stash->len); - } - - /* - * If there are enough free slots at the beginning of the list, shuffle - * everything down. - */ - if (i > 100 || i == session->split_stash_cnt) - if ((session->split_stash_cnt -= i) > 0) - memmove(session->split_stash, stash, - session->split_stash_cnt * sizeof(*stash)); -} - -/* - * __wt_split_stash_discard_all -- - * Discard all memory from a session's split stash. - */ -void -__wt_split_stash_discard_all( - WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session) -{ - WT_SPLIT_STASH *stash; - size_t i; - - /* - * This function is called during WT_CONNECTION.close to discard any - * memory that remains. For that reason, we take two WT_SESSION_IMPL - * arguments: session_safe is still linked to the WT_CONNECTION and - * can be safely used for calls to other WiredTiger functions, while - * session is the WT_SESSION_IMPL we're cleaning up. - */ - for (i = 0, stash = session->split_stash; - i < session->split_stash_cnt; - ++i, ++stash) - __wt_free(session_safe, stash->p); - - __wt_free(session_safe, session->split_stash); - session->split_stash_cnt = session->split_stash_alloc = 0; -} - -/* * __split_safe_free -- * Free a buffer if we can be sure no thread is accessing it, or schedule * it to be freed otherwise. @@ -177,13 +40,14 @@ __split_safe_free(WT_SESSION_IMPL *session, uint64_t split_gen, bool exclusive, void *p, size_t s) { /* We should only call safe free if we aren't pinning the memory. */ - WT_ASSERT(session, session->split_gen != split_gen); + WT_ASSERT(session, + __wt_session_gen(session, WT_GEN_SPLIT) != split_gen); /* * We have swapped something in a page: if we don't have exclusive * access, check whether there are other threads in the same tree. */ - if (!exclusive && __split_oldest_gen(session) > split_gen) + if (!exclusive && __wt_gen_oldest(session, WT_GEN_SPLIT) > split_gen) exclusive = true; if (exclusive) { @@ -191,7 +55,7 @@ __split_safe_free(WT_SESSION_IMPL *session, return (0); } - return (__split_stash_add(session, split_gen, p, s)); + return (__wt_stash_add(session, WT_GEN_SPLIT, split_gen, p, s)); } #ifdef HAVE_DIAGNOSTIC @@ -645,7 +509,8 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) * generation to block splits in newly created pages, so get one. */ WT_ENTER_PAGE_INDEX(session); - __split_ref_prepare(session, alloc_index, session->split_gen, false); + __split_ref_prepare(session, alloc_index, + __wt_session_gen(session, WT_GEN_SPLIT), false); /* * Confirm the root page's index hasn't moved, then update it, which @@ -662,7 +527,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) * after the new index is swapped into place in order to know that no * readers are looking at the old index. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + split_gen = __wt_gen_next(session, WT_GEN_SPLIT); root->pg_intl_split_gen = split_gen; #ifdef HAVE_DIAGNOSTIC @@ -848,7 +713,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, * the new index is swapped into place in order to know that no readers * are looking at the old index. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + split_gen = __wt_gen_next(session, WT_GEN_SPLIT); parent->pg_intl_split_gen = split_gen; /* @@ -1173,7 +1038,8 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) * generation to block splits in newly created pages, so get one. */ WT_ENTER_PAGE_INDEX(session); - __split_ref_prepare(session, alloc_index, session->split_gen, true); + __split_ref_prepare(session, alloc_index, + __wt_session_gen(session, WT_GEN_SPLIT), true); /* Split into the parent. */ if ((ret = __split_parent(session, page_ref, alloc_index->index, @@ -1194,7 +1060,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) * after the new index is swapped into place in order to know that no * readers are looking at the old index. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + split_gen = __wt_gen_next(session, WT_GEN_SPLIT); page->pg_intl_split_gen = split_gen; #ifdef HAVE_DIAGNOSTIC @@ -1300,13 +1166,19 @@ __split_internal_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool trylock, for (;;) { parent = ref->home; + /* + * The page will be marked dirty, and we can only lock a page + * with a modify structure. + */ + WT_RET(__wt_page_modify_init(session, parent)); + if (trylock) - WT_RET(__wt_try_writelock(session, &parent->page_lock)); + WT_RET(WT_PAGE_TRYLOCK(session, parent)); else - __wt_writelock(session, &parent->page_lock); + WT_PAGE_LOCK(session, parent); if (parent == ref->home) break; - __wt_writeunlock(session, &parent->page_lock); + WT_PAGE_UNLOCK(session, parent); } /* @@ -1329,7 +1201,7 @@ __split_internal_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool trylock, *parentp = parent; return (0); -err: __wt_writeunlock(session, &parent->page_lock); +err: WT_PAGE_UNLOCK(session, parent); return (ret); } @@ -1345,7 +1217,7 @@ __split_internal_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard) if (hazard) ret = __wt_hazard_clear(session, parent->pg_intl_parent_ref); - __wt_writeunlock(session, &parent->page_lock); + WT_PAGE_UNLOCK(session, parent); return (ret); } @@ -1558,8 +1430,8 @@ __split_multi_inmem( WT_ERR(__wt_col_search(session, recno, ref, &cbt)); /* Apply the modification. */ - WT_ERR(__wt_col_modify( - session, &cbt, recno, NULL, upd, false)); + WT_ERR(__wt_col_modify(session, + &cbt, recno, NULL, upd, WT_UPDATE_STANDARD, true)); break; case WT_PAGE_ROW_LEAF: /* Build a key. */ @@ -1580,8 +1452,8 @@ __split_multi_inmem( WT_ERR(__wt_row_search(session, key, ref, &cbt, true)); /* Apply the modification. */ - WT_ERR(__wt_row_modify( - session, &cbt, key, NULL, upd, false)); + WT_ERR(__wt_row_modify(session, &cbt, + key, NULL, upd, WT_UPDATE_STANDARD, true)); break; WT_ILLEGAL_VALUE_ERR(session); } diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c index 0da0e0807bd..e3b9bbced48 100644 --- a/src/btree/bt_stat.c +++ b/src/btree/bt_stat.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -178,7 +178,9 @@ __stat_page_col_var( */ WT_SKIP_FOREACH(ins, WT_COL_UPDATE(page, cip)) { upd = ins->upd; - if (WT_UPDATE_DELETED_ISSET(upd)) { + if (upd->type == WT_UPDATE_RESERVED) + continue; + if (upd->type == WT_UPDATE_DELETED) { if (!orig_deleted) { ++deleted_cnt; --entry_cnt; @@ -192,11 +194,14 @@ __stat_page_col_var( } /* Walk any append list. */ - WT_SKIP_FOREACH(ins, WT_COL_APPEND(page)) - if (WT_UPDATE_DELETED_ISSET(ins->upd)) + WT_SKIP_FOREACH(ins, WT_COL_APPEND(page)) { + if (ins->upd->type == WT_UPDATE_RESERVED) + continue; + if (ins->upd->type == WT_UPDATE_DELETED) ++deleted_cnt; else ++entry_cnt; + } WT_STAT_INCRV(session, stats, btree_column_deleted, deleted_cnt); WT_STAT_INCRV(session, stats, btree_column_rle, rle_cnt); @@ -263,7 +268,8 @@ __stat_page_row_leaf( * key on the page. */ WT_SKIP_FOREACH(ins, WT_ROW_INSERT_SMALLEST(page)) - if (!WT_UPDATE_DELETED_ISSET(ins->upd)) + if (ins->upd->type != WT_UPDATE_DELETED && + ins->upd->type != WT_UPDATE_RESERVED) ++entry_cnt; /* @@ -272,16 +278,19 @@ __stat_page_row_leaf( */ WT_ROW_FOREACH(page, rip, i) { upd = WT_ROW_UPDATE(page, rip); - if (upd == NULL || !WT_UPDATE_DELETED_ISSET(upd)) + if (upd == NULL || + (upd->type != WT_UPDATE_DELETED && + upd->type != WT_UPDATE_RESERVED)) ++entry_cnt; if (upd == NULL && (cell = __wt_row_leaf_value_cell(page, rip, NULL)) != NULL && __wt_cell_type(cell) == WT_CELL_VALUE_OVFL) - ++ovfl_cnt; + ++ovfl_cnt; /* Walk K/V pairs inserted after the on-page K/V pair. */ WT_SKIP_FOREACH(ins, WT_ROW_INSERT(page, rip)) - if (!WT_UPDATE_DELETED_ISSET(ins->upd)) + if (ins->upd->type != WT_UPDATE_DELETED && + ins->upd->type != WT_UPDATE_RESERVED) ++entry_cnt; } diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index ead6ccc4ac0..5b0bf53dc6c 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -179,22 +179,9 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) * Set the checkpointing flag to block such actions and wait for * any problematic eviction or page splits to complete. */ - WT_PUBLISH(btree->checkpointing, WT_CKPT_PREPARE); - - /* - * Sync for checkpoint allows splits to happen while the queue - * is being drained, but not reconciliation. We need to do this, - * since draining the queue can take long enough for hot pages - * to grow significantly larger than the configured maximum - * size. - */ - F_SET(btree, WT_BTREE_ALLOW_SPLITS); - ret = __wt_evict_file_exclusive_on(session); - F_CLR(btree, WT_BTREE_ALLOW_SPLITS); - WT_ERR(ret); - __wt_evict_file_exclusive_off(session); - - WT_PUBLISH(btree->checkpointing, WT_CKPT_RUNNING); + btree->checkpointing = WT_CKPT_PREPARE; + (void)__wt_gen_next_drain(session, WT_GEN_EVICT); + btree->checkpointing = WT_CKPT_RUNNING; /* Write all dirty in-cache pages. */ flags |= WT_READ_NO_EVICT; @@ -268,9 +255,8 @@ err: /* On error, clear any left-over tree walk. */ saved_pinned_id == WT_TXN_NONE) __wt_txn_release_snapshot(session); - /* Clear the checkpoint flag and push the change. */ - if (btree->checkpointing != WT_CKPT_OFF) - WT_PUBLISH(btree->checkpointing, WT_CKPT_OFF); + /* Clear the checkpoint flag. */ + btree->checkpointing = WT_CKPT_OFF; __wt_spin_unlock(session, &btree->flush_lock); diff --git a/src/btree/bt_upgrade.c b/src/btree/bt_upgrade.c index a9ff16ad496..a7fe3283218 100644 --- a/src/btree/bt_upgrade.c +++ b/src/btree/bt_upgrade.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c index 7475811adc5..21ba2d7a715 100644 --- a/src/btree/bt_vrfy.c +++ b/src/btree/bt_vrfy.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/btree/bt_vrfy_dsk.c b/src/btree/bt_vrfy_dsk.c index a4071c44aee..55c96bbed55 100644 --- a/src/btree/bt_vrfy_dsk.c +++ b/src/btree/bt_vrfy_dsk.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c index 86484feb7c9..225e6812aa1 100644 --- a/src/btree/bt_walk.c +++ b/src/btree/bt_walk.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -497,29 +497,21 @@ restart: /* } /* - * Optionally skip leaf pages: skip all leaf pages if - * WT_READ_SKIP_LEAF is set, when the skip-leaf-count - * variable is non-zero, skip some count of leaf pages. - * If this page is disk-based, crack the cell to figure - * out it's a leaf page without reading it. + * Optionally skip leaf pages: when the skip-leaf-count + * variable is non-zero, skip some count of leaf pages, + * then take the next leaf page we can. * - * If skipping some number of leaf pages, decrement the - * count of pages to zero, and then take the next leaf - * page we can. Be cautious around the page decrement, - * if for some reason don't take this particular page, - * we can take the next one, and, there are additional - * tests/decrements when we're about to return a leaf - * page. + * The reason to do some of this work here (rather than + * in our caller), is because we can look at the cell + * and know it's a leaf page without reading it into + * memory. If this page is disk-based, crack the cell + * to figure out it's a leaf page without reading it. */ - if (skipleafcntp != NULL || LF_ISSET(WT_READ_SKIP_LEAF)) - if (__ref_is_leaf(ref)) { - if (LF_ISSET(WT_READ_SKIP_LEAF)) - break; - if (*skipleafcntp > 0) { - --*skipleafcntp; - break; - } - } + if (skipleafcntp != NULL && + *skipleafcntp > 0 && __ref_is_leaf(ref)) { + --*skipleafcntp; + break; + } ret = __wt_page_swap(session, couple, ref, WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK | flags); @@ -626,34 +618,18 @@ descend: empty_internal = true; session, ref, &pindex); slot = pindex->entries - 1; } - } else { - /* - * At the lowest tree level (considering a leaf - * page), turn off the initial-descent state. - * Descent race tests are different when moving - * through the tree vs. the initial descent. - */ - initial_descent = false; - - /* - * Optionally skip leaf pages, the second half. - * We didn't have an on-page cell to figure out - * if it was a leaf page, we had to acquire the - * hazard pointer and look at the page. - */ - if (skipleafcntp != NULL || - LF_ISSET(WT_READ_SKIP_LEAF)) { - if (LF_ISSET(WT_READ_SKIP_LEAF)) - break; - if (*skipleafcntp > 0) { - --*skipleafcntp; - break; - } - } - - *refp = ref; - goto done; + continue; } + + /* + * The tree-walk restart code knows we return any leaf + * page we acquire (never hazard-pointer coupling on + * after acquiring a leaf page), and asserts no restart + * happens while holding a leaf page. This page must be + * returned to our caller. + */ + *refp = ref; + goto done; } } @@ -690,8 +666,29 @@ __wt_tree_walk_count(WT_SESSION_IMPL *session, * of leaf pages before returning. */ int -__wt_tree_walk_skip(WT_SESSION_IMPL *session, - WT_REF **refp, uint64_t *skipleafcntp, uint32_t flags) +__wt_tree_walk_skip( + WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp) { - return (__tree_walk_internal(session, refp, NULL, skipleafcntp, flags)); + /* + * Optionally skip leaf pages, the second half. The tree-walk function + * didn't have an on-page cell it could use to figure out if the page + * was a leaf page or not, it had to acquire the hazard pointer and look + * at the page. The tree-walk code never acquires a hazard pointer on a + * leaf page without returning it, and it's not trivial to change that. + * So, the tree-walk code returns all leaf pages here and we deal with + * decrementing the count. + */ + do { + WT_RET(__tree_walk_internal(session, refp, NULL, skipleafcntp, + WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED)); + + /* + * The walk skipped internal pages, any page returned must be a + * leaf page. + */ + if (*skipleafcntp > 0) + --*skipleafcntp; + } while (*skipleafcntp > 0); + + return (0); } diff --git a/src/btree/col_modify.c b/src/btree/col_modify.c index 9ccb9728189..2a64ec03952 100644 --- a/src/btree/col_modify.c +++ b/src/btree/col_modify.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -17,13 +17,14 @@ static int __col_insert_alloc( */ int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, - uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove) + uint64_t recno, const WT_ITEM *value, + WT_UPDATE *upd_arg, u_int modify_type, bool exclusive) { + static const WT_ITEM col_fix_remove = { "", 1, NULL, 0, 0 }; WT_BTREE *btree; WT_DECL_RET; WT_INSERT *ins; WT_INSERT_HEAD *ins_head, **ins_headp; - WT_ITEM _value; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_UPDATE *old_upd, *upd; @@ -37,14 +38,17 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, upd = upd_arg; append = logged = false; - /* This code expects a remove to have a NULL value. */ - if (is_remove) { - if (btree->type == BTREE_COL_FIX) { - value = &_value; - value->data = ""; - value->size = 1; - } else - value = NULL; + if (modify_type == WT_UPDATE_DELETED || + modify_type == WT_UPDATE_RESERVED) { + /* + * Fixed-size column-store doesn't have on-page deleted values, + * it's a nul byte. + */ + if (modify_type == WT_UPDATE_DELETED && + btree->type == BTREE_COL_FIX) { + modify_type = WT_UPDATE_STANDARD; + value = &col_fix_remove; + } } else { /* * There's some chance the application specified a record past @@ -83,11 +87,11 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ASSERT(session, upd_arg == NULL); /* Make sure the update can proceed. */ - WT_ERR(__wt_txn_update_check( - session, old_upd = cbt->ins->upd)); + WT_ERR(__wt_txn_update_check(session, old_upd = cbt->ins->upd)); /* Allocate a WT_UPDATE structure and transaction ID. */ - WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size)); + WT_ERR(__wt_update_alloc(session, + value, &upd, &upd_size, modify_type)); WT_ERR(__wt_txn_modify(session, upd)); logged = true; @@ -103,7 +107,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, /* Serialize the update. */ WT_ERR(__wt_update_serial( - session, page, &cbt->ins->upd, &upd, upd_size)); + session, page, &cbt->ins->upd, &upd, upd_size, false)); } else { /* Allocate the append/update list reference as necessary. */ if (append) { @@ -147,8 +151,8 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, mod->mod_col_split_recno > recno)); if (upd_arg == NULL) { - WT_ERR( - __wt_update_alloc(session, value, &upd, &upd_size)); + WT_ERR(__wt_update_alloc(session, + value, &upd, &upd_size, modify_type)); WT_ERR(__wt_txn_modify(session, upd)); logged = true; @@ -185,15 +189,15 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, if (append) WT_ERR(__wt_col_append_serial( session, page, cbt->ins_head, cbt->ins_stack, - &ins, ins_size, &cbt->recno, skipdepth)); + &ins, ins_size, &cbt->recno, skipdepth, exclusive)); else WT_ERR(__wt_insert_serial( session, page, cbt->ins_head, cbt->ins_stack, - &ins, ins_size, skipdepth)); + &ins, ins_size, skipdepth, exclusive)); } /* If the update was successful, add it to the in-memory log. */ - if (logged) + if (logged && modify_type != WT_UPDATE_RESERVED) WT_ERR(__wt_txn_log_op(session, cbt)); if (0) { diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c index c72d66f8796..78ee367dc69 100644 --- a/src/btree/col_srch.c +++ b/src/btree/col_srch.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/btree/row_key.c b/src/btree/row_key.c index 032fdf7d897..a016568898f 100644 --- a/src/btree/row_key.c +++ b/src/btree/row_key.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -471,6 +471,8 @@ __wt_row_ikey_alloc(WT_SESSION_IMPL *session, { WT_IKEY *ikey; + WT_ASSERT(session, key != NULL); /* quiet clang scan-build */ + /* * Allocate memory for the WT_IKEY structure and the key, then copy * the key into place. diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c index b1a81ca3d9f..cab07341a1c 100644 --- a/src/btree/row_modify.c +++ b/src/btree/row_modify.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -15,18 +15,13 @@ int __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page) { - WT_CONNECTION_IMPL *conn; + WT_DECL_RET; WT_PAGE_MODIFY *modify; - conn = S2C(session); - WT_RET(__wt_calloc_one(session, &modify)); - /* - * Select a spinlock for the page; let the barrier immediately below - * keep things from racing too badly. - */ - modify->page_lock = ++conn->page_lock_cnt % WT_PAGE_LOCKS; + /* Initialize the spinlock for the page. */ + WT_ERR(__wt_spin_init(session, &modify->page_lock, "btree page")); /* * Multiple threads of control may be searching and deciding to modify @@ -37,8 +32,8 @@ __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page) if (__wt_atomic_cas_ptr(&page->modify, NULL, modify)) __wt_cache_page_inmem_incr(session, page, sizeof(*modify)); else - __wt_free(session, modify); - return (0); +err: __wt_free(session, modify); + return (ret); } /* @@ -47,7 +42,8 @@ __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page) */ int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, - WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove) + const WT_ITEM *key, const WT_ITEM *value, + WT_UPDATE *upd_arg, u_int modify_type, bool exclusive) { WT_DECL_RET; WT_INSERT *ins; @@ -65,10 +61,6 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, upd = upd_arg; logged = false; - /* This code expects a remove to have a NULL value. */ - if (is_remove) - value = NULL; - /* If we don't yet have a modify structure, we'll need one. */ WT_RET(__wt_page_modify_init(session, page)); mod = page->modify; @@ -99,8 +91,8 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, session, old_upd = *upd_entry)); /* Allocate a WT_UPDATE structure and transaction ID. */ - WT_ERR( - __wt_update_alloc(session, value, &upd, &upd_size)); + WT_ERR(__wt_update_alloc(session, + value, &upd, &upd_size, modify_type)); WT_ERR(__wt_txn_modify(session, upd)); logged = true; @@ -132,7 +124,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, /* Serialize the update. */ WT_ERR(__wt_update_serial( - session, page, upd_entry, &upd, upd_size)); + session, page, upd_entry, &upd, upd_size, exclusive)); } else { /* * Allocate the insert array as necessary. @@ -170,8 +162,8 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, cbt->ins = ins; if (upd_arg == NULL) { - WT_ERR( - __wt_update_alloc(session, value, &upd, &upd_size)); + WT_ERR(__wt_update_alloc(session, + value, &upd, &upd_size, modify_type)); WT_ERR(__wt_txn_modify(session, upd)); logged = true; @@ -207,10 +199,10 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, /* Insert the WT_INSERT structure. */ WT_ERR(__wt_insert_serial( session, page, cbt->ins_head, cbt->ins_stack, - &ins, ins_size, skipdepth)); + &ins, ins_size, skipdepth, exclusive)); } - if (logged) + if (logged && modify_type != WT_UPDATE_RESERVED) WT_ERR(__wt_txn_log_op(session, cbt)); if (0) { @@ -235,7 +227,7 @@ err: /* */ int __wt_row_insert_alloc(WT_SESSION_IMPL *session, - WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep) + const WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep) { WT_INSERT *ins; size_t ins_size; @@ -263,11 +255,10 @@ __wt_row_insert_alloc(WT_SESSION_IMPL *session, * Allocate a WT_UPDATE structure and associated value and fill it in. */ int -__wt_update_alloc( - WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp, size_t *sizep) +__wt_update_alloc(WT_SESSION_IMPL *session, const WT_ITEM *value, + WT_UPDATE **updp, size_t *sizep, u_int modify_type) { WT_UPDATE *upd; - size_t size; *updp = NULL; @@ -275,15 +266,18 @@ __wt_update_alloc( * Allocate the WT_UPDATE structure and room for the value, then copy * the value into place. */ - size = value == NULL ? 0 : value->size; - WT_RET(__wt_calloc(session, 1, sizeof(WT_UPDATE) + size, &upd)); - if (value == NULL) - WT_UPDATE_DELETED_SET(upd); + if (modify_type == WT_UPDATE_DELETED || + modify_type == WT_UPDATE_RESERVED) + WT_RET(__wt_calloc(session, 1, sizeof(WT_UPDATE), &upd)); else { - upd->size = WT_STORE_SIZE(size); - if (size != 0) - memcpy(WT_UPDATE_DATA(upd), value->data, size); + WT_RET(__wt_calloc( + session, 1, sizeof(WT_UPDATE) + value->size, &upd)); + if (value->size != 0) { + upd->size = WT_STORE_SIZE(value->size); + memcpy(WT_UPDATE_DATA(upd), value->data, value->size); + } } + upd->type = (uint8_t)modify_type; *updp = upd; *sizep = WT_UPDATE_MEMSIZE(upd); diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c index 9c3d467340e..76bebde7de7 100644 --- a/src/btree/row_srch.c +++ b/src/btree/row_srch.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2014-2017 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * |