From 3d6953c361213c5bfab23e51ab274ce592edafe6 Mon Sep 17 00:00:00 2001 From: Luke Chen Date: Thu, 16 Jan 2020 15:56:44 +1100 Subject: Import wiredtiger: 723a4c13292b0bc7e27be411db4d006a0b865bd8 from branch mongodb-3.6 ref: 4c72feeb92..723a4c1329 for: 3.6.17 WT-4636 Fix strace in syscall test WT-5042 Reduce configuration parsing overhead from checkpoints WT-5120 Checkpoint hangs when reconciliation doesn't release the eviction generation WT-5135 Change lookaside file inserts to use cursor.insert WT-5218 Improve eviction to differentiate between clean and dirty pages with WT_CACHE_EVICT_NOKEEP readgen WT-5247 Ensure that only idempotent modify operations are logged WT-5277 Cursor key out-of-order detected in the lookaside file --- src/third_party/wiredtiger/dist/s_define.list | 1 + src/third_party/wiredtiger/dist/s_string.ok | 3 + src/third_party/wiredtiger/import.data | 8 +- src/third_party/wiredtiger/src/btree/bt_curnext.c | 14 +- src/third_party/wiredtiger/src/btree/bt_curprev.c | 8 +- src/third_party/wiredtiger/src/btree/bt_cursor.c | 224 ++++++++++++--------- src/third_party/wiredtiger/src/btree/bt_random.c | 10 +- src/third_party/wiredtiger/src/btree/bt_read.c | 8 +- src/third_party/wiredtiger/src/btree/bt_ret.c | 25 ++- src/third_party/wiredtiger/src/btree/bt_split.c | 9 +- src/third_party/wiredtiger/src/btree/col_modify.c | 4 +- src/third_party/wiredtiger/src/btree/col_srch.c | 24 +-- src/third_party/wiredtiger/src/btree/row_modify.c | 6 +- src/third_party/wiredtiger/src/btree/row_srch.c | 19 +- src/third_party/wiredtiger/src/cache/cache_las.c | 13 +- src/third_party/wiredtiger/src/conn/conn_dhandle.c | 43 +++- src/third_party/wiredtiger/src/include/btree.i | 7 + src/third_party/wiredtiger/src/include/cursor.i | 9 +- src/third_party/wiredtiger/src/include/dhandle.h | 1 + src/third_party/wiredtiger/src/include/extern.h | 19 +- .../wiredtiger/src/include/wiredtiger.in | 5 +- .../wiredtiger/src/include/wt_internal.h | 10 + src/third_party/wiredtiger/src/meta/meta_ckpt.c | 59 ++++-- .../wiredtiger/src/reconcile/rec_write.c | 13 +- src/third_party/wiredtiger/src/support/modify.c | 51 +++++ src/third_party/wiredtiger/src/txn/txn_log.c | 28 ++- src/third_party/wiredtiger/test/syscall/syscall.py | 2 +- .../wiredtiger/test/syscall/wt2336_base/base.run | 41 ++-- 28 files changed, 438 insertions(+), 226 deletions(-) diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list index f199900e860..d585c1e268d 100644 --- a/src/third_party/wiredtiger/dist/s_define.list +++ b/src/third_party/wiredtiger/dist/s_define.list @@ -72,6 +72,7 @@ WT_TRACK_OP_END WT_TRACK_OP_INIT WT_TRET_ERROR_OK WT_UPDATE_SIZE +WT_USE_OPENAT WT_WITH_LOCK_NOWAIT WT_WITH_LOCK_WAIT __F diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index b3b95a4e50a..dc20a154981 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -153,6 +153,7 @@ Fsync Fuerst GCC GIDs +GLIBC Gcc Geoff GetEnvironmentVariableA @@ -1040,6 +1041,7 @@ online onpage oo opcode +openat opendir openfile oplist @@ -1200,6 +1202,7 @@ stdin stdout stepp str +strace strcmp strdup strerror diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 602318b9214..ae855e43736 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,6 +1,6 @@ { - "commit": "4c72feeb921607b30984301f4e007fc24b54e26b", - "github": "wiredtiger/wiredtiger.git", - "vendor": "wiredtiger", - "branch": "mongodb-3.6" + "vendor": "wiredtiger", + "github": "wiredtiger/wiredtiger.git", + "branch": "mongodb-3.6", + "commit": "723a4c13292b0bc7e27be411db4d006a0b865bd8" } diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c index c9cccc63bf6..d80186ca91b 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curnext.c +++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c @@ -150,7 +150,7 @@ new_page: if (cbt->ins == NULL) ++cbt->page_deleted_count; continue; } - return (__wt_value_return(session, cbt, upd)); + return (__wt_value_return(cbt, upd)); } /* NOTREACHED */ } @@ -211,7 +211,7 @@ new_page: /* Find the matching WT_COL slot. */ ++cbt->page_deleted_count; continue; } - return (__wt_value_return(session, cbt, upd)); + return (__wt_value_return(cbt, upd)); } /* @@ -331,7 +331,7 @@ new_insert: if ((ins = cbt->ins) != NULL) { } key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); - return (__wt_value_return(session, cbt, upd)); + return (__wt_value_return(cbt, upd)); } /* Check for the end of the page. */ @@ -468,8 +468,12 @@ __wt_cursor_key_order_check( * search. */ int -__wt_cursor_key_order_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) +__wt_cursor_key_order_init(WT_CURSOR_BTREE *cbt) { + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)cbt->iface.session; + /* * Cursor searches set the position for cursor movements, set the * last-key value for diagnostic checking. @@ -610,7 +614,7 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) * If the update, which returned prepared conflict is * visible, return the value. */ - return (__cursor_kv_return(session, cbt, upd)); + return (__cursor_kv_return(cbt, upd)); } } diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c index f72b935c441..daee4cef8f4 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curprev.c +++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c @@ -296,7 +296,7 @@ new_page: if (cbt->ins == NULL) ++cbt->page_deleted_count; continue; } - return (__wt_value_return(session, cbt, upd)); + return (__wt_value_return(cbt, upd)); } /* NOTREACHED */ } @@ -358,7 +358,7 @@ new_page: if (cbt->recno < cbt->ref->ref_recno) ++cbt->page_deleted_count; continue; } - return (__wt_value_return(session, cbt, upd)); + return (__wt_value_return(cbt, upd)); } /* @@ -488,7 +488,7 @@ new_insert: if ((ins = cbt->ins) != NULL) { } key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); - return (__wt_value_return(session, cbt, upd)); + return (__wt_value_return(cbt, upd)); } /* Check for the beginning of the page. */ @@ -564,7 +564,7 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) * If the update, which returned prepared conflict is * visible, return the value. */ - return (__cursor_kv_return(session, cbt, upd)); + return (__cursor_kv_return(cbt, upd)); } } diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index 8f4f7982e3d..9d62ef50e92 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -362,13 +362,15 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp, bool *valid) * Column-store search from a cursor. */ static inline int -__cursor_col_search( - WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_REF *leaf) +__cursor_col_search(WT_CURSOR_BTREE *cbt, WT_REF *leaf, bool *leaf_foundp) { WT_DECL_RET; + WT_SESSION_IMPL *session; - WT_WITH_PAGE_INDEX(session, - ret = __wt_col_search(session, cbt->iface.recno, leaf, cbt, false)); + session = (WT_SESSION_IMPL *)cbt->iface.session; + WT_WITH_PAGE_INDEX( + session, ret = __wt_col_search( + cbt, cbt->iface.recno, leaf, false, leaf_foundp)); return (ret); } @@ -378,12 +380,15 @@ __cursor_col_search( */ static inline int __cursor_row_search( - WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_REF *leaf, bool insert) + WT_CURSOR_BTREE *cbt, bool insert, WT_REF *leaf, bool *leaf_foundp) { WT_DECL_RET; + WT_SESSION_IMPL *session; - WT_WITH_PAGE_INDEX(session, ret = __wt_row_search( - session, &cbt->iface.key, leaf, cbt, insert, false)); + session = (WT_SESSION_IMPL *)cbt->iface.session; + WT_WITH_PAGE_INDEX( + session, ret = __wt_row_search( + cbt, &cbt->iface.key, insert, leaf, false, leaf_foundp)); return (ret); } @@ -392,11 +397,10 @@ __cursor_row_search( * Column-store modify from a cursor, with a separate value. */ static inline int -__cursor_col_modify_v(WT_SESSION_IMPL *session, - WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type) +__cursor_col_modify_v(WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type) { - return (__wt_col_modify(session, cbt, - cbt->iface.recno, value, NULL, modify_type, false)); + return (__wt_col_modify( + cbt, cbt->iface.recno, value, NULL, modify_type, false)); } /* @@ -404,11 +408,10 @@ __cursor_col_modify_v(WT_SESSION_IMPL *session, * Row-store modify from a cursor, with a separate value. */ static inline int -__cursor_row_modify_v(WT_SESSION_IMPL *session, - WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type) +__cursor_row_modify_v(WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type) { - return (__wt_row_modify(session, cbt, - &cbt->iface.key, value, NULL, modify_type, false)); + return (__wt_row_modify( + cbt, &cbt->iface.key, value, NULL, modify_type, false)); } /* @@ -416,11 +419,11 @@ __cursor_row_modify_v(WT_SESSION_IMPL *session, * Column-store modify from a cursor. */ static inline int -__cursor_col_modify( - WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, u_int modify_type) +__cursor_col_modify(WT_CURSOR_BTREE *cbt, u_int modify_type) { - return (__wt_col_modify(session, cbt, - cbt->iface.recno, &cbt->iface.value, NULL, modify_type, false)); + return (__wt_col_modify( + cbt, cbt->iface.recno, &cbt->iface.value, + NULL, modify_type, false)); } /* @@ -428,11 +431,11 @@ __cursor_col_modify( * Row-store modify from a cursor. */ static inline int -__cursor_row_modify( - WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, u_int modify_type) +__cursor_row_modify(WT_CURSOR_BTREE *cbt, u_int modify_type) { - return (__wt_row_modify(session, cbt, - &cbt->iface.key, &cbt->iface.value, NULL, modify_type, false)); + return (__wt_row_modify( + cbt, &cbt->iface.key, &cbt->iface.value, + NULL, modify_type, false)); } /* @@ -483,7 +486,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; - bool valid; + bool leaf_found, valid; btree = cbt->btree; cursor = &cbt->iface; @@ -517,19 +520,19 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) __wt_txn_cursor_op(session); WT_ERR(btree->type == BTREE_ROW ? - __cursor_row_search(session, cbt, cbt->ref, false) : - __cursor_col_search(session, cbt, cbt->ref)); + __cursor_row_search(cbt, false, cbt->ref, &leaf_found) : + __cursor_col_search(cbt, cbt->ref, &leaf_found)); /* Return, if prepare conflict encountered. */ - if (cbt->compare == 0) + if (leaf_found && cbt->compare == 0) WT_ERR(__wt_cursor_valid(cbt, &upd, &valid)); } if (!valid) { WT_ERR(__cursor_func_init(cbt, true)); WT_ERR(btree->type == BTREE_ROW ? - __cursor_row_search(session, cbt, NULL, false) : - __cursor_col_search(session, cbt, NULL)); + __cursor_row_search(cbt, false, NULL, NULL) : + __cursor_col_search(cbt, NULL, NULL)); /* Return, if prepare conflict encountered. */ if (cbt->compare == 0) @@ -537,7 +540,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) } if (valid) - ret = __cursor_kv_return(session, cbt, upd); + ret = __cursor_kv_return(cbt, upd); else if (__cursor_fix_implicit(btree, cbt)) { /* * Creating a record past the end of the tree in a fixed-length @@ -554,7 +557,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) #ifdef HAVE_DIAGNOSTIC if (ret == 0) - WT_ERR(__wt_cursor_key_order_init(session, cbt)); + WT_ERR(__wt_cursor_key_order_init(cbt)); #endif err: if (ret != 0) { @@ -578,7 +581,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) WT_SESSION_IMPL *session; WT_UPDATE *upd; int exact; - bool valid; + bool leaf_found, valid; btree = cbt->btree; cursor = &cbt->iface; @@ -610,34 +613,43 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) * pages in the case of column-store, search-near isn't an interesting * enough case for column-store to add the complexity needed to avoid * the tree search. - * - * Set the "insert" flag for the btree row-store search; we may intend - * to position the cursor at the end of the tree, rather than match an - * existing record. */ valid = false; if (btree->type == BTREE_ROW && __cursor_page_pinned(cbt)) { __wt_txn_cursor_op(session); - - WT_ERR(__cursor_row_search(session, cbt, cbt->ref, true)); + /* + * Set the "insert" flag for the btree row-store search; we may + * intend to position the cursor at the end of the tree, rather + * than match an existing record. + */ + WT_ERR(__cursor_row_search(cbt, true, cbt->ref, &leaf_found)); /* - * Search-near is trickier than search when searching an already - * pinned page. If search returns the first or last page slots, - * discard the results and search the full tree as the neighbor - * pages might offer better matches. This test is simplistic as - * we're ignoring append lists (there may be no page slots or we - * might be legitimately positioned after the last page slot). - * Ignore those cases, it makes things too complicated. + * Only use the pinned page search results if search returns an + * exact match or a slot other than the page's boundary slots, + * if that's not the case, a neighbor page might offer a better + * match. This test is simplistic as we're ignoring append + * lists (there may be no page slots or we might be + * legitimately positioned after the last page slot). Ignore + * those cases, it makes things too complicated. */ - if (cbt->slot != 0 && cbt->slot != cbt->ref->page->entries - 1) + if (leaf_found && + (cbt->compare == 0 || + (cbt->slot != 0 && + cbt->slot != cbt->ref->page->entries - 1))) WT_ERR(__wt_cursor_valid(cbt, &upd, &valid)); } if (!valid) { WT_ERR(__cursor_func_init(cbt, true)); + + /* + * Set the "insert" flag for the btree row-store search; we may + * intend to position the cursor at the end of the tree, rather + * than match an existing record. + */ WT_ERR(btree->type == BTREE_ROW ? - __cursor_row_search(session, cbt, NULL, true) : - __cursor_col_search(session, cbt, NULL)); + __cursor_row_search(cbt, true, NULL, NULL) : + __cursor_col_search(cbt, NULL, NULL)); WT_ERR(__wt_cursor_valid(cbt, &upd, &valid)); } @@ -660,7 +672,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) */ if (valid) { exact = cbt->compare; - ret = __cursor_kv_return(session, cbt, upd); + ret = __cursor_kv_return(cbt, upd); } else if (__cursor_fix_implicit(btree, cbt)) { cbt->recno = cursor->recno; cbt->v = 0; @@ -711,7 +723,7 @@ err: if (ret == 0 && exactp != NULL) #ifdef HAVE_DIAGNOSTIC if (ret == 0) - WT_TRET(__wt_cursor_key_order_init(session, cbt)); + WT_TRET(__wt_cursor_key_order_init(cbt)); #endif if (ret != 0) { @@ -785,8 +797,8 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt) */ cbt->compare = 0; ret = btree->type == BTREE_ROW ? - __cursor_row_modify(session, cbt, WT_UPDATE_STANDARD) : - __cursor_col_modify(session, cbt, WT_UPDATE_STANDARD); + __cursor_row_modify(cbt, WT_UPDATE_STANDARD) : + __cursor_col_modify(cbt, WT_UPDATE_STANDARD); if (ret == 0) goto done; @@ -814,7 +826,7 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt) retry: WT_ERR(__cursor_func_init(cbt, true)); if (btree->type == BTREE_ROW) { - WT_ERR(__cursor_row_search(session, cbt, NULL, true)); + WT_ERR(__cursor_row_search(cbt, true, NULL, NULL)); /* * If not overwriting, fail if the key exists, else insert the * key/value pair. @@ -826,7 +838,7 @@ retry: WT_ERR(__cursor_func_init(cbt, true)); WT_ERR(WT_DUPLICATE_KEY); } - ret = __cursor_row_modify(session, cbt, WT_UPDATE_STANDARD); + ret = __cursor_row_modify(cbt, WT_UPDATE_STANDARD); } else if (append_key) { /* * Optionally insert a new record (ignoring the application's @@ -835,11 +847,11 @@ retry: WT_ERR(__cursor_func_init(cbt, true)); */ cbt->iface.recno = WT_RECNO_OOB; cbt->compare = 1; - WT_ERR(__cursor_col_search(session, cbt, NULL)); - WT_ERR(__cursor_col_modify(session, cbt, WT_UPDATE_STANDARD)); + WT_ERR(__cursor_col_search(cbt, NULL, NULL)); + WT_ERR(__cursor_col_modify(cbt, WT_UPDATE_STANDARD)); cursor->recno = cbt->recno; } else { - WT_ERR(__cursor_col_search(session, cbt, NULL)); + WT_ERR(__cursor_col_search(cbt, NULL, NULL)); /* * If not overwriting, fail if the key exists. Creating a @@ -856,7 +868,7 @@ retry: WT_ERR(__cursor_func_init(cbt, true)); WT_ERR(WT_DUPLICATE_KEY); } - WT_ERR(__cursor_col_modify(session, cbt, WT_UPDATE_STANDARD)); + WT_ERR(__cursor_col_modify(cbt, WT_UPDATE_STANDARD)); } err: if (ret == WT_RESTART) { @@ -940,7 +952,7 @@ __wt_btcur_insert_check(WT_CURSOR_BTREE *cbt) __cursor_novalue(cursor); retry: WT_ERR(__cursor_func_init(cbt, true)); - WT_ERR(__cursor_row_search(session, cbt, NULL, true)); + WT_ERR(__cursor_row_search(cbt, true, NULL, NULL)); /* Just check for conflicts. */ ret = __curfile_update_check(cbt); @@ -1045,8 +1057,8 @@ __wt_btcur_remove(WT_CURSOR_BTREE *cbt) */ cbt->compare = 0; ret = btree->type == BTREE_ROW ? - __cursor_row_modify(session, cbt, WT_UPDATE_TOMBSTONE) : - __cursor_col_modify(session, cbt, WT_UPDATE_TOMBSTONE); + __cursor_row_modify(cbt, WT_UPDATE_TOMBSTONE) : + __cursor_col_modify(cbt, WT_UPDATE_TOMBSTONE); if (ret == 0) goto done; goto err; @@ -1068,7 +1080,7 @@ retry: if (positioned == POSITIONED) WT_ERR(__cursor_func_init(cbt, true)); if (btree->type == BTREE_ROW) { - WT_ERR(__cursor_row_search(session, cbt, NULL, false)); + WT_ERR(__cursor_row_search(cbt, false, NULL, NULL)); /* Check whether an update would conflict. */ WT_ERR(__curfile_update_check(cbt)); @@ -1079,9 +1091,9 @@ retry: if (positioned == POSITIONED) if (!valid) WT_ERR(WT_NOTFOUND); - ret = __cursor_row_modify(session, cbt, WT_UPDATE_TOMBSTONE); + ret = __cursor_row_modify(cbt, WT_UPDATE_TOMBSTONE); } else { - WT_ERR(__cursor_col_search(session, cbt, NULL)); + WT_ERR(__cursor_col_search(cbt, NULL, NULL)); /* * If we find a matching record, check whether an update would @@ -1109,8 +1121,7 @@ retry: if (positioned == POSITIONED) */ cbt->recno = cursor->recno; } else - ret = __cursor_col_modify( - session, cbt, WT_UPDATE_TOMBSTONE); + ret = __cursor_col_modify(cbt, WT_UPDATE_TOMBSTONE); } err: if (ret == WT_RESTART) { @@ -1139,7 +1150,7 @@ done: switch (positioned) { * Positioned and we did a search anyway, get a key to * return. */ - WT_TRET(__wt_key_return(session, cbt)); + WT_TRET(__wt_key_return(cbt)); break; } } @@ -1194,7 +1205,7 @@ __btcur_update(WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type) WT_DECL_RET; WT_SESSION_IMPL *session; uint64_t yield_count, sleep_usecs; - bool valid; + bool leaf_found, valid; btree = cbt->btree; cursor = &cbt->iface; @@ -1227,8 +1238,8 @@ __btcur_update(WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type) */ cbt->compare = 0; ret = btree->type == BTREE_ROW ? - __cursor_row_modify_v(session, cbt, value, modify_type) : - __cursor_col_modify_v(session, cbt, value, modify_type); + __cursor_row_modify_v(cbt, value, modify_type) : + __cursor_col_modify_v(cbt, value, modify_type); if (ret == 0) goto done; @@ -1253,11 +1264,38 @@ __btcur_update(WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type) WT_ERR(__cursor_localvalue(cursor)); __cursor_state_save(cursor, &state); -retry: WT_ERR(__cursor_func_init(cbt, true)); + /* + * If our caller configures for a local search and we have a page + * pinned, do that search. + */ + if (F_ISSET(cursor, WT_CURSTD_UPDATE_LOCAL) + && __cursor_page_pinned(cbt)) { + __wt_txn_cursor_op(session); + WT_ERR(__wt_txn_autocommit_check(session)); + WT_ERR(btree->type == BTREE_ROW ? + __cursor_row_search(cbt, true, cbt->ref, &leaf_found) : + __cursor_col_search(cbt, cbt->ref, &leaf_found)); + /* + * Only use the pinned page search results if search returns an + * exact match or a slot other than the page's boundary slots, + * if that's not the case, a neighbor page might offer a better + * match. This test is simplistic as we're ignoring append + * lists (there may be no page slots or we might be + * legitimately positioned after the last page slot). Ignore + * those cases, it makes things too complicated. + */ + if (leaf_found && (cbt->compare == 0 + || (cbt->slot != 0 && cbt->slot != cbt->ref->page->entries - 1))) + goto update_local; + } +retry: + WT_ERR(__cursor_func_init(cbt, true)); + WT_ERR(btree->type == BTREE_ROW ? + __cursor_row_search(cbt, true, NULL, NULL) : + __cursor_col_search(cbt, NULL, NULL)); +update_local: if (btree->type == BTREE_ROW) { - WT_ERR(__cursor_row_search(session, cbt, NULL, true)); - /* * If not overwriting, check for conflicts and fail if the key * does not exist. @@ -1270,10 +1308,8 @@ retry: WT_ERR(__cursor_func_init(cbt, true)); if (!valid) WT_ERR(WT_NOTFOUND); } - ret = __cursor_row_modify_v(session, cbt, value, modify_type); + ret = __cursor_row_modify_v(cbt, value, modify_type); } else { - WT_ERR(__cursor_col_search(session, cbt, NULL)); - /* * If not overwriting, fail if the key doesn't exist. If we * find an update for the key, check for conflicts. Update the @@ -1291,7 +1327,7 @@ retry: WT_ERR(__cursor_func_init(cbt, true)); !__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); } - ret = __cursor_col_modify_v(session, cbt, value, modify_type); + ret = __cursor_col_modify_v(cbt, value, modify_type); } err: if (ret == WT_RESTART) { @@ -1313,8 +1349,7 @@ done: switch (modify_type) { /* * WT_CURSOR.update returns a key and a value. */ - ret = __cursor_kv_return( - session, cbt, cbt->modify_update); + ret = __cursor_kv_return(cbt, cbt->modify_update); break; case WT_UPDATE_RESERVE: /* @@ -1327,7 +1362,7 @@ done: switch (modify_type) { * WT_CURSOR.modify has already created the return value * and our job is to leave it untouched. */ - ret = __wt_key_return(session, cbt); + ret = __wt_key_return(cbt); break; case WT_UPDATE_BIRTHMARK: case WT_UPDATE_TOMBSTONE: @@ -1670,13 +1705,15 @@ __wt_btcur_equals(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *equalp) * tree. */ static int -__cursor_truncate(WT_SESSION_IMPL *session, +__cursor_truncate( WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop, - int (*rmfunc)(WT_SESSION_IMPL *, WT_CURSOR_BTREE *, u_int)) + int (*rmfunc)(WT_CURSOR_BTREE *, u_int)) { WT_DECL_RET; + WT_SESSION_IMPL *session; uint64_t yield_count, sleep_usecs; + session = (WT_SESSION_IMPL *)start->iface.session; yield_count = sleep_usecs = 0; /* @@ -1698,12 +1735,13 @@ __cursor_truncate(WT_SESSION_IMPL *session, * instantiated the end cursor, so we know that page is pinned in memory * and we can proceed without concern. */ -retry: WT_ERR(__wt_btcur_search(start)); +retry: + WT_ERR(__wt_btcur_search(start)); WT_ASSERT(session, F_MASK((WT_CURSOR *)start, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT); for (;;) { - WT_ERR(rmfunc(session, start, WT_UPDATE_TOMBSTONE)); + WT_ERR(rmfunc(start, WT_UPDATE_TOMBSTONE)); if (stop != NULL && __cursor_equals(start, stop)) return (0); @@ -1727,14 +1765,16 @@ err: if (ret == WT_RESTART) { * Discard a cursor range from fixed-width column-store tree. */ static int -__cursor_truncate_fix(WT_SESSION_IMPL *session, +__cursor_truncate_fix( WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop, - int (*rmfunc)(WT_SESSION_IMPL *, WT_CURSOR_BTREE *, u_int)) + int (*rmfunc)(WT_CURSOR_BTREE *, u_int)) { WT_DECL_RET; + WT_SESSION_IMPL *session; uint64_t yield_count, sleep_usecs; const uint8_t *value; + session = (WT_SESSION_IMPL *)start->iface.session; yield_count = sleep_usecs = 0; /* @@ -1763,7 +1803,7 @@ retry: WT_ERR(__wt_btcur_search(start)); for (;;) { value = (const uint8_t *)start->iface.value.data; if (*value != 0) - WT_ERR(rmfunc(session, start, WT_UPDATE_TOMBSTONE)); + WT_ERR(rmfunc(start, WT_UPDATE_TOMBSTONE)); if (stop != NULL && __cursor_equals(start, stop)) return (0); @@ -1797,6 +1837,7 @@ __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) btree = start->btree; WT_STAT_DATA_INCR(session, cursor_truncate); + WT_RET(__wt_txn_autocommit_check(session)); /* * For recovery, log the start and stop keys for a truncate operation, * not the individual records removed. On the other hand, for rollback @@ -1811,12 +1852,10 @@ __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) switch (btree->type) { case BTREE_COL_FIX: - WT_ERR(__cursor_truncate_fix( - session, start, stop, __cursor_col_modify)); + WT_ERR(__cursor_truncate_fix(start, stop, __cursor_col_modify)); break; case BTREE_COL_VAR: - WT_ERR(__cursor_truncate( - session, start, stop, __cursor_col_modify)); + WT_ERR(__cursor_truncate(start, stop, __cursor_col_modify)); break; case BTREE_ROW: /* @@ -1831,8 +1870,7 @@ __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) * that ever changes, we'd need to do something here to ensure a * fully instantiated cursor. */ - WT_ERR(__cursor_truncate( - session, start, stop, __cursor_row_modify)); + WT_ERR(__cursor_truncate(start, stop, __cursor_row_modify)); break; } diff --git a/src/third_party/wiredtiger/src/btree/bt_random.c b/src/third_party/wiredtiger/src/btree/bt_random.c index ed68513b245..a66cdb2c3b4 100644 --- a/src/third_party/wiredtiger/src/btree/bt_random.c +++ b/src/third_party/wiredtiger/src/btree/bt_random.c @@ -13,16 +13,18 @@ * Return a random key from a row-store leaf page. */ int -__wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) +__wt_row_random_leaf(WT_CURSOR_BTREE *cbt) { WT_INSERT *ins, **start, **stop; WT_INSERT_HEAD *ins_head; WT_PAGE *page; + WT_SESSION_IMPL *session; uint64_t samples; uint32_t choice, entries, i; int level; page = cbt->ref->page; + session = (WT_SESSION_IMPL *)cbt->iface.session; start = stop = NULL; /* [-Wconditional-uninitialized] */ entries = 0; /* [-Wconditional-uninitialized] */ @@ -425,11 +427,11 @@ random_page_entry: * Select a random entry from the leaf page. If it's not valid, move to * the next entry, if that doesn't work, move to the previous entry. */ - WT_ERR(__wt_row_random_leaf(session, cbt)); + WT_ERR(__wt_row_random_leaf(cbt)); WT_ERR(__wt_cursor_valid(cbt, &upd, &valid)); if (valid) { - WT_ERR(__wt_key_return(session, cbt)); - WT_ERR(__wt_value_return(session, cbt, upd)); + WT_ERR(__wt_key_return(cbt)); + WT_ERR(__wt_value_return(cbt, upd)); } else { if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND) ret = __wt_btcur_prev(cbt, false); diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index 69b0f95d205..5b0cba71c9c 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -31,9 +31,9 @@ __col_instantiate(WT_SESSION_IMPL *session, __wt_free_update_list(session, upd); /* Search the page and add updates. */ - WT_RET(__wt_col_search(session, recno, ref, cbt, true)); + WT_RET(__wt_col_search(cbt, recno, ref, true, NULL)); WT_RET(__wt_col_modify( - session, cbt, recno, NULL, updlist, WT_UPDATE_INVALID, false)); + cbt, recno, NULL, updlist, WT_UPDATE_INVALID, false)); return (0); } @@ -60,9 +60,9 @@ __row_instantiate(WT_SESSION_IMPL *session, __wt_free_update_list(session, upd); /* Search the page and add updates. */ - WT_RET(__wt_row_search(session, key, ref, cbt, true, true)); + WT_RET(__wt_row_search(cbt, key, true, ref, true, NULL)); WT_RET(__wt_row_modify( - session, cbt, key, NULL, updlist, WT_UPDATE_INVALID, false)); + cbt, key, NULL, updlist, WT_UPDATE_INVALID, false)); return (0); } diff --git a/src/third_party/wiredtiger/src/btree/bt_ret.c b/src/third_party/wiredtiger/src/btree/bt_ret.c index 60cb3d53699..8613bf38585 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ret.c +++ b/src/third_party/wiredtiger/src/btree/bt_ret.c @@ -13,15 +13,17 @@ * Change the cursor to reference an internal return key. */ static inline int -__key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) +__key_return(WT_CURSOR_BTREE *cbt) { WT_CURSOR *cursor; WT_ITEM *tmp; WT_PAGE *page; WT_ROW *rip; + WT_SESSION_IMPL *session; page = cbt->ref->page; cursor = &cbt->iface; + session = (WT_SESSION_IMPL *)cbt->iface.session; if (page->type == WT_PAGE_ROW_LEAF) { rip = &page->pg_row[cbt->slot]; @@ -78,7 +80,7 @@ __key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) * Change the cursor to reference an internal original-page return value. */ static inline int -__value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) +__value_return(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CELL *cell; @@ -86,8 +88,10 @@ __value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_CURSOR *cursor; WT_PAGE *page; WT_ROW *rip; + WT_SESSION_IMPL *session; uint8_t v; + session = (WT_SESSION_IMPL *)cbt->iface.session; btree = S2BT(session); page = cbt->ref->page; @@ -134,11 +138,12 @@ __value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) * value. */ int -__wt_value_return_upd(WT_SESSION_IMPL *session, +__wt_value_return_upd( WT_CURSOR_BTREE *cbt, WT_UPDATE *upd, bool ignore_visibility) { WT_CURSOR *cursor; WT_DECL_RET; + WT_SESSION_IMPL *session; WT_UPDATE **listp, *list[WT_MODIFY_ARRAY_SIZE]; size_t allocated_bytes; u_int i; @@ -146,6 +151,7 @@ __wt_value_return_upd(WT_SESSION_IMPL *session, cursor = &cbt->iface; allocated_bytes = 0; + session = (WT_SESSION_IMPL *)cbt->iface.session; /* * We're passed a "standard" or "modified" update that's visible to us. @@ -237,7 +243,7 @@ __wt_value_return_upd(WT_SESSION_IMPL *session, */ WT_ASSERT(session, cbt->slot != UINT32_MAX); - WT_ERR(__value_return(session, cbt)); + WT_ERR(__value_return(cbt)); } } else if (upd->type == WT_UPDATE_TOMBSTONE) WT_ERR(__wt_buf_set(session, &cursor->value, "", 0)); @@ -262,7 +268,7 @@ err: if (allocated_bytes != 0) * Change the cursor to reference an internal return key. */ int -__wt_key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) +__wt_key_return(WT_CURSOR_BTREE *cbt) { WT_CURSOR *cursor; @@ -279,7 +285,7 @@ __wt_key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) */ F_CLR(cursor, WT_CURSTD_KEY_EXT); if (!F_ISSET(cursor, WT_CURSTD_KEY_INT)) { - WT_RET(__key_return(session, cbt)); + WT_RET(__key_return(cbt)); F_SET(cursor, WT_CURSTD_KEY_INT); } return (0); @@ -290,8 +296,7 @@ __wt_key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) * Change the cursor to reference an internal return value. */ int -__wt_value_return( - WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +__wt_value_return(WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) { WT_CURSOR *cursor; @@ -299,9 +304,9 @@ __wt_value_return( F_CLR(cursor, WT_CURSTD_VALUE_EXT); if (upd == NULL) - WT_RET(__value_return(session, cbt)); + WT_RET(__value_return(cbt)); else - WT_RET(__wt_value_return_upd(session, cbt, upd, false)); + WT_RET(__wt_value_return_upd(cbt, upd, false)); F_SET(cursor, WT_CURSTD_VALUE_INT); return (0); } diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 0ea367fa360..80eacc95e02 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -1476,11 +1476,10 @@ __split_multi_inmem( recno = WT_INSERT_RECNO(supd->ins); /* Search the page. */ - WT_ERR(__wt_col_search( - session, recno, ref, &cbt, true)); + WT_ERR(__wt_col_search(&cbt, recno, ref, true, NULL)); /* Apply the modification. */ - WT_ERR(__wt_col_modify(session, &cbt, + WT_ERR(__wt_col_modify(&cbt, recno, NULL, upd, WT_UPDATE_INVALID, true)); break; case WT_PAGE_ROW_LEAF: @@ -1500,7 +1499,7 @@ __split_multi_inmem( /* Search the page. */ WT_ERR(__wt_row_search( - session, key, ref, &cbt, true, true)); + &cbt, key, true, ref, true, NULL)); /* * Birthmarks should only be applied to on-page values. @@ -1509,7 +1508,7 @@ __split_multi_inmem( upd->type != WT_UPDATE_BIRTHMARK); /* Apply the modification. */ - WT_ERR(__wt_row_modify(session, + WT_ERR(__wt_row_modify( &cbt, key, NULL, upd, WT_UPDATE_INVALID, true)); break; WT_ILLEGAL_VALUE_ERR(session, orig->type); diff --git a/src/third_party/wiredtiger/src/btree/col_modify.c b/src/third_party/wiredtiger/src/btree/col_modify.c index 233a88c9404..4183840a5f3 100644 --- a/src/third_party/wiredtiger/src/btree/col_modify.c +++ b/src/third_party/wiredtiger/src/btree/col_modify.c @@ -16,7 +16,7 @@ static int __col_insert_alloc( * Column-store delete, insert, and update. */ int -__wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, +__wt_col_modify(WT_CURSOR_BTREE *cbt, uint64_t recno, const WT_ITEM *value, WT_UPDATE *upd_arg, u_int modify_type, bool exclusive) { @@ -27,6 +27,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, **ins_headp; WT_PAGE *page; WT_PAGE_MODIFY *mod; + WT_SESSION_IMPL *session; WT_UPDATE *old_upd, *upd; size_t ins_size, upd_size; u_int i, skipdepth; @@ -35,6 +36,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, btree = cbt->btree; ins = NULL; page = cbt->ref->page; + session = (WT_SESSION_IMPL *)cbt->iface.session; upd = upd_arg; append = logged = false; diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c index e72ee7455da..ee49b5340a4 100644 --- a/src/third_party/wiredtiger/src/btree/col_srch.c +++ b/src/third_party/wiredtiger/src/btree/col_srch.c @@ -61,8 +61,9 @@ __check_leaf_key_range(WT_SESSION_IMPL *session, * Search a column-store tree for a specific record-based key. */ int -__wt_col_search(WT_SESSION_IMPL *session, - uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool restore) +__wt_col_search( + WT_CURSOR_BTREE *cbt, uint64_t search_recno, + WT_REF *leaf, bool leaf_safe, bool *leaf_foundp) { WT_BTREE *btree; WT_COL *cip; @@ -72,10 +73,12 @@ __wt_col_search(WT_SESSION_IMPL *session, WT_PAGE *page; WT_PAGE_INDEX *pindex, *parent_pindex; WT_REF *current, *descent; + WT_SESSION_IMPL *session; uint64_t recno; uint32_t base, indx, limit, read_flags; int depth; + session = (WT_SESSION_IMPL *)cbt->iface.session; btree = S2BT(session); current = NULL; @@ -93,23 +96,20 @@ __wt_col_search(WT_SESSION_IMPL *session, * the normal case where we are searching a tree, check the page's * parent keys before doing the full search, it's faster when the * cursor is being re-positioned. Skip this if the page is being - * re-instantiated in memory. + * re-instantiated in memory. when the cursor is being re-positioned. + * Skip that check if we know the page is the right one + * (for example, when re-instantiating a page in memory, in that + * case we know the target must be on the current page). */ if (leaf != NULL) { WT_ASSERT(session, search_recno != WT_RECNO_OOB); - if (!restore) { + if (!leaf_safe) { WT_RET(__check_leaf_key_range( session, recno, leaf, cbt)); - if (cbt->compare != 0) { - /* - * !!! - * WT_CURSOR.search_near uses the slot value to - * decide if there was an on-page match. - */ - cbt->slot = 0; + *leaf_foundp = cbt->compare == 0; + if (!*leaf_foundp) return (0); - } } current = leaf; diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c index 0f89d09f948..c5904916e66 100644 --- a/src/third_party/wiredtiger/src/btree/row_modify.c +++ b/src/third_party/wiredtiger/src/btree/row_modify.c @@ -41,8 +41,8 @@ err: __wt_free(session, modify); * Row-store insert, update and delete. */ int -__wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, - const WT_ITEM *key, const WT_ITEM *value, +__wt_row_modify( + WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_ITEM *value, WT_UPDATE *upd_arg, u_int modify_type, bool exclusive) { WT_DECL_RET; @@ -50,6 +50,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, **ins_headp; WT_PAGE *page; WT_PAGE_MODIFY *mod; + WT_SESSION_IMPL *session; WT_UPDATE *old_upd, *upd, **upd_entry; size_t ins_size, upd_size; uint32_t ins_slot; @@ -58,6 +59,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, ins = NULL; page = cbt->ref->page; + session = (WT_SESSION_IMPL *)cbt->iface.session; upd = upd_arg; logged = false; diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c index 5dff4b6fa60..5a582196557 100644 --- a/src/third_party/wiredtiger/src/btree/row_srch.c +++ b/src/third_party/wiredtiger/src/btree/row_srch.c @@ -205,9 +205,8 @@ __check_leaf_key_range(WT_SESSION_IMPL *session, * Search a row-store tree for a specific key. */ int -__wt_row_search(WT_SESSION_IMPL *session, - WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, - bool insert, bool restore) +__wt_row_search(WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key, + bool insert, WT_REF *leaf, bool leaf_safe, bool *leaf_foundp) { WT_BTREE *btree; WT_COLLATOR *collator; @@ -218,11 +217,13 @@ __wt_row_search(WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, *parent_pindex; WT_REF *current, *descent; WT_ROW *rip; + WT_SESSION_IMPL *session; size_t match, skiphigh, skiplow; uint32_t base, indx, limit, read_flags; int cmp, depth; bool append_check, descend_right, done; + session = (WT_SESSION_IMPL *)cbt->iface.session; btree = S2BT(session); collator = btree->collator; item = cbt->tmp; @@ -258,18 +259,12 @@ __wt_row_search(WT_SESSION_IMPL *session, * re-instantiated in memory. */ if (leaf != NULL) { - if (!restore) { + if (!leaf_safe) { WT_RET(__check_leaf_key_range( session, srch_key, leaf, cbt)); - if (cbt->compare != 0) { - /* - * !!! - * WT_CURSOR.search_near uses the slot value to - * decide if there was an on-page match. - */ - cbt->slot = 0; + *leaf_foundp = cbt->compare == 0; + if (!*leaf_foundp) return (0); - } } current = leaf; diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c index b38da22fc4a..35a76210a0c 100644 --- a/src/third_party/wiredtiger/src/cache/cache_las.c +++ b/src/third_party/wiredtiger/src/cache/cache_las.c @@ -704,6 +704,12 @@ __wt_las_insert_block(WT_CURSOR *cursor, WT_ERR(__wt_txn_begin(session, NULL)); local_txn = true; + /* + * Inserts should be on the same page absent a split, search any pinned + * leaf page. + */ + F_SET(cursor, WT_CURSTD_UPDATE_LOCAL); + /* Enter each update in the boundary's list into the lookaside store. */ for (las_counter = 0, i = 0, list = multi->supd; i < multi->supd_entries; ++i, ++list) { @@ -799,10 +805,8 @@ __wt_las_insert_block(WT_CURSOR *cursor, upd->type, &las_value); /* - * Using update looks a little strange because the keys - * are guaranteed to not exist, but since we're - * appending, we want the cursor to stay positioned in - * between inserts. + * Using update instead of insert so the page stays + * pinned and can be searched before the tree. */ WT_ERR(cursor->update(cursor)); ++insert_cnt; @@ -831,6 +835,7 @@ err: /* Resolve the transaction. */ } __las_restore_isolation(session, saved_isolation); + F_CLR(cursor, WT_CURSTD_UPDATE_LOCAL); if (ret == 0 && insert_cnt > 0) { multi->page_las.las_pageid = las_pageid; diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c index eeaa71683f1..aa1bd4cb08a 100644 --- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c +++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c @@ -25,6 +25,7 @@ __conn_dhandle_config_clear(WT_SESSION_IMPL *session) for (a = dhandle->cfg; *a != NULL; ++a) __wt_free(session, *a); __wt_free(session, dhandle->cfg); + __wt_free(session, dhandle->meta_base); } /* @@ -36,9 +37,12 @@ __conn_dhandle_config_set(WT_SESSION_IMPL *session) { WT_DATA_HANDLE *dhandle; WT_DECL_RET; - char *metaconf; + const char *base, *cfg[3]; + char *metaconf, *tmp; dhandle = session->dhandle; + base = NULL; + tmp = NULL; /* * Read the object's entry from the metadata file, we're done if we @@ -68,8 +72,40 @@ __conn_dhandle_config_set(WT_SESSION_IMPL *session) WT_ERR(__wt_calloc_def(session, 3, &dhandle->cfg)); switch (dhandle->type) { case WT_DHANDLE_TYPE_BTREE: + /* + * We are stripping out the checkpoint and checkpoint_lsn + * information from the config string. We save the rest of + * the metadata string, that is essentially static and + * unchanging and then concatenate the new checkpoint and + * LSN information on each checkpoint. The reason is + * performance and avoiding a lot of calls to the config + * parsing functions during a checkpoint for information + * that changes in a very well known way. + */ + cfg[0] = metaconf; + cfg[1] = "checkpoint=()"; + cfg[2] = NULL; WT_ERR(__wt_strdup(session, WT_CONFIG_BASE(session, file_meta), &dhandle->cfg[0])); + WT_ASSERT(session, dhandle->meta_base == NULL); + /* + * First collapse and overwrite any checkpoint information + * because we do not know the name or how many checkpoints + * may be in this metadata. So first we have to set the string + * to the empty checkpoint string and call collapse to + * overwrite anything existing. + */ + WT_ERR(__wt_config_collapse(session, cfg, &tmp)); + /* + * Now strip out the checkpoint and checkpoint LSN items + * from the configuration string and that is now our + * base metadata string. + */ + cfg[0] = tmp; + cfg[1] = NULL; + WT_ERR(__wt_config_merge(session, + cfg, "checkpoint=,checkpoint_lsn=", &base)); + __wt_free(session, tmp); break; case WT_DHANDLE_TYPE_TABLE: WT_ERR(__wt_strdup(session, @@ -77,9 +113,12 @@ __conn_dhandle_config_set(WT_SESSION_IMPL *session) break; } dhandle->cfg[1] = metaconf; + dhandle->meta_base = base; return (0); -err: __wt_free(session, metaconf); +err: __wt_free(session, base); + __wt_free(session, metaconf); + __wt_free(session, tmp); return (ret); } diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index 0984dc93d57..61fb79db907 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -516,6 +516,13 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) __wt_atomic_add32(&page->modify->page_state, 1) == WT_PAGE_DIRTY_FIRST) { __wt_cache_dirty_incr(session, page); + /* + * In the event we dirty a page which is flagged for eviction + * soon, we update its read generation to avoid evicting a + * dirty page prematurely. + */ + if (page->read_gen == WT_READGEN_WONT_NEED) + __wt_cache_read_gen_new(session, page); /* * We won the race to dirty the page, but another thread could diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i index cb665e17f5b..52a9736e383 100644 --- a/src/third_party/wiredtiger/src/include/cursor.i +++ b/src/third_party/wiredtiger/src/include/cursor.i @@ -315,11 +315,10 @@ __wt_cursor_dhandle_decr_use(WT_SESSION_IMPL *session) * Return a page referenced key/value pair to the application. */ static inline int -__cursor_kv_return( - WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +__cursor_kv_return(WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) { - WT_RET(__wt_key_return(session, cbt)); - WT_RET(__wt_value_return(session, cbt, upd)); + WT_RET(__wt_key_return(cbt)); + WT_RET(__wt_value_return(cbt, upd)); return (0); } @@ -457,7 +456,7 @@ value: * (if any) is visible. */ if (upd != NULL) - return (__wt_value_return(session, cbt, upd)); + return (__wt_value_return(cbt, upd)); /* Else, simple values have their location encoded in the WT_ROW. */ if (__wt_row_leaf_value(page, rip, vb)) diff --git a/src/third_party/wiredtiger/src/include/dhandle.h b/src/third_party/wiredtiger/src/include/dhandle.h index f47db3f762c..4b58ed126d9 100644 --- a/src/third_party/wiredtiger/src/include/dhandle.h +++ b/src/third_party/wiredtiger/src/include/dhandle.h @@ -74,6 +74,7 @@ struct __wt_data_handle { uint64_t name_hash; /* Hash of name */ const char *checkpoint; /* Checkpoint name (or NULL) */ const char **cfg; /* Configuration information */ + const char *meta_base; /* Base metadata configuration */ /* * Sessions holding a connection's data handle will have a non-zero diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index afc89795500..251c9393c6c 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -95,7 +95,7 @@ extern int __wt_bloom_drop(WT_BLOOM *bloom, const char *config) WT_GCC_FUNC_DECL extern int __wt_compact(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, void *context, bool *skipp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_cursor_key_order_check(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_cursor_key_order_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_cursor_key_order_init(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_cursor_key_order_reset(WT_CURSOR_BTREE *cbt); extern void __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt); extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -157,7 +157,7 @@ extern int __wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNP extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint32_t alloc_entries, bool alloc_refs, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, uint32_t flags, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_row_random_leaf(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int @@ -167,9 +167,9 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #endif ); extern int __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_value_return_upd(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd, bool ignore_visibility) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_value_return_upd(WT_CURSOR_BTREE *cbt, WT_UPDATE *upd, bool ignore_visibility) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_key_return(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_value_return(WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp, bool closing) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -186,8 +186,8 @@ extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flag extern int __wt_tree_walk_count(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_tree_walk_custom_skip(WT_SESSION_IMPL *session, WT_REF **refp, int (*skip_func)(WT_SESSION_IMPL *, WT_REF *, void *, bool *), void *func_cookie, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_tree_walk_skip(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, const WT_ITEM *value, WT_UPDATE *upd_arg, u_int modify_type, bool exclusive) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool restore) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_col_modify(WT_CURSOR_BTREE *cbt, uint64_t recno, const WT_ITEM *value, WT_UPDATE *upd_arg, u_int modify_type, bool exclusive) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_col_search(WT_CURSOR_BTREE *cbt, uint64_t search_recno, WT_REF *leaf, bool leaf_safe, bool *leaf_foundp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_leaf_key_copy(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_leaf_key_work(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip_arg, WT_ITEM *keyb, bool instantiate) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -195,12 +195,12 @@ extern int __wt_row_ikey_alloc(WT_SESSION_IMPL *session, uint32_t cell_offset, c extern int __wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t cell_offset, const void *key, size_t size, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_ikey(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key, size_t size, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_ITEM *value, WT_UPDATE *upd_arg, u_int modify_type, bool exclusive) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_row_modify(WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_ITEM *value, WT_UPDATE *upd_arg, u_int modify_type, bool exclusive) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_insert_alloc(WT_SESSION_IMPL *session, const WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_update_alloc(WT_SESSION_IMPL *session, const WT_ITEM *value, WT_UPDATE **updp, size_t *sizep, u_int modify_type) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern WT_UPDATE *__wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd); extern int __wt_search_insert(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert, bool restore) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_row_search(WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key, bool insert, WT_REF *leaf, bool leaf_safe, bool *leaf_foundp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_las_config(WT_SESSION_IMPL *session, const char **cfg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern bool __wt_las_empty(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_las_stats_update(WT_SESSION_IMPL *session); @@ -731,6 +731,7 @@ extern void __wt_huffman_close(WT_SESSION_IMPL *session, void *huffman_arg); extern void __wt_print_huffman_code(void *huffman_arg, uint16_t symbol); extern int __wt_huffman_encode(WT_SESSION_IMPL *session, void *huffman_arg, const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg, const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern bool __wt_modify_idempotent(const void *modify) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_modify_pack(WT_SESSION_IMPL *session, WT_ITEM **modifyp, WT_MODIFY *entries, int nentries) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_modify_apply_api(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_MODIFY *entries, int nentries) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_modify_apply(WT_SESSION_IMPL *session, WT_CURSOR *cursor, const void *modify) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index 36f9ecff5c7..8828dd31f80 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -716,8 +716,9 @@ struct __wt_cursor { #define WT_CURSTD_OVERWRITE 0x02000u #define WT_CURSTD_RAW 0x04000u #define WT_CURSTD_RAW_SEARCH 0x08000u -#define WT_CURSTD_VALUE_EXT 0x10000u /* Value points out of tree. */ -#define WT_CURSTD_VALUE_INT 0x20000u /* Value points into tree. */ +#define WT_CURSTD_UPDATE_LOCAL 0x10000u +#define WT_CURSTD_VALUE_EXT 0x20000u /* Value points out of tree. */ +#define WT_CURSTD_VALUE_INT 0x40000u /* Value points into tree. */ /* AUTOMATIC FLAG VALUE GENERATION STOP */ #define WT_CURSTD_KEY_SET (WT_CURSTD_KEY_EXT | WT_CURSTD_KEY_INT) #define WT_CURSTD_VALUE_SET (WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT) diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h index df3ee2da174..c5a4693319e 100644 --- a/src/third_party/wiredtiger/src/include/wt_internal.h +++ b/src/third_party/wiredtiger/src/include/wt_internal.h @@ -328,6 +328,16 @@ union __wt_rand_state; #elif defined(_MSC_VER) #include "msvc.h" #endif +/* + * GLIBC 2.26 and later use the openat syscall to implement open. + * Set this flag so that our strace tests know to expect this. + */ +#ifdef __GLIBC_PREREQ +#if __GLIBC_PREREQ(2, 26) +#define WT_USE_OPENAT 1 +#endif +#endif + #include "hardware.h" #include "swap.h" diff --git a/src/third_party/wiredtiger/src/meta/meta_ckpt.c b/src/third_party/wiredtiger/src/meta/meta_ckpt.c index 13e84efc199..13467a6e635 100644 --- a/src/third_party/wiredtiger/src/meta/meta_ckpt.c +++ b/src/third_party/wiredtiger/src/meta/meta_ckpt.c @@ -14,7 +14,7 @@ static int __ckpt_load(WT_SESSION_IMPL *, WT_CONFIG_ITEM *, WT_CONFIG_ITEM *, WT_CKPT *); static int __ckpt_named( WT_SESSION_IMPL *, const char *, const char *, WT_CKPT *); -static int __ckpt_set(WT_SESSION_IMPL *, const char *, const char *); +static int __ckpt_set(WT_SESSION_IMPL *, const char *, const char *, bool); static int __ckpt_version_chk(WT_SESSION_IMPL *, const char *, const char *); /* @@ -94,7 +94,7 @@ __wt_meta_checkpoint_clear(WT_SESSION_IMPL *session, const char *fname) * metadata entry. If no entry is found to update and we're trying to * clear the checkpoint, just ignore it. */ - WT_RET_NOTFOUND_OK(__ckpt_set(session, fname, NULL)); + WT_RET_NOTFOUND_OK(__ckpt_set(session, fname, NULL, false)); return (0); } @@ -104,25 +104,46 @@ __wt_meta_checkpoint_clear(WT_SESSION_IMPL *session, const char *fname) * Set a file's checkpoint. */ static int -__ckpt_set(WT_SESSION_IMPL *session, const char *fname, const char *v) +__ckpt_set( + WT_SESSION_IMPL *session, const char *fname, const char *v, bool use_base) { + WT_DECL_ITEM(tmp); WT_DECL_RET; - const char *cfg[3]; + const char *cfg[3], *str; char *config, *newcfg; + /* + * If the caller knows we're on a path like checkpoints where we + * have a valid checkpoint and checkpoint LSN and should use the base, + * then use that faster path. Some paths don't have a dhandle or want + * to have the older value retained from the existing metadata. + * In those cases, use the slower path through configuration + * parsing functions. + */ config = newcfg = NULL; - - /* Retrieve the metadata for this file. */ - WT_ERR(__wt_metadata_search(session, fname, &config)); - - /* Replace the checkpoint entry. */ - cfg[0] = config; - cfg[1] = v == NULL ? "checkpoint=()" : v; - cfg[2] = NULL; - WT_ERR(__wt_config_collapse(session, cfg, &newcfg)); - WT_ERR(__wt_metadata_update(session, fname, newcfg)); - -err: __wt_free(session, config); + str = v == NULL ? "checkpoint=(),checkpoint_lsn=" : v; + if (use_base && session->dhandle != NULL) { + WT_ERR(__wt_scr_alloc(session, 0, &tmp)); + WT_ASSERT(session, strcmp(session->dhandle->name, fname) == 0); + /* + * Concatenate the metadata base string with the checkpoint + * string. + */ + WT_ERR(__wt_buf_fmt(session, + tmp, "%s,%s", session->dhandle->meta_base, str)); + WT_ERR(__wt_metadata_update(session, fname, tmp->mem)); + } else { + /* Retrieve the metadata for this file. */ + WT_ERR(__wt_metadata_search(session, fname, &config)); + /* Replace the checkpoint entry. */ + cfg[0] = config; + cfg[1] = str; + cfg[2] = NULL; + WT_ERR(__wt_config_collapse(session, cfg, &newcfg)); + WT_ERR(__wt_metadata_update(session, fname, newcfg)); + } +err: __wt_scr_free(session, &tmp); + __wt_free(session, config); __wt_free(session, newcfg); return (ret); } @@ -375,6 +396,7 @@ __wt_meta_ckptlist_set(WT_SESSION_IMPL *session, time_t secs; int64_t maxorder; const char *sep; + bool has_lsn; WT_ERR(__wt_scr_alloc(session, 0, &buf)); maxorder = 0; @@ -448,11 +470,14 @@ __wt_meta_ckptlist_set(WT_SESSION_IMPL *session, sep = ","; } WT_ERR(__wt_buf_catfmt(session, buf, ")")); + + has_lsn = ckptlsn != NULL; if (ckptlsn != NULL) WT_ERR(__wt_buf_catfmt(session, buf, ",checkpoint_lsn=(%" PRIu32 ",%" PRIuMAX ")", ckptlsn->l.file, (uintmax_t)ckptlsn->l.offset)); - WT_ERR(__ckpt_set(session, fname, buf->mem)); + + WT_ERR(__ckpt_set(session, fname, buf->mem, has_lsn)); err: __wt_scr_free(session, &buf); return (ret); diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index d1aaf901534..5821292f454 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -4727,7 +4727,6 @@ __rec_col_var(WT_SESSION_IMPL *session, page = pageref->page; last = r->last; vpack = &_vpack; - cbt = &r->update_modify_cbt; WT_RET(__rec_split_init(session, r, page, pageref->ref_recno, btree->maxleafpage_precomp)); @@ -4737,6 +4736,8 @@ __rec_col_var(WT_SESSION_IMPL *session, size = 0; upd = NULL; + cbt = &r->update_modify_cbt; + cbt->iface.session = (WT_SESSION *)session; /* * The salvage code may be calling us to reconcile a page where there * were missing records in the column-store name space. If taking the @@ -4856,7 +4857,7 @@ record_loop: /* case WT_UPDATE_MODIFY: cbt->slot = WT_COL_SLOT(page, cip); WT_ERR(__wt_value_return_upd( - session, cbt, upd, + cbt, upd, F_ISSET(r, WT_REC_VISIBLE_ALL))); data = cbt->iface.value.data; size = (uint32_t)cbt->iface.value.size; @@ -5101,7 +5102,7 @@ compare: /* */ cbt->slot = UINT32_MAX; WT_ERR(__wt_value_return_upd( - session, cbt, upd, + cbt, upd, F_ISSET(r, WT_REC_VISIBLE_ALL))); data = cbt->iface.value.data; size = (uint32_t)cbt->iface.value.size; @@ -5503,6 +5504,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session, btree = S2BT(session); cbt = &r->update_modify_cbt; + cbt->iface.session = (WT_SESSION *)session; slvg_skip = salvage == NULL ? 0 : salvage->skip; key = &r->k; @@ -5657,7 +5659,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session, switch (upd->type) { case WT_UPDATE_MODIFY: cbt->slot = WT_ROW_SLOT(page, rip); - WT_ERR(__wt_value_return_upd(session, cbt, upd, + WT_ERR(__wt_value_return_upd(cbt, upd, F_ISSET(r, WT_REC_VISIBLE_ALL))); WT_ERR(__rec_cell_build_val(session, r, cbt->iface.value.data, @@ -5875,6 +5877,7 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) btree = S2BT(session); cbt = &r->update_modify_cbt; + cbt->iface.session = (WT_SESSION *)session; key = &r->k; val = &r->v; @@ -5915,7 +5918,7 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) */ cbt->slot = UINT32_MAX; WT_RET(__wt_value_return_upd( - session, cbt, upd, F_ISSET(r, WT_REC_VISIBLE_ALL))); + cbt, upd, F_ISSET(r, WT_REC_VISIBLE_ALL))); WT_RET(__rec_cell_build_val(session, r, cbt->iface.value.data, cbt->iface.value.size, (uint64_t)0)); diff --git a/src/third_party/wiredtiger/src/support/modify.c b/src/third_party/wiredtiger/src/support/modify.c index 15f8a33772f..228cfff7fea 100644 --- a/src/third_party/wiredtiger/src/support/modify.c +++ b/src/third_party/wiredtiger/src/support/modify.c @@ -8,6 +8,57 @@ #include "wt_internal.h" +#define WT_MODIFY_FOREACH_BEGIN(mod, p, nentries, napplied) \ + do { \ + const size_t *__p = p; \ + const uint8_t *__data = (const uint8_t *)(__p + (size_t)(nentries)*3); \ + int __i; \ + for (__i = 0; __i < (nentries); ++__i) { \ + memcpy(&(mod).data.size, __p++, sizeof(size_t)); \ + memcpy(&(mod).offset, __p++, sizeof(size_t)); \ + memcpy(&(mod).size, __p++, sizeof(size_t)); \ + (mod).data.data = __data; \ + __data += (mod).data.size; \ + if (__i < (napplied)) \ + continue; + +#define WT_MODIFY_FOREACH_END \ + } \ + } \ + while (0) + +/* + * __wt_modify_idempotent -- + * Check if a modify operation is idempotent. + */ +bool +__wt_modify_idempotent(const void *modify) +{ + WT_MODIFY mod; + const size_t *p; + size_t tmp; + int nentries; + + /* Get the number of modify entries. */ + p = modify; + memcpy(&tmp, p++, sizeof(size_t)); + nentries = (int)tmp; + + WT_MODIFY_FOREACH_BEGIN (mod, p, nentries, 0) + { + /* + * If the number of bytes being replaced doesn't match the + * number of bytes being written, we're resizing and the + * operation isn't idempotent. + */ + if (mod.size != mod.data.size) + return (false); + } + WT_MODIFY_FOREACH_END; + + return (true); +} + /* * __wt_modify_pack -- * Pack a modify structure into a buffer. diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c index cf8e464239a..50547034d02 100644 --- a/src/third_party/wiredtiger/src/txn/txn_log.c +++ b/src/third_party/wiredtiger/src/txn/txn_log.c @@ -91,12 +91,24 @@ __txn_op_log(WT_SESSION_IMPL *session, #endif switch (upd->type) { case WT_UPDATE_MODIFY: - WT_RET(__wt_logop_row_modify_pack( - session, logrec, fileid, &cursor->key, &value)); + /* + * Write full updates to the log for size-changing + * modify operations: they aren't idempotent and + * recovery cannot guarantee that they will be applied + * exactly once. We rely on the cursor value already + * having the modify applied. + */ + if (__wt_modify_idempotent(upd->data)) + WT_RET(__wt_logop_row_modify_pack(session, + logrec, fileid, &cursor->key, &value)); + else + WT_RET( + __wt_logop_row_put_pack(session, logrec, + fileid, &cursor->key, &cursor->value)); break; case WT_UPDATE_STANDARD: - WT_RET(__wt_logop_row_put_pack( - session, logrec, fileid, &cursor->key, &value)); + WT_RET(__wt_logop_row_put_pack(session, + logrec, fileid, &cursor->key, &value)); break; case WT_UPDATE_TOMBSTONE: WT_RET(__wt_logop_row_remove_pack( @@ -110,8 +122,12 @@ __txn_op_log(WT_SESSION_IMPL *session, switch (upd->type) { case WT_UPDATE_MODIFY: - WT_RET(__wt_logop_col_modify_pack( - session, logrec, fileid, recno, &value)); + if (__wt_modify_idempotent(upd->data)) + WT_RET(__wt_logop_col_modify_pack(session, + logrec, fileid, recno, &value)); + else + WT_RET(__wt_logop_col_put_pack(session, + logrec, fileid, recno, &cursor->value)); break; case WT_UPDATE_STANDARD: WT_RET(__wt_logop_col_put_pack( diff --git a/src/third_party/wiredtiger/test/syscall/syscall.py b/src/third_party/wiredtiger/test/syscall/syscall.py index 1fdf157b0fb..23bb54be5ef 100644 --- a/src/third_party/wiredtiger/test/syscall/syscall.py +++ b/src/third_party/wiredtiger/test/syscall/syscall.py @@ -156,7 +156,7 @@ defines_used = [ 'HAVE_FTRUNCATE', 'O_ACCMODE', 'O_APPEND', 'O_ASYNC', 'O_CLOEXEC', 'O_CREAT', 'O_EXCL', 'O_EXLOCK', 'O_NOATIME', 'O_NOFOLLOW', 'O_NONBLOCK', 'O_RDONLY', 'O_RDWR', 'O_SHLOCK', - 'O_TRUNC', 'O_WRONLY' ] + 'O_TRUNC', 'O_WRONLY', 'WT_USE_OPENAT' ] ################################################################ diff --git a/src/third_party/wiredtiger/test/syscall/wt2336_base/base.run b/src/third_party/wiredtiger/test/syscall/wt2336_base/base.run index 328d5b8b7dd..56794dc2777 100644 --- a/src/third_party/wiredtiger/test/syscall/wt2336_base/base.run +++ b/src/third_party/wiredtiger/test/syscall/wt2336_base/base.run @@ -32,11 +32,11 @@ */ #ifdef __linux__ SYSTEM("Linux"); -#define OPEN_EXISTING(name, flags) open(name, flags) +#define OPEN_EXISTING(name, flags) OPEN(name, flags) #else /* __linux__ */ SYSTEM("Darwin"); #define O_NOATIME 0 -#define OPEN_EXISTING(name, flags) open(name, flags, 0) +#define OPEN_EXISTING(name, flags) OPEN(name, flags, 0) #endif /* __linux__ */ #ifdef HAVE_FTRUNCATE @@ -49,14 +49,20 @@ SYSTEM("Darwin"); #define FTRUNCATE(fd, len) /* do nothing */ #endif -TRACE("close,fdatasync,fsync,ftruncate,open,pwrite64,rename"); +#ifdef WT_USE_OPENAT +#define OPEN(...) openat(AT_FDCWD, __VA_ARGS__) +#else +#define OPEN(...) open(__VA_ARGS__) +#endif + +TRACE("close,fdatasync,fsync,ftruncate,open,openat,pwrite64,rename"); RUN(""); ... OUTPUT("--------------wiredtiger_open"); // lock == 3 -lock = open("./WiredTiger.lock", O_RDWR|O_CREAT|O_CLOEXEC, 0666); +lock = OPEN("./WiredTiger.lock", O_RDWR|O_CREAT|O_CLOEXEC, 0666); pwrite64(lock, "WiredTiger lock file\n", 0x15, 0x0); -fd = open("./WiredTiger", O_RDWR|O_CREAT|O_CLOEXEC, 0666); +fd = OPEN("./WiredTiger", O_RDWR|O_CREAT|O_CLOEXEC, 0666); pwrite64(fd, "WiredTiger\nWiredTiger"..., ...); #ifdef __linux__ fdatasync(fd); @@ -65,7 +71,7 @@ close(fd); ... // On Linux, there are calls to open and read "/proc/meminfo" here. -fd = open("./WiredTiger.basecfg.set", O_RDWR|O_CREAT|O_EXCL|O_CLOEXEC, 0666); +fd = OPEN("./WiredTiger.basecfg.set", O_RDWR|O_CREAT|O_EXCL|O_CLOEXEC, 0666); pwrite64(fd, "# Do not modify this file."..., ...); #ifdef __linux__ fdatasync(fd); @@ -74,15 +80,15 @@ close(fd); rename("./WiredTiger.basecfg.set", "./WiredTiger.basecfg"); #ifdef __linux__ -dir = open("./", O_RDONLY); +dir = OPEN("./", O_RDONLY); fdatasync(dir); close(dir); #endif -fd = open("./WiredTiger.wt", O_RDWR|O_CREAT|O_EXCL|O_NOATIME|O_CLOEXEC, 0666); +fd = OPEN("./WiredTiger.wt", O_RDWR|O_CREAT|O_EXCL|O_NOATIME|O_CLOEXEC, 0666); #ifdef __linux__ -dir = open("./", O_RDONLY); +dir = OPEN("./", O_RDONLY); fdatasync(dir); close(dir); #endif /* __linux__ */ @@ -96,7 +102,7 @@ close(fd); wt = OPEN_EXISTING("./WiredTiger.wt\0", O_RDWR|O_NOATIME|O_CLOEXEC); FTRUNCATE(wt, 0x1000); -fd = open("./WiredTiger.turtle.set\0", O_RDWR|O_CREAT|O_EXCL|O_CLOEXEC, 0666); +fd = OPEN("./WiredTiger.turtle.set\0", O_RDWR|O_CREAT|O_EXCL|O_CLOEXEC, 0666); pwrite64(fd, "WiredTiger version string\nWiredTiger"..., ...); #ifdef __linux__ fdatasync(fd); @@ -106,10 +112,10 @@ rename("./WiredTiger.turtle.set", "./WiredTiger.turtle"); ... // There is a second open of turtle here, is it important? -fd = open("./WiredTigerLAS.wt", O_RDWR|O_CREAT|O_EXCL|O_NOATIME|O_CLOEXEC, 0666); +fd = OPEN("./WiredTigerLAS.wt", O_RDWR|O_CREAT|O_EXCL|O_NOATIME|O_CLOEXEC, 0666); #ifdef __linux__ -dir = open("./", O_RDONLY); +dir = OPEN("./", O_RDONLY); fdatasync(dir); close(dir); #endif /* __linux__ */ @@ -131,10 +137,7 @@ pwrite64(wt, ""..., 0x1000, 0x3000); #ifdef __linux__ fdatasync(wt); #endif /* __linux__ */ -fd = OPEN_EXISTING("./WiredTiger.turtle", O_RDWR|O_CLOEXEC); - -close(fd); -fd = open("./WiredTiger.turtle.set", O_RDWR|O_CREAT|O_EXCL|O_CLOEXEC, 0666); +fd = OPEN("./WiredTiger.turtle.set", O_RDWR|O_CREAT|O_EXCL|O_CLOEXEC, 0666); pwrite64(fd, "WiredTiger version string\nWiredTiger"..., ...); #ifdef __linux__ fdatasync(fd); @@ -142,7 +145,7 @@ fdatasync(fd); close(fd); rename("./WiredTiger.turtle.set", "./WiredTiger.turtle"); #ifdef __linux__ -dir = open("./", O_RDONLY); +dir = OPEN("./", O_RDONLY); fdatasync(dir); close(dir); fdatasync(wt); @@ -151,9 +154,9 @@ fdatasync(wt); OUTPUT("--------------open_session"); OUTPUT("--------------create"); -hello = open("./hello.wt", O_RDWR|O_CREAT|O_EXCL|O_NOATIME|O_CLOEXEC, 0666); +hello = OPEN("./hello.wt", O_RDWR|O_CREAT|O_EXCL|O_NOATIME|O_CLOEXEC, 0666); #ifdef __linux__ -dir = open("./", O_RDONLY); +dir = OPEN("./", O_RDONLY); fdatasync(dir); close(dir); #endif /* __linux__ */ -- cgit v1.2.1