diff options
author | Luke Chen <luke.chen@mongodb.com> | 2018-02-26 17:13:06 +1100 |
---|---|---|
committer | Luke Chen <luke.chen@mongodb.com> | 2018-02-26 17:14:42 +1100 |
commit | b5b3fcc4c38cd289e9c0d3192946ed715d7e576f (patch) | |
tree | 4b3f175a28dd1fd7bde51a1eeaa57d132ad3b54a | |
parent | 0a0597248fa8234414c07ec6cd5a4fef51169148 (diff) | |
download | mongo-b5b3fcc4c38cd289e9c0d3192946ed715d7e576f.tar.gz |
Import wiredtiger: 8f5b5544d8e2ca861956a255cf079b3ad34ca0f3 from branch mongodb-3.8
ref: a6e72378a6..8f5b5544d8
for: 3.7.3
WT-1228 Improve performance of WT_SESSION::open_cursor
WT-3805 Avoid reading lookaside pages in truncate fast path
WT-3829 WiredTiger metadata can be logically inconsistent.
WT-3848 Enhance new prepare transaction API to enforce post conditions
WT-3850 Implement WT_SESSSION::prepare_transaction
WT-3867 Bi-weekly WT codebase lint
WT-3901 Corruption of operation tracking log files
WT-3904 Reconsider error path in log server thread
WT-3905 Save the timestamp used for a checkpoint
WT-3912 fast-delete pages should re-instantiate the delete transaction's timestamp.
WT-3923 __wt_txn_context_prepare_check() requires API initialization
WT-3925 Fix test format operation selection code
WT-3926 Allow read_timestamp to be set after begin_transaction
WT-3927 LSM truncate operations are too slow.
WT-3932 WiredTiger memory allocation failure in js_test
WT-3933 test/format failure illegal WT_REF.state rolling back deleted page
88 files changed, 3667 insertions, 1013 deletions
diff --git a/src/third_party/wiredtiger/.gitignore b/src/third_party/wiredtiger/.gitignore index e55cf131f59..49e737fe301 100644 --- a/src/third_party/wiredtiger/.gitignore +++ b/src/third_party/wiredtiger/.gitignore @@ -125,6 +125,7 @@ _wiredtiger.pyd **/test/csuite/test_wt3184_dup_index_collator **/test/csuite/test_wt3338_partial_update **/test/csuite/test_wt3363_checkpoint_op_races +**/test/csuite/test_wt3874_pad_byte_collator **/test/cursor_order/cursor_order **/test/fops/t **/test/format/s_dumpcmp diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py index 93fdb0e6a4f..a8c869a7ee1 100644 --- a/src/third_party/wiredtiger/dist/api_data.py +++ b/src/third_party/wiredtiger/dist/api_data.py @@ -337,6 +337,8 @@ file_meta = file_config + [ the file checkpoint entries'''), Config('checkpoint_lsn', '', r''' LSN of the last checkpoint'''), + Config('checkpoint_timestamp', '', r''' + stable timestamp of the last checkpoint'''), Config('id', '', r''' the file's ID number'''), Config('version', '(major=0,minor=0)', r''' @@ -719,6 +721,13 @@ wiredtiger_open_statistics_log_configuration = [ ] session_config = [ + Config('cache_cursors', 'true', r''' + enable caching of cursors for reuse. Any calls to WT_CURSOR::close + for a cursor created in this session will mark the cursor + as cached and keep it available to be reused for later calls + to WT_SESSION::open_cursor. Cached cursors may be eventually + closed.''', + type='boolean'), Config('ignore_cache_size', 'false', r''' when set, operations performed by this session ignore the cache size and are not blocked when the cache is full. Note that use of this @@ -1152,7 +1161,7 @@ methods = { Transactions with higher values are less likely to abort''', min='-100', max='100'), Config('read_timestamp', '', r''' - read using the specified timestamp. The supplied value should not be + read using the specified timestamp. The supplied value must not be older than the current oldest timestamp. See @ref transaction_timestamps'''), Config('round_to_oldest', 'false', r''' @@ -1171,8 +1180,8 @@ methods = { 'WT_SESSION.commit_transaction' : Method([ Config('commit_timestamp', '', r''' set the commit timestamp for the current transaction. The supplied - value should not be older than the first commit timestamp set for the - current transaction. The value should also not be older than the + value must not be older than the first commit timestamp set for the + current transaction. The value must also not be older than the current oldest and stable timestamps. See @ref transaction_timestamps'''), Config('sync', '', r''' @@ -1189,7 +1198,7 @@ methods = { 'WT_SESSION.prepare_transaction' : Method([ Config('prepare_timestamp', '', r''' set the prepare timestamp for the updates of the current transaction. - The supplied value should not be older than any active read timestamps. + The supplied value must not be older than any active read timestamps. This configuration option is mandatory. See @ref transaction_timestamps'''), ]), @@ -1197,10 +1206,18 @@ methods = { 'WT_SESSION.timestamp_transaction' : Method([ Config('commit_timestamp', '', r''' set the commit timestamp for the current transaction. The supplied - value should not be older than the first commit timestamp set for the - current transaction. The value should also not be older than the + value must not be older than the first commit timestamp set for the + current transaction. The value must also not be older than the current oldest and stable timestamps. See @ref transaction_timestamps'''), + Config('read_timestamp', '', r''' + read using the specified timestamp. The supplied value must not be + older than the current oldest timestamp. This can only be set once + for a transaction. @ref transaction_timestamps'''), + Config('round_to_oldest', 'false', r''' + if read timestamp is earlier than oldest timestamp, + read timestamp will be rounded to oldest timestamp''', + type='boolean'), ]), 'WT_SESSION.rollback_transaction' : Method([]), @@ -1339,7 +1356,7 @@ methods = { \c oldest_timestamp and the read timestamps of all active readers, and \c stable returns the most recent \c stable_timestamp set with WT_CONNECTION::set_timestamp. See @ref transaction_timestamps''', - choices=['all_committed','oldest','pinned','stable']), + choices=['all_committed','oldest','pinned','recovery','stable']), ]), 'WT_CONNECTION.set_timestamp' : Method([ @@ -1349,7 +1366,7 @@ methods = { timestamps greater than the specified value until the next commit moves the tracked commit timestamp forwards. This is only intended for use where the application is rolling back locally committed transactions. - The supplied value should not be older than the current oldest and + The supplied value must not be older than the current oldest and stable timestamps. See @ref transaction_timestamps'''), Config('force', 'false', r''' set timestamps even if they violate normal ordering requirements. @@ -1359,13 +1376,13 @@ methods = { future commits and queries will be no earlier than the specified timestamp. Supplied values must be monotonically increasing, any attempt to set the value to older than the current is silently ignored. - The supplied value should not be newer than the current + The supplied value must not be newer than the current stable timestamp. See @ref transaction_timestamps'''), Config('stable_timestamp', '', r''' checkpoints will not include commits that are newer than the specified timestamp in tables configured with \c log=(enabled=false). Supplied values must be monotonically increasing, any attempt to set the value to - older than the current is silently ignored. The supplied value should + older than the current is silently ignored. The supplied value must not be older than the current oldest timestamp. See @ref transaction_timestamps'''), ]), diff --git a/src/third_party/wiredtiger/dist/api_err.py b/src/third_party/wiredtiger/dist/api_err.py index bfa4459d438..2404755a49d 100644 --- a/src/third_party/wiredtiger/dist/api_err.py +++ b/src/third_party/wiredtiger/dist/api_err.py @@ -58,6 +58,12 @@ errors = [ more than the configured cache size to complete. The operation may be retried; if a transaction is in progress, it should be rolled back and the operation retried in a new transaction.'''), + Error('WT_PREPARE_CONFLICT', -31808, + 'conflict with a prepared update', ''' + This error is generated when the application attempts to update + an already updated record which is in prepared state. An updated + record will be in prepared state, when the transaction that performed + the update is in prepared state.'''), ] # Update the #defines in the wiredtiger.in file. diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index 5d2eb7427b6..07f890f450b 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -603,6 +603,7 @@ curtable cust customp cv +cval cx cxa dT diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index 20f3c72b2b7..a630ebe3fa9 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -284,11 +284,13 @@ connection_stats = [ ########################################## # Cursor operations ########################################## + CursorStat('cursor_cache', 'cursors cached on close'), CursorStat('cursor_create', 'cursor create calls'), CursorStat('cursor_insert', 'cursor insert calls'), CursorStat('cursor_modify', 'cursor modify calls'), CursorStat('cursor_next', 'cursor next calls'), CursorStat('cursor_prev', 'cursor prev calls'), + CursorStat('cursor_reopen', 'cursors reused from cache'), CursorStat('cursor_remove', 'cursor remove calls'), CursorStat('cursor_reserve', 'cursor reserve calls'), CursorStat('cursor_reset', 'cursor reset calls'), @@ -299,6 +301,14 @@ connection_stats = [ CursorStat('cursor_update', 'cursor update calls'), ########################################## + # Cursor sweep + ########################################## + CursorStat('cursor_sweep', 'cursor sweeps'), + CursorStat('cursor_sweep_buckets', 'cursor sweep buckets'), + CursorStat('cursor_sweep_examined', 'cursor sweep cursors examined'), + CursorStat('cursor_sweep_closed', 'cursor sweep cursors closed'), + + ########################################## # Dhandle statistics ########################################## DhandleStat('dh_conn_handle_count', 'connection data handles currently active', 'no_clear,no_scale'), @@ -661,6 +671,7 @@ dsrc_stats = [ ########################################## # Cursor operations ########################################## + CursorStat('cursor_cache', 'cursors cached on close'), CursorStat('cursor_create', 'create calls'), CursorStat('cursor_insert', 'insert calls'), CursorStat('cursor_insert_bulk', 'bulk-loaded cursor-insert calls'), @@ -670,6 +681,7 @@ dsrc_stats = [ CursorStat('cursor_prev', 'prev calls'), CursorStat('cursor_remove', 'remove calls'), CursorStat('cursor_remove_bytes', 'cursor-remove key bytes removed', 'size'), + CursorStat('cursor_reopen', 'cursors reused from cache'), CursorStat('cursor_reserve', 'reserve calls'), CursorStat('cursor_reset', 'reset calls'), CursorStat('cursor_restart', 'restarted searches'), @@ -716,6 +728,7 @@ dsrc_stats = [ ########################################## # Session operations ########################################## + SessionStat('session_cursor_cached', 'cached cursor count', 'no_clear,no_scale'), SessionStat('session_compact', 'object compaction'), SessionStat('session_cursor_open', 'open cursor count', 'no_clear,no_scale'), diff --git a/src/third_party/wiredtiger/examples/c/ex_all.c b/src/third_party/wiredtiger/examples/c/ex_all.c index 8e40b9f0b6c..62269b91263 100644 --- a/src/third_party/wiredtiger/examples/c/ex_all.c +++ b/src/third_party/wiredtiger/examples/c/ex_all.c @@ -900,6 +900,8 @@ transaction_ops(WT_SESSION *session_arg) error_check(session->commit_transaction(session, NULL)); /*! [transaction isolation] */ +#ifdef HAVE_TIMESTAMPS + { /*! [transaction prepare] */ /* * Prepare a transaction which guarantees a subsequent commit will @@ -911,9 +913,13 @@ transaction_ops(WT_SESSION *session_arg) error_check(session->begin_transaction(session, NULL)); cursor->set_key(cursor, "key"); cursor->set_value(cursor, "value"); - session->prepare_transaction(session, "prepare_timestamp=2a"); - error_check(session->commit_transaction(session, NULL)); + error_check(session->prepare_transaction( + session, "prepare_timestamp=2a")); + error_check(session->commit_transaction( + session, "commit_timestamp=2b")); /*! [transaction prepare] */ + } +#endif /*! [session isolation configuration] */ /* Open a session configured for read-uncommitted isolation. */ diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 7cf6268d421..e7683ae0963 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "a6e72378a60249228730667a2cba9a90c454b786", + "commit": "8f5b5544d8e2ca861956a255cf079b3ad34ca0f3", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-3.8" diff --git a/src/third_party/wiredtiger/lang/java/java_doc.i b/src/third_party/wiredtiger/lang/java/java_doc.i index 64dbf7850b0..0bb5c013c6f 100644 --- a/src/third_party/wiredtiger/lang/java/java_doc.i +++ b/src/third_party/wiredtiger/lang/java/java_doc.i @@ -18,6 +18,8 @@ COPYDOC(__wt_cursor, WT_CURSOR, remove) COPYDOC(__wt_cursor, WT_CURSOR, reserve) COPYDOC(__wt_cursor, WT_CURSOR, close) COPYDOC(__wt_cursor, WT_CURSOR, reconfigure) +COPYDOC(__wt_cursor, WT_CURSOR, cache) +COPYDOC(__wt_cursor, WT_CURSOR, reopen) COPYDOC(__wt_async_op, WT_ASYNC_OP, get_key) COPYDOC(__wt_async_op, WT_ASYNC_OP, get_value) COPYDOC(__wt_async_op, WT_ASYNC_OP, set_key) diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index 1372b964dbd..29725e22b2c 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -1579,7 +1579,7 @@ __cursor_truncate(WT_SESSION_IMPL *session, * may not have a cursor position (if the higher-level truncate code * switched the cursors to have an "external" cursor key, and because * we don't save a copy of the page's write generation information, - * which we need to remove records. + * which we need to remove records). * * Once that's done, we can delete records without a full search, unless * we encounter a restart error because the page was modified by some @@ -1593,22 +1593,22 @@ __cursor_truncate(WT_SESSION_IMPL *session, * instantiated the end cursor, so we know that page is pinned in memory * and we can proceed without concern. */ -retry: WT_RET(__wt_btcur_search(start)); +retry: WT_ERR(__wt_btcur_search(start)); WT_ASSERT(session, F_MASK((WT_CURSOR *)start, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT); for (;;) { - if ((ret = rmfunc(session, start, WT_UPDATE_TOMBSTONE)) != 0) - break; + WT_ERR(rmfunc(session, start, WT_UPDATE_TOMBSTONE)); if (stop != NULL && __cursor_equals(start, stop)) - break; - if ((ret = __wt_btcur_next(start, true)) != 0) - break; + return (0); + + WT_ERR(__wt_btcur_next(start, true)); + start->compare = 0; /* Exact match */ } - if (ret == WT_RESTART) { +err: if (ret == WT_RESTART) { WT_STAT_CONN_INCR(session, cursor_restart); WT_STAT_DATA_INCR(session, cursor_restart); goto retry; @@ -1642,31 +1642,31 @@ __cursor_truncate_fix(WT_SESSION_IMPL *session, * may not have a cursor position (if the higher-level truncate code * switched the cursors to have an "external" cursor key, and because * we don't save a copy of the page's write generation information, - * which we need to remove records. + * which we need to remove records). * * Once that's done, we can delete records without a full search, unless * we encounter a restart error because the page was modified by some * other thread of control; in that case, repeat the full search to * refresh the page's modification information. */ -retry: WT_RET(__wt_btcur_search(start)); +retry: WT_ERR(__wt_btcur_search(start)); WT_ASSERT(session, F_MASK((WT_CURSOR *)start, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT); for (;;) { value = (const uint8_t *)start->iface.value.data; - if (*value != 0 && - (ret = rmfunc(session, start, WT_UPDATE_TOMBSTONE)) != 0) - break; + if (*value != 0) + WT_ERR(rmfunc(session, start, WT_UPDATE_TOMBSTONE)); if (stop != NULL && __cursor_equals(start, stop)) - break; - if ((ret = __wt_btcur_next(start, true)) != 0) - break; + return (0); + + WT_ERR(__wt_btcur_next(start, true)); + start->compare = 0; /* Exact match */ } - if (ret == WT_RESTART) { +err: if (ret == WT_RESTART) { WT_STAT_CONN_INCR(session, cursor_restart); WT_STAT_DATA_INCR(session, cursor_restart); goto retry; diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c index a728341e033..acfb88371dd 100644 --- a/src/third_party/wiredtiger/src/btree/bt_delete.c +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -17,11 +17,13 @@ * * The way cursor truncate works in a row-store object is it explicitly reads * the first and last pages of the truncate range, then walks the tree with a - * flag so the cursor walk code marks any page within the range, that hasn't - * yet been read and which has no overflow items, as deleted, by changing the - * WT_REF state to WT_REF_DELETED. Pages already in the cache or with overflow - * items, have their rows updated/deleted individually. The transaction for the - * delete operation is stored in memory referenced by the WT_REF.page_del field. + * flag so the tree walk code skips reading eligible pages within the range + * and instead just marks them as deleted, by changing their WT_REF state to + * WT_REF_DELETED. Pages ineligible for this fast path include pages already + * in the cache, having overflow items, or requiring lookaside records. + * Ineligible pages are read and have their rows updated/deleted individually. + * The transaction for the delete operation is stored in memory referenced by + * the WT_REF.page_del field. * * Future cursor walks of the tree will skip the deleted page based on the * transaction stored for the delete, but it gets more complicated if a read is @@ -45,8 +47,8 @@ * transaction list is no longer useful. For this reason, when the page is * instantiated by a read, a list of the WT_UPDATE structures on the page is * stored in the WT_REF.page_del field, with the transaction ID, that way the - * session unrolling the delete can find all of the WT_UPDATE structures that - * require update. + * session committing/unrolling the delete can find all WT_UPDATE structures + * that require update. * * One final note: pages can also be marked deleted if emptied and evicted. In * that case, the WT_REF state will be set to WT_REF_DELETED but there will not @@ -65,6 +67,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_DECL_RET; WT_PAGE *parent; + uint32_t previous_state; *skipp = false; @@ -72,7 +75,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) if (ref->state == WT_REF_MEM && __wt_atomic_casv32(&ref->state, WT_REF_MEM, WT_REF_LOCKED)) { if (__wt_page_is_modified(ref->page)) { - WT_PUBLISH(ref->state, WT_REF_MEM); + ref->state = WT_REF_MEM; return (0); } @@ -80,15 +83,52 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) ret = __wt_evict(session, ref, false); (void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1); WT_RET_BUSY_OK(ret); + ret = 0; } /* - * Atomically switch the page's state to lock it. If the page is not - * on-disk, other threads may be using it, no fast delete. + * Fast check to see if it's worth locking, then atomically switch the + * page's state to lock it. */ - if (ref->state != WT_REF_DISK || - !__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED)) + previous_state = ref->state; + switch (previous_state) { + case WT_REF_DISK: + case WT_REF_LIMBO: + case WT_REF_LOOKASIDE: + break; + default: return (0); + } + if (!__wt_atomic_casv32(&ref->state, previous_state, WT_REF_LOCKED)) + return (0); + switch (previous_state) { + case WT_REF_DISK: + break; + case WT_REF_LIMBO: + case WT_REF_LOOKASIDE: + if (__wt_las_page_skip_locked(session, ref)) + break; + /* FALLTHROUGH */ + default: + ref->state = previous_state; + return (0); + } + + /* + * If this WT_REF was previously part of a fast-delete operation, there + * may be existing page-delete information. The structure is only read + * after a WT_REF_DELETED state is switched to locked: immediately after + * locking (from a state other than WT_REF_DELETED), free the previous + * version. + * + * Note: changes have been made, we must publish any state change from + * this point on. + */ + if (ref->page_del != NULL) { + WT_ASSERT(session, ref->page_del->txnid == WT_TXN_ABORTED); + __wt_free(session, ref->page_del->update_list); + __wt_free(session, ref->page_del); + } /* * We cannot fast-delete pages that have overflow key/value items as @@ -115,27 +155,24 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) */ WT_ERR(__wt_page_parent_modify_set(session, ref, false)); - /* - * Record the change in the transaction structure and set the change's - * transaction ID. - */ + /* Allocate and initialize the page-deleted structure. */ WT_ERR(__wt_calloc_one(session, &ref->page_del)); - ref->page_del->txnid = session->txn.id; + ref->page_del->previous_state = previous_state; - WT_ERR(__wt_txn_modify_ref(session, ref)); + WT_ERR(__wt_txn_modify_page_delete(session, ref)); *skipp = true; WT_STAT_CONN_INCR(session, rec_page_delete_fast); WT_STAT_DATA_INCR(session, rec_page_delete_fast); + + /* Publish the page to its new state, ensuring visibility. */ WT_PUBLISH(ref->state, WT_REF_DELETED); return (0); err: __wt_free(session, ref->page_del); - /* - * Restore the page to on-disk status, we'll have to instantiate it. - */ - WT_PUBLISH(ref->state, WT_REF_DISK); + /* Publish the page to its previous state, ensuring visibility. */ + WT_PUBLISH(ref->state, previous_state); return (ret); } @@ -143,7 +180,7 @@ err: __wt_free(session, ref->page_del); * __wt_delete_page_rollback -- * Abort pages that were deleted without being instantiated. */ -void +int __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) { WT_UPDATE **upd; @@ -157,21 +194,15 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) */ for (sleep_count = yield_count = 0;;) { switch (ref->state) { - case WT_REF_DISK: - case WT_REF_LIMBO: - case WT_REF_LOOKASIDE: - case WT_REF_READING: - WT_ASSERT(session, 0); /* Impossible, assert */ - break; case WT_REF_DELETED: /* * If the page is still "deleted", it's as we left it, * reset the state. */ - if (__wt_atomic_casv32( - &ref->state, WT_REF_DELETED, WT_REF_DISK)) - return; - break; + if (!__wt_atomic_casv32(&ref->state, + WT_REF_DELETED, ref->page_del->previous_state)) + break; + goto done; case WT_REF_LOCKED: /* * A possible state, the page is being instantiated. @@ -187,20 +218,21 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) * with unresolved transactions, the page isn't going * anywhere. * - * The page is in an in-memory state, walk the list of + * The page is in an in-memory state, which means it + * was instantiated at some point. Walk the list of * update structures and abort them. */ for (upd = ref->page_del->update_list; *upd != NULL; ++upd) (*upd)->txnid = WT_TXN_ABORTED; - - /* - * Discard the memory, the transaction can't abort - * twice. - */ - __wt_free(session, ref->page_del->update_list); - __wt_free(session, ref->page_del); - return; + goto done; + case WT_REF_DISK: + case WT_REF_LIMBO: + case WT_REF_LOOKASIDE: + case WT_REF_READING: + default: + return (__wt_illegal_value(session, + "illegal WT_REF.state rolling back deleted page")); } /* * We wait for the change in page state, yield before retrying, @@ -211,6 +243,13 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) WT_STAT_CONN_INCRV(session, page_del_rollback_blocked, sleep_count); } + +done: /* + * Now mark the truncate aborted: this must come last because after + * this point there is nothing preventing the page from being evicted. + */ + WT_PUBLISH(ref->page_del->txnid, WT_TXN_ABORTED); + return (0); } /* @@ -261,7 +300,7 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) */ if (skip && ref->page_del != NULL && (visible_all || __wt_txn_visible_all(session, ref->page_del->txnid, - WT_TIMESTAMP_NULL(&ref->page_del->timestamp)))) { + WT_TIMESTAMP_NULL(&ref->page_del->timestamp)))) { __wt_free(session, ref->page_del->update_list); __wt_free(session, ref->page_del); } @@ -287,7 +326,6 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) btree = S2BT(session); page = ref->page; - page_del = ref->page_del; /* * Give the page a modify structure. @@ -315,11 +353,12 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) * * Second, a truncate call deleted a page and the truncate committed, * but an older transaction in the system forced us to keep the old - * version of the page around, then we crashed and recovered, and now - * we're being forced to read that page. + * version of the page around, then we crashed and recovered or we're + * running inside a checkpoint, and now we're being forced to read that + * page. * - * In the first case, we have a page reference structure, in the second - * second, we don't. + * In the first case, we have a page reference structure, in the second, + * we don't. * * Allocate the per-reference update array; in the case of instantiating * a page, deleted by a running transaction that might eventually abort, @@ -327,19 +366,26 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) * hard case is if a page splits: the update structures might be moved * to different pages, and we still have to find them all for an abort. */ + page_del = ref->page_del; if (page_del != NULL) WT_RET(__wt_calloc_def( session, page->entries + 1, &page_del->update_list)); - /* Allocate the per-page update array. */ - WT_ERR(__wt_calloc_def(session, page->entries, &upd_array)); - page->modify->mod_row_update = upd_array; + /* + * Allocate the per-page update array if one doesn't already exist. + * Because deletes may be instantiated after lookaside table updates, + * the update array may already exist. + */ + if (page->modify->mod_row_update == NULL) + WT_ERR(__wt_calloc_def( + session, page->entries, &page->modify->mod_row_update)); /* * Fill in the per-reference update array with references to update * structures, fill in the per-page update array with references to * deleted items. */ + upd_array = page->modify->mod_row_update; for (i = 0, size = 0; i < page->entries; ++i) { WT_ERR(__wt_calloc_one(session, &upd)); upd->type = WT_UPDATE_TOMBSTONE; @@ -348,6 +394,8 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) upd->txnid = WT_TXN_NONE; /* Globally visible */ else { upd->txnid = page_del->txnid; + __wt_timestamp_set( + &upd->timestamp, &page_del->timestamp); page_del->update_list[i] = upd; } @@ -362,12 +410,10 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) return (0); err: /* - * There's no need to free the page update structures on error, our - * caller will discard the page and do that work for us. We could - * similarly leave the per-reference update array alone because it - * won't ever be used by any page that's not in-memory, but cleaning - * it up makes sense, especially if we come back in to this function - * attempting to instantiate this page again. + * The page-delete update structure may have existed before we were + * called, and presumably might be in use by a running transaction. + * The list of update structures cannot have been created before we + * were called, and should not exist if we exit with an error. */ if (page_del != NULL) __wt_free(session, page_del->update_list); diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c index 8219ee0d7ed..d191fec8502 100644 --- a/src/third_party/wiredtiger/src/btree/bt_page.c +++ b/src/third_party/wiredtiger/src/btree/bt_page.c @@ -465,14 +465,16 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep) * and the deletion committed, but older transactions * in the system required the previous version of the * page to remain available, a special deleted-address - * type cell is written. The only reason we'd ever see - * that cell on a page we're reading is if we crashed - * and recovered (otherwise a version of the page w/o - * that cell would have eventually been written). If we - * crash and recover to a page with a deleted-address - * cell, we want to discard the page from the backing - * store (it was never discarded), and, of course, by - * definition no earlier transaction will ever need it. + * type cell is written. We'll see that cell on a page + * if we read from a checkpoint including a deleted + * cell or if we crash/recover and start off from such + * a checkpoint (absent running recovery, a version of + * the page without the deleted cell would eventually + * have been written). If we crash and recover to a + * page with a deleted-address cell, we want to discard + * the page from the backing store (it was never + * discarded), and, of course, by definition no earlier + * transaction will ever need it. * * Re-create the state of a deleted page. */ diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index 20e6c8c7b4d..4ac0cb2da9b 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -67,100 +67,6 @@ __row_instantiate(WT_SESSION_IMPL *session, } /* - * __las_page_skip_locked -- - * Check if we can skip reading a locked page with lookaside entries. - */ -static inline bool -__las_page_skip_locked(WT_SESSION_IMPL *session, WT_REF *ref) -{ - WT_TXN *txn; - - txn = &session->txn; - - /* - * Skip lookaside pages if reading without a timestamp and all the - * updates in lookaside are in the past. - * - * Lookaside eviction preferentially chooses the newest updates when - * creating page images with no stable timestamp. If a stable timestamp - * has been set, we have to visit the page because eviction chooses old - * version of records in that case. - * - * One case where we may need to visit the page is if lookaside eviction - * is active in tree 2 when a checkpoint has started and is working its - * way through tree 1. In that case, lookaside may have created a page - * image with updates in the future of the checkpoint. - * - * We also need to instantiate a lookaside page if this is an update - * operation in progress. - */ - if (ref->page_las->invalid) - return (false); - - if (F_ISSET(txn, WT_TXN_UPDATE)) - return (false); - - if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)) - return (false); - - if (WT_TXNID_LE(txn->snap_min, ref->page_las->las_max_txn)) - return (false); - - if (!F_ISSET(txn, WT_TXN_HAS_TS_READ) && ref->page_las->las_skew_newest) - return (true); - -#ifdef HAVE_TIMESTAMPS - /* - * Skip lookaside pages if reading as of a timestamp, we evicted new - * versions of data and all the updates are in the past. - */ - if (F_ISSET(&session->txn, WT_TXN_HAS_TS_READ) && - ref->page_las->las_skew_newest && - __wt_timestamp_cmp( - &ref->page_las->onpage_timestamp, &session->txn.read_timestamp) < 0) - return (true); - - /* - * Skip lookaside pages if reading as of a timestamp, we evicted old - * versions of data and all the updates are in the future. - */ - if (F_ISSET(&session->txn, WT_TXN_HAS_TS_READ) && - !ref->page_las->las_skew_newest && - __wt_timestamp_cmp( - &ref->page_las->min_timestamp, &session->txn.read_timestamp) > 0) - return (true); -#endif - - return (false); -} - -/* - * __las_page_skip -- - * Check if we can skip reading a page with lookaside entries. - */ -static inline bool -__las_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) -{ - uint32_t previous_state; - bool skip; - - if ((previous_state = ref->state) != WT_REF_LIMBO && - previous_state != WT_REF_LOOKASIDE) - return (false); - - if (!__wt_atomic_casv32(&ref->state, previous_state, WT_REF_LOCKED)) - return (false); - - skip = __las_page_skip_locked(session, ref); - - /* Restore the state and push the change. */ - ref->state = previous_state; - WT_FULL_BARRIER(); - - return (skip); -} - -/* * __las_page_instantiate_verbose -- * Create a verbose message to display at most once per checkpoint when * performing a lookaside table read. @@ -571,23 +477,24 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) ref->page->dsk == NULL || F_ISSET(ref->page->dsk, WT_PAGE_LAS_UPDATE)); - /* - * If reading for a checkpoint, there's no additional work to do, the - * page on disk is correct as written. - */ - if (session->dhandle->checkpoint != NULL) { - WT_ASSERT(session, previous_state == WT_REF_DISK); - goto done; - } - skip_read: switch (previous_state) { case WT_REF_DELETED: - /* If the page was deleted, instantiate that information. */ + /* + * A fast-deleted page may also have lookaside information. The + * delete happened after page eviction (writing the lookaside + * information), first update based on the lookaside table and + * then apply the delete. + */ + if (ref->page_las != NULL) { + WT_ERR(__las_page_instantiate(session, ref, btree->id)); + ref->page_las->eviction_to_lookaside = false; + } + WT_ERR(__wt_delete_page_instantiate(session, ref)); break; case WT_REF_LOOKASIDE: - if (__las_page_skip_locked(session, ref)) { + if (__wt_las_page_skip_locked(session, ref)) { WT_STAT_CONN_INCR( session, cache_read_lookaside_skipped); ref->page_las->eviction_to_lookaside = true; @@ -601,21 +508,21 @@ skip_read: WT_STAT_CONN_INCR(session, cache_read_lookaside_delay); WT_ERR(__las_page_instantiate(session, ref, btree->id)); - - /* - * The page is instantiated so we no longer need the lookaside - * entries. Note we are discarding updates so the page must be - * marked available even if these operations fail. - * - * Don't free WT_REF.page_las, there may be concurrent readers. - */ - WT_TRET(__wt_las_remove_block( - session, btree->id, ref->page_las->las_pageid)); - ref->page_las->eviction_to_lookaside = false; break; } + /* + * We no longer need lookaside entries once the page is instantiated. + * There's no reason for the lookaside remove to fail, but ignore it + * if for some reason it fails, we've got a valid page. + * + * Don't free WT_REF.page_las, there may be concurrent readers. + */ + if (final_state == WT_REF_MEM && ref->page_las != NULL) + WT_IGNORE_RET(__wt_las_remove_block( + session, btree->id, ref->page_las->las_pageid)); + done: WT_PUBLISH(ref->state, final_state); return (ret); @@ -685,7 +592,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags * must be resolved before the tree can be * discarded. */ - if (__las_page_skip(session, ref)) { + if (__wt_las_page_skip(session, ref)) { __wt_tree_modify_set(session); return (WT_NOTFOUND); } @@ -778,7 +685,7 @@ read: /* if (current_state == WT_REF_LIMBO && ((!LF_ISSET(WT_READ_CACHE) || LF_ISSET(WT_READ_LOOKASIDE)) && - !__las_page_skip_locked(session, ref))) { + !__wt_las_page_skip_locked(session, ref))) { WT_RET(__wt_hazard_clear(session, ref)); goto read; } diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c index 1f0b9c4b285..569a0247e7b 100644 --- a/src/third_party/wiredtiger/src/cache/cache_las.c +++ b/src/third_party/wiredtiger/src/cache/cache_las.c @@ -354,6 +354,102 @@ __wt_las_cursor_close( } /* + * __wt_las_page_skip_locked -- + * Check if we can skip reading a page with lookaside entries, where + * the page is already locked. + */ +bool +__wt_las_page_skip_locked(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_TXN *txn; + + txn = &session->txn; + + /* + * Skip lookaside pages if reading without a timestamp and all the + * updates in lookaside are in the past. + * + * Lookaside eviction preferentially chooses the newest updates when + * creating page images with no stable timestamp. If a stable timestamp + * has been set, we have to visit the page because eviction chooses old + * version of records in that case. + * + * One case where we may need to visit the page is if lookaside eviction + * is active in tree 2 when a checkpoint has started and is working its + * way through tree 1. In that case, lookaside may have created a page + * image with updates in the future of the checkpoint. + * + * We also need to instantiate a lookaside page if this is an update + * operation in progress. + */ + if (ref->page_las->invalid) + return (false); + + if (F_ISSET(txn, WT_TXN_UPDATE)) + return (false); + + if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)) + return (false); + + if (WT_TXNID_LE(txn->snap_min, ref->page_las->las_max_txn)) + return (false); + + if (!F_ISSET(txn, WT_TXN_HAS_TS_READ) && ref->page_las->las_skew_newest) + return (true); + +#ifdef HAVE_TIMESTAMPS + /* + * Skip lookaside pages if reading as of a timestamp, we evicted new + * versions of data and all the updates are in the past. + */ + if (F_ISSET(&session->txn, WT_TXN_HAS_TS_READ) && + ref->page_las->las_skew_newest && + __wt_timestamp_cmp( + &ref->page_las->onpage_timestamp, &session->txn.read_timestamp) < 0) + return (true); + + /* + * Skip lookaside pages if reading as of a timestamp, we evicted old + * versions of data and all the updates are in the future. + */ + if (F_ISSET(&session->txn, WT_TXN_HAS_TS_READ) && + !ref->page_las->las_skew_newest && + __wt_timestamp_cmp( + &ref->page_las->min_timestamp, &session->txn.read_timestamp) > 0) + return (true); +#endif + + return (false); +} + +/* + * __wt_las_page_skip -- + * Check if we can skip reading a page with lookaside entries, where the + * page needs to be locked before checking. + */ +bool +__wt_las_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) +{ + uint32_t previous_state; + bool skip; + + if ((previous_state = ref->state) != WT_REF_LIMBO && + previous_state != WT_REF_LOOKASIDE) + return (false); + + if (!__wt_atomic_casv32(&ref->state, previous_state, WT_REF_LOCKED)) + return (false); + + skip = __wt_las_page_skip_locked(session, ref); + + /* Restore the state and push the change. */ + ref->state = previous_state; + WT_FULL_BARRIER(); + + return (skip); +} + +/* * __las_remove_block -- * Remove all records for a given page from the lookaside store. */ @@ -709,7 +805,7 @@ __wt_las_cursor_position(WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid) * Because of the special visibility rules for * lookaside, a new block can appear in between our * search and the block of interest. Keep trying while - * we have a key lower that we expect. + * we have a key lower than we expect. * * There may be no block of lookaside entries if they * have been removed by diff --git a/src/third_party/wiredtiger/src/config/config.c b/src/third_party/wiredtiger/src/config/config.c index 53f62daa5fb..b15bbdf83c7 100644 --- a/src/third_party/wiredtiger/src/config/config.c +++ b/src/third_party/wiredtiger/src/config/config.c @@ -744,9 +744,7 @@ int __wt_config_gets_def(WT_SESSION_IMPL *session, const char **cfg, const char *key, int def, WT_CONFIG_ITEM *value) { - static const WT_CONFIG_ITEM false_value = { - "", 0, 0, WT_CONFIG_ITEM_NUM - }; + WT_CONFIG_ITEM_STATIC_INIT(false_value); *value = false_value; value->val = def; diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index 98f3ca6a633..880efb71fd5 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -34,6 +34,7 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_load_extension[] = { }; static const WT_CONFIG_CHECK confchk_WT_CONNECTION_open_session[] = { + { "cache_cursors", "boolean", NULL, NULL, NULL, 0 }, { "ignore_cache_size", "boolean", NULL, NULL, NULL, 0 }, { "isolation", "string", NULL, "choices=[\"read-uncommitted\",\"read-committed\"," @@ -45,7 +46,7 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_open_session[] = { static const WT_CONFIG_CHECK confchk_WT_CONNECTION_query_timestamp[] = { { "get", "string", NULL, "choices=[\"all_committed\",\"oldest\",\"pinned\"," - "\"stable\"]", + "\"recovery\",\"stable\"]", NULL, 0 }, { NULL, NULL, NULL, NULL, NULL, 0 } }; @@ -450,6 +451,7 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_prepare_transaction[] = { }; static const WT_CONFIG_CHECK confchk_WT_SESSION_reconfigure[] = { + { "cache_cursors", "boolean", NULL, NULL, NULL, 0 }, { "ignore_cache_size", "boolean", NULL, NULL, NULL, 0 }, { "isolation", "string", NULL, "choices=[\"read-uncommitted\",\"read-committed\"," @@ -483,6 +485,8 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_snapshot[] = { static const WT_CONFIG_CHECK confchk_WT_SESSION_timestamp_transaction[] = { { "commit_timestamp", "string", NULL, NULL, NULL, 0 }, + { "read_timestamp", "string", NULL, NULL, NULL, 0 }, + { "round_to_oldest", "boolean", NULL, NULL, NULL, 0 }, { NULL, NULL, NULL, NULL, NULL, 0 } }; @@ -592,6 +596,7 @@ static const WT_CONFIG_CHECK confchk_file_meta[] = { { "cache_resident", "boolean", NULL, NULL, NULL, 0 }, { "checkpoint", "string", NULL, NULL, NULL, 0 }, { "checkpoint_lsn", "string", NULL, NULL, NULL, 0 }, + { "checkpoint_timestamp", "string", NULL, NULL, NULL, 0 }, { "checksum", "string", NULL, "choices=[\"on\",\"off\",\"uncompressed\"]", NULL, 0 }, @@ -1213,8 +1218,9 @@ static const WT_CONFIG_ENTRY config_entries[] = { confchk_WT_CONNECTION_load_extension, 4 }, { "WT_CONNECTION.open_session", - "ignore_cache_size=false,isolation=read-committed", - confchk_WT_CONNECTION_open_session, 2 + "cache_cursors=true,ignore_cache_size=false," + "isolation=read-committed", + confchk_WT_CONNECTION_open_session, 3 }, { "WT_CONNECTION.query_timestamp", "get=all_committed", @@ -1340,8 +1346,9 @@ static const WT_CONFIG_ENTRY config_entries[] = { NULL, 0 }, { "WT_SESSION.reconfigure", - "ignore_cache_size=false,isolation=read-committed", - confchk_WT_SESSION_reconfigure, 2 + "cache_cursors=true,ignore_cache_size=false," + "isolation=read-committed", + confchk_WT_SESSION_reconfigure, 3 }, { "WT_SESSION.rename", "", @@ -1368,8 +1375,8 @@ static const WT_CONFIG_ENTRY config_entries[] = { NULL, 0 }, { "WT_SESSION.timestamp_transaction", - "commit_timestamp=", - confchk_WT_SESSION_timestamp_transaction, 1 + "commit_timestamp=,read_timestamp=,round_to_oldest=false", + confchk_WT_SESSION_timestamp_transaction, 3 }, { "WT_SESSION.transaction_sync", "timeout_ms=1200000", @@ -1412,18 +1419,19 @@ static const WT_CONFIG_ENTRY config_entries[] = { "access_pattern_hint=none,allocation_size=4KB,app_metadata=," "assert=(commit_timestamp=none,read_timestamp=none)," "block_allocation=best,block_compressor=,cache_resident=false," - "checkpoint=,checkpoint_lsn=,checksum=uncompressed,collator=," - "columns=,dictionary=0,encryption=(keyid=,name=),format=btree," - "huffman_key=,huffman_value=,id=," - "ignore_in_memory_cache_size=false,internal_item_max=0," - "internal_key_max=0,internal_key_truncate=true," - "internal_page_max=4KB,key_format=u,key_gap=10,leaf_item_max=0," - "leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=0," - "log=(enabled=true),memory_page_max=5MB,os_cache_dirty_max=0," - "os_cache_max=0,prefix_compression=false,prefix_compression_min=4" - ",split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90," - "value_format=u,version=(major=0,minor=0)", - confchk_file_meta, 40 + "checkpoint=,checkpoint_lsn=,checkpoint_timestamp=," + "checksum=uncompressed,collator=,columns=,dictionary=0," + "encryption=(keyid=,name=),format=btree,huffman_key=," + "huffman_value=,id=,ignore_in_memory_cache_size=false," + "internal_item_max=0,internal_key_max=0," + "internal_key_truncate=true,internal_page_max=4KB,key_format=u," + "key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB," + "leaf_value_max=0,log=(enabled=true),memory_page_max=5MB," + "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false," + "prefix_compression_min=4,split_deepen_min_child=0," + "split_deepen_per_child=0,split_pct=90,value_format=u," + "version=(major=0,minor=0)", + confchk_file_meta, 41 }, { "index.meta", "app_metadata=,collator=,columns=,extractor=,immutable=false," diff --git a/src/third_party/wiredtiger/src/conn/api_strerror.c b/src/third_party/wiredtiger/src/conn/api_strerror.c index 63f982deb07..fd488c02f80 100644 --- a/src/third_party/wiredtiger/src/conn/api_strerror.c +++ b/src/third_party/wiredtiger/src/conn/api_strerror.c @@ -38,6 +38,8 @@ __wt_wiredtiger_error(int error) return ("WT_RUN_RECOVERY: recovery must be run to continue"); case WT_CACHE_FULL: return ("WT_CACHE_FULL: operation would overflow cache"); + case WT_PREPARE_CONFLICT: + return ("WT_PREPARE_CONFLICT: conflict with a prepared update"); } /* Windows strerror doesn't support ENOTSUP. */ diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c index d7355bffa59..acf88e8231e 100644 --- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c +++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c @@ -593,7 +593,7 @@ err: WT_DHANDLE_RELEASE(dhandle); */ static int __conn_dhandle_close_one(WT_SESSION_IMPL *session, - const char *uri, const char *checkpoint, bool mark_dead) + const char *uri, const char *checkpoint, bool removed, bool mark_dead) { WT_DECL_RET; @@ -623,6 +623,8 @@ __conn_dhandle_close_one(WT_SESSION_IMPL *session, if (ret == 0) ret = __wt_meta_track_sub_off(session); } + if (removed) + F_SET(session->dhandle, WT_DHANDLE_DROPPED); if (!WT_META_TRACKING(session)) WT_TRET(__wt_session_release_dhandle(session)); @@ -637,7 +639,7 @@ __conn_dhandle_close_one(WT_SESSION_IMPL *session, */ int __wt_conn_dhandle_close_all( - WT_SESSION_IMPL *session, const char *uri, bool mark_dead) + WT_SESSION_IMPL *session, const char *uri, bool removed, bool mark_dead) { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; @@ -655,7 +657,8 @@ __wt_conn_dhandle_close_all( * locking the live handle to fail fast if the tree is busy (e.g., with * cursors open or in a checkpoint). */ - WT_ERR(__conn_dhandle_close_one(session, uri, NULL, mark_dead)); + WT_ERR(__conn_dhandle_close_one( + session, uri, NULL, removed, mark_dead)); bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE; TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq) { @@ -665,7 +668,8 @@ __wt_conn_dhandle_close_all( continue; WT_ERR(__conn_dhandle_close_one( - session, dhandle->name, dhandle->checkpoint, mark_dead)); + session, dhandle->name, dhandle->checkpoint, removed, + mark_dead)); } err: session->dhandle = NULL; diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c index 9097e10ef5a..6e27d0f98d6 100644 --- a/src/third_party/wiredtiger/src/conn/conn_log.c +++ b/src/third_party/wiredtiger/src/conn/conn_log.c @@ -870,7 +870,7 @@ __log_server(void *arg) WT_DECL_RET; WT_LOG *log; WT_SESSION_IMPL *session; - uint64_t time_start, time_stop, timediff; + uint64_t retry, time_start, time_stop, timediff; bool did_work, signalled; session = arg; @@ -896,6 +896,7 @@ __log_server(void *arg) * takes to sync out an earlier file. */ did_work = true; + retry = 0; while (F_ISSET(conn, WT_CONN_SERVER_LOG)) { /* * Slots depend on future activity. Force out buffered @@ -940,7 +941,24 @@ __log_server(void *arg) ret = __log_archive_once(session, 0); __wt_writeunlock( session, &log->log_archive_lock); - WT_ERR(ret); + /* + * It is possible that an external + * process on some systems may prevent + * removal. If we get a permission + * error, retry a few times. + */ + if (ret == EACCES && + retry < WT_RETRY_MAX) { + retry++; + ret = 0; + } else { + /* + * Return the error if there is + * one or reset on success. + */ + WT_ERR(ret); + retry = 0; + } } else __wt_verbose(session, WT_VERB_LOG, "%s", "log_archive: Blocked due to open " diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c index a2856843354..6fa87d47a91 100644 --- a/src/third_party/wiredtiger/src/conn/conn_open.c +++ b/src/third_party/wiredtiger/src/conn/conn_open.c @@ -175,6 +175,7 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) if (!F_ISSET(conn, WT_CONN_LEAK_MEMORY)) if ((s = conn->sessions) != NULL) for (i = 0; i < conn->session_size; ++s, ++i) { + __wt_free(session, s->cursor_cache); __wt_free(session, s->dhhash); __wt_stash_discard_all(session, s); __wt_free(session, s->hazard); diff --git a/src/third_party/wiredtiger/src/cursor/cur_backup.c b/src/third_party/wiredtiger/src/cursor/cur_backup.c index 679d1474f8a..f25e3b48db4 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_backup.c +++ b/src/third_party/wiredtiger/src/cursor/cur_backup.c @@ -57,7 +57,7 @@ __curbackup_reset(WT_CURSOR *cursor) WT_SESSION_IMPL *session; cb = (WT_CURSOR_BACKUP *)cursor; - CURSOR_API_CALL(cursor, session, reset, NULL); + CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, reset, NULL); cb->next = 0; F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); @@ -78,7 +78,7 @@ __curbackup_close(WT_CURSOR *cursor) cb = (WT_CURSOR_BACKUP *)cursor; - CURSOR_API_CALL(cursor, session, close, NULL); + CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL); /* * When starting a hot backup, we serialize hot backup cursors and set @@ -124,6 +124,8 @@ __wt_curbackup_open(WT_SESSION_IMPL *session, __wt_cursor_notsup, /* remove */ __wt_cursor_notsup, /* reserve */ __wt_cursor_reconfigure_notsup, /* reconfigure */ + __wt_cursor_notsup, /* cache */ + __wt_cursor_reopen_notsup, /* reopen */ __curbackup_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_BACKUP *cb; diff --git a/src/third_party/wiredtiger/src/cursor/cur_config.c b/src/third_party/wiredtiger/src/cursor/cur_config.c index a7b7b2bf979..98c59392161 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_config.c +++ b/src/third_party/wiredtiger/src/cursor/cur_config.c @@ -15,7 +15,13 @@ static int __curconfig_close(WT_CURSOR *cursor) { - return (__wt_cursor_close(cursor)); + WT_DECL_RET; + WT_SESSION_IMPL *session; + + CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL); + WT_TRET(__wt_cursor_close(cursor)); + +err: API_END_RET(session, ret); } /* @@ -44,6 +50,8 @@ __wt_curconfig_open(WT_SESSION_IMPL *session, __wt_cursor_notsup, /* remove */ __wt_cursor_notsup, /* reserve */ __wt_cursor_reconfigure_notsup, /* reconfigure */ + __wt_cursor_notsup, /* cache */ + __wt_cursor_reopen_notsup, /* reopen */ __curconfig_close); WT_CURSOR_CONFIG *cconfig; WT_CURSOR *cursor; diff --git a/src/third_party/wiredtiger/src/cursor/cur_ds.c b/src/third_party/wiredtiger/src/cursor/cur_ds.c index bd0dcd02eef..1eb778ed0c9 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_ds.c +++ b/src/third_party/wiredtiger/src/cursor/cur_ds.c @@ -239,7 +239,7 @@ __curds_reset(WT_CURSOR *cursor) source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; - CURSOR_API_CALL(cursor, session, reset, NULL); + CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, reset, NULL); WT_STAT_CONN_INCR(session, cursor_reset); WT_STAT_DATA_INCR(session, cursor_reset); @@ -444,7 +444,7 @@ __curds_close(WT_CURSOR *cursor) cds = (WT_CURSOR_DATA_SOURCE *)cursor; - CURSOR_API_CALL(cursor, session, close, NULL); + CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL); if (cds->source != NULL) ret = cds->source->close(cds->source); @@ -496,6 +496,8 @@ __wt_curds_open( __curds_remove, /* remove */ __curds_reserve, /* reserve */ __wt_cursor_reconfigure_notsup, /* reconfigure */ + __wt_cursor_notsup, /* cache */ + __wt_cursor_reopen_notsup, /* reopen */ __curds_close); /* close */ WT_CONFIG_ITEM cval, metadata; WT_CURSOR *cursor, *source; diff --git a/src/third_party/wiredtiger/src/cursor/cur_dump.c b/src/third_party/wiredtiger/src/cursor/cur_dump.c index a15bf2ae642..8853e6f30d6 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_dump.c +++ b/src/third_party/wiredtiger/src/cursor/cur_dump.c @@ -338,7 +338,7 @@ __curdump_close(WT_CURSOR *cursor) cdump = (WT_CURSOR_DUMP *)cursor; child = cdump->child; - CURSOR_API_CALL(cursor, session, close, NULL); + CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL); if (child != NULL) WT_TRET(child->close(child)); /* We shared the child's URI. */ @@ -374,6 +374,8 @@ __wt_curdump_create(WT_CURSOR *child, WT_CURSOR *owner, WT_CURSOR **cursorp) __curdump_remove, /* remove */ __wt_cursor_notsup, /* reserve */ __wt_cursor_reconfigure_notsup, /* reconfigure */ + __wt_cursor_notsup, /* cache */ + __wt_cursor_reopen_notsup, /* reopen */ __curdump_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_DUMP *cdump; diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c index 9d6f031807f..a8255e26d78 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_file.c +++ b/src/third_party/wiredtiger/src/cursor/cur_file.c @@ -167,7 +167,7 @@ __curfile_reset(WT_CURSOR *cursor) WT_SESSION_IMPL *session; cbt = (WT_CURSOR_BTREE *)cursor; - CURSOR_API_CALL(cursor, session, reset, cbt->btree); + CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, reset, cbt->btree); ret = __wt_btcur_reset(cbt); @@ -465,9 +465,20 @@ __curfile_close(WT_CURSOR *cursor) WT_CURSOR_BULK *cbulk; WT_DECL_RET; WT_SESSION_IMPL *session; + bool released; cbt = (WT_CURSOR_BTREE *)cursor; - CURSOR_API_CALL(cursor, session, close, cbt->btree); + CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, cbt->btree); + released = false; + + /* + * If releasing the cursor fails in any way, it will be left + * in a state that allows it to be normally closed. + */ + WT_TRET(__wt_cursor_cache_release(session, cursor, &released)); + if (released) + return (0); + if (F_ISSET(cursor, WT_CURSTD_BULK)) { /* Free the bulk-specific resources. */ cbulk = (WT_CURSOR_BULK *)cbt; @@ -478,6 +489,10 @@ __curfile_close(WT_CURSOR *cursor) WT_TRET(__wt_btcur_close(cbt, false)); /* The URI is owned by the btree handle. */ cursor->internal_uri = NULL; + + WT_ASSERT(session, session->dhandle == NULL || + session->dhandle->session_inuse > 0); + WT_TRET(__wt_cursor_close(cursor)); /* @@ -494,6 +509,55 @@ err: API_END_RET(session, ret); } /* + * __curfile_cache -- + * WT_CURSOR->cache method for the btree cursor type. + */ +static int +__curfile_cache(WT_CURSOR *cursor) +{ + WT_CURSOR_BTREE *cbt; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + cbt = (WT_CURSOR_BTREE *)cursor; + session = (WT_SESSION_IMPL *)cursor->session; + + WT_TRET(__wt_cursor_cache(cursor, cbt->btree->dhandle)); + WT_TRET(__wt_session_release_dhandle(session)); + return (ret); +} + +/* + * __curfile_reopen -- + * WT_CURSOR->reopen method for the btree cursor type. + */ +static int +__curfile_reopen(WT_CURSOR *cursor, bool check_only) +{ + WT_CURSOR_BTREE *cbt; + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + WT_SESSION_IMPL *session; + bool is_dead; + + is_dead = false; + cbt = (WT_CURSOR_BTREE *)cursor; + session = (WT_SESSION_IMPL *)cursor->session; + dhandle = cbt->btree->dhandle; + + if (!WT_DHANDLE_CAN_REOPEN(dhandle)) + ret = WT_NOTFOUND; + if (!check_only) { + session->dhandle = dhandle; + WT_TRET(__wt_session_lock_dhandle(session, 0, &is_dead)); + if (is_dead) + WT_TRET(WT_NOTFOUND); + __wt_cursor_reopen(cursor, dhandle); + } + return (ret); +} + +/* * __curfile_create -- * Open a cursor for a given btree handle. */ @@ -520,6 +584,8 @@ __curfile_create(WT_SESSION_IMPL *session, __curfile_remove, /* remove */ __curfile_reserve, /* reserve */ __wt_cursor_reconfigure, /* reconfigure */ + __curfile_cache, /* cache */ + __curfile_reopen, /* reopen */ __curfile_close); /* close */ WT_BTREE *btree; WT_CONFIG_ITEM cval; @@ -528,10 +594,12 @@ __curfile_create(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk; WT_DECL_RET; size_t csize; + bool cacheable; WT_STATIC_ASSERT(offsetof(WT_CURSOR_BTREE, iface) == 0); cbt = NULL; + cacheable = F_ISSET(session, WT_SESSION_CACHE_CURSORS) && !bulk; btree = S2BT(session); WT_ASSERT(session, btree != NULL); @@ -545,6 +613,7 @@ __curfile_create(WT_SESSION_IMPL *session, cursor->internal_uri = btree->dhandle->name; cursor->key_format = btree->key_format; cursor->value_format = btree->value_format; + cursor->checkpoint = session->dhandle->checkpoint; cbt->btree = btree; /* @@ -588,6 +657,7 @@ __curfile_create(WT_SESSION_IMPL *session, session, cfg, "next_random_sample_size", 0, &cval)); if (cval.val != 0) cbt->next_random_sample_size = (u_int)cval.val; + cacheable = false; } /* Underlying btree initialization. */ @@ -603,6 +673,13 @@ __curfile_create(WT_SESSION_IMPL *session, S2C(session)->compat_major >= WT_LOG_V2) cursor->modify = __curfile_modify; + /* + * WiredTiger.wt should not be cached, doing so interferes + * with named checkpoints. + */ + if (cacheable && !WT_STREQ(WT_METAFILE_URI, cursor->internal_uri)) + F_SET(cursor, WT_CURSTD_CACHEABLE); + WT_ERR(__wt_cursor_init( cursor, cursor->internal_uri, owner, cfg, cursorp)); diff --git a/src/third_party/wiredtiger/src/cursor/cur_index.c b/src/third_party/wiredtiger/src/cursor/cur_index.c index eb82effe702..9e75442a243 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_index.c +++ b/src/third_party/wiredtiger/src/cursor/cur_index.c @@ -180,7 +180,7 @@ __curindex_reset(WT_CURSOR *cursor) u_int i; cindex = (WT_CURSOR_INDEX *)cursor; - JOINABLE_CURSOR_API_CALL(cursor, session, reset, NULL); + JOINABLE_CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, reset, NULL); F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); WT_TRET(cindex->child->reset(cindex->child)); @@ -360,7 +360,7 @@ __curindex_close(WT_CURSOR *cursor) cindex = (WT_CURSOR_INDEX *)cursor; idx = cindex->index; - JOINABLE_CURSOR_API_CALL(cursor, session, close, NULL); + JOINABLE_CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL); if ((cp = cindex->cg_cursors) != NULL) for (i = 0, cp = cindex->cg_cursors; @@ -454,6 +454,8 @@ __wt_curindex_open(WT_SESSION_IMPL *session, __wt_cursor_notsup, /* remove */ __wt_cursor_notsup, /* reserve */ __wt_cursor_reconfigure_notsup, /* reconfigure */ + __wt_cursor_notsup, /* cache */ + __wt_cursor_reopen_notsup, /* reopen */ __curindex_close); /* close */ WT_CURSOR_INDEX *cindex; WT_CURSOR *cursor; diff --git a/src/third_party/wiredtiger/src/cursor/cur_join.c b/src/third_party/wiredtiger/src/cursor/cur_join.c index 699774809f5..8f80ecba5f8 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_join.c +++ b/src/third_party/wiredtiger/src/cursor/cur_join.c @@ -325,7 +325,7 @@ __curjoin_close(WT_CURSOR *cursor) cjoin = (WT_CURSOR_JOIN *)cursor; - JOINABLE_CURSOR_API_CALL(cursor, session, close, NULL); + JOINABLE_CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL); WT_TRET(__wt_schema_release_table(session, cjoin->table)); @@ -539,16 +539,16 @@ __curjoin_extract_insert(WT_CURSOR *cursor) WT_ITEM ikey; WT_SESSION_IMPL *session; - cextract = (WT_CURJOIN_EXTRACTOR *)cursor; /* * This insert method may be called multiple times during a single * extraction. If we already have a definitive answer to the * membership question, exit early. */ + cextract = (WT_CURJOIN_EXTRACTOR *)cursor; if (cextract->ismember) return (0); - session = (WT_SESSION_IMPL *)cursor->session; + CURSOR_API_CALL(cursor, session, insert, NULL); WT_ITEM_SET(ikey, cursor->key); /* @@ -564,7 +564,7 @@ __curjoin_extract_insert(WT_CURSOR *cursor) else if (ret == 0) cextract->ismember = true; - return (ret); +err: API_END_RET(session, ret); } /* @@ -596,6 +596,8 @@ __curjoin_entry_member(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry, __wt_cursor_notsup, /* remove */ __wt_cursor_notsup, /* reserve */ __wt_cursor_reconfigure_notsup, /* reconfigure */ + __wt_cursor_notsup, /* cache */ + __wt_cursor_reopen_notsup, /* reopen */ __wt_cursor_notsup); /* close */ WT_DECL_RET; WT_INDEX *idx; @@ -1211,7 +1213,7 @@ __curjoin_reset(WT_CURSOR *cursor) cjoin = (WT_CURSOR_JOIN *)cursor; - JOINABLE_CURSOR_API_CALL(cursor, session, reset, NULL); + JOINABLE_CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, reset, NULL); if (cjoin->iter != NULL) WT_ERR(__curjoin_iter_reset(cjoin->iter)); @@ -1301,6 +1303,8 @@ __wt_curjoin_open(WT_SESSION_IMPL *session, __wt_cursor_notsup, /* remove */ __wt_cursor_notsup, /* reserve */ __wt_cursor_reconfigure_notsup, /* reconfigure */ + __wt_cursor_notsup, /* cache */ + __wt_cursor_reopen_notsup, /* reopen */ __curjoin_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_JOIN *cjoin; diff --git a/src/third_party/wiredtiger/src/cursor/cur_log.c b/src/third_party/wiredtiger/src/cursor/cur_log.c index 8f664f4c2cd..5c2fbd325f6 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_log.c +++ b/src/third_party/wiredtiger/src/cursor/cur_log.c @@ -292,13 +292,17 @@ static int __curlog_reset(WT_CURSOR *cursor) { WT_CURSOR_LOG *cl; + WT_DECL_RET; + WT_SESSION_IMPL *session; + CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, reset, NULL); cl = (WT_CURSOR_LOG *)cursor; cl->stepp = cl->stepp_end = NULL; cl->step_count = 0; WT_INIT_LSN(cl->cur_lsn); WT_INIT_LSN(cl->next_lsn); - return (0); + +err: API_END_RET(session, ret); } /* @@ -313,7 +317,7 @@ __curlog_close(WT_CURSOR *cursor) WT_DECL_RET; WT_SESSION_IMPL *session; - CURSOR_API_CALL(cursor, session, close, NULL); + CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL); cl = (WT_CURSOR_LOG *)cursor; conn = S2C(session); @@ -362,6 +366,8 @@ __wt_curlog_open(WT_SESSION_IMPL *session, __wt_cursor_notsup, /* remove */ __wt_cursor_notsup, /* reserve */ __wt_cursor_reconfigure_notsup, /* reconfigure */ + __wt_cursor_notsup, /* cache */ + __wt_cursor_reopen_notsup, /* reopen */ __curlog_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_LOG *cl; diff --git a/src/third_party/wiredtiger/src/cursor/cur_metadata.c b/src/third_party/wiredtiger/src/cursor/cur_metadata.c index 2536de8c455..c584c9c5dc3 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_metadata.c +++ b/src/third_party/wiredtiger/src/cursor/cur_metadata.c @@ -47,12 +47,11 @@ __schema_source_config(WT_SESSION_IMPL *session, WT_ERR(__wt_scr_alloc(session, cval.len + 10, &buf)); WT_ERR(__wt_buf_fmt(session, buf, "%.*s", (int)cval.len, cval.str)); srch->set_key(srch, buf->data); - if ((ret = srch->search(srch)) == WT_NOTFOUND) - WT_ERR_MSG(session, EINVAL, - "metadata information for source configuration \"%s\" " - "not found", - (char *)buf->data); - WT_ERR(ret); + if ((ret = srch->search(srch)) != 0) + WT_ERR_MSG(session, ret, + "metadata information for source configuration" + " \"%s\" not found", + (const char *)buf->data); WT_ERR(srch->get_value(srch, &v)); WT_ERR(__wt_strdup(session, v, result)); @@ -73,6 +72,8 @@ static int __schema_create_collapse(WT_SESSION_IMPL *session, WT_CURSOR_METADATA *mdc, const char *key, const char *value, char **value_ret) { + WT_CONFIG cparser; + WT_CONFIG_ITEM cgconf, ckey, cval; WT_CURSOR *c; WT_DECL_ITEM(buf); WT_DECL_RET; @@ -82,6 +83,19 @@ __schema_create_collapse(WT_SESSION_IMPL *session, WT_CURSOR_METADATA *mdc, lastcfg = cfg = &_cfg[3]; /* position on value */ c = NULL; if (key != NULL && WT_PREFIX_SKIP(key, "table:")) { + /* + * Check if the table has declared column groups. If it does, + * don't attempt to open the automatically created column + * group for simple tables. + */ + WT_RET(__wt_config_getones( + session, value, "colgroups", &cgconf)); + + __wt_config_subinit(session, &cparser, &cgconf); + if ((ret = __wt_config_next(&cparser, &ckey, &cval)) == 0) + goto skip; + WT_RET_NOTFOUND_OK(ret); + c = mdc->create_cursor; WT_ERR(__wt_scr_alloc(session, 0, &buf)); /* @@ -90,12 +104,14 @@ __schema_create_collapse(WT_SESSION_IMPL *session, WT_CURSOR_METADATA *mdc, */ WT_ERR(__wt_buf_fmt(session, buf, "colgroup:%s", key)); c->set_key(c, buf->data); - if ((ret = c->search(c)) == 0) { - WT_ERR(c->get_value(c, &v)); - WT_ERR(__wt_strdup(session, v, --cfg)); - WT_ERR(__schema_source_config(session, c, v, --cfg)); - } else - WT_ERR_NOTFOUND_OK(ret); + if ((ret = c->search(c)) != 0) + WT_ERR_MSG(session, ret, + "metadata information for source configuration" + " \"%s\" not found", + (const char *)buf->data); + WT_ERR(c->get_value(c, &v)); + WT_ERR(__wt_strdup(session, v, --cfg)); + WT_ERR(__schema_source_config(session, c, v, --cfg)); } else if (key != NULL && WT_PREFIX_SKIP(key, "colgroup:")) { if (strchr(key, ':') != NULL) { c = mdc->create_cursor; @@ -104,7 +120,8 @@ __schema_create_collapse(WT_SESSION_IMPL *session, WT_CURSOR_METADATA *mdc, __schema_source_config(session, c, value, --cfg)); } } - firstcfg = cfg; + +skip: firstcfg = cfg; *--firstcfg = WT_CONFIG_BASE(session, WT_SESSION_create); WT_ERR(__wt_config_collapse(session, firstcfg, value_ret)); @@ -263,11 +280,20 @@ __curmetadata_next(WT_CURSOR *cursor) * all schema-level operations reflected in the results. Query * at read-uncommitted to avoid confusion caused by the current * transaction state. + * + * Don't exit from the scan if we find an incomplete entry: + * just skip over it. */ - WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED, - ret = file_cursor->next(mdc->file_cursor)); - WT_ERR(ret); - WT_ERR(__curmetadata_setkv(mdc, file_cursor)); + for (;;) { + WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED, + ret = file_cursor->next(mdc->file_cursor)); + WT_ERR(ret); + WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED, + ret = __curmetadata_setkv(mdc, file_cursor)); + if (ret == 0) + break; + WT_ERR_NOTFOUND_OK(ret); + } } err: if (ret != 0) { @@ -299,12 +325,24 @@ __curmetadata_prev(WT_CURSOR *cursor) goto err; } - WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED, - ret = file_cursor->prev(file_cursor)); - if (ret == 0) - WT_ERR(__curmetadata_setkv(mdc, file_cursor)); - else if (ret == WT_NOTFOUND) - WT_ERR(__curmetadata_metadata_search(session, cursor)); + /* + * Don't exit from the scan if we find an incomplete entry: + * just skip over it. + */ + for (;;) { + WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED, + ret = file_cursor->prev(file_cursor)); + if (ret == WT_NOTFOUND) { + WT_ERR(__curmetadata_metadata_search(session, cursor)); + break; + } + WT_ERR(ret); + WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED, + ret = __curmetadata_setkv(mdc, file_cursor)); + if (ret == 0) + break; + WT_ERR_NOTFOUND_OK(ret); + } err: if (ret != 0) { F_CLR(mdc, WT_MDC_POSITIONED | WT_MDC_ONMETADATA); @@ -327,7 +365,7 @@ __curmetadata_reset(WT_CURSOR *cursor) mdc = (WT_CURSOR_METADATA *)cursor; file_cursor = mdc->file_cursor; - CURSOR_API_CALL(cursor, session, + CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, reset, ((WT_CURSOR_BTREE *)file_cursor)->btree); if (F_ISSET(mdc, WT_MDC_POSITIONED) && !F_ISSET(mdc, WT_MDC_ONMETADATA)) @@ -363,7 +401,9 @@ __curmetadata_search(WT_CURSOR *cursor) WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED, ret = file_cursor->search(file_cursor)); WT_ERR(ret); - WT_ERR(__curmetadata_setkv(mdc, file_cursor)); + WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED, + ret = __curmetadata_setkv(mdc, file_cursor)); + WT_ERR(ret); } err: if (ret != 0) { @@ -399,7 +439,9 @@ __curmetadata_search_near(WT_CURSOR *cursor, int *exact) WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED, ret = file_cursor->search_near(file_cursor, exact)); WT_ERR(ret); - WT_ERR(__curmetadata_setkv(mdc, file_cursor)); + WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED, + ret = __curmetadata_setkv(mdc, file_cursor)); + WT_ERR(ret); } err: if (ret != 0) { @@ -511,7 +553,7 @@ __curmetadata_close(WT_CURSOR *cursor) mdc = (WT_CURSOR_METADATA *)cursor; c = mdc->file_cursor; - CURSOR_API_CALL(cursor, session, close, c == NULL ? + CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, c == NULL ? NULL : ((WT_CURSOR_BTREE *)c)->btree); if (c != NULL) @@ -555,6 +597,8 @@ __wt_curmetadata_open(WT_SESSION_IMPL *session, __curmetadata_remove, /* remove */ __wt_cursor_notsup, /* reserve */ __wt_cursor_reconfigure_notsup, /* reconfigure */ + __wt_cursor_notsup, /* cache */ + __wt_cursor_reopen_notsup, /* reopen */ __curmetadata_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_METADATA *mdc; diff --git a/src/third_party/wiredtiger/src/cursor/cur_stat.c b/src/third_party/wiredtiger/src/cursor/cur_stat.c index 39921e11edc..9cd0ee2c484 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_stat.c +++ b/src/third_party/wiredtiger/src/cursor/cur_stat.c @@ -265,7 +265,7 @@ __curstat_reset(WT_CURSOR *cursor) WT_SESSION_IMPL *session; cst = (WT_CURSOR_STAT *)cursor; - CURSOR_API_CALL(cursor, session, reset, NULL); + CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, reset, NULL); cst->notinitialized = cst->notpositioned = true; F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); @@ -320,7 +320,7 @@ __curstat_close(WT_CURSOR *cursor) size_t i; cst = (WT_CURSOR_STAT *)cursor; - CURSOR_API_CALL(cursor, session, close, NULL); + CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL); if (cst->cfg != NULL) { for (i = 0; cst->cfg[i] != NULL; ++i) @@ -581,6 +581,8 @@ __wt_curstat_open(WT_SESSION_IMPL *session, __wt_cursor_notsup, /* remove */ __wt_cursor_notsup, /* reserve */ __wt_cursor_reconfigure_notsup, /* reconfigure */ + __wt_cursor_notsup, /* cache */ + __wt_cursor_reopen_notsup, /* reopen */ __curstat_close); /* close */ WT_CONFIG_ITEM cval, sval; WT_CURSOR *cursor; diff --git a/src/third_party/wiredtiger/src/cursor/cur_std.c b/src/third_party/wiredtiger/src/cursor/cur_std.c index d7b23be75d6..32c4588b9fa 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_std.c +++ b/src/third_party/wiredtiger/src/cursor/cur_std.c @@ -21,6 +21,19 @@ __wt_cursor_noop(WT_CURSOR *cursor) } /* + * __wt_cursor_cached -- + * No actions on a closed and cached cursor are allowed. + */ +int +__wt_cursor_cached(WT_CURSOR *cursor) +{ + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)cursor->session; + WT_RET_MSG(session, ENOTSUP, "Cursor has been closed"); +} + +/* * __wt_cursor_notsup -- * Unsupported cursor actions. */ @@ -135,6 +148,18 @@ __wt_cursor_reconfigure_notsup(WT_CURSOR *cursor, const char *config) } /* + * __wt_cursor_reopen_notsup -- + * Unsupported cursor reopen. + */ +int +__wt_cursor_reopen_notsup(WT_CURSOR *cursor, bool check_only) +{ + WT_UNUSED(check_only); + + return (__wt_cursor_notsup(cursor)); +} + +/* * __wt_cursor_set_notsup -- * Reset the cursor methods to not-supported. */ @@ -557,6 +582,237 @@ err: cursor->saved_err = ret; } /* + * __wt_cursor_cache -- + * Add this cursor to the cache. + */ +int +__wt_cursor_cache(WT_CURSOR *cursor, WT_DATA_HANDLE *dhandle) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + uint64_t bucket; + + session = (WT_SESSION_IMPL *)cursor->session; + WT_ASSERT(session, !F_ISSET(cursor, WT_CURSTD_CACHED) && + dhandle != NULL); + + WT_TRET(cursor->reset(cursor)); + + /* + * Acquire a reference while decrementing the in-use counter. + * After this point, the dhandle may be marked dead, but the + * actual handle won't be removed. + */ + session->dhandle = dhandle; + WT_DHANDLE_ACQUIRE(dhandle); + __wt_cursor_dhandle_decr_use(session); + + /* Move the cursor from the open list to the caching hash table. */ + if (cursor->uri_hash == 0) + cursor->uri_hash = __wt_hash_city64( + cursor->uri, strlen(cursor->uri)); + bucket = cursor->uri_hash % WT_HASH_ARRAY_SIZE; + TAILQ_REMOVE(&session->cursors, cursor, q); + TAILQ_INSERT_HEAD(&session->cursor_cache[bucket], cursor, q); + + (void)__wt_atomic_sub32(&S2C(session)->open_cursor_count, 1); + WT_STAT_DATA_DECR(session, session_cursor_open); + WT_STAT_DATA_INCR(session, session_cursor_cached); + F_SET(cursor, WT_CURSTD_CACHED); + return (ret); +} + +/* + * __wt_cursor_reopen -- + * Reopen this cursor from the cached state. + */ +void +__wt_cursor_reopen(WT_CURSOR *cursor, WT_DATA_HANDLE *dhandle) +{ + WT_SESSION_IMPL *session; + uint64_t bucket; + + session = (WT_SESSION_IMPL *)cursor->session; + WT_ASSERT(session, F_ISSET(cursor, WT_CURSTD_CACHED)); + + if (dhandle != NULL) { + session->dhandle = dhandle; + __wt_cursor_dhandle_incr_use(session); + WT_DHANDLE_RELEASE(dhandle); + } + (void)__wt_atomic_add32(&S2C(session)->open_cursor_count, 1); + WT_STAT_DATA_INCR(session, session_cursor_open); + WT_STAT_DATA_DECR(session, session_cursor_cached); + + bucket = cursor->uri_hash % WT_HASH_ARRAY_SIZE; + TAILQ_REMOVE(&session->cursor_cache[bucket], cursor, q); + TAILQ_INSERT_HEAD(&session->cursors, cursor, q); + F_CLR(cursor, WT_CURSTD_CACHED); +} + +/* + * __wt_cursor_cache_release -- + * Put the cursor into a cached state, called during cursor close + * operations. + */ +int +__wt_cursor_cache_release(WT_SESSION_IMPL *session, WT_CURSOR *cursor, + bool *released) +{ + WT_DECL_RET; + + *released = false; + if (!F_ISSET(cursor, WT_CURSTD_CACHEABLE) || + !F_ISSET(session, WT_SESSION_CACHE_CURSORS)) + return (0); + + WT_ASSERT(session, !F_ISSET(cursor, WT_CURSTD_BULK | WT_CURSTD_CACHED)); + + /* + * Do any sweeping first, if there are errors, it will + * be easier to clean up if the cursor is not already cached. + */ + if (--session->cursor_sweep_countdown == 0) + WT_RET(__wt_session_cursor_cache_sweep(session)); + + WT_ERR(cursor->cache(cursor)); + WT_STAT_CONN_INCR(session, cursor_cache); + WT_STAT_DATA_INCR(session, cursor_cache); + WT_ASSERT(session, F_ISSET(cursor, WT_CURSTD_CACHED)); + *released = true; + + if (0) { + /* + * If caching fails, we must restore the state of the + * cursor back to open so that the close works from + * a known state. The reopen may also fail, but that + * doesn't matter at this point. + */ +err: WT_TRET(cursor->reopen(cursor, false)); + WT_ASSERT(session, !F_ISSET(cursor, WT_CURSTD_CACHED)); + } + + return (ret); +} + +/* + * __wt_cursor_cache_get -- + * Open a matching cursor from the cache. + */ +int +__wt_cursor_cache_get(WT_SESSION_IMPL *session, const char *uri, + WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) +{ + WT_CONFIG_ITEM cval; + WT_CONFIG_ITEM_STATIC_INIT(false_value); + WT_CURSOR *cursor; + WT_DECL_RET; + uint64_t bucket, hash_value; + bool have_config; + + if (owner != NULL && F_ISSET(owner, WT_CURSTD_CACHEABLE)) + return (WT_NOTFOUND); + have_config = (cfg != NULL && cfg[0] != NULL && cfg[1] != NULL); + if (have_config) { + + /* + * Any cursors that have special configuration cannot + * be cached. There are some exceptions for configurations + * that only differ by a cursor flag, which we can patch + * up if we find a matching cursor. + */ + WT_RET(__wt_config_gets_def(session, cfg, "bulk", 0, &cval)); + if (cval.val) + return (WT_NOTFOUND); + + WT_RET(__wt_config_gets_def(session, cfg, "dump", 0, &cval)); + if (cval.len != 0) + return (WT_NOTFOUND); + + WT_RET(__wt_config_gets_def( + session, cfg, "next_random", 0, &cval)); + if (cval.val != 0) + return (WT_NOTFOUND); + + WT_RET(__wt_config_gets_def( + session, cfg, "readonly", 0, &cval)); + if (cval.val) + return (WT_NOTFOUND); + + /* + * Look for checkpoint last, the value will stay in 'cval'. + */ + WT_RET_NOTFOUND_OK( + __wt_config_gets_def(session, cfg, "checkpoint", 0, &cval)); + + /* + * The internal checkpoint name is special, don't + * look for it. + */ + if (cval.len != 0 && + WT_STRING_MATCH(WT_CHECKPOINT, cval.str, cval.len)) + return (WT_NOTFOUND); + } else + cval = false_value; + +#define CHECKPOINT_MATCH(s) \ + ((s == NULL && cval.len == 0) || \ + (s != NULL && WT_STRING_MATCH(s, cval.str, cval.len))) + + /* + * Walk through all cursors, if there is a cached + * cursor that matches uri and configuration, use it. + */ + hash_value = __wt_hash_city64(uri, strlen(uri)); + bucket = hash_value % WT_HASH_ARRAY_SIZE; + TAILQ_FOREACH(cursor, &session->cursor_cache[bucket], q) { + if (cursor->uri_hash == hash_value && + WT_STREQ(cursor->uri, uri) && + CHECKPOINT_MATCH(cursor->checkpoint)) { + if ((ret = cursor->reopen(cursor, false)) != 0) { + F_CLR(cursor, WT_CURSTD_CACHEABLE); + session->dhandle = NULL; + (void)cursor->close(cursor); + return (ret); + } + + F_CLR(cursor, WT_CURSTD_APPEND | WT_CURSTD_OVERWRITE | + WT_CURSTD_RAW); + + if (have_config) { + /* + * For these configuration values, there + * is no difference in the resulting + * cursor other than flag values, so fix + * them up now. + */ + WT_RET(__wt_config_gets_def( + session, cfg, "append", 0, &cval)); + if (cval.val != 0) + F_SET(cursor, WT_CURSTD_APPEND); + + WT_RET(__wt_config_gets_def( + session, cfg, "overwrite", 1, &cval)); + if (cval.val != 0) + F_SET(cursor, WT_CURSTD_OVERWRITE); + + WT_RET(__wt_config_gets_def( + session, cfg, "raw", 0, &cval)); + if (cval.val != 0) + F_SET(cursor, WT_CURSTD_RAW); + } + + WT_STAT_CONN_INCR(session, cursor_reopen); + WT_STAT_DATA_INCR(session, cursor_reopen); + + *cursorp = cursor; + return (0); + } + } + return (WT_NOTFOUND); +} + +/* * __wt_cursor_close -- * WT_CURSOR->close default implementation. */ @@ -573,7 +829,6 @@ __wt_cursor_close(WT_CURSOR *cursor) (void)__wt_atomic_sub32(&S2C(session)->open_cursor_count, 1); WT_STAT_DATA_DECR(session, session_cursor_open); } - __wt_buf_free(session, &cursor->key); __wt_buf_free(session, &cursor->value); @@ -653,10 +908,10 @@ __wt_cursor_reconfigure(WT_CURSOR *cursor, const char *config) WT_DECL_RET; WT_SESSION_IMPL *session; - session = (WT_SESSION_IMPL *)cursor->session; + CURSOR_API_CALL(cursor, session, reconfigure, NULL); /* Reconfiguration resets the cursor. */ - WT_RET(cursor->reset(cursor)); + WT_ERR(cursor->reset(cursor)); /* * append @@ -670,7 +925,7 @@ __wt_cursor_reconfigure(WT_CURSOR *cursor, const char *config) else F_CLR(cursor, WT_CURSTD_APPEND); } else - WT_RET_NOTFOUND_OK(ret); + WT_ERR_NOTFOUND_OK(ret); } /* @@ -683,9 +938,9 @@ __wt_cursor_reconfigure(WT_CURSOR *cursor, const char *config) else F_CLR(cursor, WT_CURSTD_OVERWRITE); } else - WT_RET_NOTFOUND_OK(ret); + WT_ERR_NOTFOUND_OK(ret); - return (0); +err: API_END_RET(session, ret); } /* @@ -782,6 +1037,7 @@ __wt_cursor_init(WT_CURSOR *cursor, cursor->remove = __wt_cursor_notsup; cursor->reserve = __wt_cursor_notsup; cursor->update = __wt_cursor_notsup; + F_CLR(cursor, WT_CURSTD_CACHEABLE); } /* @@ -805,6 +1061,7 @@ __wt_cursor_init(WT_CURSOR *cursor, */ WT_RET(__wt_curdump_create(cursor, owner, &cdump)); owner = cdump; + F_CLR(cursor, WT_CURSTD_CACHEABLE); } else cdump = NULL; diff --git a/src/third_party/wiredtiger/src/cursor/cur_table.c b/src/third_party/wiredtiger/src/cursor/cur_table.c index ab2382946bd..495209b7f9f 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_table.c +++ b/src/third_party/wiredtiger/src/cursor/cur_table.c @@ -36,11 +36,13 @@ static int __curextract_insert(WT_CURSOR *cursor) { WT_CURSOR_EXTRACTOR *cextract; + WT_DECL_RET; WT_ITEM *key, ikey, pkey; WT_SESSION_IMPL *session; + CURSOR_API_CALL(cursor, session, insert, NULL); + cextract = (WT_CURSOR_EXTRACTOR *)cursor; - session = (WT_SESSION_IMPL *)cursor->session; WT_ITEM_SET(ikey, cursor->key); /* @@ -49,14 +51,14 @@ __curextract_insert(WT_CURSOR *cursor) */ WT_ASSERT(session, ikey.size > 0); --ikey.size; - WT_RET(__wt_cursor_get_raw_key(cextract->ctable->cg_cursors[0], &pkey)); + WT_ERR(__wt_cursor_get_raw_key(cextract->ctable->cg_cursors[0], &pkey)); /* * We have the index key in the format we need, and all of the primary * key columns are required: just append them. */ key = &cextract->idxc->key; - WT_RET(__wt_buf_grow(session, key, ikey.size + pkey.size)); + WT_ERR(__wt_buf_grow(session, key, ikey.size + pkey.size)); memcpy((uint8_t *)key->mem, ikey.data, ikey.size); memcpy((uint8_t *)key->mem + ikey.size, pkey.data, pkey.size); key->size = ikey.size + pkey.size; @@ -68,7 +70,9 @@ __curextract_insert(WT_CURSOR *cursor) F_SET(cextract->idxc, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); /* Call the underlying cursor function to update the index. */ - return (cextract->f(cextract->idxc)); + ret = cextract->f(cextract->idxc); + +err: API_END_RET(session, ret); } /* @@ -97,6 +101,8 @@ __wt_apply_single_idx(WT_SESSION_IMPL *session, WT_INDEX *idx, __wt_cursor_notsup, /* remove */ __wt_cursor_notsup, /* reserve */ __wt_cursor_reconfigure_notsup, /* reconfigure */ + __wt_cursor_notsup, /* cache */ + __wt_cursor_reopen_notsup, /* reopen */ __wt_cursor_notsup); /* close */ WT_CURSOR_EXTRACTOR extract_cursor; WT_DECL_RET; @@ -418,7 +424,7 @@ __curtable_reset(WT_CURSOR *cursor) WT_SESSION_IMPL *session; ctable = (WT_CURSOR_TABLE *)cursor; - JOINABLE_CURSOR_API_CALL(cursor, session, reset, NULL); + JOINABLE_CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, reset, NULL); APPLY_CG(ctable, reset); err: API_END_RET(session, ret); @@ -801,7 +807,7 @@ __curtable_close(WT_CURSOR *cursor) u_int i; ctable = (WT_CURSOR_TABLE *)cursor; - JOINABLE_CURSOR_API_CALL(cursor, session, close, NULL); + JOINABLE_CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL); if (ctable->cg_cursors != NULL) for (i = 0, cp = ctable->cg_cursors; @@ -958,6 +964,8 @@ __wt_curtable_open(WT_SESSION_IMPL *session, __curtable_remove, /* remove */ __curtable_reserve, /* reserve */ __wt_cursor_reconfigure, /* reconfigure */ + __wt_cursor_notsup, /* cache */ + __wt_cursor_reopen_notsup, /* reopen */ __curtable_close); /* close */ WT_CONFIG_ITEM cval; WT_CURSOR *cursor; diff --git a/src/third_party/wiredtiger/src/docs/error-handling.dox b/src/third_party/wiredtiger/src/docs/error-handling.dox index 8e866b8c123..fc5062b45a8 100644 --- a/src/third_party/wiredtiger/src/docs/error-handling.dox +++ b/src/third_party/wiredtiger/src/docs/error-handling.dox @@ -71,6 +71,9 @@ This error is generated when wiredtiger_open is configured to return an error if @par <code>WT_CACHE_FULL</code> This error is only generated when wiredtiger_open is configured to run in-memory, and an insert or update operation requires more than the configured cache size to complete. The operation may be retried; if a transaction is in progress, it should be rolled back and the operation retried in a new transaction. +@par <code>WT_PREPARE_CONFLICT</code> +This error is generated when the application attempts to update an already updated record which is in prepared state. An updated record will be in prepared state, when the transaction that performed the update is in prepared state. + @if IGNORE_BUILT_BY_API_ERR_END @endif diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index 71410a5a731..46d52ee0c68 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -1528,7 +1528,7 @@ retry: while (slot < max_entries) { * candidates and we aren't finding more. */ if (slot < max_entries && (retries < 2 || - (retries < 10 && + (retries < WT_RETRY_MAX && (slot == queue->evict_entries || slot > start_slot)))) { start_slot = slot; ++retries; diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index 6d904a2004b..90b71659015 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -167,6 +167,16 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) /* Figure out whether reconciliation was done on the page */ clean_page = __wt_page_evict_clean(page); + /* + * Discard all page-deleted information. If a truncate call deleted this + * page, there's memory associated with it we no longer need, eviction + * will have built a new version of the page. + */ + if (ref->page_del != NULL) { + __wt_free(session, ref->page_del->update_list); + __wt_free(session, ref->page_del); + } + /* Update the reference and discard the page. */ if (__wt_ref_is_root(ref)) __wt_ref_out(session, ref); diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h index 847d6c5ee01..afefbe8ad5c 100644 --- a/src/third_party/wiredtiger/src/include/api.h +++ b/src/third_party/wiredtiger/src/include/api.h @@ -65,10 +65,10 @@ if ((s) != NULL) { \ WT_TRACK_OP_END(s); \ WT_SINGLE_THREAD_CHECK_STOP(s); \ - if (F_ISSET(&(s)->txn, WT_TXN_RUNNING) && \ - (ret) != 0 && \ + if ((ret) != 0 && \ (ret) != WT_NOTFOUND && \ - (ret) != WT_DUPLICATE_KEY) \ + (ret) != WT_DUPLICATE_KEY && \ + F_ISSET(&(s)->txn, WT_TXN_RUNNING)) \ F_SET(&(s)->txn, WT_TXN_ERROR); \ /* \ * No code after this line, otherwise error handling \ @@ -162,19 +162,38 @@ s = (conn)->default_session; \ API_CALL_NOCONF(s, WT_CONNECTION, n, NULL) -#define SESSION_API_CALL(s, n, config, cfg) \ +#define SESSION_API_CALL_PREPARE_ALLOWED(s, n, config, cfg) \ API_CALL(s, WT_SESSION, n, NULL, config, cfg) +#define SESSION_API_CALL(s, n, config, cfg) \ + API_CALL(s, WT_SESSION, n, NULL, config, cfg); \ + WT_ERR(__wt_txn_context_prepare_check((s))) + #define SESSION_API_CALL_NOCONF(s, n) \ API_CALL_NOCONF(s, WT_SESSION, n, NULL) +#define SESSION_API_CALL_NOCONF_PREPARE_NOT_ALLOWED(s, n) \ + API_CALL_NOCONF(s, WT_SESSION, n, NULL); \ + WT_ERR(__wt_txn_context_prepare_check((s))) + #define SESSION_TXN_API_CALL(s, n, config, cfg) \ - TXN_API_CALL(s, WT_SESSION, n, NULL, config, cfg) + TXN_API_CALL(s, WT_SESSION, n, NULL, config, cfg); \ + WT_ERR(__wt_txn_context_prepare_check((s))) #define CURSOR_API_CALL(cur, s, n, bt) \ (s) = (WT_SESSION_IMPL *)(cur)->session; \ API_CALL_NOCONF(s, WT_CURSOR, n, \ - ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle) + ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle); \ + WT_ERR(__wt_txn_context_prepare_check((s))); \ + if (F_ISSET(cur, WT_CURSTD_CACHED)) \ + WT_ERR(__wt_cursor_cached(cur)) + +#define CURSOR_API_CALL_PREPARE_ALLOWED(cur, s, n, bt) \ + (s) = (WT_SESSION_IMPL *)(cur)->session; \ + API_CALL_NOCONF(s, WT_CURSOR, n, \ + ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle); \ + if (F_ISSET(cur, WT_CURSTD_CACHED)) \ + WT_ERR(__wt_cursor_cached(cur)) #define JOINABLE_CURSOR_CALL_CHECK(cur) \ if (F_ISSET(cur, WT_CURSTD_JOINED)) \ @@ -184,10 +203,15 @@ CURSOR_API_CALL(cur, s, n, bt); \ JOINABLE_CURSOR_CALL_CHECK(cur) +#define JOINABLE_CURSOR_API_CALL_PREPARE_ALLOWED(cur, s, n, bt) \ + CURSOR_API_CALL_PREPARE_ALLOWED(cur, s, n, bt); \ + JOINABLE_CURSOR_CALL_CHECK(cur) + #define CURSOR_REMOVE_API_CALL(cur, s, bt) \ (s) = (WT_SESSION_IMPL *)(cur)->session; \ TXN_API_CALL_NOCONF(s, WT_CURSOR, remove, \ - ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle); + ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle); \ + WT_ERR(__wt_txn_context_prepare_check((s))) #define JOINABLE_CURSOR_REMOVE_API_CALL(cur, s, bt) \ CURSOR_REMOVE_API_CALL(cur, s, bt); \ @@ -197,6 +221,7 @@ (s) = (WT_SESSION_IMPL *)(cur)->session; \ TXN_API_CALL_NOCONF( \ s, WT_CURSOR, n, ((WT_BTREE *)(bt))->dhandle); \ + WT_ERR(__wt_txn_context_prepare_check((s))); \ if (F_ISSET(S2C(s), WT_CONN_IN_MEMORY) && \ !F_ISSET((WT_BTREE *)(bt), WT_BTREE_IGNORE_CACHE) && \ __wt_cache_full(s)) \ @@ -204,7 +229,8 @@ #define CURSOR_UPDATE_API_CALL(cur, s, n) \ (s) = (WT_SESSION_IMPL *)(cur)->session; \ - TXN_API_CALL_NOCONF(s, WT_CURSOR, n, NULL); + TXN_API_CALL_NOCONF(s, WT_CURSOR, n, NULL); \ + WT_ERR(__wt_txn_context_prepare_check((s))) #define JOINABLE_CURSOR_UPDATE_API_CALL(cur, s, n) \ CURSOR_UPDATE_API_CALL(cur, s, n); \ diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 7fbf27a1fff..7ba73d1b94f 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -779,6 +779,8 @@ struct __wt_page_deleted { volatile uint64_t txnid; /* Transaction ID */ WT_DECL_TIMESTAMP(timestamp) + uint32_t previous_state; /* Previous state */ + WT_UPDATE **update_list; /* List of updates for abort */ }; @@ -987,6 +989,16 @@ struct __wt_update { #define WT_UPDATE_TOMBSTONE 5 /* deleted */ uint8_t type; /* type (one byte to conserve memory) */ + /* + * The update state is used for transaction prepare to manage + * visibility and transitioning update structure state safely. + */ +#define WT_UPDATE_STATE_READY 0 /* Must be 0. Default or + finalized prepare */ +#define WT_UPDATE_STATE_LOCKED 1 /* locked */ +#define WT_UPDATE_STATE_PREPARED 2 /* prepared */ + uint8_t state; /* state (one byte : conserve memory) */ + /* If the update includes a complete value. */ #define WT_UPDATE_DATA_VALUE(upd) \ ((upd)->type == WT_UPDATE_STANDARD || \ @@ -1008,7 +1020,7 @@ struct __wt_update { * WT_UPDATE_SIZE is the expected structure size excluding the payload data -- * we verify the build to ensure the compiler hasn't inserted padding. */ -#define WT_UPDATE_SIZE (21 + WT_TIMESTAMP_SIZE) +#define WT_UPDATE_SIZE (22 + WT_TIMESTAMP_SIZE) /* * The memory size of an update: include some padding because this is such a diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index 9c29b72dc67..808c8f7ee7f 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -1350,11 +1350,9 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp) F_ISSET_ATOMIC(ref->home, WT_PAGE_OVERFLOW_KEYS)) return (false); - /* - * If the page was restored after a truncate, it can't be evicted until - * the truncate completes. - */ - if (ref->page_del != NULL && !__wt_txn_visible_all(session, + /* A truncated page can't be evicted until the truncate completes. */ + if (ref->page_del != NULL && ref->page_del->txnid != WT_TXN_ABORTED && + !__wt_txn_visible_all(session, ref->page_del->txnid, WT_TIMESTAMP_NULL(&ref->page_del->timestamp))) return (false); diff --git a/src/third_party/wiredtiger/src/include/config.h b/src/third_party/wiredtiger/src/include/config.h index 541e811aa33..06847117b7d 100644 --- a/src/third_party/wiredtiger/src/include/config.h +++ b/src/third_party/wiredtiger/src/include/config.h @@ -45,6 +45,11 @@ struct __wt_config_parser_impl { WT_CONFIG_ITEM config_item; }; +#define WT_CONFIG_ITEM_STATIC_INIT(n) \ + static const WT_CONFIG_ITEM n = { \ + "", 0, 0, WT_CONFIG_ITEM_NUM \ + } + /* * DO NOT EDIT: automatically built by dist/api_config.py. * configuration section: BEGIN diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index be21fcb6456..6229ddf7b71 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -6,12 +6,6 @@ * See the file LICENSE for redistribution information. */ -/* - * Default hash table size; we don't need a prime number of buckets - * because we always use a good hash function. - */ -#define WT_HASH_ARRAY_SIZE 512 - /******************************************* * Global per-process structure. *******************************************/ diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h index 31dec1d24f6..449c46385f6 100644 --- a/src/third_party/wiredtiger/src/include/cursor.h +++ b/src/third_party/wiredtiger/src/include/cursor.h @@ -27,6 +27,8 @@ remove, \ reserve, \ reconfigure, \ + cache, \ + reopen, \ close) \ static const WT_CURSOR n = { \ NULL, /* session */ \ @@ -51,6 +53,10 @@ reserve, \ close, \ reconfigure, \ + cache, \ + reopen, \ + 0, /* uri_hash */ \ + NULL, /* checkpoint */ \ { NULL, NULL }, /* TAILQ_ENTRY q */ \ 0, /* recno key */ \ { 0 }, /* recno raw buffer */ \ diff --git a/src/third_party/wiredtiger/src/include/dhandle.h b/src/third_party/wiredtiger/src/include/dhandle.h index a18881d8ea9..f47db3f762c 100644 --- a/src/third_party/wiredtiger/src/include/dhandle.h +++ b/src/third_party/wiredtiger/src/include/dhandle.h @@ -33,6 +33,12 @@ (F_ISSET(dhandle, WT_DHANDLE_DEAD) || \ !F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_OPEN)) +/* Check if a handle could be reopened. */ +#define WT_DHANDLE_CAN_REOPEN(dhandle) \ + (!WT_DHANDLE_INACTIVE(dhandle) && \ + F_ISSET(dhandle, WT_DHANDLE_OPEN) && \ + !F_ISSET(dhandle, WT_DHANDLE_DROPPED)) + /* The metadata cursor's data handle. */ #define WT_SESSION_META_DHANDLE(s) \ (((WT_CURSOR_BTREE *)((s)->meta_cursor))->btree->dhandle) @@ -70,9 +76,10 @@ struct __wt_data_handle { const char **cfg; /* Configuration information */ /* - * Sessions caching a connection's data handle will have a non-zero + * Sessions holding a connection's data handle will have a non-zero * reference count; sessions using a connection's data handle will - * have a non-zero in-use count. + * have a non-zero in-use count. Instances of cached cursors referencing + * the data handle appear in session_cache_ref. */ uint32_t session_ref; /* Sessions referencing this handle */ int32_t session_inuse; /* Sessions using this handle */ @@ -107,10 +114,11 @@ struct __wt_data_handle { #define WT_DHANDLE_DEAD 0x01u /* Dead, awaiting discard */ #define WT_DHANDLE_DISCARD 0x02u /* Close on release */ #define WT_DHANDLE_DISCARD_KILL 0x04u /* Mark dead on release */ -#define WT_DHANDLE_EXCLUSIVE 0x08u /* Exclusive access */ -#define WT_DHANDLE_IS_METADATA 0x10u /* Metadata handle */ -#define WT_DHANDLE_LOCK_ONLY 0x20u /* Handle only used as a lock */ -#define WT_DHANDLE_OPEN 0x40u /* Handle is open */ +#define WT_DHANDLE_DROPPED 0x08u /* Handle is dropped */ +#define WT_DHANDLE_EXCLUSIVE 0x10u /* Exclusive access */ +#define WT_DHANDLE_IS_METADATA 0x20u /* Metadata handle */ +#define WT_DHANDLE_LOCK_ONLY 0x40u /* Handle only used as a lock */ +#define WT_DHANDLE_OPEN 0x80u /* Handle is open */ /* AUTOMATIC FLAG VALUE GENERATION STOP */ uint32_t flags; }; diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index a293b1ac516..0e42e554588 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -127,7 +127,7 @@ extern int __wt_debug_tree_all( WT_SESSION_IMPL *session, WT_BTREE *btree, WT_RE extern int __wt_debug_tree( WT_SESSION_IMPL *session, WT_BTREE *btree, WT_REF *ref, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_debug_page(WT_SESSION_IMPL *session, WT_REF *ref, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref); +extern int __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all); extern int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref); @@ -207,6 +207,8 @@ extern int __wt_las_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE extern int __wt_las_cursor_open(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_las_cursor( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags); extern int __wt_las_cursor_close( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern bool __wt_las_page_skip_locked(WT_SESSION_IMPL *session, WT_REF *ref); +extern bool __wt_las_page_skip(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_PAGE *page, WT_MULTI *multi, WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_las_cursor_position(WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_las_remove_block( WT_SESSION_IMPL *session, uint32_t btree_id, uint64_t pageid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -269,7 +271,7 @@ extern int __wt_conn_dhandle_find( WT_SESSION_IMPL *session, const char *uri, co extern int __wt_conn_dhandle_close( WT_SESSION_IMPL *session, bool final, bool mark_dead) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_conn_dhandle_open( WT_SESSION_IMPL *session, const char *cfg[], uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_conn_btree_apply(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_conn_dhandle_close_all( WT_SESSION_IMPL *session, const char *uri, bool mark_dead) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_conn_dhandle_close_all( WT_SESSION_IMPL *session, const char *uri, bool removed, bool mark_dead) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_conn_dhandle_discard_single( WT_SESSION_IMPL *session, bool final, bool mark_dead) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_conn_dhandle_discard(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_verbose_dump_handles(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -323,6 +325,7 @@ extern void __wt_curstat_dsrc_final(WT_CURSOR_STAT *cst); extern int __wt_curstat_init(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *curjoin, const char *cfg[], WT_CURSOR_STAT *cst) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_curstat_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *other, const char *cfg[], WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_cursor_noop(WT_CURSOR *cursor) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_cursor_cached(WT_CURSOR *cursor) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_cursor_notsup(WT_CURSOR *cursor) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_cursor_get_value_notsup(WT_CURSOR *cursor, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_cursor_set_key_notsup(WT_CURSOR *cursor, ...); @@ -332,6 +335,7 @@ extern int __wt_cursor_equals_notsup(WT_CURSOR *cursor, WT_CURSOR *other, int *e extern int __wt_cursor_modify_notsup(WT_CURSOR *cursor, WT_MODIFY *entries, int nentries) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_cursor_search_near_notsup(WT_CURSOR *cursor, int *exact) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_cursor_reconfigure_notsup(WT_CURSOR *cursor, const char *config) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_cursor_reopen_notsup(WT_CURSOR *cursor, bool check_only) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_cursor_set_notsup(WT_CURSOR *cursor); extern int __wt_cursor_kv_not_set(WT_CURSOR *cursor, bool key) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_cursor_get_key(WT_CURSOR *cursor, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -346,6 +350,10 @@ extern int __wt_cursor_get_value(WT_CURSOR *cursor, ...) WT_GCC_FUNC_DECL_ATTRIB extern int __wt_cursor_get_valuev(WT_CURSOR *cursor, va_list ap) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_cursor_set_value(WT_CURSOR *cursor, ...); extern void __wt_cursor_set_valuev(WT_CURSOR *cursor, va_list ap); +extern int __wt_cursor_cache(WT_CURSOR *cursor, WT_DATA_HANDLE *dhandle) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern void __wt_cursor_reopen(WT_CURSOR *cursor, WT_DATA_HANDLE *dhandle); +extern int __wt_cursor_cache_release(WT_SESSION_IMPL *session, WT_CURSOR *cursor, bool *released) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_cursor_cache_get(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_cursor_close(WT_CURSOR *cursor) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_cursor_equals(WT_CURSOR *cursor, WT_CURSOR *other, int *equalp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_cursor_reconfigure(WT_CURSOR *cursor, const char *config) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -531,7 +539,7 @@ extern int __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, char **va extern int __wt_turtle_update(WT_SESSION_IMPL *session, const char *key, const char *value) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_optrack_record_funcid( WT_SESSION_IMPL *session, const char *func, uint16_t *func_idp); extern int __wt_optrack_open_file(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern size_t __wt_optrack_flush_buffer(WT_SESSION_IMPL *s); +extern void __wt_optrack_flush_buffer(WT_SESSION_IMPL *s); extern int __wt_filename(WT_SESSION_IMPL *session, const char *name, char **path) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_nfilename( WT_SESSION_IMPL *session, const char *name, size_t namelen, char **path) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_filename_construct(WT_SESSION_IMPL *session, const char *path, const char *file_prefix, uintmax_t id_1, uint32_t id_2, WT_ITEM *buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -636,6 +644,7 @@ extern int __wt_name_check(WT_SESSION_IMPL *session, const char *str, size_t len extern int __wt_schema_worker(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), const char *cfg[], uint32_t open_flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_session_notsup(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_session_reset_cursors(WT_SESSION_IMPL *session, bool free_buffers) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_session_cursor_cache_sweep(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_session_copy_values(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_session_release_resources(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_open_cursor(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -832,12 +841,14 @@ extern int __wt_txn_recover(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE extern int __wt_txn_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_timestamp_to_hex_string( WT_SESSION_IMPL *session, char *hex_timestamp, const wt_timestamp_t *ts_src) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_verbose_timestamp(WT_SESSION_IMPL *session, const wt_timestamp_t *ts, const char *msg); +extern int __wt_txn_parse_timestamp_raw(WT_SESSION_IMPL *session, const char *name, wt_timestamp_t *timestamp, WT_CONFIG_ITEM *cval) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_parse_timestamp(WT_SESSION_IMPL *session, const char *name, wt_timestamp_t *timestamp, WT_CONFIG_ITEM *cval) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_global_query_timestamp( WT_SESSION_IMPL *session, char *hex_timestamp, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session, bool force) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_timestamp_validate(WT_SESSION_IMPL *session, const char *name, wt_timestamp_t *ts, WT_CONFIG_ITEM *cval, bool cmp_oldest, bool cmp_stable, bool cmp_commit) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_txn_parse_read_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_txn_set_commit_timestamp(WT_SESSION_IMPL *session); extern void __wt_txn_clear_commit_timestamp(WT_SESSION_IMPL *session); extern void __wt_txn_set_read_timestamp(WT_SESSION_IMPL *session); diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h index 0ddeb046353..a3a81c21569 100644 --- a/src/third_party/wiredtiger/src/include/misc.h +++ b/src/third_party/wiredtiger/src/include/misc.h @@ -77,6 +77,12 @@ #define WT_ENCRYPT_LEN_SIZE sizeof(uint32_t) /* + * Default hash table size; we don't need a prime number of buckets + * because we always use a good hash function. + */ +#define WT_HASH_ARRAY_SIZE 512 + +/* * __wt_calloc_def, __wt_calloc_one -- * Most calloc calls don't need separate count or sizeof arguments. */ diff --git a/src/third_party/wiredtiger/src/include/misc.i b/src/third_party/wiredtiger/src/include/misc.i index 05c0733d4ce..acbbbcaff83 100644 --- a/src/third_party/wiredtiger/src/include/misc.i +++ b/src/third_party/wiredtiger/src/include/misc.i @@ -189,6 +189,24 @@ __wt_snprintf_len_incr( } /* + * __wt_txn_context_prepare_check -- + * Return an error if the current transaction is in the prepare state. + */ +static inline int +__wt_txn_context_prepare_check( WT_SESSION_IMPL *session) +{ +#ifdef HAVE_TIMESTAMPS + if (F_ISSET(&session->txn, WT_TXN_PREPARE)) + WT_RET_MSG(session, EINVAL, + "%s: not permitted in a prepared transaction", + session->name); +#else + WT_UNUSED(session); +#endif + return (0); +} + +/* * __wt_txn_context_check -- * Complain if a transaction is/isn't running. */ diff --git a/src/third_party/wiredtiger/src/include/optrack.h b/src/third_party/wiredtiger/src/include/optrack.h index 8593a20d2c4..4dbd928a6df 100644 --- a/src/third_party/wiredtiger/src/include/optrack.h +++ b/src/third_party/wiredtiger/src/include/optrack.h @@ -57,7 +57,7 @@ struct __wt_optrack_record { __tr->op_type = optype; \ \ if (++(s)->optrackbuf_ptr == WT_OPTRACK_MAXRECS) { \ - (s)->optrack_offset += __wt_optrack_flush_buffer(s); \ + __wt_optrack_flush_buffer(s); \ (s)->optrackbuf_ptr = 0; \ } \ } while (0) diff --git a/src/third_party/wiredtiger/src/include/os.h b/src/third_party/wiredtiger/src/include/os.h index c31619f2f96..79ee3971a73 100644 --- a/src/third_party/wiredtiger/src/include/os.h +++ b/src/third_party/wiredtiger/src/include/os.h @@ -34,9 +34,11 @@ (ret) = __wt_errno(); \ } while (0) +#define WT_RETRY_MAX 10 + #define WT_SYSCALL_RETRY(call, ret) do { \ int __retry; \ - for (__retry = 0; __retry < 10; ++__retry) { \ + for (__retry = 0; __retry < WT_RETRY_MAX; ++__retry) { \ WT_SYSCALL(call, ret); \ switch (ret) { \ case EAGAIN: \ diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h index 0f3b2488e9f..1ece86b6ce0 100644 --- a/src/third_party/wiredtiger/src/include/session.h +++ b/src/third_party/wiredtiger/src/include/session.h @@ -37,6 +37,14 @@ struct __wt_hazard { #define S2BT(session) ((WT_BTREE *)(session)->dhandle->handle) #define S2BT_SAFE(session) ((session)->dhandle == NULL ? NULL : S2BT(session)) +typedef TAILQ_HEAD(__wt_cursor_list, __wt_cursor) WT_CURSOR_LIST; + +/* Number of cursors cached to trigger sweep. */ +#define WT_SESSION_CURSOR_SWEEP_COUNTDOWN 20 + +/* Maximum number of buckets to visit during sweep. */ +#define WT_SESSION_CURSOR_SWEEP_MAX 32 + /* * WT_SESSION_IMPL -- * Implementation of WT_SESSION. @@ -68,8 +76,9 @@ struct __wt_session_impl { time_t last_sweep; /* Last sweep for dead handles */ struct timespec last_epoch; /* Last epoch time returned */ - /* Cursors closed with the session */ - TAILQ_HEAD(__cursors, __wt_cursor) cursors; + WT_CURSOR_LIST cursors; /* Cursors closed with the session */ + uint32_t cursor_sweep_position; /* Position in cursor_cache for sweep */ + uint32_t cursor_sweep_countdown;/* Countdown to cursor sweep */ WT_CURSOR_BACKUP *bkp_cursor; /* Hot backup cursor */ @@ -149,28 +158,30 @@ struct __wt_session_impl { u_int stat_bucket; /* Statistics bucket offset */ /* AUTOMATIC FLAG VALUE GENERATION START */ -#define WT_SESSION_CAN_WAIT 0x000001u -#define WT_SESSION_IGNORE_CACHE_SIZE 0x000002u -#define WT_SESSION_INTERNAL 0x000004u -#define WT_SESSION_LOCKED_CHECKPOINT 0x000008u -#define WT_SESSION_LOCKED_HANDLE_LIST_READ 0x000010u -#define WT_SESSION_LOCKED_HANDLE_LIST_WRITE 0x000020u -#define WT_SESSION_LOCKED_METADATA 0x000040u -#define WT_SESSION_LOCKED_PASS 0x000080u -#define WT_SESSION_LOCKED_SCHEMA 0x000100u -#define WT_SESSION_LOCKED_SLOT 0x000200u -#define WT_SESSION_LOCKED_TABLE_READ 0x000400u -#define WT_SESSION_LOCKED_TABLE_WRITE 0x000800u -#define WT_SESSION_LOCKED_TURTLE 0x001000u -#define WT_SESSION_LOGGING_INMEM 0x002000u -#define WT_SESSION_LOOKASIDE_CURSOR 0x004000u -#define WT_SESSION_NO_DATA_HANDLES 0x008000u -#define WT_SESSION_NO_LOGGING 0x010000u -#define WT_SESSION_NO_RECONCILE 0x020000u -#define WT_SESSION_NO_SCHEMA_LOCK 0x040000u -#define WT_SESSION_QUIET_CORRUPT_FILE 0x080000u -#define WT_SESSION_READ_WONT_NEED 0x100000u -#define WT_SESSION_SERVER_ASYNC 0x200000u +#define WT_SESSION_CACHE_CURSORS 0x000001u +#define WT_SESSION_CAN_WAIT 0x000002u +#define WT_SESSION_IGNORE_CACHE_SIZE 0x000004u +#define WT_SESSION_INTERNAL 0x000008u +#define WT_SESSION_LOCKED_CHECKPOINT 0x000010u +#define WT_SESSION_LOCKED_HANDLE_LIST_READ 0x000020u +#define WT_SESSION_LOCKED_HANDLE_LIST_WRITE 0x000040u +#define WT_SESSION_LOCKED_METADATA 0x000080u +#define WT_SESSION_LOCKED_PASS 0x000100u +#define WT_SESSION_LOCKED_SCHEMA 0x000200u +#define WT_SESSION_LOCKED_SLOT 0x000400u +#define WT_SESSION_LOCKED_TABLE_READ 0x000800u +#define WT_SESSION_LOCKED_TABLE_WRITE 0x001000u +#define WT_SESSION_LOCKED_TURTLE 0x002000u +#define WT_SESSION_LOGGING_INMEM 0x004000u +#define WT_SESSION_LOOKASIDE_CURSOR 0x008000u +#define WT_SESSION_NO_DATA_HANDLES 0x010000u +#define WT_SESSION_NO_LOGGING 0x020000u +#define WT_SESSION_NO_RECONCILE 0x040000u +#define WT_SESSION_NO_SCHEMA_LOCK 0x080000u +#define WT_SESSION_QUIET_CORRUPT_FILE 0x100000u +#define WT_SESSION_READ_WONT_NEED 0x200000u +#define WT_SESSION_SCHEMA_TXN 0x400000u +#define WT_SESSION_SERVER_ASYNC 0x800000u /* AUTOMATIC FLAG VALUE GENERATION STOP */ uint32_t flags; @@ -187,6 +198,12 @@ struct __wt_session_impl { */ WT_RAND_STATE rnd; /* Random number generation state */ + /* + * Hash tables are allocated lazily as sessions are used to keep the + * size of this structure from growing too large. + */ + WT_CURSOR_LIST *cursor_cache; /* Hash table of cached cursors */ + /* Hashed handle reference list array */ TAILQ_HEAD(__dhandles_hash, __wt_data_handle_cache) *dhhash; diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index e5cfb534db5..7ef63cb0eaf 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -458,7 +458,13 @@ struct __wt_connection_stats { int64_t cursor_restart; int64_t cursor_search; int64_t cursor_search_near; + int64_t cursor_sweep_buckets; + int64_t cursor_sweep_closed; + int64_t cursor_sweep_examined; + int64_t cursor_sweep; int64_t cursor_update; + int64_t cursor_cache; + int64_t cursor_reopen; int64_t cursor_truncate; int64_t dh_conn_handle_count; int64_t dh_sweep_ref; @@ -767,6 +773,8 @@ struct __wt_dsrc_stats { int64_t cursor_insert_bytes; int64_t cursor_remove_bytes; int64_t cursor_update_bytes; + int64_t cursor_cache; + int64_t cursor_reopen; int64_t cursor_insert; int64_t cursor_modify; int64_t cursor_next; @@ -793,6 +801,7 @@ struct __wt_dsrc_stats { int64_t rec_pages; int64_t rec_pages_eviction; int64_t rec_page_delete; + int64_t session_cursor_cached; int64_t session_compact; int64_t session_cursor_open; int64_t txn_update_conflict; diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index 19eaf87cbd3..2be6f72210c 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -104,6 +104,7 @@ struct __wt_txn_global { WT_DECL_TIMESTAMP(commit_timestamp) WT_DECL_TIMESTAMP(oldest_timestamp) WT_DECL_TIMESTAMP(pinned_timestamp) + WT_DECL_TIMESTAMP(recovery_timestamp) WT_DECL_TIMESTAMP(stable_timestamp) bool has_commit_timestamp; bool has_oldest_timestamp; @@ -172,16 +173,15 @@ struct __wt_txn_op { uint32_t fileid; enum { WT_TXN_OP_BASIC, - WT_TXN_OP_BASIC_TS, WT_TXN_OP_INMEM, - WT_TXN_OP_REF, + WT_TXN_OP_REF_DELETE, WT_TXN_OP_TRUNCATE_COL, WT_TXN_OP_TRUNCATE_ROW } type; union { /* WT_TXN_OP_BASIC, WT_TXN_OP_INMEM */ WT_UPDATE *upd; - /* WT_TXN_OP_REF */ + /* WT_TXN_OP_REF_DELETE */ WT_REF *ref; /* WT_TXN_OP_TRUNCATE_COL */ struct { @@ -236,6 +236,12 @@ struct __wt_txn { */ WT_DECL_TIMESTAMP(first_commit_timestamp) + /* + * Timestamp copied into updates created by this transaction, when this + * transaction is prepared. + */ + WT_DECL_TIMESTAMP(prepare_timestamp) + /* Read updates committed as of this timestamp. */ WT_DECL_TIMESTAMP(read_timestamp) @@ -271,15 +277,16 @@ struct __wt_txn { #define WT_TXN_HAS_TS_READ 0x00020u #define WT_TXN_IGNORE_PREPARE 0x00040u #define WT_TXN_NAMED_SNAPSHOT 0x00080u -#define WT_TXN_PUBLIC_TS_COMMIT 0x00100u -#define WT_TXN_PUBLIC_TS_READ 0x00200u -#define WT_TXN_READONLY 0x00400u -#define WT_TXN_RUNNING 0x00800u -#define WT_TXN_SYNC_SET 0x01000u -#define WT_TXN_TS_COMMIT_ALWAYS 0x02000u -#define WT_TXN_TS_COMMIT_KEYS 0x04000u -#define WT_TXN_TS_COMMIT_NEVER 0x08000u -#define WT_TXN_UPDATE 0x10000u +#define WT_TXN_PREPARE 0x00100u +#define WT_TXN_PUBLIC_TS_COMMIT 0x00200u +#define WT_TXN_PUBLIC_TS_READ 0x00400u +#define WT_TXN_READONLY 0x00800u +#define WT_TXN_RUNNING 0x01000u +#define WT_TXN_SYNC_SET 0x02000u +#define WT_TXN_TS_COMMIT_ALWAYS 0x04000u +#define WT_TXN_TS_COMMIT_KEYS 0x08000u +#define WT_TXN_TS_COMMIT_NEVER 0x10000u +#define WT_TXN_UPDATE 0x20000u /* AUTOMATIC FLAG VALUE GENERATION STOP */ uint32_t flags; }; diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index fbd89195746..d3ba5c7796a 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -204,6 +204,32 @@ __wt_txn_unmodify(WT_SESSION_IMPL *session) } } +#ifdef HAVE_TIMESTAMPS +/* + * __wt_txn_update_needs_timestamp -- + * Decide whether to copy a commit timestamp into an update. If the op + * structure doesn't have a populated update or ref field or in prepared + * state there won't be any check for an existing timestamp. + */ +static inline bool +__wt_txn_update_needs_timestamp(WT_SESSION_IMPL *session, WT_TXN_OP *op) +{ + WT_TXN *txn; + + txn = &session->txn; + /* + * Updates in the metadata never get timestamps (either now or at + * commit): metadata cannot be read at a point in time, only the most + * recently committed data matches files on disk. + */ + return (op->fileid != WT_METAFILE_ID && + F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && + (op->u.upd == NULL || + __wt_timestamp_iszero(&(op->u.upd->timestamp)) || + F_ISSET(txn, WT_TXN_PREPARE))); +} +#endif + /* * __wt_txn_modify -- * Mark a WT_UPDATE object modified by the current transaction. @@ -224,21 +250,8 @@ __wt_txn_modify(WT_SESSION_IMPL *session, WT_UPDATE *upd) op->type = F_ISSET(session, WT_SESSION_LOGGING_INMEM) ? WT_TXN_OP_INMEM : WT_TXN_OP_BASIC; #ifdef HAVE_TIMESTAMPS - /* - * Mark the update with a timestamp, if we have one. - * - * Updates in the metadata never get timestamps (either now or at - * commit): metadata cannot be read at a point in time, only the most - * recently committed data matches files on disk. - */ - if (WT_IS_METADATA(session->dhandle)) { - if (!F_ISSET(session, WT_SESSION_LOGGING_INMEM)) - op->type = WT_TXN_OP_BASIC_TS; - } else if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) { + if (__wt_txn_update_needs_timestamp(session, op)) __wt_timestamp_set(&upd->timestamp, &txn->commit_timestamp); - if (!F_ISSET(session, WT_SESSION_LOGGING_INMEM)) - op->type = WT_TXN_OP_BASIC_TS; - } #endif op->u.upd = upd; upd->txnid = session->txn.id; @@ -246,18 +259,34 @@ __wt_txn_modify(WT_SESSION_IMPL *session, WT_UPDATE *upd) } /* - * __wt_txn_modify_ref -- - * Remember a WT_REF object modified by the current transaction. + * __wt_txn_modify_page_delete -- + * Remember a page fast-deleted by the current transaction. */ static inline int -__wt_txn_modify_ref(WT_SESSION_IMPL *session, WT_REF *ref) +__wt_txn_modify_page_delete(WT_SESSION_IMPL *session, WT_REF *ref) { + WT_DECL_RET; + WT_TXN *txn; WT_TXN_OP *op; + txn = &session->txn; + WT_RET(__txn_next_op(session, &op)); - op->type = WT_TXN_OP_REF; + op->type = WT_TXN_OP_REF_DELETE; + +#ifdef HAVE_TIMESTAMPS + if (__wt_txn_update_needs_timestamp(session, op)) + __wt_timestamp_set( + &ref->page_del->timestamp, &txn->commit_timestamp); +#endif op->u.ref = ref; - return (__wt_txn_log_op(session, NULL)); + ref->page_del->txnid = txn->id; + + WT_ERR(__wt_txn_log_op(session, NULL)); + return (0); + +err: __wt_txn_unmodify(session); + return (ret); } /* diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index fef1a935983..cacb64eae91 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -655,6 +655,13 @@ struct __wt_cursor { * Protected fields, only to be used by cursor implementations. */ #if !defined(SWIG) && !defined(DOXYGEN) + int __F(cache)(WT_CURSOR *cursor); /* Cache the cursor */ + /* Reopen a cached cursor */ + int __F(reopen)(WT_CURSOR *cursor, bool check_only); + + uint64_t uri_hash; /* Hash of URI */ + const char *checkpoint; /* Checkpoint, if any */ + /* * !!! * Explicit representations of structures from queue.h. @@ -680,21 +687,23 @@ struct __wt_cursor { const char *internal_uri; /* AUTOMATIC FLAG VALUE GENERATION START */ -#define WT_CURSTD_APPEND 0x0001u -#define WT_CURSTD_BULK 0x0002u -#define WT_CURSTD_DUMP_HEX 0x0004u -#define WT_CURSTD_DUMP_JSON 0x0008u -#define WT_CURSTD_DUMP_PRINT 0x0010u -#define WT_CURSTD_JOINED 0x0020u -#define WT_CURSTD_KEY_EXT 0x0040u /* Key points out of the tree. */ -#define WT_CURSTD_KEY_INT 0x0080u /* Key points into the tree. */ -#define WT_CURSTD_META_INUSE 0x0100u -#define WT_CURSTD_OPEN 0x0200u -#define WT_CURSTD_OVERWRITE 0x0400u -#define WT_CURSTD_RAW 0x0800u -#define WT_CURSTD_RAW_SEARCH 0x1000u -#define WT_CURSTD_VALUE_EXT 0x2000u /* Value points out of the tree. */ -#define WT_CURSTD_VALUE_INT 0x4000u /* Value points into the tree. */ +#define WT_CURSTD_APPEND 0x00001u +#define WT_CURSTD_BULK 0x00002u +#define WT_CURSTD_CACHEABLE 0x00004u +#define WT_CURSTD_CACHED 0x00008u +#define WT_CURSTD_DUMP_HEX 0x00010u +#define WT_CURSTD_DUMP_JSON 0x00020u +#define WT_CURSTD_DUMP_PRINT 0x00040u +#define WT_CURSTD_JOINED 0x00080u +#define WT_CURSTD_KEY_EXT 0x00100u /* Key points out of tree. */ +#define WT_CURSTD_KEY_INT 0x00200u /* Key points into tree. */ +#define WT_CURSTD_META_INUSE 0x00400u +#define WT_CURSTD_OPEN 0x00800u +#define WT_CURSTD_OVERWRITE 0x01000u +#define WT_CURSTD_RAW 0x02000u +#define WT_CURSTD_RAW_SEARCH 0x04000u +#define WT_CURSTD_VALUE_EXT 0x08000u /* Value points out of tree. */ +#define WT_CURSTD_VALUE_INT 0x10000u /* Value points into tree. */ /* AUTOMATIC FLAG VALUE GENERATION STOP */ #define WT_CURSTD_KEY_SET (WT_CURSTD_KEY_EXT | WT_CURSTD_KEY_INT) #define WT_CURSTD_VALUE_SET (WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT) @@ -946,6 +955,11 @@ struct __wt_session { * * @param session the session handle * @configstart{WT_SESSION.reconfigure, see dist/api_data.py} + * @config{cache_cursors, enable caching of cursors for reuse. Any + * calls to WT_CURSOR::close for a cursor created in this session will + * mark the cursor as cached and keep it available to be reused for + * later calls to WT_SESSION::open_cursor. Cached cursors may be + * eventually closed., a boolean flag; default \c true.} * @config{ignore_cache_size, when set\, operations performed by this * session ignore the cache size and are not blocked when the cache is * full. Note that use of this option for operations that create cache @@ -1730,7 +1744,7 @@ struct __wt_session { * conflicts. Transactions with higher values are less likely to * abort., an integer between -100 and 100; default \c 0.} * @config{read_timestamp, read using the specified timestamp. The - * supplied value should not be older than the current oldest timestamp. + * supplied value must not be older than the current oldest timestamp. * See @ref transaction_timestamps., a string; default empty.} * @config{round_to_oldest, if read timestamp is earlier than oldest * timestamp\, read timestamp will be rounded to oldest timestamp., a @@ -1760,8 +1774,8 @@ struct __wt_session { * @param session the session handle * @configstart{WT_SESSION.commit_transaction, see dist/api_data.py} * @config{commit_timestamp, set the commit timestamp for the current - * transaction. The supplied value should not be older than the first - * commit timestamp set for the current transaction. The value should + * transaction. The supplied value must not be older than the first + * commit timestamp set for the current transaction. The value must * also not be older than the current oldest and stable timestamps. See * @ref transaction_timestamps., a string; default empty.} * @config{sync, override whether to sync log records when the @@ -1782,10 +1796,12 @@ struct __wt_session { * Prepare the current transaction. * * A transaction must be in progress when this method is called. - * Preparing a transaction will guarantee subsequent commit will + * + * Preparing a transaction will guarantee a subsequent commit will * succeed. Only commit and rollback are allowed on a transaction after - * it has been prepared. At the moment, prepare transaction is designed - * to support MongoDB exclusively. + * it has been prepared. The transaction prepare API is designed to + * support MongoDB exclusively, and guarantees update conflicts have + * been resolved, but does not guarantee durability. * * @requires_transaction * @@ -1794,7 +1810,7 @@ struct __wt_session { * @param session the session handle * @configstart{WT_SESSION.prepare_transaction, see dist/api_data.py} * @config{prepare_timestamp, set the prepare timestamp for the updates - * of the current transaction. The supplied value should not be older + * of the current transaction. The supplied value must not be older * than any active read timestamps. This configuration option is * mandatory. See @ref transaction_timestamps., a string; default * empty.} @@ -1830,10 +1846,17 @@ struct __wt_session { * @param session the session handle * @configstart{WT_SESSION.timestamp_transaction, see dist/api_data.py} * @config{commit_timestamp, set the commit timestamp for the current - * transaction. The supplied value should not be older than the first - * commit timestamp set for the current transaction. The value should + * transaction. The supplied value must not be older than the first + * commit timestamp set for the current transaction. The value must * also not be older than the current oldest and stable timestamps. See * @ref transaction_timestamps., a string; default empty.} + * @config{read_timestamp, read using the specified timestamp. The + * supplied value must not be older than the current oldest timestamp. + * This can only be set once for a transaction. @ref + * transaction_timestamps., a string; default empty.} + * @config{round_to_oldest, if read timestamp is earlier than oldest + * timestamp\, read timestamp will be rounded to oldest timestamp., a + * boolean flag; default \c false.} * @configend * @errors */ @@ -2355,6 +2378,11 @@ struct __wt_connection { * connection's event handler is used. See @ref event_message_handling * for more information. * @configstart{WT_CONNECTION.open_session, see dist/api_data.py} + * @config{cache_cursors, enable caching of cursors for reuse. Any + * calls to WT_CURSOR::close for a cursor created in this session will + * mark the cursor as cached and keep it available to be reused for + * later calls to WT_SESSION::open_cursor. Cached cursors may be + * eventually closed., a boolean flag; default \c true.} * @config{ignore_cache_size, when set\, operations performed by this * session ignore the cache size and are not blocked when the cache is * full. Note that use of this option for operations that create cache @@ -2397,7 +2425,7 @@ struct __wt_connection { * set with WT_CONNECTION::set_timestamp. See @ref * transaction_timestamps., a string\, chosen from the following * options: \c "all_committed"\, \c "oldest"\, \c "pinned"\, \c - * "stable"; default \c all_committed.} + * "recovery"\, \c "stable"; default \c all_committed.} * @configend * @errors * If there is no matching timestamp (e.g., if this method is called @@ -2423,7 +2451,7 @@ struct __wt_connection { * than the specified value until the next commit moves the tracked * commit timestamp forwards. This is only intended for use where the * application is rolling back locally committed transactions. The - * supplied value should not be older than the current oldest and stable + * supplied value must not be older than the current oldest and stable * timestamps. See @ref transaction_timestamps., a string; default * empty.} * @config{force, set timestamps even if they violate normal ordering @@ -2432,14 +2460,14 @@ struct __wt_connection { * @config{oldest_timestamp, future commits and queries will be no * earlier than the specified timestamp. Supplied values must be * monotonically increasing\, any attempt to set the value to older than - * the current is silently ignored. The supplied value should not be + * the current is silently ignored. The supplied value must not be * newer than the current stable timestamp. See @ref * transaction_timestamps., a string; default empty.} * @config{stable_timestamp, checkpoints will not include commits that * are newer than the specified timestamp in tables configured with \c * log=(enabled=false). Supplied values must be monotonically * increasing\, any attempt to set the value to older than the current - * is silently ignored. The supplied value should not be older than the + * is silently ignored. The supplied value must not be older than the * current oldest timestamp. See @ref transaction_timestamps., a * string; default empty.} * @configend @@ -3559,6 +3587,14 @@ const char *wiredtiger_version(int *majorp, int *minorp, int *patchp) * transaction. */ #define WT_CACHE_FULL (-31807) +/*! + * Conflict with a prepared update. + * This error is generated when the application attempts to update an already + * updated record which is in prepared state. An updated record will be in + * prepared state, when the transaction that performed the update is in prepared + * state. + */ +#define WT_PREPARE_CONFLICT (-31808) /* * Error return section: END * DO NOT EDIT: automatically built by dist/api_err.py. @@ -5135,449 +5171,461 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_CURSOR_SEARCH 1143 /*! cursor: cursor search near calls */ #define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1144 +/*! cursor: cursor sweep buckets */ +#define WT_STAT_CONN_CURSOR_SWEEP_BUCKETS 1145 +/*! cursor: cursor sweep cursors closed */ +#define WT_STAT_CONN_CURSOR_SWEEP_CLOSED 1146 +/*! cursor: cursor sweep cursors examined */ +#define WT_STAT_CONN_CURSOR_SWEEP_EXAMINED 1147 +/*! cursor: cursor sweeps */ +#define WT_STAT_CONN_CURSOR_SWEEP 1148 /*! cursor: cursor update calls */ -#define WT_STAT_CONN_CURSOR_UPDATE 1145 +#define WT_STAT_CONN_CURSOR_UPDATE 1149 +/*! cursor: cursors cached on close */ +#define WT_STAT_CONN_CURSOR_CACHE 1150 +/*! cursor: cursors reused from cache */ +#define WT_STAT_CONN_CURSOR_REOPEN 1151 /*! cursor: truncate calls */ -#define WT_STAT_CONN_CURSOR_TRUNCATE 1146 +#define WT_STAT_CONN_CURSOR_TRUNCATE 1152 /*! data-handle: connection data handles currently active */ -#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1147 +#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1153 /*! data-handle: connection sweep candidate became referenced */ -#define WT_STAT_CONN_DH_SWEEP_REF 1148 +#define WT_STAT_CONN_DH_SWEEP_REF 1154 /*! data-handle: connection sweep dhandles closed */ -#define WT_STAT_CONN_DH_SWEEP_CLOSE 1149 +#define WT_STAT_CONN_DH_SWEEP_CLOSE 1155 /*! data-handle: connection sweep dhandles removed from hash list */ -#define WT_STAT_CONN_DH_SWEEP_REMOVE 1150 +#define WT_STAT_CONN_DH_SWEEP_REMOVE 1156 /*! data-handle: connection sweep time-of-death sets */ -#define WT_STAT_CONN_DH_SWEEP_TOD 1151 +#define WT_STAT_CONN_DH_SWEEP_TOD 1157 /*! data-handle: connection sweeps */ -#define WT_STAT_CONN_DH_SWEEPS 1152 +#define WT_STAT_CONN_DH_SWEEPS 1158 /*! data-handle: session dhandles swept */ -#define WT_STAT_CONN_DH_SESSION_HANDLES 1153 +#define WT_STAT_CONN_DH_SESSION_HANDLES 1159 /*! data-handle: session sweep attempts */ -#define WT_STAT_CONN_DH_SESSION_SWEEPS 1154 +#define WT_STAT_CONN_DH_SESSION_SWEEPS 1160 /*! lock: checkpoint lock acquisitions */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1155 +#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1161 /*! lock: checkpoint lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1156 +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1162 /*! lock: checkpoint lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1157 +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1163 /*! * lock: commit timestamp queue lock application thread time waiting for * the dhandle lock (usecs) */ -#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_APPLICATION 1158 +#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_APPLICATION 1164 /*! * lock: commit timestamp queue lock internal thread time waiting for the * dhandle lock (usecs) */ -#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_INTERNAL 1159 +#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_INTERNAL 1165 /*! lock: commit timestamp queue read lock acquisitions */ -#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_READ_COUNT 1160 +#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_READ_COUNT 1166 /*! lock: commit timestamp queue write lock acquisitions */ -#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WRITE_COUNT 1161 +#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WRITE_COUNT 1167 /*! * lock: dhandle lock application thread time waiting for the dhandle * lock (usecs) */ -#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_APPLICATION 1162 +#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_APPLICATION 1168 /*! * lock: dhandle lock internal thread time waiting for the dhandle lock * (usecs) */ -#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_INTERNAL 1163 +#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_INTERNAL 1169 /*! lock: dhandle read lock acquisitions */ -#define WT_STAT_CONN_LOCK_DHANDLE_READ_COUNT 1164 +#define WT_STAT_CONN_LOCK_DHANDLE_READ_COUNT 1170 /*! lock: dhandle write lock acquisitions */ -#define WT_STAT_CONN_LOCK_DHANDLE_WRITE_COUNT 1165 +#define WT_STAT_CONN_LOCK_DHANDLE_WRITE_COUNT 1171 /*! lock: metadata lock acquisitions */ -#define WT_STAT_CONN_LOCK_METADATA_COUNT 1166 +#define WT_STAT_CONN_LOCK_METADATA_COUNT 1172 /*! lock: metadata lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1167 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1173 /*! lock: metadata lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1168 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1174 /*! * lock: read timestamp queue lock application thread time waiting for * the dhandle lock (usecs) */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_APPLICATION 1169 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_APPLICATION 1175 /*! * lock: read timestamp queue lock internal thread time waiting for the * dhandle lock (usecs) */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_INTERNAL 1170 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_INTERNAL 1176 /*! lock: read timestamp queue read lock acquisitions */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_READ_COUNT 1171 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_READ_COUNT 1177 /*! lock: read timestamp queue write lock acquisitions */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WRITE_COUNT 1172 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WRITE_COUNT 1178 /*! lock: schema lock acquisitions */ -#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1173 +#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1179 /*! lock: schema lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1174 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1180 /*! lock: schema lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1175 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1181 /*! * lock: table lock application thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1176 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1182 /*! * lock: table lock internal thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1177 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1183 /*! lock: table read lock acquisitions */ -#define WT_STAT_CONN_LOCK_TABLE_READ_COUNT 1178 +#define WT_STAT_CONN_LOCK_TABLE_READ_COUNT 1184 /*! lock: table write lock acquisitions */ -#define WT_STAT_CONN_LOCK_TABLE_WRITE_COUNT 1179 +#define WT_STAT_CONN_LOCK_TABLE_WRITE_COUNT 1185 /*! * lock: txn global lock application thread time waiting for the dhandle * lock (usecs) */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_APPLICATION 1180 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_APPLICATION 1186 /*! * lock: txn global lock internal thread time waiting for the dhandle * lock (usecs) */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_INTERNAL 1181 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_INTERNAL 1187 /*! lock: txn global read lock acquisitions */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_READ_COUNT 1182 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_READ_COUNT 1188 /*! lock: txn global write lock acquisitions */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WRITE_COUNT 1183 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WRITE_COUNT 1189 /*! log: busy returns attempting to switch slots */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1184 +#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1190 /*! log: force checkpoint calls slept */ -#define WT_STAT_CONN_LOG_FORCE_CKPT_SLEEP 1185 +#define WT_STAT_CONN_LOG_FORCE_CKPT_SLEEP 1191 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1186 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1192 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1187 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1193 /*! log: log files manually zero-filled */ -#define WT_STAT_CONN_LOG_ZERO_FILLS 1188 +#define WT_STAT_CONN_LOG_ZERO_FILLS 1194 /*! log: log flush operations */ -#define WT_STAT_CONN_LOG_FLUSH 1189 +#define WT_STAT_CONN_LOG_FLUSH 1195 /*! log: log force write operations */ -#define WT_STAT_CONN_LOG_FORCE_WRITE 1190 +#define WT_STAT_CONN_LOG_FORCE_WRITE 1196 /*! log: log force write operations skipped */ -#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1191 +#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1197 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1192 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1198 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1193 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1199 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1194 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1200 /*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1195 +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1201 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1196 +#define WT_STAT_CONN_LOG_SCANS 1202 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1197 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1203 /*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1198 +#define WT_STAT_CONN_LOG_WRITE_LSN 1204 /*! log: log server thread write LSN walk skipped */ -#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1199 +#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1205 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1200 +#define WT_STAT_CONN_LOG_SYNC 1206 /*! log: log sync time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DURATION 1201 +#define WT_STAT_CONN_LOG_SYNC_DURATION 1207 /*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1202 +#define WT_STAT_CONN_LOG_SYNC_DIR 1208 /*! log: log sync_dir time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1203 +#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1209 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1204 +#define WT_STAT_CONN_LOG_WRITES 1210 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1205 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1211 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1206 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1212 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1207 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1213 /*! log: pre-allocated log files not ready and missed */ -#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1208 +#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1214 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1209 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1215 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1210 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1216 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1211 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1217 /*! log: slot close lost race */ -#define WT_STAT_CONN_LOG_SLOT_CLOSE_RACE 1212 +#define WT_STAT_CONN_LOG_SLOT_CLOSE_RACE 1218 /*! log: slot close unbuffered waits */ -#define WT_STAT_CONN_LOG_SLOT_CLOSE_UNBUF 1213 +#define WT_STAT_CONN_LOG_SLOT_CLOSE_UNBUF 1219 /*! log: slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1214 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1220 /*! log: slot join atomic update races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1215 +#define WT_STAT_CONN_LOG_SLOT_RACES 1221 /*! log: slot join calls atomic updates raced */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_RACE 1216 +#define WT_STAT_CONN_LOG_SLOT_YIELD_RACE 1222 /*! log: slot join calls did not yield */ -#define WT_STAT_CONN_LOG_SLOT_IMMEDIATE 1217 +#define WT_STAT_CONN_LOG_SLOT_IMMEDIATE 1223 /*! log: slot join calls found active slot closed */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_CLOSE 1218 +#define WT_STAT_CONN_LOG_SLOT_YIELD_CLOSE 1224 /*! log: slot join calls slept */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_SLEEP 1219 +#define WT_STAT_CONN_LOG_SLOT_YIELD_SLEEP 1225 /*! log: slot join calls yielded */ -#define WT_STAT_CONN_LOG_SLOT_YIELD 1220 +#define WT_STAT_CONN_LOG_SLOT_YIELD 1226 /*! log: slot join found active slot closed */ -#define WT_STAT_CONN_LOG_SLOT_ACTIVE_CLOSED 1221 +#define WT_STAT_CONN_LOG_SLOT_ACTIVE_CLOSED 1227 /*! log: slot joins yield time (usecs) */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_DURATION 1222 +#define WT_STAT_CONN_LOG_SLOT_YIELD_DURATION 1228 /*! log: slot transitions unable to find free slot */ -#define WT_STAT_CONN_LOG_SLOT_NO_FREE_SLOTS 1223 +#define WT_STAT_CONN_LOG_SLOT_NO_FREE_SLOTS 1229 /*! log: slot unbuffered writes */ -#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1224 +#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1230 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1225 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1231 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1226 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1232 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1227 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1233 /*! log: written slots coalesced */ -#define WT_STAT_CONN_LOG_SLOT_COALESCED 1228 +#define WT_STAT_CONN_LOG_SLOT_COALESCED 1234 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1229 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1235 /*! perf: file system read latency histogram (bucket 1) - 10-49ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT50 1230 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT50 1236 /*! perf: file system read latency histogram (bucket 2) - 50-99ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT100 1231 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT100 1237 /*! perf: file system read latency histogram (bucket 3) - 100-249ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT250 1232 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT250 1238 /*! perf: file system read latency histogram (bucket 4) - 250-499ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT500 1233 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT500 1239 /*! perf: file system read latency histogram (bucket 5) - 500-999ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT1000 1234 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT1000 1240 /*! perf: file system read latency histogram (bucket 6) - 1000ms+ */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_GT1000 1235 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_GT1000 1241 /*! perf: file system write latency histogram (bucket 1) - 10-49ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT50 1236 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT50 1242 /*! perf: file system write latency histogram (bucket 2) - 50-99ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT100 1237 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT100 1243 /*! perf: file system write latency histogram (bucket 3) - 100-249ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT250 1238 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT250 1244 /*! perf: file system write latency histogram (bucket 4) - 250-499ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT500 1239 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT500 1245 /*! perf: file system write latency histogram (bucket 5) - 500-999ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT1000 1240 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT1000 1246 /*! perf: file system write latency histogram (bucket 6) - 1000ms+ */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_GT1000 1241 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_GT1000 1247 /*! perf: operation read latency histogram (bucket 1) - 100-249us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT250 1242 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT250 1248 /*! perf: operation read latency histogram (bucket 2) - 250-499us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT500 1243 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT500 1249 /*! perf: operation read latency histogram (bucket 3) - 500-999us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT1000 1244 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT1000 1250 /*! perf: operation read latency histogram (bucket 4) - 1000-9999us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT10000 1245 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT10000 1251 /*! perf: operation read latency histogram (bucket 5) - 10000us+ */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_GT10000 1246 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_GT10000 1252 /*! perf: operation write latency histogram (bucket 1) - 100-249us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT250 1247 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT250 1253 /*! perf: operation write latency histogram (bucket 2) - 250-499us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT500 1248 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT500 1254 /*! perf: operation write latency histogram (bucket 3) - 500-999us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT1000 1249 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT1000 1255 /*! perf: operation write latency histogram (bucket 4) - 1000-9999us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT10000 1250 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT10000 1256 /*! perf: operation write latency histogram (bucket 5) - 10000us+ */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_GT10000 1251 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_GT10000 1257 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1252 +#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1258 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1253 +#define WT_STAT_CONN_REC_PAGES 1259 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1254 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1260 /*! reconciliation: pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE 1255 +#define WT_STAT_CONN_REC_PAGE_DELETE 1261 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1256 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1262 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1257 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1263 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1258 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1264 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1259 +#define WT_STAT_CONN_SESSION_OPEN 1265 /*! session: table alter failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1260 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1266 /*! session: table alter successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1261 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1267 /*! session: table alter unchanged and skipped */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1262 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1268 /*! session: table compact failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1263 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1269 /*! session: table compact successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1264 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1270 /*! session: table create failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1265 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1271 /*! session: table create successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1266 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1272 /*! session: table drop failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1267 +#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1273 /*! session: table drop successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1268 +#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1274 /*! session: table rebalance failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1269 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1275 /*! session: table rebalance successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1270 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1276 /*! session: table rename failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1271 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1277 /*! session: table rename successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1272 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1278 /*! session: table salvage failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1273 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1279 /*! session: table salvage successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1274 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1280 /*! session: table truncate failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1275 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1281 /*! session: table truncate successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1276 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1282 /*! session: table verify failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1277 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1283 /*! session: table verify successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1278 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1284 /*! thread-state: active filesystem fsync calls */ -#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1279 +#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1285 /*! thread-state: active filesystem read calls */ -#define WT_STAT_CONN_THREAD_READ_ACTIVE 1280 +#define WT_STAT_CONN_THREAD_READ_ACTIVE 1286 /*! thread-state: active filesystem write calls */ -#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1281 +#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1287 /*! thread-yield: application thread time evicting (usecs) */ -#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1282 +#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1288 /*! thread-yield: application thread time waiting for cache (usecs) */ -#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1283 +#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1289 /*! * thread-yield: connection close blocked waiting for transaction state * stabilization */ -#define WT_STAT_CONN_TXN_RELEASE_BLOCKED 1284 +#define WT_STAT_CONN_TXN_RELEASE_BLOCKED 1290 /*! thread-yield: connection close yielded for lsm manager shutdown */ -#define WT_STAT_CONN_CONN_CLOSE_BLOCKED_LSM 1285 +#define WT_STAT_CONN_CONN_CLOSE_BLOCKED_LSM 1291 /*! thread-yield: data handle lock yielded */ -#define WT_STAT_CONN_DHANDLE_LOCK_BLOCKED 1286 +#define WT_STAT_CONN_DHANDLE_LOCK_BLOCKED 1292 /*! * thread-yield: get reference for page index and slot time sleeping * (usecs) */ -#define WT_STAT_CONN_PAGE_INDEX_SLOT_REF_BLOCKED 1287 +#define WT_STAT_CONN_PAGE_INDEX_SLOT_REF_BLOCKED 1293 /*! thread-yield: log server sync yielded for log write */ -#define WT_STAT_CONN_LOG_SERVER_SYNC_BLOCKED 1288 +#define WT_STAT_CONN_LOG_SERVER_SYNC_BLOCKED 1294 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1289 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1295 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1290 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1296 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1291 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1297 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1292 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1298 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1293 +#define WT_STAT_CONN_PAGE_SLEEP 1299 /*! * thread-yield: page delete rollback time sleeping for state change * (usecs) */ -#define WT_STAT_CONN_PAGE_DEL_ROLLBACK_BLOCKED 1294 +#define WT_STAT_CONN_PAGE_DEL_ROLLBACK_BLOCKED 1300 /*! thread-yield: page reconciliation yielded due to child modification */ -#define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1295 +#define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1301 /*! * thread-yield: tree descend one level yielded for split page index * update */ -#define WT_STAT_CONN_TREE_DESCEND_BLOCKED 1296 +#define WT_STAT_CONN_TREE_DESCEND_BLOCKED 1302 /*! transaction: commit timestamp queue insert to empty */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_EMPTY 1297 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_EMPTY 1303 /*! transaction: commit timestamp queue inserts to tail */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_TAIL 1298 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_TAIL 1304 /*! transaction: commit timestamp queue inserts total */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_INSERTS 1299 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_INSERTS 1305 /*! transaction: commit timestamp queue length */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_LEN 1300 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_LEN 1306 /*! transaction: number of named snapshots created */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1301 +#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1307 /*! transaction: number of named snapshots dropped */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1302 +#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1308 /*! transaction: query timestamp calls */ -#define WT_STAT_CONN_TXN_QUERY_TS 1303 +#define WT_STAT_CONN_TXN_QUERY_TS 1309 /*! transaction: read timestamp queue insert to empty */ -#define WT_STAT_CONN_TXN_READ_QUEUE_EMPTY 1304 +#define WT_STAT_CONN_TXN_READ_QUEUE_EMPTY 1310 /*! transaction: read timestamp queue inserts to head */ -#define WT_STAT_CONN_TXN_READ_QUEUE_HEAD 1305 +#define WT_STAT_CONN_TXN_READ_QUEUE_HEAD 1311 /*! transaction: read timestamp queue inserts total */ -#define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1306 +#define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1312 /*! transaction: read timestamp queue length */ -#define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1307 +#define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1313 /*! transaction: set timestamp calls */ -#define WT_STAT_CONN_TXN_SET_TS 1308 +#define WT_STAT_CONN_TXN_SET_TS 1314 /*! transaction: set timestamp commit calls */ -#define WT_STAT_CONN_TXN_SET_TS_COMMIT 1309 +#define WT_STAT_CONN_TXN_SET_TS_COMMIT 1315 /*! transaction: set timestamp commit updates */ -#define WT_STAT_CONN_TXN_SET_TS_COMMIT_UPD 1310 +#define WT_STAT_CONN_TXN_SET_TS_COMMIT_UPD 1316 /*! transaction: set timestamp oldest calls */ -#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1311 +#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1317 /*! transaction: set timestamp oldest updates */ -#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1312 +#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1318 /*! transaction: set timestamp stable calls */ -#define WT_STAT_CONN_TXN_SET_TS_STABLE 1313 +#define WT_STAT_CONN_TXN_SET_TS_STABLE 1319 /*! transaction: set timestamp stable updates */ -#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1314 +#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1320 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1315 +#define WT_STAT_CONN_TXN_BEGIN 1321 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1316 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1322 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1317 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1323 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1318 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1324 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1319 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1325 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1320 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1326 /*! transaction: transaction checkpoint scrub dirty target */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1321 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1327 /*! transaction: transaction checkpoint scrub time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1322 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1328 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1323 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1329 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1324 +#define WT_STAT_CONN_TXN_CHECKPOINT 1330 /*! * transaction: transaction checkpoints skipped because database was * clean */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1325 +#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1331 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1326 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1332 /*! * transaction: transaction fsync calls for checkpoint after allocating * the transaction ID */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1327 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1333 /*! * transaction: transaction fsync duration for checkpoint after * allocating the transaction ID (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1328 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1334 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1329 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1335 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1330 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1336 /*! * transaction: transaction range of IDs currently pinned by named * snapshots */ -#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1331 +#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1337 /*! transaction: transaction range of timestamps currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1332 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1338 /*! * transaction: transaction range of timestamps pinned by the oldest * timestamp */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1333 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1339 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1334 +#define WT_STAT_CONN_TXN_SYNC 1340 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1335 +#define WT_STAT_CONN_TXN_COMMIT 1341 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1336 +#define WT_STAT_CONN_TXN_ROLLBACK 1342 /*! transaction: update conflicts */ -#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1337 +#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1343 /*! * @} @@ -5906,67 +5954,73 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2107 /*! cursor: cursor-update value bytes updated */ #define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2108 +/*! cursor: cursors cached on close */ +#define WT_STAT_DSRC_CURSOR_CACHE 2109 +/*! cursor: cursors reused from cache */ +#define WT_STAT_DSRC_CURSOR_REOPEN 2110 /*! cursor: insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT 2109 +#define WT_STAT_DSRC_CURSOR_INSERT 2111 /*! cursor: modify calls */ -#define WT_STAT_DSRC_CURSOR_MODIFY 2110 +#define WT_STAT_DSRC_CURSOR_MODIFY 2112 /*! cursor: next calls */ -#define WT_STAT_DSRC_CURSOR_NEXT 2111 +#define WT_STAT_DSRC_CURSOR_NEXT 2113 /*! cursor: prev calls */ -#define WT_STAT_DSRC_CURSOR_PREV 2112 +#define WT_STAT_DSRC_CURSOR_PREV 2114 /*! cursor: remove calls */ -#define WT_STAT_DSRC_CURSOR_REMOVE 2113 +#define WT_STAT_DSRC_CURSOR_REMOVE 2115 /*! cursor: reserve calls */ -#define WT_STAT_DSRC_CURSOR_RESERVE 2114 +#define WT_STAT_DSRC_CURSOR_RESERVE 2116 /*! cursor: reset calls */ -#define WT_STAT_DSRC_CURSOR_RESET 2115 +#define WT_STAT_DSRC_CURSOR_RESET 2117 /*! cursor: restarted searches */ -#define WT_STAT_DSRC_CURSOR_RESTART 2116 +#define WT_STAT_DSRC_CURSOR_RESTART 2118 /*! cursor: search calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH 2117 +#define WT_STAT_DSRC_CURSOR_SEARCH 2119 /*! cursor: search near calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2118 +#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2120 /*! cursor: truncate calls */ -#define WT_STAT_DSRC_CURSOR_TRUNCATE 2119 +#define WT_STAT_DSRC_CURSOR_TRUNCATE 2121 /*! cursor: update calls */ -#define WT_STAT_DSRC_CURSOR_UPDATE 2120 +#define WT_STAT_DSRC_CURSOR_UPDATE 2122 /*! reconciliation: dictionary matches */ -#define WT_STAT_DSRC_REC_DICTIONARY 2121 +#define WT_STAT_DSRC_REC_DICTIONARY 2123 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE_FAST 2122 +#define WT_STAT_DSRC_REC_PAGE_DELETE_FAST 2124 /*! * reconciliation: internal page key bytes discarded using suffix * compression */ -#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2123 +#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2125 /*! reconciliation: internal page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2124 +#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2126 /*! reconciliation: internal-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2125 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2127 /*! reconciliation: leaf page key bytes discarded using prefix compression */ -#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2126 +#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2128 /*! reconciliation: leaf page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2127 +#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2129 /*! reconciliation: leaf-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2128 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2130 /*! reconciliation: maximum blocks required for a page */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2129 +#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2131 /*! reconciliation: overflow values written */ -#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2130 +#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2132 /*! reconciliation: page checksum matches */ -#define WT_STAT_DSRC_REC_PAGE_MATCH 2131 +#define WT_STAT_DSRC_REC_PAGE_MATCH 2133 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_DSRC_REC_PAGES 2132 +#define WT_STAT_DSRC_REC_PAGES 2134 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_DSRC_REC_PAGES_EVICTION 2133 +#define WT_STAT_DSRC_REC_PAGES_EVICTION 2135 /*! reconciliation: pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE 2134 +#define WT_STAT_DSRC_REC_PAGE_DELETE 2136 +/*! session: cached cursor count */ +#define WT_STAT_DSRC_SESSION_CURSOR_CACHED 2137 /*! session: object compaction */ -#define WT_STAT_DSRC_SESSION_COMPACT 2135 +#define WT_STAT_DSRC_SESSION_COMPACT 2138 /*! session: open cursor count */ -#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2136 +#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2139 /*! transaction: update conflicts */ -#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2137 +#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2140 /*! * @} diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c index d737c09c391..7050a66a558 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c @@ -1188,7 +1188,7 @@ __clsm_reset(WT_CURSOR *cursor) * we want to do is give up our position. */ clsm = (WT_CURSOR_LSM *)cursor; - CURSOR_API_CALL(cursor, session, reset, NULL); + CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, reset, NULL); F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); WT_TRET(__clsm_reset_cursors(clsm, NULL)); @@ -1735,7 +1735,7 @@ __wt_clsm_close(WT_CURSOR *cursor) * closing, and the cursor may never have been used. */ clsm = (WT_CURSOR_LSM *)cursor; - CURSOR_API_CALL(cursor, session, close, NULL); + CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL); WT_TRET(__clsm_close_cursors(clsm, 0, clsm->nchunks)); __clsm_free_chunks(session, clsm); @@ -1776,6 +1776,8 @@ __wt_clsm_open(WT_SESSION_IMPL *session, __clsm_remove, /* remove */ __clsm_reserve, /* reserve */ __wt_cursor_reconfigure, /* reconfigure */ + __wt_cursor_notsup, /* cache */ + __wt_cursor_reopen_notsup, /* reopen */ __wt_clsm_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_LSM *clsm; diff --git a/src/third_party/wiredtiger/src/meta/meta_ckpt.c b/src/third_party/wiredtiger/src/meta/meta_ckpt.c index a2edee8aac4..67604399a2e 100644 --- a/src/third_party/wiredtiger/src/meta/meta_ckpt.c +++ b/src/third_party/wiredtiger/src/meta/meta_ckpt.c @@ -375,6 +375,7 @@ __wt_meta_ckptlist_set(WT_SESSION_IMPL *session, time_t secs; int64_t maxorder; const char *sep; + char hex_timestamp[2 * WT_TIMESTAMP_SIZE + 2]; WT_ERR(__wt_scr_alloc(session, 0, &buf)); maxorder = 0; @@ -452,6 +453,21 @@ __wt_meta_ckptlist_set(WT_SESSION_IMPL *session, WT_ERR(__wt_buf_catfmt(session, buf, ",checkpoint_lsn=(%" PRIu32 ",%" PRIuMAX ")", ckptlsn->l.file, (uintmax_t)ckptlsn->l.offset)); + hex_timestamp[0] = '0'; + hex_timestamp[1] = '\0'; +#ifdef HAVE_TIMESTAMPS + /* + * We need to record the timestamp of the checkpoint in the metadata's + * checkpoint record. Although the read_timestamp remains set for the + * duration of the checkpoint, we set and unset the flag based on the + * file's durability. Record the timestamp if the flag is set. + */ + if (F_ISSET(&session->txn, WT_TXN_HAS_TS_READ)) + WT_ERR(__wt_timestamp_to_hex_string(session, hex_timestamp, + &session->txn.read_timestamp)); +#endif + WT_ERR(__wt_buf_catfmt(session, buf, + ",checkpoint_timestamp=\"%s\"", hex_timestamp)); WT_ERR(__ckpt_set(session, fname, buf->mem)); err: __wt_scr_free(session, &buf); diff --git a/src/third_party/wiredtiger/src/meta/meta_track.c b/src/third_party/wiredtiger/src/meta/meta_track.c index 6c92434aa2c..3acfae9ab2e 100644 --- a/src/third_party/wiredtiger/src/meta/meta_track.c +++ b/src/third_party/wiredtiger/src/meta/meta_track.c @@ -114,8 +114,13 @@ __wt_meta_track_discard(WT_SESSION_IMPL *session) int __wt_meta_track_on(WT_SESSION_IMPL *session) { - if (session->meta_track_nest++ == 0) + if (session->meta_track_nest++ == 0) { + if (!F_ISSET(&session->txn, WT_TXN_RUNNING)) { + WT_RET(__wt_txn_begin(session, NULL)); + F_SET(session, WT_SESSION_SCHEMA_TXN); + } WT_RET(__meta_track_next(session, NULL)); + } return (0); } @@ -257,16 +262,21 @@ __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll) session->meta_track_next = session->meta_track_sub = NULL; /* - * If there were no operations logged, return now and avoid unnecessary - * metadata checkpoints. For example, this happens if attempting to - * create a data source that already exists (or drop one that doesn't). + * If there were no operations logged, skip unnecessary metadata + * checkpoints. For example, this happens if attempting to create a + * data source that already exists (or drop one that doesn't). */ if (trk == trk_orig) - return (0); + goto err; /* Unrolling doesn't require syncing the metadata. */ if (unroll) - goto done; + goto err; + + if (F_ISSET(session, WT_SESSION_SCHEMA_TXN)) { + F_CLR(session, WT_SESSION_SCHEMA_TXN); + WT_ERR(__wt_txn_commit(session, NULL)); + } /* * If we don't have the metadata cursor (e.g, we're in the process of @@ -274,7 +284,7 @@ __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll) */ if (!need_sync || session->meta_cursor == NULL || F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) - goto done; + goto err; /* If we're logging, make sure the metadata update was flushed. */ if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED)) @@ -303,7 +313,7 @@ __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll) ret = __wt_checkpoint_sync(session, NULL)); } -done: /* +err: /* * Undo any tracked operations on failure. * Apply any tracked operations post-commit. */ @@ -315,6 +325,18 @@ done: /* } else for (; trk_orig < trk; trk_orig++) WT_TRET(__meta_track_apply(session, trk_orig)); + + if (F_ISSET(session, WT_SESSION_SCHEMA_TXN)) { + F_CLR(session, WT_SESSION_SCHEMA_TXN); + /* + * We should have committed above unless we're unrolling, there + * was an error or the operation was a noop. + */ + WT_ASSERT(session, unroll || saved_ret != 0 || + session->txn.mod_count == 0); + WT_TRET(__wt_txn_rollback(session, NULL)); + } + if (ret != 0) WT_PANIC_RET(session, ret, "failed to apply or unroll all tracked operations"); diff --git a/src/third_party/wiredtiger/src/optrack/optrack.c b/src/third_party/wiredtiger/src/optrack/optrack.c index dd630785cd5..1be466d179e 100644 --- a/src/third_party/wiredtiger/src/optrack/optrack.c +++ b/src/third_party/wiredtiger/src/optrack/optrack.c @@ -100,20 +100,21 @@ err: WT_TRET(__wt_close(session, &session->optrack_fh)); * __wt_optrack_flush_buffer -- * Flush optrack buffer. Returns the number of bytes flushed to the file. */ -size_t +void __wt_optrack_flush_buffer(WT_SESSION_IMPL *s) { - WT_DECL_RET; - if (s->optrack_fh == NULL) if (__wt_optrack_open_file(s)) - return (0); + return; - ret = s->optrack_fh->handle->fh_write(s->optrack_fh->handle, + /* + * We're not using the standard write path deliberately, that's quite + * a bit of additional code (including atomic operations), and this + * work should be as light-weight as possible. + */ + if (s->optrack_fh->handle->fh_write(s->optrack_fh->handle, (WT_SESSION *)s, (wt_off_t)s->optrack_offset, - s->optrackbuf_ptr * sizeof(WT_OPTRACK_RECORD), s->optrack_buf); - if (ret == 0) - return (s->optrackbuf_ptr * sizeof(WT_OPTRACK_RECORD)); - else - return (0); + s->optrackbuf_ptr * sizeof(WT_OPTRACK_RECORD), s->optrack_buf) == 0) + s->optrack_offset += + s->optrackbuf_ptr * sizeof(WT_OPTRACK_RECORD); } diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 044d31b8fbd..f4d0fc0b1ef 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -1676,11 +1676,8 @@ __rec_child_deleted(WT_SESSION_IMPL *session, * Minor memory cleanup: if a truncate call deleted this page * and we were ever forced to instantiate the page in memory, * we would have built a list of updates in the page reference - * in order to be able to abort the truncate. It's a cheap - * test to make that memory go away, we do it here because - * there's really nowhere else we do the checks. In short, if - * we have such a list, and the backing address blocks are - * gone, there can't be any transaction that can abort. + * in order to be able to commit/rollback the truncate. We just + * passed a visibility test, discard the update list. */ if (page_del != NULL) { __wt_free(session, ref->page_del->update_list); diff --git a/src/third_party/wiredtiger/src/schema/schema_drop.c b/src/third_party/wiredtiger/src/schema/schema_drop.c index 12b577d0ea6..64438f1e0c7 100644 --- a/src/third_party/wiredtiger/src/schema/schema_drop.c +++ b/src/third_party/wiredtiger/src/schema/schema_drop.c @@ -30,7 +30,7 @@ __drop_file( WT_RET(__wt_schema_backup_check(session, filename)); /* Close all btree handles associated with this file. */ WT_WITH_HANDLE_LIST_WRITE_LOCK(session, - ret = __wt_conn_dhandle_close_all(session, uri, force)); + ret = __wt_conn_dhandle_close_all(session, uri, true, force)); WT_RET(ret); /* Remove the metadata entry (ignore missing items). */ diff --git a/src/third_party/wiredtiger/src/schema/schema_rename.c b/src/third_party/wiredtiger/src/schema/schema_rename.c index 0a8a80a214e..128f0ff3f86 100644 --- a/src/third_party/wiredtiger/src/schema/schema_rename.c +++ b/src/third_party/wiredtiger/src/schema/schema_rename.c @@ -32,7 +32,7 @@ __rename_file( WT_RET(__wt_schema_backup_check(session, newfile)); /* Close any btree handles in the file. */ WT_WITH_HANDLE_LIST_WRITE_LOCK(session, - ret = __wt_conn_dhandle_close_all(session, uri, false)); + ret = __wt_conn_dhandle_close_all(session, uri, true, false)); WT_ERR(ret); /* diff --git a/src/third_party/wiredtiger/src/schema/schema_worker.c b/src/third_party/wiredtiger/src/schema/schema_worker.c index 1012749efc3..407550bfdba 100644 --- a/src/third_party/wiredtiger/src/schema/schema_worker.c +++ b/src/third_party/wiredtiger/src/schema/schema_worker.c @@ -49,7 +49,7 @@ __wt_schema_worker(WT_SESSION_IMPL *session, if (FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE)) { WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __wt_conn_dhandle_close_all( - session, uri, false)); + session, uri, false, false)); WT_ERR(ret); } diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index be3a5d93473..f58fa4319e6 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -50,6 +50,67 @@ __wt_session_reset_cursors(WT_SESSION_IMPL *session, bool free_buffers) } /* + * __wt_session_cursor_cache_sweep -- + * Sweep the cursor cache. + */ +int +__wt_session_cursor_cache_sweep(WT_SESSION_IMPL *session) +{ + WT_CURSOR *cursor, *cursor_tmp; + WT_CURSOR_LIST *cached_list; + WT_DECL_RET; + uint32_t position; + int i, t_ret, nbuckets, nexamined, nclosed; + bool productive; + + if (!F_ISSET(session, WT_SESSION_CACHE_CURSORS)) + return (0); + + position = session->cursor_sweep_position; + productive = true; + nbuckets = nexamined = nclosed = 0; + + /* Turn off caching so that cursor close doesn't try to cache. */ + F_CLR(session, WT_SESSION_CACHE_CURSORS); + for (i = 0; i < WT_SESSION_CURSOR_SWEEP_MAX && productive; i++) { + ++nbuckets; + cached_list = &session->cursor_cache[position]; + position = (position + 1) % WT_HASH_ARRAY_SIZE; + TAILQ_FOREACH_SAFE(cursor, cached_list, q, cursor_tmp) { + /* + * First check to see if the cursor could be reopened. + */ + ++nexamined; + t_ret = cursor->reopen(cursor, true); + if (t_ret != 0) { + WT_TRET_NOTFOUND_OK(t_ret); + WT_TRET_NOTFOUND_OK( + cursor->reopen(cursor, false)); + WT_TRET(cursor->close(cursor)); + ++nclosed; + } + } + + /* + * We continue sweeping as long as we have some good average + * productivity. At a minimum, we look at two buckets. + */ + productive = (nclosed >= i); + } + + session->cursor_sweep_position = position; + session->cursor_sweep_countdown = WT_SESSION_CURSOR_SWEEP_COUNTDOWN; + F_SET(session, WT_SESSION_CACHE_CURSORS); + + WT_STAT_CONN_INCR(session, cursor_sweep); + WT_STAT_CONN_INCRV(session, cursor_sweep_buckets, nbuckets); + WT_STAT_CONN_INCRV(session, cursor_sweep_examined, nexamined); + WT_STAT_CONN_INCRV(session, cursor_sweep_closed, nclosed); + + return (ret); +} + +/* * __wt_session_copy_values -- * Copy values into all positioned cursors, so that they don't keep * transaction IDs pinned. @@ -168,6 +229,55 @@ __session_clear(WT_SESSION_IMPL *session) } /* + * __session_close_cursors -- + * Close all cursors in a list. + */ +static int +__session_close_cursors(WT_SESSION_IMPL *session, WT_CURSOR_LIST *cursors) +{ + WT_CURSOR *cursor, *cursor_tmp; + WT_DECL_RET; + + /* Close all open cursors. */ + WT_TAILQ_SAFE_REMOVE_BEGIN(cursor, cursors, q, cursor_tmp) { + if (F_ISSET(cursor, WT_CURSTD_CACHED)) + /* + * Put the cached cursor in an open state + * that allows it to be closed. + */ + WT_TRET_NOTFOUND_OK(cursor->reopen(cursor, false)); + else if (session->event_handler->handle_close != NULL && + !WT_STREQ(cursor->internal_uri, WT_LAS_URI)) + /* + * Notify the user that we are closing the cursor + * handle via the registered close callback. + */ + WT_TRET(session->event_handler->handle_close( + session->event_handler, &session->iface, cursor)); + + WT_TRET(cursor->close(cursor)); + } WT_TAILQ_SAFE_REMOVE_END + + return (ret); +} + +/* + * __session_close_cached_cursors -- + * Fully close all cached cursors. + */ +static int +__session_close_cached_cursors(WT_SESSION_IMPL *session) +{ + WT_DECL_RET; + int i; + + for (i = 0; i < WT_HASH_ARRAY_SIZE; i++) + WT_TRET(__session_close_cursors(session, + &session->cursor_cache[i])); + return (ret); +} + +/* * __session_close -- * WT_SESSION->close method. */ @@ -175,16 +285,18 @@ static int __session_close(WT_SESSION *wt_session, const char *config) { WT_CONNECTION_IMPL *conn; - WT_CURSOR *cursor, *cursor_tmp; WT_DECL_RET; WT_SESSION_IMPL *session; conn = (WT_CONNECTION_IMPL *)wt_session->connection; session = (WT_SESSION_IMPL *)wt_session; - SESSION_API_CALL(session, close, config, cfg); + SESSION_API_CALL_PREPARE_ALLOWED(session, close, config, cfg); WT_UNUSED(cfg); + /* Close all open cursors while the cursor cache is disabled. */ + F_CLR(session, WT_SESSION_CACHE_CURSORS); + /* Rollback any active transaction. */ if (F_ISSET(&session->txn, WT_TXN_RUNNING)) WT_TRET(__session_rollback_transaction(wt_session, NULL)); @@ -197,17 +309,8 @@ __session_close(WT_SESSION *wt_session, const char *config) __wt_txn_release_snapshot(session); /* Close all open cursors. */ - WT_TAILQ_SAFE_REMOVE_BEGIN(cursor, &session->cursors, q, cursor_tmp) { - /* - * Notify the user that we are closing the cursor handle - * via the registered close callback. - */ - if (session->event_handler->handle_close != NULL && - !WT_STREQ(cursor->internal_uri, WT_LAS_URI)) - WT_TRET(session->event_handler->handle_close( - session->event_handler, wt_session, cursor)); - WT_TRET(cursor->close(cursor)); - } WT_TAILQ_SAFE_REMOVE_END + WT_TRET(__session_close_cursors(session, &session->cursors)); + WT_TRET(__session_close_cached_cursors(session)); WT_ASSERT(session, session->ncursors == 0); @@ -224,17 +327,14 @@ __session_close(WT_SESSION *wt_session, const char *config) __wt_txn_destroy(session); /* - * Close the file where we tracked long operations. Do this before - * releasing resources, as we do scratch buffer management when we flush - * optrack buffers to disk + * Close the file where we tracked long operations. Do this before + * releasing resources, as we do scratch buffer management when we + * flush optrack buffers to disk. */ if (F_ISSET(conn, WT_CONN_OPTRACK)) { if (session->optrackbuf_ptr > 0) { - WT_IGNORE_RET((int)__wt_optrack_flush_buffer(session)); - WT_IGNORE_RET(__wt_close(session, - &session->optrack_fh)); - /* Indicate that the file is closed */ - session->optrack_fh = NULL; + __wt_optrack_flush_buffer(session); + WT_TRET(__wt_close(session, &session->optrack_fh)); } /* Free the operation tracking buffer */ @@ -291,7 +391,12 @@ __session_reconfigure(WT_SESSION *wt_session, const char *config) WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)wt_session; - SESSION_API_CALL(session, reconfigure, config, cfg); + /* + * Indicated as allowed in prepared state, even though not allowed, + * so that running transaction check below take precedence. + */ + SESSION_API_CALL_PREPARE_ALLOWED( + session, reconfigure, config, cfg); /* * Note that this method only checks keys that are passed in by the @@ -315,6 +420,17 @@ __session_reconfigure(WT_SESSION *wt_session, const char *config) } WT_ERR_NOTFOUND_OK(ret); + ret = __wt_config_getones(session, config, "cache_cursors", &cval); + if (ret == 0) { + if (cval.val) + F_SET(session, WT_SESSION_CACHE_CURSORS); + else { + F_CLR(session, WT_SESSION_CACHE_CURSORS); + WT_ERR(__session_close_cached_cursors(session)); + } + } + WT_ERR_NOTFOUND_OK(ret); + err: API_END_RET_NOTFOUND_MAP(session, ret); } @@ -417,6 +533,16 @@ __session_open_cursor_int(WT_SESSION_IMPL *session, const char *uri, if (*cursorp == NULL) return (__wt_bad_object_type(session, uri)); + if (owner != NULL) { + /* + * We support caching simple cursors that have no + * children. If this cursor is a child, we're not going + * to cache this child or its parent. + */ + F_CLR(owner, WT_CURSTD_CACHEABLE); + F_CLR(*cursorp, WT_CURSTD_CACHEABLE); + } + /* * When opening simple tables, the table code calls this function on the * underlying data source, in which case the application's URI has been @@ -439,6 +565,15 @@ int __wt_open_cursor(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) { + WT_DECL_RET; + + if (F_ISSET(session, WT_SESSION_CACHE_CURSORS)) { + if ((ret = __wt_cursor_cache_get( + session, uri, owner, cfg, cursorp)) == 0) + return (0); + WT_RET_NOTFOUND_OK(ret); + } + return (__session_open_cursor_int(session, uri, owner, NULL, cfg, cursorp)); } @@ -461,6 +596,13 @@ __session_open_cursor(WT_SESSION *wt_session, session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, open_cursor, config, cfg); + if (to_dup == NULL && F_ISSET(session, WT_SESSION_CACHE_CURSORS)) { + if ((ret = __wt_cursor_cache_get( + session, uri, NULL, cfg, cursorp)) == 0) + return (0); + WT_RET_NOTFOUND_OK(ret); + } + statjoin = (to_dup != NULL && uri != NULL && WT_STREQ(uri, "statistics:join")); if ((to_dup == NULL && uri == NULL) || @@ -730,7 +872,7 @@ __session_log_printf(WT_SESSION *wt_session, const char *fmt, ...) va_list ap; session = (WT_SESSION_IMPL *)wt_session; - SESSION_API_CALL_NOCONF(session, log_printf); + SESSION_API_CALL_NOCONF_PREPARE_NOT_ALLOWED(session, log_printf); va_start(ap, fmt); ret = __wt_log_vprintf(session, fmt, ap); @@ -884,6 +1026,8 @@ __session_reset(WT_SESSION *wt_session) WT_TRET(__wt_session_reset_cursors(session, true)); + WT_TRET(__wt_session_cursor_cache_sweep(session)); + /* Release common session resources. */ WT_TRET(__wt_session_release_resources(session)); @@ -1231,22 +1375,23 @@ __wt_session_range_truncate(WT_SESSION_IMPL *session, * what records currently appear in the object. For this reason, do a * search-near, rather than a search. Additionally, we have to correct * after calling search-near, to position the start/stop cursors on the - * next record greater than/less than the original key. + * next record greater than/less than the original key. If we fail to + * find a key in a search-near, there are no keys in the table. If we + * fail to move forward or backward in a range, there are no keys in + * the range. In either of those cases, we're done. */ - if (start != NULL) { - WT_ERR(start->search_near(start, &cmp)); - if (cmp < 0 && (ret = start->next(start)) != 0) { + if (start != NULL) + if ((ret = start->search_near(start, &cmp)) != 0 || + (cmp < 0 && (ret = start->next(start)) != 0)) { WT_ERR_NOTFOUND_OK(ret); goto done; } - } - if (stop != NULL) { - WT_ERR(stop->search_near(stop, &cmp)); - if (cmp > 0 && (ret = stop->prev(stop)) != 0) { + if (stop != NULL) + if ((ret = stop->search_near(stop, &cmp)) != 0 || + (cmp > 0 && (ret = stop->prev(stop)) != 0)) { WT_ERR_NOTFOUND_OK(ret); goto done; } - } /* * We always truncate in the forward direction because the underlying @@ -1282,7 +1427,7 @@ err: /* */ if (local_start) WT_TRET(start->close(start)); - else + else if (start != NULL) WT_TRET(start->reset(start)); if (stop != NULL) WT_TRET(stop->reset(stop)); @@ -1469,7 +1614,12 @@ __session_begin_transaction(WT_SESSION *wt_session, const char *config) WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)wt_session; - SESSION_API_CALL(session, begin_transaction, config, cfg); + /* + * Indicated as allowed in prepared state, even though not allowed, + * so that running transaction check below take precedence. + */ + SESSION_API_CALL_PREPARE_ALLOWED( + session, begin_transaction, config, cfg); WT_STAT_CONN_INCR(session, txn_begin); WT_ERR(__wt_txn_context_check(session, false)); @@ -1491,7 +1641,8 @@ __session_commit_transaction(WT_SESSION *wt_session, const char *config) WT_TXN *txn; session = (WT_SESSION_IMPL *)wt_session; - SESSION_API_CALL(session, commit_transaction, config, cfg); + SESSION_API_CALL_PREPARE_ALLOWED( + session, commit_transaction, config, cfg); WT_STAT_CONN_INCR(session, txn_commit); WT_ERR(__wt_txn_context_check(session, true)); @@ -1522,20 +1673,48 @@ __session_prepare_transaction(WT_SESSION *wt_session, const char *config) { WT_DECL_RET; WT_SESSION_IMPL *session; + WT_TXN *txn; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, prepare_transaction, config, cfg); WT_ERR(__wt_txn_context_check(session, true)); - WT_TRET(__wt_txn_prepare(session, cfg)); - /* - * Below code to be corrected as part of prepare functionality - * implementation, coded as below to avoid setting error to transaction. + * A failed transaction cannot be prepared, as it cannot guarantee + * a subsequent commit. */ + txn = &session->txn; + if (F_ISSET(txn, WT_TXN_ERROR) && txn->mod_count != 0) + WT_ERR_MSG(session, EINVAL, + "failed transaction requires rollback%s%s", + txn->rollback_reason == NULL ? "" : ": ", + txn->rollback_reason == NULL ? "" : txn->rollback_reason); + + WT_ERR(__wt_txn_prepare(session, cfg)); + +err: API_END_RET(session, ret); + +} + +/* + * __session_prepare_transaction_readonly -- + * WT_SESSION->prepare_transaction method; readonly version. + */ +static int +__session_prepare_transaction_readonly( + WT_SESSION *wt_session, const char *config) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; -err: API_END_RET_NO_TXN_ERROR(session, ret); + WT_UNUSED(config); + + session = (WT_SESSION_IMPL *)wt_session; + SESSION_API_CALL_NOCONF(session, prepare_transaction); + + ret = __wt_session_notsup(session); +err: API_END_RET(session, ret); } /* @@ -1549,7 +1728,8 @@ __session_rollback_transaction(WT_SESSION *wt_session, const char *config) WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)wt_session; - SESSION_API_CALL(session, rollback_transaction, config, cfg); + SESSION_API_CALL_PREPARE_ALLOWED( + session, rollback_transaction, config, cfg); WT_STAT_CONN_INCR(session, txn_rollback); WT_ERR(__wt_txn_context_check(session, true)); @@ -1595,7 +1775,7 @@ __session_transaction_pinned_range(WT_SESSION *wt_session, uint64_t *prange) uint64_t pinned; session = (WT_SESSION_IMPL *)wt_session; - SESSION_API_CALL_NOCONF(session, pinned_range); + SESSION_API_CALL_NOCONF_PREPARE_NOT_ALLOWED(session, pinned_range); txn_state = WT_SESSION_TXN_STATE(session); @@ -1644,7 +1824,12 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config) uint64_t time_start, time_stop; session = (WT_SESSION_IMPL *)wt_session; - SESSION_API_CALL(session, transaction_sync, config, cfg); + /* + * Indicated as allowed in prepared state, even though not allowed, + * so that running transaction check below take precedence. + */ + SESSION_API_CALL_PREPARE_ALLOWED( + session, transaction_sync, config, cfg); WT_STAT_CONN_INCR(session, txn_sync); conn = S2C(session); @@ -1738,7 +1923,12 @@ __session_checkpoint(WT_SESSION *wt_session, const char *config) session = (WT_SESSION_IMPL *)wt_session; WT_STAT_CONN_INCR(session, txn_checkpoint); - SESSION_API_CALL(session, checkpoint, config, cfg); + /* + * Indicated as allowed in prepared state, even though not allowed, + * so that running transaction check below take precedence. + */ + SESSION_API_CALL_PREPARE_ALLOWED( + session, checkpoint, config, cfg); WT_ERR(__wt_inmem_unsupported_op(session, NULL)); @@ -1913,7 +2103,7 @@ __open_session(WT_CONNECTION_IMPL *conn, __session_verify, __session_begin_transaction, __session_commit_transaction, - __session_prepare_transaction, + __session_prepare_transaction_readonly, __session_rollback_transaction, __session_timestamp_transaction, __session_checkpoint_readonly, @@ -1974,16 +2164,27 @@ __open_session(WT_CONNECTION_IMPL *conn, TAILQ_INIT(&session_ret->cursors); TAILQ_INIT(&session_ret->dhandles); + /* - * If we don't have one, allocate the dhandle hash array. + * If we don't have them, allocate the cursor and dhandle hash arrays. * Allocate the table hash array as well. */ + if (session_ret->cursor_cache == NULL) + WT_ERR(__wt_calloc_def( + session, WT_HASH_ARRAY_SIZE, &session_ret->cursor_cache)); if (session_ret->dhhash == NULL) - WT_ERR(__wt_calloc(session, WT_HASH_ARRAY_SIZE, - sizeof(struct __dhandles_hash), &session_ret->dhhash)); + WT_ERR(__wt_calloc_def( + session, WT_HASH_ARRAY_SIZE, &session_ret->dhhash)); + + /* Initialize the dhandle hash array. */ for (i = 0; i < WT_HASH_ARRAY_SIZE; i++) TAILQ_INIT(&session_ret->dhhash[i]); + /* Initialize the cursor cache hash buckets and sweep trigger. */ + for (i = 0; i < WT_HASH_ARRAY_SIZE; i++) + TAILQ_INIT(&session_ret->cursor_cache[i]); + session_ret->cursor_sweep_countdown = WT_SESSION_CURSOR_SWEEP_COUNTDOWN; + /* Initialize transaction support: default to read-committed. */ session_ret->isolation = WT_ISO_READ_COMMITTED; WT_ERR(__wt_txn_init(session, session_ret)); diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index 97c22cd5031..40a07be0174 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -112,6 +112,8 @@ static const char * const __stats_dsrc_desc[] = { "cursor: cursor-insert key and value bytes inserted", "cursor: cursor-remove key bytes removed", "cursor: cursor-update value bytes updated", + "cursor: cursors cached on close", + "cursor: cursors reused from cache", "cursor: insert calls", "cursor: modify calls", "cursor: next calls", @@ -138,6 +140,7 @@ static const char * const __stats_dsrc_desc[] = { "reconciliation: page reconciliation calls", "reconciliation: page reconciliation calls for eviction", "reconciliation: pages deleted", + "session: cached cursor count", "session: object compaction", "session: open cursor count", "transaction: update conflicts", @@ -292,6 +295,8 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) stats->cursor_insert_bytes = 0; stats->cursor_remove_bytes = 0; stats->cursor_update_bytes = 0; + stats->cursor_cache = 0; + stats->cursor_reopen = 0; stats->cursor_insert = 0; stats->cursor_modify = 0; stats->cursor_next = 0; @@ -318,6 +323,7 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) stats->rec_pages = 0; stats->rec_pages_eviction = 0; stats->rec_page_delete = 0; + /* not clearing session_cursor_cached */ stats->session_compact = 0; /* not clearing session_cursor_open */ stats->txn_update_conflict = 0; @@ -473,6 +479,8 @@ __wt_stat_dsrc_aggregate_single( to->cursor_insert_bytes += from->cursor_insert_bytes; to->cursor_remove_bytes += from->cursor_remove_bytes; to->cursor_update_bytes += from->cursor_update_bytes; + to->cursor_cache += from->cursor_cache; + to->cursor_reopen += from->cursor_reopen; to->cursor_insert += from->cursor_insert; to->cursor_modify += from->cursor_modify; to->cursor_next += from->cursor_next; @@ -500,6 +508,7 @@ __wt_stat_dsrc_aggregate_single( to->rec_pages += from->rec_pages; to->rec_pages_eviction += from->rec_pages_eviction; to->rec_page_delete += from->rec_page_delete; + to->session_cursor_cached += from->session_cursor_cached; to->session_compact += from->session_compact; to->session_cursor_open += from->session_cursor_open; to->txn_update_conflict += from->txn_update_conflict; @@ -686,6 +695,8 @@ __wt_stat_dsrc_aggregate( to->cursor_insert_bytes += WT_STAT_READ(from, cursor_insert_bytes); to->cursor_remove_bytes += WT_STAT_READ(from, cursor_remove_bytes); to->cursor_update_bytes += WT_STAT_READ(from, cursor_update_bytes); + to->cursor_cache += WT_STAT_READ(from, cursor_cache); + to->cursor_reopen += WT_STAT_READ(from, cursor_reopen); to->cursor_insert += WT_STAT_READ(from, cursor_insert); to->cursor_modify += WT_STAT_READ(from, cursor_modify); to->cursor_next += WT_STAT_READ(from, cursor_next); @@ -719,6 +730,8 @@ __wt_stat_dsrc_aggregate( to->rec_pages += WT_STAT_READ(from, rec_pages); to->rec_pages_eviction += WT_STAT_READ(from, rec_pages_eviction); to->rec_page_delete += WT_STAT_READ(from, rec_page_delete); + to->session_cursor_cached += + WT_STAT_READ(from, session_cursor_cached); to->session_compact += WT_STAT_READ(from, session_compact); to->session_cursor_open += WT_STAT_READ(from, session_cursor_open); to->txn_update_conflict += WT_STAT_READ(from, txn_update_conflict); @@ -870,7 +883,13 @@ static const char * const __stats_connection_desc[] = { "cursor: cursor restarted searches", "cursor: cursor search calls", "cursor: cursor search near calls", + "cursor: cursor sweep buckets", + "cursor: cursor sweep cursors closed", + "cursor: cursor sweep cursors examined", + "cursor: cursor sweeps", "cursor: cursor update calls", + "cursor: cursors cached on close", + "cursor: cursors reused from cache", "cursor: truncate calls", "data-handle: connection data handles currently active", "data-handle: connection sweep candidate became referenced", @@ -1250,7 +1269,13 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->cursor_restart = 0; stats->cursor_search = 0; stats->cursor_search_near = 0; + stats->cursor_sweep_buckets = 0; + stats->cursor_sweep_closed = 0; + stats->cursor_sweep_examined = 0; + stats->cursor_sweep = 0; stats->cursor_update = 0; + stats->cursor_cache = 0; + stats->cursor_reopen = 0; stats->cursor_truncate = 0; /* not clearing dh_conn_handle_count */ stats->dh_sweep_ref = 0; @@ -1670,7 +1695,14 @@ __wt_stat_connection_aggregate( to->cursor_restart += WT_STAT_READ(from, cursor_restart); to->cursor_search += WT_STAT_READ(from, cursor_search); to->cursor_search_near += WT_STAT_READ(from, cursor_search_near); + to->cursor_sweep_buckets += WT_STAT_READ(from, cursor_sweep_buckets); + to->cursor_sweep_closed += WT_STAT_READ(from, cursor_sweep_closed); + to->cursor_sweep_examined += + WT_STAT_READ(from, cursor_sweep_examined); + to->cursor_sweep += WT_STAT_READ(from, cursor_sweep); to->cursor_update += WT_STAT_READ(from, cursor_update); + to->cursor_cache += WT_STAT_READ(from, cursor_cache); + to->cursor_reopen += WT_STAT_READ(from, cursor_reopen); to->cursor_truncate += WT_STAT_READ(from, cursor_truncate); to->dh_conn_handle_count += WT_STAT_READ(from, dh_conn_handle_count); to->dh_sweep_ref += WT_STAT_READ(from, dh_sweep_ref); diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 627bafa7483..b221f211ef7 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -63,6 +63,31 @@ __snapsort(uint64_t *array, uint32_t size) } /* + * __txn_remove_from_global_table -- + * Remove the txn id from the global txn table. + */ +static inline void +__txn_remove_from_global_table(WT_SESSION_IMPL *session) +{ + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + WT_TXN_STATE *txn_state; + + txn = &session->txn; + txn_global = &S2C(session)->txn_global; + txn_state = WT_SESSION_TXN_STATE(session); + +#ifndef HAVE_DIAGNOSTIC + WT_UNUSED(txn); + WT_UNUSED(txn_global); +#endif + WT_ASSERT(session, !WT_TXNID_LT(txn->id, txn_global->last_running)); + WT_ASSERT(session, txn->id != WT_TXN_NONE && + txn_state->id != WT_TXN_NONE); + WT_PUBLISH(txn_state->id, WT_TXN_NONE); +} + +/* * __txn_sort_snapshot -- * Sort a snapshot for faster searching and set the min/max bounds. */ @@ -437,84 +462,12 @@ __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]) */ WT_RET(__wt_txn_named_snapshot_get(session, &cval)); - WT_RET(__wt_config_gets_def(session, cfg, "read_timestamp", 0, &cval)); - if (cval.len > 0) { -#ifdef HAVE_TIMESTAMPS - wt_timestamp_t ts; - WT_TXN_GLOBAL *txn_global; - char hex_timestamp[2][2 * WT_TIMESTAMP_SIZE + 1]; - bool round_to_oldest; - - txn_global = &S2C(session)->txn_global; - WT_RET(__wt_txn_parse_timestamp(session, "read", &ts, &cval)); - - /* - * Prepare transactions are supported only in timestamp build. - */ - WT_RET(__wt_config_gets_def(session, - cfg, "ignore_prepare", 0, &cval)); - if (cval.val) - F_SET(txn, WT_TXN_IGNORE_PREPARE); - - /* - * Read the configuration here to reduce the span of the - * critical section. - */ - WT_RET(__wt_config_gets_def(session, - cfg, "round_to_oldest", 0, &cval)); - round_to_oldest = cval.val; - /* - * This code is not using the timestamp validate function to - * avoid a race between checking and setting transaction - * timestamp. - */ - WT_RET(__wt_timestamp_to_hex_string(session, - hex_timestamp[0], &ts)); - __wt_readlock(session, &txn_global->rwlock); - if (__wt_timestamp_cmp(&ts, &txn_global->oldest_timestamp) < 0) - { - WT_RET(__wt_timestamp_to_hex_string(session, - hex_timestamp[1], &txn_global->oldest_timestamp)); - /* - * If given read timestamp is earlier than oldest - * timestamp then round the read timestamp to - * oldest timestamp. - */ - if (round_to_oldest) - __wt_timestamp_set(&txn->read_timestamp, - &txn_global->oldest_timestamp); - else { - __wt_readunlock(session, &txn_global->rwlock); - WT_RET_MSG(session, EINVAL, "read timestamp " - "%s older than oldest timestamp %s", - hex_timestamp[0], hex_timestamp[1]); - } - } else { - __wt_timestamp_set(&txn->read_timestamp, &ts); - /* - * Reset to avoid a verbose message as read - * timestamp is not rounded to oldest timestamp. - */ - round_to_oldest = false; - } + /* Check if prepared updates should be ignored during reads. */ + WT_RET(__wt_config_gets_def(session, cfg, "ignore_prepare", 0, &cval)); + if (cval.val) + F_SET(txn, WT_TXN_IGNORE_PREPARE); - __wt_txn_set_read_timestamp(session); - __wt_readunlock(session, &txn_global->rwlock); - txn->isolation = WT_ISO_SNAPSHOT; - if (round_to_oldest) { - /* - * This message is generated here to reduce the span of - * critical section. - */ - __wt_verbose(session, WT_VERB_TIMESTAMP, "Read " - "timestamp %s : Rounded to oldest timestamp %s", - hex_timestamp[0], hex_timestamp[1]); - } -#else - WT_RET_MSG(session, EINVAL, "read_timestamp requires a " - "version of WiredTiger built with timestamp support"); -#endif - } + WT_RET(__wt_txn_parse_read_timestamp(session, cfg)); return (0); } @@ -554,18 +507,17 @@ __wt_txn_release(WT_SESSION_IMPL *session) { WT_TXN *txn; WT_TXN_GLOBAL *txn_global; - WT_TXN_STATE *txn_state; txn = &session->txn; txn_global = &S2C(session)->txn_global; - txn_state = WT_SESSION_TXN_STATE(session); WT_ASSERT(session, txn->mod_count == 0); txn->notify = NULL; /* Clear the transaction's ID from the global table. */ if (WT_SESSION_IS_CHECKPOINT(session)) { - WT_ASSERT(session, txn_state->id == WT_TXN_NONE); + WT_ASSERT(session, + WT_SESSION_TXN_STATE(session)->id == WT_TXN_NONE); txn->id = txn_global->checkpoint_state.id = txn_global->checkpoint_state.pinned_id = WT_TXN_NONE; @@ -576,12 +528,13 @@ __wt_txn_release(WT_SESSION_IMPL *session) */ txn_global->checkpoint_id = 0; } else if (F_ISSET(txn, WT_TXN_HAS_ID)) { - WT_ASSERT(session, - !WT_TXNID_LT(txn->id, txn_global->last_running)); - - WT_ASSERT(session, txn_state->id != WT_TXN_NONE && - txn->id != WT_TXN_NONE); - WT_PUBLISH(txn_state->id, WT_TXN_NONE); + /* + * If transaction is prepared, this would have been done in + * prepare. + */ + if (!F_ISSET(txn, WT_TXN_PREPARE)) { + __txn_remove_from_global_table(session); + } txn->id = WT_TXN_NONE; } @@ -650,8 +603,7 @@ __txn_commit_timestamp_validate(WT_SESSION_IMPL *session) * are at a later timestamp or use timestamps inconsistently. */ for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) - if (op->type == WT_TXN_OP_BASIC_TS || - op->type == WT_TXN_OP_BASIC) { + if (op->type == WT_TXN_OP_BASIC) { /* * Skip over any aborted update structures or ones * from our own transaction. @@ -750,7 +702,9 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR_MSG(session, EINVAL, "commit_timestamp requires a " "version of WiredTiger built with timestamp support"); #endif - } + } else if (F_ISSET(txn, WT_TXN_PREPARE)) + WT_ERR_MSG(session, EINVAL, + "commit_timestamp is required for a prepared transaction"); #ifdef HAVE_TIMESTAMPS WT_ERR(__txn_commit_timestamp_validate(session)); @@ -805,8 +759,10 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) * We are about to release the snapshot: copy values into any * positioned cursors so they don't point to updates that could be * freed once we don't have a snapshot. + * If this transaction is prepared, then copying values would have been + * done during prepare. */ - if (session->ncursors > 0) { + if (session->ncursors > 0 && !F_ISSET(txn, WT_TXN_PREPARE)) { WT_DIAGNOSTIC_YIELD; WT_ERR(__wt_session_copy_values(session)); } @@ -838,7 +794,6 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) { switch (op->type) { case WT_TXN_OP_BASIC: - case WT_TXN_OP_BASIC_TS: case WT_TXN_OP_INMEM: /* * Switch reserved operations to abort to @@ -860,22 +815,50 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) } #ifdef HAVE_TIMESTAMPS - if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && - op->type != WT_TXN_OP_BASIC_TS) { - WT_ASSERT(session, - op->fileid != WT_METAFILE_ID); - __wt_timestamp_set(&op->u.upd->timestamp, - &txn->commit_timestamp); + if (__wt_txn_update_needs_timestamp(session, op)) { + if (F_ISSET(txn, WT_TXN_PREPARE)) { + WT_ASSERT(session, op->u.upd != NULL); + /* + * In case of a prepared transaction, + * the order of modification of the + * prepare timestamp to the commit + * timestamp in the update chain will + * not affect the data visibility, a + * reader will encounter a prepared + * update resulting in prepare conflict. + * + * As updating timestamp might not be an + * atomic operation, we will manage + * using state. + */ + FLD_SET(op->u.upd->state, + WT_UPDATE_STATE_LOCKED); + __wt_timestamp_set( + &op->u.upd->timestamp, + &txn->commit_timestamp); + FLD_SET(op->u.upd->state, + WT_UPDATE_STATE_READY); + } else + __wt_timestamp_set( + &op->u.upd->timestamp, + &txn->commit_timestamp); } #endif break; - case WT_TXN_OP_REF: + case WT_TXN_OP_REF_DELETE: #ifdef HAVE_TIMESTAMPS - if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) + if (__wt_txn_update_needs_timestamp(session, op)) { + WT_UPDATE **upd; + __wt_timestamp_set( &op->u.ref->page_del->timestamp, &txn->commit_timestamp); + for (upd = op->u.ref->page_del->update_list; + *upd != NULL; ++upd) + __wt_timestamp_set(&(*upd)->timestamp, + &txn->commit_timestamp); + } #endif break; @@ -973,15 +956,115 @@ err: /* int __wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_UNUSED(cfg); - #ifdef HAVE_TIMESTAMPS - WT_RET_MSG(session, ENOTSUP, "prepare_transaction is not supported"); + WT_CONFIG_ITEM cval; + WT_DECL_RET; + WT_TXN *txn; + WT_TXN_OP *op; + wt_timestamp_t ts; + u_int i; + + txn = &session->txn; + WT_TRET(__wt_txn_context_check(session, true)); + + WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING)); + /* Transaction should not have a commit timestamp set. */ + WT_ASSERT(session, !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)); + WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || txn->mod_count == 0); + + /* Look for a prepare timestamp. */ + WT_ERR( + __wt_config_gets_def(session, cfg, "prepare_timestamp", 0, &cval)); + if (cval.len != 0) { + WT_ERR(__wt_txn_parse_timestamp(session, + "prepare", &ts, &cval)); + + /* TODO : Validate prepare timestamp. */ + + __wt_timestamp_set(&txn->prepare_timestamp, &ts); + + } else + WT_ERR_MSG(session, EINVAL, "prepare timestamp is required"); + +#ifdef HAVE_DIAGNOSTIC + /* + * Transaction should not have updated any of the logged tables. + */ + WT_ASSERT(session, txn->logrec == NULL); +#endif + + /* + * We are about to release the snapshot: copy values into any + * positioned cursors so they don't point to updates that could be + * freed once we don't have a snapshot. + */ + if (session->ncursors > 0) { + WT_DIAGNOSTIC_YIELD; + WT_ERR(__wt_session_copy_values(session)); + } + + /* Process updates. */ + for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) { + switch (op->type) { + case WT_TXN_OP_BASIC: + case WT_TXN_OP_INMEM: + /* + * Switch reserved operation to abort to simplify + * obsolete update list truncation. + */ + if (op->u.upd->type == WT_UPDATE_RESERVE) { + op->u.upd->txnid = WT_TXN_ABORTED; + break; + } + + /* + * Assert to make sure the lookaside writes are not + * happening here. + */ + WT_ASSERT(session, + !(S2C(session)->cache->las_fileid != 0 && + op->fileid == S2C(session)->cache->las_fileid)); + + /* Set prepare timestamp. */ + if (op->fileid != WT_METAFILE_ID) + __wt_timestamp_set(&op->u.upd->timestamp, &ts); + + FLD_SET(op->u.upd->state, WT_UPDATE_STATE_PREPARED); + break; + + case WT_TXN_OP_REF_DELETE: + __wt_timestamp_set( + &op->u.ref->page_del->timestamp, &ts); + break; + case WT_TXN_OP_TRUNCATE_COL: + case WT_TXN_OP_TRUNCATE_ROW: + /* Other operations don't need timestamps. */ + break; + } + } + + /* Set transaction state to prepare. */ + F_SET(&session->txn, WT_TXN_PREPARE); + + /* Release our snapshot in case it is keeping data pinned. */ + __wt_txn_release_snapshot(session); + + /* + * Clear the transaction's ID from the global table, to facilitate + * prepared data visibility, but not from local txn structure. + */ + if (F_ISSET(txn, WT_TXN_HAS_ID)) { + __txn_remove_from_global_table(session); + } + +err: return (ret); #else + WT_UNUSED(cfg); WT_RET_MSG(session, ENOTSUP, "prepare_transaction requires a version " "of WiredTiger built with timestamp support"); #endif } + /* * __wt_txn_rollback -- * Roll back the current transaction. @@ -1014,7 +1097,6 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]) switch (op->type) { case WT_TXN_OP_BASIC: - case WT_TXN_OP_BASIC_TS: case WT_TXN_OP_INMEM: WT_ASSERT(session, op->u.upd->txnid == txn->id); WT_ASSERT(session, @@ -1022,15 +1104,15 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]) op->fileid != S2C(session)->cache->las_fileid); op->u.upd->txnid = WT_TXN_ABORTED; break; - case WT_TXN_OP_REF: - __wt_delete_page_rollback(session, op->u.ref); + case WT_TXN_OP_REF_DELETE: + WT_TRET(__wt_delete_page_rollback(session, op->u.ref)); break; case WT_TXN_OP_TRUNCATE_COL: case WT_TXN_OP_TRUNCATE_ROW: /* * Nothing to do: these operations are only logged for * recovery. The in-memory changes will be rolled back - * with a combination of WT_TXN_OP_REF and + * with a combination of WT_TXN_OP_REF_DELETE and * WT_TXN_OP_INMEM operations. */ break; diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index 616816f0e8d..0a1636ecef6 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -612,7 +612,8 @@ __checkpoint_fail_reset(WT_SESSION_IMPL *session) * Start the transaction for a checkpoint and gather handles. */ static int -__checkpoint_prepare(WT_SESSION_IMPL *session, const char *cfg[]) +__checkpoint_prepare( + WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[]) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; @@ -646,6 +647,10 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, const char *cfg[]) /* Ensure a transaction ID is allocated prior to sharing it globally */ WT_RET(__wt_txn_id_check(session)); + /* Keep track of handles acquired for locking. */ + WT_RET(__wt_meta_track_on(session)); + *trackingp = true; + /* * Mark the connection as clean. If some data gets modified after * generating checkpoint transaction id, connection will be reset to @@ -706,7 +711,8 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, const char *cfg[]) __wt_timestamp_set( &txn->read_timestamp, &txn_global->stable_timestamp); F_SET(txn, WT_TXN_HAS_TS_READ); - } + } else + __wt_timestamp_set_zero(&txn->read_timestamp); #else WT_UNUSED(use_timestamp); #endif @@ -820,10 +826,6 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) generation = __wt_gen_next(session, WT_GEN_CHECKPOINT); WT_STAT_CONN_SET(session, txn_checkpoint_generation, generation); - /* Keep track of handles acquired for locking. */ - WT_ERR(__wt_meta_track_on(session)); - tracking = true; - /* * We want to skip checkpointing clean handles whenever possible. That * is, when the checkpoint is not named or forced. However, we need to @@ -839,7 +841,8 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * Hold the schema lock while starting the transaction and gathering * handles so the set we get is complete and correct. */ - WT_WITH_SCHEMA_LOCK(session, ret = __checkpoint_prepare(session, cfg)); + WT_WITH_SCHEMA_LOCK(session, + ret = __checkpoint_prepare(session, &tracking, cfg)); WT_ERR(ret); WT_ASSERT(session, txn->isolation == WT_ISO_SNAPSHOT); diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c index d31a9e27583..c7a5f2e03d1 100644 --- a/src/third_party/wiredtiger/src/txn/txn_log.c +++ b/src/third_party/wiredtiger/src/txn/txn_log.c @@ -165,9 +165,8 @@ __wt_txn_op_free(WT_SESSION_IMPL *session, WT_TXN_OP *op) { switch (op->type) { case WT_TXN_OP_BASIC: - case WT_TXN_OP_BASIC_TS: case WT_TXN_OP_INMEM: - case WT_TXN_OP_REF: + case WT_TXN_OP_REF_DELETE: case WT_TXN_OP_TRUNCATE_COL: break; @@ -246,11 +245,10 @@ __wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) switch (op->type) { case WT_TXN_OP_BASIC: - case WT_TXN_OP_BASIC_TS: ret = __txn_op_log(session, logrec, op, cbt); break; case WT_TXN_OP_INMEM: - case WT_TXN_OP_REF: + case WT_TXN_OP_REF_DELETE: /* Nothing to log, we're done. */ break; case WT_TXN_OP_TRUNCATE_COL: diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c index e058d08cc17..0f6b99a341e 100644 --- a/src/third_party/wiredtiger/src/txn/txn_recover.c +++ b/src/third_party/wiredtiger/src/txn/txn_recover.c @@ -24,6 +24,7 @@ typedef struct { u_int max_fileid; /* Maximum file ID seen. */ WT_LSN max_lsn; /* Maximum checkpoint LSN seen. */ u_int nfiles; /* Number of files in the metadata. */ + WT_DECL_TIMESTAMP(max_timestamp) WT_LSN ckpt_lsn; /* Start LSN for main recovery loop. */ @@ -346,6 +347,7 @@ static int __recovery_setup_file(WT_RECOVERY *r, const char *uri, const char *config) { WT_CONFIG_ITEM cval; + WT_DECL_TIMESTAMP(ckpt_timestamp) WT_LSN lsn; uint32_t fileid, lsnfile, lsnoffset; @@ -362,6 +364,29 @@ __recovery_setup_file(WT_RECOVERY *r, const char *uri, const char *config) r->nfiles = fileid + 1; } +#ifdef HAVE_TIMESTAMPS + /* + * If we're reading the config for the metadata from the turtle file + * save the stable timestamp of the last checkpoint for later query. + * This gets saved in the connection. + */ + WT_CLEAR(cval); + WT_RET_NOTFOUND_OK(__wt_config_getones(r->session, + config, "checkpoint_timestamp", &cval)); + if (cval.len != 0) { + __wt_verbose(r->session, WT_VERB_RECOVERY, + "%s: Recovery timestamp %.*s", + uri, (int)cval.len, cval.str); + WT_RET(__wt_txn_parse_timestamp_raw(r->session, "recovery", + &ckpt_timestamp, &cval)); + /* + * Keep track of the largest checkpoint timestamp seen. + */ + if (__wt_timestamp_cmp(&ckpt_timestamp, &r->max_timestamp) > 0) + __wt_timestamp_set(&r->max_timestamp, &ckpt_timestamp); + } +#endif + WT_RET(__wt_strdup(r->session, uri, &r->files[fileid].uri)); WT_RET( __wt_config_getones(r->session, config, "checkpoint_lsn", &cval)); @@ -472,6 +497,10 @@ __wt_txn_recover(WT_SESSION_IMPL *session) false, WT_SESSION_NO_LOGGING, &session)); r.session = session; WT_MAX_LSN(&r.max_lsn); +#ifdef HAVE_TIMESTAMPS + __wt_timestamp_set_zero(&conn->txn_global.recovery_timestamp); + __wt_timestamp_set_zero(&r.max_timestamp); +#endif F_SET(conn, WT_CONN_RECOVERING); WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config)); @@ -565,6 +594,15 @@ __wt_txn_recover(WT_SESSION_IMPL *session) r.files[0].c = NULL; WT_ERR(metac->close(metac)); +#ifdef HAVE_TIMESTAMPS + /* + * After recovering the metadata, set the recovery timestamp to the + * largest one we recovered. + */ + __wt_timestamp_set( + &conn->txn_global.recovery_timestamp, &r.max_timestamp); +#endif + /* * Now, recover all the files apart from the metadata. * Pass WT_LOGSCAN_RECOVER so that old logs get truncated. diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index d2d07b9e6d7..d31b3995092 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -279,7 +279,7 @@ __txn_rollback_to_stable_btree_walk( /* Review deleted page saved to the ref */ if (ref->page_del != NULL && __wt_timestamp_cmp( rollback_timestamp, &ref->page_del->timestamp) < 0) - __wt_delete_page_rollback(session, ref); + WT_RET(__wt_delete_page_rollback(session, ref)); if (!__wt_page_is_modified(ref->page)) continue; diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c index d07bfecd47c..8a7e0dad83e 100644 --- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c +++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c @@ -82,12 +82,12 @@ __wt_verbose_timestamp(WT_SESSION_IMPL *session, } /* - * __wt_txn_parse_timestamp -- - * Decodes and sets a timestamp. + * __wt_txn_parse_timestamp_raw -- + * Decodes and sets a timestamp. Don't do any checking. */ int -__wt_txn_parse_timestamp(WT_SESSION_IMPL *session, - const char *name, wt_timestamp_t *timestamp, WT_CONFIG_ITEM *cval) +__wt_txn_parse_timestamp_raw(WT_SESSION_IMPL *session, const char *name, + wt_timestamp_t *timestamp, WT_CONFIG_ITEM *cval) { __wt_timestamp_set_zero(timestamp); @@ -172,7 +172,19 @@ __wt_txn_parse_timestamp(WT_SESSION_IMPL *session, ts.data, ts.size); } #endif - if (__wt_timestamp_iszero(timestamp)) + return (0); +} + +/* + * __wt_txn_parse_timestamp -- + * Decodes and sets a timestamp checking it is non-zero. + */ +int +__wt_txn_parse_timestamp(WT_SESSION_IMPL *session, const char *name, + wt_timestamp_t *timestamp, WT_CONFIG_ITEM *cval) +{ + WT_RET(__wt_txn_parse_timestamp_raw(session, name, timestamp, cval)); + if (cval->len != 0 && __wt_timestamp_iszero(timestamp)) WT_RET_MSG(session, EINVAL, "Failed to parse %s timestamp '%.*s': zero not permitted", name, (int)cval->len, cval->str); @@ -254,7 +266,10 @@ __txn_global_query_timestamp( __wt_timestamp_cmp(&txn->read_timestamp, &ts) < 0) __wt_timestamp_set(&ts, &txn->read_timestamp); __wt_readunlock(session, &txn_global->read_timestamp_rwlock); - } else if (WT_STRING_MATCH("stable", cval.str, cval.len)) { + } else if (WT_STRING_MATCH("recovery", cval.str, cval.len)) + /* Read-only value forever. No lock needed. */ + __wt_timestamp_set(&ts, &txn_global->recovery_timestamp); + else if (WT_STRING_MATCH("stable", cval.str, cval.len)) { if (!txn_global->has_stable_timestamp) return (WT_NOTFOUND); WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, @@ -617,7 +632,7 @@ __wt_timestamp_validate(WT_SESSION_IMPL *session, const char *name, /* * __wt_txn_set_timestamp -- - * Set a transaction's timestamp. + * Parse a request to set a timestamp in a transaction. */ int __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) @@ -625,19 +640,14 @@ __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) WT_CONFIG_ITEM cval; WT_DECL_RET; - /* - * Look for a commit timestamp. - */ + /* Look for a commit timestamp. */ ret = __wt_config_gets_def(session, cfg, "commit_timestamp", 0, &cval); if (ret == 0 && cval.len != 0) { #ifdef HAVE_TIMESTAMPS WT_TXN *txn = &session->txn; wt_timestamp_t ts; - if (!F_ISSET(txn, WT_TXN_RUNNING)) - WT_RET_MSG(session, EINVAL, - "Transaction must be running " - "to set a commit_timestamp"); + WT_TRET(__wt_txn_context_check(session, true)); WT_RET(__wt_txn_parse_timestamp(session, "commit", &ts, &cval)); WT_RET(__wt_timestamp_validate(session, "commit", &ts, &cval, true, true, true)); @@ -650,6 +660,108 @@ __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) } WT_RET_NOTFOUND_OK(ret); + /* Look for a read timestamp. */ + WT_RET(__wt_txn_parse_read_timestamp(session, cfg)); + + return (0); +} + +/* + * __wt_txn_parse_read_timestamp -- + * Parse a request to set a transaction's read_timestamp. + */ +int +__wt_txn_parse_read_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CONFIG_ITEM cval; + WT_TXN *txn; + + txn = &session->txn; + + WT_RET(__wt_config_gets_def(session, cfg, "read_timestamp", 0, &cval)); + if (cval.len > 0) { +#ifdef HAVE_TIMESTAMPS + wt_timestamp_t ts; + WT_TXN_GLOBAL *txn_global; + char hex_timestamp[2][2 * WT_TIMESTAMP_SIZE + 1]; + bool round_to_oldest; + + txn_global = &S2C(session)->txn_global; + WT_RET(__wt_txn_parse_timestamp(session, "read", &ts, &cval)); + + /* Read timestamps imply / require snapshot isolation. */ + if (!F_ISSET(txn, WT_TXN_RUNNING)) + txn->isolation = WT_ISO_SNAPSHOT; + else if (txn->isolation != WT_ISO_SNAPSHOT) + WT_RET_MSG(session, EINVAL, "setting a read_timestamp" + " requires a transaction running at snapshot" + " isolation"); + + /* Read timestamps can't change once set. */ + if (F_ISSET(txn, WT_TXN_HAS_TS_READ)) + WT_RET_MSG(session, EINVAL, "a read_timestamp" + " may only be set once per transaction"); + + /* + * Read the configuration here to reduce the span of the + * critical section. + */ + WT_RET(__wt_config_gets_def(session, + cfg, "round_to_oldest", 0, &cval)); + round_to_oldest = cval.val; + /* + * This code is not using the timestamp validate function to + * avoid a race between checking and setting transaction + * timestamp. + */ + WT_RET(__wt_timestamp_to_hex_string(session, + hex_timestamp[0], &ts)); + __wt_readlock(session, &txn_global->rwlock); + if (__wt_timestamp_cmp( + &ts, &txn_global->oldest_timestamp) < 0) { + WT_RET(__wt_timestamp_to_hex_string(session, + hex_timestamp[1], &txn_global->oldest_timestamp)); + /* + * If given read timestamp is earlier than oldest + * timestamp then round the read timestamp to + * oldest timestamp. + */ + if (round_to_oldest) + __wt_timestamp_set(&txn->read_timestamp, + &txn_global->oldest_timestamp); + else { + __wt_readunlock(session, &txn_global->rwlock); + WT_RET_MSG(session, EINVAL, "read timestamp " + "%s older than oldest timestamp %s", + hex_timestamp[0], hex_timestamp[1]); + } + } else { + __wt_timestamp_set(&txn->read_timestamp, &ts); + /* + * Reset to avoid a verbose message as read + * timestamp is not rounded to oldest timestamp. + */ + round_to_oldest = false; + } + + __wt_txn_set_read_timestamp(session); + __wt_readunlock(session, &txn_global->rwlock); + if (round_to_oldest) { + /* + * This message is generated here to reduce the span of + * critical section. + */ + __wt_verbose(session, WT_VERB_TIMESTAMP, "Read " + "timestamp %s : Rounded to oldest timestamp %s", + hex_timestamp[0], hex_timestamp[1]); + } +#else + WT_UNUSED(txn); + WT_RET_MSG(session, EINVAL, "read_timestamp requires a " + "version of WiredTiger built with timestamp support"); +#endif + } + return (0); } diff --git a/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c b/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c index 2cf9a69110c..3e37e2e291e 100644 --- a/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c +++ b/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c @@ -70,7 +70,6 @@ static const char * const uri_local = "table:local"; static const char * const uri_oplog = "table:oplog"; static const char * const uri_collection = "table:collection"; -static const char * const stable_store = "table:stable"; static const char * const ckpt_file = "checkpoint_done"; static bool compat, inmem, use_ts; @@ -118,7 +117,6 @@ usage(void) static WT_THREAD_RET thread_ts_run(void *arg) { - WT_CURSOR *cur_stable; WT_SESSION *session; THREAD_DATA *td; uint64_t i, last_ts, oldest_ts, this_ts; @@ -128,9 +126,6 @@ thread_ts_run(void *arg) last_ts = 0; testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session)); - testutil_check(session->open_cursor( - session, stable_store, NULL, NULL, &cur_stable)); - /* * Every N records we will record our stable timestamp into the stable * table. That will define our threshold where we expect to find records @@ -170,9 +165,6 @@ thread_ts_run(void *arg) testutil_check( td->conn->set_timestamp(td->conn, tscfg)); last_ts = oldest_ts; - cur_stable->set_key(cur_stable, td->info); - cur_stable->set_value(cur_stable, oldest_ts); - testutil_check(cur_stable->insert(cur_stable)); } else ts_wait: __wt_sleep(0, 1000); } @@ -392,8 +384,6 @@ run_workload(uint32_t nth) * Don't log the stable timestamp table so that we know what timestamp * was stored at the checkpoint. */ - testutil_check(session->create(session, stable_store, - "key_format=Q,value_format=Q,log=(enabled=false)")); testutil_check(session->close(session, NULL)); /* @@ -507,12 +497,12 @@ main(int argc, char *argv[]) FILE *fp; REPORT c_rep[MAX_TH], l_rep[MAX_TH], o_rep[MAX_TH]; WT_CONNECTION *conn; - WT_CURSOR *cur_coll, *cur_local, *cur_oplog, *cur_stable; + WT_CURSOR *cur_coll, *cur_local, *cur_oplog; WT_RAND_STATE rnd; WT_SESSION *session; pid_t pid; uint64_t absent_coll, absent_local, absent_oplog, count, key, last_key; - uint64_t stable_fp, stable_val, val[MAX_TH+1]; + uint64_t stable_fp, stable_val; uint32_t i, nth, timeout; int ch, status, ret; const char *working_dir; @@ -653,7 +643,7 @@ main(int argc, char *argv[]) */ testutil_check(__wt_snprintf(buf, sizeof(buf), "rm -rf ../%s.SAVE && mkdir ../%s.SAVE && " - "cp -p WiredTigerLog.* ../%s.SAVE", + "cp -p * ../%s.SAVE", home, home, home)); if ((status = system(buf)) < 0) testutil_die(status, "system: %s", buf); @@ -673,26 +663,17 @@ main(int argc, char *argv[]) uri_local, NULL, NULL, &cur_local)); testutil_check(session->open_cursor(session, uri_oplog, NULL, NULL, &cur_oplog)); - testutil_check(session->open_cursor(session, - stable_store, NULL, NULL, &cur_stable)); /* * Find the biggest stable timestamp value that was saved. */ stable_val = 0; - memset(val, 0, sizeof(val)); - while (cur_stable->next(cur_stable) == 0) { - testutil_check(cur_stable->get_key(cur_stable, &key)); - testutil_check(cur_stable->get_value(cur_stable, &val[key])); - if (val[key] > stable_val) - stable_val = val[key]; - - if (use_ts) - printf("Stable: key %" PRIu64 " value %" PRIu64 "\n", - key, val[key]); - } - if (use_ts) + if (use_ts) { + testutil_check( + conn->query_timestamp(conn, buf, "get=recovery")); + sscanf(buf, "%" SCNx64, &stable_val); printf("Got stable_val %" PRIu64 "\n", stable_val); + } count = 0; absent_coll = absent_local = absent_oplog = 0; @@ -761,11 +742,11 @@ main(int argc, char *argv[]) * larger than the saved one. */ if (!inmem && - stable_fp != 0 && stable_fp <= val[i]) { + stable_fp != 0 && stable_fp <= stable_val) { printf("%s: COLLECTION no record with " "key %" PRIu64 " record ts %" PRIu64 " <= stable ts %" PRIu64 "\n", - fname, key, stable_fp, val[i]); + fname, key, stable_fp, stable_val); absent_coll++; } if (c_rep[i].first_miss == INVALID_KEY) @@ -779,6 +760,18 @@ main(int argc, char *argv[]) */ c_rep[i].exist_key = key; fatal = true; + } else if (!inmem && + stable_fp != 0 && stable_fp > stable_val) { + /* + * If we found a record, the stable timestamp + * written to our file better be no larger + * than the checkpoint one. + */ + printf("%s: COLLECTION record with " + "key %" PRIu64 " record ts %" PRIu64 + " > stable ts %" PRIu64 "\n", + fname, key, stable_fp, stable_val); + fatal = true; } /* * The local table should always have all data. diff --git a/src/third_party/wiredtiger/test/format/CONFIG.stress b/src/third_party/wiredtiger/test/format/CONFIG.stress new file mode 100644 index 00000000000..0b5251d7952 --- /dev/null +++ b/src/third_party/wiredtiger/test/format/CONFIG.stress @@ -0,0 +1,7 @@ +# A reasonable configuration for stress testing. +cache_minimum=20 +huffman_key=0 +huffman_value=0 +rows=1000000 +runs=100 +timer=4 diff --git a/src/third_party/wiredtiger/test/format/bdb.c b/src/third_party/wiredtiger/test/format/bdb.c index adf32713cd2..c5d11dcefc7 100644 --- a/src/third_party/wiredtiger/test/format/bdb.c +++ b/src/third_party/wiredtiger/test/format/bdb.c @@ -32,6 +32,10 @@ static DBT key, value; static WT_ITEM keyitem; +#define bdb_die(ret, fmt, ...) \ + testutil_die(0, "%s/%d: %s: " fmt, \ + __func__, __LINE__, db_strerror(ret), __VA_ARGS__); + static int bdb_compare_reverse(DB *dbp, const DBT *k1, const DBT *k2 #if DB_VERSION_MAJOR >= 6 @@ -69,7 +73,7 @@ bdb_open(void) DB_CREATE | DB_INIT_LOCK | DB_INIT_MPOOL | DB_PRIVATE, 0) == 0); assert(db_create(&db, dbenv, 0) == 0); - if (g.type == ROW && g.c_reverse) + if (g.c_reverse) assert(db->set_bt_compare(db, bdb_compare_reverse) == 0); assert(db->open( @@ -127,7 +131,7 @@ bdb_np(bool next, if ((ret = dbc->get(dbc, &key, &value, next ? DB_NEXT : DB_PREV)) != 0) { if (ret != DB_NOTFOUND) - testutil_die(ret, "dbc.get: %s: {%.*s}", + bdb_die(ret, "dbc.get: %s: {%.*s}", next ? "DB_NEXT" : "DB_PREV", (int)key.size, (char *)key.data); *notfoundp = 1; @@ -152,7 +156,7 @@ bdb_read(uint64_t keyno, void *valuep, size_t *valuesizep, int *notfoundp) *notfoundp = 0; if ((ret = dbc->get(dbc, &key, &value, DB_SET)) != 0) { if (ret != DB_NOTFOUND) - testutil_die(ret, "dbc.get: DB_SET: {%.*s}", + bdb_die(ret, "dbc.get: DB_SET: {%.*s}", (int)key.size, (char *)key.data); *notfoundp = 1; } else { @@ -174,7 +178,7 @@ bdb_update(const void *arg_key, size_t arg_key_size, value.size = (u_int32_t)arg_value_size; if ((ret = dbc->put(dbc, &key, &value, DB_KEYFIRST)) != 0) - testutil_die(ret, "dbc.put: DB_KEYFIRST: {%.*s}{%.*s}", + bdb_die(ret, "dbc.put: DB_KEYFIRST: {%.*s}{%.*s}", (int)key.size, (char *)key.data, (int)value.size, (char *)value.data); } @@ -196,10 +200,82 @@ bdb_remove(uint64_t keyno, int *notfoundp) if (*notfoundp) return; - if ((ret = dbc->del(dbc, 0)) != 0) { - if (ret != DB_NOTFOUND) - testutil_die(ret, "dbc.del: {%.*s}", + /* Deleting a fixed-length item is the same as setting the bits to 0. */ + if (g.type == FIX) + bdb_update(key.data, key.size, "", 1); + else + if ((ret = dbc->del(dbc, 0)) != 0) { + if (ret != DB_NOTFOUND) + bdb_die(ret, "dbc.del: {%.*s}", + (int)key.size, (char *)key.data); + *notfoundp = 1; + } +} + +void +bdb_truncate(uint64_t start, uint64_t stop) +{ + DBC *dbc = g.dbc; + size_t len; + int cmp, ret, notfound; + + /* Deleting a fixed-length item is the same as setting the bits to 0. */ + if (g.type == FIX) { + /* + * If we're deleting from/to the start/end of the database, + * correct for the number of records we have. + */ + if (start == 0) + start = 1; + if (stop == 0) + stop = g.rows; + for (; start <= stop; ++start) + bdb_remove(start, ¬found); + return; + } + + if (start == 0) { + ret = dbc->get(dbc, &key, &value, DB_FIRST); + if (ret != 0 && ret != DB_NOTFOUND) + bdb_die(ret, "%s", "dbc.get: DB_FIRST"); + } else { + key_gen(&keyitem, start); + key.data = (void *)keyitem.data; + key.size = (u_int32_t)keyitem.size; + ret = dbc->get(dbc, &key, &value, DB_SET_RANGE); + if (ret != 0 && ret != DB_NOTFOUND) + bdb_die(ret, "dbc.get: DB_SET: {%.*s}", (int)key.size, (char *)key.data); - *notfoundp = 1; } + if (ret == DB_NOTFOUND) + return; + + if (stop == 0) { + do { + ret = dbc->del(dbc, 0); + if (ret != 0 && ret != DB_NOTFOUND) + bdb_die(ret, "dbc.del: {%.*s}", + (int)key.size, (char *)key.data); + } while ((ret = dbc->get(dbc, &key, &value, DB_NEXT)) == 0); + } else { + key_gen(&keyitem, stop); + do { + len = WT_MIN(key.size, keyitem.size); + cmp = memcmp(key.data, keyitem.data, len); + if (g.c_reverse) { + if (cmp < 0 || + (cmp == 0 && key.size < keyitem.size)) + break; + } else + if (cmp > 0 || + (cmp == 0 && key.size > keyitem.size)) + break; + ret = dbc->del(dbc, 0); + if (ret != 0 && ret != DB_NOTFOUND) + bdb_die(ret, "dbc.del: {%.*s}", + (int)key.size, (char *)key.data); + } while ((ret = dbc->get(dbc, &key, &value, DB_NEXT)) == 0); + } + if (ret != 0 && ret != DB_NOTFOUND) + bdb_die(ret, "%s", "dbc.get: DB_NEXT"); } diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c index 709eb9a4a26..8d85d331c89 100644 --- a/src/third_party/wiredtiger/test/format/config.c +++ b/src/third_party/wiredtiger/test/format/config.c @@ -149,8 +149,11 @@ config_setup(void) if (DATASOURCE("kvsbdb") && access(KVS_BDB_PATH, R_OK) != 0) testutil_die(errno, "kvsbdb shared library: %s", KVS_BDB_PATH); - /* Some data-sources don't support user-specified collations. */ - if (DATASOURCE("kvsbdb")) + /* + * Only row-store tables support collation order. + * Some data-sources don't support user-specified collations. + */ + if (g.type != ROW || DATASOURCE("kvsbdb")) config_single("reverse=off", 0); /* @@ -185,6 +188,13 @@ config_setup(void) if (g.c_cache_minimum != 0 && g.c_cache < g.c_cache_minimum) g.c_cache = g.c_cache_minimum; + /* + * Turn off truncate for LSM runs (some configurations with truncate + * always results in a timeout). + */ + if (!config_is_perm("truncate") && DATASOURCE("lsm")) + config_single("truncate=off", 0); + /* Give Helium configuration a final review. */ if (DATASOURCE("helium")) config_helium_reset(); @@ -588,7 +598,7 @@ config_pct(void) /* Cursor modify isn't possible for fixed-length column store. */ if (g.type == FIX) { - if (config_is_perm("modify_pct")) + if (config_is_perm("modify_pct") && g.c_modify_pct != 0) testutil_die(EINVAL, "WT_CURSOR.modify not supported by fixed-length " "column store"); @@ -603,7 +613,7 @@ config_pct(void) */ if (g.c_isolation_flag == ISOLATION_READ_UNCOMMITTED) { if (config_is_perm("isolation")) { - if (config_is_perm("modify_pct")) + if (config_is_perm("modify_pct") && g.c_modify_pct != 0) testutil_die(EINVAL, "WT_CURSOR.modify not supported with " "read-uncommitted transactions"); diff --git a/src/third_party/wiredtiger/test/format/config.h b/src/third_party/wiredtiger/test/format/config.h index 565df91d46b..3ea93e28b99 100644 --- a/src/third_party/wiredtiger/test/format/config.h +++ b/src/third_party/wiredtiger/test/format/config.h @@ -334,6 +334,10 @@ static CONFIG c[] = { "percent operations done inside an explicit transaction", 0x0, 1, 100, 100, &g.c_txn_freq, NULL }, + { "truncate", /* 100% */ + "enable truncation", + C_BOOL, 100, 0, 0, &g.c_truncate, NULL }, + { "value_max", "maximum size of values", 0x0, 32, 4096, MEGABYTE(10), &g.c_value_max, NULL }, diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h index a80c7de5c92..8c14a9f43fb 100644 --- a/src/third_party/wiredtiger/test/format/format.h +++ b/src/third_party/wiredtiger/test/format/format.h @@ -122,7 +122,9 @@ typedef struct { WT_RAND_STATE rnd; /* Global RNG state */ - uint64_t timestamp; /* Counter for timestamps. */ + uint64_t timestamp; /* Counter for timestamps */ + + uint64_t truncate_cnt; /* Counter for truncation */ /* * We have a list of records that are appended, but not yet "resolved", @@ -209,6 +211,7 @@ typedef struct { uint32_t c_timer; uint32_t c_txn_freq; uint32_t c_txn_timestamps; + uint32_t c_truncate; uint32_t c_value_max; uint32_t c_value_min; uint32_t c_verify; @@ -270,18 +273,22 @@ typedef struct { uint64_t commit_timestamp; /* last committed timestamp */ uint64_t read_timestamp; /* read timestamp */ - bool quit; /* thread should quit */ + volatile bool quit; /* thread should quit */ uint64_t search; /* operation counts */ uint64_t insert; uint64_t update; uint64_t remove; + uint64_t truncate; uint64_t ops; - uint64_t keyno; /* current key, value */ - WT_ITEM *key, _key; + uint64_t keyno; /* key */ + WT_ITEM *key, _key; /* key, value */ WT_ITEM *value, _value; + uint64_t last; /* truncate range */ + WT_ITEM *lastkey, _lastkey; + #define TINFO_RUNNING 1 /* Running */ #define TINFO_COMPLETE 2 /* Finished */ #define TINFO_JOINED 3 /* Resolved */ @@ -295,6 +302,7 @@ void bdb_np(bool, void *, size_t *, void *, size_t *, int *); void bdb_open(void); void bdb_read(uint64_t, void *, size_t *, int *); void bdb_remove(uint64_t, int *); +void bdb_truncate(uint64_t, uint64_t); void bdb_update(const void *, size_t, const void *, size_t); #endif diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c index bc11c2ba8f8..a250c295a77 100644 --- a/src/third_party/wiredtiger/test/format/ops.c +++ b/src/third_party/wiredtiger/test/format/ops.c @@ -32,6 +32,7 @@ static int col_insert(TINFO *, WT_CURSOR *); static int col_modify(TINFO *, WT_CURSOR *, bool); static int col_remove(TINFO *, WT_CURSOR *, bool); static int col_reserve(TINFO *, WT_CURSOR *, bool); +static int col_truncate(TINFO *, WT_CURSOR *); static int col_update(TINFO *, WT_CURSOR *, bool); static int nextprev(TINFO *, WT_CURSOR *, bool); static WT_THREAD_RET ops(void *); @@ -40,6 +41,7 @@ static int row_insert(TINFO *, WT_CURSOR *, bool); static int row_modify(TINFO *, WT_CURSOR *, bool); static int row_remove(TINFO *, WT_CURSOR *, bool); static int row_reserve(TINFO *, WT_CURSOR *, bool); +static int row_truncate(TINFO *, WT_CURSOR *); static int row_update(TINFO *, WT_CURSOR *, bool); static void table_append_init(void); @@ -269,10 +271,11 @@ wts_ops(int lastrun) free(tinfo_list); } -typedef enum { INSERT, MODIFY, READ, REMOVE, UPDATE } thread_op; +typedef enum { INSERT, MODIFY, READ, REMOVE, TRUNCATE, UPDATE } thread_op; typedef struct { thread_op op; /* Operation */ uint64_t keyno; /* Row number */ + uint64_t last; /* Inclusive end of a truncate range */ void *kdata; /* If an insert, the generated key */ size_t ksize; @@ -283,10 +286,10 @@ typedef struct { size_t vmemsize; } SNAP_OPS; -#define SNAP_TRACK(op, keyno, key, value) do { \ +#define SNAP_TRACK(op, tinfo) do { \ if (snap != NULL && \ (size_t)(snap - snap_list) < WT_ELEMENTS(snap_list)) \ - snap_track(snap++, op, keyno, key, value); \ + snap_track(snap++, op, tinfo); \ } while (0) /* @@ -294,28 +297,30 @@ typedef struct { * Add a single snapshot isolation returned value to the list. */ static void -snap_track( - SNAP_OPS *snap, thread_op op, uint64_t keyno, WT_ITEM *key, WT_ITEM *value) +snap_track(SNAP_OPS *snap, thread_op op, TINFO *tinfo) { - snap->op = op; - snap->keyno = keyno; + WT_ITEM *ip; - testutil_assert(key == NULL || (op == INSERT && g.type == ROW)); - if (key != NULL) { - if (snap->kmemsize < key->size) { - snap->kdata = drealloc(snap->kdata, key->size); - snap->kmemsize = key->size; + snap->op = op; + snap->keyno = tinfo->keyno; + snap->last = op == TRUNCATE ? tinfo->last : 0; + + if (op == INSERT && g.type == ROW) { + ip = tinfo->key; + if (snap->kmemsize < ip->size) { + snap->kdata = drealloc(snap->kdata, ip->size); + snap->kmemsize = ip->size; } - memcpy(snap->kdata, key->data, snap->ksize = key->size); + memcpy(snap->kdata, ip->data, snap->ksize = ip->size); } - testutil_assert(value != NULL || op == REMOVE); - if (value != NULL) { - if (snap->vmemsize < value->size) { - snap->vdata = drealloc(snap->vdata, value->size); - snap->vmemsize = value->size; + if (op != REMOVE && op != TRUNCATE) { + ip = tinfo->value; + if (snap->vmemsize < ip->size) { + snap->vdata = drealloc(snap->vdata, ip->size); + snap->vmemsize = ip->size; } - memcpy(snap->vdata, value->data, snap->vsize = value->size); + memcpy(snap->vdata, ip->data, snap->vsize = ip->size); } } @@ -332,9 +337,33 @@ snap_check(WT_CURSOR *cursor, uint8_t bitfield; for (; start < stop; ++start) { + /* + * We don't test all of the records in a truncate range, only + * the first because that matches the rest of the isolation + * checks. If a truncate range was from the start of the table, + * switch to the record at the end. + */ + if (start->op == TRUNCATE && start->keyno == 0) { + start->keyno = start->last; + testutil_assert(start->keyno != 0); + } + /* Check for subsequent changes to this record. */ - for (p = start + 1; p < stop && p->keyno != start->keyno; ++p) - ; + for (p = start + 1; p < stop; ++p) { + if (p->keyno == start->keyno) + break; + + if (p->op != TRUNCATE) + continue; + if (g.c_reverse && + (p->keyno == 0 || p->keyno >= start->keyno) && + (p->last == 0 || p->last <= start->keyno)) + break; + if (!g.c_reverse && + (p->keyno == 0 || p->keyno <= start->keyno) && + (p->last == 0 || p->last >= start->keyno)) + break; + } if (p != stop) continue; @@ -359,7 +388,9 @@ snap_check(WT_CURSOR *cursor, break; } } - if ((ret = cursor->search(cursor)) == 0) { + + switch (ret = cursor->search(cursor)) { + case 0: if (g.type == FIX) { testutil_check( cursor->get_value(cursor, &bitfield)); @@ -368,16 +399,23 @@ snap_check(WT_CURSOR *cursor, } else testutil_check( cursor->get_value(cursor, value)); - } else - if (ret != WT_NOTFOUND) - return (ret); + break; + case WT_NOTFOUND: + break; + case WT_ROLLBACK: + return (WT_ROLLBACK); + default: + testutil_die(ret, "WT_CURSOR.search"); + } /* Check for simple matches. */ - if (ret == 0 && start->op != REMOVE && + if (ret == 0 && + start->op != REMOVE && start->op != TRUNCATE && value->size == start->vsize && memcmp(value->data, start->vdata, value->size) == 0) continue; - if (ret == WT_NOTFOUND && start->op == REMOVE) + if (ret == WT_NOTFOUND && + (start->op == REMOVE || start->op == TRUNCATE)) continue; /* @@ -389,7 +427,7 @@ snap_check(WT_CURSOR *cursor, if (ret == WT_NOTFOUND && start->vsize == 1 && *(uint8_t *)start->vdata == 0) continue; - if (start->op == REMOVE && + if ((start->op == REMOVE || start->op == TRUNCATE) && value->size == 1 && *(uint8_t *)value->data == 0) continue; } @@ -551,10 +589,10 @@ ops(void *arg) WT_DECL_RET; WT_SESSION *session; thread_op op; - uint64_t reset_op, session_op; - uint32_t rnd; + uint64_t reset_op, session_op, truncate_op; + uint32_t range, rnd; u_int i, j, iso_config; - bool intxn, next, positioned, readonly; + bool greater_than, intxn, next, positioned, readonly; tinfo = arg; @@ -571,6 +609,8 @@ ops(void *arg) key_gen_init(tinfo->key); tinfo->value = &tinfo->_value; val_gen_init(tinfo->value); + tinfo->lastkey = &tinfo->_lastkey; + key_gen_init(tinfo->lastkey); /* Set the first operation where we'll create sessions and cursors. */ cursor = NULL; @@ -579,6 +619,9 @@ ops(void *arg) /* Set the first operation where we'll reset the session. */ reset_op = mmrand(&tinfo->rnd, 100, 10000); + /* Set the first operation where we'll truncate a range. */ + truncate_op = g.c_truncate == 0 ? + UINT64_MAX : mmrand(&tinfo->rnd, 100, 10000); for (intxn = false; !tinfo->quit; ++tinfo->ops) { /* Periodically open up a new session and cursors. */ @@ -683,7 +726,13 @@ ops(void *arg) op = READ; if (!readonly) { i = mmrand(&tinfo->rnd, 1, 100); - if (i < g.c_delete_pct) + if (i < g.c_delete_pct && tinfo->ops > truncate_op) { + op = TRUNCATE; + + /* Pick the next truncate operation. */ + truncate_op += + mmrand(&tinfo->rnd, 20000, 100000); + } else if (i < g.c_delete_pct) op = REMOVE; else if (i < g.c_delete_pct + g.c_insert_pct) op = INSERT; @@ -707,8 +756,7 @@ ops(void *arg) ret = read_row(tinfo, cursor); if (ret == 0) { positioned = true; - SNAP_TRACK( - READ, tinfo->keyno, NULL, tinfo->value); + SNAP_TRACK(READ, tinfo); } else { if (ret == WT_ROLLBACK && intxn) goto deadlock; @@ -763,9 +811,7 @@ ops(void *arg) positioned = false; if (ret == 0) { ++tinfo->insert; - SNAP_TRACK(INSERT, tinfo->keyno, - g.type == ROW ? tinfo->key : NULL, - tinfo->value); + SNAP_TRACK(INSERT, tinfo); } else { if (ret == WT_ROLLBACK && intxn) goto deadlock; @@ -791,8 +837,7 @@ ops(void *arg) } if (ret == 0) { positioned = true; - SNAP_TRACK( - MODIFY, tinfo->keyno, NULL, tinfo->value); + SNAP_TRACK(MODIFY, tinfo); } else { positioned = false; if (ret == WT_ROLLBACK && intxn) @@ -806,8 +851,7 @@ ops(void *arg) ret = read_row(tinfo, cursor); if (ret == 0) { positioned = true; - SNAP_TRACK( - READ, tinfo->keyno, NULL, tinfo->value); + SNAP_TRACK(READ, tinfo); } else { positioned = false; if (ret == WT_ROLLBACK && intxn) @@ -816,15 +860,14 @@ ops(void *arg) } break; case REMOVE: +remove_instead_of_truncate: switch (g.type) { case ROW: - ret = - row_remove(tinfo, cursor, positioned); + ret = row_remove(tinfo, cursor, positioned); break; case FIX: case VAR: - ret = - col_remove(tinfo, cursor, positioned); + ret = col_remove(tinfo, cursor, positioned); break; } if (ret == 0) { @@ -833,7 +876,7 @@ ops(void *arg) * Don't set positioned: it's unchanged from the * previous state, but not necessarily set. */ - SNAP_TRACK(REMOVE, tinfo->keyno, NULL, NULL); + SNAP_TRACK(REMOVE, tinfo); } else { positioned = false; if (ret == WT_ROLLBACK && intxn) @@ -841,6 +884,79 @@ ops(void *arg) testutil_assert(ret == WT_NOTFOUND); } break; + case TRUNCATE: + /* + * A maximum of 2 truncation operations at a time, more + * than that can lead to serious thrashing. + */ + if (__wt_atomic_addv64(&g.truncate_cnt, 1) > 2) { + (void)__wt_atomic_subv64(&g.truncate_cnt, 1); + goto remove_instead_of_truncate; + } + + if (!positioned) + tinfo->keyno = + mmrand(&tinfo->rnd, 1, (u_int)g.rows); + + /* + * Truncate up to 5% of the table. If the range overlaps + * the beginning/end of the table, set the key to 0 (the + * truncate function then sets a cursor to NULL so that + * code is tested). + * + * This gets tricky: there are 2 directions (truncating + * from lower keys to the current position or from + * the current position to higher keys), and collation + * order (truncating from lower keys to higher keys or + * vice-versa). + */ + greater_than = mmrand(&tinfo->rnd, 0, 1) == 1; + range = mmrand(&tinfo->rnd, 1, (u_int)g.rows / 20); + tinfo->last = tinfo->keyno; + if (greater_than) { + if (g.c_reverse) { + if (tinfo->keyno <= range) + tinfo->last = 0; + else + tinfo->last -= range; + } else { + tinfo->last += range; + if (tinfo->last > g.rows) + tinfo->last = 0; + } + } else { + if (g.c_reverse) { + tinfo->keyno += range; + if (tinfo->keyno > g.rows) + tinfo->keyno = 0; + } else { + if (tinfo->keyno <= range) + tinfo->keyno = 0; + else + tinfo->keyno -= range; + } + } + switch (g.type) { + case ROW: + ret = row_truncate(tinfo, cursor); + break; + case FIX: + case VAR: + ret = col_truncate(tinfo, cursor); + break; + } + positioned = false; + (void)__wt_atomic_subv64(&g.truncate_cnt, 1); + + if (ret == 0) { + ++tinfo->truncate; + SNAP_TRACK(TRUNCATE, tinfo); + } else { + testutil_assert(ret == WT_ROLLBACK); + if (intxn) + goto deadlock; + } + break; case UPDATE: update_instead_of_chosen_op: ++tinfo->update; @@ -855,8 +971,7 @@ update_instead_of_chosen_op: } if (ret == 0) { positioned = true; - SNAP_TRACK( - UPDATE, tinfo->keyno, NULL, tinfo->value); + SNAP_TRACK(UPDATE, tinfo); } else { positioned = false; if (ret == WT_ROLLBACK && intxn) @@ -937,6 +1052,7 @@ deadlock: ++tinfo->deadlock; } key_gen_teardown(tinfo->key); val_gen_teardown(tinfo->value); + key_gen_teardown(tinfo->lastkey); tinfo->state = TINFO_COMPLETE; return (WT_THREAD_RET_VALUE); @@ -1242,7 +1358,8 @@ mismatch: if (g.type == ROW) { } else { if ((p = (char *)strchr(bdb_key.data, '.')) != NULL) *p = '\0'; - fprintf(stderr, "\t%.*s != %" PRIu64 "\n", + fprintf(stderr, + "\t" "bdb-key %.*s != wt-key %" PRIu64 "\n", (int)bdb_key.size, (char *)bdb_key.data, keyno); } print_item("bdb-value", &bdb_value); @@ -1463,6 +1580,72 @@ col_modify(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) } /* + * row_truncate -- + * Truncate rows in a row-store file. + */ +static int +row_truncate(TINFO *tinfo, WT_CURSOR *cursor) +{ + WT_CURSOR *c2; + WT_DECL_RET; + WT_SESSION *session; + + session = cursor->session; + + /* + * The code assumes we're never truncating the entire object, assert + * that fact. + */ + testutil_assert(tinfo->keyno != 0 || tinfo->last != 0); + + c2 = NULL; + if (tinfo->keyno == 0) { + key_gen(tinfo->key, tinfo->last); + cursor->set_key(cursor, tinfo->key); + ret = session->truncate(session, NULL, NULL, cursor, NULL); + } else if (tinfo->last == 0) { + key_gen(tinfo->key, tinfo->keyno); + cursor->set_key(cursor, tinfo->key); + ret = session->truncate(session, NULL, cursor, NULL, NULL); + } else { + key_gen(tinfo->key, tinfo->keyno); + cursor->set_key(cursor, tinfo->key); + + testutil_check( + session->open_cursor(session, g.uri, NULL, NULL, &c2)); + key_gen(tinfo->lastkey, tinfo->last); + cursor->set_key(c2, tinfo->lastkey); + + ret = session->truncate(session, NULL, cursor, c2, NULL); + testutil_check(c2->close(c2)); + } + + if (g.logging == LOG_OPS) + (void)g.wt_api->msg_printf(g.wt_api, session, + "%-10s%" PRIu64 ", %" PRIu64, + "truncate", + tinfo->keyno, tinfo->last); + + switch (ret) { + case 0: + break; + case WT_CACHE_FULL: + case WT_ROLLBACK: + return (WT_ROLLBACK); + default: + testutil_die(ret, + "row_truncate: row %" PRIu64 "-%" PRIu64, + tinfo->keyno, tinfo->last); + } + +#ifdef HAVE_BERKELEY_DB + if (SINGLETHREADED) + bdb_truncate(tinfo->keyno, tinfo->last); +#endif + return (0); +} + +/* * row_update -- * Update a row in a row-store file. */ @@ -1497,12 +1680,72 @@ row_update(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) } #ifdef HAVE_BERKELEY_DB - if (!SINGLETHREADED) - return (0); + if (SINGLETHREADED) + bdb_update( + tinfo->key->data, tinfo->key->size, + tinfo->value->data, tinfo->value->size); +#endif + return (0); +} - bdb_update( - tinfo->key->data, tinfo->key->size, - tinfo->value->data, tinfo->value->size); +/* + * col_truncate -- + * Truncate rows in a column-store file. + */ +static int +col_truncate(TINFO *tinfo, WT_CURSOR *cursor) +{ + WT_CURSOR *c2; + WT_DECL_RET; + WT_SESSION *session; + + session = cursor->session; + + /* + * The code assumes we're never truncating the entire object, assert + * that fact. + */ + testutil_assert(tinfo->keyno != 0 || tinfo->last != 0); + + c2 = NULL; + if (tinfo->keyno == 0) { + cursor->set_key(cursor, tinfo->last); + ret = session->truncate(session, NULL, NULL, cursor, NULL); + } else if (tinfo->last == 0) { + cursor->set_key(cursor, tinfo->keyno); + ret = session->truncate(session, NULL, cursor, NULL, NULL); + } else { + cursor->set_key(cursor, tinfo->keyno); + + testutil_check( + session->open_cursor(session, g.uri, NULL, NULL, &c2)); + cursor->set_key(c2, tinfo->last); + + ret = session->truncate(session, NULL, cursor, c2, NULL); + testutil_check(c2->close(c2)); + } + + if (g.logging == LOG_OPS) + (void)g.wt_api->msg_printf(g.wt_api, session, + "%-10s%" PRIu64 "-%" PRIu64, + "truncate", + tinfo->keyno, tinfo->last); + + switch (ret) { + case 0: + break; + case WT_CACHE_FULL: + case WT_ROLLBACK: + return (WT_ROLLBACK); + default: + testutil_die(ret, + "col_truncate: row %" PRIu64 "-%" PRIu64, + tinfo->keyno, tinfo->last); + } + +#ifdef HAVE_BERKELEY_DB + if (SINGLETHREADED) + bdb_truncate(tinfo->keyno, tinfo->last); #endif return (0); } @@ -1549,13 +1792,12 @@ col_update(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) } #ifdef HAVE_BERKELEY_DB - if (!SINGLETHREADED) - return (0); - - key_gen(tinfo->key, tinfo->keyno); - bdb_update( - tinfo->key->data, tinfo->key->size, - tinfo->value->data, tinfo->value->size); + if (SINGLETHREADED) { + key_gen(tinfo->key, tinfo->keyno); + bdb_update( + tinfo->key->data, tinfo->key->size, + tinfo->value->data, tinfo->value->size); + } #endif return (0); } @@ -1699,12 +1941,10 @@ row_insert(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) } #ifdef HAVE_BERKELEY_DB - if (!SINGLETHREADED) - return (0); - - bdb_update( - tinfo->key->data, tinfo->key->size, - tinfo->value->data, tinfo->value->size); + if (SINGLETHREADED) + bdb_update( + tinfo->key->data, tinfo->key->size, + tinfo->value->data, tinfo->value->size); #endif return (0); } @@ -1751,13 +1991,12 @@ col_insert(TINFO *tinfo, WT_CURSOR *cursor) } #ifdef HAVE_BERKELEY_DB - if (!SINGLETHREADED) - return (0); - - key_gen(tinfo->key, tinfo->keyno); - bdb_update( - tinfo->key->data, tinfo->key->size, - tinfo->value->data, tinfo->value->size); + if (SINGLETHREADED) { + key_gen(tinfo->key, tinfo->keyno); + bdb_update( + tinfo->key->data, tinfo->key->size, + tinfo->value->data, tinfo->value->size); + } #endif return (0); } @@ -1795,14 +2034,11 @@ row_remove(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) } #ifdef HAVE_BERKELEY_DB - if (!SINGLETHREADED) - return (ret); - - { - int notfound; + if (SINGLETHREADED) { + int notfound; - bdb_remove(tinfo->keyno, ¬found); - (void)notfound_chk("row_remove", ret, notfound, tinfo->keyno); + bdb_remove(tinfo->keyno, ¬found); + (void)notfound_chk("row_remove", ret, notfound, tinfo->keyno); } #endif return (ret); @@ -1839,17 +2075,7 @@ col_remove(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) } #ifdef HAVE_BERKELEY_DB - if (!SINGLETHREADED) - return (ret); - - /* - * Deleting a fixed-length item is the same as setting the bits to 0; - * do the same thing for the BDB store. - */ - if (g.type == FIX) { - key_gen(tinfo->key, tinfo->keyno); - bdb_update(tinfo->key->data, tinfo->key->size, "", 1); - } else { + if (SINGLETHREADED) { int notfound; bdb_remove(tinfo->keyno, ¬found); diff --git a/src/third_party/wiredtiger/test/suite/test_cursor13.py b/src/third_party/wiredtiger/test/suite/test_cursor13.py new file mode 100644 index 00000000000..faab6477c48 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_cursor13.py @@ -0,0 +1,376 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2017 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest, time +from wiredtiger import stat +from wtscenario import make_scenarios +import test_cursor01, test_cursor02, test_cursor03 +import test_checkpoint01, test_checkpoint02 +from wtdataset import SimpleDataSet, ComplexDataSet, ComplexLSMDataSet +from helper import confirm_does_not_exist +from suite_random import suite_random + +# Cursor caching tests +class test_cursor13_base(wttest.WiredTigerTestCase): + conn_config = 'statistics=(fast)' + stat_cursor_cache = 0 + stat_cursor_reopen = 0 + + def setUpSessionOpen(self, conn): + return conn.open_session('cache_cursors=true') + + def caching_stats(self): + stat_cursor = self.session.open_cursor('statistics:', None, None) + cache = stat_cursor[stat.conn.cursor_cache][2] + reopen = stat_cursor[stat.conn.cursor_reopen][2] + stat_cursor.close() + return [cache, reopen] + + def sweep_stats(self): + stat_cursor = self.session.open_cursor('statistics:', None, None) + sweep = stat_cursor[stat.conn.cursor_sweep][2] + buckets = stat_cursor[stat.conn.cursor_sweep_buckets][2] + examined = stat_cursor[stat.conn.cursor_sweep_examined][2] + closed = stat_cursor[stat.conn.cursor_sweep_closed][2] + stat_cursor.close() + return [sweep, buckets, examined, closed] + + def assert_cursor_cached(self, expect_change): + stats = self.caching_stats() + if expect_change: + self.assertGreater(stats[0], self.stat_cursor_cache) + self.stat_cursor_cache = stats[0] + else: + self.assertEqual(stats[0], self.stat_cursor_cache) + + def assert_cursor_reopened(self, expect_change): + stats = self.caching_stats() + if expect_change: + self.assertGreater(stats[1], self.stat_cursor_reopen) + self.stat_cursor_reopen = stats[1] + else: + self.assertEqual(stats[1], self.stat_cursor_reopen) + + def cursor_stats_init(self): + stats = self.caching_stats() + self.stat_cursor_cache = stats[0] + self.stat_cursor_reopen = stats[1] + +# Override other cursor tests with cursors cached. +class test_cursor13_01(test_cursor01.test_cursor01, test_cursor13_base): + pass + +class test_cursor13_02(test_cursor02.test_cursor02, test_cursor13_base): + pass + +class test_cursor13_03(test_cursor03.test_cursor03, test_cursor13_base): + pass + +class test_cursor13_ckpt01(test_checkpoint01.test_checkpoint, + test_cursor13_base): + pass + +class test_cursor13_ckpt02(test_checkpoint01.test_checkpoint_cursor, + test_cursor13_base): + pass + +class test_cursor13_ckpt03(test_checkpoint01.test_checkpoint_target, + test_cursor13_base): + pass + +class test_cursor13_ckpt04(test_checkpoint01.test_checkpoint_cursor_update, + test_cursor13_base): + pass + +class test_cursor13_ckpt05(test_checkpoint01.test_checkpoint_last, + test_cursor13_base): + pass + +class test_cursor13_ckpt06(test_checkpoint01.test_checkpoint_empty, + test_cursor13_base): + pass + +class test_cursor13_ckpt2(test_checkpoint02.test_checkpoint02, + test_cursor13_base): + pass + +class test_cursor13_reopens(test_cursor13_base): + scenarios = make_scenarios([ + ('file', dict(uri='file:cursor13_reopen1', dstype=None)), + ('table', dict(uri='table:cursor13_reopen2', dstype=None)), + ('sfile', dict(uri='file:cursor13_reopen3', dstype=SimpleDataSet)), + ('stable', dict(uri='table:cursor13_reopen4', dstype=SimpleDataSet)), + ('ctable', dict(uri='table:cursor13_reopen5', dstype=ComplexDataSet)), + ('clsm', dict(uri='table:cursor13_reopen6', dstype=ComplexLSMDataSet)) + ]) + + def basic_populate(self, uri, caching_enabled): + cursor = self.session.open_cursor(uri) + cursor['A'] = 'B' + cursor.close() + self.assert_cursor_cached(caching_enabled) + cursor = self.session.open_cursor(uri) + self.assert_cursor_reopened(caching_enabled) + cursor['B'] = 'C' + cursor.close() + self.assert_cursor_cached(caching_enabled) + + def basic_check(self, cursor): + count = 0 + for x,y in cursor: + if count == 0: + self.assertEqual('A', x) + self.assertEqual('B', y) + elif count == 1: + self.assertEqual('B', x) + self.assertEqual('C', y) + count += 1 + self.assertEqual(count, 2) + + def basic_reopen(self, nopens, create, caching_enabled): + session = self.session + if create: + session.create(self.uri, 'key_format=S,value_format=S') + self.basic_populate(self.uri, caching_enabled) + # At this point, we've cached one cursor. + + # Reopen/close many times, with multiple cursors + for opens in range(0, nopens): + # We expect a cursor to be reopened if we did the + # create operation above or if this is the second or later + # time through the loop. + c = session.open_cursor(self.uri) + self.assert_cursor_reopened(caching_enabled and \ + (opens != 0 or create)) + + # With one cursor for this URI already open, we'll only + # get a reopened cursor if this is the second or later + # time through the loop. + c2 = session.open_cursor(self.uri) + self.assert_cursor_reopened(caching_enabled and opens != 0) + + self.basic_check(c) + self.basic_check(c2) + c.close() + self.assert_cursor_cached(caching_enabled) + c2.close() + self.assert_cursor_cached(caching_enabled) + + def dataset_reopen(self, caching_enabled): + ds = self.dstype(self, self.uri, 100) + ds.populate() + self.assert_cursor_cached(caching_enabled) + ds.check() + self.assert_cursor_reopened(caching_enabled) + + def test_reopen(self): + self.cursor_stats_init() + if self.dstype == None: + self.basic_reopen(100, True, True) + else: + self.dataset_reopen(True) + + def test_reconfig(self): + if self.dstype == None: + self.cursor_stats_init() + self.basic_reopen(10, True, True) + self.session.reconfigure('cache_cursors=false') + self.cursor_stats_init() + self.basic_reopen(10, False, False) + self.session.reconfigure('cache_cursors=true') + self.cursor_stats_init() + self.basic_reopen(10, False, True) + +class test_cursor13_drops(test_cursor13_base): + def open_and_drop(self, uri, cursor_session, drop_session, nopens, ntrials): + for i in range(0, ntrials): + cursor_session.create(uri, 'key_format=S,value_format=S') + for i in range(0, nopens): + c = cursor_session.open_cursor(uri) + c.close() + # The cursor cache is unaffected by the drop, and nothing + # in the cache should prevent the drop from occuring. + drop_session.drop(uri) + confirm_does_not_exist(self, uri) + + def test_open_and_drop(self): + session = self.session + for uri in [ 'file:test_cursor13_drops', 'table:test_cursor13_drops' ]: + self.open_and_drop(uri, session, session, 0, 5) + self.open_and_drop(uri, session, session, 1, 5) + self.open_and_drop(uri, session, session, 3, 5) + + # It should still work with different sessions + session2 = self.conn.open_session(None) + self.open_and_drop(uri, session2, session, 0, 5) + self.open_and_drop(uri, session2, session, 1, 5) + self.open_and_drop(uri, session2, session, 3, 5) + session2.close() + + def test_open_index_and_drop(self): + # We should also be able to detect cached cursors + # for indices + session = self.session + uri = 'table:test_cursor13_drops' + ds = ComplexDataSet(self, uri, 100) + ds.create() + indexname = ds.index_name(0) + c = session.open_cursor(indexname) + # The index is really open, so we cannot drop the main table. + self.assertRaises(wiredtiger.WiredTigerError, + lambda: session.drop(uri)) + c.close() + session.drop(uri) + confirm_does_not_exist(self, uri) + + # Same test for indices, but with cursor held by another session. + # TODO: try with session that DOES have cache_cursors and another + # that does not. + session2 = self.conn.open_session(None) + ds = ComplexDataSet(self, uri, 100) + ds.create() + indexname = ds.index_name(0) + c = session2.open_cursor(indexname) + self.assertRaises(wiredtiger.WiredTigerError, + lambda: session.drop(uri)) + c.close() + session.drop(uri) + confirm_does_not_exist(self, uri) + session2.close() + + def test_cursor_drops(self): + session = self.session + uri = 'table:test_cursor13_drops' + idxuri = 'index:test_cursor13_drops:index1' + config = 'key_format=S,value_format=S,columns=(k,v1)' + + for i in range(0, 2): + session.create(uri, config) + session.drop(uri) + + for i in range(0, 2): + session.create(uri, config) + cursor = session.open_cursor(uri, None) + cursor['A'] = 'B' + self.assertRaises(wiredtiger.WiredTigerError, + lambda: session.drop(uri)) + cursor.close() + session.drop(uri) + + for i in range(0, 2): + session.create(uri, config) + session.create(idxuri, 'columns=(v1)') + cursor = session.open_cursor(uri, None) + cursor['A'] = 'B' + self.assertRaises(wiredtiger.WiredTigerError, + lambda: session.drop(uri)) + cursor.close() + session.drop(uri) + + for i in range(0, 2): + session.create(uri, config) + session.create(idxuri, 'columns=(v1)') + cursor = session.open_cursor(uri, None) + cursor['A'] = 'B' + cursor.close() + cursor = session.open_cursor(idxuri, None) + self.assertRaises(wiredtiger.WiredTigerError, + lambda: session.drop(uri)) + cursor.close() + session.drop(uri) + +class test_cursor13_sweep(test_cursor13_base): + aggressive_sweep = False + scenarios = make_scenarios([ + ('file', dict(uri='file:cursor13_sweep_a')), + ('table', dict(uri='table:cursor13_sweep_b')) + ]) + + deep = 3 + nuris = 100 + nopens = 500000 + def uriname(self, i): + return self.uri + '.' + str(i) + + def test_cursor_sweep(self): + rand = suite_random() + + # Create a large number (self.nuris) of uris, and for each one, + # create some number (self.deep) of cached cursors. + urimap = {} + for i in xrange(0, self.nuris): + uri = self.uriname(i) + cursors = [] + self.session.create(uri, None) + for j in xrange(0, self.deep): + cursors.append(self.session.open_cursor(uri, None)) + for c in cursors: + c.close() + + # Each map entry has a list of the open cursors. + # We start with none + urimap[uri] = [] + + # At this point, we'll randomly open/close lots of cursors, keeping + # track of how many of each. As long as we don't have more than [deep] + # cursors open for each uri, we should always be taking then from + # the set of cached cursors. + self.cursor_stats_init() + begin_stats = self.caching_stats() + #self.tty('stats before = ' + str(begin_stats)) + + opencount = 0 + closecount = 0 + + while opencount < self.nopens: + uri = self.uriname(rand.rand_range(0, self.nuris)) + cursors = urimap[uri] + ncursors = len(cursors) + + # Keep the range of open cursors between 0 and [deep], + # with some random fluctuation + if ncursors == 0: + do_open = True + elif ncursors == self.deep: + do_open = False + else: + do_open = (rand.rand_range(0, 2) == 0) + if do_open: + cursors.append(self.session.open_cursor(uri, None)) + opencount += 1 + else: + i = rand.rand_range(0, ncursors) + cursors.pop(i).close() + closecount += 1 + + end_stats = self.caching_stats() + + #self.tty('opens = ' + str(opencount) + ', closes = ' + str(closecount)) + #self.tty('stats after = ' + str(end_stats)) + self.assertEquals(end_stats[0] - begin_stats[0], closecount) + self.assertEquals(end_stats[1] - begin_stats[1], opencount) diff --git a/src/third_party/wiredtiger/test/suite/test_metadata_cursor02.py b/src/third_party/wiredtiger/test/suite/test_metadata_cursor02.py new file mode 100644 index 00000000000..bf1f8b95f14 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_metadata_cursor02.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2018 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest +from wtscenario import make_scenarios + +# test_metadata_cursor02.py +# Metadata cursor operations with invalid metadata +# +# Test metadata cursor semantics when the underlying metadata is invalid. +# This can happen after a crash, or if part of a table is dropped separate +# from dropping the whole table. +class test_metadata_cursor02(wttest.WiredTigerTestCase): + """ + Test metadata cursor operations with invalid metadata + """ + table_name1 = 'table:t1' + table_name2 = 'table:t2' + table_name3 = 'table:t3' + tables = [table_name1, table_name2, table_name3] + + scenarios = make_scenarios([ + ('plain', {'metauri' : 'metadata:'}), + ('create', {'metauri' : 'metadata:create'}), + ], [ + ('drop_colgroup', {'drop' : 'colgroup'}), + ('drop_file', {'drop' : 'file'}), + ]) + + # Create tables + def create_tables(self): + # Reopen to make sure we can drop anything left over from the last run + self.reopen_conn() + for name in self.tables: + self.session.drop(name, 'force=true') + self.session.create(name, 'key_format=S,value_format=S') + + # Forward iteration. + def test_missing(self): + for name in self.tables: + self.create_tables() + + # Invalidate the table by dropping part of it + if self.drop == 'colgroup': + self.session.drop('colgroup:' + name[-2:]) + else: + self.session.drop('file:' + name[-2:] + '.wt') + + cursor = self.session.open_cursor(self.metauri) + is_create_cursor = self.metauri.endswith('create') + count = 0 + for k, v in cursor: + self.pr('Found metadata entry: ' + k) + if k.startswith('table:'): + count += 1 + cursor.close() + + if is_create_cursor: + self.captureerr.checkAdditionalPattern(self, 'metadata information.*not found') + + # Should include the metadata and the two valid tables + self.assertEqual(count, self.metauri.endswith('create') and 2 or 3) + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_metadata_cursor03.py b/src/third_party/wiredtiger/test/suite/test_metadata_cursor03.py new file mode 100644 index 00000000000..e7c4e40dc6a --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_metadata_cursor03.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2018 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import sys, wiredtiger, wttest +from wtscenario import make_scenarios + +# test_metadata03.py +# Test atomic schema operations on create. +class test_metadata03(wttest.WiredTigerTestCase): + conn_config = 'log=(enabled)' + types = [ + ('file', dict(uri='file:', use_cg=False, use_index=False)), + ('lsm', dict(uri='lsm:', use_cg=False, use_index=False)), + ('table-cg', dict(uri='table:', use_cg=True, use_index=False)), + ('table-index', dict(uri='table:', use_cg=False, use_index=True)), + ('table-simple', dict(uri='table:', use_cg=False, use_index=False)), + ] + scenarios = make_scenarios(types) + + # Count actual log records in the log. Log cursors walk the individual + # operations of a transaction as well as the entire record. Skip counting + # any individual commit operations and only count entire records. + def count_logrecs(self): + count = 0 + c = self.session.open_cursor('log:', None, None) + while c.next() == 0: + # lsn.file, lsn.offset, opcount + keys = c.get_key() + # Only count whole records, which is when opcount is zero. + # If opcount is not zero it is an operation of a commit. + if keys[2] == 0: + count += 1 + c.close() + return count + + def verify_logrecs(self, origcnt): + # + # Walk through all the log and make sure that creating any table + # only writes two log records to the log. The two records are the + # commit entry itself and the sync record for the metadata file. + # + count = self.count_logrecs() + self.assertTrue(count == origcnt + 2) + + # Test that creating and dropping tables does not write individual + # log records. + def test_metadata03_create(self): + uri = self.uri + 'table0' + create_params = 'key_format=i,value_format=S,' + + cgparam = '' + if self.use_cg or self.use_index: + cgparam = 'columns=(k,v),' + if self.use_cg: + cgparam += 'colgroups=(g0),' + + # Create main table. + origcnt = self.count_logrecs() + self.session.create(uri, create_params + cgparam) + self.verify_logrecs(origcnt) + # Add in column group or index tables. + if self.use_cg: + # Create. + cgparam = 'columns=(v),' + suburi = 'colgroup:table0:g0' + origcnt = self.count_logrecs() + self.session.create(suburi, cgparam) + self.verify_logrecs(origcnt) + + if self.use_index: + # Create. + suburi = 'index:table0:i0' + origcnt = self.count_logrecs() + self.session.create(suburi, cgparam) + self.verify_logrecs(origcnt) + + # Dropping the main table will also drop all index or column group tables. + origcnt = self.count_logrecs() + self.session.drop(uri) + self.verify_logrecs(origcnt) + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_prepare01.py b/src/third_party/wiredtiger/test/suite/test_prepare01.py index f4ef7248228..0039e9106f0 100644 --- a/src/third_party/wiredtiger/test/suite/test_prepare01.py +++ b/src/third_party/wiredtiger/test/suite/test_prepare01.py @@ -110,15 +110,14 @@ class test_prepare01(wttest.WiredTigerTestCase): committed = 0 cursor = self.session.open_cursor(self.uri, None) self.check(cursor, 0, 0) - msg = "/prepare_transaction is not supported/" + # Currently ignore_prepare is not realized yet, hence no effect. self.session.begin_transaction("ignore_prepare=false") for i in xrange(self.nentries): if i > 0 and i % (self.nentries / 37) == 0: self.check(cursor, committed, i) - self.assertRaisesWithMessage(wiredtiger.WiredTigerError, - lambda: self.session.prepare_transaction(), msg) - self.session.commit_transaction() + self.session.prepare_transaction("prepare_timestamp=2a") + self.session.commit_transaction("commit_timestamp=3a") committed = i self.session.begin_transaction() @@ -133,9 +132,9 @@ class test_prepare01(wttest.WiredTigerTestCase): cursor.insert() self.check(cursor, committed, self.nentries) - self.assertRaisesWithMessage(wiredtiger.WiredTigerError, - lambda: self.session.prepare_transaction(), msg) - self.session.commit_transaction() + + self.session.prepare_transaction("prepare_timestamp=2a") + self.session.commit_transaction("commit_timestamp=3a") self.check(cursor, self.nentries, self.nentries) # Test that read-committed is the default isolation level. @@ -154,10 +153,9 @@ class test_read_committed_default(wttest.WiredTigerTestCase): cursor = self.session.open_cursor(self.uri, None) self.session.begin_transaction() cursor['key: aaa'] = 'value: aaa' - msg = "/prepare_transaction is not supported/" - self.assertRaisesWithMessage(wiredtiger.WiredTigerError, - lambda: self.session.prepare_transaction(), msg) - self.session.commit_transaction() + + self.session.prepare_transaction("prepare_timestamp=2a") + self.session.commit_transaction("commit_timestamp=3a") self.session.begin_transaction() cursor['key: bbb'] = 'value: bbb' @@ -165,14 +163,14 @@ class test_read_committed_default(wttest.WiredTigerTestCase): cursor = s.open_cursor(self.uri, None) s.begin_transaction("isolation=read-committed") self.assertEqual(self.cursor_count(cursor), 1) - self.assertRaisesWithMessage(wiredtiger.WiredTigerError, - lambda: self.session.prepare_transaction(), msg) - s.commit_transaction() - s.begin_transaction(None) + + s.prepare_transaction("prepare_timestamp=4a") + s.commit_transaction("commit_timestamp=5a") + s.begin_transaction() self.assertEqual(self.cursor_count(cursor), 1) - self.assertRaisesWithMessage(wiredtiger.WiredTigerError, - lambda: self.session.prepare_transaction(), msg) - s.commit_transaction() + s.prepare_transaction("prepare_timestamp=7a") + + s.commit_transaction("commit_timestamp=8a") s.close() if __name__ == '__main__': diff --git a/src/third_party/wiredtiger/test/suite/test_prepare02.py b/src/third_party/wiredtiger/test/suite/test_prepare02.py new file mode 100644 index 00000000000..e2971ee4ca5 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_prepare02.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2018 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# test_prepare02.py +# Prepare : check post conditions to prepare operation +# + +from suite_subprocess import suite_subprocess +import wiredtiger, wttest + +def timestamp_str(t): + return '%x' % t + +class test_prepare02(wttest.WiredTigerTestCase, suite_subprocess): + def test_prepare_session_operations(self): + if not wiredtiger.timestamp_build(): + self.skipTest('requires a timestamp build') + + self.session.create("table:mytable", "key_format=S,value_format=S") + cursor = self.session.open_cursor("table:mytable", None) + + # Test the session methods that are forbidden after the transaction is + # prepared. + self.session.begin_transaction() + self.session.prepare_transaction("prepare_timestamp=2a") + msg = "/ not permitted in a/" + # + # The operations listed below are not supported in the prepared state. + # + # The operations are listed in the same order as they are declared in + # the session structure. Any function missing below is allowed in the + # prepared state. + # + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda:self.session.reconfigure(), msg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.open_cursor("table:mytable", None), msg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.alter("table:mytable", + "access_pattern_hint=random"), msg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.create("table:mytable1", + "key_format=S,value_format=S"), msg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.compact("table:mytable"), msg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.drop("table:mytable", None), msg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.join(cursor, cursor, + "compare=gt,count=10"), msg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.log_flush("sync=on"), msg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.log_printf("Printing to log file"), msg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.rebalance("table:mytable", None), msg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.rename("table:mytable", "table:mynewtable", + None), msg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda:self.session.reset(), msg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.salvage("table:mytable", None), msg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.truncate("table:mytable", + None, None, None), msg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.upgrade("table:mytable", None), msg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.verify("table:mytable", None), msg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda:self.session.begin_transaction(), msg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda:self.session.prepare_transaction("prepare_timestamp=2a"), msg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.timestamp_transaction( + "commit_timestamp=2a"), msg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda:self.session.checkpoint(), msg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.snapshot("name=test"), msg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda:self.session.transaction_sync(), msg) + self.session.rollback_transaction() + + # Commit after prepare is permitted. + self.session.begin_transaction() + c1 = self.session.open_cursor("table:mytable", None) + self.session.prepare_transaction("prepare_timestamp=2a") + self.session.commit_transaction("commit_timestamp=2b") + + # Rollback after prepare is permitted. + self.session.begin_transaction() + self.session.prepare_transaction("prepare_timestamp=2a") + self.session.rollback_transaction() + + # Close after prepare is permitted. + self.session.begin_transaction() + self.session.prepare_transaction("prepare_timestamp=2a") + self.session.close() + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_prepare03.py b/src/third_party/wiredtiger/test/suite/test_prepare03.py new file mode 100644 index 00000000000..d9838ae7f82 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_prepare03.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2018 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest +from wtscenario import make_scenarios + +# test_prepre03.py +# Prepare transaction check post conditions for cursor operations + +# Pattern of test script is to invoke cursor operations in prepared transaction +# state to ensure they fail and to repeat same operations in non-prepared state +# to ensure normally they pass. +class test_prepre03(wttest.WiredTigerTestCase): + """ + Test basic operations + """ + table_name = 'test_prepare_cursor' + nentries = 10 + + scenarios = make_scenarios([ + ('file-col', dict(tablekind='col',uri='file', format='key_format=r,value_format=S')), + ('file-fix', dict(tablekind='fix',uri='file', format='key_format=r,value_format=8t')), + ('file-row', dict(tablekind='row',uri='file', format='key_format=S,value_format=S')), + ('lsm-row', dict(tablekind='row',uri='lsm', format='key_format=S,value_format=S')), + ('table-col', dict(tablekind='col',uri='table', format='key_format=r,value_format=S')), + ('table-fix', dict(tablekind='fix',uri='table', format='key_format=r,value_format=8t')), + ('table-row', dict(tablekind='row',uri='table', format='key_format=S,value_format=S')) + ]) + + def genkey(self, i): + if self.tablekind == 'row': + return 'key' + str(i) + else: + return long(i+1) + + def genvalue(self, i): + if self.tablekind == 'fix': + return int(i & 0xff) + else: + return 'value' + str(i) + + def assertCursorHasNoKeyValue(self, cursor): + keymsg = '/requires key be set/' + valuemsg = '/requires value be set/' + self.assertRaisesWithMessage( + wiredtiger.WiredTigerError, cursor.get_key, keymsg) + self.assertRaisesWithMessage( + wiredtiger.WiredTigerError, cursor.get_value, valuemsg) + + # Create the table and test cursor operations. + def test_prepare_cursor(self): + tablearg = self.uri + ':' + self.table_name + create_args = self.format + preparemsg = "/ not permitted in a prepared transaction/" + + self.pr('creating session: ' + create_args) + self.session.create(tablearg, create_args) + self.pr('creating cursor') + cursor = self.session.open_cursor(tablearg, None, None) + self.assertCursorHasNoKeyValue(cursor) + self.assertEqual(cursor.uri, tablearg) + + # Check insert operation + for i in range(0, self.nentries): + self.session.begin_transaction() + cursor.set_key(self.genkey(i)) + cursor.set_value(self.genvalue(i)) + self.session.prepare_transaction("prepare_timestamp=2a") + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda:cursor.insert(), preparemsg) + self.session.commit_transaction("commit_timestamp=2b") + cursor.insert() + + # Check next, get_key, get_value operations. + cursor.reset() + self.assertCursorHasNoKeyValue(cursor) + + i = 0 + while True: + self.session.begin_transaction() + self.session.prepare_transaction("prepare_timestamp=2a") + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda:cursor.next(), preparemsg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda:cursor.get_key(), preparemsg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda:cursor.get_value(), preparemsg) + self.session.commit_transaction("commit_timestamp=2b") + nextret = cursor.next() + if nextret != 0: + break + key = cursor.get_key() + value = cursor.get_value() + self.assertEqual(key, self.genkey(i)) + self.assertEqual(value, self.genvalue(i)) + i += 1 + + self.assertEqual(i, self.nentries) + self.assertEqual(nextret, wiredtiger.WT_NOTFOUND) + self.assertCursorHasNoKeyValue(cursor) + + # Check prev operation + cursor.reset() + self.assertCursorHasNoKeyValue(cursor) + + i = self.nentries - 1 + while True: + self.session.begin_transaction() + self.session.prepare_transaction("prepare_timestamp=2a") + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda:cursor.prev(), preparemsg) + self.session.commit_transaction("commit_timestamp=2b") + prevret = cursor.prev() + if prevret != 0: + break + key = cursor.get_key() + value = cursor.get_value() + self.assertEqual(key, self.genkey(i)) + self.assertEqual(value, self.genvalue(i)) + i -= 1 + + self.assertEqual(i, -1) + self.assertEqual(prevret, wiredtiger.WT_NOTFOUND) + self.assertCursorHasNoKeyValue(cursor) + + # Check search, update, remove, reserve, reconfigure operations. + cursor.reset() + self.assertCursorHasNoKeyValue(cursor) + + # Search for a specific key. + # Verify we get the expected error and then later we can update and + # remove it. + cursor.set_key(self.genkey(self.nentries/2)) + self.session.begin_transaction() + self.session.prepare_transaction("prepare_timestamp=2a") + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda:cursor.search(), preparemsg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda:cursor.update(), preparemsg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda:cursor.remove(), preparemsg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda:cursor.reserve(), preparemsg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda:cursor.reconfigure(), preparemsg) + self.session.commit_transaction("commit_timestamp=2b") + cursor.search() + cursor.set_value(self.genvalue(self.nentries + self.nentries/2)) + cursor.update() + cursor.remove() + + # Check search_near operation + cursor.set_key(self.genkey(self.nentries)) + self.session.begin_transaction() + self.session.prepare_transaction("prepare_timestamp=2a") + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda:cursor.search_near(), preparemsg) + self.session.commit_transaction("commit_timestamp=2b") + # There is a bug with search_near operation when no key is set. + # This fix is being tracked in WT-3918. + if self.uri == 'lsm': + cursor.set_key(self.genkey(self.nentries)) + cursor.search_near() + cursor.close() + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp01.py b/src/third_party/wiredtiger/test/suite/test_timestamp01.py index 0610b8aea7b..d0b60e2a529 100644 --- a/src/third_party/wiredtiger/test/suite/test_timestamp01.py +++ b/src/third_party/wiredtiger/test/suite/test_timestamp01.py @@ -45,7 +45,7 @@ class test_timestamp01(wttest.WiredTigerTestCase, suite_subprocess): self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: self.session.timestamp_transaction( 'commit_timestamp=' + timestamp_str(1 << 5000)), - '/must be running/') + '/only permitted in a running/') # Zero is not permitted self.session.begin_transaction() diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp09.py b/src/third_party/wiredtiger/test/suite/test_timestamp09.py index 9b7d88bf64e..6fbb15e38b8 100644 --- a/src/third_party/wiredtiger/test/suite/test_timestamp09.py +++ b/src/third_party/wiredtiger/test/suite/test_timestamp09.py @@ -37,7 +37,7 @@ def timestamp_str(t): return '%x' % t class test_timestamp09(wttest.WiredTigerTestCase, suite_subprocess): - tablename = 'test_timestamp08' + tablename = 'test_timestamp09' uri = 'table:' + tablename def test_timestamp_api(self): @@ -55,7 +55,7 @@ class test_timestamp09(wttest.WiredTigerTestCase, suite_subprocess): # In a single transaction it is illegal to set a commit timestamp # older than the first commit timestamp used for this transaction. - # Check both timestamp_transaction and commit_transaction API. + # Check both timestamp_transaction and commit_transaction APIs. self.session.begin_transaction() self.session.timestamp_transaction( 'commit_timestamp=' + timestamp_str(3)) @@ -77,7 +77,7 @@ class test_timestamp09(wttest.WiredTigerTestCase, suite_subprocess): '/older than the first commit timestamp/') # Commit timestamp >= Oldest timestamp - # Check both timestamp_transaction and commit_transaction API. + # Check both timestamp_transaction and commit_transaction APIs. self.session.begin_transaction() c[3] = 3 self.session.commit_transaction( @@ -131,7 +131,7 @@ class test_timestamp09(wttest.WiredTigerTestCase, suite_subprocess): '/oldest timestamp 0*6 must not be later than stable timestamp 0*5/') # Commit timestamp >= Stable timestamp. - # Check both timestamp_transaction and commit_transaction API. + # Check both timestamp_transaction and commit_transaction APIs. # Oldest and stable timestamp are set to 5 at the moment. self.conn.set_timestamp('stable_timestamp=' + timestamp_str(6)) self.session.begin_transaction() diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp10.py b/src/third_party/wiredtiger/test/suite/test_timestamp10.py new file mode 100644 index 00000000000..a238c4cbeec --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_timestamp10.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2018 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# test_timestamp10.py +# Timestamps: Saving and querying the checkpoint recovery timestamp +# + +import fnmatch, os, shutil +from suite_subprocess import suite_subprocess +import wiredtiger, wttest + +def timestamp_str(t): + return '%x' % t + +class test_timestamp10(wttest.WiredTigerTestCase, suite_subprocess): + conn_config = 'config_base=false,create,log=(enabled)' + coll1_uri = 'table:collection10.1' + coll2_uri = 'table:collection10.2' + coll3_uri = 'table:collection10.3' + oplog_uri = 'table:oplog10' + + def copy_dir(self, olddir, newdir): + ''' Simulate a crash from olddir and restart in newdir. ''' + # with the connection still open, copy files to new directory + shutil.rmtree(newdir, ignore_errors=True) + os.mkdir(newdir) + for fname in os.listdir(olddir): + fullname = os.path.join(olddir, fname) + # Skip lock file on Windows since it is locked + if os.path.isfile(fullname) and \ + "WiredTiger.lock" not in fullname and \ + "Tmplog" not in fullname and \ + "Preplog" not in fullname: + shutil.copy(fullname, newdir) + # close the original connection. + self.close_conn() + + def test_timestamp_recovery(self): + if not wiredtiger.timestamp_build(): + self.skipTest('requires a timestamp build') + + # + # Create several collection-like tables that are checkpoint durability. + # Add data to each of them separately and checkpoint so that each one + # has a different stable timestamp. + # + self.session.create(self.oplog_uri, 'key_format=i,value_format=i') + self.session.create(self.coll1_uri, 'key_format=i,value_format=i,log=(enabled=false)') + self.session.create(self.coll2_uri, 'key_format=i,value_format=i,log=(enabled=false)') + self.session.create(self.coll3_uri, 'key_format=i,value_format=i,log=(enabled=false)') + c_op = self.session.open_cursor(self.oplog_uri) + c = [] + c.append(self.session.open_cursor(self.coll1_uri)) + c.append(self.session.open_cursor(self.coll2_uri)) + c.append(self.session.open_cursor(self.coll3_uri)) + + # Begin by adding some data. + nentries = 10 + table_cnt = 3 + for table in range(1,table_cnt+1): + curs = c[table - 1] + start = nentries * table + end = start + nentries + ts = (end - 3) + for i in range(start,end): + self.session.begin_transaction() + c_op[i] = i + curs[i] = i + self.pr("i: " + str(i)) + self.session.commit_transaction( + 'commit_timestamp=' + timestamp_str(i)) + # Set the oldest and stable timestamp a bit earlier than the data + # we inserted. Take a checkpoint to the stable timestamp. + self.pr("stable ts: " + str(ts)) + self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(ts) + + ',stable_timestamp=' + timestamp_str(ts)) + # This forces a different checkpoint timestamp for each table. + self.session.checkpoint() + + # Copy to a new database and then recover. + self.copy_dir(".", "RESTART") + self.copy_dir(".", "SAVE") + new_conn = self.wiredtiger_open("RESTART", self.conn_config) + # Query the recovery timestamp and verify the data in the new database. + new_session = new_conn.open_session() + q = new_conn.query_timestamp('get=recovery') + self.pr("query recovery ts: " + q) + self.assertTimestampsEqual(new_conn.query_timestamp('get=recovery'), timestamp_str(ts)) + + c_op = new_session.open_cursor(self.oplog_uri) + c = [] + c.append(new_session.open_cursor(self.coll1_uri)) + c.append(new_session.open_cursor(self.coll2_uri)) + c.append(new_session.open_cursor(self.coll3_uri)) + for table in range(1,table_cnt+1): + curs = c[table - 1] + start = nentries * table + end = start + nentries + ts = (end - 3) + for i in range(start,end): + self.assertEquals(c_op[i], i) + curs.set_key(i) + # Earlier tables have all the data because later checkpoints + # will save the last bit of data. Only the last table will + # be missing some. + if i <= ts or table != table_cnt: + self.assertEquals(curs[i], i) + else: + self.assertEqual(curs.search(), wiredtiger.WT_NOTFOUND) + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_truncate01.py b/src/third_party/wiredtiger/test/suite/test_truncate01.py index b26c9cf8fe1..c2876081724 100644 --- a/src/third_party/wiredtiger/test/suite/test_truncate01.py +++ b/src/third_party/wiredtiger/test/suite/test_truncate01.py @@ -32,7 +32,7 @@ import wiredtiger, wttest from helper import confirm_empty -from wtdataset import SimpleDataSet, ComplexDataSet +from wtdataset import SimpleDataSet, ComplexDataSet, simple_key from wtscenario import make_scenarios # Test truncation arguments. @@ -175,6 +175,39 @@ class test_truncate_cursor_end(wttest.WiredTigerTestCase): self.assertEquals(c2.close(), 0) self.session.drop(uri) +# Test truncation of empty objects. +class test_truncate_empty(wttest.WiredTigerTestCase): + name = 'test_truncate_empty' + + types = [ + ('file', dict(type='file:')), + ('table', dict(type='table:')) + ] + keyfmt = [ + ('integer', dict(keyfmt='i')), + ('recno', dict(keyfmt='r')), + ('string', dict(keyfmt='S')), + ] + scenarios = make_scenarios(types, keyfmt) + + # Test truncation of empty objects using a cursor + def test_truncate_empty_cursor(self): + uri = self.type + self.name + self.session.create(uri, + ',key_format=' + self.keyfmt + ',value_format=S') + c1 = self.session.open_cursor(uri, None) + c1.set_key(simple_key(c1, 1000)) + c2 = self.session.open_cursor(uri, None) + c2.set_key(simple_key(c2, 2000)) + self.assertEquals(self.session.truncate(None, c1, c2, None), 0) + + # Test truncation of empty objects using a URI + def test_truncate_empty_uri(self): + uri = self.type + self.name + self.session.create(uri, + ',key_format=' + self.keyfmt + ',value_format=S') + self.assertEquals(self.session.truncate(uri, None, None, None), 0) + # Test session.truncate. class test_truncate_cursor(wttest.WiredTigerTestCase): name = 'test_truncate' diff --git a/src/third_party/wiredtiger/test/suite/test_txn17.py b/src/third_party/wiredtiger/test/suite/test_txn17.py index a15acb4aed3..d634a8cae8a 100644 --- a/src/third_party/wiredtiger/test/suite/test_txn17.py +++ b/src/third_party/wiredtiger/test/suite/test_txn17.py @@ -46,7 +46,7 @@ class test_txn17(wttest.WiredTigerTestCase, suite_subprocess): self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: self.session.timestamp_transaction( 'commit_timestamp=' + timestamp_str(1 << 5000)), - '/must be running/') + '/only permitted in a running/') # Cannot call commit on a non-running transaction. self.assertRaisesWithMessage(wiredtiger.WiredTigerError, diff --git a/src/third_party/wiredtiger/test/utility/test_util.h b/src/third_party/wiredtiger/test/utility/test_util.h index 1054412adfe..7387615c84b 100644 --- a/src/third_party/wiredtiger/test/utility/test_util.h +++ b/src/third_party/wiredtiger/test/utility/test_util.h @@ -142,10 +142,16 @@ typedef struct { /* * error_check -- - * Complain and quit if a function call fails. The same as testutil_check, - * but with a different name because it appears in the documentation. + * Complain and quit if a function call fails. A special name because it + * appears in the documentation. Ignore ENOTSUP to allow library calls which + * might not be included in any particular build. */ -#define error_check(call) testutil_check(call) +#define error_check(call) do { \ + int __r; \ + if ((__r = (call)) != 0 && __r != ENOTSUP) \ + testutil_die( \ + __r, "%s/%d: %s", __func__, __LINE__, #call); \ +} while (0) /* * scan_end_check -- diff --git a/src/third_party/wiredtiger/test/utility/thread.c b/src/third_party/wiredtiger/test/utility/thread.c index 4f70c562687..f12970182f0 100644 --- a/src/third_party/wiredtiger/test/utility/thread.c +++ b/src/third_party/wiredtiger/test/utility/thread.c @@ -185,7 +185,7 @@ op_bulk_unique(void *arg) /* Generate a unique object name. */ testutil_check(__wt_snprintf( - new_uri, sizeof(new_uri), "%s.%u", + new_uri, sizeof(new_uri), "%s.%" PRIu64, opts->uri, __wt_atomic_add64(&opts->unique_id, 1))); testutil_check(session->create(session, new_uri, NULL)); @@ -294,7 +294,7 @@ op_create_unique(void *arg) /* Generate a unique object name. */ testutil_check(__wt_snprintf( - new_uri, sizeof(new_uri), "%s.%u", + new_uri, sizeof(new_uri), "%s.%" PRIu64, opts->uri, __wt_atomic_add64(&opts->unique_id, 1))); testutil_check(session->create(session, new_uri, NULL)); diff --git a/src/third_party/wiredtiger/tools/optrack/wt_optrack_decode.py b/src/third_party/wiredtiger/tools/optrack/wt_optrack_decode.py index 380cab50eba..0837bc723dd 100755 --- a/src/third_party/wiredtiger/tools/optrack/wt_optrack_decode.py +++ b/src/third_party/wiredtiger/tools/optrack/wt_optrack_decode.py @@ -100,7 +100,8 @@ def funcIDtoName(funcID): if (functionMap.has_key(funcID)): return functionMap[funcID]; else: - return "NULL"; + print("Could not find the name for func " + str(funcID)); + return "NULL"; # # The format of the record is written down in src/include/optrack.h |