diff options
author | Luke Chen <luke.chen@mongodb.com> | 2020-04-22 09:50:35 +1000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-04-22 00:27:44 +0000 |
commit | 0a5ff5f074b671308c843b4ba6d7195b2591e7d4 (patch) | |
tree | 2668db86a44cabd966cd7fabe8a11aeadfd1de83 /src/third_party | |
parent | c593d0fb6eec6b4f3c7ae02a5c3de73ad6e3af95 (diff) | |
download | mongo-0a5ff5f074b671308c843b4ba6d7195b2591e7d4.tar.gz |
Import wiredtiger: 7b994a862e899a12eb7c3ac814c9fada7d8d1ab9 from branch mongodb-4.4
ref: 9bd1ece797..7b994a862e
for: 4.5.1
WT-4701 Switch test/format to use WiredTiger locking primitives
WT-5766 Separate out internal and shared transaction data
WT-5791 Prepare checkpoint can finish in 0msec and reset prepare min stat
WT-5794 Remove skew_newest option from reconciliation
WT-5833 Fix caching issue for overflow key/value items
WT-5919 Disallow logging archival testing with log-based incremental backup
WT-5946 Eviction server handles can deadlock when opening HS cursors
WT-5968 Make the WT_SESSION_IMPL.txn field an allocated structure
WT-5986 Create script for emulating multiversion tests
WT-6016 Fill source code comments where lines start with parentheticals
WT-6020 __rec_append_orig_value() cleanup
WT-6026 Fix s_all breakage on format.h
Diffstat (limited to 'src/third_party')
87 files changed, 1204 insertions, 1115 deletions
diff --git a/src/third_party/wiredtiger/bench/workgen/runner/multiversion.py b/src/third_party/wiredtiger/bench/workgen/runner/multiversion.py new file mode 100755 index 00000000000..be98187e542 --- /dev/null +++ b/src/third_party/wiredtiger/bench/workgen/runner/multiversion.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2020 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# + +from runner import * +from wiredtiger import * +from workgen import * + +def show(tname, s): + print('') + print('<><><><> ' + tname + ' <><><><>') + c = s.open_cursor(tname, None) + for k,v in c: + print('key: ' + k) + print('value: ' + v) + print('<><><><><><><><><><><><>') + c.close() + +def create_compat_config(args): + if args.release == "4.4": + return ',compatibility=(release="3.3", require_min="3.2.0")' + elif args.release == "4.2": + return ',compatibility=(release="3.2", require_max="3.3.0")' + else: + return '' + +context = Context() +context.parser.add_argument("--release", dest="release", type=str, + choices=["4.2", "4.4"], help="The WiredTiger version") +context.initialize() # parse the arguments. +conn = context.wiredtiger_open("create,cache_size=1G," + create_compat_config(context.args)) + +s = conn.open_session() +tname = 'table:simple' +s.create(tname, 'key_format=S,value_format=S') + +ops = Operation(Operation.OP_INSERT, Table(tname), Key(Key.KEYGEN_APPEND, 10), Value(40)) +thread = Thread(ops) +workload = Workload(context, thread) +workload.run(conn) +show(tname, s) + +thread = Thread(ops * 5) +workload = Workload(context, thread) +workload.run(conn) +show(tname, s) diff --git a/src/third_party/wiredtiger/bench/wtperf/config.c b/src/third_party/wiredtiger/bench/wtperf/config.c index 70e7ea75e7c..9e589e2a55f 100644 --- a/src/third_party/wiredtiger/bench/wtperf/config.c +++ b/src/third_party/wiredtiger/bench/wtperf/config.c @@ -537,9 +537,8 @@ config_opt_file(WTPERF *wtperf, const char *filename) ; /* - * Find the end of the line; if there's no trailing newline, the - * the line is too long for the buffer or the file was corrupted - * (there's no terminating newline in the file). + * Find the end of the line; if there's no trailing newline, the line is too long for the + * buffer or the file was corrupted (there's no terminating newline in the file). */ for (rtrim = line; *rtrim && *rtrim != '\n'; rtrim++) ; diff --git a/src/third_party/wiredtiger/dist/log_data.py b/src/third_party/wiredtiger/dist/log_data.py index 18f368eaad0..cec0c07194e 100644 --- a/src/third_party/wiredtiger/dist/log_data.py +++ b/src/third_party/wiredtiger/dist/log_data.py @@ -95,6 +95,6 @@ optypes = [ LogOperationType('txn_timestamp', 'txn_timestamp', [('uint64', 'time_sec'), ('uint64', 'time_nsec'), ('uint64', 'commit_ts'), ('uint64', 'durable_ts'), - ('uint64', 'first_ts'), ('uint64', 'prepare_ts'), - ('uint64', 'read_ts')]), + ('uint64', 'first_commit_ts'), ('uint64', 'prepare_ts'), + ('uint64', 'read_ts'), ('uint64', 'pinned_read_ts')]), ] diff --git a/src/third_party/wiredtiger/dist/s_comment.py b/src/third_party/wiredtiger/dist/s_comment.py index f30de0e4794..482fded4fff 100644 --- a/src/third_party/wiredtiger/dist/s_comment.py +++ b/src/third_party/wiredtiger/dist/s_comment.py @@ -103,12 +103,13 @@ for line in sys.stdin: if (len(sline) >= 3 and sline.startswith('*') and sline[1] == ' ' and (sline[2].islower() or sline[2] == '_') and sline.endswith('--')): function_desc = True - # We're only reformatting block comments where each line begins with a - # space and an alphabetic character after the asterisk. The only - # exceptions are function descriptions. + # We're only reformatting block comments where each line begins with a space and an + # alphabetic character after the asterisk, or a parenthetical. The only exceptions + # are function descriptions. block = block and \ - (len(sline) >= 3 and sline.startswith('*') and - sline[1] == ' ' and sline[2].isalpha()) or function_desc + len(sline) >= 3 and sline.startswith('*') and sline[1] == ' ' and \ + (sline[2].isalpha() or (len(sline) >= 5 and \ + (sline[2] == '(' and sline[3].isalpha() and sline[4] != ')'))) or function_desc # Trim asterisks at the beginning of each line in a multiline comment. if sline.startswith('*'): sline = sline[1:] diff --git a/src/third_party/wiredtiger/examples/c/ex_all.c b/src/third_party/wiredtiger/examples/c/ex_all.c index 64d3c7da5a4..7360bf92862 100644 --- a/src/third_party/wiredtiger/examples/c/ex_all.c +++ b/src/third_party/wiredtiger/examples/c/ex_all.c @@ -567,8 +567,8 @@ session_ops_create(WT_SESSION *session) /*! [Create a table with columns] */ /* - * Create a table with columns: keys are record numbers, values are - * (string, signed 32-bit integer, unsigned 16-bit integer). + * Create a table with columns: keys are record numbers, values are (string, signed 32-bit + * integer, unsigned 16-bit integer). */ error_check(session->create(session, "table:mytable", "key_format=r,value_format=SiH," diff --git a/src/third_party/wiredtiger/examples/c/ex_extractor.c b/src/third_party/wiredtiger/examples/c/ex_extractor.c index ed6c7b671a6..b4c74c35cfd 100644 --- a/src/third_party/wiredtiger/examples/c/ex_extractor.c +++ b/src/third_party/wiredtiger/examples/c/ex_extractor.c @@ -73,11 +73,9 @@ my_extract(WT_EXTRACTOR *extractor, WT_SESSION *session, const WT_ITEM *key, con */ for (year = term_start; year <= term_end; ++year) { /* - * Note that the extract callback is called for all operations - * that update the table, not just inserts. The user sets the - * key and uses the cursor->insert() method to return the index - * key(s). WiredTiger will perform the required operation - * (such as a remove()). + * Note that the extract callback is called for all operations that update the table, not + * just inserts. The user sets the key and uses the cursor->insert() method to return the + * index key(s). WiredTiger will perform the required operation (such as a remove()). */ fprintf( stderr, "EXTRACTOR: index op for year %" PRIu16 ": %s %s\n", year, first_name, last_name); diff --git a/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c b/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c index 1778fa55b9f..28f1d41bcfc 100644 --- a/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c +++ b/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c @@ -52,23 +52,21 @@ typedef struct { } LZ4_COMPRESSOR; /* - * LZ4 decompression requires the exact compressed byte count returned by the - * LZ4_compress_default and LZ4_compress_destSize functions. WiredTiger doesn't - * track that value, store it in the destination buffer. + * LZ4 decompression requires the exact compressed byte count returned by the LZ4_compress_default + * and LZ4_compress_destSize functions. WiredTiger doesn't track that value, store it in the + * destination buffer. * - * Additionally, LZ4_compress_destSize may compress into the middle of a record, - * and after decompression we return the length to the last record successfully - * decompressed, not the number of bytes decompressed; store that value in the - * destination buffer as well. + * Additionally, LZ4_compress_destSize may compress into the middle of a record, and after + * decompression we return the length to the last record successfully decompressed, not the number + * of bytes decompressed; store that value in the destination buffer as well. * - * (Since raw compression has been removed from WiredTiger, the lz4 compression - * code no longer calls LZ4_compress_destSize. Some support remains to support - * existing compressed objects.) + * (Since raw compression has been removed from WiredTiger, the lz4 compression code no longer calls + * LZ4_compress_destSize. Some support remains to support existing compressed objects.) * * Use fixed-size, 4B values (WiredTiger never writes buffers larger than 4GB). * - * The unused field is available for a mode flag if one is needed in the future, - * we guarantee it's 0. + * The unused field is available for a mode flag if one is needed in the future, we guarantee it's + * 0. */ typedef struct { uint32_t compressed_len; /* True compressed length */ diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index bf8fe77a34a..f367175962d 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-4.4", - "commit": "9bd1ece7971714f947b47e589b0af5d7ee97a29d" + "commit": "7b994a862e899a12eb7c3ac814c9fada7d8d1ab9" } diff --git a/src/third_party/wiredtiger/src/block/block_open.c b/src/third_party/wiredtiger/src/block/block_open.c index 1f33e1092c7..638796e4459 100644 --- a/src/third_party/wiredtiger/src/block/block_open.c +++ b/src/third_party/wiredtiger/src/block/block_open.c @@ -334,12 +334,7 @@ __desc_read(WT_SESSION_IMPL *session, uint32_t allocsize, WT_BLOCK *block) * to be returning across the API boundary. */ if (block->size < allocsize) { - /* - * We use the "ignore history store tombstone" flag as of verify so we need to check that - * we're not performing a verify. - */ - if (F_ISSET(session, WT_SESSION_ROLLBACK_TO_STABLE_FLAGS) && - !F_ISSET(S2BT(session), WT_BTREE_VERIFY)) + if (F_ISSET(session, WT_SESSION_ROLLBACK_TO_STABLE)) ret = ENOENT; else { ret = WT_ERROR; diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c index e02436c836b..9087d643bbb 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curnext.c +++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c @@ -273,11 +273,9 @@ restart_read: } /* - * If we're at the same slot as the last reference and there's - * no matching insert list item, re-use the return information - * (so encoded items with large repeat counts aren't repeatedly - * decoded). Otherwise, unpack the cell and build the return - * information. + * If we're at the same slot as the last reference and there's no matching insert list item, + * re-use the return information (so encoded items with large repeat counts aren't + * repeatedly decoded). Otherwise, unpack the cell and build the return information. */ if (cbt->cip_saved != cip) { cell = WT_COL_PTR(page, cip); diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c index ffa3cf409b1..6d187bd3057 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curprev.c +++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c @@ -413,11 +413,9 @@ restart_read: } /* - * If we're at the same slot as the last reference and there's - * no matching insert list item, re-use the return information - * (so encoded items with large repeat counts aren't repeatedly - * decoded). Otherwise, unpack the cell and build the return - * information. + * If we're at the same slot as the last reference and there's no matching insert list item, + * re-use the return information (so encoded items with large repeat counts aren't + * repeatedly decoded). Otherwise, unpack the cell and build the return information. */ if (cbt->cip_saved != cip) { cell = WT_COL_PTR(page, cip); diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index 6b0ad0e936b..78437202d3d 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -89,7 +89,7 @@ __cursor_page_pinned(WT_CURSOR_BTREE *cbt, bool search_operation) * functions, but in the case of a search, we will see different results based on the cursor's * initial location. See WT-5134 for the details. */ - if (search_operation && session->txn.isolation == WT_ISO_READ_COMMITTED) + if (search_operation && session->txn->isolation == WT_ISO_READ_COMMITTED) return (false); /* @@ -1471,11 +1471,11 @@ __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries) * time and the failure is unlikely to be detected. Require explicit transactions for modify * operations. */ - if (session->txn.isolation != WT_ISO_SNAPSHOT) + if (session->txn->isolation != WT_ISO_SNAPSHOT) WT_ERR_MSG(session, ENOTSUP, "not supported in read-committed or read-uncommitted " "transactions"); - if (F_ISSET(&session->txn, WT_TXN_AUTOCOMMIT)) + if (F_ISSET(session->txn, WT_TXN_AUTOCOMMIT)) WT_ERR_MSG(session, ENOTSUP, "not supported in implicit transactions"); if (!F_ISSET(cursor, WT_CURSTD_KEY_INT) || !F_ISSET(cursor, WT_CURSTD_VALUE_INT)) diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c index 69651d9b5ca..e7974765964 100644 --- a/src/third_party/wiredtiger/src/btree/bt_delete.c +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -306,9 +306,8 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) /* * Give the page a modify structure. * - * Mark tree dirty, unless the handle is read-only. - * (We'd like to free the deleted pages, but if the handle is read-only, - * we're not able to do so.) + * Mark tree dirty, unless the handle is read-only. (We'd like to free the deleted pages, but if + * the handle is read-only, we're not able to do so.) */ WT_RET(__wt_page_modify_init(session, page)); if (!F_ISSET(btree, WT_BTREE_READONLY)) diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c index 2878391fd53..6c695f5418f 100644 --- a/src/third_party/wiredtiger/src/btree/bt_discard.c +++ b/src/third_party/wiredtiger/src/btree/bt_discard.c @@ -190,9 +190,8 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) /* * Free the insert array. * - * Row-store tables have one additional slot in the insert array - * (the insert array has an extra slot to hold keys that sort - * before keys found on the original page). + * Row-store tables have one additional slot in the insert array (the insert array has an + * extra slot to hold keys that sort before keys found on the original page). */ if (mod->mod_row_insert != NULL) __free_skip_array(session, mod->mod_row_insert, page->entries + 1, update_ignore); @@ -203,10 +202,9 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) break; } - /* Free the overflow on-page, reuse and transaction-cache skiplists. */ + /* Free the overflow on-page and reuse skiplists. */ __wt_ovfl_reuse_free(session, page); __wt_ovfl_discard_free(session, page); - __wt_ovfl_discard_remove(session, page); __wt_free(session, page->modify->ovfl_track); __wt_spin_destroy(session, &page->modify->page_lock); @@ -261,12 +259,11 @@ __wt_free_ref(WT_SESSION_IMPL *session, WT_REF *ref, int page_type, bool free_pa } /* - * Optionally free row-store WT_REF key allocation. Historic versions of - * this code looked in a passed-in page argument, but that is dangerous, - * some of our error-path callers create WT_REF structures without ever - * setting WT_REF.home or having a parent page to which the WT_REF will - * be linked. Those WT_REF structures invariably have instantiated keys, - * (they obviously cannot be on-page keys), and we must free the memory. + * Optionally free row-store WT_REF key allocation. Historic versions of this code looked in a + * passed-in page argument, but that is dangerous, some of our error-path callers create WT_REF + * structures without ever setting WT_REF.home or having a parent page to which the WT_REF will + * be linked. Those WT_REF structures invariably have instantiated keys, (they obviously cannot + * be on-page keys), and we must free the memory. */ switch (page_type) { case WT_PAGE_ROW_INT: diff --git a/src/third_party/wiredtiger/src/btree/bt_ovfl.c b/src/third_party/wiredtiger/src/btree/bt_ovfl.c index 0ea80819048..72523b695de 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ovfl.c +++ b/src/third_party/wiredtiger/src/btree/bt_ovfl.c @@ -47,8 +47,6 @@ __wt_ovfl_read( WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store, bool *decoded) { WT_DECL_RET; - WT_OVFL_TRACK *track; - size_t i; *decoded = false; @@ -60,22 +58,15 @@ __wt_ovfl_read( return (__ovfl_read(session, unpack->data, unpack->size, store)); /* - * WT_CELL_VALUE_OVFL_RM cells: If reconciliation deleted an overflow value, but there was still - * a reader in the system that might need it, the on-page cell type will have been reset to - * WT_CELL_VALUE_OVFL_RM and we will be passed a page so we can check the on-page cell. - * - * Acquire the overflow lock, and retest the on-page cell's value inside the lock. + * WT_CELL_VALUE_OVFL_RM cells: if reconciliation deletes an overflow value, the on-page cell + * type is reset to WT_CELL_VALUE_OVFL_RM. Any values required by an existing reader will be + * copied into the HS file, which means this value should never be read. It's possible to race + * with checkpoints doing that work, lock before testing the removed flag. */ __wt_readlock(session, &S2BT(session)->ovfl_lock); if (__wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM) { - track = page->modify->ovfl_track; - for (i = 0; i < track->remove_next; ++i) - if (track->remove[i].cell == unpack->cell) { - store->data = track->remove[i].data; - store->size = track->remove[i].size; - break; - } - WT_ASSERT(session, i < track->remove_next); + WT_ASSERT(session, __wt_txn_visible_all(session, unpack->stop_txn, unpack->stop_ts)); + ret = __wt_buf_setstr(session, store, "WT_CELL_VALUE_OVFL_RM"); *decoded = true; } else ret = __ovfl_read(session, unpack->data, unpack->size, store); @@ -85,109 +76,35 @@ __wt_ovfl_read( } /* - * __wt_ovfl_discard_remove -- - * Free the on-page overflow value cache. - */ -void -__wt_ovfl_discard_remove(WT_SESSION_IMPL *session, WT_PAGE *page) -{ - WT_OVFL_TRACK *track; - uint32_t i; - - if (page->modify != NULL && (track = page->modify->ovfl_track) != NULL) { - for (i = 0; i < track->remove_next; ++i) - __wt_free(session, track->remove[i].data); - __wt_free(session, page->modify->ovfl_track->remove); - track->remove_allocated = 0; - track->remove_next = 0; - } -} - -/* - * __ovfl_cache -- - * Cache an overflow value. - */ -static int -__ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack) -{ - WT_DECL_ITEM(tmp); - WT_DECL_RET; - WT_OVFL_TRACK *track; - - /* Read the overflow value. */ - WT_RET(__wt_scr_alloc(session, 1024, &tmp)); - WT_ERR(__wt_dsk_cell_data_ref(session, page->type, unpack, tmp)); - - /* Allocating tracking structures as necessary. */ - if (page->modify->ovfl_track == NULL) - WT_ERR(__wt_ovfl_track_init(session, page)); - track = page->modify->ovfl_track; - - /* Copy the overflow item into place. */ - WT_ERR( - __wt_realloc_def(session, &track->remove_allocated, track->remove_next + 1, &track->remove)); - track->remove[track->remove_next].cell = unpack->cell; - WT_ERR(__wt_memdup(session, tmp->data, tmp->size, &track->remove[track->remove_next].data)); - track->remove[track->remove_next].size = tmp->size; - ++track->remove_next; - -err: - __wt_scr_free(session, &tmp); - return (ret); -} - -/* * __wt_ovfl_remove -- * Remove an overflow value. */ int -__wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, bool evicting) +__wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack) { /* * This function solves two problems in reconciliation. * - * The first problem is snapshot readers needing on-page overflow values - * that have been removed. The scenario is as follows: - * - * - reconciling a leaf page that references an overflow item - * - the item is updated and the update committed - * - a checkpoint runs, freeing the backing overflow blocks - * - a snapshot transaction wants the original version of the item - * - * In summary, we may need the original version of an overflow item for - * a snapshot transaction after the item was deleted from a page that's - * subsequently been checkpointed, where the checkpoint must know about - * the freed blocks. We don't have any way to delay a free of the - * underlying blocks until a particular set of transactions exit (and - * this shouldn't be a common scenario), so cache the overflow value in - * memory. - * - * This gets hard because the snapshot transaction reader might: - * - search the WT_UPDATE list and not find an useful entry + * The first problem is snapshot readers needing on-page overflow values that have been removed. + * If the overflow value is required by a reader, it will be copied into the HS file before the + * backing blocks are removed. However, this gets hard because the snapshot transaction reader + * might: + * - search the update list and not find a useful entry * - read the overflow value's address from the on-page cell * - go to sleep - * - checkpoint runs, caches the overflow value, frees the blocks + * - checkpoint runs, frees the backing blocks * - another thread allocates and overwrites the blocks - * - the reader wakes up and reads the wrong value + * - the reader wakes up and uses the on-page cell to read the blocks * - * Use a read/write lock and the on-page cell to fix the problem: hold - * a write lock when changing the cell type from WT_CELL_VALUE_OVFL to - * WT_CELL_VALUE_OVFL_RM and hold a read lock when reading an overflow - * item. + * Use a read/write lock and the on-page cell to fix the problem: get a write lock when changing + * the cell type from WT_CELL_VALUE_OVFL to WT_CELL_VALUE_OVFL_RM, get a read lock when reading + * an overflow item. * - * The read/write lock is per btree, but it could be per page or even - * per overflow item. We don't do any of that because overflow values - * are supposed to be rare and we shouldn't see contention for the lock. + * The read/write lock is per btree (but could be per page or even per overflow item). We don't + * bother because overflow values are supposed to be rare and contention isn't expected. * - * We only have to do this for checkpoints: in any eviction mode, there - * can't be threads sitting in our update lists. - */ - if (!evicting) - WT_RET(__ovfl_cache(session, page, unpack)); - - /* - * The second problem is to only remove the underlying blocks once, solved by the - * WT_CELL_VALUE_OVFL_RM flag. + * The second problem is to only remove the underlying blocks once, also solved by checking the + * flag before doing any work. * * Queue the on-page cell to be set to WT_CELL_VALUE_OVFL_RM and the underlying overflow value's * blocks to be freed when reconciliation completes. @@ -213,11 +130,11 @@ __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell) __wt_cell_unpack(session, page, cell, unpack); /* - * Finally remove overflow key/value objects, called when reconciliation finishes after - * successfully writing a page. + * Remove overflow key/value objects, called when reconciliation finishes after successfully + * reconciling a page. * - * Keys must have already been instantiated and value objects must have already been cached (if - * they might potentially still be read by any running transaction). + * Keys must have already been instantiated and value objects must have already been written to + * the HS file (if they might potentially still be read by any running transaction). * * Acquire the overflow lock to avoid racing with a thread reading the backing overflow blocks. */ diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c index f6e49e27557..ac588bf901d 100644 --- a/src/third_party/wiredtiger/src/btree/bt_page.c +++ b/src/third_party/wiredtiger/src/btree/bt_page.c @@ -336,9 +336,9 @@ __inmem_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, uint64_t recno, size_t repeat_off = 0; /* - * Walk the page, building references: the page contains unsorted value - * items. The value items are on-page (WT_CELL_VALUE), overflow items - * (WT_CELL_VALUE_OVFL) or deleted items (WT_CELL_DEL). + * Walk the page, building references: the page contains unsorted value items. The value items + * are on-page (WT_CELL_VALUE), overflow items (WT_CELL_VALUE_OVFL) or deleted items + * (WT_CELL_DEL). */ indx = 0; cip = page->pg_var; diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index 554c4e047d6..5c8c0ea871a 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -257,7 +257,7 @@ read: */ if (!LF_ISSET(WT_READ_IGNORE_CACHE_SIZE)) WT_RET(__wt_cache_eviction_check( - session, true, !F_ISSET(&session->txn, WT_TXN_HAS_ID), NULL)); + session, true, !F_ISSET(session->txn, WT_TXN_HAS_ID), NULL)); WT_RET(__page_read(session, ref, flags)); /* We just read a page, don't evict it before we have a chance to use it. */ diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index 9cad990fe84..851a407f165 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -30,7 +30,7 @@ __sync_checkpoint_can_skip(WT_SESSION_IMPL *session, WT_REF *ref) u_int i; mod = ref->page->modify; - txn = &session->txn; + txn = session->txn; /* * We can skip some dirty pages during a checkpoint. The requirements: @@ -362,7 +362,7 @@ __wt_sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) conn = S2C(session); btree = S2BT(session); prev = walk = NULL; - txn = &session->txn; + txn = session->txn; tried_eviction = false; time_start = time_stop = 0; is_hs = false; @@ -381,7 +381,7 @@ __wt_sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; - saved_pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id; + saved_pinned_id = WT_SESSION_TXN_SHARED(session)->pinned_id; timer = WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT); if (timer) time_start = __wt_clock(session); @@ -567,7 +567,7 @@ __wt_sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) * pages. That happens prior to the final metadata checkpoint. */ if (F_ISSET(walk, WT_REF_FLAG_LEAF) && page->read_gen == WT_READGEN_WONT_NEED && - !tried_eviction && F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT)) { + !tried_eviction && F_ISSET(session->txn, WT_TXN_HAS_SNAPSHOT)) { ret = __wt_page_release_evict(session, walk, 0); walk = NULL; WT_ERR_ERROR_OK(ret, EBUSY, false); diff --git a/src/third_party/wiredtiger/src/btree/col_modify.c b/src/third_party/wiredtiger/src/btree/col_modify.c index 687013be783..b2062354f7c 100644 --- a/src/third_party/wiredtiger/src/btree/col_modify.c +++ b/src/third_party/wiredtiger/src/btree/col_modify.c @@ -79,23 +79,18 @@ __wt_col_modify(WT_CURSOR_BTREE *cbt, uint64_t recno, const WT_ITEM *value, WT_U mod = page->modify; /* - * If modifying a record not previously modified, but which is in the - * same update slot as a previously modified record, cursor.ins will - * not be set because there's no list of update records for this recno, - * but cursor.ins_head will be set to point to the correct update slot. - * Acquire the necessary insert information, then create a new update - * entry and link it into the existing list. We get here if a page has - * a single cell representing multiple records (the records have the - * same value), and then a record in the cell is updated or removed, - * creating the update list for the cell, and then a cursor iterates - * into that same cell to update/remove a different record. We find the - * correct slot in the update array, but we don't find an update list - * (because it doesn't exist), and don't have the information we need - * to do the insert. Normally, we wouldn't care (we could fail and do - * a search for the record which would configure everything for the - * insert), but range truncation does this pattern for every record in - * the cell, and the performance is terrible. For that reason, catch it - * here. + * If modifying a record not previously modified, but which is in the same update slot as a + * previously modified record, cursor.ins will not be set because there's no list of update + * records for this recno, but cursor.ins_head will be set to point to the correct update slot. + * Acquire the necessary insert information, then create a new update entry and link it into the + * existing list. We get here if a page has a single cell representing multiple records (the + * records have the same value), and then a record in the cell is updated or removed, creating + * the update list for the cell, and then a cursor iterates into that same cell to update/remove + * a different record. We find the correct slot in the update array, but we don't find an update + * list (because it doesn't exist), and don't have the information we need to do the insert. + * Normally, we wouldn't care (we could fail and do a search for the record which would + * configure everything for the insert), but range truncation does this pattern for every record + * in the cell, and the performance is terrible. For that reason, catch it here. */ if (cbt->ins == NULL && cbt->ins_head != NULL) { cbt->ins = __col_insert_search(cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno); diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c index 13a570f8501..a6d56c9499d 100644 --- a/src/third_party/wiredtiger/src/btree/col_srch.c +++ b/src/third_party/wiredtiger/src/btree/col_srch.c @@ -156,9 +156,8 @@ descend: /* * Reference the slot used for next step down the tree. * - * Base is the smallest index greater than recno and may be the - * (last + 1) index. The slot for descent is the one before - * base. + * Base is the smallest index greater than recno and may be the (last + 1) index. The slot + * for descent is the one before base. */ if (recno != descent->ref_recno) { /* diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c index 6a1f4af0619..98ae6f66daf 100644 --- a/src/third_party/wiredtiger/src/btree/row_srch.c +++ b/src/third_party/wiredtiger/src/btree/row_srch.c @@ -315,14 +315,12 @@ restart: } /* - * Binary search of an internal page. There are three versions - * (keys with no application-specified collation order, in long - * and short versions, and keys with an application-specified - * collation order), because doing the tests and error handling - * inside the loop costs about 5%. + * Binary search of an internal page. There are three versions (keys with no + * application-specified collation order, in long and short versions, and keys with an + * application-specified collation order), because doing the tests and error handling inside + * the loop costs about 5%. * - * Reference the comment above about the 0th key: we continue to - * special-case it. + * Reference the comment above about the 0th key: we continue to special-case it. */ base = 1; limit = pindex->entries - 1; @@ -542,20 +540,17 @@ leaf_match: /* * We didn't find an exact match in the WT_ROW array. * - * Base is the smallest index greater than key and may be the 0th index - * or the (last + 1) index. Set the slot to be the largest index less - * than the key if that's possible (if base is the 0th index it means - * the application is inserting a key before any key found on the page). + * Base is the smallest index greater than key and may be the 0th index or the (last + 1) index. + * Set the slot to be the largest index less than the key if that's possible (if base is the 0th + * index it means the application is inserting a key before any key found on the page). * - * It's still possible there is an exact match, but it's on an insert - * list. Figure out which insert chain to search and then set up the - * return information assuming we'll find nothing in the insert list - * (we'll correct as needed inside the search routine, depending on - * what we find). + * It's still possible there is an exact match, but it's on an insert list. Figure out which + * insert chain to search and then set up the return information assuming we'll find nothing in + * the insert list (we'll correct as needed inside the search routine, depending on what we + * find). * - * If inserting a key smaller than any key found in the WT_ROW array, - * use the extra slot of the insert array, otherwise the insert array - * maps one-to-one to the WT_ROW array. + * If inserting a key smaller than any key found in the WT_ROW array, use the extra slot of the + * insert array, otherwise the insert array maps one-to-one to the WT_ROW array. */ if (base == 0) { cbt->compare = 1; diff --git a/src/third_party/wiredtiger/src/conn/api_calc_modify.c b/src/third_party/wiredtiger/src/conn/api_calc_modify.c index 86912dfbd79..56391910c89 100644 --- a/src/third_party/wiredtiger/src/conn/api_calc_modify.c +++ b/src/third_party/wiredtiger/src/conn/api_calc_modify.c @@ -146,12 +146,11 @@ __wt_calc_modify(WT_SESSION_IMPL *wt_session, const WT_ITEM *oldv, const WT_ITEM goto end; /* - * Walk through the post-image, maintaining start / end markers - * separated by a gap in the pre-image. If the current point in the - * post-image matches either marker, try to extend the match to find a - * (large) range of matching bytes. If the end of the range is reached - * in the post-image without finding a good match, double the size of - * the gap, update the markers and keep trying. + * Walk through the post-image, maintaining start / end markers separated by a gap in the + * pre-image. If the current point in the post-image matches either marker, try to extend the + * match to find a (large) range of matching bytes. If the end of the range is reached in the + * post-image without finding a good match, double the size of the gap, update the markers and + * keep trying. */ hstart = hend = 0; i = gap = 0; diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index f0f8819007c..ffdfe3bc398 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -1033,7 +1033,7 @@ err: * never referenced that file. */ for (s = conn->sessions, i = 0; i < conn->session_cnt; ++s, ++i) - if (s->active && !F_ISSET(s, WT_SESSION_INTERNAL) && F_ISSET(&s->txn, WT_TXN_RUNNING)) { + if (s->active && !F_ISSET(s, WT_SESSION_INTERNAL) && F_ISSET(s->txn, WT_TXN_RUNNING)) { wt_session = &s->iface; WT_TRET(wt_session->rollback_transaction(wt_session, NULL)); } @@ -1067,11 +1067,9 @@ err: * After the async and LSM threads have exited, we won't open more files for the application. * However, the sweep server is still running and it can close file handles at the same time the * final checkpoint is reviewing open data handles (forcing checkpoint to reopen handles). Shut - * down the sweep server and then flag the system should not open anything new. + * down the sweep server. */ WT_TRET(__wt_sweep_destroy(session)); - F_SET(conn, WT_CONN_CLOSING_NO_MORE_OPENS); - WT_FULL_BARRIER(); /* * Shut down the checkpoint and capacity server threads: we don't want to throttle writes and @@ -1920,7 +1918,7 @@ __wt_verbose_dump_sessions(WT_SESSION_IMPL *session, bool show_cursors) "read-committed" : (s->isolation == WT_ISO_READ_UNCOMMITTED ? "read-uncommitted" : "snapshot"))); WT_ERR(__wt_msg(session, " Transaction:")); - WT_ERR(__wt_verbose_dump_txn_one(session, &s->txn, 0, NULL)); + WT_ERR(__wt_verbose_dump_txn_one(session, s, 0, NULL)); } else { WT_ERR(__wt_msg(session, " Number of positioned cursors: %u", s->ncursors)); TAILQ_FOREACH (cursor, &s->cursors, q) { @@ -2645,13 +2643,12 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *c WT_ERR(__conn_write_base_config(session, cfg)); /* - * Check on the turtle and metadata files, creating them if necessary - * (which avoids application threads racing to create the metadata file - * later). Once the metadata file exists, get a reference to it in - * the connection's session. + * Check on the turtle and metadata files, creating them if necessary (which avoids application + * threads racing to create the metadata file later). Once the metadata file exists, get a + * reference to it in the connection's session. * - * THE TURTLE FILE MUST BE THE LAST FILE CREATED WHEN INITIALIZING THE - * DATABASE HOME, IT'S WHAT WE USE TO DECIDE IF WE'RE CREATING OR NOT. + * THE TURTLE FILE MUST BE THE LAST FILE CREATED WHEN INITIALIZING THE DATABASE HOME, IT'S WHAT + * WE USE TO DECIDE IF WE'RE CREATING OR NOT. */ WT_ERR(__wt_turtle_init(session)); diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c index 1c391200cb1..e6da8446753 100644 --- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c +++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c @@ -367,13 +367,12 @@ __wt_conn_dhandle_close(WT_SESSION_IMPL *session, bool final, bool mark_dead) } /* - * If marking the handle dead, do so after closing the underlying btree. - * (Don't do it before that, the block manager asserts there are never - * two references to a block manager object, and re-opening the handle - * can succeed once we mark this handle dead.) + * If marking the handle dead, do so after closing the underlying btree. (Don't do it before + * that, the block manager asserts there are never two references to a block manager object, and + * re-opening the handle can succeed once we mark this handle dead.) * - * Check discard too, code we call to clear the cache expects the data - * handle dead flag to be set when discarding modified pages. + * Check discard too, code we call to clear the cache expects the data handle dead flag to be + * set when discarding modified pages. */ if (marked_dead || discard) F_SET(dhandle, WT_DHANDLE_DEAD); @@ -426,8 +425,7 @@ __wt_conn_dhandle_open(WT_SESSION_IMPL *session, const char *cfg[], uint32_t fla WT_ASSERT(session, F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) && !LF_ISSET(WT_DHANDLE_LOCK_ONLY)); - WT_ASSERT(session, F_ISSET(session, WT_SESSION_ROLLBACK_TO_STABLE_FLAGS) || - !F_ISSET(S2C(session), WT_CONN_CLOSING_NO_MORE_OPENS)); + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_CLOSING_NO_MORE_OPENS)); /* Turn off eviction. */ if (dhandle->type == WT_DHANDLE_TYPE_BTREE) @@ -527,9 +525,8 @@ __conn_btree_apply_internal(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, return (0); /* - * We need to pull the handle into the session handle cache and make - * sure it's referenced to stop other internal code dropping the handle - * (e.g in LSM when cleaning up obsolete chunks). + * We need to pull the handle into the session handle cache and make sure it's referenced to + * stop other internal code dropping the handle (e.g in LSM when cleaning up obsolete chunks). */ if ((ret = __wt_session_get_dhandle(session, dhandle->name, dhandle->checkpoint, NULL, 0)) != 0) return (ret == EBUSY ? 0 : ret); diff --git a/src/third_party/wiredtiger/src/conn/conn_handle.c b/src/third_party/wiredtiger/src/conn/conn_handle.c index 28864d2f4ec..db2b085e8c5 100644 --- a/src/third_party/wiredtiger/src/conn/conn_handle.c +++ b/src/third_party/wiredtiger/src/conn/conn_handle.c @@ -83,6 +83,9 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) TAILQ_INIT(&conn->blockhash[i]); /* Block handle hash lists */ TAILQ_INIT(&conn->blockqh); /* Block manager list */ + conn->ckpt_prep_min = UINT64_MAX; + conn->ckpt_time_min = UINT64_MAX; + return (0); } diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c index f59ce5d25d8..3c28ac121ad 100644 --- a/src/third_party/wiredtiger/src/conn/conn_open.c +++ b/src/third_party/wiredtiger/src/conn/conn_open.c @@ -97,6 +97,10 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) /* The eviction server is shut down last. */ WT_TRET(__wt_evict_destroy(session)); + /* There should be no more file opens after this point. */ + F_SET(conn, WT_CONN_CLOSING_NO_MORE_OPENS); + WT_FULL_BARRIER(); + /* Close open data handles. */ WT_TRET(__wt_conn_dhandle_discard(session)); @@ -202,11 +206,9 @@ __wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_logmgr_create(session, cfg)); /* - * Run recovery. - * NOTE: This call will start (and stop) eviction if recovery is - * required. Recovery must run before the history store table is created - * (because recovery will update the metadata), and before eviction is - * started for real. + * Run recovery. NOTE: This call will start (and stop) eviction if recovery is required. + * Recovery must run before the history store table is created (because recovery will update the + * metadata), and before eviction is started for real. */ WT_RET(__wt_txn_recover(session)); diff --git a/src/third_party/wiredtiger/src/conn/conn_stat.c b/src/third_party/wiredtiger/src/conn/conn_stat.c index 05d6f111b2c..455f10ea905 100644 --- a/src/third_party/wiredtiger/src/conn/conn_stat.c +++ b/src/third_party/wiredtiger/src/conn/conn_stat.c @@ -135,15 +135,13 @@ __statlog_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp) FLD_SET(conn->stat_flags, WT_STAT_ON_CLOSE); /* - * We don't allow the log path to be reconfigured for security reasons. - * (Applications passing input strings directly to reconfigure would - * expose themselves to a potential security problem, the utility of - * reconfiguring a statistics log path isn't worth the security risk.) + * We don't allow the log path to be reconfigured for security reasons. (Applications passing + * input strings directly to reconfigure would expose themselves to a potential security + * problem, the utility of reconfiguring a statistics log path isn't worth the security risk.) * - * See above for the details, but during reconfiguration we're loading - * the path value from the saved configuration information, and it's - * required during reconfiguration because we potentially stopped and - * are restarting, the server. + * See above for the details, but during reconfiguration we're loading the path value from the + * saved configuration information, and it's required during reconfiguration because we + * potentially stopped and are restarting, the server. */ WT_RET(__wt_config_gets(session, cfg, "statistics_log.path", &cval)); WT_ERR(__wt_scr_alloc(session, 0, &tmp)); diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c index 8ab0a51a401..eeb7ffa514c 100644 --- a/src/third_party/wiredtiger/src/conn/conn_sweep.c +++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c @@ -66,17 +66,14 @@ __sweep_expire_one(WT_SESSION_IMPL *session) /* * Acquire an exclusive lock on the handle and mark it dead. * - * The close would require I/O if an update cannot be written - * (updates in a no-longer-referenced file might not yet be - * globally visible if sessions have disjoint sets of files - * open). In that case, skip it: we'll retry the close the - * next time, after the transaction state has progressed. + * The close would require I/O if an update cannot be written (updates in a no-longer-referenced + * file might not yet be globally visible if sessions have disjoint sets of files open). In that + * case, skip it: we'll retry the close the next time, after the transaction state has + * progressed. * - * We don't set WT_DHANDLE_EXCLUSIVE deliberately, we want - * opens to block on us and then retry rather than returning an - * EBUSY error to the application. This is done holding the - * handle list lock so that connection-level handle searches - * never need to retry. + * We don't set WT_DHANDLE_EXCLUSIVE deliberately, we want opens to block on us and then retry + * rather than returning an EBUSY error to the application. This is done holding the handle list + * lock so that connection-level handle searches never need to retry. */ WT_RET(__wt_try_writelock(session, &dhandle->rwlock)); diff --git a/src/third_party/wiredtiger/src/cursor/cur_backup.c b/src/third_party/wiredtiger/src/cursor/cur_backup.c index e1864f70350..05dc7e2ff9b 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_backup.c +++ b/src/third_party/wiredtiger/src/cursor/cur_backup.c @@ -678,20 +678,15 @@ __backup_start( } if (!target_list) { /* - * It's important to first gather the log files to be copied - * (which internally starts a new log file), followed by - * choosing a checkpoint to reference in the WiredTiger.backup - * file. + * It's important to first gather the log files to be copied (which internally starts a new + * log file), followed by choosing a checkpoint to reference in the WiredTiger.backup file. * - * Applications may have logic that takes a checkpoint, followed - * by performing a write that should only appear in the new - * checkpoint. This ordering prevents choosing the prior - * checkpoint, but including the write in the log files - * returned. + * Applications may have logic that takes a checkpoint, followed by performing a write that + * should only appear in the new checkpoint. This ordering prevents choosing the prior + * checkpoint, but including the write in the log files returned. * - * It is also possible, and considered legal, to choose the new - * checkpoint, but not include the log file that contains the - * log entry for taking the new checkpoint. + * It is also possible, and considered legal, to choose the new checkpoint, but not include + * the log file that contains the log entry for taking the new checkpoint. */ WT_ERR(__backup_log_append(session, cb, true)); WT_ERR(__backup_all(session)); diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c index acb513ebcc6..7dfb3bca218 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_file.c +++ b/src/third_party/wiredtiger/src/cursor/cur_file.c @@ -457,12 +457,11 @@ err: CURSOR_UPDATE_API_END(session, ret); /* - * The application might do a WT_CURSOR.get_value call when we return, - * so we need a value and the underlying functions didn't set one up. - * For various reasons, those functions may not have done a search and - * any previous value in the cursor might race with WT_CURSOR.reserve - * (and in cases like LSM, the reserve never encountered the original - * key). For simplicity, repeat the search here. + * The application might do a WT_CURSOR.get_value call when we return, so we need a value and + * the underlying functions didn't set one up. For various reasons, those functions may not have + * done a search and any previous value in the cursor might race with WT_CURSOR.reserve (and in + * cases like LSM, the reserve never encountered the original key). For simplicity, repeat the + * search here. */ return (ret == 0 ? cursor->search(cursor) : ret); } diff --git a/src/third_party/wiredtiger/src/cursor/cur_index.c b/src/third_party/wiredtiger/src/cursor/cur_index.c index 24776ef3ca5..0f1fab36bf8 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_index.c +++ b/src/third_party/wiredtiger/src/cursor/cur_index.c @@ -226,11 +226,9 @@ __curindex_search(WT_CURSOR *cursor) JOINABLE_CURSOR_API_CALL(cursor, session, search, NULL); /* - * We are searching using the application-specified key, which - * (usually) doesn't contain the primary key, so it is just a prefix of - * any matching index key. Do a search_near, step to the next entry if - * we land on one that is too small, then check that the prefix - * matches. + * We are searching using the application-specified key, which (usually) doesn't contain the + * primary key, so it is just a prefix of any matching index key. Do a search_near, step to the + * next entry if we land on one that is too small, then check that the prefix matches. */ __wt_cursor_set_raw_key(child, &cursor->key); WT_ERR(child->search_near(child, &cmp)); @@ -297,15 +295,12 @@ __curindex_search_near(WT_CURSOR *cursor, int *exact) JOINABLE_CURSOR_API_CALL(cursor, session, search, NULL); /* - * We are searching using the application-specified key, which - * (usually) doesn't contain the primary key, so it is just a prefix of - * any matching index key. That said, if there is an exact match, we - * want to find the first matching index entry and set exact equal to - * zero. + * We are searching using the application-specified key, which (usually) doesn't contain the + * primary key, so it is just a prefix of any matching index key. That said, if there is an + * exact match, we want to find the first matching index entry and set exact equal to zero. * - * Do a search_near, and if we find an entry that is too small, step to - * the next one. In the unlikely event of a search past the end of the - * tree, go back to the last key. + * Do a search_near, and if we find an entry that is too small, step to the next one. In the + * unlikely event of a search past the end of the tree, go back to the last key. */ __wt_cursor_set_raw_key(child, &cursor->key); WT_ERR(child->search_near(child, &cmp)); diff --git a/src/third_party/wiredtiger/src/cursor/cur_join.c b/src/third_party/wiredtiger/src/cursor/cur_join.c index 36e5ca17b02..bb2497f3d19 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_join.c +++ b/src/third_party/wiredtiger/src/cursor/cur_join.c @@ -926,7 +926,7 @@ __curjoin_init_next(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, bool iterab * doing any needed check during the iteration. */ if (!iterable && F_ISSET(je, WT_CURJOIN_ENTRY_BLOOM)) { - if (session->txn.isolation == WT_ISO_READ_UNCOMMITTED) + if (session->txn->isolation == WT_ISO_READ_UNCOMMITTED) WT_ERR_MSG(session, EINVAL, "join cursors with Bloom filters cannot be " "used with read-uncommitted isolation"); diff --git a/src/third_party/wiredtiger/src/cursor/cur_std.c b/src/third_party/wiredtiger/src/cursor/cur_std.c index 38ad4f3b31d..a65bb55a8ba 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_std.c +++ b/src/third_party/wiredtiger/src/cursor/cur_std.c @@ -949,11 +949,11 @@ __cursor_modify(WT_CURSOR *cursor, WT_MODIFY *entries, int nentries) * read-uncommitted transaction, or outside of an explicit transaction. Disallow here as well, * for consistency. */ - if (session->txn.isolation != WT_ISO_SNAPSHOT) + if (session->txn->isolation != WT_ISO_SNAPSHOT) WT_ERR_MSG(session, ENOTSUP, "not supported in read-committed or read-uncommitted " "transactions"); - if (F_ISSET(&session->txn, WT_TXN_AUTOCOMMIT)) + if (F_ISSET(session->txn, WT_TXN_AUTOCOMMIT)) WT_ERR_MSG(session, ENOTSUP, "not supported in implicit transactions"); WT_ERR(__cursor_checkkey(cursor)); diff --git a/src/third_party/wiredtiger/src/cursor/cur_table.c b/src/third_party/wiredtiger/src/cursor/cur_table.c index 8af441c7d02..4fd78188c39 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_table.c +++ b/src/third_party/wiredtiger/src/cursor/cur_table.c @@ -128,8 +128,7 @@ __wt_apply_single_idx(WT_SESSION_IMPL *session, WT_INDEX *idx, WT_CURSOR *cur, WT_RET(__wt_schema_project_merge( session, ctable->cg_cursors, idx->key_plan, idx->key_format, &cur->key)); /* - * The index key is now set and the value is empty - * (it starts clear and is never set). + * The index key is now set and the value is empty (it starts clear and is never set). */ F_SET(cur, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); WT_RET(f(cur)); @@ -704,12 +703,11 @@ err: CURSOR_UPDATE_API_END(session, ret); /* - * The application might do a WT_CURSOR.get_value call when we return, - * so we need a value and the underlying functions didn't set one up. - * For various reasons, those functions may not have done a search and - * any previous value in the cursor might race with WT_CURSOR.reserve - * (and in cases like LSM, the reserve never encountered the original - * key). For simplicity, repeat the search here. + * The application might do a WT_CURSOR.get_value call when we return, so we need a value and + * the underlying functions didn't set one up. For various reasons, those functions may not have + * done a search and any previous value in the cursor might race with WT_CURSOR.reserve (and in + * cases like LSM, the reserve never encountered the original key). For simplicity, repeat the + * search here. */ return (ret == 0 ? cursor->search(cursor) : ret); } @@ -1039,20 +1037,18 @@ __wt_curtable_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, WT_ERR(__curtable_open_colgroups(ctable, cfg)); /* - * We'll need to squirrel away a copy of the cursor configuration for - * if/when we open indices. + * We'll need to squirrel away a copy of the cursor configuration for if/when we open indices. * - * cfg[0] is the baseline configuration for the cursor open and we can - * acquire another copy from the configuration structures, so it would - * be reasonable not to copy it here: but I'd rather be safe than sorry. + * cfg[0] is the baseline configuration for the cursor open and we can acquire another copy from + * the configuration structures, so it would be reasonable not to copy it here: but I'd rather + * be safe than sorry. * * cfg[1] is the application configuration. * - * Underlying indices are always opened without dump or readonly; that - * information is appended to cfg[1] so later "fast" configuration calls - * (checking only cfg[0] and cfg[1]) work. I don't expect to see more - * than two configuration strings here, but it's written to compact into - * two configuration strings, a copy of cfg[0] and the rest in cfg[1]. + * Underlying indices are always opened without dump or readonly; that information is appended + * to cfg[1] so later "fast" configuration calls (checking only cfg[0] and cfg[1]) work. I don't + * expect to see more than two configuration strings here, but it's written to compact into two + * configuration strings, a copy of cfg[0] and the rest in cfg[1]. */ WT_ERR(__wt_calloc_def(session, 3, &ctable->cfg)); WT_ERR(__wt_strdup(session, cfg[0], &ctable->cfg[0])); diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index d134d7e504f..455e8c15bef 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -272,11 +272,24 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread) WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; + uint32_t session_flags; bool did_work, was_intr; + bool is_owner; conn = S2C(session); cache = conn->cache; + /* + * Cache a history store cursor to avoid deadlock: if an eviction thread thread marks a file + * busy and then opens a different file (in this case, the HS file), it can deadlock with a + * thread waiting for the first file to drain from the eviction queue. See WT-5946 for details. + */ + if (!F_ISSET(conn, WT_CONN_IN_MEMORY)) { + session_flags = 0; /* [-Werror=maybe-uninitialized] */ + WT_RET(__wt_hs_cursor(session, &session_flags, &is_owner)); + WT_RET(__wt_hs_cursor_close(session, session_flags, is_owner)); + } + if (conn->evict_server_running && __wt_spin_trylock(session, &cache->evict_pass_lock) == 0) { /* * Cannot use WT_WITH_PASS_LOCK because this is a try lock. Fix when that is supported. We @@ -466,6 +479,13 @@ __wt_evict_create(WT_SESSION_IMPL *session) conn = S2C(session); + /* + * In case recovery has allocated some transaction IDs, bump to the current state. This will + * prevent eviction threads from pinning anything as they start up and read metadata in order to + * open cursors. + */ + WT_RET(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT)); + WT_ASSERT(session, conn->evict_threads_min > 0); /* Set first, the thread might run before we finish up. */ F_SET(conn, WT_CONN_EVICTION_RUN); @@ -2265,7 +2285,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, bool readonly, d WT_DECL_RET; WT_TRACK_OP_DECL; WT_TXN_GLOBAL *txn_global; - WT_TXN_STATE *txn_state; + WT_TXN_SHARED *txn_shared; uint64_t elapsed, time_start, time_stop; uint64_t initial_progress, max_progress; bool app_thread; @@ -2276,7 +2296,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, bool readonly, d cache = conn->cache; time_start = time_stop = 0; txn_global = &conn->txn_global; - txn_state = WT_SESSION_TXN_STATE(session); + txn_shared = WT_SESSION_TXN_SHARED(session); /* * It is not safe to proceed if the eviction server threads aren't setup yet. @@ -2303,7 +2323,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, bool readonly, d --cache->evict_aggressive_score; WT_STAT_CONN_INCR(session, txn_fail_cache); if (app_thread) - WT_TRET(__wt_msg(session, "%s", session->txn.rollback_reason)); + WT_TRET(__wt_msg(session, "%s", session->txn->rollback_reason)); } WT_ERR(ret); } @@ -2316,7 +2336,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, bool readonly, d * below 100%, limit the work to 5 evictions and return. If that's not the case, we can do * more. */ - if (!busy && txn_state->pinned_id != WT_TXN_NONE && + if (!busy && txn_shared->pinned_id != WT_TXN_NONE && txn_global->current != txn_global->oldest_id) busy = true; max_progress = busy ? 5 : 20; diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index 00718e20f70..ec93cf88a75 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -608,8 +608,7 @@ __evict_review(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags, bool * * Don't set any other flags for internal pages: there are no update lists to be saved and * restored, changes can't be written into the history store table, nor can we re-create - * internal - * pages in memory. + * internal pages in memory. * * For leaf pages: * diff --git a/src/third_party/wiredtiger/src/history/hs.c b/src/third_party/wiredtiger/src/history/hs.c index 0f63d510be0..fd90d168b6c 100644 --- a/src/third_party/wiredtiger/src/history/hs.c +++ b/src/third_party/wiredtiger/src/history/hs.c @@ -846,23 +846,16 @@ err: } /* - * __hs_save_read_timestamp -- - * Save the currently running transaction's read timestamp into a variable. - */ -static void -__hs_save_read_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *saved_timestamp) -{ - *saved_timestamp = session->txn.read_timestamp; -} - -/* * __hs_restore_read_timestamp -- - * Reset the currently running transaction's read timestamp with a previously saved one. + * Reset the currently running transaction's read timestamp with the original read timestamp. */ static void -__hs_restore_read_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t saved_timestamp) +__hs_restore_read_timestamp(WT_SESSION_IMPL *session) { - session->txn.read_timestamp = saved_timestamp; + WT_TXN_SHARED *txn_shared; + + txn_shared = WT_SESSION_TXN_SHARED(session); + session->txn->read_timestamp = txn_shared->pinned_read_timestamp; } /* @@ -886,7 +879,7 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, uint64_t recno, WT_UPDA WT_TXN *txn; WT_UPDATE *mod_upd, *upd; wt_timestamp_t durable_timestamp, durable_timestamp_tmp, hs_start_ts, hs_start_ts_tmp; - wt_timestamp_t hs_stop_ts, hs_stop_ts_tmp, read_timestamp, saved_timestamp; + wt_timestamp_t hs_stop_ts, hs_stop_ts_tmp, read_timestamp; size_t notused, size; uint64_t hs_counter, hs_counter_tmp, upd_type_full; uint32_t hs_btree_id, session_flags; @@ -900,14 +893,20 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, uint64_t recno, WT_UPDA mod_upd = upd = NULL; orig_hs_value_buf = NULL; __wt_modify_vector_init(session, &modifies); - txn = &session->txn; - __hs_save_read_timestamp(session, &saved_timestamp); + txn = session->txn; notused = size = 0; hs_btree_id = S2BT(session)->id; session_flags = 0; /* [-Werror=maybe-uninitialized] */ WT_NOT_READ(modify, false); is_owner = false; + /* + * We temporarily move the read timestamp forwards to read modify records in the history store. + * Outside of that window, it should always be equal to the original read timestamp. + */ + WT_ASSERT( + session, txn->read_timestamp == WT_SESSION_TXN_SHARED(session)->pinned_read_timestamp); + /* Row-store key is as passed to us, create the column-store key as needed. */ WT_ASSERT( session, (key == NULL && recno != WT_RECNO_OOB) || (key != NULL && recno == WT_RECNO_OOB)); @@ -983,7 +982,7 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, uint64_t recno, WT_UPDA * timestamp should be equivalent to the stop timestamp of the record that we're * currently on. */ - session->txn.read_timestamp = hs_stop_ts_tmp; + session->txn->read_timestamp = hs_stop_ts_tmp; /* * Find the base update to apply the reverse deltas. If our cursor next fails to find an @@ -1031,7 +1030,7 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, uint64_t recno, WT_UPDA mod_upd = NULL; } /* After we're done looping over modifies, reset the read timestamp. */ - __hs_restore_read_timestamp(session, saved_timestamp); + __hs_restore_read_timestamp(session); WT_STAT_CONN_INCR(session, cache_hs_read_squash); } @@ -1061,7 +1060,7 @@ err: * Restore the read timestamp if we encountered an error while processing a modify. There's no * harm in doing this multiple times. */ - __hs_restore_read_timestamp(session, saved_timestamp); + __hs_restore_read_timestamp(session); WT_TRET(__wt_hs_cursor_close(session, session_flags, is_owner)); __wt_free_update_list(session, &mod_upd); diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h index 81118e421d2..e4455d62b03 100644 --- a/src/third_party/wiredtiger/src/include/api.h +++ b/src/third_party/wiredtiger/src/include/api.h @@ -86,39 +86,39 @@ while (0) /* An API call wrapped in a transaction if necessary. */ -#define TXN_API_CALL(s, h, n, bt, config, cfg) \ - do { \ - bool __autotxn = false, __update = false; \ - API_CALL(s, h, n, bt, config, cfg); \ - __wt_txn_timestamp_flags(s); \ - __autotxn = !F_ISSET(&(s)->txn, WT_TXN_AUTOCOMMIT | WT_TXN_RUNNING); \ - if (__autotxn) \ - F_SET(&(s)->txn, WT_TXN_AUTOCOMMIT); \ - __update = !F_ISSET(&(s)->txn, WT_TXN_UPDATE); \ - if (__update) \ - F_SET(&(s)->txn, WT_TXN_UPDATE); +#define TXN_API_CALL(s, h, n, bt, config, cfg) \ + do { \ + bool __autotxn = false, __update = false; \ + API_CALL(s, h, n, bt, config, cfg); \ + __wt_txn_timestamp_flags(s); \ + __autotxn = !F_ISSET((s)->txn, WT_TXN_AUTOCOMMIT | WT_TXN_RUNNING); \ + if (__autotxn) \ + F_SET((s)->txn, WT_TXN_AUTOCOMMIT); \ + __update = !F_ISSET((s)->txn, WT_TXN_UPDATE); \ + if (__update) \ + F_SET((s)->txn, WT_TXN_UPDATE); /* An API call wrapped in a transaction if necessary. */ -#define TXN_API_CALL_NOCONF(s, h, n, dh) \ - do { \ - bool __autotxn = false, __update = false; \ - API_CALL_NOCONF(s, h, n, dh); \ - __wt_txn_timestamp_flags(s); \ - __autotxn = !F_ISSET(&(s)->txn, WT_TXN_AUTOCOMMIT | WT_TXN_RUNNING); \ - if (__autotxn) \ - F_SET(&(s)->txn, WT_TXN_AUTOCOMMIT); \ - __update = !F_ISSET(&(s)->txn, WT_TXN_UPDATE); \ - if (__update) \ - F_SET(&(s)->txn, WT_TXN_UPDATE); +#define TXN_API_CALL_NOCONF(s, h, n, dh) \ + do { \ + bool __autotxn = false, __update = false; \ + API_CALL_NOCONF(s, h, n, dh); \ + __wt_txn_timestamp_flags(s); \ + __autotxn = !F_ISSET((s)->txn, WT_TXN_AUTOCOMMIT | WT_TXN_RUNNING); \ + if (__autotxn) \ + F_SET((s)->txn, WT_TXN_AUTOCOMMIT); \ + __update = !F_ISSET((s)->txn, WT_TXN_UPDATE); \ + if (__update) \ + F_SET((s)->txn, WT_TXN_UPDATE); /* End a transactional API call, optional retry on deadlock. */ #define TXN_API_END_RETRY(s, ret, retry) \ API_END(s, ret); \ if (__update) \ - F_CLR(&(s)->txn, WT_TXN_UPDATE); \ + F_CLR((s)->txn, WT_TXN_UPDATE); \ if (__autotxn) { \ - if (F_ISSET(&(s)->txn, WT_TXN_AUTOCOMMIT)) \ - F_CLR(&(s)->txn, WT_TXN_AUTOCOMMIT); \ + if (F_ISSET((s)->txn, WT_TXN_AUTOCOMMIT)) \ + F_CLR((s)->txn, WT_TXN_AUTOCOMMIT); \ else if ((ret) == 0) \ (ret) = __wt_txn_commit((s), NULL); \ else { \ diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index bffb93036d6..6985cce0508 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -447,15 +447,6 @@ struct __wt_page_modify { WT_CELL **discard; size_t discard_entries; size_t discard_allocated; - - /* Cached overflow value cell/update address pairs. */ - struct { - WT_CELL *cell; - uint8_t *data; - size_t size; - } * remove; - size_t remove_allocated; - uint32_t remove_next; } * ovfl_track; #define WT_PAGE_LOCK(s, p) __wt_spin_lock((s), &(p)->modify->page_lock) @@ -485,8 +476,7 @@ struct __wt_page_modify { #define WT_PM_REC_REPLACE 3 /* Reconciliation: single block */ uint8_t rec_result; /* Reconciliation state */ -#define WT_PAGE_RS_HS 0x1 -#define WT_PAGE_RS_RESTORED 0x2 +#define WT_PAGE_RS_RESTORED 0x1 uint8_t restore_state; /* Created by restoring updates */ }; diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index b0ff54c70b2..7cbfddbd381 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -555,8 +555,8 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) } /* Check if this is the largest transaction ID to update the page. */ - if (WT_TXNID_LT(page->modify->update_txn, session->txn.id)) - page->modify->update_txn = session->txn.id; + if (WT_TXNID_LT(page->modify->update_txn, session->txn->id)) + page->modify->update_txn = session->txn->id; } /* diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h index 698cea9447c..64f1efe1201 100644 --- a/src/third_party/wiredtiger/src/include/cache.h +++ b/src/third_party/wiredtiger/src/include/cache.h @@ -170,9 +170,9 @@ struct __wt_cache { uint32_t evict_aggressive_score; /* - * Score of how often LRU queues are empty on refill. This score varies - * between 0 (if the queue hasn't been empty for a long time) and 100 - * (if the queue has been empty the last 10 times we filled up. + * Score of how often LRU queues are empty on refill. This score varies between 0 (if the queue + * hasn't been empty for a long time) and 100 (if the queue has been empty the last 10 times we + * filled up. */ uint32_t evict_empty_score; diff --git a/src/third_party/wiredtiger/src/include/cache.i b/src/third_party/wiredtiger/src/include/cache.i index b96f079f5bd..d8d11943c94 100644 --- a/src/third_party/wiredtiger/src/include/cache.i +++ b/src/third_party/wiredtiger/src/include/cache.i @@ -374,7 +374,7 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, bool busy, bool readonly, bo { WT_BTREE *btree; WT_TXN_GLOBAL *txn_global; - WT_TXN_STATE *txn_state; + WT_TXN_SHARED *txn_shared; double pct_full; if (didworkp != NULL) @@ -387,9 +387,9 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, bool busy, bool readonly, bo * sure there is free space in the cache. */ txn_global = &S2C(session)->txn_global; - txn_state = WT_SESSION_TXN_STATE(session); - busy = busy || txn_state->id != WT_TXN_NONE || session->nhazard > 0 || - (txn_state->pinned_id != WT_TXN_NONE && txn_global->current != txn_global->oldest_id); + txn_shared = WT_SESSION_TXN_SHARED(session); + busy = busy || txn_shared->id != WT_TXN_NONE || session->nhazard > 0 || + (txn_shared->pinned_id != WT_TXN_NONE && txn_global->current != txn_global->oldest_id); /* * LSM sets the "ignore cache size" flag when holding the LSM tree lock, in that case, or when diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i index 4d8b83b4d34..14de00f80c3 100644 --- a/src/third_party/wiredtiger/src/include/cursor.i +++ b/src/third_party/wiredtiger/src/include/cursor.i @@ -219,9 +219,8 @@ __cursor_reset(WT_CURSOR_BTREE *cbt) cbt->page_deleted_count = 0; /* - * Release any page references we're holding. This can trigger eviction - * (e.g., forced eviction of big pages), so it's important to do after - * releasing our snapshot above. + * Release any page references we're holding. This can trigger eviction (e.g., forced eviction + * of big pages), so it's important to do after releasing our snapshot above. * * Clear the reference regardless, so we don't try the release twice. */ diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 56363846dfe..0888eeee453 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -928,14 +928,15 @@ extern int __wt_logop_row_truncate_unpack(WT_SESSION_IMPL *session, const uint8_ const uint8_t *end, uint32_t *fileidp, WT_ITEM *startp, WT_ITEM *stopp, uint32_t *modep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_logop_txn_timestamp_pack(WT_SESSION_IMPL *session, WT_ITEM *logrec, - uint64_t time_sec, uint64_t time_nsec, uint64_t commit_ts, uint64_t durable_ts, uint64_t first_ts, - uint64_t prepare_ts, uint64_t read_ts) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); + uint64_t time_sec, uint64_t time_nsec, uint64_t commit_ts, uint64_t durable_ts, + uint64_t first_commit_ts, uint64_t prepare_ts, uint64_t read_ts, uint64_t pinned_read_ts) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_logop_txn_timestamp_print(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, WT_TXN_PRINTLOG_ARGS *args) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_logop_txn_timestamp_unpack(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint64_t *time_secp, uint64_t *time_nsecp, uint64_t *commit_tsp, - uint64_t *durable_tsp, uint64_t *first_tsp, uint64_t *prepare_tsp, uint64_t *read_tsp) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); + uint64_t *durable_tsp, uint64_t *first_commit_tsp, uint64_t *prepare_tsp, uint64_t *read_tsp, + uint64_t *pinned_read_tsp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_logrec_alloc(WT_SESSION_IMPL *session, size_t size, WT_ITEM **logrecp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_logrec_read(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, @@ -1126,8 +1127,8 @@ extern int __wt_ovfl_discard_add(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CEL WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store, bool *decoded) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, - bool evicting) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_ovfl_reuse_add(WT_SESSION_IMPL *session, WT_PAGE *page, const uint8_t *addr, size_t addr_size, const void *value, size_t value_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -1530,8 +1531,8 @@ extern int __wt_verbose_dump_sessions(WT_SESSION_IMPL *session, bool show_cursor WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_verbose_dump_txn(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_verbose_dump_txn_one(WT_SESSION_IMPL *session, WT_TXN *txn, int error_code, - const char *error_string) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_verbose_dump_txn_one(WT_SESSION_IMPL *session, WT_SESSION_IMPL *txn_session, + int error_code, const char *error_string) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_verbose_dump_update(WT_SESSION_IMPL *session, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) @@ -1701,7 +1702,6 @@ extern void __wt_optrack_record_funcid( WT_SESSION_IMPL *session, const char *func, uint16_t *func_idp); extern void __wt_os_stdio(WT_SESSION_IMPL *session); extern void __wt_ovfl_discard_free(WT_SESSION_IMPL *session, WT_PAGE *page); -extern void __wt_ovfl_discard_remove(WT_SESSION_IMPL *session, WT_PAGE *page); extern void __wt_ovfl_reuse_free(WT_SESSION_IMPL *session, WT_PAGE *page); extern void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep); extern void __wt_print_huffman_code(void *huffman_arg, uint16_t symbol); @@ -1759,8 +1759,8 @@ extern void __wt_txn_destroy(WT_SESSION_IMPL *session); extern void __wt_txn_get_snapshot(WT_SESSION_IMPL *session); extern void __wt_txn_global_destroy(WT_SESSION_IMPL *session); extern void __wt_txn_op_free(WT_SESSION_IMPL *session, WT_TXN_OP *op); +extern void __wt_txn_publish_durable_timestamp(WT_SESSION_IMPL *session); extern void __wt_txn_publish_read_timestamp(WT_SESSION_IMPL *session); -extern void __wt_txn_publish_timestamp(WT_SESSION_IMPL *session); extern void __wt_txn_release(WT_SESSION_IMPL *session); extern void __wt_txn_release_resources(WT_SESSION_IMPL *session); extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session); diff --git a/src/third_party/wiredtiger/src/include/reconcile.h b/src/third_party/wiredtiger/src/include/reconcile.h index 2eefed10cf7..43581c7cc1f 100644 --- a/src/third_party/wiredtiger/src/include/reconcile.h +++ b/src/third_party/wiredtiger/src/include/reconcile.h @@ -25,10 +25,7 @@ struct __wt_reconcile { uint64_t orig_btree_checkpoint_gen; uint64_t orig_txn_checkpoint_gen; - /* - * Track the oldest running transaction and whether to skew history store to the newest update. - */ - bool hs_skew_newest; + /* Track the oldest running transaction. */ uint64_t last_running; /* Track the page's min/maximum transactions. */ diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h index 98be0b299ce..bd877622ca1 100644 --- a/src/third_party/wiredtiger/src/include/session.h +++ b/src/third_party/wiredtiger/src/include/session.h @@ -129,7 +129,7 @@ struct __wt_session_impl { WT_ITEM err; /* Error buffer */ WT_TXN_ISOLATION isolation; - WT_TXN txn; /* Transaction state */ + WT_TXN *txn; /* Transaction state */ #define WT_SESSION_BG_SYNC_MSEC 1200000 WT_LSN bg_sync_lsn; /* Background sync operation LSN. */ @@ -145,11 +145,10 @@ struct __wt_session_impl { /* * Operations acting on handles. * - * The preferred pattern is to gather all of the required handles at - * the beginning of an operation, then drop any other locks, perform - * the operation, then release the handles. This cannot be easily - * merged with the list of checkpoint handles because some operations - * (such as compact) do checkpoints internally. + * The preferred pattern is to gather all of the required handles at the beginning of an + * operation, then drop any other locks, perform the operation, then release the handles. This + * cannot be easily merged with the list of checkpoint handles because some operations (such as + * compact) do checkpoints internally. */ WT_DATA_HANDLE **op_handle; /* Handle list */ u_int op_handle_next; /* Next empty slot */ @@ -190,8 +189,9 @@ struct __wt_session_impl { #define WT_SESSION_QUIET_CORRUPT_FILE 0x02000000u #define WT_SESSION_READ_WONT_NEED 0x04000000u #define WT_SESSION_RESOLVING_TXN 0x08000000u -#define WT_SESSION_SCHEMA_TXN 0x10000000u -#define WT_SESSION_SERVER_ASYNC 0x20000000u +#define WT_SESSION_ROLLBACK_TO_STABLE 0x10000000u +#define WT_SESSION_SCHEMA_TXN 0x20000000u +#define WT_SESSION_SERVER_ASYNC 0x40000000u /* AUTOMATIC FLAG VALUE GENERATION STOP */ uint32_t flags; @@ -271,9 +271,3 @@ struct __wt_session_impl { WT_SESSION_STATS stats; }; - -/* - * Rollback to stable should ignore tombstones in the history store since it needs to scan the - * entire table sequentially. - */ -#define WT_SESSION_ROLLBACK_TO_STABLE_FLAGS (WT_SESSION_IGNORE_HS_TOMBSTONE) diff --git a/src/third_party/wiredtiger/src/include/time.i b/src/third_party/wiredtiger/src/include/time.i index 759b8338370..cff5e0850ea 100644 --- a/src/third_party/wiredtiger/src/include/time.i +++ b/src/third_party/wiredtiger/src/include/time.i @@ -163,7 +163,7 @@ __wt_op_timer_start(WT_SESSION_IMPL *session) uint64_t timeout_us; /* Timer can be configured per-transaction, and defaults to per-connection. */ - if ((timeout_us = session->txn.operation_timeout_us) == 0) + if (session->txn == NULL || (timeout_us = session->txn->operation_timeout_us) == 0) timeout_us = S2C(session)->operation_timeout_us; if (timeout_us == 0) session->operation_start_us = session->operation_timeout_us = 0; diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index aedc94a96a2..fd54e279171 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -46,7 +46,7 @@ typedef enum { #define WT_TXNID_LT(t1, t2) ((t1) < (t2)) -#define WT_SESSION_TXN_STATE(s) (&S2C(s)->txn_global.states[(s)->id]) +#define WT_SESSION_TXN_SHARED(s) (&S2C(s)->txn_global.txn_shared_list[(s)->id]) #define WT_SESSION_IS_CHECKPOINT(s) ((s)->id != 0 && (s)->id == S2C(s)->txn_global.checkpoint_id) @@ -72,39 +72,59 @@ typedef enum { /* * Perform an operation at the specified isolation level. * - * This is fiddly: we can't cope with operations that begin transactions - * (leaving an ID allocated), and operations must not move our published - * snap_min forwards (or updates we need could be freed while this operation is - * in progress). Check for those cases: the bugs they cause are hard to debug. + * This is fiddly: we can't cope with operations that begin transactions (leaving an ID allocated), + * and operations must not move our published snap_min forwards (or updates we need could be freed + * while this operation is in progress). Check for those cases: the bugs they cause are hard to + * debug. */ -#define WT_WITH_TXN_ISOLATION(s, iso, op) \ - do { \ - WT_TXN_ISOLATION saved_iso = (s)->isolation; \ - WT_TXN_ISOLATION saved_txn_iso = (s)->txn.isolation; \ - WT_TXN_STATE *txn_state = WT_SESSION_TXN_STATE(s); \ - WT_TXN_STATE saved_state = *txn_state; \ - (s)->txn.forced_iso++; \ - (s)->isolation = (s)->txn.isolation = (iso); \ - op; \ - (s)->isolation = saved_iso; \ - (s)->txn.isolation = saved_txn_iso; \ - WT_ASSERT((s), (s)->txn.forced_iso > 0); \ - (s)->txn.forced_iso--; \ - WT_ASSERT((s), txn_state->id == saved_state.id && \ - (txn_state->metadata_pinned == saved_state.metadata_pinned || \ - saved_state.metadata_pinned == WT_TXN_NONE) && \ - (txn_state->pinned_id == saved_state.pinned_id || \ - saved_state.pinned_id == WT_TXN_NONE)); \ - txn_state->metadata_pinned = saved_state.metadata_pinned; \ - txn_state->pinned_id = saved_state.pinned_id; \ +#define WT_WITH_TXN_ISOLATION(s, iso, op) \ + do { \ + WT_TXN_ISOLATION saved_iso = (s)->isolation; \ + WT_TXN_ISOLATION saved_txn_iso = (s)->txn->isolation; \ + WT_TXN_SHARED *txn_shared = WT_SESSION_TXN_SHARED(s); \ + WT_TXN_SHARED saved_txn_shared = *txn_shared; \ + (s)->txn->forced_iso++; \ + (s)->isolation = (s)->txn->isolation = (iso); \ + op; \ + (s)->isolation = saved_iso; \ + (s)->txn->isolation = saved_txn_iso; \ + WT_ASSERT((s), (s)->txn->forced_iso > 0); \ + (s)->txn->forced_iso--; \ + WT_ASSERT((s), txn_shared->id == saved_txn_shared.id && \ + (txn_shared->metadata_pinned == saved_txn_shared.metadata_pinned || \ + saved_txn_shared.metadata_pinned == WT_TXN_NONE) && \ + (txn_shared->pinned_id == saved_txn_shared.pinned_id || \ + saved_txn_shared.pinned_id == WT_TXN_NONE)); \ + txn_shared->metadata_pinned = saved_txn_shared.metadata_pinned; \ + txn_shared->pinned_id = saved_txn_shared.pinned_id; \ } while (0) -struct __wt_txn_state { +struct __wt_txn_shared { WT_CACHE_LINE_PAD_BEGIN volatile uint64_t id; volatile uint64_t pinned_id; volatile uint64_t metadata_pinned; - volatile bool is_allocating; + + /* + * The first commit or durable timestamp used for this transaction. Determines its position in + * the durable queue and prevents the all_durable timestamp moving past this point. + */ + wt_timestamp_t pinned_durable_timestamp; + + /* + * Set to the first read timestamp used in the transaction. As part of our history store + * mechanism, we can move the read timestamp forward so we need to keep track of the original + * read timestamp to know what history should be pinned in front of oldest. + */ + wt_timestamp_t pinned_read_timestamp; + + TAILQ_ENTRY(__wt_txn_shared) read_timestampq; + TAILQ_ENTRY(__wt_txn_shared) durable_timestampq; + /* Set if need to clear from the durable queue */ + + volatile uint8_t is_allocating; + uint8_t clear_durable_q; + uint8_t clear_read_q; /* Set if need to clear from the read queue */ WT_CACHE_LINE_PAD_END }; @@ -144,12 +164,12 @@ struct __wt_txn_global { /* List of transactions sorted by durable timestamp. */ WT_RWLOCK durable_timestamp_rwlock; - TAILQ_HEAD(__wt_txn_dts_qh, __wt_txn) durable_timestamph; + TAILQ_HEAD(__wt_txn_dts_qh, __wt_txn_shared) durable_timestamph; uint32_t durable_timestampq_len; /* List of transactions sorted by read timestamp. */ WT_RWLOCK read_timestamp_rwlock; - TAILQ_HEAD(__wt_txn_rts_qh, __wt_txn) read_timestamph; + TAILQ_HEAD(__wt_txn_rts_qh, __wt_txn_shared) read_timestamph; uint32_t read_timestampq_len; /* @@ -163,14 +183,14 @@ struct __wt_txn_global { */ volatile bool checkpoint_running; /* Checkpoint running */ volatile uint32_t checkpoint_id; /* Checkpoint's session ID */ - WT_TXN_STATE checkpoint_state; /* Checkpoint's txn state */ + WT_TXN_SHARED checkpoint_txn_shared; /* Checkpoint's txn shared state */ wt_timestamp_t checkpoint_timestamp; /* Checkpoint's timestamp */ volatile uint64_t debug_ops; /* Debug mode op counter */ uint64_t debug_rollback; /* Debug mode rollback */ volatile uint64_t metadata_pinned; /* Oldest ID for metadata */ - WT_TXN_STATE *states; /* Per-session transaction states */ + WT_TXN_SHARED *txn_shared_list; /* Per-session shared transaction states */ }; typedef enum __wt_txn_isolation { @@ -288,12 +308,6 @@ struct __wt_txn { /* Read updates committed as of this timestamp. */ wt_timestamp_t read_timestamp; - TAILQ_ENTRY(__wt_txn) durable_timestampq; - TAILQ_ENTRY(__wt_txn) read_timestampq; - /* Set if need to clear from the durable queue */ - bool clear_durable_q; - bool clear_read_q; /* Set if need to clear from the read queue */ - /* Array of modifications by this transaction. */ WT_TXN_OP *mod; size_t mod_alloc; @@ -322,7 +336,7 @@ struct __wt_txn { * WT_TXN_HAS_TS_DURABLE -- * The transaction has an explicitly set durable timestamp (that is, it * hasn't been mirrored from its commit timestamp value). - * WT_TXN_TS_PUBLISHED -- + * WT_TXN_SHARED_TS_DURABLE -- * The transaction has been published to the durable queue. Setting this * flag lets us know that, on release, we need to mark the transaction for * clearing. @@ -339,20 +353,26 @@ struct __wt_txn { #define WT_TXN_HAS_TS_READ 0x000080u #define WT_TXN_IGNORE_PREPARE 0x000100u #define WT_TXN_PREPARE 0x000200u -#define WT_TXN_PUBLIC_TS_READ 0x000400u -#define WT_TXN_READONLY 0x000800u -#define WT_TXN_RUNNING 0x001000u -#define WT_TXN_SYNC_SET 0x002000u -#define WT_TXN_TS_COMMIT_ALWAYS 0x004000u -#define WT_TXN_TS_COMMIT_KEYS 0x008000u -#define WT_TXN_TS_COMMIT_NEVER 0x010000u -#define WT_TXN_TS_DURABLE_ALWAYS 0x020000u -#define WT_TXN_TS_DURABLE_KEYS 0x040000u -#define WT_TXN_TS_DURABLE_NEVER 0x080000u -#define WT_TXN_TS_PUBLISHED 0x100000u +#define WT_TXN_READONLY 0x000400u +#define WT_TXN_RUNNING 0x000800u +#define WT_TXN_SHARED_TS_DURABLE 0x001000u +#define WT_TXN_SHARED_TS_READ 0x002000u +#define WT_TXN_SYNC_SET 0x004000u +#define WT_TXN_TS_COMMIT_ALWAYS 0x008000u +#define WT_TXN_TS_COMMIT_KEYS 0x010000u +#define WT_TXN_TS_COMMIT_NEVER 0x020000u +#define WT_TXN_TS_DURABLE_ALWAYS 0x040000u +#define WT_TXN_TS_DURABLE_KEYS 0x080000u +#define WT_TXN_TS_DURABLE_NEVER 0x100000u #define WT_TXN_TS_ROUND_PREPARED 0x200000u #define WT_TXN_TS_ROUND_READ 0x400000u #define WT_TXN_UPDATE 0x800000u /* AUTOMATIC FLAG VALUE GENERATION STOP */ uint32_t flags; + + /* + * Zero or more bytes of value (the payload) immediately follows the WT_UPDATE structure. We use + * a C99 flexible array member which has the semantics we want. + */ + uint64_t __snapshot[]; }; diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index 9d154b892ca..574eece2e5f 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -13,7 +13,7 @@ static inline int __wt_txn_context_prepare_check(WT_SESSION_IMPL *session) { - if (F_ISSET(&session->txn, WT_TXN_PREPARE)) + if (F_ISSET(session->txn, WT_TXN_PREPARE)) WT_RET_MSG(session, EINVAL, "not permitted in a prepared transaction"); return (0); } @@ -25,9 +25,9 @@ __wt_txn_context_prepare_check(WT_SESSION_IMPL *session) static inline int __wt_txn_context_check(WT_SESSION_IMPL *session, bool requires_txn) { - if (requires_txn && !F_ISSET(&session->txn, WT_TXN_RUNNING)) + if (requires_txn && !F_ISSET(session->txn, WT_TXN_RUNNING)) WT_RET_MSG(session, EINVAL, "only permitted in a running transaction"); - if (!requires_txn && F_ISSET(&session->txn, WT_TXN_RUNNING)) + if (!requires_txn && F_ISSET(session->txn, WT_TXN_RUNNING)) WT_RET_MSG(session, EINVAL, "not permitted in a running transaction"); return (0); } @@ -41,7 +41,7 @@ __wt_txn_err_set(WT_SESSION_IMPL *session, int ret) { WT_TXN *txn; - txn = &session->txn; + txn = session->txn; /* Ignore standard errors that don't fail the transaction. */ if (ret == WT_NOTFOUND || ret == WT_DUPLICATE_KEY || ret == WT_PREPARE_CONFLICT) @@ -78,17 +78,17 @@ __wt_txn_timestamp_flags(WT_SESSION_IMPL *session) if (btree == NULL) return; if (FLD_ISSET(btree->assert_flags, WT_ASSERT_COMMIT_TS_ALWAYS)) - F_SET(&session->txn, WT_TXN_TS_COMMIT_ALWAYS); + F_SET(session->txn, WT_TXN_TS_COMMIT_ALWAYS); if (FLD_ISSET(btree->assert_flags, WT_ASSERT_COMMIT_TS_KEYS)) - F_SET(&session->txn, WT_TXN_TS_COMMIT_KEYS); + F_SET(session->txn, WT_TXN_TS_COMMIT_KEYS); if (FLD_ISSET(btree->assert_flags, WT_ASSERT_COMMIT_TS_NEVER)) - F_SET(&session->txn, WT_TXN_TS_COMMIT_NEVER); + F_SET(session->txn, WT_TXN_TS_COMMIT_NEVER); if (FLD_ISSET(btree->assert_flags, WT_ASSERT_DURABLE_TS_ALWAYS)) - F_SET(&session->txn, WT_TXN_TS_DURABLE_ALWAYS); + F_SET(session->txn, WT_TXN_TS_DURABLE_ALWAYS); if (FLD_ISSET(btree->assert_flags, WT_ASSERT_DURABLE_TS_KEYS)) - F_SET(&session->txn, WT_TXN_TS_DURABLE_KEYS); + F_SET(session->txn, WT_TXN_TS_DURABLE_KEYS); if (FLD_ISSET(btree->assert_flags, WT_ASSERT_DURABLE_TS_NEVER)) - F_SET(&session->txn, WT_TXN_TS_DURABLE_NEVER); + F_SET(session->txn, WT_TXN_TS_DURABLE_NEVER); } /* @@ -101,7 +101,7 @@ __wt_txn_op_set_recno(WT_SESSION_IMPL *session, uint64_t recno) WT_TXN *txn; WT_TXN_OP *op; - txn = &session->txn; + txn = session->txn; WT_ASSERT(session, txn->mod_count > 0 && recno != WT_RECNO_OOB); op = txn->mod + txn->mod_count - 1; @@ -132,7 +132,7 @@ __wt_txn_op_set_key(WT_SESSION_IMPL *session, const WT_ITEM *key) WT_TXN *txn; WT_TXN_OP *op; - txn = &session->txn; + txn = session->txn; WT_ASSERT(session, txn->mod_count > 0 && key->data != NULL); @@ -163,7 +163,7 @@ __txn_resolve_prepared_update(WT_SESSION_IMPL *session, WT_UPDATE *upd) { WT_TXN *txn; - txn = &session->txn; + txn = session->txn; /* * In case of a prepared transaction, the order of modification of the prepare timestamp to * commit timestamp in the update chain will not affect the data visibility, a reader will @@ -190,7 +190,7 @@ __txn_next_op(WT_SESSION_IMPL *session, WT_TXN_OP **opp) *opp = NULL; - txn = &session->txn; + txn = session->txn; /* * We're about to perform an update. Make sure we have allocated a transaction ID. @@ -219,7 +219,7 @@ __wt_txn_unmodify(WT_SESSION_IMPL *session) WT_TXN *txn; WT_TXN_OP *op; - txn = &session->txn; + txn = session->txn; if (F_ISSET(txn, WT_TXN_HAS_ID)) { WT_ASSERT(session, txn->mod_count > 0); --txn->mod_count; @@ -241,7 +241,7 @@ __wt_txn_op_apply_prepare_state(WT_SESSION_IMPL *session, WT_REF *ref, bool comm wt_timestamp_t ts; uint8_t prepare_state, previous_state; - txn = &session->txn; + txn = session->txn; /* * Lock the ref to ensure we don't race with eviction freeing the page deleted update list or @@ -285,7 +285,7 @@ __wt_txn_op_delete_commit_apply_timestamps(WT_SESSION_IMPL *session, WT_REF *ref WT_UPDATE **updp; uint8_t previous_state; - txn = &session->txn; + txn = session->txn; /* * Lock the ref to ensure we don't race with eviction freeing the page deleted update list or @@ -314,7 +314,7 @@ __wt_txn_op_set_timestamp(WT_SESSION_IMPL *session, WT_TXN_OP *op) WT_UPDATE *upd; wt_timestamp_t *timestamp; - txn = &session->txn; + txn = session->txn; /* * Updates in the metadata never get timestamps (either now or at commit): metadata cannot be @@ -366,7 +366,7 @@ __wt_txn_modify(WT_SESSION_IMPL *session, WT_UPDATE *upd) WT_TXN *txn; WT_TXN_OP *op; - txn = &session->txn; + txn = session->txn; if (F_ISSET(txn, WT_TXN_READONLY)) { if (F_ISSET(txn, WT_TXN_IGNORE_PREPARE)) @@ -393,7 +393,7 @@ __wt_txn_modify(WT_SESSION_IMPL *session, WT_UPDATE *upd) /* History store bypasses transactions, transaction modify should never be called on it. */ WT_ASSERT(session, !WT_IS_HS(S2BT(session))); - upd->txnid = session->txn.id; + upd->txnid = session->txn->id; __wt_txn_op_set_timestamp(session, op); return (0); @@ -410,7 +410,7 @@ __wt_txn_modify_page_delete(WT_SESSION_IMPL *session, WT_REF *ref) WT_TXN *txn; WT_TXN_OP *op; - txn = &session->txn; + txn = session->txn; WT_RET(__txn_next_op(session, &op)); op->type = WT_TXN_OP_REF_DELETE; @@ -472,7 +472,7 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session) * If there is no active checkpoint or this handle is up to date with the active checkpoint then * it's safe to ignore the checkpoint ID in the visibility check. */ - checkpoint_pinned = txn_global->checkpoint_state.pinned_id; + checkpoint_pinned = txn_global->checkpoint_txn_shared.pinned_id; if (checkpoint_pinned == WT_TXN_NONE || WT_TXNID_LT(oldest_id, checkpoint_pinned)) return (oldest_id); @@ -593,7 +593,7 @@ __txn_visible_id(WT_SESSION_IMPL *session, uint64_t id) WT_TXN *txn; bool found; - txn = &session->txn; + txn = session->txn; /* Changes with no associated transaction are always visible. */ if (id == WT_TXN_NONE) @@ -642,13 +642,13 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id, wt_timestamp_t timestamp { WT_TXN *txn; - txn = &session->txn; + txn = session->txn; if (!__txn_visible_id(session, id)) return (false); /* Transactions read their writes, regardless of timestamps. */ - if (F_ISSET(&session->txn, WT_TXN_HAS_ID) && id == session->txn.id) + if (F_ISSET(session->txn, WT_TXN_HAS_ID) && id == session->txn->id) return (true); /* Timestamp check. */ @@ -694,7 +694,7 @@ __wt_txn_upd_visible_type(WT_SESSION_IMPL *session, WT_UPDATE *upd) /* Ignore the prepared update, if transaction configuration says so. */ if (prepare_state == WT_PREPARE_INPROGRESS) return ( - F_ISSET(&session->txn, WT_TXN_IGNORE_PREPARE) ? WT_VISIBLE_FALSE : WT_VISIBLE_PREPARE); + F_ISSET(session->txn, WT_TXN_IGNORE_PREPARE) ? WT_VISIBLE_FALSE : WT_VISIBLE_PREPARE); return (WT_VISIBLE_TRUE); } @@ -876,7 +876,7 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) { WT_TXN *txn; - txn = &session->txn; + txn = session->txn; txn->isolation = session->isolation; txn->txn_logsync = S2C(session)->txn_logsync; @@ -916,7 +916,7 @@ __wt_txn_autocommit_check(WT_SESSION_IMPL *session) { WT_TXN *txn; - txn = &session->txn; + txn = session->txn; if (F_ISSET(txn, WT_TXN_AUTOCOMMIT)) { F_CLR(txn, WT_TXN_AUTOCOMMIT); return (__wt_txn_begin(session, NULL)); @@ -933,10 +933,10 @@ static inline int __wt_txn_idle_cache_check(WT_SESSION_IMPL *session) { WT_TXN *txn; - WT_TXN_STATE *txn_state; + WT_TXN_SHARED *txn_shared; - txn = &session->txn; - txn_state = WT_SESSION_TXN_STATE(session); + txn = session->txn; + txn_shared = WT_SESSION_TXN_SHARED(session); /* * Check the published snap_min because read-uncommitted never sets WT_TXN_HAS_SNAPSHOT. We @@ -945,7 +945,7 @@ __wt_txn_idle_cache_check(WT_SESSION_IMPL *session) * necessary. */ if (F_ISSET(txn, WT_TXN_RUNNING) && !F_ISSET(txn, WT_TXN_HAS_ID) && - txn_state->pinned_id == WT_TXN_NONE) + txn_shared->pinned_id == WT_TXN_NONE) WT_RET(__wt_cache_eviction_check(session, false, true, NULL)); return (0); @@ -959,11 +959,11 @@ static inline uint64_t __wt_txn_id_alloc(WT_SESSION_IMPL *session, bool publish) { WT_TXN_GLOBAL *txn_global; - WT_TXN_STATE *txn_state; + WT_TXN_SHARED *txn_shared; uint64_t id; txn_global = &S2C(session)->txn_global; - txn_state = WT_SESSION_TXN_STATE(session); + txn_shared = WT_SESSION_TXN_SHARED(session); /* * Allocating transaction IDs involves several steps. @@ -985,12 +985,12 @@ __wt_txn_id_alloc(WT_SESSION_IMPL *session, bool publish) * well defined, we must use an atomic increment here. */ if (publish) { - WT_PUBLISH(txn_state->is_allocating, true); - WT_PUBLISH(txn_state->id, txn_global->current); + WT_PUBLISH(txn_shared->is_allocating, true); + WT_PUBLISH(txn_shared->id, txn_global->current); id = __wt_atomic_addv64(&txn_global->current, 1) - 1; - session->txn.id = id; - WT_PUBLISH(txn_state->id, id); - WT_PUBLISH(txn_state->is_allocating, false); + session->txn->id = id; + WT_PUBLISH(txn_shared->id, id); + WT_PUBLISH(txn_shared->is_allocating, false); } else id = __wt_atomic_addv64(&txn_global->current, 1) - 1; @@ -1006,7 +1006,7 @@ __wt_txn_id_check(WT_SESSION_IMPL *session) { WT_TXN *txn; - txn = &session->txn; + txn = session->txn; WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING)); @@ -1038,20 +1038,21 @@ __wt_txn_search_check(WT_SESSION_IMPL *session) WT_BTREE *btree; WT_TXN *txn; - txn = &session->txn; btree = S2BT(session); + txn = session->txn; + /* * If the user says a table should always use a read timestamp, verify this transaction has one. * Same if it should never have a read timestamp. */ if (!F_ISSET(S2C(session), WT_CONN_RECOVERING) && FLD_ISSET(btree->assert_flags, WT_ASSERT_READ_TS_ALWAYS) && - !F_ISSET(txn, WT_TXN_PUBLIC_TS_READ)) + !F_ISSET(txn, WT_TXN_SHARED_TS_READ)) WT_RET_MSG(session, EINVAL, "read_timestamp required and " "none set on this transaction"); if (FLD_ISSET(btree->assert_flags, WT_ASSERT_READ_TS_NEVER) && - F_ISSET(txn, WT_TXN_PUBLIC_TS_READ)) + F_ISSET(txn, WT_TXN_SHARED_TS_READ)) WT_RET_MSG(session, EINVAL, "no read_timestamp required and " "timestamp set on this transaction"); @@ -1072,7 +1073,7 @@ __wt_txn_update_check(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE bool ignore_prepare_set, rollback; rollback = false; - txn = &session->txn; + txn = session->txn; txn_global = &S2C(session)->txn_global; if (txn->isolation != WT_ISO_SNAPSHOT) @@ -1130,7 +1131,7 @@ __wt_txn_read_last(WT_SESSION_IMPL *session) { WT_TXN *txn; - txn = &session->txn; + txn = session->txn; /* * Release the snap_min ID we put in the global table. @@ -1152,11 +1153,11 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session) { WT_TXN *txn; WT_TXN_GLOBAL *txn_global; - WT_TXN_STATE *txn_state; + WT_TXN_SHARED *txn_shared; - txn = &session->txn; + txn = session->txn; txn_global = &S2C(session)->txn_global; - txn_state = WT_SESSION_TXN_STATE(session); + txn_shared = WT_SESSION_TXN_SHARED(session); /* * We are about to read data, which means we need to protect against @@ -1176,10 +1177,10 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session) * positioned on a value, it can't be freed. */ if (txn->isolation == WT_ISO_READ_UNCOMMITTED) { - if (txn_state->pinned_id == WT_TXN_NONE) - txn_state->pinned_id = txn_global->last_running; - if (txn_state->metadata_pinned == WT_TXN_NONE) - txn_state->metadata_pinned = txn_state->pinned_id; + if (txn_shared->pinned_id == WT_TXN_NONE) + txn_shared->pinned_id = txn_global->last_running; + if (txn_shared->metadata_pinned == WT_TXN_NONE) + txn_shared->metadata_pinned = txn_shared->pinned_id; } else if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)) __wt_txn_get_snapshot(session); } diff --git a/src/third_party/wiredtiger/src/include/verify_build.h b/src/third_party/wiredtiger/src/include/verify_build.h index 23f1d16df95..5f131b7ab0c 100644 --- a/src/third_party/wiredtiger/src/include/verify_build.h +++ b/src/third_party/wiredtiger/src/include/verify_build.h @@ -65,7 +65,7 @@ __wt_verify_build(void) WT_STATIC_ASSERT( \ sizeof(s) > WT_CACHE_LINE_ALIGNMENT || sizeof(s) % WT_CACHE_LINE_ALIGNMENT == 0) WT_PADDING_CHECK(WT_LOGSLOT); - WT_PADDING_CHECK(WT_TXN_STATE); + WT_PADDING_CHECK(WT_TXN_SHARED); /* * The btree code encodes key/value pairs in size_t's, and requires at least 8B size_t's. diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h index 31b8b740ed9..204e6fd0eb9 100644 --- a/src/third_party/wiredtiger/src/include/wt_internal.h +++ b/src/third_party/wiredtiger/src/include/wt_internal.h @@ -325,8 +325,8 @@ struct __wt_txn_op; typedef struct __wt_txn_op WT_TXN_OP; struct __wt_txn_printlog_args; typedef struct __wt_txn_printlog_args WT_TXN_PRINTLOG_ARGS; -struct __wt_txn_state; -typedef struct __wt_txn_state WT_TXN_STATE; +struct __wt_txn_shared; +typedef struct __wt_txn_shared WT_TXN_SHARED; struct __wt_update; typedef struct __wt_update WT_UPDATE; union __wt_lsn; diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c index 963b1998289..87e4bda2a8a 100644 --- a/src/third_party/wiredtiger/src/log/log.c +++ b/src/third_party/wiredtiger/src/log/log.c @@ -1191,10 +1191,8 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created) */ if (create_log) { /* - * Increment the missed pre-allocated file counter only - * if a hot backup is not in progress. We are deliberately - * not using pre-allocated log files during backup - * (see comment above). + * Increment the missed pre-allocated file counter only if a hot backup is not in progress. + * We are deliberately not using pre-allocated log files during backup (see comment above). */ if (!conn->hot_backup) log->prep_missed++; @@ -1430,10 +1428,9 @@ __log_truncate(WT_SESSION_IMPL *session, WT_LSN *lsn, bool this_log, bool salvag /* * Truncate the log file to the given LSN. * - * It's possible the underlying file system doesn't support truncate - * (there are existing examples), which is fine, but we don't want to - * repeatedly do the setup work just to find that out every time. Check - * before doing work, and if there's a not-supported error, turn off + * It's possible the underlying file system doesn't support truncate (there are existing + * examples), which is fine, but we don't want to repeatedly do the setup work just to find that + * out every time. Check before doing work, and if there's a not-supported error, turn off * future truncates. */ WT_ERR(__log_openfile(session, lsn->l.file, 0, &log_fh)); diff --git a/src/third_party/wiredtiger/src/log/log_auto.c b/src/third_party/wiredtiger/src/log/log_auto.c index 615f4238aa3..af5eea8a1a6 100644 --- a/src/third_party/wiredtiger/src/log/log_auto.c +++ b/src/third_party/wiredtiger/src/log/log_auto.c @@ -711,22 +711,23 @@ __wt_logop_prev_lsn_print( int __wt_logop_txn_timestamp_pack(WT_SESSION_IMPL *session, WT_ITEM *logrec, uint64_t time_sec, - uint64_t time_nsec, uint64_t commit_ts, uint64_t durable_ts, uint64_t first_ts, - uint64_t prepare_ts, uint64_t read_ts) + uint64_t time_nsec, uint64_t commit_ts, uint64_t durable_ts, uint64_t first_commit_ts, + uint64_t prepare_ts, uint64_t read_ts, uint64_t pinned_read_ts) { - const char *fmt = WT_UNCHECKED_STRING(IIQQQQQQQ); + const char *fmt = WT_UNCHECKED_STRING(IIQQQQQQQQ); size_t size; uint32_t optype, recsize; optype = WT_LOGOP_TXN_TIMESTAMP; WT_RET(__wt_struct_size(session, &size, fmt, optype, 0, time_sec, time_nsec, commit_ts, - durable_ts, first_ts, prepare_ts, read_ts)); + durable_ts, first_commit_ts, prepare_ts, read_ts, pinned_read_ts)); __wt_struct_size_adjust(session, &size); WT_RET(__wt_buf_extend(session, logrec, logrec->size + size)); recsize = (uint32_t)size; WT_RET(__wt_struct_pack(session, (uint8_t *)logrec->data + logrec->size, size, fmt, optype, - recsize, time_sec, time_nsec, commit_ts, durable_ts, first_ts, prepare_ts, read_ts)); + recsize, time_sec, time_nsec, commit_ts, durable_ts, first_commit_ts, prepare_ts, read_ts, + pinned_read_ts)); logrec->size += (uint32_t)size; return (0); @@ -735,14 +736,15 @@ __wt_logop_txn_timestamp_pack(WT_SESSION_IMPL *session, WT_ITEM *logrec, uint64_ int __wt_logop_txn_timestamp_unpack(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint64_t *time_secp, uint64_t *time_nsecp, uint64_t *commit_tsp, uint64_t *durable_tsp, - uint64_t *first_tsp, uint64_t *prepare_tsp, uint64_t *read_tsp) + uint64_t *first_commit_tsp, uint64_t *prepare_tsp, uint64_t *read_tsp, uint64_t *pinned_read_tsp) { WT_DECL_RET; - const char *fmt = WT_UNCHECKED_STRING(IIQQQQQQQ); + const char *fmt = WT_UNCHECKED_STRING(IIQQQQQQQQ); uint32_t optype, size; if ((ret = __wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt, &optype, &size, - time_secp, time_nsecp, commit_tsp, durable_tsp, first_tsp, prepare_tsp, read_tsp)) != 0) + time_secp, time_nsecp, commit_tsp, durable_tsp, first_commit_tsp, prepare_tsp, read_tsp, + pinned_read_tsp)) != 0) WT_RET_MSG(session, ret, "logop_txn_timestamp: unpack failure"); WT_ASSERT(session, optype == WT_LOGOP_TXN_TIMESTAMP); @@ -758,21 +760,25 @@ __wt_logop_txn_timestamp_print( uint64_t time_nsec; uint64_t commit_ts; uint64_t durable_ts; - uint64_t first_ts; + uint64_t first_commit_ts; uint64_t prepare_ts; uint64_t read_ts; + uint64_t pinned_read_ts; WT_RET(__wt_logop_txn_timestamp_unpack(session, pp, end, &time_sec, &time_nsec, &commit_ts, - &durable_ts, &first_ts, &prepare_ts, &read_ts)); + &durable_ts, &first_commit_ts, &prepare_ts, &read_ts, &pinned_read_ts)); WT_RET(__wt_fprintf(session, args->fs, " \"optype\": \"txn_timestamp\",\n")); WT_RET(__wt_fprintf(session, args->fs, " \"time_sec\": %" PRIu64 ",\n", time_sec)); WT_RET(__wt_fprintf(session, args->fs, " \"time_nsec\": %" PRIu64 ",\n", time_nsec)); WT_RET(__wt_fprintf(session, args->fs, " \"commit_ts\": %" PRIu64 ",\n", commit_ts)); WT_RET(__wt_fprintf(session, args->fs, " \"durable_ts\": %" PRIu64 ",\n", durable_ts)); - WT_RET(__wt_fprintf(session, args->fs, " \"first_ts\": %" PRIu64 ",\n", first_ts)); + WT_RET(__wt_fprintf( + session, args->fs, " \"first_commit_ts\": %" PRIu64 ",\n", first_commit_ts)); WT_RET(__wt_fprintf(session, args->fs, " \"prepare_ts\": %" PRIu64 ",\n", prepare_ts)); - WT_RET(__wt_fprintf(session, args->fs, " \"read_ts\": %" PRIu64 "", read_ts)); + WT_RET(__wt_fprintf(session, args->fs, " \"read_ts\": %" PRIu64 ",\n", read_ts)); + WT_RET( + __wt_fprintf(session, args->fs, " \"pinned_read_ts\": %" PRIu64 "", pinned_read_ts)); return (0); } diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c index 18a26bebff0..6052d20025f 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c @@ -104,10 +104,10 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm) } else { primary = clsm->chunks[clsm->nchunks - 1]->cursor; primary_chunk = clsm->primary_chunk; - WT_ASSERT(session, F_ISSET(&session->txn, WT_TXN_HAS_ID)); + WT_ASSERT(session, F_ISSET(session->txn, WT_TXN_HAS_ID)); have_primary = (primary != NULL && primary_chunk != NULL && (primary_chunk->switch_txn == WT_TXN_NONE || - WT_TXNID_LT(session->txn.id, primary_chunk->switch_txn))); + WT_TXNID_LT(session->txn->id, primary_chunk->switch_txn))); } /* @@ -160,7 +160,7 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update) lsm_tree = clsm->lsm_tree; session = (WT_SESSION_IMPL *)clsm->iface.session; - txn = &session->txn; + txn = session->txn; /* Merge cursors never update. */ if (F_ISSET(clsm, WT_CLSM_MERGE)) @@ -209,7 +209,7 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update) clsm->nupdates = 1; if (txn->isolation == WT_ISO_SNAPSHOT && F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { WT_ASSERT(session, F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)); - pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id; + pinned_id = WT_SESSION_TXN_SHARED(session)->pinned_id; for (i = clsm->nchunks - 2; clsm->nupdates < clsm->nchunks; clsm->nupdates++, i--) { switch_txn = clsm->chunks[i]->switch_txn; if (WT_TXNID_LT(switch_txn, pinned_id)) @@ -429,7 +429,7 @@ __clsm_open_cursors(WT_CURSOR_LSM *clsm, bool update, u_int start_chunk, uint32_ c = &clsm->iface; cursor = NULL; session = (WT_SESSION_IMPL *)c->session; - txn = &session->txn; + txn = session->txn; chunk = NULL; locked = false; lsm_tree = clsm->lsm_tree; @@ -832,7 +832,7 @@ __clsm_position_chunk(WT_CURSOR_LSM *clsm, WT_CURSOR *c, bool forward, int *cmpp * and stepping forward / back. In that case, keep going until we see a key in the expected * range. */ - if (session->txn.isolation != WT_ISO_READ_UNCOMMITTED) + if (session->txn->isolation != WT_ISO_READ_UNCOMMITTED) return (0); WT_RET(WT_LSM_CURCMP(session, clsm->lsm_tree, c, cursor, *cmpp)); @@ -1386,9 +1386,9 @@ __clsm_put(WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, const WT_ITEM *key, co lsm_tree = clsm->lsm_tree; - WT_ASSERT(session, F_ISSET(&session->txn, WT_TXN_HAS_ID) && clsm->primary_chunk != NULL && + WT_ASSERT(session, F_ISSET(session->txn, WT_TXN_HAS_ID) && clsm->primary_chunk != NULL && (clsm->primary_chunk->switch_txn == WT_TXN_NONE || - WT_TXNID_LE(session->txn.id, clsm->primary_chunk->switch_txn))); + WT_TXNID_LE(session->txn->id, clsm->primary_chunk->switch_txn))); /* * Clear the existing cursor position. Don't clear the primary cursor: we're about to use it @@ -1618,12 +1618,11 @@ err: CURSOR_UPDATE_API_END(session, ret); /* - * The application might do a WT_CURSOR.get_value call when we return, - * so we need a value and the underlying functions didn't set one up. - * For various reasons, those functions may not have done a search and - * any previous value in the cursor might race with WT_CURSOR.reserve - * (and in cases like LSM, the reserve never encountered the original - * key). For simplicity, repeat the search here. + * The application might do a WT_CURSOR.get_value call when we return, so we need a value and + * the underlying functions didn't set one up. For various reasons, those functions may not have + * done a search and any previous value in the cursor might race with WT_CURSOR.reserve (and in + * cases like LSM, the reserve never encountered the original key). For simplicity, repeat the + * search here. */ return (ret == 0 ? cursor->search(cursor) : ret); } diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c index ad71a1ee3dd..ed4a5b43c4b 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c @@ -403,10 +403,10 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LS * Set read-uncommitted: we have already checked that all of the updates in this chunk are * globally visible, use the cheapest possible check in reconciliation. */ - saved_isolation = session->txn.isolation; - session->txn.isolation = WT_ISO_READ_UNCOMMITTED; + saved_isolation = session->txn->isolation; + session->txn->isolation = WT_ISO_READ_UNCOMMITTED; ret = __wt_sync_file(session, WT_SYNC_WRITE_LEAVES); - session->txn.isolation = saved_isolation; + session->txn->isolation = saved_isolation; WT_ERR(ret); __wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s", chunk->uri); diff --git a/src/third_party/wiredtiger/src/meta/meta_ckpt.c b/src/third_party/wiredtiger/src/meta/meta_ckpt.c index c9f985a10b7..66415b2cd62 100644 --- a/src/third_party/wiredtiger/src/meta/meta_ckpt.c +++ b/src/third_party/wiredtiger/src/meta/meta_ckpt.c @@ -328,11 +328,10 @@ __wt_meta_block_metadata(WT_SESSION_IMPL *session, const char *config, WT_CKPT * filecfg[1] = config; /* - * Find out if this file is encrypted. If encrypting, encrypt and encode. - * The metadata has to be encrypted because it contains private data - * (for example, column names). We pass the block manager text that - * describes the metadata (the encryption information), and the - * possibly encrypted metadata encoded as a hexadecimal string. + * Find out if this file is encrypted. If encrypting, encrypt and encode. The metadata has to be + * encrypted because it contains private data (for example, column names). We pass the block + * manager text that describes the metadata (the encryption information), and the possibly + * encrypted metadata encoded as a hexadecimal string. */ WT_ERR(__wt_btree_config_encryptor(session, filecfg, &kencryptor)); if (kencryptor == NULL) { diff --git a/src/third_party/wiredtiger/src/meta/meta_track.c b/src/third_party/wiredtiger/src/meta/meta_track.c index bc51dcb15e4..5b2710e8aba 100644 --- a/src/third_party/wiredtiger/src/meta/meta_track.c +++ b/src/third_party/wiredtiger/src/meta/meta_track.c @@ -111,7 +111,7 @@ int __wt_meta_track_on(WT_SESSION_IMPL *session) { if (session->meta_track_nest++ == 0) { - if (!F_ISSET(&session->txn, WT_TXN_RUNNING)) { + if (!F_ISSET(session->txn, WT_TXN_RUNNING)) { #ifdef WT_ENABLE_SCHEMA_TXN WT_RET(__wt_txn_begin(session, NULL)); __wt_errx(session, "TRACK: Using internal schema txn"); @@ -282,11 +282,11 @@ __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll) * If this operation is part of a running transaction, that should be included in the * checkpoint. */ - ckpt_session->txn.id = session->txn.id; + ckpt_session->txn->id = session->txn->id; WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_METADATA)); WT_WITH_DHANDLE(ckpt_session, WT_SESSION_META_DHANDLE(session), WT_WITH_METADATA_LOCK(ckpt_session, ret = __wt_checkpoint(ckpt_session, NULL))); - ckpt_session->txn.id = WT_TXN_NONE; + ckpt_session->txn->id = WT_TXN_NONE; if (ret == 0) WT_WITH_DHANDLE( session, WT_SESSION_META_DHANDLE(session), ret = __wt_checkpoint_sync(session, NULL)); @@ -316,7 +316,7 @@ err: * We should have committed above unless we're unrolling, there was an error or the * operation was a noop. */ - WT_ASSERT(session, unroll || saved_ret != 0 || session->txn.mod_count == 0); + WT_ASSERT(session, unroll || saved_ret != 0 || session->txn->mod_count == 0); #ifdef WT_ENABLE_SCHEMA_TXN __wt_err(session, saved_ret, "TRACK: Abort internal schema txn"); WT_TRET(__wt_txn_rollback(session, NULL)); @@ -521,7 +521,7 @@ __wt_meta_track_init(WT_SESSION_IMPL *session) * Sessions default to read-committed isolation, we rely on that for the correctness of * metadata checkpoints. */ - WT_ASSERT(session, conn->meta_ckpt_session->txn.isolation == WT_ISO_READ_COMMITTED); + WT_ASSERT(session, conn->meta_ckpt_session->txn->isolation == WT_ISO_READ_COMMITTED); } return (0); diff --git a/src/third_party/wiredtiger/src/os_win/os_fs.c b/src/third_party/wiredtiger/src/os_win/os_fs.c index 597dfaf81dd..276aae62796 100644 --- a/src/third_party/wiredtiger/src/os_win/os_fs.c +++ b/src/third_party/wiredtiger/src/os_win/os_fs.c @@ -109,13 +109,11 @@ __win_fs_rename(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, const char WT_ERR(__wt_to_utf16_string(session, to, &to_wide)); /* - * We want an atomic rename, but that's not guaranteed by MoveFileExW - * (or by any MSDN API). Don't set the MOVEFILE_COPY_ALLOWED flag to - * prevent the system from falling back to a copy and delete process. - * Do set the MOVEFILE_WRITE_THROUGH flag so the window is as small - * as possible, just in case. WiredTiger renames are done in a single - * directory and we expect that to be an atomic metadata update on any - * modern filesystem. + * We want an atomic rename, but that's not guaranteed by MoveFileExW (or by any MSDN API). + * Don't set the MOVEFILE_COPY_ALLOWED flag to prevent the system from falling back to a copy + * and delete process. Do set the MOVEFILE_WRITE_THROUGH flag so the window is as small as + * possible, just in case. WiredTiger renames are done in a single directory and we expect that + * to be an atomic metadata update on any modern filesystem. */ WT_WINCALL_RETRY(MoveFileExW(from_wide->data, to_wide->data, MOVEFILE_REPLACE_EXISTING | MOVEFILE_WRITE_THROUGH), diff --git a/src/third_party/wiredtiger/src/reconcile/rec_col.c b/src/third_party/wiredtiger/src/reconcile/rec_col.c index 5218aa52451..ffa4c94f1b2 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_col.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_col.c @@ -968,7 +968,7 @@ compare: * they're no longer useful. */ if (ovfl_state == OVFL_UNUSED && vpack->raw != WT_CELL_VALUE_OVFL_RM) - WT_ERR(__wt_ovfl_remove(session, page, vpack, F_ISSET(r, WT_REC_EVICT))); + WT_ERR(__wt_ovfl_remove(session, page, vpack)); } /* Walk any append list. */ diff --git a/src/third_party/wiredtiger/src/reconcile/rec_row.c b/src/third_party/wiredtiger/src/reconcile/rec_row.c index ed8c6e3f80c..d65768aba49 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_row.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_row.c @@ -879,7 +879,7 @@ __wt_rec_row_leaf( } else { /* The first time we find an overflow record, discard the underlying blocks. */ if (F_ISSET(vpack, WT_CELL_UNPACK_OVERFLOW) && vpack->raw != WT_CELL_VALUE_OVFL_RM) - WT_ERR(__wt_ovfl_remove(session, page, vpack, F_ISSET(r, WT_REC_EVICT))); + WT_ERR(__wt_ovfl_remove(session, page, vpack)); switch (upd->type) { case WT_UPDATE_MODIFY: diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c index b4dd84d58ee..e25b02a3104 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c @@ -63,67 +63,65 @@ __rec_append_orig_value( WT_ASSERT(session, upd != NULL && unpack != NULL && unpack->type != WT_CELL_DEL); - for (;; upd = upd->next) { - /* Done if at least one self-contained update is globally visible. */ - if (WT_UPDATE_DATA_VALUE(upd) && __wt_txn_upd_visible_all(session, upd)) - return (0); + append = tombstone = NULL; + total_size = 0; + /* Review the current update list, checking conditions that mean no work is needed. */ + for (;; upd = upd->next) { /* - * If the update is restored from the history store for the rollback to stable operation we - * don't need the on-disk value anymore and we're done. + * Done if the update was restored from the history store for the rollback to stable + * operation. */ if (F_ISSET(upd, WT_UPDATE_RESTORED_FOR_ROLLBACK)) return (0); - /* On page value already on chain */ - if (unpack != NULL && unpack->start_ts == upd->start_ts && unpack->start_txn == upd->txnid) + /* Done if the on page value already appears on the update list. */ + if (unpack->start_ts == upd->start_ts && unpack->start_txn == upd->txnid) + return (0); + + /* + * Done if at least one self-contained update is globally visible. It's tempting to pull + * this test out of the loop and only test the oldest self-contained update for global + * visibility (as visibility tests are expensive). However, when running at lower isolation + * levels, or when an application intentionally commits in out of timestamp order, it's + * possible for an update on the chain to be globally visible and followed by an (earlier) + * update that is not yet globally visible. + */ + if (WT_UPDATE_DATA_VALUE(upd) && __wt_txn_upd_visible_all(session, upd)) return (0); - /* Leave reference at the last item in the chain. */ + /* Leave reference pointing to the last item in the update list. */ if (upd->next == NULL) break; } - /* - * We need the original on-page value for some reader: get a copy and append it to the end of - * the update list with a transaction ID that guarantees its visibility. - * - * If we don't have a value cell, it's an insert/append list key/value pair which simply doesn't - * exist for some reader; place a deleted record at the end of the update list. - * - * If the an update is out of order so it masks the value in the cell, don't append. - */ - append = tombstone = NULL; /* -Wconditional-uninitialized */ - total_size = size = 0; /* -Wconditional-uninitialized */ + /* Done if the stop time pair of the onpage cell is globally visible. */ + if ((unpack->stop_ts != WT_TS_MAX || unpack->stop_txn != WT_TXN_MAX) && + __wt_txn_visible_all(session, unpack->stop_txn, unpack->stop_ts)) + return (0); + + /* We need the original on-page value for some reader: get a copy. */ + WT_ERR(__wt_scr_alloc(session, 0, &tmp)); + WT_ERR(__wt_page_cell_data_ref(session, page, unpack, tmp)); + WT_ERR(__wt_update_alloc(session, tmp, &append, &size, WT_UPDATE_STANDARD)); + total_size += size; + append->txnid = unpack->start_txn; + append->start_ts = unpack->start_ts; + append->durable_ts = unpack->durable_start_ts; /* - * We need to append a TOMBSTONE before the onpage value if the onpage value has a valid - * stop pair. - * - * Imagine a case we insert and delete a value respectively at timestamp 0 and 10, and later - * insert it again at 20. We need the TOMBSTONE to tell us there is no value between 10 and - * 20. + * Additionally, we need to append a tombstone before the onpage value we're about to append to + * the list, if the onpage value has a valid stop pair. Imagine a case where we insert and + * delete a value respectively at timestamp 0 and 10, and later insert it again at 20. We need + * the tombstone to tell us there is no value between 10 and 20. */ if (unpack->stop_ts != WT_TS_MAX || unpack->stop_txn != WT_TXN_MAX) { - /* No need to append anything if the stop time pair is globally visible. */ - if (__wt_txn_visible_all(session, unpack->stop_txn, unpack->stop_ts)) - return (0); WT_ERR(__wt_update_alloc(session, NULL, &tombstone, &size, WT_UPDATE_TOMBSTONE)); + total_size += size; tombstone->txnid = unpack->stop_txn; tombstone->start_ts = unpack->stop_ts; tombstone->durable_ts = unpack->durable_stop_ts; - total_size += size; - } - - WT_ERR(__wt_scr_alloc(session, 0, &tmp)); - WT_ERR(__wt_page_cell_data_ref(session, page, unpack, tmp)); - WT_ERR(__wt_update_alloc(session, tmp, &append, &size, WT_UPDATE_STANDARD)); - append->txnid = unpack->start_txn; - append->start_ts = unpack->start_ts; - append->durable_ts = unpack->durable_start_ts; - total_size += size; - if (tombstone != NULL) { tombstone->next = append; append = tombstone; } @@ -133,13 +131,12 @@ __rec_append_orig_value( __wt_cache_page_inmem_incr(session, page, total_size); + if (0) { err: - __wt_scr_free(session, &tmp); - /* Free append when tombstone allocation fails */ - if (ret != 0) { __wt_free(session, append); __wt_free(session, tombstone); } + __wt_scr_free(session, &tmp); return (ret); } @@ -156,8 +153,8 @@ __rec_need_save_upd( /* * Save updates for any reconciliation that doesn't involve history store (in-memory database - * and fixed length column store), except when the maximum timestamp and txnid are globally - * visible. + * and fixed length column store), except when the selected stop time pair or the selected start + * time pair is globally visible. */ if (!F_ISSET(r, WT_REC_HS) && !F_ISSET(r, WT_REC_IN_MEMORY) && r->page->type != WT_PAGE_COL_FIX) return (false); @@ -296,7 +293,8 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v * checkpoint in a concurrent session. */ WT_ASSERT(session, !WT_IS_METADATA(session->dhandle) || upd == NULL || - upd->txnid == WT_TXN_NONE || upd->txnid != S2C(session)->txn_global.checkpoint_state.id || + upd->txnid == WT_TXN_NONE || + upd->txnid != S2C(session)->txn_global.checkpoint_txn_shared.id || WT_SESSION_IS_CHECKPOINT(session)); /* If all of the updates were aborted, quit. */ @@ -464,11 +462,17 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v /* * Returning an update means the original on-page value might be lost, and that's a problem if - * there's a reader that needs it. This call makes a copy of the on-page value. We do that any - * time there are saved updates and during reconciliation of a backing overflow record that will - * be physically removed once it's no longer needed. + * there's a reader that needs it, make a copy of the on-page value. We do that any time there + * are saved updates (we may need the original on-page value to terminate the update chain, for + * example, in the case of an update that modifies the original value). Additionally, make a + * copy of the on-page value if the value is an overflow item and anything other than the + * on-page cell is being written. This is because the value's backing overflow blocks aren't + * part of the page, and they are physically removed by checkpoint writing this page, that is, + * the checkpoint doesn't include the overflow blocks so they're removed and future readers of + * this page won't be able to find them. */ - if (vpack != NULL && vpack->type != WT_CELL_DEL && upd_select->upd != NULL && upd_saved) + if (upd_select->upd != NULL && vpack != NULL && vpack->type != WT_CELL_DEL && + (upd_saved || F_ISSET(vpack, WT_CELL_UNPACK_OVERFLOW))) WT_ERR(__rec_append_orig_value(session, page, upd_select->upd, vpack)); err: diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index e7cdc847da8..ff4fe361abe 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -42,12 +42,12 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage /* * Sanity check flags. * - * If we try to do eviction using transaction visibility, we had better - * have a snapshot. This doesn't apply to checkpoints: there are - * (rare) cases where we write data at read-uncommitted isolation. + * If we try to do eviction using transaction visibility, we had better have a snapshot. This + * doesn't apply to checkpoints: there are (rare) cases where we write data at read-uncommitted + * isolation. */ WT_ASSERT(session, !LF_ISSET(WT_REC_EVICT) || LF_ISSET(WT_REC_VISIBLE_ALL) || - F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT)); + F_ISSET(session->txn, WT_TXN_HAS_SNAPSHOT)); /* It's an error to be called with a clean page. */ WT_ASSERT(session, __wt_page_is_modified(page)); @@ -225,11 +225,10 @@ __reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, u __rec_cleanup(session, r); /* - * When threads perform eviction, don't cache block manager structures - * (even across calls), we can have a significant number of threads - * doing eviction at the same time with large items. Ignore checkpoints, - * once the checkpoint completes, all unnecessary session resources will - * be discarded. + * When threads perform eviction, don't cache block manager structures (even across calls), we + * can have a significant number of threads doing eviction at the same time with large items. + * Ignore checkpoints, once the checkpoint completes, all unnecessary session resources will be + * discarded. */ if (!WT_SESSION_IS_CHECKPOINT(session)) { /* @@ -242,14 +241,6 @@ __reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, u WT_TRET(__rec_destroy_session(session)); } - - /* - * We track removed overflow objects in case there's a reader in transit when they're removed. - * Any form of eviction locks out readers, we can discard them all. - */ - if (LF_ISSET(WT_REC_EVICT)) - __wt_ovfl_discard_remove(session, page); - WT_RET(ret); /* @@ -308,15 +299,6 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r) */ WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT) || (F_ISSET(r, WT_REC_HS | WT_REC_IN_MEMORY) || page->type == WT_PAGE_COL_FIX)); - - /* - * We have written the page, but something prevents it from being evicted. If we wrote the - * newest versions of updates, the on-disk page may contain records that are newer than what - * checkpoint would write. Make sure that checkpoint visits the page and (if necessary) - * fixes things up. - */ - if (r->hs_skew_newest) - mod->first_dirty_txn = WT_TXN_FIRST; } else { /* * Track the page's maximum transaction ID (used to decide if we can evict a clean page and @@ -518,52 +500,25 @@ __rec_init(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags, WT_SALVAGE_COO * checkpoints into account. */ if (WT_IS_METADATA(session->dhandle)) { - WT_ORDERED_READ(ckpt_txn, txn_global->checkpoint_state.id); + WT_ORDERED_READ(ckpt_txn, txn_global->checkpoint_txn_shared.id); if (ckpt_txn != WT_TXN_NONE && WT_TXNID_LT(ckpt_txn, r->last_running)) r->last_running = ckpt_txn; } - /* - * Decide whether to skew on-page values towards newer or older versions. This is a heuristic - * attempting to minimize the number of pages that need to be rewritten by future checkpoints. - * - * We usually prefer to skew to newer versions, the logic being that by the time the next - * checkpoint runs, it is likely that all the updates we choose will be stable. However, if - * checkpointing with a timestamp (indicated by a stable_timestamp being set), and there is a - * checkpoint already running, or this page was read with history store history, or the stable - * timestamp hasn't changed since last time this page was successfully, skew oldest instead. - */ - if (F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_DEBUG_MODE) && - __wt_random(&session->rnd) % 3 == 0) - r->hs_skew_newest = false; - else - r->hs_skew_newest = LF_ISSET(WT_REC_HS) && LF_ISSET(WT_REC_VISIBLE_ALL); - - if (r->hs_skew_newest && !__wt_btree_immediately_durable(session) && - txn_global->has_stable_timestamp && - ((btree->checkpoint_gen != __wt_gen(session, WT_GEN_CHECKPOINT) && - txn_global->stable_is_pinned) || - FLD_ISSET(page->modify->restore_state, WT_PAGE_RS_HS) || - page->modify->last_stable_timestamp == txn_global->stable_timestamp)) - r->hs_skew_newest = false; - /* When operating on the history store table, we should never try history store eviction. */ WT_ASSERT(session, !F_ISSET(btree, WT_BTREE_HS) || !LF_ISSET(WT_REC_HS)); /* - * History store table eviction is configured when eviction gets aggressive, - * adjust the flags for cases we don't support. + * History store table eviction is configured when eviction gets aggressive, adjust the flags + * for cases we don't support. * - * We don't yet support fixed-length column-store combined with the - * history store table. It's not hard to do, but the underlying function - * that reviews which updates can be written to the evicted page and - * which updates need to be written to the history store table needs access - * to the original value from the page being evicted, and there's no - * code path for that in the case of fixed-length column-store objects. - * (Row-store and variable-width column-store objects provide a - * reference to the unpacked on-page cell for this purpose, but there - * isn't an on-page cell for fixed-length column-store objects.) For - * now, turn it off. + * We don't yet support fixed-length column-store combined with the history store table. It's + * not hard to do, but the underlying function that reviews which updates can be written to the + * evicted page and which updates need to be written to the history store table needs access to + * the original value from the page being evicted, and there's no code path for that in the case + * of fixed-length column-store objects. (Row-store and variable-width column-store objects + * provide a reference to the unpacked on-page cell for this purpose, but there isn't an on-page + * cell for fixed-length column-store objects.) For now, turn it off. */ if (page->type == WT_PAGE_COL_FIX) LF_CLR(WT_REC_HS); @@ -755,23 +710,20 @@ __rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r) switch (page->type) { case WT_PAGE_COL_FIX: /* - * Column-store pages can grow if there are missing records - * (that is, we lost a chunk of the range, and have to write - * deleted records). Fixed-length objects are a problem, if - * there's a big missing range, we could theoretically have to - * write large numbers of missing objects. + * Column-store pages can grow if there are missing records (that is, we lost a chunk of the + * range, and have to write deleted records). Fixed-length objects are a problem, if there's + * a big missing range, we could theoretically have to write large numbers of missing + * objects. */ page_size = (uint32_t)WT_ALIGN( WT_FIX_ENTRIES_TO_BYTES(btree, r->salvage->take + r->salvage->missing), btree->allocsize); break; case WT_PAGE_COL_VAR: /* - * Column-store pages can grow if there are missing records - * (that is, we lost a chunk of the range, and have to write - * deleted records). Variable-length objects aren't usually a - * problem because we can write any number of deleted records - * in a single page entry because of the RLE, we just need to - * ensure that additional entry fits. + * Column-store pages can grow if there are missing records (that is, we lost a chunk of the + * range, and have to write deleted records). Variable-length objects aren't usually a + * problem because we can write any number of deleted records in a single page entry because + * of the RLE, we just need to ensure that additional entry fits. */ break; case WT_PAGE_ROW_LEAF: @@ -946,15 +898,14 @@ __wt_rec_split_init( } /* - * Ensure the disk image buffer is large enough for the max object, as - * corrected by the underlying block manager. + * Ensure the disk image buffer is large enough for the max object, as corrected by the + * underlying block manager. * - * Since we want to support split_size values larger than the page size - * (to allow for adjustments based on the compression), this buffer - * should be the greater of split_size and page_size, then aligned to - * the next allocation size boundary. The latter shouldn't be an issue, - * but it's a possible scenario if, for example, the compression engine - * is expected to give us 5x compression and gives us nothing at all. + * Since we want to support split_size values larger than the page size (to allow for + * adjustments based on the compression), this buffer should be the greater of split_size and + * page_size, then aligned to the next allocation size boundary. The latter shouldn't be an + * issue, but it's a possible scenario if, for example, the compression engine is expected to + * give us 5x compression and gives us nothing at all. */ corrected_page_size = r->page_size; WT_RET(bm->write_size(bm, session, &corrected_page_size)); @@ -1626,12 +1577,11 @@ __rec_split_write_reuse( return (false); /* - * Quit if evicting with no previously written block to compare against. - * (In other words, if there's eviction pressure and the page was never - * written by a checkpoint, calculating a checksum is worthless.) + * Quit if evicting with no previously written block to compare against. (In other words, if + * there's eviction pressure and the page was never written by a checkpoint, calculating a + * checksum is worthless.) * - * Quit if evicting and a previous check failed, once there's a miss no - * future block will match. + * Quit if evicting and a previous check failed, once there's a miss no future block will match. */ if (F_ISSET(r, WT_REC_EVICT)) { if (mod->rec_result != WT_PM_REC_MULTIBLOCK || mod->mod_multi_entries < r->multi_next) diff --git a/src/third_party/wiredtiger/src/schema/schema_util.c b/src/third_party/wiredtiger/src/schema/schema_util.c index c6e750bea9e..16365ba94c5 100644 --- a/src/third_party/wiredtiger/src/schema/schema_util.c +++ b/src/third_party/wiredtiger/src/schema/schema_util.c @@ -84,7 +84,7 @@ __wt_schema_internal_session(WT_SESSION_IMPL *session, WT_SESSION_IMPL **int_ses * flags from the original. */ *int_sessionp = session; - if (F_ISSET(&session->txn, WT_TXN_RUNNING)) { + if (F_ISSET(session->txn, WT_TXN_RUNNING)) { /* We should not have a schema txn running now. */ WT_ASSERT(session, !F_ISSET(session, WT_SESSION_SCHEMA_TXN)); WT_RET( diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index 627a204c9b2..24acc8da2d9 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -132,8 +132,8 @@ __wt_session_copy_values(WT_SESSION_IMPL *session) * We have to do this with a transaction ID pinned unless the cursor is reading from a * checkpoint. */ - WT_TXN_STATE *txn_state = WT_SESSION_TXN_STATE(session); - WT_ASSERT(session, txn_state->pinned_id != WT_TXN_NONE || + WT_TXN_SHARED *txn_shared = WT_SESSION_TXN_SHARED(session); + WT_ASSERT(session, txn_shared->pinned_id != WT_TXN_NONE || (WT_PREFIX_MATCH(cursor->uri, "file:") && F_ISSET((WT_CURSOR_BTREE *)cursor, WT_CBT_NO_TXN))); #endif @@ -184,14 +184,12 @@ static void __session_clear(WT_SESSION_IMPL *session) { /* - * There's no serialization support around the review of the hazard - * array, which means threads checking for hazard pointers first check - * the active field (which may be 0) and then use the hazard pointer - * (which cannot be NULL). + * There's no serialization support around the review of the hazard array, which means threads + * checking for hazard pointers first check the active field (which may be 0) and then use the + * hazard pointer (which cannot be NULL). * - * Additionally, the session structure can include information that - * persists past the session's end-of-life, stored as part of page - * splits. + * Additionally, the session structure can include information that persists past the session's + * end-of-life, stored as part of page splits. * * For these reasons, be careful when clearing the session structure. */ @@ -274,13 +272,13 @@ __session_close(WT_SESSION *wt_session, const char *config) F_CLR(session, WT_SESSION_CACHE_CURSORS); /* Rollback any active transaction. */ - if (F_ISSET(&session->txn, WT_TXN_RUNNING)) + if (F_ISSET(session->txn, WT_TXN_RUNNING)) WT_TRET(__session_rollback_transaction(wt_session, NULL)); /* * Also release any pinned transaction ID from a non-transactional operation. */ - if (conn->txn_global.states != NULL) + if (conn->txn_global.txn_shared_list != NULL) __wt_txn_release_snapshot(session); /* Close all open cursors. */ @@ -1644,7 +1642,7 @@ __session_commit_transaction(WT_SESSION *wt_session, const char *config) WT_TXN *txn; session = (WT_SESSION_IMPL *)wt_session; - txn = &session->txn; + txn = session->txn; SESSION_API_CALL_PREPARE_ALLOWED(session, commit_transaction, config, cfg); WT_STAT_CONN_INCR(session, txn_commit); @@ -1748,7 +1746,7 @@ __session_rollback_transaction(WT_SESSION *wt_session, const char *config) SESSION_API_CALL_PREPARE_ALLOWED(session, rollback_transaction, config, cfg); WT_STAT_CONN_INCR(session, txn_rollback); - txn = &session->txn; + txn = session->txn; if (F_ISSET(txn, WT_TXN_PREPARE)) { WT_STAT_CONN_INCR(session, txn_prepare_rollback); WT_STAT_CONN_DECR(session, txn_prepare_active); @@ -1816,19 +1814,19 @@ __session_transaction_pinned_range(WT_SESSION *wt_session, uint64_t *prange) { WT_DECL_RET; WT_SESSION_IMPL *session; - WT_TXN_STATE *txn_state; + WT_TXN_SHARED *txn_shared; uint64_t pinned; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL_PREPARE_NOT_ALLOWED_NOCONF(session, transaction_pinned_range); - txn_state = WT_SESSION_TXN_STATE(session); + txn_shared = WT_SESSION_TXN_SHARED(session); /* Assign pinned to the lesser of id or snap_min */ - if (txn_state->id != WT_TXN_NONE && WT_TXNID_LT(txn_state->id, txn_state->pinned_id)) - pinned = txn_state->id; + if (txn_shared->id != WT_TXN_NONE && WT_TXNID_LT(txn_shared->id, txn_shared->pinned_id)) + pinned = txn_shared->id; else - pinned = txn_state->pinned_id; + pinned = txn_shared->pinned_id; if (pinned == WT_TXN_NONE) *prange = 0; diff --git a/src/third_party/wiredtiger/src/support/hazard.c b/src/third_party/wiredtiger/src/support/hazard.c index 34101eb2242..45580b7d0cc 100644 --- a/src/third_party/wiredtiger/src/support/hazard.c +++ b/src/third_party/wiredtiger/src/support/hazard.c @@ -156,14 +156,13 @@ __wt_hazard_set_func(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp } /* - * The page isn't available, it's being considered for eviction - * (or being evicted, for all we know). If the eviction server - * sees our hazard pointer before evicting the page, it will - * return the page to use, no harm done, if it doesn't, it will - * go ahead and complete the eviction. + * The page isn't available, it's being considered for eviction (or being evicted, for all we + * know). If the eviction server sees our hazard pointer before evicting the page, it will + * return the page to use, no harm done, if it doesn't, it will go ahead and complete the + * eviction. * - * We don't bother publishing this update: the worst case is we - * prevent some random page from being evicted. + * We don't bother publishing this update: the worst case is we prevent some random page from + * being evicted. */ hp->ref = NULL; *busyp = true; @@ -244,15 +243,13 @@ __wt_hazard_close(WT_SESSION_IMPL *session) #endif /* - * Clear any hazard pointers because it's not a correctness problem - * (any hazard pointer we find can't be real because the session is - * being closed when we're called). We do this work because session - * close isn't that common that it's an expensive check, and we don't - * want to let a hazard pointer lie around, keeping a page from being - * evicted. + * Clear any hazard pointers because it's not a correctness problem (any hazard pointer we find + * can't be real because the session is being closed when we're called). We do this work because + * session close isn't that common that it's an expensive check, and we don't want to let a + * hazard pointer lie around, keeping a page from being evicted. * - * We don't panic: this shouldn't be a correctness issue (at least, I - * can't think of a reason it would be). + * We don't panic: this shouldn't be a correctness issue (at least, I can't think of a reason it + * would be). */ for (hp = session->hazard; hp < session->hazard + session->hazard_inuse; ++hp) if (hp->ref != NULL) { diff --git a/src/third_party/wiredtiger/src/support/huffman.c b/src/third_party/wiredtiger/src/support/huffman.c index 768024617ae..2bd9b6daea6 100644 --- a/src/third_party/wiredtiger/src/support/huffman.c +++ b/src/third_party/wiredtiger/src/support/huffman.c @@ -310,10 +310,9 @@ __wt_huffman_open( WT_RET(__wt_calloc_one(session, &huffman)); /* - * The frequency table is 4B pairs of symbol and frequency. The symbol - * is either 1 or 2 bytes and the frequency ranges from 1 to UINT32_MAX - * (a frequency of 0 means the value is never expected to appear in the - * input). Validate the symbols are within range. + * The frequency table is 4B pairs of symbol and frequency. The symbol is either 1 or 2 bytes + * and the frequency ranges from 1 to UINT32_MAX (a frequency of 0 means the value is never + * expected to appear in the input). Validate the symbols are within range. */ if (numbytes != 1 && numbytes != 2) WT_ERR_MSG(session, EINVAL, diff --git a/src/third_party/wiredtiger/src/support/pow.c b/src/third_party/wiredtiger/src/support/pow.c index e6f5e9dbbb0..f245502bc9f 100644 --- a/src/third_party/wiredtiger/src/support/pow.c +++ b/src/third_party/wiredtiger/src/support/pow.c @@ -101,11 +101,10 @@ bool __wt_ispo2(uint32_t v) { /* - * Only numbers that are powers of two will satisfy the relationship - * (v & (v - 1) == 0). + * Only numbers that are powers of two will satisfy the relationship (v & (v - 1) == 0). * - * However n must be positive, this returns 0 as a power of 2; to fix - * that, use: (! (v & (v - 1)) && v) + * However n must be positive, this returns 0 as a power of 2; to fix that, use: (! (v & (v - + * 1)) && v) */ return ((v & (v - 1)) == 0); } diff --git a/src/third_party/wiredtiger/src/support/thread_group.c b/src/third_party/wiredtiger/src/support/thread_group.c index b2da78afa8e..aca2a3d9aa7 100644 --- a/src/third_party/wiredtiger/src/support/thread_group.c +++ b/src/third_party/wiredtiger/src/support/thread_group.c @@ -176,8 +176,7 @@ __thread_group_resize(WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, uint32_t for (i = group->max; i < new_max; i++) { WT_ERR(__wt_calloc_one(session, &thread)); /* - * Threads get their own session and hs table cursor - * (if the hs table is open). + * Threads get their own session and hs table cursor (if the hs table is open). */ session_flags = LF_ISSET(WT_THREAD_CAN_WAIT) ? WT_SESSION_CAN_WAIT : 0; WT_ERR( diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 39b9db2ef69..58607c7cf2c 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -71,20 +71,20 @@ __txn_remove_from_global_table(WT_SESSION_IMPL *session) #ifdef HAVE_DIAGNOSTIC WT_TXN *txn; WT_TXN_GLOBAL *txn_global; - WT_TXN_STATE *txn_state; + WT_TXN_SHARED *txn_shared; - txn = &session->txn; + txn = session->txn; txn_global = &S2C(session)->txn_global; - txn_state = WT_SESSION_TXN_STATE(session); + txn_shared = WT_SESSION_TXN_SHARED(session); WT_ASSERT(session, !WT_TXNID_LT(txn->id, txn_global->last_running)); - WT_ASSERT(session, txn->id != WT_TXN_NONE && txn_state->id != WT_TXN_NONE); + WT_ASSERT(session, txn->id != WT_TXN_NONE && txn_shared->id != WT_TXN_NONE); #else - WT_TXN_STATE *txn_state; + WT_TXN_SHARED *txn_shared; - txn_state = WT_SESSION_TXN_STATE(session); + txn_shared = WT_SESSION_TXN_SHARED(session); #endif - WT_PUBLISH(txn_state->id, WT_TXN_NONE); + WT_PUBLISH(txn_shared->id, WT_TXN_NONE); } /* @@ -96,7 +96,7 @@ __txn_sort_snapshot(WT_SESSION_IMPL *session, uint32_t n, uint64_t snap_max) { WT_TXN *txn; - txn = &session->txn; + txn = session->txn; if (n > 1) __snapsort(txn->snapshot, n); @@ -118,22 +118,22 @@ __wt_txn_release_snapshot(WT_SESSION_IMPL *session) { WT_TXN *txn; WT_TXN_GLOBAL *txn_global; - WT_TXN_STATE *txn_state; + WT_TXN_SHARED *txn_shared; - txn = &session->txn; + txn = session->txn; txn_global = &S2C(session)->txn_global; - txn_state = WT_SESSION_TXN_STATE(session); + txn_shared = WT_SESSION_TXN_SHARED(session); - WT_ASSERT(session, txn_state->pinned_id == WT_TXN_NONE || - session->txn.isolation == WT_ISO_READ_UNCOMMITTED || - !__wt_txn_visible_all(session, txn_state->pinned_id, WT_TS_NONE)); + WT_ASSERT(session, txn_shared->pinned_id == WT_TXN_NONE || + session->txn->isolation == WT_ISO_READ_UNCOMMITTED || + !__wt_txn_visible_all(session, txn_shared->pinned_id, WT_TS_NONE)); - txn_state->metadata_pinned = txn_state->pinned_id = WT_TXN_NONE; + txn_shared->metadata_pinned = txn_shared->pinned_id = WT_TXN_NONE; F_CLR(txn, WT_TXN_HAS_SNAPSHOT); /* Clear a checkpoint's pinned ID. */ if (WT_SESSION_IS_CHECKPOINT(session)) { - txn_global->checkpoint_state.pinned_id = WT_TXN_NONE; + txn_global->checkpoint_txn_shared.pinned_id = WT_TXN_NONE; txn_global->checkpoint_timestamp = 0; } @@ -150,14 +150,14 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) WT_CONNECTION_IMPL *conn; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; - WT_TXN_STATE *s, *txn_state; + WT_TXN_SHARED *s, *txn_shared; uint64_t commit_gen, current_id, id, prev_oldest_id, pinned_id; uint32_t i, n, session_cnt; conn = S2C(session); - txn = &session->txn; + txn = session->txn; txn_global = &conn->txn_global; - txn_state = WT_SESSION_TXN_STATE(session); + txn_shared = WT_SESSION_TXN_SHARED(session); n = 0; /* Fast path if we already have the current snapshot. */ @@ -179,14 +179,14 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) * changes the checkpoint has written to the metadata. We don't have to keep the checkpoint's * changes pinned so don't including it in the published pinned ID. */ - if ((id = txn_global->checkpoint_state.id) != WT_TXN_NONE) { + if ((id = txn_global->checkpoint_txn_shared.id) != WT_TXN_NONE) { txn->snapshot[n++] = id; - txn_state->metadata_pinned = id; + txn_shared->metadata_pinned = id; } /* For pure read-only workloads, avoid scanning. */ if (prev_oldest_id == current_id) { - txn_state->pinned_id = current_id; + txn_shared->pinned_id = current_id; /* Check that the oldest ID has not moved in the meantime. */ WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); goto done; @@ -194,7 +194,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); - for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { + for (i = 0, s = txn_global->txn_shared_list; i < session_cnt; i++, s++) { /* * Build our snapshot of any concurrent transaction IDs. * @@ -209,7 +209,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) * this case, we ignore this transaction because it would * not be visible to the current snapshot. */ - while (s != txn_state && (id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id) && + while (s != txn_shared && (id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id) && WT_TXNID_LT(id, current_id)) { /* * If the transaction is still allocating its ID, then we spin here until it gets its @@ -240,7 +240,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) */ WT_ASSERT(session, WT_TXNID_LE(prev_oldest_id, pinned_id)); WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); - txn_state->pinned_id = pinned_id; + txn_shared->pinned_id = pinned_id; done: __wt_readunlock(session, &txn_global->rwlock); @@ -258,7 +258,7 @@ __txn_oldest_scan(WT_SESSION_IMPL *session, uint64_t *oldest_idp, uint64_t *last WT_CONNECTION_IMPL *conn; WT_SESSION_IMPL *oldest_session; WT_TXN_GLOBAL *txn_global; - WT_TXN_STATE *s; + WT_TXN_SHARED *s; uint64_t id, last_running, metadata_pinned, oldest_id, prev_oldest_id; uint32_t i, session_cnt; @@ -269,12 +269,12 @@ __txn_oldest_scan(WT_SESSION_IMPL *session, uint64_t *oldest_idp, uint64_t *last /* The oldest ID cannot change while we are holding the scan lock. */ prev_oldest_id = txn_global->oldest_id; last_running = oldest_id = txn_global->current; - if ((metadata_pinned = txn_global->checkpoint_state.id) == WT_TXN_NONE) + if ((metadata_pinned = txn_global->checkpoint_txn_shared.id) == WT_TXN_NONE) metadata_pinned = oldest_id; /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); - for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { + for (i = 0, s = txn_global->txn_shared_list; i < session_cnt; i++, s++) { /* Update the last running transaction ID. */ while ((id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id) && WT_TXNID_LT(id, last_running)) { @@ -422,7 +422,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) "old snapshot %" PRIu64 " pinned in session %" PRIu32 " [%s]" " with snap_min %" PRIu64, - oldest_id, oldest_session->id, oldest_session->lastop, oldest_session->txn.snap_min); + oldest_id, oldest_session->id, oldest_session->lastop, oldest_session->txn->snap_min); } } @@ -442,7 +442,7 @@ __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]) WT_TXN *txn; wt_timestamp_t read_ts; - txn = &session->txn; + txn = session->txn; WT_RET(__wt_config_gets_def(session, cfg, "isolation", 0, &cval)); if (cval.len != 0) @@ -516,7 +516,7 @@ __wt_txn_reconfigure(WT_SESSION_IMPL *session, const char *config) WT_DECL_RET; WT_TXN *txn; - txn = &session->txn; + txn = session->txn; ret = __wt_config_getones(session, config, "isolation", &cval); if (ret == 0 && cval.len != 0) { @@ -540,7 +540,7 @@ __wt_txn_release(WT_SESSION_IMPL *session) WT_TXN *txn; WT_TXN_GLOBAL *txn_global; - txn = &session->txn; + txn = session->txn; txn_global = &S2C(session)->txn_global; WT_ASSERT(session, txn->mod_count == 0); @@ -548,8 +548,8 @@ __wt_txn_release(WT_SESSION_IMPL *session) /* Clear the transaction's ID from the global table. */ if (WT_SESSION_IS_CHECKPOINT(session)) { - WT_ASSERT(session, WT_SESSION_TXN_STATE(session)->id == WT_TXN_NONE); - txn->id = txn_global->checkpoint_state.id = WT_TXN_NONE; + WT_ASSERT(session, WT_SESSION_TXN_SHARED(session)->id == WT_TXN_NONE); + txn->id = txn_global->checkpoint_txn_shared.id = WT_TXN_NONE; /* * Be extra careful to cleanup everything for checkpoints: once the global checkpoint ID is @@ -563,7 +563,7 @@ __wt_txn_release(WT_SESSION_IMPL *session) if (!F_ISSET(txn, WT_TXN_PREPARE)) __txn_remove_from_global_table(session); else - WT_ASSERT(session, WT_SESSION_TXN_STATE(session)->id == WT_TXN_NONE); + WT_ASSERT(session, WT_SESSION_TXN_SHARED(session)->id == WT_TXN_NONE); txn->id = WT_TXN_NONE; } @@ -613,7 +613,7 @@ __txn_search_prepared_op( *updp = NULL; - txn = &session->txn; + txn = session->txn; cursor = *cursorp; if (cursor == NULL || ((WT_CURSOR_BTREE *)cursor)->btree->id != op->btree->id) { @@ -669,7 +669,7 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit, WT_TXN *txn; WT_UPDATE *upd; - txn = &session->txn; + txn = session->txn; WT_RET(__txn_search_prepared_op(session, op, cursorp, &upd)); @@ -735,7 +735,7 @@ __txn_commit_timestamps_assert(WT_SESSION_IMPL *session) u_int i; bool op_zero_ts, upd_zero_ts; - txn = &session->txn; + txn = session->txn; cursor = NULL; durable_op_timestamp = prev_op_timestamp = WT_TS_NONE; @@ -817,7 +817,7 @@ __txn_commit_timestamps_assert(WT_SESSION_IMPL *session) upd_zero_ts = prev_op_timestamp == WT_TS_NONE; if (op_zero_ts != upd_zero_ts) { WT_ERR(__wt_verbose_dump_update(session, upd)); - WT_ERR(__wt_verbose_dump_txn_one(session, &session->txn, EINVAL, + WT_ERR(__wt_verbose_dump_txn_one(session, session, EINVAL, "per-key timestamps used inconsistently, dumping relevant information")); } /* @@ -893,7 +893,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) u_int i; bool locked, prepare, readonly, update_durable_ts; - txn = &session->txn; + txn = session->txn; conn = S2C(session); cursor = NULL; txn_global = &conn->txn_global; @@ -1163,7 +1163,7 @@ __wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[]) int64_t txn_prepared_updates_count; u_int i; - txn = &session->txn; + txn = session->txn; txn_prepared_updates_count = 0; WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING)); @@ -1261,7 +1261,7 @@ __wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[]) WT_STAT_CONN_INCR(session, txn_prepared_updates_count); /* Set transaction state to prepare. */ - F_SET(&session->txn, WT_TXN_PREPARE); + F_SET(session->txn, WT_TXN_PREPARE); /* Release our snapshot in case it is keeping data pinned. */ __wt_txn_release_snapshot(session); @@ -1294,7 +1294,7 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]) WT_UNUSED(cfg); cursor = NULL; - txn = &session->txn; + txn = session->txn; prepare = F_ISSET(txn, WT_TXN_PREPARE); readonly = txn->mod_count == 0; @@ -1389,7 +1389,7 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]) int __wt_txn_rollback_required(WT_SESSION_IMPL *session, const char *reason) { - session->txn.rollback_reason = reason; + session->txn->rollback_reason = reason; return (WT_ROLLBACK); } @@ -1402,18 +1402,15 @@ __wt_txn_init(WT_SESSION_IMPL *session, WT_SESSION_IMPL *session_ret) { WT_TXN *txn; - txn = &session_ret->txn; + /* Allocate the WT_TXN structure, including a variable length array of snapshot information. */ + WT_RET(__wt_calloc(session, 1, + sizeof(WT_TXN) + sizeof(txn->snapshot[0]) * S2C(session)->session_size, &session_ret->txn)); + txn = session_ret->txn; + txn->snapshot = txn->__snapshot; txn->id = WT_TXN_NONE; - WT_RET(__wt_calloc_def(session, S2C(session_ret)->session_size, &txn->snapshot)); - -#ifdef HAVE_DIAGNOSTIC - if (S2C(session_ret)->txn_global.states != NULL) { - WT_TXN_STATE *txn_state; - txn_state = WT_SESSION_TXN_STATE(session_ret); - WT_ASSERT(session, txn_state->pinned_id == WT_TXN_NONE); - } -#endif + WT_ASSERT(session, S2C(session_ret)->txn_global.txn_shared_list == NULL || + WT_SESSION_TXN_SHARED(session_ret)->pinned_id == WT_TXN_NONE); /* * Take care to clean these out in case we are reusing the transaction for eviction. @@ -1443,7 +1440,7 @@ __wt_txn_stats_update(WT_SESSION_IMPL *session) conn = S2C(session); txn_global = &conn->txn_global; stats = conn->stats; - checkpoint_pinned = txn_global->checkpoint_state.pinned_id; + checkpoint_pinned = txn_global->checkpoint_txn_shared.pinned_id; WT_STAT_SET(session, stats, txn_pinned_range, txn_global->current - txn_global->oldest_id); @@ -1471,11 +1468,13 @@ __wt_txn_stats_update(WT_SESSION_IMPL *session) checkpoint_pinned == WT_TXN_NONE ? 0 : txn_global->current - checkpoint_pinned); WT_STAT_SET(session, stats, txn_checkpoint_prep_max, conn->ckpt_prep_max); - WT_STAT_SET(session, stats, txn_checkpoint_prep_min, conn->ckpt_prep_min); + if (conn->ckpt_prep_min != UINT64_MAX) + WT_STAT_SET(session, stats, txn_checkpoint_prep_min, conn->ckpt_prep_min); WT_STAT_SET(session, stats, txn_checkpoint_prep_recent, conn->ckpt_prep_recent); WT_STAT_SET(session, stats, txn_checkpoint_prep_total, conn->ckpt_prep_total); WT_STAT_SET(session, stats, txn_checkpoint_time_max, conn->ckpt_time_max); - WT_STAT_SET(session, stats, txn_checkpoint_time_min, conn->ckpt_time_min); + if (conn->ckpt_time_min != UINT64_MAX) + WT_STAT_SET(session, stats, txn_checkpoint_time_min, conn->ckpt_time_min); WT_STAT_SET(session, stats, txn_checkpoint_time_recent, conn->ckpt_time_recent); WT_STAT_SET(session, stats, txn_checkpoint_time_total, conn->ckpt_time_total); WT_STAT_SET(session, stats, txn_durable_queue_len, txn_global->durable_timestampq_len); @@ -1491,7 +1490,8 @@ __wt_txn_release_resources(WT_SESSION_IMPL *session) { WT_TXN *txn; - txn = &session->txn; + if ((txn = session->txn) == NULL) + return; WT_ASSERT(session, txn->mod_count == 0); __wt_free(session, txn->mod); @@ -1507,7 +1507,7 @@ void __wt_txn_destroy(WT_SESSION_IMPL *session) { __wt_txn_release_resources(session); - __wt_free(session, session->txn.snapshot); + __wt_free(session, session->txn); } /* @@ -1519,7 +1519,7 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CONNECTION_IMPL *conn; WT_TXN_GLOBAL *txn_global; - WT_TXN_STATE *s; + WT_TXN_SHARED *s; u_int i; WT_UNUSED(cfg); @@ -1539,9 +1539,9 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) WT_RWLOCK_INIT_TRACKED(session, &txn_global->read_timestamp_rwlock, read_timestamp); TAILQ_INIT(&txn_global->read_timestamph); - WT_RET(__wt_calloc_def(session, conn->session_size, &txn_global->states)); + WT_RET(__wt_calloc_def(session, conn->session_size, &txn_global->txn_shared_list)); - for (i = 0, s = txn_global->states; i < conn->session_size; i++, s++) + for (i = 0, s = txn_global->txn_shared_list; i < conn->session_size; i++, s++) s->id = s->metadata_pinned = s->pinned_id = WT_TXN_NONE; return (0); @@ -1568,7 +1568,7 @@ __wt_txn_global_destroy(WT_SESSION_IMPL *session) __wt_rwlock_destroy(session, &txn_global->durable_timestamp_rwlock); __wt_rwlock_destroy(session, &txn_global->read_timestamp_rwlock); __wt_rwlock_destroy(session, &txn_global->visibility_rwlock); - __wt_free(session, txn_global->states); + __wt_free(session, txn_global->txn_shared_list); } /* @@ -1669,11 +1669,11 @@ int __wt_txn_is_blocking(WT_SESSION_IMPL *session) { WT_TXN *txn; - WT_TXN_STATE *txn_state; + WT_TXN_SHARED *txn_shared; uint64_t global_oldest; - txn = &session->txn; - txn_state = WT_SESSION_TXN_STATE(session); + txn = session->txn; + txn_shared = WT_SESSION_TXN_SHARED(session); global_oldest = S2C(session)->txn_global.oldest_id; /* We can't roll back prepared transactions. */ @@ -1691,7 +1691,7 @@ __wt_txn_is_blocking(WT_SESSION_IMPL *session) /* * Check if either the transaction's ID or its pinned ID is equal to the oldest transaction ID. */ - return (txn_state->id == global_oldest || txn_state->pinned_id == global_oldest ? + return (txn_shared->id == global_oldest || txn_shared->pinned_id == global_oldest ? __wt_txn_rollback_required( session, "oldest pinned transaction ID rolled back for eviction") : 0); @@ -1703,12 +1703,17 @@ __wt_txn_is_blocking(WT_SESSION_IMPL *session) */ int __wt_verbose_dump_txn_one( - WT_SESSION_IMPL *session, WT_TXN *txn, int error_code, const char *error_string) + WT_SESSION_IMPL *session, WT_SESSION_IMPL *txn_session, int error_code, const char *error_string) { + WT_TXN *txn; + WT_TXN_SHARED *txn_shared; char buf[512]; - char ts_string[5][WT_TS_INT_STRING_SIZE]; + char ts_string[7][WT_TS_INT_STRING_SIZE]; const char *iso_tag; + txn = txn_session->txn; + txn_shared = WT_SESSION_TXN_SHARED(txn_session); + WT_NOT_READ(iso_tag, "INVALID"); switch (txn->isolation) { case WT_ISO_READ_COMMITTED: @@ -1734,6 +1739,8 @@ __wt_verbose_dump_txn_one( ", first_commit_timestamp: %s" ", prepare_timestamp: %s" ", read_timestamp: %s" + ", pinned_durable_timestamp: %s" + ", pinned_read_timestamp: %s" ", checkpoint LSN: [%" PRIu32 "][%" PRIu32 "]" ", full checkpoint: %s" ", rollback reason: %s" @@ -1743,8 +1750,10 @@ __wt_verbose_dump_txn_one( __wt_timestamp_to_string(txn->durable_timestamp, ts_string[1]), __wt_timestamp_to_string(txn->first_commit_timestamp, ts_string[2]), __wt_timestamp_to_string(txn->prepare_timestamp, ts_string[3]), - __wt_timestamp_to_string(txn->read_timestamp, ts_string[4]), txn->ckpt_lsn.l.file, - txn->ckpt_lsn.l.offset, txn->full_ckpt ? "true" : "false", + __wt_timestamp_to_string(txn->read_timestamp, ts_string[4]), + __wt_timestamp_to_string(txn_shared->pinned_durable_timestamp, ts_string[5]), + __wt_timestamp_to_string(txn_shared->pinned_read_timestamp, ts_string[6]), + txn->ckpt_lsn.l.file, txn->ckpt_lsn.l.offset, txn->full_ckpt ? "true" : "false", txn->rollback_reason == NULL ? "" : txn->rollback_reason, txn->flags, iso_tag)); /* @@ -1769,7 +1778,7 @@ __wt_verbose_dump_txn(WT_SESSION_IMPL *session) WT_CONNECTION_IMPL *conn; WT_SESSION_IMPL *sess; WT_TXN_GLOBAL *txn_global; - WT_TXN_STATE *s; + WT_TXN_SHARED *s; uint64_t id; uint32_t i, session_cnt; char ts_string[WT_TS_INT_STRING_SIZE]; @@ -1808,9 +1817,9 @@ __wt_verbose_dump_txn(WT_SESSION_IMPL *session) __wt_msg(session, "checkpoint running: %s", txn_global->checkpoint_running ? "yes" : "no")); WT_RET( __wt_msg(session, "checkpoint generation: %" PRIu64, __wt_gen(session, WT_GEN_CHECKPOINT))); - WT_RET( - __wt_msg(session, "checkpoint pinned ID: %" PRIu64, txn_global->checkpoint_state.pinned_id)); - WT_RET(__wt_msg(session, "checkpoint txn ID: %" PRIu64, txn_global->checkpoint_state.id)); + WT_RET(__wt_msg( + session, "checkpoint pinned ID: %" PRIu64, txn_global->checkpoint_txn_shared.pinned_id)); + WT_RET(__wt_msg(session, "checkpoint txn ID: %" PRIu64, txn_global->checkpoint_txn_shared.id)); WT_ORDERED_READ(session_cnt, conn->session_cnt); WT_RET(__wt_msg(session, "session count: %" PRIu32, session_cnt)); @@ -1821,7 +1830,7 @@ __wt_verbose_dump_txn(WT_SESSION_IMPL *session) * handles is not thread safe, so some information may change while traversing if other threads * are active at the same time, which is OK since this is diagnostic code. */ - for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { + for (i = 0, s = txn_global->txn_shared_list; i < session_cnt; i++, s++) { /* Skip sessions with no active transaction */ if ((id = s->id) == WT_TXN_NONE && s->pinned_id == WT_TXN_NONE) continue; @@ -1829,7 +1838,7 @@ __wt_verbose_dump_txn(WT_SESSION_IMPL *session) WT_RET(__wt_msg(session, "ID: %" PRIu64 ", pinned ID: %" PRIu64 ", metadata pinned ID: %" PRIu64 ", name: %s", id, s->pinned_id, s->metadata_pinned, sess->name == NULL ? "EMPTY" : sess->name)); - WT_RET(__wt_verbose_dump_txn_one(session, &sess->txn, 0, NULL)); + WT_RET(__wt_verbose_dump_txn_one(session, sess, 0, NULL)); } return (0); diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index c72f107bf08..af97d01a0fb 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -269,18 +269,16 @@ __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) return (0); /* - * We may have raced between starting the checkpoint transaction and - * some operation completing on the handle that updated the metadata - * (e.g., closing a bulk load cursor). All such operations either have - * exclusive access to the handle or hold the schema lock. We are now - * holding the schema lock and have an open btree handle, so if we - * can't update the metadata, then there has been some state change - * invisible to the checkpoint transaction. + * We may have raced between starting the checkpoint transaction and some operation completing + * on the handle that updated the metadata (e.g., closing a bulk load cursor). All such + * operations either have exclusive access to the handle or hold the schema lock. We are now + * holding the schema lock and have an open btree handle, so if we can't update the metadata, + * then there has been some state change invisible to the checkpoint transaction. */ if (!WT_IS_METADATA(session->dhandle)) { WT_CURSOR *meta_cursor; - WT_ASSERT(session, !F_ISSET(&session->txn, WT_TXN_ERROR)); + WT_ASSERT(session, !F_ISSET(session->txn, WT_TXN_ERROR)); WT_RET(__wt_metadata_cursor(session, &meta_cursor)); meta_cursor->set_key(meta_cursor, session->dhandle->name); ret = __wt_curfile_insert_check(meta_cursor); @@ -465,7 +463,7 @@ __checkpoint_stats(WT_SESSION_IMPL *session) if (msec > conn->ckpt_time_max) conn->ckpt_time_max = msec; - if (conn->ckpt_time_min == 0 || msec < conn->ckpt_time_min) + if (msec < conn->ckpt_time_min) conn->ckpt_time_min = msec; conn->ckpt_time_recent = msec; conn->ckpt_time_total += msec; @@ -475,7 +473,7 @@ __checkpoint_stats(WT_SESSION_IMPL *session) if (msec > conn->ckpt_prep_max) conn->ckpt_prep_max = msec; - if (conn->ckpt_prep_min == 0 || msec < conn->ckpt_prep_min) + if (msec < conn->ckpt_prep_min) conn->ckpt_prep_min = msec; conn->ckpt_prep_recent = msec; conn->ckpt_prep_total += msec; @@ -531,15 +529,15 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[ WT_DECL_RET; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; - WT_TXN_STATE *txn_state; + WT_TXN_SHARED *txn_shared; const char *txn_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_begin_transaction), "isolation=snapshot", NULL}; bool use_timestamp; conn = S2C(session); - txn = &session->txn; + txn = session->txn; txn_global = &conn->txn_global; - txn_state = WT_SESSION_TXN_STATE(session); + txn_shared = WT_SESSION_TXN_SHARED(session); WT_RET(__wt_config_gets(session, cfg, "use_timestamp", &cval)); use_timestamp = (cval.val != 0); @@ -585,21 +583,21 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[ * time and only write to the metadata. */ __wt_writelock(session, &txn_global->rwlock); - txn_global->checkpoint_state = *txn_state; - txn_global->checkpoint_state.pinned_id = txn->snap_min; + txn_global->checkpoint_txn_shared = *txn_shared; + txn_global->checkpoint_txn_shared.pinned_id = txn->snap_min; /* * Sanity check that the oldest ID hasn't moved on before we have cleared our entry. */ - WT_ASSERT(session, WT_TXNID_LE(txn_global->oldest_id, txn_state->id) && - WT_TXNID_LE(txn_global->oldest_id, txn_state->pinned_id)); + WT_ASSERT(session, WT_TXNID_LE(txn_global->oldest_id, txn_shared->id) && + WT_TXNID_LE(txn_global->oldest_id, txn_shared->pinned_id)); /* * Clear our entry from the global transaction session table. Any operation that needs to know * about the ID for this checkpoint will consider the checkpoint ID in the global structure. * Most operations can safely ignore the checkpoint ID (see the visible all check for details). */ - txn_state->id = txn_state->pinned_id = txn_state->metadata_pinned = WT_TXN_NONE; + txn_shared->id = txn_shared->pinned_id = txn_shared->metadata_pinned = WT_TXN_NONE; /* * Set the checkpoint transaction's timestamp, if requested. @@ -608,7 +606,7 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[ * the stable timestamp. */ WT_ASSERT(session, !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT | WT_TXN_HAS_TS_READ | - WT_TXN_TS_PUBLISHED | WT_TXN_PUBLIC_TS_READ)); + WT_TXN_SHARED_TS_DURABLE | WT_TXN_SHARED_TS_READ)); if (use_timestamp) { /* @@ -625,7 +623,7 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[ } else { if (!F_ISSET(conn, WT_CONN_RECOVERING)) txn_global->meta_ckpt_timestamp = WT_TS_NONE; - txn->read_timestamp = WT_TS_NONE; + txn->read_timestamp = txn_shared->pinned_read_timestamp = WT_TS_NONE; } __wt_writeunlock(session, &txn_global->rwlock); @@ -756,7 +754,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) conn = S2C(session); cache = conn->cache; hs_dhandle = NULL; - txn = &session->txn; + txn = session->txn; txn_global = &conn->txn_global; saved_isolation = session->isolation; full = idle = logging = tracking = use_timestamp = false; @@ -961,7 +959,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * Now that the metadata is stable, re-open the metadata file for regular eviction by clearing * the checkpoint_pinned flag. */ - txn_global->checkpoint_state.pinned_id = WT_TXN_NONE; + txn_global->checkpoint_txn_shared.pinned_id = WT_TXN_NONE; if (full) { __checkpoint_stats(session); @@ -1632,7 +1630,7 @@ fake: * that case, we need to sync the file here or we could roll forward the metadata in recovery * and open a checkpoint that isn't yet durable. */ - if (WT_IS_METADATA(dhandle) || !F_ISSET(&session->txn, WT_TXN_RUNNING)) + if (WT_IS_METADATA(dhandle) || !F_ISSET(session->txn, WT_TXN_RUNNING)) WT_ERR(__wt_checkpoint_sync(session, NULL)); WT_ERR(__wt_meta_ckptlist_set(session, dhandle->name, btree->ckpt, &ckptlsn)); @@ -1704,7 +1702,7 @@ __checkpoint_tree_helper(WT_SESSION_IMPL *session, const char *cfg[]) bool with_timestamp; btree = S2BT(session); - txn = &session->txn; + txn = session->txn; /* Are we using a read timestamp for this checkpoint transaction? */ with_timestamp = F_ISSET(txn, WT_TXN_HAS_TS_READ); diff --git a/src/third_party/wiredtiger/src/txn/txn_ext.c b/src/third_party/wiredtiger/src/txn/txn_ext.c index b0ad3acfd9a..645d410efad 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ext.c +++ b/src/third_party/wiredtiger/src/txn/txn_ext.c @@ -21,7 +21,7 @@ __wt_ext_transaction_id(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session) session = (WT_SESSION_IMPL *)wt_session; /* Ignore failures: the only case is running out of transaction IDs. */ WT_IGNORE_RET(__wt_txn_id_check(session)); - return (session->txn.id); + return (session->txn->id); } /* @@ -37,7 +37,7 @@ __wt_ext_transaction_isolation_level(WT_EXTENSION_API *wt_api, WT_SESSION *wt_se (void)wt_api; /* Unused parameters */ session = (WT_SESSION_IMPL *)wt_session; - txn = &session->txn; + txn = session->txn; if (txn->isolation == WT_ISO_READ_COMMITTED) return (WT_TXN_ISO_READ_COMMITTED); @@ -59,7 +59,7 @@ __wt_ext_transaction_notify(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, WT (void)wt_api; /* Unused parameters */ session = (WT_SESSION_IMPL *)wt_session; - txn = &session->txn; + txn = session->txn; /* * XXX For now, a single slot for notifications: I'm not bothering with more than one because diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c index 195c4e336e5..b5f5dab0077 100644 --- a/src/third_party/wiredtiger/src/txn/txn_log.c +++ b/src/third_party/wiredtiger/src/txn/txn_log.c @@ -206,7 +206,7 @@ __txn_logrec_init(WT_SESSION_IMPL *session) uint32_t rectype; const char *fmt; - txn = &session->txn; + txn = session->txn; rectype = WT_LOGREC_COMMIT; fmt = WT_UNCHECKED_STRING(Iq); @@ -255,7 +255,7 @@ __wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) uint32_t fileid; conn = S2C(session); - txn = &session->txn; + txn = session->txn; if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) || F_ISSET(session, WT_SESSION_NO_LOGGING) || @@ -314,7 +314,7 @@ __wt_txn_log_commit(WT_SESSION_IMPL *session, const char *cfg[]) WT_TXN *txn; WT_UNUSED(cfg); - txn = &session->txn; + txn = session->txn; /* * If there are no log records there is nothing to do. */ @@ -394,10 +394,12 @@ __wt_txn_ts_log(WT_SESSION_IMPL *session) WT_CONNECTION_IMPL *conn; WT_ITEM *logrec; WT_TXN *txn; - wt_timestamp_t commit, durable, first, prepare, read; + WT_TXN_SHARED *txn_shared; + wt_timestamp_t commit, durable, first_commit, pinned_read, prepare, read; conn = S2C(session); - txn = &session->txn; + txn = session->txn; + txn_shared = WT_SESSION_TXN_SHARED(session); if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) || F_ISSET(session, WT_SESSION_NO_LOGGING) || @@ -417,21 +419,23 @@ __wt_txn_ts_log(WT_SESSION_IMPL *session) WT_RET(__txn_logrec_init(session)); logrec = txn->logrec; - commit = durable = first = prepare = read = WT_TS_NONE; + commit = durable = first_commit = pinned_read = prepare = read = WT_TS_NONE; if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) { commit = txn->commit_timestamp; - first = txn->first_commit_timestamp; + first_commit = txn->first_commit_timestamp; } if (F_ISSET(txn, WT_TXN_HAS_TS_DURABLE)) durable = txn->durable_timestamp; if (F_ISSET(txn, WT_TXN_HAS_TS_PREPARE)) prepare = txn->prepare_timestamp; - if (F_ISSET(txn, WT_TXN_HAS_TS_READ)) + if (F_ISSET(txn, WT_TXN_HAS_TS_READ)) { read = txn->read_timestamp; + pinned_read = txn_shared->pinned_read_timestamp; + } __wt_epoch(session, &t); return (__wt_logop_txn_timestamp_pack(session, logrec, (uint64_t)t.tv_sec, (uint64_t)t.tv_nsec, - commit, durable, first, prepare, read)); + commit, durable, first_commit, prepare, read, pinned_read)); } /* @@ -455,7 +459,7 @@ __wt_txn_checkpoint_log(WT_SESSION_IMPL *session, bool full, uint32_t flags, WT_ conn = S2C(session); txn_global = &conn->txn_global; - txn = &session->txn; + txn = session->txn; ckpt_lsn = &txn->ckpt_lsn; /* diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c index 920c9d67371..ed493f7765f 100644 --- a/src/third_party/wiredtiger/src/txn/txn_recover.c +++ b/src/third_party/wiredtiger/src/txn/txn_recover.c @@ -112,7 +112,7 @@ __txn_op_apply(WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t * WT_DECL_RET; WT_ITEM key, start_key, stop_key, value; WT_SESSION_IMPL *session; - wt_timestamp_t commit, durable, first, prepare, read; + wt_timestamp_t commit, durable, first_commit, pinned_read, prepare, read; uint64_t recno, start_recno, stop_recno, t_nsec, t_sec; uint32_t fileid, mode, optype, opsize; @@ -141,9 +141,8 @@ __txn_op_apply(WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t * WT_ERR_NOTFOUND_OK(ret, false); else { /* - * Build/insert a complete value during recovery rather - * than using cursor modify to create a partial update - * (for no particular reason than simplicity). + * Build/insert a complete value during recovery rather than using cursor modify to + * create a partial update (for no particular reason than simplicity). */ WT_ERR(__wt_modify_apply(cursor, value.data)); WT_ERR(cursor->insert(cursor)); @@ -203,9 +202,8 @@ __txn_op_apply(WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t * WT_ERR_NOTFOUND_OK(ret, false); else { /* - * Build/insert a complete value during recovery rather - * than using cursor modify to create a partial update - * (for no particular reason than simplicity). + * Build/insert a complete value during recovery rather than using cursor modify to + * create a partial update (for no particular reason than simplicity). */ WT_ERR(__wt_modify_apply(cursor, value.data)); WT_ERR(cursor->insert(cursor)); @@ -268,8 +266,8 @@ __txn_op_apply(WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t * * Timestamp records are informational only. We have to unpack it to properly move forward * in the log record to the next operation, but otherwise ignore. */ - WT_ERR(__wt_logop_txn_timestamp_unpack( - session, pp, end, &t_sec, &t_nsec, &commit, &durable, &first, &prepare, &read)); + WT_ERR(__wt_logop_txn_timestamp_unpack(session, pp, end, &t_sec, &t_nsec, &commit, &durable, + &first_commit, &prepare, &read, &pinned_read)); break; default: WT_ERR(__wt_illegal_value(session, optype)); diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index e7ea00e8e69..b238d2dfd3a 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -1131,9 +1131,14 @@ __wt_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[], bool no_ckp WT_RET(__wt_open_internal_session(S2C(session), "txn rollback_to_stable", true, F_MASK(session, WT_SESSION_NO_LOGGING), &session)); - F_SET(session, WT_SESSION_ROLLBACK_TO_STABLE_FLAGS); + /* + * Rollback to stable should ignore tombstones in the history store since it needs to scan the + * entire table sequentially. + */ + F_SET(session, WT_SESSION_ROLLBACK_TO_STABLE | WT_SESSION_IGNORE_HS_TOMBSTONE); ret = __rollback_to_stable(session, cfg); - F_CLR(session, WT_SESSION_ROLLBACK_TO_STABLE_FLAGS); + F_CLR(session, WT_SESSION_ROLLBACK_TO_STABLE | WT_SESSION_IGNORE_HS_TOMBSTONE); + WT_RET(ret); /* * If the configuration is not in-memory, forcibly log a checkpoint after rollback to stable to diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c index a10745aa411..a000e86cc87 100644 --- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c +++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c @@ -141,13 +141,13 @@ __wt_txn_parse_timestamp( /* * __txn_get_read_timestamp -- * Get the read timestamp from the transaction. Additionally return bool to specify whether the - * transaction has set clear read queue flag. + * transaction has set the clear read queue flag. */ static bool -__txn_get_read_timestamp(WT_TXN *txn, wt_timestamp_t *read_timestampp) +__txn_get_read_timestamp(WT_TXN_SHARED *txn_shared, wt_timestamp_t *read_timestampp) { - WT_ORDERED_READ(*read_timestampp, txn->read_timestamp); - return (!txn->clear_read_q); + WT_ORDERED_READ(*read_timestampp, txn_shared->pinned_read_timestamp); + return (!txn_shared->clear_read_q); } /* @@ -158,8 +158,8 @@ int __wt_txn_get_pinned_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *tsp, uint32_t flags) { WT_CONNECTION_IMPL *conn; - WT_TXN *txn; WT_TXN_GLOBAL *txn_global; + WT_TXN_SHARED *txn_shared; wt_timestamp_t tmp_read_ts, tmp_ts; bool include_oldest, txn_has_write_lock; @@ -185,13 +185,13 @@ __wt_txn_get_pinned_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *tsp, uin /* Look for the oldest ordinary reader. */ __wt_readlock(session, &txn_global->read_timestamp_rwlock); - TAILQ_FOREACH (txn, &txn_global->read_timestamph, read_timestampq) { + TAILQ_FOREACH (txn_shared, &txn_global->read_timestamph, read_timestampq) { /* * Skip any transactions on the queue that are not active. Copy out value of read timestamp * to prevent possible race where a transaction resets its read timestamp while we traverse * the queue. */ - if (!__txn_get_read_timestamp(txn, &tmp_read_ts)) + if (!__txn_get_read_timestamp(txn_shared, &tmp_read_ts)) continue; /* * A zero timestamp is possible here only when the oldest timestamp is not accounted for. @@ -213,37 +213,15 @@ __wt_txn_get_pinned_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *tsp, uin } /* - * __txn_get_published_timestamp -- - * Get the current durable timestamp for a given transaction. If there is an explicit durable - * timestamp, this function will return the commit timestamp since this is implied. If there is - * neither a commit nor a durable timestamp, this function will return 0. + * __txn_get_durable_timestamp -- + * Get the durable timestamp from the transaction. Additionally return bool to specify whether + * the transaction has set the clear durable queue flag. */ -static inline wt_timestamp_t -__txn_get_published_timestamp(WT_SESSION_IMPL *session, WT_TXN *txn) +static bool +__txn_get_durable_timestamp(WT_TXN_SHARED *txn_shared, wt_timestamp_t *durable_timestampp) { - wt_timestamp_t ts; - - /* - * Any checking of bit flags in this logic is invalid. __wt_txn_release may have already been - * called on this transaction which will set the flags member to 0. So we need to deduce which - * timestamp to use purely by inspecting the timestamp members which we deliberately preserve - * for reader threads such as ourselves. - * - * In the non-prepared case, the first commit will either be less than the commit (in the case - * of multiple commits) in which case we should return the first commit. Or it will be equal to - * the commit (in the case of a single commit) and we can return durable (which is mirrored from - * the commit timestamp). - * - * In the prepared case, the first commit will always be equal to the commit so we'll return - * durable. - */ - if (txn->commit_timestamp != txn->first_commit_timestamp) - ts = txn->first_commit_timestamp; - else - ts = txn->durable_timestamp; - - WT_ASSERT(session, ts != WT_TS_NONE); - return (ts); + WT_ORDERED_READ(*durable_timestampp, txn_shared->pinned_durable_timestamp); + return (!txn_shared->clear_durable_q); } /* @@ -255,8 +233,8 @@ __txn_global_query_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *tsp, cons { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; - WT_TXN *txn; WT_TXN_GLOBAL *txn_global; + WT_TXN_SHARED *txn_shared; wt_timestamp_t ts, tmpts; conn = S2C(session); @@ -280,14 +258,13 @@ __txn_global_query_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *tsp, cons * Compare with the least recently durable transaction. */ __wt_readlock(session, &txn_global->durable_timestamp_rwlock); - TAILQ_FOREACH (txn, &txn_global->durable_timestamph, durable_timestampq) { - if (txn->clear_durable_q) - continue; - - tmpts = __txn_get_published_timestamp(session, txn) - 1; - if (tmpts < ts) - ts = tmpts; - break; + TAILQ_FOREACH (txn_shared, &txn_global->durable_timestamph, durable_timestampq) { + if (__txn_get_durable_timestamp(txn_shared, &tmpts)) { + --tmpts; + if (tmpts < ts) + ts = tmpts; + break; + } } __wt_readunlock(session, &txn_global->durable_timestamp_rwlock); @@ -333,8 +310,10 @@ __txn_query_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *tsp, const char { WT_CONFIG_ITEM cval; WT_TXN *txn; + WT_TXN_SHARED *txn_shared; - txn = &session->txn; + txn = session->txn; + txn_shared = WT_SESSION_TXN_SHARED(session); WT_STAT_CONN_INCR(session, session_query_ts); if (!F_ISSET(txn, WT_TXN_RUNNING)) @@ -348,7 +327,7 @@ __txn_query_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *tsp, const char else if (WT_STRING_MATCH("prepare", cval.str, cval.len)) *tsp = txn->prepare_timestamp; else if (WT_STRING_MATCH("read", cval.str, cval.len)) - *tsp = txn->read_timestamp; + *tsp = txn_shared->pinned_read_timestamp; else WT_RET_MSG(session, EINVAL, "unknown timestamp query %.*s", (int)cval.len, cval.str); @@ -603,23 +582,26 @@ set: */ static int __txn_assert_after_reads( - WT_SESSION_IMPL *session, const char *op, wt_timestamp_t ts, WT_TXN **prevp) + WT_SESSION_IMPL *session, const char *op, wt_timestamp_t ts, WT_TXN_SHARED **prev_sharedp) { #ifdef HAVE_DIAGNOSTIC - WT_TXN *prev, *txn = &session->txn; - WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global; + WT_TXN_GLOBAL *txn_global; + WT_TXN_SHARED *prev_shared, *txn_shared; wt_timestamp_t tmp_timestamp; char ts_string[2][WT_TS_INT_STRING_SIZE]; + txn_global = &S2C(session)->txn_global; + txn_shared = WT_SESSION_TXN_SHARED(session); + __wt_readlock(session, &txn_global->read_timestamp_rwlock); - prev = TAILQ_LAST(&txn_global->read_timestamph, __wt_txn_rts_qh); - while (prev != NULL) { + prev_shared = TAILQ_LAST(&txn_global->read_timestamph, __wt_txn_rts_qh); + while (prev_shared != NULL) { /* * Skip self and non-active transactions. Copy out value of read timestamp to prevent * possible race where a transaction resets its read timestamp while we traverse the queue. */ - if (!__txn_get_read_timestamp(prev, &tmp_timestamp) || prev == txn) { - prev = TAILQ_PREV(prev, __wt_txn_rts_qh, read_timestampq); + if (!__txn_get_read_timestamp(prev_shared, &tmp_timestamp) || prev_shared == txn_shared) { + prev_shared = TAILQ_PREV(prev_shared, __wt_txn_rts_qh, read_timestampq); continue; } @@ -636,13 +618,13 @@ __txn_assert_after_reads( __wt_readunlock(session, &txn_global->read_timestamp_rwlock); - if (prevp != NULL) - *prevp = prev; + if (prev_sharedp != NULL) + *prev_sharedp = prev_shared; #else WT_UNUSED(session); WT_UNUSED(op); WT_UNUSED(ts); - WT_UNUSED(prevp); + WT_UNUSED(prev_sharedp); #endif return (0); @@ -658,12 +640,15 @@ __txn_assert_after_reads( int __wt_txn_set_commit_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t commit_ts) { - WT_TXN *txn = &session->txn; - WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global; + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; wt_timestamp_t oldest_ts, stable_ts; char ts_string[2][WT_TS_INT_STRING_SIZE]; bool has_oldest_ts, has_stable_ts; + txn = session->txn; + txn_global = &S2C(session)->txn_global; + /* Added this redundant initialization to circumvent build failure. */ oldest_ts = stable_ts = WT_TS_NONE; @@ -764,12 +749,15 @@ __wt_txn_set_commit_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t commit_ts int __wt_txn_set_durable_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t durable_ts) { - WT_TXN *txn = &session->txn; - WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global; + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; wt_timestamp_t oldest_ts, stable_ts; char ts_string[2][WT_TS_INT_STRING_SIZE]; bool has_oldest_ts, has_stable_ts; + txn = session->txn; + txn_global = &S2C(session)->txn_global; + /* Added this redundant initialization to circumvent build failure. */ oldest_ts = stable_ts = 0; @@ -827,11 +815,16 @@ __wt_txn_set_durable_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t durable_ int __wt_txn_set_prepare_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t prepare_ts) { - WT_TXN *prev, *txn = &session->txn; - WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global; + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + WT_TXN_SHARED *prev_shared, *txn_shared; wt_timestamp_t oldest_ts; char ts_string[2][WT_TS_INT_STRING_SIZE]; + txn = session->txn; + txn_global = &S2C(session)->txn_global; + prev_shared = txn_shared = WT_SESSION_TXN_SHARED(session); + WT_RET(__wt_txn_context_prepare_check(session)); if (F_ISSET(txn, WT_TXN_HAS_TS_PREPARE)) @@ -842,7 +835,7 @@ __wt_txn_set_prepare_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t prepare_ "commit timestamp " "should not have been set before the prepare timestamp"); - WT_RET(__txn_assert_after_reads(session, "prepare", prepare_ts, &prev)); + WT_RET(__txn_assert_after_reads(session, "prepare", prepare_ts, &prev_shared)); /* * Check whether the prepare timestamp is less than the oldest timestamp. @@ -857,7 +850,7 @@ __wt_txn_set_prepare_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t prepare_ * Check that there are no active readers. That would be a violation of preconditions * for rounding timestamps of prepared transactions. */ - WT_ASSERT(session, prev == NULL); + WT_ASSERT(session, prev_shared == NULL); __wt_verbose(session, WT_VERB_TIMESTAMP, "prepare timestamp %s rounded to oldest " @@ -886,12 +879,17 @@ __wt_txn_set_prepare_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t prepare_ int __wt_txn_set_read_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t read_ts) { - WT_TXN *txn = &session->txn; - WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global; + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + WT_TXN_SHARED *txn_shared; wt_timestamp_t ts_oldest; char ts_string[2][WT_TS_INT_STRING_SIZE]; bool did_roundup_to_oldest; + txn = session->txn; + txn_global = &S2C(session)->txn_global; + txn_shared = WT_SESSION_TXN_SHARED(session); + WT_RET(__wt_txn_context_prepare_check(session)); /* Read timestamps imply / require snapshot isolation. */ @@ -922,7 +920,7 @@ __wt_txn_set_read_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t read_ts) * oldest timestamp. */ if (F_ISSET(txn, WT_TXN_TS_ROUND_READ)) { - txn->read_timestamp = ts_oldest; + txn->read_timestamp = txn_shared->pinned_read_timestamp = ts_oldest; did_roundup_to_oldest = true; } else { __wt_readunlock(session, &txn_global->rwlock); @@ -942,7 +940,7 @@ __wt_txn_set_read_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t read_ts) return (EINVAL); } } else - txn->read_timestamp = read_ts; + txn->read_timestamp = txn_shared->pinned_read_timestamp = read_ts; __wt_txn_publish_read_timestamp(session); __wt_readunlock(session, &txn_global->rwlock); @@ -1002,7 +1000,7 @@ __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_txn_set_durable_timestamp(session, ts)); } - __wt_txn_publish_timestamp(session); + __wt_txn_publish_durable_timestamp(session); /* Look for a read timestamp. */ WT_RET(__wt_config_gets_def(session, cfg, "read_timestamp", 0, &cval)); @@ -1025,21 +1023,23 @@ __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) } /* - * __wt_txn_publish_timestamp -- - * Publish a transaction's timestamp to the durable queue. + * __wt_txn_publish_durable_timestamp -- + * Publish a transaction's durable timestamp. */ void -__wt_txn_publish_timestamp(WT_SESSION_IMPL *session) +__wt_txn_publish_durable_timestamp(WT_SESSION_IMPL *session) { - WT_TXN *qtxn, *txn, *txn_tmp; + WT_TXN *txn; WT_TXN_GLOBAL *txn_global; - wt_timestamp_t ts; + WT_TXN_SHARED *qtxn_shared, *txn_shared, *txn_shared_tmp; + wt_timestamp_t tmpts, ts; uint64_t walked; - txn = &session->txn; + txn = session->txn; txn_global = &S2C(session)->txn_global; + txn_shared = WT_SESSION_TXN_SHARED(session); - if (F_ISSET(txn, WT_TXN_TS_PUBLISHED)) + if (F_ISSET(txn, WT_TXN_SHARED_TS_DURABLE)) return; if (F_ISSET(txn, WT_TXN_HAS_TS_DURABLE)) @@ -1053,7 +1053,7 @@ __wt_txn_publish_timestamp(WT_SESSION_IMPL *session) */ if (F_ISSET(txn, WT_TXN_PREPARE)) return; - ts = txn->commit_timestamp; + ts = txn->first_commit_timestamp; } else return; @@ -1063,9 +1063,9 @@ __wt_txn_publish_timestamp(WT_SESSION_IMPL *session) * otherwise might not remove ourselves before finding where to insert ourselves (which would * result in a list loop) and we don't want to walk more of the list than needed. */ - if (txn->clear_durable_q) { - TAILQ_REMOVE(&txn_global->durable_timestamph, txn, durable_timestampq); - WT_PUBLISH(txn->clear_durable_q, false); + if (txn_shared->clear_durable_q) { + TAILQ_REMOVE(&txn_global->durable_timestamph, txn_shared, durable_timestampq); + txn_shared->clear_durable_q = false; --txn_global->durable_timestampq_len; } /* @@ -1073,45 +1073,49 @@ __wt_txn_publish_timestamp(WT_SESSION_IMPL *session) * that are not active. We stop when we get to the location where we want to insert. */ if (TAILQ_EMPTY(&txn_global->durable_timestamph)) { - TAILQ_INSERT_HEAD(&txn_global->durable_timestamph, txn, durable_timestampq); + TAILQ_INSERT_HEAD(&txn_global->durable_timestamph, txn_shared, durable_timestampq); WT_STAT_CONN_INCR(session, txn_durable_queue_empty); } else { /* Walk from the start, removing cleared entries. */ walked = 0; - TAILQ_FOREACH_SAFE(qtxn, &txn_global->durable_timestamph, durable_timestampq, txn_tmp) + TAILQ_FOREACH_SAFE( + qtxn_shared, &txn_global->durable_timestamph, durable_timestampq, txn_shared_tmp) { ++walked; /* * Stop on the first entry that we cannot clear. */ - if (!qtxn->clear_durable_q) + if (!qtxn_shared->clear_durable_q) break; - TAILQ_REMOVE(&txn_global->durable_timestamph, qtxn, durable_timestampq); - WT_PUBLISH(qtxn->clear_durable_q, false); + TAILQ_REMOVE(&txn_global->durable_timestamph, qtxn_shared, durable_timestampq); + qtxn_shared->clear_durable_q = false; --txn_global->durable_timestampq_len; } /* * Now walk backwards from the end to find the correct position for the insert. */ - qtxn = TAILQ_LAST(&txn_global->durable_timestamph, __wt_txn_dts_qh); - while (qtxn != NULL && __txn_get_published_timestamp(session, qtxn) > ts) { + qtxn_shared = TAILQ_LAST(&txn_global->durable_timestamph, __wt_txn_dts_qh); + while (qtxn_shared != NULL && + (!__txn_get_durable_timestamp(qtxn_shared, &tmpts) || tmpts > ts)) { ++walked; - qtxn = TAILQ_PREV(qtxn, __wt_txn_dts_qh, durable_timestampq); + qtxn_shared = TAILQ_PREV(qtxn_shared, __wt_txn_dts_qh, durable_timestampq); } - if (qtxn == NULL) { - TAILQ_INSERT_HEAD(&txn_global->durable_timestamph, txn, durable_timestampq); + if (qtxn_shared == NULL) { + TAILQ_INSERT_HEAD(&txn_global->durable_timestamph, txn_shared, durable_timestampq); WT_STAT_CONN_INCR(session, txn_durable_queue_head); } else - TAILQ_INSERT_AFTER(&txn_global->durable_timestamph, qtxn, txn, durable_timestampq); + TAILQ_INSERT_AFTER( + &txn_global->durable_timestamph, qtxn_shared, txn_shared, durable_timestampq); WT_STAT_CONN_INCRV(session, txn_durable_queue_walked, walked); } ++txn_global->durable_timestampq_len; WT_STAT_CONN_INCR(session, txn_durable_queue_inserts); - txn->clear_durable_q = false; - F_SET(txn, WT_TXN_TS_PUBLISHED); + txn_shared->pinned_durable_timestamp = ts; + txn_shared->clear_durable_q = false; __wt_writeunlock(session, &txn_global->durable_timestamp_rwlock); + F_SET(txn, WT_TXN_SHARED_TS_DURABLE); } /* @@ -1122,11 +1126,12 @@ void __wt_txn_clear_durable_timestamp(WT_SESSION_IMPL *session) { WT_TXN *txn; - uint32_t flags; + WT_TXN_SHARED *txn_shared; - txn = &session->txn; + txn = session->txn; + txn_shared = WT_SESSION_TXN_SHARED(session); - if (!F_ISSET(txn, WT_TXN_TS_PUBLISHED)) + if (!F_ISSET(txn, WT_TXN_SHARED_TS_DURABLE)) return; /* @@ -1134,15 +1139,9 @@ __wt_txn_clear_durable_timestamp(WT_SESSION_IMPL *session) * durable timestamp queue whenever the next thread walks the queue. We do not need to remove it * now. */ - txn->clear_durable_q = true; - - /* - * Serialize clearing the flag with setting the queue state. The serialization has been here for - * awhile, but nobody remembers if or why it's necessary. - */ - flags = txn->flags; - LF_CLR(WT_TXN_TS_PUBLISHED); - WT_PUBLISH(txn->flags, flags); + txn_shared->clear_durable_q = true; + WT_WRITE_BARRIER(); + F_CLR(txn, WT_TXN_SHARED_TS_DURABLE); } /* @@ -1152,15 +1151,17 @@ __wt_txn_clear_durable_timestamp(WT_SESSION_IMPL *session) void __wt_txn_publish_read_timestamp(WT_SESSION_IMPL *session) { - WT_TXN *qtxn, *txn, *txn_tmp; + WT_TXN *txn; WT_TXN_GLOBAL *txn_global; + WT_TXN_SHARED *qtxn_shared, *txn_shared, *txn_shared_tmp; wt_timestamp_t tmp_timestamp; uint64_t walked; - txn = &session->txn; + txn = session->txn; txn_global = &S2C(session)->txn_global; + txn_shared = WT_SESSION_TXN_SHARED(session); - if (F_ISSET(txn, WT_TXN_PUBLIC_TS_READ)) + if (F_ISSET(txn, WT_TXN_SHARED_TS_READ)) return; __wt_writelock(session, &txn_global->read_timestamp_rwlock); @@ -1169,9 +1170,9 @@ __wt_txn_publish_read_timestamp(WT_SESSION_IMPL *session) * otherwise might not remove ourselves before finding where to insert ourselves (which would * result in a list loop) and we don't want to walk more of the list than needed. */ - if (txn->clear_read_q) { - TAILQ_REMOVE(&txn_global->read_timestamph, txn, read_timestampq); - WT_PUBLISH(txn->clear_read_q, false); + if (txn_shared->clear_read_q) { + TAILQ_REMOVE(&txn_global->read_timestamph, txn_shared, read_timestampq); + WT_PUBLISH(txn_shared->clear_read_q, false); --txn_global->read_timestampq_len; } /* @@ -1179,39 +1180,41 @@ __wt_txn_publish_read_timestamp(WT_SESSION_IMPL *session) * that are not active. We stop when we get to the location where we want to insert. */ if (TAILQ_EMPTY(&txn_global->read_timestamph)) { - TAILQ_INSERT_HEAD(&txn_global->read_timestamph, txn, read_timestampq); + TAILQ_INSERT_HEAD(&txn_global->read_timestamph, txn_shared, read_timestampq); WT_STAT_CONN_INCR(session, txn_read_queue_empty); } else { /* Walk from the start, removing cleared entries. */ walked = 0; - TAILQ_FOREACH_SAFE(qtxn, &txn_global->read_timestamph, read_timestampq, txn_tmp) + TAILQ_FOREACH_SAFE( + qtxn_shared, &txn_global->read_timestamph, read_timestampq, txn_shared_tmp) { ++walked; - if (!qtxn->clear_read_q) + if (!qtxn_shared->clear_read_q) break; - TAILQ_REMOVE(&txn_global->read_timestamph, qtxn, read_timestampq); - WT_PUBLISH(qtxn->clear_read_q, false); + TAILQ_REMOVE(&txn_global->read_timestamph, qtxn_shared, read_timestampq); + WT_PUBLISH(qtxn_shared->clear_read_q, false); --txn_global->read_timestampq_len; } /* * Now walk backwards from the end to find the correct position for the insert. */ - qtxn = TAILQ_LAST(&txn_global->read_timestamph, __wt_txn_rts_qh); - while (qtxn != NULL) { - if (!__txn_get_read_timestamp(qtxn, &tmp_timestamp) || - tmp_timestamp > txn->read_timestamp) { + qtxn_shared = TAILQ_LAST(&txn_global->read_timestamph, __wt_txn_rts_qh); + while (qtxn_shared != NULL) { + if (!__txn_get_read_timestamp(qtxn_shared, &tmp_timestamp) || + tmp_timestamp > txn_shared->pinned_read_timestamp) { ++walked; - qtxn = TAILQ_PREV(qtxn, __wt_txn_rts_qh, read_timestampq); + qtxn_shared = TAILQ_PREV(qtxn_shared, __wt_txn_rts_qh, read_timestampq); } else break; } - if (qtxn == NULL) { - TAILQ_INSERT_HEAD(&txn_global->read_timestamph, txn, read_timestampq); + if (qtxn_shared == NULL) { + TAILQ_INSERT_HEAD(&txn_global->read_timestamph, txn_shared, read_timestampq); WT_STAT_CONN_INCR(session, txn_read_queue_head); } else - TAILQ_INSERT_AFTER(&txn_global->read_timestamph, qtxn, txn, read_timestampq); + TAILQ_INSERT_AFTER( + &txn_global->read_timestamph, qtxn_shared, txn_shared, read_timestampq); WT_STAT_CONN_INCRV(session, txn_read_queue_walked, walked); } /* @@ -1220,8 +1223,8 @@ __wt_txn_publish_read_timestamp(WT_SESSION_IMPL *session) */ ++txn_global->read_timestampq_len; WT_STAT_CONN_INCR(session, txn_read_queue_inserts); - txn->clear_read_q = false; - F_SET(txn, WT_TXN_HAS_TS_READ | WT_TXN_PUBLIC_TS_READ); + txn_shared->clear_read_q = false; + F_SET(txn, WT_TXN_HAS_TS_READ | WT_TXN_SHARED_TS_READ); __wt_writeunlock(session, &txn_global->read_timestamp_rwlock); } @@ -1233,34 +1236,27 @@ void __wt_txn_clear_read_timestamp(WT_SESSION_IMPL *session) { WT_TXN *txn; - uint32_t flags; - - txn = &session->txn; - - if (!F_ISSET(txn, WT_TXN_PUBLIC_TS_READ)) { - txn->read_timestamp = WT_TS_NONE; - return; - } + WT_TXN_SHARED *txn_shared; - /* Assert the read timestamp is greater than or equal to the pinned timestamp. */ - WT_ASSERT(session, txn->read_timestamp >= S2C(session)->txn_global.pinned_timestamp); + txn = session->txn; + txn_shared = WT_SESSION_TXN_SHARED(session); - /* - * Notify other threads that our transaction is inactive and can be cleaned up safely from the - * read timestamp queue whenever the next thread walks the queue. We do not need to remove it - * now. - */ - txn->clear_read_q = true; + if (F_ISSET(txn, WT_TXN_SHARED_TS_READ)) { + /* Assert the read timestamp is greater than or equal to the pinned timestamp. */ + WT_ASSERT(session, txn->read_timestamp == txn_shared->pinned_read_timestamp && + txn->read_timestamp >= S2C(session)->txn_global.pinned_timestamp); - /* - * Serialize clearing the flag with setting the queue state. The serialization has been here for - * awhile, but nobody remembers if or why it's necessary. - */ - flags = txn->flags; - LF_CLR(WT_TXN_PUBLIC_TS_READ); - WT_PUBLISH(txn->flags, flags); + /* + * Notify other threads that our transaction is inactive and can be cleaned up safely from + * the read timestamp queue whenever the next thread walks the queue. We do not need to + * remove it now. + */ + txn_shared->clear_read_q = true; + WT_WRITE_BARRIER(); - txn->read_timestamp = WT_TS_NONE; + F_CLR(txn, WT_TXN_SHARED_TS_READ); + } + txn->read_timestamp = txn_shared->pinned_read_timestamp = WT_TS_NONE; } /* @@ -1271,36 +1267,40 @@ __wt_txn_clear_read_timestamp(WT_SESSION_IMPL *session) void __wt_txn_clear_timestamp_queues(WT_SESSION_IMPL *session) { - WT_TXN *txn; WT_TXN_GLOBAL *txn_global; + WT_TXN_SHARED *txn_shared; - txn = &session->txn; + txn_shared = WT_SESSION_TXN_SHARED(session); txn_global = &S2C(session)->txn_global; - if (!txn->clear_durable_q && !txn->clear_read_q) + /* + * If we've closed the connection, our transaction shared states may already have been freed. In + * that case, there's nothing more to do here. + */ + if (txn_shared == NULL || (!txn_shared->clear_durable_q && !txn_shared->clear_read_q)) return; - if (txn->clear_durable_q) { + if (txn_shared->clear_durable_q) { __wt_writelock(session, &txn_global->durable_timestamp_rwlock); /* * Recheck after acquiring the lock. */ - if (txn->clear_durable_q) { - TAILQ_REMOVE(&txn_global->durable_timestamph, txn, durable_timestampq); + if (txn_shared->clear_durable_q) { + TAILQ_REMOVE(&txn_global->durable_timestamph, txn_shared, durable_timestampq); --txn_global->durable_timestampq_len; - txn->clear_durable_q = false; + txn_shared->clear_durable_q = false; } __wt_writeunlock(session, &txn_global->durable_timestamp_rwlock); } - if (txn->clear_read_q) { + if (txn_shared->clear_read_q) { __wt_writelock(session, &txn_global->read_timestamp_rwlock); /* * Recheck after acquiring the lock. */ - if (txn->clear_read_q) { - TAILQ_REMOVE(&txn_global->read_timestamph, txn, read_timestampq); + if (txn_shared->clear_read_q) { + TAILQ_REMOVE(&txn_global->read_timestamph, txn_shared, read_timestampq); --txn_global->read_timestampq_len; - txn->clear_read_q = false; + txn_shared->clear_read_q = false; } __wt_writeunlock(session, &txn_global->read_timestamp_rwlock); } diff --git a/src/third_party/wiredtiger/test/format/backup.c b/src/third_party/wiredtiger/test/format/backup.c index 52a79777ca4..5ad1cfe65dc 100644 --- a/src/third_party/wiredtiger/test/format/backup.c +++ b/src/third_party/wiredtiger/test/format/backup.c @@ -379,9 +379,9 @@ backup(void *arg) * with named checkpoints. Wait for the checkpoint to complete, otherwise backups might be * starved out. */ - testutil_check(pthread_rwlock_wrlock(&g.backup_lock)); + lock_writelock(session, &g.backup_lock); if (g.workers_finished) { - testutil_check(pthread_rwlock_unlock(&g.backup_lock)); + lock_writeunlock(session, &g.backup_lock); break; } @@ -471,7 +471,7 @@ backup(void *arg) testutil_check(session->truncate(session, "log:", backup_cursor, NULL, NULL)); testutil_check(backup_cursor->close(backup_cursor)); - testutil_check(pthread_rwlock_unlock(&g.backup_lock)); + lock_writeunlock(session, &g.backup_lock); active_files_sort(active_now); active_files_remove_missing(active_prev, active_now); active_prev = active_now; diff --git a/src/third_party/wiredtiger/test/format/bulk.c b/src/third_party/wiredtiger/test/format/bulk.c index a1d9256e707..0f41a311e43 100644 --- a/src/third_party/wiredtiger/test/format/bulk.c +++ b/src/third_party/wiredtiger/test/format/bulk.c @@ -59,7 +59,7 @@ bulk_commit_transaction(WT_SESSION *session) testutil_check(session->commit_transaction(session, buf)); /* Update the oldest timestamp, otherwise updates are pinned in memory. */ - timestamp_once(); + timestamp_once(session); } /* diff --git a/src/third_party/wiredtiger/test/format/checkpoint.c b/src/third_party/wiredtiger/test/format/checkpoint.c index 0542bb8e54b..36e70ae3125 100644 --- a/src/third_party/wiredtiger/test/format/checkpoint.c +++ b/src/third_party/wiredtiger/test/format/checkpoint.c @@ -85,7 +85,7 @@ checkpoint(void *arg) * few names to test multiple named snapshots in * the system. */ - ret = pthread_rwlock_trywrlock(&g.backup_lock); + ret = lock_try_writelock(session, &g.backup_lock); if (ret == 0) { backup_locked = true; testutil_check(__wt_snprintf( @@ -98,7 +98,7 @@ checkpoint(void *arg) /* * 5% drop all named snapshots. */ - ret = pthread_rwlock_trywrlock(&g.backup_lock); + ret = lock_try_writelock(session, &g.backup_lock); if (ret == 0) { backup_locked = true; ckpt_config = "drop=(all)"; @@ -110,7 +110,7 @@ checkpoint(void *arg) testutil_check(session->checkpoint(session, ckpt_config)); if (backup_locked) - testutil_check(pthread_rwlock_unlock(&g.backup_lock)); + lock_writeunlock(session, &g.backup_lock); secs = mmrand(NULL, 5, 40); } diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c index ae104bae989..54a09229ce4 100644 --- a/src/third_party/wiredtiger/test/format/config.c +++ b/src/third_party/wiredtiger/test/format/config.c @@ -30,7 +30,7 @@ #include "config.h" static void config(void); -static void config_backup(void); +static void config_backup_incr(void); static void config_backward_compatible(void); static void config_cache(void); static void config_checkpoint(void); @@ -197,7 +197,7 @@ config(void) config_transaction(); /* Simple selection. */ - config_backup(); + config_backup_incr(); config_checkpoint(); config_checksum(); config_compression("btree.compression"); @@ -257,39 +257,55 @@ config(void) } /* - * config_backup -- - * Backup configuration. + * config_backup_incr -- + * Incremental backup configuration. */ static void -config_backup(void) +config_backup_incr(void) { - const char *cstr; + /* Incremental backup requires backup. */ + if (g.c_backups == 0) + return; /* - * Choose a type of incremental backup. + * Incremental backup using log files is incompatible with logging archival. Testing log file + * archival doesn't seem as useful as testing backup, let the backup configuration override. */ - if (!config_is_perm("backup.incremental")) { - cstr = "backup.incremental=off"; - switch (mmrand(NULL, 1, 10)) { - case 1: /* 30% full backup only */ - case 2: - case 3: - break; - case 4: /* 40% block based incremental */ - case 5: - case 6: - case 7: - cstr = "backup.incremental=block"; - break; - case 8: - case 9: - case 10: /* 30% log based incremental */ - if (!g.c_logging_archive) - cstr = "backup.incremental=log"; - break; + if (config_is_perm("backup.incremental")) { + if (g.c_backup_incr_flag == INCREMENTAL_LOG) { + if (g.c_logging_archive && config_is_perm("logging.archive")) + testutil_die(EINVAL, "backup.incremental=log is incompatible with logging.archive"); + if (g.c_logging_archive) + config_single("logging.archive=0", false); } + return; + } - config_single(cstr, false); + /* + * Choose a type of incremental backup, where the log archival setting can eliminate incremental + * backup based on log files. + */ + switch (mmrand(NULL, 1, 10)) { + case 1: /* 30% full backup only */ + case 2: + case 3: + config_single("backup.incremental=off", false); + break; + case 4: /* 30% log based incremental */ + case 5: + case 6: + if (!g.c_logging_archive || !config_is_perm("logging.archive")) { + if (g.c_logging_archive) + config_single("logging.archive=0", false); + config_single("backup.incremental=log", false); + } + /* FALLTHROUGH */ + case 7: /* 40% block based incremental */ + case 8: + case 9: + case 10: + config_single("backup.incremental=block", false); + break; } } diff --git a/src/third_party/wiredtiger/test/format/config.h b/src/third_party/wiredtiger/test/format/config.h index 59dc7cfd86c..04da641386b 100644 --- a/src/third_party/wiredtiger/test/format/config.h +++ b/src/third_party/wiredtiger/test/format/config.h @@ -336,6 +336,10 @@ static CONFIG c[] = { {"wiredtiger.config", "configuration string used to wiredtiger_open", C_IGNORE | C_STRING, 0, 0, 0, NULL, &g.c_config_open}, + /* 80% */ + {"wiredtiger.rwlock", "if wiredtiger read/write mutexes should be used", C_BOOL, 80, 0, 0, + &g.c_wt_mutex, NULL}, + {"wiredtiger.leak_memory", "if memory should be leaked on close", C_BOOL, 0, 0, 0, &g.c_leak_memory, NULL}, diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h index b96b858f0f8..6bc213a65ef 100644 --- a/src/third_party/wiredtiger/test/format/format.h +++ b/src/third_party/wiredtiger/test/format/format.h @@ -60,6 +60,20 @@ #define MAX_MODIFY_ENTRIES 5 /* maximum change vectors */ +/* + * Abstract lock that lets us use either pthread reader-writer locks or WiredTiger's own (likely + * faster) implementation. + */ +typedef struct { + union { + WT_RWLOCK wt; + pthread_rwlock_t pthread; + } l; + enum { LOCK_NONE = 0, LOCK_WT, LOCK_PTHREAD } lock_type; +} RWLOCK; + +#define LOCK_INITIALIZED(lock) ((lock)->lock_type != LOCK_NONE) + typedef struct { WT_CONNECTION *wts_conn; WT_EXTENSION_API *wt_api; @@ -92,8 +106,8 @@ typedef struct { bool logging; /* log operations */ FILE *logfp; /* log file */ - pthread_rwlock_t backup_lock; /* Backup running */ - uint64_t backup_id; /* Block incremental id */ + RWLOCK backup_lock; /* Backup running */ + uint64_t backup_id; /* Block incremental id */ WT_RAND_STATE rnd; /* Global RNG state */ @@ -104,13 +118,17 @@ typedef struct { * We get the last committed timestamp periodically in order to update the oldest timestamp, * that requires locking out transactional ops that set a timestamp. */ - pthread_rwlock_t ts_lock; + RWLOCK ts_lock; uint64_t timestamp; /* Counter for timestamps */ uint64_t truncate_cnt; /* Counter for truncation */ - pthread_rwlock_t death_lock; /* Single-thread failure */ + /* + * Single-thread failure. Always use pthread lock rather than WT lock in case WT library is + * misbehaving. + */ + pthread_rwlock_t death_lock; uint32_t c_abort; /* Config values */ uint32_t c_alter; @@ -204,6 +222,7 @@ typedef struct { uint32_t c_value_min; uint32_t c_verify; uint32_t c_write_pct; + uint32_t c_wt_mutex; #define FIX 1 #define ROW 2 @@ -351,6 +370,8 @@ void key_gen_common(WT_ITEM *, uint64_t, const char *); void key_gen_init(WT_ITEM *); void key_gen_teardown(WT_ITEM *); void key_init(void); +void lock_destroy(WT_SESSION *, RWLOCK *); +void lock_init(WT_SESSION *, RWLOCK *); void operations(u_int, bool); WT_THREAD_RET random_kv(void *); void path_setup(const char *); @@ -364,7 +385,7 @@ int snap_repeat_txn(WT_CURSOR *, TINFO *); void snap_repeat_update(TINFO *, bool); void snap_track(TINFO *, thread_op); WT_THREAD_RET timestamp(void *); -void timestamp_once(void); +void timestamp_once(WT_SESSION *); void track(const char *, uint64_t, TINFO *); void val_gen(WT_RAND_STATE *, WT_ITEM *, uint64_t); void val_gen_init(WT_ITEM *); diff --git a/src/third_party/wiredtiger/test/format/format.i b/src/third_party/wiredtiger/test/format/format.i index 00099e1c4cf..661dd096ae9 100644 --- a/src/third_party/wiredtiger/test/format/format.i +++ b/src/third_party/wiredtiger/test/format/format.i @@ -162,3 +162,51 @@ key_gen_insert(WT_RAND_STATE *rnd, WT_ITEM *key, uint64_t keyno) key_gen_common(key, keyno, suffix[mmrand(rnd, 0, 14)]); } + +/* + * lock_try_writelock + * Try to get exclusive lock. Fail immediately if not available. + */ +static inline int +lock_try_writelock(WT_SESSION *session, RWLOCK *lock) +{ + testutil_assert(LOCK_INITIALIZED(lock)); + + if (lock->lock_type == LOCK_WT) { + return (__wt_try_writelock((WT_SESSION_IMPL *)session, &lock->l.wt)); + } else { + return (pthread_rwlock_trywrlock(&lock->l.pthread)); + } +} + +/* + * lock_writelock -- + * Wait to get exclusive lock. + */ +static inline void +lock_writelock(WT_SESSION *session, RWLOCK *lock) +{ + testutil_assert(LOCK_INITIALIZED(lock)); + + if (lock->lock_type == LOCK_WT) { + __wt_writelock((WT_SESSION_IMPL *)session, &lock->l.wt); + } else { + testutil_check(pthread_rwlock_wrlock(&lock->l.pthread)); + } +} + +/* + * lock_writeunlock -- + * Release an exclusive lock. + */ +static inline void +lock_writeunlock(WT_SESSION *session, RWLOCK *lock) +{ + testutil_assert(LOCK_INITIALIZED(lock)); + + if (lock->lock_type == LOCK_WT) { + __wt_writeunlock((WT_SESSION_IMPL *)session, &lock->l.wt); + } else { + testutil_check(pthread_rwlock_unlock(&lock->l.pthread)); + } +} diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c index 0d679eb4cc3..6a668fa4f45 100644 --- a/src/third_party/wiredtiger/test/format/ops.c +++ b/src/third_party/wiredtiger/test/format/ops.c @@ -158,11 +158,13 @@ operations(u_int ops_seconds, bool lastrun) if (!SINGLETHREADED) g.rand_log_stop = true; - /* Logging requires a session. */ - if (g.logging) - testutil_check(conn->open_session(conn, NULL, NULL, &session)); + testutil_check(conn->open_session(conn, NULL, NULL, &session)); logop(session, "%s", "=============== thread ops start"); + /* Initialize locks to single-thread backups, failures, and timestamp updates. */ + lock_init(session, &g.backup_lock); + lock_init(session, &g.ts_lock); + /* * Create the per-thread structures and start the worker threads. Allocate the thread structures * separately to minimize false sharing. @@ -295,9 +297,11 @@ operations(u_int ops_seconds, bool lastrun) testutil_check(__wt_thread_join(NULL, ×tamp_tid)); g.workers_finished = false; + lock_destroy(session, &g.backup_lock); + lock_destroy(session, &g.ts_lock); + logop(session, "%s", "=============== thread ops stop"); - if (g.logging) - testutil_check(session->close(session, NULL)); + testutil_check(session->close(session, NULL)); for (i = 0; i < g.c_threads; ++i) { tinfo = tinfo_list[i]; @@ -372,13 +376,13 @@ begin_transaction_ts(TINFO *tinfo, u_int *iso_configp) * * Lock out the oldest timestamp update. */ - testutil_check(pthread_rwlock_wrlock(&g.ts_lock)); + lock_writelock(session, &g.ts_lock); ts = __wt_atomic_addv64(&g.timestamp, 1); testutil_check(__wt_snprintf(buf, sizeof(buf), "read_timestamp=%" PRIx64, ts)); testutil_check(session->timestamp_transaction(session, buf)); - testutil_check(pthread_rwlock_unlock(&g.ts_lock)); + lock_writeunlock(session, &g.ts_lock); snap_init(tinfo, ts, false); logop(session, "begin snapshot read-ts=%" PRIu64 " (not repeatable)", ts); @@ -443,7 +447,7 @@ commit_transaction(TINFO *tinfo, bool prepared) ts = 0; /* -Wconditional-uninitialized */ if (g.c_txn_timestamps) { /* Lock out the oldest timestamp update. */ - testutil_check(pthread_rwlock_wrlock(&g.ts_lock)); + lock_writelock(session, &g.ts_lock); ts = __wt_atomic_addv64(&g.timestamp, 1); testutil_check(__wt_snprintf(buf, sizeof(buf), "commit_timestamp=%" PRIx64, ts)); @@ -454,7 +458,7 @@ commit_transaction(TINFO *tinfo, bool prepared) testutil_check(session->timestamp_transaction(session, buf)); } - testutil_check(pthread_rwlock_unlock(&g.ts_lock)); + lock_writeunlock(session, &g.ts_lock); } testutil_check(session->commit_transaction(session, NULL)); @@ -509,7 +513,7 @@ prepare_transaction(TINFO *tinfo) * * Lock out the oldest timestamp update. */ - testutil_check(pthread_rwlock_wrlock(&g.ts_lock)); + lock_writelock(session, &g.ts_lock); ts = __wt_atomic_addv64(&g.timestamp, 1); testutil_check(__wt_snprintf(buf, sizeof(buf), "prepare_timestamp=%" PRIx64, ts)); @@ -517,7 +521,7 @@ prepare_transaction(TINFO *tinfo) logop(session, "prepare ts=%" PRIu64, ts); - testutil_check(pthread_rwlock_unlock(&g.ts_lock)); + lock_writeunlock(session, &g.ts_lock); return (ret); } diff --git a/src/third_party/wiredtiger/test/format/t.c b/src/third_party/wiredtiger/test/format/t.c index 6d897131f28..a2fcf405cf9 100644 --- a/src/third_party/wiredtiger/test/format/t.c +++ b/src/third_party/wiredtiger/test/format/t.c @@ -118,10 +118,8 @@ format_process_env(void) (void)signal(SIGTERM, signal_handler); #endif - /* Initialize locks to single-thread backups, failures, and timestamp updates. */ - testutil_check(pthread_rwlock_init(&g.backup_lock, NULL)); + /* Initialize lock to ensure single threading during failure handling */ testutil_check(pthread_rwlock_init(&g.death_lock, NULL)); - testutil_check(pthread_rwlock_init(&g.ts_lock, NULL)); #if 0 /* Configure the GNU malloc for debugging. */ @@ -328,10 +326,6 @@ main(int argc, char *argv[]) config_print(false); - testutil_check(pthread_rwlock_destroy(&g.backup_lock)); - testutil_check(pthread_rwlock_destroy(&g.death_lock)); - testutil_check(pthread_rwlock_destroy(&g.ts_lock)); - config_clear(); printf("%s: successful run completed\n", progname); diff --git a/src/third_party/wiredtiger/test/format/util.c b/src/third_party/wiredtiger/test/format/util.c index e88a9a0aa7e..7aba99c20de 100644 --- a/src/third_party/wiredtiger/test/format/util.c +++ b/src/third_party/wiredtiger/test/format/util.c @@ -233,7 +233,7 @@ fclose_and_clear(FILE **fpp) * Update the timestamp once. */ void -timestamp_once(void) +timestamp_once(WT_SESSION *session) { static const char *oldest_timestamp_str = "oldest_timestamp="; WT_CONNECTION *conn; @@ -246,16 +246,20 @@ timestamp_once(void) /* * Lock out transaction timestamp operations. The lock acts as a barrier ensuring we've checked - * if the workers have finished, we don't want that line reordered. + * if the workers have finished, we don't want that line reordered. We can also be called from + * places, such as bulk load, where we are single-threaded and the locks haven't been + * initialized. */ - testutil_check(pthread_rwlock_wrlock(&g.ts_lock)); + if (LOCK_INITIALIZED(&g.ts_lock)) + lock_writelock(session, &g.ts_lock); ret = conn->query_timestamp(conn, buf + strlen(oldest_timestamp_str), "get=all_durable"); testutil_assert(ret == 0 || ret == WT_NOTFOUND); if (ret == 0) testutil_check(conn->set_timestamp(conn, buf)); - testutil_check(pthread_rwlock_unlock(&g.ts_lock)); + if (LOCK_INITIALIZED(&g.ts_lock)) + lock_writeunlock(session, &g.ts_lock); } /* @@ -265,9 +269,15 @@ timestamp_once(void) WT_THREAD_RET timestamp(void *arg) { + WT_CONNECTION *conn; + WT_SESSION *session; bool done; (void)(arg); + conn = g.wts_conn; + + /* Locks need session */ + testutil_check(conn->open_session(conn, NULL, NULL, &session)); /* Update the oldest timestamp at least once every 15 seconds. */ done = false; @@ -281,10 +291,11 @@ timestamp(void *arg) else random_sleep(&g.rnd, 15); - timestamp_once(); + timestamp_once(session); } while (!done); + testutil_check(session->close(session, NULL)); return (WT_THREAD_RET_VALUE); } @@ -334,3 +345,38 @@ alter(void *arg) testutil_check(session->close(session, NULL)); return (WT_THREAD_RET_VALUE); } + +/* + * lock_init -- + * Initialize abstract lock that can use either pthread of wt reader-writer locks. + */ +void +lock_init(WT_SESSION *session, RWLOCK *lock) +{ + testutil_assert(lock->lock_type == LOCK_NONE); + + if (g.c_wt_mutex) { + testutil_check(__wt_rwlock_init((WT_SESSION_IMPL *)session, &lock->l.wt)); + lock->lock_type = LOCK_WT; + } else { + testutil_check(pthread_rwlock_init(&lock->l.pthread, NULL)); + lock->lock_type = LOCK_PTHREAD; + } +} + +/* + * lock_destroy -- + * Destroy abstract lock. + */ +void +lock_destroy(WT_SESSION *session, RWLOCK *lock) +{ + testutil_assert(LOCK_INITIALIZED(lock)); + + if (lock->lock_type == LOCK_WT) { + __wt_rwlock_destroy((WT_SESSION_IMPL *)session, &lock->l.wt); + } else { + testutil_check(pthread_rwlock_destroy(&lock->l.pthread)); + } + lock->lock_type = LOCK_NONE; +} diff --git a/src/third_party/wiredtiger/test/multiversion/wt_multiversion.sh b/src/third_party/wiredtiger/test/multiversion/wt_multiversion.sh new file mode 100644 index 00000000000..f3528b5d24e --- /dev/null +++ b/src/third_party/wiredtiger/test/multiversion/wt_multiversion.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +last_stable=4.2 +last_stable_dir=wiredtiger_${last_stable}/ +last_stable_branch=mongodb-${last_stable} + +function setup_last_stable { + git clone git@github.com:wiredtiger/wiredtiger.git ${last_stable_dir} + cd ${last_stable_dir}/build_posix/ || exit + git checkout $last_stable_branch || exit 1 + bash reconf + ../configure --enable-python --enable-diagnostic + make -j 10 + # Back to multiversion/ in "latest" repo. + cd ../../ || exit +} + +function run_check { + echo + "$@" + "$@" || exit 1 +} + +# Clone and build v4.2 if it doesn't already exist. +if [ ! -d $last_stable_dir ]; then + setup_last_stable +fi + +latest_workgen=../../bench/workgen/runner/multiversion.py +last_stable_workgen=${last_stable_dir}/bench/workgen/runner/multiversion.py + +# Copy the workload into the v4.2 tree. +cp $latest_workgen $last_stable_workgen + +run_check $latest_workgen --release 4.4 +run_check $latest_workgen --keep --release 4.4 +run_check $last_stable_workgen --keep --release 4.2 +run_check $latest_workgen --keep --release 4.4 + +echo Success. +exit 0 diff --git a/src/third_party/wiredtiger/test/packing/intpack-test3.c b/src/third_party/wiredtiger/test/packing/intpack-test3.c index 8bd8cc8a8c9..43cb8834997 100644 --- a/src/third_party/wiredtiger/test/packing/intpack-test3.c +++ b/src/third_party/wiredtiger/test/packing/intpack-test3.c @@ -110,8 +110,8 @@ main(void) int64_t i; /* - * Test all values in a range, to ensure pack/unpack of small numbers - * (which most actively use different numbers of bits) works. + * Test all values in a range, to ensure pack/unpack of small numbers (which most actively use + * different numbers of bits) works. */ test_spread(0, 100000, 100000); test_spread(INT16_MAX, 1025, 1025); |