summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2020-04-22 10:45:21 +1000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-04-22 01:01:19 +0000
commitc8279c67d309858027cdb4d079ef9fd7122b1690 (patch)
treec8b4b404a399e87e3d1743d60c243c882b96fede
parentb38c6815cee932098722b72430b3237da3fe312a (diff)
downloadmongo-c8279c67d309858027cdb4d079ef9fd7122b1690.tar.gz
Import wiredtiger: 7b994a862e899a12eb7c3ac814c9fada7d8d1ab9 from branch mongodb-4.4r4.4.0-rc2
ref: 9bd1ece797..7b994a862e for: 4.4.0-rc2 WT-4701 Switch test/format to use WiredTiger locking primitives WT-5766 Separate out internal and shared transaction data WT-5791 Prepare checkpoint can finish in 0msec and reset prepare min stat WT-5794 Remove skew_newest option from reconciliation WT-5833 Fix caching issue for overflow key/value items WT-5919 Disallow logging archival testing with log-based incremental backup WT-5946 Eviction server handles can deadlock when opening HS cursors WT-5968 Make the WT_SESSION_IMPL.txn field an allocated structure WT-5986 Create script for emulating multiversion tests WT-6016 Fill source code comments where lines start with parentheticals WT-6020 __rec_append_orig_value() cleanup WT-6026 Fix s_all breakage on format.h
-rwxr-xr-xsrc/third_party/wiredtiger/bench/workgen/runner/multiversion.py71
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/config.c5
-rw-r--r--src/third_party/wiredtiger/dist/log_data.py4
-rw-r--r--src/third_party/wiredtiger/dist/s_comment.py11
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_all.c4
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_extractor.c8
-rw-r--r--src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c22
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/block/block_open.c7
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curnext.c8
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curprev.c8
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_cursor.c6
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_delete.c5
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_discard.c19
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_ovfl.c133
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_page.c6
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_read.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_sync.c8
-rw-r--r--src/third_party/wiredtiger/src/btree/col_modify.c29
-rw-r--r--src/third_party/wiredtiger/src/btree/col_srch.c5
-rw-r--r--src/third_party/wiredtiger/src/btree/row_srch.c33
-rw-r--r--src/third_party/wiredtiger/src/conn/api_calc_modify.c11
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_api.c19
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_dhandle.c19
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_handle.c3
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_open.c12
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_stat.c14
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_sweep.c17
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_backup.c19
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_file.c11
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_index.c21
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_join.c2
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_std.c4
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_table.c32
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_lru.c28
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_page.c3
-rw-r--r--src/third_party/wiredtiger/src/history/hs.c37
-rw-r--r--src/third_party/wiredtiger/src/include/api.h50
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h12
-rw-r--r--src/third_party/wiredtiger/src/include/btree.i4
-rw-r--r--src/third_party/wiredtiger/src/include/cache.h6
-rw-r--r--src/third_party/wiredtiger/src/include/cache.i8
-rw-r--r--src/third_party/wiredtiger/src/include/cursor.i5
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h20
-rw-r--r--src/third_party/wiredtiger/src/include/reconcile.h5
-rw-r--r--src/third_party/wiredtiger/src/include/session.h22
-rw-r--r--src/third_party/wiredtiger/src/include/time.i2
-rw-r--r--src/third_party/wiredtiger/src/include/txn.h118
-rw-r--r--src/third_party/wiredtiger/src/include/txn.i105
-rw-r--r--src/third_party/wiredtiger/src/include/verify_build.h2
-rw-r--r--src/third_party/wiredtiger/src/include/wt_internal.h4
-rw-r--r--src/third_party/wiredtiger/src/log/log.c13
-rw-r--r--src/third_party/wiredtiger/src/log/log_auto.c30
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_cursor.c27
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_work_unit.c6
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_ckpt.c9
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_track.c10
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_fs.c12
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_col.c2
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_row.c2
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_visibility.c104
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c124
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_util.c2
-rw-r--r--src/third_party/wiredtiger/src/session/session_api.c34
-rw-r--r--src/third_party/wiredtiger/src/support/hazard.c27
-rw-r--r--src/third_party/wiredtiger/src/support/huffman.c7
-rw-r--r--src/third_party/wiredtiger/src/support/pow.c7
-rw-r--r--src/third_party/wiredtiger/src/support/thread_group.c3
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c163
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_ckpt.c46
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_ext.c6
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_log.c24
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_recover.c16
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c9
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_timestamp.c324
-rw-r--r--src/third_party/wiredtiger/test/format/backup.c6
-rw-r--r--src/third_party/wiredtiger/test/format/bulk.c2
-rw-r--r--src/third_party/wiredtiger/test/format/checkpoint.c6
-rw-r--r--src/third_party/wiredtiger/test/format/config.c70
-rw-r--r--src/third_party/wiredtiger/test/format/config.h4
-rw-r--r--src/third_party/wiredtiger/test/format/format.h31
-rw-r--r--src/third_party/wiredtiger/test/format/format.i48
-rw-r--r--src/third_party/wiredtiger/test/format/ops.c26
-rw-r--r--src/third_party/wiredtiger/test/format/t.c8
-rw-r--r--src/third_party/wiredtiger/test/format/util.c56
-rw-r--r--src/third_party/wiredtiger/test/multiversion/wt_multiversion.sh40
-rw-r--r--src/third_party/wiredtiger/test/packing/intpack-test3.c4
87 files changed, 1204 insertions, 1115 deletions
diff --git a/src/third_party/wiredtiger/bench/workgen/runner/multiversion.py b/src/third_party/wiredtiger/bench/workgen/runner/multiversion.py
new file mode 100755
index 00000000000..be98187e542
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/workgen/runner/multiversion.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2020 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+from runner import *
+from wiredtiger import *
+from workgen import *
+
+def show(tname, s):
+ print('')
+ print('<><><><> ' + tname + ' <><><><>')
+ c = s.open_cursor(tname, None)
+ for k,v in c:
+ print('key: ' + k)
+ print('value: ' + v)
+ print('<><><><><><><><><><><><>')
+ c.close()
+
+def create_compat_config(args):
+ if args.release == "4.4":
+ return ',compatibility=(release="3.3", require_min="3.2.0")'
+ elif args.release == "4.2":
+ return ',compatibility=(release="3.2", require_max="3.3.0")'
+ else:
+ return ''
+
+context = Context()
+context.parser.add_argument("--release", dest="release", type=str,
+ choices=["4.2", "4.4"], help="The WiredTiger version")
+context.initialize() # parse the arguments.
+conn = context.wiredtiger_open("create,cache_size=1G," + create_compat_config(context.args))
+
+s = conn.open_session()
+tname = 'table:simple'
+s.create(tname, 'key_format=S,value_format=S')
+
+ops = Operation(Operation.OP_INSERT, Table(tname), Key(Key.KEYGEN_APPEND, 10), Value(40))
+thread = Thread(ops)
+workload = Workload(context, thread)
+workload.run(conn)
+show(tname, s)
+
+thread = Thread(ops * 5)
+workload = Workload(context, thread)
+workload.run(conn)
+show(tname, s)
diff --git a/src/third_party/wiredtiger/bench/wtperf/config.c b/src/third_party/wiredtiger/bench/wtperf/config.c
index 70e7ea75e7c..9e589e2a55f 100644
--- a/src/third_party/wiredtiger/bench/wtperf/config.c
+++ b/src/third_party/wiredtiger/bench/wtperf/config.c
@@ -537,9 +537,8 @@ config_opt_file(WTPERF *wtperf, const char *filename)
;
/*
- * Find the end of the line; if there's no trailing newline, the
- * the line is too long for the buffer or the file was corrupted
- * (there's no terminating newline in the file).
+ * Find the end of the line; if there's no trailing newline, the line is too long for the
+ * buffer or the file was corrupted (there's no terminating newline in the file).
*/
for (rtrim = line; *rtrim && *rtrim != '\n'; rtrim++)
;
diff --git a/src/third_party/wiredtiger/dist/log_data.py b/src/third_party/wiredtiger/dist/log_data.py
index 18f368eaad0..cec0c07194e 100644
--- a/src/third_party/wiredtiger/dist/log_data.py
+++ b/src/third_party/wiredtiger/dist/log_data.py
@@ -95,6 +95,6 @@ optypes = [
LogOperationType('txn_timestamp', 'txn_timestamp',
[('uint64', 'time_sec'), ('uint64', 'time_nsec'),
('uint64', 'commit_ts'), ('uint64', 'durable_ts'),
- ('uint64', 'first_ts'), ('uint64', 'prepare_ts'),
- ('uint64', 'read_ts')]),
+ ('uint64', 'first_commit_ts'), ('uint64', 'prepare_ts'),
+ ('uint64', 'read_ts'), ('uint64', 'pinned_read_ts')]),
]
diff --git a/src/third_party/wiredtiger/dist/s_comment.py b/src/third_party/wiredtiger/dist/s_comment.py
index f30de0e4794..482fded4fff 100644
--- a/src/third_party/wiredtiger/dist/s_comment.py
+++ b/src/third_party/wiredtiger/dist/s_comment.py
@@ -103,12 +103,13 @@ for line in sys.stdin:
if (len(sline) >= 3 and sline.startswith('*') and sline[1] == ' ' and
(sline[2].islower() or sline[2] == '_') and sline.endswith('--')):
function_desc = True
- # We're only reformatting block comments where each line begins with a
- # space and an alphabetic character after the asterisk. The only
- # exceptions are function descriptions.
+ # We're only reformatting block comments where each line begins with a space and an
+ # alphabetic character after the asterisk, or a parenthetical. The only exceptions
+ # are function descriptions.
block = block and \
- (len(sline) >= 3 and sline.startswith('*') and
- sline[1] == ' ' and sline[2].isalpha()) or function_desc
+ len(sline) >= 3 and sline.startswith('*') and sline[1] == ' ' and \
+ (sline[2].isalpha() or (len(sline) >= 5 and \
+ (sline[2] == '(' and sline[3].isalpha() and sline[4] != ')'))) or function_desc
# Trim asterisks at the beginning of each line in a multiline comment.
if sline.startswith('*'):
sline = sline[1:]
diff --git a/src/third_party/wiredtiger/examples/c/ex_all.c b/src/third_party/wiredtiger/examples/c/ex_all.c
index 64d3c7da5a4..7360bf92862 100644
--- a/src/third_party/wiredtiger/examples/c/ex_all.c
+++ b/src/third_party/wiredtiger/examples/c/ex_all.c
@@ -567,8 +567,8 @@ session_ops_create(WT_SESSION *session)
/*! [Create a table with columns] */
/*
- * Create a table with columns: keys are record numbers, values are
- * (string, signed 32-bit integer, unsigned 16-bit integer).
+ * Create a table with columns: keys are record numbers, values are (string, signed 32-bit
+ * integer, unsigned 16-bit integer).
*/
error_check(session->create(session, "table:mytable",
"key_format=r,value_format=SiH,"
diff --git a/src/third_party/wiredtiger/examples/c/ex_extractor.c b/src/third_party/wiredtiger/examples/c/ex_extractor.c
index ed6c7b671a6..b4c74c35cfd 100644
--- a/src/third_party/wiredtiger/examples/c/ex_extractor.c
+++ b/src/third_party/wiredtiger/examples/c/ex_extractor.c
@@ -73,11 +73,9 @@ my_extract(WT_EXTRACTOR *extractor, WT_SESSION *session, const WT_ITEM *key, con
*/
for (year = term_start; year <= term_end; ++year) {
/*
- * Note that the extract callback is called for all operations
- * that update the table, not just inserts. The user sets the
- * key and uses the cursor->insert() method to return the index
- * key(s). WiredTiger will perform the required operation
- * (such as a remove()).
+ * Note that the extract callback is called for all operations that update the table, not
+ * just inserts. The user sets the key and uses the cursor->insert() method to return the
+ * index key(s). WiredTiger will perform the required operation (such as a remove()).
*/
fprintf(
stderr, "EXTRACTOR: index op for year %" PRIu16 ": %s %s\n", year, first_name, last_name);
diff --git a/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c b/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c
index 1778fa55b9f..28f1d41bcfc 100644
--- a/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c
+++ b/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c
@@ -52,23 +52,21 @@ typedef struct {
} LZ4_COMPRESSOR;
/*
- * LZ4 decompression requires the exact compressed byte count returned by the
- * LZ4_compress_default and LZ4_compress_destSize functions. WiredTiger doesn't
- * track that value, store it in the destination buffer.
+ * LZ4 decompression requires the exact compressed byte count returned by the LZ4_compress_default
+ * and LZ4_compress_destSize functions. WiredTiger doesn't track that value, store it in the
+ * destination buffer.
*
- * Additionally, LZ4_compress_destSize may compress into the middle of a record,
- * and after decompression we return the length to the last record successfully
- * decompressed, not the number of bytes decompressed; store that value in the
- * destination buffer as well.
+ * Additionally, LZ4_compress_destSize may compress into the middle of a record, and after
+ * decompression we return the length to the last record successfully decompressed, not the number
+ * of bytes decompressed; store that value in the destination buffer as well.
*
- * (Since raw compression has been removed from WiredTiger, the lz4 compression
- * code no longer calls LZ4_compress_destSize. Some support remains to support
- * existing compressed objects.)
+ * (Since raw compression has been removed from WiredTiger, the lz4 compression code no longer calls
+ * LZ4_compress_destSize. Some support remains to support existing compressed objects.)
*
* Use fixed-size, 4B values (WiredTiger never writes buffers larger than 4GB).
*
- * The unused field is available for a mode flag if one is needed in the future,
- * we guarantee it's 0.
+ * The unused field is available for a mode flag if one is needed in the future, we guarantee it's
+ * 0.
*/
typedef struct {
uint32_t compressed_len; /* True compressed length */
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index bf8fe77a34a..f367175962d 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-4.4",
- "commit": "9bd1ece7971714f947b47e589b0af5d7ee97a29d"
+ "commit": "7b994a862e899a12eb7c3ac814c9fada7d8d1ab9"
}
diff --git a/src/third_party/wiredtiger/src/block/block_open.c b/src/third_party/wiredtiger/src/block/block_open.c
index 1f33e1092c7..638796e4459 100644
--- a/src/third_party/wiredtiger/src/block/block_open.c
+++ b/src/third_party/wiredtiger/src/block/block_open.c
@@ -334,12 +334,7 @@ __desc_read(WT_SESSION_IMPL *session, uint32_t allocsize, WT_BLOCK *block)
* to be returning across the API boundary.
*/
if (block->size < allocsize) {
- /*
- * We use the "ignore history store tombstone" flag as of verify so we need to check that
- * we're not performing a verify.
- */
- if (F_ISSET(session, WT_SESSION_ROLLBACK_TO_STABLE_FLAGS) &&
- !F_ISSET(S2BT(session), WT_BTREE_VERIFY))
+ if (F_ISSET(session, WT_SESSION_ROLLBACK_TO_STABLE))
ret = ENOENT;
else {
ret = WT_ERROR;
diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c
index e02436c836b..9087d643bbb 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curnext.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c
@@ -273,11 +273,9 @@ restart_read:
}
/*
- * If we're at the same slot as the last reference and there's
- * no matching insert list item, re-use the return information
- * (so encoded items with large repeat counts aren't repeatedly
- * decoded). Otherwise, unpack the cell and build the return
- * information.
+ * If we're at the same slot as the last reference and there's no matching insert list item,
+ * re-use the return information (so encoded items with large repeat counts aren't
+ * repeatedly decoded). Otherwise, unpack the cell and build the return information.
*/
if (cbt->cip_saved != cip) {
cell = WT_COL_PTR(page, cip);
diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c
index ffa3cf409b1..6d187bd3057 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curprev.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c
@@ -413,11 +413,9 @@ restart_read:
}
/*
- * If we're at the same slot as the last reference and there's
- * no matching insert list item, re-use the return information
- * (so encoded items with large repeat counts aren't repeatedly
- * decoded). Otherwise, unpack the cell and build the return
- * information.
+ * If we're at the same slot as the last reference and there's no matching insert list item,
+ * re-use the return information (so encoded items with large repeat counts aren't
+ * repeatedly decoded). Otherwise, unpack the cell and build the return information.
*/
if (cbt->cip_saved != cip) {
cell = WT_COL_PTR(page, cip);
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
index 6b0ad0e936b..78437202d3d 100644
--- a/src/third_party/wiredtiger/src/btree/bt_cursor.c
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -89,7 +89,7 @@ __cursor_page_pinned(WT_CURSOR_BTREE *cbt, bool search_operation)
* functions, but in the case of a search, we will see different results based on the cursor's
* initial location. See WT-5134 for the details.
*/
- if (search_operation && session->txn.isolation == WT_ISO_READ_COMMITTED)
+ if (search_operation && session->txn->isolation == WT_ISO_READ_COMMITTED)
return (false);
/*
@@ -1471,11 +1471,11 @@ __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries)
* time and the failure is unlikely to be detected. Require explicit transactions for modify
* operations.
*/
- if (session->txn.isolation != WT_ISO_SNAPSHOT)
+ if (session->txn->isolation != WT_ISO_SNAPSHOT)
WT_ERR_MSG(session, ENOTSUP,
"not supported in read-committed or read-uncommitted "
"transactions");
- if (F_ISSET(&session->txn, WT_TXN_AUTOCOMMIT))
+ if (F_ISSET(session->txn, WT_TXN_AUTOCOMMIT))
WT_ERR_MSG(session, ENOTSUP, "not supported in implicit transactions");
if (!F_ISSET(cursor, WT_CURSTD_KEY_INT) || !F_ISSET(cursor, WT_CURSTD_VALUE_INT))
diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c
index 69651d9b5ca..e7974765964 100644
--- a/src/third_party/wiredtiger/src/btree/bt_delete.c
+++ b/src/third_party/wiredtiger/src/btree/bt_delete.c
@@ -306,9 +306,8 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
/*
* Give the page a modify structure.
*
- * Mark tree dirty, unless the handle is read-only.
- * (We'd like to free the deleted pages, but if the handle is read-only,
- * we're not able to do so.)
+ * Mark tree dirty, unless the handle is read-only. (We'd like to free the deleted pages, but if
+ * the handle is read-only, we're not able to do so.)
*/
WT_RET(__wt_page_modify_init(session, page));
if (!F_ISSET(btree, WT_BTREE_READONLY))
diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c
index 2878391fd53..6c695f5418f 100644
--- a/src/third_party/wiredtiger/src/btree/bt_discard.c
+++ b/src/third_party/wiredtiger/src/btree/bt_discard.c
@@ -190,9 +190,8 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
/*
* Free the insert array.
*
- * Row-store tables have one additional slot in the insert array
- * (the insert array has an extra slot to hold keys that sort
- * before keys found on the original page).
+ * Row-store tables have one additional slot in the insert array (the insert array has an
+ * extra slot to hold keys that sort before keys found on the original page).
*/
if (mod->mod_row_insert != NULL)
__free_skip_array(session, mod->mod_row_insert, page->entries + 1, update_ignore);
@@ -203,10 +202,9 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
break;
}
- /* Free the overflow on-page, reuse and transaction-cache skiplists. */
+ /* Free the overflow on-page and reuse skiplists. */
__wt_ovfl_reuse_free(session, page);
__wt_ovfl_discard_free(session, page);
- __wt_ovfl_discard_remove(session, page);
__wt_free(session, page->modify->ovfl_track);
__wt_spin_destroy(session, &page->modify->page_lock);
@@ -261,12 +259,11 @@ __wt_free_ref(WT_SESSION_IMPL *session, WT_REF *ref, int page_type, bool free_pa
}
/*
- * Optionally free row-store WT_REF key allocation. Historic versions of
- * this code looked in a passed-in page argument, but that is dangerous,
- * some of our error-path callers create WT_REF structures without ever
- * setting WT_REF.home or having a parent page to which the WT_REF will
- * be linked. Those WT_REF structures invariably have instantiated keys,
- * (they obviously cannot be on-page keys), and we must free the memory.
+ * Optionally free row-store WT_REF key allocation. Historic versions of this code looked in a
+ * passed-in page argument, but that is dangerous, some of our error-path callers create WT_REF
+ * structures without ever setting WT_REF.home or having a parent page to which the WT_REF will
+ * be linked. Those WT_REF structures invariably have instantiated keys, (they obviously cannot
+ * be on-page keys), and we must free the memory.
*/
switch (page_type) {
case WT_PAGE_ROW_INT:
diff --git a/src/third_party/wiredtiger/src/btree/bt_ovfl.c b/src/third_party/wiredtiger/src/btree/bt_ovfl.c
index 0ea80819048..72523b695de 100644
--- a/src/third_party/wiredtiger/src/btree/bt_ovfl.c
+++ b/src/third_party/wiredtiger/src/btree/bt_ovfl.c
@@ -47,8 +47,6 @@ __wt_ovfl_read(
WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store, bool *decoded)
{
WT_DECL_RET;
- WT_OVFL_TRACK *track;
- size_t i;
*decoded = false;
@@ -60,22 +58,15 @@ __wt_ovfl_read(
return (__ovfl_read(session, unpack->data, unpack->size, store));
/*
- * WT_CELL_VALUE_OVFL_RM cells: If reconciliation deleted an overflow value, but there was still
- * a reader in the system that might need it, the on-page cell type will have been reset to
- * WT_CELL_VALUE_OVFL_RM and we will be passed a page so we can check the on-page cell.
- *
- * Acquire the overflow lock, and retest the on-page cell's value inside the lock.
+ * WT_CELL_VALUE_OVFL_RM cells: if reconciliation deletes an overflow value, the on-page cell
+ * type is reset to WT_CELL_VALUE_OVFL_RM. Any values required by an existing reader will be
+ * copied into the HS file, which means this value should never be read. It's possible to race
+ * with checkpoints doing that work, lock before testing the removed flag.
*/
__wt_readlock(session, &S2BT(session)->ovfl_lock);
if (__wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM) {
- track = page->modify->ovfl_track;
- for (i = 0; i < track->remove_next; ++i)
- if (track->remove[i].cell == unpack->cell) {
- store->data = track->remove[i].data;
- store->size = track->remove[i].size;
- break;
- }
- WT_ASSERT(session, i < track->remove_next);
+ WT_ASSERT(session, __wt_txn_visible_all(session, unpack->stop_txn, unpack->stop_ts));
+ ret = __wt_buf_setstr(session, store, "WT_CELL_VALUE_OVFL_RM");
*decoded = true;
} else
ret = __ovfl_read(session, unpack->data, unpack->size, store);
@@ -85,109 +76,35 @@ __wt_ovfl_read(
}
/*
- * __wt_ovfl_discard_remove --
- * Free the on-page overflow value cache.
- */
-void
-__wt_ovfl_discard_remove(WT_SESSION_IMPL *session, WT_PAGE *page)
-{
- WT_OVFL_TRACK *track;
- uint32_t i;
-
- if (page->modify != NULL && (track = page->modify->ovfl_track) != NULL) {
- for (i = 0; i < track->remove_next; ++i)
- __wt_free(session, track->remove[i].data);
- __wt_free(session, page->modify->ovfl_track->remove);
- track->remove_allocated = 0;
- track->remove_next = 0;
- }
-}
-
-/*
- * __ovfl_cache --
- * Cache an overflow value.
- */
-static int
-__ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack)
-{
- WT_DECL_ITEM(tmp);
- WT_DECL_RET;
- WT_OVFL_TRACK *track;
-
- /* Read the overflow value. */
- WT_RET(__wt_scr_alloc(session, 1024, &tmp));
- WT_ERR(__wt_dsk_cell_data_ref(session, page->type, unpack, tmp));
-
- /* Allocating tracking structures as necessary. */
- if (page->modify->ovfl_track == NULL)
- WT_ERR(__wt_ovfl_track_init(session, page));
- track = page->modify->ovfl_track;
-
- /* Copy the overflow item into place. */
- WT_ERR(
- __wt_realloc_def(session, &track->remove_allocated, track->remove_next + 1, &track->remove));
- track->remove[track->remove_next].cell = unpack->cell;
- WT_ERR(__wt_memdup(session, tmp->data, tmp->size, &track->remove[track->remove_next].data));
- track->remove[track->remove_next].size = tmp->size;
- ++track->remove_next;
-
-err:
- __wt_scr_free(session, &tmp);
- return (ret);
-}
-
-/*
* __wt_ovfl_remove --
* Remove an overflow value.
*/
int
-__wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, bool evicting)
+__wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack)
{
/*
* This function solves two problems in reconciliation.
*
- * The first problem is snapshot readers needing on-page overflow values
- * that have been removed. The scenario is as follows:
- *
- * - reconciling a leaf page that references an overflow item
- * - the item is updated and the update committed
- * - a checkpoint runs, freeing the backing overflow blocks
- * - a snapshot transaction wants the original version of the item
- *
- * In summary, we may need the original version of an overflow item for
- * a snapshot transaction after the item was deleted from a page that's
- * subsequently been checkpointed, where the checkpoint must know about
- * the freed blocks. We don't have any way to delay a free of the
- * underlying blocks until a particular set of transactions exit (and
- * this shouldn't be a common scenario), so cache the overflow value in
- * memory.
- *
- * This gets hard because the snapshot transaction reader might:
- * - search the WT_UPDATE list and not find an useful entry
+ * The first problem is snapshot readers needing on-page overflow values that have been removed.
+ * If the overflow value is required by a reader, it will be copied into the HS file before the
+ * backing blocks are removed. However, this gets hard because the snapshot transaction reader
+ * might:
+ * - search the update list and not find a useful entry
* - read the overflow value's address from the on-page cell
* - go to sleep
- * - checkpoint runs, caches the overflow value, frees the blocks
+ * - checkpoint runs, frees the backing blocks
* - another thread allocates and overwrites the blocks
- * - the reader wakes up and reads the wrong value
+ * - the reader wakes up and uses the on-page cell to read the blocks
*
- * Use a read/write lock and the on-page cell to fix the problem: hold
- * a write lock when changing the cell type from WT_CELL_VALUE_OVFL to
- * WT_CELL_VALUE_OVFL_RM and hold a read lock when reading an overflow
- * item.
+ * Use a read/write lock and the on-page cell to fix the problem: get a write lock when changing
+ * the cell type from WT_CELL_VALUE_OVFL to WT_CELL_VALUE_OVFL_RM, get a read lock when reading
+ * an overflow item.
*
- * The read/write lock is per btree, but it could be per page or even
- * per overflow item. We don't do any of that because overflow values
- * are supposed to be rare and we shouldn't see contention for the lock.
+ * The read/write lock is per btree (but could be per page or even per overflow item). We don't
+ * bother because overflow values are supposed to be rare and contention isn't expected.
*
- * We only have to do this for checkpoints: in any eviction mode, there
- * can't be threads sitting in our update lists.
- */
- if (!evicting)
- WT_RET(__ovfl_cache(session, page, unpack));
-
- /*
- * The second problem is to only remove the underlying blocks once, solved by the
- * WT_CELL_VALUE_OVFL_RM flag.
+ * The second problem is to only remove the underlying blocks once, also solved by checking the
+ * flag before doing any work.
*
* Queue the on-page cell to be set to WT_CELL_VALUE_OVFL_RM and the underlying overflow value's
* blocks to be freed when reconciliation completes.
@@ -213,11 +130,11 @@ __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell)
__wt_cell_unpack(session, page, cell, unpack);
/*
- * Finally remove overflow key/value objects, called when reconciliation finishes after
- * successfully writing a page.
+ * Remove overflow key/value objects, called when reconciliation finishes after successfully
+ * reconciling a page.
*
- * Keys must have already been instantiated and value objects must have already been cached (if
- * they might potentially still be read by any running transaction).
+ * Keys must have already been instantiated and value objects must have already been written to
+ * the HS file (if they might potentially still be read by any running transaction).
*
* Acquire the overflow lock to avoid racing with a thread reading the backing overflow blocks.
*/
diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c
index f6e49e27557..ac588bf901d 100644
--- a/src/third_party/wiredtiger/src/btree/bt_page.c
+++ b/src/third_party/wiredtiger/src/btree/bt_page.c
@@ -336,9 +336,9 @@ __inmem_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, uint64_t recno, size_t
repeat_off = 0;
/*
- * Walk the page, building references: the page contains unsorted value
- * items. The value items are on-page (WT_CELL_VALUE), overflow items
- * (WT_CELL_VALUE_OVFL) or deleted items (WT_CELL_DEL).
+ * Walk the page, building references: the page contains unsorted value items. The value items
+ * are on-page (WT_CELL_VALUE), overflow items (WT_CELL_VALUE_OVFL) or deleted items
+ * (WT_CELL_DEL).
*/
indx = 0;
cip = page->pg_var;
diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c
index 554c4e047d6..5c8c0ea871a 100644
--- a/src/third_party/wiredtiger/src/btree/bt_read.c
+++ b/src/third_party/wiredtiger/src/btree/bt_read.c
@@ -257,7 +257,7 @@ read:
*/
if (!LF_ISSET(WT_READ_IGNORE_CACHE_SIZE))
WT_RET(__wt_cache_eviction_check(
- session, true, !F_ISSET(&session->txn, WT_TXN_HAS_ID), NULL));
+ session, true, !F_ISSET(session->txn, WT_TXN_HAS_ID), NULL));
WT_RET(__page_read(session, ref, flags));
/* We just read a page, don't evict it before we have a chance to use it. */
diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c
index 9cad990fe84..851a407f165 100644
--- a/src/third_party/wiredtiger/src/btree/bt_sync.c
+++ b/src/third_party/wiredtiger/src/btree/bt_sync.c
@@ -30,7 +30,7 @@ __sync_checkpoint_can_skip(WT_SESSION_IMPL *session, WT_REF *ref)
u_int i;
mod = ref->page->modify;
- txn = &session->txn;
+ txn = session->txn;
/*
* We can skip some dirty pages during a checkpoint. The requirements:
@@ -362,7 +362,7 @@ __wt_sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
conn = S2C(session);
btree = S2BT(session);
prev = walk = NULL;
- txn = &session->txn;
+ txn = session->txn;
tried_eviction = false;
time_start = time_stop = 0;
is_hs = false;
@@ -381,7 +381,7 @@ __wt_sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
internal_bytes = leaf_bytes = 0;
internal_pages = leaf_pages = 0;
- saved_pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id;
+ saved_pinned_id = WT_SESSION_TXN_SHARED(session)->pinned_id;
timer = WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT);
if (timer)
time_start = __wt_clock(session);
@@ -567,7 +567,7 @@ __wt_sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
* pages. That happens prior to the final metadata checkpoint.
*/
if (F_ISSET(walk, WT_REF_FLAG_LEAF) && page->read_gen == WT_READGEN_WONT_NEED &&
- !tried_eviction && F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT)) {
+ !tried_eviction && F_ISSET(session->txn, WT_TXN_HAS_SNAPSHOT)) {
ret = __wt_page_release_evict(session, walk, 0);
walk = NULL;
WT_ERR_ERROR_OK(ret, EBUSY, false);
diff --git a/src/third_party/wiredtiger/src/btree/col_modify.c b/src/third_party/wiredtiger/src/btree/col_modify.c
index 687013be783..b2062354f7c 100644
--- a/src/third_party/wiredtiger/src/btree/col_modify.c
+++ b/src/third_party/wiredtiger/src/btree/col_modify.c
@@ -79,23 +79,18 @@ __wt_col_modify(WT_CURSOR_BTREE *cbt, uint64_t recno, const WT_ITEM *value, WT_U
mod = page->modify;
/*
- * If modifying a record not previously modified, but which is in the
- * same update slot as a previously modified record, cursor.ins will
- * not be set because there's no list of update records for this recno,
- * but cursor.ins_head will be set to point to the correct update slot.
- * Acquire the necessary insert information, then create a new update
- * entry and link it into the existing list. We get here if a page has
- * a single cell representing multiple records (the records have the
- * same value), and then a record in the cell is updated or removed,
- * creating the update list for the cell, and then a cursor iterates
- * into that same cell to update/remove a different record. We find the
- * correct slot in the update array, but we don't find an update list
- * (because it doesn't exist), and don't have the information we need
- * to do the insert. Normally, we wouldn't care (we could fail and do
- * a search for the record which would configure everything for the
- * insert), but range truncation does this pattern for every record in
- * the cell, and the performance is terrible. For that reason, catch it
- * here.
+ * If modifying a record not previously modified, but which is in the same update slot as a
+ * previously modified record, cursor.ins will not be set because there's no list of update
+ * records for this recno, but cursor.ins_head will be set to point to the correct update slot.
+ * Acquire the necessary insert information, then create a new update entry and link it into the
+ * existing list. We get here if a page has a single cell representing multiple records (the
+ * records have the same value), and then a record in the cell is updated or removed, creating
+ * the update list for the cell, and then a cursor iterates into that same cell to update/remove
+ * a different record. We find the correct slot in the update array, but we don't find an update
+ * list (because it doesn't exist), and don't have the information we need to do the insert.
+ * Normally, we wouldn't care (we could fail and do a search for the record which would
+ * configure everything for the insert), but range truncation does this pattern for every record
+ * in the cell, and the performance is terrible. For that reason, catch it here.
*/
if (cbt->ins == NULL && cbt->ins_head != NULL) {
cbt->ins = __col_insert_search(cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno);
diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c
index 13a570f8501..a6d56c9499d 100644
--- a/src/third_party/wiredtiger/src/btree/col_srch.c
+++ b/src/third_party/wiredtiger/src/btree/col_srch.c
@@ -156,9 +156,8 @@ descend:
/*
* Reference the slot used for next step down the tree.
*
- * Base is the smallest index greater than recno and may be the
- * (last + 1) index. The slot for descent is the one before
- * base.
+ * Base is the smallest index greater than recno and may be the (last + 1) index. The slot
+ * for descent is the one before base.
*/
if (recno != descent->ref_recno) {
/*
diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c
index 6a1f4af0619..98ae6f66daf 100644
--- a/src/third_party/wiredtiger/src/btree/row_srch.c
+++ b/src/third_party/wiredtiger/src/btree/row_srch.c
@@ -315,14 +315,12 @@ restart:
}
/*
- * Binary search of an internal page. There are three versions
- * (keys with no application-specified collation order, in long
- * and short versions, and keys with an application-specified
- * collation order), because doing the tests and error handling
- * inside the loop costs about 5%.
+ * Binary search of an internal page. There are three versions (keys with no
+ * application-specified collation order, in long and short versions, and keys with an
+ * application-specified collation order), because doing the tests and error handling inside
+ * the loop costs about 5%.
*
- * Reference the comment above about the 0th key: we continue to
- * special-case it.
+ * Reference the comment above about the 0th key: we continue to special-case it.
*/
base = 1;
limit = pindex->entries - 1;
@@ -542,20 +540,17 @@ leaf_match:
/*
* We didn't find an exact match in the WT_ROW array.
*
- * Base is the smallest index greater than key and may be the 0th index
- * or the (last + 1) index. Set the slot to be the largest index less
- * than the key if that's possible (if base is the 0th index it means
- * the application is inserting a key before any key found on the page).
+ * Base is the smallest index greater than key and may be the 0th index or the (last + 1) index.
+ * Set the slot to be the largest index less than the key if that's possible (if base is the 0th
+ * index it means the application is inserting a key before any key found on the page).
*
- * It's still possible there is an exact match, but it's on an insert
- * list. Figure out which insert chain to search and then set up the
- * return information assuming we'll find nothing in the insert list
- * (we'll correct as needed inside the search routine, depending on
- * what we find).
+ * It's still possible there is an exact match, but it's on an insert list. Figure out which
+ * insert chain to search and then set up the return information assuming we'll find nothing in
+ * the insert list (we'll correct as needed inside the search routine, depending on what we
+ * find).
*
- * If inserting a key smaller than any key found in the WT_ROW array,
- * use the extra slot of the insert array, otherwise the insert array
- * maps one-to-one to the WT_ROW array.
+ * If inserting a key smaller than any key found in the WT_ROW array, use the extra slot of the
+ * insert array, otherwise the insert array maps one-to-one to the WT_ROW array.
*/
if (base == 0) {
cbt->compare = 1;
diff --git a/src/third_party/wiredtiger/src/conn/api_calc_modify.c b/src/third_party/wiredtiger/src/conn/api_calc_modify.c
index 86912dfbd79..56391910c89 100644
--- a/src/third_party/wiredtiger/src/conn/api_calc_modify.c
+++ b/src/third_party/wiredtiger/src/conn/api_calc_modify.c
@@ -146,12 +146,11 @@ __wt_calc_modify(WT_SESSION_IMPL *wt_session, const WT_ITEM *oldv, const WT_ITEM
goto end;
/*
- * Walk through the post-image, maintaining start / end markers
- * separated by a gap in the pre-image. If the current point in the
- * post-image matches either marker, try to extend the match to find a
- * (large) range of matching bytes. If the end of the range is reached
- * in the post-image without finding a good match, double the size of
- * the gap, update the markers and keep trying.
+ * Walk through the post-image, maintaining start / end markers separated by a gap in the
+ * pre-image. If the current point in the post-image matches either marker, try to extend the
+ * match to find a (large) range of matching bytes. If the end of the range is reached in the
+ * post-image without finding a good match, double the size of the gap, update the markers and
+ * keep trying.
*/
hstart = hend = 0;
i = gap = 0;
diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c
index f0f8819007c..ffdfe3bc398 100644
--- a/src/third_party/wiredtiger/src/conn/conn_api.c
+++ b/src/third_party/wiredtiger/src/conn/conn_api.c
@@ -1033,7 +1033,7 @@ err:
* never referenced that file.
*/
for (s = conn->sessions, i = 0; i < conn->session_cnt; ++s, ++i)
- if (s->active && !F_ISSET(s, WT_SESSION_INTERNAL) && F_ISSET(&s->txn, WT_TXN_RUNNING)) {
+ if (s->active && !F_ISSET(s, WT_SESSION_INTERNAL) && F_ISSET(s->txn, WT_TXN_RUNNING)) {
wt_session = &s->iface;
WT_TRET(wt_session->rollback_transaction(wt_session, NULL));
}
@@ -1067,11 +1067,9 @@ err:
* After the async and LSM threads have exited, we won't open more files for the application.
* However, the sweep server is still running and it can close file handles at the same time the
* final checkpoint is reviewing open data handles (forcing checkpoint to reopen handles). Shut
- * down the sweep server and then flag the system should not open anything new.
+ * down the sweep server.
*/
WT_TRET(__wt_sweep_destroy(session));
- F_SET(conn, WT_CONN_CLOSING_NO_MORE_OPENS);
- WT_FULL_BARRIER();
/*
* Shut down the checkpoint and capacity server threads: we don't want to throttle writes and
@@ -1920,7 +1918,7 @@ __wt_verbose_dump_sessions(WT_SESSION_IMPL *session, bool show_cursors)
"read-committed" :
(s->isolation == WT_ISO_READ_UNCOMMITTED ? "read-uncommitted" : "snapshot")));
WT_ERR(__wt_msg(session, " Transaction:"));
- WT_ERR(__wt_verbose_dump_txn_one(session, &s->txn, 0, NULL));
+ WT_ERR(__wt_verbose_dump_txn_one(session, s, 0, NULL));
} else {
WT_ERR(__wt_msg(session, " Number of positioned cursors: %u", s->ncursors));
TAILQ_FOREACH (cursor, &s->cursors, q) {
@@ -2645,13 +2643,12 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *c
WT_ERR(__conn_write_base_config(session, cfg));
/*
- * Check on the turtle and metadata files, creating them if necessary
- * (which avoids application threads racing to create the metadata file
- * later). Once the metadata file exists, get a reference to it in
- * the connection's session.
+ * Check on the turtle and metadata files, creating them if necessary (which avoids application
+ * threads racing to create the metadata file later). Once the metadata file exists, get a
+ * reference to it in the connection's session.
*
- * THE TURTLE FILE MUST BE THE LAST FILE CREATED WHEN INITIALIZING THE
- * DATABASE HOME, IT'S WHAT WE USE TO DECIDE IF WE'RE CREATING OR NOT.
+ * THE TURTLE FILE MUST BE THE LAST FILE CREATED WHEN INITIALIZING THE DATABASE HOME, IT'S WHAT
+ * WE USE TO DECIDE IF WE'RE CREATING OR NOT.
*/
WT_ERR(__wt_turtle_init(session));
diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
index 1c391200cb1..e6da8446753 100644
--- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c
+++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
@@ -367,13 +367,12 @@ __wt_conn_dhandle_close(WT_SESSION_IMPL *session, bool final, bool mark_dead)
}
/*
- * If marking the handle dead, do so after closing the underlying btree.
- * (Don't do it before that, the block manager asserts there are never
- * two references to a block manager object, and re-opening the handle
- * can succeed once we mark this handle dead.)
+ * If marking the handle dead, do so after closing the underlying btree. (Don't do it before
+ * that, the block manager asserts there are never two references to a block manager object, and
+ * re-opening the handle can succeed once we mark this handle dead.)
*
- * Check discard too, code we call to clear the cache expects the data
- * handle dead flag to be set when discarding modified pages.
+ * Check discard too, code we call to clear the cache expects the data handle dead flag to be
+ * set when discarding modified pages.
*/
if (marked_dead || discard)
F_SET(dhandle, WT_DHANDLE_DEAD);
@@ -426,8 +425,7 @@ __wt_conn_dhandle_open(WT_SESSION_IMPL *session, const char *cfg[], uint32_t fla
WT_ASSERT(session, F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) && !LF_ISSET(WT_DHANDLE_LOCK_ONLY));
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_ROLLBACK_TO_STABLE_FLAGS) ||
- !F_ISSET(S2C(session), WT_CONN_CLOSING_NO_MORE_OPENS));
+ WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_CLOSING_NO_MORE_OPENS));
/* Turn off eviction. */
if (dhandle->type == WT_DHANDLE_TYPE_BTREE)
@@ -527,9 +525,8 @@ __conn_btree_apply_internal(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle,
return (0);
/*
- * We need to pull the handle into the session handle cache and make
- * sure it's referenced to stop other internal code dropping the handle
- * (e.g in LSM when cleaning up obsolete chunks).
+ * We need to pull the handle into the session handle cache and make sure it's referenced to
+ * stop other internal code dropping the handle (e.g in LSM when cleaning up obsolete chunks).
*/
if ((ret = __wt_session_get_dhandle(session, dhandle->name, dhandle->checkpoint, NULL, 0)) != 0)
return (ret == EBUSY ? 0 : ret);
diff --git a/src/third_party/wiredtiger/src/conn/conn_handle.c b/src/third_party/wiredtiger/src/conn/conn_handle.c
index 28864d2f4ec..db2b085e8c5 100644
--- a/src/third_party/wiredtiger/src/conn/conn_handle.c
+++ b/src/third_party/wiredtiger/src/conn/conn_handle.c
@@ -83,6 +83,9 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn)
TAILQ_INIT(&conn->blockhash[i]); /* Block handle hash lists */
TAILQ_INIT(&conn->blockqh); /* Block manager list */
+ conn->ckpt_prep_min = UINT64_MAX;
+ conn->ckpt_time_min = UINT64_MAX;
+
return (0);
}
diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c
index f59ce5d25d8..3c28ac121ad 100644
--- a/src/third_party/wiredtiger/src/conn/conn_open.c
+++ b/src/third_party/wiredtiger/src/conn/conn_open.c
@@ -97,6 +97,10 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
/* The eviction server is shut down last. */
WT_TRET(__wt_evict_destroy(session));
+ /* There should be no more file opens after this point. */
+ F_SET(conn, WT_CONN_CLOSING_NO_MORE_OPENS);
+ WT_FULL_BARRIER();
+
/* Close open data handles. */
WT_TRET(__wt_conn_dhandle_discard(session));
@@ -202,11 +206,9 @@ __wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[])
WT_RET(__wt_logmgr_create(session, cfg));
/*
- * Run recovery.
- * NOTE: This call will start (and stop) eviction if recovery is
- * required. Recovery must run before the history store table is created
- * (because recovery will update the metadata), and before eviction is
- * started for real.
+ * Run recovery. NOTE: This call will start (and stop) eviction if recovery is required.
+ * Recovery must run before the history store table is created (because recovery will update the
+ * metadata), and before eviction is started for real.
*/
WT_RET(__wt_txn_recover(session));
diff --git a/src/third_party/wiredtiger/src/conn/conn_stat.c b/src/third_party/wiredtiger/src/conn/conn_stat.c
index 05d6f111b2c..455f10ea905 100644
--- a/src/third_party/wiredtiger/src/conn/conn_stat.c
+++ b/src/third_party/wiredtiger/src/conn/conn_stat.c
@@ -135,15 +135,13 @@ __statlog_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp)
FLD_SET(conn->stat_flags, WT_STAT_ON_CLOSE);
/*
- * We don't allow the log path to be reconfigured for security reasons.
- * (Applications passing input strings directly to reconfigure would
- * expose themselves to a potential security problem, the utility of
- * reconfiguring a statistics log path isn't worth the security risk.)
+ * We don't allow the log path to be reconfigured for security reasons. (Applications passing
+ * input strings directly to reconfigure would expose themselves to a potential security
+ * problem, the utility of reconfiguring a statistics log path isn't worth the security risk.)
*
- * See above for the details, but during reconfiguration we're loading
- * the path value from the saved configuration information, and it's
- * required during reconfiguration because we potentially stopped and
- * are restarting, the server.
+ * See above for the details, but during reconfiguration we're loading the path value from the
+ * saved configuration information, and it's required during reconfiguration because we
+ * potentially stopped and are restarting, the server.
*/
WT_RET(__wt_config_gets(session, cfg, "statistics_log.path", &cval));
WT_ERR(__wt_scr_alloc(session, 0, &tmp));
diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c
index 8ab0a51a401..eeb7ffa514c 100644
--- a/src/third_party/wiredtiger/src/conn/conn_sweep.c
+++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c
@@ -66,17 +66,14 @@ __sweep_expire_one(WT_SESSION_IMPL *session)
/*
* Acquire an exclusive lock on the handle and mark it dead.
*
- * The close would require I/O if an update cannot be written
- * (updates in a no-longer-referenced file might not yet be
- * globally visible if sessions have disjoint sets of files
- * open). In that case, skip it: we'll retry the close the
- * next time, after the transaction state has progressed.
+ * The close would require I/O if an update cannot be written (updates in a no-longer-referenced
+ * file might not yet be globally visible if sessions have disjoint sets of files open). In that
+ * case, skip it: we'll retry the close the next time, after the transaction state has
+ * progressed.
*
- * We don't set WT_DHANDLE_EXCLUSIVE deliberately, we want
- * opens to block on us and then retry rather than returning an
- * EBUSY error to the application. This is done holding the
- * handle list lock so that connection-level handle searches
- * never need to retry.
+ * We don't set WT_DHANDLE_EXCLUSIVE deliberately, we want opens to block on us and then retry
+ * rather than returning an EBUSY error to the application. This is done holding the handle list
+ * lock so that connection-level handle searches never need to retry.
*/
WT_RET(__wt_try_writelock(session, &dhandle->rwlock));
diff --git a/src/third_party/wiredtiger/src/cursor/cur_backup.c b/src/third_party/wiredtiger/src/cursor/cur_backup.c
index e1864f70350..05dc7e2ff9b 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_backup.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_backup.c
@@ -678,20 +678,15 @@ __backup_start(
}
if (!target_list) {
/*
- * It's important to first gather the log files to be copied
- * (which internally starts a new log file), followed by
- * choosing a checkpoint to reference in the WiredTiger.backup
- * file.
+ * It's important to first gather the log files to be copied (which internally starts a new
+ * log file), followed by choosing a checkpoint to reference in the WiredTiger.backup file.
*
- * Applications may have logic that takes a checkpoint, followed
- * by performing a write that should only appear in the new
- * checkpoint. This ordering prevents choosing the prior
- * checkpoint, but including the write in the log files
- * returned.
+ * Applications may have logic that takes a checkpoint, followed by performing a write that
+ * should only appear in the new checkpoint. This ordering prevents choosing the prior
+ * checkpoint, but including the write in the log files returned.
*
- * It is also possible, and considered legal, to choose the new
- * checkpoint, but not include the log file that contains the
- * log entry for taking the new checkpoint.
+ * It is also possible, and considered legal, to choose the new checkpoint, but not include
+ * the log file that contains the log entry for taking the new checkpoint.
*/
WT_ERR(__backup_log_append(session, cb, true));
WT_ERR(__backup_all(session));
diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c
index acb513ebcc6..7dfb3bca218 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_file.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_file.c
@@ -457,12 +457,11 @@ err:
CURSOR_UPDATE_API_END(session, ret);
/*
- * The application might do a WT_CURSOR.get_value call when we return,
- * so we need a value and the underlying functions didn't set one up.
- * For various reasons, those functions may not have done a search and
- * any previous value in the cursor might race with WT_CURSOR.reserve
- * (and in cases like LSM, the reserve never encountered the original
- * key). For simplicity, repeat the search here.
+ * The application might do a WT_CURSOR.get_value call when we return, so we need a value and
+ * the underlying functions didn't set one up. For various reasons, those functions may not have
+ * done a search and any previous value in the cursor might race with WT_CURSOR.reserve (and in
+ * cases like LSM, the reserve never encountered the original key). For simplicity, repeat the
+ * search here.
*/
return (ret == 0 ? cursor->search(cursor) : ret);
}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_index.c b/src/third_party/wiredtiger/src/cursor/cur_index.c
index 24776ef3ca5..0f1fab36bf8 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_index.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_index.c
@@ -226,11 +226,9 @@ __curindex_search(WT_CURSOR *cursor)
JOINABLE_CURSOR_API_CALL(cursor, session, search, NULL);
/*
- * We are searching using the application-specified key, which
- * (usually) doesn't contain the primary key, so it is just a prefix of
- * any matching index key. Do a search_near, step to the next entry if
- * we land on one that is too small, then check that the prefix
- * matches.
+ * We are searching using the application-specified key, which (usually) doesn't contain the
+ * primary key, so it is just a prefix of any matching index key. Do a search_near, step to the
+ * next entry if we land on one that is too small, then check that the prefix matches.
*/
__wt_cursor_set_raw_key(child, &cursor->key);
WT_ERR(child->search_near(child, &cmp));
@@ -297,15 +295,12 @@ __curindex_search_near(WT_CURSOR *cursor, int *exact)
JOINABLE_CURSOR_API_CALL(cursor, session, search, NULL);
/*
- * We are searching using the application-specified key, which
- * (usually) doesn't contain the primary key, so it is just a prefix of
- * any matching index key. That said, if there is an exact match, we
- * want to find the first matching index entry and set exact equal to
- * zero.
+ * We are searching using the application-specified key, which (usually) doesn't contain the
+ * primary key, so it is just a prefix of any matching index key. That said, if there is an
+ * exact match, we want to find the first matching index entry and set exact equal to zero.
*
- * Do a search_near, and if we find an entry that is too small, step to
- * the next one. In the unlikely event of a search past the end of the
- * tree, go back to the last key.
+ * Do a search_near, and if we find an entry that is too small, step to the next one. In the
+ * unlikely event of a search past the end of the tree, go back to the last key.
*/
__wt_cursor_set_raw_key(child, &cursor->key);
WT_ERR(child->search_near(child, &cmp));
diff --git a/src/third_party/wiredtiger/src/cursor/cur_join.c b/src/third_party/wiredtiger/src/cursor/cur_join.c
index 36e5ca17b02..bb2497f3d19 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_join.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_join.c
@@ -926,7 +926,7 @@ __curjoin_init_next(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, bool iterab
* doing any needed check during the iteration.
*/
if (!iterable && F_ISSET(je, WT_CURJOIN_ENTRY_BLOOM)) {
- if (session->txn.isolation == WT_ISO_READ_UNCOMMITTED)
+ if (session->txn->isolation == WT_ISO_READ_UNCOMMITTED)
WT_ERR_MSG(session, EINVAL,
"join cursors with Bloom filters cannot be "
"used with read-uncommitted isolation");
diff --git a/src/third_party/wiredtiger/src/cursor/cur_std.c b/src/third_party/wiredtiger/src/cursor/cur_std.c
index 38ad4f3b31d..a65bb55a8ba 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_std.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_std.c
@@ -949,11 +949,11 @@ __cursor_modify(WT_CURSOR *cursor, WT_MODIFY *entries, int nentries)
* read-uncommitted transaction, or outside of an explicit transaction. Disallow here as well,
* for consistency.
*/
- if (session->txn.isolation != WT_ISO_SNAPSHOT)
+ if (session->txn->isolation != WT_ISO_SNAPSHOT)
WT_ERR_MSG(session, ENOTSUP,
"not supported in read-committed or read-uncommitted "
"transactions");
- if (F_ISSET(&session->txn, WT_TXN_AUTOCOMMIT))
+ if (F_ISSET(session->txn, WT_TXN_AUTOCOMMIT))
WT_ERR_MSG(session, ENOTSUP, "not supported in implicit transactions");
WT_ERR(__cursor_checkkey(cursor));
diff --git a/src/third_party/wiredtiger/src/cursor/cur_table.c b/src/third_party/wiredtiger/src/cursor/cur_table.c
index 8af441c7d02..4fd78188c39 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_table.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_table.c
@@ -128,8 +128,7 @@ __wt_apply_single_idx(WT_SESSION_IMPL *session, WT_INDEX *idx, WT_CURSOR *cur,
WT_RET(__wt_schema_project_merge(
session, ctable->cg_cursors, idx->key_plan, idx->key_format, &cur->key));
/*
- * The index key is now set and the value is empty
- * (it starts clear and is never set).
+ * The index key is now set and the value is empty (it starts clear and is never set).
*/
F_SET(cur, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);
WT_RET(f(cur));
@@ -704,12 +703,11 @@ err:
CURSOR_UPDATE_API_END(session, ret);
/*
- * The application might do a WT_CURSOR.get_value call when we return,
- * so we need a value and the underlying functions didn't set one up.
- * For various reasons, those functions may not have done a search and
- * any previous value in the cursor might race with WT_CURSOR.reserve
- * (and in cases like LSM, the reserve never encountered the original
- * key). For simplicity, repeat the search here.
+ * The application might do a WT_CURSOR.get_value call when we return, so we need a value and
+ * the underlying functions didn't set one up. For various reasons, those functions may not have
+ * done a search and any previous value in the cursor might race with WT_CURSOR.reserve (and in
+ * cases like LSM, the reserve never encountered the original key). For simplicity, repeat the
+ * search here.
*/
return (ret == 0 ? cursor->search(cursor) : ret);
}
@@ -1039,20 +1037,18 @@ __wt_curtable_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner,
WT_ERR(__curtable_open_colgroups(ctable, cfg));
/*
- * We'll need to squirrel away a copy of the cursor configuration for
- * if/when we open indices.
+ * We'll need to squirrel away a copy of the cursor configuration for if/when we open indices.
*
- * cfg[0] is the baseline configuration for the cursor open and we can
- * acquire another copy from the configuration structures, so it would
- * be reasonable not to copy it here: but I'd rather be safe than sorry.
+ * cfg[0] is the baseline configuration for the cursor open and we can acquire another copy from
+ * the configuration structures, so it would be reasonable not to copy it here: but I'd rather
+ * be safe than sorry.
*
* cfg[1] is the application configuration.
*
- * Underlying indices are always opened without dump or readonly; that
- * information is appended to cfg[1] so later "fast" configuration calls
- * (checking only cfg[0] and cfg[1]) work. I don't expect to see more
- * than two configuration strings here, but it's written to compact into
- * two configuration strings, a copy of cfg[0] and the rest in cfg[1].
+ * Underlying indices are always opened without dump or readonly; that information is appended
+ * to cfg[1] so later "fast" configuration calls (checking only cfg[0] and cfg[1]) work. I don't
+ * expect to see more than two configuration strings here, but it's written to compact into two
+ * configuration strings, a copy of cfg[0] and the rest in cfg[1].
*/
WT_ERR(__wt_calloc_def(session, 3, &ctable->cfg));
WT_ERR(__wt_strdup(session, cfg[0], &ctable->cfg[0]));
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index d134d7e504f..455e8c15bef 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -272,11 +272,24 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread)
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
+ uint32_t session_flags;
bool did_work, was_intr;
+ bool is_owner;
conn = S2C(session);
cache = conn->cache;
+ /*
+ * Cache a history store cursor to avoid deadlock: if an eviction thread thread marks a file
+ * busy and then opens a different file (in this case, the HS file), it can deadlock with a
+ * thread waiting for the first file to drain from the eviction queue. See WT-5946 for details.
+ */
+ if (!F_ISSET(conn, WT_CONN_IN_MEMORY)) {
+ session_flags = 0; /* [-Werror=maybe-uninitialized] */
+ WT_RET(__wt_hs_cursor(session, &session_flags, &is_owner));
+ WT_RET(__wt_hs_cursor_close(session, session_flags, is_owner));
+ }
+
if (conn->evict_server_running && __wt_spin_trylock(session, &cache->evict_pass_lock) == 0) {
/*
* Cannot use WT_WITH_PASS_LOCK because this is a try lock. Fix when that is supported. We
@@ -466,6 +479,13 @@ __wt_evict_create(WT_SESSION_IMPL *session)
conn = S2C(session);
+ /*
+ * In case recovery has allocated some transaction IDs, bump to the current state. This will
+ * prevent eviction threads from pinning anything as they start up and read metadata in order to
+ * open cursors.
+ */
+ WT_RET(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT));
+
WT_ASSERT(session, conn->evict_threads_min > 0);
/* Set first, the thread might run before we finish up. */
F_SET(conn, WT_CONN_EVICTION_RUN);
@@ -2265,7 +2285,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, bool readonly, d
WT_DECL_RET;
WT_TRACK_OP_DECL;
WT_TXN_GLOBAL *txn_global;
- WT_TXN_STATE *txn_state;
+ WT_TXN_SHARED *txn_shared;
uint64_t elapsed, time_start, time_stop;
uint64_t initial_progress, max_progress;
bool app_thread;
@@ -2276,7 +2296,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, bool readonly, d
cache = conn->cache;
time_start = time_stop = 0;
txn_global = &conn->txn_global;
- txn_state = WT_SESSION_TXN_STATE(session);
+ txn_shared = WT_SESSION_TXN_SHARED(session);
/*
* It is not safe to proceed if the eviction server threads aren't setup yet.
@@ -2303,7 +2323,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, bool readonly, d
--cache->evict_aggressive_score;
WT_STAT_CONN_INCR(session, txn_fail_cache);
if (app_thread)
- WT_TRET(__wt_msg(session, "%s", session->txn.rollback_reason));
+ WT_TRET(__wt_msg(session, "%s", session->txn->rollback_reason));
}
WT_ERR(ret);
}
@@ -2316,7 +2336,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, bool readonly, d
* below 100%, limit the work to 5 evictions and return. If that's not the case, we can do
* more.
*/
- if (!busy && txn_state->pinned_id != WT_TXN_NONE &&
+ if (!busy && txn_shared->pinned_id != WT_TXN_NONE &&
txn_global->current != txn_global->oldest_id)
busy = true;
max_progress = busy ? 5 : 20;
diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c
index 00718e20f70..ec93cf88a75 100644
--- a/src/third_party/wiredtiger/src/evict/evict_page.c
+++ b/src/third_party/wiredtiger/src/evict/evict_page.c
@@ -608,8 +608,7 @@ __evict_review(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags, bool
*
* Don't set any other flags for internal pages: there are no update lists to be saved and
* restored, changes can't be written into the history store table, nor can we re-create
- * internal
- * pages in memory.
+ * internal pages in memory.
*
* For leaf pages:
*
diff --git a/src/third_party/wiredtiger/src/history/hs.c b/src/third_party/wiredtiger/src/history/hs.c
index 0f63d510be0..fd90d168b6c 100644
--- a/src/third_party/wiredtiger/src/history/hs.c
+++ b/src/third_party/wiredtiger/src/history/hs.c
@@ -846,23 +846,16 @@ err:
}
/*
- * __hs_save_read_timestamp --
- * Save the currently running transaction's read timestamp into a variable.
- */
-static void
-__hs_save_read_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *saved_timestamp)
-{
- *saved_timestamp = session->txn.read_timestamp;
-}
-
-/*
* __hs_restore_read_timestamp --
- * Reset the currently running transaction's read timestamp with a previously saved one.
+ * Reset the currently running transaction's read timestamp with the original read timestamp.
*/
static void
-__hs_restore_read_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t saved_timestamp)
+__hs_restore_read_timestamp(WT_SESSION_IMPL *session)
{
- session->txn.read_timestamp = saved_timestamp;
+ WT_TXN_SHARED *txn_shared;
+
+ txn_shared = WT_SESSION_TXN_SHARED(session);
+ session->txn->read_timestamp = txn_shared->pinned_read_timestamp;
}
/*
@@ -886,7 +879,7 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, uint64_t recno, WT_UPDA
WT_TXN *txn;
WT_UPDATE *mod_upd, *upd;
wt_timestamp_t durable_timestamp, durable_timestamp_tmp, hs_start_ts, hs_start_ts_tmp;
- wt_timestamp_t hs_stop_ts, hs_stop_ts_tmp, read_timestamp, saved_timestamp;
+ wt_timestamp_t hs_stop_ts, hs_stop_ts_tmp, read_timestamp;
size_t notused, size;
uint64_t hs_counter, hs_counter_tmp, upd_type_full;
uint32_t hs_btree_id, session_flags;
@@ -900,14 +893,20 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, uint64_t recno, WT_UPDA
mod_upd = upd = NULL;
orig_hs_value_buf = NULL;
__wt_modify_vector_init(session, &modifies);
- txn = &session->txn;
- __hs_save_read_timestamp(session, &saved_timestamp);
+ txn = session->txn;
notused = size = 0;
hs_btree_id = S2BT(session)->id;
session_flags = 0; /* [-Werror=maybe-uninitialized] */
WT_NOT_READ(modify, false);
is_owner = false;
+ /*
+ * We temporarily move the read timestamp forwards to read modify records in the history store.
+ * Outside of that window, it should always be equal to the original read timestamp.
+ */
+ WT_ASSERT(
+ session, txn->read_timestamp == WT_SESSION_TXN_SHARED(session)->pinned_read_timestamp);
+
/* Row-store key is as passed to us, create the column-store key as needed. */
WT_ASSERT(
session, (key == NULL && recno != WT_RECNO_OOB) || (key != NULL && recno == WT_RECNO_OOB));
@@ -983,7 +982,7 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, uint64_t recno, WT_UPDA
* timestamp should be equivalent to the stop timestamp of the record that we're
* currently on.
*/
- session->txn.read_timestamp = hs_stop_ts_tmp;
+ session->txn->read_timestamp = hs_stop_ts_tmp;
/*
* Find the base update to apply the reverse deltas. If our cursor next fails to find an
@@ -1031,7 +1030,7 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, uint64_t recno, WT_UPDA
mod_upd = NULL;
}
/* After we're done looping over modifies, reset the read timestamp. */
- __hs_restore_read_timestamp(session, saved_timestamp);
+ __hs_restore_read_timestamp(session);
WT_STAT_CONN_INCR(session, cache_hs_read_squash);
}
@@ -1061,7 +1060,7 @@ err:
* Restore the read timestamp if we encountered an error while processing a modify. There's no
* harm in doing this multiple times.
*/
- __hs_restore_read_timestamp(session, saved_timestamp);
+ __hs_restore_read_timestamp(session);
WT_TRET(__wt_hs_cursor_close(session, session_flags, is_owner));
__wt_free_update_list(session, &mod_upd);
diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h
index 81118e421d2..e4455d62b03 100644
--- a/src/third_party/wiredtiger/src/include/api.h
+++ b/src/third_party/wiredtiger/src/include/api.h
@@ -86,39 +86,39 @@
while (0)
/* An API call wrapped in a transaction if necessary. */
-#define TXN_API_CALL(s, h, n, bt, config, cfg) \
- do { \
- bool __autotxn = false, __update = false; \
- API_CALL(s, h, n, bt, config, cfg); \
- __wt_txn_timestamp_flags(s); \
- __autotxn = !F_ISSET(&(s)->txn, WT_TXN_AUTOCOMMIT | WT_TXN_RUNNING); \
- if (__autotxn) \
- F_SET(&(s)->txn, WT_TXN_AUTOCOMMIT); \
- __update = !F_ISSET(&(s)->txn, WT_TXN_UPDATE); \
- if (__update) \
- F_SET(&(s)->txn, WT_TXN_UPDATE);
+#define TXN_API_CALL(s, h, n, bt, config, cfg) \
+ do { \
+ bool __autotxn = false, __update = false; \
+ API_CALL(s, h, n, bt, config, cfg); \
+ __wt_txn_timestamp_flags(s); \
+ __autotxn = !F_ISSET((s)->txn, WT_TXN_AUTOCOMMIT | WT_TXN_RUNNING); \
+ if (__autotxn) \
+ F_SET((s)->txn, WT_TXN_AUTOCOMMIT); \
+ __update = !F_ISSET((s)->txn, WT_TXN_UPDATE); \
+ if (__update) \
+ F_SET((s)->txn, WT_TXN_UPDATE);
/* An API call wrapped in a transaction if necessary. */
-#define TXN_API_CALL_NOCONF(s, h, n, dh) \
- do { \
- bool __autotxn = false, __update = false; \
- API_CALL_NOCONF(s, h, n, dh); \
- __wt_txn_timestamp_flags(s); \
- __autotxn = !F_ISSET(&(s)->txn, WT_TXN_AUTOCOMMIT | WT_TXN_RUNNING); \
- if (__autotxn) \
- F_SET(&(s)->txn, WT_TXN_AUTOCOMMIT); \
- __update = !F_ISSET(&(s)->txn, WT_TXN_UPDATE); \
- if (__update) \
- F_SET(&(s)->txn, WT_TXN_UPDATE);
+#define TXN_API_CALL_NOCONF(s, h, n, dh) \
+ do { \
+ bool __autotxn = false, __update = false; \
+ API_CALL_NOCONF(s, h, n, dh); \
+ __wt_txn_timestamp_flags(s); \
+ __autotxn = !F_ISSET((s)->txn, WT_TXN_AUTOCOMMIT | WT_TXN_RUNNING); \
+ if (__autotxn) \
+ F_SET((s)->txn, WT_TXN_AUTOCOMMIT); \
+ __update = !F_ISSET((s)->txn, WT_TXN_UPDATE); \
+ if (__update) \
+ F_SET((s)->txn, WT_TXN_UPDATE);
/* End a transactional API call, optional retry on deadlock. */
#define TXN_API_END_RETRY(s, ret, retry) \
API_END(s, ret); \
if (__update) \
- F_CLR(&(s)->txn, WT_TXN_UPDATE); \
+ F_CLR((s)->txn, WT_TXN_UPDATE); \
if (__autotxn) { \
- if (F_ISSET(&(s)->txn, WT_TXN_AUTOCOMMIT)) \
- F_CLR(&(s)->txn, WT_TXN_AUTOCOMMIT); \
+ if (F_ISSET((s)->txn, WT_TXN_AUTOCOMMIT)) \
+ F_CLR((s)->txn, WT_TXN_AUTOCOMMIT); \
else if ((ret) == 0) \
(ret) = __wt_txn_commit((s), NULL); \
else { \
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index bffb93036d6..6985cce0508 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -447,15 +447,6 @@ struct __wt_page_modify {
WT_CELL **discard;
size_t discard_entries;
size_t discard_allocated;
-
- /* Cached overflow value cell/update address pairs. */
- struct {
- WT_CELL *cell;
- uint8_t *data;
- size_t size;
- } * remove;
- size_t remove_allocated;
- uint32_t remove_next;
} * ovfl_track;
#define WT_PAGE_LOCK(s, p) __wt_spin_lock((s), &(p)->modify->page_lock)
@@ -485,8 +476,7 @@ struct __wt_page_modify {
#define WT_PM_REC_REPLACE 3 /* Reconciliation: single block */
uint8_t rec_result; /* Reconciliation state */
-#define WT_PAGE_RS_HS 0x1
-#define WT_PAGE_RS_RESTORED 0x2
+#define WT_PAGE_RS_RESTORED 0x1
uint8_t restore_state; /* Created by restoring updates */
};
diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i
index b0ff54c70b2..7cbfddbd381 100644
--- a/src/third_party/wiredtiger/src/include/btree.i
+++ b/src/third_party/wiredtiger/src/include/btree.i
@@ -555,8 +555,8 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
}
/* Check if this is the largest transaction ID to update the page. */
- if (WT_TXNID_LT(page->modify->update_txn, session->txn.id))
- page->modify->update_txn = session->txn.id;
+ if (WT_TXNID_LT(page->modify->update_txn, session->txn->id))
+ page->modify->update_txn = session->txn->id;
}
/*
diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h
index 698cea9447c..64f1efe1201 100644
--- a/src/third_party/wiredtiger/src/include/cache.h
+++ b/src/third_party/wiredtiger/src/include/cache.h
@@ -170,9 +170,9 @@ struct __wt_cache {
uint32_t evict_aggressive_score;
/*
- * Score of how often LRU queues are empty on refill. This score varies
- * between 0 (if the queue hasn't been empty for a long time) and 100
- * (if the queue has been empty the last 10 times we filled up.
+ * Score of how often LRU queues are empty on refill. This score varies between 0 (if the queue
+ * hasn't been empty for a long time) and 100 (if the queue has been empty the last 10 times we
+ * filled up.
*/
uint32_t evict_empty_score;
diff --git a/src/third_party/wiredtiger/src/include/cache.i b/src/third_party/wiredtiger/src/include/cache.i
index b96f079f5bd..d8d11943c94 100644
--- a/src/third_party/wiredtiger/src/include/cache.i
+++ b/src/third_party/wiredtiger/src/include/cache.i
@@ -374,7 +374,7 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, bool busy, bool readonly, bo
{
WT_BTREE *btree;
WT_TXN_GLOBAL *txn_global;
- WT_TXN_STATE *txn_state;
+ WT_TXN_SHARED *txn_shared;
double pct_full;
if (didworkp != NULL)
@@ -387,9 +387,9 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, bool busy, bool readonly, bo
* sure there is free space in the cache.
*/
txn_global = &S2C(session)->txn_global;
- txn_state = WT_SESSION_TXN_STATE(session);
- busy = busy || txn_state->id != WT_TXN_NONE || session->nhazard > 0 ||
- (txn_state->pinned_id != WT_TXN_NONE && txn_global->current != txn_global->oldest_id);
+ txn_shared = WT_SESSION_TXN_SHARED(session);
+ busy = busy || txn_shared->id != WT_TXN_NONE || session->nhazard > 0 ||
+ (txn_shared->pinned_id != WT_TXN_NONE && txn_global->current != txn_global->oldest_id);
/*
* LSM sets the "ignore cache size" flag when holding the LSM tree lock, in that case, or when
diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i
index 4d8b83b4d34..14de00f80c3 100644
--- a/src/third_party/wiredtiger/src/include/cursor.i
+++ b/src/third_party/wiredtiger/src/include/cursor.i
@@ -219,9 +219,8 @@ __cursor_reset(WT_CURSOR_BTREE *cbt)
cbt->page_deleted_count = 0;
/*
- * Release any page references we're holding. This can trigger eviction
- * (e.g., forced eviction of big pages), so it's important to do after
- * releasing our snapshot above.
+ * Release any page references we're holding. This can trigger eviction (e.g., forced eviction
+ * of big pages), so it's important to do after releasing our snapshot above.
*
* Clear the reference regardless, so we don't try the release twice.
*/
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 56363846dfe..0888eeee453 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -928,14 +928,15 @@ extern int __wt_logop_row_truncate_unpack(WT_SESSION_IMPL *session, const uint8_
const uint8_t *end, uint32_t *fileidp, WT_ITEM *startp, WT_ITEM *stopp, uint32_t *modep)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_logop_txn_timestamp_pack(WT_SESSION_IMPL *session, WT_ITEM *logrec,
- uint64_t time_sec, uint64_t time_nsec, uint64_t commit_ts, uint64_t durable_ts, uint64_t first_ts,
- uint64_t prepare_ts, uint64_t read_ts) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+ uint64_t time_sec, uint64_t time_nsec, uint64_t commit_ts, uint64_t durable_ts,
+ uint64_t first_commit_ts, uint64_t prepare_ts, uint64_t read_ts, uint64_t pinned_read_ts)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_logop_txn_timestamp_print(WT_SESSION_IMPL *session, const uint8_t **pp,
const uint8_t *end, WT_TXN_PRINTLOG_ARGS *args) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_logop_txn_timestamp_unpack(WT_SESSION_IMPL *session, const uint8_t **pp,
const uint8_t *end, uint64_t *time_secp, uint64_t *time_nsecp, uint64_t *commit_tsp,
- uint64_t *durable_tsp, uint64_t *first_tsp, uint64_t *prepare_tsp, uint64_t *read_tsp)
- WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+ uint64_t *durable_tsp, uint64_t *first_commit_tsp, uint64_t *prepare_tsp, uint64_t *read_tsp,
+ uint64_t *pinned_read_tsp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_logrec_alloc(WT_SESSION_IMPL *session, size_t size, WT_ITEM **logrecp)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_logrec_read(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
@@ -1126,8 +1127,8 @@ extern int __wt_ovfl_discard_add(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CEL
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack,
WT_ITEM *store, bool *decoded) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack,
- bool evicting) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_ovfl_reuse_add(WT_SESSION_IMPL *session, WT_PAGE *page, const uint8_t *addr,
size_t addr_size, const void *value, size_t value_size)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -1530,8 +1531,8 @@ extern int __wt_verbose_dump_sessions(WT_SESSION_IMPL *session, bool show_cursor
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_verbose_dump_txn(WT_SESSION_IMPL *session)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_verbose_dump_txn_one(WT_SESSION_IMPL *session, WT_TXN *txn, int error_code,
- const char *error_string) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_verbose_dump_txn_one(WT_SESSION_IMPL *session, WT_SESSION_IMPL *txn_session,
+ int error_code, const char *error_string) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_verbose_dump_update(WT_SESSION_IMPL *session, WT_UPDATE *upd)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[])
@@ -1701,7 +1702,6 @@ extern void __wt_optrack_record_funcid(
WT_SESSION_IMPL *session, const char *func, uint16_t *func_idp);
extern void __wt_os_stdio(WT_SESSION_IMPL *session);
extern void __wt_ovfl_discard_free(WT_SESSION_IMPL *session, WT_PAGE *page);
-extern void __wt_ovfl_discard_remove(WT_SESSION_IMPL *session, WT_PAGE *page);
extern void __wt_ovfl_reuse_free(WT_SESSION_IMPL *session, WT_PAGE *page);
extern void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep);
extern void __wt_print_huffman_code(void *huffman_arg, uint16_t symbol);
@@ -1759,8 +1759,8 @@ extern void __wt_txn_destroy(WT_SESSION_IMPL *session);
extern void __wt_txn_get_snapshot(WT_SESSION_IMPL *session);
extern void __wt_txn_global_destroy(WT_SESSION_IMPL *session);
extern void __wt_txn_op_free(WT_SESSION_IMPL *session, WT_TXN_OP *op);
+extern void __wt_txn_publish_durable_timestamp(WT_SESSION_IMPL *session);
extern void __wt_txn_publish_read_timestamp(WT_SESSION_IMPL *session);
-extern void __wt_txn_publish_timestamp(WT_SESSION_IMPL *session);
extern void __wt_txn_release(WT_SESSION_IMPL *session);
extern void __wt_txn_release_resources(WT_SESSION_IMPL *session);
extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session);
diff --git a/src/third_party/wiredtiger/src/include/reconcile.h b/src/third_party/wiredtiger/src/include/reconcile.h
index 2eefed10cf7..43581c7cc1f 100644
--- a/src/third_party/wiredtiger/src/include/reconcile.h
+++ b/src/third_party/wiredtiger/src/include/reconcile.h
@@ -25,10 +25,7 @@ struct __wt_reconcile {
uint64_t orig_btree_checkpoint_gen;
uint64_t orig_txn_checkpoint_gen;
- /*
- * Track the oldest running transaction and whether to skew history store to the newest update.
- */
- bool hs_skew_newest;
+ /* Track the oldest running transaction. */
uint64_t last_running;
/* Track the page's min/maximum transactions. */
diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h
index 98be0b299ce..bd877622ca1 100644
--- a/src/third_party/wiredtiger/src/include/session.h
+++ b/src/third_party/wiredtiger/src/include/session.h
@@ -129,7 +129,7 @@ struct __wt_session_impl {
WT_ITEM err; /* Error buffer */
WT_TXN_ISOLATION isolation;
- WT_TXN txn; /* Transaction state */
+ WT_TXN *txn; /* Transaction state */
#define WT_SESSION_BG_SYNC_MSEC 1200000
WT_LSN bg_sync_lsn; /* Background sync operation LSN. */
@@ -145,11 +145,10 @@ struct __wt_session_impl {
/*
* Operations acting on handles.
*
- * The preferred pattern is to gather all of the required handles at
- * the beginning of an operation, then drop any other locks, perform
- * the operation, then release the handles. This cannot be easily
- * merged with the list of checkpoint handles because some operations
- * (such as compact) do checkpoints internally.
+ * The preferred pattern is to gather all of the required handles at the beginning of an
+ * operation, then drop any other locks, perform the operation, then release the handles. This
+ * cannot be easily merged with the list of checkpoint handles because some operations (such as
+ * compact) do checkpoints internally.
*/
WT_DATA_HANDLE **op_handle; /* Handle list */
u_int op_handle_next; /* Next empty slot */
@@ -190,8 +189,9 @@ struct __wt_session_impl {
#define WT_SESSION_QUIET_CORRUPT_FILE 0x02000000u
#define WT_SESSION_READ_WONT_NEED 0x04000000u
#define WT_SESSION_RESOLVING_TXN 0x08000000u
-#define WT_SESSION_SCHEMA_TXN 0x10000000u
-#define WT_SESSION_SERVER_ASYNC 0x20000000u
+#define WT_SESSION_ROLLBACK_TO_STABLE 0x10000000u
+#define WT_SESSION_SCHEMA_TXN 0x20000000u
+#define WT_SESSION_SERVER_ASYNC 0x40000000u
/* AUTOMATIC FLAG VALUE GENERATION STOP */
uint32_t flags;
@@ -271,9 +271,3 @@ struct __wt_session_impl {
WT_SESSION_STATS stats;
};
-
-/*
- * Rollback to stable should ignore tombstones in the history store since it needs to scan the
- * entire table sequentially.
- */
-#define WT_SESSION_ROLLBACK_TO_STABLE_FLAGS (WT_SESSION_IGNORE_HS_TOMBSTONE)
diff --git a/src/third_party/wiredtiger/src/include/time.i b/src/third_party/wiredtiger/src/include/time.i
index 759b8338370..cff5e0850ea 100644
--- a/src/third_party/wiredtiger/src/include/time.i
+++ b/src/third_party/wiredtiger/src/include/time.i
@@ -163,7 +163,7 @@ __wt_op_timer_start(WT_SESSION_IMPL *session)
uint64_t timeout_us;
/* Timer can be configured per-transaction, and defaults to per-connection. */
- if ((timeout_us = session->txn.operation_timeout_us) == 0)
+ if (session->txn == NULL || (timeout_us = session->txn->operation_timeout_us) == 0)
timeout_us = S2C(session)->operation_timeout_us;
if (timeout_us == 0)
session->operation_start_us = session->operation_timeout_us = 0;
diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h
index aedc94a96a2..fd54e279171 100644
--- a/src/third_party/wiredtiger/src/include/txn.h
+++ b/src/third_party/wiredtiger/src/include/txn.h
@@ -46,7 +46,7 @@ typedef enum {
#define WT_TXNID_LT(t1, t2) ((t1) < (t2))
-#define WT_SESSION_TXN_STATE(s) (&S2C(s)->txn_global.states[(s)->id])
+#define WT_SESSION_TXN_SHARED(s) (&S2C(s)->txn_global.txn_shared_list[(s)->id])
#define WT_SESSION_IS_CHECKPOINT(s) ((s)->id != 0 && (s)->id == S2C(s)->txn_global.checkpoint_id)
@@ -72,39 +72,59 @@ typedef enum {
/*
* Perform an operation at the specified isolation level.
*
- * This is fiddly: we can't cope with operations that begin transactions
- * (leaving an ID allocated), and operations must not move our published
- * snap_min forwards (or updates we need could be freed while this operation is
- * in progress). Check for those cases: the bugs they cause are hard to debug.
+ * This is fiddly: we can't cope with operations that begin transactions (leaving an ID allocated),
+ * and operations must not move our published snap_min forwards (or updates we need could be freed
+ * while this operation is in progress). Check for those cases: the bugs they cause are hard to
+ * debug.
*/
-#define WT_WITH_TXN_ISOLATION(s, iso, op) \
- do { \
- WT_TXN_ISOLATION saved_iso = (s)->isolation; \
- WT_TXN_ISOLATION saved_txn_iso = (s)->txn.isolation; \
- WT_TXN_STATE *txn_state = WT_SESSION_TXN_STATE(s); \
- WT_TXN_STATE saved_state = *txn_state; \
- (s)->txn.forced_iso++; \
- (s)->isolation = (s)->txn.isolation = (iso); \
- op; \
- (s)->isolation = saved_iso; \
- (s)->txn.isolation = saved_txn_iso; \
- WT_ASSERT((s), (s)->txn.forced_iso > 0); \
- (s)->txn.forced_iso--; \
- WT_ASSERT((s), txn_state->id == saved_state.id && \
- (txn_state->metadata_pinned == saved_state.metadata_pinned || \
- saved_state.metadata_pinned == WT_TXN_NONE) && \
- (txn_state->pinned_id == saved_state.pinned_id || \
- saved_state.pinned_id == WT_TXN_NONE)); \
- txn_state->metadata_pinned = saved_state.metadata_pinned; \
- txn_state->pinned_id = saved_state.pinned_id; \
+#define WT_WITH_TXN_ISOLATION(s, iso, op) \
+ do { \
+ WT_TXN_ISOLATION saved_iso = (s)->isolation; \
+ WT_TXN_ISOLATION saved_txn_iso = (s)->txn->isolation; \
+ WT_TXN_SHARED *txn_shared = WT_SESSION_TXN_SHARED(s); \
+ WT_TXN_SHARED saved_txn_shared = *txn_shared; \
+ (s)->txn->forced_iso++; \
+ (s)->isolation = (s)->txn->isolation = (iso); \
+ op; \
+ (s)->isolation = saved_iso; \
+ (s)->txn->isolation = saved_txn_iso; \
+ WT_ASSERT((s), (s)->txn->forced_iso > 0); \
+ (s)->txn->forced_iso--; \
+ WT_ASSERT((s), txn_shared->id == saved_txn_shared.id && \
+ (txn_shared->metadata_pinned == saved_txn_shared.metadata_pinned || \
+ saved_txn_shared.metadata_pinned == WT_TXN_NONE) && \
+ (txn_shared->pinned_id == saved_txn_shared.pinned_id || \
+ saved_txn_shared.pinned_id == WT_TXN_NONE)); \
+ txn_shared->metadata_pinned = saved_txn_shared.metadata_pinned; \
+ txn_shared->pinned_id = saved_txn_shared.pinned_id; \
} while (0)
-struct __wt_txn_state {
+struct __wt_txn_shared {
WT_CACHE_LINE_PAD_BEGIN
volatile uint64_t id;
volatile uint64_t pinned_id;
volatile uint64_t metadata_pinned;
- volatile bool is_allocating;
+
+ /*
+ * The first commit or durable timestamp used for this transaction. Determines its position in
+ * the durable queue and prevents the all_durable timestamp moving past this point.
+ */
+ wt_timestamp_t pinned_durable_timestamp;
+
+ /*
+ * Set to the first read timestamp used in the transaction. As part of our history store
+ * mechanism, we can move the read timestamp forward so we need to keep track of the original
+ * read timestamp to know what history should be pinned in front of oldest.
+ */
+ wt_timestamp_t pinned_read_timestamp;
+
+ TAILQ_ENTRY(__wt_txn_shared) read_timestampq;
+ TAILQ_ENTRY(__wt_txn_shared) durable_timestampq;
+ /* Set if need to clear from the durable queue */
+
+ volatile uint8_t is_allocating;
+ uint8_t clear_durable_q;
+ uint8_t clear_read_q; /* Set if need to clear from the read queue */
WT_CACHE_LINE_PAD_END
};
@@ -144,12 +164,12 @@ struct __wt_txn_global {
/* List of transactions sorted by durable timestamp. */
WT_RWLOCK durable_timestamp_rwlock;
- TAILQ_HEAD(__wt_txn_dts_qh, __wt_txn) durable_timestamph;
+ TAILQ_HEAD(__wt_txn_dts_qh, __wt_txn_shared) durable_timestamph;
uint32_t durable_timestampq_len;
/* List of transactions sorted by read timestamp. */
WT_RWLOCK read_timestamp_rwlock;
- TAILQ_HEAD(__wt_txn_rts_qh, __wt_txn) read_timestamph;
+ TAILQ_HEAD(__wt_txn_rts_qh, __wt_txn_shared) read_timestamph;
uint32_t read_timestampq_len;
/*
@@ -163,14 +183,14 @@ struct __wt_txn_global {
*/
volatile bool checkpoint_running; /* Checkpoint running */
volatile uint32_t checkpoint_id; /* Checkpoint's session ID */
- WT_TXN_STATE checkpoint_state; /* Checkpoint's txn state */
+ WT_TXN_SHARED checkpoint_txn_shared; /* Checkpoint's txn shared state */
wt_timestamp_t checkpoint_timestamp; /* Checkpoint's timestamp */
volatile uint64_t debug_ops; /* Debug mode op counter */
uint64_t debug_rollback; /* Debug mode rollback */
volatile uint64_t metadata_pinned; /* Oldest ID for metadata */
- WT_TXN_STATE *states; /* Per-session transaction states */
+ WT_TXN_SHARED *txn_shared_list; /* Per-session shared transaction states */
};
typedef enum __wt_txn_isolation {
@@ -288,12 +308,6 @@ struct __wt_txn {
/* Read updates committed as of this timestamp. */
wt_timestamp_t read_timestamp;
- TAILQ_ENTRY(__wt_txn) durable_timestampq;
- TAILQ_ENTRY(__wt_txn) read_timestampq;
- /* Set if need to clear from the durable queue */
- bool clear_durable_q;
- bool clear_read_q; /* Set if need to clear from the read queue */
-
/* Array of modifications by this transaction. */
WT_TXN_OP *mod;
size_t mod_alloc;
@@ -322,7 +336,7 @@ struct __wt_txn {
* WT_TXN_HAS_TS_DURABLE --
* The transaction has an explicitly set durable timestamp (that is, it
* hasn't been mirrored from its commit timestamp value).
- * WT_TXN_TS_PUBLISHED --
+ * WT_TXN_SHARED_TS_DURABLE --
* The transaction has been published to the durable queue. Setting this
* flag lets us know that, on release, we need to mark the transaction for
* clearing.
@@ -339,20 +353,26 @@ struct __wt_txn {
#define WT_TXN_HAS_TS_READ 0x000080u
#define WT_TXN_IGNORE_PREPARE 0x000100u
#define WT_TXN_PREPARE 0x000200u
-#define WT_TXN_PUBLIC_TS_READ 0x000400u
-#define WT_TXN_READONLY 0x000800u
-#define WT_TXN_RUNNING 0x001000u
-#define WT_TXN_SYNC_SET 0x002000u
-#define WT_TXN_TS_COMMIT_ALWAYS 0x004000u
-#define WT_TXN_TS_COMMIT_KEYS 0x008000u
-#define WT_TXN_TS_COMMIT_NEVER 0x010000u
-#define WT_TXN_TS_DURABLE_ALWAYS 0x020000u
-#define WT_TXN_TS_DURABLE_KEYS 0x040000u
-#define WT_TXN_TS_DURABLE_NEVER 0x080000u
-#define WT_TXN_TS_PUBLISHED 0x100000u
+#define WT_TXN_READONLY 0x000400u
+#define WT_TXN_RUNNING 0x000800u
+#define WT_TXN_SHARED_TS_DURABLE 0x001000u
+#define WT_TXN_SHARED_TS_READ 0x002000u
+#define WT_TXN_SYNC_SET 0x004000u
+#define WT_TXN_TS_COMMIT_ALWAYS 0x008000u
+#define WT_TXN_TS_COMMIT_KEYS 0x010000u
+#define WT_TXN_TS_COMMIT_NEVER 0x020000u
+#define WT_TXN_TS_DURABLE_ALWAYS 0x040000u
+#define WT_TXN_TS_DURABLE_KEYS 0x080000u
+#define WT_TXN_TS_DURABLE_NEVER 0x100000u
#define WT_TXN_TS_ROUND_PREPARED 0x200000u
#define WT_TXN_TS_ROUND_READ 0x400000u
#define WT_TXN_UPDATE 0x800000u
/* AUTOMATIC FLAG VALUE GENERATION STOP */
uint32_t flags;
+
+ /*
+ * Zero or more bytes of value (the payload) immediately follows the WT_UPDATE structure. We use
+ * a C99 flexible array member which has the semantics we want.
+ */
+ uint64_t __snapshot[];
};
diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i
index 9d154b892ca..574eece2e5f 100644
--- a/src/third_party/wiredtiger/src/include/txn.i
+++ b/src/third_party/wiredtiger/src/include/txn.i
@@ -13,7 +13,7 @@
static inline int
__wt_txn_context_prepare_check(WT_SESSION_IMPL *session)
{
- if (F_ISSET(&session->txn, WT_TXN_PREPARE))
+ if (F_ISSET(session->txn, WT_TXN_PREPARE))
WT_RET_MSG(session, EINVAL, "not permitted in a prepared transaction");
return (0);
}
@@ -25,9 +25,9 @@ __wt_txn_context_prepare_check(WT_SESSION_IMPL *session)
static inline int
__wt_txn_context_check(WT_SESSION_IMPL *session, bool requires_txn)
{
- if (requires_txn && !F_ISSET(&session->txn, WT_TXN_RUNNING))
+ if (requires_txn && !F_ISSET(session->txn, WT_TXN_RUNNING))
WT_RET_MSG(session, EINVAL, "only permitted in a running transaction");
- if (!requires_txn && F_ISSET(&session->txn, WT_TXN_RUNNING))
+ if (!requires_txn && F_ISSET(session->txn, WT_TXN_RUNNING))
WT_RET_MSG(session, EINVAL, "not permitted in a running transaction");
return (0);
}
@@ -41,7 +41,7 @@ __wt_txn_err_set(WT_SESSION_IMPL *session, int ret)
{
WT_TXN *txn;
- txn = &session->txn;
+ txn = session->txn;
/* Ignore standard errors that don't fail the transaction. */
if (ret == WT_NOTFOUND || ret == WT_DUPLICATE_KEY || ret == WT_PREPARE_CONFLICT)
@@ -78,17 +78,17 @@ __wt_txn_timestamp_flags(WT_SESSION_IMPL *session)
if (btree == NULL)
return;
if (FLD_ISSET(btree->assert_flags, WT_ASSERT_COMMIT_TS_ALWAYS))
- F_SET(&session->txn, WT_TXN_TS_COMMIT_ALWAYS);
+ F_SET(session->txn, WT_TXN_TS_COMMIT_ALWAYS);
if (FLD_ISSET(btree->assert_flags, WT_ASSERT_COMMIT_TS_KEYS))
- F_SET(&session->txn, WT_TXN_TS_COMMIT_KEYS);
+ F_SET(session->txn, WT_TXN_TS_COMMIT_KEYS);
if (FLD_ISSET(btree->assert_flags, WT_ASSERT_COMMIT_TS_NEVER))
- F_SET(&session->txn, WT_TXN_TS_COMMIT_NEVER);
+ F_SET(session->txn, WT_TXN_TS_COMMIT_NEVER);
if (FLD_ISSET(btree->assert_flags, WT_ASSERT_DURABLE_TS_ALWAYS))
- F_SET(&session->txn, WT_TXN_TS_DURABLE_ALWAYS);
+ F_SET(session->txn, WT_TXN_TS_DURABLE_ALWAYS);
if (FLD_ISSET(btree->assert_flags, WT_ASSERT_DURABLE_TS_KEYS))
- F_SET(&session->txn, WT_TXN_TS_DURABLE_KEYS);
+ F_SET(session->txn, WT_TXN_TS_DURABLE_KEYS);
if (FLD_ISSET(btree->assert_flags, WT_ASSERT_DURABLE_TS_NEVER))
- F_SET(&session->txn, WT_TXN_TS_DURABLE_NEVER);
+ F_SET(session->txn, WT_TXN_TS_DURABLE_NEVER);
}
/*
@@ -101,7 +101,7 @@ __wt_txn_op_set_recno(WT_SESSION_IMPL *session, uint64_t recno)
WT_TXN *txn;
WT_TXN_OP *op;
- txn = &session->txn;
+ txn = session->txn;
WT_ASSERT(session, txn->mod_count > 0 && recno != WT_RECNO_OOB);
op = txn->mod + txn->mod_count - 1;
@@ -132,7 +132,7 @@ __wt_txn_op_set_key(WT_SESSION_IMPL *session, const WT_ITEM *key)
WT_TXN *txn;
WT_TXN_OP *op;
- txn = &session->txn;
+ txn = session->txn;
WT_ASSERT(session, txn->mod_count > 0 && key->data != NULL);
@@ -163,7 +163,7 @@ __txn_resolve_prepared_update(WT_SESSION_IMPL *session, WT_UPDATE *upd)
{
WT_TXN *txn;
- txn = &session->txn;
+ txn = session->txn;
/*
* In case of a prepared transaction, the order of modification of the prepare timestamp to
* commit timestamp in the update chain will not affect the data visibility, a reader will
@@ -190,7 +190,7 @@ __txn_next_op(WT_SESSION_IMPL *session, WT_TXN_OP **opp)
*opp = NULL;
- txn = &session->txn;
+ txn = session->txn;
/*
* We're about to perform an update. Make sure we have allocated a transaction ID.
@@ -219,7 +219,7 @@ __wt_txn_unmodify(WT_SESSION_IMPL *session)
WT_TXN *txn;
WT_TXN_OP *op;
- txn = &session->txn;
+ txn = session->txn;
if (F_ISSET(txn, WT_TXN_HAS_ID)) {
WT_ASSERT(session, txn->mod_count > 0);
--txn->mod_count;
@@ -241,7 +241,7 @@ __wt_txn_op_apply_prepare_state(WT_SESSION_IMPL *session, WT_REF *ref, bool comm
wt_timestamp_t ts;
uint8_t prepare_state, previous_state;
- txn = &session->txn;
+ txn = session->txn;
/*
* Lock the ref to ensure we don't race with eviction freeing the page deleted update list or
@@ -285,7 +285,7 @@ __wt_txn_op_delete_commit_apply_timestamps(WT_SESSION_IMPL *session, WT_REF *ref
WT_UPDATE **updp;
uint8_t previous_state;
- txn = &session->txn;
+ txn = session->txn;
/*
* Lock the ref to ensure we don't race with eviction freeing the page deleted update list or
@@ -314,7 +314,7 @@ __wt_txn_op_set_timestamp(WT_SESSION_IMPL *session, WT_TXN_OP *op)
WT_UPDATE *upd;
wt_timestamp_t *timestamp;
- txn = &session->txn;
+ txn = session->txn;
/*
* Updates in the metadata never get timestamps (either now or at commit): metadata cannot be
@@ -366,7 +366,7 @@ __wt_txn_modify(WT_SESSION_IMPL *session, WT_UPDATE *upd)
WT_TXN *txn;
WT_TXN_OP *op;
- txn = &session->txn;
+ txn = session->txn;
if (F_ISSET(txn, WT_TXN_READONLY)) {
if (F_ISSET(txn, WT_TXN_IGNORE_PREPARE))
@@ -393,7 +393,7 @@ __wt_txn_modify(WT_SESSION_IMPL *session, WT_UPDATE *upd)
/* History store bypasses transactions, transaction modify should never be called on it. */
WT_ASSERT(session, !WT_IS_HS(S2BT(session)));
- upd->txnid = session->txn.id;
+ upd->txnid = session->txn->id;
__wt_txn_op_set_timestamp(session, op);
return (0);
@@ -410,7 +410,7 @@ __wt_txn_modify_page_delete(WT_SESSION_IMPL *session, WT_REF *ref)
WT_TXN *txn;
WT_TXN_OP *op;
- txn = &session->txn;
+ txn = session->txn;
WT_RET(__txn_next_op(session, &op));
op->type = WT_TXN_OP_REF_DELETE;
@@ -472,7 +472,7 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session)
* If there is no active checkpoint or this handle is up to date with the active checkpoint then
* it's safe to ignore the checkpoint ID in the visibility check.
*/
- checkpoint_pinned = txn_global->checkpoint_state.pinned_id;
+ checkpoint_pinned = txn_global->checkpoint_txn_shared.pinned_id;
if (checkpoint_pinned == WT_TXN_NONE || WT_TXNID_LT(oldest_id, checkpoint_pinned))
return (oldest_id);
@@ -593,7 +593,7 @@ __txn_visible_id(WT_SESSION_IMPL *session, uint64_t id)
WT_TXN *txn;
bool found;
- txn = &session->txn;
+ txn = session->txn;
/* Changes with no associated transaction are always visible. */
if (id == WT_TXN_NONE)
@@ -642,13 +642,13 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id, wt_timestamp_t timestamp
{
WT_TXN *txn;
- txn = &session->txn;
+ txn = session->txn;
if (!__txn_visible_id(session, id))
return (false);
/* Transactions read their writes, regardless of timestamps. */
- if (F_ISSET(&session->txn, WT_TXN_HAS_ID) && id == session->txn.id)
+ if (F_ISSET(session->txn, WT_TXN_HAS_ID) && id == session->txn->id)
return (true);
/* Timestamp check. */
@@ -694,7 +694,7 @@ __wt_txn_upd_visible_type(WT_SESSION_IMPL *session, WT_UPDATE *upd)
/* Ignore the prepared update, if transaction configuration says so. */
if (prepare_state == WT_PREPARE_INPROGRESS)
return (
- F_ISSET(&session->txn, WT_TXN_IGNORE_PREPARE) ? WT_VISIBLE_FALSE : WT_VISIBLE_PREPARE);
+ F_ISSET(session->txn, WT_TXN_IGNORE_PREPARE) ? WT_VISIBLE_FALSE : WT_VISIBLE_PREPARE);
return (WT_VISIBLE_TRUE);
}
@@ -876,7 +876,7 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[])
{
WT_TXN *txn;
- txn = &session->txn;
+ txn = session->txn;
txn->isolation = session->isolation;
txn->txn_logsync = S2C(session)->txn_logsync;
@@ -916,7 +916,7 @@ __wt_txn_autocommit_check(WT_SESSION_IMPL *session)
{
WT_TXN *txn;
- txn = &session->txn;
+ txn = session->txn;
if (F_ISSET(txn, WT_TXN_AUTOCOMMIT)) {
F_CLR(txn, WT_TXN_AUTOCOMMIT);
return (__wt_txn_begin(session, NULL));
@@ -933,10 +933,10 @@ static inline int
__wt_txn_idle_cache_check(WT_SESSION_IMPL *session)
{
WT_TXN *txn;
- WT_TXN_STATE *txn_state;
+ WT_TXN_SHARED *txn_shared;
- txn = &session->txn;
- txn_state = WT_SESSION_TXN_STATE(session);
+ txn = session->txn;
+ txn_shared = WT_SESSION_TXN_SHARED(session);
/*
* Check the published snap_min because read-uncommitted never sets WT_TXN_HAS_SNAPSHOT. We
@@ -945,7 +945,7 @@ __wt_txn_idle_cache_check(WT_SESSION_IMPL *session)
* necessary.
*/
if (F_ISSET(txn, WT_TXN_RUNNING) && !F_ISSET(txn, WT_TXN_HAS_ID) &&
- txn_state->pinned_id == WT_TXN_NONE)
+ txn_shared->pinned_id == WT_TXN_NONE)
WT_RET(__wt_cache_eviction_check(session, false, true, NULL));
return (0);
@@ -959,11 +959,11 @@ static inline uint64_t
__wt_txn_id_alloc(WT_SESSION_IMPL *session, bool publish)
{
WT_TXN_GLOBAL *txn_global;
- WT_TXN_STATE *txn_state;
+ WT_TXN_SHARED *txn_shared;
uint64_t id;
txn_global = &S2C(session)->txn_global;
- txn_state = WT_SESSION_TXN_STATE(session);
+ txn_shared = WT_SESSION_TXN_SHARED(session);
/*
* Allocating transaction IDs involves several steps.
@@ -985,12 +985,12 @@ __wt_txn_id_alloc(WT_SESSION_IMPL *session, bool publish)
* well defined, we must use an atomic increment here.
*/
if (publish) {
- WT_PUBLISH(txn_state->is_allocating, true);
- WT_PUBLISH(txn_state->id, txn_global->current);
+ WT_PUBLISH(txn_shared->is_allocating, true);
+ WT_PUBLISH(txn_shared->id, txn_global->current);
id = __wt_atomic_addv64(&txn_global->current, 1) - 1;
- session->txn.id = id;
- WT_PUBLISH(txn_state->id, id);
- WT_PUBLISH(txn_state->is_allocating, false);
+ session->txn->id = id;
+ WT_PUBLISH(txn_shared->id, id);
+ WT_PUBLISH(txn_shared->is_allocating, false);
} else
id = __wt_atomic_addv64(&txn_global->current, 1) - 1;
@@ -1006,7 +1006,7 @@ __wt_txn_id_check(WT_SESSION_IMPL *session)
{
WT_TXN *txn;
- txn = &session->txn;
+ txn = session->txn;
WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
@@ -1038,20 +1038,21 @@ __wt_txn_search_check(WT_SESSION_IMPL *session)
WT_BTREE *btree;
WT_TXN *txn;
- txn = &session->txn;
btree = S2BT(session);
+ txn = session->txn;
+
/*
* If the user says a table should always use a read timestamp, verify this transaction has one.
* Same if it should never have a read timestamp.
*/
if (!F_ISSET(S2C(session), WT_CONN_RECOVERING) &&
FLD_ISSET(btree->assert_flags, WT_ASSERT_READ_TS_ALWAYS) &&
- !F_ISSET(txn, WT_TXN_PUBLIC_TS_READ))
+ !F_ISSET(txn, WT_TXN_SHARED_TS_READ))
WT_RET_MSG(session, EINVAL,
"read_timestamp required and "
"none set on this transaction");
if (FLD_ISSET(btree->assert_flags, WT_ASSERT_READ_TS_NEVER) &&
- F_ISSET(txn, WT_TXN_PUBLIC_TS_READ))
+ F_ISSET(txn, WT_TXN_SHARED_TS_READ))
WT_RET_MSG(session, EINVAL,
"no read_timestamp required and "
"timestamp set on this transaction");
@@ -1072,7 +1073,7 @@ __wt_txn_update_check(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE
bool ignore_prepare_set, rollback;
rollback = false;
- txn = &session->txn;
+ txn = session->txn;
txn_global = &S2C(session)->txn_global;
if (txn->isolation != WT_ISO_SNAPSHOT)
@@ -1130,7 +1131,7 @@ __wt_txn_read_last(WT_SESSION_IMPL *session)
{
WT_TXN *txn;
- txn = &session->txn;
+ txn = session->txn;
/*
* Release the snap_min ID we put in the global table.
@@ -1152,11 +1153,11 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session)
{
WT_TXN *txn;
WT_TXN_GLOBAL *txn_global;
- WT_TXN_STATE *txn_state;
+ WT_TXN_SHARED *txn_shared;
- txn = &session->txn;
+ txn = session->txn;
txn_global = &S2C(session)->txn_global;
- txn_state = WT_SESSION_TXN_STATE(session);
+ txn_shared = WT_SESSION_TXN_SHARED(session);
/*
* We are about to read data, which means we need to protect against
@@ -1176,10 +1177,10 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session)
* positioned on a value, it can't be freed.
*/
if (txn->isolation == WT_ISO_READ_UNCOMMITTED) {
- if (txn_state->pinned_id == WT_TXN_NONE)
- txn_state->pinned_id = txn_global->last_running;
- if (txn_state->metadata_pinned == WT_TXN_NONE)
- txn_state->metadata_pinned = txn_state->pinned_id;
+ if (txn_shared->pinned_id == WT_TXN_NONE)
+ txn_shared->pinned_id = txn_global->last_running;
+ if (txn_shared->metadata_pinned == WT_TXN_NONE)
+ txn_shared->metadata_pinned = txn_shared->pinned_id;
} else if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT))
__wt_txn_get_snapshot(session);
}
diff --git a/src/third_party/wiredtiger/src/include/verify_build.h b/src/third_party/wiredtiger/src/include/verify_build.h
index 23f1d16df95..5f131b7ab0c 100644
--- a/src/third_party/wiredtiger/src/include/verify_build.h
+++ b/src/third_party/wiredtiger/src/include/verify_build.h
@@ -65,7 +65,7 @@ __wt_verify_build(void)
WT_STATIC_ASSERT( \
sizeof(s) > WT_CACHE_LINE_ALIGNMENT || sizeof(s) % WT_CACHE_LINE_ALIGNMENT == 0)
WT_PADDING_CHECK(WT_LOGSLOT);
- WT_PADDING_CHECK(WT_TXN_STATE);
+ WT_PADDING_CHECK(WT_TXN_SHARED);
/*
* The btree code encodes key/value pairs in size_t's, and requires at least 8B size_t's.
diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h
index 31b8b740ed9..204e6fd0eb9 100644
--- a/src/third_party/wiredtiger/src/include/wt_internal.h
+++ b/src/third_party/wiredtiger/src/include/wt_internal.h
@@ -325,8 +325,8 @@ struct __wt_txn_op;
typedef struct __wt_txn_op WT_TXN_OP;
struct __wt_txn_printlog_args;
typedef struct __wt_txn_printlog_args WT_TXN_PRINTLOG_ARGS;
-struct __wt_txn_state;
-typedef struct __wt_txn_state WT_TXN_STATE;
+struct __wt_txn_shared;
+typedef struct __wt_txn_shared WT_TXN_SHARED;
struct __wt_update;
typedef struct __wt_update WT_UPDATE;
union __wt_lsn;
diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c
index 963b1998289..87e4bda2a8a 100644
--- a/src/third_party/wiredtiger/src/log/log.c
+++ b/src/third_party/wiredtiger/src/log/log.c
@@ -1191,10 +1191,8 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created)
*/
if (create_log) {
/*
- * Increment the missed pre-allocated file counter only
- * if a hot backup is not in progress. We are deliberately
- * not using pre-allocated log files during backup
- * (see comment above).
+ * Increment the missed pre-allocated file counter only if a hot backup is not in progress.
+ * We are deliberately not using pre-allocated log files during backup (see comment above).
*/
if (!conn->hot_backup)
log->prep_missed++;
@@ -1430,10 +1428,9 @@ __log_truncate(WT_SESSION_IMPL *session, WT_LSN *lsn, bool this_log, bool salvag
/*
* Truncate the log file to the given LSN.
*
- * It's possible the underlying file system doesn't support truncate
- * (there are existing examples), which is fine, but we don't want to
- * repeatedly do the setup work just to find that out every time. Check
- * before doing work, and if there's a not-supported error, turn off
+ * It's possible the underlying file system doesn't support truncate (there are existing
+ * examples), which is fine, but we don't want to repeatedly do the setup work just to find that
+ * out every time. Check before doing work, and if there's a not-supported error, turn off
* future truncates.
*/
WT_ERR(__log_openfile(session, lsn->l.file, 0, &log_fh));
diff --git a/src/third_party/wiredtiger/src/log/log_auto.c b/src/third_party/wiredtiger/src/log/log_auto.c
index 615f4238aa3..af5eea8a1a6 100644
--- a/src/third_party/wiredtiger/src/log/log_auto.c
+++ b/src/third_party/wiredtiger/src/log/log_auto.c
@@ -711,22 +711,23 @@ __wt_logop_prev_lsn_print(
int
__wt_logop_txn_timestamp_pack(WT_SESSION_IMPL *session, WT_ITEM *logrec, uint64_t time_sec,
- uint64_t time_nsec, uint64_t commit_ts, uint64_t durable_ts, uint64_t first_ts,
- uint64_t prepare_ts, uint64_t read_ts)
+ uint64_t time_nsec, uint64_t commit_ts, uint64_t durable_ts, uint64_t first_commit_ts,
+ uint64_t prepare_ts, uint64_t read_ts, uint64_t pinned_read_ts)
{
- const char *fmt = WT_UNCHECKED_STRING(IIQQQQQQQ);
+ const char *fmt = WT_UNCHECKED_STRING(IIQQQQQQQQ);
size_t size;
uint32_t optype, recsize;
optype = WT_LOGOP_TXN_TIMESTAMP;
WT_RET(__wt_struct_size(session, &size, fmt, optype, 0, time_sec, time_nsec, commit_ts,
- durable_ts, first_ts, prepare_ts, read_ts));
+ durable_ts, first_commit_ts, prepare_ts, read_ts, pinned_read_ts));
__wt_struct_size_adjust(session, &size);
WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
recsize = (uint32_t)size;
WT_RET(__wt_struct_pack(session, (uint8_t *)logrec->data + logrec->size, size, fmt, optype,
- recsize, time_sec, time_nsec, commit_ts, durable_ts, first_ts, prepare_ts, read_ts));
+ recsize, time_sec, time_nsec, commit_ts, durable_ts, first_commit_ts, prepare_ts, read_ts,
+ pinned_read_ts));
logrec->size += (uint32_t)size;
return (0);
@@ -735,14 +736,15 @@ __wt_logop_txn_timestamp_pack(WT_SESSION_IMPL *session, WT_ITEM *logrec, uint64_
int
__wt_logop_txn_timestamp_unpack(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
uint64_t *time_secp, uint64_t *time_nsecp, uint64_t *commit_tsp, uint64_t *durable_tsp,
- uint64_t *first_tsp, uint64_t *prepare_tsp, uint64_t *read_tsp)
+ uint64_t *first_commit_tsp, uint64_t *prepare_tsp, uint64_t *read_tsp, uint64_t *pinned_read_tsp)
{
WT_DECL_RET;
- const char *fmt = WT_UNCHECKED_STRING(IIQQQQQQQ);
+ const char *fmt = WT_UNCHECKED_STRING(IIQQQQQQQQ);
uint32_t optype, size;
if ((ret = __wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt, &optype, &size,
- time_secp, time_nsecp, commit_tsp, durable_tsp, first_tsp, prepare_tsp, read_tsp)) != 0)
+ time_secp, time_nsecp, commit_tsp, durable_tsp, first_commit_tsp, prepare_tsp, read_tsp,
+ pinned_read_tsp)) != 0)
WT_RET_MSG(session, ret, "logop_txn_timestamp: unpack failure");
WT_ASSERT(session, optype == WT_LOGOP_TXN_TIMESTAMP);
@@ -758,21 +760,25 @@ __wt_logop_txn_timestamp_print(
uint64_t time_nsec;
uint64_t commit_ts;
uint64_t durable_ts;
- uint64_t first_ts;
+ uint64_t first_commit_ts;
uint64_t prepare_ts;
uint64_t read_ts;
+ uint64_t pinned_read_ts;
WT_RET(__wt_logop_txn_timestamp_unpack(session, pp, end, &time_sec, &time_nsec, &commit_ts,
- &durable_ts, &first_ts, &prepare_ts, &read_ts));
+ &durable_ts, &first_commit_ts, &prepare_ts, &read_ts, &pinned_read_ts));
WT_RET(__wt_fprintf(session, args->fs, " \"optype\": \"txn_timestamp\",\n"));
WT_RET(__wt_fprintf(session, args->fs, " \"time_sec\": %" PRIu64 ",\n", time_sec));
WT_RET(__wt_fprintf(session, args->fs, " \"time_nsec\": %" PRIu64 ",\n", time_nsec));
WT_RET(__wt_fprintf(session, args->fs, " \"commit_ts\": %" PRIu64 ",\n", commit_ts));
WT_RET(__wt_fprintf(session, args->fs, " \"durable_ts\": %" PRIu64 ",\n", durable_ts));
- WT_RET(__wt_fprintf(session, args->fs, " \"first_ts\": %" PRIu64 ",\n", first_ts));
+ WT_RET(__wt_fprintf(
+ session, args->fs, " \"first_commit_ts\": %" PRIu64 ",\n", first_commit_ts));
WT_RET(__wt_fprintf(session, args->fs, " \"prepare_ts\": %" PRIu64 ",\n", prepare_ts));
- WT_RET(__wt_fprintf(session, args->fs, " \"read_ts\": %" PRIu64 "", read_ts));
+ WT_RET(__wt_fprintf(session, args->fs, " \"read_ts\": %" PRIu64 ",\n", read_ts));
+ WT_RET(
+ __wt_fprintf(session, args->fs, " \"pinned_read_ts\": %" PRIu64 "", pinned_read_ts));
return (0);
}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
index 18a26bebff0..6052d20025f 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
@@ -104,10 +104,10 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm)
} else {
primary = clsm->chunks[clsm->nchunks - 1]->cursor;
primary_chunk = clsm->primary_chunk;
- WT_ASSERT(session, F_ISSET(&session->txn, WT_TXN_HAS_ID));
+ WT_ASSERT(session, F_ISSET(session->txn, WT_TXN_HAS_ID));
have_primary = (primary != NULL && primary_chunk != NULL &&
(primary_chunk->switch_txn == WT_TXN_NONE ||
- WT_TXNID_LT(session->txn.id, primary_chunk->switch_txn)));
+ WT_TXNID_LT(session->txn->id, primary_chunk->switch_txn)));
}
/*
@@ -160,7 +160,7 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update)
lsm_tree = clsm->lsm_tree;
session = (WT_SESSION_IMPL *)clsm->iface.session;
- txn = &session->txn;
+ txn = session->txn;
/* Merge cursors never update. */
if (F_ISSET(clsm, WT_CLSM_MERGE))
@@ -209,7 +209,7 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update)
clsm->nupdates = 1;
if (txn->isolation == WT_ISO_SNAPSHOT && F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
WT_ASSERT(session, F_ISSET(txn, WT_TXN_HAS_SNAPSHOT));
- pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id;
+ pinned_id = WT_SESSION_TXN_SHARED(session)->pinned_id;
for (i = clsm->nchunks - 2; clsm->nupdates < clsm->nchunks; clsm->nupdates++, i--) {
switch_txn = clsm->chunks[i]->switch_txn;
if (WT_TXNID_LT(switch_txn, pinned_id))
@@ -429,7 +429,7 @@ __clsm_open_cursors(WT_CURSOR_LSM *clsm, bool update, u_int start_chunk, uint32_
c = &clsm->iface;
cursor = NULL;
session = (WT_SESSION_IMPL *)c->session;
- txn = &session->txn;
+ txn = session->txn;
chunk = NULL;
locked = false;
lsm_tree = clsm->lsm_tree;
@@ -832,7 +832,7 @@ __clsm_position_chunk(WT_CURSOR_LSM *clsm, WT_CURSOR *c, bool forward, int *cmpp
* and stepping forward / back. In that case, keep going until we see a key in the expected
* range.
*/
- if (session->txn.isolation != WT_ISO_READ_UNCOMMITTED)
+ if (session->txn->isolation != WT_ISO_READ_UNCOMMITTED)
return (0);
WT_RET(WT_LSM_CURCMP(session, clsm->lsm_tree, c, cursor, *cmpp));
@@ -1386,9 +1386,9 @@ __clsm_put(WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, const WT_ITEM *key, co
lsm_tree = clsm->lsm_tree;
- WT_ASSERT(session, F_ISSET(&session->txn, WT_TXN_HAS_ID) && clsm->primary_chunk != NULL &&
+ WT_ASSERT(session, F_ISSET(session->txn, WT_TXN_HAS_ID) && clsm->primary_chunk != NULL &&
(clsm->primary_chunk->switch_txn == WT_TXN_NONE ||
- WT_TXNID_LE(session->txn.id, clsm->primary_chunk->switch_txn)));
+ WT_TXNID_LE(session->txn->id, clsm->primary_chunk->switch_txn)));
/*
* Clear the existing cursor position. Don't clear the primary cursor: we're about to use it
@@ -1618,12 +1618,11 @@ err:
CURSOR_UPDATE_API_END(session, ret);
/*
- * The application might do a WT_CURSOR.get_value call when we return,
- * so we need a value and the underlying functions didn't set one up.
- * For various reasons, those functions may not have done a search and
- * any previous value in the cursor might race with WT_CURSOR.reserve
- * (and in cases like LSM, the reserve never encountered the original
- * key). For simplicity, repeat the search here.
+ * The application might do a WT_CURSOR.get_value call when we return, so we need a value and
+ * the underlying functions didn't set one up. For various reasons, those functions may not have
+ * done a search and any previous value in the cursor might race with WT_CURSOR.reserve (and in
+ * cases like LSM, the reserve never encountered the original key). For simplicity, repeat the
+ * search here.
*/
return (ret == 0 ? cursor->search(cursor) : ret);
}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
index ad71a1ee3dd..ed4a5b43c4b 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
@@ -403,10 +403,10 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LS
* Set read-uncommitted: we have already checked that all of the updates in this chunk are
* globally visible, use the cheapest possible check in reconciliation.
*/
- saved_isolation = session->txn.isolation;
- session->txn.isolation = WT_ISO_READ_UNCOMMITTED;
+ saved_isolation = session->txn->isolation;
+ session->txn->isolation = WT_ISO_READ_UNCOMMITTED;
ret = __wt_sync_file(session, WT_SYNC_WRITE_LEAVES);
- session->txn.isolation = saved_isolation;
+ session->txn->isolation = saved_isolation;
WT_ERR(ret);
__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s", chunk->uri);
diff --git a/src/third_party/wiredtiger/src/meta/meta_ckpt.c b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
index c9f985a10b7..66415b2cd62 100644
--- a/src/third_party/wiredtiger/src/meta/meta_ckpt.c
+++ b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
@@ -328,11 +328,10 @@ __wt_meta_block_metadata(WT_SESSION_IMPL *session, const char *config, WT_CKPT *
filecfg[1] = config;
/*
- * Find out if this file is encrypted. If encrypting, encrypt and encode.
- * The metadata has to be encrypted because it contains private data
- * (for example, column names). We pass the block manager text that
- * describes the metadata (the encryption information), and the
- * possibly encrypted metadata encoded as a hexadecimal string.
+ * Find out if this file is encrypted. If encrypting, encrypt and encode. The metadata has to be
+ * encrypted because it contains private data (for example, column names). We pass the block
+ * manager text that describes the metadata (the encryption information), and the possibly
+ * encrypted metadata encoded as a hexadecimal string.
*/
WT_ERR(__wt_btree_config_encryptor(session, filecfg, &kencryptor));
if (kencryptor == NULL) {
diff --git a/src/third_party/wiredtiger/src/meta/meta_track.c b/src/third_party/wiredtiger/src/meta/meta_track.c
index bc51dcb15e4..5b2710e8aba 100644
--- a/src/third_party/wiredtiger/src/meta/meta_track.c
+++ b/src/third_party/wiredtiger/src/meta/meta_track.c
@@ -111,7 +111,7 @@ int
__wt_meta_track_on(WT_SESSION_IMPL *session)
{
if (session->meta_track_nest++ == 0) {
- if (!F_ISSET(&session->txn, WT_TXN_RUNNING)) {
+ if (!F_ISSET(session->txn, WT_TXN_RUNNING)) {
#ifdef WT_ENABLE_SCHEMA_TXN
WT_RET(__wt_txn_begin(session, NULL));
__wt_errx(session, "TRACK: Using internal schema txn");
@@ -282,11 +282,11 @@ __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll)
* If this operation is part of a running transaction, that should be included in the
* checkpoint.
*/
- ckpt_session->txn.id = session->txn.id;
+ ckpt_session->txn->id = session->txn->id;
WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_METADATA));
WT_WITH_DHANDLE(ckpt_session, WT_SESSION_META_DHANDLE(session),
WT_WITH_METADATA_LOCK(ckpt_session, ret = __wt_checkpoint(ckpt_session, NULL)));
- ckpt_session->txn.id = WT_TXN_NONE;
+ ckpt_session->txn->id = WT_TXN_NONE;
if (ret == 0)
WT_WITH_DHANDLE(
session, WT_SESSION_META_DHANDLE(session), ret = __wt_checkpoint_sync(session, NULL));
@@ -316,7 +316,7 @@ err:
* We should have committed above unless we're unrolling, there was an error or the
* operation was a noop.
*/
- WT_ASSERT(session, unroll || saved_ret != 0 || session->txn.mod_count == 0);
+ WT_ASSERT(session, unroll || saved_ret != 0 || session->txn->mod_count == 0);
#ifdef WT_ENABLE_SCHEMA_TXN
__wt_err(session, saved_ret, "TRACK: Abort internal schema txn");
WT_TRET(__wt_txn_rollback(session, NULL));
@@ -521,7 +521,7 @@ __wt_meta_track_init(WT_SESSION_IMPL *session)
* Sessions default to read-committed isolation, we rely on that for the correctness of
* metadata checkpoints.
*/
- WT_ASSERT(session, conn->meta_ckpt_session->txn.isolation == WT_ISO_READ_COMMITTED);
+ WT_ASSERT(session, conn->meta_ckpt_session->txn->isolation == WT_ISO_READ_COMMITTED);
}
return (0);
diff --git a/src/third_party/wiredtiger/src/os_win/os_fs.c b/src/third_party/wiredtiger/src/os_win/os_fs.c
index 597dfaf81dd..276aae62796 100644
--- a/src/third_party/wiredtiger/src/os_win/os_fs.c
+++ b/src/third_party/wiredtiger/src/os_win/os_fs.c
@@ -109,13 +109,11 @@ __win_fs_rename(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, const char
WT_ERR(__wt_to_utf16_string(session, to, &to_wide));
/*
- * We want an atomic rename, but that's not guaranteed by MoveFileExW
- * (or by any MSDN API). Don't set the MOVEFILE_COPY_ALLOWED flag to
- * prevent the system from falling back to a copy and delete process.
- * Do set the MOVEFILE_WRITE_THROUGH flag so the window is as small
- * as possible, just in case. WiredTiger renames are done in a single
- * directory and we expect that to be an atomic metadata update on any
- * modern filesystem.
+ * We want an atomic rename, but that's not guaranteed by MoveFileExW (or by any MSDN API).
+ * Don't set the MOVEFILE_COPY_ALLOWED flag to prevent the system from falling back to a copy
+ * and delete process. Do set the MOVEFILE_WRITE_THROUGH flag so the window is as small as
+ * possible, just in case. WiredTiger renames are done in a single directory and we expect that
+ * to be an atomic metadata update on any modern filesystem.
*/
WT_WINCALL_RETRY(MoveFileExW(from_wide->data, to_wide->data,
MOVEFILE_REPLACE_EXISTING | MOVEFILE_WRITE_THROUGH),
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_col.c b/src/third_party/wiredtiger/src/reconcile/rec_col.c
index 5218aa52451..ffa4c94f1b2 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_col.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_col.c
@@ -968,7 +968,7 @@ compare:
* they're no longer useful.
*/
if (ovfl_state == OVFL_UNUSED && vpack->raw != WT_CELL_VALUE_OVFL_RM)
- WT_ERR(__wt_ovfl_remove(session, page, vpack, F_ISSET(r, WT_REC_EVICT)));
+ WT_ERR(__wt_ovfl_remove(session, page, vpack));
}
/* Walk any append list. */
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_row.c b/src/third_party/wiredtiger/src/reconcile/rec_row.c
index ed8c6e3f80c..d65768aba49 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_row.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_row.c
@@ -879,7 +879,7 @@ __wt_rec_row_leaf(
} else {
/* The first time we find an overflow record, discard the underlying blocks. */
if (F_ISSET(vpack, WT_CELL_UNPACK_OVERFLOW) && vpack->raw != WT_CELL_VALUE_OVFL_RM)
- WT_ERR(__wt_ovfl_remove(session, page, vpack, F_ISSET(r, WT_REC_EVICT)));
+ WT_ERR(__wt_ovfl_remove(session, page, vpack));
switch (upd->type) {
case WT_UPDATE_MODIFY:
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
index b4dd84d58ee..e25b02a3104 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
@@ -63,67 +63,65 @@ __rec_append_orig_value(
WT_ASSERT(session, upd != NULL && unpack != NULL && unpack->type != WT_CELL_DEL);
- for (;; upd = upd->next) {
- /* Done if at least one self-contained update is globally visible. */
- if (WT_UPDATE_DATA_VALUE(upd) && __wt_txn_upd_visible_all(session, upd))
- return (0);
+ append = tombstone = NULL;
+ total_size = 0;
+ /* Review the current update list, checking conditions that mean no work is needed. */
+ for (;; upd = upd->next) {
/*
- * If the update is restored from the history store for the rollback to stable operation we
- * don't need the on-disk value anymore and we're done.
+ * Done if the update was restored from the history store for the rollback to stable
+ * operation.
*/
if (F_ISSET(upd, WT_UPDATE_RESTORED_FOR_ROLLBACK))
return (0);
- /* On page value already on chain */
- if (unpack != NULL && unpack->start_ts == upd->start_ts && unpack->start_txn == upd->txnid)
+ /* Done if the on page value already appears on the update list. */
+ if (unpack->start_ts == upd->start_ts && unpack->start_txn == upd->txnid)
+ return (0);
+
+ /*
+ * Done if at least one self-contained update is globally visible. It's tempting to pull
+ * this test out of the loop and only test the oldest self-contained update for global
+ * visibility (as visibility tests are expensive). However, when running at lower isolation
+ * levels, or when an application intentionally commits in out of timestamp order, it's
+ * possible for an update on the chain to be globally visible and followed by an (earlier)
+ * update that is not yet globally visible.
+ */
+ if (WT_UPDATE_DATA_VALUE(upd) && __wt_txn_upd_visible_all(session, upd))
return (0);
- /* Leave reference at the last item in the chain. */
+ /* Leave reference pointing to the last item in the update list. */
if (upd->next == NULL)
break;
}
- /*
- * We need the original on-page value for some reader: get a copy and append it to the end of
- * the update list with a transaction ID that guarantees its visibility.
- *
- * If we don't have a value cell, it's an insert/append list key/value pair which simply doesn't
- * exist for some reader; place a deleted record at the end of the update list.
- *
- * If the an update is out of order so it masks the value in the cell, don't append.
- */
- append = tombstone = NULL; /* -Wconditional-uninitialized */
- total_size = size = 0; /* -Wconditional-uninitialized */
+ /* Done if the stop time pair of the onpage cell is globally visible. */
+ if ((unpack->stop_ts != WT_TS_MAX || unpack->stop_txn != WT_TXN_MAX) &&
+ __wt_txn_visible_all(session, unpack->stop_txn, unpack->stop_ts))
+ return (0);
+
+ /* We need the original on-page value for some reader: get a copy. */
+ WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__wt_page_cell_data_ref(session, page, unpack, tmp));
+ WT_ERR(__wt_update_alloc(session, tmp, &append, &size, WT_UPDATE_STANDARD));
+ total_size += size;
+ append->txnid = unpack->start_txn;
+ append->start_ts = unpack->start_ts;
+ append->durable_ts = unpack->durable_start_ts;
/*
- * We need to append a TOMBSTONE before the onpage value if the onpage value has a valid
- * stop pair.
- *
- * Imagine a case we insert and delete a value respectively at timestamp 0 and 10, and later
- * insert it again at 20. We need the TOMBSTONE to tell us there is no value between 10 and
- * 20.
+ * Additionally, we need to append a tombstone before the onpage value we're about to append to
+ * the list, if the onpage value has a valid stop pair. Imagine a case where we insert and
+ * delete a value respectively at timestamp 0 and 10, and later insert it again at 20. We need
+ * the tombstone to tell us there is no value between 10 and 20.
*/
if (unpack->stop_ts != WT_TS_MAX || unpack->stop_txn != WT_TXN_MAX) {
- /* No need to append anything if the stop time pair is globally visible. */
- if (__wt_txn_visible_all(session, unpack->stop_txn, unpack->stop_ts))
- return (0);
WT_ERR(__wt_update_alloc(session, NULL, &tombstone, &size, WT_UPDATE_TOMBSTONE));
+ total_size += size;
tombstone->txnid = unpack->stop_txn;
tombstone->start_ts = unpack->stop_ts;
tombstone->durable_ts = unpack->durable_stop_ts;
- total_size += size;
- }
-
- WT_ERR(__wt_scr_alloc(session, 0, &tmp));
- WT_ERR(__wt_page_cell_data_ref(session, page, unpack, tmp));
- WT_ERR(__wt_update_alloc(session, tmp, &append, &size, WT_UPDATE_STANDARD));
- append->txnid = unpack->start_txn;
- append->start_ts = unpack->start_ts;
- append->durable_ts = unpack->durable_start_ts;
- total_size += size;
- if (tombstone != NULL) {
tombstone->next = append;
append = tombstone;
}
@@ -133,13 +131,12 @@ __rec_append_orig_value(
__wt_cache_page_inmem_incr(session, page, total_size);
+ if (0) {
err:
- __wt_scr_free(session, &tmp);
- /* Free append when tombstone allocation fails */
- if (ret != 0) {
__wt_free(session, append);
__wt_free(session, tombstone);
}
+ __wt_scr_free(session, &tmp);
return (ret);
}
@@ -156,8 +153,8 @@ __rec_need_save_upd(
/*
* Save updates for any reconciliation that doesn't involve history store (in-memory database
- * and fixed length column store), except when the maximum timestamp and txnid are globally
- * visible.
+ * and fixed length column store), except when the selected stop time pair or the selected start
+ * time pair is globally visible.
*/
if (!F_ISSET(r, WT_REC_HS) && !F_ISSET(r, WT_REC_IN_MEMORY) && r->page->type != WT_PAGE_COL_FIX)
return (false);
@@ -296,7 +293,8 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
* checkpoint in a concurrent session.
*/
WT_ASSERT(session, !WT_IS_METADATA(session->dhandle) || upd == NULL ||
- upd->txnid == WT_TXN_NONE || upd->txnid != S2C(session)->txn_global.checkpoint_state.id ||
+ upd->txnid == WT_TXN_NONE ||
+ upd->txnid != S2C(session)->txn_global.checkpoint_txn_shared.id ||
WT_SESSION_IS_CHECKPOINT(session));
/* If all of the updates were aborted, quit. */
@@ -464,11 +462,17 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
/*
* Returning an update means the original on-page value might be lost, and that's a problem if
- * there's a reader that needs it. This call makes a copy of the on-page value. We do that any
- * time there are saved updates and during reconciliation of a backing overflow record that will
- * be physically removed once it's no longer needed.
+ * there's a reader that needs it, make a copy of the on-page value. We do that any time there
+ * are saved updates (we may need the original on-page value to terminate the update chain, for
+ * example, in the case of an update that modifies the original value). Additionally, make a
+ * copy of the on-page value if the value is an overflow item and anything other than the
+ * on-page cell is being written. This is because the value's backing overflow blocks aren't
+ * part of the page, and they are physically removed by checkpoint writing this page, that is,
+ * the checkpoint doesn't include the overflow blocks so they're removed and future readers of
+ * this page won't be able to find them.
*/
- if (vpack != NULL && vpack->type != WT_CELL_DEL && upd_select->upd != NULL && upd_saved)
+ if (upd_select->upd != NULL && vpack != NULL && vpack->type != WT_CELL_DEL &&
+ (upd_saved || F_ISSET(vpack, WT_CELL_UNPACK_OVERFLOW)))
WT_ERR(__rec_append_orig_value(session, page, upd_select->upd, vpack));
err:
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index e7cdc847da8..ff4fe361abe 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -42,12 +42,12 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage
/*
* Sanity check flags.
*
- * If we try to do eviction using transaction visibility, we had better
- * have a snapshot. This doesn't apply to checkpoints: there are
- * (rare) cases where we write data at read-uncommitted isolation.
+ * If we try to do eviction using transaction visibility, we had better have a snapshot. This
+ * doesn't apply to checkpoints: there are (rare) cases where we write data at read-uncommitted
+ * isolation.
*/
WT_ASSERT(session, !LF_ISSET(WT_REC_EVICT) || LF_ISSET(WT_REC_VISIBLE_ALL) ||
- F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT));
+ F_ISSET(session->txn, WT_TXN_HAS_SNAPSHOT));
/* It's an error to be called with a clean page. */
WT_ASSERT(session, __wt_page_is_modified(page));
@@ -225,11 +225,10 @@ __reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, u
__rec_cleanup(session, r);
/*
- * When threads perform eviction, don't cache block manager structures
- * (even across calls), we can have a significant number of threads
- * doing eviction at the same time with large items. Ignore checkpoints,
- * once the checkpoint completes, all unnecessary session resources will
- * be discarded.
+ * When threads perform eviction, don't cache block manager structures (even across calls), we
+ * can have a significant number of threads doing eviction at the same time with large items.
+ * Ignore checkpoints, once the checkpoint completes, all unnecessary session resources will be
+ * discarded.
*/
if (!WT_SESSION_IS_CHECKPOINT(session)) {
/*
@@ -242,14 +241,6 @@ __reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, u
WT_TRET(__rec_destroy_session(session));
}
-
- /*
- * We track removed overflow objects in case there's a reader in transit when they're removed.
- * Any form of eviction locks out readers, we can discard them all.
- */
- if (LF_ISSET(WT_REC_EVICT))
- __wt_ovfl_discard_remove(session, page);
-
WT_RET(ret);
/*
@@ -308,15 +299,6 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r)
*/
WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT) ||
(F_ISSET(r, WT_REC_HS | WT_REC_IN_MEMORY) || page->type == WT_PAGE_COL_FIX));
-
- /*
- * We have written the page, but something prevents it from being evicted. If we wrote the
- * newest versions of updates, the on-disk page may contain records that are newer than what
- * checkpoint would write. Make sure that checkpoint visits the page and (if necessary)
- * fixes things up.
- */
- if (r->hs_skew_newest)
- mod->first_dirty_txn = WT_TXN_FIRST;
} else {
/*
* Track the page's maximum transaction ID (used to decide if we can evict a clean page and
@@ -518,52 +500,25 @@ __rec_init(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags, WT_SALVAGE_COO
* checkpoints into account.
*/
if (WT_IS_METADATA(session->dhandle)) {
- WT_ORDERED_READ(ckpt_txn, txn_global->checkpoint_state.id);
+ WT_ORDERED_READ(ckpt_txn, txn_global->checkpoint_txn_shared.id);
if (ckpt_txn != WT_TXN_NONE && WT_TXNID_LT(ckpt_txn, r->last_running))
r->last_running = ckpt_txn;
}
- /*
- * Decide whether to skew on-page values towards newer or older versions. This is a heuristic
- * attempting to minimize the number of pages that need to be rewritten by future checkpoints.
- *
- * We usually prefer to skew to newer versions, the logic being that by the time the next
- * checkpoint runs, it is likely that all the updates we choose will be stable. However, if
- * checkpointing with a timestamp (indicated by a stable_timestamp being set), and there is a
- * checkpoint already running, or this page was read with history store history, or the stable
- * timestamp hasn't changed since last time this page was successfully, skew oldest instead.
- */
- if (F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_DEBUG_MODE) &&
- __wt_random(&session->rnd) % 3 == 0)
- r->hs_skew_newest = false;
- else
- r->hs_skew_newest = LF_ISSET(WT_REC_HS) && LF_ISSET(WT_REC_VISIBLE_ALL);
-
- if (r->hs_skew_newest && !__wt_btree_immediately_durable(session) &&
- txn_global->has_stable_timestamp &&
- ((btree->checkpoint_gen != __wt_gen(session, WT_GEN_CHECKPOINT) &&
- txn_global->stable_is_pinned) ||
- FLD_ISSET(page->modify->restore_state, WT_PAGE_RS_HS) ||
- page->modify->last_stable_timestamp == txn_global->stable_timestamp))
- r->hs_skew_newest = false;
-
/* When operating on the history store table, we should never try history store eviction. */
WT_ASSERT(session, !F_ISSET(btree, WT_BTREE_HS) || !LF_ISSET(WT_REC_HS));
/*
- * History store table eviction is configured when eviction gets aggressive,
- * adjust the flags for cases we don't support.
+ * History store table eviction is configured when eviction gets aggressive, adjust the flags
+ * for cases we don't support.
*
- * We don't yet support fixed-length column-store combined with the
- * history store table. It's not hard to do, but the underlying function
- * that reviews which updates can be written to the evicted page and
- * which updates need to be written to the history store table needs access
- * to the original value from the page being evicted, and there's no
- * code path for that in the case of fixed-length column-store objects.
- * (Row-store and variable-width column-store objects provide a
- * reference to the unpacked on-page cell for this purpose, but there
- * isn't an on-page cell for fixed-length column-store objects.) For
- * now, turn it off.
+ * We don't yet support fixed-length column-store combined with the history store table. It's
+ * not hard to do, but the underlying function that reviews which updates can be written to the
+ * evicted page and which updates need to be written to the history store table needs access to
+ * the original value from the page being evicted, and there's no code path for that in the case
+ * of fixed-length column-store objects. (Row-store and variable-width column-store objects
+ * provide a reference to the unpacked on-page cell for this purpose, but there isn't an on-page
+ * cell for fixed-length column-store objects.) For now, turn it off.
*/
if (page->type == WT_PAGE_COL_FIX)
LF_CLR(WT_REC_HS);
@@ -755,23 +710,20 @@ __rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r)
switch (page->type) {
case WT_PAGE_COL_FIX:
/*
- * Column-store pages can grow if there are missing records
- * (that is, we lost a chunk of the range, and have to write
- * deleted records). Fixed-length objects are a problem, if
- * there's a big missing range, we could theoretically have to
- * write large numbers of missing objects.
+ * Column-store pages can grow if there are missing records (that is, we lost a chunk of the
+ * range, and have to write deleted records). Fixed-length objects are a problem, if there's
+ * a big missing range, we could theoretically have to write large numbers of missing
+ * objects.
*/
page_size = (uint32_t)WT_ALIGN(
WT_FIX_ENTRIES_TO_BYTES(btree, r->salvage->take + r->salvage->missing), btree->allocsize);
break;
case WT_PAGE_COL_VAR:
/*
- * Column-store pages can grow if there are missing records
- * (that is, we lost a chunk of the range, and have to write
- * deleted records). Variable-length objects aren't usually a
- * problem because we can write any number of deleted records
- * in a single page entry because of the RLE, we just need to
- * ensure that additional entry fits.
+ * Column-store pages can grow if there are missing records (that is, we lost a chunk of the
+ * range, and have to write deleted records). Variable-length objects aren't usually a
+ * problem because we can write any number of deleted records in a single page entry because
+ * of the RLE, we just need to ensure that additional entry fits.
*/
break;
case WT_PAGE_ROW_LEAF:
@@ -946,15 +898,14 @@ __wt_rec_split_init(
}
/*
- * Ensure the disk image buffer is large enough for the max object, as
- * corrected by the underlying block manager.
+ * Ensure the disk image buffer is large enough for the max object, as corrected by the
+ * underlying block manager.
*
- * Since we want to support split_size values larger than the page size
- * (to allow for adjustments based on the compression), this buffer
- * should be the greater of split_size and page_size, then aligned to
- * the next allocation size boundary. The latter shouldn't be an issue,
- * but it's a possible scenario if, for example, the compression engine
- * is expected to give us 5x compression and gives us nothing at all.
+ * Since we want to support split_size values larger than the page size (to allow for
+ * adjustments based on the compression), this buffer should be the greater of split_size and
+ * page_size, then aligned to the next allocation size boundary. The latter shouldn't be an
+ * issue, but it's a possible scenario if, for example, the compression engine is expected to
+ * give us 5x compression and gives us nothing at all.
*/
corrected_page_size = r->page_size;
WT_RET(bm->write_size(bm, session, &corrected_page_size));
@@ -1626,12 +1577,11 @@ __rec_split_write_reuse(
return (false);
/*
- * Quit if evicting with no previously written block to compare against.
- * (In other words, if there's eviction pressure and the page was never
- * written by a checkpoint, calculating a checksum is worthless.)
+ * Quit if evicting with no previously written block to compare against. (In other words, if
+ * there's eviction pressure and the page was never written by a checkpoint, calculating a
+ * checksum is worthless.)
*
- * Quit if evicting and a previous check failed, once there's a miss no
- * future block will match.
+ * Quit if evicting and a previous check failed, once there's a miss no future block will match.
*/
if (F_ISSET(r, WT_REC_EVICT)) {
if (mod->rec_result != WT_PM_REC_MULTIBLOCK || mod->mod_multi_entries < r->multi_next)
diff --git a/src/third_party/wiredtiger/src/schema/schema_util.c b/src/third_party/wiredtiger/src/schema/schema_util.c
index c6e750bea9e..16365ba94c5 100644
--- a/src/third_party/wiredtiger/src/schema/schema_util.c
+++ b/src/third_party/wiredtiger/src/schema/schema_util.c
@@ -84,7 +84,7 @@ __wt_schema_internal_session(WT_SESSION_IMPL *session, WT_SESSION_IMPL **int_ses
* flags from the original.
*/
*int_sessionp = session;
- if (F_ISSET(&session->txn, WT_TXN_RUNNING)) {
+ if (F_ISSET(session->txn, WT_TXN_RUNNING)) {
/* We should not have a schema txn running now. */
WT_ASSERT(session, !F_ISSET(session, WT_SESSION_SCHEMA_TXN));
WT_RET(
diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c
index 627a204c9b2..24acc8da2d9 100644
--- a/src/third_party/wiredtiger/src/session/session_api.c
+++ b/src/third_party/wiredtiger/src/session/session_api.c
@@ -132,8 +132,8 @@ __wt_session_copy_values(WT_SESSION_IMPL *session)
* We have to do this with a transaction ID pinned unless the cursor is reading from a
* checkpoint.
*/
- WT_TXN_STATE *txn_state = WT_SESSION_TXN_STATE(session);
- WT_ASSERT(session, txn_state->pinned_id != WT_TXN_NONE ||
+ WT_TXN_SHARED *txn_shared = WT_SESSION_TXN_SHARED(session);
+ WT_ASSERT(session, txn_shared->pinned_id != WT_TXN_NONE ||
(WT_PREFIX_MATCH(cursor->uri, "file:") &&
F_ISSET((WT_CURSOR_BTREE *)cursor, WT_CBT_NO_TXN)));
#endif
@@ -184,14 +184,12 @@ static void
__session_clear(WT_SESSION_IMPL *session)
{
/*
- * There's no serialization support around the review of the hazard
- * array, which means threads checking for hazard pointers first check
- * the active field (which may be 0) and then use the hazard pointer
- * (which cannot be NULL).
+ * There's no serialization support around the review of the hazard array, which means threads
+ * checking for hazard pointers first check the active field (which may be 0) and then use the
+ * hazard pointer (which cannot be NULL).
*
- * Additionally, the session structure can include information that
- * persists past the session's end-of-life, stored as part of page
- * splits.
+ * Additionally, the session structure can include information that persists past the session's
+ * end-of-life, stored as part of page splits.
*
* For these reasons, be careful when clearing the session structure.
*/
@@ -274,13 +272,13 @@ __session_close(WT_SESSION *wt_session, const char *config)
F_CLR(session, WT_SESSION_CACHE_CURSORS);
/* Rollback any active transaction. */
- if (F_ISSET(&session->txn, WT_TXN_RUNNING))
+ if (F_ISSET(session->txn, WT_TXN_RUNNING))
WT_TRET(__session_rollback_transaction(wt_session, NULL));
/*
* Also release any pinned transaction ID from a non-transactional operation.
*/
- if (conn->txn_global.states != NULL)
+ if (conn->txn_global.txn_shared_list != NULL)
__wt_txn_release_snapshot(session);
/* Close all open cursors. */
@@ -1644,7 +1642,7 @@ __session_commit_transaction(WT_SESSION *wt_session, const char *config)
WT_TXN *txn;
session = (WT_SESSION_IMPL *)wt_session;
- txn = &session->txn;
+ txn = session->txn;
SESSION_API_CALL_PREPARE_ALLOWED(session, commit_transaction, config, cfg);
WT_STAT_CONN_INCR(session, txn_commit);
@@ -1748,7 +1746,7 @@ __session_rollback_transaction(WT_SESSION *wt_session, const char *config)
SESSION_API_CALL_PREPARE_ALLOWED(session, rollback_transaction, config, cfg);
WT_STAT_CONN_INCR(session, txn_rollback);
- txn = &session->txn;
+ txn = session->txn;
if (F_ISSET(txn, WT_TXN_PREPARE)) {
WT_STAT_CONN_INCR(session, txn_prepare_rollback);
WT_STAT_CONN_DECR(session, txn_prepare_active);
@@ -1816,19 +1814,19 @@ __session_transaction_pinned_range(WT_SESSION *wt_session, uint64_t *prange)
{
WT_DECL_RET;
WT_SESSION_IMPL *session;
- WT_TXN_STATE *txn_state;
+ WT_TXN_SHARED *txn_shared;
uint64_t pinned;
session = (WT_SESSION_IMPL *)wt_session;
SESSION_API_CALL_PREPARE_NOT_ALLOWED_NOCONF(session, transaction_pinned_range);
- txn_state = WT_SESSION_TXN_STATE(session);
+ txn_shared = WT_SESSION_TXN_SHARED(session);
/* Assign pinned to the lesser of id or snap_min */
- if (txn_state->id != WT_TXN_NONE && WT_TXNID_LT(txn_state->id, txn_state->pinned_id))
- pinned = txn_state->id;
+ if (txn_shared->id != WT_TXN_NONE && WT_TXNID_LT(txn_shared->id, txn_shared->pinned_id))
+ pinned = txn_shared->id;
else
- pinned = txn_state->pinned_id;
+ pinned = txn_shared->pinned_id;
if (pinned == WT_TXN_NONE)
*prange = 0;
diff --git a/src/third_party/wiredtiger/src/support/hazard.c b/src/third_party/wiredtiger/src/support/hazard.c
index 34101eb2242..45580b7d0cc 100644
--- a/src/third_party/wiredtiger/src/support/hazard.c
+++ b/src/third_party/wiredtiger/src/support/hazard.c
@@ -156,14 +156,13 @@ __wt_hazard_set_func(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp
}
/*
- * The page isn't available, it's being considered for eviction
- * (or being evicted, for all we know). If the eviction server
- * sees our hazard pointer before evicting the page, it will
- * return the page to use, no harm done, if it doesn't, it will
- * go ahead and complete the eviction.
+ * The page isn't available, it's being considered for eviction (or being evicted, for all we
+ * know). If the eviction server sees our hazard pointer before evicting the page, it will
+ * return the page to use, no harm done, if it doesn't, it will go ahead and complete the
+ * eviction.
*
- * We don't bother publishing this update: the worst case is we
- * prevent some random page from being evicted.
+ * We don't bother publishing this update: the worst case is we prevent some random page from
+ * being evicted.
*/
hp->ref = NULL;
*busyp = true;
@@ -244,15 +243,13 @@ __wt_hazard_close(WT_SESSION_IMPL *session)
#endif
/*
- * Clear any hazard pointers because it's not a correctness problem
- * (any hazard pointer we find can't be real because the session is
- * being closed when we're called). We do this work because session
- * close isn't that common that it's an expensive check, and we don't
- * want to let a hazard pointer lie around, keeping a page from being
- * evicted.
+ * Clear any hazard pointers because it's not a correctness problem (any hazard pointer we find
+ * can't be real because the session is being closed when we're called). We do this work because
+ * session close isn't that common that it's an expensive check, and we don't want to let a
+ * hazard pointer lie around, keeping a page from being evicted.
*
- * We don't panic: this shouldn't be a correctness issue (at least, I
- * can't think of a reason it would be).
+ * We don't panic: this shouldn't be a correctness issue (at least, I can't think of a reason it
+ * would be).
*/
for (hp = session->hazard; hp < session->hazard + session->hazard_inuse; ++hp)
if (hp->ref != NULL) {
diff --git a/src/third_party/wiredtiger/src/support/huffman.c b/src/third_party/wiredtiger/src/support/huffman.c
index 768024617ae..2bd9b6daea6 100644
--- a/src/third_party/wiredtiger/src/support/huffman.c
+++ b/src/third_party/wiredtiger/src/support/huffman.c
@@ -310,10 +310,9 @@ __wt_huffman_open(
WT_RET(__wt_calloc_one(session, &huffman));
/*
- * The frequency table is 4B pairs of symbol and frequency. The symbol
- * is either 1 or 2 bytes and the frequency ranges from 1 to UINT32_MAX
- * (a frequency of 0 means the value is never expected to appear in the
- * input). Validate the symbols are within range.
+ * The frequency table is 4B pairs of symbol and frequency. The symbol is either 1 or 2 bytes
+ * and the frequency ranges from 1 to UINT32_MAX (a frequency of 0 means the value is never
+ * expected to appear in the input). Validate the symbols are within range.
*/
if (numbytes != 1 && numbytes != 2)
WT_ERR_MSG(session, EINVAL,
diff --git a/src/third_party/wiredtiger/src/support/pow.c b/src/third_party/wiredtiger/src/support/pow.c
index e6f5e9dbbb0..f245502bc9f 100644
--- a/src/third_party/wiredtiger/src/support/pow.c
+++ b/src/third_party/wiredtiger/src/support/pow.c
@@ -101,11 +101,10 @@ bool
__wt_ispo2(uint32_t v)
{
/*
- * Only numbers that are powers of two will satisfy the relationship
- * (v & (v - 1) == 0).
+ * Only numbers that are powers of two will satisfy the relationship (v & (v - 1) == 0).
*
- * However n must be positive, this returns 0 as a power of 2; to fix
- * that, use: (! (v & (v - 1)) && v)
+ * However n must be positive, this returns 0 as a power of 2; to fix that, use: (! (v & (v -
+ * 1)) && v)
*/
return ((v & (v - 1)) == 0);
}
diff --git a/src/third_party/wiredtiger/src/support/thread_group.c b/src/third_party/wiredtiger/src/support/thread_group.c
index b2da78afa8e..aca2a3d9aa7 100644
--- a/src/third_party/wiredtiger/src/support/thread_group.c
+++ b/src/third_party/wiredtiger/src/support/thread_group.c
@@ -176,8 +176,7 @@ __thread_group_resize(WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, uint32_t
for (i = group->max; i < new_max; i++) {
WT_ERR(__wt_calloc_one(session, &thread));
/*
- * Threads get their own session and hs table cursor
- * (if the hs table is open).
+ * Threads get their own session and hs table cursor (if the hs table is open).
*/
session_flags = LF_ISSET(WT_THREAD_CAN_WAIT) ? WT_SESSION_CAN_WAIT : 0;
WT_ERR(
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index 39b9db2ef69..58607c7cf2c 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -71,20 +71,20 @@ __txn_remove_from_global_table(WT_SESSION_IMPL *session)
#ifdef HAVE_DIAGNOSTIC
WT_TXN *txn;
WT_TXN_GLOBAL *txn_global;
- WT_TXN_STATE *txn_state;
+ WT_TXN_SHARED *txn_shared;
- txn = &session->txn;
+ txn = session->txn;
txn_global = &S2C(session)->txn_global;
- txn_state = WT_SESSION_TXN_STATE(session);
+ txn_shared = WT_SESSION_TXN_SHARED(session);
WT_ASSERT(session, !WT_TXNID_LT(txn->id, txn_global->last_running));
- WT_ASSERT(session, txn->id != WT_TXN_NONE && txn_state->id != WT_TXN_NONE);
+ WT_ASSERT(session, txn->id != WT_TXN_NONE && txn_shared->id != WT_TXN_NONE);
#else
- WT_TXN_STATE *txn_state;
+ WT_TXN_SHARED *txn_shared;
- txn_state = WT_SESSION_TXN_STATE(session);
+ txn_shared = WT_SESSION_TXN_SHARED(session);
#endif
- WT_PUBLISH(txn_state->id, WT_TXN_NONE);
+ WT_PUBLISH(txn_shared->id, WT_TXN_NONE);
}
/*
@@ -96,7 +96,7 @@ __txn_sort_snapshot(WT_SESSION_IMPL *session, uint32_t n, uint64_t snap_max)
{
WT_TXN *txn;
- txn = &session->txn;
+ txn = session->txn;
if (n > 1)
__snapsort(txn->snapshot, n);
@@ -118,22 +118,22 @@ __wt_txn_release_snapshot(WT_SESSION_IMPL *session)
{
WT_TXN *txn;
WT_TXN_GLOBAL *txn_global;
- WT_TXN_STATE *txn_state;
+ WT_TXN_SHARED *txn_shared;
- txn = &session->txn;
+ txn = session->txn;
txn_global = &S2C(session)->txn_global;
- txn_state = WT_SESSION_TXN_STATE(session);
+ txn_shared = WT_SESSION_TXN_SHARED(session);
- WT_ASSERT(session, txn_state->pinned_id == WT_TXN_NONE ||
- session->txn.isolation == WT_ISO_READ_UNCOMMITTED ||
- !__wt_txn_visible_all(session, txn_state->pinned_id, WT_TS_NONE));
+ WT_ASSERT(session, txn_shared->pinned_id == WT_TXN_NONE ||
+ session->txn->isolation == WT_ISO_READ_UNCOMMITTED ||
+ !__wt_txn_visible_all(session, txn_shared->pinned_id, WT_TS_NONE));
- txn_state->metadata_pinned = txn_state->pinned_id = WT_TXN_NONE;
+ txn_shared->metadata_pinned = txn_shared->pinned_id = WT_TXN_NONE;
F_CLR(txn, WT_TXN_HAS_SNAPSHOT);
/* Clear a checkpoint's pinned ID. */
if (WT_SESSION_IS_CHECKPOINT(session)) {
- txn_global->checkpoint_state.pinned_id = WT_TXN_NONE;
+ txn_global->checkpoint_txn_shared.pinned_id = WT_TXN_NONE;
txn_global->checkpoint_timestamp = 0;
}
@@ -150,14 +150,14 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
WT_CONNECTION_IMPL *conn;
WT_TXN *txn;
WT_TXN_GLOBAL *txn_global;
- WT_TXN_STATE *s, *txn_state;
+ WT_TXN_SHARED *s, *txn_shared;
uint64_t commit_gen, current_id, id, prev_oldest_id, pinned_id;
uint32_t i, n, session_cnt;
conn = S2C(session);
- txn = &session->txn;
+ txn = session->txn;
txn_global = &conn->txn_global;
- txn_state = WT_SESSION_TXN_STATE(session);
+ txn_shared = WT_SESSION_TXN_SHARED(session);
n = 0;
/* Fast path if we already have the current snapshot. */
@@ -179,14 +179,14 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
* changes the checkpoint has written to the metadata. We don't have to keep the checkpoint's
* changes pinned so don't including it in the published pinned ID.
*/
- if ((id = txn_global->checkpoint_state.id) != WT_TXN_NONE) {
+ if ((id = txn_global->checkpoint_txn_shared.id) != WT_TXN_NONE) {
txn->snapshot[n++] = id;
- txn_state->metadata_pinned = id;
+ txn_shared->metadata_pinned = id;
}
/* For pure read-only workloads, avoid scanning. */
if (prev_oldest_id == current_id) {
- txn_state->pinned_id = current_id;
+ txn_shared->pinned_id = current_id;
/* Check that the oldest ID has not moved in the meantime. */
WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
goto done;
@@ -194,7 +194,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
/* Walk the array of concurrent transactions. */
WT_ORDERED_READ(session_cnt, conn->session_cnt);
- for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
+ for (i = 0, s = txn_global->txn_shared_list; i < session_cnt; i++, s++) {
/*
* Build our snapshot of any concurrent transaction IDs.
*
@@ -209,7 +209,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
* this case, we ignore this transaction because it would
* not be visible to the current snapshot.
*/
- while (s != txn_state && (id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id) &&
+ while (s != txn_shared && (id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id) &&
WT_TXNID_LT(id, current_id)) {
/*
* If the transaction is still allocating its ID, then we spin here until it gets its
@@ -240,7 +240,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
*/
WT_ASSERT(session, WT_TXNID_LE(prev_oldest_id, pinned_id));
WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
- txn_state->pinned_id = pinned_id;
+ txn_shared->pinned_id = pinned_id;
done:
__wt_readunlock(session, &txn_global->rwlock);
@@ -258,7 +258,7 @@ __txn_oldest_scan(WT_SESSION_IMPL *session, uint64_t *oldest_idp, uint64_t *last
WT_CONNECTION_IMPL *conn;
WT_SESSION_IMPL *oldest_session;
WT_TXN_GLOBAL *txn_global;
- WT_TXN_STATE *s;
+ WT_TXN_SHARED *s;
uint64_t id, last_running, metadata_pinned, oldest_id, prev_oldest_id;
uint32_t i, session_cnt;
@@ -269,12 +269,12 @@ __txn_oldest_scan(WT_SESSION_IMPL *session, uint64_t *oldest_idp, uint64_t *last
/* The oldest ID cannot change while we are holding the scan lock. */
prev_oldest_id = txn_global->oldest_id;
last_running = oldest_id = txn_global->current;
- if ((metadata_pinned = txn_global->checkpoint_state.id) == WT_TXN_NONE)
+ if ((metadata_pinned = txn_global->checkpoint_txn_shared.id) == WT_TXN_NONE)
metadata_pinned = oldest_id;
/* Walk the array of concurrent transactions. */
WT_ORDERED_READ(session_cnt, conn->session_cnt);
- for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
+ for (i = 0, s = txn_global->txn_shared_list; i < session_cnt; i++, s++) {
/* Update the last running transaction ID. */
while ((id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id) &&
WT_TXNID_LT(id, last_running)) {
@@ -422,7 +422,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags)
"old snapshot %" PRIu64 " pinned in session %" PRIu32
" [%s]"
" with snap_min %" PRIu64,
- oldest_id, oldest_session->id, oldest_session->lastop, oldest_session->txn.snap_min);
+ oldest_id, oldest_session->id, oldest_session->lastop, oldest_session->txn->snap_min);
}
}
@@ -442,7 +442,7 @@ __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[])
WT_TXN *txn;
wt_timestamp_t read_ts;
- txn = &session->txn;
+ txn = session->txn;
WT_RET(__wt_config_gets_def(session, cfg, "isolation", 0, &cval));
if (cval.len != 0)
@@ -516,7 +516,7 @@ __wt_txn_reconfigure(WT_SESSION_IMPL *session, const char *config)
WT_DECL_RET;
WT_TXN *txn;
- txn = &session->txn;
+ txn = session->txn;
ret = __wt_config_getones(session, config, "isolation", &cval);
if (ret == 0 && cval.len != 0) {
@@ -540,7 +540,7 @@ __wt_txn_release(WT_SESSION_IMPL *session)
WT_TXN *txn;
WT_TXN_GLOBAL *txn_global;
- txn = &session->txn;
+ txn = session->txn;
txn_global = &S2C(session)->txn_global;
WT_ASSERT(session, txn->mod_count == 0);
@@ -548,8 +548,8 @@ __wt_txn_release(WT_SESSION_IMPL *session)
/* Clear the transaction's ID from the global table. */
if (WT_SESSION_IS_CHECKPOINT(session)) {
- WT_ASSERT(session, WT_SESSION_TXN_STATE(session)->id == WT_TXN_NONE);
- txn->id = txn_global->checkpoint_state.id = WT_TXN_NONE;
+ WT_ASSERT(session, WT_SESSION_TXN_SHARED(session)->id == WT_TXN_NONE);
+ txn->id = txn_global->checkpoint_txn_shared.id = WT_TXN_NONE;
/*
* Be extra careful to cleanup everything for checkpoints: once the global checkpoint ID is
@@ -563,7 +563,7 @@ __wt_txn_release(WT_SESSION_IMPL *session)
if (!F_ISSET(txn, WT_TXN_PREPARE))
__txn_remove_from_global_table(session);
else
- WT_ASSERT(session, WT_SESSION_TXN_STATE(session)->id == WT_TXN_NONE);
+ WT_ASSERT(session, WT_SESSION_TXN_SHARED(session)->id == WT_TXN_NONE);
txn->id = WT_TXN_NONE;
}
@@ -613,7 +613,7 @@ __txn_search_prepared_op(
*updp = NULL;
- txn = &session->txn;
+ txn = session->txn;
cursor = *cursorp;
if (cursor == NULL || ((WT_CURSOR_BTREE *)cursor)->btree->id != op->btree->id) {
@@ -669,7 +669,7 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit,
WT_TXN *txn;
WT_UPDATE *upd;
- txn = &session->txn;
+ txn = session->txn;
WT_RET(__txn_search_prepared_op(session, op, cursorp, &upd));
@@ -735,7 +735,7 @@ __txn_commit_timestamps_assert(WT_SESSION_IMPL *session)
u_int i;
bool op_zero_ts, upd_zero_ts;
- txn = &session->txn;
+ txn = session->txn;
cursor = NULL;
durable_op_timestamp = prev_op_timestamp = WT_TS_NONE;
@@ -817,7 +817,7 @@ __txn_commit_timestamps_assert(WT_SESSION_IMPL *session)
upd_zero_ts = prev_op_timestamp == WT_TS_NONE;
if (op_zero_ts != upd_zero_ts) {
WT_ERR(__wt_verbose_dump_update(session, upd));
- WT_ERR(__wt_verbose_dump_txn_one(session, &session->txn, EINVAL,
+ WT_ERR(__wt_verbose_dump_txn_one(session, session, EINVAL,
"per-key timestamps used inconsistently, dumping relevant information"));
}
/*
@@ -893,7 +893,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
u_int i;
bool locked, prepare, readonly, update_durable_ts;
- txn = &session->txn;
+ txn = session->txn;
conn = S2C(session);
cursor = NULL;
txn_global = &conn->txn_global;
@@ -1163,7 +1163,7 @@ __wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[])
int64_t txn_prepared_updates_count;
u_int i;
- txn = &session->txn;
+ txn = session->txn;
txn_prepared_updates_count = 0;
WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
@@ -1261,7 +1261,7 @@ __wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[])
WT_STAT_CONN_INCR(session, txn_prepared_updates_count);
/* Set transaction state to prepare. */
- F_SET(&session->txn, WT_TXN_PREPARE);
+ F_SET(session->txn, WT_TXN_PREPARE);
/* Release our snapshot in case it is keeping data pinned. */
__wt_txn_release_snapshot(session);
@@ -1294,7 +1294,7 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[])
WT_UNUSED(cfg);
cursor = NULL;
- txn = &session->txn;
+ txn = session->txn;
prepare = F_ISSET(txn, WT_TXN_PREPARE);
readonly = txn->mod_count == 0;
@@ -1389,7 +1389,7 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[])
int
__wt_txn_rollback_required(WT_SESSION_IMPL *session, const char *reason)
{
- session->txn.rollback_reason = reason;
+ session->txn->rollback_reason = reason;
return (WT_ROLLBACK);
}
@@ -1402,18 +1402,15 @@ __wt_txn_init(WT_SESSION_IMPL *session, WT_SESSION_IMPL *session_ret)
{
WT_TXN *txn;
- txn = &session_ret->txn;
+ /* Allocate the WT_TXN structure, including a variable length array of snapshot information. */
+ WT_RET(__wt_calloc(session, 1,
+ sizeof(WT_TXN) + sizeof(txn->snapshot[0]) * S2C(session)->session_size, &session_ret->txn));
+ txn = session_ret->txn;
+ txn->snapshot = txn->__snapshot;
txn->id = WT_TXN_NONE;
- WT_RET(__wt_calloc_def(session, S2C(session_ret)->session_size, &txn->snapshot));
-
-#ifdef HAVE_DIAGNOSTIC
- if (S2C(session_ret)->txn_global.states != NULL) {
- WT_TXN_STATE *txn_state;
- txn_state = WT_SESSION_TXN_STATE(session_ret);
- WT_ASSERT(session, txn_state->pinned_id == WT_TXN_NONE);
- }
-#endif
+ WT_ASSERT(session, S2C(session_ret)->txn_global.txn_shared_list == NULL ||
+ WT_SESSION_TXN_SHARED(session_ret)->pinned_id == WT_TXN_NONE);
/*
* Take care to clean these out in case we are reusing the transaction for eviction.
@@ -1443,7 +1440,7 @@ __wt_txn_stats_update(WT_SESSION_IMPL *session)
conn = S2C(session);
txn_global = &conn->txn_global;
stats = conn->stats;
- checkpoint_pinned = txn_global->checkpoint_state.pinned_id;
+ checkpoint_pinned = txn_global->checkpoint_txn_shared.pinned_id;
WT_STAT_SET(session, stats, txn_pinned_range, txn_global->current - txn_global->oldest_id);
@@ -1471,11 +1468,13 @@ __wt_txn_stats_update(WT_SESSION_IMPL *session)
checkpoint_pinned == WT_TXN_NONE ? 0 : txn_global->current - checkpoint_pinned);
WT_STAT_SET(session, stats, txn_checkpoint_prep_max, conn->ckpt_prep_max);
- WT_STAT_SET(session, stats, txn_checkpoint_prep_min, conn->ckpt_prep_min);
+ if (conn->ckpt_prep_min != UINT64_MAX)
+ WT_STAT_SET(session, stats, txn_checkpoint_prep_min, conn->ckpt_prep_min);
WT_STAT_SET(session, stats, txn_checkpoint_prep_recent, conn->ckpt_prep_recent);
WT_STAT_SET(session, stats, txn_checkpoint_prep_total, conn->ckpt_prep_total);
WT_STAT_SET(session, stats, txn_checkpoint_time_max, conn->ckpt_time_max);
- WT_STAT_SET(session, stats, txn_checkpoint_time_min, conn->ckpt_time_min);
+ if (conn->ckpt_time_min != UINT64_MAX)
+ WT_STAT_SET(session, stats, txn_checkpoint_time_min, conn->ckpt_time_min);
WT_STAT_SET(session, stats, txn_checkpoint_time_recent, conn->ckpt_time_recent);
WT_STAT_SET(session, stats, txn_checkpoint_time_total, conn->ckpt_time_total);
WT_STAT_SET(session, stats, txn_durable_queue_len, txn_global->durable_timestampq_len);
@@ -1491,7 +1490,8 @@ __wt_txn_release_resources(WT_SESSION_IMPL *session)
{
WT_TXN *txn;
- txn = &session->txn;
+ if ((txn = session->txn) == NULL)
+ return;
WT_ASSERT(session, txn->mod_count == 0);
__wt_free(session, txn->mod);
@@ -1507,7 +1507,7 @@ void
__wt_txn_destroy(WT_SESSION_IMPL *session)
{
__wt_txn_release_resources(session);
- __wt_free(session, session->txn.snapshot);
+ __wt_free(session, session->txn);
}
/*
@@ -1519,7 +1519,7 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[])
{
WT_CONNECTION_IMPL *conn;
WT_TXN_GLOBAL *txn_global;
- WT_TXN_STATE *s;
+ WT_TXN_SHARED *s;
u_int i;
WT_UNUSED(cfg);
@@ -1539,9 +1539,9 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[])
WT_RWLOCK_INIT_TRACKED(session, &txn_global->read_timestamp_rwlock, read_timestamp);
TAILQ_INIT(&txn_global->read_timestamph);
- WT_RET(__wt_calloc_def(session, conn->session_size, &txn_global->states));
+ WT_RET(__wt_calloc_def(session, conn->session_size, &txn_global->txn_shared_list));
- for (i = 0, s = txn_global->states; i < conn->session_size; i++, s++)
+ for (i = 0, s = txn_global->txn_shared_list; i < conn->session_size; i++, s++)
s->id = s->metadata_pinned = s->pinned_id = WT_TXN_NONE;
return (0);
@@ -1568,7 +1568,7 @@ __wt_txn_global_destroy(WT_SESSION_IMPL *session)
__wt_rwlock_destroy(session, &txn_global->durable_timestamp_rwlock);
__wt_rwlock_destroy(session, &txn_global->read_timestamp_rwlock);
__wt_rwlock_destroy(session, &txn_global->visibility_rwlock);
- __wt_free(session, txn_global->states);
+ __wt_free(session, txn_global->txn_shared_list);
}
/*
@@ -1669,11 +1669,11 @@ int
__wt_txn_is_blocking(WT_SESSION_IMPL *session)
{
WT_TXN *txn;
- WT_TXN_STATE *txn_state;
+ WT_TXN_SHARED *txn_shared;
uint64_t global_oldest;
- txn = &session->txn;
- txn_state = WT_SESSION_TXN_STATE(session);
+ txn = session->txn;
+ txn_shared = WT_SESSION_TXN_SHARED(session);
global_oldest = S2C(session)->txn_global.oldest_id;
/* We can't roll back prepared transactions. */
@@ -1691,7 +1691,7 @@ __wt_txn_is_blocking(WT_SESSION_IMPL *session)
/*
* Check if either the transaction's ID or its pinned ID is equal to the oldest transaction ID.
*/
- return (txn_state->id == global_oldest || txn_state->pinned_id == global_oldest ?
+ return (txn_shared->id == global_oldest || txn_shared->pinned_id == global_oldest ?
__wt_txn_rollback_required(
session, "oldest pinned transaction ID rolled back for eviction") :
0);
@@ -1703,12 +1703,17 @@ __wt_txn_is_blocking(WT_SESSION_IMPL *session)
*/
int
__wt_verbose_dump_txn_one(
- WT_SESSION_IMPL *session, WT_TXN *txn, int error_code, const char *error_string)
+ WT_SESSION_IMPL *session, WT_SESSION_IMPL *txn_session, int error_code, const char *error_string)
{
+ WT_TXN *txn;
+ WT_TXN_SHARED *txn_shared;
char buf[512];
- char ts_string[5][WT_TS_INT_STRING_SIZE];
+ char ts_string[7][WT_TS_INT_STRING_SIZE];
const char *iso_tag;
+ txn = txn_session->txn;
+ txn_shared = WT_SESSION_TXN_SHARED(txn_session);
+
WT_NOT_READ(iso_tag, "INVALID");
switch (txn->isolation) {
case WT_ISO_READ_COMMITTED:
@@ -1734,6 +1739,8 @@ __wt_verbose_dump_txn_one(
", first_commit_timestamp: %s"
", prepare_timestamp: %s"
", read_timestamp: %s"
+ ", pinned_durable_timestamp: %s"
+ ", pinned_read_timestamp: %s"
", checkpoint LSN: [%" PRIu32 "][%" PRIu32 "]"
", full checkpoint: %s"
", rollback reason: %s"
@@ -1743,8 +1750,10 @@ __wt_verbose_dump_txn_one(
__wt_timestamp_to_string(txn->durable_timestamp, ts_string[1]),
__wt_timestamp_to_string(txn->first_commit_timestamp, ts_string[2]),
__wt_timestamp_to_string(txn->prepare_timestamp, ts_string[3]),
- __wt_timestamp_to_string(txn->read_timestamp, ts_string[4]), txn->ckpt_lsn.l.file,
- txn->ckpt_lsn.l.offset, txn->full_ckpt ? "true" : "false",
+ __wt_timestamp_to_string(txn->read_timestamp, ts_string[4]),
+ __wt_timestamp_to_string(txn_shared->pinned_durable_timestamp, ts_string[5]),
+ __wt_timestamp_to_string(txn_shared->pinned_read_timestamp, ts_string[6]),
+ txn->ckpt_lsn.l.file, txn->ckpt_lsn.l.offset, txn->full_ckpt ? "true" : "false",
txn->rollback_reason == NULL ? "" : txn->rollback_reason, txn->flags, iso_tag));
/*
@@ -1769,7 +1778,7 @@ __wt_verbose_dump_txn(WT_SESSION_IMPL *session)
WT_CONNECTION_IMPL *conn;
WT_SESSION_IMPL *sess;
WT_TXN_GLOBAL *txn_global;
- WT_TXN_STATE *s;
+ WT_TXN_SHARED *s;
uint64_t id;
uint32_t i, session_cnt;
char ts_string[WT_TS_INT_STRING_SIZE];
@@ -1808,9 +1817,9 @@ __wt_verbose_dump_txn(WT_SESSION_IMPL *session)
__wt_msg(session, "checkpoint running: %s", txn_global->checkpoint_running ? "yes" : "no"));
WT_RET(
__wt_msg(session, "checkpoint generation: %" PRIu64, __wt_gen(session, WT_GEN_CHECKPOINT)));
- WT_RET(
- __wt_msg(session, "checkpoint pinned ID: %" PRIu64, txn_global->checkpoint_state.pinned_id));
- WT_RET(__wt_msg(session, "checkpoint txn ID: %" PRIu64, txn_global->checkpoint_state.id));
+ WT_RET(__wt_msg(
+ session, "checkpoint pinned ID: %" PRIu64, txn_global->checkpoint_txn_shared.pinned_id));
+ WT_RET(__wt_msg(session, "checkpoint txn ID: %" PRIu64, txn_global->checkpoint_txn_shared.id));
WT_ORDERED_READ(session_cnt, conn->session_cnt);
WT_RET(__wt_msg(session, "session count: %" PRIu32, session_cnt));
@@ -1821,7 +1830,7 @@ __wt_verbose_dump_txn(WT_SESSION_IMPL *session)
* handles is not thread safe, so some information may change while traversing if other threads
* are active at the same time, which is OK since this is diagnostic code.
*/
- for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
+ for (i = 0, s = txn_global->txn_shared_list; i < session_cnt; i++, s++) {
/* Skip sessions with no active transaction */
if ((id = s->id) == WT_TXN_NONE && s->pinned_id == WT_TXN_NONE)
continue;
@@ -1829,7 +1838,7 @@ __wt_verbose_dump_txn(WT_SESSION_IMPL *session)
WT_RET(__wt_msg(session,
"ID: %" PRIu64 ", pinned ID: %" PRIu64 ", metadata pinned ID: %" PRIu64 ", name: %s", id,
s->pinned_id, s->metadata_pinned, sess->name == NULL ? "EMPTY" : sess->name));
- WT_RET(__wt_verbose_dump_txn_one(session, &sess->txn, 0, NULL));
+ WT_RET(__wt_verbose_dump_txn_one(session, sess, 0, NULL));
}
return (0);
diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
index c72f107bf08..af97d01a0fb 100644
--- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c
+++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
@@ -269,18 +269,16 @@ __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[])
return (0);
/*
- * We may have raced between starting the checkpoint transaction and
- * some operation completing on the handle that updated the metadata
- * (e.g., closing a bulk load cursor). All such operations either have
- * exclusive access to the handle or hold the schema lock. We are now
- * holding the schema lock and have an open btree handle, so if we
- * can't update the metadata, then there has been some state change
- * invisible to the checkpoint transaction.
+ * We may have raced between starting the checkpoint transaction and some operation completing
+ * on the handle that updated the metadata (e.g., closing a bulk load cursor). All such
+ * operations either have exclusive access to the handle or hold the schema lock. We are now
+ * holding the schema lock and have an open btree handle, so if we can't update the metadata,
+ * then there has been some state change invisible to the checkpoint transaction.
*/
if (!WT_IS_METADATA(session->dhandle)) {
WT_CURSOR *meta_cursor;
- WT_ASSERT(session, !F_ISSET(&session->txn, WT_TXN_ERROR));
+ WT_ASSERT(session, !F_ISSET(session->txn, WT_TXN_ERROR));
WT_RET(__wt_metadata_cursor(session, &meta_cursor));
meta_cursor->set_key(meta_cursor, session->dhandle->name);
ret = __wt_curfile_insert_check(meta_cursor);
@@ -465,7 +463,7 @@ __checkpoint_stats(WT_SESSION_IMPL *session)
if (msec > conn->ckpt_time_max)
conn->ckpt_time_max = msec;
- if (conn->ckpt_time_min == 0 || msec < conn->ckpt_time_min)
+ if (msec < conn->ckpt_time_min)
conn->ckpt_time_min = msec;
conn->ckpt_time_recent = msec;
conn->ckpt_time_total += msec;
@@ -475,7 +473,7 @@ __checkpoint_stats(WT_SESSION_IMPL *session)
if (msec > conn->ckpt_prep_max)
conn->ckpt_prep_max = msec;
- if (conn->ckpt_prep_min == 0 || msec < conn->ckpt_prep_min)
+ if (msec < conn->ckpt_prep_min)
conn->ckpt_prep_min = msec;
conn->ckpt_prep_recent = msec;
conn->ckpt_prep_total += msec;
@@ -531,15 +529,15 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[
WT_DECL_RET;
WT_TXN *txn;
WT_TXN_GLOBAL *txn_global;
- WT_TXN_STATE *txn_state;
+ WT_TXN_SHARED *txn_shared;
const char *txn_cfg[] = {
WT_CONFIG_BASE(session, WT_SESSION_begin_transaction), "isolation=snapshot", NULL};
bool use_timestamp;
conn = S2C(session);
- txn = &session->txn;
+ txn = session->txn;
txn_global = &conn->txn_global;
- txn_state = WT_SESSION_TXN_STATE(session);
+ txn_shared = WT_SESSION_TXN_SHARED(session);
WT_RET(__wt_config_gets(session, cfg, "use_timestamp", &cval));
use_timestamp = (cval.val != 0);
@@ -585,21 +583,21 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[
* time and only write to the metadata.
*/
__wt_writelock(session, &txn_global->rwlock);
- txn_global->checkpoint_state = *txn_state;
- txn_global->checkpoint_state.pinned_id = txn->snap_min;
+ txn_global->checkpoint_txn_shared = *txn_shared;
+ txn_global->checkpoint_txn_shared.pinned_id = txn->snap_min;
/*
* Sanity check that the oldest ID hasn't moved on before we have cleared our entry.
*/
- WT_ASSERT(session, WT_TXNID_LE(txn_global->oldest_id, txn_state->id) &&
- WT_TXNID_LE(txn_global->oldest_id, txn_state->pinned_id));
+ WT_ASSERT(session, WT_TXNID_LE(txn_global->oldest_id, txn_shared->id) &&
+ WT_TXNID_LE(txn_global->oldest_id, txn_shared->pinned_id));
/*
* Clear our entry from the global transaction session table. Any operation that needs to know
* about the ID for this checkpoint will consider the checkpoint ID in the global structure.
* Most operations can safely ignore the checkpoint ID (see the visible all check for details).
*/
- txn_state->id = txn_state->pinned_id = txn_state->metadata_pinned = WT_TXN_NONE;
+ txn_shared->id = txn_shared->pinned_id = txn_shared->metadata_pinned = WT_TXN_NONE;
/*
* Set the checkpoint transaction's timestamp, if requested.
@@ -608,7 +606,7 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[
* the stable timestamp.
*/
WT_ASSERT(session, !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT | WT_TXN_HAS_TS_READ |
- WT_TXN_TS_PUBLISHED | WT_TXN_PUBLIC_TS_READ));
+ WT_TXN_SHARED_TS_DURABLE | WT_TXN_SHARED_TS_READ));
if (use_timestamp) {
/*
@@ -625,7 +623,7 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[
} else {
if (!F_ISSET(conn, WT_CONN_RECOVERING))
txn_global->meta_ckpt_timestamp = WT_TS_NONE;
- txn->read_timestamp = WT_TS_NONE;
+ txn->read_timestamp = txn_shared->pinned_read_timestamp = WT_TS_NONE;
}
__wt_writeunlock(session, &txn_global->rwlock);
@@ -756,7 +754,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
conn = S2C(session);
cache = conn->cache;
hs_dhandle = NULL;
- txn = &session->txn;
+ txn = session->txn;
txn_global = &conn->txn_global;
saved_isolation = session->isolation;
full = idle = logging = tracking = use_timestamp = false;
@@ -961,7 +959,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
* Now that the metadata is stable, re-open the metadata file for regular eviction by clearing
* the checkpoint_pinned flag.
*/
- txn_global->checkpoint_state.pinned_id = WT_TXN_NONE;
+ txn_global->checkpoint_txn_shared.pinned_id = WT_TXN_NONE;
if (full) {
__checkpoint_stats(session);
@@ -1632,7 +1630,7 @@ fake:
* that case, we need to sync the file here or we could roll forward the metadata in recovery
* and open a checkpoint that isn't yet durable.
*/
- if (WT_IS_METADATA(dhandle) || !F_ISSET(&session->txn, WT_TXN_RUNNING))
+ if (WT_IS_METADATA(dhandle) || !F_ISSET(session->txn, WT_TXN_RUNNING))
WT_ERR(__wt_checkpoint_sync(session, NULL));
WT_ERR(__wt_meta_ckptlist_set(session, dhandle->name, btree->ckpt, &ckptlsn));
@@ -1704,7 +1702,7 @@ __checkpoint_tree_helper(WT_SESSION_IMPL *session, const char *cfg[])
bool with_timestamp;
btree = S2BT(session);
- txn = &session->txn;
+ txn = session->txn;
/* Are we using a read timestamp for this checkpoint transaction? */
with_timestamp = F_ISSET(txn, WT_TXN_HAS_TS_READ);
diff --git a/src/third_party/wiredtiger/src/txn/txn_ext.c b/src/third_party/wiredtiger/src/txn/txn_ext.c
index b0ad3acfd9a..645d410efad 100644
--- a/src/third_party/wiredtiger/src/txn/txn_ext.c
+++ b/src/third_party/wiredtiger/src/txn/txn_ext.c
@@ -21,7 +21,7 @@ __wt_ext_transaction_id(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session)
session = (WT_SESSION_IMPL *)wt_session;
/* Ignore failures: the only case is running out of transaction IDs. */
WT_IGNORE_RET(__wt_txn_id_check(session));
- return (session->txn.id);
+ return (session->txn->id);
}
/*
@@ -37,7 +37,7 @@ __wt_ext_transaction_isolation_level(WT_EXTENSION_API *wt_api, WT_SESSION *wt_se
(void)wt_api; /* Unused parameters */
session = (WT_SESSION_IMPL *)wt_session;
- txn = &session->txn;
+ txn = session->txn;
if (txn->isolation == WT_ISO_READ_COMMITTED)
return (WT_TXN_ISO_READ_COMMITTED);
@@ -59,7 +59,7 @@ __wt_ext_transaction_notify(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, WT
(void)wt_api; /* Unused parameters */
session = (WT_SESSION_IMPL *)wt_session;
- txn = &session->txn;
+ txn = session->txn;
/*
* XXX For now, a single slot for notifications: I'm not bothering with more than one because
diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c
index 195c4e336e5..b5f5dab0077 100644
--- a/src/third_party/wiredtiger/src/txn/txn_log.c
+++ b/src/third_party/wiredtiger/src/txn/txn_log.c
@@ -206,7 +206,7 @@ __txn_logrec_init(WT_SESSION_IMPL *session)
uint32_t rectype;
const char *fmt;
- txn = &session->txn;
+ txn = session->txn;
rectype = WT_LOGREC_COMMIT;
fmt = WT_UNCHECKED_STRING(Iq);
@@ -255,7 +255,7 @@ __wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
uint32_t fileid;
conn = S2C(session);
- txn = &session->txn;
+ txn = session->txn;
if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) ||
F_ISSET(session, WT_SESSION_NO_LOGGING) ||
@@ -314,7 +314,7 @@ __wt_txn_log_commit(WT_SESSION_IMPL *session, const char *cfg[])
WT_TXN *txn;
WT_UNUSED(cfg);
- txn = &session->txn;
+ txn = session->txn;
/*
* If there are no log records there is nothing to do.
*/
@@ -394,10 +394,12 @@ __wt_txn_ts_log(WT_SESSION_IMPL *session)
WT_CONNECTION_IMPL *conn;
WT_ITEM *logrec;
WT_TXN *txn;
- wt_timestamp_t commit, durable, first, prepare, read;
+ WT_TXN_SHARED *txn_shared;
+ wt_timestamp_t commit, durable, first_commit, pinned_read, prepare, read;
conn = S2C(session);
- txn = &session->txn;
+ txn = session->txn;
+ txn_shared = WT_SESSION_TXN_SHARED(session);
if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) ||
F_ISSET(session, WT_SESSION_NO_LOGGING) ||
@@ -417,21 +419,23 @@ __wt_txn_ts_log(WT_SESSION_IMPL *session)
WT_RET(__txn_logrec_init(session));
logrec = txn->logrec;
- commit = durable = first = prepare = read = WT_TS_NONE;
+ commit = durable = first_commit = pinned_read = prepare = read = WT_TS_NONE;
if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) {
commit = txn->commit_timestamp;
- first = txn->first_commit_timestamp;
+ first_commit = txn->first_commit_timestamp;
}
if (F_ISSET(txn, WT_TXN_HAS_TS_DURABLE))
durable = txn->durable_timestamp;
if (F_ISSET(txn, WT_TXN_HAS_TS_PREPARE))
prepare = txn->prepare_timestamp;
- if (F_ISSET(txn, WT_TXN_HAS_TS_READ))
+ if (F_ISSET(txn, WT_TXN_HAS_TS_READ)) {
read = txn->read_timestamp;
+ pinned_read = txn_shared->pinned_read_timestamp;
+ }
__wt_epoch(session, &t);
return (__wt_logop_txn_timestamp_pack(session, logrec, (uint64_t)t.tv_sec, (uint64_t)t.tv_nsec,
- commit, durable, first, prepare, read));
+ commit, durable, first_commit, prepare, read, pinned_read));
}
/*
@@ -455,7 +459,7 @@ __wt_txn_checkpoint_log(WT_SESSION_IMPL *session, bool full, uint32_t flags, WT_
conn = S2C(session);
txn_global = &conn->txn_global;
- txn = &session->txn;
+ txn = session->txn;
ckpt_lsn = &txn->ckpt_lsn;
/*
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
index 920c9d67371..ed493f7765f 100644
--- a/src/third_party/wiredtiger/src/txn/txn_recover.c
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -112,7 +112,7 @@ __txn_op_apply(WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *
WT_DECL_RET;
WT_ITEM key, start_key, stop_key, value;
WT_SESSION_IMPL *session;
- wt_timestamp_t commit, durable, first, prepare, read;
+ wt_timestamp_t commit, durable, first_commit, pinned_read, prepare, read;
uint64_t recno, start_recno, stop_recno, t_nsec, t_sec;
uint32_t fileid, mode, optype, opsize;
@@ -141,9 +141,8 @@ __txn_op_apply(WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *
WT_ERR_NOTFOUND_OK(ret, false);
else {
/*
- * Build/insert a complete value during recovery rather
- * than using cursor modify to create a partial update
- * (for no particular reason than simplicity).
+ * Build/insert a complete value during recovery rather than using cursor modify to
+ * create a partial update (for no particular reason than simplicity).
*/
WT_ERR(__wt_modify_apply(cursor, value.data));
WT_ERR(cursor->insert(cursor));
@@ -203,9 +202,8 @@ __txn_op_apply(WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *
WT_ERR_NOTFOUND_OK(ret, false);
else {
/*
- * Build/insert a complete value during recovery rather
- * than using cursor modify to create a partial update
- * (for no particular reason than simplicity).
+ * Build/insert a complete value during recovery rather than using cursor modify to
+ * create a partial update (for no particular reason than simplicity).
*/
WT_ERR(__wt_modify_apply(cursor, value.data));
WT_ERR(cursor->insert(cursor));
@@ -268,8 +266,8 @@ __txn_op_apply(WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *
* Timestamp records are informational only. We have to unpack it to properly move forward
* in the log record to the next operation, but otherwise ignore.
*/
- WT_ERR(__wt_logop_txn_timestamp_unpack(
- session, pp, end, &t_sec, &t_nsec, &commit, &durable, &first, &prepare, &read));
+ WT_ERR(__wt_logop_txn_timestamp_unpack(session, pp, end, &t_sec, &t_nsec, &commit, &durable,
+ &first_commit, &prepare, &read, &pinned_read));
break;
default:
WT_ERR(__wt_illegal_value(session, optype));
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
index e7ea00e8e69..b238d2dfd3a 100644
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
@@ -1131,9 +1131,14 @@ __wt_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[], bool no_ckp
WT_RET(__wt_open_internal_session(S2C(session), "txn rollback_to_stable", true,
F_MASK(session, WT_SESSION_NO_LOGGING), &session));
- F_SET(session, WT_SESSION_ROLLBACK_TO_STABLE_FLAGS);
+ /*
+ * Rollback to stable should ignore tombstones in the history store since it needs to scan the
+ * entire table sequentially.
+ */
+ F_SET(session, WT_SESSION_ROLLBACK_TO_STABLE | WT_SESSION_IGNORE_HS_TOMBSTONE);
ret = __rollback_to_stable(session, cfg);
- F_CLR(session, WT_SESSION_ROLLBACK_TO_STABLE_FLAGS);
+ F_CLR(session, WT_SESSION_ROLLBACK_TO_STABLE | WT_SESSION_IGNORE_HS_TOMBSTONE);
+ WT_RET(ret);
/*
* If the configuration is not in-memory, forcibly log a checkpoint after rollback to stable to
diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c
index a10745aa411..a000e86cc87 100644
--- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c
+++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c
@@ -141,13 +141,13 @@ __wt_txn_parse_timestamp(
/*
* __txn_get_read_timestamp --
* Get the read timestamp from the transaction. Additionally return bool to specify whether the
- * transaction has set clear read queue flag.
+ * transaction has set the clear read queue flag.
*/
static bool
-__txn_get_read_timestamp(WT_TXN *txn, wt_timestamp_t *read_timestampp)
+__txn_get_read_timestamp(WT_TXN_SHARED *txn_shared, wt_timestamp_t *read_timestampp)
{
- WT_ORDERED_READ(*read_timestampp, txn->read_timestamp);
- return (!txn->clear_read_q);
+ WT_ORDERED_READ(*read_timestampp, txn_shared->pinned_read_timestamp);
+ return (!txn_shared->clear_read_q);
}
/*
@@ -158,8 +158,8 @@ int
__wt_txn_get_pinned_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *tsp, uint32_t flags)
{
WT_CONNECTION_IMPL *conn;
- WT_TXN *txn;
WT_TXN_GLOBAL *txn_global;
+ WT_TXN_SHARED *txn_shared;
wt_timestamp_t tmp_read_ts, tmp_ts;
bool include_oldest, txn_has_write_lock;
@@ -185,13 +185,13 @@ __wt_txn_get_pinned_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *tsp, uin
/* Look for the oldest ordinary reader. */
__wt_readlock(session, &txn_global->read_timestamp_rwlock);
- TAILQ_FOREACH (txn, &txn_global->read_timestamph, read_timestampq) {
+ TAILQ_FOREACH (txn_shared, &txn_global->read_timestamph, read_timestampq) {
/*
* Skip any transactions on the queue that are not active. Copy out value of read timestamp
* to prevent possible race where a transaction resets its read timestamp while we traverse
* the queue.
*/
- if (!__txn_get_read_timestamp(txn, &tmp_read_ts))
+ if (!__txn_get_read_timestamp(txn_shared, &tmp_read_ts))
continue;
/*
* A zero timestamp is possible here only when the oldest timestamp is not accounted for.
@@ -213,37 +213,15 @@ __wt_txn_get_pinned_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *tsp, uin
}
/*
- * __txn_get_published_timestamp --
- * Get the current durable timestamp for a given transaction. If there is an explicit durable
- * timestamp, this function will return the commit timestamp since this is implied. If there is
- * neither a commit nor a durable timestamp, this function will return 0.
+ * __txn_get_durable_timestamp --
+ * Get the durable timestamp from the transaction. Additionally return bool to specify whether
+ * the transaction has set the clear durable queue flag.
*/
-static inline wt_timestamp_t
-__txn_get_published_timestamp(WT_SESSION_IMPL *session, WT_TXN *txn)
+static bool
+__txn_get_durable_timestamp(WT_TXN_SHARED *txn_shared, wt_timestamp_t *durable_timestampp)
{
- wt_timestamp_t ts;
-
- /*
- * Any checking of bit flags in this logic is invalid. __wt_txn_release may have already been
- * called on this transaction which will set the flags member to 0. So we need to deduce which
- * timestamp to use purely by inspecting the timestamp members which we deliberately preserve
- * for reader threads such as ourselves.
- *
- * In the non-prepared case, the first commit will either be less than the commit (in the case
- * of multiple commits) in which case we should return the first commit. Or it will be equal to
- * the commit (in the case of a single commit) and we can return durable (which is mirrored from
- * the commit timestamp).
- *
- * In the prepared case, the first commit will always be equal to the commit so we'll return
- * durable.
- */
- if (txn->commit_timestamp != txn->first_commit_timestamp)
- ts = txn->first_commit_timestamp;
- else
- ts = txn->durable_timestamp;
-
- WT_ASSERT(session, ts != WT_TS_NONE);
- return (ts);
+ WT_ORDERED_READ(*durable_timestampp, txn_shared->pinned_durable_timestamp);
+ return (!txn_shared->clear_durable_q);
}
/*
@@ -255,8 +233,8 @@ __txn_global_query_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *tsp, cons
{
WT_CONFIG_ITEM cval;
WT_CONNECTION_IMPL *conn;
- WT_TXN *txn;
WT_TXN_GLOBAL *txn_global;
+ WT_TXN_SHARED *txn_shared;
wt_timestamp_t ts, tmpts;
conn = S2C(session);
@@ -280,14 +258,13 @@ __txn_global_query_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *tsp, cons
* Compare with the least recently durable transaction.
*/
__wt_readlock(session, &txn_global->durable_timestamp_rwlock);
- TAILQ_FOREACH (txn, &txn_global->durable_timestamph, durable_timestampq) {
- if (txn->clear_durable_q)
- continue;
-
- tmpts = __txn_get_published_timestamp(session, txn) - 1;
- if (tmpts < ts)
- ts = tmpts;
- break;
+ TAILQ_FOREACH (txn_shared, &txn_global->durable_timestamph, durable_timestampq) {
+ if (__txn_get_durable_timestamp(txn_shared, &tmpts)) {
+ --tmpts;
+ if (tmpts < ts)
+ ts = tmpts;
+ break;
+ }
}
__wt_readunlock(session, &txn_global->durable_timestamp_rwlock);
@@ -333,8 +310,10 @@ __txn_query_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *tsp, const char
{
WT_CONFIG_ITEM cval;
WT_TXN *txn;
+ WT_TXN_SHARED *txn_shared;
- txn = &session->txn;
+ txn = session->txn;
+ txn_shared = WT_SESSION_TXN_SHARED(session);
WT_STAT_CONN_INCR(session, session_query_ts);
if (!F_ISSET(txn, WT_TXN_RUNNING))
@@ -348,7 +327,7 @@ __txn_query_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *tsp, const char
else if (WT_STRING_MATCH("prepare", cval.str, cval.len))
*tsp = txn->prepare_timestamp;
else if (WT_STRING_MATCH("read", cval.str, cval.len))
- *tsp = txn->read_timestamp;
+ *tsp = txn_shared->pinned_read_timestamp;
else
WT_RET_MSG(session, EINVAL, "unknown timestamp query %.*s", (int)cval.len, cval.str);
@@ -603,23 +582,26 @@ set:
*/
static int
__txn_assert_after_reads(
- WT_SESSION_IMPL *session, const char *op, wt_timestamp_t ts, WT_TXN **prevp)
+ WT_SESSION_IMPL *session, const char *op, wt_timestamp_t ts, WT_TXN_SHARED **prev_sharedp)
{
#ifdef HAVE_DIAGNOSTIC
- WT_TXN *prev, *txn = &session->txn;
- WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_SHARED *prev_shared, *txn_shared;
wt_timestamp_t tmp_timestamp;
char ts_string[2][WT_TS_INT_STRING_SIZE];
+ txn_global = &S2C(session)->txn_global;
+ txn_shared = WT_SESSION_TXN_SHARED(session);
+
__wt_readlock(session, &txn_global->read_timestamp_rwlock);
- prev = TAILQ_LAST(&txn_global->read_timestamph, __wt_txn_rts_qh);
- while (prev != NULL) {
+ prev_shared = TAILQ_LAST(&txn_global->read_timestamph, __wt_txn_rts_qh);
+ while (prev_shared != NULL) {
/*
* Skip self and non-active transactions. Copy out value of read timestamp to prevent
* possible race where a transaction resets its read timestamp while we traverse the queue.
*/
- if (!__txn_get_read_timestamp(prev, &tmp_timestamp) || prev == txn) {
- prev = TAILQ_PREV(prev, __wt_txn_rts_qh, read_timestampq);
+ if (!__txn_get_read_timestamp(prev_shared, &tmp_timestamp) || prev_shared == txn_shared) {
+ prev_shared = TAILQ_PREV(prev_shared, __wt_txn_rts_qh, read_timestampq);
continue;
}
@@ -636,13 +618,13 @@ __txn_assert_after_reads(
__wt_readunlock(session, &txn_global->read_timestamp_rwlock);
- if (prevp != NULL)
- *prevp = prev;
+ if (prev_sharedp != NULL)
+ *prev_sharedp = prev_shared;
#else
WT_UNUSED(session);
WT_UNUSED(op);
WT_UNUSED(ts);
- WT_UNUSED(prevp);
+ WT_UNUSED(prev_sharedp);
#endif
return (0);
@@ -658,12 +640,15 @@ __txn_assert_after_reads(
int
__wt_txn_set_commit_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t commit_ts)
{
- WT_TXN *txn = &session->txn;
- WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global;
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
wt_timestamp_t oldest_ts, stable_ts;
char ts_string[2][WT_TS_INT_STRING_SIZE];
bool has_oldest_ts, has_stable_ts;
+ txn = session->txn;
+ txn_global = &S2C(session)->txn_global;
+
/* Added this redundant initialization to circumvent build failure. */
oldest_ts = stable_ts = WT_TS_NONE;
@@ -764,12 +749,15 @@ __wt_txn_set_commit_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t commit_ts
int
__wt_txn_set_durable_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t durable_ts)
{
- WT_TXN *txn = &session->txn;
- WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global;
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
wt_timestamp_t oldest_ts, stable_ts;
char ts_string[2][WT_TS_INT_STRING_SIZE];
bool has_oldest_ts, has_stable_ts;
+ txn = session->txn;
+ txn_global = &S2C(session)->txn_global;
+
/* Added this redundant initialization to circumvent build failure. */
oldest_ts = stable_ts = 0;
@@ -827,11 +815,16 @@ __wt_txn_set_durable_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t durable_
int
__wt_txn_set_prepare_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t prepare_ts)
{
- WT_TXN *prev, *txn = &session->txn;
- WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global;
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_SHARED *prev_shared, *txn_shared;
wt_timestamp_t oldest_ts;
char ts_string[2][WT_TS_INT_STRING_SIZE];
+ txn = session->txn;
+ txn_global = &S2C(session)->txn_global;
+ prev_shared = txn_shared = WT_SESSION_TXN_SHARED(session);
+
WT_RET(__wt_txn_context_prepare_check(session));
if (F_ISSET(txn, WT_TXN_HAS_TS_PREPARE))
@@ -842,7 +835,7 @@ __wt_txn_set_prepare_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t prepare_
"commit timestamp "
"should not have been set before the prepare timestamp");
- WT_RET(__txn_assert_after_reads(session, "prepare", prepare_ts, &prev));
+ WT_RET(__txn_assert_after_reads(session, "prepare", prepare_ts, &prev_shared));
/*
* Check whether the prepare timestamp is less than the oldest timestamp.
@@ -857,7 +850,7 @@ __wt_txn_set_prepare_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t prepare_
* Check that there are no active readers. That would be a violation of preconditions
* for rounding timestamps of prepared transactions.
*/
- WT_ASSERT(session, prev == NULL);
+ WT_ASSERT(session, prev_shared == NULL);
__wt_verbose(session, WT_VERB_TIMESTAMP,
"prepare timestamp %s rounded to oldest "
@@ -886,12 +879,17 @@ __wt_txn_set_prepare_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t prepare_
int
__wt_txn_set_read_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t read_ts)
{
- WT_TXN *txn = &session->txn;
- WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global;
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_SHARED *txn_shared;
wt_timestamp_t ts_oldest;
char ts_string[2][WT_TS_INT_STRING_SIZE];
bool did_roundup_to_oldest;
+ txn = session->txn;
+ txn_global = &S2C(session)->txn_global;
+ txn_shared = WT_SESSION_TXN_SHARED(session);
+
WT_RET(__wt_txn_context_prepare_check(session));
/* Read timestamps imply / require snapshot isolation. */
@@ -922,7 +920,7 @@ __wt_txn_set_read_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t read_ts)
* oldest timestamp.
*/
if (F_ISSET(txn, WT_TXN_TS_ROUND_READ)) {
- txn->read_timestamp = ts_oldest;
+ txn->read_timestamp = txn_shared->pinned_read_timestamp = ts_oldest;
did_roundup_to_oldest = true;
} else {
__wt_readunlock(session, &txn_global->rwlock);
@@ -942,7 +940,7 @@ __wt_txn_set_read_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t read_ts)
return (EINVAL);
}
} else
- txn->read_timestamp = read_ts;
+ txn->read_timestamp = txn_shared->pinned_read_timestamp = read_ts;
__wt_txn_publish_read_timestamp(session);
__wt_readunlock(session, &txn_global->rwlock);
@@ -1002,7 +1000,7 @@ __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[])
WT_RET(__wt_txn_set_durable_timestamp(session, ts));
}
- __wt_txn_publish_timestamp(session);
+ __wt_txn_publish_durable_timestamp(session);
/* Look for a read timestamp. */
WT_RET(__wt_config_gets_def(session, cfg, "read_timestamp", 0, &cval));
@@ -1025,21 +1023,23 @@ __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[])
}
/*
- * __wt_txn_publish_timestamp --
- * Publish a transaction's timestamp to the durable queue.
+ * __wt_txn_publish_durable_timestamp --
+ * Publish a transaction's durable timestamp.
*/
void
-__wt_txn_publish_timestamp(WT_SESSION_IMPL *session)
+__wt_txn_publish_durable_timestamp(WT_SESSION_IMPL *session)
{
- WT_TXN *qtxn, *txn, *txn_tmp;
+ WT_TXN *txn;
WT_TXN_GLOBAL *txn_global;
- wt_timestamp_t ts;
+ WT_TXN_SHARED *qtxn_shared, *txn_shared, *txn_shared_tmp;
+ wt_timestamp_t tmpts, ts;
uint64_t walked;
- txn = &session->txn;
+ txn = session->txn;
txn_global = &S2C(session)->txn_global;
+ txn_shared = WT_SESSION_TXN_SHARED(session);
- if (F_ISSET(txn, WT_TXN_TS_PUBLISHED))
+ if (F_ISSET(txn, WT_TXN_SHARED_TS_DURABLE))
return;
if (F_ISSET(txn, WT_TXN_HAS_TS_DURABLE))
@@ -1053,7 +1053,7 @@ __wt_txn_publish_timestamp(WT_SESSION_IMPL *session)
*/
if (F_ISSET(txn, WT_TXN_PREPARE))
return;
- ts = txn->commit_timestamp;
+ ts = txn->first_commit_timestamp;
} else
return;
@@ -1063,9 +1063,9 @@ __wt_txn_publish_timestamp(WT_SESSION_IMPL *session)
* otherwise might not remove ourselves before finding where to insert ourselves (which would
* result in a list loop) and we don't want to walk more of the list than needed.
*/
- if (txn->clear_durable_q) {
- TAILQ_REMOVE(&txn_global->durable_timestamph, txn, durable_timestampq);
- WT_PUBLISH(txn->clear_durable_q, false);
+ if (txn_shared->clear_durable_q) {
+ TAILQ_REMOVE(&txn_global->durable_timestamph, txn_shared, durable_timestampq);
+ txn_shared->clear_durable_q = false;
--txn_global->durable_timestampq_len;
}
/*
@@ -1073,45 +1073,49 @@ __wt_txn_publish_timestamp(WT_SESSION_IMPL *session)
* that are not active. We stop when we get to the location where we want to insert.
*/
if (TAILQ_EMPTY(&txn_global->durable_timestamph)) {
- TAILQ_INSERT_HEAD(&txn_global->durable_timestamph, txn, durable_timestampq);
+ TAILQ_INSERT_HEAD(&txn_global->durable_timestamph, txn_shared, durable_timestampq);
WT_STAT_CONN_INCR(session, txn_durable_queue_empty);
} else {
/* Walk from the start, removing cleared entries. */
walked = 0;
- TAILQ_FOREACH_SAFE(qtxn, &txn_global->durable_timestamph, durable_timestampq, txn_tmp)
+ TAILQ_FOREACH_SAFE(
+ qtxn_shared, &txn_global->durable_timestamph, durable_timestampq, txn_shared_tmp)
{
++walked;
/*
* Stop on the first entry that we cannot clear.
*/
- if (!qtxn->clear_durable_q)
+ if (!qtxn_shared->clear_durable_q)
break;
- TAILQ_REMOVE(&txn_global->durable_timestamph, qtxn, durable_timestampq);
- WT_PUBLISH(qtxn->clear_durable_q, false);
+ TAILQ_REMOVE(&txn_global->durable_timestamph, qtxn_shared, durable_timestampq);
+ qtxn_shared->clear_durable_q = false;
--txn_global->durable_timestampq_len;
}
/*
* Now walk backwards from the end to find the correct position for the insert.
*/
- qtxn = TAILQ_LAST(&txn_global->durable_timestamph, __wt_txn_dts_qh);
- while (qtxn != NULL && __txn_get_published_timestamp(session, qtxn) > ts) {
+ qtxn_shared = TAILQ_LAST(&txn_global->durable_timestamph, __wt_txn_dts_qh);
+ while (qtxn_shared != NULL &&
+ (!__txn_get_durable_timestamp(qtxn_shared, &tmpts) || tmpts > ts)) {
++walked;
- qtxn = TAILQ_PREV(qtxn, __wt_txn_dts_qh, durable_timestampq);
+ qtxn_shared = TAILQ_PREV(qtxn_shared, __wt_txn_dts_qh, durable_timestampq);
}
- if (qtxn == NULL) {
- TAILQ_INSERT_HEAD(&txn_global->durable_timestamph, txn, durable_timestampq);
+ if (qtxn_shared == NULL) {
+ TAILQ_INSERT_HEAD(&txn_global->durable_timestamph, txn_shared, durable_timestampq);
WT_STAT_CONN_INCR(session, txn_durable_queue_head);
} else
- TAILQ_INSERT_AFTER(&txn_global->durable_timestamph, qtxn, txn, durable_timestampq);
+ TAILQ_INSERT_AFTER(
+ &txn_global->durable_timestamph, qtxn_shared, txn_shared, durable_timestampq);
WT_STAT_CONN_INCRV(session, txn_durable_queue_walked, walked);
}
++txn_global->durable_timestampq_len;
WT_STAT_CONN_INCR(session, txn_durable_queue_inserts);
- txn->clear_durable_q = false;
- F_SET(txn, WT_TXN_TS_PUBLISHED);
+ txn_shared->pinned_durable_timestamp = ts;
+ txn_shared->clear_durable_q = false;
__wt_writeunlock(session, &txn_global->durable_timestamp_rwlock);
+ F_SET(txn, WT_TXN_SHARED_TS_DURABLE);
}
/*
@@ -1122,11 +1126,12 @@ void
__wt_txn_clear_durable_timestamp(WT_SESSION_IMPL *session)
{
WT_TXN *txn;
- uint32_t flags;
+ WT_TXN_SHARED *txn_shared;
- txn = &session->txn;
+ txn = session->txn;
+ txn_shared = WT_SESSION_TXN_SHARED(session);
- if (!F_ISSET(txn, WT_TXN_TS_PUBLISHED))
+ if (!F_ISSET(txn, WT_TXN_SHARED_TS_DURABLE))
return;
/*
@@ -1134,15 +1139,9 @@ __wt_txn_clear_durable_timestamp(WT_SESSION_IMPL *session)
* durable timestamp queue whenever the next thread walks the queue. We do not need to remove it
* now.
*/
- txn->clear_durable_q = true;
-
- /*
- * Serialize clearing the flag with setting the queue state. The serialization has been here for
- * awhile, but nobody remembers if or why it's necessary.
- */
- flags = txn->flags;
- LF_CLR(WT_TXN_TS_PUBLISHED);
- WT_PUBLISH(txn->flags, flags);
+ txn_shared->clear_durable_q = true;
+ WT_WRITE_BARRIER();
+ F_CLR(txn, WT_TXN_SHARED_TS_DURABLE);
}
/*
@@ -1152,15 +1151,17 @@ __wt_txn_clear_durable_timestamp(WT_SESSION_IMPL *session)
void
__wt_txn_publish_read_timestamp(WT_SESSION_IMPL *session)
{
- WT_TXN *qtxn, *txn, *txn_tmp;
+ WT_TXN *txn;
WT_TXN_GLOBAL *txn_global;
+ WT_TXN_SHARED *qtxn_shared, *txn_shared, *txn_shared_tmp;
wt_timestamp_t tmp_timestamp;
uint64_t walked;
- txn = &session->txn;
+ txn = session->txn;
txn_global = &S2C(session)->txn_global;
+ txn_shared = WT_SESSION_TXN_SHARED(session);
- if (F_ISSET(txn, WT_TXN_PUBLIC_TS_READ))
+ if (F_ISSET(txn, WT_TXN_SHARED_TS_READ))
return;
__wt_writelock(session, &txn_global->read_timestamp_rwlock);
@@ -1169,9 +1170,9 @@ __wt_txn_publish_read_timestamp(WT_SESSION_IMPL *session)
* otherwise might not remove ourselves before finding where to insert ourselves (which would
* result in a list loop) and we don't want to walk more of the list than needed.
*/
- if (txn->clear_read_q) {
- TAILQ_REMOVE(&txn_global->read_timestamph, txn, read_timestampq);
- WT_PUBLISH(txn->clear_read_q, false);
+ if (txn_shared->clear_read_q) {
+ TAILQ_REMOVE(&txn_global->read_timestamph, txn_shared, read_timestampq);
+ WT_PUBLISH(txn_shared->clear_read_q, false);
--txn_global->read_timestampq_len;
}
/*
@@ -1179,39 +1180,41 @@ __wt_txn_publish_read_timestamp(WT_SESSION_IMPL *session)
* that are not active. We stop when we get to the location where we want to insert.
*/
if (TAILQ_EMPTY(&txn_global->read_timestamph)) {
- TAILQ_INSERT_HEAD(&txn_global->read_timestamph, txn, read_timestampq);
+ TAILQ_INSERT_HEAD(&txn_global->read_timestamph, txn_shared, read_timestampq);
WT_STAT_CONN_INCR(session, txn_read_queue_empty);
} else {
/* Walk from the start, removing cleared entries. */
walked = 0;
- TAILQ_FOREACH_SAFE(qtxn, &txn_global->read_timestamph, read_timestampq, txn_tmp)
+ TAILQ_FOREACH_SAFE(
+ qtxn_shared, &txn_global->read_timestamph, read_timestampq, txn_shared_tmp)
{
++walked;
- if (!qtxn->clear_read_q)
+ if (!qtxn_shared->clear_read_q)
break;
- TAILQ_REMOVE(&txn_global->read_timestamph, qtxn, read_timestampq);
- WT_PUBLISH(qtxn->clear_read_q, false);
+ TAILQ_REMOVE(&txn_global->read_timestamph, qtxn_shared, read_timestampq);
+ WT_PUBLISH(qtxn_shared->clear_read_q, false);
--txn_global->read_timestampq_len;
}
/*
* Now walk backwards from the end to find the correct position for the insert.
*/
- qtxn = TAILQ_LAST(&txn_global->read_timestamph, __wt_txn_rts_qh);
- while (qtxn != NULL) {
- if (!__txn_get_read_timestamp(qtxn, &tmp_timestamp) ||
- tmp_timestamp > txn->read_timestamp) {
+ qtxn_shared = TAILQ_LAST(&txn_global->read_timestamph, __wt_txn_rts_qh);
+ while (qtxn_shared != NULL) {
+ if (!__txn_get_read_timestamp(qtxn_shared, &tmp_timestamp) ||
+ tmp_timestamp > txn_shared->pinned_read_timestamp) {
++walked;
- qtxn = TAILQ_PREV(qtxn, __wt_txn_rts_qh, read_timestampq);
+ qtxn_shared = TAILQ_PREV(qtxn_shared, __wt_txn_rts_qh, read_timestampq);
} else
break;
}
- if (qtxn == NULL) {
- TAILQ_INSERT_HEAD(&txn_global->read_timestamph, txn, read_timestampq);
+ if (qtxn_shared == NULL) {
+ TAILQ_INSERT_HEAD(&txn_global->read_timestamph, txn_shared, read_timestampq);
WT_STAT_CONN_INCR(session, txn_read_queue_head);
} else
- TAILQ_INSERT_AFTER(&txn_global->read_timestamph, qtxn, txn, read_timestampq);
+ TAILQ_INSERT_AFTER(
+ &txn_global->read_timestamph, qtxn_shared, txn_shared, read_timestampq);
WT_STAT_CONN_INCRV(session, txn_read_queue_walked, walked);
}
/*
@@ -1220,8 +1223,8 @@ __wt_txn_publish_read_timestamp(WT_SESSION_IMPL *session)
*/
++txn_global->read_timestampq_len;
WT_STAT_CONN_INCR(session, txn_read_queue_inserts);
- txn->clear_read_q = false;
- F_SET(txn, WT_TXN_HAS_TS_READ | WT_TXN_PUBLIC_TS_READ);
+ txn_shared->clear_read_q = false;
+ F_SET(txn, WT_TXN_HAS_TS_READ | WT_TXN_SHARED_TS_READ);
__wt_writeunlock(session, &txn_global->read_timestamp_rwlock);
}
@@ -1233,34 +1236,27 @@ void
__wt_txn_clear_read_timestamp(WT_SESSION_IMPL *session)
{
WT_TXN *txn;
- uint32_t flags;
-
- txn = &session->txn;
-
- if (!F_ISSET(txn, WT_TXN_PUBLIC_TS_READ)) {
- txn->read_timestamp = WT_TS_NONE;
- return;
- }
+ WT_TXN_SHARED *txn_shared;
- /* Assert the read timestamp is greater than or equal to the pinned timestamp. */
- WT_ASSERT(session, txn->read_timestamp >= S2C(session)->txn_global.pinned_timestamp);
+ txn = session->txn;
+ txn_shared = WT_SESSION_TXN_SHARED(session);
- /*
- * Notify other threads that our transaction is inactive and can be cleaned up safely from the
- * read timestamp queue whenever the next thread walks the queue. We do not need to remove it
- * now.
- */
- txn->clear_read_q = true;
+ if (F_ISSET(txn, WT_TXN_SHARED_TS_READ)) {
+ /* Assert the read timestamp is greater than or equal to the pinned timestamp. */
+ WT_ASSERT(session, txn->read_timestamp == txn_shared->pinned_read_timestamp &&
+ txn->read_timestamp >= S2C(session)->txn_global.pinned_timestamp);
- /*
- * Serialize clearing the flag with setting the queue state. The serialization has been here for
- * awhile, but nobody remembers if or why it's necessary.
- */
- flags = txn->flags;
- LF_CLR(WT_TXN_PUBLIC_TS_READ);
- WT_PUBLISH(txn->flags, flags);
+ /*
+ * Notify other threads that our transaction is inactive and can be cleaned up safely from
+ * the read timestamp queue whenever the next thread walks the queue. We do not need to
+ * remove it now.
+ */
+ txn_shared->clear_read_q = true;
+ WT_WRITE_BARRIER();
- txn->read_timestamp = WT_TS_NONE;
+ F_CLR(txn, WT_TXN_SHARED_TS_READ);
+ }
+ txn->read_timestamp = txn_shared->pinned_read_timestamp = WT_TS_NONE;
}
/*
@@ -1271,36 +1267,40 @@ __wt_txn_clear_read_timestamp(WT_SESSION_IMPL *session)
void
__wt_txn_clear_timestamp_queues(WT_SESSION_IMPL *session)
{
- WT_TXN *txn;
WT_TXN_GLOBAL *txn_global;
+ WT_TXN_SHARED *txn_shared;
- txn = &session->txn;
+ txn_shared = WT_SESSION_TXN_SHARED(session);
txn_global = &S2C(session)->txn_global;
- if (!txn->clear_durable_q && !txn->clear_read_q)
+ /*
+ * If we've closed the connection, our transaction shared states may already have been freed. In
+ * that case, there's nothing more to do here.
+ */
+ if (txn_shared == NULL || (!txn_shared->clear_durable_q && !txn_shared->clear_read_q))
return;
- if (txn->clear_durable_q) {
+ if (txn_shared->clear_durable_q) {
__wt_writelock(session, &txn_global->durable_timestamp_rwlock);
/*
* Recheck after acquiring the lock.
*/
- if (txn->clear_durable_q) {
- TAILQ_REMOVE(&txn_global->durable_timestamph, txn, durable_timestampq);
+ if (txn_shared->clear_durable_q) {
+ TAILQ_REMOVE(&txn_global->durable_timestamph, txn_shared, durable_timestampq);
--txn_global->durable_timestampq_len;
- txn->clear_durable_q = false;
+ txn_shared->clear_durable_q = false;
}
__wt_writeunlock(session, &txn_global->durable_timestamp_rwlock);
}
- if (txn->clear_read_q) {
+ if (txn_shared->clear_read_q) {
__wt_writelock(session, &txn_global->read_timestamp_rwlock);
/*
* Recheck after acquiring the lock.
*/
- if (txn->clear_read_q) {
- TAILQ_REMOVE(&txn_global->read_timestamph, txn, read_timestampq);
+ if (txn_shared->clear_read_q) {
+ TAILQ_REMOVE(&txn_global->read_timestamph, txn_shared, read_timestampq);
--txn_global->read_timestampq_len;
- txn->clear_read_q = false;
+ txn_shared->clear_read_q = false;
}
__wt_writeunlock(session, &txn_global->read_timestamp_rwlock);
}
diff --git a/src/third_party/wiredtiger/test/format/backup.c b/src/third_party/wiredtiger/test/format/backup.c
index 52a79777ca4..5ad1cfe65dc 100644
--- a/src/third_party/wiredtiger/test/format/backup.c
+++ b/src/third_party/wiredtiger/test/format/backup.c
@@ -379,9 +379,9 @@ backup(void *arg)
* with named checkpoints. Wait for the checkpoint to complete, otherwise backups might be
* starved out.
*/
- testutil_check(pthread_rwlock_wrlock(&g.backup_lock));
+ lock_writelock(session, &g.backup_lock);
if (g.workers_finished) {
- testutil_check(pthread_rwlock_unlock(&g.backup_lock));
+ lock_writeunlock(session, &g.backup_lock);
break;
}
@@ -471,7 +471,7 @@ backup(void *arg)
testutil_check(session->truncate(session, "log:", backup_cursor, NULL, NULL));
testutil_check(backup_cursor->close(backup_cursor));
- testutil_check(pthread_rwlock_unlock(&g.backup_lock));
+ lock_writeunlock(session, &g.backup_lock);
active_files_sort(active_now);
active_files_remove_missing(active_prev, active_now);
active_prev = active_now;
diff --git a/src/third_party/wiredtiger/test/format/bulk.c b/src/third_party/wiredtiger/test/format/bulk.c
index a1d9256e707..0f41a311e43 100644
--- a/src/third_party/wiredtiger/test/format/bulk.c
+++ b/src/third_party/wiredtiger/test/format/bulk.c
@@ -59,7 +59,7 @@ bulk_commit_transaction(WT_SESSION *session)
testutil_check(session->commit_transaction(session, buf));
/* Update the oldest timestamp, otherwise updates are pinned in memory. */
- timestamp_once();
+ timestamp_once(session);
}
/*
diff --git a/src/third_party/wiredtiger/test/format/checkpoint.c b/src/third_party/wiredtiger/test/format/checkpoint.c
index 0542bb8e54b..36e70ae3125 100644
--- a/src/third_party/wiredtiger/test/format/checkpoint.c
+++ b/src/third_party/wiredtiger/test/format/checkpoint.c
@@ -85,7 +85,7 @@ checkpoint(void *arg)
* few names to test multiple named snapshots in
* the system.
*/
- ret = pthread_rwlock_trywrlock(&g.backup_lock);
+ ret = lock_try_writelock(session, &g.backup_lock);
if (ret == 0) {
backup_locked = true;
testutil_check(__wt_snprintf(
@@ -98,7 +98,7 @@ checkpoint(void *arg)
/*
* 5% drop all named snapshots.
*/
- ret = pthread_rwlock_trywrlock(&g.backup_lock);
+ ret = lock_try_writelock(session, &g.backup_lock);
if (ret == 0) {
backup_locked = true;
ckpt_config = "drop=(all)";
@@ -110,7 +110,7 @@ checkpoint(void *arg)
testutil_check(session->checkpoint(session, ckpt_config));
if (backup_locked)
- testutil_check(pthread_rwlock_unlock(&g.backup_lock));
+ lock_writeunlock(session, &g.backup_lock);
secs = mmrand(NULL, 5, 40);
}
diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c
index ae104bae989..54a09229ce4 100644
--- a/src/third_party/wiredtiger/test/format/config.c
+++ b/src/third_party/wiredtiger/test/format/config.c
@@ -30,7 +30,7 @@
#include "config.h"
static void config(void);
-static void config_backup(void);
+static void config_backup_incr(void);
static void config_backward_compatible(void);
static void config_cache(void);
static void config_checkpoint(void);
@@ -197,7 +197,7 @@ config(void)
config_transaction();
/* Simple selection. */
- config_backup();
+ config_backup_incr();
config_checkpoint();
config_checksum();
config_compression("btree.compression");
@@ -257,39 +257,55 @@ config(void)
}
/*
- * config_backup --
- * Backup configuration.
+ * config_backup_incr --
+ * Incremental backup configuration.
*/
static void
-config_backup(void)
+config_backup_incr(void)
{
- const char *cstr;
+ /* Incremental backup requires backup. */
+ if (g.c_backups == 0)
+ return;
/*
- * Choose a type of incremental backup.
+ * Incremental backup using log files is incompatible with logging archival. Testing log file
+ * archival doesn't seem as useful as testing backup, let the backup configuration override.
*/
- if (!config_is_perm("backup.incremental")) {
- cstr = "backup.incremental=off";
- switch (mmrand(NULL, 1, 10)) {
- case 1: /* 30% full backup only */
- case 2:
- case 3:
- break;
- case 4: /* 40% block based incremental */
- case 5:
- case 6:
- case 7:
- cstr = "backup.incremental=block";
- break;
- case 8:
- case 9:
- case 10: /* 30% log based incremental */
- if (!g.c_logging_archive)
- cstr = "backup.incremental=log";
- break;
+ if (config_is_perm("backup.incremental")) {
+ if (g.c_backup_incr_flag == INCREMENTAL_LOG) {
+ if (g.c_logging_archive && config_is_perm("logging.archive"))
+ testutil_die(EINVAL, "backup.incremental=log is incompatible with logging.archive");
+ if (g.c_logging_archive)
+ config_single("logging.archive=0", false);
}
+ return;
+ }
- config_single(cstr, false);
+ /*
+ * Choose a type of incremental backup, where the log archival setting can eliminate incremental
+ * backup based on log files.
+ */
+ switch (mmrand(NULL, 1, 10)) {
+ case 1: /* 30% full backup only */
+ case 2:
+ case 3:
+ config_single("backup.incremental=off", false);
+ break;
+ case 4: /* 30% log based incremental */
+ case 5:
+ case 6:
+ if (!g.c_logging_archive || !config_is_perm("logging.archive")) {
+ if (g.c_logging_archive)
+ config_single("logging.archive=0", false);
+ config_single("backup.incremental=log", false);
+ }
+ /* FALLTHROUGH */
+ case 7: /* 40% block based incremental */
+ case 8:
+ case 9:
+ case 10:
+ config_single("backup.incremental=block", false);
+ break;
}
}
diff --git a/src/third_party/wiredtiger/test/format/config.h b/src/third_party/wiredtiger/test/format/config.h
index 59dc7cfd86c..04da641386b 100644
--- a/src/third_party/wiredtiger/test/format/config.h
+++ b/src/third_party/wiredtiger/test/format/config.h
@@ -336,6 +336,10 @@ static CONFIG c[] = {
{"wiredtiger.config", "configuration string used to wiredtiger_open", C_IGNORE | C_STRING, 0, 0,
0, NULL, &g.c_config_open},
+ /* 80% */
+ {"wiredtiger.rwlock", "if wiredtiger read/write mutexes should be used", C_BOOL, 80, 0, 0,
+ &g.c_wt_mutex, NULL},
+
{"wiredtiger.leak_memory", "if memory should be leaked on close", C_BOOL, 0, 0, 0,
&g.c_leak_memory, NULL},
diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h
index b96b858f0f8..6bc213a65ef 100644
--- a/src/third_party/wiredtiger/test/format/format.h
+++ b/src/third_party/wiredtiger/test/format/format.h
@@ -60,6 +60,20 @@
#define MAX_MODIFY_ENTRIES 5 /* maximum change vectors */
+/*
+ * Abstract lock that lets us use either pthread reader-writer locks or WiredTiger's own (likely
+ * faster) implementation.
+ */
+typedef struct {
+ union {
+ WT_RWLOCK wt;
+ pthread_rwlock_t pthread;
+ } l;
+ enum { LOCK_NONE = 0, LOCK_WT, LOCK_PTHREAD } lock_type;
+} RWLOCK;
+
+#define LOCK_INITIALIZED(lock) ((lock)->lock_type != LOCK_NONE)
+
typedef struct {
WT_CONNECTION *wts_conn;
WT_EXTENSION_API *wt_api;
@@ -92,8 +106,8 @@ typedef struct {
bool logging; /* log operations */
FILE *logfp; /* log file */
- pthread_rwlock_t backup_lock; /* Backup running */
- uint64_t backup_id; /* Block incremental id */
+ RWLOCK backup_lock; /* Backup running */
+ uint64_t backup_id; /* Block incremental id */
WT_RAND_STATE rnd; /* Global RNG state */
@@ -104,13 +118,17 @@ typedef struct {
* We get the last committed timestamp periodically in order to update the oldest timestamp,
* that requires locking out transactional ops that set a timestamp.
*/
- pthread_rwlock_t ts_lock;
+ RWLOCK ts_lock;
uint64_t timestamp; /* Counter for timestamps */
uint64_t truncate_cnt; /* Counter for truncation */
- pthread_rwlock_t death_lock; /* Single-thread failure */
+ /*
+ * Single-thread failure. Always use pthread lock rather than WT lock in case WT library is
+ * misbehaving.
+ */
+ pthread_rwlock_t death_lock;
uint32_t c_abort; /* Config values */
uint32_t c_alter;
@@ -204,6 +222,7 @@ typedef struct {
uint32_t c_value_min;
uint32_t c_verify;
uint32_t c_write_pct;
+ uint32_t c_wt_mutex;
#define FIX 1
#define ROW 2
@@ -351,6 +370,8 @@ void key_gen_common(WT_ITEM *, uint64_t, const char *);
void key_gen_init(WT_ITEM *);
void key_gen_teardown(WT_ITEM *);
void key_init(void);
+void lock_destroy(WT_SESSION *, RWLOCK *);
+void lock_init(WT_SESSION *, RWLOCK *);
void operations(u_int, bool);
WT_THREAD_RET random_kv(void *);
void path_setup(const char *);
@@ -364,7 +385,7 @@ int snap_repeat_txn(WT_CURSOR *, TINFO *);
void snap_repeat_update(TINFO *, bool);
void snap_track(TINFO *, thread_op);
WT_THREAD_RET timestamp(void *);
-void timestamp_once(void);
+void timestamp_once(WT_SESSION *);
void track(const char *, uint64_t, TINFO *);
void val_gen(WT_RAND_STATE *, WT_ITEM *, uint64_t);
void val_gen_init(WT_ITEM *);
diff --git a/src/third_party/wiredtiger/test/format/format.i b/src/third_party/wiredtiger/test/format/format.i
index 00099e1c4cf..661dd096ae9 100644
--- a/src/third_party/wiredtiger/test/format/format.i
+++ b/src/third_party/wiredtiger/test/format/format.i
@@ -162,3 +162,51 @@ key_gen_insert(WT_RAND_STATE *rnd, WT_ITEM *key, uint64_t keyno)
key_gen_common(key, keyno, suffix[mmrand(rnd, 0, 14)]);
}
+
+/*
+ * lock_try_writelock
+ * Try to get exclusive lock. Fail immediately if not available.
+ */
+static inline int
+lock_try_writelock(WT_SESSION *session, RWLOCK *lock)
+{
+ testutil_assert(LOCK_INITIALIZED(lock));
+
+ if (lock->lock_type == LOCK_WT) {
+ return (__wt_try_writelock((WT_SESSION_IMPL *)session, &lock->l.wt));
+ } else {
+ return (pthread_rwlock_trywrlock(&lock->l.pthread));
+ }
+}
+
+/*
+ * lock_writelock --
+ * Wait to get exclusive lock.
+ */
+static inline void
+lock_writelock(WT_SESSION *session, RWLOCK *lock)
+{
+ testutil_assert(LOCK_INITIALIZED(lock));
+
+ if (lock->lock_type == LOCK_WT) {
+ __wt_writelock((WT_SESSION_IMPL *)session, &lock->l.wt);
+ } else {
+ testutil_check(pthread_rwlock_wrlock(&lock->l.pthread));
+ }
+}
+
+/*
+ * lock_writeunlock --
+ * Release an exclusive lock.
+ */
+static inline void
+lock_writeunlock(WT_SESSION *session, RWLOCK *lock)
+{
+ testutil_assert(LOCK_INITIALIZED(lock));
+
+ if (lock->lock_type == LOCK_WT) {
+ __wt_writeunlock((WT_SESSION_IMPL *)session, &lock->l.wt);
+ } else {
+ testutil_check(pthread_rwlock_unlock(&lock->l.pthread));
+ }
+}
diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c
index 0d679eb4cc3..6a668fa4f45 100644
--- a/src/third_party/wiredtiger/test/format/ops.c
+++ b/src/third_party/wiredtiger/test/format/ops.c
@@ -158,11 +158,13 @@ operations(u_int ops_seconds, bool lastrun)
if (!SINGLETHREADED)
g.rand_log_stop = true;
- /* Logging requires a session. */
- if (g.logging)
- testutil_check(conn->open_session(conn, NULL, NULL, &session));
+ testutil_check(conn->open_session(conn, NULL, NULL, &session));
logop(session, "%s", "=============== thread ops start");
+ /* Initialize locks to single-thread backups, failures, and timestamp updates. */
+ lock_init(session, &g.backup_lock);
+ lock_init(session, &g.ts_lock);
+
/*
* Create the per-thread structures and start the worker threads. Allocate the thread structures
* separately to minimize false sharing.
@@ -295,9 +297,11 @@ operations(u_int ops_seconds, bool lastrun)
testutil_check(__wt_thread_join(NULL, &timestamp_tid));
g.workers_finished = false;
+ lock_destroy(session, &g.backup_lock);
+ lock_destroy(session, &g.ts_lock);
+
logop(session, "%s", "=============== thread ops stop");
- if (g.logging)
- testutil_check(session->close(session, NULL));
+ testutil_check(session->close(session, NULL));
for (i = 0; i < g.c_threads; ++i) {
tinfo = tinfo_list[i];
@@ -372,13 +376,13 @@ begin_transaction_ts(TINFO *tinfo, u_int *iso_configp)
*
* Lock out the oldest timestamp update.
*/
- testutil_check(pthread_rwlock_wrlock(&g.ts_lock));
+ lock_writelock(session, &g.ts_lock);
ts = __wt_atomic_addv64(&g.timestamp, 1);
testutil_check(__wt_snprintf(buf, sizeof(buf), "read_timestamp=%" PRIx64, ts));
testutil_check(session->timestamp_transaction(session, buf));
- testutil_check(pthread_rwlock_unlock(&g.ts_lock));
+ lock_writeunlock(session, &g.ts_lock);
snap_init(tinfo, ts, false);
logop(session, "begin snapshot read-ts=%" PRIu64 " (not repeatable)", ts);
@@ -443,7 +447,7 @@ commit_transaction(TINFO *tinfo, bool prepared)
ts = 0; /* -Wconditional-uninitialized */
if (g.c_txn_timestamps) {
/* Lock out the oldest timestamp update. */
- testutil_check(pthread_rwlock_wrlock(&g.ts_lock));
+ lock_writelock(session, &g.ts_lock);
ts = __wt_atomic_addv64(&g.timestamp, 1);
testutil_check(__wt_snprintf(buf, sizeof(buf), "commit_timestamp=%" PRIx64, ts));
@@ -454,7 +458,7 @@ commit_transaction(TINFO *tinfo, bool prepared)
testutil_check(session->timestamp_transaction(session, buf));
}
- testutil_check(pthread_rwlock_unlock(&g.ts_lock));
+ lock_writeunlock(session, &g.ts_lock);
}
testutil_check(session->commit_transaction(session, NULL));
@@ -509,7 +513,7 @@ prepare_transaction(TINFO *tinfo)
*
* Lock out the oldest timestamp update.
*/
- testutil_check(pthread_rwlock_wrlock(&g.ts_lock));
+ lock_writelock(session, &g.ts_lock);
ts = __wt_atomic_addv64(&g.timestamp, 1);
testutil_check(__wt_snprintf(buf, sizeof(buf), "prepare_timestamp=%" PRIx64, ts));
@@ -517,7 +521,7 @@ prepare_transaction(TINFO *tinfo)
logop(session, "prepare ts=%" PRIu64, ts);
- testutil_check(pthread_rwlock_unlock(&g.ts_lock));
+ lock_writeunlock(session, &g.ts_lock);
return (ret);
}
diff --git a/src/third_party/wiredtiger/test/format/t.c b/src/third_party/wiredtiger/test/format/t.c
index 6d897131f28..a2fcf405cf9 100644
--- a/src/third_party/wiredtiger/test/format/t.c
+++ b/src/third_party/wiredtiger/test/format/t.c
@@ -118,10 +118,8 @@ format_process_env(void)
(void)signal(SIGTERM, signal_handler);
#endif
- /* Initialize locks to single-thread backups, failures, and timestamp updates. */
- testutil_check(pthread_rwlock_init(&g.backup_lock, NULL));
+ /* Initialize lock to ensure single threading during failure handling */
testutil_check(pthread_rwlock_init(&g.death_lock, NULL));
- testutil_check(pthread_rwlock_init(&g.ts_lock, NULL));
#if 0
/* Configure the GNU malloc for debugging. */
@@ -328,10 +326,6 @@ main(int argc, char *argv[])
config_print(false);
- testutil_check(pthread_rwlock_destroy(&g.backup_lock));
- testutil_check(pthread_rwlock_destroy(&g.death_lock));
- testutil_check(pthread_rwlock_destroy(&g.ts_lock));
-
config_clear();
printf("%s: successful run completed\n", progname);
diff --git a/src/third_party/wiredtiger/test/format/util.c b/src/third_party/wiredtiger/test/format/util.c
index e88a9a0aa7e..7aba99c20de 100644
--- a/src/third_party/wiredtiger/test/format/util.c
+++ b/src/third_party/wiredtiger/test/format/util.c
@@ -233,7 +233,7 @@ fclose_and_clear(FILE **fpp)
* Update the timestamp once.
*/
void
-timestamp_once(void)
+timestamp_once(WT_SESSION *session)
{
static const char *oldest_timestamp_str = "oldest_timestamp=";
WT_CONNECTION *conn;
@@ -246,16 +246,20 @@ timestamp_once(void)
/*
* Lock out transaction timestamp operations. The lock acts as a barrier ensuring we've checked
- * if the workers have finished, we don't want that line reordered.
+ * if the workers have finished, we don't want that line reordered. We can also be called from
+ * places, such as bulk load, where we are single-threaded and the locks haven't been
+ * initialized.
*/
- testutil_check(pthread_rwlock_wrlock(&g.ts_lock));
+ if (LOCK_INITIALIZED(&g.ts_lock))
+ lock_writelock(session, &g.ts_lock);
ret = conn->query_timestamp(conn, buf + strlen(oldest_timestamp_str), "get=all_durable");
testutil_assert(ret == 0 || ret == WT_NOTFOUND);
if (ret == 0)
testutil_check(conn->set_timestamp(conn, buf));
- testutil_check(pthread_rwlock_unlock(&g.ts_lock));
+ if (LOCK_INITIALIZED(&g.ts_lock))
+ lock_writeunlock(session, &g.ts_lock);
}
/*
@@ -265,9 +269,15 @@ timestamp_once(void)
WT_THREAD_RET
timestamp(void *arg)
{
+ WT_CONNECTION *conn;
+ WT_SESSION *session;
bool done;
(void)(arg);
+ conn = g.wts_conn;
+
+ /* Locks need session */
+ testutil_check(conn->open_session(conn, NULL, NULL, &session));
/* Update the oldest timestamp at least once every 15 seconds. */
done = false;
@@ -281,10 +291,11 @@ timestamp(void *arg)
else
random_sleep(&g.rnd, 15);
- timestamp_once();
+ timestamp_once(session);
} while (!done);
+ testutil_check(session->close(session, NULL));
return (WT_THREAD_RET_VALUE);
}
@@ -334,3 +345,38 @@ alter(void *arg)
testutil_check(session->close(session, NULL));
return (WT_THREAD_RET_VALUE);
}
+
+/*
+ * lock_init --
+ * Initialize abstract lock that can use either pthread of wt reader-writer locks.
+ */
+void
+lock_init(WT_SESSION *session, RWLOCK *lock)
+{
+ testutil_assert(lock->lock_type == LOCK_NONE);
+
+ if (g.c_wt_mutex) {
+ testutil_check(__wt_rwlock_init((WT_SESSION_IMPL *)session, &lock->l.wt));
+ lock->lock_type = LOCK_WT;
+ } else {
+ testutil_check(pthread_rwlock_init(&lock->l.pthread, NULL));
+ lock->lock_type = LOCK_PTHREAD;
+ }
+}
+
+/*
+ * lock_destroy --
+ * Destroy abstract lock.
+ */
+void
+lock_destroy(WT_SESSION *session, RWLOCK *lock)
+{
+ testutil_assert(LOCK_INITIALIZED(lock));
+
+ if (lock->lock_type == LOCK_WT) {
+ __wt_rwlock_destroy((WT_SESSION_IMPL *)session, &lock->l.wt);
+ } else {
+ testutil_check(pthread_rwlock_destroy(&lock->l.pthread));
+ }
+ lock->lock_type = LOCK_NONE;
+}
diff --git a/src/third_party/wiredtiger/test/multiversion/wt_multiversion.sh b/src/third_party/wiredtiger/test/multiversion/wt_multiversion.sh
new file mode 100644
index 00000000000..f3528b5d24e
--- /dev/null
+++ b/src/third_party/wiredtiger/test/multiversion/wt_multiversion.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+last_stable=4.2
+last_stable_dir=wiredtiger_${last_stable}/
+last_stable_branch=mongodb-${last_stable}
+
+function setup_last_stable {
+ git clone git@github.com:wiredtiger/wiredtiger.git ${last_stable_dir}
+ cd ${last_stable_dir}/build_posix/ || exit
+ git checkout $last_stable_branch || exit 1
+ bash reconf
+ ../configure --enable-python --enable-diagnostic
+ make -j 10
+ # Back to multiversion/ in "latest" repo.
+ cd ../../ || exit
+}
+
+function run_check {
+ echo + "$@"
+ "$@" || exit 1
+}
+
+# Clone and build v4.2 if it doesn't already exist.
+if [ ! -d $last_stable_dir ]; then
+ setup_last_stable
+fi
+
+latest_workgen=../../bench/workgen/runner/multiversion.py
+last_stable_workgen=${last_stable_dir}/bench/workgen/runner/multiversion.py
+
+# Copy the workload into the v4.2 tree.
+cp $latest_workgen $last_stable_workgen
+
+run_check $latest_workgen --release 4.4
+run_check $latest_workgen --keep --release 4.4
+run_check $last_stable_workgen --keep --release 4.2
+run_check $latest_workgen --keep --release 4.4
+
+echo Success.
+exit 0
diff --git a/src/third_party/wiredtiger/test/packing/intpack-test3.c b/src/third_party/wiredtiger/test/packing/intpack-test3.c
index 8bd8cc8a8c9..43cb8834997 100644
--- a/src/third_party/wiredtiger/test/packing/intpack-test3.c
+++ b/src/third_party/wiredtiger/test/packing/intpack-test3.c
@@ -110,8 +110,8 @@ main(void)
int64_t i;
/*
- * Test all values in a range, to ensure pack/unpack of small numbers
- * (which most actively use different numbers of bits) works.
+ * Test all values in a range, to ensure pack/unpack of small numbers (which most actively use
+ * different numbers of bits) works.
*/
test_spread(0, 100000, 100000);
test_spread(INT16_MAX, 1025, 1025);