diff options
-rwxr-xr-x | src/third_party/wiredtiger/bench/workgen/runner/evict-btree-lookaside.py | 168 | ||||
-rw-r--r-- | src/third_party/wiredtiger/dist/api_data.py | 5 | ||||
-rw-r--r-- | src/third_party/wiredtiger/examples/c/ex_all.c | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/import.data | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_sync.c | 5 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/config/config_def.c | 5 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/wiredtiger.in | 22 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/txn/txn_timestamp.c | 3 | ||||
-rw-r--r-- | src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c | 2 | ||||
-rwxr-xr-x | src/third_party/wiredtiger/test/evergreen.yml | 62 | ||||
-rw-r--r-- | src/third_party/wiredtiger/test/format/format.h | 12 | ||||
-rwxr-xr-x | src/third_party/wiredtiger/test/format/format.sh | 4 | ||||
-rw-r--r-- | src/third_party/wiredtiger/test/format/ops.c | 164 | ||||
-rw-r--r-- | src/third_party/wiredtiger/test/format/t.c | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/test/suite/test_timestamp14.py | 83 |
15 files changed, 342 insertions, 199 deletions
diff --git a/src/third_party/wiredtiger/bench/workgen/runner/evict-btree-lookaside.py b/src/third_party/wiredtiger/bench/workgen/runner/evict-btree-lookaside.py new file mode 100755 index 00000000000..333da4b178c --- /dev/null +++ b/src/third_party/wiredtiger/bench/workgen/runner/evict-btree-lookaside.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2019 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +# This benchmark is designed to stress disk access to the LAS/History Store file. +# This is achieved through: +# - Long running transactions consisting of read and update operations. +# - Low cache size (~20%) for a reasonably sized WT table with large documents. +# - Pareto distribution for operations in long running transactions. This will cause +# skewed access for a selective set of keys in WT table. +# - Relatively large number of read operation threads to stress cache. + +# Benchmark is based on a wtperf config: wiredtiger/bench/wtperf/runners/evict-btree-scan.wtperf +# There are number of changes made to original configs such as: +# - Reduced the number of documents inserts during initial warm-up phase. +# - Increased the sizes of key and value. +# - Reduced the WT cache size. +# - Added transaction based operations and repurposed some of the read threads as long-running +# transcation threads. + +################################################################################################### +''' +# wtperf options file: evict btree configuration +conn_config="cache_size=40G,checkpoint=(wait=60,log_size=2GB),eviction=(threads_min=12, +threads_max=12),log=(enabled=true),session_max=600,eviction_target=60,statistics=(fast), +statistics_log=(wait=1,json)" + +# 1B records * (key=12 + value=138) is about 150G total data size +key_sz=12 +value_sz=138 +log_like_table=true +table_config="type=file" +icount=1000000000 +report_interval=5 +run_time=3600 +# Scans every 10 minutes for all the scan specific tables. +# .4B records * (key=12 + value=138) is about 60G total data size for scan +# Running on a machine with 64G physical memory, this exhausts both the +# WT cache and the system cache. +scan_interval=600 +scan_pct=100 +scan_table_count=20 +scan_icount=400000000 +populate_threads=5 +table_count=100 +threads=((count=400,reads=1),(count=20,inserts=1,throttle=500),(count=10,updates=1,throttle=500)) +# Add throughput/latency monitoring +max_latency=50000 +sample_interval=5 +''' +################################################################################################### + +from runner import * +from wiredtiger import * +from workgen import * + +context = Context() +homedir = "WT_TEST" +conn_config = "cache_size=1G,checkpoint=(wait=60,log_size=2GB),\ + eviction=(threads_min=12,threads_max=12),log=(enabled=true),session_max=600,\ + eviction_target=60,statistics=(fast),statistics_log=(wait=1,json)"# explicitly added +conn = wiredtiger_open(homedir, "create," + conn_config) +s = conn.open_session("") + +wtperf_table_config = "key_format=S,value_format=S," +\ + "exclusive=true,allocation_size=4kb," +\ + "internal_page_max=64kb,leaf_page_max=4kb,split_pct=100," +compress_table_config = "" +table_config = "type=file" +tables = [] +table_count = 1 +for i in range(0, table_count): + tname = "table:test" + str(i) + table = Table(tname) + s.create(tname, wtperf_table_config +\ + compress_table_config + table_config) + table.options.key_size = 200 + table.options.value_size = 5000 + tables.append(table) + +populate_threads = 40 +icount = 500000 +# If there are multiple tables to be filled during populate, +# the icount is split between them all. +pop_ops = Operation(Operation.OP_INSERT, tables[0]) +pop_ops = op_multi_table(pop_ops, tables) +nops_per_thread = icount // (populate_threads * table_count) +pop_thread = Thread(pop_ops * nops_per_thread) +pop_workload = Workload(context, populate_threads * pop_thread) +pop_workload.run(conn) + +# Log like file, requires that logging be enabled in the connection config. +log_name = "table:log" +s.create(log_name, wtperf_table_config + "key_format=S,value_format=S," +\ + compress_table_config + table_config + ",log=(enabled=true)") +log_table = Table(log_name) + +ops = Operation(Operation.OP_SEARCH, tables[0]) +ops = op_multi_table(ops, tables, False) +ops = op_log_like(ops, log_table, 0) +thread0 = Thread(ops) + +ops = Operation(Operation.OP_INSERT, tables[0]) +ops = op_multi_table(ops, tables, False) +ops = op_log_like(ops, log_table, 0) +thread1 = Thread(ops) +# These operations include log_like operations, which will increase the number +# of insert/update operations by a factor of 2.0. This may cause the +# actual operations performed to be above the throttle. +thread1.options.throttle=500 +thread1.options.throttle_burst=1.0 + +ops = Operation(Operation.OP_UPDATE, tables[0]) +ops = op_multi_table(ops, tables, False) +ops = op_log_like(ops, log_table, 0) +thread2 = Thread(ops) +# These operations include log_like operations, which will increase the number +# of insert/update operations by a factor of 2.0. This may cause the +# actual operations performed to be above the throttle. +thread2.options.throttle=500 +thread2.options.throttle_burst=1.0 + +# Long running transactions. There is a 0.1 second sleep after a series of search and update +# operations. The sleep op is repeated 10000 times and this will make these transcations to at +# least run for ~17 minutes. +search_op = Operation(Operation.OP_SEARCH, tables[0], Key(Key.KEYGEN_PARETO, 0, ParetoOptions(1))) +update_op = Operation(Operation.OP_UPDATE, tables[0], Key(Key.KEYGEN_PARETO, 0, ParetoOptions(1))) +ops = txn(((search_op + update_op) * 1000 + sleep(0.1)) * 10000) +thread3 = Thread(ops) + +ops = Operation(Operation.OP_SLEEP, "0.1") + \ + Operation(Operation.OP_LOG_FLUSH, "") +logging_thread = Thread(ops) + +workload = Workload(context, 350 * thread0 + 10 * thread1 +\ + 50 * thread2 + 100 * thread3 + logging_thread) +workload.options.report_interval=5 +workload.options.run_time=300 +workload.options.max_latency=50000 +workload.run(conn) + +latency_filename = homedir + "/latency.out" +latency.workload_latency(workload, latency_filename) +conn.close() diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py index d6280b7f0ec..3a46d6e7ff0 100644 --- a/src/third_party/wiredtiger/dist/api_data.py +++ b/src/third_party/wiredtiger/dist/api_data.py @@ -1610,8 +1610,7 @@ methods = { 'WT_CONNECTION.query_timestamp' : Method([ Config('get', 'all_durable', r''' - specify which timestamp to query: \c all_committed returns the largest - timestamp such that all timestamps up to that value have committed, + specify which timestamp to query: \c all_durable returns the largest timestamp such that all timestamps up to that value have been made durable, \c last_checkpoint returns the timestamp of the most recent stable checkpoint, \c oldest returns the @@ -1622,7 +1621,7 @@ methods = { timestamp of the most recent stable checkpoint taken prior to a shutdown and \c stable returns the most recent \c stable_timestamp set with WT_CONNECTION::set_timestamp. See @ref transaction_timestamps''', - choices=['all_committed','all_durable','last_checkpoint', + choices=['all_durable','last_checkpoint', 'oldest','oldest_reader','pinned','recovery','stable']), ]), diff --git a/src/third_party/wiredtiger/examples/c/ex_all.c b/src/third_party/wiredtiger/examples/c/ex_all.c index 53c2ac9d95d..3345cd7c708 100644 --- a/src/third_party/wiredtiger/examples/c/ex_all.c +++ b/src/third_party/wiredtiger/examples/c/ex_all.c @@ -912,7 +912,7 @@ transaction_ops(WT_SESSION *session_arg) error_check(session->commit_transaction(session, NULL)); - error_check(conn->query_timestamp(conn, timestamp_buf, "get=all_committed")); + error_check(conn->query_timestamp(conn, timestamp_buf, "get=all_durable")); /*! [query timestamp] */ } diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index f2d431f97fa..b2f18ef2ef2 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "61bf0547c6edfc6cd7c633c80f1c76f3e33e5aec", + "commit": "5bd9fdfc0bb7f8fcf791e234b6e436c79dfe1fee", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-4.4" diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index f7297729967..71a4c11dd44 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -291,9 +291,12 @@ __wt_sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) * if eviction fails (the page may stay in cache clean but with history that cannot be * discarded), that is not wasted effort because checkpoint doesn't need to write the * page again. + * + * Once the transaction has given up it's snapshot it is no longer safe to reconcile + * pages. That happens prior to the final metadata checkpoint. */ if (!WT_PAGE_IS_INTERNAL(page) && page->read_gen == WT_READGEN_WONT_NEED && - !tried_eviction) { + !tried_eviction && F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT)) { ret = __wt_page_release_evict(session, walk, 0); walk = NULL; WT_ERR_BUSY_OK(ret); diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index a24841fe67e..12f510728c7 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -33,9 +33,8 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_open_session[] = { static const WT_CONFIG_CHECK confchk_WT_CONNECTION_query_timestamp[] = { {"get", "string", NULL, - "choices=[\"all_committed\",\"all_durable\"," - "\"last_checkpoint\",\"oldest\",\"oldest_reader\",\"pinned\"," - "\"recovery\",\"stable\"]", + "choices=[\"all_durable\",\"last_checkpoint\",\"oldest\"," + "\"oldest_reader\",\"pinned\",\"recovery\",\"stable\"]", NULL, 0}, {NULL, NULL, NULL, NULL, NULL, 0}}; diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index a50e617705b..ddcdeffedf1 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -2471,18 +2471,16 @@ struct __wt_connection { * hexadecimal encoding of the timestamp being queried. Must be large * enough to hold a NUL terminated, hex-encoded 8B timestamp (17 bytes). * @configstart{WT_CONNECTION.query_timestamp, see dist/api_data.py} - * @config{get, specify which timestamp to query: \c all_committed returns the largest - * timestamp such that all timestamps up to that value have committed\, \c all_durable - * returns the largest timestamp such that all timestamps up to that value have been made - * durable\, \c last_checkpoint returns the timestamp of the most recent stable checkpoint\, - * \c oldest returns the most recent \c oldest_timestamp set with - * WT_CONNECTION::set_timestamp\, \c oldest_reader returns the minimum of the read - * timestamps of all active readers \c pinned returns the minimum of the \c oldest_timestamp - * and the read timestamps of all active readers\, \c recovery returns the timestamp of the - * most recent stable checkpoint taken prior to a shutdown and \c stable returns the most - * recent \c stable_timestamp set with WT_CONNECTION::set_timestamp. See @ref - * transaction_timestamps., a string\, chosen from the following options: \c - * "all_committed"\, \c "all_durable"\, \c "last_checkpoint"\, \c "oldest"\, \c + * @config{get, specify which timestamp to query: \c all_durable returns the largest + * timestamp such that all timestamps up to that value have been made durable\, \c + * last_checkpoint returns the timestamp of the most recent stable checkpoint\, \c oldest + * returns the most recent \c oldest_timestamp set with WT_CONNECTION::set_timestamp\, \c + * oldest_reader returns the minimum of the read timestamps of all active readers \c pinned + * returns the minimum of the \c oldest_timestamp and the read timestamps of all active + * readers\, \c recovery returns the timestamp of the most recent stable checkpoint taken + * prior to a shutdown and \c stable returns the most recent \c stable_timestamp set with + * WT_CONNECTION::set_timestamp. See @ref transaction_timestamps., a string\, chosen from + * the following options: \c "all_durable"\, \c "last_checkpoint"\, \c "oldest"\, \c * "oldest_reader"\, \c "pinned"\, \c "recovery"\, \c "stable"; default \c all_durable.} * @configend * @errors diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c index 8db13cc7048..c24191efa24 100644 --- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c +++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c @@ -250,8 +250,7 @@ __txn_global_query_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *tsp, cons WT_STAT_CONN_INCR(session, txn_query_ts); WT_RET(__wt_config_gets(session, cfg, "get", &cval)); - if (WT_STRING_MATCH("all_committed", cval.str, cval.len) || - WT_STRING_MATCH("all_durable", cval.str, cval.len)) { + if (WT_STRING_MATCH("all_durable", cval.str, cval.len)) { if (!txn_global->has_durable_timestamp) return (WT_NOTFOUND); ts = txn_global->durable_timestamp; diff --git a/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c b/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c index 1b69427d9f2..c6672dc9830 100644 --- a/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c +++ b/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c @@ -141,7 +141,7 @@ thread_ts_run(void *arg) * that requires locking out transactional ops that set or query a timestamp. */ testutil_check(pthread_rwlock_wrlock(&ts_lock)); - ret = td->conn->query_timestamp(td->conn, ts_string, "get=all_committed"); + ret = td->conn->query_timestamp(td->conn, ts_string, "get=all_durable"); testutil_check(pthread_rwlock_unlock(&ts_lock)); testutil_assert(ret == 0 || ret == WT_NOTFOUND); if (ret == 0) { diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml index d728e9f4e96..5f63c4ff3a6 100755 --- a/src/third_party/wiredtiger/test/evergreen.yml +++ b/src/third_party/wiredtiger/test/evergreen.yml @@ -371,6 +371,55 @@ tasks: vars: posix_configure_flags: --enable-strict --enable-diagnostic + - name: compile-gcc + tags: ["pull_request", "pull_request_compilers"] + commands: + - func: "get project" + - func: "compile wiredtiger" + vars: + configure_env_vars: CC=gcc-4.8 CXX=g++-4.8 CFLAGS="-ggdb -fPIC" + - func: "compile wiredtiger" + vars: + configure_env_vars: CC=gcc-5 CXX=g++-5 CFLAGS="-ggdb -fPIC" + - func: "compile wiredtiger" + vars: + configure_env_vars: CC=gcc-6 CXX=g++-6 CFLAGS="-ggdb -fPIC" + - func: "compile wiredtiger" + vars: + configure_env_vars: CC=gcc-6 CXX=g++-6 CFLAGS="-ggdb -fPIC" CPPFLAGS=-fvisibility=hidden + - func: "compile wiredtiger" + vars: + configure_env_vars: CC=gcc-7 CXX=g++-7 CFLAGS="-ggdb -fPIC" + - func: "compile wiredtiger" + vars: + configure_env_vars: CC=gcc-8 CXX=g++-8 CFLAGS="-ggdb -fPIC" + - func: "compile wiredtiger" + vars: + configure_env_vars: CC=gcc-9 CXX=g++-9 CFLAGS="-ggdb -fPIC" + + - name: compile-clang + tags: ["pull_request", "pull_request_compilers"] + commands: + - func: "get project" + - func: "compile wiredtiger" + vars: + configure_env_vars: CC=clang-3.9 CXX=clang++-3.9 CFLAGS="-ggdb -fPIC" + - func: "compile wiredtiger" + vars: + configure_env_vars: CC=clang-4.0 CXX=clang++-4.0 CFLAGS="-ggdb -fPIC" + - func: "compile wiredtiger" + vars: + configure_env_vars: CC=clang-5.0 CXX=clang++-5.0 CFLAGS="-ggdb -fPIC" + - func: "compile wiredtiger" + vars: + configure_env_vars: CC=clang-6.0 CXX=clang++-6.0 CFLAGS="-ggdb -fPIC" + - func: "compile wiredtiger" + vars: + configure_env_vars: CC=clang-7 CXX=clang++-7 CFLAGS="-ggdb -fPIC" + - func: "compile wiredtiger" + vars: + configure_env_vars: CC=clang-8 CXX=clang++-8 CFLAGS="-ggdb -fPIC" + - name: make-check-test depends_on: - name: compile @@ -2029,7 +2078,7 @@ buildvariants: posix_configure_flags: --enable-silent-rules --enable-diagnostic --enable-python --enable-zlib --enable-snappy --enable-strict --enable-static --prefix=$(pwd)/LOCAL_INSTALL make_command: PATH=/opt/mongodbtoolchain/v3/bin:$PATH make tasks: - - name: ".pull_request !.windows_only" + - name: ".pull_request !.windows_only !.pull_request_compilers" - name: compile-msan - name: make-check-msan-test - name: compile-ubsan @@ -2055,6 +2104,17 @@ buildvariants: - name: format-stress-smoke-test - name: checkpoint-stress-test +- name: ubuntu1804-compilers + display_name: Ubuntu 18.04 Compilers + run_on: + - ubuntu1804-wt-build + expansions: + posix_configure_flags: --enable-silent-rules --enable-diagnostic --enable-strict --enable-lz4 --enable-snappy --enable-zlib --enable-zstd --enable-python + smp_command: -j $(grep -c ^processor /proc/cpuinfo) + make_command: PATH=/opt/mongodbtoolchain/v3/bin:$PATH make + tasks: + - name: ".pull_request_compilers" + - name: ubuntu1804-python3 display_name: Ubuntu 18.04 (Python3) run_on: diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h index a1fa5d425b9..899956c8f71 100644 --- a/src/third_party/wiredtiger/test/format/format.h +++ b/src/third_party/wiredtiger/test/format/format.h @@ -106,15 +106,6 @@ typedef struct { uint64_t truncate_cnt; /* Counter for truncation */ - /* - * We have a list of records that are appended, but not yet "resolved", that is, we haven't yet - * incremented the g.rows value to reflect the new records. - */ - uint64_t *append; /* Appended records */ - size_t append_max; /* Maximum unresolved records */ - size_t append_cnt; /* Current unresolved records */ - pthread_rwlock_t append_lock; /* Single-thread resolution */ - pthread_rwlock_t death_lock; /* Single-thread failure */ char *uri; /* Object name */ @@ -315,6 +306,9 @@ typedef struct { uint64_t commit_ts; /* commit timestamp */ SNAP_OPS *snap, *snap_first, snap_list[512]; + uint64_t insert_list[256]; /* column-store inserted records */ + u_int insert_list_cnt; + WT_ITEM *tbuf, _tbuf; /* temporary buffer */ #define TINFO_RUNNING 1 /* Running */ diff --git a/src/third_party/wiredtiger/test/format/format.sh b/src/third_party/wiredtiger/test/format/format.sh index 37607d6b8df..f9baf495a83 100755 --- a/src/third_party/wiredtiger/test/format/format.sh +++ b/src/third_party/wiredtiger/test/format/format.sh @@ -361,14 +361,14 @@ format() if [[ $smoke_test -ne 0 ]]; then args=${smoke_list[$smoke_next]} smoke_next=$(($smoke_next + 1)) - echo "$name: starting smoke-test job in $dir" + echo "$name: starting smoke-test job in $dir ($(date))" else args=$format_args # If abort/recovery testing is configured, do it 5% of the time. [[ $abort_test -ne 0 ]] && [[ $(($count_jobs % 20)) -eq 0 ]] && args="$args abort=1" - echo "$name: starting job in $dir" + echo "$name: starting job in $dir ($(date))" fi cmd="$format_binary -c "$config" -h "$dir" -1 $args quiet=1" diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c index 669cb7a484e..d6fce308164 100644 --- a/src/third_party/wiredtiger/test/format/ops.c +++ b/src/third_party/wiredtiger/test/format/ops.c @@ -29,6 +29,7 @@ #include "format.h" static int col_insert(TINFO *, WT_CURSOR *); +static void col_insert_resolve(TINFO *); static int col_modify(TINFO *, WT_CURSOR *, bool); static int col_remove(TINFO *, WT_CURSOR *, bool); static int col_reserve(TINFO *, WT_CURSOR *, bool); @@ -43,7 +44,6 @@ static int row_remove(TINFO *, WT_CURSOR *, bool); static int row_reserve(TINFO *, WT_CURSOR *, bool); static int row_truncate(TINFO *, WT_CURSOR *); static int row_update(TINFO *, WT_CURSOR *, bool); -static void table_append_init(void); static char modify_repl[256]; @@ -167,9 +167,6 @@ wts_ops(bool lastrun) quit_fourths = fourths + 15 * 4 * 60; } - /* Initialize the table extension code. */ - table_append_init(); - /* * We support replay of threaded runs, but don't log random numbers after threaded operations * start, there's no point. @@ -789,7 +786,7 @@ ops(void *arg) * We can only append so many new records, once we reach that limit, update a record * instead of inserting. */ - if (g.append_cnt >= g.append_max) + if (tinfo->insert_list_cnt >= WT_ELEMENTS(tinfo->insert_list)) goto update_instead_of_chosen_op; ret = col_insert(tinfo, cursor); @@ -944,6 +941,10 @@ update_instead_of_chosen_op: break; } + /* If we have pending inserts, try and update the total rows. */ + if (tinfo->insert_list_cnt > 0) + col_insert_resolve(tinfo); + /* * The cursor is positioned if we did any operation other than insert, do a small number of * next/prev cursor operations in a random direction. @@ -1582,100 +1583,6 @@ col_update(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) } /* - * table_append_init -- - * Re-initialize the appended records list. - */ -static void -table_append_init(void) -{ - /* Append up to 10 records per thread before waiting on resolution. */ - g.append_max = (size_t)g.c_threads * 10; - g.append_cnt = 0; - - free(g.append); - g.append = dcalloc(g.append_max, sizeof(uint64_t)); -} - -/* - * table_append -- - * Resolve the appended records. - */ -static void -table_append(uint64_t keyno) -{ - uint64_t *ep, *p; - int done; - - ep = g.append + g.append_max; - - /* - * We don't want to ignore records we append, which requires we update the "last row" as we - * insert new records. Threads allocating record numbers can race with other threads, so the - * thread allocating record N may return after the thread allocating N + 1. We can't update a - * record before it's been inserted, and so we can't leave gaps when the count of records in the - * table is incremented. - * - * The solution is the append table, which contains an unsorted list of appended records. Every - * time we finish appending a record, process the table, trying to update the total records in - * the object. - * - * First, enter the new key into the append list. - * - * It's technically possible to race: we allocated space for 10 records per thread, but the - * check for the maximum number of records being appended doesn't lock. If a thread allocated a - * new record and went to sleep (so the append table fills up), then N threads of control used - * the same g.append_cnt value to decide there was an available slot in the append table and - * both allocated new records, we could run out of space in the table. It's unfortunately not - * even unlikely in the case of a large number of threads all inserting as fast as they can and - * a single thread going to sleep for an unexpectedly long time. If it happens, sleep and retry - * until earlier records are resolved and we find a slot. - */ - for (done = 0;;) { - testutil_check(pthread_rwlock_wrlock(&g.append_lock)); - - /* - * If this is the thread we've been waiting for, and its record won't fit, we'd loop - * infinitely. If there are many append operations and a thread goes to sleep for a little - * too long, it can happen. - */ - if (keyno == g.rows + 1) { - g.rows = keyno; - done = 1; - - /* - * Clean out the table, incrementing the total count of records until we don't find the - * next key. - */ - for (;;) { - for (p = g.append; p < ep; ++p) - if (*p == g.rows + 1) { - g.rows = *p; - *p = 0; - --g.append_cnt; - break; - } - if (p == ep) - break; - } - } else - /* Enter the key into the table. */ - for (p = g.append; p < ep; ++p) - if (*p == 0) { - *p = keyno; - ++g.append_cnt; - done = 1; - break; - } - - testutil_check(pthread_rwlock_unlock(&g.append_lock)); - - if (done) - break; - __wt_sleep(1, 0); - } -} - -/* * row_insert -- * Insert a row in a row-store file. */ @@ -1707,6 +1614,63 @@ row_insert(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) } /* + * col_insert_resolve -- + * Resolve newly inserted records. + */ +static void +col_insert_resolve(TINFO *tinfo) +{ + uint64_t v, *p; + u_int i; + + /* + * We don't want to ignore column-store records we insert, which requires we update the "last + * row" so other threads consider them. Threads allocating record numbers can race with other + * threads, so the thread allocating record N may return after the thread allocating N + 1. We + * can't update a record before it's been inserted, and so we can't leave gaps when the count of + * records in the table is incremented. + * + * The solution is a per-thread array which contains an unsorted list of inserted records. If + * there are pending inserts, we review the table after every operation, trying to update the + * total rows. This is wasteful, but we want to give other threads immediate access to the row, + * ideally they'll collide with our insert before we resolve. + * + * Process the existing records and advance the last row count until we can't go further. + */ + do { + WT_ORDERED_READ(v, g.rows); + for (i = 0, p = tinfo->insert_list; i < WT_ELEMENTS(tinfo->insert_list); ++i) { + if (*p == v + 1) { + testutil_assert(__wt_atomic_casv64(&g.rows, v, v + 1)); + *p = 0; + --tinfo->insert_list_cnt; + break; + } + testutil_assert(*p == 0 || *p > v); + } + } while (tinfo->insert_list_cnt > 0 && i < WT_ELEMENTS(tinfo->insert_list)); +} + +/* + * col_insert_add -- + * Add newly inserted records. + */ +static void +col_insert_add(TINFO *tinfo) +{ + u_int i; + + /* Add the inserted record to the array. */ + for (i = 0; i < WT_ELEMENTS(tinfo->insert_list); ++i) + if (tinfo->insert_list[i] == 0) { + tinfo->insert_list[i] = tinfo->keyno; + ++tinfo->insert_list_cnt; + break; + } + testutil_assert(i < WT_ELEMENTS(tinfo->insert_list)); +} + +/* * col_insert -- * Insert an element in a column-store file. */ @@ -1726,7 +1690,7 @@ col_insert(TINFO *tinfo, WT_CURSOR *cursor) testutil_check(cursor->get_key(cursor, &tinfo->keyno)); - table_append(tinfo->keyno); /* Extend the object. */ + col_insert_add(tinfo); /* Extend the object. */ if (g.type == FIX) logop(cursor->session, "%-10s%" PRIu64 " {0x%02" PRIx8 "}", "insert", tinfo->keyno, diff --git a/src/third_party/wiredtiger/test/format/t.c b/src/third_party/wiredtiger/test/format/t.c index 5665b50ab4c..85bef1a694c 100644 --- a/src/third_party/wiredtiger/test/format/t.c +++ b/src/third_party/wiredtiger/test/format/t.c @@ -190,7 +190,6 @@ main(int argc, char *argv[]) * Initialize locks to single-thread named checkpoints and backups, last last-record updates, * and failures. */ - testutil_check(pthread_rwlock_init(&g.append_lock, NULL)); testutil_check(pthread_rwlock_init(&g.backup_lock, NULL)); testutil_check(pthread_rwlock_init(&g.death_lock, NULL)); testutil_check(pthread_rwlock_init(&g.ts_lock, NULL)); @@ -270,7 +269,6 @@ main(int argc, char *argv[]) config_print(false); - testutil_check(pthread_rwlock_destroy(&g.append_lock)); testutil_check(pthread_rwlock_destroy(&g.backup_lock)); testutil_check(pthread_rwlock_destroy(&g.death_lock)); testutil_check(pthread_rwlock_destroy(&g.ts_lock)); diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp14.py b/src/third_party/wiredtiger/test/suite/test_timestamp14.py index 4343160f92f..1fc5cdc7b22 100644 --- a/src/third_party/wiredtiger/test/suite/test_timestamp14.py +++ b/src/third_party/wiredtiger/test/suite/test_timestamp14.py @@ -43,36 +43,32 @@ class test_timestamp14(wttest.WiredTigerTestCase, suite_subprocess): uri = 'table:' + tablename session_config = 'isolation=snapshot' - def test_all_committed(self): + def test_all_durable_old(self): # This test was originally for testing the all_committed timestamp. # In the absence of prepared transactions, all_durable is identical to - # all_committed so let's enforce the same values for both. - all_committed_uri = self.uri + '_all_committed' + # all_committed so let's enforce the all_durable values instead. + all_durable_uri = self.uri + '_all_durable' session1 = self.setUpSessionOpen(self.conn) session2 = self.setUpSessionOpen(self.conn) - session1.create(all_committed_uri, 'key_format=i,value_format=i') - session2.create(all_committed_uri, 'key_format=i,value_format=i') + session1.create(all_durable_uri, 'key_format=i,value_format=i') + session2.create(all_durable_uri, 'key_format=i,value_format=i') # Scenario 0: No commit timestamp has ever been specified therefore - # There is no all_committed timestamp and we will get an error + # There is no all_durable timestamp and we will get an error # Querying for it. session1.begin_transaction() - cur1 = session1.open_cursor(all_committed_uri) + cur1 = session1.open_cursor(all_durable_uri) cur1[1]=1 session1.commit_transaction() self.assertRaisesException(wiredtiger.WiredTigerError, - lambda: self.conn.query_timestamp('get=all_committed')) - self.assertRaisesException(wiredtiger.WiredTigerError, lambda: self.conn.query_timestamp('get=all_durable')) # Scenario 1: A single transaction with a commit timestamp, will - # result in the all_committed timestamp being set. + # result in the all_durable timestamp being set. session1.begin_transaction() cur1[1]=1 session1.commit_transaction('commit_timestamp=1') self.assertTimestampsEqual( - self.conn.query_timestamp('get=all_committed'), "1") - self.assertTimestampsEqual( self.conn.query_timestamp('get=all_durable'), "1") # Scenario 2: A transaction begins and specifies that it intends @@ -82,36 +78,30 @@ class test_timestamp14(wttest.WiredTigerTestCase, suite_subprocess): session1.timestamp_transaction('commit_timestamp=2') session2.begin_transaction() - cur2 = session2.open_cursor(all_committed_uri) + cur2 = session2.open_cursor(all_durable_uri) cur2[2] = 2 session2.commit_transaction('commit_timestamp=3') - # As the original transaction is still running the all_committed + # As the original transaction is still running the all_durable # timestamp is being held at 1. self.assertTimestampsEqual( - self.conn.query_timestamp('get=all_committed'), "1") - self.assertTimestampsEqual( self.conn.query_timestamp('get=all_durable'), "1") cur1[1] = 2 session1.commit_transaction() - # Now that the original transaction has finished the all_committed + # Now that the original transaction has finished the all_durable # timestamp has moved to 3, skipping 2 as there is a commit with # a greater timestamp already existing. self.assertTimestampsEqual( - self.conn.query_timestamp('get=all_committed'), "3") - self.assertTimestampsEqual( self.conn.query_timestamp('get=all_durable'), "3") # Senario 3: Commit with a commit timestamp of 5 and then begin a - # transaction intending to commit at 4, the all_committed timestamp + # transaction intending to commit at 4, the all_durable timestamp # should move back to 3. Until the transaction at 4 completes. session1.begin_transaction() cur1[1] = 3 session1.commit_transaction('commit_timestamp=5') self.assertTimestampsEqual( - self.conn.query_timestamp('get=all_committed'), "5") - self.assertTimestampsEqual( self.conn.query_timestamp('get=all_durable'), "5") session1.begin_transaction() @@ -120,8 +110,6 @@ class test_timestamp14(wttest.WiredTigerTestCase, suite_subprocess): session1.timestamp_transaction('commit_timestamp=4') self.assertTimestampsEqual( - self.conn.query_timestamp('get=all_committed'), "3") - self.assertTimestampsEqual( self.conn.query_timestamp('get=all_durable'), "3") session1.commit_transaction() @@ -129,20 +117,16 @@ class test_timestamp14(wttest.WiredTigerTestCase, suite_subprocess): # Now that the transaction at timestamp 4 has completed the # all committed timestamp is back at 5. self.assertTimestampsEqual( - self.conn.query_timestamp('get=all_committed'), "5") - self.assertTimestampsEqual( self.conn.query_timestamp('get=all_durable'), "5") # Scenario 4: Holding a transaction open without a commit timestamp - # Will not affect the all_committed timestamp. + # Will not affect the all_durable timestamp. session1.begin_transaction() session2.begin_transaction() cur2[2] = 2 session2.commit_transaction('commit_timestamp=6') self.assertTimestampsEqual( - self.conn.query_timestamp('get=all_committed'), "6") - self.assertTimestampsEqual( self.conn.query_timestamp('get=all_durable'), "6") cur1[1] = 2 session1.commit_transaction() @@ -270,26 +254,19 @@ class test_timestamp14(wttest.WiredTigerTestCase, suite_subprocess): cur1[1] = 1 session1.commit_transaction('commit_timestamp=3') self.assertTimestampsEqual( - self.conn.query_timestamp('get=all_committed'), '3') - self.assertTimestampsEqual( self.conn.query_timestamp('get=all_durable'), '3') # We have a running transaction with a lower commit_timestamp than we've - # seen before. So all_durable (like all_committed) should return (lowest - # commit timestamp - 1). + # seen before. So all_durable should return (lowest commit timestamp - 1). session1.begin_transaction() cur1[1] = 2 session1.timestamp_transaction('commit_timestamp=2') self.assertTimestampsEqual( - self.conn.query_timestamp('get=all_committed'), '1') - self.assertTimestampsEqual( self.conn.query_timestamp('get=all_durable'), '1') session1.commit_transaction() # After committing, go back to the value we saw previously. self.assertTimestampsEqual( - self.conn.query_timestamp('get=all_committed'), '3') - self.assertTimestampsEqual( self.conn.query_timestamp('get=all_durable'), '3') # For prepared transactions, we take into account the durable timestamp @@ -302,16 +279,12 @@ class test_timestamp14(wttest.WiredTigerTestCase, suite_subprocess): # don't want that to be visible in the all_durable calculation. session1.timestamp_transaction('commit_timestamp=7') self.assertTimestampsEqual( - self.conn.query_timestamp('get=all_committed'), '3') - self.assertTimestampsEqual( self.conn.query_timestamp('get=all_durable'), '3') # Now take into account the durable timestamp. session1.timestamp_transaction('durable_timestamp=8') session1.commit_transaction() self.assertTimestampsEqual( - self.conn.query_timestamp('get=all_committed'), '8') - self.assertTimestampsEqual( self.conn.query_timestamp('get=all_durable'), '8') # All durable moves back when we have a running prepared transaction @@ -324,21 +297,15 @@ class test_timestamp14(wttest.WiredTigerTestCase, suite_subprocess): # don't want that to be visible in the all_durable calculation. session1.timestamp_transaction('commit_timestamp=4') self.assertTimestampsEqual( - self.conn.query_timestamp('get=all_committed'), '8') - self.assertTimestampsEqual( self.conn.query_timestamp('get=all_durable'), '8') # Now take into account the durable timestamp. session1.timestamp_transaction('durable_timestamp=5') self.assertTimestampsEqual( - self.conn.query_timestamp('get=all_committed'), '4') - self.assertTimestampsEqual( self.conn.query_timestamp('get=all_durable'), '4') session1.commit_transaction() self.assertTimestampsEqual( - self.conn.query_timestamp('get=all_committed'), '8') - self.assertTimestampsEqual( self.conn.query_timestamp('get=all_durable'), '8') # Now test a scenario with multiple commit timestamps for a single txn. @@ -346,8 +313,6 @@ class test_timestamp14(wttest.WiredTigerTestCase, suite_subprocess): cur1[1] = 5 session1.timestamp_transaction('commit_timestamp=6') self.assertTimestampsEqual( - self.conn.query_timestamp('get=all_committed'), '5') - self.assertTimestampsEqual( self.conn.query_timestamp('get=all_durable'), '5') # Make more changes and set a new commit timestamp. @@ -356,15 +321,11 @@ class test_timestamp14(wttest.WiredTigerTestCase, suite_subprocess): cur1[1] = 6 session1.timestamp_transaction('commit_timestamp=7') self.assertTimestampsEqual( - self.conn.query_timestamp('get=all_committed'), '5') - self.assertTimestampsEqual( self.conn.query_timestamp('get=all_durable'), '5') # Once committed, we go back to 8. session1.commit_transaction() self.assertTimestampsEqual( - self.conn.query_timestamp('get=all_committed'), '8') - self.assertTimestampsEqual( self.conn.query_timestamp('get=all_durable'), '8') def test_all(self): @@ -386,9 +347,9 @@ class test_timestamp14(wttest.WiredTigerTestCase, suite_subprocess): session1.begin_transaction() cur1[1]=2 session1.commit_transaction('commit_timestamp=4') - # Confirm all_committed is now 4. + # Confirm all_durable is now 4. self.assertTimestampsEqual( - self.conn.query_timestamp('get=all_committed'), "4") + self.conn.query_timestamp('get=all_durable'), "4") # Create a read session. session1.begin_transaction('read_timestamp=2') @@ -401,9 +362,9 @@ class test_timestamp14(wttest.WiredTigerTestCase, suite_subprocess): session2.begin_transaction() cur2[2] = 2 session2.commit_transaction('commit_timestamp=7') - # All_committed should now be 7. + # All_durable should now be 7. self.assertTimestampsEqual( - self.conn.query_timestamp('get=all_committed'), "7") + self.conn.query_timestamp('get=all_durable'), "7") # Move oldest to 5. self.conn.set_timestamp('oldest_timestamp=5') @@ -414,20 +375,20 @@ class test_timestamp14(wttest.WiredTigerTestCase, suite_subprocess): self.conn.query_timestamp('get=oldest_reader')) # Begin a write transaction pointing at timestamp 6, - # this is below our current all_committed so it should move back + # this is below our current all_durable so it should move back # to the oldest timestamp. session2.begin_transaction() session2.timestamp_transaction('commit_timestamp=6') cur2[2] = 3 - # Confirm all_committed is now equal to oldest. + # Confirm all_durable is now equal to oldest. self.assertTimestampsEqual( - self.conn.query_timestamp('get=all_committed'), + self.conn.query_timestamp('get=all_durable'), self.conn.query_timestamp('get=oldest')) session2.commit_transaction() self.assertTimestampsEqual( - self.conn.query_timestamp('get=all_committed'), "7") + self.conn.query_timestamp('get=all_durable'), "7") # End our read transaction. session1.commit_transaction() |