diff options
author | Luke Chen <luke.chen@mongodb.com> | 2021-02-03 13:58:53 +1100 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-02-03 03:20:38 +0000 |
commit | 7aa1b65641938719accd595bda3e45e97dc5f475 (patch) | |
tree | 06442d7ad52cf475dca61aa2ca96ed908f450388 /src | |
parent | ea687c2a4bcd9937d02658043b5a2529943ef950 (diff) | |
download | mongo-7aa1b65641938719accd595bda3e45e97dc5f475.tar.gz |
Import wiredtiger: 332dddfe0e48eb1c263455d3db9219ec5f7cdc30 from branch mongodb-4.4r4.4.4-rc0
ref: a52cd5a47a..332dddfe0e
for: 4.4.4
WT-6430 Move WT_CONN_SERVER flags into their own field
WT-6504 Don't fallback to onpage value as base value if we see the onpage value in the history store
WT-6567 Write "rollback to stable" subpage for Architecture Guide
WT-6772 Add support for prepared updates in datastore for test_hs09
WT-6901 Write "cursor" subpage for Architecture Guide
WT-7069 Enable column store configuration to history store
WT-7089 Don't skip checkpointing objects that have obsolete pages
WT-7091 Restrict usage of LSM to only operate in conjunction with compatible incremental backup mechanism
WT-7117 RTS to skip modifies that are more than on-disk base update while restoring an update
WT-7121 Include log-structured allocation python tests in WT
WT-7126 Coverity analysis defect 116991: Explicit null dereferenced
WT-7127 Coverity analysis defect 116992: Unchecked return value
WT-7128 Coverity analysis defect 116993: Resource leak
WT-7131 Tiered cursors should return error if configured with zero tiers
Diffstat (limited to 'src')
48 files changed, 1655 insertions, 230 deletions
diff --git a/src/third_party/wiredtiger/dist/docs_data.py b/src/third_party/wiredtiger/dist/docs_data.py index a1301c87057..9f2bb32486c 100644 --- a/src/third_party/wiredtiger/dist/docs_data.py +++ b/src/third_party/wiredtiger/dist/docs_data.py @@ -65,6 +65,9 @@ arch_doc_pages = [ ArchDocPage('arch-row', ['WT_BTREE'], ['src/include/btree.h']), + ArchDocPage('arch-rts', + [''], + ['src/txn/']), ArchDocPage('arch-schema', ['WT_COLGROUP', 'WT_INDEX', 'WT_LSM_TREE', 'WT_TABLE'], ['src/include/intpack_inline.h', 'src/include/packing_inline.h', diff --git a/src/third_party/wiredtiger/dist/s_void b/src/third_party/wiredtiger/dist/s_void index 5beea1cddc1..fca1ccc9810 100755 --- a/src/third_party/wiredtiger/dist/s_void +++ b/src/third_party/wiredtiger/dist/s_void @@ -78,6 +78,7 @@ func_ok() -e '/int __wt_stat_dsrc_desc$/d' \ -e '/int __wt_stat_join_desc$/d' \ -e '/int __wt_stat_session_desc/d' \ + -e '/int __wt_txn_read_upd_list$/d' \ -e '/int __wt_txn_rollback_required$/d' \ -e '/int __wt_win_directory_list_free$/d' \ -e '/int bdb_compare_reverse$/d' \ diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index 997337c7896..3ccc3e0b57e 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -856,8 +856,10 @@ conn_dsrc_stats = [ ########################################## # Transaction statistics ########################################## + TxnStat('txn_checkpoint_obsolete_applied', 'transaction checkpoints due to obsolete pages'), TxnStat('txn_read_race_prepare_update', 'race to read prepared update retry'), TxnStat('txn_rts_hs_removed', 'rollback to stable updates removed from history store'), + TxnStat('txn_rts_hs_restore_updates', 'rollback to stable restored updates from history store'), TxnStat('txn_rts_hs_restore_tombstones', 'rollback to stable restored tombstones from history store'), TxnStat('txn_rts_hs_stop_older_than_newer_start', 'rollback to stable hs records with stop timestamps older than newer records'), TxnStat('txn_rts_keys_removed', 'rollback to stable keys removed'), diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index eb0dde936e8..3bbabe4e470 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-4.4", - "commit": "a52cd5a47a7e9af9e2c341e66f0ffdd9bc977930" + "commit": "332dddfe0e48eb1c263455d3db9219ec5f7cdc30" } diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c index 05394d0ae98..63d187b7442 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curnext.c +++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c @@ -57,7 +57,7 @@ __cursor_fix_append_next(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) cbt->iface.value.data = &cbt->v; } else { restart_read: - WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd, NULL)); + WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd)); if (cbt->upd_value->type == WT_UPDATE_INVALID) { cbt->v = 0; cbt->iface.value.data = &cbt->v; @@ -157,7 +157,7 @@ new_page: __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); restart_read: - WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd, NULL)); + WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd)); if (cbt->upd_value->type == WT_UPDATE_INVALID) { ++*skippedp; @@ -232,7 +232,7 @@ restart_read: cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno); __wt_upd_value_clear(cbt->upd_value); if (cbt->ins != NULL) - WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd, NULL)); + WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd)); if (cbt->upd_value->type != WT_UPDATE_INVALID) { if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) { if (cbt->upd_value->tw.stop_txn != WT_TXN_NONE && @@ -365,7 +365,7 @@ restart_read_insert: if ((ins = cbt->ins) != NULL) { key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); - WT_RET(__wt_txn_read_upd_list(session, cbt, ins->upd, NULL)); + WT_RET(__wt_txn_read_upd_list(session, cbt, ins->upd)); if (cbt->upd_value->type == WT_UPDATE_INVALID) { ++*skippedp; continue; diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c index bb2c3a9e05c..7fe24232317 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curprev.c +++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c @@ -197,7 +197,7 @@ __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart) cbt->iface.value.data = &cbt->v; } else { restart_read: - WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd, NULL)); + WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd)); if (cbt->upd_value->type == WT_UPDATE_INVALID) { cbt->v = 0; cbt->iface.value.data = &cbt->v; @@ -297,7 +297,7 @@ new_page: __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); restart_read: - WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd, NULL)); + WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd)); if (cbt->upd_value->type == WT_UPDATE_INVALID) { ++*skippedp; continue; @@ -372,7 +372,7 @@ restart_read: cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno); __wt_upd_value_clear(cbt->upd_value); if (cbt->ins != NULL) - WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd, NULL)); + WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd)); if (cbt->upd_value->type != WT_UPDATE_INVALID) { if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) { if (cbt->upd_value->tw.stop_txn != WT_TXN_NONE && @@ -514,7 +514,7 @@ restart_read_insert: if ((ins = cbt->ins) != NULL) { key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); - WT_RET(__wt_txn_read_upd_list(session, cbt, ins->upd, NULL)); + WT_RET(__wt_txn_read_upd_list(session, cbt, ins->upd)); if (cbt->upd_value->type == WT_UPDATE_INVALID) { ++*skippedp; continue; diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index fc7e542a12e..e4f538d80df 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -230,7 +230,7 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint64_t recno, bool *vali * update that's been deleted is not a valid key/value pair). */ if (cbt->ins != NULL) { - WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd, NULL)); + WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd)); if (cbt->upd_value->type != WT_UPDATE_INVALID) { if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) return (0); diff --git a/src/third_party/wiredtiger/src/conn/conn_capacity.c b/src/third_party/wiredtiger/src/conn/conn_capacity.c index 2213f696e4f..2d0085dfb7b 100644 --- a/src/third_party/wiredtiger/src/conn/conn_capacity.c +++ b/src/third_party/wiredtiger/src/conn/conn_capacity.c @@ -72,7 +72,7 @@ __capacity_config(WT_SESSION_IMPL *session, const char *cfg[]) static bool __capacity_server_run_chk(WT_SESSION_IMPL *session) { - return (F_ISSET(S2C(session), WT_CONN_SERVER_CAPACITY)); + return (FLD_ISSET(S2C(session)->server_flags, WT_CONN_SERVER_CAPACITY)); } /* @@ -129,7 +129,7 @@ __capacity_server_start(WT_CONNECTION_IMPL *conn) { WT_SESSION_IMPL *session; - F_SET(conn, WT_CONN_SERVER_CAPACITY); + FLD_SET(conn->server_flags, WT_CONN_SERVER_CAPACITY); /* * The capacity server gets its own session. @@ -196,7 +196,7 @@ __wt_capacity_server_destroy(WT_SESSION_IMPL *session) conn = S2C(session); - F_CLR(conn, WT_CONN_SERVER_CAPACITY); + FLD_CLR(conn->server_flags, WT_CONN_SERVER_CAPACITY); if (conn->capacity_tid_set) { __wt_cond_signal(session, conn->capacity_cond); WT_TRET(__wt_thread_join(session, &conn->capacity_tid)); diff --git a/src/third_party/wiredtiger/src/conn/conn_ckpt.c b/src/third_party/wiredtiger/src/conn/conn_ckpt.c index b71ac6c28f4..27a5121ab2e 100644 --- a/src/third_party/wiredtiger/src/conn/conn_ckpt.c +++ b/src/third_party/wiredtiger/src/conn/conn_ckpt.c @@ -63,7 +63,7 @@ __ckpt_server_config(WT_SESSION_IMPL *session, const char **cfg, bool *startp) static bool __ckpt_server_run_chk(WT_SESSION_IMPL *session) { - return (F_ISSET(S2C(session), WT_CONN_SERVER_CHECKPOINT)); + return (FLD_ISSET(S2C(session)->server_flags, WT_CONN_SERVER_CHECKPOINT)); } /* @@ -134,7 +134,7 @@ __ckpt_server_start(WT_CONNECTION_IMPL *conn) if (conn->ckpt_session != NULL) return (0); - F_SET(conn, WT_CONN_SERVER_CHECKPOINT); + FLD_SET(conn->server_flags, WT_CONN_SERVER_CHECKPOINT); /* * The checkpoint server gets its own session. @@ -201,7 +201,7 @@ __wt_checkpoint_server_destroy(WT_SESSION_IMPL *session) conn = S2C(session); - F_CLR(conn, WT_CONN_SERVER_CHECKPOINT); + FLD_CLR(conn->server_flags, WT_CONN_SERVER_CHECKPOINT); if (conn->ckpt_tid_set) { __wt_cond_signal(session, conn->ckpt_cond); WT_TRET(__wt_thread_join(session, &conn->ckpt_tid)); diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c index acba9ebb12c..0cb718d92f0 100644 --- a/src/third_party/wiredtiger/src/conn/conn_log.c +++ b/src/third_party/wiredtiger/src/conn/conn_log.c @@ -522,7 +522,7 @@ __wt_log_truncate_files(WT_SESSION_IMPL *session, WT_CURSOR *cursor, bool force) conn = S2C(session); if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) return (0); - if (!force && F_ISSET(conn, WT_CONN_SERVER_LOG) && + if (!force && FLD_ISSET(conn->server_flags, WT_CONN_SERVER_LOG) && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) WT_RET_MSG(session, EINVAL, "Attempt to archive manually while a server is running"); @@ -566,7 +566,7 @@ __log_file_server(void *arg) log = conn->log; locked = false; yield_count = 0; - while (F_ISSET(conn, WT_CONN_SERVER_LOG)) { + while (FLD_ISSET(conn->server_flags, WT_CONN_SERVER_LOG)) { /* * If there is a log file to close, make sure any outstanding write operations have * completed, then fsync and close it. @@ -838,7 +838,7 @@ __log_wrlsn_server(void *arg) log = conn->log; yield = 0; WT_INIT_LSN(&prev); - while (F_ISSET(conn, WT_CONN_SERVER_LOG)) { + while (FLD_ISSET(conn->server_flags, WT_CONN_SERVER_LOG)) { /* * Write out any log record buffers if anything was done since last time. Only call the * function to walk the slots if the system is not idle. On an idle system the alloc_lsn @@ -908,7 +908,7 @@ __log_server(void *arg) * records sitting in the buffer over the time it takes to sync out an earlier file. */ did_work = true; - while (F_ISSET(conn, WT_CONN_SERVER_LOG)) { + while (FLD_ISSET(conn->server_flags, WT_CONN_SERVER_LOG)) { /* * Slots depend on future activity. Force out buffered writes in case we are idle. This * cannot be part of the wrlsn thread because of interaction advancing the write_lsn and a @@ -1036,7 +1036,7 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) return (0); - F_SET(conn, WT_CONN_SERVER_LOG); + FLD_SET(conn->server_flags, WT_CONN_SERVER_LOG); /* * Start the log close thread. It is not configurable. If logging is enabled, this thread runs. @@ -1104,7 +1104,7 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) conn = S2C(session); - F_CLR(conn, WT_CONN_SERVER_LOG); + FLD_CLR(conn->server_flags, WT_CONN_SERVER_LOG); if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) { /* diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c index 8d45b5eb678..0c93433a59e 100644 --- a/src/third_party/wiredtiger/src/conn/conn_open.c +++ b/src/third_party/wiredtiger/src/conn/conn_open.c @@ -77,7 +77,7 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) * The LSM services are not shut down in this path (which is called when wiredtiger_open hits an * error (as well as during normal shutdown). Assert they're not running. */ - WT_ASSERT(session, !F_ISSET(conn, WT_CONN_SERVER_LSM)); + WT_ASSERT(session, !FLD_ISSET(conn->server_flags, WT_CONN_SERVER_LSM)); /* Shut down the subsystems, ensuring workers see the state change. */ F_SET(conn, WT_CONN_CLOSING); diff --git a/src/third_party/wiredtiger/src/conn/conn_stat.c b/src/third_party/wiredtiger/src/conn/conn_stat.c index 69989939c43..06ae5b14350 100644 --- a/src/third_party/wiredtiger/src/conn/conn_stat.c +++ b/src/third_party/wiredtiger/src/conn/conn_stat.c @@ -528,7 +528,7 @@ __statlog_on_close(WT_SESSION_IMPL *session) if (!FLD_ISSET(conn->stat_flags, WT_STAT_ON_CLOSE)) return (0); - if (F_ISSET(conn, WT_CONN_SERVER_STATISTICS)) + if (FLD_ISSET(conn->server_flags, WT_CONN_SERVER_STATISTICS)) WT_RET_MSG(session, EINVAL, "Attempt to log statistics while a server is running"); WT_RET(__wt_scr_alloc(session, strlen(conn->stat_path) + 128, &tmp)); @@ -547,7 +547,7 @@ err: static bool __statlog_server_run_chk(WT_SESSION_IMPL *session) { - return (F_ISSET(S2C(session), WT_CONN_SERVER_STATISTICS)); + return (FLD_ISSET(S2C(session)->server_flags, WT_CONN_SERVER_STATISTICS)); } /* @@ -614,7 +614,7 @@ __statlog_start(WT_CONNECTION_IMPL *conn) if (conn->stat_session != NULL) return (0); - F_SET(conn, WT_CONN_SERVER_STATISTICS); + FLD_SET(conn->server_flags, WT_CONN_SERVER_STATISTICS); /* The statistics log server gets its own session. */ WT_RET(__wt_open_internal_session(conn, "statlog-server", true, 0, &conn->stat_session)); @@ -685,7 +685,7 @@ __wt_statlog_destroy(WT_SESSION_IMPL *session, bool is_close) conn = S2C(session); /* Stop the server thread. */ - F_CLR(conn, WT_CONN_SERVER_STATISTICS); + FLD_CLR(conn->server_flags, WT_CONN_SERVER_STATISTICS); if (conn->stat_tid_set) { __wt_cond_signal(session, conn->stat_cond); WT_TRET(__wt_thread_join(session, &conn->stat_tid)); diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c index 0ab2b97e21e..7931f4e8a79 100644 --- a/src/third_party/wiredtiger/src/conn/conn_sweep.c +++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c @@ -249,7 +249,7 @@ __sweep_remove_handles(WT_SESSION_IMPL *session) static bool __sweep_server_run_chk(WT_SESSION_IMPL *session) { - return (F_ISSET(S2C(session), WT_CONN_SERVER_SWEEP)); + return (FLD_ISSET(S2C(session)->server_flags, WT_CONN_SERVER_SWEEP)); } /* @@ -375,7 +375,7 @@ __wt_sweep_create(WT_SESSION_IMPL *session) conn = S2C(session); /* Set first, the thread might run before we finish up. */ - F_SET(conn, WT_CONN_SERVER_SWEEP); + FLD_SET(conn->server_flags, WT_CONN_SERVER_SWEEP); /* * Handle sweep does enough I/O it may be called upon to perform slow operations for the block @@ -406,7 +406,7 @@ __wt_sweep_destroy(WT_SESSION_IMPL *session) conn = S2C(session); - F_CLR(conn, WT_CONN_SERVER_SWEEP); + FLD_CLR(conn->server_flags, WT_CONN_SERVER_SWEEP); if (conn->sweep_tid_set) { __wt_cond_signal(session, conn->sweep_cond); WT_TRET(__wt_thread_join(session, &conn->sweep_tid)); diff --git a/src/third_party/wiredtiger/src/docs/arch-cursor.dox b/src/third_party/wiredtiger/src/docs/arch-cursor.dox index 60e47c5a8ad..7eee860af86 100644 --- a/src/third_party/wiredtiger/src/docs/arch-cursor.dox +++ b/src/third_party/wiredtiger/src/docs/arch-cursor.dox @@ -3,13 +3,393 @@ Cursors are used in WiredTiger to get and modify data. A caller of WiredTiger uses WT_SESSION::open_cursor to create a WT_CURSOR. Methods on the WT_CURSOR can then be used to -position, iterate, get, and set data. +position, iterate, get, and set data. In the typical case, a cursor +will be used to access keys and values in a Btree. However, cursors +can also used to access indexed data, WiredTiger statistics, log files, +and metadata. Additionally, cursors are used for managing backups. -Depending on the <code>uri</code> used when creating a cursor, the cursor will -be internally implemented as one of the many cursor structures that include -WT_CURSOR_BTREE, WT_CURSOR_BACKUP, WT_CURSOR_INDEX, WT_CURSOR_LOG, -WT_CURSOR_METADATA, WT_CURSOR_STAT. Each of these structures starts +The various kinds of cursors are created by the WT_SESSION::open_cursor call. +Depending on the <code>uri</code> used when opening a cursor, the cursor will +be implemented internally as one of the many cursor structures that include +\c WT_CURSOR_BTREE, \c WT_CURSOR_BACKUP, \c WT_CURSOR_INDEX, \c WT_CURSOR_LOG, +\c WT_CURSOR_METADATA, \c WT_CURSOR_STAT, \c WT_CURSOR_TABLE. Each of these structures starts with the common \c %WT_CURSOR structure, which contain all of the data and method pointers that make up the public part of the API. +Thus, any one of these "extended cursor structs" can be allocated and +returned as a WT_CURSOR pointer. Since the method pointers are filled with +a specific implementations, for example \c __curtable_reset, a call to +WT_CURSOR::reset will call the specific implementation function, and the first +argument can be cast to be a pointer to the cursor struct used by the implementation. +Thus, in our C code, we have something similar to a class hierarchy, having +an abstract base class (WT_CURSOR) with virtual methods, and a number of +implementation classes. + +The code for each cursor type's methods are generally organized in a single file. +For example, a backup cursor is implemented in \c src/cursor/cur_backup.c . +Similarly, shared or utility cursor methods are defined in \c src/cursor/cur_std.c . + +Several cursor types have the concept of subordinate cursors, or child +cursors. For example, a table cursor is composed of subordinate file cursors, +each representing a column group. A metadata cursor has a subordinate +cursor that is a file cursor on the metadata file. + +Every open cursor appears on a list in \c WT_SESSION->cursors. This list is ordered +in a way that cursors are closed when the session is closed. Thus, if a subordinate +cursor needs to be closed before its parent, it must be listed before the parent. + +@section arch_cursor_raw Data translation + +Cursors that expose Btree data, like file, table and index cursors, return a set of keys and values, +translating encoded data to types that match the schema. For example, if the \c value_format specified for +a table is \c "iSi" (an integer, a null-terminated string, an integer), then data `{0, "abc", 5}` might +be stored in the Btree in a packed format as: +``` +0x80 0x61 0x62 0x63 0x00 0x85 +``` +When retrieved, the values are decoded and stored into typed variables whose addresses are passed to WT_CURSOR::get_value. + +As an aside, `0x80` represents `0` using a variable length encoding. Using 0x80 as zero allows negative integers +to be stored (\c -1 is \c 0x7f) in a way that they will sort before zero and positive integers. Small integer values +can be stored in a single byte. See comments in \c src/include/intpack_inline.h for more information. + +When creating a cursor via WT_SESSION::open_cursor, the *raw* flag can be used. This has the effect +of disabling the translation provided by the schema, transferring a single block of unencoded data +for the key or value. + +@section arch_cursor_file File cursors + +File cursors (also known as Btree cursors) are one of fundamental kind of cursors, allowing +direct accesses to WiredTiger Btrees. The implementation structure for a file +cursor is \c WT_CURSOR_BTREE. The file cursor methods are generally small wrappers around calls +into the Btree layer, where the \c WT_CURSOR_BTREE structure is used. The Btree layer handles +all aspects of cursor positioning, and transfers of raw key and value data. + +@section arch_cursor_table Table and index cursors + +A table cursor is a higher level concept built on file cursors. A WiredTiger table allows data +to be physically split into separate Btrees, this is done via the concept of column groups. Column +groups may be defined that contain a set of named columns (`columns` are synonymous with `fields`). +Each column group's columns are the stored in a single btree and may be looked up by the table's key. +See @ref schema_column_groups for API details. + +Tables may also have indices. These are implemented as Btrees mapping the index key(s) to the +main key(s) of the table. Indices may be added after the data is populated in the main table. +This requires the index to be filled at the time it is added. Index cursors are implemented using +\c WT_CURSOR_INDEX. Methods on this cursor that set and get values know to use the value that appears +in the index as a key into the main table. +See @ref schema_indices for API details. + +Internally, we define the concept of a "simple" table as one that has no named columns. *Columns*, +that is the set of keys and values, may be optionally named when creating a table. +With no named columns, column groups are not possible (as they must reference names), +and all of a table's keys and values reside in a single file. +Also without named columns, there is no possibility that a cursor can use projections. +A "simple" table, by its nature, can be implemented by a single Btree file and needs no special translation. +Thus, if a cursor is opened on a "simple" table, we can return a file cursor on the single file used +to store its data, instead of a table cursor. This optimization means that every cursor method +goes directly to the file cursor implementation, saving CPU time throughout the lifetime of the cursor. + +@section arch_cursor_projections Projections and plans + +Projections are an indication, when a cursor is opened, that the indicated values, and possibly some keys, +should be returned by WT_CURSOR::get_value calls. This is only available for table cursors. +If the table was configured with column groups, a projection has a bearing on which column group +files must be opened in a cursor. When a subset of values is returned, it's possible that some +column groups will not be needed. To implement projections and column groups, cursors use a *plan*. + +A plan is a string that indicates a series of actions that must be taken to retrieve the needed +values from the subordinate cursors in the table cursor. Remember that each column group gets +its own cursor. When a table is created, a default plan is created that asks to copy all +columns from each column group cursor in order. +When a projection is used for a cursor, a more complex plan may be created. +A plan contains numbers and action letters. The numbers are arguments, the +action letters are commands. The actions 'k' and 'v' indicate that the subordinate +cursor indicated by the numeric argument will be used to get keys or values that follow. +The 'n' action gets one or more next key or value columns from the cursor. The 's' skips +one or more columns. Switching to another cursor resets the cursor to point at the first +column in the indicated cursor. This allows a way to find keys and values in arbitrary +order. + +As a contrived example, suppose we have a table with two keys +\c "k1,k2" and four values \c "x1,x2,x3,x4". The column groups are created as follows: +``` + session->create(session, "colgroup:main:c1", "columns=(x1,x2)"); + session->create(session, "colgroup:main:c2", "columns=(x3,x4)"); +``` +The column group \c "main:c1" has keys "k1,k2" and values "x1,x2"; +the column group \c "main:c2" has keys "k1,k2" and values "x3,x4". A table cursor +on the main table will have two sub-cursors, one for each column group. +Now consider the ill-considered projection that is opened as: +``` + session->open_cursor(session, "table:main(x1,x4,x3,k2,k1,x2,x4)", NULL, NULL, &cursor); + +``` +In this case, the plan is this string (with spacing added): +``` +0v n 1v s n 1v n 0k s n 0k n 0v s n 1v s n +``` +To break it down, +- \c "0v" means use sub-cursor 0 and prepare to read values. +- \c "n" means get the next value, that is \c "x1". +- \c "1v" means use sub-cursor 1 and prepare to read values. +- \c "s" means skip a value, that is \c "x3". +- \c "n" means get the next value, that is \c "x4". +- \c ... + +Notice that this plan requires many switches of cursors and several \c "s" (skip) operations. +Each skip involves enough decoding of the data item to determine its length so its data can +be skipped over. + +With the default (complete) projection, getting values is fast. Using the default plan, the needed +columns are pulled out of each subordinate cursor one by one, and get copied to the caller's arguments. +With projections, the simple algorithm following the plan works well if the columns in the projection +are grouped by column group and requested in order. Without that discipline, as in this example, +the performance will not be optimal. + +The implementation of plan creation and execution resides in the @ref arch-schema. + +@section arch_cursor_dump Dump cursors + +Dump cursors are used in two rather different ways. Regular dump cursors retrieve the raw keys and values +and translate bytes either as raw characters or as hex values. This flavor of dump cursor is used +by the `wt dump` or `wt dump -x` utility. JSON dump cursors do more sophisticated translation, returning a string that is a JSON +formatted record, with the name for each key and its data and the name for each value and its corresponding data. +Data is translated to either integral, floating point, or string depending on the format of the column in the schema. +This flavor of dump cursor is used by the `wt dump -j` command as part of creating a JSON dump of a WiredTiger table or file. + +There is a single \c WT_CURSOR_DUMP struct that is used to implement both flavors. +The dump code checks for the \c WT_CURSTD_DUMP_JSON flag and as needed, calls into functions +in `src/cursor/cur_json.c` . The code in that file also implements several external functions that are used +by the `wt` utility when loading JSON-dumped files. In particular, the \c __wt_json_token function +returns individual JSON tokens from an input string. + +The JSON code used by the dump cursor uses some storage that hangs off of `cursor->json_private` which +is typed as \c WT_CURSOR_JSON. When a JSON flavored cursor is created, the list of key column names +and value column names is populated in \c WT_CURSOR_JSON. These names, obtained from the +configuration string that created the table or file, are useful to have in advance, +as they are used once per row to help fill out the JSON output. The functions that get +rows iterate these names and unpack the corresponding column data, +converting them into the appropriate JSON format for the data. + +@section arch_cursor_backup Backup cursors + +A backup cursor is used to manage backups. It is implemented using a \c WT_CURSOR_BACKUP structure. +A backup cursor can be configured to do a full backup or an incremental backup. +First, we'll look at full backups. + +A backup cursor for a full backup returns the set of files that need to be copied to achieve the +backup. The backup cursor, when opened, ensures that it is the only backup cursor running +in the system and returns an error if not. This is managed using the +\c WT_CONNECTION_IMPL->hot_backup_start variable, which can only be accessed when the +connection schema lock is held. A non-zero value means a hot backup is in progress. +Closing the backup cursor sets it back to zero. + +Having an open backup influences actions +elsewhere in WiredTiger, since part of the backup protocol involves the application copying +whole data files. Thus, having an open backup may cause the block manager and log file server +thread to avoid truncating data and log files. A truncation of a file being copied by the +application would be unexpected. Also, open checkpoints are not deleted during the +course of a backup. + +When the backup cursor is initialized, the complete set of files needed to back up is generated +and stored in the cursor. This makes the backup's \c next function easy as it just returns the next +file in the list. + +Incremental backups work much the same way, except that the file list is reduced to files that +have changed since a previous backup referenced in the configuration when the cursor is opened. +The other twist is that for each file returned, the caller does a duplicate operation on the backup +cursor, and the duplicate code actually returns a specialized incremental backup cursor. This kind +of cursor has its own \c next method that causes it to return information about individual pieces +of this file that need to be copied. The code to implement incremental cursors +is in \c src/cursor/cur_backup_incr.c . + +@section arch_cursor_join Join cursors + +Join cursors implement a join mechanism for WiredTiger. The idea is that joins can be configured +by opening a special join cursor on a table, and attaching the table's index cursors to it, to +return rows that match a filter, like: +``` +(row.price > 100 and row.price <= 200) and row.in_stock < 10 +``` +Building up the conditions essentially creates a tree that is used for evaluation. This is stored +in \c WT_CURSOR_JOIN, and is returned as the cursor object. Entries in the tree representing +an index's participation in one clause are stored in \c WT_CURSOR_JOIN_ENTRY objects. \c WT_CURSOR_JOIN and +\c WT_CURSOR_JOIN_ENTRY objects can be composed in a hierarchical manner, representing the shape of the tree representing +the query. These two structures are somewhat static, being created when the join is created. + +The join cursor contains a pointer to a \c WT_CURSOR_JOIN_ITER, which in some sense encodes the "position" +of the cursor. To get the next row that satisfies a join requires that multiple cursors be iterated. +Generally, the "left-most" index cursor is iterated first. Using the example above, the \c price +index would be iterated, skipping over any entries that did not satisfy `(row.price > 100 and row.price <= 200)`. +Then other cursors are checked to see that any other conditions are satisfied. +The join may be configured to use Bloom filters, and when that occurs, +\c WT_CURSOR_JOIN_ENTRIES contain the bloom filter for individual index checks. This allows checks +to occur quickly, at the expense of an initialization that occurs when the cursor begins iteration. + +During a join cursor iteration, multiple \c WT_CURSOR_JOIN_ITER objects may be created. +There is one \c WT_CURSOR_JOIN_ITER object corresponding to each level of nesting (queries can have +arbitrary nesting of \c AND/OR). Each \c WT_CURSOR_JOIN_ITER allows the question to be asked: does +the current position of the main table cursor satisfy the join conditions for this part of the join tree? + +Part of the reason for this dynamic structure is that disjunctions may require some dynamic action. +Consider +``` +(row.price > 100 and row.in_stock < 10) .... or (row.aisle == 12 and row.on_sale == 1) +``` +The last part of the query (being part of an OR clause) will be first executed after a number of items +are returned that satisfy the first part of the query. If the join cursor is closed before then, +it would be a waste to have opened subordinate cursors on the \c aisle and \c on_sale indices, and +potentially computed bloom filters, etc. + +When positioned on an entry that represents a nested join, a new child \c WT_CURSOR_JOIN_ITER is +created that will be bound to the nested \c WT_CURSOR_JOIN. That iterator is then used to generate candidate +primary keys. When its iteration is completed, that iterator is destroyed and the parent iterator +advances to the next entry. Thus, depending on how deeply joins are nested, a similarly deep +stack of iterators is created. + +@section arch_cursor_duplicate Duplicating cursors + +Cursors may be duplicated, this occurs by passing a cursor to be duplicated as part of the +WT_SESSION::open_cursor call. Cursor duplication does not occur in the cursor type code. +Rather, a new cursor of the requested type is created, and the cursor's position is duplicated +via a call to \c __wt_cursor_dup_position. This function gets the key from the original cursor +in *raw* form (not converting it using \c key_format), sets the key in the new cursor, and +does a search to set the position properly. + +@section arch_cursor_dhandle File cursors, Btrees and data handles + +File cursors, Btrees and data handles exist in a WiredTiger system as different ways to +reference the data in a Btree. It is useful to understand the differences between these structures +and how they are used. + +At the bottom is the data handle (also known as *dhandle*). This is an abstraction of an operating +system file handle, with a set of flags and some reference counts. A Btree is a much larger +abstraction, with a memory cache of key value pairs along with functions to read and write data +as needed to and from the data file. A Btree is paired with a data handle to allow the transfer of data. +Both the data handle and the Btree are owned by the connection. That is, they are shared among all sessions. + +File cursors, on the other hand, are owned by a session. When a session opens a cursor on a file for the +first time in that session, a file cursor is created. This occurs even if the file may be opened by cursors +in other sessions already. The session owns the cursor, and the cursor may only be used by that session. +Open cursors do increment reference counts in the data handle, so that the data handle "knows" it is +being used, so that the file may not be dropped, renamed, verified or salvaged. +So when WT_CURSOR::close is called for a file cursor, the cursor's memory may be released (or +retained if cached), and reference counts decremented. Other sessions may retain open cursors on that file, +they are independent. + +@section arch_cursor_caching Cursor caching + +Cursors, upon closing, may be cached in the session. An open of the same URI will return a cached +cursor if one is found matching the URI. Cursor caching is currently only done on file cursors. +Because of the optimization for simple tables described above, cursors on simple tables are also cached. + +To help implement caching, two methods, \c cache and \c reopen have been added to the cursor API. +These are not public. Their function is to perform cursor-type specific operations to change a cursor +from an open state to a cached state (\c cache) and change from a cached state to the open state (\c reopen). + +When a cursor is opened the first time, it is marked as *cacheable* or not. Cursors that specify +certain options, like bulk loading, random, or readonly, are not cacheable. When a cursor is closed, the +cursor is checked if it is cacheable. If so and if cursor caching is enabled in the session, then it will be +cached. Cached cursors live in a hash table that is owned by the session. A hash function on the URI is used +to determine which hash bucket to use. We compute the hash of the URI once, its value is stored in +in the cursor for future use. Thus, caching a cursor (what happens within the type-specific \c cache function +(e.g. \c __curfile_cache) is relatively quick: + +- Free storage that we don't want held (for example, storage used by the cursor's key and value). +- Get a *weak* reference to the data handle (increment \c dhandle->session_ref). +- Release the *strong* reference to the data handle (decrement \c dhandle->session_inuse). +- Determine the hash bucket needed (using the hash value in the cursor). +- Move the cursor from the session's open list to the list in the hash bucket. +- Increment statistics and decrement the connection's open cursor count. +- Set the cache cursor flag. +- Unlock the dhandle. + +The change of reference from a *strong* to a *weak* is significant. When a dhandle's \c session_inuse +(*strong* reference) drops to zero, it means that no cursor is open on the dhandle, and the only +references are from cached cursors. In this state, the dhandle may be marked dead by the dhandle sweep. +When the dhandle is dead, the dhandle's memory will still persist, but each session will eventually +notice, during its cursor cache sweep, and "fully close" the cursor, removing it from the cache list, +releasing its weak reference before freeing the cursor. Each session holding cached cursors must have +some periodic activity that causes it to run its sweep, an occasional call to WT_SESSION::reset will suffice. +After a dhandle has been dead for enough time, it is expected that all of its weak references will drop to zero, +and the dhandle itself can be freed by the dhandle sweep. + +If there is a failure during the \c cache function, then we would want to fully close the cursor. +Rather than having special case code to handle this rare condition, we instead call \c reopen to +temporarily bring the cursor back to an open state, and turn cursor caching off temporarily in +the session while we close the cursor, releasing all its resources and references. + +During a cursor open, if the cursor configuration options allow caching, we hash the uri, and look +at the corresponding hash bucket in the session cursor cache. If we find a matching cursor, we +call \c reopen on the cursor. This is what happens within the call to the type-specific \c reopen function +(e.g. \c __curfile_reopen): + +- Lock the dhandle. +- If the dhandle is no longer open, release it and mark the reopen to fail (but continue). +- Get a "strong" reference to the data handle (increment \c dhandle->session_inuse). +- Release the "weak" reference to the data handle (decrement \c dhandle->session_ref). +- Increment statistics and the connection's open cursor count. +- Move the handle from the hash bucket to the session's open list (the hash value was previously saved in the cursor) +- Clear the cache cursor flag. +- Update convenient pointers within the cursor to parts of the WT_BTREE that may have changed + which the cursor was cached. + +If the reopen fails (probably due to the dhandle no longer being open or being marked dead), +we have ensured that enough of the cursor is opened so that it can be legally closed. +We have studiously avoided having a cursor that is in some state that is half-open and half-closed, +as it is hard to know how to dispose of it. + +@section arch_cursor_sweep Session cursor sweep + +Consider a large system that has many sessions using the same set of tables. When all sessions +have closed a particular table, there will be no need to keep the underlying data handle open. +In fact, if a WT_SESSION::drop call is called, we want to ensure that the data handle has been +closed and the corresponding file is removed. Some systems, like Windows, require that all open +file handles be closed before a file can be successfully removed from the file system. So there +is motivation to periodically mark data handles that have no active references, and have sessions +free cached cursors that have weak references to such marked data handles. The former job +occurs in the connection sweep code. The latter job occurs in the session cursor cache sweep. + +The session cursor cache sweep currently happens in the WT_CURSOR::close call, +and also on calls to WT_SESSION::reset. On one hand, we don't want the overhead +of a sweep to occur too frequently. It is quite possible that both close and reset can be called a lot +and there may be many of thousands of cached cursors in the session. For that reason, we'd like +to do the sweep in small increments, and not on each call. On the other hand, in a larger system, +a session may be part of a pool servicing higher level requests. When a session completes its work, +it may be left idle by the caller of WiredTiger, and such a session may +then be idle for long periods of time. When cursor caching is enabled and sessions are not active, +we want even occasional calls to WT_SESSION::reset to have a strong effect. We want occasional +sweeps to keep up with freeing up references to data handles, so that otherwise unused data handles +may in turn be freed eventually. + +Our solution to this is three-fold. First, every time we want to call the sweep, a countdown +counter is used, so we only consider a sweep every \c WT_SESSION_CURSOR_SWEEP_COUNTDOWN times +(currently `WT_SESSION_CURSOR_SWEEP_COUNTDOWN == 40`). Secondly, we won't sweep if it's already +been done this second in time. Finally, we sweep by walking a small set of buckets, initially 5 out of +typically 512 configured buckets. However, depending on how productive our sweep is, that is, +how many references to closed data handles are freed, we may continue our walk. This should +usually strike a good balance between not having a lot of overhead for sweeps, and keeping up with +the need to free up shared resources. + +@section arch_cursor_debug_copy Debug copy + +When cursors are positioned, their data may point to data in the btree or data allocated in the cursor. +A caller may use a pointer to that data until the next cursor call. After that, the pointer should not be +considered valid. By default, WiredTiger does not enforce this. When opening a cursor, a \c "debug=copy" configuration +flag can be used. This forces any data that is returned by WT_CURSOR::get_key or WT_CURSOR::get_value +to be in malloc'd memory, and explicitly freed on the next API call. Systems that +are instrumented to track memory references can detect the references to freed memory, thus latent +bugs can be detected. + +The implementation is straightforward. The key and value in each cursor is represented as a \c WT_ITEM. +A \c WT_ITEM includes a pointer and size, and it can point to arbitrary memory. However, the \c WT_ITEM +also includes a memory buffer that may or may not be allocated. When the \c WT_ITEM pointer points +to the item's own memory buffer, then it is already in malloc'd memory. When \c "debug=copy" is +configured, it is a simple matter to check if a key and value being returned are already in the item's malloc'd memory. +If not, memory is allocated, the copy is made and the item's pointer is updated. On the beginning +of the next API call using that cursor, the item's malloc'd memory is overwritten and freed. +Thus, in the presence of a memory tracker, uses +of "stray" pointers will be detected. Even without a memory tracker, uses of "stray" pointers into +the freed storage will likely yield the overwritten bytes, and not the previously seen key or value. */ diff --git a/src/third_party/wiredtiger/src/docs/arch-index.dox b/src/third_party/wiredtiger/src/docs/arch-index.dox index 21af5d1f62f..52ebc0120d4 100644 --- a/src/third_party/wiredtiger/src/docs/arch-index.dox +++ b/src/third_party/wiredtiger/src/docs/arch-index.dox @@ -184,6 +184,10 @@ WiredTiger has a Python API that is useful for scripting and experimentation. Row Stores are Btrees that have a variable size key and data. +@subpage arch-rts + +Rollback to stable to remove the unstable updates from the database. + @subpage arch-schema A schema defines the format of the application data in WiredTiger. diff --git a/src/third_party/wiredtiger/src/docs/arch-rts.dox b/src/third_party/wiredtiger/src/docs/arch-rts.dox new file mode 100644 index 00000000000..451732f5776 --- /dev/null +++ b/src/third_party/wiredtiger/src/docs/arch-rts.dox @@ -0,0 +1,113 @@ +/*! @arch_page arch-rts Rollback to stable + +Rollback to stable is an operation that retains only the modifications that +are stable according to the stable timestamp and recovered checkpoint snapshot. +The checkpoint transaction snapshot details are saved at the end of every +checkpoint are recovered and used as a recovered checkpoint snapshot. + +@section rts-overview Overview of rollback to stable + +Rollback to stable scans each and every table present in the database except +metadata @ref arch-metadata to remove the modifications from the table +that are more than stable timestamp and recovered checkpoint snapshot. + +In the process of removing newer modifications from the table, all the in-memory +updates are aborted and the on-disk version updates are replaced with an update +from history store otherwise the data store version is removed. + +Rollback to stable is performed in three phases +1. WT startup +2. WT shutdown +3. Application initiated + +To improve the performance of rollback to stable operation, rollback to stable +will perform only on particular tables that need rollback. Rollback to stable +doesn't operate on logged tables as the updates on these are stable when the +transaction gets committed. + +@section rts-stable-update Stable update of rollback to stable + +According to rollback to stable, the stable version of update is an update that +has durable timestamp of less than or equal to the stable timestamp and it's +transaction id must be committed according to the checkpoint snapshot. + +@section rts-preconditions Pre-conditions required for rollback to stable + +To perform rollback to stable, there shouldn't be any transaction activity happening +in the WiredTiger. + +@section rts-table-check Checks performed on a table by rollback to stable + +Rollback to stable consider a table to be processed for rollback based on the following +conditions. +1. Table is modified +2. The checkpoint durable start/stop timestamp is greater than the rollback timestamp. +3. There is no durable timestamp in any checkpoint. +4. Has prepared updates +5. Has updates from transactions greater than checkpoint snapshot (only in restart phase) + +There are some special conditions where the table is skipped for rollback to stable. +1. Empty file +2. Table has timestamp updates but there is no stable timestamp set. +3. Files that don't exist +4. Files that are corrupted. + +If the table has no timestamp updates, then the history store table is +scanned to remove any historical versions related to this table as these +older versions no longer required. + +Once all the tables are process for rollback to stable, at the end the history store +is processed to remove any unstable updates more than the stable timestamp. + +@section rts-how How rollback to stable fixes the unstable updates + +Once a table is identified to perform rollback to stable, it reads the pages into +the cache if they don't exist and process it to remove the unstable updates. + +There are two types of rollbacks that rollback to stable performs: +1. Rolling back unstable fast truncate +2. Rolling back unstable updates + +All internal pages are traversed to rollback unstable fast truncate operations and +leaf pages are traversed to remove the unstable updates. + +Once the leaf page is identified to rollback the updates, it is performed in the +following order. +1. Check smallest insert lists on the page +2. Traverse through all the on-disk keys + a. Check update list + b. Check insert list + c. Check the on-disk version if no stable update found in the update list. +3. Traverse through the reconciled pages to abort any history store updates. + +@section rts-abort-update How rollback to stable aborts in-memory updates + +Traverse through all the updates in the update list and abort them until a stable +update is found. + +@section rts-abort-on-disk-update How rollback to stable aborts on-disk update + +If the start time pair is not stable try to find a valid update from the history +store that is stable to replace the on-disk version otherwise remove the on-disk +key. In case if the stop time pair if exists and if its not stable, restore the +on-disk update again into the update list. + +To remove any existing update, rollback to stable adds globally visible tombstone to +the key update list and this key will get removed later during the reconciliation. + +Note: As of now, rollback to stable don't work on removing on-disk columnar updates. + +@section rts-hs-search Rollback to stable history store search + +Rollback to stable searches the history store to find a stable update to replace an +unstable update in the data store. It searches the history store with the given data +store key with a maximum timestamp and traverse back till a stable update found. If +no valid update is found the data store key is removed. + +@section rts-page-skip Skipping reading unnecessary pages into memory + +Rollback to stable doesn't load the pages that don't have any unstable updates to be +removed to improve the performance of rollback to stable by verifying the time aggregated +values with the stable timestamp or recovered checkpoint snapshot during the tree walk. + +*/ diff --git a/src/third_party/wiredtiger/src/docs/spell.ok b/src/third_party/wiredtiger/src/docs/spell.ok index be45196ff97..5a37d260c1e 100644 --- a/src/third_party/wiredtiger/src/docs/spell.ok +++ b/src/third_party/wiredtiger/src/docs/spell.ok @@ -17,6 +17,7 @@ Christoph Collet's Coverity Coverity's +CURSTD DB's DBTs DHANDLE @@ -51,6 +52,8 @@ Google's HyperDex HyperLevelDB IEC +IMPL +ITER JDK JIRA JavaScript @@ -114,6 +117,7 @@ Yann Za Zstd aR +abc abstime ack'ed ajn @@ -149,6 +153,7 @@ bufs builtin builtins bzip +cacheable cachesize calc callbk @@ -180,6 +185,7 @@ crashless crc curfile cursortype +curtable customerABC cv dN @@ -197,6 +203,7 @@ dbformat dbm dbt decl +decrement decrementing decrypt decrypted @@ -291,6 +298,7 @@ html htmlinclude huffman hugepage +iSi icount ie iflag @@ -299,6 +307,7 @@ indices init insn intl +intpack inuse io ip @@ -469,6 +478,7 @@ rmw ro rotn rpc +rts runnable runtime rwlock @@ -497,6 +507,7 @@ src ssd startsync startuml +startup statlog stderr stdout diff --git a/src/third_party/wiredtiger/src/history/hs_cursor.c b/src/third_party/wiredtiger/src/history/hs_cursor.c index 333ec0e25bb..6baaf94847c 100644 --- a/src/third_party/wiredtiger/src/history/hs_cursor.c +++ b/src/third_party/wiredtiger/src/history/hs_cursor.c @@ -180,7 +180,7 @@ __wt_hs_cursor_position(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t bt static int __hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key, const char *value_format, uint64_t recno, WT_UPDATE_VALUE *upd_value, bool allow_prepare, - WT_ITEM *on_disk_buf, WT_TIME_WINDOW *on_disk_tw) + WT_ITEM *base_value_buf) { WT_CURSOR *hs_cursor; WT_CURSOR_BTREE *hs_cbt; @@ -333,14 +333,14 @@ __hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key, WT_ERR_NOTFOUND_OK(__wt_hs_cursor_next(session, hs_cursor), true); if (ret == WT_NOTFOUND) { /* - * Fallback to the onpage value as the base value. + * Fallback to the provided value as the base value. * * Work around of clang analyzer complaining the value is never read as it is reset * again by the following WT_ERR macro. */ WT_NOT_READ(ret, 0); orig_hs_value_buf = hs_value; - hs_value = on_disk_buf; + hs_value = base_value_buf; upd_type = WT_UPDATE_STANDARD; break; } @@ -356,9 +356,9 @@ __hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key, hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts_tmp, &hs_counter_tmp)); if (hs_btree_id != btree_id) { - /* Fallback to the onpage value as the base value. */ + /* Fallback to the provided value as the base value. */ orig_hs_value_buf = hs_value; - hs_value = on_disk_buf; + hs_value = base_value_buf; upd_type = WT_UPDATE_STANDARD; break; } @@ -366,24 +366,22 @@ __hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key, WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp)); if (cmp != 0) { - /* Fallback to the onpage value as the base value. */ + /* Fallback to the provided value as the base value. */ orig_hs_value_buf = hs_value; - hs_value = on_disk_buf; + hs_value = base_value_buf; upd_type = WT_UPDATE_STANDARD; break; } /* - * If we find a history store record that either corresponds to the on-disk value or is - * newer than it then we should use the on-disk value as the base value and apply our - * modifies on top of it. + * If the stop time pair on the tombstone in the history store is already globally + * visible fall back to the base value. This is possible in scenarios where the latest + * updates are aborted by RTS according to stable timestamp. */ - if (on_disk_tw->start_ts < hs_start_ts_tmp || - (on_disk_tw->start_ts == hs_start_ts_tmp && - on_disk_tw->start_txn <= hs_cbt->upd_value->tw.start_txn)) { - /* Fallback to the onpage value as the base value. */ + if (__wt_txn_tw_stop_visible_all(session, &hs_cbt->upd_value->tw)) { + /* Fallback to the provided value as the base value. */ orig_hs_value_buf = hs_value; - hs_value = on_disk_buf; + hs_value = base_value_buf; upd_type = WT_UPDATE_STANDARD; break; } @@ -451,7 +449,7 @@ err: */ int __wt_hs_find_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_format, uint64_t recno, - WT_UPDATE_VALUE *upd_value, bool allow_prepare, WT_ITEM *on_disk_buf, WT_TIME_WINDOW *on_disk_tw) + WT_UPDATE_VALUE *upd_value, bool allow_prepare, WT_ITEM *base_value_buf) { WT_BTREE *btree; WT_DECL_RET; @@ -460,8 +458,8 @@ __wt_hs_find_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_forma WT_RET(__wt_hs_cursor_open(session)); WT_WITH_BTREE(session, CUR2BT(session->hs_cursor), - (ret = __hs_find_upd_int(session, btree->id, key, value_format, recno, upd_value, - allow_prepare, on_disk_buf, on_disk_tw))); + (ret = __hs_find_upd_int( + session, btree->id, key, value_format, recno, upd_value, allow_prepare, base_value_buf))); WT_TRET(__wt_hs_cursor_close(session)); return (ret); } diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h index f11210061b1..79addcfd048 100644 --- a/src/third_party/wiredtiger/src/include/btree.h +++ b/src/third_party/wiredtiger/src/include/btree.h @@ -252,18 +252,19 @@ struct __wt_btree { * Flag values up to 0xff are reserved for WT_DHANDLE_XXX. We don't automatically generate these * flag values for that reason, there's no way to start at an offset. */ -#define WT_BTREE_ALTER 0x000100u /* Handle is for alter */ -#define WT_BTREE_BULK 0x000200u /* Bulk-load handle */ -#define WT_BTREE_CLOSED 0x000400u /* Handle closed */ -#define WT_BTREE_IGNORE_CACHE 0x000800u /* Cache-resident object */ -#define WT_BTREE_IN_MEMORY 0x001000u /* Cache-resident object */ -#define WT_BTREE_NO_CHECKPOINT 0x002000u /* Disable checkpoints */ -#define WT_BTREE_NO_LOGGING 0x004000u /* Disable logging */ -#define WT_BTREE_READONLY 0x008000u /* Handle is readonly */ -#define WT_BTREE_SALVAGE 0x010000u /* Handle is for salvage */ -#define WT_BTREE_SKIP_CKPT 0x020000u /* Handle skipped checkpoint */ -#define WT_BTREE_UPGRADE 0x040000u /* Handle is for upgrade */ -#define WT_BTREE_VERIFY 0x080000u /* Handle is for verify */ +#define WT_BTREE_ALTER 0x000100u /* Handle is for alter */ +#define WT_BTREE_BULK 0x000200u /* Bulk-load handle */ +#define WT_BTREE_CLOSED 0x000400u /* Handle closed */ +#define WT_BTREE_IGNORE_CACHE 0x000800u /* Cache-resident object */ +#define WT_BTREE_IN_MEMORY 0x001000u /* Cache-resident object */ +#define WT_BTREE_NO_CHECKPOINT 0x002000u /* Disable checkpoints */ +#define WT_BTREE_NO_LOGGING 0x004000u /* Disable logging */ +#define WT_BTREE_OBSOLETE_PAGES 0x008000u /* Handle has obsolete pages */ +#define WT_BTREE_READONLY 0x010000u /* Handle is readonly */ +#define WT_BTREE_SALVAGE 0x020000u /* Handle is for salvage */ +#define WT_BTREE_SKIP_CKPT 0x040000u /* Handle skipped checkpoint */ +#define WT_BTREE_UPGRADE 0x080000u /* Handle is for upgrade */ +#define WT_BTREE_VERIFY 0x100000u /* Handle is for verify */ uint32_t flags; }; diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index f62570a8041..5af42c1a0e7 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -548,35 +548,42 @@ struct __wt_connection_impl { */ WT_FILE_SYSTEM *file_system; +/* + * Server subsystem flags. + */ +/* AUTOMATIC FLAG VALUE GENERATION START */ +#define WT_CONN_SERVER_CAPACITY 0x01u +#define WT_CONN_SERVER_CHECKPOINT 0x02u +#define WT_CONN_SERVER_LOG 0x04u +#define WT_CONN_SERVER_LSM 0x08u +#define WT_CONN_SERVER_STATISTICS 0x10u +#define WT_CONN_SERVER_SWEEP 0x20u + /* AUTOMATIC FLAG VALUE GENERATION STOP */ + uint32_t server_flags; + /* AUTOMATIC FLAG VALUE GENERATION START */ -#define WT_CONN_CACHE_CURSORS 0x0000001u -#define WT_CONN_CACHE_POOL 0x0000002u -#define WT_CONN_CKPT_SYNC 0x0000004u -#define WT_CONN_CLOSING 0x0000008u -#define WT_CONN_CLOSING_NO_MORE_OPENS 0x0000010u -#define WT_CONN_CLOSING_TIMESTAMP 0x0000020u -#define WT_CONN_COMPATIBILITY 0x0000040u -#define WT_CONN_DATA_CORRUPTION 0x0000080u -#define WT_CONN_EVICTION_RUN 0x0000100u -#define WT_CONN_FILE_CLOSE_SYNC 0x0000200u -#define WT_CONN_HS_OPEN 0x0000400u -#define WT_CONN_INCR_BACKUP 0x0000800u -#define WT_CONN_IN_MEMORY 0x0001000u -#define WT_CONN_LEAK_MEMORY 0x0002000u -#define WT_CONN_LSM_MERGE 0x0004000u -#define WT_CONN_OPTRACK 0x0008000u -#define WT_CONN_PANIC 0x0010000u -#define WT_CONN_READONLY 0x0020000u -#define WT_CONN_RECONFIGURING 0x0040000u -#define WT_CONN_RECOVERING 0x0080000u -#define WT_CONN_SALVAGE 0x0100000u -#define WT_CONN_SERVER_CAPACITY 0x0200000u -#define WT_CONN_SERVER_CHECKPOINT 0x0400000u -#define WT_CONN_SERVER_LOG 0x0800000u -#define WT_CONN_SERVER_LSM 0x1000000u -#define WT_CONN_SERVER_STATISTICS 0x2000000u -#define WT_CONN_SERVER_SWEEP 0x4000000u -#define WT_CONN_WAS_BACKUP 0x8000000u +#define WT_CONN_CACHE_CURSORS 0x000001u +#define WT_CONN_CACHE_POOL 0x000002u +#define WT_CONN_CKPT_SYNC 0x000004u +#define WT_CONN_CLOSING 0x000008u +#define WT_CONN_CLOSING_NO_MORE_OPENS 0x000010u +#define WT_CONN_CLOSING_TIMESTAMP 0x000020u +#define WT_CONN_COMPATIBILITY 0x000040u +#define WT_CONN_DATA_CORRUPTION 0x000080u +#define WT_CONN_EVICTION_RUN 0x000100u +#define WT_CONN_FILE_CLOSE_SYNC 0x000200u +#define WT_CONN_HS_OPEN 0x000400u +#define WT_CONN_INCR_BACKUP 0x000800u +#define WT_CONN_IN_MEMORY 0x001000u +#define WT_CONN_LEAK_MEMORY 0x002000u +#define WT_CONN_LSM_MERGE 0x004000u +#define WT_CONN_OPTRACK 0x008000u +#define WT_CONN_PANIC 0x010000u +#define WT_CONN_READONLY 0x020000u +#define WT_CONN_RECONFIGURING 0x040000u +#define WT_CONN_RECOVERING 0x080000u +#define WT_CONN_SALVAGE 0x100000u +#define WT_CONN_WAS_BACKUP 0x200000u /* AUTOMATIC FLAG VALUE GENERATION STOP */ uint32_t flags; }; diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 68e636cdebf..405ae73401b 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -763,8 +763,8 @@ extern int __wt_hs_delete_key_from_ts( WT_SESSION_IMPL *session, uint32_t btree_id, const WT_ITEM *key, wt_timestamp_t ts, bool reinsert) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_hs_find_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_format, - uint64_t recno, WT_UPDATE_VALUE *upd_value, bool allow_prepare, WT_ITEM *on_disk_buf, - WT_TIME_WINDOW *on_disk_tw) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); + uint64_t recno, WT_UPDATE_VALUE *upd_value, bool allow_prepare, WT_ITEM *base_value_buf) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_hs_get_btree(WT_SESSION_IMPL *session, WT_BTREE **hs_btreep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) @@ -2070,7 +2070,10 @@ static inline int __wt_txn_read(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_UPDATE *upd, WT_CELL_UNPACK_KV *vpack) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_txn_read_upd_list(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, - WT_UPDATE *upd, WT_UPDATE **prepare_updp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); + WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +static inline int __wt_txn_read_upd_list_internal(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, + WT_UPDATE *upd, WT_UPDATE **prepare_updp, WT_UPDATE **restored_updp) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_txn_search_check(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline int __wt_txn_update_check(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index e1c6cea488a..3635dd41d1b 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -783,8 +783,10 @@ struct __wt_connection_stats { int64_t txn_rts_keys_removed; int64_t txn_rts_keys_restored; int64_t txn_rts_hs_restore_tombstones; + int64_t txn_rts_hs_restore_updates; int64_t txn_rts_sweep_hs_keys; int64_t txn_rts_hs_removed; + int64_t txn_checkpoint_obsolete_applied; int64_t txn_update_conflict; }; @@ -996,8 +998,10 @@ struct __wt_dsrc_stats { int64_t txn_rts_keys_removed; int64_t txn_rts_keys_restored; int64_t txn_rts_hs_restore_tombstones; + int64_t txn_rts_hs_restore_updates; int64_t txn_rts_sweep_hs_keys; int64_t txn_rts_hs_removed; + int64_t txn_checkpoint_obsolete_applied; int64_t txn_update_conflict; }; diff --git a/src/third_party/wiredtiger/src/include/txn_inline.h b/src/third_party/wiredtiger/src/include/txn_inline.h index 9aa86728d59..f4a4c552ddb 100644 --- a/src/third_party/wiredtiger/src/include/txn_inline.h +++ b/src/third_party/wiredtiger/src/include/txn_inline.h @@ -836,18 +836,21 @@ __wt_upd_alloc_tombstone(WT_SESSION_IMPL *session, WT_UPDATE **updp, size_t *siz } /* - * __wt_txn_read_upd_list -- - * Get the first visible update in a list (or NULL if none are visible). + * __wt_txn_read_upd_list_internal -- + * Internal helper function to get the first visible update in a list (or NULL if none are + * visible). */ static inline int -__wt_txn_read_upd_list( - WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd, WT_UPDATE **prepare_updp) +__wt_txn_read_upd_list_internal(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd, + WT_UPDATE **prepare_updp, WT_UPDATE **restored_updp) { WT_VISIBLE_TYPE upd_visible; uint8_t prepare_state, type; if (prepare_updp != NULL) *prepare_updp = NULL; + if (restored_updp != NULL) + *restored_updp = NULL; __wt_upd_value_clear(cbt->upd_value); for (; upd != NULL; upd = upd->next) { @@ -888,6 +891,16 @@ __wt_txn_read_upd_list( F_ISSET(upd, WT_UPDATE_PREPARE_RESTORED_FROM_DS)) *prepare_updp = upd; + /* + * Save the restored update to use it as base value update in case if we need to reach + * history store instead of on-disk value. + */ + if (restored_updp != NULL && F_ISSET(upd, WT_UPDATE_RESTORED_FROM_HS) && + type == WT_UPDATE_STANDARD) { + WT_ASSERT(session, *restored_updp == NULL); + *restored_updp = upd; + } + if (upd_visible == WT_VISIBLE_PREPARE) { /* Ignore the prepared update, if transaction configuration says so. */ if (F_ISSET(session->txn, WT_TXN_IGNORE_PREPARE)) @@ -916,6 +929,16 @@ __wt_txn_read_upd_list( } /* + * __wt_txn_read_upd_list -- + * Get the first visible update in a list (or NULL if none are visible). + */ +static inline int +__wt_txn_read_upd_list(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +{ + return __wt_txn_read_upd_list_internal(session, cbt, upd, NULL, NULL); +} + +/* * __wt_txn_read -- * Get the first visible update in a chain. This function will first check the update list * supplied as a function argument. If there is no visible update, it will check the onpage @@ -927,14 +950,14 @@ __wt_txn_read(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint WT_UPDATE *upd, WT_CELL_UNPACK_KV *vpack) { WT_TIME_WINDOW tw; - WT_UPDATE *prepare_upd; + WT_UPDATE *prepare_upd, *restored_upd; bool have_stop_tw, retry; - prepare_upd = NULL; + prepare_upd = restored_upd = NULL; retry = true; retry: - WT_RET(__wt_txn_read_upd_list(session, cbt, upd, &prepare_upd)); + WT_RET(__wt_txn_read_upd_list_internal(session, cbt, upd, &prepare_upd, &restored_upd)); if (WT_UPDATE_DATA_VALUE(cbt->upd_value) || (cbt->upd_value->type == WT_UPDATE_MODIFY && cbt->upd_value->skip_buf)) return (0); @@ -947,66 +970,77 @@ retry: } /* - * When we inspected the update list we may have seen a tombstone leaving us with a valid stop - * time window, we don't want to overwrite this stop time window. + * Skip retrieving the on-disk value when there exists a restored update from history store in + * the update list. Having a restored update as part of the update list indicates that the + * existing on-disk value is unstable. */ - have_stop_tw = WT_TIME_WINDOW_HAS_STOP(&cbt->upd_value->tw); - - /* Check the ondisk value. */ - if (vpack == NULL) { - WT_TIME_WINDOW_INIT(&tw); - WT_RET(__wt_value_return_buf(cbt, cbt->ref, &cbt->upd_value->buf, &tw)); + if (restored_upd != NULL) { + WT_ASSERT(session, !WT_IS_HS(session->dhandle)); + cbt->upd_value->buf.data = restored_upd->data; + cbt->upd_value->buf.size = restored_upd->size; } else { - WT_TIME_WINDOW_COPY(&tw, &vpack->tw); - cbt->upd_value->buf.data = vpack->data; - cbt->upd_value->buf.size = vpack->size; - } - - /* - * If the stop time point is set, that means that there is a tombstone at that time. If it is - * not prepared and it is visible to our txn it means we've just spotted a tombstone and should - * return "not found", except scanning the history store during rollback to stable and when we - * are told to ignore non-globally visible tombstones. - */ - if (!have_stop_tw && __wt_txn_tw_stop_visible(session, &tw) && - !F_ISSET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE)) { - cbt->upd_value->buf.data = NULL; - cbt->upd_value->buf.size = 0; - cbt->upd_value->tw.durable_stop_ts = tw.durable_stop_ts; - cbt->upd_value->tw.stop_ts = tw.stop_ts; - cbt->upd_value->tw.stop_txn = tw.stop_txn; - cbt->upd_value->tw.prepare = tw.prepare; - cbt->upd_value->type = WT_UPDATE_TOMBSTONE; - return (0); - } - - /* Store the stop time pair of the history store record that is returning. */ - if (!have_stop_tw && WT_TIME_WINDOW_HAS_STOP(&tw) && WT_IS_HS(session->dhandle)) { - cbt->upd_value->tw.durable_stop_ts = tw.durable_stop_ts; - cbt->upd_value->tw.stop_ts = tw.stop_ts; - cbt->upd_value->tw.stop_txn = tw.stop_txn; - cbt->upd_value->tw.prepare = tw.prepare; - } + /* + * When we inspected the update list we may have seen a tombstone leaving us with a valid + * stop time window, we don't want to overwrite this stop time window. + */ + have_stop_tw = WT_TIME_WINDOW_HAS_STOP(&cbt->upd_value->tw); + + /* Check the ondisk value. */ + if (vpack == NULL) { + WT_TIME_WINDOW_INIT(&tw); + WT_RET(__wt_value_return_buf(cbt, cbt->ref, &cbt->upd_value->buf, &tw)); + } else { + WT_TIME_WINDOW_COPY(&tw, &vpack->tw); + cbt->upd_value->buf.data = vpack->data; + cbt->upd_value->buf.size = vpack->size; + } - /* If the start time point is visible then we need to return the ondisk value. */ - if (WT_IS_HS(session->dhandle) || __wt_txn_tw_start_visible(session, &tw)) { - if (cbt->upd_value->skip_buf) { + /* + * If the stop time point is set, that means that there is a tombstone at that time. If it + * is not prepared and it is visible to our txn it means we've just spotted a tombstone and + * should return "not found", except scanning the history store during rollback to stable + * and when we are told to ignore non-globally visible tombstones. + */ + if (!have_stop_tw && __wt_txn_tw_stop_visible(session, &tw) && + !F_ISSET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE)) { cbt->upd_value->buf.data = NULL; cbt->upd_value->buf.size = 0; + cbt->upd_value->tw.durable_stop_ts = tw.durable_stop_ts; + cbt->upd_value->tw.stop_ts = tw.stop_ts; + cbt->upd_value->tw.stop_txn = tw.stop_txn; + cbt->upd_value->tw.prepare = tw.prepare; + cbt->upd_value->type = WT_UPDATE_TOMBSTONE; + return (0); + } + + /* Store the stop time pair of the history store record that is returning. */ + if (!have_stop_tw && WT_TIME_WINDOW_HAS_STOP(&tw) && WT_IS_HS(session->dhandle)) { + cbt->upd_value->tw.durable_stop_ts = tw.durable_stop_ts; + cbt->upd_value->tw.stop_ts = tw.stop_ts; + cbt->upd_value->tw.stop_txn = tw.stop_txn; + cbt->upd_value->tw.prepare = tw.prepare; + } + + /* If the start time point is visible then we need to return the ondisk value. */ + if (WT_IS_HS(session->dhandle) || __wt_txn_tw_start_visible(session, &tw)) { + if (cbt->upd_value->skip_buf) { + cbt->upd_value->buf.data = NULL; + cbt->upd_value->buf.size = 0; + } + cbt->upd_value->tw.durable_start_ts = tw.durable_start_ts; + cbt->upd_value->tw.start_ts = tw.start_ts; + cbt->upd_value->tw.start_txn = tw.start_txn; + cbt->upd_value->tw.prepare = tw.prepare; + cbt->upd_value->type = WT_UPDATE_STANDARD; + return (0); } - cbt->upd_value->tw.durable_start_ts = tw.durable_start_ts; - cbt->upd_value->tw.start_ts = tw.start_ts; - cbt->upd_value->tw.start_txn = tw.start_txn; - cbt->upd_value->tw.prepare = tw.prepare; - cbt->upd_value->type = WT_UPDATE_STANDARD; - return (0); } /* If there's no visible update in the update chain or ondisk, check the history store file. */ if (F_ISSET(S2C(session), WT_CONN_HS_OPEN) && !F_ISSET(session->dhandle, WT_DHANDLE_HS)) { __wt_timing_stress(session, WT_TIMING_STRESS_HS_SEARCH); WT_RET(__wt_hs_find_upd(session, key, cbt->iface.value_format, recno, cbt->upd_value, false, - &cbt->upd_value->buf, &tw)); + &cbt->upd_value->buf)); } /* diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index d59138e1bb0..8b760c6df42 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -5875,12 +5875,16 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_TXN_RTS_KEYS_RESTORED 1468 /*! transaction: rollback to stable restored tombstones from history store */ #define WT_STAT_CONN_TXN_RTS_HS_RESTORE_TOMBSTONES 1469 +/*! transaction: rollback to stable restored updates from history store */ +#define WT_STAT_CONN_TXN_RTS_HS_RESTORE_UPDATES 1470 /*! transaction: rollback to stable sweeping history store keys */ -#define WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS 1470 +#define WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS 1471 /*! transaction: rollback to stable updates removed from history store */ -#define WT_STAT_CONN_TXN_RTS_HS_REMOVED 1471 +#define WT_STAT_CONN_TXN_RTS_HS_REMOVED 1472 +/*! transaction: transaction checkpoints due to obsolete pages */ +#define WT_STAT_CONN_TXN_CHECKPOINT_OBSOLETE_APPLIED 1473 /*! transaction: update conflicts */ -#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1472 +#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1474 /*! * @} @@ -6484,12 +6488,16 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_DSRC_TXN_RTS_KEYS_RESTORED 2201 /*! transaction: rollback to stable restored tombstones from history store */ #define WT_STAT_DSRC_TXN_RTS_HS_RESTORE_TOMBSTONES 2202 +/*! transaction: rollback to stable restored updates from history store */ +#define WT_STAT_DSRC_TXN_RTS_HS_RESTORE_UPDATES 2203 /*! transaction: rollback to stable sweeping history store keys */ -#define WT_STAT_DSRC_TXN_RTS_SWEEP_HS_KEYS 2203 +#define WT_STAT_DSRC_TXN_RTS_SWEEP_HS_KEYS 2204 /*! transaction: rollback to stable updates removed from history store */ -#define WT_STAT_DSRC_TXN_RTS_HS_REMOVED 2204 +#define WT_STAT_DSRC_TXN_RTS_HS_REMOVED 2205 +/*! transaction: transaction checkpoints due to obsolete pages */ +#define WT_STAT_DSRC_TXN_CHECKPOINT_OBSOLETE_APPLIED 2206 /*! transaction: update conflicts */ -#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2205 +#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2207 /*! * @} diff --git a/src/third_party/wiredtiger/src/lsm/lsm_manager.c b/src/third_party/wiredtiger/src/lsm/lsm_manager.c index 707ef28c086..1ca3f32f262 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_manager.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_manager.c @@ -213,7 +213,7 @@ __wt_lsm_manager_start(WT_SESSION_IMPL *session) manager->lsm_worker_cookies[i].session = worker_session; } - F_SET(conn, WT_CONN_SERVER_LSM); + FLD_SET(conn->server_flags, WT_CONN_SERVER_LSM); /* Start the LSM manager thread. */ WT_ERR(__wt_thread_create(session, &manager->lsm_worker_cookies[0].tid, __lsm_worker_manager, @@ -269,7 +269,7 @@ __wt_lsm_manager_destroy(WT_SESSION_IMPL *session) removed = 0; /* Clear the LSM server flag. */ - F_CLR(conn, WT_CONN_SERVER_LSM); + FLD_CLR(conn->server_flags, WT_CONN_SERVER_LSM); WT_ASSERT(session, !F_ISSET(conn, WT_CONN_READONLY) || manager->lsm_workers == 0); if (manager->lsm_workers > 0) { @@ -351,7 +351,7 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session) conn = S2C(session); dhandle_locked = false; - while (F_ISSET(conn, WT_CONN_SERVER_LSM)) { + while (FLD_ISSET(conn->server_flags, WT_CONN_SERVER_LSM)) { __wt_sleep(0, 10000); if (TAILQ_EMPTY(&conn->lsmqh)) continue; diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index 7ebdf736c2f..9581144489a 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -1760,7 +1760,7 @@ __transaction_sync_run_chk(WT_SESSION_IMPL *session) conn = S2C(session); - return (FLD_ISSET(conn->flags, WT_CONN_SERVER_LOG)); + return (FLD_ISSET(conn->server_flags, WT_CONN_SERVER_LOG)); } /* diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index 1ee255be706..100fa7b3ed8 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -210,8 +210,10 @@ static const char *const __stats_dsrc_desc[] = { "transaction: rollback to stable keys removed", "transaction: rollback to stable keys restored", "transaction: rollback to stable restored tombstones from history store", + "transaction: rollback to stable restored updates from history store", "transaction: rollback to stable sweeping history store keys", "transaction: rollback to stable updates removed from history store", + "transaction: transaction checkpoints due to obsolete pages", "transaction: update conflicts", }; @@ -456,8 +458,10 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) stats->txn_rts_keys_removed = 0; stats->txn_rts_keys_restored = 0; stats->txn_rts_hs_restore_tombstones = 0; + stats->txn_rts_hs_restore_updates = 0; stats->txn_rts_sweep_hs_keys = 0; stats->txn_rts_hs_removed = 0; + stats->txn_checkpoint_obsolete_applied = 0; stats->txn_update_conflict = 0; } @@ -689,8 +693,10 @@ __wt_stat_dsrc_aggregate_single(WT_DSRC_STATS *from, WT_DSRC_STATS *to) to->txn_rts_keys_removed += from->txn_rts_keys_removed; to->txn_rts_keys_restored += from->txn_rts_keys_restored; to->txn_rts_hs_restore_tombstones += from->txn_rts_hs_restore_tombstones; + to->txn_rts_hs_restore_updates += from->txn_rts_hs_restore_updates; to->txn_rts_sweep_hs_keys += from->txn_rts_sweep_hs_keys; to->txn_rts_hs_removed += from->txn_rts_hs_removed; + to->txn_checkpoint_obsolete_applied += from->txn_checkpoint_obsolete_applied; to->txn_update_conflict += from->txn_update_conflict; } @@ -928,8 +934,10 @@ __wt_stat_dsrc_aggregate(WT_DSRC_STATS **from, WT_DSRC_STATS *to) to->txn_rts_keys_removed += WT_STAT_READ(from, txn_rts_keys_removed); to->txn_rts_keys_restored += WT_STAT_READ(from, txn_rts_keys_restored); to->txn_rts_hs_restore_tombstones += WT_STAT_READ(from, txn_rts_hs_restore_tombstones); + to->txn_rts_hs_restore_updates += WT_STAT_READ(from, txn_rts_hs_restore_updates); to->txn_rts_sweep_hs_keys += WT_STAT_READ(from, txn_rts_sweep_hs_keys); to->txn_rts_hs_removed += WT_STAT_READ(from, txn_rts_hs_removed); + to->txn_checkpoint_obsolete_applied += WT_STAT_READ(from, txn_checkpoint_obsolete_applied); to->txn_update_conflict += WT_STAT_READ(from, txn_update_conflict); } @@ -1416,8 +1424,10 @@ static const char *const __stats_connection_desc[] = { "transaction: rollback to stable keys removed", "transaction: rollback to stable keys restored", "transaction: rollback to stable restored tombstones from history store", + "transaction: rollback to stable restored updates from history store", "transaction: rollback to stable sweeping history store keys", "transaction: rollback to stable updates removed from history store", + "transaction: transaction checkpoints due to obsolete pages", "transaction: update conflicts", }; @@ -1929,8 +1939,10 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->txn_rts_keys_removed = 0; stats->txn_rts_keys_restored = 0; stats->txn_rts_hs_restore_tombstones = 0; + stats->txn_rts_hs_restore_updates = 0; stats->txn_rts_sweep_hs_keys = 0; stats->txn_rts_hs_removed = 0; + stats->txn_checkpoint_obsolete_applied = 0; stats->txn_update_conflict = 0; } @@ -2453,8 +2465,10 @@ __wt_stat_connection_aggregate(WT_CONNECTION_STATS **from, WT_CONNECTION_STATS * to->txn_rts_keys_removed += WT_STAT_READ(from, txn_rts_keys_removed); to->txn_rts_keys_restored += WT_STAT_READ(from, txn_rts_keys_restored); to->txn_rts_hs_restore_tombstones += WT_STAT_READ(from, txn_rts_hs_restore_tombstones); + to->txn_rts_hs_restore_updates += WT_STAT_READ(from, txn_rts_hs_restore_updates); to->txn_rts_sweep_hs_keys += WT_STAT_READ(from, txn_rts_sweep_hs_keys); to->txn_rts_hs_removed += WT_STAT_READ(from, txn_rts_hs_removed); + to->txn_checkpoint_obsolete_applied += WT_STAT_READ(from, txn_checkpoint_obsolete_applied); to->txn_update_conflict += WT_STAT_READ(from, txn_update_conflict); } diff --git a/src/third_party/wiredtiger/src/tiered/tiered_cursor.c b/src/third_party/wiredtiger/src/tiered/tiered_cursor.c index 26c750fb496..1694e57dbc3 100644 --- a/src/third_party/wiredtiger/src/tiered/tiered_cursor.c +++ b/src/third_party/wiredtiger/src/tiered/tiered_cursor.c @@ -34,8 +34,7 @@ __curtiered_open_cursors(WT_CURSOR_TIERED *curtiered) dhandle = NULL; tiered = curtiered->tiered; - if (tiered->ntiers == 0) - return (0); + WT_ASSERT(session, tiered->ntiers > 0); /* * If the key is pointing to memory that is pinned by a chunk cursor, take a copy before closing @@ -1017,21 +1016,14 @@ err: * documents avoids biasing towards small chunks. Then return the cursor on the chunk we have * picked. */ -static int +static void __curtiered_random_chunk(WT_SESSION_IMPL *session, WT_CURSOR_TIERED *curtiered, WT_CURSOR **cursor) { - u_int i, ntiers; - - /* - * If the tree is empty we cannot do a random lookup, so return a WT_NOTFOUND. - */ - if ((ntiers = curtiered->tiered->ntiers) == 0) - return (WT_NOTFOUND); + u_int i; /* TODO: make randomness respect tree size. */ - i = __wt_random(&session->rnd) % ntiers; + i = __wt_random(&session->rnd) % curtiered->tiered->ntiers; *cursor = curtiered->cursors[i]; - return (0); } /* @@ -1055,7 +1047,7 @@ __curtiered_next_random(WT_CURSOR *cursor) WT_ERR(__curtiered_enter(curtiered, false)); for (;;) { - WT_ERR(__curtiered_random_chunk(session, curtiered, &c)); + __curtiered_random_chunk(session, curtiered, &c); /* * This call to next_random on the chunk can potentially end in WT_NOTFOUND if the chunk we * picked is empty. We want to retry in that case. diff --git a/src/third_party/wiredtiger/src/tiered/tiered_schema.c b/src/third_party/wiredtiger/src/tiered/tiered_schema.c index dc153b31e43..6e7dd84c0e3 100644 --- a/src/third_party/wiredtiger/src/tiered/tiered_schema.c +++ b/src/third_party/wiredtiger/src/tiered/tiered_schema.c @@ -15,12 +15,16 @@ int __wt_tiered_create(WT_SESSION_IMPL *session, const char *uri, bool exclusive, const char *config) { + WT_CONFIG cparser; + WT_CONFIG_ITEM ckey, cval, tierconf; WT_DECL_RET; + int ntiers; char *meta_value; const char *cfg[] = {WT_CONFIG_BASE(session, tiered_meta), config, NULL}; const char *metadata; metadata = NULL; + ntiers = 0; /* If it can be opened, it already exists. */ if ((ret = __wt_metadata_search(session, uri, &meta_value)) != WT_NOTFOUND) { @@ -30,12 +34,24 @@ __wt_tiered_create(WT_SESSION_IMPL *session, const char *uri, bool exclusive, co } WT_RET_NOTFOUND_OK(ret); + /* A tiered cursor must specify at least one underlying table */ + WT_RET(__wt_config_gets(session, cfg, "tiered.tiers", &tierconf)); + __wt_config_subinit(session, &cparser, &tierconf); + + while ((ret = __wt_config_next(&cparser, &ckey, &cval)) == 0) + ++ntiers; + WT_RET_NOTFOUND_OK(ret); + + if (ntiers == 0) + WT_RET_MSG(session, EINVAL, "tiered table must specify at least one tier"); + if (!F_ISSET(S2C(session), WT_CONN_READONLY)) { WT_ERR(__wt_config_merge(session, cfg, NULL, &metadata)); WT_ERR(__wt_metadata_insert(session, uri, metadata)); } err: + __wt_free(session, meta_value); __wt_free(session, metadata); return (ret); } @@ -188,14 +204,14 @@ __tiered_open(WT_SESSION_IMPL *session, const char *cfg[]) /* Point to some items in the copy to save re-parsing. */ WT_RET(__wt_config_gets(session, tiered_cfg, "tiered.tiers", &tierconf)); - /* - * Count the number of tiers. - */ + /* Count the number of tiers. */ __wt_config_subinit(session, &cparser, &tierconf); while ((ret = __wt_config_next(&cparser, &ckey, &cval)) == 0) ++tiered->ntiers; WT_RET_NOTFOUND_OK(ret); + WT_ASSERT(session, tiered->ntiers > 0); + WT_RET(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_calloc_def(session, tiered->ntiers, &tiered->tiers)); @@ -204,7 +220,7 @@ __tiered_open(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__wt_config_next(&cparser, &ckey, &cval)); WT_ERR(__wt_buf_fmt(session, buf, "%.*s", (int)ckey.len, ckey.str)); WT_ERR(__wt_session_get_dhandle(session, (const char *)buf->data, NULL, cfg, 0)); - __wt_atomic_addi32(&session->dhandle->session_inuse, 1); + (void)__wt_atomic_addi32(&session->dhandle->session_inuse, 1); /* Load in reverse order (based on LSM logic). */ tiered->tiers[(tiered->ntiers - 1) - i] = session->dhandle; WT_ERR(__wt_session_release_dhandle(session)); @@ -247,7 +263,7 @@ __wt_tiered_close(WT_SESSION_IMPL *session, WT_TIERED *tiered) __wt_free(session, tiered->value_format); if (tiered->tiers != NULL) { for (i = 0; i < tiered->ntiers; i++) - __wt_atomic_subi32(&tiered->tiers[i]->session_inuse, 1); + (void)__wt_atomic_subi32(&tiered->tiers[i]->session_inuse, 1); __wt_free(session, tiered->tiers); } diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index d9a15ed067c..887abaa503d 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -1415,14 +1415,17 @@ __checkpoint_lock_dirty_tree( if (now > btree->clean_ckpt_timer) skip_ckpt = false; } - if (skip_ckpt) { + + /* Skip the clean btree until the btree has obsolete pages. */ + if (skip_ckpt && !F_ISSET(btree, WT_BTREE_OBSOLETE_PAGES)) { F_SET(btree, WT_BTREE_SKIP_CKPT); goto skip; } } - /* If we have to process this btree for any reason, reset the timer. */ + /* If we have to process this btree for any reason, reset the timer and obsolete pages flag. */ WT_BTREE_CLEAN_CKPT(session, btree, 0); + F_CLR(btree, WT_BTREE_OBSOLETE_PAGES); /* Get the list of checkpoints for this file. */ WT_ERR(__wt_meta_ckptlist_get(session, dhandle->name, true, &ckptbase)); @@ -1486,6 +1489,35 @@ skip: } /* + * __checkpoint_apply_obsolete -- + * Returns true if the checkpoint is obsolete. + */ +static bool +__checkpoint_apply_obsolete(WT_SESSION_IMPL *session, WT_BTREE *btree, WT_CKPT *ckpt) +{ + wt_timestamp_t stop_ts; + + stop_ts = WT_TS_MAX; + if (ckpt->size != 0) { + /* + * If the checkpoint has a valid stop timestamp, mark the btree as having obsolete pages. + * This flag is used to avoid skipping the btree until the obsolete check is performed on + * the checkpoints. + */ + if (ckpt->ta.newest_stop_ts != WT_TS_MAX) { + F_SET(btree, WT_BTREE_OBSOLETE_PAGES); + stop_ts = ckpt->ta.newest_stop_durable_ts; + } + if (__wt_txn_visible_all(session, ckpt->ta.newest_stop_txn, stop_ts)) { + WT_STAT_CONN_DATA_INCR(session, txn_checkpoint_obsolete_applied); + return (true); + } + } + + return (false); +} + +/* * __checkpoint_mark_skip -- * Figure out whether the checkpoint can be skipped for a tree. */ @@ -1523,9 +1555,17 @@ __checkpoint_mark_skip(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, bool force) F_CLR(btree, WT_BTREE_SKIP_CKPT); if (!btree->modified && !force) { deleted = 0; - WT_CKPT_FOREACH (ckptbase, ckpt) + WT_CKPT_FOREACH (ckptbase, ckpt) { + /* + * Don't skip the objects that have obsolete pages to let them to be removed as part of + * checkpoint cleanup. + */ + if (__checkpoint_apply_obsolete(session, btree, ckpt)) + return (0); + if (F_ISSET(ckpt, WT_CKPT_DELETE)) ++deleted; + } /* * Complicated test: if the tree is clean and last two checkpoints have the same name diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index b30cf03be69..9ad6b7abd6d 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -251,12 +251,22 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW WT_ERR(hs_cursor->get_value( hs_cursor, &hs_stop_durable_ts, &hs_durable_ts, &type_full, hs_value)); type = (uint8_t)type_full; - if (type == WT_UPDATE_MODIFY) - WT_ERR(__wt_modify_apply_item( - session, S2BT(session)->value_format, &full_value, hs_value->data)); - else { - WT_ASSERT(session, type == WT_UPDATE_STANDARD); - WT_ERR(__wt_buf_set(session, &full_value, hs_value->data, hs_value->size)); + + /* + * Do not include history store updates greater than on-disk data store version to construct + * a full update to restore. Comparing with timestamps here has no problem unlike in search + * flow where the timestamps may be reset during reconciliation. RTS detects an on-disk + * update is unstable based on the written proper timestamp, so comparing against it with + * history store shouldn't have any problem. + */ + if (hs_start_ts <= unpack->tw.start_ts) { + if (type == WT_UPDATE_MODIFY) + WT_ERR(__wt_modify_apply_item( + session, S2BT(session)->value_format, &full_value, hs_value->data)); + else { + WT_ASSERT(session, type == WT_UPDATE_STANDARD); + WT_ERR(__wt_buf_set(session, &full_value, hs_value->data, hs_value->size)); + } } /* @@ -280,9 +290,10 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW */ if (!replace && hs_stop_durable_ts <= rollback_timestamp) { __wt_verbose(session, WT_VERB_RECOVERY_RTS(session), - "history store update valid with stop timestamp: %s and stable timestamp: %s", + "history store update valid with stop timestamp: %s, stable timestamp: %s and type: " + "%" PRIu8, __wt_timestamp_to_string(hs_stop_durable_ts, ts_string[0]), - __wt_timestamp_to_string(rollback_timestamp, ts_string[1])); + __wt_timestamp_to_string(rollback_timestamp, ts_string[1]), type); break; } @@ -290,22 +301,23 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW if (hs_durable_ts <= rollback_timestamp) { __wt_verbose(session, WT_VERB_RECOVERY_RTS(session), "history store update valid with start timestamp: %s, durable timestamp: %s, stop " - "timestamp: %s and stable timestamp: %s", + "timestamp: %s, stable timestamp: %s and type: %" PRIu8, __wt_timestamp_to_string(hs_start_ts, ts_string[0]), __wt_timestamp_to_string(hs_durable_ts, ts_string[1]), __wt_timestamp_to_string(hs_stop_durable_ts, ts_string[2]), - __wt_timestamp_to_string(rollback_timestamp, ts_string[3])); + __wt_timestamp_to_string(rollback_timestamp, ts_string[3]), type); + WT_ASSERT(session, cbt->upd_value->tw.start_ts < unpack->tw.start_ts); valid_update_found = true; break; } __wt_verbose(session, WT_VERB_RECOVERY_RTS(session), "history store update aborted with start timestamp: %s, durable timestamp: %s, stop " - "timestamp: %s and stable timestamp: %s", + "timestamp: %s, stable timestamp: %s and type: %" PRIu8, __wt_timestamp_to_string(hs_start_ts, ts_string[0]), __wt_timestamp_to_string(hs_durable_ts, ts_string[1]), __wt_timestamp_to_string(hs_stop_durable_ts, ts_string[2]), - __wt_timestamp_to_string(rollback_timestamp, ts_string[3])); + __wt_timestamp_to_string(rollback_timestamp, ts_string[3]), type); /* * Start time point of the current record may be used as stop time point of the previous @@ -329,6 +341,7 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW * list. Otherwise remove the key by adding a tombstone. */ if (valid_update_found) { + WT_ASSERT(session, cbt->upd_value->tw.start_ts < unpack->tw.start_ts); WT_ERR(__wt_upd_alloc(session, &full_value, WT_UPDATE_STANDARD, &upd, NULL)); upd->txnid = cbt->upd_value->tw.start_txn; @@ -336,7 +349,7 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW upd->start_ts = cbt->upd_value->tw.start_ts; __wt_verbose(session, WT_VERB_RECOVERY_RTS(session), "update restored from history store (txnid: %" PRIu64 - ", start_ts: %s, durable_ts: %s", + ", start_ts: %s and durable_ts: %s", upd->txnid, __wt_timestamp_to_string(upd->start_ts, ts_string[0]), __wt_timestamp_to_string(upd->durable_ts, ts_string[1])); @@ -345,6 +358,7 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW * the rollback to stable operation. */ F_SET(upd, WT_UPDATE_RESTORED_FROM_HS); + WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_restore_updates); /* * We have a tombstone on the original update chain and it is behind the stable diff --git a/src/third_party/wiredtiger/src/utilities/util_list.c b/src/third_party/wiredtiger/src/utilities/util_list.c index 13a3577745f..1c9cae21bbf 100644 --- a/src/third_party/wiredtiger/src/utilities/util_list.c +++ b/src/third_party/wiredtiger/src/utilities/util_list.c @@ -88,6 +88,12 @@ list_init_block(WT_SESSION *session, const char *key, WT_BLOCK *block) wt_api = session->connection->get_extension_api(session->connection); if ((ret = wt_api->metadata_search(wt_api, session, key, &config)) != 0) WT_ERR(util_err(session, ret, "%s: WT_EXTENSION_API.metadata_search", key)); + /* + * The config variable should be set and not NULL, but Coverity is convinced otherwise. This is + * an infrequent code path. Just add this extra conditional to make it happy. + */ + if (config == NULL) + goto err; if ((ret = wt_api->config_parser_open(wt_api, session, config, strlen(config), &parser)) != 0) WT_ERR(util_err(session, ret, "WT_EXTENSION_API.config_parser_open")); if ((ret = parser->get(parser, "allocation_size", &cval)) == 0) diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c index 13d91b793e3..f244df90452 100644 --- a/src/third_party/wiredtiger/test/format/config.c +++ b/src/third_party/wiredtiger/test/format/config.c @@ -31,6 +31,7 @@ static void config_backup_incr(void); static void config_backup_incr_granularity(void); +static void config_backup_incr_log_compatibility_check(void); static void config_backward_compatible(void); static void config_cache(void); static void config_checkpoint(void); @@ -267,12 +268,8 @@ config_backup_incr(void) * archival doesn't seem as useful as testing backup, let the backup configuration override. */ if (config_is_perm("backup.incremental")) { - if (g.c_backup_incr_flag == INCREMENTAL_LOG) { - if (g.c_logging_archive && config_is_perm("logging.archive")) - testutil_die(EINVAL, "backup.incremental=log is incompatible with logging.archive"); - if (g.c_logging_archive) - config_single("logging.archive=0", false); - } + if (g.c_backup_incr_flag == INCREMENTAL_LOG) + config_backup_incr_log_compatibility_check(); if (g.c_backup_incr_flag == INCREMENTAL_BLOCK) config_backup_incr_granularity(); return; @@ -761,6 +758,23 @@ config_in_memory_reset(void) } /* + * config_backup_incr_compatibility_check -- + * Backup incremental log compatibility check. + */ +static void +config_backup_incr_log_compatibility_check(void) +{ + /* + * Incremental backup using log files is incompatible with logging archival. Disable logging + * archival if log incremental backup is set. + */ + if (g.c_logging_archive && config_is_perm("logging.archive")) + testutil_die(EINVAL, "backup.incremental=log is incompatible with logging.archive"); + if (g.c_logging_archive) + config_single("logging.archive=0", false); +} + +/* * config_lsm_reset -- * LSM configuration review. */ @@ -801,6 +815,7 @@ config_lsm_reset(void) case 2: /* 50% */ config_single("backup.incremental=log", false); + config_backup_incr_log_compatibility_check(); break; } } diff --git a/src/third_party/wiredtiger/test/suite/test_checkpoint08.py b/src/third_party/wiredtiger/test/suite/test_checkpoint08.py new file mode 100755 index 00000000000..047b36dcd50 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_checkpoint08.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2020 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# test_checkpoint08.py +# Test that the btree checkpoint is not skipped if there are obsolete pages. + +import wiredtiger, wttest +from wiredtiger import stat +from wtdataset import SimpleDataSet + +def timestamp_str(t): + return '%x' % t + +class test_checkpoint08(wttest.WiredTigerTestCase): + conn_config = 'cache_size=50MB,log=(enabled),statistics=(all)' + session_config = 'isolation=snapshot' + + def get_stat(self, uri): + stat_uri = 'statistics:' + uri + stat_cursor = self.session.open_cursor(stat_uri) + val = stat_cursor[stat.dsrc.btree_clean_checkpoint_timer][2] + stat_cursor.close() + return val + + def test_checkpoint08(self): + self.uri1 = 'table:ckpt08.1' + self.file1 = 'file:ckpt08.1.wt' + self.uri2 = 'table:ckpt08.2' + self.file2 = 'file:ckpt08.2.wt' + self.hsfile = 'file:WiredTigerHS.wt' + self.session.create(self.uri1, 'key_format=i,value_format=i') + self.session.create(self.uri2, 'key_format=i,value_format=i') + + # Pin oldest and stable to timestamp 1. + self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(1) + + ',stable_timestamp=' + timestamp_str(1)) + + # Setup: Insert some data and checkpoint it. Then modify only + # the data in the first table and checkpoint. Verify the clean skip + # timer is not set for the modified table and is set for the clean one. + c1 = self.session.open_cursor(self.uri1, None) + c2 = self.session.open_cursor(self.uri2, None) + + self.session.begin_transaction() + c1[1] = 1 + c2[1] = 1 + self.session.commit_transaction('commit_timestamp=' + timestamp_str(2)) + + self.session.begin_transaction() + c1[1] = 10 + c2[1] = 10 + self.session.commit_transaction('commit_timestamp=' + timestamp_str(3)) + + self.conn.set_timestamp('stable_timestamp=' + timestamp_str(3)) + self.session.checkpoint(None) + + # Modify the both tables and reverify. + self.session.begin_transaction() + c1[3] = 3 + c2[3] = 3 + self.session.commit_transaction('commit_timestamp=' + timestamp_str(4)) + + self.conn.set_timestamp('stable_timestamp=' + timestamp_str(4)) + self.session.checkpoint(None) + + val = self.get_stat(self.uri1) + self.assertEqual(val, 0) + val = self.get_stat(self.uri2) + self.assertEqual(val, 0) + hsval = self.get_stat(self.hsfile) + self.assertNotEqual(hsval, 0) + + # Modify the both tables and reverify when oldest timestamp moved. + self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(4)) + self.session.begin_transaction() + c1[4] = 4 + c2[4] = 4 + self.session.commit_transaction('commit_timestamp=' + timestamp_str(5)) + + self.conn.set_timestamp('stable_timestamp=' + timestamp_str(5)) + self.session.checkpoint(None) + + val = self.get_stat(self.uri1) + self.assertEqual(val, 0) + val = self.get_stat(self.uri2) + self.assertEqual(val, 0) + hsval = self.get_stat(self.hsfile) + self.assertEqual(hsval, 0) + + stat_cursor = self.session.open_cursor('statistics:file:WiredTigerHS.wt', None, None) + obsolete_applied = stat_cursor[stat.dsrc.txn_checkpoint_obsolete_applied][2] + self.assertEqual(obsolete_applied, 1) + stat_cursor.close() + + c1.close() + c2.close() + self.session.close() + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_hs05.py b/src/third_party/wiredtiger/test/suite/test_hs05.py index f2d93a40547..745d22d5480 100644 --- a/src/third_party/wiredtiger/test/suite/test_hs05.py +++ b/src/third_party/wiredtiger/test/suite/test_hs05.py @@ -30,6 +30,7 @@ from helper import copy_wiredtiger_home import wiredtiger, wttest from wiredtiger import stat from wtdataset import SimpleDataSet +from wtscenario import make_scenarios def timestamp_str(t): return '%x' % t @@ -44,6 +45,12 @@ class test_hs05(wttest.WiredTigerTestCase): conn_config += 'eviction_updates_target=100,eviction_updates_trigger=100' session_config = 'isolation=snapshot' stable = 1 + key_format_values = [ + ('column', dict(key_format='r')), + ('integer', dict(key_format='i')), + ('string', dict(key_format='S')) + ] + scenarios = make_scenarios(key_format_values) def get_stat(self, stat): stat_cursor = self.session.open_cursor('statistics:') @@ -71,7 +78,7 @@ class test_hs05(wttest.WiredTigerTestCase): # Create a small table. uri = "table:test_hs05" nrows = 100 - ds = SimpleDataSet(self, uri, nrows, key_format="S", value_format='u') + ds = SimpleDataSet(self, uri, nrows, key_format=self.key_format, value_format='u') ds.populate() bigvalue = b"aaaaa" * 100 diff --git a/src/third_party/wiredtiger/test/suite/test_hs06.py b/src/third_party/wiredtiger/test/suite/test_hs06.py index 967f1519807..cc38097da7f 100644 --- a/src/third_party/wiredtiger/test/suite/test_hs06.py +++ b/src/third_party/wiredtiger/test/suite/test_hs06.py @@ -47,8 +47,7 @@ class test_hs06(wttest.WiredTigerTestCase): conn_config = 'cache_size=50MB,statistics=(fast)' session_config = 'isolation=snapshot' key_format_values = [ - # The commented columnar tests needs to be enabled once columnar page instantiated is fixed in (WT-6061). - # ('column', dict(key_format='r')), + ('column', dict(key_format='r')), ('integer', dict(key_format='i')), ('string', dict(key_format='S')) ] @@ -210,6 +209,11 @@ class test_hs06(wttest.WiredTigerTestCase): self.session.rollback_transaction() def test_hs_prepare_reads(self): + # Prepare reads currently not supported with columnar store. + # Remove this once prepare reads is supported in WT-6061. + if self.key_format == 'r': + return + # Create a small table. uri = "table:test_hs06" create_params = 'key_format={},value_format=S'.format(self.key_format) diff --git a/src/third_party/wiredtiger/test/suite/test_hs07.py b/src/third_party/wiredtiger/test/suite/test_hs07.py index 14ad15c3281..37b451f3b79 100644 --- a/src/third_party/wiredtiger/test/suite/test_hs07.py +++ b/src/third_party/wiredtiger/test/suite/test_hs07.py @@ -30,6 +30,7 @@ import time from helper import copy_wiredtiger_home import wiredtiger, wttest from wtdataset import SimpleDataSet +from wtscenario import make_scenarios def timestamp_str(t): return '%x' % t @@ -42,6 +43,12 @@ class test_hs07(wttest.WiredTigerTestCase): 'eviction_updates_target=80,log=(enabled)') session_config = 'isolation=snapshot' + key_format_values = ( + ('column', dict(key_format='r')), + ('int', dict(key_format='i')) + ) + scenarios = make_scenarios(key_format_values) + def large_updates(self, uri, value, ds, nrows, commit_ts): # Update a large number of records, we'll hang if the history store table isn't working. session = self.session @@ -70,11 +77,11 @@ class test_hs07(wttest.WiredTigerTestCase): # behavior. uri = "table:las07_main" ds = SimpleDataSet( - self, uri, 0, key_format="i", value_format="S", config='log=(enabled=false)') + self, uri, 0, key_format=self.key_format, value_format="S", config='log=(enabled=false)') ds.populate() uri2 = "table:las07_extra" - ds2 = SimpleDataSet(self, uri2, 0, key_format="i", value_format="S") + ds2 = SimpleDataSet(self, uri2, 0, key_format=self.key_format, value_format="S") ds2.populate() # Pin oldest and stable to timestamp 1. diff --git a/src/third_party/wiredtiger/test/suite/test_hs08.py b/src/third_party/wiredtiger/test/suite/test_hs08.py index 9899842d529..9388121cdad 100644 --- a/src/third_party/wiredtiger/test/suite/test_hs08.py +++ b/src/third_party/wiredtiger/test/suite/test_hs08.py @@ -38,6 +38,11 @@ def timestamp_str(t): class test_hs08(wttest.WiredTigerTestCase): conn_config = 'cache_size=100MB,statistics=(all)' session_config = 'isolation=snapshot' + key_format_values = [ + ('column', dict(key_format='r')), + ('integer', dict(key_format='i')), + ] + scenarios = make_scenarios(key_format_values) def get_stat(self, stat): stat_cursor = self.session.open_cursor('statistics:') @@ -47,7 +52,7 @@ class test_hs08(wttest.WiredTigerTestCase): def test_modify_insert_to_hs(self): uri = "table:test_hs08" - create_params = 'value_format=S,key_format=i' + create_params = 'value_format=S,key_format={}'.format(self.key_format) value1 = 'a' * 1000 self.session.create(uri, create_params) diff --git a/src/third_party/wiredtiger/test/suite/test_hs09.py b/src/third_party/wiredtiger/test/suite/test_hs09.py index ac34e3f7b17..4bed2791808 100644 --- a/src/third_party/wiredtiger/test/suite/test_hs09.py +++ b/src/third_party/wiredtiger/test/suite/test_hs09.py @@ -38,12 +38,11 @@ def timestamp_str(t): # second newest committed version to history store. class test_hs09(wttest.WiredTigerTestCase): # Force a small cache. - conn_config = 'cache_size=50MB,statistics=(fast)' + conn_config = 'cache_size=20MB,statistics=(fast)' session_config = 'isolation=snapshot' uri = "table:test_hs09" key_format_values = [ - # The commented columnar tests needs to be enabled once columnar page instantiated is fixed in (WT-6061). - #('column', dict(key_format='r')), + ('column', dict(key_format='r')), ('integer', dict(key_format='i')), ('string', dict(key_format='S')), ] @@ -54,20 +53,31 @@ class test_hs09(wttest.WiredTigerTestCase): return str(i) return i - def check_ckpt_hs(self, expected_data_value, expected_hs_value, expected_hs_start_ts, expected_hs_stop_ts): + def check_ckpt_hs(self, expected_data_value, expected_hs_value, expected_hs_start_ts, + expected_hs_stop_ts, expect_prepared_in_datastore = False): session = self.conn.open_session(self.session_config) session.checkpoint() - # Check the data file value + # Check the data file value. cursor = session.open_cursor(self.uri, None, 'checkpoint=WiredTigerCheckpoint') + + # If we are expecting prepapred updates in the datastore, start an explicit transaction with + # ignore prepare flag to avoid getting a WT_PREPARE_CONFLICT error. + if expect_prepared_in_datastore: + session.begin_transaction("ignore_prepare=true") + for _, value in cursor: self.assertEqual(value, expected_data_value) + + if expect_prepared_in_datastore: + session.rollback_transaction() + cursor.close() - # Check the history store file value + # Check the history store file value. cursor = session.open_cursor("file:WiredTigerHS.wt", None, 'checkpoint=WiredTigerCheckpoint') for _, _, hs_start_ts, _, hs_stop_ts, _, type, value in cursor: - # No WT_UPDATE_TOMBSTONE in the history store + # No WT_UPDATE_TOMBSTONE in the history store. self.assertNotEqual(type, 5) - # No WT_UPDATE_BIRTHMARK in the history store + # No WT_UPDATE_BIRTHMARK in the history store. self.assertNotEqual(type, 1) # WT_UPDATE_STANDARD if (type == 4): @@ -100,7 +110,7 @@ class test_hs09(wttest.WiredTigerTestCase): cursor[self.create_key(i)] = value2 self.session.commit_transaction('commit_timestamp=' + timestamp_str(3)) - # Uncommitted changes + # Uncommitted changes. self.session.begin_transaction() for i in range(1, 11): cursor[self.create_key(i)] = value3 @@ -108,6 +118,11 @@ class test_hs09(wttest.WiredTigerTestCase): self.check_ckpt_hs(value2, value1, 2, 3) def test_prepared_updates_not_written_to_hs(self): + # Prepare reads currently not supported with columnar store. + # Remove this once prepare reads is supported in WT-6061. + if self.key_format == 'r': + return + # Create a small table. create_params = 'key_format={},value_format=S'.format(self.key_format) self.session.create(self.uri, create_params) @@ -130,13 +145,15 @@ class test_hs09(wttest.WiredTigerTestCase): cursor[self.create_key(i)] = value2 self.session.commit_transaction('commit_timestamp=' + timestamp_str(3)) - # Prepare some updates + # Prepare some updates. self.session.begin_transaction() for i in range(1, 11): cursor[self.create_key(i)] = value3 self.session.prepare_transaction('prepare_timestamp=' + timestamp_str(4)) - self.check_ckpt_hs(value2, value1, 2, 3) + # We can expect prepared values to show up in data store if the eviction runs between now + # and the time when we open a cursor on the user table. + self.check_ckpt_hs(value2, value1, 2, 3, True) self.session.commit_transaction('commit_timestamp=' + timestamp_str(5) + ',durable_timestamp=' + timestamp_str(5)) diff --git a/src/third_party/wiredtiger/test/suite/test_hs10.py b/src/third_party/wiredtiger/test/suite/test_hs10.py index f41f18bb999..ef92195daf9 100644 --- a/src/third_party/wiredtiger/test/suite/test_hs10.py +++ b/src/third_party/wiredtiger/test/suite/test_hs10.py @@ -38,6 +38,11 @@ def timestamp_str(t): class test_hs10(wttest.WiredTigerTestCase): conn_config = 'cache_size=2MB,statistics=(all),eviction=(threads_max=1)' session_config = 'isolation=snapshot' + key_format_values = ( + ('column', dict(key_format='r')), + ('int', dict(key_format='i')) + ) + scenarios = make_scenarios(key_format_values) def get_stat(self, stat): stat_cursor = self.session.open_cursor('statistics:') @@ -48,7 +53,7 @@ class test_hs10(wttest.WiredTigerTestCase): def test_modify_insert_to_hs(self): uri = "table:test_hs10" uri2 = "table:test_hs10_otherdata" - create_params = 'value_format=S,key_format=i' + create_params = 'value_format=S,key_format={}'.format(self.key_format) value1 = 'a' * 1000 value2 = 'b' * 1000 self.session.create(uri, create_params) diff --git a/src/third_party/wiredtiger/test/suite/test_hs12.py b/src/third_party/wiredtiger/test/suite/test_hs12.py index ed332dc3349..a4e9199323c 100644 --- a/src/third_party/wiredtiger/test/suite/test_hs12.py +++ b/src/third_party/wiredtiger/test/suite/test_hs12.py @@ -27,6 +27,7 @@ # OTHER DEALINGS IN THE SOFTWARE. import wiredtiger, wttest, time +from wtscenario import make_scenarios def timestamp_str(t): return '%x' % t @@ -36,10 +37,17 @@ def timestamp_str(t): class test_hs12(wttest.WiredTigerTestCase): conn_config = 'cache_size=2MB,statistics=(all),eviction=(threads_max=1)' session_config = 'isolation=snapshot' + key_format_values = [ + # The commented columnar tests needs to be enabled once columnar + # Modify type update is fixed in (WT-5550). + # ('column', dict(key_format='r')), + ('integer', dict(key_format='i')), + ] + scenarios = make_scenarios(key_format_values) def test_modify_append_to_string(self): uri = "table:test_reverse_modify01_notimestamp" - create_params = 'value_format=S,key_format=i' + create_params = 'value_format=S,key_format={}'.format(self.key_format) value1 = 'abcedfghijklmnopqrstuvwxyz' * 5 value2 = 'b' * 100 valuebig = 'e' * 1000 diff --git a/src/third_party/wiredtiger/test/suite/test_hs13.py b/src/third_party/wiredtiger/test/suite/test_hs13.py index 7ada5d7a0b6..27c6c6d4ade 100644 --- a/src/third_party/wiredtiger/test/suite/test_hs13.py +++ b/src/third_party/wiredtiger/test/suite/test_hs13.py @@ -27,6 +27,8 @@ # OTHER DEALINGS IN THE SOFTWARE.
import wiredtiger, wttest
+from wtscenario import make_scenarios
+
def timestamp_str(t):
return '%x' % t
@@ -35,10 +37,16 @@ def timestamp_str(t): class test_hs13(wttest.WiredTigerTestCase):
conn_config = 'cache_size=2MB,statistics=(all),eviction=(threads_max=1)'
session_config = 'isolation=snapshot'
+ key_format_values = [
+ # The commented columnar tests needs to be enabled once columnar Modify type update is fixed in (WT-5550).
+ # ('column', dict(key_format='r')),
+ ('integer', dict(key_format='i'))
+ ]
+ scenarios = make_scenarios(key_format_values)
def test_reverse_modifies_constructed_after_eviction(self):
uri = "table:test_hs13"
- create_params = 'value_format=S,key_format=i'
+ create_params = 'value_format=S,key_format={}'.format(self.key_format)
value1 = 'a' * 10000
value2 = 'b' * 10000
value3 = 'e' * 10000
diff --git a/src/third_party/wiredtiger/test/suite/test_hs14.py b/src/third_party/wiredtiger/test/suite/test_hs14.py index ebd5f471f2b..4e8f5148da1 100644 --- a/src/third_party/wiredtiger/test/suite/test_hs14.py +++ b/src/third_party/wiredtiger/test/suite/test_hs14.py @@ -27,6 +27,7 @@ # OTHER DEALINGS IN THE SOFTWARE. import time, wiredtiger, wttest +from wtscenario import make_scenarios def timestamp_str(t): return '%x' % t @@ -37,10 +38,20 @@ def timestamp_str(t): class test_hs14(wttest.WiredTigerTestCase): conn_config = 'cache_size=50MB' session_config = 'isolation=snapshot' + key_format_values = [ + ('column', dict(key_format='r')), + ('string', dict(key_format='S')) + ] + scenarios = make_scenarios(key_format_values) + + def create_key(self, i): + if self.key_format == 'S': + return str(i) + return i def test_hs14(self): uri = 'table:test_hs14' - self.session.create(uri, 'key_format=S,value_format=S') + self.session.create(uri, 'key_format={},value_format=S'.format(self.key_format)) self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(1)) cursor = self.session.open_cursor(uri) @@ -52,22 +63,22 @@ class test_hs14(wttest.WiredTigerTestCase): for i in range(1, 10000): self.session.begin_transaction() - cursor[str(i)] = value1 + cursor[self.create_key(i)] = value1 self.session.commit_transaction('commit_timestamp=' + timestamp_str(2)) self.session.begin_transaction() - cursor[str(i)] = value2 + cursor[self.create_key(i)] = value2 self.session.commit_transaction('commit_timestamp=' + timestamp_str(2)) self.session.begin_transaction() - cursor[str(i)] = value3 + cursor[self.create_key(i)] = value3 self.session.commit_transaction('commit_timestamp=' + timestamp_str(3)) self.session.begin_transaction() - cursor[str(i)] = value4 + cursor[self.create_key(i)] = value4 self.session.commit_transaction('commit_timestamp=' + timestamp_str(4)) start = time.time() self.session.begin_transaction('read_timestamp=' + timestamp_str(3)) for i in range(1, 10000): - self.assertEqual(cursor[str(i)], value3) + self.assertEqual(cursor[self.create_key(i)], value3) self.session.rollback_transaction() end = time.time() @@ -76,17 +87,17 @@ class test_hs14(wttest.WiredTigerTestCase): for i in range(1, 10000): self.session.begin_transaction() - cursor.set_key(str(i)) + cursor.set_key(self.create_key(i)) cursor.remove() self.session.commit_transaction('commit_timestamp=' + timestamp_str(5)) self.session.begin_transaction() - cursor[str(i)] = value5 + cursor[self.create_key(i)] = value5 self.session.commit_transaction('commit_timestamp=' + timestamp_str(10)) start = time.time() self.session.begin_transaction('read_timestamp=' + timestamp_str(9)) for i in range(1, 10000): - cursor.set_key(str(i)) + cursor.set_key(self.create_key(i)) self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND) self.session.rollback_transaction() end = time.time() diff --git a/src/third_party/wiredtiger/test/suite/test_hs20.py b/src/third_party/wiredtiger/test/suite/test_hs20.py new file mode 100644 index 00000000000..2a9f7d02c93 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_hs20.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2020 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import time, wiredtiger, wttest + +# test_hs20.py +# Ensure we never reconstruct a reverse modify update in the history store based on the onpage overflow value +def timestamp_str(t): + return '%x' % t + +class test_hs20(wttest.WiredTigerTestCase): + conn_config = 'cache_size=50MB,eviction=(threads_max=1)' + session_config = 'isolation=snapshot' + + def test_hs20(self): + uri = 'table:test_hs20' + # Set a very small maximum leaf value to trigger writing overflow values + self.session.create(uri, 'key_format=S,value_format=S,leaf_value_max=10B') + cursor = self.session.open_cursor(uri) + self.conn.set_timestamp( + 'oldest_timestamp=' + timestamp_str(1) + ',stable_timestamp=' + timestamp_str(1)) + + value1 = 'a' * 500 + value2 = 'b' * 50 + + # Insert a value that is larger than the maximum leaf value. + for i in range(0, 10): + self.session.begin_transaction() + cursor[str(i)] = value1 + self.session.commit_transaction('commit_timestamp=' + timestamp_str(2)) + + # Do 2 modifies. + for i in range(0, 10): + self.session.begin_transaction() + cursor.set_key(str(i)) + mods = [wiredtiger.Modify('B', 500, 1)] + self.assertEqual(cursor.modify(mods), 0) + self.session.commit_transaction('commit_timestamp=' + timestamp_str(3)) + + for i in range(0, 10): + self.session.begin_transaction() + cursor.set_key(str(i)) + mods = [wiredtiger.Modify('C', 501, 1)] + self.assertEqual(cursor.modify(mods), 0) + self.session.commit_transaction('commit_timestamp=' + timestamp_str(4)) + + # Insert more data to trigger eviction. + for i in range(10, 100000): + self.session.begin_transaction() + cursor[str(i)] = value2 + self.session.commit_transaction('commit_timestamp=' + timestamp_str(5)) + + # Update the overflow values. + for i in range(0, 10): + self.session.begin_transaction() + cursor[str(i)] = value2 + self.session.commit_transaction('commit_timestamp=' + timestamp_str(5)) + + # Do a checkpoint to move the overflow values to the history store but keep the current in memory disk image. + self.session.checkpoint() + + # Search the first modifies. + for i in range(0, 10): + self.session.begin_transaction('read_timestamp=' + timestamp_str(3)) + self.assertEqual(cursor[str(i)], value1 + "B") + self.session.rollback_transaction() diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable14.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable14.py new file mode 100755 index 00000000000..ea88a33a066 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable14.py @@ -0,0 +1,304 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2020 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import fnmatch, os, shutil, threading, time +from helper import copy_wiredtiger_home +from test_rollback_to_stable01 import test_rollback_to_stable_base +from wiredtiger import stat, wiredtiger_strerror, WiredTigerError, WT_ROLLBACK +from wtdataset import SimpleDataSet +from wtscenario import make_scenarios +from wtthread import checkpoint_thread, op_thread +from time import sleep + +def timestamp_str(t): + return '%x' % t + +def mod_val(value, char, location, nbytes=1): + return value[0:location] + char + value[location+nbytes:] + +def retry_rollback(self, name, txn_session, code): + retry_limit = 100 + retries = 0 + completed = False + saved_exception = None + while not completed and retries < retry_limit: + if retries != 0: + self.pr("Retrying operation for " + name) + if txn_session: + txn_session.rollback_transaction() + sleep(0.1) + if txn_session: + txn_session.begin_transaction('isolation=snapshot') + self.pr("Began new transaction for " + name) + try: + code() + completed = True + except WiredTigerError as e: + rollback_str = wiredtiger_strerror(WT_ROLLBACK) + if rollback_str not in str(e): + raise(e) + retries += 1 + saved_exception = e + if not completed and saved_exception: + raise(saved_exception) + +# test_rollback_to_stable14.py +# Test the rollback to stable operation uses proper base update while restoring modifies from history store. +class test_rollback_to_stable14(test_rollback_to_stable_base): + session_config = 'isolation=snapshot' + + prepare_values = [ + ('no_prepare', dict(prepare=False)), + ('prepare', dict(prepare=True)) + ] + + scenarios = make_scenarios(prepare_values) + + def conn_config(self): + config = 'cache_size=8MB,statistics=(all),statistics_log=(json,on_close,wait=1),log=(enabled=true),timing_stress_for_test=[history_store_checkpoint_delay]' + return config + + def simulate_crash_restart(self, olddir, newdir): + ''' Simulate a crash from olddir and restart in newdir. ''' + # with the connection still open, copy files to new directory + shutil.rmtree(newdir, ignore_errors=True) + os.mkdir(newdir) + for fname in os.listdir(olddir): + fullname = os.path.join(olddir, fname) + # Skip lock file on Windows since it is locked + if os.path.isfile(fullname) and \ + "WiredTiger.lock" not in fullname and \ + "Tmplog" not in fullname and \ + "Preplog" not in fullname: + shutil.copy(fullname, newdir) + # + # close the original connection and open to new directory + # NOTE: This really cannot test the difference between the + # write-no-sync (off) version of log_flush and the sync + # version since we're not crashing the system itself. + # + self.close_conn() + self.conn = self.setUpConnectionOpen(newdir) + self.session = self.setUpSessionOpen(self.conn) + + def test_rollback_to_stable(self): + nrows = 1500 + + # Create a table without logging. + self.pr("create/populate table") + uri = "table:rollback_to_stable14" + ds = SimpleDataSet( + self, uri, 0, key_format="i", value_format="S", config='log=(enabled=false)') + ds.populate() + + # Pin oldest and stable to timestamp 10. + self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(10) + + ',stable_timestamp=' + timestamp_str(10)) + + value_a = "aaaaa" * 100 + + value_modQ = mod_val(value_a, 'Q', 0) + value_modR = mod_val(value_modQ, 'R', 1) + value_modS = mod_val(value_modR, 'S', 2) + value_modT = mod_val(value_modS, 'T', 3) + + # Perform a combination of modifies and updates. + self.pr("large updates and modifies") + self.large_updates(uri, value_a, ds, nrows, 20) + self.large_modifies(uri, 'Q', ds, 0, 1, nrows, 30) + self.large_modifies(uri, 'R', ds, 1, 1, nrows, 40) + self.large_modifies(uri, 'S', ds, 2, 1, nrows, 50) + self.large_modifies(uri, 'T', ds, 3, 1, nrows, 60) + + # Verify data is visible and correct. + self.check(value_a, uri, nrows, 20) + self.check(value_modQ, uri, nrows, 30) + self.check(value_modR, uri, nrows, 40) + self.check(value_modS, uri, nrows, 50) + self.check(value_modT, uri, nrows, 60) + + # Pin stable to timestamp 60 if prepare otherwise 50. + if self.prepare: + self.conn.set_timestamp('stable_timestamp=' + timestamp_str(60)) + else: + self.conn.set_timestamp('stable_timestamp=' + timestamp_str(50)) + + # Create a checkpoint thread + done = threading.Event() + ckpt = checkpoint_thread(self.conn, done) + try: + self.pr("start checkpoint") + ckpt.start() + + # Perform several modifies in parallel with checkpoint. + # Rollbacks may occur when checkpoint is running, so retry as needed. + self.pr("modifies") + retry_rollback(self, 'modify ds1, W', None, + lambda: self.large_modifies(uri, 'W', ds, 4, 1, nrows, 70)) + retry_rollback(self, 'modify ds1, X', None, + lambda: self.large_modifies(uri, 'X', ds, 5, 1, nrows, 80)) + retry_rollback(self, 'modify ds1, Y', None, + lambda: self.large_modifies(uri, 'Y', ds, 6, 1, nrows, 90)) + retry_rollback(self, 'modify ds1, Z', None, + lambda: self.large_modifies(uri, 'Z', ds, 7, 1, nrows, 100)) + finally: + done.set() + ckpt.join() + + # Simulate a server crash and restart. + self.pr("restart") + self.simulate_crash_restart(".", "RESTART") + self.pr("restart complete") + + stat_cursor = self.session.open_cursor('statistics:', None, None) + calls = stat_cursor[stat.conn.txn_rts][2] + hs_removed = stat_cursor[stat.conn.txn_rts_hs_removed][2] + hs_sweep = stat_cursor[stat.conn.txn_rts_sweep_hs_keys][2] + hs_restore_updates = stat_cursor[stat.conn.txn_rts_hs_restore_updates][2] + keys_removed = stat_cursor[stat.conn.txn_rts_keys_removed][2] + keys_restored = stat_cursor[stat.conn.txn_rts_keys_restored][2] + pages_visited = stat_cursor[stat.conn.txn_rts_pages_visited][2] + upd_aborted = stat_cursor[stat.conn.txn_rts_upd_aborted][2] + stat_cursor.close() + + self.assertEqual(calls, 0) + self.assertEqual(keys_removed, 0) + self.assertEqual(hs_restore_updates, nrows) + self.assertEqual(keys_restored, 0) + self.assertEqual(upd_aborted, 0) + self.assertGreater(pages_visited, 0) + self.assertGreaterEqual(hs_removed, nrows) + self.assertGreaterEqual(hs_sweep, 0) + + # Check that the correct data is seen at and after the stable timestamp. + self.check(value_a, uri, nrows, 20) + self.check(value_modQ, uri, nrows, 30) + self.check(value_modR, uri, nrows, 40) + self.check(value_modS, uri, nrows, 50) + + # The test may output the following message in eviction under cache pressure. Ignore that. + self.ignoreStdoutPatternIfExists("oldest pinned transaction ID rolled back for eviction") + + def test_rollback_to_stable_same_ts(self): + nrows = 1500 + + # Create a table without logging. + self.pr("create/populate table") + uri = "table:rollback_to_stable14" + ds = SimpleDataSet( + self, uri, 0, key_format="i", value_format="S", config='log=(enabled=false)') + ds.populate() + + # Pin oldest and stable to timestamp 10. + self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(10) + + ',stable_timestamp=' + timestamp_str(10)) + + value_a = "aaaaa" * 100 + + value_modQ = mod_val(value_a, 'Q', 0) + value_modR = mod_val(value_modQ, 'R', 1) + value_modS = mod_val(value_modR, 'S', 2) + value_modT = mod_val(value_modS, 'T', 3) + + # Perform a combination of modifies and updates. + self.pr("large updates and modifies") + self.large_updates(uri, value_a, ds, nrows, 20) + self.large_modifies(uri, 'Q', ds, 0, 1, nrows, 30) + # prepare cannot use same timestamp always, so use a different timestamps that are aborted. + if self.prepare: + self.large_modifies(uri, 'R', ds, 1, 1, nrows, 51) + self.large_modifies(uri, 'S', ds, 2, 1, nrows, 55) + self.large_modifies(uri, 'T', ds, 3, 1, nrows, 60) + else: + self.large_modifies(uri, 'R', ds, 1, 1, nrows, 60) + self.large_modifies(uri, 'S', ds, 2, 1, nrows, 60) + self.large_modifies(uri, 'T', ds, 3, 1, nrows, 60) + + # Verify data is visible and correct. + self.check(value_a, uri, nrows, 20) + self.check(value_modQ, uri, nrows, 30) + self.check(value_modT, uri, nrows, 60) + + self.conn.set_timestamp('stable_timestamp=' + timestamp_str(50)) + + # Create a checkpoint thread + done = threading.Event() + ckpt = checkpoint_thread(self.conn, done) + try: + self.pr("start checkpoint") + ckpt.start() + + # Perform several modifies in parallel with checkpoint. + # Rollbacks may occur when checkpoint is running, so retry as needed. + self.pr("modifies") + retry_rollback(self, 'modify ds1, W', None, + lambda: self.large_modifies(uri, 'W', ds, 4, 1, nrows, 70)) + retry_rollback(self, 'modify ds1, X', None, + lambda: self.large_modifies(uri, 'X', ds, 5, 1, nrows, 80)) + retry_rollback(self, 'modify ds1, Y', None, + lambda: self.large_modifies(uri, 'Y', ds, 6, 1, nrows, 90)) + retry_rollback(self, 'modify ds1, Z', None, + lambda: self.large_modifies(uri, 'Z', ds, 7, 1, nrows, 100)) + finally: + done.set() + ckpt.join() + + # Simulate a server crash and restart. + self.pr("restart") + self.simulate_crash_restart(".", "RESTART") + self.pr("restart complete") + + stat_cursor = self.session.open_cursor('statistics:', None, None) + calls = stat_cursor[stat.conn.txn_rts][2] + hs_removed = stat_cursor[stat.conn.txn_rts_hs_removed][2] + hs_restore_updates = stat_cursor[stat.conn.txn_rts_hs_restore_updates][2] + hs_sweep = stat_cursor[stat.conn.txn_rts_sweep_hs_keys][2] + keys_removed = stat_cursor[stat.conn.txn_rts_keys_removed][2] + keys_restored = stat_cursor[stat.conn.txn_rts_keys_restored][2] + pages_visited = stat_cursor[stat.conn.txn_rts_pages_visited][2] + upd_aborted = stat_cursor[stat.conn.txn_rts_upd_aborted][2] + stat_cursor.close() + + self.assertEqual(calls, 0) + self.assertEqual(keys_removed, 0) + self.assertEqual(hs_restore_updates, nrows) + self.assertEqual(keys_restored, 0) + self.assertEqual(upd_aborted, 0) + self.assertGreater(pages_visited, 0) + self.assertGreaterEqual(hs_removed, nrows * 3) + self.assertGreaterEqual(hs_sweep, 0) + + # Check that the correct data is seen at and after the stable timestamp. + self.check(value_a, uri, nrows, 20) + self.check(value_modQ, uri, nrows, 30) + + # The test may output the following message in eviction under cache pressure. Ignore that. + self.ignoreStdoutPatternIfExists("oldest pinned transaction ID rolled back for eviction") + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_tiered01.py b/src/third_party/wiredtiger/test/suite/test_tiered01.py index 2a41c3ff7ef..9a7066fd708 100644 --- a/src/third_party/wiredtiger/test/suite/test_tiered01.py +++ b/src/third_party/wiredtiger/test/suite/test_tiered01.py @@ -71,5 +71,12 @@ class test_tiered01(wttest.WiredTigerTestCase): # self.session.drop(self.uri) + # It is an error to configure a tiered table with no tiers + def test_no_tiers(self): + msg = '/tiered table must specify at least one tier/' + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.create(self.uri, 'type=tiered,key_format=S,tiered=(tiers=())'), + msg) + if __name__ == '__main__': wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_tiered02.py b/src/third_party/wiredtiger/test/suite/test_tiered02.py new file mode 100644 index 00000000000..17eb3073c39 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_tiered02.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2021 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wtscenario, wttest +from wtdataset import SimpleDataSet + +# test_tiered02.py +# Test block-log-structured tree configuration options. +class test_tiered02(wttest.WiredTigerTestCase): + K = 1024 + M = 1024 * K + G = 1024 * M + uri = "file:test_tiered02" + + # Occasionally add a lot of records, so that merges (and bloom) happen. + record_count_scenarios = wtscenario.quick_scenarios( + 'nrecs', [10, 10000], [0.9, 0.1]) + + scenarios = wtscenario.make_scenarios(record_count_scenarios, prune=100, prunelong=500) + + # Test drop of an object. + def test_tiered(self): + args = 'key_format=S,block_allocation=log-structured' + self.verbose(3, + 'Test log-structured allocation with config: ' + args + ' count: ' + str(self.nrecs)) + #ds = SimpleDataSet(self, self.uri, self.nrecs, config=args) + ds = SimpleDataSet(self, self.uri, 10, config=args) + ds.populate() + self.session.checkpoint() + ds = SimpleDataSet(self, self.uri, 10000, config=args) + ds.populate() + + self.reopen_conn() + ds = SimpleDataSet(self, self.uri, 1000, config=args) + ds.populate() + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_tiered03.py b/src/third_party/wiredtiger/test/suite/test_tiered03.py new file mode 100644 index 00000000000..624387c21a3 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_tiered03.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2021 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import os, re +import wiredtiger, wtscenario, wttest +from wtdataset import SimpleDataSet + +# test_tiered03.py +# Test block-log-structured tree configuration options. +class test_tiered03(wttest.WiredTigerTestCase): + K = 1024 + M = 1024 * K + G = 1024 * M + uri = 'file:test_tiered03' + + # Occasionally add a lot of records, so that merges (and bloom) happen. + record_count_scenarios = wtscenario.quick_scenarios( + 'nrecs', [10, 10000], [0.9, 0.1]) + + scenarios = wtscenario.make_scenarios(record_count_scenarios, prune=100, prunelong=500) + + # Test sharing data between a primary and a secondary + def test_sharing(self): + args = 'block_allocation=log-structured' + self.verbose(3, + 'Test log-structured allocation with config: ' + args + ' count: ' + str(self.nrecs)) + ds = SimpleDataSet(self, self.uri, 10, config=args) + ds.populate() + ds.check() + self.session.checkpoint() + ds.check() + + # Create a secondary database + dir2 = os.path.join(self.home, 'SECONDARY') + os.mkdir(dir2) + conn2 = self.setUpConnectionOpen(dir2) + session2 = conn2.open_session() + + # Reference the tree from the secondary: + metac = self.session.open_cursor('metadata:') + metac2 = session2.open_cursor('metadata:', None, 'readonly=0') + uri2 = self.uri[:5] + '../' + self.uri[5:] + metac2[uri2] = metac[self.uri] + ",readonly=1" + + cursor2 = session2.open_cursor(uri2) + ds.check_cursor(cursor2) + cursor2.close() + + newds = SimpleDataSet(self, self.uri, 10000, config=args) + newds.populate() + newds.check() + self.session.checkpoint() + newds.check() + + # Check we can still read from the last checkpoint + cursor2 = session2.open_cursor(uri2) + ds.check_cursor(cursor2) + cursor2.close() + + # Bump to new checkpoint + origmeta = metac[self.uri] + checkpoint = re.search(r',checkpoint=\(.+?\)\)', origmeta).group(0)[1:] + self.pr('Orig checkpoint: ' + checkpoint) + session2.alter(uri2, checkpoint) + self.pr('New metadata on secondaery: ' + metac2[uri2]) + + # Check that we can see the new data + cursor2 = session2.open_cursor(uri2) + newds.check_cursor(cursor2) + +if __name__ == '__main__': + wttest.run() |