diff options
author | Alex Gorrod <alexander.gorrod@mongodb.com> | 2017-08-21 09:26:14 +1000 |
---|---|---|
committer | Alex Gorrod <alexander.gorrod@mongodb.com> | 2017-08-21 09:26:14 +1000 |
commit | 0cfe4dfc2cf371f9e8196cb79414c3432b95b5af (patch) | |
tree | 3ea1dfc60609b4b6c424144f02af5e8045d1fe40 | |
parent | 1cbfc673303260c725ef428eb0f2b6372feb5ec1 (diff) | |
download | mongo-0cfe4dfc2cf371f9e8196cb79414c3432b95b5af.tar.gz |
Import wiredtiger: b055251678e6b4fcc74a1f651432aadbfeecc0e4 from branch mongodb-3.6
ref: 698847557c..b055251678
for: 3.5.12
WT-3328 Enhance docs around when schema operations can get EBUSY
WT-3358 LSM will hang if the manager fails to start
WT-3365 Understand how timestamps interact with LSM chunk switching
WT-3399 Add new checkpoint blocking test case to automated testing
WT-3417 Drain transactions during upgrade/downgrade.
WT-3441 test_timestamp01 doesn't account for a large WT_TIMESTAMP_SIZE
WT-3450 Add verbose option that tracks timestamp state and information
WT-3452 Enhance existing recovery test to exercise timestamp API
WT-3455 Enhance eviction to be aware of stable timestamp
WT-3459 Test WiredTiger with clock shifting
WT-3460 Add support for rollback_to_stable to column store
WT-3465 Optimize performance when timestamp size is 8 bytes
WT-3483 WT_SESSION::checkpoint returning WT_ROLLBACK
WT-3492 ex_all.c not calling transaction_ops
WT-3493 wt_verbose_dump_txn should display timestamp information
WT-3497 Improve logging message when hitting the WT session limits
WT-3498 Incorrect data read after caching overflow items
WT-3499 Checkpoint can miss not yet committed item
WT-3500 New timestamp-abort test is too chatty
WT-3502 Only keep 10 delta updates between full copies
WT-3503 Coverity 1379333: unchecked return value, full-build Friday
WT-3508 timestamp-abort bug in verification phase
WT-3509 __wt_illegal_value doesn't always provide a failure location
WT-3514 WT_SESSION.checkpoint: read timestamp 6373c older than oldest timestamp
WT-3517 WT_SESSION::reset doesn't need to call out EBUSY specially
WT-3521 Unstable updates should not be written by lookaside eviction
64 files changed, 2567 insertions, 901 deletions
diff --git a/src/third_party/wiredtiger/.gitignore b/src/third_party/wiredtiger/.gitignore index 204cd421fd1..e81c037a1ac 100644 --- a/src/third_party/wiredtiger/.gitignore +++ b/src/third_party/wiredtiger/.gitignore @@ -134,6 +134,7 @@ _wiredtiger.pyd **/test/packing/packing-test **/test/readonly/t **/test/recovery/random-abort +**/test/recovery/timestamp-abort **/test/recovery/truncated-log **/test/salvage/t **/test/syscall/test_wt2336_base diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py index ee2f14b980b..37f9baedc70 100644 --- a/src/third_party/wiredtiger/dist/api_data.py +++ b/src/third_party/wiredtiger/dist/api_data.py @@ -402,7 +402,9 @@ connection_runtime_config = [ min='0', max='100000'), ]), Config('compatibility', '', r''' - set compatibility version of database''', + set compatibility version of database. Changing the compatibility + version requires that there are no active operations for the duration + of the call.''', type='category', subconfig=[ Config('release', '', r''' compatibility release version string'''), @@ -560,6 +562,7 @@ connection_runtime_config = [ 'split', 'temporary', 'thread_group', + 'timestamp', 'transaction', 'verify', 'version', @@ -1146,9 +1149,6 @@ methods = { Config('name', '', r''' if set, specify a name for the checkpoint (note that checkpoints including LSM trees may not be named)'''), - Config('read_timestamp', '', r''' - if set, create the checkpoint as of the specified timestamp''', - undoc=True), Config('target', '', r''' if non-empty, checkpoint the list of objects''', type='list'), Config('use_timestamp', 'true', r''' diff --git a/src/third_party/wiredtiger/dist/filelist b/src/third_party/wiredtiger/dist/filelist index 33ede795c69..9755e24f3c7 100644 --- a/src/third_party/wiredtiger/dist/filelist +++ b/src/third_party/wiredtiger/dist/filelist @@ -72,6 +72,7 @@ src/conn/conn_dhandle.c src/conn/conn_handle.c src/conn/conn_log.c src/conn/conn_open.c +src/conn/conn_reconfig.c src/conn/conn_stat.c src/conn/conn_sweep.c src/cursor/cur_backup.c diff --git a/src/third_party/wiredtiger/dist/flags.py b/src/third_party/wiredtiger/dist/flags.py index 05ffb8851a2..8c0448b27c1 100644 --- a/src/third_party/wiredtiger/dist/flags.py +++ b/src/third_party/wiredtiger/dist/flags.py @@ -87,6 +87,7 @@ flags = { 'VERB_SPLIT', 'VERB_TEMPORARY', 'VERB_THREAD_GROUP', + 'VERB_TIMESTAMP', 'VERB_TRANSACTION', 'VERB_VERIFY', 'VERB_VERSION', diff --git a/src/third_party/wiredtiger/dist/s_all b/src/third_party/wiredtiger/dist/s_all index be33657e640..1809cef3aa5 100755 --- a/src/third_party/wiredtiger/dist/s_all +++ b/src/third_party/wiredtiger/dist/s_all @@ -88,8 +88,8 @@ COMMANDS=" 2>&1 ./s_docs > ${t_pfx}s_docs 2>&1 ./s_export > ${t_pfx}s_export 2>&1 ./s_funcs > ${t_pfx}s_funcs +2>&1 ./s_function > ${t_pfx}s_function 2>&1 ./s_getopt > ${t_pfx}s_getopt -2>&1 ./s_label > ${t_pfx}s_label 2>&1 ./s_lang > ${t_pfx}s_lang 2>&1 ./s_longlines > ${t_pfx}s_longlines 2>&1 ./s_python > ${t_pfx}s_python diff --git a/src/third_party/wiredtiger/dist/s_copyright.list b/src/third_party/wiredtiger/dist/s_copyright.list index 2ac63bcb159..71ffa446eed 100644 --- a/src/third_party/wiredtiger/dist/s_copyright.list +++ b/src/third_party/wiredtiger/dist/s_copyright.list @@ -12,7 +12,7 @@ skip dist/flags.py skip dist/java_doc.py skip dist/log.py skip dist/log_data.py -skip dist/s_label_loop.py +skip dist/s_function_loop.py skip dist/stat.py skip dist/stat_data.py skip dist/style.py diff --git a/src/third_party/wiredtiger/dist/s_label b/src/third_party/wiredtiger/dist/s_function index c7b63d9d5b3..3259e215d0c 100755 --- a/src/third_party/wiredtiger/dist/s_label +++ b/src/third_party/wiredtiger/dist/s_function @@ -1,6 +1,6 @@ #! /bin/sh -# Check WiredTiger error/return macros. +# Check various WiredTiger function behaviors. t=__wt.$$ trap 'rm -f $t' 0 1 2 3 13 15 @@ -36,7 +36,7 @@ done # Jumps before returns have already been detected above. for f in `find bench examples ext src test -name '*.[ci]'`; do file_parse $f | sed "s=^=$f:=" -done | python dist/s_label_loop.py | +done | python dist/s_function_loop.py | egrep '\{@[^@]*(WT_ILLEGAL_VALUE|WT_RET[_A-Z]*)\([^@]*(WT_ERR[_A-Z]*|WT_ILLEGAL_VALUE_ERR)\(.*err:' | sed -e 's/^\([^:]*\): *\([^:]*\):.*/\1:\2: mix of returns and jump to the error label within a loop/' @@ -80,4 +80,37 @@ for f in `find bench examples ext src test -name '*.[ci]'`; do done +# API_END with a return +for f in `find bench examples ext src test -name '*.[ci]'`; do + file_parse $f | + egrep '[^A-Z_]API_END.*return' | + sed 's/:.*//' > $t + test -s $t && { + echo "$f: API_END followed by return." + sed 's/^/function @ line:/' < $t + } +done + +# S2C with a local WT_CONNECTION_IMPL variable. +for f in `find bench examples ext src test -name '*.[ci]'`; do + file_parse $f | + egrep 'conn = S2C.*S2C' | + sed 's/:.*//' > $t + test -s $t && { + echo "$f: S2C with a local WT_CONNECTION_IMPL variable." + sed 's/^/function @ line:/' < $t + } +done + +# S2B with a local WT_BTREE variable. +for f in `find bench examples ext src test -name '*.[ci]'`; do + file_parse $f | + egrep 'btree = S2B.*S2B' | + sed 's/:.*//' > $t + test -s $t && { + echo "$f: S2B with a local WT_BTREE variable." + sed 's/^/function @ line:/' < $t + } +done + exit 0 diff --git a/src/third_party/wiredtiger/dist/s_label_loop.py b/src/third_party/wiredtiger/dist/s_function_loop.py index 5cc222a4250..5cc222a4250 100644 --- a/src/third_party/wiredtiger/dist/s_label_loop.py +++ b/src/third_party/wiredtiger/dist/s_function_loop.py diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index c79dc5129a5..58b8137cad9 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -74,6 +74,7 @@ Checksum Checksums CityHash CloseHandle +Cmvz Collet Comparator Config @@ -264,6 +265,7 @@ NoAddr Noll Nul OOB +OPLOG OPTYPE OUTBUFF OVFL @@ -528,6 +530,7 @@ checkpointer checkpointing checksum checksums +checksys checkvalue children's chk @@ -940,6 +943,7 @@ msvc multi multiblock multicore +multicycle multiprocess multisocket multithreaded @@ -1284,6 +1288,7 @@ vtype vunpack vw vxr +vz waitpid waker wakeup diff --git a/src/third_party/wiredtiger/examples/c/ex_all.c b/src/third_party/wiredtiger/examples/c/ex_all.c index 02237faf4e9..a0c6f87ceda 100644 --- a/src/third_party/wiredtiger/examples/c/ex_all.c +++ b/src/third_party/wiredtiger/examples/c/ex_all.c @@ -36,20 +36,20 @@ static const char *home; -void add_collator(WT_CONNECTION *conn); -void add_extractor(WT_CONNECTION *conn); -void backup(WT_SESSION *session); -void checkpoint_ops(WT_SESSION *session); -void connection_ops(WT_CONNECTION *conn); -int cursor_ops(WT_SESSION *session); -void cursor_search_near(WT_CURSOR *cursor); -void cursor_statistics(WT_SESSION *session); -void named_snapshot_ops(WT_SESSION *session); -void pack_ops(WT_SESSION *session); -void session_ops(WT_SESSION *session); -void transaction_ops(WT_CONNECTION *conn, WT_SESSION *session); +static void add_collator(WT_CONNECTION *conn); +static void add_extractor(WT_CONNECTION *conn); +static void backup(WT_SESSION *session); +static void checkpoint_ops(WT_SESSION *session); +static void connection_ops(WT_CONNECTION *conn); +static int cursor_ops(WT_SESSION *session); +static void cursor_search_near(WT_CURSOR *cursor); +static void cursor_statistics(WT_SESSION *session); +static void named_snapshot_ops(WT_SESSION *session); +static void pack_ops(WT_SESSION *session); +static void session_ops(WT_SESSION *session); +static void transaction_ops(WT_SESSION *session); -int +static int cursor_ops(WT_SESSION *session) { WT_CURSOR *cursor; @@ -66,6 +66,21 @@ cursor_ops(WT_SESSION *session) /*! [Open a cursor on the metadata] */ { + const char *key = "some key", *value = "some value"; + /*! [Reconfigure a cursor] */ + error_check(session->open_cursor( + session, "table:mytable", NULL, "overwrite=false", &cursor)); + + /* Reconfigure the cursor to overwrite the record. */ + error_check(cursor->reconfigure(cursor, "overwrite=true")); + + cursor->set_key(cursor, key); + cursor->set_value(cursor, value); + error_check(cursor->insert(cursor)); + /*! [Reconfigure a cursor] */ + } + + { WT_CURSOR *duplicate; const char *key = "some key"; /*! [Duplicate a cursor] */ @@ -81,20 +96,6 @@ cursor_ops(WT_SESSION *session) } { - const char *key = "some key", *value = "some value"; - /*! [Reconfigure a cursor] */ - error_check(session->open_cursor( - session, "table:mytable", NULL, "overwrite=false", &cursor)); - cursor->set_key(cursor, key); - cursor->set_value(cursor, value); - - /* Reconfigure the cursor to overwrite the record. */ - error_check(cursor->reconfigure(cursor, "overwrite=true")); - error_check(cursor->insert(cursor)); - /*! [Reconfigure a cursor] */ - } - - { /*! [boolean configuration string example] */ error_check(session->open_cursor( session, "table:mytable", NULL, "overwrite", &cursor)); @@ -105,6 +106,8 @@ cursor_ops(WT_SESSION *session) /*! [boolean configuration string example] */ } + error_check(session->checkpoint(session, "name=midnight")); + { /*! [open a named checkpoint] */ error_check(session->open_cursor(session, @@ -120,18 +123,31 @@ cursor_ops(WT_SESSION *session) } { + /*! [Set the cursor's string key] */ + /* Set the cursor's string key. */ + const char *key = "another key"; + cursor->set_key(cursor, key); + /*! [Set the cursor's string key] */ + } + + { /*! [Get the cursor's string key] */ const char *key; /* Get the cursor's string key. */ error_check(cursor->get_key(cursor, &key)); /*! [Get the cursor's string key] */ } + /* Switch to a recno table. */ + error_check(session->create( + session, "table:recno", "key_format=r,value_format=S")); + error_check(session->open_cursor( + session, "table:recno", NULL, NULL, &cursor)); + { - /*! [Set the cursor's string key] */ - /* Set the cursor's string key. */ - const char *key = "another key"; - cursor->set_key(cursor, key); - /*! [Set the cursor's string key] */ + /*! [Set the cursor's record number key] */ + uint64_t recno = 37; /* Set the cursor's record number key. */ + cursor->set_key(cursor, recno); + /*! [Set the cursor's record number key] */ } { @@ -141,11 +157,17 @@ cursor_ops(WT_SESSION *session) /*! [Get the cursor's record number key] */ } + /* Switch to a composite table. */ + error_check(session->create( + session, "table:composite", "key_format=SiH,value_format=S")); + error_check(session->open_cursor( + session, "table:recno", NULL, NULL, &cursor)); + { - /*! [Set the cursor's record number key] */ - uint64_t recno = 37; /* Set the cursor's record number key. */ - cursor->set_key(cursor, recno); - /*! [Set the cursor's record number key] */ + /*! [Set the cursor's composite key] */ + /* Set the cursor's "SiH" format composite key. */ + cursor->set_key(cursor, "first", (int32_t)5, (uint16_t)7); + /*! [Set the cursor's composite key] */ } { @@ -159,10 +181,11 @@ cursor_ops(WT_SESSION *session) } { - /*! [Set the cursor's composite key] */ - /* Set the cursor's "SiH" format composite key. */ - cursor->set_key(cursor, "first", (int32_t)5, (uint16_t)7); - /*! [Set the cursor's composite key] */ + /*! [Set the cursor's string value] */ + /* Set the cursor's string value. */ + const char *value = "another value"; + cursor->set_value(cursor, value); + /*! [Set the cursor's string value] */ } { @@ -173,14 +196,6 @@ cursor_ops(WT_SESSION *session) } { - /*! [Set the cursor's string value] */ - /* Set the cursor's string value. */ - const char *value = "another value"; - cursor->set_value(cursor, value); - /*! [Set the cursor's string value] */ - } - - { /*! [Get the cursor's raw value] */ WT_ITEM value; /* Get the cursor's raw value. */ error_check(cursor->get_value(cursor, &value)); @@ -196,20 +211,26 @@ cursor_ops(WT_SESSION *session) /*! [Set the cursor's raw value] */ } + error_check(cursor->insert(cursor)); + /*! [Return the next record] */ error_check(cursor->next(cursor)); /*! [Return the next record] */ - /*! [Return the previous record] */ - error_check(cursor->prev(cursor)); - /*! [Return the previous record] */ - /*! [Reset the cursor] */ error_check(cursor->reset(cursor)); /*! [Reset the cursor] */ + /*! [Return the previous record] */ + error_check(cursor->prev(cursor)); + /*! [Return the previous record] */ + { WT_CURSOR *other = NULL; + error_check( + session->open_cursor(session, NULL, cursor, NULL, &other)); + + { /*! [Cursor comparison] */ int compare; error_check(cursor->compare(cursor, other, &compare)); @@ -224,7 +245,6 @@ cursor_ops(WT_SESSION *session) } { - WT_CURSOR *other = NULL; /*! [Cursor equality] */ int equal; error_check(cursor->equals(cursor, other, &equal)); @@ -235,17 +255,8 @@ cursor_ops(WT_SESSION *session) } /*! [Cursor equality] */ } - - { - /*! [Search for an exact match] */ - const char *key = "some key"; - cursor->set_key(cursor, key); - error_check(cursor->search(cursor)); - /*! [Search for an exact match] */ } - cursor_search_near(cursor); - { /*! [Insert a new record or overwrite an existing record] */ /* Insert a new record or overwrite an existing record. */ @@ -259,9 +270,19 @@ cursor_ops(WT_SESSION *session) } { + /*! [Search for an exact match] */ + const char *key = "some key"; + cursor->set_key(cursor, key); + error_check(cursor->search(cursor)); + /*! [Search for an exact match] */ + } + + cursor_search_near(cursor); + + { /*! [Insert a new record and fail if the record exists] */ /* Insert a new record and fail if the record exists. */ - const char *key = "some key", *value = "some value"; + const char *key = "new key", *value = "some value"; error_check(session->open_cursor( session, "table:mytable", NULL, "overwrite=false", &cursor)); cursor->set_key(cursor, key); @@ -270,35 +291,52 @@ cursor_ops(WT_SESSION *session) /*! [Insert a new record and fail if the record exists] */ } + error_check(session->open_cursor( + session, "table:recno", NULL, "append", &cursor)); + { /*! [Insert a new record and assign a record number] */ /* Insert a new record and assign a record number. */ uint64_t recno; const char *value = "some value"; - error_check(session->open_cursor( - session, "table:mytable", NULL, "append", &cursor)); cursor->set_value(cursor, value); error_check(cursor->insert(cursor)); error_check(cursor->get_key(cursor, &recno)); /*! [Insert a new record and assign a record number] */ } + error_check(session->open_cursor( + session, "table:mytable", NULL, NULL, &cursor)); + { /*! [Reserve a record] */ const char *key = "some key"; - error_check(session->open_cursor( - session, "table:mytable", NULL, NULL, &cursor)); + error_check(session->begin_transaction(session, NULL)); cursor->set_key(cursor, key); error_check(cursor->reserve(cursor)); + error_check(session->commit_transaction(session, NULL)); /*! [Reserve a record] */ } + error_check(session->create( + session, "table:blob", "key_format=S,value_format=u")); + error_check(session->open_cursor( + session, "table:blob", NULL, NULL, &cursor)); + { + WT_ITEM value; + value.data = "abcdefghijklmnopqrstuvwxyz" + "abcdefghijklmnopqrstuvwxyz" + "abcdefghijklmnopqrstuvwxyz"; + value.size = strlen(value.data); + cursor->set_key(cursor, "some key"); + cursor->set_value(cursor, &value); + error_check(cursor->insert(cursor)); + } + { /*! [Modify an existing record] */ WT_MODIFY entries[3]; const char *key = "some key"; - error_check(session->open_cursor( - session, "table:mytable", NULL, NULL, &cursor)); /* Position the cursor. */ cursor->set_key(cursor, key); @@ -349,23 +387,23 @@ cursor_ops(WT_SESSION *session) } { - /*! [Remove a record] */ + /*! [Remove a record and fail if DNE] */ const char *key = "some key"; error_check(session->open_cursor( - session, "table:mytable", NULL, NULL, &cursor)); + session, "table:mytable", NULL, "overwrite=false", &cursor)); cursor->set_key(cursor, key); error_check(cursor->remove(cursor)); - /*! [Remove a record] */ + /*! [Remove a record and fail if DNE] */ } { - /*! [Remove a record and fail if DNE] */ + /*! [Remove a record] */ const char *key = "some key"; error_check(session->open_cursor( - session, "table:mytable", NULL, "overwrite=false", &cursor)); + session, "table:mytable", NULL, NULL, &cursor)); cursor->set_key(cursor, key); error_check(cursor->remove(cursor)); - /*! [Remove a record and fail if DNE] */ + /*! [Remove a record] */ } { @@ -400,7 +438,7 @@ cursor_ops(WT_SESSION *session) return (0); } -void +static void cursor_search_near(WT_CURSOR *cursor) { int exact, ret; @@ -445,9 +483,12 @@ cursor_search_near(WT_CURSOR *cursor) /*! [Backward scan less than] */ } -void +static void checkpoint_ops(WT_SESSION *session) { + error_check(session->create(session, "table:table1", NULL)); + error_check(session->create(session, "table:table2", NULL)); + /*! [Checkpoint examples] */ /* Checkpoint the database. */ error_check(session->checkpoint(session, NULL)); @@ -506,7 +547,7 @@ checkpoint_ops(WT_SESSION *session) /*! [JSON quoting example] */ } -void +static void cursor_statistics(WT_SESSION *session) { WT_CURSOR *cursor; @@ -538,7 +579,7 @@ cursor_statistics(WT_SESSION *session) /*! [Statistics cursor clear configuration] */ } -void +static void named_snapshot_ops(WT_SESSION *session) { /*! [Snapshot examples] */ @@ -551,11 +592,17 @@ named_snapshot_ops(WT_SESSION *session) /* Drop all named snapshots */ error_check(session->snapshot(session, "drop=(all)")); /*! [Snapshot examples] */ + + error_check(session->rollback_transaction(session, NULL)); } -void +static void session_ops(WT_SESSION *session) { + WT_CONNECTION *conn; + + conn = session->connection; + /*! [Reconfigure a session] */ error_check(session->reconfigure(session, "isolation=snapshot")); /*! [Reconfigure a session] */ @@ -765,20 +812,47 @@ session_ops(WT_SESSION *session) error_check(session->verify(session, "table:mytable", NULL)); /*! [Verify a table] */ - /*! [Drop a table] */ - error_check(session->drop(session, "table:mytable", NULL)); - /*! [Drop a table] */ - } + /* + * We can't call the backup function because it includes absolute paths + * for documentation purposes that don't exist on test systems. That + * said, we have to reference the function to avoid build warnings + * about unused static code. + */ + (void)backup; + + /* Call other functions, where possible. */ + checkpoint_ops(session); + error_check(cursor_ops(session)); + cursor_statistics(session); + named_snapshot_ops(session); + pack_ops(session); + transaction_ops(session); /*! [Close a session] */ error_check(session->close(session, NULL)); /*! [Close a session] */ + + /* + * We close the old session first to close all cursors, open a new one + * for the drop. + */ + error_check(conn->open_session(conn, NULL, NULL, &session)); + + /*! [Drop a table] */ + error_check(session->drop(session, "table:mytable", NULL)); + /*! [Drop a table] */ + } } -void -transaction_ops(WT_CONNECTION *conn, WT_SESSION *session) +static void +transaction_ops(WT_SESSION *session_arg) { + WT_CONNECTION *conn; WT_CURSOR *cursor; + WT_SESSION *session; + + session = session_arg; + conn = session->connection; /*! [transaction commit/rollback] */ /* @@ -829,7 +903,7 @@ transaction_ops(WT_CONNECTION *conn, WT_SESSION *session) /*! [session isolation configuration] */ /* Open a session configured for read-uncommitted isolation. */ error_check(conn->open_session( - conn, NULL, "isolation=read_uncommitted", &session)); + conn, NULL, "isolation=read-uncommitted", &session)); /*! [session isolation configuration] */ /*! [session isolation re-configuration] */ @@ -837,6 +911,9 @@ transaction_ops(WT_CONNECTION *conn, WT_SESSION *session) error_check(session->reconfigure(session, "isolation=snapshot")); /*! [session isolation re-configuration] */ + error_check(session->close(session, NULL)); + session = session_arg; + { /*! [transaction pinned range] */ /* Check the transaction ID range pinned by the session handle. */ @@ -846,17 +923,19 @@ transaction_ops(WT_CONNECTION *conn, WT_SESSION *session) /*! [transaction pinned range] */ } + error_check(session->begin_transaction(session, NULL)); + +#ifdef HAVE_TIMESTAMPS + { + /*! [query timestamp] */ + char timestamp_buf[2 * WT_TIMESTAMP_SIZE + 1]; + /*! [transaction timestamp] */ error_check( session->timestamp_transaction(session, "commit_timestamp=2a")); /*! [transaction timestamp] */ - { -#ifndef WT_TIMESTAMP_SIZE -#define WT_TIMESTAMP_SIZE 8 -#endif - /*! [query timestamp] */ - char timestamp_buf[2 * WT_TIMESTAMP_SIZE + 1]; + error_check(session->commit_transaction(session, NULL)); error_check(conn->query_timestamp( conn, timestamp_buf, "get=all_committed")); @@ -871,9 +950,14 @@ transaction_ops(WT_CONNECTION *conn, WT_SESSION *session) error_check(conn->set_timestamp(conn, "oldest_timestamp=2a")); /*! [set oldest timestamp] */ + /*! [set stable timestamp] */ + error_check(conn->set_timestamp(conn, "stable_timestamp=2a")); + /*! [set stable timestamp] */ + /*! [rollback to stable] */ - error_check(conn->rollback_to_stable(conn, "")); + error_check(conn->rollback_to_stable(conn, NULL)); /*! [rollback to stable] */ +#endif } /*! [Implement WT_COLLATOR] */ @@ -900,7 +984,7 @@ my_compare(WT_COLLATOR *collator, WT_SESSION *session, } /*! [Implement WT_COLLATOR] */ -void +static void add_collator(WT_CONNECTION *conn) { /*! [WT_COLLATOR register] */ @@ -926,7 +1010,7 @@ my_extract(WT_EXTRACTOR *extractor, WT_SESSION *session, } /*! [WT_EXTRACTOR] */ -void +static void add_extractor(WT_CONNECTION *conn) { /*! [WT_EXTRACTOR register] */ @@ -937,7 +1021,7 @@ add_extractor(WT_CONNECTION *conn) /*! [WT_EXTRACTOR register] */ } -void +static void connection_ops(WT_CONNECTION *conn) { #ifdef MIGHT_NOT_RUN @@ -1017,7 +1101,7 @@ connection_ops(WT_CONNECTION *conn) /*! [Close a connection] */ } -void +static void pack_ops(WT_SESSION *session) { { @@ -1047,7 +1131,7 @@ pack_ops(WT_SESSION *session) } } -void +static void backup(WT_SESSION *session) { char buf[1024]; @@ -1099,7 +1183,8 @@ main(int argc, char *argv[]) /*! [Open a connection] */ error_check(wiredtiger_open(home, NULL, - "create,cache_size=5GB,log=(enabled,recover=on)", &conn)); + "create,cache_size=5GB,log=(enabled,recover=on),statistics=(all)", + &conn)); /*! [Open a connection] */ connection_ops(conn); diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 4d1ae59f448..c0f667140d0 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "698847557ce7b3a938bbc8334d64a9430e4dc786", + "commit": "b055251678e6b4fcc74a1f651432aadbfeecc0e4", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-3.6" diff --git a/src/third_party/wiredtiger/src/block/block_ckpt.c b/src/third_party/wiredtiger/src/block/block_ckpt.c index c20a294c07b..98cc10a6de1 100644 --- a/src/third_party/wiredtiger/src/block/block_ckpt.c +++ b/src/third_party/wiredtiger/src/block/block_ckpt.c @@ -342,6 +342,11 @@ __ckpt_verify(WT_SESSION_IMPL *session, WT_CKPT *ckptbase) break; /* FALLTHROUGH */ default: + /* + * Don't convert to WT_ILLEGAL_VALUE, it won't compile + * on some gcc compilers because they don't understand + * FALLTHROUGH as part of a macro. + */ return ( __wt_illegal_value(session, "checkpoint array")); } diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index d58dc78fbed..6e1ab526e52 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -1197,8 +1197,7 @@ __cursor_chain_exceeded(WT_CURSOR_BTREE *cbt) upd = page->modify->mod_row_update[cbt->slot]; for (i = 0; upd != NULL; ++i, upd = upd->next) { - if (upd->type == WT_UPDATE_DELETED || - upd->type == WT_UPDATE_STANDARD) + if (WT_UPDATE_DATA_VALUE(upd)) return (false); if (i >= WT_MAX_MODIFY_UPDATE) return (true); @@ -1219,7 +1218,7 @@ __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries) WT_DECL_RET; WT_SESSION_IMPL *session; size_t orig, new; - bool chain_exceeded, overwrite; + bool overwrite; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; @@ -1259,13 +1258,13 @@ __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries) /* * WT_CURSOR.modify is update-without-overwrite. * - * Use the modify buffer as the update if under the limit, else use the - * complete value. + * Use the modify buffer as the update if the data package saves us some + * memory and the update chain is under the limit, else use the complete + * value. */ overwrite = F_ISSET(cursor, WT_CURSTD_OVERWRITE); F_CLR(cursor, WT_CURSTD_OVERWRITE); - chain_exceeded = __cursor_chain_exceeded(cbt); - if (chain_exceeded) + if (cursor->value.size <= 64 || __cursor_chain_exceeded(cbt)) ret = __btcur_update(cbt, &cursor->value, WT_UPDATE_STANDARD); else if ((ret = __wt_modify_pack(session, &modify, entries, nentries)) == 0) diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index b8d11be7b3e..d91ac027738 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -39,7 +39,6 @@ static int __debug_col_skip(WT_DBG *, WT_INSERT_HEAD *, const char *, bool); static int __debug_config(WT_SESSION_IMPL *, WT_DBG *, const char *); static int __debug_dsk_cell(WT_DBG *, const WT_PAGE_HEADER *); static int __debug_dsk_col_fix(WT_DBG *, const WT_PAGE_HEADER *); -static int __debug_item(WT_DBG *, const char *, const void *, size_t); static int __debug_page(WT_DBG *, WT_REF *, uint32_t); static int __debug_page_col_fix(WT_DBG *, WT_REF *); static int __debug_page_col_int(WT_DBG *, WT_PAGE *, uint32_t); @@ -81,6 +80,41 @@ __debug_hex_byte(WT_DBG *ds, uint8_t v) } /* + * __debug_bytes -- + * Dump a single set of bytes. + */ +static int +__debug_bytes(WT_DBG *ds, const void *data_arg, size_t size) +{ + size_t i; + u_char ch; + const uint8_t *data; + + for (data = data_arg, i = 0; i < size; ++i, ++data) { + ch = data[0]; + if (__wt_isprint(ch)) + WT_RET(ds->f(ds, "%c", (int)ch)); + else + WT_RET(__debug_hex_byte(ds, data[0])); + } + return (0); +} + +/* + * __debug_item -- + * Dump a single data/size pair, with an optional tag. + */ +static int +__debug_item(WT_DBG *ds, const char *tag, const void *data_arg, size_t size) +{ + WT_RET(ds->f(ds, + "\t%s%s{", tag == NULL ? "" : tag, tag == NULL ? "" : " ")); + WT_RET(__debug_bytes(ds, data_arg, size)); + WT_RET(ds->f(ds, "}\n")); + return (0); +} + +/* * __dmsg_event -- * Send a debug message to the event handler. */ @@ -993,23 +1027,26 @@ static int __debug_modified(WT_DBG *ds, WT_UPDATE *upd) { const size_t *p; - int nentries; + size_t nentries, data_size, offset, size; const uint8_t *data; - void *modify; - - modify = upd->data; - p = modify; - nentries = (int)*p++; - data = (uint8_t *)modify + + p = (size_t *)upd->data; + memcpy(&nentries, p++, sizeof(size_t)); + data = upd->data + sizeof(size_t) + ((size_t)nentries * 3 * sizeof(size_t)); - WT_RET(ds->f(ds, "%d: ", nentries)); - for (; nentries-- > 0; data += p[0], p += 3) + WT_RET(ds->f(ds, "%" WT_SIZET_FMT ": ", nentries)); + for (; nentries-- > 0; data += data_size) { + memcpy(&data_size, p++, sizeof(size_t)); + memcpy(&offset, p++, sizeof(size_t)); + memcpy(&size, p++, sizeof(size_t)); WT_RET(ds->f(ds, "{%" WT_SIZET_FMT ", %" WT_SIZET_FMT ", %" WT_SIZET_FMT - ", %.*s}%s", p[0], p[1], p[2], - (int)p[2], data, nentries == 0 ? "" : ", ")); + ", ", + data_size, offset, size)); + WT_RET(__debug_bytes(ds, data, data_size)); + WT_RET(ds->f(ds, "}%s", nentries == 0 ? "" : ", ")); + } return (0); } @@ -1052,17 +1089,10 @@ __debug_update(WT_DBG *ds, WT_UPDATE *upd, bool hexbyte) #ifdef HAVE_TIMESTAMPS if (!__wt_timestamp_iszero( WT_TIMESTAMP_NULL(&upd->timestamp))) { -#if WT_TIMESTAMP_SIZE == 8 - WT_RET(ds->f(ds, - ", stamp %" PRIu64, upd->timestamp.val)); -#else - int i; - - WT_RET(ds->f(ds, ", stamp 0x")); - for (i = 0; i < WT_TIMESTAMP_SIZE; ++i) - WT_RET(ds->f(ds, - "%" PRIx8, upd->timestamp.ts[i])); -#endif + char hex_timestamp[2 * WT_TIMESTAMP_SIZE + 1]; + WT_RET(__wt_timestamp_to_hex_string( + ds->session, hex_timestamp, &upd->timestamp)); + WT_RET(ds->f(ds, ", stamp %s", hex_timestamp)); } #endif WT_RET(ds->f(ds, "\n")); @@ -1250,28 +1280,4 @@ __debug_cell_data(WT_DBG *ds, return (ret); } - -/* - * __debug_item -- - * Dump a single data/size pair, with an optional tag. - */ -static int -__debug_item(WT_DBG *ds, const char *tag, const void *data_arg, size_t size) -{ - size_t i; - u_char ch; - const uint8_t *data; - - WT_RET(ds->f(ds, - "\t%s%s{", tag == NULL ? "" : tag, tag == NULL ? "" : " ")); - for (data = data_arg, i = 0; i < size; ++i, ++data) { - ch = data[0]; - if (__wt_isprint(ch)) - WT_RET(ds->f(ds, "%c", (int)ch)); - else - WT_RET(__debug_hex_byte(ds, data[0])); - } - WT_RET(ds->f(ds, "}\n")); - return (0); -} #endif diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c index bc9356e2669..806a9770057 100644 --- a/src/third_party/wiredtiger/src/btree/bt_discard.c +++ b/src/third_party/wiredtiger/src/btree/bt_discard.c @@ -249,9 +249,8 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) /* Free the overflow on-page, reuse and transaction-cache skiplists. */ __wt_ovfl_reuse_free(session, page); - if (mod->ovfl_track != NULL) - __wt_free(session, mod->ovfl_track->remove); __wt_ovfl_discard_free(session, page); + __wt_ovfl_discard_remove(session, page); __wt_free(session, page->modify->ovfl_track); __wt_spin_destroy(session, &page->modify->page_lock); diff --git a/src/third_party/wiredtiger/src/btree/bt_ovfl.c b/src/third_party/wiredtiger/src/btree/bt_ovfl.c index f933245eaef..fab38f3cc8d 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ovfl.c +++ b/src/third_party/wiredtiger/src/btree/bt_ovfl.c @@ -49,7 +49,6 @@ __wt_ovfl_read(WT_SESSION_IMPL *session, { WT_DECL_RET; WT_OVFL_TRACK *track; - WT_UPDATE *upd; size_t i; *decoded = false; @@ -74,14 +73,13 @@ __wt_ovfl_read(WT_SESSION_IMPL *session, __wt_readlock(session, &S2BT(session)->ovfl_lock); if (__wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM) { track = page->modify->ovfl_track; - for (upd = NULL, i = 0; i < track->remove_next; ++i) + for (i = 0; i < track->remove_next; ++i) if (track->remove[i].cell == unpack->cell) { - upd = track->remove[i].upd; + store->data = track->remove[i].data; + store->size = track->remove[i].size; break; } WT_ASSERT(session, i < track->remove_next); - store->data = upd->data; - store->size = upd->size; *decoded = true; } else ret = __ovfl_read(session, unpack->data, unpack->size, store); @@ -91,134 +89,56 @@ __wt_ovfl_read(WT_SESSION_IMPL *session, } /* - * __ovfl_cache_col_visible -- - * column-store: check for a globally visible update. + * __wt_ovfl_discard_remove -- + * Free the on-page overflow value cache. */ -static bool -__ovfl_cache_col_visible( - WT_SESSION_IMPL *session, WT_UPDATE *upd, WT_CELL_UNPACK *unpack) +void +__wt_ovfl_discard_remove(WT_SESSION_IMPL *session, WT_PAGE *page) { - /* - * Column-store is harder than row_store: we're here because there's a - * reader in the system that might read the original version of an - * overflow record, which might match a number of records. For example, - * the original overflow value was for records 100-200, we've replaced - * each of those records individually, but there exists a reader that - * might read any one of those records, and all of those records have - * different update entries with different transaction IDs. Since it's - * infeasible to determine if there's a globally visible update for each - * reader for each record, we test the simple case where a single record - * has a single, globally visible update. If that's not the case, cache - * the value. - */ - if (__wt_cell_rle(unpack) == 1 && - WT_UPDATE_DATA_VALUE(upd) && __wt_txn_upd_visible_all(session, upd)) - return (true); - return (false); -} - -/* - * __ovfl_cache_row_visible -- - * row-store: check for a globally visible update. - */ -static bool -__ovfl_cache_row_visible(WT_SESSION_IMPL *session, WT_UPDATE *upd) -{ - /* Check to see if there's a globally visible update. */ - for (; upd != NULL; upd = upd->next) - if (WT_UPDATE_DATA_VALUE(upd) && - __wt_txn_upd_visible_all(session, upd)) - return (true); - - return (false); + WT_OVFL_TRACK *track; + uint32_t i; + + if (page->modify != NULL && + (track = page->modify->ovfl_track) != NULL) { + for (i = 0; i < track->remove_next; ++i) + __wt_free(session, track->remove[i].data); + __wt_free(session, page->modify->ovfl_track->remove); + track->remove_allocated = 0; + track->remove_next = 0; + } } /* - * __ovfl_cache_append_update -- - * Append an overflow value to the update list. + * __ovfl_cache -- + * Cache an overflow value. */ static int -__ovfl_cache_append_update(WT_SESSION_IMPL *session, WT_PAGE *page, - WT_UPDATE *upd_list, WT_CELL_UNPACK *unpack, WT_UPDATE **updp) +__ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack) { WT_DECL_ITEM(tmp); WT_DECL_RET; - WT_UPDATE *append, *upd; - size_t size; - - *updp = NULL; + WT_OVFL_TRACK *track; /* Read the overflow value. */ WT_RET(__wt_scr_alloc(session, 1024, &tmp)); WT_ERR(__wt_dsk_cell_data_ref(session, page->type, unpack, tmp)); - /* - * Create an update entry with no transaction ID to ensure global - * visibility, append it to the update list. - * - * We don't need locks or barriers in this function: any thread reading - * the update list will see our newly appended record or not, it doesn't - * matter until the on-page cell is set to WT_CELL_VALUE_OVFL_RM. That - * involves atomic operations which will act as our barrier. Regardless, - * we update the page footprint as part of this operation, which acts as - * a barrier as well. - * - * The update transaction ID choice is tricky, to work around an issue - * in variable-length column store. Imagine an overflow value with an - * RLE greater than 1. We append a copy to the end of an update chain, - * but it's possible it's the overflow value for more than one record, - * and appending it to the end of one record's update chain means a - * subsequent enter of a globally visible value to one of the records - * would allow the truncation of the overflow chain that leaves other - * records without a value. If appending such an overflow record, set - * the transaction ID to the first possible transaction ID. That ID is - * old enough to be globally visible, but we can use it as a flag if an - * update record cannot be discarded when truncating an update chain. - */ - WT_ERR(__wt_update_alloc( - session, tmp, &append, &size, WT_UPDATE_STANDARD)); - append->txnid = page->type == WT_PAGE_COL_VAR && - __wt_cell_rle(unpack) > 1 ? WT_TXN_FIRST : WT_TXN_NONE; - for (upd = upd_list; upd->next != NULL; upd = upd->next) - ; - WT_PUBLISH(upd->next, append); - - __wt_cache_page_inmem_incr(session, page, size); - - *updp = append; - -err: __wt_scr_free(session, &tmp); - return (ret); -} - -/* - * __ovfl_cache -- - * Cache an overflow value. - */ -static int -__ovfl_cache(WT_SESSION_IMPL *session, - WT_PAGE *page, WT_UPDATE *upd_list, WT_CELL_UNPACK *unpack) -{ - WT_OVFL_TRACK *track; - WT_UPDATE *upd; - - /* Append a copy of the overflow value to the update list. */ - WT_RET(__ovfl_cache_append_update( - session, page, upd_list, unpack, &upd)); - /* Allocating tracking structures as necessary. */ if (page->modify->ovfl_track == NULL) - WT_RET(__wt_ovfl_track_init(session, page)); + WT_ERR(__wt_ovfl_track_init(session, page)); track = page->modify->ovfl_track; - /* Add the value's information to the update list. */ - WT_RET(__wt_realloc_def(session, + /* Copy the overflow item into place. */ + WT_ERR(__wt_realloc_def(session, &track->remove_allocated, track->remove_next + 1, &track->remove)); track->remove[track->remove_next].cell = unpack->cell; - track->remove[track->remove_next].upd = upd; + WT_ERR(__wt_strndup(session, + tmp->data, tmp->size, &track->remove[track->remove_next].data)); + track->remove[track->remove_next].size = tmp->size; ++track->remove_next; - return (0); +err: __wt_scr_free(session, &tmp); + return (ret); } /* @@ -227,12 +147,14 @@ __ovfl_cache(WT_SESSION_IMPL *session, */ int __wt_ovfl_remove(WT_SESSION_IMPL *session, - WT_PAGE *page, WT_UPDATE *upd_list, WT_CELL_UNPACK *unpack) + WT_PAGE *page, WT_CELL_UNPACK *unpack, bool checkpoint) { - bool visible; - /* - * This function solves a problem in reconciliation. The scenario is: + * This function solves two problems in reconciliation. + * + * The first problem is snapshot readers needing on-page overflow values + * that have been removed. The scenario is as follows: + * * - reconciling a leaf page that references an overflow item * - the item is updated and the update committed * - a checkpoint runs, freeing the backing overflow blocks @@ -263,28 +185,16 @@ __wt_ovfl_remove(WT_SESSION_IMPL *session, * per overflow item. We don't do any of that because overflow values * are supposed to be rare and we shouldn't see contention for the lock. * - * Check for a globally visible update. If there is a globally visible - * update, we don't need to cache the item because it's not possible for - * a running thread to have moved past it. - */ - switch (page->type) { - case WT_PAGE_COL_VAR: - visible = __ovfl_cache_col_visible(session, upd_list, unpack); - break; - case WT_PAGE_ROW_LEAF: - visible = __ovfl_cache_row_visible(session, upd_list); - break; - WT_ILLEGAL_VALUE(session); - } - - /* - * If there's no globally visible update, there's a reader in the system - * that might try and read the old value, cache it. + * We only have to do this for checkpoints: in any eviction mode, there + * can't be threads sitting in our update lists. */ - if (!visible) - WT_RET(__ovfl_cache(session, page, upd_list, unpack)); + if (checkpoint) + WT_RET(__ovfl_cache(session, page, unpack)); /* + * The second problem is to only remove the underlying blocks once, + * solved by the WT_CELL_VALUE_OVFL_RM flag. + * * Queue the on-page cell to be set to WT_CELL_VALUE_OVFL_RM and the * underlying overflow value's blocks to be freed when reconciliation * completes. diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index 49b12b2d4e9..0c3cb026421 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -96,7 +96,7 @@ __col_instantiate(WT_SESSION_IMPL *session, /* Search the page and add updates. */ WT_RET(__wt_col_search(session, recno, ref, cbt)); WT_RET(__wt_col_modify( - session, cbt, recno, NULL, updlist, updlist->type, false)); + session, cbt, recno, NULL, updlist, WT_UPDATE_INVALID, false)); return (0); } @@ -121,7 +121,7 @@ __row_instantiate(WT_SESSION_IMPL *session, /* Search the page and add updates. */ WT_RET(__wt_row_search(session, key, ref, cbt, true)); WT_RET(__wt_row_modify( - session, cbt, key, NULL, updlist, updlist->type, false)); + session, cbt, key, NULL, updlist, WT_UPDATE_INVALID, false)); return (0); } diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index a0db4457f62..ac90d6693d3 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -1446,8 +1446,8 @@ __split_multi_inmem( WT_ERR(__wt_col_search(session, recno, ref, &cbt)); /* Apply the modification. */ - WT_ERR(__wt_col_modify( - session, &cbt, recno, NULL, upd, upd->type, true)); + WT_ERR(__wt_col_modify(session, &cbt, + recno, NULL, upd, WT_UPDATE_INVALID, true)); break; case WT_PAGE_ROW_LEAF: /* Build a key. */ @@ -1468,8 +1468,8 @@ __split_multi_inmem( WT_ERR(__wt_row_search(session, key, ref, &cbt, true)); /* Apply the modification. */ - WT_ERR(__wt_row_modify( - session, &cbt, key, NULL, upd, upd->type, true)); + WT_ERR(__wt_row_modify(session, + &cbt, key, NULL, upd, WT_UPDATE_INVALID, true)); break; WT_ILLEGAL_VALUE_ERR(session); } diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c index 3fdafcebfb9..261c0fc1937 100644 --- a/src/third_party/wiredtiger/src/btree/bt_walk.c +++ b/src/third_party/wiredtiger/src/btree/bt_walk.c @@ -634,8 +634,7 @@ err: WT_LEAVE_PAGE_INDEX(session); int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags) { - return (__tree_walk_internal( - session, refp, NULL, NULL, NULL, flags)); + return (__tree_walk_internal(session, refp, NULL, NULL, NULL, flags)); } /* @@ -661,8 +660,8 @@ __wt_tree_walk_custom_skip( int (*skip_func)(WT_SESSION_IMPL *, WT_REF *, void *, bool *), void *func_cookie, uint32_t flags) { - return (__tree_walk_internal(session, refp, - NULL, skip_func, func_cookie, flags)); + return (__tree_walk_internal( + session, refp, NULL, skip_func, func_cookie, flags)); } /* diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c index 6e610b86376..5e84899999a 100644 --- a/src/third_party/wiredtiger/src/btree/row_modify.c +++ b/src/third_party/wiredtiger/src/btree/row_modify.c @@ -263,6 +263,13 @@ __wt_update_alloc(WT_SESSION_IMPL *session, const WT_ITEM *value, *updp = NULL; /* + * The code paths leading here are convoluted: assert we never attempt + * to allocate an update structure if only intending to insert one we + * already have. + */ + WT_ASSERT(session, modify_type != WT_UPDATE_INVALID); + + /* * Allocate the WT_UPDATE structure and room for the value, then copy * the value into place. */ @@ -304,14 +311,11 @@ __wt_update_obsolete_check( * Walk the list of updates, looking for obsolete updates at the end. * * Only updates with globally visible, self-contained data can terminate - * update chains, ignore modified and reserved updates. Special case the - * first transaction ID, it flags column-store overflow values which can - * never be discarded. + * update chains. */ for (first = NULL, count = 0; upd != NULL; upd = upd->next, count++) if (WT_UPDATE_DATA_VALUE(upd) && - __wt_txn_upd_visible_all(session, upd) && - upd->txnid != WT_TXN_FIRST) { + __wt_txn_upd_visible_all(session, upd)) { if (first == NULL) first = upd; } else if (upd->txnid != WT_TXN_ABORTED) diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index c53a63ccb25..764006b024d 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -172,8 +172,8 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = { "\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\"," "\"rebalance\",\"reconcile\",\"recovery\",\"recovery_progress\"," "\"salvage\",\"shared_cache\",\"split\",\"temporary\"," - "\"thread_group\",\"transaction\",\"verify\",\"version\"," - "\"write\"]", + "\"thread_group\",\"timestamp\",\"transaction\",\"verify\"," + "\"version\",\"write\"]", NULL, 0 }, { NULL, NULL, NULL, NULL, NULL, 0 } }; @@ -225,7 +225,6 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_checkpoint[] = { { "drop", "list", NULL, NULL, NULL, 0 }, { "force", "boolean", NULL, NULL, NULL, 0 }, { "name", "string", NULL, NULL, NULL, 0 }, - { "read_timestamp", "string", NULL, NULL, NULL, 0 }, { "target", "list", NULL, NULL, NULL, 0 }, { "use_timestamp", "boolean", NULL, NULL, NULL, 0 }, { NULL, NULL, NULL, NULL, NULL, 0 } @@ -802,8 +801,8 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { "\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\"," "\"rebalance\",\"reconcile\",\"recovery\",\"recovery_progress\"," "\"salvage\",\"shared_cache\",\"split\",\"temporary\"," - "\"thread_group\",\"transaction\",\"verify\",\"version\"," - "\"write\"]", + "\"thread_group\",\"timestamp\",\"transaction\",\"verify\"," + "\"version\",\"write\"]", NULL, 0 }, { "write_through", "list", NULL, "choices=[\"data\",\"log\"]", @@ -897,8 +896,8 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { "\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\"," "\"rebalance\",\"reconcile\",\"recovery\",\"recovery_progress\"," "\"salvage\",\"shared_cache\",\"split\",\"temporary\"," - "\"thread_group\",\"transaction\",\"verify\",\"version\"," - "\"write\"]", + "\"thread_group\",\"timestamp\",\"transaction\",\"verify\"," + "\"version\",\"write\"]", NULL, 0 }, { "version", "string", NULL, NULL, NULL, 0 }, { "write_through", "list", @@ -987,8 +986,8 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { "\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\"," "\"rebalance\",\"reconcile\",\"recovery\",\"recovery_progress\"," "\"salvage\",\"shared_cache\",\"split\",\"temporary\"," - "\"thread_group\",\"transaction\",\"verify\",\"version\"," - "\"write\"]", + "\"thread_group\",\"timestamp\",\"transaction\",\"verify\"," + "\"version\",\"write\"]", NULL, 0 }, { "version", "string", NULL, NULL, NULL, 0 }, { "write_through", "list", @@ -1077,8 +1076,8 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { "\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\"," "\"rebalance\",\"reconcile\",\"recovery\",\"recovery_progress\"," "\"salvage\",\"shared_cache\",\"split\",\"temporary\"," - "\"thread_group\",\"transaction\",\"verify\",\"version\"," - "\"write\"]", + "\"thread_group\",\"timestamp\",\"transaction\",\"verify\"," + "\"version\",\"write\"]", NULL, 0 }, { "write_through", "list", NULL, "choices=[\"data\",\"log\"]", @@ -1173,9 +1172,8 @@ static const WT_CONFIG_ENTRY config_entries[] = { confchk_WT_SESSION_begin_transaction, 6 }, { "WT_SESSION.checkpoint", - "drop=,force=false,name=,read_timestamp=,target=," - "use_timestamp=true", - confchk_WT_SESSION_checkpoint, 6 + "drop=,force=false,name=,target=,use_timestamp=true", + confchk_WT_SESSION_checkpoint, 5 }, { "WT_SESSION.close", "", diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index df71ddf18f6..b29b6184ce3 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -8,8 +8,6 @@ #include "wt_internal.h" -static int __conn_statistics_config(WT_SESSION_IMPL *, const char *[]); - /* * ext_collate -- * Call the collation function (external API version). @@ -190,45 +188,6 @@ __wt_conn_remove_collator(WT_SESSION_IMPL *session) } /* - * __conn_compat_config -- - * Configure compatibility version. - */ -static int -__conn_compat_config(WT_SESSION_IMPL *session, const char **cfg) -{ - WT_CONFIG_ITEM cval; - WT_CONNECTION_IMPL *conn; - uint16_t patch; - - conn = S2C(session); - WT_RET(__wt_config_gets(session, cfg, - "compatibility.release", &cval)); - if (cval.len != 0) { - /* - * Accept either a major.minor release string or a - * major.minor.patch release string. We ignore the patch - * value, but allow it in the string. - */ - if (sscanf(cval.str, "%" SCNu16 ".%" SCNu16, - &conn->compat_major, &conn->compat_minor) != 2 && - sscanf(cval.str, "%" SCNu16 ".%" SCNu16 ".%" SCNu16, - &conn->compat_major, &conn->compat_minor, &patch) != 3) - WT_RET_MSG(session, - EINVAL, "illegal compatibility release"); - if (conn->compat_major > WIREDTIGER_VERSION_MAJOR) - WT_RET_MSG(session, EINVAL, "unknown major version"); - if (conn->compat_major == WIREDTIGER_VERSION_MAJOR && - conn->compat_minor > WIREDTIGER_VERSION_MINOR) - WT_RET_MSG(session, - EINVAL, "illegal compatibility version"); - } else { - conn->compat_major = WIREDTIGER_VERSION_MAJOR; - conn->compat_minor = WIREDTIGER_VERSION_MINOR; - } - return (0); -} - -/* * __compressor_confchk -- * Validate the compressor. */ @@ -1143,57 +1102,12 @@ __conn_reconfigure(WT_CONNECTION *wt_conn, const char *config) WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION_IMPL *session; - const char *p; - bool locked; conn = (WT_CONNECTION_IMPL *)wt_conn; - locked = false; CONNECTION_API_CALL(conn, session, reconfigure, config, cfg); - - /* Serialize reconfiguration. */ - __wt_spin_lock(session, &conn->reconfig_lock); - locked = true; - - /* - * The configuration argument has been checked for validity, update the - * previous connection configuration. - * - * DO NOT merge the configuration before the reconfigure calls. Some - * of the underlying reconfiguration functions do explicit checks with - * the second element of the configuration array, knowing the defaults - * are in slot #1 and the application's modifications are in slot #2. - * - * First, replace the base configuration set up by CONNECTION_API_CALL - * with the current connection configuration, otherwise reconfiguration - * functions will find the base value instead of previously configured - * value. - */ - cfg[0] = conn->cfg; - cfg[1] = config; - - /* Second, reconfigure the system. */ - WT_ERR(__conn_compat_config(session, cfg)); - WT_ERR(__conn_statistics_config(session, cfg)); - WT_ERR(__wt_async_reconfig(session, cfg)); - WT_ERR(__wt_cache_config(session, true, cfg)); - WT_ERR(__wt_checkpoint_server_create(session, cfg)); - WT_ERR(__wt_logmgr_reconfig(session, cfg)); - WT_ERR(__wt_lsm_manager_reconfig(session, cfg)); - WT_ERR(__wt_statlog_create(session, cfg)); - WT_ERR(__wt_sweep_config(session, cfg)); - WT_ERR(__wt_verbose_config(session, cfg)); - WT_ERR(__wt_timing_stress_config(session, cfg)); - - /* Third, merge everything together, creating a new connection state. */ - WT_ERR(__wt_config_merge(session, cfg, NULL, &p)); - __wt_free(session, conn->cfg); - conn->cfg = p; - -err: if (locked) - __wt_spin_unlock(session, &conn->reconfig_lock); - - API_END_RET(session, ret); + ret = __wt_conn_reconfig(session, cfg); +err: API_END_RET(session, ret); } /* @@ -1274,8 +1188,7 @@ __conn_rollback_to_stable(WT_CONNECTION *wt_conn, const char *config) conn = (WT_CONNECTION_IMPL *)wt_conn; - CONNECTION_API_CALL( - conn, session, rollback_to_stable, config, cfg); + CONNECTION_API_CALL(conn, session, rollback_to_stable, config, cfg); WT_TRET(__wt_txn_rollback_to_stable(session, cfg)); err: API_END_RET(session, ret); } @@ -1788,94 +1701,6 @@ err: /* return (ret); } -/* - * __conn_statistics_config -- - * Set statistics configuration. - */ -static int -__conn_statistics_config(WT_SESSION_IMPL *session, const char *cfg[]) -{ - WT_CONFIG_ITEM cval, sval; - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - uint32_t flags; - int set; - - conn = S2C(session); - - WT_RET(__wt_config_gets(session, cfg, "statistics", &cval)); - - flags = 0; - set = 0; - if ((ret = __wt_config_subgets( - session, &cval, "none", &sval)) == 0 && sval.val != 0) { - flags = 0; - ++set; - } - WT_RET_NOTFOUND_OK(ret); - - if ((ret = __wt_config_subgets( - session, &cval, "fast", &sval)) == 0 && sval.val != 0) { - LF_SET(WT_STAT_TYPE_FAST); - ++set; - } - WT_RET_NOTFOUND_OK(ret); - - if ((ret = __wt_config_subgets( - session, &cval, "all", &sval)) == 0 && sval.val != 0) { - LF_SET( - WT_STAT_TYPE_ALL | WT_STAT_TYPE_CACHE_WALK | - WT_STAT_TYPE_FAST | WT_STAT_TYPE_TREE_WALK); - ++set; - } - WT_RET_NOTFOUND_OK(ret); - - if (set > 1) - WT_RET_MSG(session, EINVAL, - "Only one of all, fast, none configuration values should " - "be specified"); - - /* - * Now that we've parsed general statistics categories, process - * sub-categories. - */ - if ((ret = __wt_config_subgets( - session, &cval, "cache_walk", &sval)) == 0 && sval.val != 0) - /* - * Configuring cache walk statistics implies fast statistics. - * Keep that knowledge internal for now - it may change in the - * future. - */ - LF_SET(WT_STAT_TYPE_FAST | WT_STAT_TYPE_CACHE_WALK); - WT_RET_NOTFOUND_OK(ret); - - if ((ret = __wt_config_subgets( - session, &cval, "tree_walk", &sval)) == 0 && sval.val != 0) - /* - * Configuring tree walk statistics implies fast statistics. - * Keep that knowledge internal for now - it may change in the - * future. - */ - LF_SET(WT_STAT_TYPE_FAST | WT_STAT_TYPE_TREE_WALK); - WT_RET_NOTFOUND_OK(ret); - - if ((ret = __wt_config_subgets( - session, &cval, "clear", &sval)) == 0 && sval.val != 0) { - if (!LF_ISSET(WT_STAT_TYPE_ALL | WT_STAT_TYPE_CACHE_WALK | - WT_STAT_TYPE_FAST | WT_STAT_TYPE_TREE_WALK)) - WT_RET_MSG(session, EINVAL, - "the value \"clear\" can only be specified if " - "statistics are enabled"); - LF_SET(WT_STAT_CLEAR); - } - WT_RET_NOTFOUND_OK(ret); - - /* Configuring statistics clears any existing values. */ - conn->stat_flags = flags; - - return (0); -} - /* Simple structure for name and flag configuration searches. */ typedef struct { const char *name; @@ -1916,6 +1741,7 @@ __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) { "split", WT_VERB_SPLIT }, { "temporary", WT_VERB_TEMPORARY }, { "thread_group", WT_VERB_THREAD_GROUP }, + { "timestamp", WT_VERB_TIMESTAMP }, { "transaction", WT_VERB_TRANSACTION }, { "verify", WT_VERB_VERIFY }, { "version", WT_VERB_VERSION }, @@ -2344,7 +2170,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, /* * Set compatibility versions early so that any subsystem sees it. */ - WT_ERR(__conn_compat_config(session, cfg)); + WT_ERR(__wt_conn_compat_config(session, cfg)); /* * If the application didn't configure its own file system, configure @@ -2531,7 +2357,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, WT_ERR(__wt_config_gets(session, cfg, "mmap", &cval)); conn->mmap = cval.val != 0; - WT_ERR(__conn_statistics_config(session, cfg)); + WT_ERR(__wt_conn_statistics_config(session, cfg)); WT_ERR(__wt_lsm_manager_config(session, cfg)); WT_ERR(__wt_sweep_config(session, cfg)); diff --git a/src/third_party/wiredtiger/src/conn/conn_reconfig.c b/src/third_party/wiredtiger/src/conn/conn_reconfig.c new file mode 100644 index 00000000000..e67f2c9a18d --- /dev/null +++ b/src/third_party/wiredtiger/src/conn/conn_reconfig.c @@ -0,0 +1,210 @@ +/*- + * Copyright (c) 2014-2017 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_conn_compat_config -- + * Configure compatibility version. + */ +int +__wt_conn_compat_config(WT_SESSION_IMPL *session, const char **cfg) +{ + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; + uint16_t patch; + bool txn_active; + + conn = S2C(session); + WT_RET(__wt_config_gets(session, cfg, + "compatibility.release", &cval)); + if (cval.len == 0) { + conn->compat_major = WIREDTIGER_VERSION_MAJOR; + conn->compat_minor = WIREDTIGER_VERSION_MINOR; + return (0); + } + + /* + * Accept either a major.minor release string or a + * major.minor.patch release string. We ignore the patch + * value, but allow it in the string. + */ + if (sscanf(cval.str, "%" SCNu16 ".%" SCNu16, + &conn->compat_major, &conn->compat_minor) != 2 && + sscanf(cval.str, "%" SCNu16 ".%" SCNu16 ".%" SCNu16, + &conn->compat_major, &conn->compat_minor, &patch) != 3) + WT_RET_MSG(session, EINVAL, "illegal compatibility release"); + if (conn->compat_major > WIREDTIGER_VERSION_MAJOR) + WT_RET_MSG(session, EINVAL, "unknown major version"); + if (conn->compat_major == WIREDTIGER_VERSION_MAJOR && + conn->compat_minor > WIREDTIGER_VERSION_MINOR) + WT_RET_MSG(session, EINVAL, "illegal compatibility version"); + + /* + * We're doing an upgrade or downgrade, check whether transactions are + * active. + */ + WT_RET(__wt_txn_activity_check(session, &txn_active)); + if (txn_active) + WT_RET_MSG(session, ENOTSUP, + "upgrade / downgrade must run single-threaded"); + return (0); +} + +/* + * __wt_conn_statistics_config -- + * Set statistics configuration. + */ +int +__wt_conn_statistics_config(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CONFIG_ITEM cval, sval; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + uint32_t flags; + int set; + + conn = S2C(session); + + WT_RET(__wt_config_gets(session, cfg, "statistics", &cval)); + + flags = 0; + set = 0; + if ((ret = __wt_config_subgets( + session, &cval, "none", &sval)) == 0 && sval.val != 0) { + flags = 0; + ++set; + } + WT_RET_NOTFOUND_OK(ret); + + if ((ret = __wt_config_subgets( + session, &cval, "fast", &sval)) == 0 && sval.val != 0) { + LF_SET(WT_STAT_TYPE_FAST); + ++set; + } + WT_RET_NOTFOUND_OK(ret); + + if ((ret = __wt_config_subgets( + session, &cval, "all", &sval)) == 0 && sval.val != 0) { + LF_SET( + WT_STAT_TYPE_ALL | WT_STAT_TYPE_CACHE_WALK | + WT_STAT_TYPE_FAST | WT_STAT_TYPE_TREE_WALK); + ++set; + } + WT_RET_NOTFOUND_OK(ret); + + if (set > 1) + WT_RET_MSG(session, EINVAL, + "Only one of all, fast, none configuration values should " + "be specified"); + + /* + * Now that we've parsed general statistics categories, process + * sub-categories. + */ + if ((ret = __wt_config_subgets( + session, &cval, "cache_walk", &sval)) == 0 && sval.val != 0) + /* + * Configuring cache walk statistics implies fast statistics. + * Keep that knowledge internal for now - it may change in the + * future. + */ + LF_SET(WT_STAT_TYPE_FAST | WT_STAT_TYPE_CACHE_WALK); + WT_RET_NOTFOUND_OK(ret); + + if ((ret = __wt_config_subgets( + session, &cval, "tree_walk", &sval)) == 0 && sval.val != 0) + /* + * Configuring tree walk statistics implies fast statistics. + * Keep that knowledge internal for now - it may change in the + * future. + */ + LF_SET(WT_STAT_TYPE_FAST | WT_STAT_TYPE_TREE_WALK); + WT_RET_NOTFOUND_OK(ret); + + if ((ret = __wt_config_subgets( + session, &cval, "clear", &sval)) == 0 && sval.val != 0) { + if (!LF_ISSET(WT_STAT_TYPE_ALL | WT_STAT_TYPE_CACHE_WALK | + WT_STAT_TYPE_FAST | WT_STAT_TYPE_TREE_WALK)) + WT_RET_MSG(session, EINVAL, + "the value \"clear\" can only be specified if " + "statistics are enabled"); + LF_SET(WT_STAT_CLEAR); + } + WT_RET_NOTFOUND_OK(ret); + + /* Configuring statistics clears any existing values. */ + conn->stat_flags = flags; + + return (0); +} + +/* + * __wt_conn_reconfig -- + * Reconfigure a connection (internal version). + */ +int +__wt_conn_reconfig(WT_SESSION_IMPL *session, const char **cfg) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + const char *p; + + conn = S2C(session); + + /* Serialize reconfiguration. */ + __wt_spin_lock(session, &conn->reconfig_lock); + + /* + * The configuration argument has been checked for validity, update the + * previous connection configuration. + * + * DO NOT merge the configuration before the reconfigure calls. Some + * of the underlying reconfiguration functions do explicit checks with + * the second element of the configuration array, knowing the defaults + * are in slot #1 and the application's modifications are in slot #2. + * + * Replace the base configuration set up by CONNECTION_API_CALL with + * the current connection configuration, otherwise reconfiguration + * functions will find the base value instead of previously configured + * value. + */ + cfg[0] = conn->cfg; + + /* + * Reconfigure the system. + * + * The compatibility version check is special: upgrade / downgrade + * cannot be done with transactions active, and checkpoints must not + * span a version change. Hold the checkpoint lock to avoid conflicts + * with WiredTiger's checkpoint thread, and rely on the documentation + * specifying that no new operations can start until the upgrade / + * downgrade completes. + */ + WT_WITH_CHECKPOINT_LOCK(session, + ret = __wt_conn_compat_config(session, cfg)); + WT_ERR(__wt_conn_statistics_config(session, cfg)); + WT_ERR(__wt_async_reconfig(session, cfg)); + WT_ERR(__wt_cache_config(session, true, cfg)); + WT_ERR(__wt_checkpoint_server_create(session, cfg)); + WT_ERR(__wt_logmgr_reconfig(session, cfg)); + WT_ERR(__wt_lsm_manager_reconfig(session, cfg)); + WT_ERR(__wt_statlog_create(session, cfg)); + WT_ERR(__wt_sweep_config(session, cfg)); + WT_ERR(__wt_verbose_config(session, cfg)); + WT_ERR(__wt_timing_stress_config(session, cfg)); + + /* Third, merge everything together, creating a new connection state. */ + WT_ERR(__wt_config_merge(session, cfg, NULL, &p)); + __wt_free(session, conn->cfg); + conn->cfg = p; + +err: __wt_spin_unlock(session, &conn->reconfig_lock); + + return (ret); +} diff --git a/src/third_party/wiredtiger/src/cursor/cur_ds.c b/src/third_party/wiredtiger/src/cursor/cur_ds.c index 10de133be75..087c811747a 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_ds.c +++ b/src/third_party/wiredtiger/src/cursor/cur_ds.c @@ -38,17 +38,16 @@ static int __curds_key_set(WT_CURSOR *cursor) { WT_CURSOR *source; - WT_DECL_RET; source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; - WT_ERR(__cursor_needkey(cursor)); + WT_RET(__cursor_needkey(cursor)); source->recno = cursor->recno; source->key.data = cursor->key.data; source->key.size = cursor->key.size; -err: return (ret); + return (0); } /* @@ -59,16 +58,15 @@ static int __curds_value_set(WT_CURSOR *cursor) { WT_CURSOR *source; - WT_DECL_RET; source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; - WT_ERR(__cursor_needvalue(cursor)); + WT_RET(__cursor_needvalue(cursor)); source->value.data = cursor->value.data; source->value.size = cursor->value.size; -err: return (ret); + return (0); } /* diff --git a/src/third_party/wiredtiger/src/cursor/cur_join.c b/src/third_party/wiredtiger/src/cursor/cur_join.c index 855ad70d6e0..e3ae9dbd9f6 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_join.c +++ b/src/third_party/wiredtiger/src/cursor/cur_join.c @@ -499,9 +499,7 @@ __curjoin_entry_in_range(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry, passed = (cmp < 0); break; - default: - WT_RET(__wt_illegal_value(session, NULL)); - break; + WT_ILLEGAL_VALUE(session); } if (!passed) { diff --git a/src/third_party/wiredtiger/src/docs/Doxyfile b/src/third_party/wiredtiger/src/docs/Doxyfile index 8292df18e47..e95d8babe48 100644 --- a/src/third_party/wiredtiger/src/docs/Doxyfile +++ b/src/third_party/wiredtiger/src/docs/Doxyfile @@ -206,8 +206,8 @@ TAB_SIZE = 8 # You can put \n's in the value part of an alias to insert newlines. ALIASES = "notyet{1}=Note: <b>"\1"</b> not yet supported in WiredTiger.\n@todo fix when \1 supported\n\n" \ - "errors=@returns zero on success and a non-zero error code on failure. See @ref error_returns \"Error Returns\" for details." \ - "ebusy_errors=@returns zero on success, EBUSY if there are open cursors on the object and a non-zero error code on failure. See @ref error_returns \"Error Returns\" for details." \ + "errors=@returns zero on success and a non-zero error code on failure. See @ref error_handling \"Error handling\" for details." \ + "ebusy_errors=@returns zero on success, EBUSY if the object is not available for exclusive access, and a non-zero error code on failure. See @ref error_handling \"Error handling\" for details." \ "ex_ref{1}=@ref \1 \"\1\"" \ "ref_single=@ref" \ "subpage_single=@subpage" \ diff --git a/src/third_party/wiredtiger/src/docs/error-handling.dox b/src/third_party/wiredtiger/src/docs/error-handling.dox index eb9ca6bb82a..7b7c0cd03d5 100644 --- a/src/third_party/wiredtiger/src/docs/error-handling.dox +++ b/src/third_party/wiredtiger/src/docs/error-handling.dox @@ -3,11 +3,11 @@ WiredTiger operations return a value of 0 on success and a non-zero value on error. Error codes may be either positive or negative: positive error codes are standard error codes as described for -POSIX-like systems (for example, EINVAL or EBUSY), negative error codes -are WiredTiger-specific (for example, WT_ROLLBACK). +POSIX-like systems (for example, \c EINVAL or \c EBUSY), negative error +codes are WiredTiger-specific (for example, \c WT_ROLLBACK). WiredTiger-specific error codes always appear in the -31,800 to -31,999 -range. +range, inclusive. @m_if{java} Informational return values, like <code>wiredtiger.WT_NOTFOUND</code> @@ -29,11 +29,22 @@ correctly-written WiredTiger application will likely catch errors. Note that no further WiredTiger calls are required after \c WiredTigerPanicException is caught (and further calls will themselves immediately fail). +@m_endif + +WiredTiger returns \c EBUSY for operations requiring exclusive access, when +an object is not available for exclusive access. For example, the +WT_SESSION::drop or WT_SESSION::verify methods will fail if the object +has open cursors. Note that internal WiredTiger threads may temporarily +open cursors on objects (for example, threads performing operations like +statistics logging), and operations may temporarily fail and return \c EBUSY +when there are no application cursors open on the object. -The following is a complete list of possible WiredTiger-specific -return values, all constants defined in the com.wiredtiger.db.wiredtiger class: +@m_if{java} +The following is a complete list of the WiredTiger-specific return +values, all constants defined in the com.wiredtiger.db.wiredtiger class: @m_else -The following is a list of possible WiredTiger-specific errors: +The following is a complete list of the WiredTiger-specific return +values: @m_endif @if IGNORE_BUILT_BY_API_ERR_BEGIN diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 01a9179aedc..f0d810281c2 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -386,8 +386,9 @@ struct __wt_page_modify { /* Cached overflow value cell/update address pairs. */ struct { - WT_CELL *cell; - WT_UPDATE *upd; + WT_CELL *cell; + uint8_t *data; + size_t size; } *remove; size_t remove_allocated; uint32_t remove_next; @@ -895,10 +896,11 @@ struct __wt_update { uint32_t size; /* data length */ -#define WT_UPDATE_DELETED 0 /* deleted */ -#define WT_UPDATE_MODIFIED 1 /* partial-update modify value */ -#define WT_UPDATE_RESERVED 2 /* reserved */ -#define WT_UPDATE_STANDARD 3 /* complete value */ +#define WT_UPDATE_INVALID 0 /* diagnostic check */ +#define WT_UPDATE_DELETED 1 /* deleted */ +#define WT_UPDATE_MODIFIED 2 /* partial-update modify value */ +#define WT_UPDATE_RESERVED 3 /* reserved */ +#define WT_UPDATE_STANDARD 4 /* complete value */ uint8_t type; /* type (one byte to conserve memory) */ /* If the update includes a complete value. */ @@ -936,7 +938,7 @@ struct __wt_update { * Limit update chains to a small value to avoid penalizing reads and * permit truncation. */ -#define WT_MAX_MODIFY_UPDATE 100 +#define WT_MAX_MODIFY_UPDATE 10 /* * WT_INSERT -- diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index afd4c874cf1..9a86dbc1a26 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -150,7 +150,8 @@ extern const char *__wt_cell_type_string(uint8_t type); extern const char *__wt_page_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, WT_ITEM *buf); extern const char *__wt_addr_string(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, WT_ITEM *buf); extern int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store, bool *decoded) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd_list, WT_CELL_UNPACK *unpack) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern void __wt_ovfl_discard_remove(WT_SESSION_IMPL *session, WT_PAGE *page); +extern int __wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, bool checkpoint) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint32_t alloc_entries, bool alloc_refs, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -278,6 +279,9 @@ extern int __wt_logmgr_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIB extern int __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_connection_close(WT_CONNECTION_IMPL *conn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_conn_compat_config(WT_SESSION_IMPL *session, const char **cfg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_conn_statistics_config(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_conn_reconfig(WT_SESSION_IMPL *session, const char **cfg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_conn_stat_init(WT_SESSION_IMPL *session); extern int __wt_statlog_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_statlog_destroy(WT_SESSION_IMPL *session, bool is_close) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -475,6 +479,7 @@ extern int __wt_lsm_tree_worker(WT_SESSION_IMPL *session, const char *uri, int ( extern int __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool force, WT_LSM_CHUNK **chunkp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_lsm_work_switch( WT_SESSION_IMPL *session, WT_LSM_WORK_UNIT **entryp, bool *ran) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern bool __wt_lsm_chunk_visible_all( WT_SESSION_IMPL *session, WT_LSM_CHUNK *chunk); extern int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_lsm_worker_start(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -664,7 +669,7 @@ __wt_assert(WT_SESSION_IMPL *session, #endif WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern int __wt_panic(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_illegal_value(WT_SESSION_IMPL *session, const char *name) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_illegal_value_func( WT_SESSION_IMPL *session, const char *tag, const char *file, int line) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_inmem_unsupported_op(WT_SESSION_IMPL *session, const char *tag) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_object_unsupported(WT_SESSION_IMPL *session, const char *uri) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_bad_object_type(WT_SESSION_IMPL *session, const char *uri) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -683,7 +688,6 @@ extern int __wt_stash_add(WT_SESSION_IMPL *session, int which, uint64_t generati extern void __wt_stash_discard_all(WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session); extern int __wt_library_init(void) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_breakpoint(void); -extern void __wt_attach(WT_SESSION_IMPL *session); extern uint64_t __wt_hash_city64(const void *s, size_t len); extern uint64_t __wt_hash_fnv64(const void *string, size_t len); extern int @@ -809,6 +813,8 @@ extern int __wt_txn_named_snapshot_config(WT_SESSION_IMPL *session, const char * extern void __wt_txn_named_snapshot_destroy(WT_SESSION_IMPL *session); extern int __wt_txn_recover(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_timestamp_to_hex_string( WT_SESSION_IMPL *session, char *hex_timestamp, const wt_timestamp_t *ts_src) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern void __wt_verbose_timestamp(WT_SESSION_IMPL *session, const wt_timestamp_t *ts, const char *msg); extern int __wt_txn_parse_timestamp(WT_SESSION_IMPL *session, const char *name, wt_timestamp_t *timestamp, WT_CONFIG_ITEM *cval) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_global_query_timestamp( WT_SESSION_IMPL *session, char *hex_timestamp, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h index 243716c2ecb..ccb32900dc4 100644 --- a/src/third_party/wiredtiger/src/include/flags.h +++ b/src/third_party/wiredtiger/src/include/flags.h @@ -113,10 +113,11 @@ #define WT_VERB_SPLIT 0x00800000 #define WT_VERB_TEMPORARY 0x01000000 #define WT_VERB_THREAD_GROUP 0x02000000 -#define WT_VERB_TRANSACTION 0x04000000 -#define WT_VERB_VERIFY 0x08000000 -#define WT_VERB_VERSION 0x10000000 -#define WT_VERB_WRITE 0x20000000 +#define WT_VERB_TIMESTAMP 0x04000000 +#define WT_VERB_TRANSACTION 0x08000000 +#define WT_VERB_VERIFY 0x10000000 +#define WT_VERB_VERSION 0x20000000 +#define WT_VERB_WRITE 0x40000000 #define WT_VISIBILITY_ERR 0x00000080 /* * flags section: END diff --git a/src/third_party/wiredtiger/src/include/lsm.h b/src/third_party/wiredtiger/src/include/lsm.h index df7d6c8d5ca..397f17400de 100644 --- a/src/third_party/wiredtiger/src/include/lsm.h +++ b/src/third_party/wiredtiger/src/include/lsm.h @@ -97,6 +97,11 @@ struct __wt_lsm_chunk { * out, or by compact to get the most * recent chunk flushed. */ + WT_DECL_TIMESTAMP(switch_timestamp)/* + * The timestamp used to decide when + * updates need to detect conflicts. + */ + WT_SPINLOCK timestamp_spinlock; uint32_t id; /* ID used to generate URIs */ uint32_t generation; /* Merge generation */ @@ -107,10 +112,11 @@ struct __wt_lsm_chunk { int8_t evicted; /* 1/0: in-memory chunk was evicted */ uint8_t flushing; /* 1/0: chunk flush in progress */ -#define WT_LSM_CHUNK_BLOOM 0x01 -#define WT_LSM_CHUNK_MERGING 0x02 -#define WT_LSM_CHUNK_ONDISK 0x04 -#define WT_LSM_CHUNK_STABLE 0x08 +#define WT_LSM_CHUNK_BLOOM 0x01 +#define WT_LSM_CHUNK_HAS_TIMESTAMP 0x02 +#define WT_LSM_CHUNK_MERGING 0x04 +#define WT_LSM_CHUNK_ONDISK 0x08 +#define WT_LSM_CHUNK_STABLE 0x10 uint32_t flags; }; diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h index bf7d36e19ca..a6cb56dd852 100644 --- a/src/third_party/wiredtiger/src/include/misc.h +++ b/src/third_party/wiredtiger/src/include/misc.h @@ -293,6 +293,10 @@ typedef void wt_timestamp_t; __wt_page_swap_func(session, held, want, flags) #endif +/* Called on unexpected code path: locate the failure. */ +#define __wt_illegal_value(session, msg) \ + __wt_illegal_value_func(session, msg, __FILE__, __LINE__) + /* Random number generator state. */ union __wt_rand_state { uint64_t v; diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index 61ab343151c..e0513a82892 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -69,7 +69,6 @@ struct __wt_named_snapshot { struct __wt_txn_state { WT_CACHE_LINE_PAD_BEGIN - WT_RWLOCK rwlock; volatile uint64_t id; volatile uint64_t pinned_id; volatile uint64_t metadata_pinned; @@ -105,6 +104,9 @@ struct __wt_txn_global { /* Protects the active transaction states. */ WT_RWLOCK rwlock; + /* Protects logging, checkpoints and transaction visibility. */ + WT_RWLOCK visibility_rwlock; + /* List of transactions sorted by commit timestamp. */ WT_RWLOCK commit_timestamp_rwlock; TAILQ_HEAD(__wt_txn_cts_qh, __wt_txn) commit_timestamph; diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index 30f29e0f5d0..8067b6128c5 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -11,6 +11,8 @@ static inline void __wt_txn_read_last(WT_SESSION_IMPL *session); #ifdef HAVE_TIMESTAMPS #if WT_TIMESTAMP_SIZE == 8 +#define WT_WITH_TIMESTAMP_READLOCK(session, l, e) e + /* * __wt_timestamp_cmp -- * Compare two timestamps. @@ -61,6 +63,12 @@ __wt_timestamp_set_zero(wt_timestamp_t *ts) ts->val = 0; } #else +#define WT_WITH_TIMESTAMP_READLOCK(s, l, e) do { \ + __wt_readlock((s), (l)); \ + e; \ + __wt_readunlock((s), (l)); \ +} while (0) + /* * __wt_timestamp_cmp -- * Compare two timestamps. @@ -90,8 +98,7 @@ __wt_timestamp_iszero(const wt_timestamp_t *ts) { static const wt_timestamp_t zero_timestamp; - return (memcmp(ts->ts, - WT_TIMESTAMP_NULL(&zero_timestamp), WT_TIMESTAMP_SIZE) == 0); + return (memcmp(ts->ts, &zero_timestamp, WT_TIMESTAMP_SIZE) == 0); } /* @@ -182,7 +189,17 @@ __wt_txn_modify(WT_SESSION_IMPL *session, WT_UPDATE *upd) op->type = F_ISSET(session, WT_SESSION_LOGGING_INMEM) ? WT_TXN_OP_INMEM : WT_TXN_OP_BASIC; #ifdef HAVE_TIMESTAMPS - if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) { + /* + * Mark the update with a timestamp, if we have one. + * + * Updates in the metadata never get timestamps (either now or at + * commit): metadata cannot be read at a point in time, only the most + * recently committed data matches files on disk. + */ + if (WT_IS_METADATA(session->dhandle)) { + if (!F_ISSET(session, WT_SESSION_LOGGING_INMEM)) + op->type = WT_TXN_OP_BASIC_TS; + } else if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) { __wt_timestamp_set(&upd->timestamp, &txn->commit_timestamp); if (!F_ISSET(session, WT_SESSION_LOGGING_INMEM)) op->type = WT_TXN_OP_BASIC_TS; @@ -285,9 +302,9 @@ __txn_visible_all_id(WT_SESSION_IMPL *session, uint64_t id) /* * __wt_txn_visible_all -- - * Check if a given transaction is "globally visible". This is, if - * all sessions in the system will see the transaction ID including the - * ID that belongs to a running checkpoint. + * Check if a given transaction is "globally visible". This is, if all + * sessions in the system will see the transaction ID including the ID + * that belongs to a running checkpoint. */ static inline bool __wt_txn_visible_all( @@ -302,12 +319,18 @@ __wt_txn_visible_all( int cmp; /* Timestamp check. */ - if (!txn_global->has_pinned_timestamp || timestamp == NULL) + if (timestamp == NULL || __wt_timestamp_iszero(timestamp)) return (true); - __wt_readlock(session, &txn_global->rwlock); - cmp = __wt_timestamp_cmp(timestamp, &txn_global->pinned_timestamp); - __wt_readunlock(session, &txn_global->rwlock); + /* + * If no oldest timestamp has been supplied, updates have to stay in + * cache until we are shutting down. + */ + if (!txn_global->has_pinned_timestamp) + return (F_ISSET(S2C(session), WT_CONN_CLOSING)); + + WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, + cmp = __wt_timestamp_cmp(timestamp, &txn_global->pinned_timestamp)); /* * We can discard updates with timestamps less than or equal to the @@ -581,8 +604,7 @@ __wt_txn_id_alloc(WT_SESSION_IMPL *session, bool publish) /* * __wt_txn_id_check -- - * A transaction is going to do an update, start an auto commit - * transaction if required and allocate a transaction ID. + * A transaction is going to do an update, allocate a transaction ID. */ static inline int __wt_txn_id_check(WT_SESSION_IMPL *session) @@ -606,7 +628,7 @@ __wt_txn_id_check(WT_SESSION_IMPL *session) * more we can do. */ if (txn->id == WT_TXN_ABORTED) - WT_RET_MSG(session, ENOMEM, "Out of transaction IDs"); + WT_RET_MSG(session, WT_ERROR, "out of transaction IDs"); F_SET(txn, WT_TXN_HAS_ID); return (0); @@ -730,11 +752,11 @@ __wt_txn_am_oldest(WT_SESSION_IMPL *session) } /* - * __wt_txn_are_any_active -- + * __wt_txn_activity_check -- * Check whether there are any running transactions. */ static inline int -__wt_txn_are_any_active(WT_SESSION_IMPL *session, bool *any_active) +__wt_txn_activity_check(WT_SESSION_IMPL *session, bool *txn_active) { WT_TXN_GLOBAL *txn_global; @@ -747,6 +769,8 @@ __wt_txn_are_any_active(WT_SESSION_IMPL *session, bool *any_active) WT_RET(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT)); - *any_active = (txn_global->oldest_id != txn_global->current); + *txn_active = (txn_global->oldest_id != txn_global->current || + txn_global->metadata_pinned != txn_global->current); + return (0); } diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index 5d087447c5a..7825962d89f 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -504,6 +504,12 @@ struct __wt_cursor { * (as it partially depends on the underlying file configuration), but * is always a small number of bytes less than 4GB. * + * The WT_CURSOR::modify method stores a change record in cache and + * writes a change record to the log, instead of the usual complete + * value. This can reduce cache and logging requirements, but may result + * in slower reads because the complete value must be assembled during + * retrieval. + * * @param cursor the cursor handle * @param entries an array of modification data structures * @param nentries the number of modification data structures @@ -1537,7 +1543,7 @@ struct __wt_session { * @snippet ex_all.c Reset the session * * @param session the session handle - * @ebusy_errors + * @errors */ int __F(reset)(WT_SESSION *session); @@ -1998,8 +2004,10 @@ struct __wt_connection { * checkpoint; setting this value above 0 configures periodic * checkpoints., an integer between 0 and 100000; default \c 0.} * @config{ ),,} - * @config{compatibility = (, set compatibility version of database., a - * set of related configuration options defined below.} + * @config{compatibility = (, set compatibility version of database. + * Changing the compatibility version requires that there are no active + * operations for the duration of the call., a set of related + * configuration options defined below.} * @config{ release, compatibility release * version string., a string; default empty.} * @config{ ),,} @@ -2143,8 +2151,9 @@ struct __wt_connection { * "metadata"\, \c "mutex"\, \c "overflow"\, \c "read"\, \c * "rebalance"\, \c "reconcile"\, \c "recovery"\, \c * "recovery_progress"\, \c "salvage"\, \c "shared_cache"\, \c "split"\, - * \c "temporary"\, \c "thread_group"\, \c "transaction"\, \c "verify"\, - * \c "version"\, \c "write"; default empty.} + * \c "temporary"\, \c "thread_group"\, \c "timestamp"\, \c + * "transaction"\, \c "verify"\, \c "version"\, \c "write"; default + * empty.} * @configend * @errors */ @@ -2258,6 +2267,8 @@ struct __wt_connection { * * @snippet ex_all.c set oldest timestamp * + * @snippet ex_all.c set stable timestamp + * * @param connection the connection handle * @configstart{WT_CONNECTION.set_timestamp, see dist/api_data.py} * @config{commit_timestamp, reset the maximum commit timestamp tracked @@ -2292,8 +2303,8 @@ struct __wt_connection { * WT_CONNECTION::set_timestamp. Any updates to checkpoint durable * tables that are more recent than the stable timestamp are removed. * - * This method requires that there are no active cursor operations - * for the duration of the call. + * This method requires that there are no active operations for the + * duration of the call. * * Any updates made to logged tables will not be rolled back. Any * updates made without an associated timestamp will not be rolled @@ -2527,10 +2538,12 @@ struct __wt_connection { * @config{ ),,} * @config{checkpoint_sync, flush files to stable storage when closing or * writing checkpoints., a boolean flag; default \c true.} - * @config{compatibility = (, set compatibility version of database., a set of - * related configuration options defined below.} - * @config{ release, compatibility release version - * string., a string; default empty.} + * @config{compatibility = (, set compatibility version of database. Changing + * the compatibility version requires that there are no active operations for + * the duration of the call., a set of related configuration options defined + * below.} + * @config{ release, compatibility release + * version string., a string; default empty.} * @config{ ),,} * @config{config_base, write the base configuration file if creating the * database. If \c false in the config passed directly to ::wiredtiger_open\, @@ -2766,8 +2779,8 @@ struct __wt_connection { * "lookaside_activity"\, \c "lsm"\, \c "lsm_manager"\, \c "metadata"\, \c * "mutex"\, \c "overflow"\, \c "read"\, \c "rebalance"\, \c "reconcile"\, \c * "recovery"\, \c "recovery_progress"\, \c "salvage"\, \c "shared_cache"\, \c - * "split"\, \c "temporary"\, \c "thread_group"\, \c "transaction"\, \c - * "verify"\, \c "version"\, \c "write"; default empty.} + * "split"\, \c "temporary"\, \c "thread_group"\, \c "timestamp"\, \c + * "transaction"\, \c "verify"\, \c "version"\, \c "write"; default empty.} * @config{write_through, Use \c FILE_FLAG_WRITE_THROUGH on Windows to write to * files. Ignored on non-Windows systems. Options are given as a list\, such * as <code>"write_through=[data]"</code>. Configuring \c write_through requires @@ -3301,7 +3314,6 @@ const char *wiredtiger_version(int *majorp, int *minorp, int *patchp) * Error returns *******************************************/ /*! - * @anchor error_returns * @name Error returns * Most functions and methods in WiredTiger return an integer code indicating * whether the operation succeeded or failed. A return of zero indicates diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c index 6a1709b03f2..39656c17ee0 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c @@ -538,8 +538,7 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { chunk = lsm_tree->chunk[ngood - 1]; clsm->chunks[ngood - 1]->switch_txn = chunk->switch_txn; - if (__wt_txn_visible_all( - session, chunk->switch_txn, NULL)) + if (__wt_lsm_chunk_visible_all(session, chunk)) break; } } else { @@ -937,10 +936,9 @@ retry: /* goto retry; err: __clsm_leave(clsm); - API_END(session, ret); if (ret == 0) __clsm_deleted_decode(clsm, &cursor->value); - return (ret); + API_END_RET(session, ret); } /* @@ -1029,8 +1027,7 @@ __clsm_next_random(WT_CURSOR *cursor) err: F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); } __clsm_leave(clsm); - API_END(session, ret); - return (ret); + API_END_RET(session, ret); } /* @@ -1116,10 +1113,9 @@ retry: /* goto retry; err: __clsm_leave(clsm); - API_END(session, ret); if (ret == 0) __clsm_deleted_decode(clsm, &cursor->value); - return (ret); + API_END_RET(session, ret); } /* @@ -1275,10 +1271,9 @@ __clsm_search(WT_CURSOR *cursor) ret = __clsm_lookup(clsm, &cursor->value); err: __clsm_leave(clsm); - API_END(session, ret); if (ret == 0) __clsm_deleted_decode(clsm, &cursor->value); - return (ret); + API_END_RET(session, ret); } /* @@ -1418,7 +1413,6 @@ __clsm_search_near(WT_CURSOR *cursor, int *exactp) *exactp = cmp; err: __clsm_leave(clsm); - API_END(session, ret); if (closest != NULL) WT_TRET(closest->reset(closest)); @@ -1428,7 +1422,7 @@ err: __clsm_leave(clsm); } else clsm->current = NULL; - return (ret); + API_END_RET(session, ret); } /* diff --git a/src/third_party/wiredtiger/src/lsm/lsm_manager.c b/src/third_party/wiredtiger/src/lsm/lsm_manager.c index 24a0429a184..3949d88cec4 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_manager.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_manager.c @@ -208,14 +208,20 @@ __wt_lsm_manager_start(WT_SESSION_IMPL *session) conn = S2C(session); manager = &conn->lsm_manager; - if (F_ISSET(conn, WT_CONN_READONLY)) { - manager->lsm_workers = 0; - return (0); - } /* - * We need at least a manager, a switch thread and a generic - * worker. + * If readonly or the manager is running, or we've already failed, + * there's no work to do. */ + if (F_ISSET(conn, WT_CONN_READONLY) || + manager->lsm_workers != 0 || + F_ISSET(manager, WT_LSM_MANAGER_SHUTDOWN)) + return (0); + + /* It's possible to race, see if we're the winner. */ + if (!__wt_atomic_cas32(&manager->lsm_workers, 0, 1)) + return (0); + + /* We need at least a manager, a switch thread and a generic worker. */ WT_ASSERT(session, manager->lsm_workers_max > 2); /* @@ -245,6 +251,15 @@ err: for (i = 0; i++) WT_TRET((&worker_session->iface)->close( &worker_session->iface, NULL)); + + /* Make the failure permanent, we won't try again. */ + F_SET(manager, WT_LSM_MANAGER_SHUTDOWN); + + /* + * Reset the workers count (otherwise, LSM destroy will hang + * waiting for threads to exit. + */ + WT_PUBLISH(manager->lsm_workers, 0); } return (ret); } diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c index 18e1f6d3115..e6eccf96467 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_tree.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c @@ -35,6 +35,7 @@ __lsm_tree_discard_state(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) if ((chunk = lsm_tree->chunk[i]) == NULL) continue; + __wt_spin_destroy(session, &chunk->timestamp_spinlock); __wt_free(session, chunk->bloom_uri); __wt_free(session, chunk->uri); __wt_free(session, chunk); @@ -44,6 +45,7 @@ __lsm_tree_discard_state(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) chunk = lsm_tree->old_chunks[i]; WT_ASSERT(session, chunk != NULL); + __wt_spin_destroy(session, &chunk->timestamp_spinlock); __wt_free(session, chunk->bloom_uri); __wt_free(session, chunk->uri); __wt_free(session, chunk); @@ -280,6 +282,8 @@ __wt_lsm_tree_setup_chunk( WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); __wt_epoch(session, &chunk->create_time); + __wt_spin_init(session, + &chunk->timestamp_spinlock, "LSM chunk timestamp"); WT_RET(__wt_lsm_tree_chunk_name( session, lsm_tree, chunk->id, &chunk->uri)); @@ -474,8 +478,7 @@ __lsm_tree_open(WT_SESSION_IMPL *session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); /* Start the LSM manager thread if it isn't running. */ - if (__wt_atomic_cas32(&conn->lsm_manager.lsm_workers, 0, 1)) - WT_RET(__wt_lsm_manager_start(session)); + WT_RET(__wt_lsm_manager_start(session)); /* Make sure no one beat us to it. */ if ((ret = __lsm_tree_find( diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c index 2f21e8acdc3..816eafebe99 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c @@ -256,6 +256,63 @@ err: } /* + * __wt_lsm_chunk_visible_all -- + * Setup a timestamp and check visibility for a chunk, can be called + * from multiple threads in parallel + */ +bool +__wt_lsm_chunk_visible_all( + WT_SESSION_IMPL *session, WT_LSM_CHUNK *chunk) +{ + /* Once a chunk has been flushed it's contents must be visible */ + if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK | WT_LSM_CHUNK_STABLE)) + return (true); + + if (chunk->switch_txn == WT_TXN_NONE || + !__wt_txn_visible_all(session, chunk->switch_txn, NULL)) + return (false); + +#ifdef HAVE_TIMESTAMPS + { + WT_TXN_GLOBAL *txn_global; + + txn_global = &S2C(session)->txn_global; + + /* + * Once all transactions with updates in the chunk are visible all + * timestamps associated with those updates are assigned so setup a + * timestamp for visibility checking. + */ + if (txn_global->has_commit_timestamp || + txn_global->has_pinned_timestamp) { + if (!F_ISSET(chunk, WT_LSM_CHUNK_HAS_TIMESTAMP)) { + __wt_spin_lock(session, &chunk->timestamp_spinlock); + /* Set the timestamp if we won the race */ + if (!F_ISSET(chunk, WT_LSM_CHUNK_HAS_TIMESTAMP)) { + __wt_readlock(session, &txn_global->rwlock); + __wt_timestamp_set(&chunk->switch_timestamp, + &txn_global->commit_timestamp); + __wt_readunlock(session, &txn_global->rwlock); + F_SET(chunk, WT_LSM_CHUNK_HAS_TIMESTAMP); + } + __wt_spin_unlock(session, &chunk->timestamp_spinlock); + } + if (!__wt_txn_visible_all( + session, chunk->switch_txn, &chunk->switch_timestamp)) + return (false); + } else + /* + * If timestamps aren't in use when the chunk becomes visible + * use the zero timestamp for visibility checks. Otherwise + * there could be confusion if timestamps start being used. + */ + F_SET(chunk, WT_LSM_CHUNK_HAS_TIMESTAMP); + } +#endif + return (true); +} + +/* * __wt_lsm_checkpoint_chunk -- * Flush a single LSM chunk to disk. */ @@ -295,14 +352,12 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, /* Stop if a running transaction needs the chunk. */ WT_RET(__wt_txn_update_oldest( session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT)); - if (chunk->switch_txn == WT_TXN_NONE || - !__wt_txn_visible_all(session, chunk->switch_txn, NULL)) { + if (!__wt_lsm_chunk_visible_all(session, chunk)) { __wt_verbose(session, WT_VERB_LSM, "LSM worker %s: running transaction, return", chunk->uri); return (0); } - if (!__wt_atomic_cas8(&chunk->flushing, 0, 1)) return (0); flush_set = true; diff --git a/src/third_party/wiredtiger/src/os_common/os_abort.c b/src/third_party/wiredtiger/src/os_common/os_abort.c index 905f3160acf..ebef001ce67 100644 --- a/src/third_party/wiredtiger/src/os_common/os_abort.c +++ b/src/third_party/wiredtiger/src/os_common/os_abort.c @@ -16,12 +16,18 @@ void __wt_abort(WT_SESSION_IMPL *session) WT_GCC_FUNC_ATTRIBUTE((noreturn)) { - __wt_errx(session, "aborting WiredTiger library"); +#ifdef HAVE_ATTACH + u_int i; -#ifdef HAVE_DIAGNOSTIC - __wt_attach(session); -#endif + __wt_errx(session, "process ID %" PRIdMAX + ": waiting for debugger...", (intmax_t)getpid()); + /* Sleep forever, the debugger will interrupt us when it attaches. */ + for (i = 0; i < WT_MILLION; ++i) + __wt_sleep(10, 0); +#else + __wt_errx(session, "aborting WiredTiger library"); +#endif abort(); /* NOTREACHED */ } diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index a3cb6a53a09..10c2c0dc937 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -58,8 +58,12 @@ typedef struct { uint64_t orig_btree_checkpoint_gen; uint64_t orig_txn_checkpoint_gen; - /* Track the oldest transaction running when reconciliation starts. */ + /* + * Track the oldest running transaction and the stable timestamp when + * reconciliation starts. + */ uint64_t last_running; + WT_DECL_TIMESTAMP(stable_timestamp) /* Track the page's maximum transaction. */ uint64_t max_txn; @@ -506,6 +510,13 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_TRET(session->block_manager_cleanup(session)); WT_TRET(__rec_destroy_session(session)); + + /* + * We track removed overflow objects in case there's a reader + * in transit when they're removed. Any form of eviction locks + * out readers, we can discard them all. + */ + __wt_ovfl_discard_remove(session, page); } WT_RET(ret); @@ -881,6 +892,7 @@ __rec_init(WT_SESSION_IMPL *session, WT_BTREE *btree; WT_PAGE *page; WT_RECONCILE *r; + WT_TXN_GLOBAL *txn_global; btree = S2BT(session); page = ref->page; @@ -924,7 +936,13 @@ __rec_init(WT_SESSION_IMPL *session, * transaction running when reconciliation starts is considered * uncommitted. */ - WT_ORDERED_READ(r->last_running, S2C(session)->txn_global.last_running); + txn_global = &S2C(session)->txn_global; + WT_ORDERED_READ(r->last_running, txn_global->last_running); +#ifdef HAVE_TIMESTAMPS + WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, + __wt_timestamp_set( + &r->stable_timestamp, &txn_global->stable_timestamp)); +#endif /* * Lookaside table eviction is configured when eviction gets aggressive, @@ -1194,6 +1212,64 @@ __rec_update_move(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_SAVE_UPD *supd) } /* + * __rec_append_orig_value -- + * Append the key's original value to its update list. + */ +static int +__rec_append_orig_value(WT_SESSION_IMPL *session, + WT_PAGE *page, WT_UPDATE *upd_list, WT_CELL_UNPACK *unpack) +{ + WT_DECL_ITEM(tmp); + WT_DECL_RET; + WT_UPDATE *append, *upd; + size_t size; + + /* If at least one standard update is globally visible, we're done. */ + for (upd = upd_list; upd != NULL; upd = upd->next) + if (WT_UPDATE_DATA_VALUE(upd) && + __wt_txn_upd_visible_all(session, upd)) + return (0); + + /* + * We need the original on-page value for some reader: get a copy and + * append it to the end of the update list with a transaction ID that + * guarantees its visibility. + * + * If we don't have a value cell, it's an insert/append list key/value + * pair which simply doesn't exist for some reader; place a deleted + * record at the end of the update list. + */ + append = NULL; /* -Wconditional-uninitialized */ + size = 0; /* -Wconditional-uninitialized */ + if (unpack == NULL || unpack->type == WT_CELL_DEL) + WT_RET(__wt_update_alloc(session, + NULL, &append, &size, WT_UPDATE_DELETED)); + else { + WT_RET(__wt_scr_alloc(session, 0, &tmp)); + WT_ERR(__wt_page_cell_data_ref(session, page, unpack, tmp)); + WT_ERR(__wt_update_alloc( + session, tmp, &append, &size, WT_UPDATE_STANDARD)); + } + + /* + * Give the entry no transaction ID to ensure global visibility, append + * it to the update list. + * + * Note the change to the actual reader-accessible update list: from now + * on, the original on-page value appears at the end of the update list, + * even if this reconciliation subsequently fails. + */ + append->txnid = WT_TXN_NONE; + for (upd = upd_list; upd->next != NULL; upd = upd->next) + ; + WT_PUBLISH(upd->next, append); + __wt_cache_page_inmem_incr(session, page, size); + +err: __wt_scr_free(session, &tmp); + return (ret); +} + +/* * __rec_txn_read -- * Return the update in a list that should be written (or NULL if none can * be written). @@ -1203,18 +1279,14 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, void *ripcip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp) { WT_BTREE *btree; - WT_DECL_RET; - WT_DECL_ITEM(tmp); - WT_DECL_TIMESTAMP(min_timestamp) WT_DECL_TIMESTAMP(max_timestamp) WT_PAGE *page; - WT_UPDATE *append, *upd, *upd_list; - size_t size, update_mem; - uint64_t max_txn, min_txn, txnid; - bool append_origv, skipped; + WT_UPDATE *upd, *upd_list; + size_t update_mem; + uint64_t max_txn, txnid; + bool skipped; *updp = NULL; - append = NULL; /* -Wconditional-uninitialized */ btree = S2BT(session); page = r->page; @@ -1235,9 +1307,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, max_txn = WT_TXN_NONE; #ifdef HAVE_TIMESTAMPS __wt_timestamp_set_zero(&max_timestamp); - __wt_timestamp_set_inf(&min_timestamp); #endif - min_txn = UINT64_MAX; if (F_ISSET(r, WT_EVICTING)) { /* Discard obsolete updates. */ @@ -1258,8 +1328,6 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, */ if (WT_TXNID_LT(max_txn, txnid)) max_txn = txnid; - if (WT_TXNID_LT(txnid, min_txn)) - min_txn = txnid; /* * Find the first update we can use. @@ -1285,17 +1353,13 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, if (*updp == NULL) *updp = upd; + #ifdef HAVE_TIMESTAMPS /* Track min/max timestamps. */ if (__wt_timestamp_cmp( - &max_timestamp, &upd->timestamp) < 0) + &upd->timestamp, &max_timestamp) > 0) __wt_timestamp_set( &max_timestamp, &upd->timestamp); - - if (__wt_timestamp_cmp( - &min_timestamp, &upd->timestamp) > 0) - __wt_timestamp_set( - &min_timestamp, &upd->timestamp); #endif } } else @@ -1325,7 +1389,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, } } - /* Reconciliation should never see a reserved update. */ + /* Reconciliation should never see an aborted or reserved update. */ WT_ASSERT(session, *updp == NULL || ((*updp)->txnid != WT_TXN_ABORTED && (*updp)->type != WT_UPDATE_RESERVED)); @@ -1370,18 +1434,17 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, if (!skipped && (F_ISSET(btree, WT_BTREE_LOOKASIDE) || __wt_txn_visible_all(session, max_txn, WT_TIMESTAMP_NULL(&max_timestamp)))) { -#ifdef HAVE_DIAGNOSTIC /* * The checkpoint transaction is special. Make sure we never * write (metadata) updates from a checkpoint in a concurrent * session. */ - txnid = *updp == NULL ? WT_TXN_NONE : (*updp)->txnid; - WT_ASSERT(session, txnid == WT_TXN_NONE || - txnid != S2C(session)->txn_global.checkpoint_state.id || + WT_ASSERT(session, *updp == NULL || + (*updp)->txnid != + S2C(session)->txn_global.checkpoint_state.id || WT_SESSION_IS_CHECKPOINT(session)); -#endif - return (0); + + goto check_original_value; } /* @@ -1400,7 +1463,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, */ if (!F_ISSET(r, WT_EVICTING)) { r->leave_dirty = true; - return (0); + goto check_original_value; } /* @@ -1441,7 +1504,20 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, if (skipped) r->update_mem_uncommitted += update_mem; - append_origv = false; +#ifdef HAVE_TIMESTAMPS + /* + * Don't allow lookaside eviction with updates newer than the stable + * timestamp. Also don't recommend lookaside eviction in that case. + */ + if (__wt_timestamp_cmp(&max_timestamp, &r->stable_timestamp) > 0) { + if (!F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) + return (EBUSY); + + if (!skipped) + r->update_mem_uncommitted += update_mem; + } +#endif + if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) { /* * The save/restore eviction path. @@ -1456,58 +1532,6 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, /* The page can't be marked clean. */ r->leave_dirty = true; - } else { - /* - * The lookaside table eviction path. - * - * If at least one update is globally visible, copy the update - * list and ignore the current on-page value. If no update is - * globally visible, readers require the page's original value. - */ - if (!__wt_txn_visible_all( - session, min_txn, WT_TIMESTAMP_NULL(&min_timestamp))) - append_origv = true; - } - - /* - * We need the original on-page value for some reason: get a copy and - * append it to the end of the update list with a transaction ID that - * guarantees its visibility. - */ - if (append_origv) { - /* - * If we don't have a value cell, it's an insert/append list - * key/value pair which simply doesn't exist for some reader; - * place a deleted record at the end of the update list. - */ - size = 0; /* -Wconditional-uninitialized */ - if (vpack == NULL || vpack->type == WT_CELL_DEL) - WT_RET(__wt_update_alloc(session, - NULL, &append, &size, WT_UPDATE_DELETED)); - else { - WT_RET(__wt_scr_alloc(session, 0, &tmp)); - if ((ret = __wt_page_cell_data_ref( - session, page, vpack, tmp)) == 0) - ret = __wt_update_alloc(session, - tmp, &append, &size, WT_UPDATE_STANDARD); - __wt_scr_free(session, &tmp); - WT_RET(ret); - } - - /* - * Give the entry no transaction ID to ensure global visibility, - * append it to the update list. - * - * Note the change to the actual reader-accessible update list: - * from now on, the original on-page value appears at the end - * of the update list, even if this reconciliation subsequently - * fails. - */ - append->txnid = WT_TXN_NONE; - for (upd = upd_list; upd->next != NULL; upd = upd->next) - ; - WT_PUBLISH(upd->next, append); - __wt_cache_page_inmem_incr(session, page, size); } /* @@ -1521,7 +1545,23 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, * that transaction ID is globally visible, we know we no longer need * the lookaside table records, allowing them to be discarded. */ - return (__rec_update_save(session, r, ins, ripcip, *updp)); + WT_RET(__rec_update_save(session, r, ins, ripcip, *updp)); + +check_original_value: + /* + * Returning an update means the original on-page value might be lost, + * and that's a problem if there's a reader that needs it. There are + * two cases: any lookaside table eviction (because the backing disk + * image is rewritten), or any reconciliation of a backing overflow + * record that will be physically removed once it's no longer needed. + */ + if (*updp != NULL && + (F_ISSET(r, WT_EVICT_LOOKASIDE) || + (vpack != NULL && + vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM))) + WT_RET(__rec_append_orig_value(session, page, *updp, vpack)); + + return (0); } /* @@ -4708,7 +4748,7 @@ __rec_col_var(WT_SESSION_IMPL *session, * file, otherwise we'll leak blocks on the checkpoint. * That's safe because if the backing overflow value is * still needed by any running transaction, we'll cache - * a copy in the reconciliation tracking structures. + * a copy in the update list. * * Regardless, we avoid copying in overflow records: if * there's a WT_INSERT entry that modifies a reference @@ -4793,8 +4833,8 @@ record_loop: /* * The on-page value will never be accessed, * write a placeholder record. */ - data = "@"; - size = 1; + data = "ovfl-unused"; + size = WT_STORE_SIZE(strlen("ovfl-unused")); } else { update_no_copy = false; /* Maybe data copy */ @@ -4928,7 +4968,8 @@ compare: /* */ if (ovfl_state == OVFL_UNUSED && vpack->raw != WT_CELL_VALUE_OVFL_RM) - WT_ERR(__wt_ovfl_remove(session, page, upd, vpack)); + WT_ERR(__wt_ovfl_remove( + session, page, vpack, !F_ISSET(r, WT_EVICTING))); } /* Walk any append list. */ @@ -5535,8 +5576,9 @@ __rec_row_leaf(WT_SESSION_IMPL *session, * The on-page value will never be accessed, * write a placeholder record. */ - WT_ERR(__rec_cell_build_val( - session, r, "@", 1, (uint64_t)0)); + WT_ERR(__rec_cell_build_val(session, r, + "ovfl-unused", strlen("ovfl-unused"), + (uint64_t)0)); } else { val->buf.data = val_cell; val->buf.size = __wt_cell_total_len(vpack); @@ -5554,8 +5596,8 @@ __rec_row_leaf(WT_SESSION_IMPL *session, */ if (vpack != NULL && vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM) - WT_ERR(__wt_ovfl_remove( - session, page, upd, vpack)); + WT_ERR(__wt_ovfl_remove(session, + page, vpack, !F_ISSET(r, WT_EVICTING))); switch (upd->type) { case WT_UPDATE_DELETED: diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index 52d11651191..1a63ed675b5 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -1848,9 +1848,9 @@ __open_session(WT_CONNECTION_IMPL *conn, if (!session_ret->active) break; if (i == conn->session_size) - WT_ERR_MSG(session, ENOMEM, - "only configured to support %" PRIu32 " sessions" - " (including %d additional internal sessions)", + WT_ERR_MSG(session, WT_ERROR, + "out of sessions, only configured to support %" PRIu32 + " sessions (including %d additional internal sessions)", conn->session_size, WT_EXTRA_INTERNAL_SESSIONS); /* diff --git a/src/third_party/wiredtiger/src/support/err.c b/src/third_party/wiredtiger/src/support/err.c index 94ae27628c2..a6ab328864d 100644 --- a/src/third_party/wiredtiger/src/support/err.c +++ b/src/third_party/wiredtiger/src/support/err.c @@ -533,17 +533,20 @@ __wt_panic(WT_SESSION_IMPL *session) } /* - * __wt_illegal_value -- + * __wt_illegal_value_func -- * A standard error message when we detect an illegal value. */ int -__wt_illegal_value(WT_SESSION_IMPL *session, const char *name) +__wt_illegal_value_func( + WT_SESSION_IMPL *session, const char *tag, const char *file, int line) WT_GCC_FUNC_ATTRIBUTE((cold)) WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) { - __wt_errx(session, "%s%s%s", - name == NULL ? "" : name, name == NULL ? "" : ": ", - "encountered an illegal file format or internal value"); + __wt_errx(session, "%s%s%s: (%s, %d)", + tag == NULL ? "" : tag, + tag == NULL ? "" : ": ", + "encountered an illegal file format or internal value", + file, line); return (__wt_panic(session)); } diff --git a/src/third_party/wiredtiger/src/support/global.c b/src/third_party/wiredtiger/src/support/global.c index 6525fe21809..e425b690a5b 100644 --- a/src/third_party/wiredtiger/src/support/global.c +++ b/src/third_party/wiredtiger/src/support/global.c @@ -106,25 +106,4 @@ __wt_breakpoint(void) */ __wt_yield(); } - -/* - * __wt_attach -- - * A routine to wait for the debugging to attach. - */ -void -__wt_attach(WT_SESSION_IMPL *session) -{ -#ifdef HAVE_ATTACH - u_int i; - - __wt_errx(session, "process ID %" PRIdMAX - ": waiting for debugger...", (intmax_t)getpid()); - - /* Sleep forever, the debugger will interrupt us when it attaches. */ - for (i = 0; i < WT_MILLION; ++i) - __wt_sleep(10, 0); -#else - WT_UNUSED(session); -#endif -} #endif diff --git a/src/third_party/wiredtiger/src/support/time.c b/src/third_party/wiredtiger/src/support/time.c index 233bc871e06..240a77591a3 100644 --- a/src/third_party/wiredtiger/src/support/time.c +++ b/src/third_party/wiredtiger/src/support/time.c @@ -35,8 +35,7 @@ __time_check_monotonic(WT_SESSION_IMPL *session, struct timespec *tsp) /* * __wt_epoch -- - * Return the time since the Epoch, adjusted so it never appears to go - * backwards. + * Return the time since the Epoch. */ void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) @@ -45,9 +44,14 @@ __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) struct timespec tmp; /* - * Read into a local variable so that we're comparing the correct - * value when we check for monotonic increasing time. There are - * many places we read into an unlocked global variable. + * Read into a local variable, then check for monotonically increasing + * time, ensuring single threads never see time move backward. We don't + * prevent multiple threads from seeing time move backwards (even when + * reading time serially, the saved last-read time is per thread, not + * per timer, so multiple threads can race the time). Nor do we prevent + * multiple threads simultaneously reading the time from seeing random + * time or time moving backwards (assigning the time structure to the + * returned memory location implies multicycle writes to memory). */ __wt_epoch_raw(session, &tmp); __time_check_monotonic(session, &tmp); diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 191f7e0ba0f..09efb2924bf 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -445,12 +445,11 @@ __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_txn_parse_timestamp( session, "read", &txn->read_timestamp, &cval)); - __wt_readlock(session, &txn_global->rwlock); - __wt_timestamp_set( - &oldest_timestamp, &txn_global->oldest_timestamp); - __wt_timestamp_set( - &stable_timestamp, &txn_global->stable_timestamp); - __wt_readunlock(session, &txn_global->rwlock); + WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, + __wt_timestamp_set( + &oldest_timestamp, &txn_global->oldest_timestamp); + __wt_timestamp_set( + &stable_timestamp, &txn_global->stable_timestamp)); if (__wt_timestamp_cmp( &txn->read_timestamp, &oldest_timestamp) < 0) WT_RET_MSG(session, EINVAL, @@ -568,18 +567,20 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; WT_TXN_OP *op; + u_int i; + bool did_update, locked; #ifdef HAVE_TIMESTAMPS - WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global; wt_timestamp_t prev_commit_timestamp; bool update_timestamp; #endif - u_int i; - bool did_update; txn = &session->txn; conn = S2C(session); + txn_global = &conn->txn_global; did_update = txn->mod_count != 0; + locked = false; WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING)); WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || !did_update); @@ -665,6 +666,14 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) * This is particularly important for checkpoints. */ __wt_txn_release_snapshot(session); + /* + * We hold the visibility lock for reading from the time + * we write our log record until the time we release our + * transaction so that the LSN any checkpoint gets will + * always reflect visible data. + */ + __wt_readlock(session, &txn_global->visibility_rwlock); + locked = true; WT_ERR(__wt_txn_log_commit(session, cfg)); } @@ -687,9 +696,12 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) #ifdef HAVE_TIMESTAMPS if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && - op->type != WT_TXN_OP_BASIC_TS) + op->type != WT_TXN_OP_BASIC_TS) { + WT_ASSERT(session, + op->fileid != WT_METAFILE_ID); __wt_timestamp_set(&op->u.upd->timestamp, &txn->commit_timestamp); + } #endif break; @@ -724,14 +736,15 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) #endif __wt_txn_release(session); + if (locked) + __wt_readunlock(session, &txn_global->visibility_rwlock); #ifdef HAVE_TIMESTAMPS /* First check if we've already committed something in the future. */ if (update_timestamp) { - __wt_readlock(session, &txn_global->rwlock); - __wt_timestamp_set( - &prev_commit_timestamp, &txn_global->commit_timestamp); - __wt_readunlock(session, &txn_global->rwlock); + WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, + __wt_timestamp_set( + &prev_commit_timestamp, &txn_global->commit_timestamp)); update_timestamp = __wt_timestamp_cmp( &txn->commit_timestamp, &prev_commit_timestamp) > 0; } @@ -760,6 +773,8 @@ err: /* * !!! * Nothing can fail after this point. */ + if (locked) + __wt_readunlock(session, &txn_global->visibility_rwlock); WT_TRET(__wt_txn_rollback(session, cfg)); return (ret); } @@ -930,6 +945,7 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_spin_init( session, &txn_global->id_lock, "transaction id lock")); WT_RET(__wt_rwlock_init(session, &txn_global->rwlock)); + WT_RET(__wt_rwlock_init(session, &txn_global->visibility_rwlock)); WT_RET(__wt_rwlock_init(session, &txn_global->commit_timestamp_rwlock)); TAILQ_INIT(&txn_global->commit_timestamph); @@ -971,6 +987,7 @@ __wt_txn_global_destroy(WT_SESSION_IMPL *session) __wt_rwlock_destroy(session, &txn_global->commit_timestamp_rwlock); __wt_rwlock_destroy(session, &txn_global->read_timestamp_rwlock); __wt_rwlock_destroy(session, &txn_global->nsnap_rwlock); + __wt_rwlock_destroy(session, &txn_global->visibility_rwlock); __wt_free(session, txn_global->states); } @@ -981,10 +998,7 @@ __wt_txn_global_destroy(WT_SESSION_IMPL *session) int __wt_txn_global_shutdown(WT_SESSION_IMPL *session) { - WT_DECL_RET; - WT_TXN_GLOBAL *txn_global; - - txn_global = &S2C(session)->txn_global; + bool txn_active; /* * We're shutting down. Make sure everything gets freed. @@ -995,10 +1009,8 @@ __wt_txn_global_shutdown(WT_SESSION_IMPL *session) * transaction ID will catch up with the current ID. */ for (;;) { - WT_TRET(__wt_txn_update_oldest(session, - WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT)); - if (txn_global->oldest_id == txn_global->current && - txn_global->metadata_pinned == txn_global->current) + WT_RET(__wt_txn_activity_check(session, &txn_active)); + if (!txn_active) break; WT_STAT_CONN_INCR(session, txn_release_blocked); @@ -1010,10 +1022,10 @@ __wt_txn_global_shutdown(WT_SESSION_IMPL *session) * Now that all transactions have completed, no timestamps should be * pinned. */ - __wt_timestamp_set_inf(&txn_global->pinned_timestamp); + __wt_timestamp_set_inf(&S2C(session)->txn_global.pinned_timestamp); #endif - return (ret); + return (0); } #if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) @@ -1031,7 +1043,9 @@ __wt_verbose_dump_txn(WT_SESSION_IMPL *session) const char *iso_tag; uint64_t id; uint32_t i, session_cnt; - +#ifdef HAVE_TIMESTAMPS + char hex_timestamp[3][2 * WT_TIMESTAMP_SIZE + 1]; +#endif conn = S2C(session); txn_global = &conn->txn_global; @@ -1042,10 +1056,35 @@ __wt_verbose_dump_txn(WT_SESSION_IMPL *session) WT_RET(__wt_msg(session, "last running ID: %" PRIu64, txn_global->last_running)); WT_RET(__wt_msg(session, "oldest ID: %" PRIu64, txn_global->oldest_id)); - WT_RET(__wt_msg(session, - "oldest named snapshot ID: %" PRIu64, txn_global->nsnap_oldest_id)); - WT_RET(__wt_msg(session, "checkpoint running? %s", +#ifdef HAVE_TIMESTAMPS + WT_RET(__wt_timestamp_to_hex_string( + session, hex_timestamp[0], &txn_global->commit_timestamp)); + WT_RET(__wt_msg(session, "commit timestamp: %s", hex_timestamp[0])); + WT_RET(__wt_timestamp_to_hex_string( + session, hex_timestamp[0], &txn_global->oldest_timestamp)); + WT_RET(__wt_msg(session, "oldest timestamp: %s", hex_timestamp[0])); + WT_RET(__wt_timestamp_to_hex_string( + session, hex_timestamp[0], &txn_global->pinned_timestamp)); + WT_RET(__wt_msg(session, "pinned timestamp: %s", hex_timestamp[0])); + WT_RET(__wt_timestamp_to_hex_string( + session, hex_timestamp[0], &txn_global->stable_timestamp)); + WT_RET(__wt_msg(session, "stable timestamp: %s", hex_timestamp[0])); + WT_RET(__wt_msg(session, "has_commit_timestamp: %s", + txn_global->has_commit_timestamp ? "yes" : "no")); + WT_RET(__wt_msg(session, "has_oldest_timestamp: %s", + txn_global->has_oldest_timestamp ? "yes" : "no")); + WT_RET(__wt_msg(session, "has_pinned_timestamp: %s", + txn_global->has_pinned_timestamp ? "yes" : "no")); + WT_RET(__wt_msg(session, "has_stable_timestamp: %s", + txn_global->has_stable_timestamp ? "yes" : "no")); + WT_RET(__wt_msg(session, "oldest_is_pinned: %s", + txn_global->oldest_is_pinned ? "yes" : "no")); + WT_RET(__wt_msg(session, "stable_is_pinned: %s", + txn_global->stable_is_pinned ? "yes" : "no")); +#endif + + WT_RET(__wt_msg(session, "checkpoint running: %s", txn_global->checkpoint_running ? "yes" : "no")); WT_RET(__wt_msg(session, "checkpoint generation: %" PRIu64, __wt_gen(session, WT_GEN_CHECKPOINT))); @@ -1054,9 +1093,11 @@ __wt_verbose_dump_txn(WT_SESSION_IMPL *session) WT_RET(__wt_msg(session, "checkpoint txn ID: %" PRIu64, txn_global->checkpoint_state.id)); + WT_RET(__wt_msg(session, + "oldest named snapshot ID: %" PRIu64, txn_global->nsnap_oldest_id)); + WT_ORDERED_READ(session_cnt, conn->session_cnt); WT_RET(__wt_msg(session, "session count: %" PRIu32, session_cnt)); - WT_RET(__wt_msg(session, "Transaction state of active sessions:")); /* @@ -1083,7 +1124,40 @@ __wt_verbose_dump_txn(WT_SESSION_IMPL *session) iso_tag = "WT_ISO_SNAPSHOT"; break; } - +#ifdef HAVE_TIMESTAMPS + WT_RET(__wt_timestamp_to_hex_string( + session, hex_timestamp[0], &txn->commit_timestamp)); + WT_RET(__wt_timestamp_to_hex_string( + session, hex_timestamp[1], &txn->first_commit_timestamp)); + WT_RET(__wt_timestamp_to_hex_string( + session, hex_timestamp[2], &txn->read_timestamp)); + WT_RET(__wt_msg(session, + "ID: %8" PRIu64 + ", mod count: %u" + ", pinned ID: %8" PRIu64 + ", snap min: %" PRIu64 + ", snap max: %" PRIu64 + ", commit_timestamp: %s" + ", first_commit_timestamp: %s" + ", read_timestamp: %s" + ", metadata pinned ID: %" PRIu64 + ", flags: 0x%08" PRIx32 + ", name: %s" + ", isolation: %s", + id, + txn->mod_count, + s->pinned_id, + txn->snap_min, + txn->snap_max, + hex_timestamp[0], + hex_timestamp[1], + hex_timestamp[2], + s->metadata_pinned, + txn->flags, + conn->sessions[i].name == NULL ? + "EMPTY" : conn->sessions[i].name, + iso_tag)); +#else WT_RET(__wt_msg(session, "ID: %6" PRIu64 ", mod count: %u" @@ -1104,6 +1178,7 @@ __wt_verbose_dump_txn(WT_SESSION_IMPL *session) conn->sessions[i].name == NULL ? "EMPTY" : conn->sessions[i].name, iso_tag)); +#endif } WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index 532c4819d29..9065966fe8f 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -571,43 +571,17 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, const char *cfg[]) WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *txn_state; - char timestamp_buf[2 * WT_TIMESTAMP_SIZE + 1], timestamp_config[100]; - const char *query_cfg[] = { WT_CONFIG_BASE(session, - WT_CONNECTION_query_timestamp), "get=stable", NULL }; const char *txn_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_begin_transaction), "isolation=snapshot", NULL, NULL }; + bool use_timestamp; conn = S2C(session); txn = &session->txn; txn_global = &conn->txn_global; txn_state = WT_SESSION_TXN_STATE(session); - /* - * Someone giving us a specific timestamp overrides the general - * use_timestamp. - */ - WT_RET(__wt_config_gets(session, cfg, "read_timestamp", &cval)); - if (cval.len > 0) { - WT_RET(__wt_snprintf(timestamp_config, sizeof(timestamp_config), - "read_timestamp=%.*s", (int)cval.len, cval.str)); - txn_cfg[2] = timestamp_config; - } else if (txn_global->has_stable_timestamp) { - WT_RET(__wt_config_gets(session, cfg, "use_timestamp", &cval)); - /* - * Get the stable timestamp currently set. Then set that as - * the read timestamp for the transaction. - */ - if (cval.val != 0) { - if ((ret = __wt_txn_global_query_timestamp(session, - timestamp_buf, query_cfg)) != 0 && - ret != WT_NOTFOUND) - return (ret); - WT_RET(__wt_snprintf(timestamp_config, - sizeof(timestamp_config), - "read_timestamp=%s", timestamp_buf)); - txn_cfg[2] = timestamp_config; - } - } + WT_RET(__wt_config_gets(session, cfg, "use_timestamp", &cval)); + use_timestamp = (cval.val != 0); /* * Start a snapshot transaction for the checkpoint. @@ -667,15 +641,33 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, const char *cfg[]) */ txn_state->id = txn_state->pinned_id = txn_state->metadata_pinned = WT_TXN_NONE; - __wt_writeunlock(session, &txn_global->rwlock); #ifdef HAVE_TIMESTAMPS /* - * Now that the checkpoint transaction is published, clear it from the - * regular lists. + * Set the checkpoint transaction's timestamp, if requested. + * + * We rely on having the global transaction data locked so the oldest + * timestamp can't move past the stable timestamp. */ - __wt_txn_clear_commit_timestamp(session); - __wt_txn_clear_read_timestamp(session); + WT_ASSERT(session, !F_ISSET(txn, + WT_TXN_HAS_TS_COMMIT | WT_TXN_HAS_TS_READ | + WT_TXN_PUBLIC_TS_COMMIT | WT_TXN_PUBLIC_TS_READ)); + + if (use_timestamp && txn_global->has_stable_timestamp) { + __wt_timestamp_set( + &txn->read_timestamp, &txn_global->stable_timestamp); + F_SET(txn, WT_TXN_HAS_TS_READ); + } +#else + WT_UNUSED(use_timestamp); +#endif + + __wt_writeunlock(session, &txn_global->rwlock); + +#ifdef HAVE_TIMESTAMPS + if (F_ISSET(txn, WT_TXN_HAS_TS_READ)) + __wt_verbose_timestamp(session, &txn->read_timestamp, + "Checkpoint requested at stable timestamp"); #endif /* diff --git a/src/third_party/wiredtiger/src/txn/txn_ext.c b/src/third_party/wiredtiger/src/txn/txn_ext.c index 1fe4d6ddf47..103a1d38166 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ext.c +++ b/src/third_party/wiredtiger/src/txn/txn_ext.c @@ -72,7 +72,8 @@ __wt_ext_transaction_notify( if (txn->notify == notify) return (0); if (txn->notify != NULL) - return (ENOMEM); + WT_RET_MSG( + session, WT_ERROR, "transaction notify already scheduled"); txn->notify = notify; diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c index 1fc74fb53a1..a03047b5392 100644 --- a/src/third_party/wiredtiger/src/txn/txn_log.c +++ b/src/third_party/wiredtiger/src/txn/txn_log.c @@ -357,12 +357,14 @@ __wt_txn_checkpoint_log( WT_ITEM *ckpt_snapshot, empty; WT_LSN *ckpt_lsn; WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; uint8_t *end, *p; size_t recsize; uint32_t i, rectype; const char *fmt; conn = S2C(session); + txn_global = &conn->txn_global; txn = &session->txn; ckpt_lsn = &txn->ckpt_lsn; @@ -408,6 +410,15 @@ __wt_txn_checkpoint_log( } /* + * We take and immediately release the visibility lock. + * Acquiring the write lock guarantees that any transaction + * that has written to the log has also made its transaction + * visible at this time. + */ + __wt_writelock(session, &txn_global->visibility_rwlock); + __wt_writeunlock(session, &txn_global->visibility_rwlock); + + /* * We need to make sure that the log records in the checkpoint * LSN are on disk. In particular to make sure that the * current log file exists. diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index 9c02322c526..e19bbc73bb3 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -37,10 +37,10 @@ __txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session) * updated while rolling back, accessing it without a lock would * violate protocol. */ - txn_global = &S2C(session)->txn_global; - __wt_readlock(session, &txn_global->rwlock); - __wt_timestamp_set(&rollback_timestamp, &txn_global->stable_timestamp); - __wt_readunlock(session, &txn_global->rwlock); + txn_global = &conn->txn_global; + WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, + __wt_timestamp_set( + &rollback_timestamp, &txn_global->stable_timestamp)); __wt_las_cursor(session, &cursor, &session_flags); @@ -120,11 +120,11 @@ __txn_abort_newer_update(WT_SESSION_IMPL *session, } /* - * __txn_abort_newer_row_skip -- + * __txn_abort_newer_insert -- * Apply the update abort check to each entry in an insert skip list */ static void -__txn_abort_newer_row_skip(WT_SESSION_IMPL *session, +__txn_abort_newer_insert(WT_SESSION_IMPL *session, WT_INSERT_HEAD *head, wt_timestamp_t *rollback_timestamp) { WT_INSERT *ins; @@ -134,6 +134,50 @@ __txn_abort_newer_row_skip(WT_SESSION_IMPL *session, } /* + * __txn_abort_newer_col_var -- + * Abort updates on a variable length col leaf page with timestamps newer + * than the rollback timestamp. + */ +static void +__txn_abort_newer_col_var( + WT_SESSION_IMPL *session, WT_PAGE *page, wt_timestamp_t *rollback_timestamp) +{ + WT_COL *cip; + WT_INSERT_HEAD *ins; + uint32_t i; + + /* Review the changes to the original on-page data items */ + WT_COL_FOREACH(page, cip, i) + if ((ins = WT_COL_UPDATE(page, cip)) != NULL) + __txn_abort_newer_insert(session, + ins, rollback_timestamp); + + /* Review the append list */ + if ((ins = WT_COL_APPEND(page)) != NULL) + __txn_abort_newer_insert(session, ins, rollback_timestamp); +} + +/* + * __txn_abort_newer_col_fix -- + * Abort updates on a fixed length col leaf page with timestamps newer than + * the rollback timestamp. + */ +static void +__txn_abort_newer_col_fix( + WT_SESSION_IMPL *session, WT_PAGE *page, wt_timestamp_t *rollback_timestamp) +{ + WT_INSERT_HEAD *ins; + + /* Review the changes to the original on-page data items */ + if ((ins = WT_COL_UPDATE_SINGLE(page)) != NULL) + __txn_abort_newer_insert(session, ins, rollback_timestamp); + + /* Review the append list */ + if ((ins = WT_COL_APPEND(page)) != NULL) + __txn_abort_newer_insert(session, ins, rollback_timestamp); +} + +/* * __txn_abort_newer_row_leaf -- * Abort updates on a row leaf page with timestamps newer than the * rollback timestamp. @@ -152,8 +196,7 @@ __txn_abort_newer_row_leaf( * page. */ if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL) - __txn_abort_newer_row_skip( - session, insert, rollback_timestamp); + __txn_abort_newer_insert(session, insert, rollback_timestamp); /* * Review updates that belong to keys that are on the disk image, @@ -165,7 +208,7 @@ __txn_abort_newer_row_leaf( session, upd, rollback_timestamp); if ((insert = WT_ROW_INSERT(page, rip)) != NULL) - __txn_abort_newer_row_skip( + __txn_abort_newer_insert( session, insert, rollback_timestamp); } } @@ -182,6 +225,13 @@ __txn_abort_newer_updates( page = ref->page; switch (page->type) { + case WT_PAGE_COL_FIX: + __txn_abort_newer_col_fix(session, page, rollback_timestamp); + break; + case WT_PAGE_COL_VAR: + __txn_abort_newer_col_var(session, page, rollback_timestamp); + break; + case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: /* * There is nothing to do for internal pages, since we aren't @@ -193,9 +243,7 @@ __txn_abort_newer_updates( case WT_PAGE_ROW_LEAF: __txn_abort_newer_row_leaf(session, page, rollback_timestamp); break; - default: - WT_RET_MSG(session, EINVAL, "rollback_to_stable " - "is only supported for row store btrees"); + WT_ILLEGAL_VALUE(session); } return (0); @@ -209,14 +257,11 @@ static int __txn_rollback_to_stable_custom_skip( WT_SESSION_IMPL *session, WT_REF *ref, void *context, bool *skipp) { - WT_UNUSED(session); WT_UNUSED(context); + WT_UNUSED(session); /* Review all pages that are in memory. */ - if (ref->state == WT_REF_MEM || ref->state == WT_REF_DELETED) - *skipp = false; - else - *skipp = true; + *skipp = !(ref->state == WT_REF_MEM || ref->state == WT_REF_DELETED); return (0); } @@ -284,8 +329,7 @@ __txn_rollback_to_stable_btree( * Add the btree ID to the bitstring, so we can exclude any * lookaside entries for this btree. */ - __bit_set( - S2C(session)->stable_rollback_bitstring, btree->id); + __bit_set(S2C(session)->stable_rollback_bitstring, btree->id); return (0); } @@ -297,19 +341,15 @@ __txn_rollback_to_stable_btree( if (btree->root.page == NULL) return (0); - if (btree->type != BTREE_ROW) - WT_RET_MSG(session, EINVAL, "rollback_to_stable " - "is only supported for row store btrees"); - /* * Copy the stable timestamp, otherwise we'd need to lock it each time * it's accessed. Even though the stable timestamp isn't supposed to be * updated while rolling back, accessing it without a lock would * violate protocol. */ - __wt_readlock(session, &txn_global->rwlock); - __wt_timestamp_set(&rollback_timestamp, &txn_global->stable_timestamp); - __wt_readunlock(session, &txn_global->rwlock); + WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, + __wt_timestamp_set( + &rollback_timestamp, &txn_global->stable_timestamp)); /* * Ensure the eviction server is out of the file - we don't @@ -333,15 +373,12 @@ static int __txn_rollback_to_stable_check(WT_SESSION_IMPL *session) { WT_TXN_GLOBAL *txn_global; - bool active_txns, stable_set; + bool txn_active; txn_global = &S2C(session)->txn_global; - __wt_readlock(session, &txn_global->rwlock); - stable_set = !__wt_timestamp_iszero(&txn_global->stable_timestamp); - __wt_readunlock(session, &txn_global->rwlock); - if (!stable_set) - WT_RET_MSG(session, EINVAL, "rollback_to_stable requires a " - "stable timestamp"); + if (!txn_global->has_stable_timestamp) + WT_RET_MSG(session, EINVAL, + "rollback_to_stable requires a stable timestamp"); /* * Help the user - see if they have any active transactions. I'd @@ -349,8 +386,8 @@ __txn_rollback_to_stable_check(WT_SESSION_IMPL *session) * require peeking into all open sessions, which isn't really * kosher. */ - WT_RET(__wt_txn_are_any_active(session, &active_txns)); - if (active_txns) + WT_RET(__wt_txn_activity_check(session, &txn_active)); + if (txn_active) WT_RET_MSG(session, EINVAL, "rollback_to_stable illegal with active transactions"); @@ -369,9 +406,8 @@ __wt_txn_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[]) #ifndef HAVE_TIMESTAMPS WT_UNUSED(cfg); - WT_RET_MSG(session, EINVAL, "rollback_to_stable " - "requires a version of WiredTiger built with timestamp " - "support"); + WT_RET_MSG(session, ENOTSUP, "rollback_to_stable " + "requires a version of WiredTiger built with timestamp support"); #else WT_CONNECTION_IMPL *conn; WT_DECL_RET; diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c index 9e4a1e200cc..275ef941490 100644 --- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c +++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c @@ -10,6 +10,83 @@ #ifdef HAVE_TIMESTAMPS /* + * __wt_timestamp_to_hex_string -- + * Convert a timestamp to hex string representation. + */ +int +__wt_timestamp_to_hex_string( + WT_SESSION_IMPL *session, char *hex_timestamp, const wt_timestamp_t *ts_src) +{ + wt_timestamp_t ts; + + __wt_timestamp_set(&ts, ts_src); + + if (__wt_timestamp_iszero(&ts)) { + hex_timestamp[0] = '0'; + hex_timestamp[1] = '\0'; + return (0); + } + +#if WT_TIMESTAMP_SIZE == 8 + { + char *p, v; + + for (p = hex_timestamp; ts.val != 0; ts.val >>= 4) + *p++ = (char)__wt_hex((u_char)(ts.val & 0x0f)); + *p = '\0'; + + /* Reverse the string. */ + for (--p; p > hex_timestamp;) { + v = *p; + *p-- = *hex_timestamp; + *hex_timestamp++ = v; + } + WT_UNUSED(session); + } +#else + { + WT_ITEM hexts; + size_t len; + uint8_t *tsp; + + /* Avoid memory allocation: set up an item guaranteed large enough. */ + hexts.data = hexts.mem = hex_timestamp; + hexts.memsize = 2 * WT_TIMESTAMP_SIZE + 1; + /* Trim leading zeros. */ + for (tsp = ts.ts, len = WT_TIMESTAMP_SIZE; + len > 0 && *tsp == 0; + ++tsp, --len) + ; + WT_RET(__wt_raw_to_hex(session, tsp, len, &hexts)); + } +#endif + return (0); +} + +/* + * __wt_verbose_timestamp -- + * Output a verbose message along with the specified timestamp + */ +void +__wt_verbose_timestamp(WT_SESSION_IMPL *session, + const wt_timestamp_t *ts, const char *msg) +{ +#ifdef HAVE_VERBOSE + char timestamp_buf[2 * WT_TIMESTAMP_SIZE + 1]; + + if (__wt_timestamp_to_hex_string(session, timestamp_buf, ts) != 0) + return; + + __wt_verbose(session, + WT_VERB_TIMESTAMP, "Timestamp %s : %s", timestamp_buf, msg); +#else + WT_UNUSED(session); + WT_UNUSED(ts); + WT_UNUSED(msg); +#endif +} + +/* * __wt_txn_parse_timestamp -- * Decodes and sets a timestamp. */ @@ -25,7 +102,7 @@ __wt_txn_parse_timestamp(WT_SESSION_IMPL *session, /* Protect against unexpectedly long hex strings. */ if (cval->len > 2 * WT_TIMESTAMP_SIZE) WT_RET_MSG(session, EINVAL, - "Failed to parse %s timestamp '%.*s': too long", + "%s timestamp too long '%.*s'", name, (int)cval->len, cval->str); #if WT_TIMESTAMP_SIZE == 8 @@ -119,10 +196,9 @@ __txn_global_query_timestamp( if (WT_STRING_MATCH("all_committed", cval.str, cval.len)) { if (!txn_global->has_commit_timestamp) return (WT_NOTFOUND); - __wt_readlock(session, &txn_global->rwlock); - __wt_timestamp_set(&ts, &txn_global->commit_timestamp); + WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, + __wt_timestamp_set(&ts, &txn_global->commit_timestamp)); WT_ASSERT(session, !__wt_timestamp_iszero(&ts)); - __wt_readunlock(session, &txn_global->rwlock); /* Compare with the oldest running transaction. */ __wt_readlock(session, &txn_global->commit_timestamp_rwlock); @@ -157,9 +233,8 @@ __txn_global_query_timestamp( } else if (WT_STRING_MATCH("stable", cval.str, cval.len)) { if (!txn_global->has_stable_timestamp) return (WT_NOTFOUND); - __wt_readlock(session, &txn_global->rwlock); - __wt_timestamp_set(&ts, &txn_global->stable_timestamp); - __wt_readunlock(session, &txn_global->rwlock); + WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, + __wt_timestamp_set(&ts, &txn_global->stable_timestamp)); } else WT_RET_MSG(session, EINVAL, "unknown timestamp query %.*s", (int)cval.len, cval.str); @@ -181,47 +256,7 @@ __wt_txn_global_query_timestamp( wt_timestamp_t ts; WT_RET(__txn_global_query_timestamp(session, &ts, cfg)); - -#if WT_TIMESTAMP_SIZE == 8 - { - char *p, v; - - for (p = hex_timestamp; ts.val != 0; ts.val >>= 4) - *p++ = (char)__wt_hex((u_char)(ts.val & 0x0f)); - *p = '\0'; - - /* Reverse the string. */ - for (--p; p > hex_timestamp;) { - v = *p; - *p-- = *hex_timestamp; - *hex_timestamp++ = v; - } - } -#else - { - WT_ITEM hexts; - size_t len; - uint8_t *tsp; - - /* - * Keep clang-analyzer happy: it can't tell that ts will be set - * whenever the call below succeeds. - */ - __wt_timestamp_set_zero(&ts); - WT_RET(__txn_global_query_timestamp(session, &ts, cfg)); - - /* Avoid memory allocation: set up an item guaranteed large enough. */ - hexts.data = hexts.mem = hex_timestamp; - hexts.memsize = 2 * WT_TIMESTAMP_SIZE + 1; - /* Trim leading zeros. */ - for (tsp = ts.ts, len = WT_TIMESTAMP_SIZE; - len > 0 && *tsp == 0; - ++tsp, --len) - ; - WT_RET(__wt_raw_to_hex(session, tsp, len, &hexts)); - } -#endif - return (0); + return (__wt_timestamp_to_hex_string(session, hex_timestamp, &ts)); #else WT_UNUSED(hex_timestamp); WT_UNUSED(cfg); @@ -253,9 +288,9 @@ __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session) if (txn_global->oldest_is_pinned) return (0); - __wt_readlock(session, &txn_global->rwlock); - __wt_timestamp_set(&oldest_timestamp, &txn_global->oldest_timestamp); - __wt_readunlock(session, &txn_global->rwlock); + WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, + __wt_timestamp_set( + &oldest_timestamp, &txn_global->oldest_timestamp)); /* Scan to find the global pinned timestamp. */ if ((ret = __txn_global_query_timestamp( @@ -276,6 +311,8 @@ __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session) txn_global->oldest_is_pinned = __wt_timestamp_cmp( &txn_global->pinned_timestamp, &txn_global->oldest_timestamp) == 0; + __wt_verbose_timestamp(session, + &pinned_timestamp, "Updated pinned timestamp"); } __wt_writeunlock(session, &txn_global->rwlock); @@ -388,6 +425,8 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) if (has_commit) { __wt_timestamp_set(&txn_global->commit_timestamp, &commit_ts); txn_global->has_commit_timestamp = true; + __wt_verbose_timestamp(session, &commit_ts, + "Updated global commit timestamp"); } if (has_oldest && (!txn_global->has_oldest_timestamp || @@ -396,6 +435,8 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) __wt_timestamp_set(&txn_global->oldest_timestamp, &oldest_ts); txn_global->has_oldest_timestamp = true; txn_global->oldest_is_pinned = false; + __wt_verbose_timestamp(session, &oldest_ts, + "Updated global oldest timestamp"); } if (has_stable && (!txn_global->has_stable_timestamp || @@ -404,17 +445,18 @@ __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) __wt_timestamp_set(&txn_global->stable_timestamp, &stable_ts); txn_global->has_stable_timestamp = true; txn_global->stable_is_pinned = false; + __wt_verbose_timestamp(session, &stable_ts, + "Updated global stable timestamp"); } __wt_writeunlock(session, &txn_global->rwlock); if (has_oldest || has_stable) WT_RET(__wt_txn_update_pinned_timestamp(session)); - + } #else WT_RET_MSG(session, EINVAL, "set_timestamp requires a " "version of WiredTiger built with timestamp support"); #endif - } return (0); } diff --git a/src/third_party/wiredtiger/test/csuite/time_shift_test.sh b/src/third_party/wiredtiger/test/csuite/time_shift_test.sh new file mode 100755 index 00000000000..ae06fd03f36 --- /dev/null +++ b/src/third_party/wiredtiger/test/csuite/time_shift_test.sh @@ -0,0 +1,116 @@ +#! /bin/sh + +set -e + +# the purpose of this test is to ensure we use monotonic clock instead of +# realtime clock in our code. we had the instances where WT is hanging when +# system clock shifts (for eg: due to NTP servers). this test calculates +# the execution time of a test(test_rwlock), shifts the clock -vely by that +# time period and reexecutes the test. if the difference in the two execution +# times is less than 20% test is considered passed. 20% is selected, based on +# assumption that other factors of the environment will influence the execution +# time by less than 20%. + + +# need to enable long tests to run test_rwlock +export TESTUTIL_ENABLE_LONG_TESTS=1 + +# We will run only when long tests are enabled. +test "$TESTUTIL_ENABLE_LONG_TESTS" = "1" || exit 0 + +EXIT_SUCCESS=0 +EXIT_FAILURE=1 + +export DONT_FAKE_MONOTONIC=1 +RUN_OS=$(uname -s) + +# linux we run with cpu affinity, to control the execution time +# if we don't control the execution time this test is not effective +CPU_SET=0-1 +echo "test read write lock for time shifting using libfaketime" + + +# check for program arguements, if not present, print usage +if [ -z $1 ] +then + echo "fail : this test needs libfaketime library with path" + echo "Usage :" + echo " " $0 " <libpath> [cpuset] " + echo " libpath : path to libfaketime library" + echo " cpuset : set of cpu's to be used for taskset on linux" + echo " : default is 0-1 " + exit $EXIT_FAILURE +fi + +# check for the existence of dependent library +if [ ! -r $1 ] +then + echo "fail : $1 , libfaketime library is not readable" + exit $EXIT_FAILURE +fi + +SEC1=`date +%s` +if [ "$RUN_OS" = "Darwin" ] +then + ./test_rwlock +elif [ "$RUN_OS" = "Linux" ] +then + if [ -z $2 ] + then + echo "default taskset value is 0-1" + else + CPU_SET=$2 + fi + taskset -c $CPU_SET ./test_rwlock +else + echo "not able to decide running OS, so exiting" + exit $EXIT_FAILURE +fi + +SEC2=`date +%s` +DIFF1=$((SEC2 - SEC1)) + +# preload libfaketime +if [ "$RUN_OS" = "Darwin" ] +then + export DYLD_FORCE_FLAT_NAMESPACE=y + export DYLD_INSERT_LIBRARIES=$1 + ./test_rwlock & +else + LD_PRELOAD=$1 taskset -c $CPU_SET ./test_rwlock & +fi + +# get pid of test run in background +PID=$! + +sleep 5s +echo "-$DIFF1""s" >| ~/.faketimerc + +wait $PID + +#kept echo statement here so as not to loose in cluster of test msgs. +echo "after sleeping for 5 seconds set ~/.faketimerc value as -ve $DIFF1 seconds" +rm ~/.faketimerc + +if [ "$RUN_OS" = "Darwin" ] +then + export DYLD_FORCE_FLAT_NAMESPACE= + export DYLD_INSERT_LIBRARIES= +fi +SEC3=`date +%s` +DIFF2=$((SEC3 - SEC2)) + +PERC=$((((DIFF2 - DIFF1)*100)/DIFF1)) +echo "execution time difference : $PERC %, less than 20% is ok" +echo "normal execution time : $DIFF1 seconds" +echo "fake time reduction by : $DIFF1 seconds" +echo "execution time with -ve time shift : $DIFF2 seconds" + +if [ "$PERC" -le 20 ] +then + echo "pass : execution time is affected $PERC % by -ve time shift" + exit $EXIT_SUCCESS +else + echo "fail : execution time is affected $PERC % by -ve time shift" + exit $EXIT_FAILURE +fi diff --git a/src/third_party/wiredtiger/test/csuite/wt3363_checkpoint_op_races/main.c b/src/third_party/wiredtiger/test/csuite/wt3363_checkpoint_op_races/main.c index d007eb65382..9cb1ab0f4c6 100644 --- a/src/third_party/wiredtiger/test/csuite/wt3363_checkpoint_op_races/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt3363_checkpoint_op_races/main.c @@ -81,7 +81,7 @@ main(int argc, char *argv[]) * This test should not run unless long tests flag is set. The test * runs for 15 minutes. */ - if (!testutil_is_flag_set("WT3363_CHECKPOINT_OP_RACES")) + if (!testutil_is_flag_set("TESTUTIL_ENABLE_TIMING_TESTS")) return (EXIT_SUCCESS); opts = &_opts; diff --git a/src/third_party/wiredtiger/test/format/backup.c b/src/third_party/wiredtiger/test/format/backup.c index 47f3c54325f..0dc7402e181 100644 --- a/src/third_party/wiredtiger/test/format/backup.c +++ b/src/third_party/wiredtiger/test/format/backup.c @@ -36,6 +36,7 @@ static void check_copy(void) { WT_CONNECTION *conn; + WT_DECL_RET; WT_SESSION *session; wts_open(g.home_backup, false, &conn); @@ -44,9 +45,14 @@ check_copy(void) conn->open_session(conn, NULL, NULL, &session), "%s", g.home_backup); - testutil_checkfmt( - session->verify(session, g.uri, NULL), - "%s: %s", g.home_backup, g.uri); + /* + * Verify can return EBUSY if the handle isn't available. Don't yield + * and retry, in the case of LSM, the handle may not be available for + * a long time. + */ + ret = session->verify(session, g.uri, NULL); + testutil_assertfmt(ret == 0 || ret == EBUSY, + "WT_SESSION.verify: %s: %s", g.home_backup, g.uri); testutil_checkfmt(conn->close(conn, NULL), "%s", g.home_backup); } diff --git a/src/third_party/wiredtiger/test/format/wts.c b/src/third_party/wiredtiger/test/format/wts.c index 6a58cad5403..031e3bb25af 100644 --- a/src/third_party/wiredtiger/test/format/wts.c +++ b/src/third_party/wiredtiger/test/format/wts.c @@ -584,10 +584,14 @@ wts_verify(const char *tag) testutil_check(conn->set_timestamp(conn, config_buf)); } - /* Session operations for LSM can return EBUSY. */ + /* + * Verify can return EBUSY if the handle isn't available. Don't yield + * and retry, in the case of LSM, the handle may not be available for + * a long time. + */ ret = session->verify(session, g.uri, "strict"); - if (ret != 0 && !(ret == EBUSY && DATASOURCE("lsm"))) - testutil_die(ret, "session.verify: %s: %s", g.uri, tag); + testutil_assertfmt( + ret == 0 || ret == EBUSY, "session.verify: %s: %s", g.uri, tag); if (g.logging != 0) (void)g.wt_api->msg_printf(g.wt_api, session, diff --git a/src/third_party/wiredtiger/test/recovery/Makefile.am b/src/third_party/wiredtiger/test/recovery/Makefile.am index 3e7fce17d0e..298b9a995b8 100644 --- a/src/third_party/wiredtiger/test/recovery/Makefile.am +++ b/src/third_party/wiredtiger/test/recovery/Makefile.am @@ -2,12 +2,17 @@ AM_CPPFLAGS = -I$(top_builddir) AM_CPPFLAGS +=-I$(top_srcdir)/src/include AM_CPPFLAGS +=-I$(top_srcdir)/test/utility -noinst_PROGRAMS = random-abort truncated-log +noinst_PROGRAMS = random-abort timestamp-abort truncated-log random_abort_SOURCES = random-abort.c random_abort_LDADD = $(top_builddir)/test/utility/libtest_util.la random_abort_LDADD +=$(top_builddir)/libwiredtiger.la random_abort_LDFLAGS = -static +timestamp_abort_SOURCES = timestamp-abort.c +timestamp_abort_LDADD = $(top_builddir)/test/utility/libtest_util.la +timestamp_abort_LDADD +=$(top_builddir)/libwiredtiger.la +timestamp_abort_LDFLAGS = -static + truncated_log_SOURCES = truncated-log.c truncated_log_LDADD = $(top_builddir)/test/utility/libtest_util.la truncated_log_LDADD +=$(top_builddir)/libwiredtiger.la diff --git a/src/third_party/wiredtiger/test/recovery/smoke.sh b/src/third_party/wiredtiger/test/recovery/smoke.sh index ba4d77c642b..6587c7c9f98 100755 --- a/src/third_party/wiredtiger/test/recovery/smoke.sh +++ b/src/third_party/wiredtiger/test/recovery/smoke.sh @@ -8,4 +8,8 @@ $TEST_WRAPPER ./random-abort -t 10 -T 5 $TEST_WRAPPER ./random-abort -m -t 10 -T 5 $TEST_WRAPPER ./random-abort -C -t 10 -T 5 $TEST_WRAPPER ./random-abort -C -m -t 10 -T 5 +$TEST_WRAPPER ./timestamp-abort -t 10 -T 5 +$TEST_WRAPPER ./timestamp-abort -m -t 10 -T 5 +$TEST_WRAPPER ./timestamp-abort -C -t 10 -T 5 +$TEST_WRAPPER ./timestamp-abort -C -m -t 10 -T 5 $TEST_WRAPPER ./truncated-log diff --git a/src/third_party/wiredtiger/test/recovery/timestamp-abort.c b/src/third_party/wiredtiger/test/recovery/timestamp-abort.c new file mode 100644 index 00000000000..7e912b1fe26 --- /dev/null +++ b/src/third_party/wiredtiger/test/recovery/timestamp-abort.c @@ -0,0 +1,722 @@ +/*- + * Public Domain 2014-2017 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "test_util.h" + +#include <sys/wait.h> +#include <signal.h> + +static char home[1024]; /* Program working dir */ + +/* + * Create three tables that we will write the same data to and verify that + * all the types of usage have the expected data in them after a crash and + * recovery. We want: + * 1. A table that is logged and is not involved in timestamps. This table + * simulates a user local table. + * 2. A table that is logged and involved in timestamps. This simulates + * the oplog. + * 3. A table that is not logged and involved in timestamps. This simulates + * a typical collection file. + * + * We also create a fourth table that is not logged and not involved directly + * in timestamps to store the stable timestamp. That way we can know what the + * latest stable timestamp is on checkpoint. + * + * We also create several files that are not WiredTiger tables. The checkpoint + * thread creates a file indicating that a checkpoint has completed. The parent + * process uses this to know when at least one checkpoint is done and it can + * start the timer to abort. + * + * Each worker thread creates its own records file that records the data it + * inserted and it records the timestamp that was used for that insertion. + */ +static const char * const uri_local = "table:local"; +static const char * const uri_oplog = "table:oplog"; +static const char * const uri_collection = "table:collection"; + +static const char * const stable_store = "table:stable"; +static const char * const ckpt_file = "checkpoint_done"; +static bool compat, inmem, use_ts; +static uint64_t global_ts = 1; + +#define MAX_TH 12 +#define MAX_TIME 40 +#define MIN_TH 5 +#define MIN_TIME 10 +#define RECORDS_FILE "records-%" PRIu32 +#define STABLE_PERIOD 100 + +#define ENV_CONFIG_COMPAT ",compatibility=(release=\"2.9\")" +#define ENV_CONFIG_DEF \ + "create,log=(archive=false,file_max=10M,enabled)" +#define ENV_CONFIG_TXNSYNC \ + "create,log=(archive=false,file_max=10M,enabled)," \ + "transaction_sync=(enabled,method=none)" +#define ENV_CONFIG_REC "log=(archive=false,recover=on)" + +#define MAX_CKPT_INTERVAL 5 /* Maximum interval between checkpoints */ +#define MAX_VAL 1024 + +static void usage(void) + WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn)); +static void +usage(void) +{ + fprintf(stderr, + "usage: %s [-h dir] [-T threads] [-t time] [-Cmvz]\n", progname); + exit(EXIT_FAILURE); +} + +typedef struct { + WT_CONNECTION *conn; + uint64_t start; + uint32_t id; +} WT_THREAD_DATA; + +/* + * thread_ckpt_run -- + * Runner function for the checkpoint thread. + */ +static WT_THREAD_RET +thread_ckpt_run(void *arg) +{ + FILE *fp; + WT_RAND_STATE rnd; + WT_SESSION *session; + WT_THREAD_DATA *td; + uint64_t ts; + uint32_t sleep_time; + int i, ret; + bool first_ckpt; + + __wt_random_init(&rnd); + + td = (WT_THREAD_DATA *)arg; + /* + * Keep a separate file with the records we wrote for checking. + */ + (void)unlink(ckpt_file); + if ((ret = td->conn->open_session(td->conn, NULL, NULL, &session)) != 0) + testutil_die(ret, "WT_CONNECTION:open_session"); + first_ckpt = true; + ts = 0; + for (i = 0; ;++i) { + sleep_time = __wt_random(&rnd) % MAX_CKPT_INTERVAL; + sleep(sleep_time); + if (use_ts) + ts = global_ts; + /* + * Since this is the default, send in this string even if + * running without timestamps. + */ + testutil_check(session->checkpoint( + session, "use_timestamp=true")); + printf("Checkpoint %d complete. Minimum ts %" PRIu64 "\n", + i, ts); + fflush(stdout); + /* + * Create the checkpoint file so that the parent process knows + * at least one checkpoint has finished and can start its + * timer. + */ + if (first_ckpt) { + testutil_checksys((fp = fopen(ckpt_file, "w")) == NULL); + first_ckpt = false; + testutil_checksys(fclose(fp) != 0); + } + } + /* NOTREACHED */ +} + +/* + * thread_run -- + * Runner function for the worker threads. + */ +static WT_THREAD_RET +thread_run(void *arg) +{ + FILE *fp; + WT_CURSOR *cur_coll, *cur_local, *cur_oplog, *cur_stable; + WT_ITEM data; + WT_RAND_STATE rnd; + WT_SESSION *session; + WT_THREAD_DATA *td; + uint64_t i, stable_ts; + int ret; + char cbuf[MAX_VAL], lbuf[MAX_VAL], obuf[MAX_VAL]; + char kname[64], tscfg[64]; + + __wt_random_init(&rnd); + memset(cbuf, 0, sizeof(cbuf)); + memset(lbuf, 0, sizeof(lbuf)); + memset(obuf, 0, sizeof(obuf)); + memset(kname, 0, sizeof(kname)); + + td = (WT_THREAD_DATA *)arg; + /* + * Set up the separate file for checking. + */ + testutil_check(__wt_snprintf(cbuf, sizeof(cbuf), RECORDS_FILE, td->id)); + (void)unlink(cbuf); + testutil_checksys((fp = fopen(cbuf, "w")) == NULL); + /* + * Set to line buffering. But that is advisory only. We've seen + * cases where the result files end up with partial lines. + */ + __wt_stream_set_line_buffer(fp); + if ((ret = td->conn->open_session(td->conn, NULL, NULL, &session)) != 0) + testutil_die(ret, "WT_CONNECTION:open_session"); + /* + * Open a cursor to each table. + */ + if ((ret = session->open_cursor(session, + uri_collection, NULL, NULL, &cur_coll)) != 0) + testutil_die(ret, "WT_SESSION.open_cursor: %s", uri_collection); + if ((ret = session->open_cursor(session, + uri_local, NULL, NULL, &cur_local)) != 0) + testutil_die(ret, "WT_SESSION.open_cursor: %s", uri_local); + if ((ret = session->open_cursor(session, + uri_oplog, NULL, NULL, &cur_oplog)) != 0) + testutil_die(ret, "WT_SESSION.open_cursor: %s", uri_oplog); + + if ((ret = session->open_cursor( + session, stable_store, NULL, NULL, &cur_stable)) != 0) + testutil_die(ret, "WT_SESSION.open_cursor: %s", stable_store); + + /* + * Write our portion of the key space until we're killed. + */ + printf("Thread %" PRIu32 " starts at %" PRIu64 "\n", td->id, td->start); + for (i = td->start; ; ++i) { + if (use_ts) + stable_ts = global_ts++; + else + stable_ts = 0; + testutil_check(__wt_snprintf( + kname, sizeof(kname), "%" PRIu64, i)); + + testutil_check(session->begin_transaction(session, NULL)); + cur_coll->set_key(cur_coll, kname); + cur_local->set_key(cur_local, kname); + cur_oplog->set_key(cur_oplog, kname); + /* + * Put an informative string into the value so that it + * can be viewed well in a binary dump. + */ + testutil_check(__wt_snprintf(cbuf, sizeof(cbuf), + "COLL: thread:%" PRIu64 " ts:%" PRIu64 " key: %" PRIu64, + td->id, stable_ts, i)); + testutil_check(__wt_snprintf(lbuf, sizeof(lbuf), + "LOCAL: thread:%" PRIu64 " ts:%" PRIu64 " key: %" PRIu64, + td->id, stable_ts, i)); + testutil_check(__wt_snprintf(obuf, sizeof(obuf), + "OPLOG: thread:%" PRIu64 " ts:%" PRIu64 " key: %" PRIu64, + td->id, stable_ts, i)); + data.size = __wt_random(&rnd) % MAX_VAL; + data.data = cbuf; + cur_coll->set_value(cur_coll, &data); + if ((ret = cur_coll->insert(cur_coll)) != 0) + testutil_die(ret, "WT_CURSOR.insert"); + data.size = __wt_random(&rnd) % MAX_VAL; + data.data = obuf; + cur_oplog->set_value(cur_oplog, &data); + if ((ret = cur_oplog->insert(cur_oplog)) != 0) + testutil_die(ret, "WT_CURSOR.insert"); + if (use_ts) { + testutil_check(__wt_snprintf(tscfg, sizeof(tscfg), + "commit_timestamp=%" PRIx64, stable_ts)); + testutil_check( + session->commit_transaction(session, tscfg)); + } else + testutil_check( + session->commit_transaction(session, NULL)); + /* + * Insert into the local table outside the timestamp txn. + */ + data.size = __wt_random(&rnd) % MAX_VAL; + data.data = lbuf; + cur_local->set_value(cur_local, &data); + if ((ret = cur_local->insert(cur_local)) != 0) + testutil_die(ret, "WT_CURSOR.insert"); + + /* + * Every N records we will record our stable timestamp into the + * stable table. That will define our threshold where we + * expect to find records after recovery. + */ + if (i % STABLE_PERIOD == 0) { + if (use_ts) { + /* + * Set both the oldest and stable timestamp + * so that we don't need to maintain read + * availability at older timestamps. + */ + testutil_check(__wt_snprintf( + tscfg, sizeof(tscfg), + "oldest_timestamp=%" PRIx64 + ",stable_timestamp=%" PRIx64, + stable_ts, stable_ts)); + testutil_check( + td->conn->set_timestamp(td->conn, tscfg)); + } + cur_stable->set_key(cur_stable, td->id); + cur_stable->set_value(cur_stable, stable_ts); + testutil_check(cur_stable->insert(cur_stable)); + } + /* + * Save the timestamp and key separately for checking later. + */ + if (fprintf(fp, + "%" PRIu64 " %" PRIu64 "\n", stable_ts, i) < 0) + testutil_die(EIO, "fprintf"); + } + /* NOTREACHED */ +} + +/* + * Child process creates the database and table, and then creates worker + * threads to add data until it is killed by the parent. + */ +static void run_workload(uint32_t) + WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn)); +static void +run_workload(uint32_t nth) +{ + WT_CONNECTION *conn; + WT_SESSION *session; + WT_THREAD_DATA *td; + wt_thread_t *thr; + uint32_t i; + int ret; + char envconf[512]; + + thr = dcalloc(nth+1, sizeof(*thr)); + td = dcalloc(nth+1, sizeof(WT_THREAD_DATA)); + if (chdir(home) != 0) + testutil_die(errno, "Child chdir: %s", home); + if (inmem) + strcpy(envconf, ENV_CONFIG_DEF); + else + strcpy(envconf, ENV_CONFIG_TXNSYNC); + if (compat) + strcat(envconf, ENV_CONFIG_COMPAT); + + if ((ret = wiredtiger_open(NULL, NULL, envconf, &conn)) != 0) + testutil_die(ret, "wiredtiger_open"); + if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) + testutil_die(ret, "WT_CONNECTION:open_session"); + /* + * Create all the tables. + */ + if ((ret = session->create(session, uri_collection, + "key_format=S,value_format=u,log=(enabled=false)")) != 0) + testutil_die(ret, "WT_SESSION.create: %s", uri_collection); + if ((ret = session->create(session, + uri_local, "key_format=S,value_format=u")) != 0) + testutil_die(ret, "WT_SESSION.create: %s", uri_local); + if ((ret = session->create(session, + uri_oplog, "key_format=S,value_format=u")) != 0) + testutil_die(ret, "WT_SESSION.create: %s", uri_oplog); + /* + * Don't log the stable timestamp table so that we know what timestamp + * was stored at the checkpoint. + */ + if ((ret = session->create(session, stable_store, + "key_format=Q,value_format=Q,log=(enabled=false)")) != 0) + testutil_die(ret, "WT_SESSION.create: %s", stable_store); + if ((ret = session->close(session, NULL)) != 0) + testutil_die(ret, "WT_SESSION:close"); + + /* + * Thread 0 is the checkpoint thread. + */ + td[0].conn = conn; + td[0].id = 0; + printf("Create checkpoint thread\n"); + testutil_check(__wt_thread_create( + NULL, &thr[0], thread_ckpt_run, &td[0])); + for (i = 1; i <= nth; ++i) { + td[i].conn = conn; + td[i].start = (UINT64_MAX / nth) * (i - 1); + td[i].id = i; + testutil_check(__wt_thread_create( + NULL, &thr[i], thread_run, &td[i])); + } + /* + * The threads never exit, so the child will just wait here until + * it is killed. + */ + printf("Create %" PRIu32 " writer threads\n", nth); + fflush(stdout); + for (i = 0; i <= nth; ++i) + testutil_check(__wt_thread_join(NULL, thr[i])); + /* + * NOTREACHED + */ + free(thr); + free(td); + exit(EXIT_SUCCESS); +} + +extern int __wt_optind; +extern char *__wt_optarg; + +int +main(int argc, char *argv[]) +{ + struct stat sb; + FILE *fp; + WT_CONNECTION *conn; + WT_CURSOR *cur_coll, *cur_local, *cur_oplog, *cur_stable; + WT_RAND_STATE rnd; + WT_SESSION *session; + pid_t pid; + uint64_t absent_coll, absent_local, absent_oplog, count, key, last_key; + uint64_t first_miss, middle_coll, middle_local, middle_oplog; + uint64_t stable_fp, stable_val, val[MAX_TH+1]; + uint32_t i, nth, timeout; + int ch, status, ret; + const char *working_dir; + char buf[128], fname[64], kname[64], statname[1024]; + bool fatal, rand_th, rand_time, verify_only; + + (void)testutil_set_progname(argv); + + compat = inmem = false; + use_ts = true; + nth = MIN_TH; + rand_th = rand_time = true; + timeout = MIN_TIME; + verify_only = false; + working_dir = "WT_TEST.timestamp-abort"; + + while ((ch = __wt_getopt(progname, argc, argv, "Ch:mT:t:vz")) != EOF) + switch (ch) { + case 'C': + compat = true; + break; + case 'h': + working_dir = __wt_optarg; + break; + case 'm': + inmem = true; + break; + case 'T': + rand_th = false; + nth = (uint32_t)atoi(__wt_optarg); + break; + case 't': + rand_time = false; + timeout = (uint32_t)atoi(__wt_optarg); + break; + case 'v': + verify_only = true; + break; + case 'z': + use_ts = false; + break; + default: + usage(); + } + argc -= __wt_optind; + argv += __wt_optind; + if (argc != 0) + usage(); + + testutil_work_dir_from_path(home, sizeof(home), working_dir); + /* + * If the user wants to verify they need to tell us how many threads + * there were so we can find the old record files. + */ + if (verify_only && rand_th) { + fprintf(stderr, + "Verify option requires specifying number of threads\n"); + exit (EXIT_FAILURE); + } + if (!verify_only) { + testutil_make_work_dir(home); + + __wt_random_init_seed(NULL, &rnd); + if (rand_time) { + timeout = __wt_random(&rnd) % MAX_TIME; + if (timeout < MIN_TIME) + timeout = MIN_TIME; + } + if (rand_th) { + nth = __wt_random(&rnd) % MAX_TH; + if (nth < MIN_TH) + nth = MIN_TH; + } + printf("Parent: compatibility: %s, " + "in-mem log sync: %s, timestamp in use: %s\n", + compat ? "true" : "false", + inmem ? "true" : "false", + use_ts ? "true" : "false"); + printf("Parent: Create %" PRIu32 + " threads; sleep %" PRIu32 " seconds\n", nth, timeout); + /* + * Fork a child to insert as many items. We will then randomly + * kill the child, run recovery and make sure all items we wrote + * exist after recovery runs. + */ + testutil_checksys((pid = fork()) < 0); + + if (pid == 0) { /* child */ + run_workload(nth); + return (EXIT_SUCCESS); + } + + /* parent */ + /* + * Sleep for the configured amount of time before killing + * the child. Start the timeout from the time we notice that + * the file has been created. That allows the test to run + * correctly on really slow machines. Verify the process ID + * still exists in case the child aborts for some reason we + * don't stay in this loop forever. + */ + testutil_check(__wt_snprintf( + statname, sizeof(statname), "%s/%s", home, ckpt_file)); + while (stat(statname, &sb) != 0 && kill(pid, 0) == 0) + sleep(1); + sleep(timeout); + + /* + * !!! It should be plenty long enough to make sure more than + * one log file exists. If wanted, that check would be added + * here. + */ + printf("Kill child\n"); + testutil_checksys(kill(pid, SIGKILL) != 0); + testutil_checksys(waitpid(pid, &status, 0) == -1); + } + /* + * !!! If we wanted to take a copy of the directory before recovery, + * this is the place to do it. + */ + if (chdir(home) != 0) + testutil_die(errno, "parent chdir: %s", home); + testutil_check(__wt_snprintf(buf, sizeof(buf), + "rm -rf ../%s.SAVE && mkdir ../%s.SAVE && cp -rp * ../%s.SAVE", + home, home, home)); + (void)system(buf); + printf("Open database, run recovery and verify content\n"); + + /* + * Open the connection which forces recovery to be run. + */ + if ((ret = wiredtiger_open(NULL, NULL, ENV_CONFIG_REC, &conn)) != 0) + testutil_die(ret, "wiredtiger_open"); + if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) + testutil_die(ret, "WT_CONNECTION:open_session"); + /* + * Open a cursor on all the tables. + */ + if ((ret = session->open_cursor(session, + uri_collection, NULL, NULL, &cur_coll)) != 0) + testutil_die(ret, "WT_SESSION.open_cursor: %s", uri_collection); + if ((ret = session->open_cursor(session, + uri_local, NULL, NULL, &cur_local)) != 0) + testutil_die(ret, "WT_SESSION.open_cursor: %s", uri_local); + if ((ret = session->open_cursor(session, + uri_oplog, NULL, NULL, &cur_oplog)) != 0) + testutil_die(ret, "WT_SESSION.open_cursor: %s", uri_oplog); + if ((ret = session->open_cursor(session, + stable_store, NULL, NULL, &cur_stable)) != 0) + testutil_die(ret, "WT_SESSION.open_cursor: %s", stable_store); + + /* + * Find the biggest stable timestamp value that was saved. + */ + stable_val = 0; + memset(val, 0, sizeof(val)); + while (cur_stable->next(cur_stable) == 0) { + cur_stable->get_key(cur_stable, &key); + cur_stable->get_value(cur_stable, &val[key]); + if (val[key] > stable_val) + stable_val = val[key]; + + if (use_ts) + printf("Stable: key %" PRIu64 " value %" PRIu64 "\n", + key, val[key]); + } + if (use_ts) + printf("Got stable_val %" PRIu64 "\n", stable_val); + + count = 0; + absent_coll = absent_local = absent_oplog = 0; + fatal = false; + for (i = 1; i <= nth; ++i) { + first_miss = middle_coll = middle_local = middle_oplog = 0; + testutil_check(__wt_snprintf( + fname, sizeof(fname), RECORDS_FILE, i)); + if ((fp = fopen(fname, "r")) == NULL) + testutil_die(errno, "fopen: %s", fname); + + /* + * For every key in the saved file, verify that the key exists + * in the table after recovery. If we're doing in-memory + * log buffering we never expect a record missing in the middle, + * but records may be missing at the end. If we did + * write-no-sync, we expect every key to have been recovered. + */ + for (last_key = UINT64_MAX;; ++count, last_key = key) { + ret = fscanf(fp, "%" SCNu64 "%" SCNu64 "\n", + &stable_fp, &key); + if (ret != EOF && ret != 2) { + /* + * If we find a partial line, consider it + * like an EOF. + */ + if (ret == 1 || ret == 0) + break; + testutil_die(errno, "fscanf"); + } + if (ret == EOF) + break; + /* + * If we're unlucky, the last line may be a partially + * written key at the end that can result in a false + * negative error for a missing record. Detect it. + */ + if (last_key != UINT64_MAX && key != last_key + 1) { + printf("%s: Ignore partial record %" PRIu64 + " last valid key %" PRIu64 "\n", + fname, key, last_key); + break; + } + testutil_check(__wt_snprintf( + kname, sizeof(kname), "%" PRIu64, key)); + cur_coll->set_key(cur_coll, kname); + cur_local->set_key(cur_local, kname); + cur_oplog->set_key(cur_oplog, kname); + /* + * The collection table should always only have the + * data as of the checkpoint. + */ + if ((ret = cur_coll->search(cur_coll)) != 0) { + if (ret != WT_NOTFOUND) + testutil_die(ret, "search"); + /* + * If we don't find a record, the stable + * timestamp written to our file better be + * larger than the saved one. + */ + if (!inmem && + stable_fp != 0 && stable_fp <= val[i]) { + printf("%s: COLLECTION no record with " + "key %" PRIu64 " record ts %" PRIu64 + " <= stable ts %" PRIu64 "\n", + fname, key, stable_fp, val[i]); + absent_coll++; + } + if (middle_coll == 0) + first_miss = key; + middle_coll = key; + } else if (middle_coll != 0) { + /* + * We should never find an existing key after + * we have detected one missing. + */ + printf("%s: COLLECTION after absent records %" + PRIu64 "-%" PRIu64 " key %" PRIu64 + " exists\n", + fname, first_miss, middle_coll, key); + fatal = true; + } + /* + * The local table should always have all data. + */ + if ((ret = cur_local->search(cur_local)) != 0) { + if (ret != WT_NOTFOUND) + testutil_die(ret, "search"); + if (!inmem) + printf("%s: LOCAL no record with key %" + PRIu64 "\n", fname, key); + absent_local++; + middle_local = key; + } else if (middle_local != 0) { + /* + * We should never find an existing key after + * we have detected one missing. + */ + printf("%s: LOCAL after absent record at %" + PRIu64 " key %" PRIu64 " exists\n", + fname, middle_local, key); + fatal = true; + } + /* + * The oplog table should always have all data. + */ + if ((ret = cur_oplog->search(cur_oplog)) != 0) { + if (ret != WT_NOTFOUND) + testutil_die(ret, "search"); + if (!inmem) + printf("%s: OPLOG no record with key %" + PRIu64 "\n", fname, key); + absent_oplog++; + middle_oplog = key; + } else if (middle_oplog != 0) { + /* + * We should never find an existing key after + * we have detected one missing. + */ + printf("%s: OPLOG after absent record at %" + PRIu64 " key %" PRIu64 " exists\n", + fname, middle_oplog, key); + fatal = true; + } + } + testutil_checksys(fclose(fp) != 0); + } + if ((ret = conn->close(conn, NULL)) != 0) + testutil_die(ret, "WT_CONNECTION:close"); + if (fatal) + return (EXIT_FAILURE); + if (!inmem && absent_coll) { + printf("COLLECTION: %" PRIu64 + " record(s) absent from %" PRIu64 "\n", + absent_coll, count); + fatal = true; + } + if (!inmem && absent_local) { + printf("LOCAL: %" PRIu64 " record(s) absent from %" PRIu64 "\n", + absent_local, count); + fatal = true; + } + if (!inmem && absent_oplog) { + printf("OPLOG: %" PRIu64 " record(s) absent from %" PRIu64 "\n", + absent_oplog, count); + fatal = true; + } + if (fatal) + return (EXIT_FAILURE); + printf("%" PRIu64 " records verified\n", count); + return (EXIT_SUCCESS); +} diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp01.py b/src/third_party/wiredtiger/test/suite/test_timestamp01.py index a934753488d..c8938296908 100644 --- a/src/third_party/wiredtiger/test/suite/test_timestamp01.py +++ b/src/third_party/wiredtiger/test/suite/test_timestamp01.py @@ -52,7 +52,7 @@ class test_timestamp01(wttest.WiredTigerTestCase, suite_subprocess): self.session.begin_transaction() self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: self.session.commit_transaction( - 'commit_timestamp=' + timestamp_str(1 << 100)), + 'commit_timestamp=' + timestamp_str(1 << 5000)), '/too long/') # One is okay, as is 2**64 - 1 diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp03.py b/src/third_party/wiredtiger/test/suite/test_timestamp03.py index 734961e9e98..728200e528a 100644 --- a/src/third_party/wiredtiger/test/suite/test_timestamp03.py +++ b/src/third_party/wiredtiger/test/suite/test_timestamp03.py @@ -62,7 +62,6 @@ class test_timestamp03(wttest.WiredTigerTestCase, suite_subprocess): ('use_ts_def', dict(ckptcfg='', val='none')), ('use_ts_false', dict(ckptcfg='use_timestamp=false', val='all')), ('use_ts_true', dict(ckptcfg='use_timestamp=true', val='none')), - ('read_ts', dict(ckptcfg='read_timestamp', val='none')), ] conncfg = [ diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp04.py b/src/third_party/wiredtiger/test/suite/test_timestamp04.py index 146326834db..3af0feed31b 100644 --- a/src/third_party/wiredtiger/test/suite/test_timestamp04.py +++ b/src/third_party/wiredtiger/test/suite/test_timestamp04.py @@ -30,8 +30,6 @@ # Timestamps: Test that rollback_to_stable obeys expected visibility rules # -import datetime -import random from suite_subprocess import suite_subprocess import wiredtiger, wttest from wtscenario import make_scenarios @@ -50,9 +48,10 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess): uri = 'table:' + tablename scenarios = make_scenarios([ - #('col', dict(extra_config=',key_format=r')), - #('lsm', dict(extra_config=',type=lsm')), - ('row', dict(extra_config=',memory_page_max=32k,leaf_page_max=8k,internal_page_max=8k')), + ('col_fix', dict(empty=1, extra_config=',key_format=r, value_format=8t')), + ('col_var', dict(empty=0, extra_config=',key_format=r')), + #('lsm', dict(empty=0, extra_config=',type=lsm')), + ('row', dict(empty=0, extra_config='')), ]) # Rollback only works for non-durable tables @@ -65,17 +64,21 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess): session.begin_transaction(txn_config) c = session.open_cursor(self.uri, None) if missing == False: - actual = dict((k, v) for k, v, pad in c if v != 0) + actual = dict((k, v) for k, v in c if v != 0) #print expected #print actual self.assertEqual(actual, expected) # Search for the expected items as well as iterating for k, v in expected.iteritems(): if missing == False: - self.assertEqual(c[k][0], v, "for key " + str(k)) + self.assertEqual(c[k], v, "for key " + str(k)) else: c.set_key(k) - self.assertEqual(c.search(), wiredtiger.WT_NOTFOUND) + if self.empty: + # Fixed-length column-store rows always exist. + self.assertEqual(c.search(), 0) + else: + self.assertEqual(c.search(), wiredtiger.WT_NOTFOUND) c.close() if txn_config: session.commit_transaction() @@ -87,7 +90,8 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess): # Configure small page sizes to ensure eviction comes through and we have a # somewhat complex tree self.session.create(self.uri, - 'key_format=i,value_format=iS,memory_page_max=16k,leaf_page_max=8k' + self.extra_config) + 'key_format=i,value_format=i,memory_page_max=32k,leaf_page_max=8k,internal_page_max=8k' + + self.extra_config) c = self.session.open_cursor(self.uri) # Insert keys each with timestamp=key, in some order @@ -96,7 +100,7 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess): for k in keys: self.session.begin_transaction() - c[k] = (1, 'the quick brown fox') + c[k] = 1 self.session.commit_transaction('commit_timestamp=' + timestamp_str(k)) # Setup an oldest timestamp to ensure state remains in cache. if k == 1: @@ -119,7 +123,7 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess): # Update the values again in preparation for rolling back more for k in keys: self.session.begin_transaction() - c[k] = (2, 'jumped over the lazy dog') + c[k] = 2 self.session.commit_transaction('commit_timestamp=' + timestamp_str(k + key_range)) # Now we should have: keys 1-100 with value 2 diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp05.py b/src/third_party/wiredtiger/test/suite/test_timestamp05.py new file mode 100644 index 00000000000..d7131cb2004 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_timestamp05.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2017 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# test_timestamp05.py +# Timestamps: make sure they don't end up in metadata +# + +from helper import copy_wiredtiger_home +import random +from suite_subprocess import suite_subprocess +import wiredtiger, wttest +from wtscenario import make_scenarios + +def timestamp_str(t): + return '%x' % t + +def timestamp_ret_str(t): + s = timestamp_str(t) + if len(s) % 2 == 1: + s = '0' + s + return s + +class test_timestamp05(wttest.WiredTigerTestCase, suite_subprocess): + uri = 'table:ts05' + + def test_create(self): + if not wiredtiger.timestamp_build(): + self.skipTest('requires a timestamp build') + + s = self.session + conn = self.conn + + # Start timestamps at 50 + conn.set_timestamp('oldest_timestamp=50,stable_timestamp=50') + + # Commit at 100 + s.begin_transaction() + s.create(self.uri, 'key_format=i,value_format=S') + s.commit_transaction('commit_timestamp=' + timestamp_str(100)) + + # Make sure the tree is dirty + c = s.open_cursor(self.uri) + c[200] = 'new value' + + # Checkpoint at 50 + s.checkpoint('use_timestamp=true') + + def test_bulk(self): + if not wiredtiger.timestamp_build(): + self.skipTest('requires a timestamp build') + + s = self.session + conn = self.conn + + s.create(self.uri, 'key_format=i,value_format=S') + c = s.open_cursor(self.uri, None, 'bulk') + + # Insert keys 1..100 each with timestamp=key, in some order + nkeys = 100 + keys = range(1, nkeys+1) + + for k in keys: + c[k] = 'some value' + + # Start timestamps at 50 + conn.set_timestamp('oldest_timestamp=50,stable_timestamp=50') + + # Commit at 100 + s.begin_transaction() + c.close() + s.commit_transaction('commit_timestamp=' + timestamp_str(100)) + + # Make sure the tree is dirty + c = s.open_cursor(self.uri) + c[200] = 'new value' + + # Checkpoint at 50 + s.checkpoint('use_timestamp=true') + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp07.py b/src/third_party/wiredtiger/test/suite/test_timestamp07.py new file mode 100644 index 00000000000..c1f70e0cb1a --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_timestamp07.py @@ -0,0 +1,284 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2017 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# test_timestamp07.py +# Timestamps: checkpoints and eviction +# + +from helper import copy_wiredtiger_home +import random +from suite_subprocess import suite_subprocess +import wiredtiger, wttest +from wiredtiger import stat +from wtscenario import make_scenarios + +def timestamp_str(t): + return '%x' % t + +class test_timestamp07(wttest.WiredTigerTestCase, suite_subprocess): + tablename = 'ts07_ts_nologged' + tablename2 = 'ts07_nots_logged' + tablename3 = 'ts07_ts_logged' + + types = [ + ('file', dict(uri='file:', use_cg=False, use_index=False)), + ('table-cg', dict(uri='table:', use_cg=True, use_index=False)), + ] + + conncfg = [ + ('nolog', dict(conn_config='create,cache_size=1M,statistics=(fast)', using_log=False)), + ('log', dict(conn_config='create,log=(enabled),cache_size=1M,statistics=(fast)', using_log=True)), + ] + + nkeys = [ + ('100keys', dict(nkeys=100,evicts=False)), + ('500keys', dict(nkeys=500,evicts=True)), +# ('1000keys', dict(nkeys=1000,evicts=True)), + ] + + scenarios = make_scenarios(types, conncfg, nkeys) + + modified_evicted = 0 + + # Binary values. + value = u'\u0001\u0002abcd\u0007\u0004' + value2 = u'\u0001\u0002dcba\u0007\u0004' + value3 = u'\u0001\u0002cdef\u0007\u0004' + + # Check that a cursor (optionally started in a new transaction), sees the + # expected values. + def check(self, session, txn_config, expected): + if txn_config: + #print "Check: txn_config:" + #print txn_config + session.begin_transaction(txn_config) + c = session.open_cursor(self.uri + self.tablename, None) + actual = dict((k, v) for k, v in c if v != 0) + self.maxDiff = None + #print "Expected:" + #print expected + #print "Actual:" + #print actual + self.assertEqual(actual, expected) + # Search for the expected items as well as iterating + for k, v in expected.iteritems(): + self.assertEqual(c[k], v, "for key " + str(k)) + c.close() + if txn_config: + session.commit_transaction() + # + # Take a backup of the database and verify that the value we want to + # check exists in the tables the expected number of times. + # + def backup_check(self, check_value, valcnt, valcnt2, valcnt3): + newdir = "BACKUP" + copy_wiredtiger_home('.', newdir, True) + + conn = self.setUpConnectionOpen(newdir) + session = self.setUpSessionOpen(conn) + c = session.open_cursor(self.uri + self.tablename, None) + c2 = session.open_cursor(self.uri + self.tablename2, None) + c3 = session.open_cursor(self.uri + self.tablename3, None) + # Count how many times the second value is present + count = 0 + for k, v in c: + if check_value in str(v): + # print "check_value found in key " + str(k) + count += 1 + c.close() + # Count how many times the second value is present in the + # non-timestamp table. + count2 = 0 + for k, v in c2: + if check_value in str(v): + # print "check_value found in key " + str(k) + count2 += 1 + c2.close() + # Count how many times the second value is present in the + # logged timestamp table. + count3 = 0 + for k, v in c3: + if check_value in str(v): + count3 += 1 + c3.close() + conn.close() + # print "CHECK BACKUP: Count " + str(count) + " Count2 " + str(count2) + " Count3 " + str(count3) + # print "CHECK BACKUP: Expect value2 count " + str(valcnt) + # print "CHECK BACKUP: 2nd table Expect value2 count " + str(valcnt2) + # print "CHECK BACKUP: 3rd table Expect value2 count " + str(valcnt3) + self.assertEqual(count, valcnt) + self.assertEqual(count2, valcnt2) + self.assertEqual(count3, valcnt3) + + # Return whether or not eviction happened since the last call. + def check_eviction(self): + # Get a statistics cursor and look at the number of dirty pages + # evicted. Keep track of the last read value so we can determine + # if the value changed since the last call to this function. + stat_cursor = self.session.open_cursor('statistics:', None, None) + evict_dirty = stat_cursor[stat.conn.cache_eviction_dirty][2] + + # Return True if the new value is more, False otherwise. + #print "Old: " + str(self.modified_evicted) + # print "New: " + str(evict_dirty) + did_eviction = self.modified_evicted < evict_dirty + stat_cursor.close() + self.modified_evicted = evict_dirty + # print "Evict ret: " + str(ret) + + # XXX we can't guarantee that eviction will always happen, but make + # sure it doesn't happen if not expected. + self.assertTrue(not did_eviction or self.evicts) + + # Check that a cursor sees the expected values after a checkpoint. + def ckpt_backup(self, check_value, valcnt, valcnt2, valcnt3): + + # Take a checkpoint. Make a copy of the database. Open the + # copy and verify whether or not the expected data is in there. + ckptcfg = 'use_timestamp=true' + self.session.checkpoint(ckptcfg) + self.backup_check(check_value, valcnt, valcnt2, valcnt3) + + def test_timestamp07(self): + if not wiredtiger.timestamp_build(): + self.skipTest('requires a timestamp build') + + uri = self.uri + self.tablename + uri2 = self.uri + self.tablename2 + uri3 = self.uri + self.tablename3 + # + # Open three tables: + # 1. Table is not logged and uses timestamps. + # 2. Table is logged and does not use timestamps. + # 3. Table is logged and uses timestamps. + # + self.session.create(uri, 'key_format=i,value_format=S,log=(enabled=false)') + c = self.session.open_cursor(uri) + self.session.create(uri2, 'key_format=i,value_format=S') + c2 = self.session.open_cursor(uri2) + self.session.create(uri3, 'key_format=i,value_format=S') + c3 = self.session.open_cursor(uri3) + + # Insert keys 1..nkeys each with timestamp=key, in some order. + orig_keys = range(1, self.nkeys+1) + keys = orig_keys[:] + random.shuffle(keys) + + for k in keys: + c2[k] = self.value + self.session.begin_transaction() + c[k] = self.value + c3[k] = self.value + self.session.commit_transaction('commit_timestamp=' + timestamp_str(k)) + + self.check_eviction() + # Now check that we see the expected state when reading at each + # timestamp. + for i, t in enumerate(orig_keys): + self.check(self.session, 'read_timestamp=' + timestamp_str(t), + dict((k, self.value) for k in orig_keys[:i+1])) + + # Bump the oldest timestamp, we're not going back... + self.assertEqual(self.conn.query_timestamp(), timestamp_str(self.nkeys)) + self.oldts = timestamp_str(self.nkeys) + self.conn.set_timestamp('oldest_timestamp=' + self.oldts) + self.conn.set_timestamp('stable_timestamp=' + self.oldts) + # print "Oldest " + self.oldts + + # Update them and retry. + random.shuffle(keys) + count = 0 + for k in keys: + # Make sure a timestamp cursor is the last one to update. This + # tests the scenario for a bug we found where recovery replayed + # the last record written into the log. + # + # print "Key " + str(k) + " to value2" + c2[k] = self.value2 + self.session.begin_transaction() + c[k] = self.value2 + c3[k] = self.value2 + ts = timestamp_str(k + self.nkeys) + self.session.commit_transaction('commit_timestamp=' + ts) + # print "Commit key " + str(k) + " ts " + ts + count += 1 + + self.check_eviction() + + # print "Updated " + str(count) + " keys to value2" + + # Take a checkpoint using the given configuration. Then verify + # whether value2 appears in a copy of that data or not. + valcnt2 = valcnt3 = self.nkeys + valcnt = 0 + self.ckpt_backup(self.value2, valcnt, valcnt2, valcnt3) + # Update the stable timestamp to the latest, but not the oldest + # timestamp and make sure we can see the data. Once the stable + # timestamp is moved we should see all keys with value2. + self.conn.set_timestamp('stable_timestamp=' + \ + timestamp_str(self.nkeys*2)) + self.ckpt_backup(self.value2, self.nkeys, self.nkeys, self.nkeys) + + # If we're not using the log we're done. + if not self.using_log: + return + + # Update the key and retry. This time take a backup and recover. + random.shuffle(keys) + count = 0 + for k in keys: + # Make sure a timestamp cursor is the last one to update. This + # tests the scenario for a bug we found where recovery replayed + # the last record written into the log. + # + # print "Key " + str(k) + " to value3" + c2[k] = self.value3 + self.session.begin_transaction() + c[k] = self.value3 + c3[k] = self.value3 + ts = timestamp_str(k + self.nkeys*2) + self.session.commit_transaction('commit_timestamp=' + ts) + # print "Commit key " + str(k) + " ts " + ts + count += 1 + + self.check_eviction() + # print "Updated " + str(count) + " keys to value3" + + # Flush the log but don't checkpoint + self.session.log_flush('sync=on') + + # Take a backup and then verify whether value3 appears in a copy + # of that data or not. Both tables that are logged should see + # all the data regardless of timestamps. The table that is not + # logged should not see any of it. + valcnt = 0 + valcnt2 = valcnt3 = self.nkeys + self.backup_check(self.value3, valcnt, valcnt2, valcnt3) + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/utility/test_util.h b/src/third_party/wiredtiger/test/utility/test_util.h index e53018ad4ea..7500df8d5e5 100644 --- a/src/third_party/wiredtiger/test/utility/test_util.h +++ b/src/third_party/wiredtiger/test/utility/test_util.h @@ -117,6 +117,18 @@ typedef struct { } while (0) /* + * testutil_checksys -- + * Complain and quit if a function call fails, returning errno. The error + * test must be specified, not just the call, because system calls fail in a + * variety of ways. + */ +#define testutil_checksys(call) do { \ + if (call) \ + testutil_die( \ + errno, "%s/%d: %s", __func__, __LINE__, #call); \ +} while (0) + +/* * testutil_checkfmt -- * Complain and quit if a function call fails, with additional arguments. */ |