diff options
author | David Hows <howsdav@gmail.com> | 2017-06-27 12:12:12 +1000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-06-27 12:12:12 +1000 |
commit | bf72bb44985c49c2d80f47500958469ec56d71b6 (patch) | |
tree | 3025dd6fbc30a25a50b97d36adcfdcdf71a6d761 | |
parent | a23f520a8496986c60fd6496b27b21291ff29af7 (diff) | |
download | mongo-bf72bb44985c49c2d80f47500958469ec56d71b6.tar.gz |
WT-3363 Add a test to confirm if operations block waiting for checkpoints to complete (#3462)
* Add a test to confirm if operations conflict with checkpoints
* Move handler functions into test/utility
* Ensure that the checkpoint delays only affect external checkpoints and skip internal ones
* Rename to timing stress, remove an un-needed ifdef
-rw-r--r-- | dist/api_data.py | 9 | ||||
-rw-r--r-- | dist/flags.py | 3 | ||||
-rw-r--r-- | src/config/config_def.c | 116 | ||||
-rw-r--r-- | src/conn/conn_api.c | 46 | ||||
-rw-r--r-- | src/include/connection.h | 6 | ||||
-rw-r--r-- | src/include/extern.h | 1 | ||||
-rw-r--r-- | src/include/flags.h | 1 | ||||
-rw-r--r-- | src/txn/txn_ckpt.c | 29 | ||||
-rw-r--r-- | test/csuite/Makefile.am | 3 | ||||
-rw-r--r-- | test/csuite/wt3363_checkpoint_op_races/main.c | 268 | ||||
-rw-r--r-- | test/utility/test_util.h | 25 | ||||
-rw-r--r-- | test/utility/thread.c | 282 |
12 files changed, 739 insertions, 50 deletions
diff --git a/dist/api_data.py b/dist/api_data.py index 2cc6267d31e..4a4ffae535d 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -401,6 +401,15 @@ connection_runtime_config = [ above 0 configures periodic checkpoints''', min='0', max='100000'), ]), + Config('diagnostic_timing_stress', '', r''' + enable insertion of code that interrupts the usual timing of + operations with a goal of uncovering race conditions and unexpected + blocking. This option is intended for use with internal stress + testing of WiredTiger. Only available if WiredTiger is configured + with --enable-diagnostic. Options are given as a list, such as + <code>"diagnostic_timing_stress=[checkpoint_slow]"</code>''', + type='list', undoc=True, choices=[ + 'checkpoint_slow']), Config('error_prefix', '', r''' prefix string for error messages'''), Config('eviction', '', r''' diff --git a/dist/flags.py b/dist/flags.py index 8edabd69648..43db7a67054 100644 --- a/dist/flags.py +++ b/dist/flags.py @@ -8,6 +8,9 @@ flags = { ################################################### # Internal routine flag declarations ################################################### + 'diagnostic_timing_stress' : [ + 'TIMING_STRESS_CHECKPOINT_SLOW', + ], 'log_scan' : [ 'LOGSCAN_FIRST', 'LOGSCAN_FROM_CKP', diff --git a/src/config/config_def.c b/src/config/config_def.c index f737b49866c..30b2b3e0e4d 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -117,6 +117,9 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = { { "checkpoint", "category", NULL, NULL, confchk_wiredtiger_open_checkpoint_subconfigs, 2 }, + { "diagnostic_timing_stress", "list", + NULL, "choices=[\"checkpoint_slow\"]", + NULL, 0 }, { "error_prefix", "string", NULL, NULL, NULL, 0 }, { "eviction", "category", NULL, NULL, @@ -711,6 +714,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { { "checkpoint_sync", "boolean", NULL, NULL, NULL, 0 }, { "config_base", "boolean", NULL, NULL, NULL, 0 }, { "create", "boolean", NULL, NULL, NULL, 0 }, + { "diagnostic_timing_stress", "list", + NULL, "choices=[\"checkpoint_slow\"]", + NULL, 0 }, { "direct_io", "list", NULL, "choices=[\"checkpoint\",\"data\",\"log\"]", NULL, 0 }, @@ -799,6 +805,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { { "checkpoint_sync", "boolean", NULL, NULL, NULL, 0 }, { "config_base", "boolean", NULL, NULL, NULL, 0 }, { "create", "boolean", NULL, NULL, NULL, 0 }, + { "diagnostic_timing_stress", "list", + NULL, "choices=[\"checkpoint_slow\"]", + NULL, 0 }, { "direct_io", "list", NULL, "choices=[\"checkpoint\",\"data\",\"log\"]", NULL, 0 }, @@ -886,6 +895,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { NULL, NULL, confchk_wiredtiger_open_checkpoint_subconfigs, 2 }, { "checkpoint_sync", "boolean", NULL, NULL, NULL, 0 }, + { "diagnostic_timing_stress", "list", + NULL, "choices=[\"checkpoint_slow\"]", + NULL, 0 }, { "direct_io", "list", NULL, "choices=[\"checkpoint\",\"data\",\"log\"]", NULL, 0 }, @@ -969,6 +981,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { NULL, NULL, confchk_wiredtiger_open_checkpoint_subconfigs, 2 }, { "checkpoint_sync", "boolean", NULL, NULL, NULL, 0 }, + { "diagnostic_timing_stress", "list", + NULL, "choices=[\"checkpoint_slow\"]", + NULL, 0 }, { "direct_io", "list", NULL, "choices=[\"checkpoint\",\"data\",\"log\"]", NULL, 0 }, @@ -1083,18 +1098,19 @@ static const WT_CONFIG_ENTRY config_entries[] = { }, { "WT_CONNECTION.reconfigure", "async=(enabled=false,ops_max=1024,threads=2),cache_overhead=8," - "cache_size=100MB,checkpoint=(log_size=0,wait=0),error_prefix=," - "eviction=(threads_max=8,threads_min=1)," - "eviction_checkpoint_target=5,eviction_dirty_target=5," - "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95" - ",file_manager=(close_handle_minimum=250,close_idle_time=30," + "cache_size=100MB,checkpoint=(log_size=0,wait=0)," + "diagnostic_timing_stress=,error_prefix=,eviction=(threads_max=8," + "threads_min=1),eviction_checkpoint_target=5," + "eviction_dirty_target=5,eviction_dirty_trigger=20," + "eviction_target=80,eviction_trigger=95," + "file_manager=(close_handle_minimum=250,close_idle_time=30," "close_scan_interval=10),log=(archive=true,prealloc=true," "zero_fill=false),lsm_manager=(merge=true,worker_thread_max=4)," "lsm_merge=true,shared_cache=(chunk=10MB,name=,quota=0,reserve=0," "size=500MB),statistics=none,statistics_log=(json=false," "on_close=false,sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," "verbose=", - confchk_WT_CONNECTION_reconfigure, 19 + confchk_WT_CONNECTION_reconfigure, 20 }, { "WT_CONNECTION.set_file_system", "", @@ -1302,57 +1318,57 @@ static const WT_CONFIG_ENTRY config_entries[] = { "async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1" ",builtin_extension_config=,cache_overhead=8,cache_size=100MB," "checkpoint=(log_size=0,wait=0),checkpoint_sync=true," - "config_base=true,create=false,direct_io=,encryption=(keyid=," - "name=,secretkey=),error_prefix=,eviction=(threads_max=8," - "threads_min=1),eviction_checkpoint_target=5," - "eviction_dirty_target=5,eviction_dirty_trigger=20," - "eviction_target=80,eviction_trigger=95,exclusive=false," - "extensions=,file_extend=,file_manager=(close_handle_minimum=250," - "close_idle_time=30,close_scan_interval=10),hazard_max=1000," - "in_memory=false,log=(archive=true,compressor=,enabled=false," - "file_max=100MB,path=\".\",prealloc=true,recover=on," - "zero_fill=false),lsm_manager=(merge=true,worker_thread_max=4)," - "lsm_merge=true,mmap=true,multiprocess=false,readonly=false," - "session_max=100,session_scratch_max=2MB,shared_cache=(chunk=10MB" - ",name=,quota=0,reserve=0,size=500MB),statistics=none," - "statistics_log=(json=false,on_close=false,path=\".\",sources=," - "timestamp=\"%b %d %H:%M:%S\",wait=0)," - "transaction_sync=(enabled=false,method=fsync)," + "config_base=true,create=false,diagnostic_timing_stress=," + "direct_io=,encryption=(keyid=,name=,secretkey=),error_prefix=," + "eviction=(threads_max=8,threads_min=1)," + "eviction_checkpoint_target=5,eviction_dirty_target=5," + "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95" + ",exclusive=false,extensions=,file_extend=," + "file_manager=(close_handle_minimum=250,close_idle_time=30," + "close_scan_interval=10),hazard_max=1000,in_memory=false," + "log=(archive=true,compressor=,enabled=false,file_max=100MB," + "path=\".\",prealloc=true,recover=on,zero_fill=false)," + "lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true," + "mmap=true,multiprocess=false,readonly=false,session_max=100," + "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0," + "reserve=0,size=500MB),statistics=none,statistics_log=(json=false" + ",on_close=false,path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\"" + ",wait=0),transaction_sync=(enabled=false,method=fsync)," "use_environment=true,use_environment_priv=false,verbose=," "write_through=", - confchk_wiredtiger_open, 40 + confchk_wiredtiger_open, 41 }, { "wiredtiger_open_all", "async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1" ",builtin_extension_config=,cache_overhead=8,cache_size=100MB," "checkpoint=(log_size=0,wait=0),checkpoint_sync=true," - "config_base=true,create=false,direct_io=,encryption=(keyid=," - "name=,secretkey=),error_prefix=,eviction=(threads_max=8," - "threads_min=1),eviction_checkpoint_target=5," - "eviction_dirty_target=5,eviction_dirty_trigger=20," - "eviction_target=80,eviction_trigger=95,exclusive=false," - "extensions=,file_extend=,file_manager=(close_handle_minimum=250," - "close_idle_time=30,close_scan_interval=10),hazard_max=1000," - "in_memory=false,log=(archive=true,compressor=,enabled=false," - "file_max=100MB,path=\".\",prealloc=true,recover=on," - "zero_fill=false),lsm_manager=(merge=true,worker_thread_max=4)," - "lsm_merge=true,mmap=true,multiprocess=false,readonly=false," - "session_max=100,session_scratch_max=2MB,shared_cache=(chunk=10MB" - ",name=,quota=0,reserve=0,size=500MB),statistics=none," - "statistics_log=(json=false,on_close=false,path=\".\",sources=," - "timestamp=\"%b %d %H:%M:%S\",wait=0)," - "transaction_sync=(enabled=false,method=fsync)," + "config_base=true,create=false,diagnostic_timing_stress=," + "direct_io=,encryption=(keyid=,name=,secretkey=),error_prefix=," + "eviction=(threads_max=8,threads_min=1)," + "eviction_checkpoint_target=5,eviction_dirty_target=5," + "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95" + ",exclusive=false,extensions=,file_extend=," + "file_manager=(close_handle_minimum=250,close_idle_time=30," + "close_scan_interval=10),hazard_max=1000,in_memory=false," + "log=(archive=true,compressor=,enabled=false,file_max=100MB," + "path=\".\",prealloc=true,recover=on,zero_fill=false)," + "lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true," + "mmap=true,multiprocess=false,readonly=false,session_max=100," + "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0," + "reserve=0,size=500MB),statistics=none,statistics_log=(json=false" + ",on_close=false,path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\"" + ",wait=0),transaction_sync=(enabled=false,method=fsync)," "use_environment=true,use_environment_priv=false,verbose=," "version=(major=0,minor=0),write_through=", - confchk_wiredtiger_open_all, 41 + confchk_wiredtiger_open_all, 42 }, { "wiredtiger_open_basecfg", "async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1" ",builtin_extension_config=,cache_overhead=8,cache_size=100MB," - "checkpoint=(log_size=0,wait=0),checkpoint_sync=true,direct_io=," - "encryption=(keyid=,name=,secretkey=),error_prefix=," - "eviction=(threads_max=8,threads_min=1)," - "eviction_checkpoint_target=5,eviction_dirty_target=5," + "checkpoint=(log_size=0,wait=0),checkpoint_sync=true," + "diagnostic_timing_stress=,direct_io=,encryption=(keyid=,name=," + "secretkey=),error_prefix=,eviction=(threads_max=8,threads_min=1)" + ",eviction_checkpoint_target=5,eviction_dirty_target=5," "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95" ",extensions=,file_extend=,file_manager=(close_handle_minimum=250" ",close_idle_time=30,close_scan_interval=10),hazard_max=1000," @@ -1365,15 +1381,15 @@ static const WT_CONFIG_ENTRY config_entries[] = { ",on_close=false,path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\"" ",wait=0),transaction_sync=(enabled=false,method=fsync),verbose=," "version=(major=0,minor=0),write_through=", - confchk_wiredtiger_open_basecfg, 35 + confchk_wiredtiger_open_basecfg, 36 }, { "wiredtiger_open_usercfg", "async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1" ",builtin_extension_config=,cache_overhead=8,cache_size=100MB," - "checkpoint=(log_size=0,wait=0),checkpoint_sync=true,direct_io=," - "encryption=(keyid=,name=,secretkey=),error_prefix=," - "eviction=(threads_max=8,threads_min=1)," - "eviction_checkpoint_target=5,eviction_dirty_target=5," + "checkpoint=(log_size=0,wait=0),checkpoint_sync=true," + "diagnostic_timing_stress=,direct_io=,encryption=(keyid=,name=," + "secretkey=),error_prefix=,eviction=(threads_max=8,threads_min=1)" + ",eviction_checkpoint_target=5,eviction_dirty_target=5," "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95" ",extensions=,file_extend=,file_manager=(close_handle_minimum=250" ",close_idle_time=30,close_scan_interval=10),hazard_max=1000," @@ -1386,7 +1402,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { ",on_close=false,path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\"" ",wait=0),transaction_sync=(enabled=false,method=fsync),verbose=," "write_through=", - confchk_wiredtiger_open_usercfg, 34 + confchk_wiredtiger_open_usercfg, 35 }, { NULL, NULL, NULL, 0 } }; diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index 46c170d2dad..cb0aa17791e 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -1143,6 +1143,7 @@ __conn_reconfigure(WT_CONNECTION *wt_conn, const char *config) WT_ERR(__wt_statlog_create(session, cfg)); WT_ERR(__wt_sweep_config(session, cfg)); WT_ERR(__wt_verbose_config(session, cfg)); + WT_ERR(__wt_timing_stress_config(session, cfg)); /* Third, merge everything together, creating a new connection state. */ WT_ERR(__wt_config_merge(session, cfg, NULL, &p)); @@ -1894,6 +1895,50 @@ __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) } /* + * __wt_timing_stress_config -- + * Set diagnostic stress timing delay configuration. + */ +int +__wt_timing_stress_config(WT_SESSION_IMPL *session, const char *cfg[]) +{ + static const WT_NAME_FLAG stress_types[] = { + { "checkpoint_slow", WT_TIMING_STRESS_CHECKPOINT_SLOW }, + { NULL, 0 } + }; + WT_CONFIG_ITEM cval, sval; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + const WT_NAME_FLAG *ft; + uint32_t flags; + + conn = S2C(session); + + WT_RET(__wt_config_gets( + session, cfg, "diagnostic_timing_stress", &cval)); + + flags = 0; + for (ft = stress_types; ft->name != NULL; ft++) { + if ((ret = __wt_config_subgets( + session, &cval, ft->name, &sval)) == 0 && sval.val != 0) { +#ifdef HAVE_DIAGNOSTIC + LF_SET(ft->flag); +#else + WT_RET_MSG(session, EINVAL, + "diagnostic_timing_stress option specified when " + "WiredTiger built without diagnostic support. Add " + "--enable-diagnostic to configure command and " + "rebuild to include support for diagnostic stress " + "timing delays"); +#endif + } + WT_RET_NOTFOUND_OK(ret); + } + + conn->timing_stress_flags = flags; + return (0); +} + +/* * __conn_write_base_config -- * Save the base configuration used to create a database. */ @@ -2359,6 +2404,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, session, cval.str, cval.len, &conn->error_prefix)); } WT_ERR(__wt_verbose_config(session, cfg)); + WT_ERR(__wt_timing_stress_config(session, cfg)); WT_ERR(__wt_config_gets(session, cfg, "session_max", &cval)); conn->session_size = (uint32_t)cval.val + WT_EXTRA_INTERNAL_SESSIONS; diff --git a/src/include/connection.h b/src/include/connection.h index 56d801cd361..94abe48ffe5 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -410,6 +410,12 @@ struct __wt_connection_impl { int page_size; /* OS page size for mmap alignment */ uint32_t verbose; + /* + * Variable with flags for which subsystems the diagnostic stress timing + * delays have been requested. + */ + uint32_t timing_stress_flags; + #define WT_STDERR(s) (&S2C(s)->wt_stderr) #define WT_STDOUT(s) (&S2C(s)->wt_stdout) WT_FSTREAM wt_stderr, wt_stdout; diff --git a/src/include/extern.h b/src/include/extern.h index e7be1493875..59e853d75f4 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -244,6 +244,7 @@ extern int __wt_conn_remove_encryptor(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL extern int __wt_extractor_config(WT_SESSION_IMPL *session, const char *uri, const char *config, WT_EXTRACTOR **extractorp, int *ownp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_conn_remove_extractor(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_timing_stress_config(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_cache_config(WT_SESSION_IMPL *session, bool reconfigure, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_cache_stats_update(WT_SESSION_IMPL *session); diff --git a/src/include/flags.h b/src/include/flags.h index 919c0dd2f98..ef66a186fa4 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -78,6 +78,7 @@ #define WT_STAT_TYPE_FAST 0x00000020 #define WT_STAT_TYPE_SIZE 0x00000040 #define WT_STAT_TYPE_TREE_WALK 0x00000080 +#define WT_TIMING_STRESS_CHECKPOINT_SLOW 0x00000001 #define WT_TXN_LOG_CKPT_CLEANUP 0x00000001 #define WT_TXN_LOG_CKPT_PREPARE 0x00000002 #define WT_TXN_LOG_CKPT_START 0x00000004 diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 0cbbeedfaaf..73ad96b5518 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -8,6 +8,7 @@ #include "wt_internal.h" +static void __checkpoint_timing_stress(WT_SESSION_IMPL *); static int __checkpoint_lock_dirty_tree( WT_SESSION_IMPL *, bool, bool, bool, const char *[]); static int __checkpoint_mark_skip(WT_SESSION_IMPL *, WT_CKPT *, bool); @@ -780,6 +781,8 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__wt_txn_checkpoint_log( session, full, WT_TXN_LOG_CKPT_START, NULL)); + __checkpoint_timing_stress(session); + WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_tree_helper)); /* @@ -1707,3 +1710,29 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final) return (ret); } + +/* + * __checkpoint_timing_stress -- + * Optionally add a 10 second delay to a checkpoint to simulate a long + * running checkpoint for debug purposes. The reason for this option is + * finding operations that can block while waiting for a checkpoint to + * complete. + */ +static void +__checkpoint_timing_stress(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); + + /* + * We only want to sleep if the flag is set and the checkpoint comes + * from the API, so check if the session used is either of the two + * sessions set aside for internal checkpoints. + */ + if (conn->ckpt_session != session && + conn->meta_ckpt_session != session && + FLD_ISSET(conn->timing_stress_flags, + WT_TIMING_STRESS_CHECKPOINT_SLOW)) + __wt_sleep(10, 0); +} diff --git a/test/csuite/Makefile.am b/test/csuite/Makefile.am index f2b4fcacdc8..0b117a6588b 100644 --- a/test/csuite/Makefile.am +++ b/test/csuite/Makefile.am @@ -57,6 +57,9 @@ noinst_PROGRAMS += test_wt3135_search_near_collator test_wt3184_dup_index_collator_SOURCES = wt3184_dup_index_collator/main.c noinst_PROGRAMS += test_wt3184_dup_index_collator +test_wt3363_checkpoint_op_races_SOURCES = wt3363_checkpoint_op_races/main.c +noinst_PROGRAMS += test_wt3363_checkpoint_op_races + test_rwlock_SOURCES = rwlock/main.c noinst_PROGRAMS += test_rwlock diff --git a/test/csuite/wt3363_checkpoint_op_races/main.c b/test/csuite/wt3363_checkpoint_op_races/main.c new file mode 100644 index 00000000000..bfe3ef2d140 --- /dev/null +++ b/test/csuite/wt3363_checkpoint_op_races/main.c @@ -0,0 +1,268 @@ +/*- + * Public Domain 2014-2017 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "test_util.h" + +/* + * JIRA ticket reference: WT-3363 + * + * Test case description: There are a number of operations that we run that we + * expect not to conflict with or block against a running checkpoint. This test + * aims to run repeated checkpoints in a thread, while running an assortment + * of operations that we expect to execute quickly on further threads. To + * ensure that we catch any blockages we introduce a very large delay into the + * checkpoint and measure that no operation takes 1/2 the length of this delay. + * + * Failure mode: We monitor the execution time of all operations and if we find + * any operation taking longer than 1/2 the delay time, we abort dumping a core + * file which can be used to determine what operation was blocked. + */ +void* do_checkpoints(void *); +void* do_ops(void *); +void* monitor(void *); + +/* + * Time delay to introduce into checkpoints in seconds. Should be at-least + * double the maximum time that any one of the operations should take. Currently + * this is set to 10 seconds and we expect no single operation to take longer + * than 5 seconds. + */ +#define MAX_EXECUTION_TIME 10 +#define N_THREADS 10 + +/* + * Number of seconds to execute for. Initially set to 15 minutes, as we need to + * run long enough to be certain we have captured any blockages. In initial + * testing 5 minutes was enough to reproduce the issue, so we run for 3x that + * here to ensure we reproduce before declaring success. + */ +#define RUNTIME 900.0 + +static WT_EVENT_HANDLER event_handler = { + handle_op_error, + handle_op_message, + NULL, + NULL +}; + +int +main(int argc, char *argv[]) +{ + TEST_PER_THREAD_OPTS thread_args[N_THREADS]; + TEST_OPTS *opts, _opts; + pthread_t ckpt_thread, mon_thread, threads[N_THREADS]; + int i; + bool diagnostic; + +#ifdef HAVE_DIAGNOSTIC + diagnostic = true; +#else + diagnostic = false; +#endif + + /* + * This test should not run unless we have compiled with diagnostic + * support and the long tests flag is set. The test will fail when + * attempting to set the option to add the delays to checkpoints if + * diagnostic mode is not enable and runs for 15 minutes. + */ + if (!testutil_enable_long_tests() || !diagnostic) + return (EXIT_SUCCESS); + + opts = &_opts; + opts->unique_id = 0; + memset(opts, 0, sizeof(*opts)); + + testutil_check(testutil_parse_opts(argc, argv, opts)); + testutil_make_work_dir(opts->home); + + testutil_check(wiredtiger_open(opts->home, &event_handler, + "create,cache_size=1G,diagnostic_timing_stress=[checkpoint_slow]", + &opts->conn)); + + testutil_check(pthread_create( + &ckpt_thread, NULL, do_checkpoints, (void *)opts)); + + for (i = 0; i < N_THREADS; ++i) { + thread_args[i].testopts = opts; + thread_args[i].thread_counter = 0; + thread_args[i].threadnum = i; + testutil_check(pthread_create( + &threads[i], NULL, do_ops, (void *)&thread_args[i])); + } + + /* + * Pass the whole array of thread arguments to the monitoring thread. + * This thread will need to monitor each threads counter to track if it + * is stuck. + */ + testutil_check( + pthread_create(&mon_thread, NULL, monitor, &thread_args)); + + for (i = 0; i < N_THREADS; ++i) + testutil_check(pthread_join(threads[i], NULL)); + + testutil_check(pthread_join(mon_thread, NULL)); + + testutil_check(pthread_join(ckpt_thread, NULL)); + + printf("Success\n"); + + testutil_cleanup(opts); + return (EXIT_SUCCESS); +} + +/* + * Function for repeatedly running checkpoint operations. + */ +void * +do_checkpoints(void *_opts) +{ + TEST_OPTS *opts; + WT_SESSION *session; + time_t now, start; + int ret; + + opts = (TEST_OPTS *)_opts; + (void)time(&start); + (void)time(&now); + + while (difftime(now, start) < RUNTIME) { + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + + if ((ret = session->checkpoint(session, "force")) != 0) + if (ret != EBUSY && ret != ENOENT) + testutil_die(ret, "session.checkpoint"); + + testutil_check(session->close(session, NULL)); + + /* + * A short sleep to let operations process and avoid back to + * back checkpoints locking up resources. + */ + sleep(1); + (void)time(&now); + } + + return (NULL); +} + +/* + * Function to monitor running operations and abort to dump core in the event + * that we catch an operation running long. + */ +void * +monitor(void *args) +{ + TEST_PER_THREAD_OPTS *thread_args; + time_t now, start; + int ctr, i, last_ops[N_THREADS]; + + thread_args = (TEST_PER_THREAD_OPTS*)args; + + (void)time(&start); + (void)time(&now); + + memset(last_ops, 0, sizeof(int) + N_THREADS); + + while (difftime(now, start) < RUNTIME) { + + /* + * Checkpoints will run for slightly over MAX_EXECUTION_TIME. + * MAX_EXECUTION_TIME should always be long enough that we can + * complete any single operation in 1/2 that time. + */ + sleep(MAX_EXECUTION_TIME/2); + + for (i = 0; i < N_THREADS; i++) { + ctr = thread_args[i].thread_counter; + + /* Ignore any threads which may not have started yet. */ + if (ctr == 0) + continue; + + /* + * We track how many operations each thread has done. If + * we have slept and the counter remains the same for a + * thread it is stuck and should drop a core so the + * cause of the hang can be investigated. + */ + if (ctr != last_ops[i]) + last_ops[i] = ctr; + else { + printf("Thread %d had a task running" + " for more than %d seconds\n", + i, MAX_EXECUTION_TIME/2); + abort(); + } + } + (void)time(&now); + } + + return (NULL); +} + +/* + * Worker thread. Executes random operations from the set of 6. + */ +void * +do_ops(void *args) +{ + WT_RAND_STATE rnd; + time_t now, start; + + __wt_random_init_seed(NULL, &rnd); + (void)time(&start); + (void)time(&now); + + while (difftime(now, start) < RUNTIME) { + switch (__wt_random(&rnd) % 6) { + case 0: + op_bulk(args); + break; + case 1: + op_create(args); + break; + case 2: + op_cursor(args); + break; + case 3: + op_drop(args); + break; + case 4: + op_bulk_unique(args); + break; + case 5: + op_create_unique(args); + break; + } + (void)time(&now); + } + + return (NULL); +} diff --git a/test/utility/test_util.h b/test/utility/test_util.h index dd198fa0081..2e3c43dca40 100644 --- a/test/utility/test_util.h +++ b/test/utility/test_util.h @@ -72,10 +72,21 @@ typedef struct { bool running; char *uri; volatile uint64_t next_threadid; + uint64_t unique_id; uint64_t max_inserted_id; } TEST_OPTS; /* + * A structure for the data specific to a single thread of those used by the + * group of threads defined below. + */ +typedef struct { + TEST_OPTS *testopts; + int threadnum; + int thread_counter; +} TEST_PER_THREAD_OPTS; + +/* * testutil_assert -- * Complain and quit if something isn't true. */ @@ -198,6 +209,20 @@ void *drealloc(void *, size_t); void *dstrdup(const void *); void *dstrndup(const char *, size_t); const char *example_setup(int, char * const *); + +/* + * The functions below can generate errors that we wish to ignore. We have + * handler functions available for them here, to avoid making tests crash + * prematurely. + */ +int handle_op_error(WT_EVENT_HANDLER *, WT_SESSION *, int, const char *); +int handle_op_message(WT_EVENT_HANDLER *, WT_SESSION *, const char *); +void *op_bulk(void *); +void *op_bulk_unique(void *); +void *op_cursor(void *); +void *op_create(void *); +void *op_create_unique(void *); +void *op_drop(void *); void testutil_clean_work_dir(const char *); void testutil_cleanup(TEST_OPTS *); bool testutil_enable_long_tests(void); diff --git a/test/utility/thread.c b/test/utility/thread.c index 08f49c54c5e..4c5202016a5 100644 --- a/test/utility/thread.c +++ b/test/utility/thread.c @@ -140,3 +140,285 @@ thread_prev(void *arg) testutil_check(session->close(session, NULL)); return (NULL); } + +/* + * Below are a series of functions originally designed for test/fops. These + * threads perform a series of simple API access calls, such as opening and + * closing sessions and cursors. These functions require use of the + * TEST_PER_THREAD_OPTS structure in test_util.h. Additionally there are two + * event handler functions that should be used to suppress "expected" errors + * that these functions generate. An example of the use of these functions and + * structures is in the csuite test wt3363_checkpoint_op_races. + */ + +/* + * Handle errors that generated by series of functions below that we can safely + * ignore. + */ +int +handle_op_error(WT_EVENT_HANDLER *handler, + WT_SESSION *session, int error, const char *errmsg) +{ + (void)(handler); + (void)(session); + + /* + * Ignore complaints about missing files. It's unlikely but possible + * that checkpoints and cursor open operations can return this due to + * the sequencing of the various ops. + */ + if (error == ENOENT) + return (0); + + /* Ignore complaints about failure to open bulk cursors. */ + if (strstr( + errmsg, "bulk-load is only supported on newly created") != NULL) + return (0); + + return (fprintf(stderr, "%s\n", errmsg) < 0 ? -1 : 0); +} + +/* + * Handle messages generated by the functions below that we can safely ignore. + */ +int +handle_op_message(WT_EVENT_HANDLER *handler, + WT_SESSION *session, const char *message) +{ + (void)(handler); + (void)(session); + + /* Ignore messages about failing to create forced checkpoints. */ + if (strstr(message, "forced or named checkpoint") != NULL) + return (0); + + return (printf("%s\n", message) < 0 ? -1 : 0); +} + +/* + * Create a table and open a bulk cursor on it. + */ +void * +op_bulk(void *arg) +{ + TEST_OPTS *opts; + TEST_PER_THREAD_OPTS *args; + WT_CURSOR *c; + WT_SESSION *session; + int ret; + + args = (TEST_PER_THREAD_OPTS *)arg; + opts = args->testopts; + + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + + if ((ret = session->create(session, opts->uri, NULL)) != 0) + if (ret != EEXIST && ret != EBUSY) + testutil_die(ret, "session.create"); + + if (ret == 0) { + __wt_yield(); + if ((ret = session->open_cursor(session, + opts->uri, NULL, "bulk,checkpoint_wait=false", &c)) == 0) { + testutil_check(c->close(c)); + } else if (ret != ENOENT && ret != EBUSY && ret != EINVAL) + testutil_die(ret, "session.open_cursor bulk"); + } + + testutil_check(session->close(session, NULL)); + args->thread_counter++; + + return (NULL); +} + +/* + * Create a guaranteed unique table and open and close a bulk cursor on it. + */ +void * +op_bulk_unique(void *arg) +{ + TEST_OPTS *opts; + TEST_PER_THREAD_OPTS *args; + WT_CURSOR *c; + WT_RAND_STATE rnd; + WT_SESSION *session; + int ret; + char new_uri[64]; + + args = (TEST_PER_THREAD_OPTS *)arg; + opts = args->testopts; + __wt_random_init_seed(NULL, &rnd); + + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + + /* Generate a unique object name. */ + testutil_check(__wt_snprintf( + new_uri, sizeof(new_uri), "%s.%u", + opts->uri, __wt_atomic_add64(&opts->unique_id, 1))); + testutil_check(session->create(session, new_uri, NULL)); + + __wt_yield(); + + /* + * Opening a bulk cursor may have raced with a forced checkpoint + * which created a checkpoint of the empty file, and triggers an EINVAL. + */ + if ((ret = session->open_cursor( + session, new_uri, NULL, "bulk,checkpoint_wait=false", &c)) == 0) { + testutil_check(c->close(c)); + } else if (ret != EINVAL && ret != EBUSY) + testutil_die(ret, + "session.open_cursor bulk unique: %s", new_uri); + + while ((ret = session->drop(session, new_uri, __wt_random(&rnd) & 1 ? + "force,checkpoint_wait=false" : "checkpoint_wait=false")) != 0) + if (ret != EBUSY) + testutil_die(ret, "session.drop: %s", new_uri); + else + /* + * The EBUSY is expected when we run with + * checkpoint_wait set to false, so we increment the + * counter while in this loop to avoid false positives. + */ + args->thread_counter++; + + testutil_check(session->close(session, NULL)); + args->thread_counter++; + + return (NULL); +} + +/* + * Open and close cursor on a table. + */ +void * +op_cursor(void *arg) +{ + TEST_OPTS *opts; + TEST_PER_THREAD_OPTS *args; + WT_SESSION *session; + WT_CURSOR *cursor; + int ret; + + args = (TEST_PER_THREAD_OPTS *)arg; + opts = args->testopts; + + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + + if ((ret = session->open_cursor( + session, opts->uri, NULL, NULL, &cursor)) != 0) { + if (ret != ENOENT && ret != EBUSY) + testutil_die(ret, "session.open_cursor"); + } else + testutil_check(cursor->close(cursor)); + + testutil_check(session->close(session, NULL)); + args->thread_counter++; + + return (NULL); +} + +/* + * Create a table. + */ +void * +op_create(void *arg) +{ + TEST_OPTS *opts; + TEST_PER_THREAD_OPTS *args; + WT_SESSION *session; + int ret; + + args = (TEST_PER_THREAD_OPTS *)arg; + opts = args->testopts; + + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + + if ((ret = session->create(session, opts->uri, NULL)) != 0) + if (ret != EEXIST && ret != EBUSY) + testutil_die(ret, "session.create"); + + testutil_check(session->close(session, NULL)); + args->thread_counter++; + + return (NULL); +} + +/* + * Create and drop a unique guaranteed table. + */ +void * +op_create_unique(void *arg) +{ + TEST_OPTS *opts; + TEST_PER_THREAD_OPTS *args; + WT_RAND_STATE rnd; + WT_SESSION *session; + int ret; + char new_uri[64]; + + args = (TEST_PER_THREAD_OPTS *)arg; + opts = args->testopts; + __wt_random_init_seed(NULL, &rnd); + + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + + /* Generate a unique object name. */ + testutil_check(__wt_snprintf( + new_uri, sizeof(new_uri), "%s.%u", + opts->uri, __wt_atomic_add64(&opts->unique_id, 1))); + testutil_check(session->create(session, new_uri, NULL)); + + __wt_yield(); + while ((ret = session->drop(session, new_uri, __wt_random(&rnd) & 1 ? + "force,checkpoint_wait=false" : "checkpoint_wait=false")) != 0) + if (ret != EBUSY) + testutil_die(ret, "session.drop: %s", new_uri); + else + /* + * The EBUSY is expected when we run with + * checkpoint_wait set to false, so we increment the + * counter while in this loop to avoid false positives. + */ + args->thread_counter++; + + testutil_check(session->close(session, NULL)); + args->thread_counter++; + + return (NULL); +} + +/* + * Drop a table. + */ +void * +op_drop(void *arg) +{ + TEST_PER_THREAD_OPTS *args; + TEST_OPTS *opts; + WT_RAND_STATE rnd; + WT_SESSION *session; + int ret; + + args = (TEST_PER_THREAD_OPTS *)arg; + opts = args->testopts; + __wt_random_init_seed(NULL, &rnd); + + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + + if ((ret = session->drop(session, opts->uri, __wt_random(&rnd) & 1 ? + "force,checkpoint_wait=false" : "checkpoint_wait=false")) != 0) + if (ret != ENOENT && ret != EBUSY) + testutil_die(ret, "session.drop"); + + testutil_check(session->close(session, NULL)); + args->thread_counter++; + + return (NULL); +} |