diff options
251 files changed, 12175 insertions, 3765 deletions
diff --git a/bench/wtperf/config.c b/bench/wtperf/config.c index 3cb20ff2b26..e83d6fcceed 100644 --- a/bench/wtperf/config.c +++ b/bench/wtperf/config.c @@ -134,9 +134,11 @@ config_free(CONFIG *cfg) } cleanup_truncate_config(cfg); + free(cfg->base_uri); free(cfg->ckptthreads); + free(cfg->partial_config); free(cfg->popthreads); - free(cfg->base_uri); + free(cfg->reopen_config); free(cfg->workers); free(cfg->workload); } @@ -157,13 +159,19 @@ config_compress(CONFIG *cfg) cfg->compress_ext = NULL; cfg->compress_table = NULL; } else if (strcmp(s, "lz4") == 0) { +#ifndef HAVE_BUILTIN_EXTENSION_LZ4 cfg->compress_ext = LZ4_EXT; +#endif cfg->compress_table = LZ4_BLK; } else if (strcmp(s, "snappy") == 0) { +#ifndef HAVE_BUILTIN_EXTENSION_SNAPPY cfg->compress_ext = SNAPPY_EXT; +#endif cfg->compress_table = SNAPPY_BLK; } else if (strcmp(s, "zlib") == 0) { +#ifndef HAVE_BUILTIN_EXTENSION_ZLIB cfg->compress_ext = ZLIB_EXT; +#endif cfg->compress_table = ZLIB_BLK; } else { fprintf(stderr, @@ -233,10 +241,6 @@ config_threads(CONFIG *cfg, const char *config, size_t len) goto err; continue; } - if (STRING_MATCH("throttle", k.str, k.len)) { - workp->throttle = (uint64_t)v.val; - continue; - } if (STRING_MATCH("insert", k.str, k.len) || STRING_MATCH("inserts", k.str, k.len)) { if ((workp->insert = v.val) < 0) @@ -254,20 +258,17 @@ config_threads(CONFIG *cfg, const char *config, size_t len) goto err; continue; } - if (STRING_MATCH("update", k.str, k.len) || - STRING_MATCH("updates", k.str, k.len)) { - if ((workp->update = v.val) < 0) - goto err; + if (STRING_MATCH("throttle", k.str, k.len)) { + workp->throttle = (uint64_t)v.val; continue; } if (STRING_MATCH("truncate", k.str, k.len)) { if ((workp->truncate = v.val) != 1) goto err; /* There can only be one Truncate thread. */ - if (cfg->has_truncate != 0) { + if (F_ISSET(cfg, CFG_TRUNCATE)) goto err; - } - cfg->has_truncate = 1; + F_SET(cfg, CFG_TRUNCATE); continue; } if (STRING_MATCH("truncate_pct", k.str, k.len)) { @@ -282,6 +283,29 @@ config_threads(CONFIG *cfg, const char *config, size_t len) workp->truncate_count = (uint64_t)v.val; continue; } + if (STRING_MATCH("update", k.str, k.len) || + STRING_MATCH("updates", k.str, k.len)) { + if ((workp->update = v.val) < 0) + goto err; + continue; + } + if (STRING_MATCH("update_delta", k.str, k.len)) { + if (v.type == WT_CONFIG_ITEM_STRING || + v.type == WT_CONFIG_ITEM_ID) { + if (strncmp(v.str, "rand", 4) != 0) + goto err; + /* Special random value */ + workp->update_delta = INT64_MAX; + F_SET(cfg, CFG_GROW); + } else { + workp->update_delta = v.val; + if (v.val > 0) + F_SET(cfg, CFG_GROW); + if (v.val < 0) + F_SET(cfg, CFG_SHRINK); + } + continue; + } goto err; } if (ret == WT_NOTFOUND) @@ -401,7 +425,12 @@ config_opt(CONFIG *cfg, WT_CONFIG_ITEM *k, WT_CONFIG_ITEM *v) *(uint32_t *)valueloc = (uint32_t)v->val; break; case CONFIG_STRING_TYPE: - if (v->type != WT_CONFIG_ITEM_STRING) { + /* + * Configuration parsing uses string/ID to distinguish + * between quoted and unquoted values. + */ + if (v->type != WT_CONFIG_ITEM_STRING && + v->type != WT_CONFIG_ITEM_ID) { fprintf(stderr, "wtperf: Error: " "bad string value for \'%.*s=%.*s\'\n", (int)k->len, k->str, (int)v->len, v->str); @@ -430,7 +459,8 @@ config_opt(CONFIG *cfg, WT_CONFIG_ITEM *k, WT_CONFIG_ITEM *v) STRING_MATCH("threads", k->str, k->len)) return (config_threads(cfg, v->str, v->len)); - if (v->type != WT_CONFIG_ITEM_STRING) { + if (v->type != WT_CONFIG_ITEM_STRING && + v->type != WT_CONFIG_ITEM_ID) { fprintf(stderr, "wtperf: Error: " "bad string value for \'%.*s=%.*s\'\n", (int)k->len, k->str, (int)v->len, v->str); @@ -634,6 +664,9 @@ config_opt_str(CONFIG *cfg, const char *name, const char *value) int config_sanity(CONFIG *cfg) { + WORKLOAD *workp; + u_int i; + /* Various intervals should be less than the run-time. */ if (cfg->run_time > 0 && ((cfg->checkpoint_threads != 0 && @@ -660,6 +693,36 @@ config_sanity(CONFIG *cfg) "Invalid pareto distribution - should be a percentage\n"); return (EINVAL); } + + if (cfg->value_sz_max < cfg->value_sz) { + if (F_ISSET(cfg, CFG_GROW)) { + fprintf(stderr, "value_sz_max %" PRIu32 + " must be greater than or equal to value_sz %" + PRIu32 "\n", cfg->value_sz_max, cfg->value_sz); + return (EINVAL); + } else + cfg->value_sz_max = cfg->value_sz; + } + if (cfg->value_sz_min > cfg->value_sz) { + if (F_ISSET(cfg, CFG_SHRINK)) { + fprintf(stderr, "value_sz_min %" PRIu32 + " must be less than or equal to value_sz %" + PRIu32 "\n", cfg->value_sz_min, cfg->value_sz); + return (EINVAL); + } else + cfg->value_sz_min = cfg->value_sz; + } + + if (cfg->readonly && cfg->workload != NULL) + for (i = 0, workp = cfg->workload; + i < cfg->workload_cnt; ++i, ++workp) + if (workp->insert != 0 || workp->update != 0 || + workp->truncate != 0) { + fprintf(stderr, + "Invalid workload: insert, update or " + "truncate specified with readonly\n"); + return (EINVAL); + } return (0); } diff --git a/bench/wtperf/runners/evict-btree-readonly.wtperf b/bench/wtperf/runners/evict-btree-readonly.wtperf new file mode 100644 index 00000000000..25599fadd8d --- /dev/null +++ b/bench/wtperf/runners/evict-btree-readonly.wtperf @@ -0,0 +1,12 @@ +# wtperf options file: evict btree configuration +conn_config="cache_size=50M,eviction=(threads_max=4),mmap=false" +table_config="type=file" +icount=10000000 +report_interval=5 +run_time=120 +populate_threads=1 +readonly=true +threads=((count=16,reads=1)) +# Add throughput/latency monitoring +max_latency=2000 +sample_interval=5 diff --git a/bench/wtperf/runners/evict-btree.wtperf b/bench/wtperf/runners/evict-btree.wtperf index 24da4dd7902..e7d967e5c63 100644 --- a/bench/wtperf/runners/evict-btree.wtperf +++ b/bench/wtperf/runners/evict-btree.wtperf @@ -1,5 +1,5 @@ # wtperf options file: evict btree configuration -conn_config="cache_size=50M" +conn_config="cache_size=50M,eviction=(threads_max=4)" table_config="type=file" icount=10000000 report_interval=5 diff --git a/bench/wtperf/runners/evict-lsm-readonly.wtperf b/bench/wtperf/runners/evict-lsm-readonly.wtperf new file mode 100644 index 00000000000..661b8e21924 --- /dev/null +++ b/bench/wtperf/runners/evict-lsm-readonly.wtperf @@ -0,0 +1,13 @@ +# wtperf options file: evict lsm configuration +conn_config="cache_size=50M,lsm_manager=(worker_thread_max=6),eviction=(threads_max=4)" +table_config="type=lsm,lsm=(chunk_size=2M),os_cache_dirty_max=16MB" +compact=true +icount=10000000 +report_interval=5 +run_time=120 +populate_threads=1 +readonly=true +threads=((count=16,reads=1)) +# Add throughput/latency monitoring +max_latency=2000 +sample_interval=5 diff --git a/bench/wtperf/runners/evict-lsm.wtperf b/bench/wtperf/runners/evict-lsm.wtperf index ad885d98eb7..b872d429046 100644 --- a/bench/wtperf/runners/evict-lsm.wtperf +++ b/bench/wtperf/runners/evict-lsm.wtperf @@ -1,5 +1,5 @@ # wtperf options file: evict lsm configuration -conn_config="cache_size=50M,lsm_manager=(worker_thread_max=6)" +conn_config="cache_size=50M,lsm_manager=(worker_thread_max=6),eviction=(threads_max=4)" table_config="type=lsm,lsm=(chunk_size=2M),os_cache_dirty_max=16MB" compact=true icount=10000000 diff --git a/bench/wtperf/runners/update-delta-mix1.wtperf b/bench/wtperf/runners/update-delta-mix1.wtperf new file mode 100644 index 00000000000..0f5e75f5347 --- /dev/null +++ b/bench/wtperf/runners/update-delta-mix1.wtperf @@ -0,0 +1,18 @@ +# wtperf options file: Mixed workload where we grow some values and shrink +# others. Mixed load leaning toward growing the dataset. +# +conn_config="cache_size=2GB,checkpoint=(wait=30)" +table_config="type=file,leaf_page_max=32k,leaf_value_max=128k,split_pct=90" +# The values are starting small, insert a lot so our database grows larger than +# cache quickly. +icount=200000 +report_interval=5 +run_time=300 +populate_threads=1 +# +# Run more grow workload threads than shrink threads. +# +threads=((count=4,update=1,update_delta=100),(count=2,update=1,update_delta=-150)) +value_sz=20000 +value_sz_min=1000 +value_sz_max=65536 diff --git a/bench/wtperf/runners/update-delta-mix2.wtperf b/bench/wtperf/runners/update-delta-mix2.wtperf new file mode 100644 index 00000000000..f3ce2a455cc --- /dev/null +++ b/bench/wtperf/runners/update-delta-mix2.wtperf @@ -0,0 +1,18 @@ +# wtperf options file: Mixed workload where we grow some values and shrink +# others. Mixed load leaning toward shrinking the dataset. +# +conn_config="cache_size=2GB,checkpoint=(wait=30)" +table_config="type=file,leaf_page_max=32k,leaf_value_max=128k,split_pct=90" +# The values are starting small, insert a lot so our database grows larger than +# cache quickly. +icount=200000 +report_interval=5 +run_time=300 +populate_threads=1 +# +# Run more shrink workload threads than grow threads. +# +threads=((count=2,update=1,update_delta=150),(count=4,update=1,update_delta=-100)) +value_sz=20000 +value_sz_min=1000 +value_sz_max=65536 diff --git a/bench/wtperf/runners/update-delta-mix3.wtperf b/bench/wtperf/runners/update-delta-mix3.wtperf new file mode 100644 index 00000000000..606eb727eef --- /dev/null +++ b/bench/wtperf/runners/update-delta-mix3.wtperf @@ -0,0 +1,18 @@ +# wtperf options file: Mixed workload where we grow some values and shrink +# others. Mixed load leaning toward mostly a balance. +# +conn_config="cache_size=2GB,checkpoint=(wait=30)" +table_config="type=file,leaf_page_max=32k,leaf_value_max=128k,split_pct=90" +# The values are starting small, insert a lot so our database grows larger than +# cache quickly. +icount=200000 +report_interval=5 +run_time=300 +populate_threads=1 +# +# Run a balance of threads. +# +threads=((count=3,update=1,update_delta=100),(count=3,update=1,update_delta=-100)) +value_sz=20000 +value_sz_min=1000 +value_sz_max=65536 diff --git a/bench/wtperf/runners/update-grow-stress.wtperf b/bench/wtperf/runners/update-grow-stress.wtperf new file mode 100644 index 00000000000..f7403e1578d --- /dev/null +++ b/bench/wtperf/runners/update-grow-stress.wtperf @@ -0,0 +1,15 @@ +# wtperf options file: Grow the size of documents while there is cache +# pressure and appends are happening as well. +conn_config="cache_size=2GB,checkpoint=(wait=30)" +table_config="type=file,leaf_page_max=32k,leaf_value_max=128k,split_pct=90" +# The values are starting small, insert a lot so our database grows larger than +# cache quickly. +icount=200000 +report_interval=5 +run_time=240 +populate_threads=1 +# Continue inserting new records. +threads=((count=1,inserts=1,throttle=1000),(count=4,update=1,update_delta=100)) +# Start with small values and let them grow slowly to large values. +value_sz=10000 +value_sz_max=65536 diff --git a/bench/wtperf/runners/update-shrink-stress.wtperf b/bench/wtperf/runners/update-shrink-stress.wtperf new file mode 100644 index 00000000000..bbdd9593b59 --- /dev/null +++ b/bench/wtperf/runners/update-shrink-stress.wtperf @@ -0,0 +1,15 @@ +# wtperf options file: Shrink the size of values. Checkpoint frequently +# and insert new records too. +# +conn_config="cache_size=2GB,checkpoint=(wait=30)" +table_config="type=file,leaf_page_max=32k,leaf_value_max=128k,split_pct=90" +# Since we're continually inserting, start with a smaller number initially. +icount=200000 +report_interval=5 +run_time=240 +populate_threads=1 +# Continue inserting new records. +threads=((count=1,inserts=1,throttle=1000),(count=4,update=1,update_delta=-100)) +# Start with moderate values and let them shrink slowly. +value_sz_min=1000 +value_sz=10000 diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c index b2e68198e9a..340c400ba7e 100644 --- a/bench/wtperf/wtperf.c +++ b/bench/wtperf/wtperf.c @@ -33,6 +33,7 @@ static const CONFIG default_cfg = { "WT_TEST", /* home */ "WT_TEST", /* monitor dir */ NULL, /* partial logging */ + NULL, /* reopen config */ NULL, /* base_uri */ NULL, /* uris */ NULL, /* helium_mount */ @@ -59,7 +60,7 @@ static const CONFIG default_cfg = { 0, /* in warmup phase */ false, /* Signal for idle cycle thread */ 0, /* total seconds running */ - 0, /* has truncate */ + 0, /* flags */ {NULL, NULL}, /* the truncate queue */ {NULL, NULL}, /* the config queue */ @@ -86,6 +87,7 @@ static int start_threads(CONFIG *, WORKLOAD *, CONFIG_THREAD *, u_int, void *(*)(void *)); static int stop_threads(CONFIG *, u_int, CONFIG_THREAD *); static void *thread_run_wtperf(void *); +static void update_value_delta(CONFIG_THREAD *); static void *worker(void *); static uint64_t wtperf_rand(CONFIG_THREAD *); @@ -104,24 +106,93 @@ get_next_incr(CONFIG *cfg) return (__wt_atomic_add64(&cfg->insert_key, 1)); } +/* + * Each time this function is called we will overwrite the first and one + * other element in the value buffer. + */ static void randomize_value(CONFIG_THREAD *thread, char *value_buf) { uint8_t *vb; - uint32_t i; + uint32_t i, max_range, rand_val; + + /* + * Limit how much of the buffer we validate for length, this means + * that only threads that do growing updates will ever make changes to + * values outside of the initial value size, but that's a fair trade + * off for avoiding figuring out how long the value is more accurately + * in this performance sensitive function. + */ + if (thread->workload == NULL || thread->workload->update_delta == 0) + max_range = thread->cfg->value_sz; + else if (thread->workload->update_delta > 0) + max_range = thread->cfg->value_sz_max; + else + max_range = thread->cfg->value_sz_min; /* - * Each time we're called overwrite value_buf[0] and one other - * randomly chosen byte (other than the trailing NUL). - * Make sure we don't write a NUL: keep the value the same length. + * Generate a single random value and re-use it. We generally only + * have small ranges in this function, so avoiding a bunch of calls + * is worthwhile. + */ + rand_val = __wt_random(&thread->rnd); + i = rand_val % (max_range - 1); + + /* + * Ensure we don't write past the end of a value when configured for + * randomly sized values. */ - i = __wt_random(&thread->rnd) % (thread->cfg->value_sz - 1); while (value_buf[i] == '\0' && i > 0) --i; - if (i > 0) { - vb = (uint8_t *)value_buf; - vb[0] = (__wt_random(&thread->rnd) % 255) + 1; - vb[i] = (__wt_random(&thread->rnd) % 255) + 1; + + vb = (uint8_t *)value_buf; + vb[0] = ((rand_val >> 8) % 255) + 1; + /* + * If i happened to be 0, we'll be re-writing the same value + * twice, but that doesn't matter. + */ + vb[i] = ((rand_val >> 16) % 255) + 1; +} + +/* + * Figure out and extend the size of the value string, used for growing + * updates. We know that the value to be updated is in the threads value + * scratch buffer. + */ +static inline void +update_value_delta(CONFIG_THREAD *thread) +{ + CONFIG *cfg; + char * value; + int64_t delta, len, new_len; + + cfg = thread->cfg; + value = thread->value_buf; + delta = thread->workload->update_delta; + len = (int64_t)strlen(value); + + if (delta == INT64_MAX) + delta = __wt_random(&thread->rnd) % + (cfg->value_sz_max - cfg->value_sz); + + /* Ensure we aren't changing across boundaries */ + if (delta > 0 && len + delta > cfg->value_sz_max) + delta = cfg->value_sz_max - len; + else if (delta < 0 && len + delta < cfg->value_sz_min) + delta = cfg->value_sz_min - len; + + /* Bail if there isn't anything to do */ + if (delta == 0) + return; + + if (delta < 0) + value[len + delta] = '\0'; + else { + /* Extend the value by the configured amount. */ + for (new_len = len; + new_len < cfg->value_sz_max && new_len - len < delta; + new_len++) + value[new_len] = 'a'; } } @@ -623,8 +694,10 @@ worker(void *arg) * Copy as much of the previous value as is * safe, and be sure to NUL-terminate. */ - strncpy(value_buf, value, cfg->value_sz); - value_buf[cfg->value_sz - 1] = '\0'; + strncpy(value_buf, + value, cfg->value_sz_max - 1); + if (thread->workload->update_delta != 0) + update_value_delta(thread); if (value_buf[0] == 'a') value_buf[0] = 'b'; else @@ -1517,7 +1590,7 @@ close_reopen(CONFIG *cfg) { int ret; - if (!cfg->reopen_connection) + if (!cfg->readonly && !cfg->reopen_connection) return (0); /* * Reopen the connection. We do this so that the workload phase always @@ -1533,7 +1606,7 @@ close_reopen(CONFIG *cfg) return (ret); } if ((ret = wiredtiger_open( - cfg->home, NULL, cfg->conn_config, &cfg->conn)) != 0) { + cfg->home, NULL, cfg->reopen_config, &cfg->conn)) != 0) { lprintf(cfg, ret, 0, "Re-opening the connection failed"); return (ret); } @@ -1595,7 +1668,7 @@ execute_workload(CONFIG *cfg) for (threads = cfg->workers, i = 0, workp = cfg->workload; i < cfg->workload_cnt; ++i, ++workp) { lprintf(cfg, 0, 1, - "Starting workload #%d: %" PRId64 " threads, inserts=%" + "Starting workload #%u: %" PRId64 " threads, inserts=%" PRId64 ", reads=%" PRId64 ", updates=%" PRId64 ", truncate=%" PRId64 ", throttle=%" PRId64, i + 1, workp->threads, workp->insert, @@ -2194,7 +2267,7 @@ main(int argc, char *argv[]) * the compact operation, but not for the workloads. */ if (cfg->async_threads > 0) { - if (cfg->has_truncate > 0) { + if (F_ISSET(cfg, CFG_TRUNCATE)) { lprintf(cfg, 1, 0, "Cannot run truncate and async\n"); goto err; } @@ -2212,20 +2285,20 @@ main(int argc, char *argv[]) req_len = strlen(",async=(enabled=true,threads=)") + 4; cfg->async_config = dcalloc(req_len, 1); snprintf(cfg->async_config, req_len, - ",async=(enabled=true,threads=%d)", + ",async=(enabled=true,threads=%" PRIu32 ")", cfg->async_threads); } if ((ret = config_compress(cfg)) != 0) goto err; /* You can't have truncate on a random collection. */ - if (cfg->has_truncate && cfg->random_range) { + if (F_ISSET(cfg, CFG_TRUNCATE) && cfg->random_range) { lprintf(cfg, 1, 0, "Cannot run truncate and random_range\n"); goto err; } /* We can't run truncate with more than one table. */ - if (cfg->has_truncate && cfg->table_count > 1) { + if (F_ISSET(cfg, CFG_TRUNCATE) && cfg->table_count > 1) { lprintf(cfg, 1, 0, "Cannot truncate more than 1 table\n"); goto err; } @@ -2297,9 +2370,25 @@ main(int argc, char *argv[]) req_len = strlen(cfg->table_config) + strlen(LOG_PARTIAL_CONFIG) + 1; cfg->partial_config = dcalloc(req_len, 1); - snprintf((char *)cfg->partial_config, req_len, "%s%s", - (char *)cfg->table_config, LOG_PARTIAL_CONFIG); + snprintf(cfg->partial_config, req_len, "%s%s", + cfg->table_config, LOG_PARTIAL_CONFIG); } + /* + * Set the config for reopen. If readonly add in that string. + * If not readonly then just copy the original conn_config. + */ + if (cfg->readonly) + req_len = strlen(cfg->conn_config) + + strlen(READONLY_CONFIG) + 1; + else + req_len = strlen(cfg->conn_config) + 1; + cfg->reopen_config = dcalloc(req_len, 1); + if (cfg->readonly) + snprintf(cfg->reopen_config, req_len, "%s%s", + cfg->conn_config, READONLY_CONFIG); + else + snprintf(cfg->reopen_config, req_len, "%s", + cfg->conn_config); /* Sanity-check the configuration. */ if ((ret = config_sanity(cfg)) != 0) @@ -2357,7 +2446,8 @@ start_threads(CONFIG *cfg, * strings: trailing NUL is included in the size. */ thread->key_buf = dcalloc(cfg->key_sz, 1); - thread->value_buf = dcalloc(cfg->value_sz, 1); + thread->value_buf = dcalloc(cfg->value_sz_max, 1); + /* * Initialize and then toss in a bit of random values if needed. */ diff --git a/bench/wtperf/wtperf.h b/bench/wtperf/wtperf.h index 929880b0aef..a2b497b3142 100644 --- a/bench/wtperf/wtperf.h +++ b/bench/wtperf/wtperf.h @@ -94,6 +94,7 @@ typedef struct { int64_t truncate; /* Truncate ratio */ uint64_t truncate_pct; /* Truncate Percent */ uint64_t truncate_count; /* Truncate Count */ + int64_t update_delta; /* Value size change on update */ #define WORKER_INSERT 1 /* Insert */ #define WORKER_INSERT_RMW 2 /* Insert with read-modify-write */ @@ -138,6 +139,7 @@ typedef struct { } THROTTLE_CONFIG; #define LOG_PARTIAL_CONFIG ",log=(enabled=false)" +#define READONLY_CONFIG ",readonly=true" /* * NOTE: If you add any fields to this structure here, you must also add * an initialization in wtperf.c in the default_cfg. @@ -145,7 +147,8 @@ typedef struct { struct __config { /* Configuration structure */ const char *home; /* WiredTiger home */ const char *monitor_dir; /* Monitor output dir */ - const char *partial_config; /* Config string for partial logging */ + char *partial_config; /* Config string for partial logging */ + char *reopen_config; /* Config string for conn reopen */ char *base_uri; /* Object URI */ char **uris; /* URIs if multiple tables */ const char *helium_mount; /* Optional Helium mount point */ @@ -188,7 +191,10 @@ struct __config { /* Configuration structure */ volatile uint32_t totalsec; /* total seconds running */ - u_int has_truncate; /* if there is a truncate workload */ +#define CFG_GROW 0x0001 /* There is a grow workload */ +#define CFG_SHRINK 0x0002 /* There is a shrink workload */ +#define CFG_TRUNCATE 0x0004 /* There is a truncate workload */ + uint32_t flags; /* flags */ /* Queue head for use with the Truncate Logic */ TAILQ_HEAD(__truncate_qh, __truncate_queue_entry) stone_head; @@ -331,7 +337,7 @@ generate_key(CONFIG *cfg, char *key_buf, uint64_t keyno) static inline void extract_key(char *key_buf, uint64_t *keynop) { - sscanf(key_buf, "%" SCNu64, keynop); + (void)sscanf(key_buf, "%" SCNu64, keynop); } /* @@ -364,11 +370,11 @@ dmalloc(size_t len) * Call calloc, dying on failure. */ static inline void * -dcalloc(size_t num, size_t len) +dcalloc(size_t num, size_t size) { void *p; - if ((p = calloc(len, num)) == NULL) + if ((p = calloc(num, size)) == NULL) die(errno, "calloc"); return (p); } @@ -410,11 +416,9 @@ static inline char * dstrndup(const char *str, const size_t len) { char *p; - p = dcalloc(len + 1, 1); - strncpy(p, str, len); - if (p == NULL) - die(errno, "dstrndup"); + p = dcalloc(len + 1, sizeof(char)); + memcpy(p, str, len); return (p); } #endif diff --git a/bench/wtperf/wtperf_opt.i b/bench/wtperf/wtperf_opt.i index 60bbaff56e5..b5e274a17c2 100644 --- a/bench/wtperf/wtperf_opt.i +++ b/bench/wtperf/wtperf_opt.i @@ -145,6 +145,10 @@ DEF_OPT_AS_UINT32(random_range, 0, "insert operations") DEF_OPT_AS_BOOL(random_value, 0, "generate random content for the value") DEF_OPT_AS_UINT32(read_range, 0, "scan a range of keys after each search") +DEF_OPT_AS_BOOL(readonly, 0, + "reopen the connection between populate and workload phases in readonly " + "mode. Requires reopen_connection turned on (default). Requires that " + "read be the only workload specified") DEF_OPT_AS_BOOL(reopen_connection, 1, "close and reopen the connection between populate and workload phases") DEF_OPT_AS_UINT32(report_interval, 2, @@ -180,13 +184,17 @@ DEF_OPT_AS_STRING(threads, "", "workload configuration: each 'count' " "'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' " "which would create 2 threads doing nothing but reads and 8 threads " "each doing 50% inserts and 25% reads and updates. Allowed configuration " - "values are 'count', 'throttle', 'reads', 'inserts', 'updates', 'truncate'," - " 'truncate_pct' and 'truncate_count'. There are " + "values are 'count', 'throttle', 'update_delta', 'reads', 'inserts', " + "'updates', 'truncate', 'truncate_pct' and 'truncate_count'. There are " "also behavior modifiers, supported modifiers are 'ops_per_txn'") DEF_OPT_AS_CONFIG_STRING(transaction_config, "", "transaction configuration string, relevant when populate_opts_per_txn " "is nonzero") DEF_OPT_AS_STRING(table_name, "test", "table name") +DEF_OPT_AS_UINT32(value_sz_max, 1000, + "maximum value size when delta updates are present. Default disabled") +DEF_OPT_AS_UINT32(value_sz_min, 1, + "minimum value size when delta updates are present. Default disabled") DEF_OPT_AS_UINT32(value_sz, 100, "value size") DEF_OPT_AS_UINT32(verbose, 1, "verbosity") DEF_OPT_AS_UINT32(warmup, 0, diff --git a/build_posix/Make.subdirs b/build_posix/Make.subdirs index e1f8a05c613..4e1f829c0c5 100644 --- a/build_posix/Make.subdirs +++ b/build_posix/Make.subdirs @@ -6,6 +6,7 @@ # If the directory exists, it is added to AUTO_SUBDIRS. # If a condition is included, the subdir is made conditional via AM_CONDITIONAL ext/collators/reverse +ext/collators/revint ext/compressors/lz4 LZ4 ext/compressors/nop ext/compressors/snappy SNAPPY @@ -26,10 +27,13 @@ lang/python PYTHON # Make the tests test/bloom test/checkpoint +test/cursor_order test/fops test/format test/huge +test/manydbs test/packing +test/readonly test/recovery test/salvage test/thread diff --git a/build_posix/configure.ac.in b/build_posix/configure.ac.in index 875c8b436a8..9251873be73 100644 --- a/build_posix/configure.ac.in +++ b/build_posix/configure.ac.in @@ -32,6 +32,23 @@ AC_SUBST([LIBTOOL_DEPS]) AC_PROG_CC(cc gcc) AC_PROG_CXX(c++ g++) +AM_PROG_AS(as gas) + +# This is a workaround as part of WT-2459. Currently, clang (v3.7) does not +# support compiling the ASM code we have to perform the CRC checks on PowerPC. +# To compile with clang we need to override the ASM compiler with CCAS to use +# gcc. Unfortunately, doing the compilation in this manner means libtool can't +# determine what tag to use for that one .S file. If we catch that we are using +# two different compilers for CC and CCAS and we are on a PowerPC system we +# overload the libtool flags to provide CC by default. +if test "$CC" != "$CCAS"; then + AS_CASE([$host_cpu], + [ppc64*], [AM_LIBTOOLFLAGS+="--tag=CC"], + [elf64lppc], [AM_LIBTOOLFLAGS+="--tag=CC"], + [powerpc*], [AM_LIBTOOLFLAGS+="--tag=CC"], + []) +fi +AC_SUBST(AM_LIBTOOLFLAGS) if test "$GCC" = "yes"; then # The Solaris gcc compiler gets the additional -pthreads flag. @@ -96,6 +113,13 @@ AC_SYS_LARGEFILE AC_C_BIGENDIAN +AC_MSG_CHECKING([for a 64-bit build]) +AC_COMPUTE_INT(ac_cv_sizeof_void_p, [sizeof(void *)]) +if test "$ac_cv_sizeof_void_p" != "8" ; then + AC_MSG_ERROR([WiredTiger requires a 64-bit build.]) +fi +AC_MSG_RESULT(yes) + # Linux requires _GNU_SOURCE to be defined case "$host_os" in linux*) AM_CFLAGS="$AM_CFLAGS -D_GNU_SOURCE" ;; diff --git a/build_posix/reconf b/build_posix/reconf index 8700c5da43d..16d4002d9b9 100755 --- a/build_posix/reconf +++ b/build_posix/reconf @@ -24,6 +24,7 @@ clean() aclocal.m4 \ auto-includes.chk \ autom4te.cache \ + config.cache \ config.hin \ config.hin~ \ config.log \ diff --git a/build_win/filelist.win b/build_win/filelist.win index 0a313026793..b6a9caf4a74 100644 --- a/build_win/filelist.win +++ b/build_win/filelist.win @@ -155,6 +155,7 @@ src/session/session_compact.c src/session/session_dhandle.c src/session/session_salvage.c src/support/cksum.c +src/support/cond_auto.c src/support/crypto.c src/support/err.c src/support/filename.c diff --git a/dist/api_data.py b/dist/api_data.py index c386c0b345d..02aee1e8825 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -76,12 +76,12 @@ lsm_config = [ Config('bloom', 'true', r''' create bloom filters on LSM tree chunks as they are merged''', type='boolean'), - Config('bloom_config', '', r''' - config string used when creating Bloom filter files, passed - to WT_SESSION::create'''), Config('bloom_bit_count', '16', r''' the number of bits used per item for LSM bloom filters''', min='2', max='1000'), + Config('bloom_config', '', r''' + config string used when creating Bloom filter files, passed + to WT_SESSION::create'''), Config('bloom_hash_count', '8', r''' the number of hash values per item used for LSM bloom filters''', @@ -299,6 +299,15 @@ file_meta = file_config + [ the file version'''), ] +lsm_meta = file_config + lsm_config + [ + Config('last', '', r''' + the last allocated chunk ID'''), + Config('chunks', '', r''' + active chunks in the LSM tree'''), + Config('old_chunks', '', r''' + obsolete chunks in the LSM tree'''), +] + table_only_config = [ Config('colgroups', '', r''' comma-separated list of names of column groups. Each column @@ -522,6 +531,9 @@ connection_runtime_config = [ the statistics log server uses a session from the configured session_max''', type='category', subconfig=[ + Config('json', 'false', r''' + encode statistics in JSON format''', + type='boolean'), Config('on_close', 'false', r'''log statistics on database close''', type='boolean'), Config('path', '"WiredTigerStat.%d.%H"', r''' @@ -538,7 +550,8 @@ connection_runtime_config = [ type='list'), Config('timestamp', '"%b %d %H:%M:%S"', r''' a timestamp prepended to each log record, may contain strftime - conversion specifications'''), + conversion specifications, when \c json is configured, defaults + to \c "%FT%Y.000Z"'''), Config('wait', '0', r''' seconds to wait between each write of the log records; setting this value above 0 configures statistics logging''', @@ -655,6 +668,11 @@ wiredtiger_open_common = connection_runtime_config + [ RPC server for primary processes and use RPC for secondary processes). <b>Not yet supported in WiredTiger</b>''', type='boolean'), + Config('readonly', 'false', r''' + open connection in read-only mode. The database must exist. All + methods that may modify a database are disabled. See @ref readonly + for more information''', + type='boolean'), Config('session_max', '100', r''' maximum expected number of sessions (including server threads)''', @@ -732,12 +750,16 @@ cursor_runtime_config = [ ] methods = { -'file.meta' : Method(file_meta), - 'colgroup.meta' : Method(colgroup_meta), +'file.config' : Method(file_config), + +'file.meta' : Method(file_meta), + 'index.meta' : Method(index_meta), +'lsm.meta' : Method(lsm_meta), + 'table.meta' : Method(table_meta), 'WT_CURSOR.close' : Method([]), diff --git a/dist/api_err.py b/dist/api_err.py index 09332d508a2..a17c68ee196 100644 --- a/dist/api_err.py +++ b/dist/api_err.py @@ -56,6 +56,8 @@ errors = [ This error is generated when wiredtiger_open is configured to run in-memory, and an insert or update operation requires more than the configured cache size to complete.''', undoc=True), + Error('WT_PERM_DENIED', -31808, + 'permission denied (internal)', undoc=True), ] # Update the #defines in the wiredtiger.in file. diff --git a/dist/filelist b/dist/filelist index edd59435841..350e0c50087 100644 --- a/dist/filelist +++ b/dist/filelist @@ -153,6 +153,7 @@ src/session/session_compact.c src/session/session_dhandle.c src/session/session_salvage.c src/support/cksum.c +src/support/cond_auto.c src/support/crypto.c src/support/err.c src/support/filename.c @@ -163,6 +164,8 @@ src/support/hazard.c src/support/hex.c src/support/huffman.c src/support/pow.c +src/support/power8/crc32.S +src/support/power8/crc32_wrapper.c src/support/rand.c src/support/scratch.c src/support/stat.c diff --git a/dist/flags.py b/dist/flags.py index b97235b965a..f500e3b1ae1 100644 --- a/dist/flags.py +++ b/dist/flags.py @@ -99,6 +99,7 @@ flags = { 'CONN_LOG_SERVER_RUN', 'CONN_LSM_MERGE', 'CONN_PANIC', + 'CONN_READONLY', 'CONN_SERVER_ASYNC', 'CONN_SERVER_CHECKPOINT', 'CONN_SERVER_LSM', @@ -114,6 +115,7 @@ flags = { 'SESSION_LOCK_NO_WAIT', 'SESSION_LOCKED_CHECKPOINT', 'SESSION_LOCKED_HANDLE_LIST', + 'SESSION_LOCKED_METADATA', 'SESSION_LOCKED_SCHEMA', 'SESSION_LOCKED_SLOT', 'SESSION_LOCKED_TABLE', diff --git a/dist/s_export b/dist/s_export index 1212b5b2c1f..8a2c701d27f 100755 --- a/dist/s_export +++ b/dist/s_export @@ -12,10 +12,7 @@ Darwin) *) # We require GNU nm, which may not be installed. type nm > /dev/null 2>&1 && - (nm --version | grep 'GNU nm') > /dev/null 2>&1 || { - echo 'skipped: GNU nm not found' - exit 0 - } + (nm --version | grep 'GNU nm') > /dev/null 2>&1 || exit 0 NM='nm --extern-only --defined-only --print-file-name $f' ;; esac @@ -28,7 +25,9 @@ check() sed 's/.* //' | egrep -v '^__wt') | sort | - uniq -u > $t + uniq -u | + egrep -v \ + 'zlib_extension_init|lz4_extension_init|snappy_extension_init' > $t test -s $t && { echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=" diff --git a/dist/s_funcs.list b/dist/s_funcs.list index ed6cf43bb2f..8d32eecdfb7 100644 --- a/dist/s_funcs.list +++ b/dist/s_funcs.list @@ -1,4 +1,6 @@ # List of functions that aren't found by s_funcs, but that's OK. +FUNC_END +FUNC_START WT_CURDUMP_PASS __bit_ffs __bit_nclr diff --git a/dist/s_longlines b/dist/s_longlines index decedb58f44..000f33d51d5 100755 --- a/dist/s_longlines +++ b/dist/s_longlines @@ -9,8 +9,9 @@ l=`(cd .. && find dist -name '*.py' && find src -name '*.in') | sed -e '/dist\/stat_data\.py/d' \ - -e '/support\/stat\.c/d' \ - -e '/include\/extern\.h/d'` + -e '/include\/extern\.h/d' \ + -e '/support\/power8/d' \ + -e '/support\/stat\.c/d'` for f in $l ; do expand -t8 < ../$f | awk -- \ diff --git a/dist/s_string.ok b/dist/s_string.ok index 19fa27cd719..6762521ca76 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -107,11 +107,13 @@ FALLTHROUGH FH FLD FLSv +FLv FNV FORALL FOREACH FULLFSYNC FindFirstFile +Fixup Fk FlushFileBuffers Fprintf @@ -189,6 +191,7 @@ MALLOC MEM MEMALIGN MERCHANTABILITY +METADATA MONGODB MSVC MULTIBLOCK @@ -210,6 +213,7 @@ MySecret NEEDKEY NEEDVALUE NOLL +NOLOCK NONINFRINGEMENT NOTFOUND NOTREACHED @@ -243,6 +247,7 @@ Preload Prepend Qsort RCS +RDNOLOCK RECNO REF's REFs @@ -255,6 +260,7 @@ RNG RPC RUNDIR Radu +Readonly Rebalance RedHat Redistributions @@ -328,6 +334,7 @@ VxWorks WAL WIREDTIGER WRLSN +WRNOLOCK WakeAllConditionVariable Wconditional WeakHashLen @@ -430,6 +437,8 @@ cfg cfkos change's changelog +chdir +checkfmt checkpointed checkpointer checkpointing @@ -437,6 +446,7 @@ checksum checksums children's chk +chmod chongo cip cjoin @@ -501,6 +511,7 @@ datasets datasource datastore dbc +dbs dcalloc decile deciles @@ -670,6 +681,7 @@ inline inmem insertK insertV +inserters instantiation intl intnum @@ -686,6 +698,7 @@ jnr jrx json kb +kbits keycmp keyid keyv @@ -715,6 +728,7 @@ libwiredtiger llll llu loadtext +localTime localtime logf logmgr @@ -744,6 +758,8 @@ majorp malloc marshall marshalled +maxcpu +maxdbs mbll mbss mem @@ -756,6 +772,7 @@ memset memsize metaconf metadata +metadata's metafile mfence minorp @@ -792,7 +809,9 @@ nfilename nhex nlpo nocase +noclear nocrypto +nolock nonliteral noop nop @@ -829,8 +848,11 @@ parserp patchp pathname pathnames +pclose +pcpu perf pfx +popen poptable popthreads portably @@ -838,6 +860,7 @@ pos posint posix postsize +powerpc pragmas pre prealloc @@ -855,6 +878,7 @@ ps psp pthread ptr +ptrdiff pushms putK putV @@ -869,6 +893,7 @@ rS rb rbrace rbracket +rdonly rduppo readlock readonly @@ -891,6 +916,7 @@ resize resizing ret retp +revint rf rle rmw @@ -898,6 +924,7 @@ rng rocksdb rotN rotn +rp rpc run's runtime @@ -965,10 +992,12 @@ superset sw sy sys +sz t's tV tablename tcbench +td testutil th tid @@ -1054,9 +1083,12 @@ vsize vsnprintf vtype vunpack +vw +waitpid walk's warmup wb +wiredTiger wiredtiger workFactor wrapup diff --git a/dist/s_style b/dist/s_style index 44a5bdda741..78fb7a6eb03 100755 --- a/dist/s_style +++ b/dist/s_style @@ -18,7 +18,9 @@ if [ $# -ne 1 ]; then find bench examples ext src test \ -name '*.[chisy]' -o -name '*.in' -o -name '*.dox' | - sed -e '/Makefile.in/d' -e '/build_win\/wiredtiger_config.h/d' | + sed -e '/Makefile.in/d' \ + -e '/build_win\/wiredtiger_config.h/d' \ + -e '/support\/power8/d' | xargs $xp -n 1 -I{} sh ./dist/s_style {} else # General style correction and cleanup for a single file diff --git a/dist/s_whitespace b/dist/s_whitespace index d13de4b5989..74820a4f0e9 100755 --- a/dist/s_whitespace +++ b/dist/s_whitespace @@ -36,10 +36,9 @@ for f in `find bench examples ext src test \ -name '*.[chi]' -o \ -name '*.dox' -o \ -name '*.in' -o \ - -name 'Makefile.am'`; do - if expr "$f" : ".*/Makefile.in" > /dev/null; then - continue - fi + -name 'Makefile.am' | + sed -e '/Makefile.in/d' \ + -e '/support\/power8/d'`; do whitespace_and_empty_line $f done diff --git a/dist/s_win b/dist/s_win index 1eb4702d517..0b7d5184037 100755 --- a/dist/s_win +++ b/dist/s_win @@ -44,7 +44,7 @@ win_filelist() f='../build_win/filelist.win' # Process the files for which there's a Windows-specific version, then - # append Windows-only files. (There aren't yet any POSIX-only files.) + # append Windows-only files and discard POSIX-only files. (sed \ -e 's;os_posix/os_dir.c;os_win/os_dir.c;' \ -e 's;os_posix/os_dlopen.c;os_win/os_dlopen.c;' \ @@ -71,7 +71,9 @@ win_filelist() -e 's;os_posix/os_sleep.c;os_win/os_sleep.c;' \ -e 's;os_posix/os_thread.c;os_win/os_thread.c;' \ -e 's;os_posix/os_time.c;os_win/os_time.c;' \ - -e 's;os_posix/os_yield.c;os_win/os_yield.c;' + -e 's;os_posix/os_yield.c;os_win/os_yield.c;' \ + -e '/src\/support\/power8\/crc32.S/d' \ + -e '/src\/support\/power8\/crc32_wrapper.c/d' echo 'src/os_win/os_snprintf.c' echo 'src/os_win/os_vsnprintf.c') < filelist | sort > $t cmp $t $f > /dev/null 2>&1 || diff --git a/dist/stat.py b/dist/stat.py index 6dcfccfeab5..7961bf7053f 100644 --- a/dist/stat.py +++ b/dist/stat.py @@ -98,11 +98,11 @@ for line in open('../src/include/wiredtiger.in', 'r'): f.close() compare_srcfile(tmp_file, '../src/include/wiredtiger.in') -def print_func(name, handle, list): +def print_func(name, handle, statlist): '''Print the structures/functions for the stat.c file.''' f.write('\n') f.write('static const char * const __stats_' + name + '_desc[] = {\n') - for l in list: + for l in statlist: f.write('\t"' + l.desc + '",\n') f.write('};\n') @@ -143,7 +143,7 @@ void __wt_stat_''' + name + '_clear_single(WT_' + name.upper() + '''_STATS *stats) { ''') - for l in sorted(list): + for l in statlist: # no_clear: don't clear the value. if 'no_clear' in l.flags: f.write('\t\t/* not clearing ' + l.name + ' */\n') @@ -170,7 +170,7 @@ __wt_stat_''' + name + '''_aggregate_single( WT_''' + name.upper() + '_STATS *from, WT_' + name.upper() + '''_STATS *to) { ''') - for l in sorted(list): + for l in statlist: if 'max_aggregate' in l.flags: o = '\tif (from->' + l.name + ' > to->' + l.name + ')\n' +\ '\t\tto->' + l.name + ' = from->' + l.name + ';\n' @@ -190,11 +190,11 @@ __wt_stat_''' + name + '''_aggregate( # Connection level aggregation does not currently have any computation # of a maximum value; I'm leaving in support for it, but don't declare # a temporary variable until it's needed. - for l in sorted(list): + for l in statlist: if 'max_aggregate' in l.flags: f.write('\tint64_t v;\n\n') break; - for l in sorted(list): + for l in statlist: if 'max_aggregate' in l.flags: o = '\tif ((v = WT_STAT_READ(from, ' + l.name + ')) > ' +\ 'to->' + l.name + ')\n' diff --git a/dist/stat_data.py b/dist/stat_data.py index 41a93961079..bd951e64999 100644 --- a/dist/stat_data.py +++ b/dist/stat_data.py @@ -12,6 +12,7 @@ # max_aggregate Take the maximum value when aggregating statistics # no_clear Value not cleared when statistics cleared # no_scale Don't scale value per second in the logging tool script +# size Used by timeseries tool, indicates value is a byte count # # The no_clear and no_scale flags are normally always set together (values that # are maintained over time are normally not scaled per second). @@ -108,6 +109,8 @@ connection_stats = [ ########################################## # System statistics ########################################## + ConnStat('cond_auto_wait', 'auto adjusting condition wait calls'), + ConnStat('cond_auto_wait_reset', 'auto adjusting condition resets'), ConnStat('cond_wait', 'pthread mutex condition wait calls'), ConnStat('file_open', 'files currently open', 'no_clear,no_scale'), ConnStat('memory_allocation', 'memory allocations'), @@ -123,7 +126,7 @@ connection_stats = [ ########################################## AsyncStat('async_alloc_race', 'number of allocation state races'), AsyncStat('async_alloc_view', 'number of operation slots viewed for allocation'), - AsyncStat('async_cur_queue', 'current work queue length'), + AsyncStat('async_cur_queue', 'current work queue length', 'no_scale'), AsyncStat('async_flush', 'number of flush calls'), AsyncStat('async_full', 'number of times operation allocation failed'), AsyncStat('async_max_queue', 'maximum work queue length', 'no_clear,no_scale'), @@ -138,9 +141,9 @@ connection_stats = [ ########################################## # Block manager statistics ########################################## - BlockStat('block_byte_map_read', 'mapped bytes read'), - BlockStat('block_byte_read', 'bytes read'), - BlockStat('block_byte_write', 'bytes written'), + BlockStat('block_byte_map_read', 'mapped bytes read', 'size'), + BlockStat('block_byte_read', 'bytes read', 'size'), + BlockStat('block_byte_write', 'bytes written', 'size'), BlockStat('block_map_read', 'mapped blocks read'), BlockStat('block_preload', 'blocks pre-loaded'), BlockStat('block_read', 'blocks read'), @@ -149,14 +152,15 @@ connection_stats = [ ########################################## # Cache and eviction statistics ########################################## - CacheStat('cache_bytes_dirty', 'tracked dirty bytes in the cache', 'no_clear,no_scale'), - CacheStat('cache_bytes_internal', 'tracked bytes belonging to internal pages in the cache', 'no_clear,no_scale'), - CacheStat('cache_bytes_inuse', 'bytes currently in the cache', 'no_clear,no_scale'), - CacheStat('cache_bytes_leaf', 'tracked bytes belonging to leaf pages in the cache', 'no_clear,no_scale'), - CacheStat('cache_bytes_max', 'maximum bytes configured', 'no_clear,no_scale'), - CacheStat('cache_bytes_overflow', 'tracked bytes belonging to overflow pages in the cache', 'no_clear,no_scale'), - CacheStat('cache_bytes_read', 'bytes read into cache'), - CacheStat('cache_bytes_write', 'bytes written from cache'), + CacheStat('cache_bytes_dirty', 'tracked dirty bytes in the cache', 'no_clear,no_scale,size'), + CacheStat('cache_bytes_internal', 'tracked bytes belonging to internal pages in the cache', 'no_clear,no_scale,size'), + CacheStat('cache_bytes_inuse', 'bytes currently in the cache', 'no_clear,no_scale,size'), + CacheStat('cache_bytes_leaf', 'tracked bytes belonging to leaf pages in the cache', 'no_clear,no_scale,size'), + CacheStat('cache_bytes_max', 'maximum bytes configured', 'no_clear,no_scale,size'), + CacheStat('cache_bytes_overflow', 'tracked bytes belonging to overflow pages in the cache', 'no_clear,no_scale,size'), + CacheStat('cache_bytes_read', 'bytes read into cache', 'size'), + CacheStat('cache_bytes_write', 'bytes written from cache', 'size'), + CacheStat('cache_eviction_aggressive_set', 'eviction currently operating in aggressive mode', 'no_clear,no_scale'), CacheStat('cache_eviction_app', 'pages evicted by application threads'), CacheStat('cache_eviction_checkpoint', 'checkpoint blocked page eviction'), CacheStat('cache_eviction_clean', 'unmodified pages evicted'), @@ -168,7 +172,7 @@ connection_stats = [ CacheStat('cache_eviction_force_fail', 'failed eviction of pages that exceeded the in-memory maximum'), CacheStat('cache_eviction_hazard', 'hazard pointer blocked page eviction'), CacheStat('cache_eviction_internal', 'internal pages evicted'), - CacheStat('cache_eviction_maximum_page_size', 'maximum page size at eviction', 'no_clear,no_scale'), + CacheStat('cache_eviction_maximum_page_size', 'maximum page size at eviction', 'no_clear,no_scale,size'), CacheStat('cache_eviction_queue_empty', 'eviction server candidate queue empty when topping up'), CacheStat('cache_eviction_queue_not_empty', 'eviction server candidate queue not empty when topping up'), CacheStat('cache_eviction_server_evicting', 'eviction server evicting pages'), @@ -206,17 +210,19 @@ connection_stats = [ ########################################## # Logging statistics ########################################## - LogStat('log_buffer_size', 'total log buffer size', 'no_clear,no_scale'), - LogStat('log_bytes_payload', 'log bytes of payload data'), - LogStat('log_bytes_written', 'log bytes written'), + LogStat('log_buffer_size', 'total log buffer size', 'no_clear,no_scale,size'), + LogStat('log_bytes_payload', 'log bytes of payload data', 'size'), + LogStat('log_bytes_written', 'log bytes written', 'size'), LogStat('log_close_yields', 'yields waiting for previous log file close'), - LogStat('log_compress_len', 'total size of compressed records'), - LogStat('log_compress_mem', 'total in-memory size of compressed records'), + LogStat('log_compress_len', 'total size of compressed records', 'size'), + LogStat('log_compress_mem', 'total in-memory size of compressed records', 'size'), LogStat('log_compress_small', 'log records too small to compress'), LogStat('log_compress_write_fails', 'log records not compressed'), LogStat('log_compress_writes', 'log records compressed'), LogStat('log_flush', 'log flush operations'), - LogStat('log_max_filesize', 'maximum log file size', 'no_clear,no_scale'), + LogStat('log_force_write', 'log force write operations'), + LogStat('log_force_write_skip', 'log force write operations skipped'), + LogStat('log_max_filesize', 'maximum log file size', 'no_clear,no_scale,size'), LogStat('log_prealloc_files', 'pre-allocated log files prepared'), LogStat('log_prealloc_max', 'number of pre-allocated log files to create', 'no_clear,no_scale'), LogStat('log_prealloc_missed', 'pre-allocated log files not ready and missed'), @@ -227,7 +233,7 @@ connection_stats = [ LogStat('log_scans', 'log scan operations'), LogStat('log_slot_closes', 'consolidated slot closures'), LogStat('log_slot_coalesced', 'written slots coalesced'), - LogStat('log_slot_consolidated', 'logging bytes consolidated'), + LogStat('log_slot_consolidated', 'logging bytes consolidated', 'size'), LogStat('log_slot_joins', 'consolidated slot joins'), LogStat('log_slot_races', 'consolidated slot join races'), LogStat('log_slot_switch_busy', 'busy returns attempting to switch slots'), @@ -236,6 +242,7 @@ connection_stats = [ LogStat('log_sync', 'log sync operations'), LogStat('log_sync_dir', 'log sync_dir operations'), LogStat('log_write_lsn', 'log server thread advances write LSN'), + LogStat('log_write_lsn_skip', 'log server thread write LSN walk skipped'), LogStat('log_writes', 'log write operations'), LogStat('log_zero_fills', 'log files manually zero-filled'), @@ -246,7 +253,7 @@ connection_stats = [ RecStat('rec_page_delete_fast', 'fast-path pages deleted'), RecStat('rec_pages', 'page reconciliation calls'), RecStat('rec_pages_eviction', 'page reconciliation calls for eviction'), - RecStat('rec_split_stashed_bytes', 'split bytes currently awaiting free', 'no_clear,no_scale'), + RecStat('rec_split_stashed_bytes', 'split bytes currently awaiting free', 'no_clear,no_scale,size'), RecStat('rec_split_stashed_objects', 'split objects currently awaiting free', 'no_clear,no_scale'), ########################################## @@ -315,7 +322,7 @@ connection_stats = [ YieldStat('page_sleep', 'page acquire time sleeping (usecs)'), ] -connection_stats = sorted(connection_stats, key=attrgetter('name')) +connection_stats = sorted(connection_stats, key=attrgetter('desc')) ########################################## # Data source statistics @@ -333,18 +340,18 @@ dsrc_stats = [ CursorStat('cursor_create', 'create calls'), CursorStat('cursor_insert', 'insert calls'), CursorStat('cursor_insert_bulk', 'bulk-loaded cursor-insert calls'), - CursorStat('cursor_insert_bytes', 'cursor-insert key and value bytes inserted'), + CursorStat('cursor_insert_bytes', 'cursor-insert key and value bytes inserted', 'size'), CursorStat('cursor_next', 'next calls'), CursorStat('cursor_prev', 'prev calls'), CursorStat('cursor_remove', 'remove calls'), - CursorStat('cursor_remove_bytes', 'cursor-remove key bytes removed'), + CursorStat('cursor_remove_bytes', 'cursor-remove key bytes removed', 'size'), CursorStat('cursor_reset', 'reset calls'), CursorStat('cursor_restart', 'restarted searches'), CursorStat('cursor_search', 'search calls'), CursorStat('cursor_search_near', 'search near calls'), CursorStat('cursor_truncate', 'truncate calls'), CursorStat('cursor_update', 'update calls'), - CursorStat('cursor_update_bytes', 'cursor-update value bytes updated'), + CursorStat('cursor_update_bytes', 'cursor-update value bytes updated', 'size'), ########################################## # Btree statistics @@ -357,13 +364,13 @@ dsrc_stats = [ BtreeStat('btree_column_variable', 'column-store variable-size leaf pages', 'no_scale'), BtreeStat('btree_compact_rewrite', 'pages rewritten by compaction'), BtreeStat('btree_entries', 'number of key/value pairs', 'no_scale'), - BtreeStat('btree_fixed_len', 'fixed-record size', 'max_aggregate,no_scale'), + BtreeStat('btree_fixed_len', 'fixed-record size', 'max_aggregate,no_scale,size'), BtreeStat('btree_maximum_depth', 'maximum tree depth', 'max_aggregate,no_scale'), - BtreeStat('btree_maxintlkey', 'maximum internal page key size', 'max_aggregate,no_scale'), - BtreeStat('btree_maxintlpage', 'maximum internal page size', 'max_aggregate,no_scale'), - BtreeStat('btree_maxleafkey', 'maximum leaf page key size', 'max_aggregate,no_scale'), - BtreeStat('btree_maxleafpage', 'maximum leaf page size', 'max_aggregate,no_scale'), - BtreeStat('btree_maxleafvalue', 'maximum leaf page value size', 'max_aggregate,no_scale'), + BtreeStat('btree_maxintlkey', 'maximum internal page key size', 'max_aggregate,no_scale,size'), + BtreeStat('btree_maxintlpage', 'maximum internal page size', 'max_aggregate,no_scale,size'), + BtreeStat('btree_maxleafkey', 'maximum leaf page key size', 'max_aggregate,no_scale,size'), + BtreeStat('btree_maxleafpage', 'maximum leaf page size', 'max_aggregate,no_scale,size'), + BtreeStat('btree_maxleafvalue', 'maximum leaf page value size', 'max_aggregate,no_scale,size'), BtreeStat('btree_overflow', 'overflow pages', 'no_scale'), BtreeStat('btree_row_internal', 'row-store internal pages', 'no_scale'), BtreeStat('btree_row_leaf', 'row-store leaf pages', 'no_scale'), @@ -377,7 +384,7 @@ dsrc_stats = [ LSMStat('bloom_miss', 'bloom filter misses'), LSMStat('bloom_page_evict', 'bloom filter pages evicted from cache'), LSMStat('bloom_page_read', 'bloom filter pages read into cache'), - LSMStat('bloom_size', 'total size of bloom filters', 'no_scale'), + LSMStat('bloom_size', 'total size of bloom filters', 'no_scale,size'), LSMStat('lsm_checkpoint_throttle', 'sleep for LSM checkpoint throttle'), LSMStat('lsm_chunk_count', 'chunks in the LSM tree', 'no_scale'), LSMStat('lsm_generation_max', 'highest merge generation in the LSM tree', 'max_aggregate,no_scale'), @@ -387,22 +394,22 @@ dsrc_stats = [ ########################################## # Block manager statistics ########################################## - BlockStat('allocation_size', 'file allocation unit size', 'max_aggregate,no_scale'), + BlockStat('allocation_size', 'file allocation unit size', 'max_aggregate,no_scale,size'), BlockStat('block_alloc', 'blocks allocated'), - BlockStat('block_checkpoint_size', 'checkpoint size', 'no_scale'), + BlockStat('block_checkpoint_size', 'checkpoint size', 'no_scale,size'), BlockStat('block_extension', 'allocations requiring file extension'), BlockStat('block_free', 'blocks freed'), BlockStat('block_magic', 'file magic number', 'max_aggregate,no_scale'), BlockStat('block_major', 'file major version number', 'max_aggregate,no_scale'), BlockStat('block_minor', 'minor version number', 'max_aggregate,no_scale'), - BlockStat('block_reuse_bytes', 'file bytes available for reuse'), - BlockStat('block_size', 'file size in bytes', 'no_scale'), + BlockStat('block_reuse_bytes', 'file bytes available for reuse', 'no_scale,size'), + BlockStat('block_size', 'file size in bytes', 'no_scale,size'), ########################################## # Cache and eviction statistics ########################################## - CacheStat('cache_bytes_read', 'bytes read into cache'), - CacheStat('cache_bytes_write', 'bytes written from cache'), + CacheStat('cache_bytes_read', 'bytes read into cache', 'size'), + CacheStat('cache_bytes_write', 'bytes written from cache', 'size'), CacheStat('cache_eviction_checkpoint', 'checkpoint blocked page eviction'), CacheStat('cache_eviction_clean', 'unmodified pages evicted'), CacheStat('cache_eviction_deepen', 'page split during eviction deepened the tree'), @@ -448,8 +455,8 @@ dsrc_stats = [ RecStat('rec_page_match', 'page checksum matches'), RecStat('rec_pages', 'page reconciliation calls'), RecStat('rec_pages_eviction', 'page reconciliation calls for eviction'), - RecStat('rec_prefix_compression', 'leaf page key bytes discarded using prefix compression'), - RecStat('rec_suffix_compression', 'internal page key bytes discarded using suffix compression'), + RecStat('rec_prefix_compression', 'leaf page key bytes discarded using prefix compression', 'size'), + RecStat('rec_suffix_compression', 'internal page key bytes discarded using suffix compression', 'size'), ########################################## # Transaction statistics @@ -457,7 +464,7 @@ dsrc_stats = [ TxnStat('txn_update_conflict', 'update conflicts'), ] -dsrc_stats = sorted(dsrc_stats, key=attrgetter('name')) +dsrc_stats = sorted(dsrc_stats, key=attrgetter('desc')) ########################################## # Cursor Join statistics @@ -468,4 +475,4 @@ join_stats = [ JoinStat('bloom_false_positive', 'bloom filter false positives'), ] -join_stats = sorted(join_stats, key=attrgetter('name')) +join_stats = sorted(join_stats, key=attrgetter('desc')) diff --git a/examples/c/Makefile.am b/examples/c/Makefile.am index 587204efff1..72fd98aff7b 100644 --- a/examples/c/Makefile.am +++ b/examples/c/Makefile.am @@ -12,6 +12,7 @@ noinst_PROGRAMS = \ ex_cursor \ ex_data_source \ ex_encrypt \ + ex_event_handler \ ex_extending \ ex_extractor \ ex_hello \ diff --git a/examples/c/ex_all.c b/examples/c/ex_all.c index 418c99ad6a3..1c036b75461 100644 --- a/examples/c/ex_all.c +++ b/examples/c/ex_all.c @@ -346,8 +346,7 @@ cursor_ops(WT_SESSION *session) cursor->set_key(cursor, key); if ((ret = cursor->remove(cursor)) != 0) { fprintf(stderr, - "cursor.remove: %s\n", - cursor->session->strerror(cursor->session, ret)); + "cursor.remove: %s\n", wiredtiger_strerror(ret)); return (ret); } /*! [Display an error] */ @@ -359,7 +358,8 @@ cursor_ops(WT_SESSION *session) cursor->set_key(cursor, key); if ((ret = cursor->remove(cursor)) != 0) { fprintf(stderr, - "cursor.remove: %s\n", session->strerror(session, ret)); + "cursor.remove: %s\n", + cursor->session->strerror(cursor->session, ret)); return (ret); } /*! [Display an error thread safe] */ diff --git a/examples/c/ex_async.c b/examples/c/ex_async.c index 584c3e54b87..ecdbd2f4fea 100644 --- a/examples/c/ex_async.c +++ b/examples/c/ex_async.c @@ -218,7 +218,7 @@ main(void) */ ret = conn->close(conn, NULL); - printf("Searched for %d keys\n", ex_asynckeys.num_keys); + printf("Searched for %" PRIu32 " keys\n", ex_asynckeys.num_keys); return (ret); } diff --git a/examples/c/ex_config_parse.c b/examples/c/ex_config_parse.c index 124eff21130..be3c78bedd4 100644 --- a/examples/c/ex_config_parse.c +++ b/examples/c/ex_config_parse.c @@ -30,6 +30,7 @@ * configuration strings. */ +#include <inttypes.h> #include <stdio.h> #include <string.h> @@ -99,7 +100,7 @@ main(void) while ((ret = parser->next(parser, &k, &v)) == 0) { printf("%.*s:", (int)k.len, k.str); if (v.type == WT_CONFIG_ITEM_NUM) - printf("%d\n", (int)v.val); + printf("%" PRId64 "\n", v.val); else printf("%.*s\n", (int)v.len, v.str); } @@ -126,7 +127,7 @@ main(void) "log.file_max configuration: %s", wiredtiger_strerror(ret)); return (ret); } - printf("log file max: %d\n", (int)v.val); + printf("log file max: %" PRId64 "\n", v.val); /*! [nested get] */ ret = parser->close(parser); diff --git a/examples/c/ex_event_handler.c b/examples/c/ex_event_handler.c new file mode 100644 index 00000000000..ba6807cd56d --- /dev/null +++ b/examples/c/ex_event_handler.c @@ -0,0 +1,136 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * ex_event_handler.c + * Demonstrate how to use the WiredTiger event handler mechanism. + * + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <wiredtiger.h> + +static const char *home; + +int handle_wiredtiger_error( + WT_EVENT_HANDLER *, WT_SESSION *, int, const char *); +int handle_wiredtiger_message(WT_EVENT_HANDLER *, WT_SESSION *, const char *); + +/*! [Function event_handler] */ +/* + * Create our own event handler structure to allow us to pass context through + * to event handler callbacks. For this to work the WiredTiger event handler + * must appear first in our custom event handler structure. + */ +typedef struct { + WT_EVENT_HANDLER h; + const char *app_id; +} CUSTOM_EVENT_HANDLER; + +/* + * handle_wiredtiger_error -- + * Function to handle error callbacks from WiredTiger. + */ +int +handle_wiredtiger_error(WT_EVENT_HANDLER *handler, + WT_SESSION *session, int error, const char *message) +{ + CUSTOM_EVENT_HANDLER *custom_handler; + + /* Cast the handler back to our custom handler. */ + custom_handler = (CUSTOM_EVENT_HANDLER *)handler; + + /* Report the error on the console. */ + fprintf(stderr, + "app_id %s, thread context %p, error %d, message %s\n", + custom_handler->app_id, session, error, message); + + return (0); +} + +/* + * handle_wiredtiger_message -- + * Function to handle message callbacks from WiredTiger. + */ +int +handle_wiredtiger_message( + WT_EVENT_HANDLER *handler, WT_SESSION *session, const char *message) +{ + /* Cast the handler back to our custom handler. */ + printf("app id %s, thread context %p, message %s\n", + ((CUSTOM_EVENT_HANDLER *)handler)->app_id, session, message); + + return (0); +} +/*! [Function event_handler] */ + +static int +config_event_handler() +{ + WT_CONNECTION *conn; + WT_SESSION *session; + int ret; + + /*! [Configure event_handler] */ + CUSTOM_EVENT_HANDLER event_handler; + + event_handler.h.handle_error = handle_wiredtiger_error; + event_handler.h.handle_message = handle_wiredtiger_message; + /* Set handlers to NULL to use the default handler. */ + event_handler.h.handle_progress = NULL; + event_handler.h.handle_close = NULL; + event_handler.app_id = "example_event_handler"; + + ret = wiredtiger_open(home, + (WT_EVENT_HANDLER *)&event_handler, "create", &conn); + /*! [Configure event_handler] */ + + /* Make an invalid API call, to ensure the event handler works. */ + (void)conn->open_session(conn, NULL, "isolation=invalid", &session); + + if (ret == 0) + ret = conn->close(conn, NULL); + + return (ret); +} + +int +main(void) +{ + /* + * Create a clean test directory for this run of the test program if the + * environment variable isn't already set (as is done by make check). + */ + if (getenv("WIREDTIGER_HOME") == NULL) { + home = "WT_HOME"; + (void)system("rm -rf WT_HOME && mkdir WT_HOME"); + } else + home = NULL; + + return (config_event_handler()); +} diff --git a/examples/c/ex_extractor.c b/examples/c/ex_extractor.c index fff9c79f8e0..8623f4759fc 100644 --- a/examples/c/ex_extractor.c +++ b/examples/c/ex_extractor.c @@ -99,11 +99,13 @@ my_extract(WT_EXTRACTOR *extractor, WT_SESSION *session, * key(s). WiredTiger will perform the required operation * (such as a remove()). */ - fprintf(stderr, "EXTRACTOR: index op for year %d: %s %s\n", + fprintf(stderr, + "EXTRACTOR: index op for year %" PRIu16 ": %s %s\n", year, first_name, last_name); result_cursor->set_key(result_cursor, year); if ((ret = result_cursor->insert(result_cursor)) != 0) { - fprintf(stderr, "EXTRACTOR: op year %d: error %d\n", + fprintf(stderr, + "EXTRACTOR: op year %" PRIu16 ": error %d\n", year, ret); return (ret); } @@ -157,7 +159,7 @@ read_index(WT_SESSION *session) */ for (i = 0; i < 10 && RET_OK(ret); i++) { year = (uint16_t)((rand() % YEAR_SPAN) + YEAR_BASE); - printf("Year %d:\n", year); + printf("Year %" PRIu16 ":\n", year); cursor->set_key(cursor, year); if ((ret = cursor->search(cursor)) != 0) break; @@ -181,7 +183,7 @@ read_index(WT_SESSION *session) } } if (!RET_OK(ret)) - fprintf(stderr, "Error %d for year %d\n", ret, year); + fprintf(stderr, "Error %d for year %" PRIu16 "\n", ret, year); ret = cursor->close(cursor); return (ret); @@ -245,7 +247,8 @@ setup_table(WT_SESSION *session) cursor->set_key(cursor, p.id); cursor->set_value(cursor, p.last_name, p.first_name, p.term_start, p.term_end); - fprintf(stderr, "SETUP: table insert %d-%d: %s %s\n", + fprintf(stderr, + "SETUP: table insert %" PRIu16 "-%" PRIu16 ": %s %s\n", p.term_start, p.term_end, p.first_name, p.last_name); ret = cursor->insert(cursor); diff --git a/examples/c/ex_schema.c b/examples/c/ex_schema.c index fdf02d12302..70fc7eb2e62 100644 --- a/examples/c/ex_schema.c +++ b/examples/c/ex_schema.c @@ -69,7 +69,7 @@ main(void) { POP_RECORD *p; WT_CONNECTION *conn; - WT_CURSOR *cursor, *cursor2, *join_cursor; + WT_CURSOR *cursor, *cursor2, *join_cursor, *stat_cursor; WT_SESSION *session; const char *country; uint64_t recno, population; @@ -86,7 +86,8 @@ main(void) } else home = NULL; - if ((ret = wiredtiger_open(home, NULL, "create", &conn)) != 0) { + if ((ret = wiredtiger_open( + home, NULL, "create,statistics=(fast)", &conn)) != 0) { fprintf(stderr, "Error connecting to %s: %s\n", home, wiredtiger_strerror(ret)); return (ret); @@ -164,7 +165,8 @@ main(void) ret = cursor->get_key(cursor, &recno); ret = cursor->get_value(cursor, &country, &year, &population); printf("ID %" PRIu64, recno); - printf(": country %s, year %u, population %" PRIu64 "\n", + printf( + ": country %s, year %" PRIu16 ", population %" PRIu64 "\n", country, year, population); } ret = cursor->close(cursor); @@ -185,7 +187,8 @@ main(void) ret = wiredtiger_struct_unpack(session, value.data, value.size, "5sHQ", &country, &year, &population); - printf(": country %s, year %u, population %" PRIu64 "\n", + printf( + ": country %s, year %" PRIu16 ", population %" PRIu64 "\n", country, year, population); } /*! [List the records in the table using raw mode.] */ @@ -201,7 +204,9 @@ main(void) cursor->set_key(cursor, 2); if ((ret = cursor->search(cursor)) == 0) { ret = cursor->get_value(cursor, &country, &year, &population); - printf("ID 2: country %s, year %u, population %" PRIu64 "\n", + printf( + "ID 2: " + "country %s, year %" PRIu16 ", population %" PRIu64 "\n", country, year, population); } /*! [Read population from the primary column group] */ @@ -229,8 +234,8 @@ main(void) cursor->set_key(cursor, "AU\0\0\0"); ret = cursor->search(cursor); ret = cursor->get_value(cursor, &country, &year, &population); - printf("AU: country %s, year %u, population %" PRIu64 "\n", - country, (unsigned int)year, population); + printf("AU: country %s, year %" PRIu16 ", population %" PRIu64 "\n", + country, year, population); /*! [Search in a simple index] */ ret = cursor->close(cursor); @@ -241,8 +246,9 @@ main(void) cursor->set_key(cursor, "USA\0\0", (uint16_t)1900); ret = cursor->search(cursor); ret = cursor->get_value(cursor, &country, &year, &population); - printf("US 1900: country %s, year %u, population %" PRIu64 "\n", - country, (unsigned int)year, population); + printf( + "US 1900: country %s, year %" PRIu16 ", population %" PRIu64 "\n", + country, year, population); /*! [Search in a composite index] */ ret = cursor->close(cursor); @@ -255,7 +261,7 @@ main(void) "table:poptable(country,year)", NULL, NULL, &cursor); while ((ret = cursor->next(cursor)) == 0) { ret = cursor->get_value(cursor, &country, &year); - printf("country %s, year %u\n", country, year); + printf("country %s, year %" PRIu16 "\n", country, year); } /*! [Return a subset of values from the table] */ ret = cursor->close(cursor); @@ -273,7 +279,7 @@ main(void) ret = cursor->get_value(cursor, &value); ret = wiredtiger_struct_unpack( session, value.data, value.size, "5sH", &country, &year); - printf("country %s, year %u\n", country, year); + printf("country %s, year %" PRIu16 "\n", country, year); } /*! [Return a subset of values from the table using raw mode] */ ret = cursor->close(cursor); @@ -288,7 +294,7 @@ main(void) while ((ret = cursor->next(cursor)) == 0) { ret = cursor->get_key(cursor, &country, &year); ret = cursor->get_value(cursor, &recno); - printf("row ID %" PRIu64 ": country %s, year %u\n", + printf("row ID %" PRIu64 ": country %s, year %" PRIu16 "\n", recno, country, year); } /*! [Return the table's record number key using an index] */ @@ -305,7 +311,7 @@ main(void) while ((ret = cursor->next(cursor)) == 0) { ret = cursor->get_key(cursor, &country, &year); ret = cursor->get_value(cursor, &population); - printf("population %" PRIu64 ": country %s, year %u\n", + printf("population %" PRIu64 ": country %s, year %" PRIu16 "\n", population, country, year); } /*! [Return a subset of the value columns from an index] */ @@ -320,7 +326,7 @@ main(void) "index:poptable:country_plus_year()", NULL, NULL, &cursor); while ((ret = cursor->next(cursor)) == 0) { ret = cursor->get_key(cursor, &country, &year); - printf("country %s, year %u\n", country, year); + printf("country %s, year %" PRIu16 "\n", country, year); } /*! [Access only the index] */ ret = cursor->close(cursor); @@ -350,10 +356,19 @@ main(void) ret = join_cursor->get_value(join_cursor, &country, &year, &population); printf("ID %" PRIu64, recno); - printf(": country %s, year %u, population %" PRIu64 "\n", + printf( + ": country %s, year %" PRIu16 ", population %" PRIu64 "\n", country, year, population); } /*! [Join cursors] */ + + /*! [Statistics cursor join cursor] */ + ret = session->open_cursor(session, + "statistics:join", + join_cursor, NULL, &stat_cursor); + /*! [Statistics cursor join cursor] */ + + ret = stat_cursor->close(stat_cursor); ret = join_cursor->close(join_cursor); ret = cursor2->close(cursor2); ret = cursor->close(cursor); diff --git a/examples/c/ex_stat.c b/examples/c/ex_stat.c index 65402230eb8..6c5c15aacc6 100644 --- a/examples/c/ex_stat.c +++ b/examples/c/ex_stat.c @@ -39,6 +39,7 @@ int print_cursor(WT_CURSOR *); int print_database_stats(WT_SESSION *); int print_file_stats(WT_SESSION *); +int print_join_cursor_stats(WT_SESSION *); int print_overflow_pages(WT_SESSION *); int get_stat(WT_CURSOR *cursor, int stat_field, uint64_t *valuep); int print_derived_stats(WT_SESSION *); @@ -99,6 +100,37 @@ print_file_stats(WT_SESSION *session) } int +print_join_cursor_stats(WT_SESSION *session) +{ + WT_CURSOR *idx_cursor, *join_cursor, *stat_cursor; + int ret; + + ret = session->create( + session, "index:access:idx", "columns=(v)"); + ret = session->open_cursor( + session, "index:access:idx", NULL, NULL, &idx_cursor); + ret = idx_cursor->next(idx_cursor); + ret = session->open_cursor( + session, "join:table:access", NULL, NULL, &join_cursor); + ret = session->join(session, join_cursor, idx_cursor, "compare=gt"); + ret = join_cursor->next(join_cursor); + + /*! [statistics join cursor function] */ + if ((ret = session->open_cursor(session, + "statistics:join", join_cursor, NULL, &stat_cursor)) != 0) + return (ret); + + ret = print_cursor(stat_cursor); + ret = stat_cursor->close(stat_cursor); + /*! [statistics join cursor function] */ + + ret = join_cursor->close(join_cursor); + ret = idx_cursor->close(idx_cursor); + + return (ret); +} + +int print_overflow_pages(WT_SESSION *session) { /*! [statistics retrieve by key] */ @@ -204,7 +236,8 @@ main(void) ret = wiredtiger_open(home, NULL, "create,statistics=(all)", &conn); ret = conn->open_session(conn, NULL, NULL, &session); ret = session->create( - session, "table:access", "key_format=S,value_format=S"); + session, "table:access", + "key_format=S,value_format=S,columns=(k,v)"); ret = session->open_cursor( session, "table:access", NULL, NULL, &cursor); @@ -219,6 +252,8 @@ main(void) ret = print_file_stats(session); + ret = print_join_cursor_stats(session); + ret = print_overflow_pages(session); ret = print_derived_stats(session); diff --git a/examples/java/com/wiredtiger/examples/ex_all.java b/examples/java/com/wiredtiger/examples/ex_all.java index 09db8e0fd56..5fe767d49bf 100644 --- a/examples/java/com/wiredtiger/examples/ex_all.java +++ b/examples/java/com/wiredtiger/examples/ex_all.java @@ -326,6 +326,22 @@ public static int cursor_ops(Session session) /*! [Display an error] */ } + { + /*! [Display an error thread safe] */ + try { + String key = "non-existent key"; + cursor.putKeyString(key); + if ((ret = cursor.remove()) != 0) { + System.err.println( + "cursor.remove: " + wiredtiger.wiredtiger_strerror(ret)); + return (ret); + } + } catch (WiredTigerException wte) { /* Catch severe errors. */ + System.err.println("cursor.remove exception: " + wte); + } + /*! [Display an error thread safe] */ + } + /*! [Close the cursor] */ ret = cursor.close(); /*! [Close the cursor] */ diff --git a/examples/java/com/wiredtiger/examples/ex_schema.java b/examples/java/com/wiredtiger/examples/ex_schema.java index be1077ee2df..7cc26acb479 100644 --- a/examples/java/com/wiredtiger/examples/ex_schema.java +++ b/examples/java/com/wiredtiger/examples/ex_schema.java @@ -76,7 +76,7 @@ public class ex_schema { throws WiredTigerException { Connection conn; - Cursor cursor, cursor2, join_cursor; + Cursor cursor, cursor2, join_cursor, stat_cursor; Session session; String country; long recno, population; @@ -106,7 +106,7 @@ public class ex_schema { home = null; try { - conn = wiredtiger.open(home, "create"); + conn = wiredtiger.open(home, "create,statistics=(fast)"); session = conn.open_session(null); } catch (WiredTigerException wte) { System.err.println("WiredTigerException: " + wte); @@ -368,6 +368,13 @@ public class ex_schema { ", population " + population); } /*! [Join cursors] */ + + /*! [Statistics cursor join cursor] */ + stat_cursor = session.open_cursor( + "statistics:join", join_cursor, null); + /*! [Statistics cursor join cursor] */ + + ret = stat_cursor.close(); ret = join_cursor.close(); ret = cursor2.close(); ret = cursor.close(); diff --git a/examples/java/com/wiredtiger/examples/ex_stat.java b/examples/java/com/wiredtiger/examples/ex_stat.java index b0b83a2d3b2..f8877a4620e 100644 --- a/examples/java/com/wiredtiger/examples/ex_stat.java +++ b/examples/java/com/wiredtiger/examples/ex_stat.java @@ -92,6 +92,33 @@ public class ex_stat { } int + print_join_cursor_stats(Session session) + throws WiredTigerException + { + Cursor idx_cursor, join_cursor, stat_cursor; + int ret; + + ret = session.create("index:access:idx", "columns=(v)"); + idx_cursor = session.open_cursor("index:access:idx", null, null); + ret = idx_cursor.next(); + join_cursor = session.open_cursor("join:table:access", null, null); + ret = session.join(join_cursor, idx_cursor, "compare=gt"); + ret = join_cursor.next(); + + /*! [statistics join cursor function] */ + stat_cursor = session.open_cursor("statistics:join", join_cursor, null); + + ret = print_cursor(stat_cursor); + ret = stat_cursor.close(); + /*! [statistics join cursor function] */ + + ret = join_cursor.close(); + ret = idx_cursor.close(); + + return (ret); + } + + int print_overflow_pages(Session session) throws WiredTigerException { @@ -220,7 +247,8 @@ public class ex_stat { conn = wiredtiger.open(home, "create,statistics=(all)"); session = conn.open_session(null); - ret = session.create("table:access", "key_format=S,value_format=S"); + ret = session.create("table:access", + "key_format=S,value_format=S,columns=(k,v)"); cursor = session.open_cursor("table:access", null, null); cursor.putKeyString("key"); @@ -234,6 +262,8 @@ public class ex_stat { ret = print_file_stats(session); + ret = print_join_cursor_stats(session); + ret = print_overflow_pages(session); ret = print_derived_stats(session); diff --git a/ext/collators/revint/Makefile.am b/ext/collators/revint/Makefile.am new file mode 100644 index 00000000000..8c85c6a4701 --- /dev/null +++ b/ext/collators/revint/Makefile.am @@ -0,0 +1,10 @@ +AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)/src/include + +noinst_LTLIBRARIES = libwiredtiger_revint_collator.la +libwiredtiger_revint_collator_la_SOURCES = revint_collator.c + +# libtool hack: noinst_LTLIBRARIES turns off building shared libraries as well +# as installation, it will only build static libraries. As far as I can tell, +# the "approved" libtool way to turn them back on is by adding -rpath. +libwiredtiger_revint_collator_la_LDFLAGS = \ + -avoid-version -module -rpath /nowhere diff --git a/ext/collators/revint/revint_collator.c b/ext/collators/revint/revint_collator.c new file mode 100644 index 00000000000..30b5dc67556 --- /dev/null +++ b/ext/collators/revint/revint_collator.c @@ -0,0 +1,153 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <stdlib.h> +#include <errno.h> +#include <stdint.h> +#include <wiredtiger_ext.h> + +/* + * A simple WiredTiger collator for indices having a single integer key, + * where the ordering is descending (reversed). This collator also + * requires that primary key be an integer. + */ + +/* Local collator structure. */ +typedef struct { + WT_COLLATOR collator; /* Must come first */ + WT_EXTENSION_API *wt_api; /* Extension API */ +} REVINT_COLLATOR; + +/* + * revint_compare -- + * WiredTiger reverse integer collation, used for tests. + */ +static int +revint_compare(WT_COLLATOR *collator, + WT_SESSION *session, const WT_ITEM *k1, const WT_ITEM *k2, int *cmp) +{ + const REVINT_COLLATOR *revint_collator; + WT_EXTENSION_API *wtapi; + WT_PACK_STREAM *pstream; + int ret; + int64_t i1, i2, p1, p2; + + i1 = i2 = p1 = p2 = 0; + revint_collator = (const REVINT_COLLATOR *)collator; + wtapi = revint_collator->wt_api; + + /* + * All indices using this collator have an integer key, and the + * primary key is also an integer. A collator is usually passed the + * concatenation of index key and primary key (when available), + * hence we initially unpack using "ii". + * + * A collator may also be called with an item that includes a index + * key and no primary key. Among items having the same index key, + * an item with no primary key should sort before an item with a + * primary key. The reason is that if the application calls + * WT_CURSOR::search on a index key for which there are more than + * one value, the search key will not yet have a primary key. We + * want to position the cursor at the 'first' matching index key so + * that repeated calls to WT_CURSOR::next will see them all. + * + * To keep this code simple, we do not reverse the ordering + * when comparing primary keys. + */ + if ((ret = wtapi->unpack_start( + wtapi, session, "ii", k1->data, k1->size, &pstream)) != 0 || + (ret = wtapi->unpack_int(wtapi, pstream, &i1)) != 0) + goto err; + if ((ret = wtapi->unpack_int(wtapi, pstream, &p1)) != 0) + /* A missing primary key is OK and sorts first. */ + p1 = INT64_MIN; + if ((ret = wtapi->pack_close(wtapi, pstream, NULL)) != 0) + goto err; + + /* Unpack the second pair of numbers. */ + if ((ret = wtapi->unpack_start( + wtapi, session, "ii", k2->data, k2->size, &pstream)) != 0 || + (ret = wtapi->unpack_int(wtapi, pstream, &i2)) != 0) + goto err; + if ((ret = wtapi->unpack_int(wtapi, pstream, &p2)) != 0) + /* A missing primary key is OK and sorts first. */ + p2 = INT64_MIN; + if ((ret = wtapi->pack_close(wtapi, pstream, NULL)) != 0) + goto err; + + /* sorting is reversed */ + if (i1 < i2) + *cmp = 1; + else if (i1 > i2) + *cmp = -1; + /* compare primary keys next, not reversed */ + else if (p1 < p2) + *cmp = -1; + else if (p1 > p2) + *cmp = 1; + else + *cmp = 0; /* index key and primary key are same */ + +err: return (ret); +} + +/* + * revint_terminate -- + * Terminate is called to free the collator and any associated memory. + */ +static int +revint_terminate(WT_COLLATOR *collator, WT_SESSION *session) +{ + (void)session; /* Unused parameters */ + + /* Free the allocated memory. */ + free(collator); + return (0); +} + +/* + * wiredtiger_extension_init -- + * WiredTiger revint collation extension. + */ +int +wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) +{ + REVINT_COLLATOR *revint_collator; + + (void)config; /* Unused parameters */ + + if ((revint_collator = calloc(1, sizeof(REVINT_COLLATOR))) == NULL) + return (errno); + + revint_collator->collator.compare = revint_compare; + revint_collator->collator.terminate = revint_terminate; + revint_collator->wt_api = connection->get_extension_api(connection); + + return (connection->add_collator( + connection, "revint", &revint_collator->collator, NULL)); +} diff --git a/ext/compressors/lz4/lz4_compress.c b/ext/compressors/lz4/lz4_compress.c index 062307b721a..35159d0fa76 100644 --- a/ext/compressors/lz4/lz4_compress.c +++ b/ext/compressors/lz4/lz4_compress.c @@ -26,13 +26,15 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -#include <wt_internal.h> - #include <lz4.h> #include <errno.h> #include <stdlib.h> #include <string.h> +#include <wiredtiger_config.h> +#include <wiredtiger.h> +#include <wiredtiger_ext.h> + /* Local compressor structure. */ typedef struct { WT_COMPRESSOR compressor; /* Must come first */ @@ -62,6 +64,22 @@ typedef struct { uint32_t unused; /* Guaranteed to be 0 */ } LZ4_PREFIX; +#ifdef WORDS_BIGENDIAN +/* + * lz4_bswap32 -- + * 32-bit unsigned little-endian to/from big-endian value. + */ +static inline uint32_t +lz4_bswap32(uint32_t v) +{ + return ( + ((v << 24) & 0xff000000) | + ((v << 8) & 0x00ff0000) | + ((v >> 8) & 0x0000ff00) | + ((v >> 24) & 0x000000ff) + ); +} + /* * lz4_prefix_swap -- * The additional information is written in little-endian format, handle @@ -70,15 +88,12 @@ typedef struct { static inline void lz4_prefix_swap(LZ4_PREFIX *prefix) { -#ifdef WORDS_BIGENDIAN - prefix->compressed_len = __wt_bswap32(prefix->compressed_len); - prefix->uncompressed_len = __wt_bswap32(prefix->uncompressed_len); - prefix->useful_len = __wt_bswap32(prefix->useful_len); - prefix->unused = __wt_bswap32(prefix->unused); -#else - WT_UNUSED(prefix); -#endif + prefix->compressed_len = lz4_bswap32(prefix->compressed_len); + prefix->uncompressed_len = lz4_bswap32(prefix->uncompressed_len); + prefix->useful_len = lz4_bswap32(prefix->useful_len); + prefix->unused = lz4_bswap32(prefix->unused); } +#endif /* * lz4_error -- @@ -127,7 +142,9 @@ lz4_compress(WT_COMPRESSOR *compressor, WT_SESSION *session, prefix.uncompressed_len = (uint32_t)src_len; prefix.useful_len = (uint32_t)src_len; prefix.unused = 0; +#ifdef WORDS_BIGENDIAN lz4_prefix_swap(&prefix); +#endif memcpy(dst, &prefix, sizeof(LZ4_PREFIX)); *result_lenp = (size_t)lz4_len + sizeof(LZ4_PREFIX); @@ -163,7 +180,9 @@ lz4_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session, * decompressed bytes to return from the start of the source buffer. */ memcpy(&prefix, src, sizeof(LZ4_PREFIX)); +#ifdef WORDS_BIGENDIAN lz4_prefix_swap(&prefix); +#endif /* * Decompress, starting after the prefix bytes. Use safe decompression: @@ -278,7 +297,9 @@ lz4_compress_raw(WT_COMPRESSOR *compressor, WT_SESSION *session, prefix.uncompressed_len = (uint32_t)sourceSize; prefix.useful_len = offsets[slot]; prefix.unused = 0; +#ifdef WORDS_BIGENDIAN lz4_prefix_swap(&prefix); +#endif memcpy(dst, &prefix, sizeof(LZ4_PREFIX)); *result_slotsp = slot; diff --git a/ext/compressors/snappy/snappy_compress.c b/ext/compressors/snappy/snappy_compress.c index fcefb8bb575..981e334a2de 100644 --- a/ext/compressors/snappy/snappy_compress.c +++ b/ext/compressors/snappy/snappy_compress.c @@ -26,13 +26,15 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -#include <wt_internal.h> - #include <snappy-c.h> #include <errno.h> #include <stdlib.h> #include <string.h> +#include <wiredtiger_config.h> +#include <wiredtiger.h> +#include <wiredtiger_ext.h> + /* Local compressor structure. */ typedef struct { WT_COMPRESSOR compressor; /* Must come first */ @@ -40,6 +42,27 @@ typedef struct { WT_EXTENSION_API *wt_api; /* Extension API */ } SNAPPY_COMPRESSOR; +#ifdef WORDS_BIGENDIAN +/* + * snappy_bswap64 -- + * 64-bit unsigned little-endian to/from big-endian value. + */ +static inline uint64_t +snappy_bswap64(uint64_t v) +{ + return ( + ((v << 56) & 0xff00000000000000UL) | + ((v << 40) & 0x00ff000000000000UL) | + ((v << 24) & 0x0000ff0000000000UL) | + ((v << 8) & 0x000000ff00000000UL) | + ((v >> 8) & 0x00000000ff000000UL) | + ((v >> 24) & 0x0000000000ff0000UL) | + ((v >> 40) & 0x000000000000ff00UL) | + ((v >> 56) & 0x00000000000000ffUL) + ); +} +#endif + /* * wt_snappy_error -- * Output an error message, and return a standard error code. @@ -109,7 +132,7 @@ wt_snappy_compress(WT_COMPRESSOR *compressor, WT_SESSION *session, * Store the value in little-endian format. */ #ifdef WORDS_BIGENDIAN - snaplen = __wt_bswap64(snaplen); + snaplen = snappy_bswap64(snaplen); #endif *(size_t *)dst = snaplen; } else @@ -142,7 +165,7 @@ wt_snappy_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session, */ snaplen = *(size_t *)src; #ifdef WORDS_BIGENDIAN - snaplen = __wt_bswap64(snaplen); + snaplen = snappy_bswap64(snaplen); #endif if (snaplen + sizeof(size_t) > src_len) { (void)wt_api->err_printf(wt_api, diff --git a/src/async/async_op.c b/src/async/async_op.c index 130c704757b..970c33c3360 100644 --- a/src/async/async_op.c +++ b/src/async/async_op.c @@ -349,14 +349,8 @@ __wt_async_op_init(WT_SESSION_IMPL *session) WT_ERR(__async_op_init(conn, op, i)); } return (0); -err: - if (async->async_ops != NULL) { - __wt_free(session, async->async_ops); - async->async_ops = NULL; - } - if (async->async_queue != NULL) { - __wt_free(session, async->async_queue); - async->async_queue = NULL; - } + +err: __wt_free(session, async->async_ops); + __wt_free(session, async->async_queue); return (ret); } diff --git a/src/block/block_ckpt.c b/src/block/block_ckpt.c index 03059c8f23a..812bf99acfb 100644 --- a/src/block/block_ckpt.c +++ b/src/block/block_ckpt.c @@ -812,8 +812,7 @@ __ckpt_string(WT_SESSION_IMPL *session, WT_RET(__wt_block_buffer_to_ckpt(session, block, addr, ci)); WT_RET(__wt_buf_fmt(session, buf, - "version=%d", - ci->version)); + "version=%" PRIu8, ci->version)); if (ci->root_offset == WT_BLOCK_INVALID_OFFSET) WT_RET(__wt_buf_catfmt(session, buf, ", root=[Empty]")); else diff --git a/src/block/block_mgr.c b/src/block/block_mgr.c index dceaae8bb99..0bb75d129e1 100644 --- a/src/block/block_mgr.c +++ b/src/block/block_mgr.c @@ -69,6 +69,21 @@ __bm_checkpoint(WT_BM *bm, } /* + * __bm_checkpoint_readonly -- + * Write a buffer into a block, creating a checkpoint; readonly version. + */ +static int +__bm_checkpoint_readonly(WT_BM *bm, + WT_SESSION_IMPL *session, WT_ITEM *buf, WT_CKPT *ckptbase, bool data_cksum) +{ + WT_UNUSED(buf); + WT_UNUSED(ckptbase); + WT_UNUSED(data_cksum); + + return (__bm_readonly(bm, session)); +} + +/* * __bm_checkpoint_load -- * Load a checkpoint. */ @@ -113,6 +128,16 @@ __bm_checkpoint_resolve(WT_BM *bm, WT_SESSION_IMPL *session) } /* + * __bm_checkpoint_resolve_readonly -- + * Resolve the checkpoint; readonly version. + */ +static int +__bm_checkpoint_resolve_readonly(WT_BM *bm, WT_SESSION_IMPL *session) +{ + return (__bm_readonly(bm, session)); +} + +/* * __bm_checkpoint_unload -- * Unload a checkpoint point. */ @@ -161,6 +186,16 @@ __bm_compact_end(WT_BM *bm, WT_SESSION_IMPL *session) } /* + * __bm_compact_end_readonly -- + * End a block manager compaction; readonly version. + */ +static int +__bm_compact_end_readonly(WT_BM *bm, WT_SESSION_IMPL *session) +{ + return (__bm_readonly(bm, session)); +} + +/* * __bm_compact_page_skip -- * Return if a page is useful for compaction. */ @@ -173,6 +208,21 @@ __bm_compact_page_skip(WT_BM *bm, WT_SESSION_IMPL *session, } /* + * __bm_compact_page_skip_readonly -- + * Return if a page is useful for compaction; readonly version. + */ +static int +__bm_compact_page_skip_readonly(WT_BM *bm, WT_SESSION_IMPL *session, + const uint8_t *addr, size_t addr_size, bool *skipp) +{ + WT_UNUSED(addr); + WT_UNUSED(addr_size); + WT_UNUSED(skipp); + + return (__bm_readonly(bm, session)); +} + +/* * __bm_compact_skip -- * Return if a file can be compacted. */ @@ -183,6 +233,18 @@ __bm_compact_skip(WT_BM *bm, WT_SESSION_IMPL *session, bool *skipp) } /* + * __bm_compact_skip_readonly -- + * Return if a file can be compacted; readonly version. + */ +static int +__bm_compact_skip_readonly(WT_BM *bm, WT_SESSION_IMPL *session, bool *skipp) +{ + WT_UNUSED(skipp); + + return (__bm_readonly(bm, session)); +} + +/* * __bm_compact_start -- * Start a block manager compaction. */ @@ -193,6 +255,16 @@ __bm_compact_start(WT_BM *bm, WT_SESSION_IMPL *session) } /* + * __bm_compact_start_readonly -- + * Start a block manager compaction; readonly version. + */ +static int +__bm_compact_start_readonly(WT_BM *bm, WT_SESSION_IMPL *session) +{ + return (__bm_readonly(bm, session)); +} + +/* * __bm_free -- * Free a block of space to the underlying file. */ @@ -204,6 +276,20 @@ __bm_free(WT_BM *bm, } /* + * __bm_free_readonly -- + * Free a block of space to the underlying file; readonly version. + */ +static int +__bm_free_readonly(WT_BM *bm, + WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) +{ + WT_UNUSED(addr); + WT_UNUSED(addr_size); + + return (__bm_readonly(bm, session)); +} + +/* * __bm_is_mapped -- * Return if the file is mapped into memory. */ @@ -226,6 +312,31 @@ __bm_salvage_end(WT_BM *bm, WT_SESSION_IMPL *session) } /* + * __bm_salvage_end_readonly -- + * End a block manager salvage; readonly version. + */ +static int +__bm_salvage_end_readonly(WT_BM *bm, WT_SESSION_IMPL *session) +{ + return (__bm_readonly(bm, session)); +} + +/* + * __bm_salvage_next_readonly -- + * Return the next block from the file; readonly version. + */ +static int +__bm_salvage_next_readonly(WT_BM *bm, + WT_SESSION_IMPL *session, uint8_t *addr, size_t *addr_sizep, bool *eofp) +{ + WT_UNUSED(addr); + WT_UNUSED(addr_sizep); + WT_UNUSED(eofp); + + return (__bm_readonly(bm, session)); +} + +/* * __bm_salvage_next -- * Return the next block from the file. */ @@ -248,6 +359,16 @@ __bm_salvage_start(WT_BM *bm, WT_SESSION_IMPL *session) } /* + * __bm_salvage_start_readonly -- + * Start a block manager salvage; readonly version. + */ +static int +__bm_salvage_start_readonly(WT_BM *bm, WT_SESSION_IMPL *session) +{ + return (__bm_readonly(bm, session)); +} + +/* * __bm_salvage_valid -- * Inform salvage a block is valid. */ @@ -260,6 +381,21 @@ __bm_salvage_valid(WT_BM *bm, } /* + * __bm_salvage_valid_readonly -- + * Inform salvage a block is valid; readonly version. + */ +static int +__bm_salvage_valid_readonly(WT_BM *bm, + WT_SESSION_IMPL *session, uint8_t *addr, size_t addr_size, bool valid) +{ + WT_UNUSED(addr); + WT_UNUSED(addr_size); + WT_UNUSED(valid); + + return (__bm_readonly(bm, session)); +} + +/* * __bm_stat -- * Block-manager statistics. */ @@ -283,6 +419,18 @@ __bm_sync(WT_BM *bm, WT_SESSION_IMPL *session, bool async) } /* + * __bm_sync_readonly -- + * Flush a file to disk; readonly version. + */ +static int +__bm_sync_readonly(WT_BM *bm, WT_SESSION_IMPL *session, bool async) +{ + WT_UNUSED(async); + + return (__bm_readonly(bm, session)); +} + +/* * __bm_verify_addr -- * Verify an address. */ @@ -327,6 +475,23 @@ __bm_write(WT_BM *bm, WT_SESSION_IMPL *session, } /* + * __bm_write_readonly -- + * Write a buffer into a block, returning the block's address cookie; + * readonly version. + */ +static int +__bm_write_readonly(WT_BM *bm, WT_SESSION_IMPL *session, + WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, bool data_cksum) +{ + WT_UNUSED(buf); + WT_UNUSED(addr); + WT_UNUSED(addr_sizep); + WT_UNUSED(data_cksum); + + return (__bm_readonly(bm, session)); +} + +/* * __bm_write_size -- * Return the buffer size required to write a block. */ @@ -337,84 +502,68 @@ __bm_write_size(WT_BM *bm, WT_SESSION_IMPL *session, size_t *sizep) } /* + * __bm_write_size_readonly -- + * Return the buffer size required to write a block; readonly version. + */ +static int +__bm_write_size_readonly(WT_BM *bm, WT_SESSION_IMPL *session, size_t *sizep) +{ + WT_UNUSED(sizep); + + return (__bm_readonly(bm, session)); +} + +/* * __bm_method_set -- * Set up the legal methods. */ static void __bm_method_set(WT_BM *bm, bool readonly) { + bm->addr_invalid = __bm_addr_invalid; + bm->addr_string = __bm_addr_string; + bm->block_header = __bm_block_header; + bm->checkpoint = __bm_checkpoint; + bm->checkpoint_load = __bm_checkpoint_load; + bm->checkpoint_resolve = __bm_checkpoint_resolve; + bm->checkpoint_unload = __bm_checkpoint_unload; + bm->close = __bm_close; + bm->compact_end = __bm_compact_end; + bm->compact_page_skip = __bm_compact_page_skip; + bm->compact_skip = __bm_compact_skip; + bm->compact_start = __bm_compact_start; + bm->free = __bm_free; + bm->is_mapped = __bm_is_mapped; + bm->preload = __wt_bm_preload; + bm->read = __wt_bm_read; + bm->salvage_end = __bm_salvage_end; + bm->salvage_next = __bm_salvage_next; + bm->salvage_start = __bm_salvage_start; + bm->salvage_valid = __bm_salvage_valid; + bm->size = __wt_block_manager_size; + bm->stat = __bm_stat; + bm->sync = __bm_sync; + bm->verify_addr = __bm_verify_addr; + bm->verify_end = __bm_verify_end; + bm->verify_start = __bm_verify_start; + bm->write = __bm_write; + bm->write_size = __bm_write_size; + if (readonly) { - bm->addr_invalid = __bm_addr_invalid; - bm->addr_string = __bm_addr_string; - bm->block_header = __bm_block_header; - bm->checkpoint = (int (*)(WT_BM *, WT_SESSION_IMPL *, - WT_ITEM *, WT_CKPT *, bool))__bm_readonly; - bm->checkpoint_load = __bm_checkpoint_load; - bm->checkpoint_resolve = - (int (*)(WT_BM *, WT_SESSION_IMPL *))__bm_readonly; - bm->checkpoint_unload = __bm_checkpoint_unload; - bm->close = __bm_close; - bm->compact_end = - (int (*)(WT_BM *, WT_SESSION_IMPL *))__bm_readonly; - bm->compact_page_skip = (int (*)(WT_BM *, WT_SESSION_IMPL *, - const uint8_t *, size_t, bool *))__bm_readonly; - bm->compact_skip = (int (*) - (WT_BM *, WT_SESSION_IMPL *, bool *))__bm_readonly; - bm->compact_start = - (int (*)(WT_BM *, WT_SESSION_IMPL *))__bm_readonly; - bm->free = (int (*)(WT_BM *, - WT_SESSION_IMPL *, const uint8_t *, size_t))__bm_readonly; - bm->is_mapped = __bm_is_mapped; - bm->preload = __wt_bm_preload; - bm->read = __wt_bm_read; - bm->salvage_end = (int (*) - (WT_BM *, WT_SESSION_IMPL *))__bm_readonly; - bm->salvage_next = (int (*)(WT_BM *, WT_SESSION_IMPL *, - uint8_t *, size_t *, bool *))__bm_readonly; - bm->salvage_start = (int (*) - (WT_BM *, WT_SESSION_IMPL *))__bm_readonly; - bm->salvage_valid = (int (*)(WT_BM *, - WT_SESSION_IMPL *, uint8_t *, size_t, bool))__bm_readonly; - bm->size = __wt_block_manager_size; - bm->stat = __bm_stat; - bm->sync = - (int (*)(WT_BM *, WT_SESSION_IMPL *, bool))__bm_readonly; - bm->verify_addr = __bm_verify_addr; - bm->verify_end = __bm_verify_end; - bm->verify_start = __bm_verify_start; - bm->write = (int (*)(WT_BM *, WT_SESSION_IMPL *, - WT_ITEM *, uint8_t *, size_t *, bool))__bm_readonly; - bm->write_size = (int (*) - (WT_BM *, WT_SESSION_IMPL *, size_t *))__bm_readonly; - } else { - bm->addr_invalid = __bm_addr_invalid; - bm->addr_string = __bm_addr_string; - bm->block_header = __bm_block_header; - bm->checkpoint = __bm_checkpoint; - bm->checkpoint_load = __bm_checkpoint_load; - bm->checkpoint_resolve = __bm_checkpoint_resolve; - bm->checkpoint_unload = __bm_checkpoint_unload; - bm->close = __bm_close; - bm->compact_end = __bm_compact_end; - bm->compact_page_skip = __bm_compact_page_skip; - bm->compact_skip = __bm_compact_skip; - bm->compact_start = __bm_compact_start; - bm->free = __bm_free; - bm->is_mapped = __bm_is_mapped; - bm->preload = __wt_bm_preload; - bm->read = __wt_bm_read; - bm->salvage_end = __bm_salvage_end; - bm->salvage_next = __bm_salvage_next; - bm->salvage_start = __bm_salvage_start; - bm->salvage_valid = __bm_salvage_valid; - bm->size = __wt_block_manager_size; - bm->stat = __bm_stat; - bm->sync = __bm_sync; - bm->verify_addr = __bm_verify_addr; - bm->verify_end = __bm_verify_end; - bm->verify_start = __bm_verify_start; - bm->write = __bm_write; - bm->write_size = __bm_write_size; + bm->checkpoint = __bm_checkpoint_readonly; + bm->checkpoint_resolve = __bm_checkpoint_resolve_readonly; + bm->compact_end = __bm_compact_end_readonly; + bm->compact_page_skip = __bm_compact_page_skip_readonly; + bm->compact_skip = __bm_compact_skip_readonly; + bm->compact_start = __bm_compact_start_readonly; + bm->free = __bm_free_readonly; + bm->salvage_end = __bm_salvage_end_readonly; + bm->salvage_next = __bm_salvage_next_readonly; + bm->salvage_start = __bm_salvage_start_readonly; + bm->salvage_valid = __bm_salvage_valid_readonly; + bm->sync = __bm_sync_readonly; + bm->write = __bm_write_readonly; + bm->write_size = __bm_write_size_readonly; } } diff --git a/src/block/block_open.c b/src/block/block_open.c index d9b2f908737..adb745c99e7 100644 --- a/src/block/block_open.c +++ b/src/block/block_open.c @@ -369,7 +369,7 @@ __desc_read(WT_SESSION_IMPL *session, WT_BLOCK *block) WT_ERR_MSG(session, WT_ERROR, "unsupported WiredTiger file version: this build only " "supports major/minor versions up to %d/%d, and the file " - "is version %d/%d", + "is version %" PRIu16 "/%" PRIu16, WT_BLOCK_MAJOR_VERSION, WT_BLOCK_MINOR_VERSION, desc->majorv, desc->minorv); diff --git a/src/block/block_write.c b/src/block/block_write.c index 4c6ac198fe4..e05a430832e 100644 --- a/src/block/block_write.c +++ b/src/block/block_write.c @@ -206,10 +206,16 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, uint32_t cksum; bool local_locked; - blk = WT_BLOCK_HEADER_REF(buf->mem); fh = block->fh; /* + * Clear the block header to ensure all of it is initialized, even the + * unused fields. + */ + blk = WT_BLOCK_HEADER_REF(buf->mem); + memset(blk, 0, sizeof(*blk)); + + /* * Swap the page-header as needed; this doesn't belong here, but it's * the best place to catch all callers. */ diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c index 12df19a7e04..9cc56c56452 100644 --- a/src/btree/bt_compact.c +++ b/src/btree/bt_compact.c @@ -96,14 +96,13 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) WT_BTREE *btree; WT_DECL_RET; WT_REF *ref; - bool block_manager_begin, skip; + bool skip; WT_UNUSED(cfg); btree = S2BT(session); bm = btree->bm; ref = NULL; - block_manager_begin = false; WT_STAT_FAST_DATA_INCR(session, session_compact); @@ -123,24 +122,12 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) * We need to ensure we don't race with page reconciliation as it's * writing the page modify information. * - * There are three ways we call reconciliation: checkpoints, threads - * writing leaf pages (usually in preparation for a checkpoint or if - * closing a file), and eviction. - * - * We're holding the schema lock which serializes with checkpoints. - */ - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); - - /* - * Get the tree handle's flush lock which blocks threads writing leaf - * pages. + * There are two ways we call reconciliation: checkpoints and eviction. + * Get the tree's flush lock which blocks threads writing pages for + * checkpoints. */ __wt_spin_lock(session, &btree->flush_lock); - /* Start compaction. */ - WT_ERR(bm->compact_start(bm, session)); - block_manager_begin = true; - /* Walk the tree reviewing pages to see if they should be re-written. */ for (;;) { /* @@ -170,9 +157,6 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) err: if (ref != NULL) WT_TRET(__wt_page_release(session, ref, 0)); - if (block_manager_begin) - WT_TRET(bm->compact_end(bm, session)); - /* Unblock threads writing leaf pages. */ __wt_spin_unlock(session, &btree->flush_lock); diff --git a/src/btree/bt_curprev.c b/src/btree/bt_curprev.c index a083ec4016e..7475c0f1312 100644 --- a/src/btree/bt_curprev.c +++ b/src/btree/bt_curprev.c @@ -51,7 +51,8 @@ restart: if (cbt->btree->type == BTREE_ROW) { key.data = WT_INSERT_KEY(current); key.size = WT_INSERT_KEY_SIZE(current); - WT_RET(__wt_search_insert(session, cbt, &key)); + WT_RET(__wt_search_insert( + session, cbt, cbt->ins_head, &key)); } else cbt->ins = __col_insert_search(cbt->ins_head, cbt->ins_stack, cbt->next_stack, diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index c11b7d35de6..1f3ac443495 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -173,13 +173,18 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) */ break; case BTREE_COL_VAR: + /* The search function doesn't check for empty pages. */ + if (page->pg_var_entries == 0) + return (false); + WT_ASSERT(session, cbt->slot < page->pg_var_entries); + /* - * If search returned an insert object, there may or may not be - * a matching on-page object, we have to check. Variable-length - * column-store pages don't map one-to-one to keys, but have - * "slots", check if search returned a valid slot. + * Column-store updates aren't stored on the page, instead they + * are stored as "insert" objects. If search returned an insert + * object we can't return, the returned on-page object must be + * checked for a match. */ - if (cbt->slot >= page->pg_var_entries) + if (cbt->ins != NULL && !F_ISSET(cbt, WT_CBT_VAR_ONPAGE_MATCH)) return (false); /* @@ -194,6 +199,11 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) return (false); break; case BTREE_ROW: + /* The search function doesn't check for empty pages. */ + if (page->pg_row_entries == 0) + return (false); + WT_ASSERT(session, cbt->slot < page->pg_row_entries); + /* * See above: for row-store, no insert object can have the same * key as an on-page object, we're done. @@ -201,15 +211,6 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) if (cbt->ins != NULL) return (false); - /* - * Check if searched returned a valid slot (the failure mode is - * an empty page, the search function doesn't check, and so the - * more exact test is "page->pg_row_entries == 0", but this test - * mirrors the column-store test). - */ - if (cbt->slot >= page->pg_row_entries) - return (false); - /* Updates are stored on the page, check for a delete. */ if (page->pg_row_upd != NULL && (upd = __wt_txn_read( session, page->pg_row_upd[cbt->slot])) != NULL) { @@ -1162,22 +1163,14 @@ int __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) { WT_BTREE *btree; - WT_CURSOR_BTREE *cbt; WT_DECL_RET; WT_SESSION_IMPL *session; - cbt = (start != NULL) ? start : stop; - session = (WT_SESSION_IMPL *)cbt->iface.session; - btree = cbt->btree; + session = (WT_SESSION_IMPL *)start->iface.session; + btree = start->btree; WT_STAT_FAST_DATA_INCR(session, cursor_truncate); /* - * We always delete in a forward direction because it's faster, assert - * our caller provided us with a start cursor. - */ - WT_ASSERT(session, start != NULL); - - /* * For recovery, log the start and stop keys for a truncate operation, * not the individual records removed. On the other hand, for rollback * we need to keep track of all the in-memory operations. diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c index 795111d53f9..1f739c9572e 100644 --- a/src/btree/bt_discard.c +++ b/src/btree/bt_discard.c @@ -337,8 +337,7 @@ __free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) copy = WT_ROW_KEY_COPY(rip); (void)__wt_row_leaf_key_info( page, copy, &ikey, NULL, NULL, NULL); - if (ikey != NULL) - __wt_free(session, ikey); + __wt_free(session, ikey); } /* diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index 2db3ca7d984..1d33a7e7c9a 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -36,7 +36,8 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) btree = S2BT(session); /* Checkpoint files are readonly. */ - readonly = dhandle->checkpoint != NULL; + readonly = (dhandle->checkpoint != NULL || + F_ISSET(S2C(session), WT_CONN_READONLY)); /* Get the checkpoint information for this name/checkpoint pair. */ WT_CLEAR(ckpt); @@ -349,7 +350,7 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) /* Initialize locks. */ WT_RET(__wt_rwlock_alloc( session, &btree->ovfl_lock, "btree overflow lock")); - WT_RET(__wt_spin_init(session, &btree->flush_lock, "btree flush lock")); + WT_RET(__wt_spin_init(session, &btree->flush_lock, "btree flush")); btree->checkpointing = WT_CKPT_OFF; /* Not checkpointing */ btree->modified = 0; /* Clean */ diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index ac9faef4ff2..5cf6a9bf2bc 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -281,10 +281,8 @@ err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); * On error, upd points to a single unlinked WT_UPDATE structure, * first_upd points to a list. */ - if (upd != NULL) - __wt_free(session, upd); - if (first_upd != NULL) - __wt_free_update_list(session, first_upd); + __wt_free(session, upd); + __wt_free_update_list(session, first_upd); __wt_scr_free(session, ¤t_key); __wt_scr_free(session, &las_addr); @@ -460,12 +458,12 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags WT_DECL_RET; WT_PAGE *page; u_int sleep_cnt, wait_cnt; - bool busy, cache_work, oldgen, stalled; + bool busy, cache_work, evict_soon, stalled; int force_attempts; btree = S2BT(session); - for (oldgen = stalled = false, + for (evict_soon = stalled = false, force_attempts = 0, sleep_cnt = wait_cnt = 0;;) { switch (ref->state) { case WT_REF_DELETED: @@ -486,7 +484,16 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags WT_RET(__wt_cache_eviction_check( session, 1, NULL)); WT_RET(__page_read(session, ref)); - oldgen = LF_ISSET(WT_READ_WONT_NEED) || + + /* + * If configured to not trash the cache, leave the page + * generation unset, we'll set it before returning to + * the oldest read generation, so the page is forcibly + * evicted as soon as possible. We don't do that set + * here because we don't want to evict the page before + * we "acquire" it. + */ + evict_soon = LF_ISSET(WT_READ_WONT_NEED) || F_ISSET(session, WT_SESSION_NO_CACHE); continue; case WT_REF_READING: @@ -575,20 +582,24 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags } /* - * If we read the page and we are configured to not - * trash the cache, set the oldest read generation so - * the page is forcibly evicted as soon as possible. + * If we read the page and are configured to not trash + * the cache, and no other thread has already used the + * page, set the oldest read generation so the page is + * forcibly evicted as soon as possible. * - * Otherwise, update the page's read generation. + * Otherwise, if we read the page, or, if configured to + * update the page's read generation and the page isn't + * already flagged for forced eviction, update the page + * read generation. */ page = ref->page; - if (oldgen && page->read_gen == WT_READGEN_NOTSET) - __wt_page_evict_soon(page); - else if (!LF_ISSET(WT_READ_NO_GEN) && - page->read_gen != WT_READGEN_OLDEST && - page->read_gen < __wt_cache_read_gen(session)) - page->read_gen = - __wt_cache_read_gen_bump(session); + if (page->read_gen == WT_READGEN_NOTSET) { + if (evict_soon) + __wt_page_evict_soon(page); + else + __wt_cache_read_gen_new(session, page); + } else if (!LF_ISSET(WT_READ_NO_GEN)) + __wt_cache_read_gen_bump(session, page); skip_evict: /* * Check if we need an autocommit transaction. diff --git a/src/btree/bt_rebalance.c b/src/btree/bt_rebalance.c index 86360e83ddf..d94eb2ddd80 100644 --- a/src/btree/bt_rebalance.c +++ b/src/btree/bt_rebalance.c @@ -412,6 +412,7 @@ __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) WT_UNUSED(cfg); btree = S2BT(session); + evict_reset = false; /* * If the tree has never been written to disk, we're done, rebalance @@ -438,7 +439,8 @@ __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) * cache is the root page, and that cannot be evicted; however, this way * eviction ignores the tree entirely.) */ - WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset)); + WT_ERR(__wt_evict_file_exclusive_on(session)); + evict_reset = true; /* Recursively walk the tree. */ switch (rs->type) { @@ -470,7 +472,10 @@ __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) btree->root.page = rs->root; rs->root = NULL; -err: /* Discard any leftover root page we created. */ +err: if (evict_reset) + __wt_evict_file_exclusive_off(session); + + /* Discard any leftover root page we created. */ if (rs->root != NULL) { __wt_page_modify_clear(session, rs->root); __wt_page_out(session, &rs->root); diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c index 8d78bda79fb..0e064d306b6 100644 --- a/src/btree/bt_slvg.c +++ b/src/btree/bt_slvg.c @@ -1206,8 +1206,7 @@ __slvg_col_build_internal( __wt_root_ref_init(&ss->root_ref, page, true); if (0) { -err: if (addr != NULL) - __wt_free(session, addr); +err: __wt_free(session, addr); __wt_page_out(session, &page); } return (ret); @@ -1868,8 +1867,7 @@ __slvg_row_build_internal( __wt_root_ref_init(&ss->root_ref, page, false); if (0) { -err: if (addr != NULL) - __wt_free(session, addr); +err: __wt_free(session, addr); __wt_page_out(session, &page); } return (ret); diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index bd38451d5d1..4f16a290958 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -151,8 +151,7 @@ __wt_split_stash_discard_all( for (i = 0, stash = session->split_stash; i < session->split_stash_cnt; ++i, ++stash) - if (stash->p != NULL) - __wt_free(session_safe, stash->p); + __wt_free(session_safe, stash->p); __wt_free(session_safe, session->split_stash); session->split_stash_cnt = session->split_stash_alloc = 0; @@ -1383,11 +1382,27 @@ __split_internal_should_split(WT_SESSION_IMPL *session, WT_REF *ref) static int __split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard) { + WT_BTREE *btree; WT_DECL_RET; WT_PAGE *parent; WT_REF *ref; bool parent_hazard; + btree = S2BT(session); + + /* + * Disallow internal splits during the final pass of a checkpoint. Most + * splits are already disallowed during checkpoints, but an important + * exception is insert splits. The danger is an insert split creates a + * new chunk of the namespace, and then the internal split will move it + * to a different part of the tree where it will be written; in other + * words, in one part of the tree we'll skip the newly created insert + * split chunk, but we'll write it upon finding it in a different part + * of the tree. + */ + if (btree->checkpointing != WT_CKPT_OFF) + return (__split_internal_unlock(session, page, page_hazard)); + /* * Page splits trickle up the tree, that is, as leaf pages grow large * enough and are evicted, they'll split into their parent. And, as @@ -1771,8 +1786,8 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) type, WT_INSERT_RECNO(moved_ins), 0, false, &right)); /* - * The new page is dirty by definition, column-store splits update the - * page-modify structure, so create it now. + * The new page is dirty by definition, plus column-store splits update + * the page-modify structure, so create it now. */ WT_ERR(__wt_page_modify_init(session, right)); __wt_page_modify_set(session, right); @@ -1813,15 +1828,6 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) } /* - * We modified the page above, which will have set the first dirty - * transaction to the last transaction current running. However, the - * updates we installed may be older than that. Set the first dirty - * transaction to an impossibly old value so this page is never skipped - * in a checkpoint. - */ - right->modify->first_dirty_txn = WT_TXN_FIRST; - - /* * Calculate how much memory we're moving: figure out how deep the skip * list stack is for the element we are moving, and the memory used by * the item's list of updates. @@ -1919,6 +1925,24 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) #endif /* + * We perform insert splits concurrently with checkpoints, where the + * requirement is a checkpoint must include either the original page + * or both new pages. The page we're splitting is dirty, but that's + * insufficient: set the first dirty transaction to an impossibly old + * value so this page is not skipped by a checkpoint. + */ + page->modify->first_dirty_txn = WT_TXN_FIRST; + + /* + * We modified the page above, which will have set the first dirty + * transaction to the last transaction current running. However, the + * updates we installed may be older than that. Set the first dirty + * transaction to an impossibly old value so this page is never skipped + * in a checkpoint. + */ + right->modify->first_dirty_txn = WT_TXN_FIRST; + + /* * Update the page accounting. * * XXX @@ -1928,10 +1952,14 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) __wt_cache_page_inmem_incr(session, right, right_incr); /* - * Split into the parent. On successful return, the original page is no - * longer locked, so we cannot safely look at it. + * The act of splitting into the parent releases the pages for eviction; + * ensure the page contents are consistent. + */ + WT_WRITE_BARRIER(); + + /* + * Split into the parent. */ - page = NULL; if ((ret = __split_parent( session, ref, split_ref, 2, parent_incr, false, true)) == 0) return (0); @@ -1941,7 +1969,8 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) * * Reset the split column-store page record. */ - page->modify->mod_split_recno = WT_RECNO_OOB; + if (type != WT_PAGE_ROW_LEAF) + page->modify->mod_split_recno = WT_RECNO_OOB; /* * Clear the allocated page's reference to the moved insert list element diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index 5cbd8d1e996..57056eb5c99 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -17,18 +17,18 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) { struct timespec end, start; WT_BTREE *btree; + WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_REF *walk; WT_TXN *txn; uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages; - uint64_t saved_snap_min; + uint64_t oldest_id, saved_snap_min; uint32_t flags; - bool evict_reset; + conn = S2C(session); btree = S2BT(session); - walk = NULL; txn = &session->txn; saved_snap_min = WT_SESSION_TXN_STATE(session)->snap_min; @@ -56,6 +56,15 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) return (0); } + /* + * Save the oldest transaction ID we need to keep around. + * Otherwise, in a busy system, we could be updating pages so + * fast that write leaves never catches up. We deliberately + * have no transaction running at this point that would keep + * the oldest ID from moving forwards as we walk the tree. + */ + oldest_id = __wt_txn_oldest_id(session); + flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); @@ -64,13 +73,13 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) /* * Write dirty pages if nobody beat us to it. Don't - * try to write the hottest pages: checkpoint will have - * to visit them anyway. + * try to write hot pages (defined as pages that have + * been updated since the write phase leaves started): + * checkpoint will have to visit them anyway. */ page = walk->page; if (__wt_page_is_modified(page) && - __wt_txn_visible_all( - session, page->modify->update_txn)) { + WT_TXNID_LT(page->modify->update_txn, oldest_id)) { if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); leaf_bytes += page->memory_footprint; @@ -105,19 +114,18 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) __wt_spin_lock(session, &btree->flush_lock); /* - * When internal pages are being reconciled by checkpoint their - * child pages cannot disappear from underneath them or be split - * into them, nor can underlying blocks be freed until the block - * lists for the checkpoint are stable. Set the checkpointing - * flag to block eviction of dirty pages until the checkpoint's - * internal page pass is complete, then wait for any existing - * eviction to complete. + * In the final checkpoint pass, child pages cannot be evicted + * from underneath internal pages nor can underlying blocks be + * freed until the checkpoint's block lists are stable. Also, + * we cannot split child pages into parents unless we know the + * final pass will write a consistent view of that namespace. + * Set the checkpointing flag to block such actions and wait for + * any problematic eviction or page splits to complete. */ WT_PUBLISH(btree->checkpointing, WT_CKPT_PREPARE); - WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset)); - if (evict_reset) - __wt_evict_file_exclusive_off(session); + WT_ERR(__wt_evict_file_exclusive_on(session)); + __wt_evict_file_exclusive_off(session); WT_PUBLISH(btree->checkpointing, WT_CKPT_RUNNING); @@ -215,7 +223,7 @@ err: /* On error, clear any left-over tree walk. */ * so that eviction knows that the checkpoint has completed. */ WT_PUBLISH(btree->checkpoint_gen, - S2C(session)->txn_global.checkpoint_gen); + conn->txn_global.checkpoint_gen); WT_STAT_FAST_DATA_SET(session, btree_checkpoint_generation, btree->checkpoint_gen); @@ -249,7 +257,8 @@ err: /* On error, clear any left-over tree walk. */ * before checkpointing the file). Start a flush to stable storage, * but don't wait for it. */ - if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES) + if (ret == 0 && + syncop == WT_SYNC_WRITE_LEAVES && F_ISSET(conn, WT_CONN_CKPT_SYNC)) WT_RET(btree->bm->sync(btree->bm, session, true)); return (ret); @@ -260,24 +269,18 @@ err: /* On error, clear any left-over tree walk. */ * Cache operations. */ int -__wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_CACHE_OP op) +__wt_cache_op(WT_SESSION_IMPL *session, WT_CACHE_OP op) { - WT_DECL_RET; - WT_BTREE *btree; - - btree = S2BT(session); - switch (op) { case WT_SYNC_CHECKPOINT: case WT_SYNC_CLOSE: /* - * Set the checkpoint reference for reconciliation; it's ugly, - * but drilling a function parameter path from our callers to - * the reconciliation of the tree's root page is going to be - * worse. + * Make sure the checkpoint reference is set for + * reconciliation; it's ugly, but drilling a function parameter + * path from our callers to the reconciliation of the tree's + * root page is going to be worse. */ - WT_ASSERT(session, btree->ckpt == NULL); - btree->ckpt = ckptbase; + WT_ASSERT(session, S2BT(session)->ckpt != NULL); break; case WT_SYNC_DISCARD: case WT_SYNC_WRITE_LEAVES: @@ -287,23 +290,10 @@ __wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_CACHE_OP op) switch (op) { case WT_SYNC_CHECKPOINT: case WT_SYNC_WRITE_LEAVES: - WT_ERR(__sync_file(session, op)); - break; + return (__sync_file(session, op)); case WT_SYNC_CLOSE: case WT_SYNC_DISCARD: - WT_ERR(__wt_evict_file(session, op)); - break; + return (__wt_evict_file(session, op)); + WT_ILLEGAL_VALUE(session); } - -err: switch (op) { - case WT_SYNC_CHECKPOINT: - case WT_SYNC_CLOSE: - btree->ckpt = NULL; - break; - case WT_SYNC_DISCARD: - case WT_SYNC_WRITE_LEAVES: - break; - } - - return (ret); } diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c index ae2c20be1b6..952298f2456 100644 --- a/src/btree/bt_vrfy.c +++ b/src/btree/bt_vrfy.c @@ -226,7 +226,7 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) WT_WITH_PAGE_INDEX(session, ret = __verify_tree(session, &btree->root, vs)); - WT_TRET(__wt_cache_op(session, NULL, WT_SYNC_DISCARD)); + WT_TRET(__wt_cache_op(session, WT_SYNC_DISCARD)); } /* Unload the checkpoint. */ diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c index 55b11d7b2d1..bb8a750d848 100644 --- a/src/btree/bt_walk.c +++ b/src/btree/bt_walk.c @@ -583,14 +583,14 @@ restart: /* break; } WT_ERR(ret); + couple = ref; /* * A new page: configure for traversal of any internal * page's children, else return the leaf page. */ if (WT_PAGE_IS_INTERNAL(ref->page)) { -descend: couple = ref; - empty_internal = true; +descend: empty_internal = true; /* * There's a split race when a cursor is setting @@ -649,7 +649,6 @@ descend: couple = ref; */ if (skipleafcntp != NULL || LF_ISSET(WT_READ_SKIP_LEAF)) { - couple = ref; if (LF_ISSET(WT_READ_SKIP_LEAF)) break; if (*skipleafcntp > 0) { diff --git a/src/btree/col_modify.c b/src/btree/col_modify.c index 645d98d9c9b..fd60b12538a 100644 --- a/src/btree/col_modify.c +++ b/src/btree/col_modify.c @@ -25,6 +25,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, **ins_headp; WT_ITEM _value; WT_PAGE *page; + WT_PAGE_MODIFY *mod; WT_UPDATE *old_upd, *upd; size_t ins_size, upd_size; u_int i, skipdepth; @@ -60,6 +61,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, /* If we don't yet have a modify structure, we'll need one. */ WT_RET(__wt_page_modify_init(session, page)); + mod = page->modify; /* * Delete, insert or update a column-store entry. @@ -105,17 +107,17 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, /* Allocate the append/update list reference as necessary. */ if (append) { WT_PAGE_ALLOC_AND_SWAP(session, - page, page->modify->mod_append, ins_headp, 1); - ins_headp = &page->modify->mod_append[0]; + page, mod->mod_append, ins_headp, 1); + ins_headp = &mod->mod_append[0]; } else if (page->type == WT_PAGE_COL_FIX) { WT_PAGE_ALLOC_AND_SWAP(session, - page, page->modify->mod_update, ins_headp, 1); - ins_headp = &page->modify->mod_update[0]; + page, mod->mod_update, ins_headp, 1); + ins_headp = &mod->mod_update[0]; } else { WT_PAGE_ALLOC_AND_SWAP(session, - page, page->modify->mod_update, ins_headp, + page, mod->mod_update, ins_headp, page->pg_var_entries); - ins_headp = &page->modify->mod_update[cbt->slot]; + ins_headp = &mod->mod_update[cbt->slot]; } /* Allocate the WT_INSERT_HEAD structure as necessary. */ @@ -135,6 +137,14 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, cbt->ins_head = ins_head; cbt->ins = ins; + /* + * Check for insert split and checkpoint races in column-store: + * it's easy (as opposed to in row-store) and a difficult bug to + * otherwise diagnose. + */ + WT_ASSERT(session, mod->mod_split_recno == WT_RECNO_OOB || + (recno != WT_RECNO_OOB && mod->mod_split_recno > recno)); + if (upd_arg == NULL) { WT_ERR( __wt_update_alloc(session, value, &upd, &upd_size)); diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c index 3aa31044b82..4730267a545 100644 --- a/src/btree/col_srch.c +++ b/src/btree/col_srch.c @@ -77,6 +77,7 @@ __wt_col_search(WT_SESSION_IMPL *session, int depth; btree = S2BT(session); + current = NULL; __cursor_pos_clear(cbt); @@ -116,12 +117,19 @@ __wt_col_search(WT_SESSION_IMPL *session, goto leaf_only; } -restart_root: + if (0) { +restart: /* + * Discard the currently held page and restart the search from + * the root. + */ + WT_RET(__wt_page_release(session, current, 0)); + } + /* Search the internal pages of the tree. */ current = &btree->root; for (depth = 2, pindex = NULL;; ++depth) { parent_pindex = pindex; -restart_page: page = current->page; + page = current->page; if (page->type != WT_PAGE_COL_INT) break; @@ -138,10 +146,8 @@ restart_page: page = current->page; * on the page), check for an internal page split race. */ if (__wt_split_descent_race( - session, current, parent_pindex)) { - WT_RET(__wt_page_release(session, current, 0)); - goto restart_root; - } + session, current, parent_pindex)) + goto restart; goto descend; } @@ -178,8 +184,14 @@ descend: /* /* * Swap the current page for the child page. If the page splits - * while we're retrieving it, restart the search in the current - * page; otherwise return on error, the swap call ensures we're + * while we're retrieving it, restart the search at the root. + * We cannot restart in the "current" page; for example, if a + * thread is appending to the tree, the page it's waiting for + * did an insert-split into the parent, then the parent split + * into its parent, the name space we are searching for may have + * moved above the current page in the tree. + * + * On other error, simply return, the swap call ensures we're * holding nothing on failure. */ if ((ret = __wt_page_swap( @@ -188,7 +200,7 @@ descend: /* continue; } if (ret == WT_RESTART) - goto restart_page; + goto restart; return (ret); } @@ -199,7 +211,6 @@ descend: /* leaf_only: page = current->page; cbt->ref = current; - cbt->recno = recno; /* * Don't bother searching if the caller is appending a new record where @@ -213,13 +224,6 @@ leaf_only: } /* - * Set the on-page slot to an impossible value larger than any possible - * slot (it's used to interpret the search function's return after the - * search returns an insert list for a page that has no entries). - */ - cbt->slot = UINT32_MAX; - - /* * Search the leaf page. * * Search after a page is pinned does a search of the pinned page before @@ -232,28 +236,38 @@ leaf_only: * that's impossibly large for the page. We do have additional setup to * do in that case, the record may be appended to the page. */ - cbt->compare = 0; if (page->type == WT_PAGE_COL_FIX) { if (recno < page->pg_fix_recno) { + cbt->recno = page->pg_fix_recno; cbt->compare = 1; return (0); } if (recno >= page->pg_fix_recno + page->pg_fix_entries) { cbt->recno = page->pg_fix_recno + page->pg_fix_entries; goto past_end; - } else + } else { + cbt->recno = recno; + cbt->compare = 0; ins_head = WT_COL_UPDATE_SINGLE(page); + } } else { if (recno < page->pg_var_recno) { + cbt->recno = page->pg_var_recno; + cbt->slot = 0; cbt->compare = 1; return (0); } if ((cip = __col_var_search(page, recno, NULL)) == NULL) { cbt->recno = __col_var_last_recno(page); + cbt->slot = page->pg_var_entries == 0 ? + 0 : page->pg_var_entries - 1; goto past_end; } else { + cbt->recno = recno; cbt->slot = WT_COL_SLOT(page, cip); + cbt->compare = 0; ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot); + F_SET(cbt, WT_CBT_VAR_ONPAGE_MATCH); } } diff --git a/src/btree/row_key.c b/src/btree/row_key.c index 8b9e858ec18..9fff092d079 100644 --- a/src/btree/row_key.c +++ b/src/btree/row_key.c @@ -52,6 +52,7 @@ __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page) WT_RET(__wt_scr_alloc(session, 0, &key)); WT_RET(__wt_scr_alloc(session, (uint32_t)__bitstr_size(page->pg_row_entries), &tmp)); + memset(tmp->mem, 0, tmp->memsize); if ((gap = btree->key_gap) == 0) gap = 1; diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c index 28c55a4ccd0..6169a0a810a 100644 --- a/src/btree/row_srch.c +++ b/src/btree/row_srch.c @@ -9,18 +9,17 @@ #include "wt_internal.h" /* - * __wt_search_insert_append -- + * __search_insert_append -- * Fast append search of a row-store insert list, creating a skiplist stack * as we go. */ static inline int -__wt_search_insert_append(WT_SESSION_IMPL *session, - WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key, bool *donep) +__search_insert_append(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, + WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key, bool *donep) { WT_BTREE *btree; WT_COLLATOR *collator; WT_INSERT *ins; - WT_INSERT_HEAD *inshead; WT_ITEM key; int cmp, i; @@ -28,8 +27,7 @@ __wt_search_insert_append(WT_SESSION_IMPL *session, collator = btree->collator; *donep = 0; - inshead = cbt->ins_head; - if ((ins = WT_SKIP_LAST(inshead)) == NULL) + if ((ins = WT_SKIP_LAST(ins_head)) == NULL) return (0); key.data = WT_INSERT_KEY(ins); key.size = WT_INSERT_KEY_SIZE(ins); @@ -48,12 +46,13 @@ __wt_search_insert_append(WT_SESSION_IMPL *session, */ for (i = WT_SKIP_MAXDEPTH - 1; i >= 0; i--) { cbt->ins_stack[i] = (i == 0) ? &ins->next[0] : - (inshead->tail[i] != NULL) ? - &inshead->tail[i]->next[i] : &inshead->head[i]; + (ins_head->tail[i] != NULL) ? + &ins_head->tail[i]->next[i] : &ins_head->head[i]; cbt->next_stack[i] = NULL; } cbt->compare = -cmp; cbt->ins = ins; + cbt->ins_head = ins_head; *donep = 1; } return (0); @@ -64,20 +63,18 @@ __wt_search_insert_append(WT_SESSION_IMPL *session, * Search a row-store insert list, creating a skiplist stack as we go. */ int -__wt_search_insert( - WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key) +__wt_search_insert(WT_SESSION_IMPL *session, + WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key) { WT_BTREE *btree; WT_COLLATOR *collator; WT_INSERT *ins, **insp, *last_ins; - WT_INSERT_HEAD *inshead; WT_ITEM key; size_t match, skiphigh, skiplow; int cmp, i; btree = S2BT(session); collator = btree->collator; - inshead = cbt->ins_head; cmp = 0; /* -Wuninitialized */ /* @@ -86,7 +83,7 @@ __wt_search_insert( */ match = skiphigh = skiplow = 0; ins = last_ins = NULL; - for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0;) { + for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; i >= 0;) { if ((ins = *insp) == NULL) { cbt->next_stack[i] = NULL; cbt->ins_stack[i--] = insp--; @@ -128,6 +125,7 @@ __wt_search_insert( */ cbt->compare = -cmp; cbt->ins = (ins != NULL) ? ins : last_ins; + cbt->ins_head = ins_head; return (0); } @@ -212,6 +210,7 @@ __wt_row_search(WT_SESSION_IMPL *session, WT_BTREE *btree; WT_COLLATOR *collator; WT_DECL_RET; + WT_INSERT_HEAD *ins_head; WT_ITEM *item; WT_PAGE *page; WT_PAGE_INDEX *pindex, *parent_pindex; @@ -276,12 +275,20 @@ __wt_row_search(WT_SESSION_IMPL *session, goto leaf_only; } + if (0) { +restart: /* + * Discard the currently held page and restart the search from + * the root. + */ + WT_RET(__wt_page_release(session, current, 0)); + skiphigh = skiplow = 0; + } + /* Search the internal pages of the tree. */ -restart_root: current = &btree->root; for (depth = 2, pindex = NULL;; ++depth) { parent_pindex = pindex; -restart_page: page = current->page; + page = current->page; if (page->type != WT_PAGE_ROW_INT) break; @@ -419,20 +426,20 @@ restart_page: page = current->page; */ if (pindex->entries == base) { append: if (__wt_split_descent_race( - session, current, parent_pindex)) { - if ((ret = __wt_page_release( - session, current, 0)) != 0) - return (ret); - - skiplow = skiphigh = 0; - goto restart_root; - } + session, current, parent_pindex)) + goto restart; } descend: /* * Swap the current page for the child page. If the page splits - * while we're retrieving it, restart the search in the current - * page; otherwise return on error, the swap call ensures we're + * while we're retrieving it, restart the search at the root. + * We cannot restart in the "current" page; for example, if a + * thread is appending to the tree, the page it's waiting for + * did an insert-split into the parent, then the parent split + * into its parent, the name space we are searching for may have + * moved above the current page in the tree. + * + * On other error, simply return, the swap call ensures we're * holding nothing on failure. */ if ((ret = __wt_page_swap( @@ -440,10 +447,8 @@ descend: /* current = descent; continue; } - if (ret == WT_RESTART) { - skiphigh = skiplow = 0; - goto restart_page; - } + if (ret == WT_RESTART) + goto restart; return (ret); } @@ -456,6 +461,12 @@ leaf_only: cbt->ref = current; /* + * Clear current now that we have moved the reference into the btree + * cursor, so that cleanup never releases twice. + */ + current = NULL; + + /* * In the case of a right-side tree descent during an insert, do a fast * check for an append to the page, try to catch cursors appending data * into the tree. @@ -479,24 +490,18 @@ leaf_only: cbt->slot = WT_ROW_SLOT(page, page->pg_row_d); F_SET(cbt, WT_CBT_SEARCH_SMALLEST); - cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); + ins_head = WT_ROW_INSERT_SMALLEST(page); } else { cbt->slot = WT_ROW_SLOT(page, page->pg_row_d + (page->pg_row_entries - 1)); - cbt->ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot); + ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot); } - WT_ERR( - __wt_search_insert_append(session, cbt, srch_key, &done)); + WT_ERR(__search_insert_append( + session, cbt, ins_head, srch_key, &done)); if (done) return (0); - - /* - * Don't leave the insert list head set, code external to the - * search uses it. - */ - cbt->ins_head = NULL; } /* @@ -589,16 +594,16 @@ leaf_match: cbt->compare = 0; cbt->slot = WT_ROW_SLOT(page, page->pg_row_d); F_SET(cbt, WT_CBT_SEARCH_SMALLEST); - cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); + ins_head = WT_ROW_INSERT_SMALLEST(page); } else { cbt->compare = -1; cbt->slot = WT_ROW_SLOT(page, page->pg_row_d + (base - 1)); - cbt->ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot); + ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot); } /* If there's no insert list, we're done. */ - if (WT_SKIP_FIRST(cbt->ins_head) == NULL) + if (WT_SKIP_FIRST(ins_head) == NULL) return (0); /* @@ -606,23 +611,16 @@ leaf_match: cbt->compare = 0; * catch cursors repeatedly inserting at a single point. */ if (insert) { - WT_ERR( - __wt_search_insert_append(session, cbt, srch_key, &done)); + WT_ERR(__search_insert_append( + session, cbt, ins_head, srch_key, &done)); if (done) return (0); } - WT_ERR(__wt_search_insert(session, cbt, srch_key)); + WT_ERR(__wt_search_insert(session, cbt, ins_head, srch_key)); return (0); -err: /* - * Release the current page if the search started at the root. If the - * search didn't start at the root we should never have gone looking - * beyond the start page. - */ - WT_ASSERT(session, leaf == NULL || leaf == current); - if (leaf == NULL) - WT_TRET(__wt_page_release(session, current, 0)); +err: WT_TRET(__wt_page_release(session, current, 0)); return (ret); } @@ -660,19 +658,16 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) /* * If the tree is new (and not empty), it might have a large insert * list. - */ - F_SET(cbt, WT_CBT_SEARCH_SMALLEST); - if ((cbt->ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL) - return (WT_NOTFOUND); - - /* + * * Walk down the list until we find a level with at least 50 entries, * that's where we'll start rolling random numbers. The value 50 is * used to ignore levels with only a few entries, that is, levels which * are potentially badly skewed. */ - for (ins_head = cbt->ins_head, - level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) { + F_SET(cbt, WT_CBT_SEARCH_SMALLEST); + if ((ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL) + return (WT_NOTFOUND); + for (level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) { start = &ins_head->head[level]; for (entries = 0, stop = start; *stop != NULL; stop = &(*stop)->next[level]) @@ -767,6 +762,7 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) ins = ins->next[0]; cbt->ins = ins; + cbt->ins_head = ins_head; cbt->compare = 0; return (0); @@ -786,11 +782,19 @@ __wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_REF *current, *descent; btree = S2BT(session); + current = NULL; __cursor_pos_clear(cbt); -restart_root: - /* Walk the internal pages of the tree. */ + if (0) { +restart: /* + * Discard the currently held page and restart the search from + * the root. + */ + WT_RET(__wt_page_release(session, current, 0)); + } + + /* Search the internal pages of the tree. */ current = &btree->root; for (;;) { page = current->page; @@ -802,22 +806,19 @@ restart_root: __wt_random(&session->rnd) % pindex->entries]; /* - * Swap the parent page for the child page; return on error, - * the swap function ensures we're holding nothing on failure. + * Swap the current page for the child page. If the page splits + * while we're retrieving it, restart the search at the root. + * + * On other error, simply return, the swap call ensures we're + * holding nothing on failure. */ if ((ret = __wt_page_swap( session, current, descent, WT_READ_RESTART_OK)) == 0) { current = descent; continue; } - /* - * Restart is returned if we find a page that's been split; the - * held page isn't discarded when restart is returned, discard - * it and restart the search from the top of the tree. - */ - if (ret == WT_RESTART && - (ret = __wt_page_release(session, current, 0)) == 0) - goto restart_root; + if (ret == WT_RESTART) + goto restart; return (ret); } diff --git a/src/cache/cache_las.c b/src/cache/cache_las.c index 1ef8dd32bb4..8796ec6b2fc 100644 --- a/src/cache/cache_las.c +++ b/src/cache/cache_las.c @@ -58,6 +58,8 @@ __wt_las_create(WT_SESSION_IMPL *session) conn = S2C(session); + if (F_ISSET(conn, WT_CONN_READONLY)) + return (0); /* * Done at startup: we cannot do it on demand because we require the * schema lock to create and drop the table, and it may not always be @@ -203,7 +205,7 @@ __wt_las_cursor( * useful more than once. */ *session_flags = - F_ISSET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); + F_MASK(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); conn = S2C(session); diff --git a/src/config/config.c b/src/config/config.c index f480ab83dbd..96ef7a4e62a 100644 --- a/src/config/config.c +++ b/src/config/config.c @@ -16,9 +16,9 @@ static int __config_err(WT_CONFIG *conf, const char *msg, int err) { WT_RET_MSG(conf->session, err, - "Error parsing '%.*s' at byte %u: %s", + "Error parsing '%.*s' at offset %" WT_PTRDIFFT_FMT ": %s", (int)(conf->end - conf->orig), conf->orig, - (u_int)(conf->cur - conf->orig), msg); + conf->cur - conf->orig, msg); } /* diff --git a/src/config/config_def.c b/src/config/config_def.c index 879de670695..c752e5eb265 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -99,6 +99,7 @@ static const WT_CONFIG_CHECK static const WT_CONFIG_CHECK confchk_wiredtiger_open_statistics_log_subconfigs[] = { + { "json", "boolean", NULL, NULL, NULL, 0 }, { "on_close", "boolean", NULL, NULL, NULL, 0 }, { "path", "string", NULL, NULL, NULL, 0 }, { "sources", "list", NULL, NULL, NULL, 0 }, @@ -146,7 +147,7 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = { NULL, 0 }, { "statistics_log", "category", NULL, NULL, - confchk_wiredtiger_open_statistics_log_subconfigs, 5 }, + confchk_wiredtiger_open_statistics_log_subconfigs, 6 }, { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," "\"evict\",\"evictserver\",\"fileops\",\"log\",\"lsm\"," @@ -390,6 +391,61 @@ static const WT_CONFIG_CHECK confchk_colgroup_meta[] = { { NULL, NULL, NULL, NULL, NULL, 0 } }; +static const WT_CONFIG_CHECK confchk_file_config[] = { + { "allocation_size", "int", + NULL, "min=512B,max=128MB", + NULL, 0 }, + { "app_metadata", "string", NULL, NULL, NULL, 0 }, + { "block_allocation", "string", + NULL, "choices=[\"first\",\"best\"]", + NULL, 0 }, + { "block_compressor", "string", NULL, NULL, NULL, 0 }, + { "cache_resident", "boolean", NULL, NULL, NULL, 0 }, + { "checksum", "string", + NULL, "choices=[\"on\",\"off\",\"uncompressed\"]", + NULL, 0 }, + { "collator", "string", NULL, NULL, NULL, 0 }, + { "columns", "list", NULL, NULL, NULL, 0 }, + { "dictionary", "int", NULL, "min=0", NULL, 0 }, + { "encryption", "category", + NULL, NULL, + confchk_WT_SESSION_create_encryption_subconfigs, 2 }, + { "format", "string", NULL, "choices=[\"btree\"]", NULL, 0 }, + { "huffman_key", "string", NULL, NULL, NULL, 0 }, + { "huffman_value", "string", NULL, NULL, NULL, 0 }, + { "internal_item_max", "int", NULL, "min=0", NULL, 0 }, + { "internal_key_max", "int", NULL, "min=0", NULL, 0 }, + { "internal_key_truncate", "boolean", NULL, NULL, NULL, 0 }, + { "internal_page_max", "int", + NULL, "min=512B,max=512MB", + NULL, 0 }, + { "key_format", "format", __wt_struct_confchk, NULL, NULL, 0 }, + { "key_gap", "int", NULL, "min=0", NULL, 0 }, + { "leaf_item_max", "int", NULL, "min=0", NULL, 0 }, + { "leaf_key_max", "int", NULL, "min=0", NULL, 0 }, + { "leaf_page_max", "int", + NULL, "min=512B,max=512MB", + NULL, 0 }, + { "leaf_value_max", "int", NULL, "min=0", NULL, 0 }, + { "log", "category", + NULL, NULL, + confchk_WT_SESSION_create_log_subconfigs, 1 }, + { "memory_page_max", "int", + NULL, "min=512B,max=10TB", + NULL, 0 }, + { "os_cache_dirty_max", "int", NULL, "min=0", NULL, 0 }, + { "os_cache_max", "int", NULL, "min=0", NULL, 0 }, + { "prefix_compression", "boolean", NULL, NULL, NULL, 0 }, + { "prefix_compression_min", "int", NULL, "min=0", NULL, 0 }, + { "split_deepen_min_child", "int", NULL, NULL, NULL, 0 }, + { "split_deepen_per_child", "int", NULL, NULL, NULL, 0 }, + { "split_pct", "int", NULL, "min=25,max=100", NULL, 0 }, + { "value_format", "format", + __wt_struct_confchk, NULL, + NULL, 0 }, + { NULL, NULL, NULL, NULL, NULL, 0 } +}; + static const WT_CONFIG_CHECK confchk_file_meta[] = { { "allocation_size", "int", NULL, "min=512B,max=128MB", @@ -465,6 +521,67 @@ static const WT_CONFIG_CHECK confchk_index_meta[] = { { NULL, NULL, NULL, NULL, NULL, 0 } }; +static const WT_CONFIG_CHECK confchk_lsm_meta[] = { + { "allocation_size", "int", + NULL, "min=512B,max=128MB", + NULL, 0 }, + { "app_metadata", "string", NULL, NULL, NULL, 0 }, + { "block_allocation", "string", + NULL, "choices=[\"first\",\"best\"]", + NULL, 0 }, + { "block_compressor", "string", NULL, NULL, NULL, 0 }, + { "cache_resident", "boolean", NULL, NULL, NULL, 0 }, + { "checksum", "string", + NULL, "choices=[\"on\",\"off\",\"uncompressed\"]", + NULL, 0 }, + { "chunks", "string", NULL, NULL, NULL, 0 }, + { "collator", "string", NULL, NULL, NULL, 0 }, + { "columns", "list", NULL, NULL, NULL, 0 }, + { "dictionary", "int", NULL, "min=0", NULL, 0 }, + { "encryption", "category", + NULL, NULL, + confchk_WT_SESSION_create_encryption_subconfigs, 2 }, + { "format", "string", NULL, "choices=[\"btree\"]", NULL, 0 }, + { "huffman_key", "string", NULL, NULL, NULL, 0 }, + { "huffman_value", "string", NULL, NULL, NULL, 0 }, + { "internal_item_max", "int", NULL, "min=0", NULL, 0 }, + { "internal_key_max", "int", NULL, "min=0", NULL, 0 }, + { "internal_key_truncate", "boolean", NULL, NULL, NULL, 0 }, + { "internal_page_max", "int", + NULL, "min=512B,max=512MB", + NULL, 0 }, + { "key_format", "format", __wt_struct_confchk, NULL, NULL, 0 }, + { "key_gap", "int", NULL, "min=0", NULL, 0 }, + { "last", "string", NULL, NULL, NULL, 0 }, + { "leaf_item_max", "int", NULL, "min=0", NULL, 0 }, + { "leaf_key_max", "int", NULL, "min=0", NULL, 0 }, + { "leaf_page_max", "int", + NULL, "min=512B,max=512MB", + NULL, 0 }, + { "leaf_value_max", "int", NULL, "min=0", NULL, 0 }, + { "log", "category", + NULL, NULL, + confchk_WT_SESSION_create_log_subconfigs, 1 }, + { "lsm", "category", + NULL, NULL, + confchk_WT_SESSION_create_lsm_subconfigs, 11 }, + { "memory_page_max", "int", + NULL, "min=512B,max=10TB", + NULL, 0 }, + { "old_chunks", "string", NULL, NULL, NULL, 0 }, + { "os_cache_dirty_max", "int", NULL, "min=0", NULL, 0 }, + { "os_cache_max", "int", NULL, "min=0", NULL, 0 }, + { "prefix_compression", "boolean", NULL, NULL, NULL, 0 }, + { "prefix_compression_min", "int", NULL, "min=0", NULL, 0 }, + { "split_deepen_min_child", "int", NULL, NULL, NULL, 0 }, + { "split_deepen_per_child", "int", NULL, NULL, NULL, 0 }, + { "split_pct", "int", NULL, "min=25,max=100", NULL, 0 }, + { "value_format", "format", + __wt_struct_confchk, NULL, + NULL, 0 }, + { NULL, NULL, NULL, NULL, NULL, 0 } +}; + static const WT_CONFIG_CHECK confchk_table_meta[] = { { "app_metadata", "string", NULL, NULL, NULL, 0 }, { "colgroups", "list", NULL, NULL, NULL, 0 }, @@ -544,6 +661,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { { "lsm_merge", "boolean", NULL, NULL, NULL, 0 }, { "mmap", "boolean", NULL, NULL, NULL, 0 }, { "multiprocess", "boolean", NULL, NULL, NULL, 0 }, + { "readonly", "boolean", NULL, NULL, NULL, 0 }, { "session_max", "int", NULL, "min=1", NULL, 0 }, { "session_scratch_max", "int", NULL, NULL, NULL, 0 }, { "shared_cache", "category", @@ -554,7 +672,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { NULL, 0 }, { "statistics_log", "category", NULL, NULL, - confchk_wiredtiger_open_statistics_log_subconfigs, 5 }, + confchk_wiredtiger_open_statistics_log_subconfigs, 6 }, { "transaction_sync", "category", NULL, NULL, confchk_wiredtiger_open_transaction_sync_subconfigs, 2 }, @@ -624,6 +742,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { { "lsm_merge", "boolean", NULL, NULL, NULL, 0 }, { "mmap", "boolean", NULL, NULL, NULL, 0 }, { "multiprocess", "boolean", NULL, NULL, NULL, 0 }, + { "readonly", "boolean", NULL, NULL, NULL, 0 }, { "session_max", "int", NULL, "min=1", NULL, 0 }, { "session_scratch_max", "int", NULL, NULL, NULL, 0 }, { "shared_cache", "category", @@ -634,7 +753,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { NULL, 0 }, { "statistics_log", "category", NULL, NULL, - confchk_wiredtiger_open_statistics_log_subconfigs, 5 }, + confchk_wiredtiger_open_statistics_log_subconfigs, 6 }, { "transaction_sync", "category", NULL, NULL, confchk_wiredtiger_open_transaction_sync_subconfigs, 2 }, @@ -701,6 +820,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { { "lsm_merge", "boolean", NULL, NULL, NULL, 0 }, { "mmap", "boolean", NULL, NULL, NULL, 0 }, { "multiprocess", "boolean", NULL, NULL, NULL, 0 }, + { "readonly", "boolean", NULL, NULL, NULL, 0 }, { "session_max", "int", NULL, "min=1", NULL, 0 }, { "session_scratch_max", "int", NULL, NULL, NULL, 0 }, { "shared_cache", "category", @@ -711,7 +831,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { NULL, 0 }, { "statistics_log", "category", NULL, NULL, - confchk_wiredtiger_open_statistics_log_subconfigs, 5 }, + confchk_wiredtiger_open_statistics_log_subconfigs, 6 }, { "transaction_sync", "category", NULL, NULL, confchk_wiredtiger_open_transaction_sync_subconfigs, 2 }, @@ -776,6 +896,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { { "lsm_merge", "boolean", NULL, NULL, NULL, 0 }, { "mmap", "boolean", NULL, NULL, NULL, 0 }, { "multiprocess", "boolean", NULL, NULL, NULL, 0 }, + { "readonly", "boolean", NULL, NULL, NULL, 0 }, { "session_max", "int", NULL, "min=1", NULL, 0 }, { "session_scratch_max", "int", NULL, NULL, NULL, 0 }, { "shared_cache", "category", @@ -786,7 +907,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { NULL, 0 }, { "statistics_log", "category", NULL, NULL, - confchk_wiredtiger_open_statistics_log_subconfigs, 5 }, + confchk_wiredtiger_open_statistics_log_subconfigs, 6 }, { "transaction_sync", "category", NULL, NULL, confchk_wiredtiger_open_transaction_sync_subconfigs, 2 }, @@ -853,7 +974,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "file_max=100MB,path=,prealloc=,recover=on,zero_fill=0)," "lsm_manager=(merge=,worker_thread_max=4),lsm_merge=," "shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB)," - "statistics=none,statistics_log=(on_close=0," + "statistics=none,statistics_log=(json=0,on_close=0," "path=\"WiredTigerStat.%d.%H\",sources=," "timestamp=\"%b %d %H:%M:%S\",wait=0),verbose=", confchk_WT_CONNECTION_reconfigure, 18 @@ -980,6 +1101,20 @@ static const WT_CONFIG_ENTRY config_entries[] = { "app_metadata=,collator=,columns=,source=,type=file", confchk_colgroup_meta, 5 }, + { "file.config", + "allocation_size=4KB,app_metadata=,block_allocation=best," + "block_compressor=,cache_resident=0,checksum=uncompressed," + "collator=,columns=,dictionary=0,encryption=(keyid=,name=)," + "format=btree,huffman_key=,huffman_value=,internal_item_max=0," + "internal_key_max=0,internal_key_truncate=,internal_page_max=4KB," + "key_format=u,key_gap=10,leaf_item_max=0,leaf_key_max=0," + "leaf_page_max=32KB,leaf_value_max=0,log=(enabled=)," + "memory_page_max=5MB,os_cache_dirty_max=0,os_cache_max=0," + "prefix_compression=0,prefix_compression_min=4," + "split_deepen_min_child=0,split_deepen_per_child=0,split_pct=75," + "value_format=u", + confchk_file_config, 33 + }, { "file.meta", "allocation_size=4KB,app_metadata=,block_allocation=best," "block_compressor=,cache_resident=0,checkpoint=,checkpoint_lsn=," @@ -1000,6 +1135,23 @@ static const WT_CONFIG_ENTRY config_entries[] = { "index_key_columns=,key_format=u,source=,type=file,value_format=u", confchk_index_meta, 10 }, + { "lsm.meta", + "allocation_size=4KB,app_metadata=,block_allocation=best," + "block_compressor=,cache_resident=0,checksum=uncompressed,chunks=" + ",collator=,columns=,dictionary=0,encryption=(keyid=,name=)," + "format=btree,huffman_key=,huffman_value=,internal_item_max=0," + "internal_key_max=0,internal_key_truncate=,internal_page_max=4KB," + "key_format=u,key_gap=10,last=,leaf_item_max=0,leaf_key_max=0," + "leaf_page_max=32KB,leaf_value_max=0,log=(enabled=)," + "lsm=(auto_throttle=,bloom=,bloom_bit_count=16,bloom_config=," + "bloom_hash_count=8,bloom_oldest=0,chunk_count_limit=0," + "chunk_max=5GB,chunk_size=10MB,merge_max=15,merge_min=0)," + "memory_page_max=5MB,old_chunks=,os_cache_dirty_max=0," + "os_cache_max=0,prefix_compression=0,prefix_compression_min=4," + "split_deepen_min_child=0,split_deepen_per_child=0,split_pct=75," + "value_format=u", + confchk_lsm_meta, 37 + }, { "table.meta", "app_metadata=,colgroups=,collator=,columns=,key_format=u," "value_format=u", @@ -1017,14 +1169,14 @@ static const WT_CONFIG_ENTRY config_entries[] = { "close_idle_time=30,close_scan_interval=10),hazard_max=1000," "in_memory=0,log=(archive=,compressor=,enabled=0,file_max=100MB," "path=,prealloc=,recover=on,zero_fill=0),lsm_manager=(merge=," - "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0," + "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0,readonly=0," "session_max=100,session_scratch_max=2MB,shared_cache=(chunk=10MB" ",name=,quota=0,reserve=0,size=500MB),statistics=none," - "statistics_log=(on_close=0,path=\"WiredTigerStat.%d.%H\"," + "statistics_log=(json=0,on_close=0,path=\"WiredTigerStat.%d.%H\"," "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," "transaction_sync=(enabled=0,method=fsync),use_environment=," "use_environment_priv=0,verbose=,write_through=", - confchk_wiredtiger_open, 37 + confchk_wiredtiger_open, 38 }, { "wiredtiger_open_all", "async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1," @@ -1038,15 +1190,15 @@ static const WT_CONFIG_ENTRY config_entries[] = { "close_idle_time=30,close_scan_interval=10),hazard_max=1000," "in_memory=0,log=(archive=,compressor=,enabled=0,file_max=100MB," "path=,prealloc=,recover=on,zero_fill=0),lsm_manager=(merge=," - "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0," + "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0,readonly=0," "session_max=100,session_scratch_max=2MB,shared_cache=(chunk=10MB" ",name=,quota=0,reserve=0,size=500MB),statistics=none," - "statistics_log=(on_close=0,path=\"WiredTigerStat.%d.%H\"," + "statistics_log=(json=0,on_close=0,path=\"WiredTigerStat.%d.%H\"," "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," "transaction_sync=(enabled=0,method=fsync),use_environment=," "use_environment_priv=0,verbose=,version=(major=0,minor=0)," "write_through=", - confchk_wiredtiger_open_all, 38 + confchk_wiredtiger_open_all, 39 }, { "wiredtiger_open_basecfg", "async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1," @@ -1059,14 +1211,14 @@ static const WT_CONFIG_ENTRY config_entries[] = { ",close_idle_time=30,close_scan_interval=10),hazard_max=1000," "log=(archive=,compressor=,enabled=0,file_max=100MB,path=," "prealloc=,recover=on,zero_fill=0),lsm_manager=(merge=," - "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0," + "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0,readonly=0," "session_max=100,session_scratch_max=2MB,shared_cache=(chunk=10MB" ",name=,quota=0,reserve=0,size=500MB),statistics=none," - "statistics_log=(on_close=0,path=\"WiredTigerStat.%d.%H\"," + "statistics_log=(json=0,on_close=0,path=\"WiredTigerStat.%d.%H\"," "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," "transaction_sync=(enabled=0,method=fsync),verbose=," "version=(major=0,minor=0),write_through=", - confchk_wiredtiger_open_basecfg, 32 + confchk_wiredtiger_open_basecfg, 33 }, { "wiredtiger_open_usercfg", "async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1," @@ -1079,14 +1231,14 @@ static const WT_CONFIG_ENTRY config_entries[] = { ",close_idle_time=30,close_scan_interval=10),hazard_max=1000," "log=(archive=,compressor=,enabled=0,file_max=100MB,path=," "prealloc=,recover=on,zero_fill=0),lsm_manager=(merge=," - "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0," + "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0,readonly=0," "session_max=100,session_scratch_max=2MB,shared_cache=(chunk=10MB" ",name=,quota=0,reserve=0,size=500MB),statistics=none," - "statistics_log=(on_close=0,path=\"WiredTigerStat.%d.%H\"," + "statistics_log=(json=0,on_close=0,path=\"WiredTigerStat.%d.%H\"," "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," "transaction_sync=(enabled=0,method=fsync),verbose=," "write_through=", - confchk_wiredtiger_open_usercfg, 31 + confchk_wiredtiger_open_usercfg, 32 }, { NULL, NULL, NULL, 0 } }; diff --git a/src/conn/api_strerror.c b/src/conn/api_strerror.c index edb11957556..87864f7f4b0 100644 --- a/src/conn/api_strerror.c +++ b/src/conn/api_strerror.c @@ -40,6 +40,8 @@ __wt_wiredtiger_error(int error) return ("WT_RUN_RECOVERY: recovery must be run to continue"); case WT_CACHE_FULL: return ("WT_CACHE_FULL: operation would overflow cache"); + case WT_PERM_DENIED: + return ("WT_PERM_DENIED: permission denied (internal)"); } /* diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index 27977de63b2..6d115c8fdcd 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -772,6 +772,19 @@ __conn_get_extension_api(WT_CONNECTION *wt_conn) conn->extension_api.transaction_visible = __wt_ext_transaction_visible; conn->extension_api.version = wiredtiger_version; + /* Streaming pack/unpack API */ + conn->extension_api.pack_start = __wt_ext_pack_start; + conn->extension_api.unpack_start = __wt_ext_unpack_start; + conn->extension_api.pack_close = __wt_ext_pack_close; + conn->extension_api.pack_item = __wt_ext_pack_item; + conn->extension_api.pack_int = __wt_ext_pack_int; + conn->extension_api.pack_str = __wt_ext_pack_str; + conn->extension_api.pack_uint = __wt_ext_pack_uint; + conn->extension_api.unpack_item = __wt_ext_unpack_item; + conn->extension_api.unpack_int = __wt_ext_unpack_int; + conn->extension_api.unpack_str = __wt_ext_unpack_str; + conn->extension_api.unpack_uint = __wt_ext_unpack_uint; + return (&conn->extension_api); } @@ -1109,6 +1122,29 @@ __conn_config_append(const char *cfg[], const char *config) } /* + * __conn_config_readonly -- + * Append an entry to a config stack that overrides some settings + * when read-only is configured. + */ +static void +__conn_config_readonly(const char *cfg[]) +{ + const char *readonly; + + /* + * Override certain settings. In general we override the options + * whose default conflicts. Other settings at odds will return + * an error and will be checked when those settings are processed. + */ + readonly="checkpoint=(wait=0)," + "config_base=false," + "create=false," + "log=(archive=false,prealloc=false)," + "lsm_manager=(merge=false),"; + __conn_config_append(cfg, readonly); +} + +/* * __conn_config_check_version -- * Check if a configuration version isn't compatible. */ @@ -1382,7 +1418,7 @@ __conn_single(WT_SESSION_IMPL *session, const char *cfg[]) WT_FH *fh; size_t len; wt_off_t size; - bool exist, is_create; + bool bytelock, exist, is_create; char buf[256]; conn = S2C(session); @@ -1391,6 +1427,10 @@ __conn_single(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_config_gets(session, cfg, "create", &cval)); is_create = cval.val != 0; + if (F_ISSET(conn, WT_CONN_READONLY)) + is_create = false; + + bytelock = true; __wt_spin_lock(session, &__wt_process.spinlock); /* @@ -1448,47 +1488,89 @@ __conn_single(WT_SESSION_IMPL *session, const char *cfg[]) exist = false; if (!is_create) WT_ERR(__wt_exist(session, WT_WIREDTIGER, &exist)); - WT_ERR(__wt_open(session, - WT_SINGLETHREAD, is_create || exist, false, 0, &conn->lock_fh)); + ret = __wt_open(session, + WT_SINGLETHREAD, is_create || exist, false, 0, &conn->lock_fh); /* - * Lock a byte of the file: if we don't get the lock, some other process - * is holding it, we're done. The file may be zero-length, and that's - * OK, the underlying call supports locking past the end-of-file. + * If this is a read-only connection and we cannot grab the lock + * file, check if it is because there is not write permission or + * if the file does not exist. If so, then ignore the error. + * XXX Ignoring the error does allow multiple read-only + * connections to exist at the same time on a read-only directory. */ - if (__wt_bytelock(conn->lock_fh, (wt_off_t)0, true) != 0) - WT_ERR_MSG(session, EBUSY, - "WiredTiger database is already being managed by another " - "process"); + if (F_ISSET(conn, WT_CONN_READONLY)) { + /* + * If we got an expected permission or non-existence error + * then skip the byte lock. + */ + ret = __wt_map_error_rdonly(ret); + if (ret == WT_NOTFOUND || ret == WT_PERM_DENIED) { + bytelock = false; + ret = 0; + } + } + WT_ERR(ret); + if (bytelock) { + /* + * Lock a byte of the file: if we don't get the lock, some other + * process is holding it, we're done. The file may be + * zero-length, and that's OK, the underlying call supports + * locking past the end-of-file. + */ + if (__wt_bytelock(conn->lock_fh, (wt_off_t)0, true) != 0) + WT_ERR_MSG(session, EBUSY, + "WiredTiger database is already being managed by " + "another process"); - /* - * If the size of the lock file is non-zero, we created it (or won a - * locking race with the thread that created it, it doesn't matter). - * - * Write something into the file, zero-length files make me nervous. - * - * The test against the expected length is sheer paranoia (the length - * should be 0 or correct), but it shouldn't hurt. - */ + /* + * If the size of the lock file is non-zero, we created it (or + * won a locking race with the thread that created it, it + * doesn't matter). + * + * Write something into the file, zero-length files make me + * nervous. + * + * The test against the expected length is sheer paranoia (the + * length should be 0 or correct), but it shouldn't hurt. + */ #define WT_SINGLETHREAD_STRING "WiredTiger lock file\n" - WT_ERR(__wt_filesize(session, conn->lock_fh, &size)); - if (size != strlen(WT_SINGLETHREAD_STRING)) - WT_ERR(__wt_write(session, conn->lock_fh, (wt_off_t)0, - strlen(WT_SINGLETHREAD_STRING), WT_SINGLETHREAD_STRING)); + WT_ERR(__wt_filesize(session, conn->lock_fh, &size)); + if (size != strlen(WT_SINGLETHREAD_STRING)) + WT_ERR(__wt_write(session, conn->lock_fh, (wt_off_t)0, + strlen(WT_SINGLETHREAD_STRING), + WT_SINGLETHREAD_STRING)); + + } /* We own the lock file, optionally create the WiredTiger file. */ - WT_ERR(__wt_open(session, WT_WIREDTIGER, is_create, false, 0, &fh)); + ret = __wt_open(session, WT_WIREDTIGER, is_create, false, 0, &fh); /* - * Lock the WiredTiger file (for backward compatibility reasons as - * described above). Immediately release the lock, it's just a test. + * If we're read-only, check for success as well as handled errors. + * Even if we're able to open the WiredTiger file successfully, we + * do not try to lock it. The lock file test above is the only + * one we do for read-only. */ - if (__wt_bytelock(fh, (wt_off_t)0, true) != 0) { - WT_ERR_MSG(session, EBUSY, - "WiredTiger database is already being managed by another " - "process"); + if (F_ISSET(conn, WT_CONN_READONLY)) { + ret = __wt_map_error_rdonly(ret); + if (ret == 0 || ret == WT_NOTFOUND || ret == WT_PERM_DENIED) + ret = 0; + WT_ERR(ret); + } else { + WT_ERR(ret); + + /* + * Lock the WiredTiger file (for backward compatibility reasons + * as described above). Immediately release the lock, it's + * just a test. + */ + if (__wt_bytelock(fh, (wt_off_t)0, true) != 0) { + WT_ERR_MSG(session, EBUSY, + "WiredTiger database is already being managed by " + "another process"); + } + WT_ERR(__wt_bytelock(fh, (wt_off_t)0, false)); } - WT_ERR(__wt_bytelock(fh, (wt_off_t)0, false)); /* * We own the database home, figure out if we're creating it. There are @@ -1502,11 +1584,21 @@ __conn_single(WT_SESSION_IMPL *session, const char *cfg[]) conn->is_new = exist ? 0 : 1; if (conn->is_new) { + if (F_ISSET(conn, WT_CONN_READONLY)) + WT_ERR_MSG(session, EINVAL, "Creating a new database is" + " incompatible with read-only configuration."); len = (size_t)snprintf(buf, sizeof(buf), "%s\n%s\n", WT_WIREDTIGER, WIREDTIGER_VERSION_STRING); WT_ERR(__wt_write(session, fh, (wt_off_t)0, len, buf)); WT_ERR(__wt_fsync(session, fh)); } else { + /* + * Although exclusive and the read-only configuration settings + * are at odds, we do not have to check against read-only here + * because it falls out from earlier code in this function + * preventing creation and confirming the database + * already exists. + */ WT_ERR(__wt_config_gets(session, cfg, "exclusive", &cval)); if (cval.val != 0) WT_ERR_MSG(session, EEXIST, @@ -1602,6 +1694,7 @@ __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) { "fileops", WT_VERB_FILEOPS }, { "log", WT_VERB_LOG }, { "lsm", WT_VERB_LSM }, + { "lsm_manager", WT_VERB_LSM_MANAGER }, { "metadata", WT_VERB_METADATA }, { "mutex", WT_VERB_MUTEX }, { "overflow", WT_VERB_OVERFLOW }, @@ -1736,6 +1829,7 @@ __conn_write_base_config(WT_SESSION_IMPL *session, const char *cfg[]) "exclusive=," "in_memory=," "log=(recover=)," + "readonly=," "use_environment_priv=," "verbose=,", &base_config)); WT_ERR(__wt_config_init(session, &parser, base_config)); @@ -1808,7 +1902,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const WT_NAME_FLAG *ft; WT_SESSION_IMPL *session; bool config_base_set; - const char *enc_cfg[] = { NULL, NULL }; + const char *enc_cfg[] = { NULL, NULL }, *merge_cfg; char version[64]; /* Leave lots of space for optional additional configuration. */ @@ -1819,6 +1913,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, conn = NULL; session = NULL; + merge_cfg = NULL; WT_RET(__wt_library_init()); @@ -1860,6 +1955,16 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, session, cval.str, cval.len, &conn->error_prefix)); /* + * We need to look for read-only early so that we can use it + * in __conn_single and whether to use the base config file. + * XXX that means we can only make the choice in __conn_single if the + * user passes it in via the config string to wiredtiger_open. + */ + WT_ERR(__wt_config_gets(session, cfg, "readonly", &cval)); + if (cval.val) + F_SET(conn, WT_CONN_READONLY); + + /* * XXX ideally, we would check "in_memory" here, so we could completely * avoid having a database directory. However, it can be convenient to * pass "in_memory" via the WIREDTIGER_CONFIG environment variable, and @@ -1883,6 +1988,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, * 4. the config passed in by the application * 5. user configuration file (optional) * 6. environment variable settings (optional) + * 7. overrides for a read-only connection * * Clear the entries we added to the stack, we're going to build it in * order. @@ -1898,8 +2004,8 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, (int)sizeof(version), ENOMEM); __conn_config_append(cfg, version); - /* Ignore the base_config file if we config_base set to false. */ - if (config_base_set) + /* Ignore the base_config file if config_base_set is false. */ + if (config_base_set || F_ISSET(conn, WT_CONN_READONLY)) WT_ERR( __conn_config_file(session, WT_BASECONFIG, false, cfg, i1)); __conn_config_append(cfg, config); @@ -1909,7 +2015,35 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, /* * Merge the full configuration stack and save it for reconfiguration. */ - WT_ERR(__wt_config_merge(session, cfg, NULL, &conn->cfg)); + WT_ERR(__wt_config_merge(session, cfg, NULL, &merge_cfg)); + /* + * The read-only setting may have been set in a configuration file. + * Get it again so that we can override other configuration settings + * before they are processed by the subsystems. + */ + WT_ERR(__wt_config_gets(session, cfg, "readonly", &cval)); + if (cval.val) + F_SET(conn, WT_CONN_READONLY); + if (F_ISSET(conn, WT_CONN_READONLY)) { + /* + * Create a new stack with the merged configuration as the + * base. The read-only string will use entry 1 and then + * we'll merge it again. + */ + cfg[0] = merge_cfg; + cfg[1] = NULL; + cfg[2] = NULL; + /* + * We override some configuration settings for read-only. + * Other settings that conflict with and are an error with + * read-only are tested in their individual locations later. + */ + __conn_config_readonly(cfg); + WT_ERR(__wt_config_merge(session, cfg, NULL, &conn->cfg)); + } else { + conn->cfg = merge_cfg; + merge_cfg = NULL; + } /* * Configuration ... @@ -2082,6 +2216,7 @@ err: /* Discard the scratch buffers. */ __wt_scr_free(session, &i2); __wt_scr_free(session, &i3); + __wt_free(session, merge_cfg); /* * We may have allocated scratch memory when using the dummy session or * the subsequently created real session, and we don't want to tie down diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c index 1831aad5895..9a2c394e9a6 100644 --- a/src/conn/conn_cache.c +++ b/src/conn/conn_cache.c @@ -140,6 +140,12 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_cache_config(session, false, cfg)); /* + * The lowest possible page read-generation has a special meaning, it + * marks a page for forcible eviction; don't let it happen by accident. + */ + cache->read_gen = WT_READGEN_START_VALUE; + + /* * The target size must be lower than the trigger size or we will never * get any work done. */ @@ -147,8 +153,8 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR_MSG(session, EINVAL, "eviction target must be lower than the eviction trigger"); - WT_ERR(__wt_cond_alloc(session, - "cache eviction server", false, &cache->evict_cond)); + WT_ERR(__wt_cond_auto_alloc(session, "cache eviction server", + false, 10000, WT_MILLION, &cache->evict_cond)); WT_ERR(__wt_cond_alloc(session, "eviction waiters", false, &cache->evict_waiter_cond)); WT_ERR(__wt_spin_init(session, &cache->evict_lock, "cache eviction")); @@ -246,7 +252,7 @@ __wt_cache_destroy(WT_SESSION_IMPL *session) " bytes dirty and %" PRIu64 " pages dirty", cache->bytes_dirty, cache->pages_dirty); - WT_TRET(__wt_cond_destroy(session, &cache->evict_cond)); + WT_TRET(__wt_cond_auto_destroy(session, &cache->evict_cond)); WT_TRET(__wt_cond_destroy(session, &cache->evict_waiter_cond)); __wt_spin_destroy(session, &cache->evict_lock); __wt_spin_destroy(session, &cache->evict_walk_lock); diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c index 60136a71b99..5019ab59fe3 100644 --- a/src/conn/conn_dhandle.c +++ b/src/conn/conn_dhandle.c @@ -129,7 +129,7 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) WT_BTREE *btree; WT_DATA_HANDLE *dhandle; WT_DECL_RET; - bool evict_reset, marked_dead, no_schema_lock; + bool marked_dead, no_schema_lock; btree = S2BT(session); bm = btree->bm; @@ -139,8 +139,8 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) if (!F_ISSET(dhandle, WT_DHANDLE_OPEN)) return (0); - /* Ensure that we aren't racing with the eviction server */ - WT_RET(__wt_evict_file_exclusive_on(session, &evict_reset)); + /* Turn off eviction. */ + WT_RET(__wt_evict_file_exclusive_on(session)); /* * If we don't already have the schema lock, make it an error to try @@ -176,23 +176,19 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) { if (force && (bm == NULL || !bm->is_mapped(bm, session))) { F_SET(session->dhandle, WT_DHANDLE_DEAD); + marked_dead = true; - /* - * Reset the tree's eviction priority, and the tree is - * evictable by definition. - */ + /* Reset the tree's eviction priority (if any). */ __wt_evict_priority_clear(session); - F_CLR(S2BT(session), WT_BTREE_NO_EVICTION); - - marked_dead = true; } if (!marked_dead || final) WT_ERR(__wt_checkpoint_close(session, final)); } WT_TRET(__wt_btree_close(session)); + /* - * If we marked a handle as dead it will be closed by sweep, via + * If we marked a handle dead it will be closed by sweep, via * another call to sync and close. */ if (!marked_dead) { @@ -206,12 +202,11 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) err: __wt_spin_unlock(session, &dhandle->close_lock); - if (evict_reset) - __wt_evict_file_exclusive_off(session); - if (no_schema_lock) F_CLR(session, WT_SESSION_NO_SCHEMA_LOCK); + __wt_evict_file_exclusive_off(session); + return (ret); } @@ -355,42 +350,52 @@ err: F_CLR(btree, WT_BTREE_SPECIAL_FLAGS); /* * __conn_btree_apply_internal -- - * Apply a function to the open btree handles. + * Apply a function to an open data handle. */ static int __conn_btree_apply_internal(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, - int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]) + int (*file_func)(WT_SESSION_IMPL *, const char *[]), + int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), + const char *cfg[]) { WT_DECL_RET; + bool skip; + + /* Always apply the name function, if supplied. */ + skip = false; + if (name_func != NULL) + WT_RET(name_func(session, dhandle->name, &skip)); + + /* If there is no file function, don't bother locking the handle */ + if (file_func == NULL || skip) + return (0); /* * We need to pull the handle into the session handle cache and make * sure it's referenced to stop other internal code dropping the handle * (e.g in LSM when cleaning up obsolete chunks). */ - ret = __wt_session_get_btree(session, - dhandle->name, dhandle->checkpoint, NULL, 0); - if (ret == 0) { - WT_SAVE_DHANDLE(session, - ret = func(session, cfg)); - if (WT_META_TRACKING(session)) - WT_TRET(__wt_meta_track_handle_lock(session, false)); - else - WT_TRET(__wt_session_release_btree(session)); - } else if (ret == EBUSY) - ret = __wt_conn_btree_apply_single(session, dhandle->name, - dhandle->checkpoint, func, cfg); + if ((ret = __wt_session_get_btree(session, + dhandle->name, dhandle->checkpoint, NULL, 0)) != 0) + return (ret == EBUSY ? 0 : ret); + + WT_SAVE_DHANDLE(session, ret = file_func(session, cfg)); + if (WT_META_TRACKING(session)) + WT_TRET(__wt_meta_track_handle_lock(session, false)); + else + WT_TRET(__wt_session_release_btree(session)); return (ret); } /* * __wt_conn_btree_apply -- - * Apply a function to all open btree handles apart from the metadata. + * Apply a function to all open btree handles with the given URI. */ int -__wt_conn_btree_apply(WT_SESSION_IMPL *session, - bool apply_checkpoints, const char *uri, - int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]) +__wt_conn_btree_apply(WT_SESSION_IMPL *session, const char *uri, + int (*file_func)(WT_SESSION_IMPL *, const char *[]), + int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), + const char *cfg[]) { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; @@ -407,116 +412,27 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session, if (uri != NULL) { bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE; - TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq) - if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && - !F_ISSET(dhandle, WT_DHANDLE_DEAD) && - strcmp(uri, dhandle->name) == 0 && - (apply_checkpoints || dhandle->checkpoint == NULL)) - WT_RET(__conn_btree_apply_internal( - session, dhandle, func, cfg)); + TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq) { + if (!F_ISSET(dhandle, WT_DHANDLE_OPEN) || + F_ISSET(dhandle, WT_DHANDLE_DEAD) || + dhandle->checkpoint != NULL || + strcmp(uri, dhandle->name) != 0) + continue; + WT_RET(__conn_btree_apply_internal( + session, dhandle, file_func, name_func, cfg)); + } } else { - TAILQ_FOREACH(dhandle, &conn->dhqh, q) - if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && - !F_ISSET(dhandle, WT_DHANDLE_DEAD) && - (apply_checkpoints || - dhandle->checkpoint == NULL) && - WT_PREFIX_MATCH(dhandle->name, "file:") && - !WT_IS_METADATA(session, dhandle)) - WT_RET(__conn_btree_apply_internal( - session, dhandle, func, cfg)); - } - - return (0); -} - -/* - * __wt_conn_btree_apply_single_ckpt -- - * Decode any checkpoint information from the configuration string then - * call btree apply single. - */ -int -__wt_conn_btree_apply_single_ckpt(WT_SESSION_IMPL *session, - const char *uri, - int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]) -{ - WT_CONFIG_ITEM cval; - WT_DECL_RET; - const char *checkpoint; - - checkpoint = NULL; - - /* - * This function exists to handle checkpoint configuration. Callers - * that never open a checkpoint call the underlying function directly. - */ - WT_RET_NOTFOUND_OK( - __wt_config_gets_def(session, cfg, "checkpoint", 0, &cval)); - if (cval.len != 0) { - /* - * The internal checkpoint name is special, find the last - * unnamed checkpoint of the object. - */ - if (WT_STRING_MATCH(WT_CHECKPOINT, cval.str, cval.len)) { - WT_RET(__wt_meta_checkpoint_last_name( - session, uri, &checkpoint)); - } else - WT_RET(__wt_strndup( - session, cval.str, cval.len, &checkpoint)); - } - - ret = __wt_conn_btree_apply_single(session, uri, checkpoint, func, cfg); - - __wt_free(session, checkpoint); - - return (ret); -} - -/* - * __wt_conn_btree_apply_single -- - * Apply a function to a single btree handle that couldn't be locked - * (attempting to get the handle returned EBUSY). - */ -int -__wt_conn_btree_apply_single(WT_SESSION_IMPL *session, - const char *uri, const char *checkpoint, - int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]) -{ - WT_CONNECTION_IMPL *conn; - WT_DATA_HANDLE *dhandle; - WT_DECL_RET; - uint64_t bucket, hash; - - conn = S2C(session); - - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); - - hash = __wt_hash_city64(uri, strlen(uri)); - bucket = hash % WT_HASH_ARRAY_SIZE; - TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq) - if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && - !F_ISSET(dhandle, WT_DHANDLE_DEAD) && - (hash == dhandle->name_hash && - strcmp(uri, dhandle->name) == 0) && - ((dhandle->checkpoint == NULL && checkpoint == NULL) || - (dhandle->checkpoint != NULL && checkpoint != NULL && - strcmp(dhandle->checkpoint, checkpoint) == 0))) { - /* - * We're holding the handle list lock which locks out - * handle open (which might change the state of the - * underlying object). However, closing a handle - * doesn't require the handle list lock, lock out - * closing the handle and then confirm the handle is - * still open. - */ - __wt_spin_lock(session, &dhandle->close_lock); - if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && - !F_ISSET(dhandle, WT_DHANDLE_DEAD)) { - WT_WITH_DHANDLE(session, dhandle, - ret = func(session, cfg)); - } - __wt_spin_unlock(session, &dhandle->close_lock); - WT_RET(ret); + TAILQ_FOREACH(dhandle, &conn->dhqh, q) { + if (!F_ISSET(dhandle, WT_DHANDLE_OPEN) || + F_ISSET(dhandle, WT_DHANDLE_DEAD) || + dhandle->checkpoint != NULL || + !WT_PREFIX_MATCH(dhandle->name, "file:") || + WT_IS_METADATA(session, dhandle)) + continue; + WT_RET(__conn_btree_apply_internal( + session, dhandle, file_func, name_func, cfg)); } + } return (0); } diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c index 12b4e87e921..16717597f4d 100644 --- a/src/conn/conn_handle.c +++ b/src/conn/conn_handle.c @@ -56,6 +56,7 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_RET(__wt_rwlock_alloc(session, &conn->hot_backup_lock, "hot backup")); WT_RET(__wt_spin_init(session, &conn->las_lock, "lookaside table")); + WT_RET(__wt_spin_init(session, &conn->metadata_lock, "metadata")); WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure")); WT_RET(__wt_spin_init(session, &conn->schema_lock, "schema")); WT_RET(__wt_spin_init(session, &conn->table_lock, "table creation")); @@ -123,7 +124,8 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) * underlying file-close code uses the mutex to guard lists of * open files. */ - WT_TRET(__wt_close(session, &conn->lock_fh)); + if (conn->lock_fh) + WT_TRET(__wt_close(session, &conn->lock_fh)); /* Remove from the list of connections. */ __wt_spin_lock(session, &__wt_process.spinlock); @@ -143,6 +145,7 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_spin_destroy(session, &conn->fh_lock); WT_TRET(__wt_rwlock_destroy(session, &conn->hot_backup_lock)); __wt_spin_destroy(session, &conn->las_lock); + __wt_spin_destroy(session, &conn->metadata_lock); __wt_spin_destroy(session, &conn->reconfig_lock); __wt_spin_destroy(session, &conn->schema_lock); __wt_spin_destroy(session, &conn->table_lock); diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c index 60f46288072..757d69bf240 100644 --- a/src/conn/conn_log.c +++ b/src/conn/conn_log.c @@ -133,10 +133,17 @@ __logmgr_config( FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR); WT_RET(__wt_config_gets(session, cfg, "log.zero_fill", &cval)); - if (cval.val != 0) + if (cval.val != 0) { + if (F_ISSET(conn, WT_CONN_READONLY)) + WT_RET_MSG(session, EINVAL, + "Read-only configuration incompatible with " + "zero-filling log files"); FLD_SET(conn->log_flags, WT_CONN_LOG_ZERO_FILL); + } WT_RET(__logmgr_sync_cfg(session, cfg)); + if (conn->log_cond != NULL) + WT_RET(__wt_cond_auto_signal(session, conn->log_cond)); return (0); } @@ -463,7 +470,7 @@ __log_file_server(void *arg) locked = false; __wt_spin_unlock(session, &log->log_sync_lock); } else { - WT_ERR(__wt_cond_signal( + WT_ERR(__wt_cond_auto_signal( session, conn->log_wrlsn_cond)); /* * We do not want to wait potentially a second @@ -633,7 +640,7 @@ restart: if (slot->slot_start_lsn.l.offset != slot->slot_last_offset) slot->slot_start_lsn.l.offset = - slot->slot_last_offset; + (uint32_t)slot->slot_last_offset; log->write_start_lsn = slot->slot_start_lsn; log->write_lsn = slot->slot_end_lsn; WT_ERR(__wt_cond_signal( @@ -662,31 +669,54 @@ __log_wrlsn_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; + WT_LOG *log; + WT_LSN prev; WT_SESSION_IMPL *session; int yield; + bool did_work; session = arg; conn = S2C(session); + log = conn->log; yield = 0; + WT_INIT_LSN(&prev); + did_work = false; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* - * Write out any log record buffers. + * Write out any log record buffers if anything was done + * since last time. Only call the function to walk the + * slots if the system is not idle. On an idle system + * the alloc_lsn will not advance and the written lsn will + * match the alloc_lsn. */ - WT_ERR(__wt_log_wrlsn(session, &yield)); + if (__wt_log_cmp(&prev, &log->alloc_lsn) != 0 || + __wt_log_cmp(&log->write_lsn, &log->alloc_lsn) != 0) + WT_ERR(__wt_log_wrlsn(session, &yield)); + else + WT_STAT_FAST_CONN_INCR(session, log_write_lsn_skip); + prev = log->alloc_lsn; + if (yield == 0) + did_work = true; + else + did_work = false; /* * If __wt_log_wrlsn did work we want to yield instead of sleep. */ if (yield++ < WT_THOUSAND) __wt_yield(); else - WT_ERR(__wt_cond_wait( - session, conn->log_wrlsn_cond, 10000)); + /* + * Send in false because if we did any work we would + * not be on this path. + */ + WT_ERR(__wt_cond_auto_wait( + session, conn->log_wrlsn_cond, did_work)); } /* * On close we need to do this one more time because there could * be straggling log writes that need to be written. */ - WT_ERR(__wt_log_force_write(session, 1)); + WT_ERR(__wt_log_force_write(session, 1, NULL)); WT_ERR(__wt_log_wrlsn(session, NULL)); if (0) { err: __wt_err(session, ret, "log wrlsn server error"); @@ -701,12 +731,13 @@ err: __wt_err(session, ret, "log wrlsn server error"); static WT_THREAD_RET __log_server(void *arg) { + struct timespec start, now; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_SESSION_IMPL *session; - int freq_per_sec; - bool locked, signalled; + uint64_t timediff; + bool did_work, locked, signalled; session = arg; conn = S2C(session); @@ -714,11 +745,10 @@ __log_server(void *arg) locked = signalled = false; /* - * Set this to the number of times per second we want to force out the - * log slot buffer. + * Set this to the number of milliseconds we want to run archive and + * pre-allocation. Start it so that we run on the first time through. */ -#define WT_FORCE_PER_SECOND 20 - freq_per_sec = WT_FORCE_PER_SECOND; + timediff = WT_THOUSAND; /* * The log server thread does a variety of work. It forces out any @@ -731,6 +761,7 @@ __log_server(void *arg) * don't want log records sitting in the buffer over the time it * takes to sync out an earlier file. */ + did_work = true; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * Slots depend on future activity. Force out buffered @@ -739,15 +770,14 @@ __log_server(void *arg) * and a buffer may need to wait for the write_lsn to advance * in the case of a synchronous buffer. We end up with a hang. */ - WT_ERR_BUSY_OK(__wt_log_force_write(session, 0)); + WT_ERR_BUSY_OK(__wt_log_force_write(session, 0, &did_work)); /* * We don't want to archive or pre-allocate files as often as * we want to force out log buffers. Only do it once per second * or if the condition was signalled. */ - if (--freq_per_sec <= 0 || signalled) { - freq_per_sec = WT_FORCE_PER_SECOND; + if (timediff >= WT_THOUSAND || signalled) { /* * Perform log pre-allocation. @@ -788,8 +818,12 @@ __log_server(void *arg) } /* Wait until the next event. */ - WT_ERR(__wt_cond_wait_signal(session, conn->log_cond, - WT_MILLION / WT_FORCE_PER_SECOND, &signalled)); + + WT_ERR(__wt_epoch(session, &start)); + WT_ERR(__wt_cond_auto_wait_signal(session, conn->log_cond, + did_work, &signalled)); + WT_ERR(__wt_epoch(session, &now)); + timediff = WT_TIMEDIFF_MS(now, start); } if (0) { @@ -901,8 +935,9 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) */ WT_RET(__wt_open_internal_session(conn, "log-wrlsn-server", false, session_flags, &conn->log_wrlsn_session)); - WT_RET(__wt_cond_alloc(conn->log_wrlsn_session, - "log write lsn server", false, &conn->log_wrlsn_cond)); + WT_RET(__wt_cond_auto_alloc(conn->log_wrlsn_session, + "log write lsn server", false, 10000, WT_MILLION, + &conn->log_wrlsn_cond)); WT_RET(__wt_thread_create(conn->log_wrlsn_session, &conn->log_wrlsn_tid, __log_wrlsn_server, conn->log_wrlsn_session)); conn->log_wrlsn_tid_set = true; @@ -916,13 +951,13 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) if (conn->log_session != NULL) { WT_ASSERT(session, conn->log_cond != NULL); WT_ASSERT(session, conn->log_tid_set == true); - WT_RET(__wt_cond_signal(session, conn->log_cond)); + WT_RET(__wt_cond_auto_signal(session, conn->log_cond)); } else { /* The log server gets its own session. */ WT_RET(__wt_open_internal_session(conn, "log-server", false, session_flags, &conn->log_session)); - WT_RET(__wt_cond_alloc(conn->log_session, - "log server", false, &conn->log_cond)); + WT_RET(__wt_cond_auto_alloc(conn->log_session, + "log server", false, 50000, WT_MILLION, &conn->log_cond)); /* * Start the thread. @@ -958,7 +993,7 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) return (0); } if (conn->log_tid_set) { - WT_TRET(__wt_cond_signal(session, conn->log_cond)); + WT_TRET(__wt_cond_auto_signal(session, conn->log_cond)); WT_TRET(__wt_thread_join(session, conn->log_tid)); conn->log_tid_set = false; } @@ -973,7 +1008,7 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) conn->log_file_session = NULL; } if (conn->log_wrlsn_tid_set) { - WT_TRET(__wt_cond_signal(session, conn->log_wrlsn_cond)); + WT_TRET(__wt_cond_auto_signal(session, conn->log_wrlsn_cond)); WT_TRET(__wt_thread_join(session, conn->log_wrlsn_tid)); conn->log_wrlsn_tid_set = false; } @@ -994,9 +1029,9 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) } /* Destroy the condition variables now that all threads are stopped */ - WT_TRET(__wt_cond_destroy(session, &conn->log_cond)); + WT_TRET(__wt_cond_auto_destroy(session, &conn->log_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond)); - WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond)); + WT_TRET(__wt_cond_auto_destroy(session, &conn->log_wrlsn_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log->log_sync_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log->log_write_cond)); diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c index 58577b4587d..aff422654d7 100644 --- a/src/conn/conn_open.c +++ b/src/conn/conn_open.c @@ -210,10 +210,8 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) /* * If hash arrays were allocated, free them now. */ - if (s->dhhash != NULL) - __wt_free(session, s->dhhash); - if (s->tablehash != NULL) - __wt_free(session, s->tablehash); + __wt_free(session, s->dhhash); + __wt_free(session, s->tablehash); __wt_free(session, s->hazard); } diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c index 08ad105c725..d6e59a50da5 100644 --- a/src/conn/conn_stat.c +++ b/src/conn/conn_stat.c @@ -86,6 +86,11 @@ __statlog_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp) conn->stat_usecs = (uint64_t)cval.val * WT_MILLION; WT_RET(__wt_config_gets( + session, cfg, "statistics_log.json", &cval)); + if (cval.val != 0) + FLD_SET(conn->stat_flags, WT_CONN_STAT_JSON); + + WT_RET(__wt_config_gets( session, cfg, "statistics_log.on_close", &cval)); if (cval.val != 0) FLD_SET(conn->stat_flags, WT_CONN_STAT_ON_CLOSE); @@ -97,6 +102,10 @@ __statlog_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp) if (!*runp && !FLD_ISSET(conn->stat_flags, WT_CONN_STAT_ON_CLOSE)) return (0); + /* + * If any statistics logging is done, this must not be a read-only + * connection. + */ WT_RET(__wt_config_gets(session, cfg, "statistics_log.sources", &cval)); WT_RET(__wt_config_subinit(session, &objectconf, &cval)); for (cnt = 0; (ret = __wt_config_next(&objectconf, &k, &v)) == 0; ++cnt) @@ -132,9 +141,24 @@ __statlog_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp) WT_ERR(__wt_config_gets(session, cfg, "statistics_log.path", &cval)); WT_ERR(__wt_nfilename(session, cval.str, cval.len, &conn->stat_path)); - WT_ERR(__wt_config_gets( - session, cfg, "statistics_log.timestamp", &cval)); - WT_ERR(__wt_strndup(session, cval.str, cval.len, &conn->stat_format)); + /* + * When using JSON format, use the same timestamp format as MongoDB by + * default. + */ + if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_JSON)) { + ret = __wt_config_gets( + session, &cfg[1], "statistics_log.timestamp", &cval); + if (ret == WT_NOTFOUND) + WT_ERR(__wt_strdup( + session, "%FT%T.000Z", &conn->stat_format)); + WT_ERR_NOTFOUND_OK(ret); + } + if (conn->stat_format == NULL) { + WT_ERR(__wt_config_gets( + session, cfg, "statistics_log.timestamp", &cval)); + WT_ERR(__wt_strndup( + session, cval.str, cval.len, &conn->stat_format)); + } err: __stat_sources_free(session, &sources); return (ret); @@ -149,22 +173,25 @@ __statlog_dump(WT_SESSION_IMPL *session, const char *name, bool conn_stats) { WT_CONNECTION_IMPL *conn; WT_CURSOR *cursor; - WT_CURSOR_STAT *cst; WT_DECL_ITEM(tmp); WT_DECL_RET; - int64_t *stats; - int i; - const char *desc, *uri; + int64_t val; + size_t prefixlen; + const char *desc, *endprefix, *valstr, *uri; const char *cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL }; + bool first, groupfirst; conn = S2C(session); + cursor = NULL; + + WT_RET(__wt_scr_alloc(session, 0, &tmp)); + first = groupfirst = true; /* Build URI and configuration string. */ if (conn_stats) uri = "statistics:"; else { - WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__wt_buf_fmt(session, tmp, "statistics:%s", name)); uri = tmp->data; } @@ -175,31 +202,54 @@ __statlog_dump(WT_SESSION_IMPL *session, const char *name, bool conn_stats) * If we don't find an underlying object, silently ignore it, the object * may exist only intermittently. */ - switch (ret = __wt_curstat_open(session, uri, NULL, cfg, &cursor)) { - case 0: - cst = (WT_CURSOR_STAT *)cursor; - for (stats = cst->stats, i = 0; i < cst->stats_count; ++i) { - if (conn_stats) - WT_ERR(__wt_stat_connection_desc(cst, i, - &desc)); - else - WT_ERR(__wt_stat_dsrc_desc(cst, i, &desc)); + if ((ret = __wt_curstat_open(session, uri, NULL, cfg, &cursor)) != 0) { + if (ret == EBUSY || ret == ENOENT || ret == WT_NOTFOUND) + ret = 0; + goto err; + } + + if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_JSON)) { + WT_ERR(__wt_fprintf(conn->stat_fp, + "{\"version\":\"%s\",\"localTime\":\"%s\"", + WIREDTIGER_VERSION_STRING, conn->stat_stamp)); + WT_ERR(__wt_fprintf(conn->stat_fp, ",\"wiredTiger\":{")); + while ((ret = cursor->next(cursor)) == 0) { + WT_ERR(cursor->get_value(cursor, &desc, &valstr, &val)); + /* Check if we are starting a new section. */ + endprefix = strchr(desc, ':'); + prefixlen = WT_PTRDIFF(endprefix, desc); + WT_ASSERT(session, endprefix != NULL); + if (first || + tmp->size != prefixlen || + strncmp(desc, tmp->data, tmp->size) != 0) { + WT_ERR(__wt_buf_set( + session, tmp, desc, prefixlen)); + WT_ERR(__wt_fprintf(conn->stat_fp, + "%s\"%.*s\":{", first ? "" : "},", + (int)prefixlen, desc)); + first = false; + groupfirst = true; + } + WT_ERR(__wt_fprintf(conn->stat_fp, + "%s\"%s\":%" PRId64, + groupfirst ? "" : ",", endprefix + 2, val)); + groupfirst = false; + } + WT_ERR_NOTFOUND_OK(ret); + WT_ERR(__wt_fprintf(conn->stat_fp, "}}}\n")); + } else { + while ((ret = cursor->next(cursor)) == 0) { + WT_ERR(cursor->get_value(cursor, &desc, &valstr, &val)); WT_ERR(__wt_fprintf(conn->stat_fp, "%s %" PRId64 " %s %s\n", - conn->stat_stamp, stats[i], name, desc)); + conn->stat_stamp, val, name, desc)); } - WT_ERR(cursor->close(cursor)); - break; - case EBUSY: - case ENOENT: - case WT_NOTFOUND: - ret = 0; - break; - default: - break; + WT_ERR_NOTFOUND_OK(ret); } err: __wt_scr_free(session, &tmp); + if (cursor != NULL) + WT_TRET(cursor->close(cursor)); return (ret); } @@ -342,7 +392,7 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp) if (conn->stat_sources != NULL) { WT_WITH_HANDLE_LIST_LOCK(session, ret = __wt_conn_btree_apply( - session, false, NULL, __statlog_apply, NULL)); + session, NULL, __statlog_apply, NULL, NULL)); WT_RET(ret); } diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c index 7628076e605..cc0aa5a1322 100644 --- a/src/conn/conn_sweep.c +++ b/src/conn/conn_sweep.c @@ -91,9 +91,9 @@ __sweep_expire_one(WT_SESSION_IMPL *session) goto err; /* - * Mark the handle as dead and close the underlying file - * handle. Closing the handle decrements the open file count, - * meaning the close loop won't overrun the configured minimum. + * Mark the handle dead and close the underlying file handle. + * Closing the handle decrements the open file count, meaning the close + * loop won't overrun the configured minimum. */ ret = __wt_conn_btree_sync_and_close(session, false, true); @@ -163,7 +163,7 @@ __sweep_discard_trees(WT_SESSION_IMPL *session, u_int *dead_handlesp) !F_ISSET(dhandle, WT_DHANDLE_DEAD)) continue; - /* If the handle is marked "dead", flush it from cache. */ + /* If the handle is marked dead, flush it from cache. */ WT_WITH_DHANDLE(session, dhandle, ret = __wt_conn_btree_sync_and_close(session, false, false)); diff --git a/src/cursor/cur_backup.c b/src/cursor/cur_backup.c index d7d74da48d4..2fb0c464a76 100644 --- a/src/cursor/cur_backup.c +++ b/src/cursor/cur_backup.c @@ -8,12 +8,12 @@ #include "wt_internal.h" -static int __backup_all(WT_SESSION_IMPL *, WT_CURSOR_BACKUP *); +static int __backup_all(WT_SESSION_IMPL *); static int __backup_cleanup_handles(WT_SESSION_IMPL *, WT_CURSOR_BACKUP *); static int __backup_file_create(WT_SESSION_IMPL *, WT_CURSOR_BACKUP *, bool); -static int __backup_list_all_append(WT_SESSION_IMPL *, const char *[]); static int __backup_list_append( WT_SESSION_IMPL *, WT_CURSOR_BACKUP *, const char *); +static int __backup_list_uri_append(WT_SESSION_IMPL *, const char *, bool *); static int __backup_start( WT_SESSION_IMPL *, WT_CURSOR_BACKUP *, const char *[]); static int __backup_stop(WT_SESSION_IMPL *); @@ -103,22 +103,22 @@ __wt_curbackup_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, - __wt_cursor_get_key, /* get-key */ - __wt_cursor_notsup, /* get-value */ - __wt_cursor_notsup, /* set-key */ - __wt_cursor_notsup, /* set-value */ - __wt_cursor_notsup, /* compare */ - __wt_cursor_notsup, /* equals */ - __curbackup_next, /* next */ - __wt_cursor_notsup, /* prev */ - __curbackup_reset, /* reset */ - __wt_cursor_notsup, /* search */ - __wt_cursor_notsup, /* search-near */ - __wt_cursor_notsup, /* insert */ - __wt_cursor_notsup, /* update */ - __wt_cursor_notsup, /* remove */ - __wt_cursor_notsup, /* reconfigure */ - __curbackup_close); /* close */ + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value_notsup, /* get-value */ + __wt_cursor_set_key_notsup, /* set-key */ + __wt_cursor_set_value_notsup, /* set-value */ + __wt_cursor_compare_notsup, /* compare */ + __wt_cursor_equals_notsup, /* equals */ + __curbackup_next, /* next */ + __wt_cursor_notsup, /* prev */ + __curbackup_reset, /* reset */ + __wt_cursor_notsup, /* search */ + __wt_cursor_search_near_notsup, /* search-near */ + __wt_cursor_notsup, /* insert */ + __wt_cursor_notsup, /* update */ + __wt_cursor_notsup, /* remove */ + __wt_cursor_reconfigure_notsup, /* reconfigure */ + __curbackup_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_BACKUP *cb; WT_DECL_RET; @@ -140,8 +140,9 @@ __wt_curbackup_open(WT_SESSION_IMPL *session, * Start the backup and fill in the cursor's list. Acquire the schema * lock, we need a consistent view when creating a copy. */ - WT_WITH_SCHEMA_LOCK(session, ret, - ret = __backup_start(session, cb, cfg)); + WT_WITH_CHECKPOINT_LOCK(session, ret, + WT_WITH_SCHEMA_LOCK(session, ret, + ret = __backup_start(session, cb, cfg))); WT_ERR(ret); /* __wt_cursor_init is last so we don't have to clean up on error. */ @@ -241,7 +242,7 @@ __backup_start( if (!target_list) { WT_ERR(__backup_log_append(session, cb, true)); - WT_ERR(__backup_all(session, cb)); + WT_ERR(__backup_all(session)); } /* Add the hot backup and standard WiredTiger files to the list. */ @@ -332,55 +333,14 @@ __backup_stop(WT_SESSION_IMPL *session) * Backup all objects in the database. */ static int -__backup_all(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb) +__backup_all(WT_SESSION_IMPL *session) { - WT_CONFIG_ITEM cval; - WT_CURSOR *cursor; WT_DECL_RET; - const char *key, *value; - - cursor = NULL; - - /* Copy all of the metadata entries to the hot backup file. */ - WT_RET(__wt_metadata_cursor(session, &cursor)); - while ((ret = cursor->next(cursor)) == 0) { - WT_ERR(cursor->get_key(cursor, &key)); - WT_ERR(cursor->get_value(cursor, &value)); - WT_ERR(__wt_fprintf(cb->bfp, "%s\n%s\n", key, value)); - - /* - * While reading the metadata file, check there are no "sources" - * or "types" which can't support hot backup. This checks for - * a data source that's non-standard, which can't be backed up, - * but is also sanity checking: if there's an entry backed by - * anything other than a file or lsm entry, we're confused. - */ - if ((ret = __wt_config_getones( - session, value, "type", &cval)) == 0 && - !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "file") && - !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "lsm")) - WT_ERR_MSG(session, ENOTSUP, - "hot backup is not supported for objects of " - "type %.*s", (int)cval.len, cval.str); - WT_ERR_NOTFOUND_OK(ret); - if ((ret =__wt_config_getones( - session, value, "source", &cval)) == 0 && - !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "file:") && - !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "lsm:")) - WT_ERR_MSG(session, ENOTSUP, - "hot backup is not supported for objects of " - "source %.*s", (int)cval.len, cval.str); - WT_ERR_NOTFOUND_OK(ret); - } - WT_ERR_NOTFOUND_OK(ret); - - WT_ERR(__wt_metadata_cursor_release(session, &cursor)); /* Build a list of the file objects that need to be copied. */ WT_WITH_HANDLE_LIST_LOCK(session, ret = - __wt_meta_btree_apply(session, __backup_list_all_append, NULL)); + __wt_meta_apply_all(session, NULL, __backup_list_uri_append, NULL)); -err: WT_TRET(__wt_metadata_cursor_release(session, &cursor)); return (ret); } @@ -430,11 +390,11 @@ __backup_uri(WT_SESSION_IMPL *session, */ if (WT_PREFIX_MATCH(uri, "log:")) { *log_only = !target_list; - WT_ERR(__wt_backup_list_uri_append(session, uri, NULL)); + WT_ERR(__backup_list_uri_append(session, uri, NULL)); } else { *log_only = false; WT_ERR(__wt_schema_worker(session, - uri, NULL, __wt_backup_list_uri_append, cfg, 0)); + uri, NULL, __backup_list_uri_append, cfg, 0)); } } WT_ERR_NOTFOUND_OK(ret); @@ -471,12 +431,12 @@ __wt_backup_file_remove(WT_SESSION_IMPL *session) } /* - * __wt_backup_list_uri_append -- + * __backup_list_uri_append -- * Append a new file name to the list, allocate space as necessary. * Called via the schema_worker function. */ -int -__wt_backup_list_uri_append( +static int +__backup_list_uri_append( WT_SESSION_IMPL *session, const char *name, bool *skip) { WT_CURSOR_BACKUP *cb; @@ -485,11 +445,31 @@ __wt_backup_list_uri_append( cb = session->bkp_cursor; WT_UNUSED(skip); + /* + * While reading the metadata file, check there are no data sources + * that can't support hot backup. This checks for a data source that's + * non-standard, which can't be backed up, but is also sanity checking: + * if there's an entry backed by anything other than a file or lsm + * entry, we're confused. + */ if (WT_PREFIX_MATCH(name, "log:")) { WT_RET(__backup_log_append(session, cb, false)); return (0); } + if (!WT_PREFIX_MATCH(name, "file:") && + !WT_PREFIX_MATCH(name, "colgroup:") && + !WT_PREFIX_MATCH(name, "index:") && + !WT_PREFIX_MATCH(name, "lsm:") && + !WT_PREFIX_MATCH(name, "table:")) + WT_RET_MSG(session, ENOTSUP, + "hot backup is not supported for objects of type %s", + name); + + /* Ignore the lookaside table. */ + if (strcmp(name, WT_LAS_URI) == 0) + return (0); + /* Add the metadata entry to the backup file. */ WT_RET(__wt_metadata_search(session, name, &value)); WT_RET(__wt_fprintf(cb->bfp, "%s\n%s\n", name, value)); @@ -503,34 +483,6 @@ __wt_backup_list_uri_append( } /* - * __backup_list_all_append -- - * Append a new file name to the list, allocate space as necessary. - * Called via the __wt_meta_btree_apply function. - */ -static int -__backup_list_all_append(WT_SESSION_IMPL *session, const char *cfg[]) -{ - WT_CURSOR_BACKUP *cb; - const char *name; - - WT_UNUSED(cfg); - - cb = session->bkp_cursor; - name = session->dhandle->name; - - /* Ignore files in the process of being bulk-loaded. */ - if (F_ISSET(S2BT(session), WT_BTREE_BULK)) - return (0); - - /* Ignore the lookaside table. */ - if (strcmp(name, WT_LAS_URI) == 0) - return (0); - - /* Add the file to the list of files to be copied. */ - return (__backup_list_append(session, cb, name)); -} - -/* * __backup_list_append -- * Append a new file name to the list, allocate space as necessary. */ @@ -541,7 +493,6 @@ __backup_list_append( WT_CURSOR_BACKUP_ENTRY *p; WT_DATA_HANDLE *old_dhandle; WT_DECL_RET; - bool need_handle; const char *name; /* Leave a NULL at the end to mark the end of the list. */ @@ -551,11 +502,26 @@ __backup_list_append( p[0].name = p[1].name = NULL; p[0].handle = p[1].handle = NULL; - need_handle = false; name = uri; + + /* + * If it's a file in the database, get a handle for the underlying + * object (this handle blocks schema level operations, for example + * WT_SESSION.drop or an LSM file discard after level merging). + * + * If the handle is busy (e.g., it is being bulk-loaded), silently skip + * it. We have a special fake checkpoint in the metadata, and recovery + * will recreate an empty file. + */ if (WT_PREFIX_MATCH(uri, "file:")) { - need_handle = true; name += strlen("file:"); + + old_dhandle = session->dhandle; + ret = __wt_session_get_btree(session, uri, NULL, NULL, 0); + p->handle = session->dhandle; + session->dhandle = old_dhandle; + if (ret != 0) + return (ret == EBUSY ? 0 : ret); } /* @@ -569,20 +535,6 @@ __backup_list_append( */ WT_RET(__wt_strdup(session, name, &p->name)); - /* - * If it's a file in the database, get a handle for the underlying - * object (this handle blocks schema level operations, for example - * WT_SESSION.drop or an LSM file discard after level merging). - */ - if (need_handle) { - old_dhandle = session->dhandle; - if ((ret = - __wt_session_get_btree(session, uri, NULL, NULL, 0)) == 0) - p->handle = session->dhandle; - session->dhandle = old_dhandle; - WT_RET(ret); - } - ++cb->list_next; return (0); } diff --git a/src/cursor/cur_config.c b/src/cursor/cur_config.c index 1b2fec0eb89..e0d270e4245 100644 --- a/src/cursor/cur_config.c +++ b/src/cursor/cur_config.c @@ -27,21 +27,21 @@ __wt_curconfig_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, - __wt_cursor_get_key, /* get-key */ - __wt_cursor_get_value, /* get-value */ - __wt_cursor_set_key, /* set-key */ - __wt_cursor_set_value, /* set-value */ - __wt_cursor_notsup, /* compare */ - __wt_cursor_notsup, /* equals */ - __wt_cursor_notsup, /* next */ - __wt_cursor_notsup, /* prev */ - __wt_cursor_noop, /* reset */ - __wt_cursor_notsup, /* search */ - __wt_cursor_notsup, /* search-near */ - __wt_cursor_notsup, /* insert */ - __wt_cursor_notsup, /* update */ - __wt_cursor_notsup, /* remove */ - __wt_cursor_notsup, /* reconfigure */ + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ + __wt_cursor_compare_notsup, /* compare */ + __wt_cursor_equals_notsup, /* equals */ + __wt_cursor_notsup, /* next */ + __wt_cursor_notsup, /* prev */ + __wt_cursor_noop, /* reset */ + __wt_cursor_notsup, /* search */ + __wt_cursor_search_near_notsup, /* search-near */ + __wt_cursor_notsup, /* insert */ + __wt_cursor_notsup, /* update */ + __wt_cursor_notsup, /* remove */ + __wt_cursor_reconfigure_notsup, /* reconfigure */ __curconfig_close); WT_CURSOR_CONFIG *cconfig; WT_CURSOR *cursor; diff --git a/src/cursor/cur_ds.c b/src/cursor/cur_ds.c index 2a598c99523..804c24a3d2e 100644 --- a/src/cursor/cur_ds.c +++ b/src/cursor/cur_ds.c @@ -449,22 +449,22 @@ __wt_curds_open( const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, - __wt_cursor_get_key, /* get-key */ - __wt_cursor_get_value, /* get-value */ - __wt_cursor_set_key, /* set-key */ - __wt_cursor_set_value, /* set-value */ - __curds_compare, /* compare */ - __wt_cursor_equals, /* equals */ - __curds_next, /* next */ - __curds_prev, /* prev */ - __curds_reset, /* reset */ - __curds_search, /* search */ - __curds_search_near, /* search-near */ - __curds_insert, /* insert */ - __curds_update, /* update */ - __curds_remove, /* remove */ - __wt_cursor_notsup, /* reconfigure */ - __curds_close); /* close */ + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ + __curds_compare, /* compare */ + __wt_cursor_equals, /* equals */ + __curds_next, /* next */ + __curds_prev, /* prev */ + __curds_reset, /* reset */ + __curds_search, /* search */ + __curds_search_near, /* search-near */ + __curds_insert, /* insert */ + __curds_update, /* update */ + __curds_remove, /* remove */ + __wt_cursor_reconfigure_notsup, /* reconfigure */ + __curds_close); /* close */ WT_CONFIG_ITEM cval, metadata; WT_CURSOR *cursor, *source; WT_CURSOR_DATA_SOURCE *data_source; diff --git a/src/cursor/cur_dump.c b/src/cursor/cur_dump.c index 3324efd96cc..a7b1c98871a 100644 --- a/src/cursor/cur_dump.c +++ b/src/cursor/cur_dump.c @@ -348,22 +348,22 @@ int __wt_curdump_create(WT_CURSOR *child, WT_CURSOR *owner, WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, - __curdump_get_key, /* get-key */ - __curdump_get_value, /* get-value */ - __curdump_set_key, /* set-key */ - __curdump_set_value, /* set-value */ - __wt_cursor_notsup, /* compare */ - __wt_cursor_notsup, /* equals */ - __curdump_next, /* next */ - __curdump_prev, /* prev */ - __curdump_reset, /* reset */ - __curdump_search, /* search */ - __curdump_search_near, /* search-near */ - __curdump_insert, /* insert */ - __curdump_update, /* update */ - __curdump_remove, /* remove */ - __wt_cursor_notsup, /* reconfigure */ - __curdump_close); /* close */ + __curdump_get_key, /* get-key */ + __curdump_get_value, /* get-value */ + __curdump_set_key, /* set-key */ + __curdump_set_value, /* set-value */ + __wt_cursor_compare_notsup, /* compare */ + __wt_cursor_equals_notsup, /* equals */ + __curdump_next, /* next */ + __curdump_prev, /* prev */ + __curdump_reset, /* reset */ + __curdump_search, /* search */ + __curdump_search_near, /* search-near */ + __curdump_insert, /* insert */ + __curdump_update, /* update */ + __curdump_remove, /* remove */ + __wt_cursor_reconfigure_notsup, /* reconfigure */ + __curdump_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_DUMP *cdump; WT_CURSOR_JSON *json; diff --git a/src/cursor/cur_file.c b/src/cursor/cur_file.c index 8bbe1cc8eda..fac903b4770 100644 --- a/src/cursor/cur_file.c +++ b/src/cursor/cur_file.c @@ -397,22 +397,22 @@ __wt_curfile_create(WT_SESSION_IMPL *session, WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, - __wt_cursor_get_key, /* get-key */ - __wt_cursor_get_value, /* get-value */ - __wt_cursor_set_key, /* set-key */ - __wt_cursor_set_value, /* set-value */ - __curfile_compare, /* compare */ - __curfile_equals, /* equals */ - __curfile_next, /* next */ - __curfile_prev, /* prev */ - __curfile_reset, /* reset */ - __curfile_search, /* search */ - __curfile_search_near, /* search-near */ - __curfile_insert, /* insert */ - __curfile_update, /* update */ - __curfile_remove, /* remove */ - __wt_cursor_reconfigure, /* reconfigure */ - __curfile_close); /* close */ + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ + __curfile_compare, /* compare */ + __curfile_equals, /* equals */ + __curfile_next, /* next */ + __curfile_prev, /* prev */ + __curfile_reset, /* reset */ + __curfile_search, /* search */ + __curfile_search_near, /* search-near */ + __curfile_insert, /* insert */ + __curfile_update, /* update */ + __curfile_remove, /* remove */ + __wt_cursor_reconfigure, /* reconfigure */ + __curfile_close); /* close */ WT_BTREE *btree; WT_CONFIG_ITEM cval; WT_CURSOR *cursor; diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c index 6822055131a..dbe8046ca21 100644 --- a/src/cursor/cur_index.c +++ b/src/cursor/cur_index.c @@ -386,22 +386,22 @@ __wt_curindex_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, - __wt_cursor_get_key, /* get-key */ - __curindex_get_value, /* get-value */ - __wt_cursor_set_key, /* set-key */ - __curindex_set_value, /* set-value */ - __curindex_compare, /* compare */ - __wt_cursor_equals, /* equals */ - __curindex_next, /* next */ - __curindex_prev, /* prev */ - __curindex_reset, /* reset */ - __curindex_search, /* search */ - __curindex_search_near, /* search-near */ - __wt_cursor_notsup, /* insert */ - __wt_cursor_notsup, /* update */ - __wt_cursor_notsup, /* remove */ - __wt_cursor_notsup, /* reconfigure */ - __curindex_close); /* close */ + __wt_cursor_get_key, /* get-key */ + __curindex_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __curindex_set_value, /* set-value */ + __curindex_compare, /* compare */ + __wt_cursor_equals, /* equals */ + __curindex_next, /* next */ + __curindex_prev, /* prev */ + __curindex_reset, /* reset */ + __curindex_search, /* search */ + __curindex_search_near, /* search-near */ + __wt_cursor_notsup, /* insert */ + __wt_cursor_notsup, /* update */ + __wt_cursor_notsup, /* remove */ + __wt_cursor_reconfigure_notsup, /* reconfigure */ + __curindex_close); /* close */ WT_CURSOR_INDEX *cindex; WT_CURSOR *cursor; WT_DECL_ITEM(tmp); diff --git a/src/cursor/cur_join.c b/src/cursor/cur_join.c index 2cbefa68c5e..38a83217933 100644 --- a/src/cursor/cur_join.c +++ b/src/cursor/cur_join.c @@ -8,6 +8,9 @@ #include "wt_internal.h" +static int __curjoin_insert_endpoint(WT_SESSION_IMPL *, + WT_CURSOR_JOIN_ENTRY *, u_int, WT_CURSOR_JOIN_ENDPOINT **); + /* * __curjoin_entry_iter_init -- * Initialize an iteration for the index managed by a join entry. @@ -17,49 +20,56 @@ static int __curjoin_entry_iter_init(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_CURSOR_JOIN_ENTRY *entry, WT_CURSOR_JOIN_ITER **iterp) { - WT_CURSOR *newcur; WT_CURSOR *to_dup; WT_DECL_RET; const char *raw_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), "raw", NULL }; const char *def_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), NULL }; - const char *uri, **config; - char *uribuf; + const char *urimain, **config; + char *mainbuf, *uri; WT_CURSOR_JOIN_ITER *iter; size_t size; iter = NULL; - uribuf = NULL; + mainbuf = uri = NULL; to_dup = entry->ends[0].cursor; - uri = to_dup->uri; if (F_ISSET((WT_CURSOR *)cjoin, WT_CURSTD_RAW)) config = &raw_cfg[0]; else config = &def_cfg[0]; + size = strlen(to_dup->internal_uri) + 3; + WT_ERR(__wt_calloc(session, size, 1, &uri)); + snprintf(uri, size, "%s()", to_dup->internal_uri); + urimain = cjoin->table->name; if (cjoin->projection != NULL) { - size = strlen(uri) + strlen(cjoin->projection) + 1; - WT_ERR(__wt_calloc(session, size, 1, &uribuf)); - snprintf(uribuf, size, "%s%s", uri, cjoin->projection); - uri = uribuf; + size = strlen(urimain) + strlen(cjoin->projection) + 1; + WT_ERR(__wt_calloc(session, size, 1, &mainbuf)); + snprintf(mainbuf, size, "%s%s", urimain, cjoin->projection); + urimain = mainbuf; } - WT_ERR(__wt_open_cursor(session, uri, (WT_CURSOR *)cjoin, config, - &newcur)); - WT_ERR(__wt_cursor_dup_position(to_dup, newcur)); + WT_ERR(__wt_calloc_one(session, &iter)); + WT_ERR(__wt_open_cursor(session, uri, (WT_CURSOR *)cjoin, config, + &iter->cursor)); + WT_ERR(__wt_cursor_dup_position(to_dup, iter->cursor)); + WT_ERR(__wt_open_cursor(session, urimain, (WT_CURSOR *)cjoin, config, + &iter->main)); iter->cjoin = cjoin; iter->session = session; iter->entry = entry; - iter->cursor = newcur; - iter->advance = false; + iter->positioned = false; + iter->isequal = (entry->ends_next == 1 && + WT_CURJOIN_END_RANGE(&entry->ends[0]) == WT_CURJOIN_END_EQ); *iterp = iter; if (0) { err: __wt_free(session, iter); } - __wt_free(session, uribuf); + __wt_free(session, mainbuf); + __wt_free(session, uri); return (ret); } @@ -72,18 +82,70 @@ static int __curjoin_pack_recno(WT_SESSION_IMPL *session, uint64_t r, uint8_t *buf, size_t bufsize, WT_ITEM *item) { - WT_DECL_RET; WT_SESSION *wtsession; size_t sz; wtsession = (WT_SESSION *)session; - WT_ERR(wiredtiger_struct_size(wtsession, &sz, "r", r)); + WT_RET(wiredtiger_struct_size(wtsession, &sz, "r", r)); WT_ASSERT(session, sz < bufsize); - WT_ERR(wiredtiger_struct_pack(wtsession, buf, bufsize, "r", r)); + WT_RET(wiredtiger_struct_pack(wtsession, buf, bufsize, "r", r)); item->size = sz; item->data = buf; + return (0); +} + +/* + * __curjoin_split_key -- + * Copy the primary key from a cursor (either main table or index) + * to another cursor. When copying from an index file, the index + * key is also returned. + * + */ +static int +__curjoin_split_key(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, + WT_ITEM *idxkey, WT_CURSOR *tocur, WT_CURSOR *fromcur, + const char *repack_fmt, bool isindex) +{ + WT_CURSOR *firstcg_cur; + WT_CURSOR_INDEX *cindex; + WT_ITEM *keyp; + const uint8_t *p; -err: return (ret); + if (isindex) { + cindex = ((WT_CURSOR_INDEX *)fromcur); + /* + * Repack tells us where the index key ends; advance past + * that to get where the raw primary key starts. + */ + WT_RET(__wt_struct_repack(session, cindex->child->key_format, + repack_fmt != NULL ? repack_fmt : cindex->iface.key_format, + &cindex->child->key, idxkey)); + WT_ASSERT(session, cindex->child->key.size > idxkey->size); + tocur->key.data = (uint8_t *)idxkey->data + idxkey->size; + tocur->key.size = cindex->child->key.size - idxkey->size; + if (WT_CURSOR_RECNO(tocur)) { + p = (const uint8_t *)tocur->key.data; + WT_RET(__wt_vunpack_uint(&p, tocur->key.size, + &tocur->recno)); + } else + tocur->recno = 0; + } else { + firstcg_cur = ((WT_CURSOR_TABLE *)fromcur)->cg_cursors[0]; + keyp = &firstcg_cur->key; + if (WT_CURSOR_RECNO(tocur)) { + WT_ASSERT(session, keyp->size == sizeof(uint64_t)); + tocur->recno = *(uint64_t *)keyp->data; + WT_RET(__curjoin_pack_recno(session, tocur->recno, + cjoin->recno_buf, sizeof(cjoin->recno_buf), + &tocur->key)); + } else { + WT_ITEM_SET(tocur->key, *keyp); + tocur->recno = 0; + } + idxkey->data = NULL; + idxkey->size = 0; + } + return (0); } /* @@ -92,45 +154,24 @@ err: return (ret); * */ static int -__curjoin_entry_iter_next(WT_CURSOR_JOIN_ITER *iter, WT_ITEM *primkey, - uint64_t *rp) +__curjoin_entry_iter_next(WT_CURSOR_JOIN_ITER *iter, WT_CURSOR *cursor) { - WT_CURSOR *firstcg_cur; - WT_CURSOR_JOIN *cjoin; - WT_DECL_RET; - WT_SESSION_IMPL *session; - uint64_t r; - - if (iter->advance) - WT_ERR(iter->cursor->next(iter->cursor)); + if (iter->positioned) + WT_RET(iter->cursor->next(iter->cursor)); else - iter->advance = true; - - session = iter->session; - cjoin = iter->cjoin; + iter->positioned = true; /* * Set our key to the primary key, we'll also need this * to check membership. */ - if (iter->entry->index != NULL) - firstcg_cur = ((WT_CURSOR_INDEX *)iter->cursor)->cg_cursors[0]; - else - firstcg_cur = ((WT_CURSOR_TABLE *)iter->cursor)->cg_cursors[0]; - if (WT_CURSOR_RECNO(&cjoin->iface)) { - r = *(uint64_t *)firstcg_cur->key.data; - WT_ERR(__curjoin_pack_recno(session, r, cjoin->recno_buf, - sizeof(cjoin->recno_buf), primkey)); - *rp = r; - } else { - WT_ITEM_SET(*primkey, firstcg_cur->key); - *rp = 0; - } - iter->curkey = primkey; + WT_RET(__curjoin_split_key(iter->session, iter->cjoin, &iter->idxkey, + cursor, iter->cursor, iter->entry->repack_format, + iter->entry->index != NULL)); + iter->curkey = &cursor->key; iter->entry->stats.actual_count++; iter->entry->stats.accesses++; - -err: return (ret); + return (0); } /* @@ -141,17 +182,15 @@ err: return (ret); static int __curjoin_entry_iter_reset(WT_CURSOR_JOIN_ITER *iter) { - WT_DECL_RET; - - if (iter->advance) { - WT_ERR(iter->cursor->reset(iter->cursor)); - WT_ERR(__wt_cursor_dup_position( + if (iter->positioned) { + WT_RET(iter->cursor->reset(iter->cursor)); + WT_RET(iter->main->reset(iter->main)); + WT_RET(__wt_cursor_dup_position( iter->cjoin->entries[0].ends[0].cursor, iter->cursor)); - iter->advance = false; + iter->positioned = false; iter->entry->stats.actual_count = 0; } - -err: return (ret); + return (0); } /* @@ -162,7 +201,7 @@ err: return (ret); static bool __curjoin_entry_iter_ready(WT_CURSOR_JOIN_ITER *iter) { - return (iter->advance); + return (iter->positioned); } /* @@ -177,6 +216,8 @@ __curjoin_entry_iter_close(WT_CURSOR_JOIN_ITER *iter) if (iter->cursor != NULL) WT_TRET(iter->cursor->close(iter->cursor)); + if (iter->main != NULL) + WT_TRET(iter->main->close(iter->main)); __wt_free(iter->session, iter); return (ret); @@ -232,10 +273,8 @@ __curjoin_get_value(WT_CURSOR *cursor, ...) !__curjoin_entry_iter_ready(iter)) WT_ERR_MSG(session, EINVAL, "join cursor must be advanced with next()"); - if (iter->entry->index != NULL) - WT_ERR(__wt_curindex_get_valuev(iter->cursor, ap)); - else - WT_ERR(__wt_curtable_get_valuev(iter->cursor, ap)); + + WT_ERR(__wt_curtable_get_valuev(iter->main, ap)); err: va_end(ap); API_END_RET(session, ret); @@ -251,43 +290,26 @@ __curjoin_init_bloom(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, { WT_COLLATOR *collator; WT_CURSOR *c; - WT_CURSOR_INDEX *cindex; WT_CURSOR_JOIN_ENDPOINT *end, *endmax; WT_DECL_RET; WT_DECL_ITEM(uribuf); - WT_ITEM curkey, curvalue, *k; - WT_TABLE *maintable; + WT_ITEM curkey, curvalue; const char *raw_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), "raw", NULL }; - const char *mainkey_str, *p; - void *allocbuf; - size_t mainkey_len, size; - u_int i; + const char *uri; + size_t size; int cmp, skip; c = NULL; - allocbuf = NULL; skip = 0; - if (entry->index != NULL) { + if (entry->index != NULL) /* - * Open a cursor having a projection of the keys of the - * index we're comparing against. Open it raw, we're - * going to compare it to the raw keys of the - * reference cursors. + * Open the raw index. We're avoiding any references + * to the main table, they may be expensive. */ - maintable = ((WT_CURSOR_TABLE *)entry->main)->table; - mainkey_str = maintable->colconf.str + 1; - for (p = mainkey_str, i = 0; - p != NULL && i < maintable->nkey_columns; i++) - p = strchr(p + 1, ','); - WT_ASSERT(session, p != 0); - mainkey_len = WT_PTRDIFF(p, mainkey_str); - size = strlen(entry->index->name) + mainkey_len + 3; - WT_ERR(__wt_scr_alloc(session, size, &uribuf)); - WT_ERR(__wt_buf_fmt(session, uribuf, "%s(%.*s)", - entry->index->name, (int)mainkey_len, mainkey_str)); - } else { + uri = entry->index->source; + else { /* * For joins on the main table, we just need the primary * key for comparison, we don't need any values. @@ -296,35 +318,38 @@ __curjoin_init_bloom(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_ERR(__wt_scr_alloc(session, size, &uribuf)); WT_ERR(__wt_buf_fmt(session, uribuf, "%s()", cjoin->table->name)); + uri = uribuf->data; } - WT_ERR(__wt_open_cursor( - session, uribuf->data, &cjoin->iface, raw_cfg, &c)); + WT_ERR(__wt_open_cursor(session, uri, &cjoin->iface, raw_cfg, &c)); /* Initially position the cursor if necessary. */ endmax = &entry->ends[entry->ends_next]; - if ((end = &entry->ends[0]) < endmax && - F_ISSET(end, WT_CURJOIN_END_GE)) { - WT_ERR(__wt_cursor_dup_position(end->cursor, c)); - if (end->flags == WT_CURJOIN_END_GE) - skip = 1; + if ((end = &entry->ends[0]) < endmax) { + if (F_ISSET(end, WT_CURJOIN_END_GT) || + WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_EQ) { + WT_ERR(__wt_cursor_dup_position(end->cursor, c)); + if (WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_GE) + skip = 1; + } else if (F_ISSET(end, WT_CURJOIN_END_LT)) { + if ((ret = c->next(c)) == WT_NOTFOUND) + goto done; + WT_ERR(ret); + } else + WT_ERR(__wt_illegal_value(session, NULL)); } collator = (entry->index == NULL) ? NULL : entry->index->collator; while (ret == 0) { WT_ERR(c->get_key(c, &curkey)); if (entry->index != NULL) { - cindex = (WT_CURSOR_INDEX *)c; - if (cindex->index->extractor == NULL) { - /* - * Repack so it's comparable to the - * reference endpoints. - */ - k = &cindex->child->key; - WT_ERR(__wt_struct_repack(session, - cindex->child->key_format, - entry->main->value_format, k, &curkey, - &allocbuf)); - } else - curkey = cindex->child->key; + /* + * Repack so it's comparable to the + * reference endpoints. + */ + WT_ERR(__wt_struct_repack(session, + c->key_format, + (entry->repack_format != NULL ? + entry->repack_format : entry->index->idxkey_format), + &c->key, &curkey)); } for (end = &entry->ends[skip]; end < endmax; end++) { WT_ERR(__wt_compare(session, collator, &curkey, @@ -345,8 +370,12 @@ __curjoin_init_bloom(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, goto done; } } - if (entry->index != NULL) - WT_ERR(c->get_value(c, &curvalue)); + if (entry->index != NULL) { + curvalue.data = + (unsigned char *)curkey.data + curkey.size; + WT_ASSERT(session, c->key.size > curkey.size); + curvalue.size = c->key.size - curkey.size; + } else WT_ERR(c->get_key(c, &curvalue)); WT_ERR(__wt_bloom_insert(bloom, &curvalue)); @@ -361,7 +390,6 @@ done: err: if (c != NULL) WT_TRET(c->close(c)); __wt_scr_free(session, &uribuf); - __wt_free(session, allocbuf); return (ret); } @@ -375,27 +403,23 @@ __curjoin_endpoint_init_key(WT_SESSION_IMPL *session, { WT_CURSOR *cursor; WT_CURSOR_INDEX *cindex; - WT_DECL_RET; WT_ITEM *k; uint64_t r; - void *allocbuf; - allocbuf = NULL; if ((cursor = endpoint->cursor) != NULL) { if (entry->index != NULL) { /* Extract and save the index's logical key. */ cindex = (WT_CURSOR_INDEX *)endpoint->cursor; - WT_ERR(__wt_struct_repack(session, + WT_RET(__wt_struct_repack(session, cindex->child->key_format, - cindex->iface.key_format, - &cindex->child->key, &endpoint->key, &allocbuf)); - if (allocbuf != NULL) - F_SET(endpoint, WT_CURJOIN_END_OWN_KEY); + (entry->repack_format != NULL ? + entry->repack_format : cindex->iface.key_format), + &cindex->child->key, &endpoint->key)); } else { k = &((WT_CURSOR_TABLE *)cursor)->cg_cursors[0]->key; if (WT_CURSOR_RECNO(cursor)) { r = *(uint64_t *)k->data; - WT_ERR(__curjoin_pack_recno(session, r, + WT_RET(__curjoin_pack_recno(session, r, endpoint->recno_buf, sizeof(endpoint->recno_buf), &endpoint->key)); @@ -404,10 +428,7 @@ __curjoin_endpoint_init_key(WT_SESSION_IMPL *session, endpoint->key = *k; } } - if (0) { -err: __wt_free(session, allocbuf); - } - return (ret); + return (0); } /* @@ -419,8 +440,13 @@ __curjoin_init_iter(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin) { WT_BLOOM *bloom; WT_DECL_RET; + WT_CURSOR *origcur; WT_CURSOR_JOIN_ENTRY *je, *jeend, *je2; WT_CURSOR_JOIN_ENDPOINT *end; + const char *def_cfg[] = { WT_CONFIG_BASE( + session, WT_SESSION_open_cursor), NULL }; + const char *raw_cfg[] = { WT_CONFIG_BASE( + session, WT_SESSION_open_cursor), "raw", NULL }; uint32_t f, k; if (cjoin->entries_next == 0) @@ -429,9 +455,27 @@ __curjoin_init_iter(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin) "cursors"); je = &cjoin->entries[0]; + jeend = &cjoin->entries[cjoin->entries_next]; + + /* + * For a single compare=le endpoint in the first iterated entry, + * construct a companion compare=ge endpoint that will actually + * be iterated. + */ + if (((je = cjoin->entries) != jeend) && + je->ends_next == 1 && F_ISSET(&je->ends[0], WT_CURJOIN_END_LT)) { + origcur = je->ends[0].cursor; + WT_RET(__curjoin_insert_endpoint(session, je, 0, &end)); + WT_RET(__wt_open_cursor(session, origcur->uri, + (WT_CURSOR *)cjoin, + F_ISSET(origcur, WT_CURSTD_RAW) ? raw_cfg : def_cfg, + &end->cursor)); + WT_RET(end->cursor->next(end->cursor)); + end->flags = WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ | + WT_CURJOIN_END_OWN_CURSOR; + } WT_RET(__curjoin_entry_iter_init(session, cjoin, je, &cjoin->iter)); - jeend = &cjoin->entries[cjoin->entries_next]; for (je = cjoin->entries; je < jeend; je++) { __wt_stat_join_init_single(&je->stats); for (end = &je->ends[0]; end < &je->ends[je->ends_next]; @@ -449,6 +493,10 @@ __curjoin_init_iter(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin) F_SET(cjoin, WT_CURJOIN_SKIP_FIRST_LEFT); if (F_ISSET(je, WT_CURJOIN_ENTRY_BLOOM)) { + if (session->txn.isolation == WT_ISO_READ_UNCOMMITTED) + WT_RET_MSG(session, EINVAL, + "join cursors with Bloom filters cannot be " + "used with read-uncommitted isolation"); if (je->bloom == NULL) { /* * Look for compatible filters to be shared, @@ -520,35 +568,34 @@ __curjoin_entry_in_range(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry, { WT_COLLATOR *collator; WT_CURSOR_JOIN_ENDPOINT *end, *endmax; - WT_DECL_RET; int cmp; collator = (entry->index != NULL) ? entry->index->collator : NULL; endmax = &entry->ends[entry->ends_next]; for (end = &entry->ends[skip_left ? 1 : 0]; end < endmax; end++) { - WT_ERR(__wt_compare(session, collator, curkey, &end->key, + WT_RET(__wt_compare(session, collator, curkey, &end->key, &cmp)); if (!F_ISSET(end, WT_CURJOIN_END_LT)) { if (cmp < 0 || (cmp == 0 && !F_ISSET(end, WT_CURJOIN_END_EQ)) || (cmp > 0 && !F_ISSET(end, WT_CURJOIN_END_GT))) - WT_ERR(WT_NOTFOUND); + WT_RET(WT_NOTFOUND); } else { if (cmp > 0 || (cmp == 0 && !F_ISSET(end, WT_CURJOIN_END_EQ)) || (cmp < 0 && !F_ISSET(end, WT_CURJOIN_END_LT))) - WT_ERR(WT_NOTFOUND); + WT_RET(WT_NOTFOUND); } } -err: return (ret); + return (0); } typedef struct { WT_CURSOR iface; WT_CURSOR_JOIN_ENTRY *entry; - int ismember; + bool ismember; } WT_CURJOIN_EXTRACTOR; /* @@ -584,8 +631,8 @@ __curjoin_extract_insert(WT_CURSOR *cursor) { ret = __curjoin_entry_in_range(session, cextract->entry, &ikey, false); if (ret == WT_NOTFOUND) ret = 0; - else - cextract->ismember = 1; + else if (ret == 0) + cextract->ismember = true; return (ret); } @@ -602,27 +649,29 @@ __curjoin_entry_member(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_CURJOIN_EXTRACTOR extract_cursor; WT_CURSOR *c; WT_CURSOR_STATIC_INIT(iface, - __wt_cursor_get_key, /* get-key */ - __wt_cursor_get_value, /* get-value */ - __wt_cursor_set_key, /* set-key */ - __wt_cursor_set_value, /* set-value */ - __wt_cursor_notsup, /* compare */ - __wt_cursor_notsup, /* equals */ - __wt_cursor_notsup, /* next */ - __wt_cursor_notsup, /* prev */ - __wt_cursor_notsup, /* reset */ - __wt_cursor_notsup, /* search */ - __wt_cursor_notsup, /* search-near */ - __curjoin_extract_insert, /* insert */ - __wt_cursor_notsup, /* update */ - __wt_cursor_notsup, /* reconfigure */ - __wt_cursor_notsup, /* remove */ - __wt_cursor_notsup); /* close */ + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ + __wt_cursor_compare_notsup, /* compare */ + __wt_cursor_equals_notsup, /* equals */ + __wt_cursor_notsup, /* next */ + __wt_cursor_notsup, /* prev */ + __wt_cursor_notsup, /* reset */ + __wt_cursor_notsup, /* search */ + __wt_cursor_search_near_notsup, /* search-near */ + __curjoin_extract_insert, /* insert */ + __wt_cursor_notsup, /* update */ + __wt_cursor_notsup, /* remove */ + __wt_cursor_reconfigure_notsup, /* reconfigure */ + __wt_cursor_notsup); /* close */ WT_DECL_RET; WT_INDEX *idx; WT_ITEM *key, v; bool bloom_found; + if (skip_left && entry->ends_next == 1) + return (0); /* no checks to make */ key = cjoin->iter->curkey; entry->stats.accesses++; bloom_found = false; @@ -645,24 +694,35 @@ __curjoin_entry_member(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, bloom_found = true; } if (entry->index != NULL) { - memset(&v, 0, sizeof(v)); /* Keep lint quiet. */ - c = entry->main; - c->set_key(c, key); - if ((ret = c->search(c)) == 0) - ret = c->get_value(c, &v); - else if (ret == WT_NOTFOUND) - WT_ERR_MSG(session, WT_ERROR, - "main table for join is missing entry."); - WT_TRET(c->reset(c)); - WT_ERR(ret); + /* + * If this entry is used by the iterator, then we already + * have the index key, and we won't have to do any extraction + * either. + */ + if (entry == cjoin->iter->entry) + WT_ITEM_SET(v, cjoin->iter->idxkey); + else { + memset(&v, 0, sizeof(v)); /* Keep lint quiet. */ + c = entry->main; + c->set_key(c, key); + if ((ret = c->search(c)) == 0) + ret = c->get_value(c, &v); + else if (ret == WT_NOTFOUND) + WT_ERR_MSG(session, WT_ERROR, + "main table for join is missing entry"); + WT_TRET(c->reset(c)); + WT_ERR(ret); + } } else - v = *key; + WT_ITEM_SET(v, *key); - if ((idx = entry->index) != NULL && idx->extractor != NULL) { + if ((idx = entry->index) != NULL && idx->extractor != NULL && + entry != cjoin->iter->entry) { + WT_CLEAR(extract_cursor); extract_cursor.iface = iface; extract_cursor.iface.session = &session->iface; extract_cursor.iface.key_format = idx->exkey_format; - extract_cursor.ismember = 0; + extract_cursor.ismember = false; extract_cursor.entry = entry; WT_ERR(idx->extractor->extract(idx->extractor, &session->iface, key, &v, &extract_cursor.iface)); @@ -685,7 +745,9 @@ err: if (ret == WT_NOTFOUND && bloom_found) static int __curjoin_next(WT_CURSOR *cursor) { + WT_CURSOR *c; WT_CURSOR_JOIN *cjoin; + WT_CURSOR_JOIN_ITER *iter; WT_DECL_RET; WT_SESSION_IMPL *session; bool skip_left; @@ -701,9 +763,11 @@ __curjoin_next(WT_CURSOR *cursor) if (!F_ISSET(cjoin, WT_CURJOIN_INITIALIZED)) WT_ERR(__curjoin_init_iter(session, cjoin)); + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + iter = cjoin->iter; + nextkey: - if ((ret = __curjoin_entry_iter_next(cjoin->iter, &cursor->key, - &cursor->recno)) == 0) { + if ((ret = __curjoin_entry_iter_next(iter, cursor)) == 0) { F_SET(cursor, WT_CURSTD_KEY_EXT); /* @@ -715,11 +779,31 @@ nextkey: for (i = 0; i < cjoin->entries_next; i++) { ret = __curjoin_entry_member(session, cjoin, &cjoin->entries[i], skip_left); - if (ret == WT_NOTFOUND) + if (ret == WT_NOTFOUND) { + /* + * If this is compare=eq on our outer iterator, + * and we've moved past it, we're done. + */ + if (iter->isequal && i == 0) + break; goto nextkey; + } skip_left = false; WT_ERR(ret); } + } else if (ret != WT_NOTFOUND) + WT_ERR(ret); + + if (ret == 0) { + /* + * Position the 'main' cursor, this will be used to + * retrieve values from the cursor join. + */ + c = iter->main; + c->set_key(c, iter->curkey); + if ((ret = c->search(c)) != 0) + WT_ERR(c->search(c)); + F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); } if (0) { @@ -785,10 +869,11 @@ __curjoin_close(WT_CURSOR *cursor) for (end = &entry->ends[0]; end < &entry->ends[entry->ends_next]; end++) { F_CLR(end->cursor, WT_CURSTD_JOINED); - if (F_ISSET(end, WT_CURJOIN_END_OWN_KEY)) - __wt_free(session, end->key.data); + if (F_ISSET(end, WT_CURJOIN_END_OWN_CURSOR)) + WT_TRET(end->cursor->close(end->cursor)); } __wt_free(session, entry->ends); + __wt_free(session, entry->repack_format); } if (cjoin->iter != NULL) @@ -810,22 +895,22 @@ __wt_curjoin_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, - __curjoin_get_key, /* get-key */ - __curjoin_get_value, /* get-value */ - __wt_cursor_notsup, /* set-key */ - __wt_cursor_notsup, /* set-value */ - __wt_cursor_notsup, /* compare */ - __wt_cursor_notsup, /* equals */ - __curjoin_next, /* next */ - __wt_cursor_notsup, /* prev */ - __curjoin_reset, /* reset */ - __wt_cursor_notsup, /* search */ - __wt_cursor_notsup, /* search-near */ - __wt_cursor_notsup, /* insert */ - __wt_cursor_notsup, /* update */ - __wt_cursor_notsup, /* remove */ - __wt_cursor_notsup, /* reconfigure */ - __curjoin_close); /* close */ + __curjoin_get_key, /* get-key */ + __curjoin_get_value, /* get-value */ + __wt_cursor_set_key_notsup, /* set-key */ + __wt_cursor_set_value_notsup, /* set-value */ + __wt_cursor_compare_notsup, /* compare */ + __wt_cursor_equals_notsup, /* equals */ + __curjoin_next, /* next */ + __wt_cursor_notsup, /* prev */ + __curjoin_reset, /* reset */ + __wt_cursor_notsup, /* search */ + __wt_cursor_search_near_notsup, /* search-near */ + __wt_cursor_notsup, /* insert */ + __wt_cursor_notsup, /* update */ + __wt_cursor_notsup, /* remove */ + __wt_cursor_reconfigure_notsup, /* reconfigure */ + __curjoin_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_JOIN *cjoin; WT_DECL_ITEM(tmp); @@ -891,22 +976,22 @@ __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_INDEX *idx, WT_CURSOR *ref_cursor, uint8_t flags, uint8_t range, uint64_t count, uint32_t bloom_bit_count, uint32_t bloom_hash_count) { + WT_CURSOR_INDEX *cindex; + WT_CURSOR_JOIN_ENDPOINT *end; WT_CURSOR_JOIN_ENTRY *entry; WT_DECL_RET; - WT_CURSOR_JOIN_ENDPOINT *end, *newend; bool hasins, needbloom, range_eq; - u_int i, ins, nonbloom; + char *main_uri, *newformat; const char *raw_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), "raw", NULL }; - char *main_uri; - size_t namesize, newsize; + size_t len, newsize; + u_int i, ins, nonbloom; entry = NULL; hasins = needbloom = false; ins = 0; /* -Wuninitialized */ main_uri = NULL; nonbloom = 0; /* -Wuninitialized */ - namesize = strlen(cjoin->table->name); for (i = 0; i < cjoin->entries_next; i++) { if (cjoin->entries[i].index == idx) { @@ -982,13 +1067,13 @@ __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, ((range & WT_CURJOIN_END_GT) != 0 || range_eq)) || (F_ISSET(end, WT_CURJOIN_END_LT) && ((range & WT_CURJOIN_END_LT) != 0 || range_eq)) || - (end->flags == WT_CURJOIN_END_EQ && + (WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_EQ && (range & (WT_CURJOIN_END_LT | WT_CURJOIN_END_GT)) != 0)) WT_ERR_MSG(session, EINVAL, "join has overlapping ranges"); if (range == WT_CURJOIN_END_EQ && - end->flags == WT_CURJOIN_END_EQ && + WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_EQ && !F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION)) WT_ERR_MSG(session, EINVAL, "compare=eq can only be combined " @@ -1013,31 +1098,70 @@ __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, entry->bloom_hash_count = WT_MAX(entry->bloom_hash_count, bloom_hash_count); } - WT_ERR(__wt_realloc_def(session, &entry->ends_allocated, - entry->ends_next + 1, &entry->ends)); - if (!hasins) - ins = entry->ends_next; - newend = &entry->ends[ins]; - memmove(newend + 1, newend, - (entry->ends_next - ins) * sizeof(WT_CURSOR_JOIN_ENDPOINT)); - memset(newend, 0, sizeof(WT_CURSOR_JOIN_ENDPOINT)); - entry->ends_next++; - newend->cursor = ref_cursor; - F_SET(newend, range); + WT_ERR(__curjoin_insert_endpoint(session, entry, + hasins ? ins : entry->ends_next, &end)); + end->cursor = ref_cursor; + F_SET(end, range); /* Open the main file with a projection of the indexed columns. */ - if (entry->main == NULL && entry->index != NULL) { - namesize = strlen(cjoin->table->name); - newsize = namesize + entry->index->colconf.len + 1; + if (entry->main == NULL && idx != NULL) { + newsize = strlen(cjoin->table->name) + idx->colconf.len + 1; WT_ERR(__wt_calloc(session, 1, newsize, &main_uri)); snprintf(main_uri, newsize, "%s%.*s", - cjoin->table->name, (int)entry->index->colconf.len, - entry->index->colconf.str); + cjoin->table->name, (int)idx->colconf.len, + idx->colconf.str); WT_ERR(__wt_open_cursor(session, main_uri, (WT_CURSOR *)cjoin, raw_cfg, &entry->main)); + if (idx->extractor == NULL) { + /* + * Add no-op padding so trailing 'u' formats are not + * transformed to 'U'. This matches what happens in + * the index. We don't do this when we have an + * extractor, extractors already use the padding + * byte trick. + */ + len = strlen(entry->main->value_format) + 3; + WT_ERR(__wt_calloc(session, len, 1, &newformat)); + snprintf(newformat, len, "%s0x", + entry->main->value_format); + __wt_free(session, entry->main->value_format); + entry->main->value_format = newformat; + } + + /* + * When we are repacking index keys to remove the primary + * key, we never want to transform trailing 'u'. Use no-op + * padding to force this. + */ + cindex = (WT_CURSOR_INDEX *)ref_cursor; + len = strlen(cindex->iface.key_format) + 3; + WT_ERR(__wt_calloc(session, len, 1, &entry->repack_format)); + snprintf(entry->repack_format, len, "%s0x", + cindex->iface.key_format); } -err: if (main_uri != NULL) - __wt_free(session, main_uri); +err: __wt_free(session, main_uri); return (ret); } + +/* + * __curjoin_insert_endpoint -- + * Insert a new entry into the endpoint array for the join entry. + */ +static int +__curjoin_insert_endpoint(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry, + u_int pos, WT_CURSOR_JOIN_ENDPOINT **newendp) +{ + WT_CURSOR_JOIN_ENDPOINT *newend; + + WT_RET(__wt_realloc_def(session, &entry->ends_allocated, + entry->ends_next + 1, &entry->ends)); + newend = &entry->ends[pos]; + memmove(newend + 1, newend, + (entry->ends_next - pos) * sizeof(WT_CURSOR_JOIN_ENDPOINT)); + memset(newend, 0, sizeof(WT_CURSOR_JOIN_ENDPOINT)); + entry->ends_next++; + *newendp = newend; + + return (0); +} diff --git a/src/cursor/cur_log.c b/src/cursor/cur_log.c index 3fcd8a86066..0a13803da5d 100644 --- a/src/cursor/cur_log.c +++ b/src/cursor/cur_log.c @@ -347,22 +347,22 @@ __wt_curlog_open(WT_SESSION_IMPL *session, { WT_CONNECTION_IMPL *conn; WT_CURSOR_STATIC_INIT(iface, - __wt_cursor_get_key, /* get-key */ - __wt_cursor_get_value, /* get-value */ - __wt_cursor_set_key, /* set-key */ - __wt_cursor_set_value, /* set-value */ - __curlog_compare, /* compare */ - __wt_cursor_equals, /* equals */ - __curlog_next, /* next */ - __wt_cursor_notsup, /* prev */ - __curlog_reset, /* reset */ - __curlog_search, /* search */ - __wt_cursor_notsup, /* search-near */ - __wt_cursor_notsup, /* insert */ - __wt_cursor_notsup, /* update */ - __wt_cursor_notsup, /* remove */ - __wt_cursor_notsup, /* reconfigure */ - __curlog_close); /* close */ + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ + __curlog_compare, /* compare */ + __wt_cursor_equals, /* equals */ + __curlog_next, /* next */ + __wt_cursor_notsup, /* prev */ + __curlog_reset, /* reset */ + __curlog_search, /* search */ + __wt_cursor_search_near_notsup, /* search-near */ + __wt_cursor_notsup, /* insert */ + __wt_cursor_notsup, /* update */ + __wt_cursor_notsup, /* remove */ + __wt_cursor_reconfigure_notsup, /* reconfigure */ + __curlog_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_LOG *cl; WT_DECL_RET; @@ -397,7 +397,7 @@ __wt_curlog_open(WT_SESSION_IMPL *session, * The user may be trying to read a log record they just wrote. * Log records may be buffered, so force out any now. */ - WT_ERR(__wt_log_force_write(session, 1)); + WT_ERR(__wt_log_force_write(session, 1, NULL)); /* Log cursors block archiving. */ WT_ERR(__wt_readlock(session, log->log_archive_lock)); diff --git a/src/cursor/cur_metadata.c b/src/cursor/cur_metadata.c index df66ef34ddd..3d702e2ea8c 100644 --- a/src/cursor/cur_metadata.c +++ b/src/cursor/cur_metadata.c @@ -31,6 +31,58 @@ } while (0) /* + * __wt_schema_create_final -- + * Create a single configuration line from a set of configuration strings, + * including all of the defaults declared for a session.create, and stripping + * any configuration strings that don't belong in a session.create. Here for + * the wt dump command utility, which reads a set of configuration strings and + * needs to add in the defaults and then collapse them into single string for + * a subsequent load. + */ +int +__wt_schema_create_final( + WT_SESSION_IMPL *session, char *cfg_arg[], char **value_ret) +{ + WT_DECL_RET; + u_int i; + const char **cfg; + + /* + * Count the entries in the original, + * Allocate a copy with the defaults as the first entry, + * Collapse the whole thing into a single configuration string (which + * also strips any entries that don't appear in the first entry). + */ + for (i = 0; cfg_arg[i] != NULL; ++i) + ; + WT_RET(__wt_calloc_def(session, i + 2, &cfg)); + cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_create); + for (i = 0; cfg_arg[i] != NULL; ++i) + cfg[i + 1] = cfg_arg[i]; + cfg[i + 1] = NULL; + + ret = __wt_config_collapse(session, cfg, value_ret); + + __wt_free(session, cfg); + return (ret); +} + +/* + * __schema_create_strip -- + * Discard any configuration information from a schema entry that is not + * applicable to an session.create call. Here for the metadata:create URI. + */ +static int +__schema_create_strip( + WT_SESSION_IMPL *session, const char *value, char **value_ret) +{ + const char *cfg[] = + { WT_CONFIG_BASE(session, WT_SESSION_create), value, NULL }; + + return (__wt_config_collapse(session, cfg, value_ret)); +} + +/* * __curmetadata_setkv -- * Copy key/value into the public cursor, stripping internal metadata for * "create-only" cursors. @@ -49,8 +101,7 @@ __curmetadata_setkv(WT_CURSOR_METADATA *mdc, WT_CURSOR *fc) c->key.data = fc->key.data; c->key.size = fc->key.size; if (F_ISSET(mdc, WT_MDC_CREATEONLY)) { - WT_RET(__wt_schema_create_strip( - session, fc->value.data, NULL, &value)); + WT_RET(__schema_create_strip(session, fc->value.data, &value)); ret = __wt_buf_set( session, &c->value, value, strlen(value) + 1); __wt_free(session, value); @@ -92,8 +143,7 @@ __curmetadata_metadata_search(WT_SESSION_IMPL *session, WT_CURSOR *cursor) WT_RET(__wt_metadata_search(session, WT_METAFILE_URI, &value)); if (F_ISSET(mdc, WT_MDC_CREATEONLY)) { - ret = __wt_schema_create_strip( - session, value, NULL, &stripped); + ret = __schema_create_strip(session, value, &stripped); __wt_free(session, value); WT_RET(ret); value = stripped; @@ -448,22 +498,22 @@ __wt_curmetadata_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, - __wt_cursor_get_key, /* get-key */ - __wt_cursor_get_value, /* get-value */ - __wt_cursor_set_key, /* set-key */ - __wt_cursor_set_value, /* set-value */ - __curmetadata_compare, /* compare */ - __wt_cursor_equals, /* equals */ - __curmetadata_next, /* next */ - __curmetadata_prev, /* prev */ - __curmetadata_reset, /* reset */ - __curmetadata_search, /* search */ - __curmetadata_search_near, /* search-near */ - __curmetadata_insert, /* insert */ - __curmetadata_update, /* update */ - __curmetadata_remove, /* remove */ - __wt_cursor_notsup, /* reconfigure */ - __curmetadata_close); /* close */ + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ + __curmetadata_compare, /* compare */ + __wt_cursor_equals, /* equals */ + __curmetadata_next, /* next */ + __curmetadata_prev, /* prev */ + __curmetadata_reset, /* reset */ + __curmetadata_search, /* search */ + __curmetadata_search_near, /* search-near */ + __curmetadata_insert, /* insert */ + __curmetadata_update, /* update */ + __curmetadata_remove, /* remove */ + __wt_cursor_reconfigure_notsup, /* reconfigure */ + __curmetadata_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_METADATA *mdc; WT_DECL_RET; diff --git a/src/cursor/cur_stat.c b/src/cursor/cur_stat.c index 00a6ade21c6..f7a8f5fc866 100644 --- a/src/cursor/cur_stat.c +++ b/src/cursor/cur_stat.c @@ -200,8 +200,6 @@ __curstat_next(WT_CURSOR *cursor) if (cst->notinitialized) { WT_ERR(__wt_curstat_init( session, cursor->internal_uri, NULL, cst->cfg, cst)); - if (cst->next_set != NULL) - WT_ERR((*cst->next_set)(session, cst, true, true)); cst->notinitialized = false; } @@ -209,6 +207,8 @@ __curstat_next(WT_CURSOR *cursor) if (cst->notpositioned) { cst->notpositioned = false; cst->key = WT_STAT_KEY_MIN(cst); + if (cst->next_set != NULL) + WT_ERR((*cst->next_set)(session, cst, true, true)); } else if (cst->key < WT_STAT_KEY_MAX(cst)) ++cst->key; else if (cst->next_set != NULL) @@ -244,8 +244,6 @@ __curstat_prev(WT_CURSOR *cursor) if (cst->notinitialized) { WT_ERR(__wt_curstat_init( session, cursor->internal_uri, NULL, cst->cfg, cst)); - if (cst->next_set != NULL) - WT_ERR((*cst->next_set)(session, cst, false, true)); cst->notinitialized = false; } @@ -253,6 +251,8 @@ __curstat_prev(WT_CURSOR *cursor) if (cst->notpositioned) { cst->notpositioned = false; cst->key = WT_STAT_KEY_MAX(cst); + if (cst->next_set != NULL) + WT_ERR((*cst->next_set)(session, cst, false, true)); } else if (cst->key > WT_STAT_KEY_MIN(cst)) --cst->key; else if (cst->next_set != NULL) @@ -449,7 +449,6 @@ __curstat_join_next_set(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst, WT_JOIN_STATS_GROUP *join_group; ssize_t pos; - WT_ASSERT(session, WT_STREQ(cst->iface.uri, "statistics:join")); join_group = &cst->u.join_stats_group; cjoin = join_group->join_cursor; if (init) @@ -504,14 +503,13 @@ __curstat_join_init(WT_SESSION_IMPL *session, WT_CURSOR *curjoin, const char *cfg[], WT_CURSOR_STAT *cst) { WT_CURSOR_JOIN *cjoin; - WT_DECL_RET; WT_UNUSED(cfg); if (curjoin == NULL && cst->u.join_stats_group.join_cursor != NULL) curjoin = &cst->u.join_stats_group.join_cursor->iface; if (curjoin == NULL || !WT_PREFIX_MATCH(curjoin->uri, "join:")) - WT_ERR_MSG(session, EINVAL, + WT_RET_MSG(session, EINVAL, "join cursor must be used with statistics:join"); cjoin = (WT_CURSOR_JOIN *)curjoin; memset(&cst->u.join_stats_group, 0, sizeof(WT_JOIN_STATS_GROUP)); @@ -522,8 +520,7 @@ __curstat_join_init(WT_SESSION_IMPL *session, cst->stats_count = sizeof(WT_JOIN_STATS) / sizeof(int64_t); cst->stats_desc = __curstat_join_desc; cst->next_set = __curstat_join_next_set; - -err: return (ret); + return (0); } /* @@ -544,25 +541,28 @@ __wt_curstat_init(WT_SESSION_IMPL *session, dsrc_uri = uri + strlen("statistics:"); if (WT_STREQ(dsrc_uri, "join")) - return (__curstat_join_init(session, curjoin, cfg, cst)); + WT_RET(__curstat_join_init(session, curjoin, cfg, cst)); - if (WT_PREFIX_MATCH(dsrc_uri, "colgroup:")) - return ( + else if (WT_PREFIX_MATCH(dsrc_uri, "colgroup:")) + WT_RET( __wt_curstat_colgroup_init(session, dsrc_uri, cfg, cst)); - if (WT_PREFIX_MATCH(dsrc_uri, "file:")) - return (__curstat_file_init(session, dsrc_uri, cfg, cst)); + else if (WT_PREFIX_MATCH(dsrc_uri, "file:")) + WT_RET(__curstat_file_init(session, dsrc_uri, cfg, cst)); - if (WT_PREFIX_MATCH(dsrc_uri, "index:")) - return (__wt_curstat_index_init(session, dsrc_uri, cfg, cst)); + else if (WT_PREFIX_MATCH(dsrc_uri, "index:")) + WT_RET(__wt_curstat_index_init(session, dsrc_uri, cfg, cst)); - if (WT_PREFIX_MATCH(dsrc_uri, "lsm:")) - return (__wt_curstat_lsm_init(session, dsrc_uri, cst)); + else if (WT_PREFIX_MATCH(dsrc_uri, "lsm:")) + WT_RET(__wt_curstat_lsm_init(session, dsrc_uri, cst)); - if (WT_PREFIX_MATCH(dsrc_uri, "table:")) - return (__wt_curstat_table_init(session, dsrc_uri, cfg, cst)); + else if (WT_PREFIX_MATCH(dsrc_uri, "table:")) + WT_RET(__wt_curstat_table_init(session, dsrc_uri, cfg, cst)); - return (__wt_bad_object_type(session, uri)); + else + return (__wt_bad_object_type(session, uri)); + + return (0); } /* @@ -575,22 +575,22 @@ __wt_curstat_open(WT_SESSION_IMPL *session, { WT_CONNECTION_IMPL *conn; WT_CURSOR_STATIC_INIT(iface, - __curstat_get_key, /* get-key */ - __curstat_get_value, /* get-value */ - __curstat_set_key, /* set-key */ - __curstat_set_value, /* set-value */ - __wt_cursor_notsup, /* compare */ - __wt_cursor_notsup, /* equals */ - __curstat_next, /* next */ - __curstat_prev, /* prev */ - __curstat_reset, /* reset */ - __curstat_search, /* search */ - __wt_cursor_notsup, /* search-near */ - __wt_cursor_notsup, /* insert */ - __wt_cursor_notsup, /* update */ - __wt_cursor_notsup, /* remove */ - __wt_cursor_notsup, /* reconfigure */ - __curstat_close); /* close */ + __curstat_get_key, /* get-key */ + __curstat_get_value, /* get-value */ + __curstat_set_key, /* set-key */ + __curstat_set_value, /* set-value */ + __wt_cursor_compare_notsup, /* compare */ + __wt_cursor_equals_notsup, /* equals */ + __curstat_next, /* next */ + __curstat_prev, /* prev */ + __curstat_reset, /* reset */ + __curstat_search, /* search */ + __wt_cursor_search_near_notsup, /* search-near */ + __wt_cursor_notsup, /* insert */ + __wt_cursor_notsup, /* update */ + __wt_cursor_notsup, /* remove */ + __wt_cursor_reconfigure_notsup, /* reconfigure */ + __curstat_close); /* close */ WT_CONFIG_ITEM cval, sval; WT_CURSOR *cursor; WT_CURSOR_STAT *cst; diff --git a/src/cursor/cur_std.c b/src/cursor/cur_std.c index 051f36c8854..7839971f975 100644 --- a/src/cursor/cur_std.c +++ b/src/cursor/cur_std.c @@ -9,27 +9,108 @@ #include "wt_internal.h" /* + * __wt_cursor_noop -- + * Cursor noop. + */ +int +__wt_cursor_noop(WT_CURSOR *cursor) +{ + WT_UNUSED(cursor); + + return (0); +} + +/* * __wt_cursor_notsup -- * Unsupported cursor actions. */ int __wt_cursor_notsup(WT_CURSOR *cursor) { - WT_UNUSED(cursor); + WT_SESSION_IMPL *session; - return (ENOTSUP); + session = (WT_SESSION_IMPL *)cursor->session; + WT_RET_MSG(session, ENOTSUP, "Unsupported cursor operation"); } /* - * __wt_cursor_noop -- - * Cursor noop. + * __wt_cursor_get_value_notsup -- + * WT_CURSOR.get_value not-supported. */ int -__wt_cursor_noop(WT_CURSOR *cursor) +__wt_cursor_get_value_notsup(WT_CURSOR *cursor, ...) { - WT_UNUSED(cursor); + return (__wt_cursor_notsup(cursor)); +} - return (0); +/* + * __wt_cursor_set_key_notsup -- + * WT_CURSOR.set_key not-supported. + */ +void +__wt_cursor_set_key_notsup(WT_CURSOR *cursor, ...) +{ + (void)__wt_cursor_notsup(cursor); +} + +/* + * __wt_cursor_set_value_notsup -- + * WT_CURSOR.set_value not-supported. + */ +void +__wt_cursor_set_value_notsup(WT_CURSOR *cursor, ...) +{ + (void)__wt_cursor_notsup(cursor); +} + +/* + * __wt_cursor_compare_notsup -- + * Unsupported cursor comparison. + */ +int +__wt_cursor_compare_notsup(WT_CURSOR *a, WT_CURSOR *b, int *cmpp) +{ + WT_UNUSED(b); + WT_UNUSED(cmpp); + + return (__wt_cursor_notsup(a)); +} + +/* + * __wt_cursor_equals_notsup -- + * Unsupported cursor equality. + */ +int +__wt_cursor_equals_notsup(WT_CURSOR *cursor, WT_CURSOR *other, int *equalp) +{ + WT_UNUSED(other); + WT_UNUSED(equalp); + + return (__wt_cursor_notsup(cursor)); +} + +/* + * __wt_cursor_search_near_notsup -- + * Unsupported cursor search-near. + */ +int +__wt_cursor_search_near_notsup(WT_CURSOR *cursor, int *exact) +{ + WT_UNUSED(exact); + + return (__wt_cursor_notsup(cursor)); +} + +/* + * __wt_cursor_reconfigure_notsup -- + * Unsupported cursor reconfiguration. + */ +int +__wt_cursor_reconfigure_notsup(WT_CURSOR *cursor, const char *config) +{ + WT_UNUSED(config); + + return (__wt_cursor_notsup(cursor)); } /* @@ -46,13 +127,12 @@ __wt_cursor_set_notsup(WT_CURSOR *cursor) * cursors in a session. Reconfigure is left open in case it's possible * in the future to change these configurations. */ - cursor->compare = - (int (*)(WT_CURSOR *, WT_CURSOR *, int *))__wt_cursor_notsup; + cursor->compare = __wt_cursor_compare_notsup; cursor->next = __wt_cursor_notsup; cursor->prev = __wt_cursor_notsup; cursor->reset = __wt_cursor_noop; cursor->search = __wt_cursor_notsup; - cursor->search_near = (int (*)(WT_CURSOR *, int *))__wt_cursor_notsup; + cursor->search_near = __wt_cursor_search_near_notsup; cursor->insert = __wt_cursor_notsup; cursor->update = __wt_cursor_notsup; cursor->remove = __wt_cursor_notsup; @@ -628,7 +708,7 @@ __wt_cursor_init(WT_CURSOR *cursor, } else { WT_RET( __wt_config_gets_def(session, cfg, "readonly", 0, &cval)); - if (cval.val != 0) { + if (cval.val != 0 || F_ISSET(S2C(session), WT_CONN_READONLY)) { cursor->insert = __wt_cursor_notsup; cursor->update = __wt_cursor_notsup; cursor->remove = __wt_cursor_notsup; diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c index d986577f640..9eb88ec6fcd 100644 --- a/src/cursor/cur_table.c +++ b/src/cursor/cur_table.c @@ -79,22 +79,22 @@ __wt_apply_single_idx(WT_SESSION_IMPL *session, WT_INDEX *idx, WT_CURSOR *cur, WT_CURSOR_TABLE *ctable, int (*f)(WT_CURSOR *)) { WT_CURSOR_STATIC_INIT(iface, - __wt_cursor_get_key, /* get-key */ - __wt_cursor_get_value, /* get-value */ - __wt_cursor_set_key, /* set-key */ - __wt_cursor_set_value, /* set-value */ - __wt_cursor_notsup, /* compare */ - __wt_cursor_notsup, /* equals */ - __wt_cursor_notsup, /* next */ - __wt_cursor_notsup, /* prev */ - __wt_cursor_notsup, /* reset */ - __wt_cursor_notsup, /* search */ - __wt_cursor_notsup, /* search-near */ - __curextract_insert, /* insert */ - __wt_cursor_notsup, /* update */ - __wt_cursor_notsup, /* reconfigure */ - __wt_cursor_notsup, /* remove */ - __wt_cursor_notsup); /* close */ + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ + __wt_cursor_compare_notsup, /* compare */ + __wt_cursor_equals_notsup, /* equals */ + __wt_cursor_notsup, /* next */ + __wt_cursor_notsup, /* prev */ + __wt_cursor_notsup, /* reset */ + __wt_cursor_notsup, /* search */ + __wt_cursor_search_near_notsup, /* search-near */ + __curextract_insert, /* insert */ + __wt_cursor_notsup, /* update */ + __wt_cursor_notsup, /* remove */ + __wt_cursor_reconfigure_notsup, /* reconfigure */ + __wt_cursor_notsup); /* close */ WT_CURSOR_EXTRACTOR extract_cursor; WT_DECL_RET; WT_ITEM key, value; @@ -842,22 +842,22 @@ __wt_curtable_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, - __wt_curtable_get_key, /* get-key */ - __wt_curtable_get_value, /* get-value */ - __wt_curtable_set_key, /* set-key */ - __wt_curtable_set_value, /* set-value */ - __curtable_compare, /* compare */ - __wt_cursor_equals, /* equals */ - __curtable_next, /* next */ - __curtable_prev, /* prev */ - __curtable_reset, /* reset */ - __curtable_search, /* search */ - __curtable_search_near, /* search-near */ - __curtable_insert, /* insert */ - __curtable_update, /* update */ - __curtable_remove, /* remove */ - __wt_cursor_reconfigure, /* reconfigure */ - __curtable_close); /* close */ + __wt_curtable_get_key, /* get-key */ + __wt_curtable_get_value, /* get-value */ + __wt_curtable_set_key, /* set-key */ + __wt_curtable_set_value, /* set-value */ + __curtable_compare, /* compare */ + __wt_cursor_equals, /* equals */ + __curtable_next, /* next */ + __curtable_prev, /* prev */ + __curtable_reset, /* reset */ + __curtable_search, /* search */ + __curtable_search_near, /* search-near */ + __curtable_insert, /* insert */ + __curtable_update, /* update */ + __curtable_remove, /* remove */ + __wt_cursor_reconfigure, /* reconfigure */ + __curtable_close); /* close */ WT_CONFIG_ITEM cval; WT_CURSOR *cursor; WT_CURSOR_TABLE *ctable; diff --git a/src/docs/checkpoint.dox b/src/docs/checkpoint.dox index 523c0887859..ec28fea13c3 100644 --- a/src/docs/checkpoint.dox +++ b/src/docs/checkpoint.dox @@ -23,11 +23,16 @@ All transactional updates committed before a checkpoint are made durable by the checkpoint, therefore the frequency of checkpoints limits the volume of data that may be lost due to application or system failure. -When WiredTiger data sources are first opened, they are opened in the -state of the most recent checkpoint taken on the file, in other words, -updates after the most recent checkpoint will not appear in the data -source. If no checkpoint is found when the data source is opened, the -data source will appear empty. +Data sources that are involved in an exclusive operation when the +checkpoint starts, including bulk load, verify or salvage, will be skipped +by the checkpoint. Operations requiring exclusive access may fail with +an \c EBUSY error if attempted during a checkpoint. + +When data sources are first opened, they are opened in the state of the +most recent checkpoint taken on the file, in other words, updates after the +most recent checkpoint will not appear in the data source. If no +checkpoint is found when the data source is opened, the data source will +appear empty. @section checkpoint_server Automatic checkpoints @@ -54,15 +59,16 @@ checkpoint cursor is closed. @section checkpoint_naming Checkpoint naming -Additionally, checkpoints that do not include LSM trees may optionally -be given names by the application. Checkpoints named by the application -persist until explicitly discarded or the application creates a new -checkpoint with the same name (which replaces the previous checkpoint -of that name). If the previous checkpoint cannot be replaced, either -because a cursor is reading from the previous checkpoint, or backups are -in progress, the checkpoint will fail. Because named checkpoints -persist until discarded or replaced, they can be used to periodically -snapshot data for later use. +Additionally, checkpoints that do not include LSM trees may optionally be +given names by the application. Because named checkpoints persist until +discarded or replaced, they can be used to periodically snapshot data for +later use. + +Checkpoints named by the application persist until explicitly discarded or +the application creates a new checkpoint with the same name (which replaces +the previous checkpoint of that name). If the previous checkpoint cannot be +replaced, either because a cursor is reading from the previous checkpoint, +or backups are in progress, the checkpoint will fail. Internal checkpoints (that is, checkpoints not named by the application) use the reserved name "WiredTigerCheckpoint". Applications can open the diff --git a/src/docs/command-line.dox b/src/docs/command-line.dox index e2b376d5e3f..0f5c56d25ce 100644 --- a/src/docs/command-line.dox +++ b/src/docs/command-line.dox @@ -41,7 +41,7 @@ by default and commands that only read data will not run recovery. Perform a backup of a database or set of data sources. The \c backup command performs a backup of the database, copying the -database files to a \c specified directory, which can be subsequently +underlying files to a \c specified directory, which can be subsequently opened as a WiredTiger database. See @ref backup for more information, and @ref file_permissions for specifics on the copied file permissions. @@ -58,10 +58,10 @@ the named data sources. <hr> @section util_compact wt compact -Compact a table or file. +Compact a table. -The \c compact command attempts to rewrite the specified table or file -to consume less disk space. +The \c compact command attempts to rewrite the specified table to +consume less disk space. @subsection util_compact_synopsis Synopsis <code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] compact uri</code> @@ -71,7 +71,7 @@ The \c compact command has no command-specific options. <hr> @section util_create wt create -Create a table or file. +Create a table. The \c create command creates the specified \c uri with the specified configuration. It is equivalent to a call to WT_SESSION::create with @@ -88,7 +88,7 @@ Include a configuration string to be passed to WT_SESSION::create. <hr> @section util_drop wt drop -Drop a table or file. +Drop a table. The \c drop command drops the specified \c uri. It is equivalent to a call to WT_SESSION::drop with the "force" configuration argument. @@ -136,10 +136,10 @@ printable characters unencoded). <hr> @section util_list wt list -List the tables and files in the database. +List the tables in the database. -By default, the \c list command prints out the tables and files stored in -the database. If a URI is specified as an argument, only information about +By default, the \c list command prints out the tables stored in the +database. If a URI is specified as an argument, only information about that data source is printed. @subsection util_list_synopsis Synopsis @@ -158,16 +158,16 @@ value is printed. <hr> @section util_load wt load -Load a table or file from dump output. +Load a table from dump output. The \c load command reads the standard input for data and loads it into -a table or file, creating the table or file if it does not yet exist. -The data should be the format produced by the \c dump command; see -@ref dump_formats for details. +a table, creating the table if it does not yet exist. The data should +be the format produced by the \c dump command; see @ref dump_formats for +details. -By default, if the table or file already exists, data in the file or -table will be overwritten by the new data (use the \c -n option to -make an attempt to overwrite existing data return an error). +By default, if the table already exists, data in the table will be +overwritten by the new data (use the \c -n option to make an attempt to +overwrite existing data return an error). @subsection util_load_synopsis Synopsis <code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] load [-ajn] [-f input] [-r name] [uri configuration ...]</code> @@ -182,8 +182,8 @@ number keys. The \c -a option is only applicable when loading into a column store. @par <code>-f</code> -By default, the \c load command reads from the standard input; the \c --f option reads the input from the specified file. +By default, the \c load command reads from the standard input; the \c -f +option reads the input from the specified file. @par <code>-j</code> Load input in the JSON (<a href="http://www.json.org">JavaScript Object @@ -196,7 +196,7 @@ load command to fail if there's an attempt to overwrite already existing data. @par <code>-r</code> -By default, the \c load command uses the table or file name taken from the +By default, the \c load command uses the table name taken from the input; the \c -r option renames the data source. Additionally, \c uri and \c configuration pairs may be specified to the @@ -227,24 +227,23 @@ table:xxx block_allocation=first table:xxx prefix_compress=false <hr> @section util_loadtext wt loadtext -Load text into a table or file. +Load text into a table. The \c loadtext command reads the standard input for text and loads it -into a table or file. The input data should be printable characters, -with newline delimiters for each key or value. +into a table. The input data should be printable characters, with +newline delimiters for each key or value. -The \c loadtext command does not create the file if it does not yet +The \c loadtext command does not create the object if it does not yet exist. -In the case of inserting values into a column-store table or file, each -value is appended to the table or file; in the case of inserting values -into a row-store table or file, lines are handled in pairs, where the -first line is the key and the second line is the value. If the -row-store table or file already exists, data in the table or file will -be overwritten by the new data. +In the case of inserting values into a column-store table, each value +is appended to the table; in the case of inserting values into a +row-store table, lines are handled in pairs, where the first line is the +key and the second line is the value. If the row-store table already +exists, data in the table will be overwritten by the new data. @subsection util_loadtext_synopsis Synopsis -<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] loadtext [-f input]</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] loadtext [-f input] uri</code> @subsection util_loadtext_options Options The following are command-specific options for the \c loadtext command: @@ -275,7 +274,7 @@ to the default string format. <hr> @section util_read wt read -Read records from a table or file. +Read records from a table. The \c read command prints out the records associated with the specified keys from the specified data source. The data source must be configured @@ -291,9 +290,9 @@ The \c read command has no command-specific options. <hr> @section util_rename wt rename -Rename a table or file. +Rename a table. -The \c rename command renames the specified table or file. +The \c rename command renames the specified table. @subsection util_rename_synopsis Synopsis <code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] rename uri name</code> @@ -303,11 +302,11 @@ The \c rename command has no command-specific options. <hr> @section util_salvage wt salvage -Recover data from a corrupted file. +Recover data from a corrupted table. The \c salvage command salvages the specified data source, discarding any -data that cannot be recovered. Underlying files are re-written in -place, overwriting the original file contents. +data that cannot be recovered. Underlying files are re-written in place, +overwriting the original file contents. @subsection util_salvage_synopsis Synopsis <code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] salvage [-F force] uri</code> @@ -316,9 +315,9 @@ place, overwriting the original file contents. The following are command-specific options for the \c salvage command: @par <code>-F</code> -By default, salvage will refuse to salvage files that fail basic tests -(for example, files that don't appear to be in a WiredTiger format). -The \c -F option forces the salvage of the file, regardless. +By default, salvage will refuse to salvage tables that fail basic tests +(for example, tables that don't appear to be in a WiredTiger format). +The \c -F option forces the salvage of the table, regardless. <hr> @section util_stat wt stat @@ -339,11 +338,11 @@ Include only "fast" statistics in the output (equivalent to passing <hr> @section util_upgrade wt upgrade -Upgrade a table or file. +Upgrade a table. -The \c upgrade command upgrades the specified table or file, exiting -success if the data source is up-to-date, and failure if the data source -cannot be upgraded. +The \c upgrade command upgrades the specified table, exiting success if +the data source is up-to-date, and failure if the data source cannot be +upgraded. @subsection util_upgrade_synopsis Synopsis <code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] upgrade uri</code> @@ -353,11 +352,10 @@ The \c upgrade command has no command-specific options. <hr> @section util_verify wt verify -Check the structural integrity of a table or file. +Check the structural integrity of a table. -The \c verify command verifies the specified table or file, exiting -success if the data source is correct, and failure if the data source is -corrupted. +The \c verify command verifies the specified table, exiting success if +the data source is correct, and failure if the data source is corrupted. @subsection util_verify_synopsis Synopsis <code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] verify uri</code> @@ -367,7 +365,7 @@ The \c verify command has no command-specific options. <hr> @section util_write wt write -Write records to a table or file. +Write records to a table. The \c write command stores records into the specified data source. The data source must be configured with string or record number keys and diff --git a/src/docs/data-sources.dox b/src/docs/data-sources.dox index d09d1cbc1b8..7f1879e0ffe 100644 --- a/src/docs/data-sources.dox +++ b/src/docs/data-sources.dox @@ -38,7 +38,7 @@ cursor types that give access to data managed by WiredTiger: key=<code>string</code>\, value=<code>string</code>\,<br> see @ref metadata for details} @row{<tt>statistics:[\<data source URI\>]</tt>, - database or data source statistics cursor, + database, data source or join statistics cursor, key=<code>int id</code>\,<br> value=<code>(string description\, string value\, uint64_t value)</code>\,<br> @@ -106,7 +106,9 @@ WiredTiger database as well as statistics for individual data sources. The statistics are at two levels: per-database and per-individual data source. Database-wide statistics are retrieved with the \c "statistics:" URI; individual data source statistics are available by specifying -\c "statistics:<data source URI>". +\c "statistics:<data source URI>". Additionally, statistics about a +join cursor can be retrieved by specifying \c "statistics:join" and +supplying the join cursor as an argument in the SESSION::open_cursor call. The statistic key is an integer from the list of keys in @ref_single statistics_keys "Statistics Keys". Statistics cursors return @@ -127,7 +129,11 @@ The following is an example of printing statistics about a table: @snippet ex_stat.c statistics table function -Both examples can use a common display routine that iterates through the +The following is an example of printing statistics about a join cursor: + +@snippet ex_stat.c statistics join cursor function + +These three examples can use a common display routine that iterates through the statistics until the cursor returns the end of the list. @snippet ex_stat.c statistics display function diff --git a/src/docs/error-handling.dox b/src/docs/error-handling.dox index d1291e38ff0..d91a126ee21 100644 --- a/src/docs/error-handling.dox +++ b/src/docs/error-handling.dox @@ -55,14 +55,32 @@ This error is generated when wiredtiger_open is configured to return an error if @if IGNORE_BUILT_BY_API_ERR_END @endif -The ::wiredtiger_strerror function returns the standard message -associated with any WiredTiger, ISO C99, or POSIX 1003.1-2001 function: +@section error_translation Translating errors + +The WT_SESSION::strerror and ::wiredtiger_strerror functions return the +standard text message associated with any WiredTiger, ISO C, or POSIX +standard API. + +@snippet ex_all.c Display an error thread safe @snippet ex_all.c Display an error +Note that ::wiredtiger_strerror is not thread-safe. + @m_if{c} +@section error_handling_event Error handling using the WT_EVENT_HANDLER + More complex error handling can be configured by passing an implementation of WT_EVENT_HANDLER to ::wiredtiger_open or WT_CONNECTION::open_session. + +For example, both informational and error messages might be passed to an +application-specific logging function that added a timestamp and logged +the message to a file, and error messages might additionally be output to +the \c stderr file stream. + +@snippet ex_event_handler.c Function event_handler +@snippet ex_event_handler.c Configure event_handler + @m_endif */ diff --git a/src/docs/license.dox b/src/docs/license.dox index febced2c6af..d7814d04fd6 100644 --- a/src/docs/license.dox +++ b/src/docs/license.dox @@ -2,16 +2,16 @@ The complete WiredTiger software package is Open Source software: you are welcome to modify and redistribute it under the terms of -<a href="http://www.gnu.org/licenses/gpl-2.0-standalone.html"> -<b>version 2</b></a> or -<a href="http://www.gnu.org/licenses/gpl-3.0-standalone.html"> -<b>version 3</b></a> of the -<b>GNU General Public License</b></a> +<a href="http://www.gnu.org/licenses/gpl-2.0-standalone.html">version 2</a> +or +<a href="http://www.gnu.org/licenses/gpl-3.0-standalone.html">version 3</a> +of the +<b>GNU General Public License</b> as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -<b>GNU General Public License</b></a> for details. +<b>GNU General Public License</b> for details. Additionally, portions of the WiredTiger distribution are distributed under the terms of the @@ -31,10 +31,10 @@ those described above, or for technical support for this software, please contact MongoDB, Inc. at <a mailto="info@wiredtiger.com">info@wiredtiger.com</a>. -@section license_library 3rd party software included in the WiredTiger library +@section license_library 3rd party software always included in the WiredTiger library Every build of the WiredTiger library binary includes the following 3rd -party software, distributed under their license terms. Redistribution +party software, distributed under separate license terms. Redistribution of the WiredTiger library should comply with these copyrights. <table> @@ -46,14 +46,26 @@ of the WiredTiger library should comply with these copyrights. @row{\c src/support/hash_fnv.c, Authors, Public Domain} </table> +@section license_crc32-vpmsum 3rd party software optionally included in the WiredTiger library: PPC64 + +PPC64 and PPC64LE builds of the WiredTiger library binary include additional +3rd party software, distributed under separate license terms. Redistribution +of the WiredTiger library PPC64 and PPC64LE builds should comply with these +copyrights. + +<table> +@hrow{Distribution Files, Copyright Holder, License} +@row{\c src/support/power8/*, Anton Blanchard, <a href="http://opensource.org/licenses/Apache-2.0">Apache License\, Version 2.0</a> or the <a href="http://www.gnu.org/licenses/gpl-2.0-standalone.html">GNU General Public License\, version 2 or later</a>} +</table> + @section license_leveldb 3rd party software optionally included in the WiredTiger library: LevelDB If the \c --enable-leveldb configuration option is specified when configuring the WiredTiger build, additional 3rd party software is -included in the WiredTiger LevelDB library binary, distributed under -their license terms. Redistribution of the WiredTiger library built -with the \c --enable-leveldb configuration option should comply with -these copyrights. +included in the WiredTiger library binary, distributed under separate +license terms. Redistribution of the WiredTiger library built with the +\c --enable-leveldb configuration option should comply with these +copyrights. <table> @hrow{Distribution Files, Copyright Holder, License} diff --git a/src/docs/programming.dox b/src/docs/programming.dox index 5d79edd660b..f717f4ed1fe 100644 --- a/src/docs/programming.dox +++ b/src/docs/programming.dox @@ -30,6 +30,7 @@ each of which is ordered by one or more columns. <h2>Programming notes</h2> - @subpage threads - @subpage namespace +- @subpage readonly @m_if{c} - @subpage signals @m_endif diff --git a/src/docs/readonly.dox b/src/docs/readonly.dox new file mode 100644 index 00000000000..ad4a94a73f1 --- /dev/null +++ b/src/docs/readonly.dox @@ -0,0 +1,55 @@ +/*! @m_page{{c,java},readonly,Database read-only mode} + +WiredTiger supports read-only mode databases. When a database is opened +in read-only mode, all modifications are disabled on the WT_CONNECTION +handle, any sessions opened in that connection and any cursors opened +in any of those sessions. For example, all cursor or session handle +methods that modify the database will instead return errors. + +When a database is opened in read-only mode, the database directory and +content must already exist and have been shutdown cleanly. + +@section readonly_config Database read-only configuration considerations + +The \c readonly configuration affects other configuration settings. +Where a default setting contradicts read-only operation, WiredTiger +defaults are overridden to perform in a read-only mode. For example, LSM +tree merges are turned off when LSM trees are configured, and log file +archiving is disabled when logging is configured. + +Where a user configured setting contradicts read-only operation, WiredTiger +will return an error. For example, zero-filling +log files is not allowed in read-only mode, and attempting to configure +them will return an error. + +@section readonly_recovery Readonly configuration and recovery + +Because recovery modifies the database, recovery cannot be done in +read-only mode. A ::wiredtiger_open call to open a database in read-only +mode will fail if the database was not cleanly shutdown and recovery is +required. + +@section readonly_logging Readonly configuration and logging + +If logging is enabled on the database when opened in read-only mode, log +file archiving and log file pre-allocation are disabled and the log files +will not be modified any way. + +@section readonly_lsm Readonly configuration and LSM trees + +If LSM trees are in use, read-only mode turns off all modification. +Internal LSM operations such as merging, creating new chunks, creating +bloom filters and dropping old chunks are disabled. + +@section readonly_handles Readonly configuration and multiple database handles + +One unusual affect of read-only operations is the potential for multiple +read-only database handles open on the same database at the same time. +WiredTiger prevents multiple connection handles by writing a lock file, +and this locking is done even in read-only mode. However, if the lock +file cannot be written, opening in read-only mode is still allowed to +proceed. For that reason, multiple read-only connection handles could +be open at the same time. Normal locking occurs if the lock file can be +written in read-only mode, preventing multiple database connections. + +*/ diff --git a/src/docs/spell.ok b/src/docs/spell.ok index 80597302cbb..efc306568cd 100644 --- a/src/docs/spell.ok +++ b/src/docs/spell.ok @@ -7,6 +7,7 @@ Atomicity BLOBs CFLAGS CPPFLAGS +CRC Cheng Christoph Collet's @@ -64,6 +65,7 @@ NOTFOUND NUMA NoSQL OPTYPE +PPC PRELOAD README Rebalance @@ -151,6 +153,7 @@ control's copydoc cpp crashless +crc cursortype customerABC cv @@ -377,6 +380,7 @@ rVv rdbms rdlock readlock +readonly realclean realloc realloc'd @@ -419,6 +423,7 @@ src ssd startsync statlog +stderr str strerror strftime @@ -475,6 +480,7 @@ valuefmt vec versa vm +vpmsum warmup whitespace wiredtiger diff --git a/src/docs/statistics.dox b/src/docs/statistics.dox index 453da34c51a..0a29e351e4e 100644 --- a/src/docs/statistics.dox +++ b/src/docs/statistics.dox @@ -79,6 +79,15 @@ or logged: @snippet ex_all.c Statistics clear configuration +The following example opens a statistics cursor on an open join cursor: + +@snippet ex_schema.c Statistics cursor join cursor + +The statistics gathered will be organized by reference cursors participating +in the join (see WT_SESSION::join); the uri of each reference cursor appears +as a prefix in the description field returned as a value by the statistics +cursor. + @section statistics_log Statistics logging WiredTiger will optionally log database statistics into a file when the diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox index e4d85003a1e..8b3d61e4c19 100644 --- a/src/docs/upgrading.dox +++ b/src/docs/upgrading.dox @@ -2,27 +2,34 @@ @section version_271 Upgrading to Version 2.7.1 <dl> +<dt>LSM metadata</dt> +<dd> +There is a change to the format of LSM metadata in this release to fix bugs +in dump / load of tables of type LSM. Tables created with the old LSM metadata +format will be upgraded automatically, but once updated to the new version +<b>are no longer compatible with older releases of WiredTiger</b>. +</dd> + <dt>Column-store bulk-load cursors</dt> <dd> -Historically, bulk-load of a column-store object ignored any key set in -the cursor and automatically assigned each inserted row the next -sequential record number for its key. In the 2.7.1 release, column-store -objects match row-store behavior and require the cursor key be set -before an insert. (This also allows allows sparse tables to be created -in column-store objects, any skipped records are created as -already-deleted rows.) To match the previous behavior, specify the -\c append configuration string when opening the column-store bulk-load -cursor; this causes the cursor's key to be ignored and each inserted row -will be assigned the next record number. +Historically, bulk-load of a column-store object ignored any key set in the +cursor and automatically assigned each inserted row the next sequential +record number for its key. In the 2.7.1 release, column-store objects match +row-store behavior and require the cursor key be set before an insert. +(This allows sparse tables to be created in column-store objects, any +skipped records are created as already-deleted rows.) To match the previous +behavior, specify the \c append configuration string when opening the +column-store bulk-load cursor; this causes the cursor's key to be ignored +and each inserted row will be assigned the next record number. </dd> <dt>Change to WT_SESSION::truncate with URI</dt> <dd> If using the WT_SESSION::truncate API with a file: URI for a full table -truncate, underlying algorithmic changes result in some visible differences. -This call can now return WT_ROLLBACK. Applications should be prepared to -handle this error. This method no longer requires exclusive access to the -table. Also the underlying disk space may not be immediately +truncate, underlying algorithmic changes result in some visible +differences. This call can now return WT_ROLLBACK. Applications should be +prepared to handle this error. This method no longer requires exclusive +access to the table. Also the underlying disk space may not be immediately reclaimed when the call returns. The performance of this API may differ from earlier releases. </dd> @@ -34,6 +41,14 @@ from the WiredTiger release; remaining compression engines include LZ4, snappy and zlib. </dd> +<dt>Change to named checkpoints with bulk loads</dt> +<dd> +Previous versions of WiredTiger created empty named checkpoints in files +being bulk-loaded. In this release, checkpoint skips files being +bulk-loaded, so they do not get named checkpoints that complete during the +bulk load. +</dd> + </dl><hr> @section version_270 Upgrading to Version 2.7.0 diff --git a/src/docs/wtperf.dox b/src/docs/wtperf.dox index 1f0d1533ac4..6d8dcab8f65 100644 --- a/src/docs/wtperf.dox +++ b/src/docs/wtperf.dox @@ -212,6 +212,10 @@ insert operations generate random content for the value @par read_range (unsigned int, default=0) scan a range of keys after each search +@par readonly (boolean, default=false) +reopen the connection between populate and workload phases in readonly +mode. Requires reopen_connection turned on (default). Requires that +read be the only workload specified @par reopen_connection (boolean, default=true) close and reopen the connection between populate and workload phases @par report_interval (unsigned int, default=2) @@ -247,14 +251,19 @@ threads configuration might be 'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' which would create 2 threads doing nothing but reads and 8 threads each doing 50% inserts and 25% reads and updates. Allowed -configuration values are 'count', 'throttle', 'reads', 'inserts', -'updates', 'truncate', 'truncate_pct' and 'truncate_count'. There are -also behavior modifiers, supported modifiers are 'ops_per_txn' +configuration values are 'count', 'throttle', 'update_delta', 'reads', +'inserts', 'updates', 'truncate', 'truncate_pct' and 'truncate_count'. +There are also behavior modifiers, supported modifiers are +'ops_per_txn' @par transaction_config (string, default=) transaction configuration string, relevant when populate_opts_per_txn is nonzero @par table_name (string, default=test) table name +@par value_sz_max (unsigned int, default=1000) +maximum value size when delta updates are present. Default disabled +@par value_sz_min (unsigned int, default=1) +minimum value size when delta updates are present. Default disabled @par value_sz (unsigned int, default=100) value size @par verbose (unsigned int, default=1) diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c index 641864a8baa..ca98b1bd62a 100644 --- a/src/evict/evict_file.c +++ b/src/evict/evict_file.c @@ -18,13 +18,12 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) WT_DECL_RET; WT_PAGE *page; WT_REF *next_ref, *ref; - bool evict_reset; /* * We need exclusive access to the file -- disable ordinary eviction * and drain any blocks already queued. */ - WT_RET(__wt_evict_file_exclusive_on(session, &evict_reset)); + WT_RET(__wt_evict_file_exclusive_on(session)); /* Make sure the oldest transaction ID is up-to-date. */ __wt_txn_update_oldest(session, true); @@ -98,8 +97,7 @@ err: /* On error, clear any left-over tree walk. */ session, next_ref, WT_READ_NO_EVICT)); } - if (evict_reset) - __wt_evict_file_exclusive_off(session); + __wt_evict_file_exclusive_off(session); return (ret); } diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 0536a06bc22..50a00787f35 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -159,7 +159,7 @@ __wt_evict_server_wake(WT_SESSION_IMPL *session) bytes_max / WT_MEGABYTE)); } - return (__wt_cond_signal(session, cache->evict_cond)); + return (__wt_cond_auto_signal(session, cache->evict_cond)); } /* @@ -175,8 +175,8 @@ __evict_server(void *arg) WT_SESSION_IMPL *session; #ifdef HAVE_DIAGNOSTIC struct timespec now, stuck_ts; - uint64_t pages_evicted = 0; #endif + uint64_t pages_evicted = 0; u_int spins; session = arg; @@ -219,11 +219,11 @@ __evict_server(void *arg) /* Next time we wake up, reverse the sweep direction. */ cache->flags ^= WT_CACHE_WALK_REVERSE; -#ifdef HAVE_DIAGNOSTIC pages_evicted = 0; } else if (pages_evicted != cache->pages_evict) { - WT_ERR(__wt_epoch(session, &stuck_ts)); pages_evicted = cache->pages_evict; +#ifdef HAVE_DIAGNOSTIC + WT_ERR(__wt_epoch(session, &stuck_ts)); } else { /* After being stuck for 5 minutes, give up. */ WT_ERR(__wt_epoch(session, &now)); @@ -238,7 +238,8 @@ __evict_server(void *arg) WT_ERR(__wt_verbose(session, WT_VERB_EVICTSERVER, "sleeping")); /* Don't rely on signals: check periodically. */ - WT_ERR(__wt_cond_wait(session, cache->evict_cond, 100000)); + WT_ERR(__wt_cond_auto_wait( + session, cache->evict_cond, pages_evicted != 0)); WT_ERR(__wt_verbose(session, WT_VERB_EVICTSERVER, "waking")); } @@ -477,6 +478,7 @@ __evict_update_work(WT_SESSION_IMPL *session) conn = S2C(session); cache = conn->cache; + WT_STAT_FAST_CONN_SET(session, cache_eviction_aggressive_set, 0); /* Clear previous state. */ cache->state = 0; @@ -534,8 +536,11 @@ __evict_update_work(WT_SESSION_IMPL *session) return (false); -done: if (F_ISSET(cache, WT_CACHE_STUCK)) +done: if (F_ISSET(cache, WT_CACHE_STUCK)) { + WT_STAT_FAST_CONN_SET(session, + cache_eviction_aggressive_set, 1); FLD_SET(cache->state, WT_EVICT_PASS_AGGRESSIVE); + } return (true); } @@ -594,8 +599,11 @@ __evict_pass(WT_SESSION_IMPL *session) if (!__evict_update_work(session)) break; - if (loop > 10) + if (loop > 10) { + WT_STAT_FAST_CONN_SET(session, + cache_eviction_aggressive_set, 1); FLD_SET(cache->state, WT_EVICT_PASS_AGGRESSIVE); + } /* * Start a worker if we have capacity and we haven't reached @@ -713,12 +721,32 @@ __evict_clear_walks(WT_SESSION_IMPL *session) } /* - * __evict_request_walk_clear -- + * __evict_clear_all_walks -- + * Clear the eviction walk points for all files a session is waiting on. + */ +static int +__evict_clear_all_walks(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + + conn = S2C(session); + + TAILQ_FOREACH(dhandle, &conn->dhqh, q) + if (WT_PREFIX_MATCH(dhandle->name, "file:")) + WT_WITH_DHANDLE(session, + dhandle, WT_TRET(__evict_clear_walk(session))); + return (ret); +} + +/* + * __evict_request_clear_walk -- * Request that the eviction server clear the tree's current eviction * point. */ static int -__evict_request_walk_clear(WT_SESSION_IMPL *session) +__evict_request_clear_walk(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; @@ -746,32 +774,12 @@ __evict_request_walk_clear(WT_SESSION_IMPL *session) } /* - * __evict_clear_all_walks -- - * Clear the eviction walk points for all files a session is waiting on. - */ -static int -__evict_clear_all_walks(WT_SESSION_IMPL *session) -{ - WT_CONNECTION_IMPL *conn; - WT_DATA_HANDLE *dhandle; - WT_DECL_RET; - - conn = S2C(session); - - TAILQ_FOREACH(dhandle, &conn->dhqh, q) - if (WT_PREFIX_MATCH(dhandle->name, "file:")) - WT_WITH_DHANDLE(session, - dhandle, WT_TRET(__evict_clear_walk(session))); - return (ret); -} - -/* * __wt_evict_file_exclusive_on -- * Get exclusive eviction access to a file and discard any of the file's * blocks queued for eviction. */ int -__wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp) +__wt_evict_file_exclusive_on(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; @@ -779,33 +787,39 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp) WT_EVICT_ENTRY *evict; u_int i, elem; - *evict_resetp = false; - btree = S2BT(session); cache = S2C(session)->cache; - /* If the file wasn't evictable, there's no work to do. */ - if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) + /* + * Hold the walk lock to set the no-eviction flag. + * + * The no-eviction flag can be set permanently, in which case we never + * increment the no-eviction count. + */ + __wt_spin_lock(session, &cache->evict_walk_lock); + if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) { + if (btree->evict_disabled != 0) + ++btree->evict_disabled; + __wt_spin_unlock(session, &cache->evict_walk_lock); return (0); + } + ++btree->evict_disabled; /* - * Hold the walk lock to set the "no eviction" flag: no new pages from - * the file will be queued for eviction after this point. + * Ensure no new pages from the file will be queued for eviction after + * this point. */ - __wt_spin_lock(session, &cache->evict_walk_lock); F_SET(btree, WT_BTREE_NO_EVICTION); - __wt_spin_unlock(session, &cache->evict_walk_lock); + WT_FULL_BARRIER(); /* Clear any existing LRU eviction walk for the file. */ - WT_ERR(__evict_request_walk_clear(session)); - - /* Hold the evict lock to remove any queued pages from this file. */ - __wt_spin_lock(session, &cache->evict_lock); + WT_ERR(__evict_request_clear_walk(session)); /* * The eviction candidate list might reference pages from the file, - * clear it. + * clear it. Hold the evict lock to remove queued pages from a file. */ + __wt_spin_lock(session, &cache->evict_lock); elem = cache->evict_max; for (i = 0, evict = cache->evict_queue; i < elem; i++, evict++) if (evict->btree == btree) @@ -819,10 +833,11 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp) while (btree->evict_busy > 0) __wt_yield(); - *evict_resetp = true; - return (0); - -err: F_CLR(btree, WT_BTREE_NO_EVICTION); + if (0) { +err: --btree->evict_disabled; + F_CLR(btree, WT_BTREE_NO_EVICTION); + } + __wt_spin_unlock(session, &cache->evict_walk_lock); return (ret); } @@ -834,12 +849,28 @@ void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session) { WT_BTREE *btree; + WT_CACHE *cache; btree = S2BT(session); + cache = S2C(session)->cache; - WT_ASSERT(session, btree->evict_ref == NULL); + /* + * We have seen subtle bugs with multiple threads racing to turn + * eviction on/off. Make races more likely in diagnostic builds. + */ + WT_DIAGNOSTIC_YIELD; - F_CLR(btree, WT_BTREE_NO_EVICTION); + WT_ASSERT(session, + btree->evict_ref == NULL && F_ISSET(btree, WT_BTREE_NO_EVICTION)); + + /* + * The no-eviction flag can be set permanently, in which case we never + * increment the no-eviction count. + */ + __wt_spin_lock(session, &cache->evict_walk_lock); + if (btree->evict_disabled > 0 && --btree->evict_disabled == 0) + F_CLR(btree, WT_BTREE_NO_EVICTION); + __wt_spin_unlock(session, &cache->evict_walk_lock); } /* @@ -869,7 +900,7 @@ __evict_lru_walk(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_DECL_RET; - uint64_t cutoff; + uint64_t cutoff, read_gen_oldest; uint32_t candidates, entries; cache = S2C(session)->cache; @@ -910,34 +941,62 @@ __evict_lru_walk(WT_SESSION_IMPL *session) return (0); } - WT_ASSERT(session, cache->evict_queue[0].ref != NULL); - - /* Track the oldest read generation we have in the queue. */ - cache->read_gen_oldest = cache->evict_queue[0].ref->page->read_gen; - + /* Decide how many of the candidates we're going to try and evict. */ if (FLD_ISSET(cache->state, - WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK)) + WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK)) { /* * Take all candidates if we only gathered pages with an oldest * read generation set. */ cache->evict_candidates = entries; - else { - /* Find the bottom 25% of read generations. */ - cutoff = (3 * __evict_read_gen(&cache->evict_queue[0]) + - __evict_read_gen(&cache->evict_queue[entries - 1])) / 4; + } else { /* - * Don't take less than 10% or more than 50% of entries, - * regardless. That said, if there is only one entry, which is - * normal when populating an empty file, don't exclude it. + * Find the oldest read generation we have in the queue, used + * to set the initial value for pages read into the system. + * The queue is sorted, find the first "normal" generation. */ - for (candidates = 1 + entries / 10; - candidates < entries / 2; - candidates++) - if (__evict_read_gen( - &cache->evict_queue[candidates]) > cutoff) + read_gen_oldest = WT_READGEN_OLDEST; + for (candidates = 0; candidates < entries; ++candidates) { + read_gen_oldest = + __evict_read_gen(&cache->evict_queue[candidates]); + if (read_gen_oldest != WT_READGEN_OLDEST) break; - cache->evict_candidates = candidates; + } + + /* + * Take all candidates if we only gathered pages with an oldest + * read generation set. + * + * We normally never take more than 50% of the entries; if 50% + * of the entries were at the oldest read generation, take them. + */ + if (read_gen_oldest == WT_READGEN_OLDEST) + cache->evict_candidates = entries; + else if (candidates >= entries / 2) + cache->evict_candidates = candidates; + else { + /* Save the calculated oldest generation. */ + cache->read_gen_oldest = read_gen_oldest; + + /* Find the bottom 25% of read generations. */ + cutoff = + (3 * read_gen_oldest + __evict_read_gen( + &cache->evict_queue[entries - 1])) / 4; + + /* + * Don't take less than 10% or more than 50% of entries, + * regardless. That said, if there is only one entry, + * which is normal when populating an empty file, don't + * exclude it. + */ + for (candidates = 1 + entries / 10; + candidates < entries / 2; + candidates++) + if (__evict_read_gen( + &cache->evict_queue[candidates]) > cutoff) + break; + cache->evict_candidates = candidates; + } } cache->evict_current = cache->evict_queue; @@ -1106,23 +1165,27 @@ retry: while (slot < max_entries && ret == 0) { __wt_spin_unlock(session, &conn->dhandle_lock); dhandle_locked = false; - __wt_spin_lock(session, &cache->evict_walk_lock); - /* - * Re-check the "no eviction" flag -- it is used to enforce - * exclusive access when a handle is being closed. + * Re-check the "no eviction" flag, used to enforce exclusive + * access when a handle is being closed. If not set, remember + * the file to visit first, next loop. + * + * Only try to acquire the lock and simply continue if we fail; + * the lock is held while the thread turning off eviction clears + * the tree's current eviction point, and part of the process is + * waiting on this thread to acknowledge that action. */ - if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) { - /* Remember the file to visit first, next loop. */ - cache->evict_file_next = dhandle; - - WT_WITH_DHANDLE(session, dhandle, - ret = __evict_walk_file(session, &slot)); - WT_ASSERT(session, session->split_gen == 0); + if (!F_ISSET(btree, WT_BTREE_NO_EVICTION) && + !__wt_spin_trylock(session, &cache->evict_walk_lock)) { + if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) { + cache->evict_file_next = dhandle; + WT_WITH_DHANDLE(session, dhandle, + ret = __evict_walk_file(session, &slot)); + WT_ASSERT(session, session->split_gen == 0); + } + __wt_spin_unlock(session, &cache->evict_walk_lock); } - __wt_spin_unlock(session, &cache->evict_walk_lock); - /* * If we didn't find any candidates in the file, skip it next * time. @@ -1209,7 +1272,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp) uint64_t pages_walked; uint32_t walk_flags; int internal_pages, restarts; - bool enough, modified, would_split; + bool enough, modified; conn = S2C(session); btree = S2BT(session); @@ -1265,9 +1328,22 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp) if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)) continue; + /* + * It's possible (but unlikely) to visit a page without a read + * generation, if we race with the read instantiating the page. + * Ignore those pages, but set the page's read generation here + * to ensure a bug doesn't somehow leave a page without a read + * generation. + */ + if (page->read_gen == WT_READGEN_NOTSET) { + __wt_cache_read_gen_new(session, page); + continue; + } + /* Pages we no longer need (clean or dirty), are found money. */ if (__wt_page_is_empty(page) || - F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) + F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || + page->read_gen == WT_READGEN_OLDEST) goto fast; /* Skip clean pages if appropriate. */ @@ -1280,25 +1356,17 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp) * eviction, skip anything that isn't marked. */ if (FLD_ISSET(cache->state, WT_EVICT_PASS_WOULD_BLOCK) && - page->memory_footprint < btree->splitmempage && - page->read_gen != WT_READGEN_OLDEST) + page->memory_footprint < btree->splitmempage) continue; /* Limit internal pages to 50% unless we get aggressive. */ if (WT_PAGE_IS_INTERNAL(page) && - ++internal_pages > WT_EVICT_WALK_PER_FILE / 2 && - !FLD_ISSET(cache->state, WT_EVICT_PASS_AGGRESSIVE)) + !FLD_ISSET(cache->state, WT_EVICT_PASS_AGGRESSIVE) && + internal_pages >= (int)(evict - start) / 2) continue; - /* - * If this page has never been considered for eviction, set its - * read generation to somewhere in the middle of the LRU list. - */ - if (page->read_gen == WT_READGEN_NOTSET) - page->read_gen = __wt_cache_read_gen_new(session); - fast: /* If the page can't be evicted, give up. */ - if (!__wt_page_can_evict(session, ref, &would_split)) + if (!__wt_page_can_evict(session, ref, NULL)) continue; /* @@ -1332,6 +1400,9 @@ fast: /* If the page can't be evicted, give up. */ __evict_init_candidate(session, evict, ref); ++evict; + if (WT_PAGE_IS_INTERNAL(page)) + ++internal_pages; + WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER, "select: %p, size %" PRIu64, page, page->memory_footprint)); } @@ -1392,8 +1463,9 @@ __evict_get_ref( } /* - * The eviction server only tries to evict half of the pages before - * looking for more. + * Only evict half of the pages before looking for more. The remainder + * are left to eviction workers (if configured), or application threads + * if necessary. */ candidates = cache->evict_candidates; if (is_server && candidates > 1) @@ -1452,7 +1524,6 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server) { WT_BTREE *btree; WT_DECL_RET; - WT_PAGE *page; WT_REF *ref; WT_RET(__evict_get_ref(session, is_server, &btree, &ref)); @@ -1481,9 +1552,7 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server) * the page and some other thread may have evicted it by the time we * look at it. */ - page = ref->page; - if (page->read_gen != WT_READGEN_OLDEST) - page->read_gen = __wt_cache_read_gen_bump(session); + __wt_cache_read_gen_bump(session, ref->page); WT_WITH_BTREE(session, btree, ret = __wt_evict(session, ref, false)); diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c index 72c07eaa05d..f0d4752cc83 100644 --- a/src/evict/evict_page.c +++ b/src/evict/evict_page.c @@ -471,8 +471,7 @@ __evict_review( LF_SET(WT_EVICT_IN_MEMORY | WT_EVICT_UPDATE_RESTORE); else if (page->read_gen == WT_READGEN_OLDEST) LF_SET(WT_EVICT_UPDATE_RESTORE); - else if (F_ISSET(session, WT_SESSION_INTERNAL) && - F_ISSET(S2C(session)->cache, WT_CACHE_STUCK)) + else if (F_ISSET(S2C(session)->cache, WT_CACHE_STUCK)) LF_SET(WT_EVICT_LOOKASIDE); } diff --git a/src/include/btmem.h b/src/include/btmem.h index ee495c52fc8..7cdf2bef43a 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -598,9 +598,14 @@ struct __wt_page { * read generation is incremented by the eviction server each time it * becomes active. To avoid incrementing a page's read generation too * frequently, it is set to a future point. + * + * Because low read generation values have special meaning, and there + * are places where we manipulate the value, use an initial value well + * outside of the special range. */ #define WT_READGEN_NOTSET 0 #define WT_READGEN_OLDEST 1 +#define WT_READGEN_START_VALUE 100 #define WT_READGEN_STEP 100 uint64_t read_gen; diff --git a/src/include/btree.h b/src/include/btree.h index 703de0f2fc6..fd921677751 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -129,10 +129,11 @@ struct __wt_btree { uint64_t rec_max_txn; /* Maximum txn seen (clean trees) */ uint64_t write_gen; /* Write generation */ - WT_REF *evict_ref; /* Eviction thread's location */ - uint64_t evict_priority; /* Relative priority of cached pages */ - u_int evict_walk_period; /* Skip this many LRU walks */ - u_int evict_walk_skips; /* Number of walks skipped */ + WT_REF *evict_ref; /* Eviction thread's location */ + uint64_t evict_priority; /* Relative priority of cached pages */ + u_int evict_walk_period; /* Skip this many LRU walks */ + u_int evict_walk_skips; /* Number of walks skipped */ + u_int evict_disabled; /* Eviction disabled count */ volatile uint32_t evict_busy; /* Count of threads in eviction */ enum { diff --git a/src/include/btree.i b/src/include/btree.i index b4b4d7f25a2..6df7f87073f 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -1149,7 +1149,7 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp) * parent frees the backing blocks for any no-longer-used overflow keys, * which will corrupt the checkpoint's block management. */ - if (btree->checkpointing && + if (btree->checkpointing != WT_CKPT_OFF && F_ISSET_ATOMIC(ref->home, WT_PAGE_OVERFLOW_KEYS)) return (false); diff --git a/src/include/cache.h b/src/include/cache.h index a3961d6043e..9184a2fe6ed 100644 --- a/src/include/cache.h +++ b/src/include/cache.h @@ -75,9 +75,9 @@ struct __wt_cache { /* * Read information. */ - uint64_t read_gen; /* Page read generation (LRU) */ - uint64_t read_gen_oldest; /* The oldest read generation that - eviction knows about */ + uint64_t read_gen; /* Current page read generation */ + uint64_t read_gen_oldest; /* Oldest read generation the eviction + * server saw in its last queue load */ /* * Eviction thread information. diff --git a/src/include/cache.i b/src/include/cache.i index ee13eee84c5..8cf7555e716 100644 --- a/src/include/cache.i +++ b/src/include/cache.i @@ -28,34 +28,43 @@ __wt_cache_read_gen_incr(WT_SESSION_IMPL *session) /* * __wt_cache_read_gen_bump -- - * Get the read generation to keep a page in memory. + * Update the page's read generation. */ -static inline uint64_t -__wt_cache_read_gen_bump(WT_SESSION_IMPL *session) +static inline void +__wt_cache_read_gen_bump(WT_SESSION_IMPL *session, WT_PAGE *page) { + /* Ignore pages set for forcible eviction. */ + if (page->read_gen == WT_READGEN_OLDEST) + return; + + /* Ignore pages already in the future. */ + if (page->read_gen > __wt_cache_read_gen(session)) + return; + /* - * We return read-generations from the future (where "the future" is - * measured by increments of the global read generation). The reason - * is because when acquiring a new hazard pointer for a page, we can - * check its read generation, and if the read generation isn't less - * than the current global generation, we don't bother updating the - * page. In other words, the goal is to avoid some number of updates - * immediately after each update we have to make. + * We set read-generations in the future (where "the future" is measured + * by increments of the global read generation). The reason is because + * when acquiring a new hazard pointer for a page, we can check its read + * generation, and if the read generation isn't less than the current + * global generation, we don't bother updating the page. In other + * words, the goal is to avoid some number of updates immediately after + * each update we have to make. */ - return (__wt_cache_read_gen(session) + WT_READGEN_STEP); + page->read_gen = __wt_cache_read_gen(session) + WT_READGEN_STEP; } /* * __wt_cache_read_gen_new -- * Get the read generation for a new page in memory. */ -static inline uint64_t -__wt_cache_read_gen_new(WT_SESSION_IMPL *session) +static inline void +__wt_cache_read_gen_new(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_CACHE *cache; cache = S2C(session)->cache; - return (__wt_cache_read_gen(session) + cache->read_gen_oldest) / 2; + page->read_gen = + (__wt_cache_read_gen(session) + cache->read_gen_oldest) / 2; } /* @@ -119,12 +128,11 @@ __wt_session_can_wait(WT_SESSION_IMPL *session) return (0); /* - * LSM sets the no-eviction flag when holding the LSM tree lock, - * in that case, or when holding the schema lock, we don't want to - * highjack the thread for eviction. + * LSM sets the no-eviction flag when holding the LSM tree lock, in that + * case, or when holding the schema lock, we don't want to highjack the + * thread for eviction. */ - if (F_ISSET(session, - WT_SESSION_NO_EVICTION | WT_SESSION_LOCKED_SCHEMA)) + if (F_ISSET(session, WT_SESSION_NO_EVICTION | WT_SESSION_LOCKED_SCHEMA)) return (0); return (1); @@ -224,11 +232,11 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, bool busy, bool *didworkp) return (0); /* - * Threads operating on trees that cannot be evicted are ignored, - * mostly because they're not contributing to the problem. + * Threads operating on cache-resident trees are ignored because they're + * not contributing to the problem. */ btree = S2BT_SAFE(session); - if (btree != NULL && F_ISSET(btree, WT_BTREE_NO_EVICTION)) + if (btree != NULL && F_ISSET(btree, WT_BTREE_IN_MEMORY)) return (0); /* Check if eviction is needed. */ diff --git a/src/include/column.i b/src/include/column.i index 9f3e2101f6f..d64e68420a5 100644 --- a/src/include/column.i +++ b/src/include/column.i @@ -11,13 +11,13 @@ * Search a column-store insert list for the next larger record. */ static inline WT_INSERT * -__col_insert_search_gt(WT_INSERT_HEAD *inshead, uint64_t recno) +__col_insert_search_gt(WT_INSERT_HEAD *ins_head, uint64_t recno) { WT_INSERT *ins, **insp; int i; /* If there's no insert chain to search, we're done. */ - if ((ins = WT_SKIP_LAST(inshead)) == NULL) + if ((ins = WT_SKIP_LAST(ins_head)) == NULL) return (NULL); /* Fast path check for targets past the end of the skiplist. */ @@ -29,7 +29,7 @@ __col_insert_search_gt(WT_INSERT_HEAD *inshead, uint64_t recno) * go as far as possible at each level before stepping down to the next. */ ins = NULL; - for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0;) + for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; i >= 0;) if (*insp != NULL && recno >= WT_INSERT_RECNO(*insp)) { ins = *insp; /* GTE: keep going at this level */ insp = &(*insp)->next[i]; @@ -50,7 +50,7 @@ __col_insert_search_gt(WT_INSERT_HEAD *inshead, uint64_t recno) * such a record exists before searching. */ if (ins == NULL) - ins = WT_SKIP_FIRST(inshead); + ins = WT_SKIP_FIRST(ins_head); while (recno >= WT_INSERT_RECNO(ins)) ins = WT_SKIP_NEXT(ins); return (ins); @@ -61,13 +61,13 @@ __col_insert_search_gt(WT_INSERT_HEAD *inshead, uint64_t recno) * Search a column-store insert list for the next smaller record. */ static inline WT_INSERT * -__col_insert_search_lt(WT_INSERT_HEAD *inshead, uint64_t recno) +__col_insert_search_lt(WT_INSERT_HEAD *ins_head, uint64_t recno) { WT_INSERT *ins, **insp; int i; /* If there's no insert chain to search, we're done. */ - if ((ins = WT_SKIP_FIRST(inshead)) == NULL) + if ((ins = WT_SKIP_FIRST(ins_head)) == NULL) return (NULL); /* Fast path check for targets before the skiplist. */ @@ -78,7 +78,7 @@ __col_insert_search_lt(WT_INSERT_HEAD *inshead, uint64_t recno) * The insert list is a skip list: start at the highest skip level, then * go as far as possible at each level before stepping down to the next. */ - for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0;) + for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; i >= 0;) if (*insp != NULL && recno > WT_INSERT_RECNO(*insp)) { ins = *insp; /* GT: keep going at this level */ insp = &(*insp)->next[i]; @@ -95,14 +95,14 @@ __col_insert_search_lt(WT_INSERT_HEAD *inshead, uint64_t recno) * Search a column-store insert list for an exact match. */ static inline WT_INSERT * -__col_insert_search_match(WT_INSERT_HEAD *inshead, uint64_t recno) +__col_insert_search_match(WT_INSERT_HEAD *ins_head, uint64_t recno) { WT_INSERT **insp, *ret_ins; uint64_t ins_recno; int cmp, i; /* If there's no insert chain to search, we're done. */ - if ((ret_ins = WT_SKIP_LAST(inshead)) == NULL) + if ((ret_ins = WT_SKIP_LAST(ins_head)) == NULL) return (NULL); /* Fast path the check for values at the end of the skiplist. */ @@ -115,7 +115,7 @@ __col_insert_search_match(WT_INSERT_HEAD *inshead, uint64_t recno) * The insert list is a skip list: start at the highest skip level, then * go as far as possible at each level before stepping down to the next. */ - for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0; ) { + for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; i >= 0; ) { if (*insp == NULL) { --i; --insp; @@ -143,7 +143,7 @@ __col_insert_search_match(WT_INSERT_HEAD *inshead, uint64_t recno) * Search a column-store insert list, creating a skiplist stack as we go. */ static inline WT_INSERT * -__col_insert_search(WT_INSERT_HEAD *inshead, +__col_insert_search(WT_INSERT_HEAD *ins_head, WT_INSERT ***ins_stack, WT_INSERT **next_stack, uint64_t recno) { WT_INSERT **insp, *ret_ins; @@ -151,15 +151,15 @@ __col_insert_search(WT_INSERT_HEAD *inshead, int cmp, i; /* If there's no insert chain to search, we're done. */ - if ((ret_ins = WT_SKIP_LAST(inshead)) == NULL) + if ((ret_ins = WT_SKIP_LAST(ins_head)) == NULL) return (NULL); /* Fast path appends. */ if (recno >= WT_INSERT_RECNO(ret_ins)) { for (i = 0; i < WT_SKIP_MAXDEPTH; i++) { ins_stack[i] = (i == 0) ? &ret_ins->next[0] : - (inshead->tail[i] != NULL) ? - &inshead->tail[i]->next[i] : &inshead->head[i]; + (ins_head->tail[i] != NULL) ? + &ins_head->tail[i]->next[i] : &ins_head->head[i]; next_stack[i] = NULL; } return (ret_ins); @@ -169,7 +169,7 @@ __col_insert_search(WT_INSERT_HEAD *inshead, * The insert list is a skip list: start at the highest skip level, then * go as far as possible at each level before stepping down to the next. */ - for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0; ) { + for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; i >= 0; ) { if ((ret_ins = *insp) == NULL) { next_stack[i] = NULL; ins_stack[i--] = insp--; diff --git a/src/include/config.h b/src/include/config.h index e63db0e76cf..48a255134af 100644 --- a/src/include/config.h +++ b/src/include/config.h @@ -85,13 +85,15 @@ struct __wt_config_parser_impl { #define WT_CONFIG_ENTRY_WT_SESSION_upgrade 33 #define WT_CONFIG_ENTRY_WT_SESSION_verify 34 #define WT_CONFIG_ENTRY_colgroup_meta 35 -#define WT_CONFIG_ENTRY_file_meta 36 -#define WT_CONFIG_ENTRY_index_meta 37 -#define WT_CONFIG_ENTRY_table_meta 38 -#define WT_CONFIG_ENTRY_wiredtiger_open 39 -#define WT_CONFIG_ENTRY_wiredtiger_open_all 40 -#define WT_CONFIG_ENTRY_wiredtiger_open_basecfg 41 -#define WT_CONFIG_ENTRY_wiredtiger_open_usercfg 42 +#define WT_CONFIG_ENTRY_file_config 36 +#define WT_CONFIG_ENTRY_file_meta 37 +#define WT_CONFIG_ENTRY_index_meta 38 +#define WT_CONFIG_ENTRY_lsm_meta 39 +#define WT_CONFIG_ENTRY_table_meta 40 +#define WT_CONFIG_ENTRY_wiredtiger_open 41 +#define WT_CONFIG_ENTRY_wiredtiger_open_all 42 +#define WT_CONFIG_ENTRY_wiredtiger_open_basecfg 43 +#define WT_CONFIG_ENTRY_wiredtiger_open_usercfg 44 /* * configuration section: END * DO NOT EDIT: automatically built by dist/flags.py. diff --git a/src/include/connection.h b/src/include/connection.h index 88797e83ad6..2255056fcf6 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -175,6 +175,7 @@ struct __wt_connection_impl { WT_SPINLOCK checkpoint_lock; /* Checkpoint spinlock */ WT_SPINLOCK dhandle_lock; /* Data handle list spinlock */ WT_SPINLOCK fh_lock; /* File handle queue spinlock */ + WT_SPINLOCK metadata_lock; /* Metadata update spinlock */ WT_SPINLOCK reconfig_lock; /* Single thread reconfigure */ WT_SPINLOCK schema_lock; /* Schema operation spinlock */ WT_SPINLOCK table_lock; /* Table creation spinlock */ @@ -298,9 +299,10 @@ struct __wt_connection_impl { #define WT_CONN_STAT_ALL 0x01 /* "all" statistics configured */ #define WT_CONN_STAT_CLEAR 0x02 /* clear after gathering */ #define WT_CONN_STAT_FAST 0x04 /* "fast" statistics configured */ -#define WT_CONN_STAT_NONE 0x08 /* don't gather statistics */ -#define WT_CONN_STAT_ON_CLOSE 0x10 /* output statistics on close */ -#define WT_CONN_STAT_SIZE 0x20 /* "size" statistics configured */ +#define WT_CONN_STAT_JSON 0x08 /* output JSON format */ +#define WT_CONN_STAT_NONE 0x10 /* don't gather statistics */ +#define WT_CONN_STAT_ON_CLOSE 0x20 /* output statistics on close */ +#define WT_CONN_STAT_SIZE 0x40 /* "size" statistics configured */ uint32_t stat_flags; /* Connection statistics */ diff --git a/src/include/cursor.h b/src/include/cursor.h index 7f7b5dceb79..4b35daf106e 100644 --- a/src/include/cursor.h +++ b/src/include/cursor.h @@ -31,22 +31,22 @@ NULL, /* uri */ \ NULL, /* key_format */ \ NULL, /* value_format */ \ - (int (*)(WT_CURSOR *, ...))(get_key), \ - (int (*)(WT_CURSOR *, ...))(get_value), \ - (void (*)(WT_CURSOR *, ...))(set_key), \ - (void (*)(WT_CURSOR *, ...))(set_value), \ - (int (*)(WT_CURSOR *, WT_CURSOR *, int *))(compare), \ - (int (*)(WT_CURSOR *, WT_CURSOR *, int *))(equals), \ + get_key, \ + get_value, \ + set_key, \ + set_value, \ + compare, \ + equals, \ next, \ prev, \ reset, \ search, \ - (int (*)(WT_CURSOR *, int *))(search_near), \ + search_near, \ insert, \ update, \ remove, \ close, \ - (int (*)(WT_CURSOR *, const char *))(reconfigure), \ + reconfigure, \ { NULL, NULL }, /* TAILQ_ENTRY q */ \ 0, /* recno key */ \ { 0 }, /* recno raw buffer */ \ @@ -213,10 +213,11 @@ struct __wt_cursor_btree { #define WT_CBT_NO_TXN 0x10 /* Non-transactional cursor (e.g. on a checkpoint) */ #define WT_CBT_SEARCH_SMALLEST 0x20 /* Row-store: small-key insert list */ +#define WT_CBT_VAR_ONPAGE_MATCH 0x40 /* Var-store: on-page recno match */ #define WT_CBT_POSITION_MASK /* Flags associated with position */ \ (WT_CBT_ITERATE_APPEND | WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV | \ - WT_CBT_SEARCH_SMALLEST) + WT_CBT_SEARCH_SMALLEST | WT_CBT_VAR_ONPAGE_MATCH) uint8_t flags; }; @@ -287,9 +288,12 @@ struct __wt_cursor_join_iter { WT_SESSION_IMPL *session; WT_CURSOR_JOIN *cjoin; WT_CURSOR_JOIN_ENTRY *entry; - WT_CURSOR *cursor; - WT_ITEM *curkey; - bool advance; + WT_CURSOR *cursor; /* has null projection */ + WT_CURSOR *main; /* main table with projection */ + WT_ITEM *curkey; /* primary key */ + WT_ITEM idxkey; + bool positioned; + bool isequal; /* advancing means we're done */ }; struct __wt_cursor_join_endpoint { @@ -302,14 +306,18 @@ struct __wt_cursor_join_endpoint { #define WT_CURJOIN_END_GT 0x04 /* include values > cursor */ #define WT_CURJOIN_END_GE (WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ) #define WT_CURJOIN_END_LE (WT_CURJOIN_END_LT | WT_CURJOIN_END_EQ) -#define WT_CURJOIN_END_OWN_KEY 0x08 /* must free key's data */ +#define WT_CURJOIN_END_OWN_CURSOR 0x08 /* must close cursor */ uint8_t flags; /* range for this endpoint */ }; +#define WT_CURJOIN_END_RANGE(endp) \ + ((endp)->flags & \ + (WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ | WT_CURJOIN_END_LT)) struct __wt_cursor_join_entry { WT_INDEX *index; WT_CURSOR *main; /* raw main table cursor */ WT_BLOOM *bloom; /* Bloom filter handle */ + char *repack_format; /* target format for repack */ uint32_t bloom_bit_count; /* bits per item in bloom */ uint32_t bloom_hash_count; /* hash functions in bloom */ uint64_t count; /* approx number of matches */ diff --git a/src/include/extern.h b/src/include/extern.h index 1999ff6b732..48c52d4a109 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -168,7 +168,7 @@ extern int __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing); extern int __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst); -extern int __wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_CACHE_OP op); +extern int __wt_cache_op(WT_SESSION_IMPL *session, WT_CACHE_OP op); extern int __wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk, size_t size, bool empty_page_ok); @@ -190,7 +190,7 @@ extern int __wt_row_insert_alloc(WT_SESSION_IMPL *session, WT_ITEM *key, u_int s extern int __wt_update_alloc( WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp, size_t *sizep); extern WT_UPDATE *__wt_update_obsolete_check( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd); extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd); -extern int __wt_search_insert( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key); +extern int __wt_search_insert(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key); extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert); extern int __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt); extern int __wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt); @@ -252,9 +252,7 @@ extern int __wt_checkpoint_signal(WT_SESSION_IMPL *session, wt_off_t logsize); extern int __wt_conn_dhandle_find( WT_SESSION_IMPL *session, const char *uri, const char *checkpoint); extern int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force); extern int __wt_conn_btree_open( WT_SESSION_IMPL *session, const char *cfg[], uint32_t flags); -extern int __wt_conn_btree_apply(WT_SESSION_IMPL *session, bool apply_checkpoints, const char *uri, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]); -extern int __wt_conn_btree_apply_single_ckpt(WT_SESSION_IMPL *session, const char *uri, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]); -extern int __wt_conn_btree_apply_single(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]); +extern int __wt_conn_btree_apply(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), const char *cfg[]); extern int __wt_conn_dhandle_close_all( WT_SESSION_IMPL *session, const char *uri, bool force); extern int __wt_conn_dhandle_discard_single( WT_SESSION_IMPL *session, bool final, bool force); extern int __wt_conn_dhandle_discard(WT_SESSION_IMPL *session); @@ -278,7 +276,6 @@ extern int __wt_sweep_create(WT_SESSION_IMPL *session); extern int __wt_sweep_destroy(WT_SESSION_IMPL *session); extern int __wt_curbackup_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp); extern int __wt_backup_file_remove(WT_SESSION_IMPL *session); -extern int __wt_backup_list_uri_append( WT_SESSION_IMPL *session, const char *name, bool *skip); extern int __wt_curbulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool bitmap, bool skip_sort_check); extern int __wt_curconfig_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp); extern int __wt_curds_open( WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp); @@ -300,12 +297,20 @@ extern int __wt_json_to_item(WT_SESSION_IMPL *session, const char *jstr, const c extern ssize_t __wt_json_strlen(const char *src, size_t srclen); extern int __wt_json_strncpy(char **pdst, size_t dstlen, const char *src, size_t srclen); extern int __wt_curlog_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp); +extern int __wt_schema_create_final( WT_SESSION_IMPL *session, char *cfg_arg[], char **value_ret); extern int __wt_curmetadata_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); extern void __wt_curstat_dsrc_final(WT_CURSOR_STAT *cst); extern int __wt_curstat_init(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *curjoin, const char *cfg[], WT_CURSOR_STAT *cst); extern int __wt_curstat_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *other, const char *cfg[], WT_CURSOR **cursorp); -extern int __wt_cursor_notsup(WT_CURSOR *cursor); extern int __wt_cursor_noop(WT_CURSOR *cursor); +extern int __wt_cursor_notsup(WT_CURSOR *cursor); +extern int __wt_cursor_get_value_notsup(WT_CURSOR *cursor, ...); +extern void __wt_cursor_set_key_notsup(WT_CURSOR *cursor, ...); +extern void __wt_cursor_set_value_notsup(WT_CURSOR *cursor, ...); +extern int __wt_cursor_compare_notsup(WT_CURSOR *a, WT_CURSOR *b, int *cmpp); +extern int __wt_cursor_equals_notsup(WT_CURSOR *cursor, WT_CURSOR *other, int *equalp); +extern int __wt_cursor_search_near_notsup(WT_CURSOR *cursor, int *exact); +extern int __wt_cursor_reconfigure_notsup(WT_CURSOR *cursor, const char *config); extern void __wt_cursor_set_notsup(WT_CURSOR *cursor); extern int __wt_cursor_kv_not_set(WT_CURSOR *cursor, bool key); extern int __wt_cursor_get_key(WT_CURSOR *cursor, ...); @@ -337,7 +342,7 @@ extern void __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_evict_server_wake(WT_SESSION_IMPL *session); extern int __wt_evict_create(WT_SESSION_IMPL *session); extern int __wt_evict_destroy(WT_SESSION_IMPL *session); -extern int __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp); +extern int __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session); extern void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session); extern int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full); extern void __wt_evict_priority_set(WT_SESSION_IMPL *session, uint64_t v); @@ -360,7 +365,7 @@ extern int __wt_log_open(WT_SESSION_IMPL *session); extern int __wt_log_close(WT_SESSION_IMPL *session); extern int __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep); extern int __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, int (*func)(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, WT_LSN *next_lsnp, void *cookie, int firstrecord), void *cookie); -extern int __wt_log_force_write(WT_SESSION_IMPL *session, bool retry); +extern int __wt_log_force_write(WT_SESSION_IMPL *session, bool retry, bool *did_work); extern int __wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags); extern int __wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap); extern int __wt_log_flush(WT_SESSION_IMPL *session, uint32_t flags); @@ -441,7 +446,7 @@ extern int __wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree); extern int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk); extern int __wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree); extern int __wt_lsm_worker_start(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args); -extern int __wt_meta_btree_apply(WT_SESSION_IMPL *session, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]); +extern int __wt_meta_apply_all(WT_SESSION_IMPL *session, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), const char *cfg[]); extern int __wt_meta_checkpoint(WT_SESSION_IMPL *session, const char *fname, const char *checkpoint, WT_CKPT *ckpt); extern int __wt_meta_checkpoint_last_name( WT_SESSION_IMPL *session, const char *fname, const char **namep); extern int __wt_meta_checkpoint_clear(WT_SESSION_IMPL *session, const char *fname); @@ -481,7 +486,9 @@ extern int __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, char **va extern int __wt_turtle_update(WT_SESSION_IMPL *session, const char *key, const char *value); extern void __wt_abort(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn)); extern int __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp); +extern int __wt_malloc(WT_SESSION_IMPL *session, size_t bytes_to_allocate, void *retp); extern int __wt_realloc(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp); +extern int __wt_realloc_noclear(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp); extern int __wt_realloc_aligned(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp); extern int __wt_strndup(WT_SESSION_IMPL *session, const void *str, size_t len, void *retp); extern void __wt_free_int(WT_SESSION_IMPL *session, const void *p_arg); @@ -490,6 +497,7 @@ extern int __wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp extern int __wt_dlsym(WT_SESSION_IMPL *session, WT_DLH *dlh, const char *name, bool fail, void *sym_ret); extern int __wt_dlclose(WT_SESSION_IMPL *session, WT_DLH *dlh); extern int __wt_errno(void); +extern int __wt_map_error_rdonly(int error); extern const char *__wt_strerror(WT_SESSION_IMPL *session, int error, char *errbuf, size_t errlen); extern int __wt_exist(WT_SESSION_IMPL *session, const char *filename, bool *existp); extern void __wt_fallocate_config(WT_SESSION_IMPL *session, WT_FH *fh); @@ -552,8 +560,18 @@ extern int __wt_struct_confchk(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *v); extern int __wt_struct_size(WT_SESSION_IMPL *session, size_t *sizep, const char *fmt, ...); extern int __wt_struct_pack(WT_SESSION_IMPL *session, void *buffer, size_t size, const char *fmt, ...); extern int __wt_struct_unpack(WT_SESSION_IMPL *session, const void *buffer, size_t size, const char *fmt, ...); -extern int __wt_struct_unpack_size(WT_SESSION_IMPL *session, const void *buffer, size_t size, const char *fmt, size_t *resultp); -extern int __wt_struct_repack(WT_SESSION_IMPL *session, const char *infmt, const char *outfmt, const WT_ITEM *inbuf, WT_ITEM *outbuf, void **reallocp); +extern int __wt_struct_repack(WT_SESSION_IMPL *session, const char *infmt, const char *outfmt, const WT_ITEM *inbuf, WT_ITEM *outbuf); +extern int __wt_ext_pack_start(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *format, void *buffer, size_t size, WT_PACK_STREAM **psp); +extern int __wt_ext_unpack_start(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *format, const void *buffer, size_t size, WT_PACK_STREAM **psp); +extern int __wt_ext_pack_close(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, size_t *usedp); +extern int __wt_ext_pack_item(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, WT_ITEM *item); +extern int __wt_ext_pack_int(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, int64_t i); +extern int __wt_ext_pack_str(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, const char *s); +extern int __wt_ext_pack_uint(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, uint64_t u); +extern int __wt_ext_unpack_item(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, WT_ITEM *item); +extern int __wt_ext_unpack_int(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, int64_t *ip); +extern int __wt_ext_unpack_str(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, const char **sp); +extern int __wt_ext_unpack_uint(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, uint64_t *up); extern int __wt_ovfl_discard_add(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell); extern void __wt_ovfl_discard_free(WT_SESSION_IMPL *session, WT_PAGE *page); extern int __wt_ovfl_reuse_search(WT_SESSION_IMPL *session, WT_PAGE *page, uint8_t **addrp, size_t *addr_sizep, const void *value, size_t value_size); @@ -572,7 +590,6 @@ extern int __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) extern int __wt_bulk_insert_fix( WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted); extern int __wt_bulk_insert_fix_bitmap(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); extern int __wt_bulk_insert_var( WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted); -extern int __wt_schema_create_strip(WT_SESSION_IMPL *session, const char *v1, const char *v2, char **value_ret); extern int __wt_direct_io_size_check(WT_SESSION_IMPL *session, const char **cfg, const char *config_name, uint32_t *allocsizep); extern int __wt_schema_colgroup_source(WT_SESSION_IMPL *session, WT_TABLE *table, const char *cgname, const char *config, WT_ITEM *buf); extern int __wt_schema_index_source(WT_SESSION_IMPL *session, WT_TABLE *table, const char *idxname, const char *config, WT_ITEM *buf); @@ -612,6 +629,7 @@ extern WT_DATA_SOURCE *__wt_schema_get_source(WT_SESSION_IMPL *session, const ch extern int __wt_str_name_check(WT_SESSION_IMPL *session, const char *str); extern int __wt_name_check(WT_SESSION_IMPL *session, const char *str, size_t len); extern int __wt_schema_worker(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), const char *cfg[], uint32_t open_flags); +extern int __wt_session_notsup(WT_SESSION *wt_session); extern int __wt_session_reset_cursors(WT_SESSION_IMPL *session, bool free_buffers); extern int __wt_session_copy_values(WT_SESSION_IMPL *session); extern int __wt_session_release_resources(WT_SESSION_IMPL *session); @@ -621,8 +639,8 @@ extern int __wt_session_drop(WT_SESSION_IMPL *session, const char *uri, const ch extern int __wt_session_range_truncate(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *start, WT_CURSOR *stop); extern int __wt_open_session(WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler, const char *config, bool open_metadata, WT_SESSION_IMPL **sessionp); extern int __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, bool open_metadata, uint32_t session_flags, WT_SESSION_IMPL **sessionp); -extern int __wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, bool *skipp); extern int __wt_session_compact( WT_SESSION *wt_session, const char *uri, const char *config); +extern int __wt_session_compact_readonly( WT_SESSION *wt_session, const char *uri, const char *config); extern int __wt_session_lock_dhandle( WT_SESSION_IMPL *session, uint32_t flags, bool *is_deadp); extern int __wt_session_release_btree(WT_SESSION_IMPL *session); extern int __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], uint32_t flags); @@ -632,6 +650,11 @@ extern int __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *ch extern int __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[]); extern uint32_t __wt_cksum(const void *chunk, size_t len); extern void __wt_cksum_init(void); +extern int __wt_cond_auto_alloc( WT_SESSION_IMPL *session, const char *name, bool is_signalled, uint64_t min, uint64_t max, WT_CONDVAR **condp); +extern int __wt_cond_auto_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond); +extern int __wt_cond_auto_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool *signalled); +extern int __wt_cond_auto_wait( WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress); +extern int __wt_cond_auto_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp); extern int __wt_decrypt(WT_SESSION_IMPL *session, WT_ENCRYPTOR *encryptor, size_t skip, WT_ITEM *in, WT_ITEM *out); extern int __wt_encrypt(WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t skip, WT_ITEM *in, WT_ITEM *out); extern void __wt_encrypt_size(WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t incoming_size, size_t *sizep); @@ -731,7 +754,7 @@ extern void __wt_txn_destroy(WT_SESSION_IMPL *session); extern int __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_txn_global_destroy(WT_SESSION_IMPL *session); extern int __wt_checkpoint_name_ok(WT_SESSION_IMPL *session, const char *name, size_t len); -extern int __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[]); diff --git a/src/include/flags.h b/src/include/flags.h index 24fae4abccd..a6f42a9938f 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -12,13 +12,14 @@ #define WT_CONN_LOG_SERVER_RUN 0x00000080 #define WT_CONN_LSM_MERGE 0x00000100 #define WT_CONN_PANIC 0x00000200 -#define WT_CONN_SERVER_ASYNC 0x00000400 -#define WT_CONN_SERVER_CHECKPOINT 0x00000800 -#define WT_CONN_SERVER_LSM 0x00001000 -#define WT_CONN_SERVER_RUN 0x00002000 -#define WT_CONN_SERVER_STATISTICS 0x00004000 -#define WT_CONN_SERVER_SWEEP 0x00008000 -#define WT_CONN_WAS_BACKUP 0x00010000 +#define WT_CONN_READONLY 0x00000400 +#define WT_CONN_SERVER_ASYNC 0x00000800 +#define WT_CONN_SERVER_CHECKPOINT 0x00001000 +#define WT_CONN_SERVER_LSM 0x00002000 +#define WT_CONN_SERVER_RUN 0x00004000 +#define WT_CONN_SERVER_STATISTICS 0x00008000 +#define WT_CONN_SERVER_SWEEP 0x00010000 +#define WT_CONN_WAS_BACKUP 0x00020000 #define WT_EVICTING 0x00000001 #define WT_EVICT_IN_MEMORY 0x00000002 #define WT_EVICT_LOOKASIDE 0x00000004 @@ -55,20 +56,21 @@ #define WT_SESSION_INTERNAL 0x00000004 #define WT_SESSION_LOCKED_CHECKPOINT 0x00000008 #define WT_SESSION_LOCKED_HANDLE_LIST 0x00000010 -#define WT_SESSION_LOCKED_SCHEMA 0x00000020 -#define WT_SESSION_LOCKED_SLOT 0x00000040 -#define WT_SESSION_LOCKED_TABLE 0x00000080 -#define WT_SESSION_LOCKED_TURTLE 0x00000100 -#define WT_SESSION_LOCK_NO_WAIT 0x00000200 -#define WT_SESSION_LOGGING_INMEM 0x00000400 -#define WT_SESSION_LOOKASIDE_CURSOR 0x00000800 -#define WT_SESSION_NO_CACHE 0x00001000 -#define WT_SESSION_NO_DATA_HANDLES 0x00002000 -#define WT_SESSION_NO_EVICTION 0x00004000 -#define WT_SESSION_NO_LOGGING 0x00008000 -#define WT_SESSION_NO_SCHEMA_LOCK 0x00010000 -#define WT_SESSION_QUIET_CORRUPT_FILE 0x00020000 -#define WT_SESSION_SERVER_ASYNC 0x00040000 +#define WT_SESSION_LOCKED_METADATA 0x00000020 +#define WT_SESSION_LOCKED_SCHEMA 0x00000040 +#define WT_SESSION_LOCKED_SLOT 0x00000080 +#define WT_SESSION_LOCKED_TABLE 0x00000100 +#define WT_SESSION_LOCKED_TURTLE 0x00000200 +#define WT_SESSION_LOCK_NO_WAIT 0x00000400 +#define WT_SESSION_LOGGING_INMEM 0x00000800 +#define WT_SESSION_LOOKASIDE_CURSOR 0x00001000 +#define WT_SESSION_NO_CACHE 0x00002000 +#define WT_SESSION_NO_DATA_HANDLES 0x00004000 +#define WT_SESSION_NO_EVICTION 0x00008000 +#define WT_SESSION_NO_LOGGING 0x00010000 +#define WT_SESSION_NO_SCHEMA_LOCK 0x00020000 +#define WT_SESSION_QUIET_CORRUPT_FILE 0x00040000 +#define WT_SESSION_SERVER_ASYNC 0x00080000 #define WT_TXN_LOG_CKPT_CLEANUP 0x00000001 #define WT_TXN_LOG_CKPT_PREPARE 0x00000002 #define WT_TXN_LOG_CKPT_START 0x00000004 diff --git a/src/include/gcc.h b/src/include/gcc.h index 6ccc0de3c03..ce6afdd6e9c 100644 --- a/src/include/gcc.h +++ b/src/include/gcc.h @@ -6,6 +6,7 @@ * See the file LICENSE for redistribution information. */ +#define WT_PTRDIFFT_FMT "td" /* ptrdiff_t format string */ #define WT_SIZET_FMT "zu" /* size_t format string */ /* Add GCC-specific attributes to types and function declarations. */ diff --git a/src/include/lint.h b/src/include/lint.h index f8b17022968..1b64186cbab 100644 --- a/src/include/lint.h +++ b/src/include/lint.h @@ -6,6 +6,7 @@ * See the file LICENSE for redistribution information. */ +#define WT_PTRDIFFT_FMT "td" /* ptrdiff_t format string */ #define WT_SIZET_FMT "zu" /* size_t format string */ #define WT_COMPILER_TYPE_ALIGN(x) diff --git a/src/include/lsm.h b/src/include/lsm.h index 7cb3ccc895d..444073087df 100644 --- a/src/include/lsm.h +++ b/src/include/lsm.h @@ -179,7 +179,7 @@ struct __wt_lsm_tree { int collator_owned; uint32_t refcnt; /* Number of users of the tree */ - uint8_t exclusive; /* Tree is locked exclusively */ + WT_SESSION_IMPL *excl_session; /* Session has exclusive lock */ #define LSM_TREE_MAX_QUEUE 100 uint32_t queue_ref; @@ -215,7 +215,7 @@ struct __wt_lsm_tree { size_t chunk_alloc; /* Space allocated for chunks */ uint32_t nchunks; /* Number of active chunks */ uint32_t last; /* Last allocated ID */ - int modified; /* Have there been updates? */ + bool modified; /* Have there been updates? */ WT_LSM_CHUNK **old_chunks; /* Array of old LSM chunks */ size_t old_alloc; /* Space allocated for old chunks */ @@ -242,13 +242,18 @@ struct __wt_lsm_tree { int64_t lsm_lookup_no_bloom; int64_t lsm_merge_throttle; -#define WT_LSM_TREE_ACTIVE 0x01 /* Workers are active */ -#define WT_LSM_TREE_AGGRESSIVE_TIMER 0x02 /* Timer for merge aggression */ -#define WT_LSM_TREE_COMPACTING 0x04 /* Tree being compacted */ -#define WT_LSM_TREE_MERGES 0x08 /* Tree should run merges */ -#define WT_LSM_TREE_NEED_SWITCH 0x10 /* New chunk needs creating */ -#define WT_LSM_TREE_OPEN 0x20 /* The tree is open */ -#define WT_LSM_TREE_THROTTLE 0x40 /* Throttle updates */ + /* + * The tree is open for business. This used to be a flag, but it is + * susceptible to races. + */ + bool active; + +#define WT_LSM_TREE_AGGRESSIVE_TIMER 0x01 /* Timer for merge aggression */ +#define WT_LSM_TREE_COMPACTING 0x02 /* Tree being compacted */ +#define WT_LSM_TREE_MERGES 0x04 /* Tree should run merges */ +#define WT_LSM_TREE_NEED_SWITCH 0x08 /* New chunk needs creating */ +#define WT_LSM_TREE_OPEN 0x10 /* The tree is open */ +#define WT_LSM_TREE_THROTTLE 0x20 /* Throttle updates */ uint32_t flags; }; diff --git a/src/include/meta.h b/src/include/meta.h index d61022c0c44..ac0f5fedac4 100644 --- a/src/include/meta.h +++ b/src/include/meta.h @@ -21,6 +21,7 @@ #define WT_METADATA_TURTLE_SET "WiredTiger.turtle.set" /* Turtle temp file */ #define WT_METADATA_URI "metadata:" /* Metadata alias */ +#define WT_METAFILE "WiredTiger.wt" /* Metadata table */ #define WT_METAFILE_URI "file:WiredTiger.wt" /* Metadata table URI */ #define WT_LAS_URI "file:WiredTigerLAS.wt" /* Lookaside table URI*/ diff --git a/src/include/misc.h b/src/include/misc.h index 5dadb1b1484..07d52c61eac 100644 --- a/src/include/misc.h +++ b/src/include/misc.h @@ -198,13 +198,9 @@ /* Check if a string matches a prefix. */ #define WT_PREFIX_MATCH(str, pfx) \ - (((const char *)str)[0] == ((const char *)pfx)[0] && \ + (((const char *)(str))[0] == ((const char *)pfx)[0] && \ strncmp((str), (pfx), strlen(pfx)) == 0) -/* Check if a non-nul-terminated string matches a prefix. */ -#define WT_PREFIX_MATCH_LEN(str, len, pfx) \ - ((len) >= strlen(pfx) && WT_PREFIX_MATCH(str, pfx)) - /* Check if a string matches a prefix, and move past it. */ #define WT_PREFIX_SKIP(str, pfx) \ (WT_PREFIX_MATCH(str, pfx) ? ((str) += strlen(pfx), 1) : 0) diff --git a/src/include/msvc.h b/src/include/msvc.h index 99260a44875..d5be5bd8c60 100644 --- a/src/include/msvc.h +++ b/src/include/msvc.h @@ -13,6 +13,7 @@ #define inline __inline +#define WT_PTRDIFFT_FMT "Id" /* ptrdiff_t format string */ #define WT_SIZET_FMT "Iu" /* size_t format string */ /* diff --git a/src/include/mutex.h b/src/include/mutex.h index f798bfb3ece..04679884930 100644 --- a/src/include/mutex.h +++ b/src/include/mutex.h @@ -20,6 +20,13 @@ struct __wt_condvar { int waiters; /* Numbers of waiters, or -1 if signalled with no waiters. */ + /* + * The following fields are only used for automatically adjusting + * condition variables. They could be in a separate structure. + */ + uint64_t min_wait; /* Minimum wait duration */ + uint64_t max_wait; /* Maximum wait duration */ + uint64_t prev_wait; /* Wait duration used last time */ }; /* diff --git a/src/include/packing.i b/src/include/packing.i index 784a55ef2ae..35b2ddc43db 100644 --- a/src/include/packing.i +++ b/src/include/packing.i @@ -677,8 +677,8 @@ __wt_struct_unpackv(WT_SESSION_IMPL *session, if (fmt[0] != '\0' && fmt[1] == '\0') { pv.type = fmt[0]; - if ((ret = __unpack_read(session, &pv, &p, size)) == 0) - WT_UNPACK_PUT(session, pv, ap); + WT_RET(__unpack_read(session, &pv, &p, size)); + WT_UNPACK_PUT(session, pv, ap); return (0); } diff --git a/src/include/schema.h b/src/include/schema.h index a51030870c1..f93c596e2ca 100644 --- a/src/include/schema.h +++ b/src/include/schema.h @@ -133,6 +133,14 @@ struct __wt_table { &S2C(session)->dhandle_lock, WT_SESSION_LOCKED_HANDLE_LIST, op) /* + * WT_WITH_METADATA_LOCK -- + * Acquire the metadata lock, perform an operation, drop the lock. + */ +#define WT_WITH_METADATA_LOCK(session, ret, op) \ + WT_WITH_LOCK(session, ret, \ + &S2C(session)->metadata_lock, WT_SESSION_LOCKED_METADATA, op) + +/* * WT_WITH_SCHEMA_LOCK -- * Acquire the schema lock, perform an operation, drop the lock. * Check that we are not already holding some other lock: the schema lock @@ -166,6 +174,8 @@ struct __wt_table { */ #define WT_WITHOUT_LOCKS(session, op) do { \ WT_CONNECTION_IMPL *__conn = S2C(session); \ + bool __checkpoint_locked = \ + F_ISSET(session, WT_SESSION_LOCKED_CHECKPOINT); \ bool __handle_locked = \ F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST); \ bool __table_locked = \ @@ -184,7 +194,15 @@ struct __wt_table { F_CLR(session, WT_SESSION_LOCKED_SCHEMA); \ __wt_spin_unlock(session, &__conn->schema_lock); \ } \ + if (__checkpoint_locked) { \ + F_CLR(session, WT_SESSION_LOCKED_CHECKPOINT); \ + __wt_spin_unlock(session, &__conn->checkpoint_lock); \ + } \ op; \ + if (__checkpoint_locked) { \ + __wt_spin_lock(session, &__conn->checkpoint_lock); \ + F_SET(session, WT_SESSION_LOCKED_CHECKPOINT); \ + } \ if (__schema_locked) { \ __wt_spin_lock(session, &__conn->schema_lock); \ F_SET(session, WT_SESSION_LOCKED_SCHEMA); \ diff --git a/src/include/session.h b/src/include/session.h index 5c3291230b4..7fdb7fc2548 100644 --- a/src/include/session.h +++ b/src/include/session.h @@ -126,14 +126,24 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl { void *block_manager; /* Block-manager support */ int (*block_manager_cleanup)(WT_SESSION_IMPL *); - /* Checkpoint support */ - struct { - WT_DATA_HANDLE *dhandle; - const char *name; - } *ckpt_handle; /* Handle list */ + /* Checkpoint handles */ + WT_DATA_HANDLE **ckpt_handle; /* Handle list */ u_int ckpt_handle_next; /* Next empty slot */ size_t ckpt_handle_allocated; /* Bytes allocated */ + /* + * Operations acting on handles. + * + * The preferred pattern is to gather all of the required handles at + * the beginning of an operation, then drop any other locks, perform + * the operation, then release the handles. This cannot be easily + * merged with the list of checkpoint handles because some operations + * (such as compact) do checkpoints internally. + */ + WT_DATA_HANDLE **op_handle; /* Handle list */ + u_int op_handle_next; /* Next empty slot */ + size_t op_handle_allocated; /* Bytes allocated */ + void *reconcile; /* Reconciliation support */ int (*reconcile_cleanup)(WT_SESSION_IMPL *); diff --git a/src/include/stat.h b/src/include/stat.h index 51d2fa332e7..f9170dc1a79 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -227,12 +227,22 @@ __wt_stats_clear(void *stats_arg, int slot) */ #define WT_CONNECTION_STATS_BASE 1000 struct __wt_connection_stats { - int64_t async_alloc_race; - int64_t async_alloc_view; + int64_t lsm_work_queue_app; + int64_t lsm_work_queue_manager; + int64_t lsm_rows_merged; + int64_t lsm_checkpoint_throttle; + int64_t lsm_merge_throttle; + int64_t lsm_work_queue_switch; + int64_t lsm_work_units_discarded; + int64_t lsm_work_units_done; + int64_t lsm_work_units_created; + int64_t lsm_work_queue_max; int64_t async_cur_queue; + int64_t async_max_queue; + int64_t async_alloc_race; int64_t async_flush; + int64_t async_alloc_view; int64_t async_full; - int64_t async_max_queue; int64_t async_nowork; int64_t async_op_alloc; int64_t async_op_compact; @@ -240,55 +250,66 @@ struct __wt_connection_stats { int64_t async_op_remove; int64_t async_op_search; int64_t async_op_update; - int64_t block_byte_map_read; - int64_t block_byte_read; - int64_t block_byte_write; - int64_t block_map_read; int64_t block_preload; int64_t block_read; int64_t block_write; - int64_t cache_bytes_dirty; - int64_t cache_bytes_internal; + int64_t block_byte_read; + int64_t block_byte_write; + int64_t block_map_read; + int64_t block_byte_map_read; int64_t cache_bytes_inuse; - int64_t cache_bytes_leaf; - int64_t cache_bytes_max; - int64_t cache_bytes_overflow; int64_t cache_bytes_read; int64_t cache_bytes_write; - int64_t cache_eviction_app; int64_t cache_eviction_checkpoint; - int64_t cache_eviction_clean; - int64_t cache_eviction_deepen; - int64_t cache_eviction_dirty; - int64_t cache_eviction_fail; - int64_t cache_eviction_force; - int64_t cache_eviction_force_delete; - int64_t cache_eviction_force_fail; - int64_t cache_eviction_hazard; - int64_t cache_eviction_internal; - int64_t cache_eviction_maximum_page_size; + int64_t cache_eviction_aggressive_set; int64_t cache_eviction_queue_empty; int64_t cache_eviction_queue_not_empty; int64_t cache_eviction_server_evicting; int64_t cache_eviction_server_not_evicting; int64_t cache_eviction_slow; - int64_t cache_eviction_split_internal; - int64_t cache_eviction_split_leaf; - int64_t cache_eviction_walk; int64_t cache_eviction_worker_evicting; - int64_t cache_inmem_split; + int64_t cache_eviction_force_fail; + int64_t cache_eviction_hazard; int64_t cache_inmem_splittable; + int64_t cache_inmem_split; + int64_t cache_eviction_internal; + int64_t cache_eviction_split_internal; + int64_t cache_eviction_split_leaf; int64_t cache_lookaside_insert; int64_t cache_lookaside_remove; - int64_t cache_overhead; - int64_t cache_pages_dirty; + int64_t cache_bytes_max; + int64_t cache_eviction_maximum_page_size; + int64_t cache_eviction_dirty; + int64_t cache_eviction_deepen; + int64_t cache_write_lookaside; int64_t cache_pages_inuse; + int64_t cache_eviction_force; + int64_t cache_eviction_force_delete; + int64_t cache_eviction_app; int64_t cache_read; int64_t cache_read_lookaside; + int64_t cache_eviction_fail; + int64_t cache_eviction_walk; int64_t cache_write; - int64_t cache_write_lookaside; int64_t cache_write_restore; + int64_t cache_overhead; + int64_t cache_bytes_internal; + int64_t cache_bytes_leaf; + int64_t cache_bytes_overflow; + int64_t cache_bytes_dirty; + int64_t cache_pages_dirty; + int64_t cache_eviction_clean; + int64_t cond_auto_wait_reset; + int64_t cond_auto_wait; + int64_t file_open; + int64_t memory_allocation; + int64_t memory_free; + int64_t memory_grow; int64_t cond_wait; + int64_t rwlock_read; + int64_t rwlock_write; + int64_t read_io; + int64_t write_io; int64_t cursor_create; int64_t cursor_insert; int64_t cursor_next; @@ -298,96 +319,81 @@ struct __wt_connection_stats { int64_t cursor_restart; int64_t cursor_search; int64_t cursor_search_near; - int64_t cursor_truncate; int64_t cursor_update; + int64_t cursor_truncate; int64_t dh_conn_handle_count; - int64_t dh_session_handles; - int64_t dh_session_sweeps; - int64_t dh_sweep_close; int64_t dh_sweep_ref; + int64_t dh_sweep_close; int64_t dh_sweep_remove; int64_t dh_sweep_tod; int64_t dh_sweeps; - int64_t file_open; - int64_t log_buffer_size; + int64_t dh_session_handles; + int64_t dh_session_sweeps; + int64_t log_slot_switch_busy; + int64_t log_slot_closes; + int64_t log_slot_races; + int64_t log_slot_transitions; + int64_t log_slot_joins; + int64_t log_slot_unbuffered; int64_t log_bytes_payload; int64_t log_bytes_written; - int64_t log_close_yields; - int64_t log_compress_len; - int64_t log_compress_mem; - int64_t log_compress_small; - int64_t log_compress_write_fails; - int64_t log_compress_writes; + int64_t log_zero_fills; int64_t log_flush; + int64_t log_force_write; + int64_t log_force_write_skip; + int64_t log_compress_writes; + int64_t log_compress_write_fails; + int64_t log_compress_small; + int64_t log_release_write_lsn; + int64_t log_scans; + int64_t log_scan_rereads; + int64_t log_write_lsn; + int64_t log_write_lsn_skip; + int64_t log_sync; + int64_t log_sync_dir; + int64_t log_writes; + int64_t log_slot_consolidated; int64_t log_max_filesize; - int64_t log_prealloc_files; int64_t log_prealloc_max; int64_t log_prealloc_missed; + int64_t log_prealloc_files; int64_t log_prealloc_used; - int64_t log_release_write_lsn; int64_t log_scan_records; - int64_t log_scan_rereads; - int64_t log_scans; - int64_t log_slot_closes; + int64_t log_compress_mem; + int64_t log_buffer_size; + int64_t log_compress_len; int64_t log_slot_coalesced; - int64_t log_slot_consolidated; - int64_t log_slot_joins; - int64_t log_slot_races; - int64_t log_slot_switch_busy; - int64_t log_slot_transitions; - int64_t log_slot_unbuffered; - int64_t log_sync; - int64_t log_sync_dir; - int64_t log_write_lsn; - int64_t log_writes; - int64_t log_zero_fills; - int64_t lsm_checkpoint_throttle; - int64_t lsm_merge_throttle; - int64_t lsm_rows_merged; - int64_t lsm_work_queue_app; - int64_t lsm_work_queue_manager; - int64_t lsm_work_queue_max; - int64_t lsm_work_queue_switch; - int64_t lsm_work_units_created; - int64_t lsm_work_units_discarded; - int64_t lsm_work_units_done; - int64_t memory_allocation; - int64_t memory_free; - int64_t memory_grow; - int64_t page_busy_blocked; - int64_t page_forcible_evict_blocked; - int64_t page_locked_blocked; - int64_t page_read_blocked; - int64_t page_sleep; - int64_t read_io; - int64_t rec_page_delete; + int64_t log_close_yields; int64_t rec_page_delete_fast; int64_t rec_pages; int64_t rec_pages_eviction; + int64_t rec_page_delete; int64_t rec_split_stashed_bytes; int64_t rec_split_stashed_objects; - int64_t rwlock_read; - int64_t rwlock_write; int64_t session_cursor_open; int64_t session_open; + int64_t page_busy_blocked; + int64_t page_forcible_evict_blocked; + int64_t page_locked_blocked; + int64_t page_read_blocked; + int64_t page_sleep; + int64_t txn_snapshots_created; + int64_t txn_snapshots_dropped; int64_t txn_begin; - int64_t txn_checkpoint; - int64_t txn_checkpoint_generation; int64_t txn_checkpoint_running; + int64_t txn_checkpoint_generation; int64_t txn_checkpoint_time_max; int64_t txn_checkpoint_time_min; int64_t txn_checkpoint_time_recent; int64_t txn_checkpoint_time_total; - int64_t txn_commit; + int64_t txn_checkpoint; int64_t txn_fail_cache; - int64_t txn_pinned_checkpoint_range; int64_t txn_pinned_range; + int64_t txn_pinned_checkpoint_range; int64_t txn_pinned_snapshot_range; - int64_t txn_rollback; - int64_t txn_snapshots_created; - int64_t txn_snapshots_dropped; int64_t txn_sync; - int64_t write_io; + int64_t txn_commit; + int64_t txn_rollback; }; /* @@ -395,102 +401,102 @@ struct __wt_connection_stats { */ #define WT_DSRC_STATS_BASE 2000 struct __wt_dsrc_stats { - int64_t allocation_size; - int64_t block_alloc; - int64_t block_checkpoint_size; - int64_t block_extension; - int64_t block_free; - int64_t block_magic; - int64_t block_major; - int64_t block_minor; - int64_t block_reuse_bytes; - int64_t block_size; - int64_t bloom_count; int64_t bloom_false_positive; int64_t bloom_hit; int64_t bloom_miss; int64_t bloom_page_evict; int64_t bloom_page_read; + int64_t bloom_count; + int64_t lsm_chunk_count; + int64_t lsm_generation_max; + int64_t lsm_lookup_no_bloom; + int64_t lsm_checkpoint_throttle; + int64_t lsm_merge_throttle; int64_t bloom_size; + int64_t block_extension; + int64_t block_alloc; + int64_t block_free; + int64_t block_checkpoint_size; + int64_t allocation_size; + int64_t block_reuse_bytes; + int64_t block_magic; + int64_t block_major; + int64_t block_size; + int64_t block_minor; int64_t btree_checkpoint_generation; - int64_t btree_column_deleted; int64_t btree_column_fix; int64_t btree_column_internal; int64_t btree_column_rle; + int64_t btree_column_deleted; int64_t btree_column_variable; - int64_t btree_compact_rewrite; - int64_t btree_entries; int64_t btree_fixed_len; - int64_t btree_maximum_depth; int64_t btree_maxintlkey; int64_t btree_maxintlpage; int64_t btree_maxleafkey; int64_t btree_maxleafpage; int64_t btree_maxleafvalue; + int64_t btree_maximum_depth; + int64_t btree_entries; int64_t btree_overflow; + int64_t btree_compact_rewrite; int64_t btree_row_internal; int64_t btree_row_leaf; int64_t cache_bytes_read; int64_t cache_bytes_write; int64_t cache_eviction_checkpoint; - int64_t cache_eviction_clean; - int64_t cache_eviction_deepen; - int64_t cache_eviction_dirty; int64_t cache_eviction_fail; int64_t cache_eviction_hazard; + int64_t cache_inmem_splittable; + int64_t cache_inmem_split; int64_t cache_eviction_internal; int64_t cache_eviction_split_internal; int64_t cache_eviction_split_leaf; - int64_t cache_inmem_split; - int64_t cache_inmem_splittable; + int64_t cache_eviction_dirty; + int64_t cache_read_overflow; int64_t cache_overflow_value; + int64_t cache_eviction_deepen; + int64_t cache_write_lookaside; int64_t cache_read; int64_t cache_read_lookaside; - int64_t cache_read_overflow; int64_t cache_write; - int64_t cache_write_lookaside; int64_t cache_write_restore; - int64_t compress_raw_fail; - int64_t compress_raw_fail_temporary; - int64_t compress_raw_ok; + int64_t cache_eviction_clean; int64_t compress_read; int64_t compress_write; int64_t compress_write_fail; int64_t compress_write_too_small; - int64_t cursor_create; - int64_t cursor_insert; + int64_t compress_raw_fail_temporary; + int64_t compress_raw_fail; + int64_t compress_raw_ok; int64_t cursor_insert_bulk; + int64_t cursor_create; int64_t cursor_insert_bytes; + int64_t cursor_remove_bytes; + int64_t cursor_update_bytes; + int64_t cursor_insert; int64_t cursor_next; int64_t cursor_prev; int64_t cursor_remove; - int64_t cursor_remove_bytes; int64_t cursor_reset; int64_t cursor_restart; int64_t cursor_search; int64_t cursor_search_near; int64_t cursor_truncate; int64_t cursor_update; - int64_t cursor_update_bytes; - int64_t lsm_checkpoint_throttle; - int64_t lsm_chunk_count; - int64_t lsm_generation_max; - int64_t lsm_lookup_no_bloom; - int64_t lsm_merge_throttle; int64_t rec_dictionary; + int64_t rec_page_delete_fast; + int64_t rec_suffix_compression; int64_t rec_multiblock_internal; - int64_t rec_multiblock_leaf; - int64_t rec_multiblock_max; int64_t rec_overflow_key_internal; + int64_t rec_prefix_compression; + int64_t rec_multiblock_leaf; int64_t rec_overflow_key_leaf; + int64_t rec_multiblock_max; int64_t rec_overflow_value; - int64_t rec_page_delete; - int64_t rec_page_delete_fast; int64_t rec_page_match; int64_t rec_pages; int64_t rec_pages_eviction; - int64_t rec_prefix_compression; - int64_t rec_suffix_compression; + int64_t rec_page_delete; int64_t session_compact; int64_t session_cursor_open; int64_t txn_update_conflict; diff --git a/src/include/txn.i b/src/include/txn.i index 46f2ff3e5f1..40e2a6175d6 100644 --- a/src/include/txn.i +++ b/src/include/txn.i @@ -266,6 +266,8 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) } F_SET(txn, WT_TXN_RUNNING); + if (F_ISSET(S2C(session), WT_CONN_READONLY)) + F_SET(txn, WT_TXN_READONLY); return (false); } diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 767c176b53f..1e263f22880 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -828,7 +828,8 @@ struct __wt_session { * @snippet ex_all.c Display an error thread safe * * @param session the session handle - * @param error a return value from a WiredTiger function + * @param error a return value from a WiredTiger, ISO C, or POSIX + * standard API * @returns a string representation of the error */ const char *__F(strerror)(WT_SESSION *session, int error); @@ -873,7 +874,7 @@ struct __wt_session { * updates). See @ref data_sources for more information. * <br> * @copydoc doc_cursor_types - * @param to_dup a cursor to duplicate + * @param to_dup a cursor to duplicate or gather statistics on * @configstart{WT_SESSION.open_cursor, see dist/api_data.py} * @config{append, append the value as a new record\, creating a new * record number key; valid only for cursors with record number keys., a @@ -1409,7 +1410,7 @@ struct __wt_session { * if <code>NULL</code>, the truncate continues to the end of the * object * @configempty{WT_SESSION.truncate, see dist/api_data.py} - * @ebusy_errors + * @errors */ int __F(truncate)(WT_SESSION *session, const char *name, @@ -1893,8 +1894,10 @@ struct __wt_connection { * information. Enabling the statistics log server uses a session from * the configured session_max., a set of related configuration options * defined below.} - * @config{ on_close, log - * statistics on database close., a boolean flag; default \c false.} + * @config{ json, encode + * statistics in JSON format., a boolean flag; default \c false.} + * @config{ on_close, log statistics on database + * close., a boolean flag; default \c false.} * @config{ path, the pathname to a file into * which the log records are written\, may contain ISO C standard * strftime conversion specifications. If the value is not an absolute @@ -1908,7 +1911,8 @@ struct __wt_connection { * empty.} * @config{ timestamp, a timestamp * prepended to each log record\, may contain strftime conversion - * specifications., a string; default \c "%b %d %H:%M:%S".} + * specifications\, when \c json is configured\, defaults to \c + * "%FT%Y.000Z"., a string; default \c "%b %d %H:%M:%S".} * @config{ wait, seconds to wait between each * write of the log records; setting this value above 0 configures * statistics logging., an integer between 0 and 100000; default \c 0.} @@ -1982,7 +1986,8 @@ struct __wt_connection { * * @param connection the connection handle * @param errhandler An error handler. If <code>NULL</code>, the - * connection's error handler is used + * connection's error handler is used. See @ref error_handling_event + * for more information. * @configstart{WT_CONNECTION.open_session, see dist/api_data.py} * @config{isolation, the default isolation level for operations in this * session., a string\, chosen from the following options: \c @@ -2143,7 +2148,8 @@ struct __wt_connection { * @param home The path to the database home directory. See @ref home * for more information. * @param errhandler An error handler. If <code>NULL</code>, a builtin error - * handler is installed that writes error messages to stderr + * handler is installed that writes error messages to stderr. See + * @ref error_handling_event for more information. * @configstart{wiredtiger_open, see dist/api_data.py} * @config{async = (, asynchronous operations configuration options., a set of * related configuration options defined below.} @@ -2326,6 +2332,9 @@ struct __wt_connection { * start an RPC server for primary processes and use RPC for secondary * processes). <b>Not yet supported in WiredTiger</b>., a boolean flag; default * \c false.} + * @config{readonly, open connection in read-only mode. The database must + * exist. All methods that may modify a database are disabled. See @ref + * readonly for more information., a boolean flag; default \c false.} * @config{session_max, maximum expected number of sessions (including server * threads)., an integer greater than or equal to 1; default \c 100.} * @config{shared_cache = (, shared cache configuration options. A database @@ -2363,23 +2372,26 @@ struct __wt_connection { * maintain\, to a file. See @ref statistics for more information. Enabling * the statistics log server uses a session from the configured session_max., a * set of related configuration options defined below.} - * @config{ on_close, log statistics on database close., - * a boolean flag; default \c false.} - * @config{ path, the - * pathname to a file into which the log records are written\, may contain ISO C - * standard strftime conversion specifications. If the value is not an absolute - * path name\, the file is created relative to the database home., a string; - * default \c "WiredTigerStat.%d.%H".} - * @config{ sources, - * if non-empty\, include statistics for the list of data source URIs\, if they - * are open at the time of the statistics logging. The list may include URIs + * @config{ json, encode statistics in JSON format., a + * boolean flag; default \c false.} + * @config{ on_close, + * log statistics on database close., a boolean flag; default \c false.} + * @config{ path, the pathname to a file into which the + * log records are written\, may contain ISO C standard strftime conversion + * specifications. If the value is not an absolute path name\, the file is + * created relative to the database home., a string; default \c + * "WiredTigerStat.%d.%H".} + * @config{ sources, if + * non-empty\, include statistics for the list of data source URIs\, if they are + * open at the time of the statistics logging. The list may include URIs * matching a single data source ("table:mytable")\, or a URI matching all data * sources of a particular type ("table:")., a list of strings; default empty.} * @config{ timestamp, a timestamp prepended to each log - * record\, may contain strftime conversion specifications., a string; default - * \c "%b %d %H:%M:%S".} - * @config{ wait, seconds to wait - * between each write of the log records; setting this value above 0 configures + * record\, may contain strftime conversion specifications\, when \c json is + * configured\, defaults to \c "%FT%Y.000Z"., a string; default \c "%b %d + * %H:%M:%S".} + * @config{ wait, seconds to wait between + * each write of the log records; setting this value above 0 configures * statistics logging., an integer between 0 and 100000; default \c 0.} * @config{ * ),,} @@ -2431,11 +2443,12 @@ int wiredtiger_open(const char *home, WT_CONNECTION **connectionp); /*! - * Return information about a WiredTiger error as a string, not thread-safe. + * Return information about a WiredTiger error as a string (see + * WT_SESSION::strerror for a thread-safe API). * * @snippet ex_all.c Display an error * - * @param error a return value from a WiredTiger call + * @param error a return value from a WiredTiger, ISO C, or POSIX standard API * @returns a string representation of the error */ const char *wiredtiger_strerror(int error); @@ -2474,7 +2487,7 @@ struct __wt_async_callback { struct __wt_event_handler { /*! * Callback to handle error messages; by default, error messages are - * written to the stderr stream. + * written to the stderr stream. See @ref error_handling. * * Errors that require the application to exit and restart will have * their \c error value set to \c WT_PANIC. The application can exit @@ -2488,8 +2501,9 @@ struct __wt_event_handler { * @param session the WiredTiger session handle in use when the error * was generated. The handle may have been created by the application * or automatically by WiredTiger. - * @param error a WiredTiger, C99 or POSIX error code, which can - * be converted to a string using ::wiredtiger_strerror + * @param error a return value from a WiredTiger, ISO C, or + * POSIX standard API, which can be converted to a string using + * WT_SESSION::strerror * @param message an error string */ int (*handle_error)(WT_EVENT_HANDLER *handler, @@ -2497,7 +2511,7 @@ struct __wt_event_handler { /*! * Callback to handle informational messages; by default, informational - * messages are written to the stdout stream. + * messages are written to the stdout stream. See @ref error_handling. * * Message handler returns are not ignored: if the handler returns * non-zero, the error may cause the WiredTiger function posting the @@ -2513,7 +2527,7 @@ struct __wt_event_handler { /*! * Callback to handle progress messages; by default, no progress - * messages are written. + * messages are written. See @ref error_handling. * * Progress handler returns are not ignored: if the handler returns * non-zero, the error may cause the WiredTiger function posting the @@ -2998,6 +3012,10 @@ const char *wiredtiger_version(int *majorp, int *minorp, int *patchp); */ #define WT_CACHE_FULL -31807 /*! @endcond */ +/*! @cond internal */ +/*! Permission denied (internal). */ +#define WT_PERM_DENIED -31808 +/*! @endcond */ /* * Error return section: END * DO NOT EDIT: automatically built by dist/api_err.py. @@ -3688,329 +3706,341 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); * keys. See @ref data_statistics for more information. * @{ */ -/*! async: number of allocation state races */ -#define WT_STAT_CONN_ASYNC_ALLOC_RACE 1000 -/*! async: number of operation slots viewed for allocation */ -#define WT_STAT_CONN_ASYNC_ALLOC_VIEW 1001 +/*! LSM: application work units currently queued */ +#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1000 +/*! LSM: merge work units currently queued */ +#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1001 +/*! LSM: rows merged in an LSM tree */ +#define WT_STAT_CONN_LSM_ROWS_MERGED 1002 +/*! LSM: sleep for LSM checkpoint throttle */ +#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1003 +/*! LSM: sleep for LSM merge throttle */ +#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1004 +/*! LSM: switch work units currently queued */ +#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1005 +/*! LSM: tree maintenance operations discarded */ +#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1006 +/*! LSM: tree maintenance operations executed */ +#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1007 +/*! LSM: tree maintenance operations scheduled */ +#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1008 +/*! LSM: tree queue hit maximum */ +#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1009 /*! async: current work queue length */ -#define WT_STAT_CONN_ASYNC_CUR_QUEUE 1002 +#define WT_STAT_CONN_ASYNC_CUR_QUEUE 1010 +/*! async: maximum work queue length */ +#define WT_STAT_CONN_ASYNC_MAX_QUEUE 1011 +/*! async: number of allocation state races */ +#define WT_STAT_CONN_ASYNC_ALLOC_RACE 1012 /*! async: number of flush calls */ -#define WT_STAT_CONN_ASYNC_FLUSH 1003 +#define WT_STAT_CONN_ASYNC_FLUSH 1013 +/*! async: number of operation slots viewed for allocation */ +#define WT_STAT_CONN_ASYNC_ALLOC_VIEW 1014 /*! async: number of times operation allocation failed */ -#define WT_STAT_CONN_ASYNC_FULL 1004 -/*! async: maximum work queue length */ -#define WT_STAT_CONN_ASYNC_MAX_QUEUE 1005 +#define WT_STAT_CONN_ASYNC_FULL 1015 /*! async: number of times worker found no work */ -#define WT_STAT_CONN_ASYNC_NOWORK 1006 +#define WT_STAT_CONN_ASYNC_NOWORK 1016 /*! async: total allocations */ -#define WT_STAT_CONN_ASYNC_OP_ALLOC 1007 +#define WT_STAT_CONN_ASYNC_OP_ALLOC 1017 /*! async: total compact calls */ -#define WT_STAT_CONN_ASYNC_OP_COMPACT 1008 +#define WT_STAT_CONN_ASYNC_OP_COMPACT 1018 /*! async: total insert calls */ -#define WT_STAT_CONN_ASYNC_OP_INSERT 1009 +#define WT_STAT_CONN_ASYNC_OP_INSERT 1019 /*! async: total remove calls */ -#define WT_STAT_CONN_ASYNC_OP_REMOVE 1010 +#define WT_STAT_CONN_ASYNC_OP_REMOVE 1020 /*! async: total search calls */ -#define WT_STAT_CONN_ASYNC_OP_SEARCH 1011 +#define WT_STAT_CONN_ASYNC_OP_SEARCH 1021 /*! async: total update calls */ -#define WT_STAT_CONN_ASYNC_OP_UPDATE 1012 -/*! block-manager: mapped bytes read */ -#define WT_STAT_CONN_BLOCK_BYTE_MAP_READ 1013 -/*! block-manager: bytes read */ -#define WT_STAT_CONN_BLOCK_BYTE_READ 1014 -/*! block-manager: bytes written */ -#define WT_STAT_CONN_BLOCK_BYTE_WRITE 1015 -/*! block-manager: mapped blocks read */ -#define WT_STAT_CONN_BLOCK_MAP_READ 1016 +#define WT_STAT_CONN_ASYNC_OP_UPDATE 1022 /*! block-manager: blocks pre-loaded */ -#define WT_STAT_CONN_BLOCK_PRELOAD 1017 +#define WT_STAT_CONN_BLOCK_PRELOAD 1023 /*! block-manager: blocks read */ -#define WT_STAT_CONN_BLOCK_READ 1018 +#define WT_STAT_CONN_BLOCK_READ 1024 /*! block-manager: blocks written */ -#define WT_STAT_CONN_BLOCK_WRITE 1019 -/*! cache: tracked dirty bytes in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1020 -/*! cache: tracked bytes belonging to internal pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1021 +#define WT_STAT_CONN_BLOCK_WRITE 1025 +/*! block-manager: bytes read */ +#define WT_STAT_CONN_BLOCK_BYTE_READ 1026 +/*! block-manager: bytes written */ +#define WT_STAT_CONN_BLOCK_BYTE_WRITE 1027 +/*! block-manager: mapped blocks read */ +#define WT_STAT_CONN_BLOCK_MAP_READ 1028 +/*! block-manager: mapped bytes read */ +#define WT_STAT_CONN_BLOCK_BYTE_MAP_READ 1029 /*! cache: bytes currently in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_INUSE 1022 -/*! cache: tracked bytes belonging to leaf pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_LEAF 1023 -/*! cache: maximum bytes configured */ -#define WT_STAT_CONN_CACHE_BYTES_MAX 1024 -/*! cache: tracked bytes belonging to overflow pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_OVERFLOW 1025 +#define WT_STAT_CONN_CACHE_BYTES_INUSE 1030 /*! cache: bytes read into cache */ -#define WT_STAT_CONN_CACHE_BYTES_READ 1026 +#define WT_STAT_CONN_CACHE_BYTES_READ 1031 /*! cache: bytes written from cache */ -#define WT_STAT_CONN_CACHE_BYTES_WRITE 1027 -/*! cache: pages evicted by application threads */ -#define WT_STAT_CONN_CACHE_EVICTION_APP 1028 +#define WT_STAT_CONN_CACHE_BYTES_WRITE 1032 /*! cache: checkpoint blocked page eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 1029 -/*! cache: unmodified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1030 -/*! cache: page split during eviction deepened the tree */ -#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1031 -/*! cache: modified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1032 -/*! cache: pages selected for eviction unable to be evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1033 -/*! cache: pages evicted because they exceeded the in-memory maximum */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1034 -/*! cache: pages evicted because they had chains of deleted items */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1035 -/*! cache: failed eviction of pages that exceeded the in-memory maximum */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1036 -/*! cache: hazard pointer blocked page eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1037 -/*! cache: internal pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1038 -/*! cache: maximum page size at eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1039 +#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 1033 +/*! cache: eviction currently operating in aggressive mode */ +#define WT_STAT_CONN_CACHE_EVICTION_AGGRESSIVE_SET 1034 /*! cache: eviction server candidate queue empty when topping up */ -#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1040 +#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1035 /*! cache: eviction server candidate queue not empty when topping up */ -#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1041 +#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1036 /*! cache: eviction server evicting pages */ -#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1042 +#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1037 /*! cache: eviction server populating queue, but not evicting pages */ -#define WT_STAT_CONN_CACHE_EVICTION_SERVER_NOT_EVICTING 1043 +#define WT_STAT_CONN_CACHE_EVICTION_SERVER_NOT_EVICTING 1038 /*! cache: eviction server unable to reach eviction goal */ -#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1044 -/*! cache: internal pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1045 -/*! cache: leaf pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1046 -/*! cache: pages walked for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK 1047 +#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1039 /*! cache: eviction worker thread evicting pages */ -#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1048 -/*! cache: in-memory page splits */ -#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1049 +#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1040 +/*! cache: failed eviction of pages that exceeded the in-memory maximum */ +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1041 +/*! cache: hazard pointer blocked page eviction */ +#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1042 /*! cache: in-memory page passed criteria to be split */ -#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1050 +#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1043 +/*! cache: in-memory page splits */ +#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1044 +/*! cache: internal pages evicted */ +#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1045 +/*! cache: internal pages split during eviction */ +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1046 +/*! cache: leaf pages split during eviction */ +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1047 /*! cache: lookaside table insert calls */ -#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1051 +#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1048 /*! cache: lookaside table remove calls */ -#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1052 -/*! cache: percentage overhead */ -#define WT_STAT_CONN_CACHE_OVERHEAD 1053 -/*! cache: tracked dirty pages in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1054 +#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1049 +/*! cache: maximum bytes configured */ +#define WT_STAT_CONN_CACHE_BYTES_MAX 1050 +/*! cache: maximum page size at eviction */ +#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1051 +/*! cache: modified pages evicted */ +#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1052 +/*! cache: page split during eviction deepened the tree */ +#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1053 +/*! cache: page written requiring lookaside records */ +#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1054 /*! cache: pages currently held in the cache */ #define WT_STAT_CONN_CACHE_PAGES_INUSE 1055 +/*! cache: pages evicted because they exceeded the in-memory maximum */ +#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1056 +/*! cache: pages evicted because they had chains of deleted items */ +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1057 +/*! cache: pages evicted by application threads */ +#define WT_STAT_CONN_CACHE_EVICTION_APP 1058 /*! cache: pages read into cache */ -#define WT_STAT_CONN_CACHE_READ 1056 +#define WT_STAT_CONN_CACHE_READ 1059 /*! cache: pages read into cache requiring lookaside entries */ -#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1057 +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1060 +/*! cache: pages selected for eviction unable to be evicted */ +#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1061 +/*! cache: pages walked for eviction */ +#define WT_STAT_CONN_CACHE_EVICTION_WALK 1062 /*! cache: pages written from cache */ -#define WT_STAT_CONN_CACHE_WRITE 1058 -/*! cache: page written requiring lookaside records */ -#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1059 +#define WT_STAT_CONN_CACHE_WRITE 1063 /*! cache: pages written requiring in-memory restoration */ -#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1060 +#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1064 +/*! cache: percentage overhead */ +#define WT_STAT_CONN_CACHE_OVERHEAD 1065 +/*! cache: tracked bytes belonging to internal pages in the cache */ +#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1066 +/*! cache: tracked bytes belonging to leaf pages in the cache */ +#define WT_STAT_CONN_CACHE_BYTES_LEAF 1067 +/*! cache: tracked bytes belonging to overflow pages in the cache */ +#define WT_STAT_CONN_CACHE_BYTES_OVERFLOW 1068 +/*! cache: tracked dirty bytes in the cache */ +#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1069 +/*! cache: tracked dirty pages in the cache */ +#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1070 +/*! cache: unmodified pages evicted */ +#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1071 +/*! connection: auto adjusting condition resets */ +#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1072 +/*! connection: auto adjusting condition wait calls */ +#define WT_STAT_CONN_COND_AUTO_WAIT 1073 +/*! connection: files currently open */ +#define WT_STAT_CONN_FILE_OPEN 1074 +/*! connection: memory allocations */ +#define WT_STAT_CONN_MEMORY_ALLOCATION 1075 +/*! connection: memory frees */ +#define WT_STAT_CONN_MEMORY_FREE 1076 +/*! connection: memory re-allocations */ +#define WT_STAT_CONN_MEMORY_GROW 1077 /*! connection: pthread mutex condition wait calls */ -#define WT_STAT_CONN_COND_WAIT 1061 +#define WT_STAT_CONN_COND_WAIT 1078 +/*! connection: pthread mutex shared lock read-lock calls */ +#define WT_STAT_CONN_RWLOCK_READ 1079 +/*! connection: pthread mutex shared lock write-lock calls */ +#define WT_STAT_CONN_RWLOCK_WRITE 1080 +/*! connection: total read I/Os */ +#define WT_STAT_CONN_READ_IO 1081 +/*! connection: total write I/Os */ +#define WT_STAT_CONN_WRITE_IO 1082 /*! cursor: cursor create calls */ -#define WT_STAT_CONN_CURSOR_CREATE 1062 +#define WT_STAT_CONN_CURSOR_CREATE 1083 /*! cursor: cursor insert calls */ -#define WT_STAT_CONN_CURSOR_INSERT 1063 +#define WT_STAT_CONN_CURSOR_INSERT 1084 /*! cursor: cursor next calls */ -#define WT_STAT_CONN_CURSOR_NEXT 1064 +#define WT_STAT_CONN_CURSOR_NEXT 1085 /*! cursor: cursor prev calls */ -#define WT_STAT_CONN_CURSOR_PREV 1065 +#define WT_STAT_CONN_CURSOR_PREV 1086 /*! cursor: cursor remove calls */ -#define WT_STAT_CONN_CURSOR_REMOVE 1066 +#define WT_STAT_CONN_CURSOR_REMOVE 1087 /*! cursor: cursor reset calls */ -#define WT_STAT_CONN_CURSOR_RESET 1067 +#define WT_STAT_CONN_CURSOR_RESET 1088 /*! cursor: cursor restarted searches */ -#define WT_STAT_CONN_CURSOR_RESTART 1068 +#define WT_STAT_CONN_CURSOR_RESTART 1089 /*! cursor: cursor search calls */ -#define WT_STAT_CONN_CURSOR_SEARCH 1069 +#define WT_STAT_CONN_CURSOR_SEARCH 1090 /*! cursor: cursor search near calls */ -#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1070 -/*! cursor: truncate calls */ -#define WT_STAT_CONN_CURSOR_TRUNCATE 1071 +#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1091 /*! cursor: cursor update calls */ -#define WT_STAT_CONN_CURSOR_UPDATE 1072 +#define WT_STAT_CONN_CURSOR_UPDATE 1092 +/*! cursor: truncate calls */ +#define WT_STAT_CONN_CURSOR_TRUNCATE 1093 /*! data-handle: connection data handles currently active */ -#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1073 -/*! data-handle: session dhandles swept */ -#define WT_STAT_CONN_DH_SESSION_HANDLES 1074 -/*! data-handle: session sweep attempts */ -#define WT_STAT_CONN_DH_SESSION_SWEEPS 1075 -/*! data-handle: connection sweep dhandles closed */ -#define WT_STAT_CONN_DH_SWEEP_CLOSE 1076 +#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1094 /*! data-handle: connection sweep candidate became referenced */ -#define WT_STAT_CONN_DH_SWEEP_REF 1077 +#define WT_STAT_CONN_DH_SWEEP_REF 1095 +/*! data-handle: connection sweep dhandles closed */ +#define WT_STAT_CONN_DH_SWEEP_CLOSE 1096 /*! data-handle: connection sweep dhandles removed from hash list */ -#define WT_STAT_CONN_DH_SWEEP_REMOVE 1078 +#define WT_STAT_CONN_DH_SWEEP_REMOVE 1097 /*! data-handle: connection sweep time-of-death sets */ -#define WT_STAT_CONN_DH_SWEEP_TOD 1079 +#define WT_STAT_CONN_DH_SWEEP_TOD 1098 /*! data-handle: connection sweeps */ -#define WT_STAT_CONN_DH_SWEEPS 1080 -/*! connection: files currently open */ -#define WT_STAT_CONN_FILE_OPEN 1081 -/*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1082 +#define WT_STAT_CONN_DH_SWEEPS 1099 +/*! data-handle: session dhandles swept */ +#define WT_STAT_CONN_DH_SESSION_HANDLES 1100 +/*! data-handle: session sweep attempts */ +#define WT_STAT_CONN_DH_SESSION_SWEEPS 1101 +/*! log: busy returns attempting to switch slots */ +#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1102 +/*! log: consolidated slot closures */ +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1103 +/*! log: consolidated slot join races */ +#define WT_STAT_CONN_LOG_SLOT_RACES 1104 +/*! log: consolidated slot join transitions */ +#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1105 +/*! log: consolidated slot joins */ +#define WT_STAT_CONN_LOG_SLOT_JOINS 1106 +/*! log: consolidated slot unbuffered writes */ +#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1107 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1083 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1108 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1084 -/*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1085 -/*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1086 -/*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1087 -/*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1088 -/*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1089 -/*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1090 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1109 +/*! log: log files manually zero-filled */ +#define WT_STAT_CONN_LOG_ZERO_FILLS 1110 /*! log: log flush operations */ -#define WT_STAT_CONN_LOG_FLUSH 1091 +#define WT_STAT_CONN_LOG_FLUSH 1111 +/*! log: log force write operations */ +#define WT_STAT_CONN_LOG_FORCE_WRITE 1112 +/*! log: log force write operations skipped */ +#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1113 +/*! log: log records compressed */ +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1114 +/*! log: log records not compressed */ +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1115 +/*! log: log records too small to compress */ +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1116 +/*! log: log release advances write LSN */ +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1117 +/*! log: log scan operations */ +#define WT_STAT_CONN_LOG_SCANS 1118 +/*! log: log scan records requiring two reads */ +#define WT_STAT_CONN_LOG_SCAN_REREADS 1119 +/*! log: log server thread advances write LSN */ +#define WT_STAT_CONN_LOG_WRITE_LSN 1120 +/*! log: log server thread write LSN walk skipped */ +#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1121 +/*! log: log sync operations */ +#define WT_STAT_CONN_LOG_SYNC 1122 +/*! log: log sync_dir operations */ +#define WT_STAT_CONN_LOG_SYNC_DIR 1123 +/*! log: log write operations */ +#define WT_STAT_CONN_LOG_WRITES 1124 +/*! log: logging bytes consolidated */ +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1125 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1092 -/*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1093 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1126 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1094 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1127 /*! log: pre-allocated log files not ready and missed */ -#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1095 +#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1128 +/*! log: pre-allocated log files prepared */ +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1129 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1096 -/*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1097 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1130 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1098 -/*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1099 -/*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1100 -/*! log: consolidated slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1101 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1131 +/*! log: total in-memory size of compressed records */ +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1132 +/*! log: total log buffer size */ +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1133 +/*! log: total size of compressed records */ +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1134 /*! log: written slots coalesced */ -#define WT_STAT_CONN_LOG_SLOT_COALESCED 1102 -/*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1103 -/*! log: consolidated slot joins */ -#define WT_STAT_CONN_LOG_SLOT_JOINS 1104 -/*! log: consolidated slot join races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1105 -/*! log: busy returns attempting to switch slots */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1106 -/*! log: consolidated slot join transitions */ -#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1107 -/*! log: consolidated slot unbuffered writes */ -#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1108 -/*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1109 -/*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1110 -/*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1111 -/*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1112 -/*! log: log files manually zero-filled */ -#define WT_STAT_CONN_LOG_ZERO_FILLS 1113 -/*! LSM: sleep for LSM checkpoint throttle */ -#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1114 -/*! LSM: sleep for LSM merge throttle */ -#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1115 -/*! LSM: rows merged in an LSM tree */ -#define WT_STAT_CONN_LSM_ROWS_MERGED 1116 -/*! LSM: application work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1117 -/*! LSM: merge work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1118 -/*! LSM: tree queue hit maximum */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1119 -/*! LSM: switch work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1120 -/*! LSM: tree maintenance operations scheduled */ -#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1121 -/*! LSM: tree maintenance operations discarded */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1122 -/*! LSM: tree maintenance operations executed */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1123 -/*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1124 -/*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1125 -/*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1126 -/*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1127 -/*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1128 -/*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1129 -/*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1130 -/*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1131 -/*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1132 -/*! reconciliation: pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE 1133 +#define WT_STAT_CONN_LOG_SLOT_COALESCED 1135 +/*! log: yields waiting for previous log file close */ +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1136 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1134 +#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1137 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1135 +#define WT_STAT_CONN_REC_PAGES 1138 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1136 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1139 +/*! reconciliation: pages deleted */ +#define WT_STAT_CONN_REC_PAGE_DELETE 1140 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1137 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1141 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1138 -/*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1139 -/*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1140 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1142 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1141 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1143 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1142 +#define WT_STAT_CONN_SESSION_OPEN 1144 +/*! thread-yield: page acquire busy blocked */ +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1145 +/*! thread-yield: page acquire eviction blocked */ +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1146 +/*! thread-yield: page acquire locked blocked */ +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1147 +/*! thread-yield: page acquire read blocked */ +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1148 +/*! thread-yield: page acquire time sleeping (usecs) */ +#define WT_STAT_CONN_PAGE_SLEEP 1149 +/*! transaction: number of named snapshots created */ +#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1150 +/*! transaction: number of named snapshots dropped */ +#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1151 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1143 -/*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1144 -/*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1145 +#define WT_STAT_CONN_TXN_BEGIN 1152 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1146 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1153 +/*! transaction: transaction checkpoint generation */ +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1154 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1147 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1155 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1148 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1156 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1149 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1157 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1150 -/*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1151 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1158 +/*! transaction: transaction checkpoints */ +#define WT_STAT_CONN_TXN_CHECKPOINT 1159 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1152 -/*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1153 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1160 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1154 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1161 +/*! transaction: transaction range of IDs currently pinned by a checkpoint */ +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1162 /*! transaction: transaction range of IDs currently pinned by named * snapshots */ -#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1155 -/*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1156 -/*! transaction: number of named snapshots created */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1157 -/*! transaction: number of named snapshots dropped */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1158 +#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1163 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1159 -/*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1160 +#define WT_STAT_CONN_TXN_SYNC 1164 +/*! transaction: transactions committed */ +#define WT_STAT_CONN_TXN_COMMIT 1165 +/*! transaction: transactions rolled back */ +#define WT_STAT_CONN_TXN_ROLLBACK 1166 /*! * @} @@ -4018,200 +4048,200 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); * @anchor statistics_dsrc * @{ */ -/*! block-manager: file allocation unit size */ -#define WT_STAT_DSRC_ALLOCATION_SIZE 2000 -/*! block-manager: blocks allocated */ -#define WT_STAT_DSRC_BLOCK_ALLOC 2001 -/*! block-manager: checkpoint size */ -#define WT_STAT_DSRC_BLOCK_CHECKPOINT_SIZE 2002 -/*! block-manager: allocations requiring file extension */ -#define WT_STAT_DSRC_BLOCK_EXTENSION 2003 -/*! block-manager: blocks freed */ -#define WT_STAT_DSRC_BLOCK_FREE 2004 -/*! block-manager: file magic number */ -#define WT_STAT_DSRC_BLOCK_MAGIC 2005 -/*! block-manager: file major version number */ -#define WT_STAT_DSRC_BLOCK_MAJOR 2006 -/*! block-manager: minor version number */ -#define WT_STAT_DSRC_BLOCK_MINOR 2007 -/*! block-manager: file bytes available for reuse */ -#define WT_STAT_DSRC_BLOCK_REUSE_BYTES 2008 -/*! block-manager: file size in bytes */ -#define WT_STAT_DSRC_BLOCK_SIZE 2009 -/*! LSM: bloom filters in the LSM tree */ -#define WT_STAT_DSRC_BLOOM_COUNT 2010 /*! LSM: bloom filter false positives */ -#define WT_STAT_DSRC_BLOOM_FALSE_POSITIVE 2011 +#define WT_STAT_DSRC_BLOOM_FALSE_POSITIVE 2000 /*! LSM: bloom filter hits */ -#define WT_STAT_DSRC_BLOOM_HIT 2012 +#define WT_STAT_DSRC_BLOOM_HIT 2001 /*! LSM: bloom filter misses */ -#define WT_STAT_DSRC_BLOOM_MISS 2013 +#define WT_STAT_DSRC_BLOOM_MISS 2002 /*! LSM: bloom filter pages evicted from cache */ -#define WT_STAT_DSRC_BLOOM_PAGE_EVICT 2014 +#define WT_STAT_DSRC_BLOOM_PAGE_EVICT 2003 /*! LSM: bloom filter pages read into cache */ -#define WT_STAT_DSRC_BLOOM_PAGE_READ 2015 +#define WT_STAT_DSRC_BLOOM_PAGE_READ 2004 +/*! LSM: bloom filters in the LSM tree */ +#define WT_STAT_DSRC_BLOOM_COUNT 2005 +/*! LSM: chunks in the LSM tree */ +#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2006 +/*! LSM: highest merge generation in the LSM tree */ +#define WT_STAT_DSRC_LSM_GENERATION_MAX 2007 +/*! LSM: queries that could have benefited from a Bloom filter that did + * not exist */ +#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2008 +/*! LSM: sleep for LSM checkpoint throttle */ +#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2009 +/*! LSM: sleep for LSM merge throttle */ +#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2010 /*! LSM: total size of bloom filters */ -#define WT_STAT_DSRC_BLOOM_SIZE 2016 +#define WT_STAT_DSRC_BLOOM_SIZE 2011 +/*! block-manager: allocations requiring file extension */ +#define WT_STAT_DSRC_BLOCK_EXTENSION 2012 +/*! block-manager: blocks allocated */ +#define WT_STAT_DSRC_BLOCK_ALLOC 2013 +/*! block-manager: blocks freed */ +#define WT_STAT_DSRC_BLOCK_FREE 2014 +/*! block-manager: checkpoint size */ +#define WT_STAT_DSRC_BLOCK_CHECKPOINT_SIZE 2015 +/*! block-manager: file allocation unit size */ +#define WT_STAT_DSRC_ALLOCATION_SIZE 2016 +/*! block-manager: file bytes available for reuse */ +#define WT_STAT_DSRC_BLOCK_REUSE_BYTES 2017 +/*! block-manager: file magic number */ +#define WT_STAT_DSRC_BLOCK_MAGIC 2018 +/*! block-manager: file major version number */ +#define WT_STAT_DSRC_BLOCK_MAJOR 2019 +/*! block-manager: file size in bytes */ +#define WT_STAT_DSRC_BLOCK_SIZE 2020 +/*! block-manager: minor version number */ +#define WT_STAT_DSRC_BLOCK_MINOR 2021 /*! btree: btree checkpoint generation */ -#define WT_STAT_DSRC_BTREE_CHECKPOINT_GENERATION 2017 -/*! btree: column-store variable-size deleted values */ -#define WT_STAT_DSRC_BTREE_COLUMN_DELETED 2018 +#define WT_STAT_DSRC_BTREE_CHECKPOINT_GENERATION 2022 /*! btree: column-store fixed-size leaf pages */ -#define WT_STAT_DSRC_BTREE_COLUMN_FIX 2019 +#define WT_STAT_DSRC_BTREE_COLUMN_FIX 2023 /*! btree: column-store internal pages */ -#define WT_STAT_DSRC_BTREE_COLUMN_INTERNAL 2020 +#define WT_STAT_DSRC_BTREE_COLUMN_INTERNAL 2024 /*! btree: column-store variable-size RLE encoded values */ -#define WT_STAT_DSRC_BTREE_COLUMN_RLE 2021 +#define WT_STAT_DSRC_BTREE_COLUMN_RLE 2025 +/*! btree: column-store variable-size deleted values */ +#define WT_STAT_DSRC_BTREE_COLUMN_DELETED 2026 /*! btree: column-store variable-size leaf pages */ -#define WT_STAT_DSRC_BTREE_COLUMN_VARIABLE 2022 -/*! btree: pages rewritten by compaction */ -#define WT_STAT_DSRC_BTREE_COMPACT_REWRITE 2023 -/*! btree: number of key/value pairs */ -#define WT_STAT_DSRC_BTREE_ENTRIES 2024 +#define WT_STAT_DSRC_BTREE_COLUMN_VARIABLE 2027 /*! btree: fixed-record size */ -#define WT_STAT_DSRC_BTREE_FIXED_LEN 2025 -/*! btree: maximum tree depth */ -#define WT_STAT_DSRC_BTREE_MAXIMUM_DEPTH 2026 +#define WT_STAT_DSRC_BTREE_FIXED_LEN 2028 /*! btree: maximum internal page key size */ -#define WT_STAT_DSRC_BTREE_MAXINTLKEY 2027 +#define WT_STAT_DSRC_BTREE_MAXINTLKEY 2029 /*! btree: maximum internal page size */ -#define WT_STAT_DSRC_BTREE_MAXINTLPAGE 2028 +#define WT_STAT_DSRC_BTREE_MAXINTLPAGE 2030 /*! btree: maximum leaf page key size */ -#define WT_STAT_DSRC_BTREE_MAXLEAFKEY 2029 +#define WT_STAT_DSRC_BTREE_MAXLEAFKEY 2031 /*! btree: maximum leaf page size */ -#define WT_STAT_DSRC_BTREE_MAXLEAFPAGE 2030 +#define WT_STAT_DSRC_BTREE_MAXLEAFPAGE 2032 /*! btree: maximum leaf page value size */ -#define WT_STAT_DSRC_BTREE_MAXLEAFVALUE 2031 +#define WT_STAT_DSRC_BTREE_MAXLEAFVALUE 2033 +/*! btree: maximum tree depth */ +#define WT_STAT_DSRC_BTREE_MAXIMUM_DEPTH 2034 +/*! btree: number of key/value pairs */ +#define WT_STAT_DSRC_BTREE_ENTRIES 2035 /*! btree: overflow pages */ -#define WT_STAT_DSRC_BTREE_OVERFLOW 2032 +#define WT_STAT_DSRC_BTREE_OVERFLOW 2036 +/*! btree: pages rewritten by compaction */ +#define WT_STAT_DSRC_BTREE_COMPACT_REWRITE 2037 /*! btree: row-store internal pages */ -#define WT_STAT_DSRC_BTREE_ROW_INTERNAL 2033 +#define WT_STAT_DSRC_BTREE_ROW_INTERNAL 2038 /*! btree: row-store leaf pages */ -#define WT_STAT_DSRC_BTREE_ROW_LEAF 2034 +#define WT_STAT_DSRC_BTREE_ROW_LEAF 2039 /*! cache: bytes read into cache */ -#define WT_STAT_DSRC_CACHE_BYTES_READ 2035 +#define WT_STAT_DSRC_CACHE_BYTES_READ 2040 /*! cache: bytes written from cache */ -#define WT_STAT_DSRC_CACHE_BYTES_WRITE 2036 +#define WT_STAT_DSRC_CACHE_BYTES_WRITE 2041 /*! cache: checkpoint blocked page eviction */ -#define WT_STAT_DSRC_CACHE_EVICTION_CHECKPOINT 2037 -/*! cache: unmodified pages evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2038 -/*! cache: page split during eviction deepened the tree */ -#define WT_STAT_DSRC_CACHE_EVICTION_DEEPEN 2039 -/*! cache: modified pages evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_DIRTY 2040 +#define WT_STAT_DSRC_CACHE_EVICTION_CHECKPOINT 2042 /*! cache: data source pages selected for eviction unable to be evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_FAIL 2041 +#define WT_STAT_DSRC_CACHE_EVICTION_FAIL 2043 /*! cache: hazard pointer blocked page eviction */ -#define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 2042 +#define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 2044 +/*! cache: in-memory page passed criteria to be split */ +#define WT_STAT_DSRC_CACHE_INMEM_SPLITTABLE 2045 +/*! cache: in-memory page splits */ +#define WT_STAT_DSRC_CACHE_INMEM_SPLIT 2046 /*! cache: internal pages evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 2043 +#define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 2047 /*! cache: internal pages split during eviction */ -#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT_INTERNAL 2044 +#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT_INTERNAL 2048 /*! cache: leaf pages split during eviction */ -#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT_LEAF 2045 -/*! cache: in-memory page splits */ -#define WT_STAT_DSRC_CACHE_INMEM_SPLIT 2046 -/*! cache: in-memory page passed criteria to be split */ -#define WT_STAT_DSRC_CACHE_INMEM_SPLITTABLE 2047 +#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT_LEAF 2049 +/*! cache: modified pages evicted */ +#define WT_STAT_DSRC_CACHE_EVICTION_DIRTY 2050 +/*! cache: overflow pages read into cache */ +#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2051 /*! cache: overflow values cached in memory */ -#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2048 +#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2052 +/*! cache: page split during eviction deepened the tree */ +#define WT_STAT_DSRC_CACHE_EVICTION_DEEPEN 2053 +/*! cache: page written requiring lookaside records */ +#define WT_STAT_DSRC_CACHE_WRITE_LOOKASIDE 2054 /*! cache: pages read into cache */ -#define WT_STAT_DSRC_CACHE_READ 2049 +#define WT_STAT_DSRC_CACHE_READ 2055 /*! cache: pages read into cache requiring lookaside entries */ -#define WT_STAT_DSRC_CACHE_READ_LOOKASIDE 2050 -/*! cache: overflow pages read into cache */ -#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2051 +#define WT_STAT_DSRC_CACHE_READ_LOOKASIDE 2056 /*! cache: pages written from cache */ -#define WT_STAT_DSRC_CACHE_WRITE 2052 -/*! cache: page written requiring lookaside records */ -#define WT_STAT_DSRC_CACHE_WRITE_LOOKASIDE 2053 +#define WT_STAT_DSRC_CACHE_WRITE 2057 /*! cache: pages written requiring in-memory restoration */ -#define WT_STAT_DSRC_CACHE_WRITE_RESTORE 2054 -/*! compression: raw compression call failed, no additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2055 -/*! compression: raw compression call failed, additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2056 -/*! compression: raw compression call succeeded */ -#define WT_STAT_DSRC_COMPRESS_RAW_OK 2057 +#define WT_STAT_DSRC_CACHE_WRITE_RESTORE 2058 +/*! cache: unmodified pages evicted */ +#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2059 /*! compression: compressed pages read */ -#define WT_STAT_DSRC_COMPRESS_READ 2058 +#define WT_STAT_DSRC_COMPRESS_READ 2060 /*! compression: compressed pages written */ -#define WT_STAT_DSRC_COMPRESS_WRITE 2059 +#define WT_STAT_DSRC_COMPRESS_WRITE 2061 /*! compression: page written failed to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2060 +#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2062 /*! compression: page written was too small to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2061 -/*! cursor: create calls */ -#define WT_STAT_DSRC_CURSOR_CREATE 2062 -/*! cursor: insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT 2063 +#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2063 +/*! compression: raw compression call failed, additional data available */ +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2064 +/*! compression: raw compression call failed, no additional data available */ +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2065 +/*! compression: raw compression call succeeded */ +#define WT_STAT_DSRC_COMPRESS_RAW_OK 2066 /*! cursor: bulk-loaded cursor-insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2064 +#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2067 +/*! cursor: create calls */ +#define WT_STAT_DSRC_CURSOR_CREATE 2068 /*! cursor: cursor-insert key and value bytes inserted */ -#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2065 +#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2069 +/*! cursor: cursor-remove key bytes removed */ +#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2070 +/*! cursor: cursor-update value bytes updated */ +#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2071 +/*! cursor: insert calls */ +#define WT_STAT_DSRC_CURSOR_INSERT 2072 /*! cursor: next calls */ -#define WT_STAT_DSRC_CURSOR_NEXT 2066 +#define WT_STAT_DSRC_CURSOR_NEXT 2073 /*! cursor: prev calls */ -#define WT_STAT_DSRC_CURSOR_PREV 2067 +#define WT_STAT_DSRC_CURSOR_PREV 2074 /*! cursor: remove calls */ -#define WT_STAT_DSRC_CURSOR_REMOVE 2068 -/*! cursor: cursor-remove key bytes removed */ -#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2069 +#define WT_STAT_DSRC_CURSOR_REMOVE 2075 /*! cursor: reset calls */ -#define WT_STAT_DSRC_CURSOR_RESET 2070 +#define WT_STAT_DSRC_CURSOR_RESET 2076 /*! cursor: restarted searches */ -#define WT_STAT_DSRC_CURSOR_RESTART 2071 +#define WT_STAT_DSRC_CURSOR_RESTART 2077 /*! cursor: search calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH 2072 +#define WT_STAT_DSRC_CURSOR_SEARCH 2078 /*! cursor: search near calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2073 +#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2079 /*! cursor: truncate calls */ -#define WT_STAT_DSRC_CURSOR_TRUNCATE 2074 +#define WT_STAT_DSRC_CURSOR_TRUNCATE 2080 /*! cursor: update calls */ -#define WT_STAT_DSRC_CURSOR_UPDATE 2075 -/*! cursor: cursor-update value bytes updated */ -#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2076 -/*! LSM: sleep for LSM checkpoint throttle */ -#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2077 -/*! LSM: chunks in the LSM tree */ -#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2078 -/*! LSM: highest merge generation in the LSM tree */ -#define WT_STAT_DSRC_LSM_GENERATION_MAX 2079 -/*! LSM: queries that could have benefited from a Bloom filter that did - * not exist */ -#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2080 -/*! LSM: sleep for LSM merge throttle */ -#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2081 +#define WT_STAT_DSRC_CURSOR_UPDATE 2081 /*! reconciliation: dictionary matches */ #define WT_STAT_DSRC_REC_DICTIONARY 2082 +/*! reconciliation: fast-path pages deleted */ +#define WT_STAT_DSRC_REC_PAGE_DELETE_FAST 2083 +/*! reconciliation: internal page key bytes discarded using suffix + * compression */ +#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2084 /*! reconciliation: internal page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2083 -/*! reconciliation: leaf page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2084 -/*! reconciliation: maximum blocks required for a page */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2085 +#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2085 /*! reconciliation: internal-page overflow keys */ #define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2086 +/*! reconciliation: leaf page key bytes discarded using prefix compression */ +#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2087 +/*! reconciliation: leaf page multi-block writes */ +#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2088 /*! reconciliation: leaf-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2087 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2089 +/*! reconciliation: maximum blocks required for a page */ +#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2090 /*! reconciliation: overflow values written */ -#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2088 -/*! reconciliation: pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE 2089 -/*! reconciliation: fast-path pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE_FAST 2090 +#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2091 /*! reconciliation: page checksum matches */ -#define WT_STAT_DSRC_REC_PAGE_MATCH 2091 +#define WT_STAT_DSRC_REC_PAGE_MATCH 2092 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_DSRC_REC_PAGES 2092 +#define WT_STAT_DSRC_REC_PAGES 2093 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_DSRC_REC_PAGES_EVICTION 2093 -/*! reconciliation: leaf page key bytes discarded using prefix compression */ -#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2094 -/*! reconciliation: internal page key bytes discarded using suffix - * compression */ -#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2095 +#define WT_STAT_DSRC_REC_PAGES_EVICTION 2094 +/*! reconciliation: pages deleted */ +#define WT_STAT_DSRC_REC_PAGE_DELETE 2095 /*! session: object compaction */ #define WT_STAT_DSRC_SESSION_COMPACT 2096 /*! session: open cursor count */ diff --git a/src/include/wiredtiger_ext.h b/src/include/wiredtiger_ext.h index 0db876b56f3..7d97d97dcf5 100644 --- a/src/include/wiredtiger_ext.h +++ b/src/include/wiredtiger_ext.h @@ -268,8 +268,9 @@ struct __wt_extension_api { WT_SESSION *session, const char *key, const char *value); /*! - * Pack a structure into a buffer. - * See ::wiredtiger_struct_pack for details. + * Pack a structure into a buffer. Deprecated in favor of stream + * based pack and unpack API. See WT_EXTENSION_API::pack_start for + * details. * * @param wt_api the extension handle * @param session the session handle @@ -282,8 +283,8 @@ struct __wt_extension_api { void *buffer, size_t size, const char *format, ...); /*! - * Calculate the size required to pack a structure. - * See ::wiredtiger_struct_size for details. + * Calculate the size required to pack a structure. Deprecated in + * favor of stream based pack and unpack API. * * @param wt_api the extension handle * @param session the session handle @@ -296,8 +297,9 @@ struct __wt_extension_api { size_t *sizep, const char *format, ...); /*! - * Unpack a structure from a buffer. - * See ::wiredtiger_struct_unpack for details. + * Unpack a structure from a buffer. Deprecated in favor of stream + * based pack and unpack API. See WT_EXTENSION_API::unpack_start for + * details. * * @param wt_api the extension handle * @param session the session handle @@ -309,6 +311,130 @@ struct __wt_extension_api { int (*struct_unpack)(WT_EXTENSION_API *wt_api, WT_SESSION *session, const void *buffer, size_t size, const char *format, ...); + /* + * Streaming pack/unpack API. + */ + /*! + * Start a packing operation into a buffer. + * See ::wiredtiger_pack_start for details. + * + * @param session the session handle + * @param format the data format, see @ref packing + * @param buffer a pointer to memory to hold the packed data + * @param size the size of the buffer + * @param[out] psp the new packing stream handle + * @errors + */ + int (*pack_start)(WT_EXTENSION_API *wt_api, + WT_SESSION *session, const char *format, + void *buffer, size_t size, WT_PACK_STREAM **psp); + + /*! + * Start an unpacking operation from a buffer. + * See ::wiredtiger_unpack_start for details. + * + * @param session the session handle + * @param format the data format, see @ref packing + * @param buffer a pointer to memory holding the packed data + * @param size the size of the buffer + * @param[out] psp the new packing stream handle + * @errors + */ + int (*unpack_start)(WT_EXTENSION_API *wt_api, + WT_SESSION *session, const char *format, + const void *buffer, size_t size, WT_PACK_STREAM **psp); + + /*! + * Close a packing stream. + * + * @param ps the packing stream handle + * @param[out] usedp the number of bytes in the buffer used by the + * stream + * @errors + */ + int (*pack_close)(WT_EXTENSION_API *wt_api, + WT_PACK_STREAM *ps, size_t *usedp); + + /*! + * Pack an item into a packing stream. + * + * @param ps the packing stream handle + * @param item an item to pack + * @errors + */ + int (*pack_item)(WT_EXTENSION_API *wt_api, + WT_PACK_STREAM *ps, WT_ITEM *item); + + /*! + * Pack a signed integer into a packing stream. + * + * @param ps the packing stream handle + * @param i a signed integer to pack + * @errors + */ + int (*pack_int)(WT_EXTENSION_API *wt_api, + WT_PACK_STREAM *ps, int64_t i); + + /*! + * Pack a string into a packing stream. + * + * @param ps the packing stream handle + * @param s a string to pack + * @errors + */ + int (*pack_str)(WT_EXTENSION_API *wt_api, + WT_PACK_STREAM *ps, const char *s); + + /*! + * Pack an unsigned integer into a packing stream. + * + * @param ps the packing stream handle + * @param u an unsigned integer to pack + * @errors + */ + int (*pack_uint)(WT_EXTENSION_API *wt_api, + WT_PACK_STREAM *ps, uint64_t u); + + /*! + * Unpack an item from a packing stream. + * + * @param ps the packing stream handle + * @param item an item to unpack + * @errors + */ + int (*unpack_item)(WT_EXTENSION_API *wt_api, + WT_PACK_STREAM *ps, WT_ITEM *item); + + /*! + * Unpack a signed integer from a packing stream. + * + * @param ps the packing stream handle + * @param[out] ip the unpacked signed integer + * @errors + */ + int (*unpack_int)(WT_EXTENSION_API *wt_api, + WT_PACK_STREAM *ps, int64_t *ip); + + /*! + * Unpack a string from a packing stream. + * + * @param ps the packing stream handle + * @param[out] sp the unpacked string + * @errors + */ + int (*unpack_str)(WT_EXTENSION_API *wt_api, + WT_PACK_STREAM *ps, const char **sp); + + /*! + * Unpack an unsigned integer from a packing stream. + * + * @param ps the packing stream handle + * @param[out] up the unpacked unsigned integer + * @errors + */ + int (*unpack_uint)(WT_EXTENSION_API *wt_api, + WT_PACK_STREAM *ps, uint64_t *up); + /*! * Return the current transaction ID. * diff --git a/src/log/log.c b/src/log/log.c index ce2d7191491..e41073299a8 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -29,7 +29,7 @@ __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn) log = conn->log; log->ckpt_lsn = *ckp_lsn; if (conn->log_cond != NULL) - WT_RET(__wt_cond_signal(session, conn->log_cond)); + WT_RET(__wt_cond_auto_signal(session, conn->log_cond)); return (0); } @@ -46,7 +46,7 @@ __wt_log_flush_lsn(WT_SESSION_IMPL *session, WT_LSN *lsn, bool start) conn = S2C(session); log = conn->log; - WT_RET(__wt_log_force_write(session, 1)); + WT_RET(__wt_log_force_write(session, 1, NULL)); WT_RET(__wt_log_wrlsn(session, NULL)); if (start) *lsn = log->write_start_lsn; @@ -118,9 +118,9 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) */ if (log->sync_dir_lsn.l.file < min_lsn->l.file) { WT_ERR(__wt_verbose(session, WT_VERB_LOG, - "log_force_sync: sync directory %s to LSN %d/%lu", - log->log_dir_fh->name, - min_lsn->l.file, min_lsn->l.offset)); + "log_force_sync: sync directory %s to LSN %" PRIu32 + "/%" PRIu32, + log->log_dir_fh->name, min_lsn->l.file, min_lsn->l.offset)); WT_ERR(__wt_directory_sync_fh(session, log->log_dir_fh)); log->sync_dir_lsn = *min_lsn; WT_STAT_FAST_CONN_INCR(session, log_sync_dir); @@ -130,7 +130,7 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) */ if (__wt_log_cmp(&log->sync_lsn, min_lsn) < 0) { WT_ERR(__wt_verbose(session, WT_VERB_LOG, - "log_force_sync: sync %s to LSN %d/%lu", + "log_force_sync: sync %s to LSN %" PRIu32 "/%" PRIu32, log->log_fh->name, min_lsn->l.file, min_lsn->l.offset)); WT_ERR(__wt_fsync(session, log->log_fh)); log->sync_lsn = *min_lsn; @@ -273,7 +273,7 @@ __wt_log_get_all_files(WT_SESSION_IMPL *session, * These may be files needed by backup. Force the current slot * to get written to the file. */ - WT_RET(__wt_log_force_write(session, 1)); + WT_RET(__wt_log_force_write(session, 1, NULL)); WT_RET(__log_get_files(session, WT_LOG_FILENAME, &files, &count)); /* Filter out any files that are below the checkpoint LSN. */ @@ -697,7 +697,7 @@ __log_openfile(WT_SESSION_IMPL *session, WT_ERR_MSG(session, WT_ERROR, "unsupported WiredTiger file version: this build " " only supports major/minor versions up to %d/%d, " - " and the file is version %d/%d", + " and the file is version %" PRIu16 "/%" PRIu16, WT_LOG_MAJOR_VERSION, WT_LOG_MINOR_VERSION, desc->majorv, desc->minorv); } @@ -824,7 +824,7 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created) if (create_log) { WT_STAT_FAST_CONN_INCR(session, log_prealloc_missed); if (conn->log_cond != NULL) - WT_RET(__wt_cond_signal( + WT_RET(__wt_cond_auto_signal( session, conn->log_cond)); } } @@ -1088,28 +1088,36 @@ __wt_log_open(WT_SESSION_IMPL *session) WT_RET(__wt_open(session, conn->log_path, false, false, WT_FILE_TYPE_DIRECTORY, &log->log_dir_fh)); } - /* - * Clean up any old interim pre-allocated files. - * We clean up these files because settings have changed upon reboot - * and we want those settings to take effect right away. - */ - WT_ERR(__log_get_files(session, - WT_LOG_TMPNAME, &logfiles, &logcount)); - for (i = 0; i < logcount; i++) { - WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); - WT_ERR(__wt_log_remove(session, WT_LOG_TMPNAME, lognum)); - } - __wt_log_files_free(session, logfiles, logcount); - logfiles = NULL; - logcount = 0; - WT_ERR(__log_get_files(session, - WT_LOG_PREPNAME, &logfiles, &logcount)); - for (i = 0; i < logcount; i++) { - WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); - WT_ERR(__wt_log_remove(session, WT_LOG_PREPNAME, lognum)); + + if (!F_ISSET(conn, WT_CONN_READONLY)) { + /* + * Clean up any old interim pre-allocated files. We clean + * up these files because settings have changed upon reboot + * and we want those settings to take effect right away. + */ + WT_ERR(__log_get_files(session, + WT_LOG_TMPNAME, &logfiles, &logcount)); + for (i = 0; i < logcount; i++) { + WT_ERR(__wt_log_extract_lognum( + session, logfiles[i], &lognum)); + WT_ERR(__wt_log_remove( + session, WT_LOG_TMPNAME, lognum)); + } + __wt_log_files_free(session, logfiles, logcount); + logfiles = NULL; + logcount = 0; + WT_ERR(__log_get_files(session, + WT_LOG_PREPNAME, &logfiles, &logcount)); + for (i = 0; i < logcount; i++) { + WT_ERR(__wt_log_extract_lognum( + session, logfiles[i], &lognum)); + WT_ERR(__wt_log_remove( + session, WT_LOG_PREPNAME, lognum)); + } + __wt_log_files_free(session, logfiles, logcount); + logfiles = NULL; } - __wt_log_files_free(session, logfiles, logcount); - logfiles = NULL; + /* * Now look at the log files and set our LSNs. */ @@ -1121,7 +1129,8 @@ __wt_log_open(WT_SESSION_IMPL *session) } log->fileid = lastlog; WT_ERR(__wt_verbose(session, WT_VERB_LOG, - "log_open: first log %d last log %d", firstlog, lastlog)); + "log_open: first log %" PRIu32 " last log %" PRIu32, + firstlog, lastlog)); if (firstlog == UINT32_MAX) { WT_ASSERT(session, logcount == 0); WT_INIT_LSN(&log->first_lsn); @@ -1132,9 +1141,11 @@ __wt_log_open(WT_SESSION_IMPL *session) * Start logging at the beginning of the next log file, no matter * where the previous log file ends. */ - WT_WITH_SLOT_LOCK(session, log, ret, - ret = __log_newfile(session, true, NULL)); - WT_ERR(ret); + if (!F_ISSET(conn, WT_CONN_READONLY)) { + WT_WITH_SLOT_LOCK(session, log, ret, + ret = __log_newfile(session, true, NULL)); + WT_ERR(ret); + } /* If we found log files, save the new state. */ if (logcount > 0) { @@ -1163,20 +1174,24 @@ __wt_log_close(WT_SESSION_IMPL *session) if (log->log_close_fh != NULL && log->log_close_fh != log->log_fh) { WT_RET(__wt_verbose(session, WT_VERB_LOG, "closing old log %s", log->log_close_fh->name)); - WT_RET(__wt_fsync(session, log->log_close_fh)); + if (!F_ISSET(conn, WT_CONN_READONLY)) + WT_RET(__wt_fsync(session, log->log_close_fh)); WT_RET(__wt_close(session, &log->log_close_fh)); } if (log->log_fh != NULL) { WT_RET(__wt_verbose(session, WT_VERB_LOG, "closing log %s", log->log_fh->name)); - WT_RET(__wt_fsync(session, log->log_fh)); + if (!F_ISSET(conn, WT_CONN_READONLY)) + WT_RET(__wt_fsync(session, log->log_fh)); WT_RET(__wt_close(session, &log->log_fh)); log->log_fh = NULL; } if (log->log_dir_fh != NULL) { WT_RET(__wt_verbose(session, WT_VERB_LOG, "closing log directory %s", log->log_dir_fh->name)); - WT_RET(__wt_directory_sync_fh(session, log->log_dir_fh)); + if (!F_ISSET(conn, WT_CONN_READONLY)) + WT_RET( + __wt_directory_sync_fh(session, log->log_dir_fh)); WT_RET(__wt_close(session, &log->log_dir_fh)); log->log_dir_fh = NULL; } @@ -1237,10 +1252,8 @@ __log_has_hole(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, bool *hole) } } -err: if (buf != NULL) - __wt_free(session, buf); - if (zerobuf != NULL) - __wt_free(session, zerobuf); +err: __wt_free(session, buf); + __wt_free(session, zerobuf); return (ret); } @@ -1324,7 +1337,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) */ if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) __wt_spin_unlock(session, &log->log_slot_lock); - WT_ERR(__wt_cond_signal(session, conn->log_wrlsn_cond)); + WT_ERR(__wt_cond_auto_signal(session, conn->log_wrlsn_cond)); if (++yield_count < WT_THOUSAND) __wt_yield(); else @@ -1381,7 +1394,8 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) (log->sync_dir_lsn.l.file < sync_lsn.l.file)) { WT_ASSERT(session, log->log_dir_fh != NULL); WT_ERR(__wt_verbose(session, WT_VERB_LOG, - "log_release: sync directory %s to LSN %u/%lu", + "log_release: sync directory %s to LSN %" PRIu32 + "/%" PRIu32, log->log_dir_fh->name, sync_lsn.l.file, sync_lsn.l.offset)); WT_ERR(__wt_directory_sync_fh( @@ -1396,7 +1410,8 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) if (F_ISSET(slot, WT_SLOT_SYNC) && __wt_log_cmp(&log->sync_lsn, &slot->slot_end_lsn) < 0) { WT_ERR(__wt_verbose(session, WT_VERB_LOG, - "log_release: sync log %s to LSN %u/%lu", + "log_release: sync log %s to LSN %" PRIu32 + "/%" PRIu32, log->log_fh->name, sync_lsn.l.file, sync_lsn.l.offset)); WT_STAT_FAST_CONN_INCR(session, log_sync); @@ -1463,7 +1478,7 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, if (LF_ISSET(WT_LOGSCAN_RECOVER)) WT_RET(__wt_verbose(session, WT_VERB_LOG, - "__wt_log_scan truncating to %u/%u", + "__wt_log_scan truncating to %" PRIu32 "/%" PRIu32, log->trunc_lsn.l.file, log->trunc_lsn.l.offset)); if (log != NULL) { @@ -1744,14 +1759,25 @@ err: WT_STAT_FAST_CONN_INCR(session, log_scans); * Wrapper function that takes the lock. */ int -__wt_log_force_write(WT_SESSION_IMPL *session, bool retry) +__wt_log_force_write(WT_SESSION_IMPL *session, bool retry, bool *did_work) { WT_LOG *log; WT_MYSLOT myslot; + uint32_t joined; log = S2C(session)->log; memset(&myslot, 0, sizeof(myslot)); + WT_STAT_FAST_CONN_INCR(session, log_force_write); + if (did_work != NULL) + *did_work = true; myslot.slot = log->active_slot; + joined = WT_LOG_SLOT_JOINED(log->active_slot->slot_state); + if (joined == 0) { + WT_STAT_FAST_CONN_INCR(session, log_force_write_skip); + if (did_work != NULL) + *did_work = false; + return (0); + } return (__wt_log_slot_switch(session, &myslot, retry, true)); } @@ -1984,10 +2010,10 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, * XXX I've seen times when conditions are NULL. */ if (conn->log_cond != NULL) { - WT_ERR(__wt_cond_signal(session, conn->log_cond)); + WT_ERR(__wt_cond_auto_signal(session, conn->log_cond)); __wt_yield(); } else - WT_ERR(__wt_log_force_write(session, 1)); + WT_ERR(__wt_log_force_write(session, 1, NULL)); } if (LF_ISSET(WT_LOG_FLUSH)) { /* Wait for our writes to reach the OS */ @@ -2114,7 +2140,7 @@ __wt_log_flush(WT_SESSION_IMPL *session, uint32_t flags) WT_RET(__wt_log_flush_lsn(session, &lsn, false)); WT_RET(__wt_verbose(session, WT_VERB_LOG, - "log_flush: flags %d LSN %u/%lu", + "log_flush: flags %#" PRIx32 " LSN %" PRIu32 "/%" PRIu32, flags, lsn.l.file, lsn.l.offset)); /* * If the user wants write-no-sync, there is nothing more to do. diff --git a/src/log/log_slot.c b/src/log/log_slot.c index 2844516e78f..570d1c9ce48 100644 --- a/src/log/log_slot.c +++ b/src/log/log_slot.c @@ -253,7 +253,7 @@ __wt_log_slot_new(WT_SESSION_IMPL *session) /* * If we didn't find any free slots signal the worker thread. */ - (void)__wt_cond_signal(session, conn->log_wrlsn_cond); + (void)__wt_cond_auto_signal(session, conn->log_wrlsn_cond); __wt_yield(); } /* NOTREACHED */ diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index f76b2bfd9ac..e023b2b407e 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -1501,22 +1501,22 @@ __wt_clsm_open(WT_SESSION_IMPL *session, { WT_CONFIG_ITEM cval; WT_CURSOR_STATIC_INIT(iface, - __wt_cursor_get_key, /* get-key */ - __wt_cursor_get_value, /* get-value */ - __wt_cursor_set_key, /* set-key */ - __wt_cursor_set_value, /* set-value */ - __clsm_compare, /* compare */ - __wt_cursor_equals, /* equals */ - __clsm_next, /* next */ - __clsm_prev, /* prev */ - __clsm_reset, /* reset */ - __clsm_search, /* search */ - __clsm_search_near, /* search-near */ - __clsm_insert, /* insert */ - __clsm_update, /* update */ - __clsm_remove, /* remove */ - __wt_cursor_reconfigure, /* reconfigure */ - __wt_clsm_close); /* close */ + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ + __clsm_compare, /* compare */ + __wt_cursor_equals, /* equals */ + __clsm_next, /* next */ + __clsm_prev, /* prev */ + __clsm_reset, /* reset */ + __clsm_search, /* search */ + __clsm_search_near, /* search-near */ + __clsm_insert, /* insert */ + __clsm_update, /* update */ + __clsm_remove, /* remove */ + __wt_cursor_reconfigure, /* reconfigure */ + __wt_clsm_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_LSM *clsm; WT_DECL_RET; @@ -1556,7 +1556,7 @@ __wt_clsm_open(WT_SESSION_IMPL *session, WT_ERR(ret); /* Make sure we have exclusive access if and only if we want it */ - WT_ASSERT(session, !bulk || lsm_tree->exclusive); + WT_ASSERT(session, !bulk || lsm_tree->excl_session != NULL); WT_ERR(__wt_calloc_one(session, &clsm)); diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c index dac8d987328..943a5894ab3 100644 --- a/src/lsm/lsm_manager.c +++ b/src/lsm/lsm_manager.c @@ -212,6 +212,10 @@ __wt_lsm_manager_start(WT_SESSION_IMPL *session) conn = S2C(session); manager = &conn->lsm_manager; + if (F_ISSET(conn, WT_CONN_READONLY)) { + manager->lsm_workers = 0; + return (0); + } /* * We need at least a manager, a switch thread and a generic * worker. @@ -284,6 +288,8 @@ __wt_lsm_manager_destroy(WT_SESSION_IMPL *session) manager = &conn->lsm_manager; removed = 0; + WT_ASSERT(session, !F_ISSET(conn, WT_CONN_READONLY) || + manager->lsm_workers == 0); if (manager->lsm_workers > 0) { /* * Stop the main LSM manager thread first. @@ -384,7 +390,7 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session) F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST); dhandle_locked = true; TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) { - if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) + if (!lsm_tree->active) continue; WT_ERR(__wt_epoch(session, &now)); pushms = lsm_tree->work_push_ts.tv_sec == 0 ? 0 : @@ -427,8 +433,10 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session) session, WT_LSM_WORK_BLOOM, 0, lsm_tree)); WT_ERR(__wt_verbose(session, WT_VERB_LSM_MANAGER, - "MGR %s: queue %d mod %d nchunks %d" - " flags 0x%x aggressive %d pushms %" PRIu64 + "MGR %s: queue %" PRIu32 " mod %d " + "nchunks %" PRIu32 + " flags %#" PRIx32 " aggressive %" PRIu32 + " pushms %" PRIu64 " fillms %" PRIu64, lsm_tree->name, lsm_tree->queue_ref, lsm_tree->modified, lsm_tree->nchunks, @@ -616,6 +624,7 @@ __wt_lsm_manager_push_entry(WT_SESSION_IMPL *session, manager = &S2C(session)->lsm_manager; + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY)); /* * Don't add merges or bloom filter creates if merges * or bloom filters are disabled in the tree. @@ -641,7 +650,7 @@ __wt_lsm_manager_push_entry(WT_SESSION_IMPL *session, * is checked. */ (void)__wt_atomic_add32(&lsm_tree->queue_ref, 1); - if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) { + if (!lsm_tree->active) { (void)__wt_atomic_sub32(&lsm_tree->queue_ref, 1); return (0); } diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c index 29325066da7..6d907284546 100644 --- a/src/lsm/lsm_merge.c +++ b/src/lsm/lsm_merge.c @@ -60,10 +60,11 @@ __lsm_merge_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { struct timespec now; uint64_t msec_since_last_merge, msec_to_create_merge; - u_int new_aggressive; + uint32_t new_aggressive; new_aggressive = 0; + WT_ASSERT(session, lsm_tree->merge_min != 0); /* * If the tree is open read-only or we are compacting, be very * aggressive. Otherwise, we can spend a long time waiting for merges @@ -124,8 +125,9 @@ __lsm_merge_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) if (new_aggressive > lsm_tree->merge_aggressiveness) { WT_RET(__wt_verbose(session, WT_VERB_LSM, - "LSM merge %s got aggressive (old %u new %u), " - "merge_min %d, %u / %" PRIu64, + "LSM merge %s got aggressive " + "(old %" PRIu32 " new %" PRIu32 "), " + "merge_min %u, %" PRIu64 " / %" PRIu64, lsm_tree->name, lsm_tree->merge_aggressiveness, new_aggressive, lsm_tree->merge_min, msec_since_last_merge, lsm_tree->chunk_fill_ms)); @@ -410,7 +412,8 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) start_chunk, end_chunk, dest_id, record_count, generation)); for (verb = start_chunk; verb <= end_chunk; verb++) WT_ERR(__wt_verbose(session, WT_VERB_LSM, - "Merging %s: Chunk[%u] id %u, gen: %" PRIu32 + "Merging %s: Chunk[%u] id %" PRIu32 + ", gen: %" PRIu32 ", size: %" PRIu64 ", records: %" PRIu64, lsm_tree->name, verb, lsm_tree->chunk[verb]->id, lsm_tree->chunk[verb]->generation, @@ -460,7 +463,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) #define LSM_MERGE_CHECK_INTERVAL WT_THOUSAND for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) { - if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) + if (!lsm_tree->active) WT_ERR(EINTR); WT_STAT_FAST_CONN_INCRV(session, diff --git a/src/lsm/lsm_meta.c b/src/lsm/lsm_meta.c index d76b2a48aa7..e19e2cd0126 100644 --- a/src/lsm/lsm_meta.c +++ b/src/lsm/lsm_meta.c @@ -9,17 +9,17 @@ #include "wt_internal.h" /* - * __wt_lsm_meta_read -- - * Read the metadata for an LSM tree. + * __lsm_meta_read_v0 -- + * Read v0 of LSM metadata. */ -int -__wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +static int +__lsm_meta_read_v0( + WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, const char *lsmconf) { WT_CONFIG cparser, lparser; WT_CONFIG_ITEM ck, cv, fileconf, lk, lv, metadata; WT_DECL_RET; WT_LSM_CHUNK *chunk; - char *lsmconfig; u_int nchunks; chunk = NULL; /* -Wconditional-uninitialized */ @@ -28,8 +28,7 @@ __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) if (F_ISSET(S2C(session), WT_CONN_LSM_MERGE)) F_SET(lsm_tree, WT_LSM_TREE_MERGES); - WT_RET(__wt_metadata_search(session, lsm_tree->name, &lsmconfig)); - WT_ERR(__wt_config_init(session, &cparser, lsmconfig)); + WT_ERR(__wt_config_init(session, &cparser, lsmconf)); while ((ret = __wt_config_next(&cparser, &ck, &cv)) == 0) { if (WT_STRING_MATCH("key_format", ck.str, ck.len)) { __wt_free(session, lsm_tree->key_format); @@ -48,7 +47,7 @@ __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) * from the file configuration. */ WT_ERR(__wt_config_getones( - session, lsmconfig, "file_config", &fileconf)); + session, lsmconf, "file_config", &fileconf)); WT_CLEAR(metadata); WT_ERR_NOTFOUND_OK(__wt_config_subgets( session, &fileconf, "app_metadata", &metadata)); @@ -160,16 +159,292 @@ __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) */ } WT_ERR_NOTFOUND_OK(ret); +err: return (ret); +} + +/* + * __lsm_meta_read_v1 -- + * Read v1 of LSM metadata. + */ +static int +__lsm_meta_read_v1( + WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, const char *lsmconf) +{ + WT_CONFIG lparser; + WT_CONFIG_ITEM cv, lk, lv, metadata; + WT_DECL_ITEM(buf); + WT_DECL_RET; + WT_LSM_CHUNK *chunk; + const char *file_cfg[] = { + WT_CONFIG_BASE(session, file_config), NULL, NULL, NULL }; + char *fileconf; + u_int nchunks; + + chunk = NULL; /* -Wconditional-uninitialized */ + + WT_ERR(__wt_config_getones(session, lsmconf, "key_format", &cv)); + WT_ERR(__wt_strndup(session, cv.str, cv.len, &lsm_tree->key_format)); + WT_ERR(__wt_config_getones(session, lsmconf, "value_format", &cv)); + WT_ERR(__wt_strndup(session, cv.str, cv.len, &lsm_tree->value_format)); + + WT_ERR(__wt_config_getones(session, lsmconf, "collator", &cv)); + if (cv.len != 0 && !WT_STRING_MATCH("none", cv.str, cv.len)) { + /* Extract the application-supplied metadata (if any). */ + WT_CLEAR(metadata); + WT_ERR_NOTFOUND_OK(__wt_config_getones( + session, lsmconf, "app_metadata", &metadata)); + WT_ERR(__wt_collator_config(session, lsm_tree->name, + &cv, &metadata, + &lsm_tree->collator, &lsm_tree->collator_owned)); + WT_ERR(__wt_strndup(session, + cv.str, cv.len, &lsm_tree->collator_name)); + } + + WT_ERR(__wt_config_getones(session, lsmconf, "lsm.auto_throttle", &cv)); + if (cv.val) + F_SET(lsm_tree, WT_LSM_TREE_THROTTLE); + else + F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE); + + WT_ERR(__wt_config_getones(session, lsmconf, "lsm.bloom", &cv)); + FLD_SET(lsm_tree->bloom, + (cv.val == 0 ? WT_LSM_BLOOM_OFF : WT_LSM_BLOOM_MERGED)); + WT_ERR(__wt_config_getones(session, lsmconf, "lsm.bloom_oldest", &cv)); + if (cv.val != 0) + FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST); + + if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) && + FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST)) + WT_ERR_MSG(session, EINVAL, + "Bloom filters can only be created on newest and oldest " + "chunks if bloom filters are enabled"); + + WT_ERR(__wt_config_getones( + session, lsmconf, "lsm.bloom_bit_count", &cv)); + lsm_tree->bloom_bit_count = (uint32_t)cv.val; + WT_ERR(__wt_config_getones(session, lsmconf, "lsm.bloom_config", &cv)); + /* Don't include the brackets. */ + if (cv.type == WT_CONFIG_ITEM_STRUCT) { + cv.str++; + cv.len -= 2; + } + WT_ERR(__wt_config_check(session, + WT_CONFIG_REF(session, WT_SESSION_create), cv.str, cv.len)); + WT_ERR(__wt_strndup(session, cv.str, cv.len, &lsm_tree->bloom_config)); + WT_ERR(__wt_config_getones( + session, lsmconf, "lsm.bloom_hash_count", &cv)); + lsm_tree->bloom_hash_count = (uint32_t)cv.val; + + WT_ERR(__wt_config_getones( + session, lsmconf, "lsm.chunk_count_limit", &cv)); + lsm_tree->chunk_count_limit = (uint32_t)cv.val; + if (cv.val == 0) + F_SET(lsm_tree, WT_LSM_TREE_MERGES); + else + F_CLR(lsm_tree, WT_LSM_TREE_MERGES); + WT_ERR(__wt_config_getones(session, lsmconf, "lsm.chunk_max", &cv)); + lsm_tree->chunk_max = (uint64_t)cv.val; + WT_ERR(__wt_config_getones(session, lsmconf, "lsm.chunk_size", &cv)); + lsm_tree->chunk_size = (uint64_t)cv.val; + + if (lsm_tree->chunk_size > lsm_tree->chunk_max) + WT_ERR_MSG(session, EINVAL, + "Chunk size (chunk_size) must be smaller than or equal to " + "the maximum chunk size (chunk_max)"); + + WT_ERR(__wt_config_getones(session, lsmconf, "lsm.merge_max", &cv)); + lsm_tree->merge_max = (uint32_t)cv.val; + WT_ERR(__wt_config_getones(session, lsmconf, "lsm.merge_min", &cv)); + lsm_tree->merge_min = (uint32_t)cv.val; + + if (lsm_tree->merge_min > lsm_tree->merge_max) + WT_ERR_MSG(session, EINVAL, + "LSM merge_min must be less than or equal to merge_max"); + + WT_ERR(__wt_config_getones(session, lsmconf, "last", &cv)); + lsm_tree->last = (u_int)cv.val; + WT_ERR(__wt_config_getones(session, lsmconf, "chunks", &cv)); + WT_ERR(__wt_config_subinit(session, &lparser, &cv)); + for (nchunks = 0; (ret = + __wt_config_next(&lparser, &lk, &lv)) == 0; ) { + if (WT_STRING_MATCH("id", lk.str, lk.len)) { + WT_ERR(__wt_realloc_def(session, + &lsm_tree->chunk_alloc, + nchunks + 1, &lsm_tree->chunk)); + WT_ERR(__wt_calloc_one(session, &chunk)); + lsm_tree->chunk[nchunks++] = chunk; + chunk->id = (uint32_t)lv.val; + WT_ERR(__wt_lsm_tree_chunk_name(session, + lsm_tree, chunk->id, &chunk->uri)); + F_SET(chunk, + WT_LSM_CHUNK_ONDISK | + WT_LSM_CHUNK_STABLE); + } else if (WT_STRING_MATCH("bloom", lk.str, lk.len)) { + WT_ERR(__wt_lsm_tree_bloom_name( + session, lsm_tree, chunk->id, &chunk->bloom_uri)); + F_SET(chunk, WT_LSM_CHUNK_BLOOM); + continue; + } else if (WT_STRING_MATCH("chunk_size", lk.str, lk.len)) { + chunk->size = (uint64_t)lv.val; + continue; + } else if (WT_STRING_MATCH("count", lk.str, lk.len)) { + chunk->count = (uint64_t)lv.val; + continue; + } else if (WT_STRING_MATCH("generation", lk.str, lk.len)) { + chunk->generation = (uint32_t)lv.val; + continue; + } + } + WT_ERR_NOTFOUND_OK(ret); + lsm_tree->nchunks = nchunks; + + WT_ERR(__wt_config_getones(session, lsmconf, "old_chunks", &cv)); + WT_ERR(__wt_config_subinit(session, &lparser, &cv)); + for (nchunks = 0; (ret = + __wt_config_next(&lparser, &lk, &lv)) == 0; ) { + if (WT_STRING_MATCH("bloom", lk.str, lk.len)) { + WT_ERR(__wt_strndup(session, + lv.str, lv.len, &chunk->bloom_uri)); + F_SET(chunk, WT_LSM_CHUNK_BLOOM); + continue; + } + WT_ERR(__wt_realloc_def(session, + &lsm_tree->old_alloc, nchunks + 1, + &lsm_tree->old_chunks)); + WT_ERR(__wt_calloc_one(session, &chunk)); + lsm_tree->old_chunks[nchunks++] = chunk; + WT_ERR(__wt_strndup(session, + lk.str, lk.len, &chunk->uri)); + F_SET(chunk, WT_LSM_CHUNK_ONDISK); + } + WT_ERR_NOTFOUND_OK(ret); + lsm_tree->nold_chunks = nchunks; + + /* + * Set up the config for each chunk. + * + * Make the memory_page_max double the chunk size, so application + * threads don't immediately try to force evict the chunk when the + * worker thread clears the NO_EVICTION flag. + */ + file_cfg[1] = lsmconf; + WT_ERR(__wt_scr_alloc(session, 0, &buf)); + WT_ERR(__wt_buf_fmt(session, buf, + "key_format=u,value_format=u,memory_page_max=%" PRIu64, + 2 * lsm_tree->chunk_max)); + file_cfg[2] = buf->data; + WT_ERR(__wt_config_collapse(session, file_cfg, &fileconf)); + lsm_tree->file_config = fileconf; + + /* + * Ignore any other values: the metadata entry might have been + * created by a future release, with unknown options. + */ +err: __wt_scr_free(session, &buf); + return (ret); +} + +/* + * __lsm_meta_upgrade_v1 -- + * Upgrade to v1 of LSM metadata. + */ +static int +__lsm_meta_upgrade_v1(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_DECL_ITEM(buf); + WT_DECL_RET; + const char *new_cfg[] = { + WT_CONFIG_BASE(session, lsm_meta), NULL, NULL, NULL }; + + /* Include the custom config that used to be embedded in file_config. */ + new_cfg[1] = lsm_tree->file_config; + + WT_ERR(__wt_scr_alloc(session, 0, &buf)); + WT_ERR(__wt_buf_fmt(session, buf, + "key_format=%s,value_format=%s", + lsm_tree->key_format, lsm_tree->value_format)); + + WT_ERR(__wt_buf_catfmt(session, buf, ",collator=%s", + lsm_tree->collator_name != NULL ? lsm_tree->collator_name : "")); + + WT_ERR(__wt_buf_catfmt(session, buf, ",lsm=(")); + + WT_ERR(__wt_buf_catfmt(session, buf, "auto_throttle=%d", + F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE))); + + WT_ERR(__wt_buf_catfmt(session, buf, ",bloom=%d", + FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_MERGED))); + WT_ERR(__wt_buf_catfmt(session, buf, ",bloom_oldest=%d", + FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST))); + WT_ERR(__wt_buf_catfmt(session, buf, ",bloom_bit_count=%" PRIu32, + lsm_tree->bloom_bit_count)); + if (lsm_tree->bloom_config != NULL && + strlen(lsm_tree->bloom_config) > 0) + WT_ERR(__wt_buf_catfmt(session, buf, ",bloom_config=(%s)", + lsm_tree->bloom_config)); + else + WT_ERR(__wt_buf_catfmt(session, buf, ",bloom_config=")); + WT_ERR(__wt_buf_catfmt(session, buf, ",bloom_hash_count=%" PRIu32, + lsm_tree->bloom_hash_count)); + + WT_ERR(__wt_buf_catfmt(session, buf, ",chunk_count_limit=%" PRIu32, + lsm_tree->chunk_count_limit)); + WT_ERR(__wt_buf_catfmt(session, buf, ",chunk_max=%" PRIu64, + lsm_tree->chunk_max)); + WT_ERR(__wt_buf_catfmt(session, buf, ",merge_max=%" PRIu32, + lsm_tree->merge_max)); + WT_ERR(__wt_buf_catfmt(session, buf, ",merge_min=%" PRIu32, + lsm_tree->merge_min)); + + WT_ERR(__wt_buf_catfmt(session, buf, ")")); + + new_cfg[2] = buf->data; + WT_ERR(__wt_config_merge(session, new_cfg, NULL, &lsm_tree->config)); + +err: __wt_scr_free(session, &buf); + return (ret); +} +/* + * __wt_lsm_meta_read -- + * Read the metadata for an LSM tree. + */ +int +__wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_CONFIG_ITEM cval; + WT_DECL_RET; + char *lsmconf; + bool upgrade; + + /* LSM trees inherit the merge setting from the connection. */ + if (F_ISSET(S2C(session), WT_CONN_LSM_MERGE)) + F_SET(lsm_tree, WT_LSM_TREE_MERGES); + + WT_RET(__wt_metadata_search(session, lsm_tree->name, &lsmconf)); + upgrade = false; + ret = __wt_config_getones(session, lsmconf, "file_config", &cval); + if (ret == 0) { + ret = __lsm_meta_read_v0(session, lsm_tree, lsmconf); + __wt_free(session, lsmconf); + WT_RET(ret); + upgrade = true; + } else if (ret == WT_NOTFOUND) { + lsm_tree->config = lsmconf; + ret = 0; + WT_RET(__lsm_meta_read_v1(session, lsm_tree, lsmconf)); + } /* - * If the default merge_min was not overridden, calculate it now. We - * do this here so that trees created before merge_min was added get a - * sane value. + * If the default merge_min was not overridden, calculate it now. */ if (lsm_tree->merge_min < 2) lsm_tree->merge_min = WT_MAX(2, lsm_tree->merge_max / 2); - -err: __wt_free(session, lsmconfig); + /* + * If needed, upgrade the configuration. We need to do this after + * we have fixed the merge_min value. + */ + if (upgrade) + WT_RET(__lsm_meta_upgrade_v1(session, lsm_tree)); return (ret); } @@ -184,32 +459,15 @@ __wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_DECL_RET; WT_LSM_CHUNK *chunk; u_int i; + const char *new_cfg[] = { NULL, NULL, NULL }; + char *new_metadata; bool first; + new_metadata = NULL; + WT_RET(__wt_scr_alloc(session, 0, &buf)); - WT_ERR(__wt_buf_fmt(session, buf, - "key_format=%s,value_format=%s,bloom_config=(%s),file_config=(%s)", - lsm_tree->key_format, lsm_tree->value_format, - lsm_tree->bloom_config, lsm_tree->file_config)); - if (lsm_tree->collator_name != NULL) - WT_ERR(__wt_buf_catfmt( - session, buf, ",collator=%s", lsm_tree->collator_name)); WT_ERR(__wt_buf_catfmt(session, buf, - ",last=%" PRIu32 - ",chunk_count_limit=%" PRIu32 - ",chunk_max=%" PRIu64 - ",chunk_size=%" PRIu64 - ",auto_throttle=%" PRIu32 - ",merge_max=%" PRIu32 - ",merge_min=%" PRIu32 - ",bloom=%" PRIu32 - ",bloom_bit_count=%" PRIu32 - ",bloom_hash_count=%" PRIu32, - lsm_tree->last, lsm_tree->chunk_count_limit, - lsm_tree->chunk_max, lsm_tree->chunk_size, - F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) ? 1 : 0, - lsm_tree->merge_max, lsm_tree->merge_min, lsm_tree->bloom, - lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count)); + ",last=%" PRIu32, lsm_tree->last)); WT_ERR(__wt_buf_catfmt(session, buf, ",chunks=[")); for (i = 0; i < lsm_tree->nchunks; i++) { chunk = lsm_tree->chunk[i]; @@ -243,9 +501,15 @@ __wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) session, buf, ",bloom=\"%s\"", chunk->bloom_uri)); } WT_ERR(__wt_buf_catfmt(session, buf, "]")); - ret = __wt_metadata_update(session, lsm_tree->name, buf->data); + + /* Update the existing configuration with the new values. */ + new_cfg[0] = lsm_tree->config; + new_cfg[1] = buf->data; + WT_ERR(__wt_config_collapse(session, new_cfg, &new_metadata)); + ret = __wt_metadata_update(session, lsm_tree->name, new_metadata); WT_ERR(ret); err: __wt_scr_free(session, &buf); + __wt_free(session, new_metadata); return (ret); } diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c index ab18e41a2f5..cb1ddf22f84 100644 --- a/src/lsm/lsm_tree.c +++ b/src/lsm/lsm_tree.c @@ -27,6 +27,7 @@ __lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final) WT_UNUSED(final); /* Only used in diagnostic builds */ + WT_ASSERT(session, !lsm_tree->active); /* * The work unit queue should be empty, but it's worth checking * since work units use a different locking scheme to regular tree @@ -85,19 +86,27 @@ __lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final) * Close an LSM tree structure. */ static int -__lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +__lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final) { WT_DECL_RET; int i; - /* Stop any active merges. */ - F_CLR(lsm_tree, WT_LSM_TREE_ACTIVE); + /* + * Stop any new work units being added. The barrier is necessary + * because we rely on the state change being visible before checking + * the tree queue state. + */ + lsm_tree->active = false; + WT_READ_BARRIER(); /* - * Wait for all LSM operations and work units that were in flight to - * finish. + * Wait for all LSM operations to drain. If WiredTiger is shutting + * down also wait for the tree reference count to go to zero, otherwise + * we know a user is holding a reference to the tree, so exclusive + * access is not available. */ - for (i = 0; lsm_tree->refcnt > 1 || lsm_tree->queue_ref > 0; ++i) { + for (i = 0; + lsm_tree->queue_ref > 0 || (final && lsm_tree->refcnt > 1); ++i) { /* * Remove any work units from the manager queues. Do this step * repeatedly in case a work unit was in the process of being @@ -114,11 +123,14 @@ __lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) if (i % WT_THOUSAND == 0) { WT_WITHOUT_LOCKS(session, ret = __wt_lsm_manager_clear_tree(session, lsm_tree)); - WT_RET(ret); + WT_ERR(ret); } __wt_yield(); } return (0); + +err: lsm_tree->active = true; + return (ret); } /* @@ -142,7 +154,7 @@ __wt_lsm_tree_close_all(WT_SESSION_IMPL *session) * is unconditional. */ (void)__wt_atomic_add32(&lsm_tree->refcnt, 1); - WT_TRET(__lsm_tree_close(session, lsm_tree)); + WT_TRET(__lsm_tree_close(session, lsm_tree, true)); WT_TRET(__lsm_tree_discard(session, lsm_tree, true)); } @@ -157,9 +169,12 @@ static int __lsm_tree_set_name(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, const char *uri) { - if (lsm_tree->name != NULL) - __wt_free(session, lsm_tree->name); - WT_RET(__wt_strdup(session, uri, &lsm_tree->name)); + void *p; + + WT_RET(__wt_strdup(session, uri, &p)); + + __wt_free(session, lsm_tree->name); + lsm_tree->name = p; lsm_tree->filename = lsm_tree->name + strlen("lsm:"); return (0); } @@ -306,15 +321,15 @@ int __wt_lsm_tree_create(WT_SESSION_IMPL *session, const char *uri, bool exclusive, const char *config) { - WT_CONFIG_ITEM cval; - WT_DECL_ITEM(buf); WT_DECL_RET; WT_LSM_TREE *lsm_tree; const char *cfg[] = - { WT_CONFIG_BASE(session, WT_SESSION_create), config, NULL }; - char *tmpconfig; + { WT_CONFIG_BASE(session, lsm_meta), config, NULL }; + const char *metadata; - /* If the tree is open, it already exists. */ + metadata = NULL; + + /* If the tree can be opened, it already exists. */ WT_WITH_HANDLE_LIST_LOCK(session, ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree)); if (ret == 0) { @@ -323,139 +338,22 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session, } WT_RET_NOTFOUND_OK(ret); - /* - * If the tree has metadata, it already exists. - * - * !!! - * Use a local variable: we don't care what the existing configuration - * is, but we don't want to overwrite the real config. - */ - if (__wt_metadata_search(session, uri, &tmpconfig) == 0) { - __wt_free(session, tmpconfig); - return (exclusive ? EEXIST : 0); + if (!F_ISSET(S2C(session), WT_CONN_READONLY)) { + WT_ERR(__wt_config_merge(session, cfg, NULL, &metadata)); + WT_ERR(__wt_metadata_insert(session, uri, metadata)); } - WT_RET_NOTFOUND_OK(ret); - - /* In-memory configurations don't make sense for LSM. */ - if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) - WT_RET_MSG(session, EINVAL, - "LSM trees not supported by in-memory configurations"); - - WT_RET(__wt_config_gets(session, cfg, "key_format", &cval)); - if (WT_STRING_MATCH("r", cval.str, cval.len)) - WT_RET_MSG(session, EINVAL, - "LSM trees cannot be configured as column stores"); - - WT_RET(__wt_calloc_one(session, &lsm_tree)); - - WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri)); - - WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval)); - WT_ERR(__wt_strndup( - session, cval.str, cval.len, &lsm_tree->key_format)); - WT_ERR(__wt_config_gets(session, cfg, "value_format", &cval)); - WT_ERR(__wt_strndup( - session, cval.str, cval.len, &lsm_tree->value_format)); - - WT_ERR(__wt_config_gets_none(session, cfg, "collator", &cval)); - WT_ERR(__wt_strndup( - session, cval.str, cval.len, &lsm_tree->collator_name)); - - WT_ERR(__wt_config_gets(session, cfg, "cache_resident", &cval)); - if (cval.val != 0) - WT_ERR_MSG(session, EINVAL, - "The cache_resident flag is not compatible with LSM"); - - WT_ERR(__wt_config_gets(session, cfg, "lsm.auto_throttle", &cval)); - if (cval.val) - F_SET(lsm_tree, WT_LSM_TREE_THROTTLE); - else - F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE); - WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom", &cval)); - FLD_SET(lsm_tree->bloom, - (cval.val == 0 ? WT_LSM_BLOOM_OFF : WT_LSM_BLOOM_MERGED)); - WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_oldest", &cval)); - if (cval.val != 0) - FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST); - - if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) && - FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST)) - WT_ERR_MSG(session, EINVAL, - "Bloom filters can only be created on newest and oldest " - "chunks if bloom filters are enabled"); - - WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_config", &cval)); - if (cval.type == WT_CONFIG_ITEM_STRUCT) { - cval.str++; - cval.len -= 2; - } - WT_ERR(__wt_config_check(session, - WT_CONFIG_REF(session, WT_SESSION_create), cval.str, cval.len)); - WT_ERR(__wt_strndup( - session, cval.str, cval.len, &lsm_tree->bloom_config)); - - WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_bit_count", &cval)); - lsm_tree->bloom_bit_count = (uint32_t)cval.val; - WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_hash_count", &cval)); - lsm_tree->bloom_hash_count = (uint32_t)cval.val; - WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_count_limit", &cval)); - lsm_tree->chunk_count_limit = (uint32_t)cval.val; - if (cval.val == 0) - F_SET(lsm_tree, WT_LSM_TREE_MERGES); - else - F_CLR(lsm_tree, WT_LSM_TREE_MERGES); - WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_max", &cval)); - lsm_tree->chunk_max = (uint64_t)cval.val; - WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_size", &cval)); - lsm_tree->chunk_size = (uint64_t)cval.val; - if (lsm_tree->chunk_size > lsm_tree->chunk_max) - WT_ERR_MSG(session, EINVAL, - "Chunk size (chunk_size) must be smaller than or equal to " - "the maximum chunk size (chunk_max)"); - WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_max", &cval)); - lsm_tree->merge_max = (uint32_t)cval.val; - WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_min", &cval)); - lsm_tree->merge_min = (uint32_t)cval.val; - if (lsm_tree->merge_min > lsm_tree->merge_max) - WT_ERR_MSG(session, EINVAL, - "LSM merge_min must be less than or equal to merge_max"); - - /* - * Set up the config for each chunk. - * - * Make the memory_page_max double the chunk size, so application - * threads don't immediately try to force evict the chunk when the - * worker thread clears the NO_EVICTION flag. - */ - WT_ERR(__wt_scr_alloc(session, 0, &buf)); - WT_ERR(__wt_buf_fmt(session, buf, - "%s,key_format=u,value_format=u,memory_page_max=%" PRIu64, - config, 2 * lsm_tree->chunk_max)); - WT_ERR(__wt_strndup( - session, buf->data, buf->size, &lsm_tree->file_config)); - - /* Create the first chunk and flush the metadata. */ - WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); - - /* Discard our partially populated handle. */ - ret = __lsm_tree_discard(session, lsm_tree, false); - lsm_tree = NULL; /* * Open our new tree and add it to the handle cache. Don't discard on * error: the returned handle is NULL on error, and the metadata * tracking macros handle cleaning up on failure. */ - if (ret == 0) - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __lsm_tree_open(session, uri, true, &lsm_tree)); + WT_WITH_HANDLE_LIST_LOCK(session, + ret = __lsm_tree_open(session, uri, true, &lsm_tree)); if (ret == 0) __wt_lsm_tree_release(session, lsm_tree); - if (0) { -err: WT_TRET(__lsm_tree_discard(session, lsm_tree, false)); - } - __wt_scr_free(session, &buf); +err: __wt_free(session, metadata); return (ret); } @@ -477,27 +375,26 @@ __lsm_tree_find(WT_SESSION_IMPL *session, /* See if the tree is already open. */ TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) if (strcmp(uri, lsm_tree->name) == 0) { - /* - * Short circuit if the handle is already held - * exclusively or exclusive access is requested and - * there are references held. - */ - if ((exclusive && lsm_tree->refcnt > 0) || - lsm_tree->exclusive) - return (EBUSY); - if (exclusive) { /* * Make sure we win the race to switch on the * exclusive flag. */ - if (!__wt_atomic_cas8( - &lsm_tree->exclusive, 0, 1)) + if (!__wt_atomic_cas_ptr( + &lsm_tree->excl_session, NULL, session)) return (EBUSY); - /* Make sure there are no readers */ - if (!__wt_atomic_cas32( - &lsm_tree->refcnt, 0, 1)) { - lsm_tree->exclusive = 0; + + /* + * Drain the work queue before checking for + * open cursors - otherwise we can generate + * spurious busy returns. + */ + (void)__wt_atomic_add32(&lsm_tree->refcnt, 1); + if (__lsm_tree_close( + session, lsm_tree, false) != 0 || + lsm_tree->refcnt != 1) { + __wt_lsm_tree_release( + session, lsm_tree); return (EBUSY); } } else { @@ -507,11 +404,11 @@ __lsm_tree_find(WT_SESSION_IMPL *session, * We got a reference, check if an exclusive * lock beat us to it. */ - if (lsm_tree->exclusive) { + if (lsm_tree->excl_session != NULL) { WT_ASSERT(session, lsm_tree->refcnt > 0); - (void)__wt_atomic_sub32( - &lsm_tree->refcnt, 1); + __wt_lsm_tree_release( + session, lsm_tree); return (EBUSY); } } @@ -603,7 +500,7 @@ __lsm_tree_open(WT_SESSION_IMPL *session, * with getting handles exclusive. */ lsm_tree->refcnt = 1; - lsm_tree->exclusive = exclusive ? 1 : 0; + lsm_tree->excl_session = exclusive ? session : NULL; lsm_tree->queue_ref = 0; /* Set a flush timestamp as a baseline. */ @@ -611,7 +508,9 @@ __lsm_tree_open(WT_SESSION_IMPL *session, /* Now the tree is setup, make it visible to others. */ TAILQ_INSERT_HEAD(&S2C(session)->lsmqh, lsm_tree, q); - F_SET(lsm_tree, WT_LSM_TREE_ACTIVE | WT_LSM_TREE_OPEN); + if (!exclusive) + lsm_tree->active = true; + F_SET(lsm_tree, WT_LSM_TREE_OPEN); *treep = lsm_tree; @@ -638,7 +537,7 @@ __wt_lsm_tree_get(WT_SESSION_IMPL *session, ret = __lsm_tree_open(session, uri, exclusive, treep); WT_ASSERT(session, ret != 0 || - (exclusive ? 1 : 0) == (*treep)->exclusive); + (*treep)->excl_session == (exclusive ? session : NULL)); return (ret); } @@ -650,8 +549,11 @@ void __wt_lsm_tree_release(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_ASSERT(session, lsm_tree->refcnt > 0); - if (lsm_tree->exclusive) - lsm_tree->exclusive = 0; + if (lsm_tree->excl_session == session) { + /* We cleared the active flag when getting exclusive access. */ + lsm_tree->active = true; + lsm_tree->excl_session = NULL; + } (void)__wt_atomic_sub32(&lsm_tree->refcnt, 1); } @@ -868,7 +770,7 @@ __wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) F_CLR(lsm_tree, WT_LSM_TREE_NEED_SWITCH); ++lsm_tree->dsk_gen; - lsm_tree->modified = 1; + lsm_tree->modified = true; /* * Set the switch transaction in the previous chunk unless this is @@ -964,9 +866,7 @@ __wt_lsm_tree_drop( WT_WITH_HANDLE_LIST_LOCK(session, ret = __wt_lsm_tree_get(session, name, true, &lsm_tree)); WT_RET(ret); - - /* Shut down the LSM worker. */ - WT_ERR(__lsm_tree_close(session, lsm_tree)); + WT_ASSERT(session, !lsm_tree->active); /* Prevent any new opens. */ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); @@ -995,6 +895,7 @@ __wt_lsm_tree_drop( WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree)); ret = __wt_metadata_remove(session, name); + WT_ASSERT(session, !lsm_tree->active); err: if (locked) WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); WT_WITH_HANDLE_LIST_LOCK(session, @@ -1027,9 +928,6 @@ __wt_lsm_tree_rename(WT_SESSION_IMPL *session, ret = __wt_lsm_tree_get(session, olduri, true, &lsm_tree)); WT_RET(ret); - /* Shut down the LSM worker. */ - WT_ERR(__lsm_tree_close(session, lsm_tree)); - /* Prevent any new opens. */ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); locked = true; @@ -1067,8 +965,8 @@ __wt_lsm_tree_rename(WT_SESSION_IMPL *session, err: if (locked) WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); - if (old != NULL) - __wt_free(session, old); + __wt_free(session, old); + /* * Discard this LSM tree structure. The first operation on the renamed * tree will create a new one. @@ -1102,9 +1000,6 @@ __wt_lsm_tree_truncate( ret = __wt_lsm_tree_get(session, name, true, &lsm_tree)); WT_RET(ret); - /* Shut down the LSM worker. */ - WT_ERR(__lsm_tree_close(session, lsm_tree)); - /* Prevent any new opens. */ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); locked = true; @@ -1308,8 +1203,8 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp) if (chunk != NULL) { WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Compact force flush %s flags 0x%" PRIx32 - " chunk %u flags 0x%" - PRIx32, name, lsm_tree->flags, chunk->id, chunk->flags)); + " chunk %" PRIu32 " flags 0x%" PRIx32, + name, lsm_tree->flags, chunk->id, chunk->flags)); flushing = true; /* * Make sure the in-memory chunk gets flushed do not push a @@ -1331,7 +1226,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp) } /* Wait for the work unit queues to drain. */ - while (F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) { + while (lsm_tree->active) { /* * The flush flag is cleared when the chunk has been flushed. * Continue to push forced flushes until the chunk is on disk. @@ -1342,7 +1237,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp) if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { WT_ERR(__wt_verbose(session, WT_VERB_LSM, - "Compact flush done %s chunk %u. " + "Compact flush done %s chunk %" PRIu32 ". " "Start compacting progress %" PRIu64, name, chunk->id, lsm_tree->merge_progressing)); @@ -1353,7 +1248,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp) progress = lsm_tree->merge_progressing; } else { WT_ERR(__wt_verbose(session, WT_VERB_LSM, - "Compact flush retry %s chunk %u", + "Compact flush retry %s chunk %" PRIu32, name, chunk->id)); WT_ERR(__wt_lsm_manager_push_entry(session, WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE, @@ -1413,7 +1308,6 @@ err: __wt_lsm_tree_release(session, lsm_tree); return (ret); - } /* @@ -1455,8 +1349,7 @@ __wt_lsm_tree_worker(WT_SESSION_IMPL *session, continue; WT_ERR(__wt_schema_worker(session, chunk->uri, file_func, name_func, cfg, open_flags)); - if (name_func == __wt_backup_list_uri_append && - F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) + if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) WT_ERR(__wt_schema_worker(session, chunk->bloom_uri, file_func, name_func, cfg, open_flags)); } diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c index d5d81df6785..87771e2cb6c 100644 --- a/src/lsm/lsm_work_unit.c +++ b/src/lsm/lsm_work_unit.c @@ -29,7 +29,7 @@ __lsm_copy_chunks(WT_SESSION_IMPL *session, cookie->nchunks = 0; WT_RET(__wt_lsm_tree_readlock(session, lsm_tree)); - if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) + if (!lsm_tree->active) return (__wt_lsm_tree_readunlock(session, lsm_tree)); /* Take a copy of the current state of the LSM tree. */ @@ -72,14 +72,14 @@ __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session, { WT_DECL_RET; WT_LSM_CHUNK *chunk, *evict_chunk, *flush_chunk; - u_int i; + uint32_t i; *chunkp = NULL; chunk = evict_chunk = flush_chunk = NULL; WT_ASSERT(session, lsm_tree->queue_ref > 0); WT_RET(__wt_lsm_tree_readlock(session, lsm_tree)); - if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE) || lsm_tree->nchunks == 0) + if (!lsm_tree->active || lsm_tree->nchunks == 0) return (__wt_lsm_tree_readunlock(session, lsm_tree)); /* Search for a chunk to evict and/or a chunk to flush. */ @@ -118,7 +118,7 @@ __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session, if (chunk != NULL) { WT_ERR(__wt_verbose(session, WT_VERB_LSM, - "Flush%s: return chunk %u of %u: %s", + "Flush%s: return chunk %" PRIu32 " of %" PRIu32 ": %s", force ? " w/ force" : "", i, lsm_tree->nchunks, chunk->uri)); @@ -322,7 +322,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, */ saved_isolation = session->txn.isolation; session->txn.isolation = WT_ISO_READ_UNCOMMITTED; - ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES); + ret = __wt_cache_op(session, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; WT_TRET(__wt_session_release_btree(session)); } @@ -334,11 +334,17 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, /* * Turn on metadata tracking to ensure the checkpoint gets the * necessary handle locks. + * + * Ensure that we don't race with a running checkpoint: the checkpoint + * lock protects against us racing with an application checkpoint in + * this chunk. Don't wait for it, though: checkpoints can take a long + * time, and our checkpoint operation should be very quick. */ WT_ERR(__wt_meta_track_on(session)); - WT_WITH_SCHEMA_LOCK(session, ret, - ret = __wt_schema_worker( - session, chunk->uri, __wt_checkpoint, NULL, NULL, 0)); + WT_WITH_CHECKPOINT_LOCK(session, ret, + WT_WITH_SCHEMA_LOCK(session, ret, + ret = __wt_schema_worker( + session, chunk->uri, __wt_checkpoint, NULL, NULL, 0))); WT_TRET(__wt_meta_track_off(session, false, ret != 0)); if (ret != 0) WT_ERR_MSG(session, ret, "LSM checkpoint"); diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c index 7562cb1cae3..0874da8db13 100644 --- a/src/lsm/lsm_worker.c +++ b/src/lsm/lsm_worker.c @@ -20,7 +20,7 @@ int __wt_lsm_worker_start(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args) { WT_RET(__wt_verbose(session, WT_VERB_LSM_MANAGER, - "Start LSM worker %d type 0x%x", args->id, args->type)); + "Start LSM worker %u type %#" PRIx32, args->id, args->type)); return (__wt_thread_create(session, &args->tid, __lsm_worker, args)); } @@ -59,9 +59,8 @@ __lsm_worker_general_op( */ if (chunk != NULL) { WT_ERR(__wt_verbose(session, WT_VERB_LSM, - "Flush%s chunk %d %s", - force ? " w/ force" : "", - chunk->id, chunk->uri)); + "Flush%s chunk %" PRIu32 " %s", + force ? " w/ force" : "", chunk->id, chunk->uri)); ret = __wt_lsm_checkpoint_chunk( session, entry->lsm_tree, chunk); WT_ASSERT(session, chunk->refcnt > 0); @@ -140,7 +139,7 @@ __lsm_worker(void *arg) if (ret == WT_NOTFOUND) { F_CLR(entry->lsm_tree, WT_LSM_TREE_COMPACTING); ret = 0; - } else if (ret == EBUSY) + } else if (ret == EBUSY || ret == EINTR) ret = 0; /* Paranoia: clear session state. */ @@ -164,7 +163,7 @@ __lsm_worker(void *arg) if (ret != 0) { err: __wt_lsm_manager_free_work_unit(session, entry); WT_PANIC_MSG(session, ret, - "Error in LSM worker thread %d", cookie->id); + "Error in LSM worker thread %u", cookie->id); } return (WT_THREAD_RET_VALUE); } diff --git a/src/meta/meta_apply.c b/src/meta/meta_apply.c index 92766213b33..fb483c21dd9 100644 --- a/src/meta/meta_apply.c +++ b/src/meta/meta_apply.c @@ -15,39 +15,41 @@ */ static inline int __meta_btree_apply(WT_SESSION_IMPL *session, WT_CURSOR *cursor, - int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]) + int (*file_func)(WT_SESSION_IMPL *, const char *[]), + int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), + const char *cfg[]) { WT_DECL_RET; const char *uri; - int cmp; + bool skip; - cursor->set_key(cursor, "file:"); - if ((ret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0) - ret = cursor->next(cursor); - for (; ret == 0; ret = cursor->next(cursor)) { + while ((ret = cursor->next(cursor)) == 0) { WT_RET(cursor->get_key(cursor, &uri)); - if (!WT_PREFIX_MATCH(uri, "file:")) - break; if (strcmp(uri, WT_METAFILE_URI) == 0) continue; + skip = false; + if (name_func != NULL) + WT_RET(name_func(session, uri, &skip)); + + if (file_func == NULL || skip || !WT_PREFIX_MATCH(uri, "file:")) + continue; + /* * We need to pull the handle into the session handle cache * and make sure it's referenced to stop other internal code * dropping the handle (e.g in LSM when cleaning up obsolete * chunks). Holding the metadata lock isn't enough. */ - ret = __wt_session_get_btree(session, uri, NULL, NULL, 0); - if (ret == 0) { - WT_SAVE_DHANDLE(session, ret = func(session, cfg)); - if (WT_META_TRACKING(session)) - WT_TRET(__wt_meta_track_handle_lock( - session, false)); - else - WT_TRET(__wt_session_release_btree(session)); - } else if (ret == EBUSY) - ret = __wt_conn_btree_apply_single( - session, uri, NULL, func, cfg); + if ((ret = __wt_session_get_btree( + session, uri, NULL, NULL, 0)) != 0) + return (ret == EBUSY ? 0 : ret); + WT_SAVE_DHANDLE(session, ret = file_func(session, cfg)); + if (WT_META_TRACKING(session)) + WT_TRET(__wt_meta_track_handle_lock( + session, false)); + else + WT_TRET(__wt_session_release_btree(session)); WT_RET(ret); } WT_RET_NOTFOUND_OK(ret); @@ -56,20 +58,22 @@ __meta_btree_apply(WT_SESSION_IMPL *session, WT_CURSOR *cursor, } /* - * __wt_meta_btree_apply -- + * __wt_meta_apply_all -- * Apply a function to all files listed in the metadata, apart from the * metadata file. */ int -__wt_meta_btree_apply(WT_SESSION_IMPL *session, - int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]) +__wt_meta_apply_all(WT_SESSION_IMPL *session, + int (*file_func)(WT_SESSION_IMPL *, const char *[]), + int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), + const char *cfg[]) { WT_CURSOR *cursor; WT_DECL_RET; WT_RET(__wt_metadata_cursor(session, &cursor)); - WT_SAVE_DHANDLE(session, - ret = __meta_btree_apply(session, cursor, func, cfg)); + WT_SAVE_DHANDLE(session, ret = + __meta_btree_apply(session, cursor, file_func, name_func, cfg)); WT_TRET(__wt_metadata_cursor_release(session, &cursor)); return (ret); diff --git a/src/meta/meta_ckpt.c b/src/meta/meta_ckpt.c index df4cd2cb4d6..0a864432daf 100644 --- a/src/meta/meta_ckpt.c +++ b/src/meta/meta_ckpt.c @@ -212,8 +212,7 @@ __ckpt_last_name( if (found && a.val < found) continue; - if (*namep != NULL) - __wt_free(session, *namep); + __wt_free(session, *namep); WT_ERR(__wt_strndup(session, k.str, k.len, namep)); found = a.val; } @@ -221,7 +220,7 @@ __ckpt_last_name( ret = WT_NOTFOUND; if (0) { -err: __wt_free(session, namep); +err: __wt_free(session, *namep); } return (ret); } diff --git a/src/meta/meta_table.c b/src/meta/meta_table.c index 61cc009c983..e5f2727b5b6 100644 --- a/src/meta/meta_table.c +++ b/src/meta/meta_table.c @@ -67,18 +67,16 @@ __wt_metadata_cursor_open( btree = ((WT_CURSOR_BTREE *)(*cursorp))->btree; /* - * Set special flags for the metadata file: eviction (the metadata file - * is in-memory and never evicted), logging (the metadata file is always - * logged if possible). + * Special settings for metadata: skew eviction so metadata almost + * always stays in cache and make sure metadata is logged if possible. * - * Test flags before setting them so updates can't race in subsequent - * opens (the first update is safe because it's single-threaded from + * Test before setting so updates can't race in subsequent opens (the + * first update is safe because it's single-threaded from * wiredtiger_open). */ - if (!F_ISSET(btree, WT_BTREE_IN_MEMORY)) - F_SET(btree, WT_BTREE_IN_MEMORY); - if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) - F_SET(btree, WT_BTREE_NO_EVICTION); + if (btree->evict_priority == 0) + WT_WITH_BTREE(session, btree, + __wt_evict_priority_set(session, WT_EVICT_INT_SKEW)); if (F_ISSET(btree, WT_BTREE_NO_LOGGING)) F_CLR(btree, WT_BTREE_NO_LOGGING); diff --git a/src/meta/meta_track.c b/src/meta/meta_track.c index 1baab2deae1..a73b7e09d37 100644 --- a/src/meta/meta_track.c +++ b/src/meta/meta_track.c @@ -284,11 +284,12 @@ __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll) * should be included in the checkpoint. */ ckpt_session->txn.id = session->txn.id; - F_SET(ckpt_session, WT_SESSION_LOCKED_SCHEMA); - WT_WITH_DHANDLE(ckpt_session, - WT_SESSION_META_DHANDLE(session), - ret = __wt_checkpoint(ckpt_session, NULL)); - F_CLR(ckpt_session, WT_SESSION_LOCKED_SCHEMA); + F_SET(ckpt_session, WT_SESSION_LOCKED_METADATA); + WT_WITH_METADATA_LOCK(session, ret, + WT_WITH_DHANDLE(ckpt_session, + WT_SESSION_META_DHANDLE(session), + ret = __wt_checkpoint(ckpt_session, NULL))); + F_CLR(ckpt_session, WT_SESSION_LOCKED_METADATA); ckpt_session->txn.id = WT_TXN_NONE; WT_RET(ret); WT_WITH_DHANDLE(session, diff --git a/src/meta/meta_turtle.c b/src/meta/meta_turtle.c index 7182bb0fe5f..471bb65cac0 100644 --- a/src/meta/meta_turtle.c +++ b/src/meta/meta_turtle.c @@ -113,8 +113,9 @@ __metadata_load_bulk(WT_SESSION_IMPL *session) WT_DECL_RET; uint32_t allocsize; bool exist; - const char *filecfg[] = { WT_CONFIG_BASE(session, file_meta), NULL }; - const char *key; + const char *filecfg[] = { + WT_CONFIG_BASE(session, file_meta), NULL, NULL }; + const char *key, *value; /* * If a file was being bulk-loaded during the hot backup, it will appear @@ -135,6 +136,8 @@ __metadata_load_bulk(WT_SESSION_IMPL *session) * If the file doesn't exist, assume it's a bulk-loaded file; * retrieve the allocation size and re-create the file. */ + WT_ERR(cursor->get_value(cursor, &value)); + filecfg[1] = value; WT_ERR(__wt_direct_io_size_check( session, filecfg, "allocation_size", &allocsize)); WT_ERR(__wt_block_manager_create(session, key, allocsize)); @@ -153,10 +156,11 @@ int __wt_turtle_init(WT_SESSION_IMPL *session) { WT_DECL_RET; - bool exist, exist_incr; + bool exist_backup, exist_incr, exist_turtle, load; char *metaconf; metaconf = NULL; + load = false; /* * Discard any turtle setup file left-over from previous runs. This @@ -179,13 +183,29 @@ __wt_turtle_init(WT_SESSION_IMPL *session) * done. */ WT_RET(__wt_exist(session, WT_INCREMENTAL_BACKUP, &exist_incr)); - WT_RET(__wt_exist(session, WT_METADATA_TURTLE, &exist)); - if (exist) { + WT_RET(__wt_exist(session, WT_METADATA_BACKUP, &exist_backup)); + WT_RET(__wt_exist(session, WT_METADATA_TURTLE, &exist_turtle)); + if (exist_turtle) { if (exist_incr) WT_RET_MSG(session, EINVAL, "Incremental backup after running recovery " "is not allowed."); - } else { + /* + * If we have a backup file and metadata and turtle files, + * we want to recreate the metadata from the backup. + */ + if (exist_backup) { + WT_RET(__wt_msg(session, "Both %s and %s exist. " + "Recreating metadata from backup.", + WT_METADATA_TURTLE, WT_METADATA_BACKUP)); + WT_RET(__wt_remove_if_exists(session, WT_METAFILE)); + WT_RET(__wt_remove_if_exists( + session, WT_METADATA_TURTLE)); + load = true; + } + } else + load = true; + if (load) { if (exist_incr) F_SET(S2C(session), WT_CONN_WAS_BACKUP); diff --git a/src/os_posix/os_alloc.c b/src/os_posix/os_alloc.c index 3876f9a1afe..cfc7b80450e 100644 --- a/src/os_posix/os_alloc.c +++ b/src/os_posix/os_alloc.c @@ -18,22 +18,13 @@ #include <gperftools/tcmalloc.h> #define calloc tc_calloc +#define malloc tc_malloc #define realloc tc_realloc #define posix_memalign tc_posix_memalign #define free tc_free #endif /* - * There's no malloc interface, WiredTiger never calls malloc. - * - * The problem is an application might allocate memory, write secret stuff in - * it, free the memory, then WiredTiger allocates the memory and uses it for a - * file page or log record, then writes it to disk, without having overwritten - * it fully. That results in the secret stuff being protected by WiredTiger's - * permission mechanisms, potentially inappropriate for the secret stuff. - */ - -/* * __wt_calloc -- * ANSI calloc function. */ @@ -67,12 +58,46 @@ __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp) } /* - * __wt_realloc -- - * ANSI realloc function. + * __wt_malloc -- + * ANSI malloc function. */ int -__wt_realloc(WT_SESSION_IMPL *session, - size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp) +__wt_malloc(WT_SESSION_IMPL *session, size_t bytes_to_allocate, void *retp) +{ + void *p; + + /* + * Defensive: if our caller doesn't handle errors correctly, ensure a + * free won't fail. + */ + *(void **)retp = NULL; + + /* + * !!! + * This function MUST handle a NULL WT_SESSION_IMPL handle. + */ + WT_ASSERT(session, bytes_to_allocate != 0); + + if (session != NULL) + WT_STAT_FAST_CONN_INCR(session, memory_allocation); + + if ((p = malloc(bytes_to_allocate)) == NULL) + WT_RET_MSG(session, __wt_errno(), + "memory allocation of %" WT_SIZET_FMT " bytes failed", + bytes_to_allocate); + + *(void **)retp = p; + return (0); +} + +/* + * __realloc_func -- + * ANSI realloc function. + */ +static int +__realloc_func(WT_SESSION_IMPL *session, + size_t *bytes_allocated_ret, size_t bytes_to_allocate, bool clear_memory, + void *retp) { void *p; size_t bytes_allocated; @@ -107,15 +132,12 @@ __wt_realloc(WT_SESSION_IMPL *session, bytes_to_allocate); /* - * Clear the allocated memory -- an application might: allocate memory, - * write secret stuff into it, free the memory, then we re-allocate the - * memory and use it for a file page or log record, and then write it to - * disk. That would result in the secret stuff being protected by the - * WiredTiger permission mechanisms, potentially inappropriate for the - * secret stuff. + * Clear the allocated memory, parts of WiredTiger depend on allocated + * memory being cleared. */ - memset((uint8_t *) - p + bytes_allocated, 0, bytes_to_allocate - bytes_allocated); + if (clear_memory) + memset((uint8_t *)p + bytes_allocated, + 0, bytes_to_allocate - bytes_allocated); /* Update caller's bytes allocated value. */ if (bytes_allocated_ret != NULL) @@ -126,9 +148,33 @@ __wt_realloc(WT_SESSION_IMPL *session, } /* + * __wt_realloc -- + * WiredTiger's realloc API. + */ +int +__wt_realloc(WT_SESSION_IMPL *session, + size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp) +{ + return (__realloc_func( + session, bytes_allocated_ret, bytes_to_allocate, true, retp)); +} + +/* + * __wt_realloc_noclear -- + * WiredTiger's realloc API, not clearing allocated memory. + */ +int +__wt_realloc_noclear(WT_SESSION_IMPL *session, + size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp) +{ + return (__realloc_func( + session, bytes_allocated_ret, bytes_to_allocate, false, retp)); +} + +/* * __wt_realloc_aligned -- * ANSI realloc function that aligns to buffer boundaries, configured with - * the "buffer_alignment" key to wiredtiger_open. + * the "buffer_alignment" key to wiredtiger_open. */ int __wt_realloc_aligned(WT_SESSION_IMPL *session, @@ -184,10 +230,6 @@ __wt_realloc_aligned(WT_SESSION_IMPL *session, __wt_free(session, p); p = newp; - /* Clear the allocated memory (see above). */ - memset((uint8_t *)p + bytes_allocated, 0, - bytes_to_allocate - bytes_allocated); - /* Update caller's bytes allocated value. */ if (bytes_allocated_ret != NULL) *bytes_allocated_ret = bytes_to_allocate; @@ -200,11 +242,11 @@ __wt_realloc_aligned(WT_SESSION_IMPL *session, * If there is no posix_memalign function, or no alignment configured, * fall back to realloc. * - * Windows note: Visual C CRT memalign does not match Posix behavior - * and would also double each allocation so it is bad for memory use + * Windows note: Visual C CRT memalign does not match POSIX behavior + * and would also double each allocation so it is bad for memory use. */ - return (__wt_realloc( - session, bytes_allocated_ret, bytes_to_allocate, retp)); + return (__realloc_func( + session, bytes_allocated_ret, bytes_to_allocate, false, retp)); } /* @@ -221,13 +263,14 @@ __wt_strndup(WT_SESSION_IMPL *session, const void *str, size_t len, void *retp) return (0); } - WT_RET(__wt_calloc(session, len + 1, 1, &p)); + WT_RET(__wt_malloc(session, len + 1, &p)); /* * Don't change this to strncpy, we rely on this function to duplicate * "strings" that contain nul bytes. */ memcpy(p, str, len); + ((uint8_t *)p)[len] = '\0'; *(void **)retp = p; return (0); diff --git a/src/os_posix/os_errno.c b/src/os_posix/os_errno.c index a58ae88447e..a0f1202c6ef 100644 --- a/src/os_posix/os_errno.c +++ b/src/os_posix/os_errno.c @@ -23,6 +23,22 @@ __wt_errno(void) } /* + * __wt_map_error_rdonly -- + * Map an error into a WiredTiger error code specific for + * read-only operation which intercepts based on certain types + * of failures. + */ +int +__wt_map_error_rdonly(int error) +{ + if (error == ENOENT) + return (WT_NOTFOUND); + else if (error == EACCES) + return (WT_PERM_DENIED); + return (error); +} + +/* * __wt_strerror -- * POSIX implementation of WT_SESSION.strerror and wiredtiger_strerror. */ diff --git a/src/os_posix/os_fallocate.c b/src/os_posix/os_fallocate.c index 9d160afd179..bf20a99bdef 100644 --- a/src/os_posix/os_fallocate.c +++ b/src/os_posix/os_fallocate.c @@ -115,6 +115,7 @@ __wt_fallocate( { WT_DECL_RET; + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY)); switch (fh->fallocate_available) { /* * Check for already configured handles and make the configured call. diff --git a/src/os_posix/os_fsync.c b/src/os_posix/os_fsync.c index f5afddc557b..0bd0359338b 100644 --- a/src/os_posix/os_fsync.c +++ b/src/os_posix/os_fsync.c @@ -60,6 +60,7 @@ __wt_directory_sync_fh(WT_SESSION_IMPL *session, WT_FH *fh) #ifdef __linux__ WT_DECL_RET; + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY)); if ((ret = __wt_handle_sync(fh->fd)) == 0) return (0); WT_RET_MSG(session, ret, "%s: fsync", fh->name); @@ -108,6 +109,7 @@ __wt_directory_sync(WT_SESSION_IMPL *session, const char *path) if (ret != 0) WT_RET_MSG(session, ret, "%s: open", path); + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY)); if ((ret = __wt_handle_sync(fd)) != 0) WT_ERR_MSG(session, ret, "%s: fsync", path); @@ -134,6 +136,9 @@ __wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh) WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: fsync", fh->name)); + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY) || + WT_STRING_MATCH(fh->name, WT_SINGLETHREAD, + strlen(WT_SINGLETHREAD))); if ((ret = __wt_handle_sync(fh->fd)) == 0) return (0); WT_RET_MSG(session, ret, "%s fsync error", fh->name); @@ -149,6 +154,7 @@ __wt_fsync_async(WT_SESSION_IMPL *session, WT_FH *fh) #ifdef HAVE_SYNC_FILE_RANGE WT_DECL_RET; + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY)); WT_RET(__wt_verbose( session, WT_VERB_FILEOPS, "%s: sync_file_range", fh->name)); diff --git a/src/os_posix/os_ftruncate.c b/src/os_posix/os_ftruncate.c index 2af90512f26..94d6cba3bf5 100644 --- a/src/os_posix/os_ftruncate.c +++ b/src/os_posix/os_ftruncate.c @@ -17,6 +17,7 @@ __wt_ftruncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t len) { WT_DECL_RET; + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY)); WT_SYSCALL_RETRY(ftruncate(fh->fd, len), ret); if (ret == 0) return (0); diff --git a/src/os_posix/os_open.c b/src/os_posix/os_open.c index b085676c53b..219b26c2fa1 100644 --- a/src/os_posix/os_open.c +++ b/src/os_posix/os_open.c @@ -73,7 +73,16 @@ __wt_open(WT_SESSION_IMPL *session, goto setupfh; } - f = O_RDWR; + /* + * If this is a read-only connection, open all files read-only + * except the lock file. + */ + if (F_ISSET(conn, WT_CONN_READONLY) && + !WT_STRING_MATCH(name, WT_SINGLETHREAD, + strlen(WT_SINGLETHREAD))) + f = O_RDONLY; + else + f = O_RDWR; #ifdef O_BINARY /* Windows clones: we always want to treat the file as a binary. */ f |= O_BINARY; @@ -94,6 +103,9 @@ __wt_open(WT_SESSION_IMPL *session, #endif if (ok_create) { + WT_ASSERT(session, !F_ISSET(conn, WT_CONN_READONLY) || + WT_STRING_MATCH(name, WT_SINGLETHREAD, + strlen(WT_SINGLETHREAD))); f |= O_CREAT; if (exclusive) f |= O_EXCL; diff --git a/src/os_posix/os_remove.c b/src/os_posix/os_remove.c index bc244c12e46..eb2e37fdc38 100644 --- a/src/os_posix/os_remove.c +++ b/src/os_posix/os_remove.c @@ -21,6 +21,7 @@ __remove_file_check(WT_SESSION_IMPL *session, const char *name) uint64_t bucket; conn = S2C(session); + WT_ASSERT(session, !F_ISSET(conn, WT_CONN_READONLY)); fh = NULL; bucket = __wt_hash_city64(name, strlen(name)) % WT_HASH_ARRAY_SIZE; diff --git a/src/os_posix/os_rename.c b/src/os_posix/os_rename.c index 301190305c4..8ec4ee3aa23 100644 --- a/src/os_posix/os_rename.c +++ b/src/os_posix/os_rename.c @@ -21,6 +21,7 @@ __wt_rename(WT_SESSION_IMPL *session, const char *from, const char *to) WT_RET(__wt_verbose( session, WT_VERB_FILEOPS, "rename %s to %s", from, to)); + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY)); from_path = to_path = NULL; WT_RET(__wt_filename(session, from, &from_path)); diff --git a/src/os_posix/os_rw.c b/src/os_posix/os_rw.c index 8733bfe0f53..3d49fa7e712 100644 --- a/src/os_posix/os_rw.c +++ b/src/os_posix/os_rw.c @@ -65,6 +65,9 @@ __wt_write(WT_SESSION_IMPL *session, "%s: write %" WT_SIZET_FMT " bytes at offset %" PRIuMAX, fh->name, len, (uintmax_t)offset)); + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY) || + WT_STRING_MATCH(fh->name, WT_SINGLETHREAD, + strlen(WT_SINGLETHREAD))); /* Assert direct I/O is aligned and a multiple of the alignment. */ WT_ASSERT(session, !fh->direct_io || diff --git a/src/os_posix/os_stdio.c b/src/os_posix/os_stdio.c index 7ab107eda1e..65a0f40a659 100644 --- a/src/os_posix/os_stdio.c +++ b/src/os_posix/os_stdio.c @@ -46,8 +46,7 @@ __wt_fopen(WT_SESSION_IMPL *session, if (*fpp == NULL) ret = __wt_errno(); - if (pathbuf != NULL) - __wt_free(session, pathbuf); + __wt_free(session, pathbuf); if (ret == 0) return (0); diff --git a/src/os_win/os_errno.c b/src/os_win/os_errno.c index 6a9daf8443f..590fcdc9d44 100644 --- a/src/os_win/os_errno.c +++ b/src/os_win/os_errno.c @@ -17,11 +17,13 @@ static const int windows_error_offset = -29000; * Windows errors are from 0 - 15999 according to the documentation */ static DWORD -__wt_map_error_to_windows_error(int error) { - /* Ensure we do not exceed the error range - Also validate he do not get any COM errors - (which are negative integers) - */ +__wt_map_error_to_windows_error(int error) +{ + /* + * Ensure we do not exceed the error range + * Also validate we do not get any COM errors + * (which are negative integers) + */ WT_ASSERT(NULL, error < 0); return (error + -(windows_error_offset)); @@ -32,11 +34,28 @@ __wt_map_error_to_windows_error(int error) { * Return a positive integer, a decoded Windows error */ static int -__wt_map_windows_error_to_error(DWORD winerr) { +__wt_map_windows_error_to_error(DWORD winerr) +{ return (winerr + windows_error_offset); } /* + * __wt_map_error_rdonly -- + * Map an error into a WiredTiger error code specific for + * read-only operation which intercepts based on certain types + * of failures. + */ +int +__wt_map_error_rdonly(int winerr) +{ + if (winerr == ERROR_FILE_NOT_FOUND) + return (WT_NOTFOUND); + else if (winerr == ERROR_ACCESS_DENIED) + return (WT_PERM_DENIED); + return (winerr); +} + +/* * __wt_errno -- * Return errno, or WT_ERROR if errno not set. */ diff --git a/src/os_win/os_fallocate.c b/src/os_win/os_fallocate.c index cdc7a1c46ee..a324687ca73 100644 --- a/src/os_win/os_fallocate.c +++ b/src/os_win/os_fallocate.c @@ -35,6 +35,7 @@ int __wt_fallocate( WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, wt_off_t len) { + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY)); WT_UNUSED(session); WT_UNUSED(fh); WT_UNUSED(offset); diff --git a/src/os_win/os_fsync.c b/src/os_win/os_fsync.c index 913b7ca5a4e..c196fc6c06a 100644 --- a/src/os_win/os_fsync.c +++ b/src/os_win/os_fsync.c @@ -15,6 +15,7 @@ int __wt_directory_sync_fh(WT_SESSION_IMPL *session, WT_FH *fh) { + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY)); WT_UNUSED(session); WT_UNUSED(fh); return (0); @@ -27,6 +28,7 @@ __wt_directory_sync_fh(WT_SESSION_IMPL *session, WT_FH *fh) int __wt_directory_sync(WT_SESSION_IMPL *session, const char *path) { + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY)); WT_UNUSED(session); WT_UNUSED(path); return (0); @@ -44,6 +46,9 @@ __wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh) WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: FlushFileBuffers", fh->name)); + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY) || + WT_STRING_MATCH(fh->name, WT_SINGLETHREAD, + strlen(WT_SINGLETHREAD))); if ((ret = FlushFileBuffers(fh->filehandle)) == FALSE) WT_RET_MSG(session, __wt_errno(), "%s FlushFileBuffers error", fh->name); @@ -58,6 +63,7 @@ __wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh) int __wt_fsync_async(WT_SESSION_IMPL *session, WT_FH *fh) { + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY)); WT_UNUSED(session); WT_UNUSED(fh); diff --git a/src/os_win/os_ftruncate.c b/src/os_win/os_ftruncate.c index 0c11b5509b7..88fcf9542c1 100644 --- a/src/os_win/os_ftruncate.c +++ b/src/os_win/os_ftruncate.c @@ -18,6 +18,7 @@ __wt_ftruncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t len) WT_DECL_RET; LARGE_INTEGER largeint; + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY)); largeint.QuadPart = len; if ((ret = SetFilePointerEx( diff --git a/src/os_win/os_open.c b/src/os_win/os_open.c index 3ec53daf001..f10582c5bd1 100644 --- a/src/os_win/os_open.c +++ b/src/os_win/os_open.c @@ -58,7 +58,17 @@ __wt_open(WT_SESSION_IMPL *session, WT_RET(__wt_filename(session, name, &path)); - share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE; + /* + * If this is a read-only connection, open all files read-only + * except the lock file. + */ + if (F_ISSET(conn, WT_CONN_READONLY) && + !WT_STRING_MATCH(name, WT_SINGLETHREAD, + strlen(WT_SINGLETHREAD))) + share_mode = FILE_SHARE_READ; + else + share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE; + /* * Security: * The application may spawn a new process, and we don't want another @@ -72,6 +82,9 @@ __wt_open(WT_SESSION_IMPL *session, dwCreationDisposition = 0; if (ok_create) { + WT_ASSERT(session, !F_ISSET(conn, WT_CONN_READONLY) || + WT_STRING_MATCH(name, WT_SINGLETHREAD, + strlen(WT_SINGLETHREAD))); dwCreationDisposition = CREATE_NEW; if (exclusive) dwCreationDisposition = CREATE_ALWAYS; diff --git a/src/os_win/os_remove.c b/src/os_win/os_remove.c index 5682a25d7f2..84f1dd86674 100644 --- a/src/os_win/os_remove.c +++ b/src/os_win/os_remove.c @@ -21,6 +21,7 @@ __remove_file_check(WT_SESSION_IMPL *session, const char *name) uint64_t bucket; conn = S2C(session); + WT_ASSERT(session, !F_ISSET(conn, WT_CONN_READONLY)); fh = NULL; bucket = __wt_hash_city64(name, strlen(name)) % WT_HASH_ARRAY_SIZE; diff --git a/src/os_win/os_rename.c b/src/os_win/os_rename.c index 829ab1d16e9..b4be2dba24c 100644 --- a/src/os_win/os_rename.c +++ b/src/os_win/os_rename.c @@ -22,6 +22,7 @@ __wt_rename(WT_SESSION_IMPL *session, const char *from, const char *to) WT_RET(__wt_verbose( session, WT_VERB_FILEOPS, "rename %s to %s", from, to)); + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY)); from_path = to_path = NULL; WT_RET(__wt_filename(session, from, &from_path)); diff --git a/src/os_win/os_rw.c b/src/os_win/os_rw.c index 49f011001a4..a9537a648f9 100644 --- a/src/os_win/os_rw.c +++ b/src/os_win/os_rw.c @@ -74,6 +74,9 @@ __wt_write(WT_SESSION_IMPL *session, "%s: write %" WT_SIZET_FMT " bytes at offset %" PRIuMAX, fh->name, len, (uintmax_t)offset)); + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY) || + WT_STRING_MATCH(fh->name, WT_SINGLETHREAD, + strlen(WT_SINGLETHREAD))); /* Assert direct I/O is aligned and a multiple of the alignment. */ WT_ASSERT(session, !fh->direct_io || diff --git a/src/packing/pack_impl.c b/src/packing/pack_impl.c index 0e3ed44ba6a..5dbb0f33842 100644 --- a/src/packing/pack_impl.c +++ b/src/packing/pack_impl.c @@ -107,36 +107,6 @@ __wt_struct_unpack(WT_SESSION_IMPL *session, } /* - * __wt_struct_unpack_size -- - * Determine the packed size of a buffer matching the format. - */ -int -__wt_struct_unpack_size(WT_SESSION_IMPL *session, - const void *buffer, size_t size, const char *fmt, size_t *resultp) -{ - WT_DECL_PACK_VALUE(pv); - WT_DECL_RET; - WT_PACK pack; - const uint8_t *p, *end; - - p = buffer; - end = p + size; - - WT_RET(__pack_init(session, &pack, fmt)); - while ((ret = __pack_next(&pack, &pv)) == 0) - WT_RET(__unpack_read(session, &pv, &p, (size_t)(end - p))); - - /* Be paranoid - __pack_write should never overflow. */ - WT_ASSERT(session, p <= end); - - if (ret != WT_NOTFOUND) - return (ret); - - *resultp = WT_PTRDIFF(p, buffer); - return (0); -} - -/* * __wt_struct_repack -- * Return the subset of the packed buffer that represents part of * the format. If the result is not contiguous in the existing @@ -144,70 +114,43 @@ __wt_struct_unpack_size(WT_SESSION_IMPL *session, */ int __wt_struct_repack(WT_SESSION_IMPL *session, const char *infmt, - const char *outfmt, const WT_ITEM *inbuf, WT_ITEM *outbuf, void **reallocp) + const char *outfmt, const WT_ITEM *inbuf, WT_ITEM *outbuf) { WT_DECL_PACK_VALUE(pvin); WT_DECL_PACK_VALUE(pvout); WT_DECL_RET; WT_PACK packin, packout; const uint8_t *before, *end, *p; - uint8_t *pout; - size_t len; const void *start; start = NULL; p = inbuf->data; end = p + inbuf->size; - /* - * Handle this non-contiguous case: 'U' -> 'u' at the end of the buf. - * The former case has the size embedded before the item, the latter - * does not. - */ - if ((len = strlen(outfmt)) > 1 && outfmt[len - 1] == 'u' && - strlen(infmt) > len && infmt[len - 1] == 'U') { - WT_ERR(__wt_realloc(session, NULL, inbuf->size, reallocp)); - pout = *reallocp; - } else - pout = NULL; - - WT_ERR(__pack_init(session, &packout, outfmt)); - WT_ERR(__pack_init(session, &packin, infmt)); + WT_RET(__pack_init(session, &packout, outfmt)); + WT_RET(__pack_init(session, &packin, infmt)); /* Outfmt should complete before infmt */ while ((ret = __pack_next(&packout, &pvout)) == 0) { if (p >= end) - WT_ERR(EINVAL); - WT_ERR(__pack_next(&packin, &pvin)); + WT_RET(EINVAL); + if (pvout.type == 'x' && pvout.size == 0 && pvout.havesize) + continue; + WT_RET(__pack_next(&packin, &pvin)); before = p; - WT_ERR(__unpack_read(session, &pvin, &p, (size_t)(end - p))); - if (pvout.type != pvin.type) { - if (pvout.type == 'u' && pvin.type == 'U') { - /* Skip the prefixed size, we don't need it */ - WT_ERR(__wt_struct_unpack_size(session, before, - (size_t)(end - before), "I", &len)); - before += len; - } else - WT_ERR(ENOTSUP); - } - if (pout != NULL) { - memcpy(pout, before, WT_PTRDIFF(p, before)); - pout += p - before; - } else if (start == NULL) + WT_RET(__unpack_read(session, &pvin, &p, (size_t)(end - p))); + if (pvout.type != pvin.type) + WT_RET(ENOTSUP); + if (start == NULL) start = before; } - WT_ERR_NOTFOUND_OK(ret); + WT_RET_NOTFOUND_OK(ret); /* Be paranoid - __pack_write should never overflow. */ WT_ASSERT(session, p <= end); - if (pout != NULL) { - outbuf->data = *reallocp; - outbuf->size = WT_PTRDIFF(pout, *reallocp); - } else { - outbuf->data = start; - outbuf->size = WT_PTRDIFF(p, start); - } + outbuf->data = start; + outbuf->size = WT_PTRDIFF(p, start); -err: return (ret); + return (0); } diff --git a/src/packing/pack_stream.c b/src/packing/pack_stream.c index 98da5b405c3..1393eb9a9c1 100644 --- a/src/packing/pack_stream.c +++ b/src/packing/pack_stream.c @@ -65,8 +65,7 @@ wiredtiger_pack_close(WT_PACK_STREAM *ps, size_t *usedp) if (usedp != NULL) *usedp = WT_PTRDIFF(ps->p, ps->start); - if (ps != NULL) - __wt_free(ps->pack.session, ps); + __wt_free(ps->pack.session, ps); return (0); } @@ -327,3 +326,139 @@ wiredtiger_unpack_uint(WT_PACK_STREAM *ps, uint64_t *up) } return (0); } + +/* + * __wt_ext_pack_start -- + * WT_EXTENSION.pack_start method. + */ +int +__wt_ext_pack_start(WT_EXTENSION_API *wt_api, + WT_SESSION *wt_session, const char *format, + void *buffer, size_t size, WT_PACK_STREAM **psp) +{ + WT_CONNECTION_IMPL *conn; + + conn = (WT_CONNECTION_IMPL *)wt_api->conn; + if (wt_session == NULL) + wt_session = (WT_SESSION *)conn->default_session; + return (wiredtiger_pack_start(wt_session, format, buffer, size, psp)); +} + +/* + * __wt_ext_unpack_start -- + * WT_EXTENSION.unpack_start + */ +int +__wt_ext_unpack_start(WT_EXTENSION_API *wt_api, + WT_SESSION *wt_session, const char *format, + const void *buffer, size_t size, WT_PACK_STREAM **psp) +{ + WT_CONNECTION_IMPL *conn; + + conn = (WT_CONNECTION_IMPL *)wt_api->conn; + if (wt_session == NULL) + wt_session = (WT_SESSION *)conn->default_session; + return (wiredtiger_unpack_start(wt_session, format, buffer, size, psp)); +} + +/* + * __wt_ext_pack_close -- + * WT_EXTENSION.pack_close + */ +int +__wt_ext_pack_close(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, size_t *usedp) +{ + WT_UNUSED(wt_api); + return (wiredtiger_pack_close(ps, usedp)); +} + +/* + * __wt_ext_pack_item -- + * WT_EXTENSION.pack_item + */ +int +__wt_ext_pack_item(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, WT_ITEM *item) +{ + WT_UNUSED(wt_api); + return (wiredtiger_pack_item(ps, item)); +} + +/* + * __wt_ext_pack_int -- + * WT_EXTENSION.pack_int + */ +int +__wt_ext_pack_int(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, int64_t i) +{ + WT_UNUSED(wt_api); + return (wiredtiger_pack_int(ps, i)); +} + +/* + * __wt_ext_pack_str -- + * WT_EXTENSION.pack_str + */ +int +__wt_ext_pack_str(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, const char *s) +{ + WT_UNUSED(wt_api); + return (wiredtiger_pack_str(ps, s)); +} + +/* + * __wt_ext_pack_uint -- + * WT_EXTENSION.pack_uint + */ +int +__wt_ext_pack_uint(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, uint64_t u) +{ + WT_UNUSED(wt_api); + return (wiredtiger_pack_uint(ps, u)); +} + +/* + * __wt_ext_unpack_item -- + * WT_EXTENSION.unpack_item + */ +int +__wt_ext_unpack_item(WT_EXTENSION_API *wt_api, + WT_PACK_STREAM *ps, WT_ITEM *item) +{ + WT_UNUSED(wt_api); + return (wiredtiger_unpack_item(ps, item)); +} + +/* + * __wt_ext_unpack_int -- + * WT_EXTENSION.unpack_int + */ +int +__wt_ext_unpack_int(WT_EXTENSION_API *wt_api, + WT_PACK_STREAM *ps, int64_t *ip) +{ + WT_UNUSED(wt_api); + return (wiredtiger_unpack_int(ps, ip)); +} + +/* + * __wt_ext_unpack_str -- + * WT_EXTENSION.unpack_str + */ +int +__wt_ext_unpack_str(WT_EXTENSION_API *wt_api, + WT_PACK_STREAM *ps, const char **sp) +{ + WT_UNUSED(wt_api); + return (wiredtiger_unpack_str(ps, sp)); +} + +/* + * __wt_ext_unpack_uint -- + * WT_EXTENSION.unpack_uint + */ +int +__wt_ext_unpack_uint(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, uint64_t *up) +{ + WT_UNUSED(wt_api); + return (wiredtiger_unpack_uint(ps, up)); +} diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index c25d7b5e493..a69f335c9b3 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -363,6 +363,17 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_ASSERT(session, __wt_page_is_modified(page)); /* + * Reconciliation locks the page for three reasons: + * Reconciliation reads the lists of page updates, obsolete updates + * cannot be discarded while reconciliation is in progress; + * The compaction process reads page modification information, which + * reconciliation modifies; + * In-memory splits: reconciliation of an internal page cannot handle + * a child page splitting during the reconciliation. + */ + WT_RET(__wt_fair_lock(session, &page->page_lock)); + + /* * Check that transaction time always moves forward for a given page. * If this check fails, reconciliation can free something that a future * reconciliation will need. @@ -376,17 +387,6 @@ __wt_reconcile(WT_SESSION_IMPL *session, session, ref, flags, salvage, &session->reconcile)); r = session->reconcile; - /* - * Reconciliation locks the page for three reasons: - * Reconciliation reads the lists of page updates, obsolete updates - * cannot be discarded while reconciliation is in progress; - * The compaction process reads page modification information, which - * reconciliation modifies; - * In-memory splits: reconciliation of an internal page cannot handle - * a child page splitting during the reconciliation. - */ - WT_RET(__wt_fair_lock(session, &page->page_lock)); - /* Reconcile the page. */ switch (page->type) { case WT_PAGE_COL_FIX: @@ -1313,7 +1313,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, } while (0) typedef enum { - WT_CHILD_IGNORE, /* Deleted child: ignore */ + WT_CHILD_IGNORE, /* Ignored child */ WT_CHILD_MODIFIED, /* Modified child */ WT_CHILD_ORIGINAL, /* Original child */ WT_CHILD_PROXY /* Deleted child: proxy */ @@ -1450,16 +1450,15 @@ __rec_child_modify(WT_SESSION_IMPL *session, /* * This function is called when walking an internal page to decide how - * to handle child pages referenced by the internal page, specifically - * if the child page is to be merged into its parent. + * to handle child pages referenced by the internal page. * * Internal pages are reconciled for two reasons: first, when evicting * an internal page, second by the checkpoint code when writing internal - * pages. During eviction, the subtree is locked down so all pages - * should be in the WT_REF_DISK or WT_REF_LOCKED state. During - * checkpoint, any eviction that might affect our review of an internal - * page is prohibited, however, as the subtree is not reserved for our - * exclusive use, there are other page states that must be considered. + * pages. During eviction, all pages should be in the WT_REF_DISK or + * WT_REF_DELETED state. During checkpoint, eviction that might affect + * review of an internal page is prohibited, however, as the subtree is + * not reserved for our exclusive use, there are other page states that + * must be considered. */ for (;; __wt_yield()) switch (r->tested_ref_state = ref->state) { @@ -1488,15 +1487,14 @@ __rec_child_modify(WT_SESSION_IMPL *session, /* * Locked. * - * If evicting, the evicted page's subtree, including - * this child, was selected for eviction by us and the - * state is stable until we reset it, it's an in-memory - * state. This is the expected state for a child being - * merged into a page (where the page was selected by - * the eviction server for eviction). + * We should never be here during eviction, active child + * pages in an evicted page's subtree fails the eviction + * attempt. */ - if (F_ISSET(r, WT_EVICTING)) - goto in_memory; + if (F_ISSET(r, WT_EVICTING)) { + WT_ASSERT(session, !F_ISSET(r, WT_EVICTING)); + return (EBUSY); + } /* * If called during checkpoint, the child is being @@ -1514,24 +1512,21 @@ __rec_child_modify(WT_SESSION_IMPL *session, /* * In memory. * - * If evicting, the evicted page's subtree, including - * this child, was selected for eviction by us and the - * state is stable until we reset it, it's an in-memory - * state. This is the expected state for a child being - * merged into a page (where the page belongs to a file - * being discarded from the cache during close). + * We should never be here during eviction, active child + * pages in an evicted page's subtree fails the eviction + * attempt. */ - if (F_ISSET(r, WT_EVICTING)) - goto in_memory; + if (F_ISSET(r, WT_EVICTING)) { + WT_ASSERT(session, !F_ISSET(r, WT_EVICTING)); + return (EBUSY); + } /* * If called during checkpoint, acquire a hazard pointer * so the child isn't evicted, it's an in-memory case. * - * This call cannot return split/restart, eviction of - * pages that split into their parent is shutout during - * checkpoint, all splits in process will have completed - * before we walk any pages for checkpoint. + * This call cannot return split/restart, we have a lock + * on the parent which prevents a child page split. */ ret = __wt_page_in(session, ref, WT_READ_CACHE | WT_READ_NO_EVICT | @@ -1548,29 +1543,31 @@ __rec_child_modify(WT_SESSION_IMPL *session, /* * Being read, not modified by definition. * - * We should never be here during eviction, a child page - * in this state within an evicted page's subtree would - * have caused normally eviction to fail, and exclusive - * eviction shouldn't ever see pages being read. + * We should never be here during eviction, active child + * pages in an evicted page's subtree fails the eviction + * attempt. */ - WT_ASSERT(session, !F_ISSET(r, WT_EVICTING)); + if (F_ISSET(r, WT_EVICTING)) { + WT_ASSERT(session, !F_ISSET(r, WT_EVICTING)); + return (EBUSY); + } goto done; case WT_REF_SPLIT: /* * The page was split out from under us. * - * We should never be here during eviction, a child page - * in this state within an evicted page's subtree would - * have caused eviction to fail. + * We should never be here during eviction, active child + * pages in an evicted page's subtree fails the eviction + * attempt. * * We should never be here during checkpoint, dirty page * eviction is shutout during checkpoint, all splits in * process will have completed before we walk any pages * for checkpoint. */ - WT_ASSERT(session, ref->state != WT_REF_SPLIT); - /* FALLTHROUGH */ + WT_ASSERT(session, WT_REF_SPLIT != WT_REF_SPLIT); + return (EBUSY); WT_ILLEGAL_VALUE(session); } @@ -1581,11 +1578,21 @@ in_memory: * modify structure has been instantiated. If the modify structure * exists and the page has actually been modified, set that state. * If that's not the case, we would normally use the original cell's - * disk address as our reference, but, if we're forced to instantiate - * a deleted child page and it's never modified, we end up here with - * a page that has a modify structure, no modifications, and no disk - * address. Ignore those pages, they're not modified and there is no - * reason to write the cell. + * disk address as our reference, however there are two special cases, + * both flagged by a missing block address. + * + * First, if forced to instantiate a deleted child page and it's never + * modified, we end up here with a page that has a modify structure, no + * modifications, and no disk address. Ignore those pages, they're not + * modified and there is no reason to write the cell. + * + * Second, insert splits are permitted during checkpoint. When doing the + * final checkpoint pass, we first walk the internal page's page-index + * and write out any dirty pages we find, then we write out the internal + * page in post-order traversal. If we found the split page in the first + * step, it will have an address; if we didn't find the split page in + * the first step, it won't have an address and we ignore it, it's not + * part of the checkpoint. */ mod = ref->page->modify; if (mod != NULL && mod->rec_result != 0) @@ -1953,12 +1960,21 @@ __rec_split_init(WT_SESSION_IMPL *session, WT_RET(__wt_buf_init(session, &r->disk_image, corrected_page_size)); /* - * Clear the disk page's header and block-manager space, set the page - * type (the type doesn't change, and setting it later would require - * additional code in a few different places). + * Clear the disk page header to ensure all of it is initialized, even + * the unused fields. + * + * In the case of fixed-length column-store, clear the entire buffer: + * fixed-length column-store sets bits in bytes, where the bytes are + * assumed to initially be 0. + */ + memset(r->disk_image.mem, 0, page->type == WT_PAGE_COL_FIX ? + corrected_page_size : WT_PAGE_HEADER_SIZE); + + /* + * Set the page type (the type doesn't change, and setting it later + * would require additional code in a few different places). */ dsk = r->disk_image.mem; - memset(dsk, 0, WT_PAGE_HEADER_BYTE_SIZE(btree)); dsk->type = page->type; /* @@ -3019,13 +3035,13 @@ __rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r) * The data isn't laid out on a page boundary or nul padded; copy it to * a clean, aligned, padded buffer before writing it. * - * Allocate a scratch buffer to hold the new disk image. Copy the - * WT_PAGE_HEADER header onto the scratch buffer, most of the header - * information remains unchanged between the pages. + * Allocate a scratch buffer to hold the new disk image. Copy the disk + * page's header and block-manager space into the scratch buffer, most + * of the header information remains unchanged between the pages. */ WT_RET(__wt_scr_alloc(session, r->disk_image.memsize, &tmp)); dsk = tmp->mem; - memcpy(dsk, r->disk_image.mem, WT_PAGE_HEADER_SIZE); + memcpy(dsk, r->disk_image.mem, WT_PAGE_HEADER_BYTE_SIZE(btree)); /* * For each split chunk we've created, update the disk image and copy @@ -3808,7 +3824,7 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) switch (state) { case WT_CHILD_IGNORE: - /* Deleted child we don't have to write. */ + /* Ignored child. */ WT_CHILD_RELEASE_ERR(session, hazard, ref); continue; @@ -3977,7 +3993,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) * record 100 moves to another page. When we reconcile * the original page, we write record 98, then we don't * see record 99 for whatever reason. If we've moved - * record 1000, we don't know to write a deleted record + * record 100, we don't know to write a deleted record * 99 on the page.) * * The record number recorded during the split is the @@ -3999,8 +4015,6 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) } else { WT_RET( __rec_txn_read(session, r, ins, NULL, NULL, &upd)); - if (upd == NULL) - continue; recno = WT_INSERT_RECNO(ins); } for (;;) { @@ -4536,22 +4550,25 @@ compare: /* * record 100 moves to another page. When we reconcile * the original page, we write record 98, then we don't * see record 99 for whatever reason. If we've moved - * record 1000, we don't know to write a deleted record + * record 100, we don't know to write a deleted record * 99 on the page.) * + * Assert the recorded record number is past the end of + * the page. + * * The record number recorded during the split is the * first key on the split page, that is, one larger than * the last key on this page, we have to decrement it. */ if ((n = page->modify->mod_split_recno) == WT_RECNO_OOB) break; + WT_ASSERT(session, n >= src_recno); n -= 1; + upd = NULL; } else { WT_ERR( __rec_txn_read(session, r, ins, NULL, NULL, &upd)); - if (upd == NULL) - continue; n = WT_INSERT_RECNO(ins); } while (src_recno <= n) { @@ -4734,10 +4751,10 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) switch (state) { case WT_CHILD_IGNORE: /* - * Deleted child we don't have to write. + * Ignored child. * - * Overflow keys referencing discarded pages are no - * longer useful, schedule them for discard. Don't + * Overflow keys referencing pages we're not writing are + * no longer useful, schedule them for discard. Don't * worry about instantiation, internal page keys are * always instantiated. Don't worry about reuse, * reusing this key in this reconciliation is unlikely. diff --git a/src/schema/schema_create.c b/src/schema/schema_create.c index 9b3b76b62de..756f1fdcc6c 100644 --- a/src/schema/schema_create.c +++ b/src/schema/schema_create.c @@ -9,22 +9,6 @@ #include "wt_internal.h" /* - * __wt_schema_create_strip -- - * Discard any configuration information from a schema entry that is not - * applicable to an session.create call, here for the wt dump command utility, - * which only wants to dump the schema information needed for load. - */ -int -__wt_schema_create_strip(WT_SESSION_IMPL *session, - const char *v1, const char *v2, char **value_ret) -{ - const char *cfg[] = - { WT_CONFIG_BASE(session, WT_SESSION_create), v1, v2, NULL }; - - return (__wt_config_collapse(session, cfg, value_ret)); -} - -/* * __wt_direct_io_size_check -- * Return a size from the configuration, complaining if it's insufficient * for direct I/O. diff --git a/src/schema/schema_open.c b/src/schema/schema_open.c index 49318f80959..e7ce4e42498 100644 --- a/src/schema/schema_open.c +++ b/src/schema/schema_open.c @@ -109,8 +109,7 @@ __wt_schema_open_colgroups(WT_SESSION_IMPL *session, WT_TABLE *table) err: __wt_scr_free(session, &buf); __wt_schema_destroy_colgroup(session, &colgroup); - if (cgconfig != NULL) - __wt_free(session, cgconfig); + __wt_free(session, cgconfig); return (ret); } diff --git a/src/schema/schema_plan.c b/src/schema/schema_plan.c index 612a2d2d192..12a1aa9c22f 100644 --- a/src/schema/schema_plan.c +++ b/src/schema/schema_plan.c @@ -212,7 +212,7 @@ __wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table, WT_ASSERT(session, !value_only || coltype == WT_PROJ_VALUE); WT_RET(__wt_buf_catfmt( - session, plan, "%d%c", cg, coltype)); + session, plan, "%u%c", cg, coltype)); /* * Set the current column group and column @@ -226,7 +226,7 @@ __wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table, if (current_col < col) { if (col - current_col > 1) WT_RET(__wt_buf_catfmt(session, - plan, "%d", col - current_col)); + plan, "%u", col - current_col)); WT_RET(__wt_buf_catfmt(session, plan, "%c", WT_PROJ_SKIP)); } @@ -375,8 +375,8 @@ __wt_struct_reformat(WT_SESSION_IMPL *session, WT_TABLE *table, pv.type = 'u'; if (pv.havesize) - WT_RET(__wt_buf_catfmt( - session, format, "%d%c", (int)pv.size, pv.type)); + WT_RET(__wt_buf_catfmt(session, + format, "%" PRIu32 "%c", pv.size, pv.type)); else WT_RET(__wt_buf_catfmt(session, format, "%c", pv.type)); } while (have_next); @@ -399,8 +399,8 @@ __wt_struct_truncate(WT_SESSION_IMPL *session, while (ncols-- > 0) { WT_RET(__pack_next(&pack, &pv)); if (pv.havesize) - WT_RET(__wt_buf_catfmt( - session, format, "%d%c", (int)pv.size, pv.type)); + WT_RET(__wt_buf_catfmt(session, + format, "%" PRIu32 "%c", pv.size, pv.type)); else WT_RET(__wt_buf_catfmt(session, format, "%c", pv.type)); } diff --git a/src/schema/schema_truncate.c b/src/schema/schema_truncate.c index e7752b60ca4..d9a798b6ed8 100644 --- a/src/schema/schema_truncate.c +++ b/src/schema/schema_truncate.c @@ -131,22 +131,19 @@ int __wt_schema_range_truncate( WT_SESSION_IMPL *session, WT_CURSOR *start, WT_CURSOR *stop) { - WT_CURSOR *cursor; WT_DATA_SOURCE *dsrc; WT_DECL_RET; const char *uri; - cursor = (start != NULL) ? start : stop; - uri = cursor->internal_uri; + uri = start->internal_uri; if (WT_PREFIX_MATCH(uri, "file:")) { - if (start != NULL) - WT_CURSOR_NEEDKEY(start); + WT_CURSOR_NEEDKEY(start); if (stop != NULL) WT_CURSOR_NEEDKEY(stop); - WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)cursor)->btree, + WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)start)->btree, ret = __wt_btcur_range_truncate( - (WT_CURSOR_BTREE *)start, (WT_CURSOR_BTREE *)stop)); + (WT_CURSOR_BTREE *)start, (WT_CURSOR_BTREE *)stop)); } else if (WT_PREFIX_MATCH(uri, "table:")) ret = __wt_table_range_truncate( (WT_CURSOR_TABLE *)start, (WT_CURSOR_TABLE *)stop); diff --git a/src/schema/schema_worker.c b/src/schema/schema_worker.c index b5ee3bb7f7d..52be76bb7a5 100644 --- a/src/schema/schema_worker.c +++ b/src/schema/schema_worker.c @@ -55,18 +55,11 @@ __wt_schema_worker(WT_SESSION_IMPL *session, WT_ERR(ret); } - if ((ret = __wt_session_get_btree_ckpt( - session, uri, cfg, open_flags)) == 0) { - WT_SAVE_DHANDLE(session, - ret = file_func(session, cfg)); - WT_TRET(__wt_session_release_btree(session)); - } else if (ret == EBUSY) { - WT_ASSERT(session, !FLD_ISSET( - open_flags, WT_DHANDLE_EXCLUSIVE)); - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_conn_btree_apply_single_ckpt( - session, uri, file_func, cfg)); - } + WT_ERR(__wt_session_get_btree_ckpt( + session, uri, cfg, open_flags)); + WT_SAVE_DHANDLE(session, + ret = file_func(session, cfg)); + WT_TRET(__wt_session_release_btree(session)); WT_ERR(ret); } } else if (WT_PREFIX_MATCH(uri, "colgroup:")) { @@ -133,7 +126,7 @@ __wt_schema_worker(WT_SESSION_IMPL *session, dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg)); else if (file_func == __wt_checkpoint) ; - else if (file_func == __wt_checkpoint_list) + else if (file_func == __wt_checkpoint_get_handles) ; else if (file_func == __wt_checkpoint_sync) ; diff --git a/src/session/session_api.c b/src/session/session_api.c index c03b5fdc044..bb496494234 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -13,6 +13,20 @@ static int __session_snapshot(WT_SESSION *, const char *); static int __session_rollback_transaction(WT_SESSION *, const char *); /* + * __wt_session_notsup -- + * Unsupported session method. + */ +int +__wt_session_notsup(WT_SESSION *wt_session) +{ + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)wt_session; + + WT_RET_MSG(session, ENOTSUP, "Unsupported session method"); +} + +/* * __wt_session_reset_cursors -- * Reset all open cursors. */ @@ -26,7 +40,8 @@ __wt_session_reset_cursors(WT_SESSION_IMPL *session, bool free_buffers) /* Stop when there are no positioned cursors. */ if (session->ncursors == 0) break; - WT_TRET(cursor->reset(cursor)); + if (!F_ISSET(cursor, WT_CURSTD_JOINED)) + WT_TRET(cursor->reset(cursor)); /* Optionally, free the cursor buffers */ if (free_buffers) { __wt_buf_free(session, &cursor->key); @@ -478,10 +493,13 @@ __session_create(WT_SESSION *wt_session, const char *uri, const char *config) /* * We can't disallow type entirely, a configuration string might * innocently include it, for example, a dump/load pair. If the - * URI type prefix and the type are the same, let it go. + * underlying type is "file", it's OK ("file" is the underlying + * type for every type); if the URI type prefix and the type are + * the same, let it go. */ if ((ret = __wt_config_getones(session, config, "type", &cval)) == 0 && + !WT_STRING_MATCH("file", cval.str, cval.len) && (strncmp(uri, cval.str, cval.len) != 0 || uri[cval.len] != ':')) WT_ERR_MSG(session, EINVAL, @@ -495,6 +513,20 @@ err: API_END_RET_NOTFOUND_MAP(session, ret); } /* + * __session_create_readonly -- + * WT_SESSION->create method; readonly version. + */ +static int +__session_create_readonly( + WT_SESSION *wt_session, const char *uri, const char *config) +{ + WT_UNUSED(uri); + WT_UNUSED(config); + + return (__wt_session_notsup(wt_session)); +} + +/* * __session_log_flush -- * WT_SESSION->log_flush method. */ @@ -532,6 +564,18 @@ err: API_END_RET(session, ret); } /* + * __session_log_flush_readonly -- + * WT_SESSION->log_flush method; readonly version. + */ +static int +__session_log_flush_readonly(WT_SESSION *wt_session, const char *config) +{ + WT_UNUSED(config); + + return (__wt_session_notsup(wt_session)); +} + +/* * __session_log_printf -- * WT_SESSION->log_printf method. */ @@ -554,6 +598,19 @@ err: API_END_RET(session, ret); } /* + * __session_log_printf_readonly -- + * WT_SESSION->log_printf method; readonly version. + */ +static int +__session_log_printf_readonly(WT_SESSION *wt_session, const char *fmt, ...) + WT_GCC_FUNC_ATTRIBUTE((format (printf, 2, 3))) +{ + WT_UNUSED(fmt); + + return (__wt_session_notsup(wt_session)); +} + +/* * __session_rebalance -- * WT_SESSION->rebalance method. */ @@ -567,9 +624,6 @@ __session_rebalance(WT_SESSION *wt_session, const char *uri, const char *config) SESSION_API_CALL(session, rebalance, config, cfg); - if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) - WT_ERR(ENOTSUP); - /* Block out checkpoints to avoid spurious EBUSY errors. */ WT_WITH_CHECKPOINT_LOCK(session, ret, WT_WITH_SCHEMA_LOCK(session, ret, @@ -580,6 +634,20 @@ err: API_END_RET_NOTFOUND_MAP(session, ret); } /* + * __session_rebalance_readonly -- + * WT_SESSION->rebalance method; readonly version. + */ +static int +__session_rebalance_readonly( + WT_SESSION *wt_session, const char *uri, const char *config) +{ + WT_UNUSED(uri); + WT_UNUSED(config); + + return (__wt_session_notsup(wt_session)); +} + +/* * __session_rename -- * WT_SESSION->rename method. */ @@ -597,14 +665,30 @@ __session_rename(WT_SESSION *wt_session, WT_ERR(__wt_str_name_check(session, uri)); WT_ERR(__wt_str_name_check(session, newuri)); - WT_WITH_SCHEMA_LOCK(session, ret, - WT_WITH_TABLE_LOCK(session, ret, - ret = __wt_schema_rename(session, uri, newuri, cfg))); + WT_WITH_CHECKPOINT_LOCK(session, ret, + WT_WITH_SCHEMA_LOCK(session, ret, + WT_WITH_TABLE_LOCK(session, ret, + ret = __wt_schema_rename(session, uri, newuri, cfg)))); err: API_END_RET_NOTFOUND_MAP(session, ret); } /* + * __session_rename_readonly -- + * WT_SESSION->rename method; readonly version. + */ +static int +__session_rename_readonly(WT_SESSION *wt_session, + const char *uri, const char *newuri, const char *config) +{ + WT_UNUSED(uri); + WT_UNUSED(newuri); + WT_UNUSED(config); + + return (__wt_session_notsup(wt_session)); +} + +/* * __session_reset -- * WT_SESSION->reset method. */ @@ -646,9 +730,10 @@ __wt_session_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) if (!lock_wait) F_SET(session, WT_SESSION_LOCK_NO_WAIT); - WT_WITH_SCHEMA_LOCK(session, ret, - WT_WITH_TABLE_LOCK(session, ret, - ret = __wt_schema_drop(session, uri, cfg))); + WT_WITH_CHECKPOINT_LOCK(session, ret, + WT_WITH_SCHEMA_LOCK(session, ret, + WT_WITH_TABLE_LOCK(session, ret, + ret = __wt_schema_drop(session, uri, cfg)))); if (!lock_wait) F_CLR(session, WT_SESSION_LOCK_NO_WAIT); @@ -679,6 +764,20 @@ err: /* Note: drop operations cannot be unrolled (yet?). */ } /* + * __session_drop_readonly -- + * WT_SESSION->drop method; readonly version. + */ +static int +__session_drop_readonly( + WT_SESSION *wt_session, const char *uri, const char *config) +{ + WT_UNUSED(uri); + WT_UNUSED(config); + + return (__wt_session_notsup(wt_session)); +} + +/* * __session_join -- * WT_SESSION->join method. */ @@ -823,6 +922,20 @@ err: API_END_RET_NOTFOUND_MAP(session, ret); } /* + * __session_salvage_readonly -- + * WT_SESSION->salvage method; readonly version. + */ +static int +__session_salvage_readonly( + WT_SESSION *wt_session, const char *uri, const char *config) +{ + WT_UNUSED(uri); + WT_UNUSED(config); + + return (__wt_session_notsup(wt_session)); +} + +/* * __wt_session_range_truncate -- * Session handling of a range truncate. */ @@ -1004,6 +1117,22 @@ err: TXN_API_END_RETRY(session, ret, 0); } /* + * __session_truncate_readonly -- + * WT_SESSION->truncate method; readonly version. + */ +static int +__session_truncate_readonly(WT_SESSION *wt_session, + const char *uri, WT_CURSOR *start, WT_CURSOR *stop, const char *config) +{ + WT_UNUSED(uri); + WT_UNUSED(start); + WT_UNUSED(stop); + WT_UNUSED(config); + + return (__wt_session_notsup(wt_session)); +} + +/* * __session_upgrade -- * WT_SESSION->upgrade method. */ @@ -1026,6 +1155,20 @@ err: API_END_RET_NOTFOUND_MAP(session, ret); } /* + * __session_upgrade_readonly -- + * WT_SESSION->upgrade method; readonly version. + */ +static int +__session_upgrade_readonly( + WT_SESSION *wt_session, const char *uri, const char *config) +{ + WT_UNUSED(uri); + WT_UNUSED(config); + + return (__wt_session_notsup(wt_session)); +} + +/* * __session_verify -- * WT_SESSION->verify method. */ @@ -1247,6 +1390,18 @@ err: API_END_RET(session, ret); } /* + * __session_transaction_sync_readonly -- + * WT_SESSION->transaction_sync method; readonly version. + */ +static int +__session_transaction_sync_readonly(WT_SESSION *wt_session, const char *config) +{ + WT_UNUSED(config); + + return (__wt_session_notsup(wt_session)); +} + +/* * __session_checkpoint -- * WT_SESSION->checkpoint method. */ @@ -1295,6 +1450,18 @@ err: API_END_RET_NOTFOUND_MAP(session, ret); } /* + * __session_checkpoint_readonly -- + * WT_SESSION->checkpoint method; readonly version. + */ +static int +__session_checkpoint_readonly(WT_SESSION *wt_session, const char *config) +{ + WT_UNUSED(config); + + return (__wt_session_notsup(wt_session)); +} + +/* * __session_snapshot -- * WT_SESSION->snapshot method. */ @@ -1380,6 +1547,33 @@ __open_session(WT_CONNECTION_IMPL *conn, __session_snapshot, __session_transaction_pinned_range, __session_transaction_sync + }, stds_readonly = { + NULL, + NULL, + __session_close, + __session_reconfigure, + __session_strerror, + __session_open_cursor, + __session_create_readonly, + __wt_session_compact_readonly, + __session_drop_readonly, + __session_join, + __session_log_flush_readonly, + __session_log_printf_readonly, + __session_rebalance_readonly, + __session_rename_readonly, + __session_reset, + __session_salvage_readonly, + __session_truncate_readonly, + __session_upgrade_readonly, + __session_verify, + __session_begin_transaction, + __session_commit_transaction, + __session_rollback_transaction, + __session_checkpoint_readonly, + __session_snapshot, + __session_transaction_pinned_range, + __session_transaction_sync_readonly }; WT_DECL_RET; WT_SESSION_IMPL *session, *session_ret; @@ -1407,7 +1601,7 @@ __open_session(WT_CONNECTION_IMPL *conn, if (i == conn->session_size) WT_ERR_MSG(session, ENOMEM, "only configured to support %" PRIu32 " sessions" - " (including %d additional internal sessions)", + " (including %" PRIu32 " additional internal sessions)", conn->session_size, WT_EXTRA_INTERNAL_SESSIONS); /* @@ -1419,7 +1613,8 @@ __open_session(WT_CONNECTION_IMPL *conn, conn->session_cnt = i + 1; session_ret->id = i; - session_ret->iface = stds; + session_ret->iface = + F_ISSET(conn, WT_CONN_READONLY) ? stds_readonly : stds; session_ret->iface.connection = &conn->iface; WT_ERR(__wt_cond_alloc(session, "session", false, &session_ret->cond)); diff --git a/src/session/session_compact.c b/src/session/session_compact.c index 5abccbd1366..2a53ad58f52 100644 --- a/src/session/session_compact.c +++ b/src/session/session_compact.c @@ -97,13 +97,13 @@ */ /* - * __wt_compact_uri_analyze -- + * __compact_uri_analyze -- * Extract information relevant to deciding what work compact needs to * do from a URI that is part of a table schema. * Called via the schema_worker function. */ -int -__wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, bool *skipp) +static int +__compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, bool *skipp) { /* * Add references to schema URI objects to the list of objects to be @@ -120,6 +120,61 @@ __wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, bool *skipp) } /* + * __compact_start -- + * Start object compaction. + */ +static int +__compact_start(WT_SESSION_IMPL *session) +{ + WT_BM *bm; + + bm = S2BT(session)->bm; + return (bm->compact_start(bm, session)); +} + +/* + * __compact_end -- + * End object compaction. + */ +static int +__compact_end(WT_SESSION_IMPL *session) +{ + WT_BM *bm; + + bm = S2BT(session)->bm; + return (bm->compact_end(bm, session)); +} + +/* + * __compact_handle_append -- + * Gather a file handle to be compacted. + * Called via the schema_worker function. + */ +static int +__compact_handle_append(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_DECL_RET; + + WT_UNUSED(cfg); + + /* Make sure there is space for the next entry. */ + WT_RET(__wt_realloc_def(session, &session->op_handle_allocated, + session->op_handle_next + 1, &session->op_handle)); + + WT_RET(__wt_session_get_btree( + session, session->dhandle->name, NULL, NULL, 0)); + + /* Set compact active on the handle. */ + if ((ret = __compact_start(session)) != 0) { + WT_TRET(__wt_session_release_btree(session)); + return (ret); + } + + session->op_handle[session->op_handle_next++] = session->dhandle; + return (0); +} + +/* * __session_compact_check_timeout -- * Check if the timeout has been exceeded. */ @@ -143,21 +198,25 @@ __session_compact_check_timeout( * Function to alternate between checkpoints and compaction calls. */ static int -__compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) +__compact_file(WT_SESSION_IMPL *session, const char *cfg[]) { struct timespec start_time; + WT_DATA_HANDLE *dhandle; WT_DECL_ITEM(t); WT_DECL_RET; int i; const char *checkpoint_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_checkpoint), NULL, NULL }; + dhandle = session->dhandle; + /* * Force the checkpoint: we don't want to skip it because the work we * need to have done is done in the underlying block manager. */ WT_ERR(__wt_scr_alloc(session, 128, &t)); - WT_ERR(__wt_buf_fmt(session, t, "target=(\"%s\"),force=1", uri)); + WT_ERR(__wt_buf_fmt( + session, t, "target=(\"%s\"),force=1", dhandle->name)); checkpoint_cfg[1] = t->data; WT_ERR(__wt_epoch(session, &start_time)); @@ -173,9 +232,8 @@ __compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) WT_ERR(__wt_txn_checkpoint(session, checkpoint_cfg)); session->compact_state = WT_COMPACT_RUNNING; - WT_WITH_SCHEMA_LOCK(session, ret, - ret = __wt_schema_worker( - session, uri, __wt_compact, NULL, cfg, 0)); + WT_WITH_DHANDLE(session, dhandle, + ret = __wt_compact(session, cfg)); WT_ERR(ret); if (session->compact_state != WT_COMPACT_SUCCESS) break; @@ -193,6 +251,7 @@ err: session->compact_state = WT_COMPACT_NONE; /* * __wt_session_compact -- + * WT_SESSION.compact method. */ int __wt_session_compact( @@ -203,6 +262,7 @@ __wt_session_compact( WT_DECL_RET; WT_SESSION_IMPL *session; WT_TXN *txn; + u_int i; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, compact, config, cfg); @@ -227,10 +287,10 @@ __wt_session_compact( WT_ERR(__wt_config_gets(session, cfg, "timeout", &cval)); session->compact->max_time = (uint64_t)cval.val; - /* Find the types of data sources are being compacted. */ + /* Find the types of data sources being compacted. */ WT_WITH_SCHEMA_LOCK(session, ret, - ret = __wt_schema_worker( - session, uri, NULL, __wt_compact_uri_analyze, cfg, 0)); + ret = __wt_schema_worker(session, uri, + __compact_handle_append, __compact_uri_analyze, cfg, 0)); WT_ERR(ret); if (session->compact->lsm_count != 0) @@ -247,11 +307,25 @@ __wt_session_compact( WT_ERR_MSG(session, EINVAL, " File compaction not permitted in a transaction"); - WT_ERR(__compact_file(session, uri, cfg)); + for (i = 0; i < session->op_handle_next; ++i) { + WT_WITH_DHANDLE(session, session->op_handle[i], + ret = __compact_file(session, cfg)); + WT_ERR(ret); + } } err: session->compact = NULL; + for (i = 0; i < session->op_handle_next; ++i) { + WT_WITH_DHANDLE(session, session->op_handle[i], + WT_TRET(__compact_end(session))); + WT_WITH_DHANDLE(session, session->op_handle[i], + WT_TRET(__wt_session_release_btree(session))); + } + + __wt_free(session, session->op_handle); + session->op_handle_allocated = session->op_handle_next = 0; + /* * Release common session resources (for example, checkpoint may acquire * significant reconciliation structures/memory). @@ -260,3 +334,17 @@ err: session->compact = NULL; API_END_RET_NOTFOUND_MAP(session, ret); } + +/* + * __wt_session_compact_readonly -- + * WT_SESSION.compact method; readonly version. + */ +int +__wt_session_compact_readonly( + WT_SESSION *wt_session, const char *uri, const char *config) +{ + WT_UNUSED(uri); + WT_UNUSED(config); + + return (__wt_session_notsup(wt_session)); +} diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c index 1ee3342442c..ddf4d3dfa33 100644 --- a/src/session/session_dhandle.c +++ b/src/session/session_dhandle.c @@ -15,24 +15,21 @@ static int __session_dhandle_sweep(WT_SESSION_IMPL *); * Add a handle to the session's cache. */ static int -__session_add_dhandle( - WT_SESSION_IMPL *session, WT_DATA_HANDLE_CACHE **dhandle_cachep) +__session_add_dhandle(WT_SESSION_IMPL *session) { WT_DATA_HANDLE_CACHE *dhandle_cache; uint64_t bucket; + /* Allocate a handle cache entry. */ WT_RET(__wt_calloc_one(session, &dhandle_cache)); + dhandle_cache->dhandle = session->dhandle; bucket = dhandle_cache->dhandle->name_hash % WT_HASH_ARRAY_SIZE; TAILQ_INSERT_HEAD(&session->dhandles, dhandle_cache, q); TAILQ_INSERT_HEAD(&session->dhhash[bucket], dhandle_cache, hashq); - if (dhandle_cachep != NULL) - *dhandle_cachep = dhandle_cache; - - /* Sweep the handle list to remove any dead handles. */ - return (__session_dhandle_sweep(session)); + return (0); } /* @@ -450,14 +447,23 @@ __session_get_dhandle( return (0); } + /* Sweep the handle list to remove any dead handles. */ + WT_RET(__session_dhandle_sweep(session)); + /* * We didn't find a match in the session cache, search the shared * handle list and cache the handle we find. */ WT_WITH_HANDLE_LIST_LOCK(session, ret = __session_find_shared_dhandle(session, uri, checkpoint)); - if (ret == 0) - ret = __session_add_dhandle(session, NULL); + WT_RET(ret); + + /* + * Fixup the reference count on failure (we incremented the reference + * count while holding the handle-list lock). + */ + if ((ret = __session_add_dhandle(session)) != 0) + (void)__wt_atomic_sub32(&session->dhandle->session_ref, 1); return (ret); } @@ -571,7 +577,7 @@ __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint) * files, since changes to the underlying file are visible to the in * memory pages. */ - WT_ERR(__wt_cache_op(session, NULL, WT_SYNC_DISCARD)); + WT_ERR(__wt_cache_op(session, WT_SYNC_DISCARD)); /* * We lock checkpoint handles that we are overwriting, so the handle diff --git a/src/support/cksum.c b/src/support/cksum.c index c2982c40015..0b086753406 100644 --- a/src/support/cksum.c +++ b/src/support/cksum.c @@ -1260,6 +1260,23 @@ __wt_cksum_hw(const void *chunk, size_t len) } #endif +#if defined(__powerpc64__) + +unsigned int crc32_vpmsum(unsigned int crc, const unsigned char *p, + unsigned long len); + +/* + * __wt_cksum_hw -- + * Return a checksum for a chunk of memory, computed in hardware + * using 8 byte steps. + */ +static uint32_t +__wt_cksum_hw(const void *chunk, size_t len) +{ + return crc32_vpmsum(0, chunk, len); +} +#endif + /* * __wt_cksum -- * Return a checksum for a chunk of memory using the fastest method @@ -1302,6 +1319,8 @@ __wt_cksum_init(void) __wt_cksum_func = __wt_cksum_hw; else __wt_cksum_func = __wt_cksum_sw; +#elif defined(__powerpc64__) + __wt_cksum_func = __wt_cksum_hw; #else __wt_cksum_func = __wt_cksum_sw; #endif diff --git a/src/support/cond_auto.c b/src/support/cond_auto.c new file mode 100644 index 00000000000..ec95622f333 --- /dev/null +++ b/src/support/cond_auto.c @@ -0,0 +1,136 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "wt_internal.h" + +/* + * This is an implementation of condition variables that automatically adjust + * the wait time depending on whether the wake is resulting in useful work. + */ + +/* + * __wt_cond_auto_alloc -- + * Allocate and initialize an automatically adjusting condition variable. + */ +int +__wt_cond_auto_alloc( + WT_SESSION_IMPL *session, const char *name, + bool is_signalled, uint64_t min, uint64_t max, WT_CONDVAR **condp) +{ + WT_CONDVAR *cond; + + WT_RET(__wt_cond_alloc(session, name, is_signalled, condp)); + cond = *condp; + + cond->min_wait = min; + cond->max_wait = max; + cond->prev_wait = min; + + return (0); +} + +/* + * __wt_cond_auto_signal -- + * Signal a condition variable. + */ +int +__wt_cond_auto_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) +{ + + WT_ASSERT(session, cond->min_wait != 0); + return (__wt_cond_signal(session, cond)); +} + +/* + * __wt_cond_auto_wait_signal -- + * Wait on a mutex, optionally timing out. If we get it before the time + * out period expires, let the caller know. + * TODO: Can this version of the API be removed, now that we have the + * auto adjusting condition variables? + */ +int +__wt_cond_auto_wait_signal( + WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool *signalled) +{ + uint64_t delta; + + /* + * Catch cases where this function is called with a condition variable + * that was initialized non-auto. + */ + WT_ASSERT(session, cond->min_wait != 0); + + WT_STAT_FAST_CONN_INCR(session, cond_auto_wait); + if (progress) + cond->prev_wait = cond->min_wait; + else { + delta = WT_MAX(1, (cond->max_wait - cond->min_wait) / 10); + cond->prev_wait = WT_MIN( + cond->max_wait, cond->prev_wait + delta); + } + + WT_RET(__wt_cond_wait_signal( + session, cond, cond->prev_wait, signalled)); + + if (progress || *signalled) + WT_STAT_FAST_CONN_INCR(session, cond_auto_wait_reset); + if (*signalled) + cond->prev_wait = cond->min_wait; + + return (0); +} + +/* + * __wt_cond_auto_wait -- + * Wait on a mutex, optionally timing out. If we get it before the time + * out period expires, let the caller know. + */ +int +__wt_cond_auto_wait( + WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress) +{ + bool signalled; + + /* + * Call the signal version so the wait period is reset if the + * condition is woken explicitly. + */ + WT_RET(__wt_cond_auto_wait_signal(session, cond, progress, &signalled)); + + return (0); +} + +/* + * __wt_cond_auto_destroy -- + * Destroy a condition variable. + */ +int +__wt_cond_auto_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) +{ + return (__wt_cond_destroy(session, condp)); +} diff --git a/src/support/huffman.c b/src/support/huffman.c index edd0bc9f648..1e1aaeab5b5 100644 --- a/src/support/huffman.c +++ b/src/support/huffman.c @@ -492,11 +492,12 @@ __wt_huffman_open(WT_SESSION_IMPL *session, uint8_t symbol; uint32_t weighted_length; - printf("leaf depth %" PRIu16 "..%" PRIu16 ", memory use: " - "codes %u# * %uB + code2symbol %u# * %uB\n", + printf("leaf depth %" PRIu16 "..%" PRIu16 + ", memory use: codes %u# * %" WT_SIZET_FMT + "B + code2symbol %u# * %" WT_SIZET_FMT "B\n", huffman->min_depth, huffman->max_depth, - huffman->numSymbols, (u_int)sizeof(WT_HUFFMAN_CODE), - 1U << huffman->max_depth, (u_int)sizeof(uint16_t)); + huffman->numSymbols, sizeof(WT_HUFFMAN_CODE), + 1U << huffman->max_depth, sizeof(uint16_t)); /* * measure quality of computed Huffman codes, for different max bit diff --git a/src/support/power8/LICENSE.TXT b/src/support/power8/LICENSE.TXT new file mode 100644 index 00000000000..2f4bb91f574 --- /dev/null +++ b/src/support/power8/LICENSE.TXT @@ -0,0 +1,476 @@ +Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM + +crc32-vpmsum is free software; you can redistribute it and/or +modify it under the terms of either: + + a) the GNU General Public License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version., or + b) the Apache License, Version 2.0 + + + + + + + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + + + + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS diff --git a/src/support/power8/README.md b/src/support/power8/README.md new file mode 100644 index 00000000000..3e2976650cd --- /dev/null +++ b/src/support/power8/README.md @@ -0,0 +1,208 @@ +crc32-vpmsum +============ + +A set of examples for accelerating CRC32 calculations using the vector +polynomial multiply sum (vpmsum) instructions introduced in POWER8. These +instructions implement byte, halfword, word and doubleword carryless +multiply/add. + +Performance +----------- + +An implementation of slice-by-8, one of the fastest lookup table methods +is included so we can compare performance against it. Testing 5000000 +iterations of a CRC of 32 kB of data (to keep it L1 cache contained): + +``` +# time slice_by_8_bench 32768 5000000 +122.220 seconds + +# time crc32_bench 32768 5000000 +2.937 seconds +``` + +The vpmsum accelerated CRC is just over 41x faster. + +This test was run on a 4.1 GHz POWER8, so the algorithm sustains about +52 GiB/sec or 13.6 bytes/cycle. The theoretical limit is 16 bytes/cycle +since we can execute a maximum of one vpmsum instruction per cycle. + +In another test, a version was added to the kernel and btrfs write +performance was shown to be 3.8x faster. The test was done to a ramdisk +to mitigate any I/O induced variability. + +Quick start +----------- + +- Modify CRC and OPTIONS in the Makefile. There are examples for the two most + common crc32s. + +- Type make to create the constants (crc32_constants.h) + +- Import the code into your application (crc32.S crc32_wrapper.c + crc32_constants.h ppc-opcode.h) and call the CRC: + +``` +unsigned int crc32_vpmsum(unsigned int crc, unsigned char *p, unsigned long len); +``` + +CRC background +-------------- + +For a good background on CRCs, check out: + +http://www.ross.net/crc/download/crc_v3.txt + +A few key points: + +- A CRC is the remainder after dividing a message by the CRC polynomial, + ie M mod CRC_POLY +- multiply/divide is carryless +- add/subtract is an xor +- n (where n is the order of the CRC) bits of zeroes are appended to the + end of the message. + +One more important piece of information - a CRC is a linear function, so: + +``` + CRC(A xor B) = CRC(A) xor CRC(B) + + CRC(A . B) = CRC(A) . CRC(B) (remember this is carryless multiply) +``` + +If we take 64bits of data, represented by two 32 bit chunks (AAAAAAAA +and BBBBBBBB): + +``` +CRC(AAAAAAAABBBBBBBB) + = CRC(AAAAAAAA00000000 xor BBBBBBBB) + = CRC(AAAAAAAA00000000) xor CRC(BBBBBBBB) +``` + +If we operate on AAAAAAAA: + +``` +CRC(AAAAAAAA00000000) + = CRC(AAAAAAAA . 100000000) + = CRC(AAAAAAAA) . CRC(100000000) +``` + +And CRC(100000000) is a constant which we can pre-calculate: + +``` +CRC(100000000) + = 100000000 mod CRC_POLY + = 2^32 mod CRC_POLY +``` + +Finally we can add our modified AAAAAAAA to BBBBBBBB: + +``` +CRC(AAAAAAAABBBBBBBB) + = ((2^32 mod CRC_POLY) . CRC(AAAAAAAA)) xor CRC(BBBBBBBB) +``` + +In other words, with the right constants pre-calculated we can shift the +input data around and we can also calculate the CRC in as many parallel +chunks as we want. + +No matter how much shifting we do, the final result will be be 64 bits of +data (63 actually, because there is no carry into the top bit). To reduce +it further we need a another trick, and that is Barrett reduction: + +http://en.wikipedia.org/wiki/Barrett_reduction + +Barrett reduction is a method of calculating a mod n. The idea is to +calculate q, the multiple of our polynomial that we need to subtract. By +doing the computation 2x bits higher (ie 64 bits) and shifting the +result back down 2x bits, we round down to the nearest multiple. + +``` + k = 32 + m = floor((4^k)/n) = floor((4^32))/n) + n = 64 bits of data + a = 32 bit CRC + + q = floor(ma/(2^64)) + result = a - qn +``` + +An example in the floating point domain makes it clearer how this works: + +``` +a mod n = a - floor(am) * n +``` + +Let's use it to calculate 22 mod 10: + +``` + a = 22 + n = 10 + m = 1/n = 1/10 = 0.1 + +22 mod 10 + = 22 - floor(22*0.1) * 10 + = 22 - 2 * 10 + = 22 - 20 + = 2 +``` + +There is one more issue left - bit reflection. Some CRCs are defined to +operate on the least significant bit first (eg CRC32c). Lets look at +how this would get laid out in a register, and lets simplify it to just +two bytes (vs a 16 byte VMX register): + + [ 8..15 ] [ 0..7 ] + +Notice how the bits and bytes are out of order. Since we are doing +multi word multiplication on these values we need them to both be +in order. + +The simplest way to fix this is to reflect the bits in each byte: + + [ 15..8 ] [ 7..0 ] + +However shuffling bits in a byte is expensive on most CPUs. It is +however relatively cheap to shuffle bytes around. What if we load +the bytes in reversed: + + [ 0..7 ] [ 8..15 ] + +Now the bits and bytes are in order, except the least significant bit +of the register is now on the left and the most significant bit is on the +right. We operate as if the register is reflected, which normally we +cannot do. The reason we get away with this is our multiplies are carryless +and our addition and subtraction is xor, so our operations never create +carries. + +The only trick is we have to shift the result of multiplies left one +because the high bit of the multiply is always 0, and we want that high bit +on the right not the left. + +Implementation +-------------- + +The vpmsum instructions on POWER8 have a 6 cycle latency and we can +execute one every cycle. In light of this the main loop has 8 parallel +streams which consume 8 x 16 B each iteration. At the completion of this +loop we have taken 32 kB of data and reduced it to 8 x 16 B (128 B). + +The next step is to take this 128 B and reduce it to 8 B. At this stage +we also add 32 bits of 0 to the end. + +We then apply Barrett reduction to get our CRC. + +Examples +-------- +- barrett_reduction: An example of Barrett reduction + +- final_fold: Starting with 128 bits, add 32 bits of zeros and reduce it to + 64 bits, then apply Barrett reduction + +- final_fold2: A second method of reduction + +Acknowledgements +---------------- + +Thanks to Michael Gschwind, Jeff Derby, Lorena Pesantez and Stewart Smith +for their ideas and assistance. diff --git a/src/support/power8/crc32.S b/src/support/power8/crc32.S new file mode 100644 index 00000000000..c0b81143f07 --- /dev/null +++ b/src/support/power8/crc32.S @@ -0,0 +1,771 @@ +#if defined(__powerpc64__) +/* + * Calculate the checksum of data that is 16 byte aligned and a multiple of + * 16 bytes. + * + * The first step is to reduce it to 1024 bits. We do this in 8 parallel + * chunks in order to mask the latency of the vpmsum instructions. If we + * have more than 32 kB of data to checksum we repeat this step multiple + * times, passing in the previous 1024 bits. + * + * The next step is to reduce the 1024 bits to 64 bits. This step adds + * 32 bits of 0s to the end - this matches what a CRC does. We just + * calculate constants that land the data in this 32 bits. + * + * We then use fixed point Barrett reduction to compute a mod n over GF(2) + * for n = CRC using POWER8 instructions. We use x = 32. + * + * http://en.wikipedia.org/wiki/Barrett_reduction + * + * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <ppc-asm.h> +#include "ppc-opcode.h" + +#undef toc + +#ifndef r1 +#define r1 1 +#endif + +#ifndef r2 +#define r2 2 +#endif + + .section .rodata +.balign 16 + +.byteswap_constant: + /* byte reverse permute constant */ + .octa 0x0F0E0D0C0B0A09080706050403020100 + +#define __ASSEMBLY__ +#include "crc32_constants.h" + + .text + +#if defined(__BIG_ENDIAN__) && defined(REFLECT) +#define BYTESWAP_DATA +#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT) +#define BYTESWAP_DATA +#else +#undef BYTESWAP_DATA +#endif + +#define off16 r25 +#define off32 r26 +#define off48 r27 +#define off64 r28 +#define off80 r29 +#define off96 r30 +#define off112 r31 + +#define const1 v24 +#define const2 v25 + +#define byteswap v26 +#define mask_32bit v27 +#define mask_64bit v28 +#define zeroes v29 + +#ifdef BYTESWAP_DATA +#define VPERM(A, B, C, D) vperm A, B, C, D +#else +#define VPERM(A, B, C, D) +#endif + +/* unsigned int __crc32_vpmsum(unsigned int crc, void *p, unsigned long len) */ +FUNC_START(__crc32_vpmsum) + std r31,-8(r1) + std r30,-16(r1) + std r29,-24(r1) + std r28,-32(r1) + std r27,-40(r1) + std r26,-48(r1) + std r25,-56(r1) + + li off16,16 + li off32,32 + li off48,48 + li off64,64 + li off80,80 + li off96,96 + li off112,112 + li r0,0 + + /* Enough room for saving 10 non volatile VMX registers */ + subi r6,r1,56+10*16 + subi r7,r1,56+2*16 + + stvx v20,0,r6 + stvx v21,off16,r6 + stvx v22,off32,r6 + stvx v23,off48,r6 + stvx v24,off64,r6 + stvx v25,off80,r6 + stvx v26,off96,r6 + stvx v27,off112,r6 + stvx v28,0,r7 + stvx v29,off16,r7 + + mr r10,r3 + + vxor zeroes,zeroes,zeroes + vspltisw v0,-1 + + vsldoi mask_32bit,zeroes,v0,4 + vsldoi mask_64bit,zeroes,v0,8 + + /* Get the initial value into v8 */ + vxor v8,v8,v8 + MTVRD(v8, r3) +#ifdef REFLECT + vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */ +#else + vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */ +#endif + +#ifdef BYTESWAP_DATA + addis r3,r2,.byteswap_constant@toc@ha + addi r3,r3,.byteswap_constant@toc@l + + lvx byteswap,0,r3 + addi r3,r3,16 +#endif + + cmpdi r5,256 + blt .Lshort + + rldicr r6,r5,0,56 + + /* Checksum in blocks of MAX_SIZE */ +1: lis r7,MAX_SIZE@h + ori r7,r7,MAX_SIZE@l + mr r9,r7 + cmpd r6,r7 + bgt 2f + mr r7,r6 +2: subf r6,r7,r6 + + /* our main loop does 128 bytes at a time */ + srdi r7,r7,7 + + /* + * Work out the offset into the constants table to start at. Each + * constant is 16 bytes, and it is used against 128 bytes of input + * data - 128 / 16 = 8 + */ + sldi r8,r7,4 + srdi r9,r9,3 + subf r8,r8,r9 + + /* We reduce our final 128 bytes in a separate step */ + addi r7,r7,-1 + mtctr r7 + + addis r3,r2,.constants@toc@ha + addi r3,r3,.constants@toc@l + + /* Find the start of our constants */ + add r3,r3,r8 + + /* zero v0-v7 which will contain our checksums */ + vxor v0,v0,v0 + vxor v1,v1,v1 + vxor v2,v2,v2 + vxor v3,v3,v3 + vxor v4,v4,v4 + vxor v5,v5,v5 + vxor v6,v6,v6 + vxor v7,v7,v7 + + lvx const1,0,r3 + + /* + * If we are looping back to consume more data we use the values + * already in v16-v23. + */ + cmpdi r0,1 + beq 2f + + /* First warm up pass */ + lvx v16,0,r4 + lvx v17,off16,r4 + VPERM(v16,v16,v16,byteswap) + VPERM(v17,v17,v17,byteswap) + lvx v18,off32,r4 + lvx v19,off48,r4 + VPERM(v18,v18,v18,byteswap) + VPERM(v19,v19,v19,byteswap) + lvx v20,off64,r4 + lvx v21,off80,r4 + VPERM(v20,v20,v20,byteswap) + VPERM(v21,v21,v21,byteswap) + lvx v22,off96,r4 + lvx v23,off112,r4 + VPERM(v22,v22,v22,byteswap) + VPERM(v23,v23,v23,byteswap) + addi r4,r4,8*16 + + /* xor in initial value */ + vxor v16,v16,v8 + +2: bdz .Lfirst_warm_up_done + + addi r3,r3,16 + lvx const2,0,r3 + + /* Second warm up pass */ + VPMSUMD(v8,v16,const1) + lvx v16,0,r4 + VPERM(v16,v16,v16,byteswap) + ori r2,r2,0 + + VPMSUMD(v9,v17,const1) + lvx v17,off16,r4 + VPERM(v17,v17,v17,byteswap) + ori r2,r2,0 + + VPMSUMD(v10,v18,const1) + lvx v18,off32,r4 + VPERM(v18,v18,v18,byteswap) + ori r2,r2,0 + + VPMSUMD(v11,v19,const1) + lvx v19,off48,r4 + VPERM(v19,v19,v19,byteswap) + ori r2,r2,0 + + VPMSUMD(v12,v20,const1) + lvx v20,off64,r4 + VPERM(v20,v20,v20,byteswap) + ori r2,r2,0 + + VPMSUMD(v13,v21,const1) + lvx v21,off80,r4 + VPERM(v21,v21,v21,byteswap) + ori r2,r2,0 + + VPMSUMD(v14,v22,const1) + lvx v22,off96,r4 + VPERM(v22,v22,v22,byteswap) + ori r2,r2,0 + + VPMSUMD(v15,v23,const1) + lvx v23,off112,r4 + VPERM(v23,v23,v23,byteswap) + + addi r4,r4,8*16 + + bdz .Lfirst_cool_down + + /* + * main loop. We modulo schedule it such that it takes three iterations + * to complete - first iteration load, second iteration vpmsum, third + * iteration xor. + */ + .balign 16 +4: lvx const1,0,r3 + addi r3,r3,16 + ori r2,r2,0 + + vxor v0,v0,v8 + VPMSUMD(v8,v16,const2) + lvx v16,0,r4 + VPERM(v16,v16,v16,byteswap) + ori r2,r2,0 + + vxor v1,v1,v9 + VPMSUMD(v9,v17,const2) + lvx v17,off16,r4 + VPERM(v17,v17,v17,byteswap) + ori r2,r2,0 + + vxor v2,v2,v10 + VPMSUMD(v10,v18,const2) + lvx v18,off32,r4 + VPERM(v18,v18,v18,byteswap) + ori r2,r2,0 + + vxor v3,v3,v11 + VPMSUMD(v11,v19,const2) + lvx v19,off48,r4 + VPERM(v19,v19,v19,byteswap) + lvx const2,0,r3 + ori r2,r2,0 + + vxor v4,v4,v12 + VPMSUMD(v12,v20,const1) + lvx v20,off64,r4 + VPERM(v20,v20,v20,byteswap) + ori r2,r2,0 + + vxor v5,v5,v13 + VPMSUMD(v13,v21,const1) + lvx v21,off80,r4 + VPERM(v21,v21,v21,byteswap) + ori r2,r2,0 + + vxor v6,v6,v14 + VPMSUMD(v14,v22,const1) + lvx v22,off96,r4 + VPERM(v22,v22,v22,byteswap) + ori r2,r2,0 + + vxor v7,v7,v15 + VPMSUMD(v15,v23,const1) + lvx v23,off112,r4 + VPERM(v23,v23,v23,byteswap) + + addi r4,r4,8*16 + + bdnz 4b + +.Lfirst_cool_down: + /* First cool down pass */ + lvx const1,0,r3 + addi r3,r3,16 + + vxor v0,v0,v8 + VPMSUMD(v8,v16,const1) + ori r2,r2,0 + + vxor v1,v1,v9 + VPMSUMD(v9,v17,const1) + ori r2,r2,0 + + vxor v2,v2,v10 + VPMSUMD(v10,v18,const1) + ori r2,r2,0 + + vxor v3,v3,v11 + VPMSUMD(v11,v19,const1) + ori r2,r2,0 + + vxor v4,v4,v12 + VPMSUMD(v12,v20,const1) + ori r2,r2,0 + + vxor v5,v5,v13 + VPMSUMD(v13,v21,const1) + ori r2,r2,0 + + vxor v6,v6,v14 + VPMSUMD(v14,v22,const1) + ori r2,r2,0 + + vxor v7,v7,v15 + VPMSUMD(v15,v23,const1) + ori r2,r2,0 + +.Lsecond_cool_down: + /* Second cool down pass */ + vxor v0,v0,v8 + vxor v1,v1,v9 + vxor v2,v2,v10 + vxor v3,v3,v11 + vxor v4,v4,v12 + vxor v5,v5,v13 + vxor v6,v6,v14 + vxor v7,v7,v15 + +#ifdef REFLECT + /* + * vpmsumd produces a 96 bit result in the least significant bits + * of the register. Since we are bit reflected we have to shift it + * left 32 bits so it occupies the least significant bits in the + * bit reflected domain. + */ + vsldoi v0,v0,zeroes,4 + vsldoi v1,v1,zeroes,4 + vsldoi v2,v2,zeroes,4 + vsldoi v3,v3,zeroes,4 + vsldoi v4,v4,zeroes,4 + vsldoi v5,v5,zeroes,4 + vsldoi v6,v6,zeroes,4 + vsldoi v7,v7,zeroes,4 +#endif + + /* xor with last 1024 bits */ + lvx v8,0,r4 + lvx v9,off16,r4 + VPERM(v8,v8,v8,byteswap) + VPERM(v9,v9,v9,byteswap) + lvx v10,off32,r4 + lvx v11,off48,r4 + VPERM(v10,v10,v10,byteswap) + VPERM(v11,v11,v11,byteswap) + lvx v12,off64,r4 + lvx v13,off80,r4 + VPERM(v12,v12,v12,byteswap) + VPERM(v13,v13,v13,byteswap) + lvx v14,off96,r4 + lvx v15,off112,r4 + VPERM(v14,v14,v14,byteswap) + VPERM(v15,v15,v15,byteswap) + + addi r4,r4,8*16 + + vxor v16,v0,v8 + vxor v17,v1,v9 + vxor v18,v2,v10 + vxor v19,v3,v11 + vxor v20,v4,v12 + vxor v21,v5,v13 + vxor v22,v6,v14 + vxor v23,v7,v15 + + li r0,1 + cmpdi r6,0 + addi r6,r6,128 + bne 1b + + /* Work out how many bytes we have left */ + andi. r5,r5,127 + + /* Calculate where in the constant table we need to start */ + subfic r6,r5,128 + add r3,r3,r6 + + /* How many 16 byte chunks are in the tail */ + srdi r7,r5,4 + mtctr r7 + + /* + * Reduce the previously calculated 1024 bits to 64 bits, shifting + * 32 bits to include the trailing 32 bits of zeros + */ + lvx v0,0,r3 + lvx v1,off16,r3 + lvx v2,off32,r3 + lvx v3,off48,r3 + lvx v4,off64,r3 + lvx v5,off80,r3 + lvx v6,off96,r3 + lvx v7,off112,r3 + addi r3,r3,8*16 + + VPMSUMW(v0,v16,v0) + VPMSUMW(v1,v17,v1) + VPMSUMW(v2,v18,v2) + VPMSUMW(v3,v19,v3) + VPMSUMW(v4,v20,v4) + VPMSUMW(v5,v21,v5) + VPMSUMW(v6,v22,v6) + VPMSUMW(v7,v23,v7) + + /* Now reduce the tail (0 - 112 bytes) */ + cmpdi r7,0 + beq 1f + + lvx v16,0,r4 + lvx v17,0,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off16,r4 + lvx v17,off16,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off32,r4 + lvx v17,off32,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off48,r4 + lvx v17,off48,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off64,r4 + lvx v17,off64,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off80,r4 + lvx v17,off80,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off96,r4 + lvx v17,off96,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + + /* Now xor all the parallel chunks together */ +1: vxor v0,v0,v1 + vxor v2,v2,v3 + vxor v4,v4,v5 + vxor v6,v6,v7 + + vxor v0,v0,v2 + vxor v4,v4,v6 + + vxor v0,v0,v4 + +.Lbarrett_reduction: + /* Barrett constants */ + addis r3,r2,.barrett_constants@toc@ha + addi r3,r3,.barrett_constants@toc@l + + lvx const1,0,r3 + lvx const2,off16,r3 + + vsldoi v1,v0,v0,8 + vxor v0,v0,v1 /* xor two 64 bit results together */ + +#ifdef REFLECT + /* shift left one bit */ + vspltisb v1,1 + vsl v0,v0,v1 +#endif + + vand v0,v0,mask_64bit + +#ifndef REFLECT + /* + * Now for the Barrett reduction algorithm. The idea is to calculate q, + * the multiple of our polynomial that we need to subtract. By + * doing the computation 2x bits higher (ie 64 bits) and shifting the + * result back down 2x bits, we round down to the nearest multiple. + */ + VPMSUMD(v1,v0,const1) /* ma */ + vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */ + VPMSUMD(v1,v1,const2) /* qn */ + vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ + + /* + * Get the result into r3. We need to shift it left 8 bytes: + * V0 [ 0 1 2 X ] + * V0 [ 0 X 2 3 ] + */ + vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */ +#else + /* + * The reflected version of Barrett reduction. Instead of bit + * reflecting our data (which is expensive to do), we bit reflect our + * constants and our algorithm, which means the intermediate data in + * our vector registers goes from 0-63 instead of 63-0. We can reflect + * the algorithm because we don't carry in mod 2 arithmetic. + */ + vand v1,v0,mask_32bit /* bottom 32 bits of a */ + VPMSUMD(v1,v1,const1) /* ma */ + vand v1,v1,mask_32bit /* bottom 32bits of ma */ + VPMSUMD(v1,v1,const2) /* qn */ + vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ + + /* + * Since we are bit reflected, the result (ie the low 32 bits) is in + * the high 32 bits. We just need to shift it left 4 bytes + * V0 [ 0 1 X 3 ] + * V0 [ 0 X 2 3 ] + */ + vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */ +#endif + + /* Get it into r3 */ + MFVRD(r3, v0) + +.Lout: + subi r6,r1,56+10*16 + subi r7,r1,56+2*16 + + lvx v20,0,r6 + lvx v21,off16,r6 + lvx v22,off32,r6 + lvx v23,off48,r6 + lvx v24,off64,r6 + lvx v25,off80,r6 + lvx v26,off96,r6 + lvx v27,off112,r6 + lvx v28,0,r7 + lvx v29,off16,r7 + + ld r31,-8(r1) + ld r30,-16(r1) + ld r29,-24(r1) + ld r28,-32(r1) + ld r27,-40(r1) + ld r26,-48(r1) + ld r25,-56(r1) + + blr + +.Lfirst_warm_up_done: + lvx const1,0,r3 + addi r3,r3,16 + + VPMSUMD(v8,v16,const1) + VPMSUMD(v9,v17,const1) + VPMSUMD(v10,v18,const1) + VPMSUMD(v11,v19,const1) + VPMSUMD(v12,v20,const1) + VPMSUMD(v13,v21,const1) + VPMSUMD(v14,v22,const1) + VPMSUMD(v15,v23,const1) + + b .Lsecond_cool_down + +.Lshort: + cmpdi r5,0 + beq .Lzero + + addis r3,r2,.short_constants@toc@ha + addi r3,r3,.short_constants@toc@l + + /* Calculate where in the constant table we need to start */ + subfic r6,r5,256 + add r3,r3,r6 + + /* How many 16 byte chunks? */ + srdi r7,r5,4 + mtctr r7 + + vxor v19,v19,v19 + vxor v20,v20,v20 + + lvx v0,0,r4 + lvx v16,0,r3 + VPERM(v0,v0,v16,byteswap) + vxor v0,v0,v8 /* xor in initial value */ + VPMSUMW(v0,v0,v16) + bdz .Lv0 + + lvx v1,off16,r4 + lvx v17,off16,r3 + VPERM(v1,v1,v17,byteswap) + VPMSUMW(v1,v1,v17) + bdz .Lv1 + + lvx v2,off32,r4 + lvx v16,off32,r3 + VPERM(v2,v2,v16,byteswap) + VPMSUMW(v2,v2,v16) + bdz .Lv2 + + lvx v3,off48,r4 + lvx v17,off48,r3 + VPERM(v3,v3,v17,byteswap) + VPMSUMW(v3,v3,v17) + bdz .Lv3 + + lvx v4,off64,r4 + lvx v16,off64,r3 + VPERM(v4,v4,v16,byteswap) + VPMSUMW(v4,v4,v16) + bdz .Lv4 + + lvx v5,off80,r4 + lvx v17,off80,r3 + VPERM(v5,v5,v17,byteswap) + VPMSUMW(v5,v5,v17) + bdz .Lv5 + + lvx v6,off96,r4 + lvx v16,off96,r3 + VPERM(v6,v6,v16,byteswap) + VPMSUMW(v6,v6,v16) + bdz .Lv6 + + lvx v7,off112,r4 + lvx v17,off112,r3 + VPERM(v7,v7,v17,byteswap) + VPMSUMW(v7,v7,v17) + bdz .Lv7 + + addi r3,r3,128 + addi r4,r4,128 + + lvx v8,0,r4 + lvx v16,0,r3 + VPERM(v8,v8,v16,byteswap) + VPMSUMW(v8,v8,v16) + bdz .Lv8 + + lvx v9,off16,r4 + lvx v17,off16,r3 + VPERM(v9,v9,v17,byteswap) + VPMSUMW(v9,v9,v17) + bdz .Lv9 + + lvx v10,off32,r4 + lvx v16,off32,r3 + VPERM(v10,v10,v16,byteswap) + VPMSUMW(v10,v10,v16) + bdz .Lv10 + + lvx v11,off48,r4 + lvx v17,off48,r3 + VPERM(v11,v11,v17,byteswap) + VPMSUMW(v11,v11,v17) + bdz .Lv11 + + lvx v12,off64,r4 + lvx v16,off64,r3 + VPERM(v12,v12,v16,byteswap) + VPMSUMW(v12,v12,v16) + bdz .Lv12 + + lvx v13,off80,r4 + lvx v17,off80,r3 + VPERM(v13,v13,v17,byteswap) + VPMSUMW(v13,v13,v17) + bdz .Lv13 + + lvx v14,off96,r4 + lvx v16,off96,r3 + VPERM(v14,v14,v16,byteswap) + VPMSUMW(v14,v14,v16) + bdz .Lv14 + + lvx v15,off112,r4 + lvx v17,off112,r3 + VPERM(v15,v15,v17,byteswap) + VPMSUMW(v15,v15,v17) + +.Lv15: vxor v19,v19,v15 +.Lv14: vxor v20,v20,v14 +.Lv13: vxor v19,v19,v13 +.Lv12: vxor v20,v20,v12 +.Lv11: vxor v19,v19,v11 +.Lv10: vxor v20,v20,v10 +.Lv9: vxor v19,v19,v9 +.Lv8: vxor v20,v20,v8 +.Lv7: vxor v19,v19,v7 +.Lv6: vxor v20,v20,v6 +.Lv5: vxor v19,v19,v5 +.Lv4: vxor v20,v20,v4 +.Lv3: vxor v19,v19,v3 +.Lv2: vxor v20,v20,v2 +.Lv1: vxor v19,v19,v1 +.Lv0: vxor v20,v20,v0 + + vxor v0,v19,v20 + + b .Lbarrett_reduction + +.Lzero: + mr r3,r10 + b .Lout + +FUNC_END(__crc32_vpmsum) +#endif diff --git a/src/support/power8/crc32_constants.h b/src/support/power8/crc32_constants.h new file mode 100644 index 00000000000..02c471d1c56 --- /dev/null +++ b/src/support/power8/crc32_constants.h @@ -0,0 +1,901 @@ +#define CRC 0x1edc6f41 +#define CRC_XOR +#define REFLECT + +#ifndef __ASSEMBLY__ +#ifdef CRC_TABLE +static const unsigned int crc_table[] = { + 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, + 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, + 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, + 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, + 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, + 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, + 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, + 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, + 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, + 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, + 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, + 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, + 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, + 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, + 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, + 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, + 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, + 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, + 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, + 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, + 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, + 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, + 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, + 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, + 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, + 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, + 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, + 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, + 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, + 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, + 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, + 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, + 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, + 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, + 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, + 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, + 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, + 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, + 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, + 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, + 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, + 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, + 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, + 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, + 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, + 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, + 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, + 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, + 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, + 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, + 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, + 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, + 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, + 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, + 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, + 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, + 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, + 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, + 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, + 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, + 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, + 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, + 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, + 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351,}; + +#endif +#else +#define MAX_SIZE 32768 +.constants: + + /* Reduce 262144 kbits to 1024 bits */ + /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */ + .octa 0x00000000b6ca9e20000000009c37c408 + + /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */ + .octa 0x00000000350249a800000001b51df26c + + /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */ + .octa 0x00000001862dac54000000000724b9d0 + + /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */ + .octa 0x00000001d87fb48c00000001c00532fe + + /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */ + .octa 0x00000001f39b699e00000000f05a9362 + + /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */ + .octa 0x0000000101da11b400000001e1007970 + + /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */ + .octa 0x00000001cab571e000000000a57366ee + + /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */ + .octa 0x00000000c7020cfe0000000192011284 + + /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */ + .octa 0x00000000cdaed1ae0000000162716d9a + + /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */ + .octa 0x00000001e804effc00000000cd97ecde + + /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */ + .octa 0x0000000077c3ea3a0000000058812bc0 + + /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */ + .octa 0x0000000068df31b40000000088b8c12e + + /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */ + .octa 0x00000000b059b6c200000001230b234c + + /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */ + .octa 0x0000000145fb8ed800000001120b416e + + /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */ + .octa 0x00000000cbc0916800000001974aecb0 + + /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */ + .octa 0x000000005ceeedc2000000008ee3f226 + + /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */ + .octa 0x0000000047d74e8600000001089aba9a + + /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */ + .octa 0x00000001407e9e220000000065113872 + + /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */ + .octa 0x00000001da967bda000000005c07ec10 + + /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */ + .octa 0x000000006c8983680000000187590924 + + /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */ + .octa 0x00000000f2d14c9800000000e35da7c6 + + /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */ + .octa 0x00000001993c6ad4000000000415855a + + /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */ + .octa 0x000000014683d1ac0000000073617758 + + /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */ + .octa 0x00000001a7c93e6c0000000176021d28 + + /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */ + .octa 0x000000010211e90a00000001c358fd0a + + /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */ + .octa 0x000000001119403e00000001ff7a2c18 + + /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */ + .octa 0x000000001c3261aa00000000f2d9f7e4 + + /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */ + .octa 0x000000014e37a634000000016cf1f9c8 + + /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */ + .octa 0x0000000073786c0c000000010af9279a + + /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */ + .octa 0x000000011dc037f80000000004f101e8 + + /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */ + .octa 0x0000000031433dfc0000000070bcf184 + + /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */ + .octa 0x000000009cde8348000000000a8de642 + + /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */ + .octa 0x0000000038d3c2a60000000062ea130c + + /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */ + .octa 0x000000011b25f26000000001eb31cbb2 + + /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */ + .octa 0x000000001629e6f00000000170783448 + + /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */ + .octa 0x0000000160838b4c00000001a684b4c6 + + /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */ + .octa 0x000000007a44011c00000000253ca5b4 + + /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */ + .octa 0x00000000226f417a0000000057b4b1e2 + + /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */ + .octa 0x0000000045eb2eb400000000b6bd084c + + /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */ + .octa 0x000000014459d70c0000000123c2d592 + + /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */ + .octa 0x00000001d406ed8200000000159dafce + + /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */ + .octa 0x0000000160c8e1a80000000127e1a64e + + /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */ + .octa 0x0000000027ba80980000000056860754 + + /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */ + .octa 0x000000006d92d01800000001e661aae8 + + /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */ + .octa 0x000000012ed7e3f200000000f82c6166 + + /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */ + .octa 0x000000002dc8778800000000c4f9c7ae + + /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */ + .octa 0x0000000018240bb80000000074203d20 + + /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */ + .octa 0x000000001ad381580000000198173052 + + /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */ + .octa 0x00000001396b78f200000001ce8aba54 + + /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */ + .octa 0x000000011a68133400000001850d5d94 + + /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */ + .octa 0x000000012104732e00000001d609239c + + /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */ + .octa 0x00000000a140d90c000000001595f048 + + /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */ + .octa 0x00000001b7215eda0000000042ccee08 + + /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */ + .octa 0x00000001aaf1df3c000000010a389d74 + + /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */ + .octa 0x0000000029d15b8a000000012a840da6 + + /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */ + .octa 0x00000000f1a96922000000001d181c0c + + /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */ + .octa 0x00000001ac80d03c0000000068b7d1f6 + + /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */ + .octa 0x000000000f11d56a000000005b0f14fc + + /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */ + .octa 0x00000001f1c022a20000000179e9e730 + + /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */ + .octa 0x0000000173d00ae200000001ce1368d6 + + /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */ + .octa 0x00000001d4ffe4ac0000000112c3a84c + + /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */ + .octa 0x000000016edc5ae400000000de940fee + + /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */ + .octa 0x00000001f1a0214000000000fe896b7e + + /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */ + .octa 0x00000000ca0b28a000000001f797431c + + /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */ + .octa 0x00000001928e30a20000000053e989ba + + /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */ + .octa 0x0000000097b1b002000000003920cd16 + + /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */ + .octa 0x00000000b15bf90600000001e6f579b8 + + /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */ + .octa 0x00000000411c5d52000000007493cb0a + + /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */ + .octa 0x00000001c36f330000000001bdd376d8 + + /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */ + .octa 0x00000001119227e0000000016badfee6 + + /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */ + .octa 0x00000000114d47020000000071de5c58 + + /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */ + .octa 0x00000000458b5b9800000000453f317c + + /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */ + .octa 0x000000012e31fb8e0000000121675cce + + /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */ + .octa 0x000000005cf619d800000001f409ee92 + + /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */ + .octa 0x0000000063f4d8b200000000f36b9c88 + + /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */ + .octa 0x000000004138dc8a0000000036b398f4 + + /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */ + .octa 0x00000001d29ee8e000000001748f9adc + + /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */ + .octa 0x000000006a08ace800000001be94ec00 + + /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */ + .octa 0x0000000127d4201000000000b74370d6 + + /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */ + .octa 0x0000000019d76b6200000001174d0b98 + + /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */ + .octa 0x00000001b1471f6e00000000befc06a4 + + /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */ + .octa 0x00000001f64c19cc00000001ae125288 + + /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */ + .octa 0x00000000003c0ea00000000095c19b34 + + /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */ + .octa 0x000000014d73abf600000001a78496f2 + + /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */ + .octa 0x00000001620eb84400000001ac5390a0 + + /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */ + .octa 0x0000000147655048000000002a80ed6e + + /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */ + .octa 0x0000000067b5077e00000001fa9b0128 + + /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */ + .octa 0x0000000010ffe20600000001ea94929e + + /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */ + .octa 0x000000000fee8f1e0000000125f4305c + + /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */ + .octa 0x00000001da26fbae00000001471e2002 + + /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */ + .octa 0x00000001b3a8bd880000000132d2253a + + /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */ + .octa 0x00000000e8f3898e00000000f26b3592 + + /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */ + .octa 0x00000000b0d0d28c00000000bc8b67b0 + + /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */ + .octa 0x0000000030f2a798000000013a826ef2 + + /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */ + .octa 0x000000000fba10020000000081482c84 + + /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */ + .octa 0x00000000bdb9bd7200000000e77307c2 + + /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */ + .octa 0x0000000075d3bf5a00000000d4a07ec8 + + /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */ + .octa 0x00000000ef1f98a00000000017102100 + + /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */ + .octa 0x00000000689c760200000000db406486 + + /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */ + .octa 0x000000016d5fa5fe0000000192db7f88 + + /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */ + .octa 0x00000001d0d2b9ca000000018bf67b1e + + /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */ + .octa 0x0000000041e7b470000000007c09163e + + /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */ + .octa 0x00000001cbb6495e000000000adac060 + + /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */ + .octa 0x000000010052a0b000000000bd8316ae + + /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */ + .octa 0x00000001d8effb5c000000019f09ab54 + + /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */ + .octa 0x00000001d969853c0000000125155542 + + /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */ + .octa 0x00000000523ccce2000000018fdb5882 + + /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */ + .octa 0x000000001e2436bc00000000e794b3f4 + + /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */ + .octa 0x00000000ddd1c3a2000000016f9bb022 + + /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */ + .octa 0x0000000019fcfe3800000000290c9978 + + /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */ + .octa 0x00000001ce95db640000000083c0f350 + + /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */ + .octa 0x00000000af5828060000000173ea6628 + + /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */ + .octa 0x00000001006388f600000001c8b4e00a + + /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */ + .octa 0x0000000179eca00a00000000de95d6aa + + /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */ + .octa 0x0000000122410a6a000000010b7f7248 + + /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */ + .octa 0x000000004288e87c00000001326e3a06 + + /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */ + .octa 0x000000016c5490da00000000bb62c2e6 + + /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */ + .octa 0x00000000d1c71f6e0000000156a4b2c2 + + /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */ + .octa 0x00000001b4ce08a6000000011dfe763a + + /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */ + .octa 0x00000001466ba60c000000007bcca8e2 + + /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */ + .octa 0x00000001f6c488a40000000186118faa + + /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */ + .octa 0x000000013bfb06820000000111a65a88 + + /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */ + .octa 0x00000000690e9e54000000003565e1c4 + + /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */ + .octa 0x00000000281346b6000000012ed02a82 + + /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */ + .octa 0x000000015646402400000000c486ecfc + + /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */ + .octa 0x000000016063a8dc0000000001b951b2 + + /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */ + .octa 0x0000000116a663620000000048143916 + + /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */ + .octa 0x000000017e8aa4d200000001dc2ae124 + + /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */ + .octa 0x00000001728eb10c00000001416c58d6 + + /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */ + .octa 0x00000001b08fd7fa00000000a479744a + + /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */ + .octa 0x00000001092a16e80000000096ca3a26 + + /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */ + .octa 0x00000000a505637c00000000ff223d4e + + /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */ + .octa 0x00000000d94869b2000000010e84da42 + + /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */ + .octa 0x00000001c8b203ae00000001b61ba3d0 + + /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */ + .octa 0x000000005704aea000000000680f2de8 + + /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */ + .octa 0x000000012e295fa2000000008772a9a8 + + /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */ + .octa 0x000000011d0908bc0000000155f295bc + + /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */ + .octa 0x0000000193ed97ea00000000595f9282 + + /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */ + .octa 0x000000013a0f1c520000000164b1c25a + + /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */ + .octa 0x000000010c2c40c000000000fbd67c50 + + /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */ + .octa 0x00000000ff6fac3e0000000096076268 + + /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */ + .octa 0x000000017b3609c000000001d288e4cc + + /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */ + .octa 0x0000000088c8c92200000001eaac1bdc + + /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */ + .octa 0x00000001751baae600000001f1ea39e2 + + /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */ + .octa 0x000000010795297200000001eb6506fc + + /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */ + .octa 0x0000000162b00abe000000010f806ffe + + /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */ + .octa 0x000000000d7b404c000000010408481e + + /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */ + .octa 0x00000000763b13d40000000188260534 + + /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */ + .octa 0x00000000f6dc22d80000000058fc73e0 + + /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */ + .octa 0x000000007daae06000000000391c59b8 + + /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */ + .octa 0x000000013359ab7c000000018b638400 + + /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */ + .octa 0x000000008add438a000000011738f5c4 + + /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */ + .octa 0x00000001edbefdea000000008cf7c6da + + /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */ + .octa 0x000000004104e0f800000001ef97fb16 + + /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */ + .octa 0x00000000b48a82220000000102130e20 + + /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */ + .octa 0x00000001bcb4684400000000db968898 + + /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */ + .octa 0x000000013293ce0a00000000b5047b5e + + /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */ + .octa 0x00000001710d0844000000010b90fdb2 + + /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */ + .octa 0x0000000117907f6e000000004834a32e + + /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */ + .octa 0x0000000087ddf93e0000000059c8f2b0 + + /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */ + .octa 0x000000005970e9b00000000122cec508 + + /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */ + .octa 0x0000000185b2b7d0000000000a330cda + + /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */ + .octa 0x00000001dcee0efc000000014a47148c + + /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */ + .octa 0x0000000030da27220000000042c61cb8 + + /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */ + .octa 0x000000012f925a180000000012fe6960 + + /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */ + .octa 0x00000000dd2e357c00000000dbda2c20 + + /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */ + .octa 0x00000000071c80de000000011122410c + + /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */ + .octa 0x000000011513140a00000000977b2070 + + /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */ + .octa 0x00000001df876e8e000000014050438e + + /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */ + .octa 0x000000015f81d6ce0000000147c840e8 + + /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */ + .octa 0x000000019dd94dbe00000001cc7c88ce + + /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */ + .octa 0x00000001373d206e00000001476b35a4 + + /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */ + .octa 0x00000000668ccade000000013d52d508 + + /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */ + .octa 0x00000001b192d268000000008e4be32e + + /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */ + .octa 0x00000000e30f3a7800000000024120fe + + /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */ + .octa 0x000000010ef1f7bc00000000ddecddb4 + + /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */ + .octa 0x00000001f5ac738000000000d4d403bc + + /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */ + .octa 0x000000011822ea7000000001734b89aa + + /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */ + .octa 0x00000000c3a33848000000010e7a58d6 + + /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */ + .octa 0x00000001bd151c2400000001f9f04e9c + + /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */ + .octa 0x0000000056002d7600000000b692225e + + /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */ + .octa 0x000000014657c4f4000000019b8d3f3e + + /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */ + .octa 0x0000000113742d7c00000001a874f11e + + /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */ + .octa 0x000000019c5920ba000000010d5a4254 + + /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */ + .octa 0x000000005216d2d600000000bbb2f5d6 + + /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */ + .octa 0x0000000136f5ad8a0000000179cc0e36 + + /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */ + .octa 0x000000018b07beb600000001dca1da4a + + /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */ + .octa 0x00000000db1e93b000000000feb1a192 + + /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */ + .octa 0x000000000b96fa3a00000000d1eeedd6 + + /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */ + .octa 0x00000001d9968af0000000008fad9bb4 + + /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */ + .octa 0x000000000e4a77a200000001884938e4 + + /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */ + .octa 0x00000000508c2ac800000001bc2e9bc0 + + /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */ + .octa 0x0000000021572a8000000001f9658a68 + + /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */ + .octa 0x00000001b859daf2000000001b9224fc + + /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */ + .octa 0x000000016f7884740000000055b2fb84 + + /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */ + .octa 0x00000001b438810e000000018b090348 + + /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */ + .octa 0x0000000095ddc6f2000000011ccbd5ea + + /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */ + .octa 0x00000001d977c20c0000000007ae47f8 + + /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */ + .octa 0x00000000ebedb99a0000000172acbec0 + + /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */ + .octa 0x00000001df9e9e9200000001c6e3ff20 + + /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */ + .octa 0x00000001a4a3f95200000000e1b38744 + + /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */ + .octa 0x00000000e2f5122000000000791585b2 + + /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */ + .octa 0x000000004aa01f3e00000000ac53b894 + + /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */ + .octa 0x00000000b3e90a5800000001ed5f2cf4 + + /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */ + .octa 0x000000000c9ca2aa00000001df48b2e0 + + /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */ + .octa 0x000000015168231600000000049c1c62 + + /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */ + .octa 0x0000000036fce78c000000017c460c12 + + /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */ + .octa 0x000000009037dc10000000015be4da7e + + /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */ + .octa 0x00000000d3298582000000010f38f668 + + /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */ + .octa 0x00000001b42e8ad60000000039f40a00 + + /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */ + .octa 0x00000000142a983800000000bd4c10c4 + + /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */ + .octa 0x0000000109c7f1900000000042db1d98 + + /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */ + .octa 0x0000000056ff931000000001c905bae6 + + /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */ + .octa 0x00000001594513aa00000000069d40ea + + /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */ + .octa 0x00000001e3b5b1e8000000008e4fbad0 + + /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */ + .octa 0x000000011dd5fc080000000047bedd46 + + /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */ + .octa 0x00000001675f0cc20000000026396bf8 + + /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */ + .octa 0x00000000d1c8dd4400000000379beb92 + + /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */ + .octa 0x0000000115ebd3d8000000000abae54a + + /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */ + .octa 0x00000001ecbd0dac0000000007e6a128 + + /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */ + .octa 0x00000000cdf67af2000000000ade29d2 + + /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */ + .octa 0x000000004c01ff4c00000000f974c45c + + /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */ + .octa 0x00000000f2d8657e00000000e77ac60a + + /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */ + .octa 0x000000006bae74c40000000145895816 + + /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */ + .octa 0x0000000152af8aa00000000038e362be + + /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */ + .octa 0x0000000004663802000000007f991a64 + + /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */ + .octa 0x00000001ab2f5afc00000000fa366d3a + + /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */ + .octa 0x0000000074a4ebd400000001a2bb34f0 + + /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */ + .octa 0x00000001d7ab3a4c0000000028a9981e + + /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */ + .octa 0x00000001a8da60c600000001dbc672be + + /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */ + .octa 0x000000013cf6382000000000b04d77f6 + + /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */ + .octa 0x00000000bec12e1e0000000124400d96 + + /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */ + .octa 0x00000001c6368010000000014ca4b414 + + /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */ + .octa 0x00000001e6e78758000000012fe2c938 + + /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */ + .octa 0x000000008d7f2b3c00000001faed01e6 + + /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */ + .octa 0x000000016b4a156e000000007e80ecfe + + /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */ + .octa 0x00000001c63cfeb60000000098daee94 + + /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */ + .octa 0x000000015f902670000000010a04edea + + /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */ + .octa 0x00000001cd5de11e00000001c00b4524 + + /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */ + .octa 0x000000001acaec540000000170296550 + + /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */ + .octa 0x000000002bd0ca780000000181afaa48 + + /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */ + .octa 0x0000000032d63d5c0000000185a31ffa + + /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */ + .octa 0x000000001c6d4e4c000000002469f608 + + /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */ + .octa 0x0000000106a60b92000000006980102a + + /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */ + .octa 0x00000000d3855e120000000111ea9ca8 + + /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */ + .octa 0x00000000e312563600000001bd1d29ce + + /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */ + .octa 0x000000009e8f7ea400000001b34b9580 + + /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */ + .octa 0x00000001c82e562c000000003076054e + + /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */ + .octa 0x00000000ca9f09ce000000012a608ea4 + + /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */ + .octa 0x00000000c63764e600000000784d05fe + + /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */ + .octa 0x0000000168d2e49e000000016ef0d82a + + /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */ + .octa 0x00000000e986c1480000000075bda454 + + /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */ + .octa 0x00000000cfb65894000000003dc0a1c4 + + /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */ + .octa 0x0000000111cadee400000000e9a5d8be + + /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */ + .octa 0x0000000171fb63ce00000001609bc4b4 + +.short_constants: + + /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */ + /* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod p(x)` */ + .octa 0x7fec2963e5bf80485cf015c388e56f72 + + /* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod p(x)` */ + .octa 0x38e888d4844752a9963a18920246e2e6 + + /* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod p(x)` */ + .octa 0x42316c00730206ad419a441956993a31 + + /* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod p(x)` */ + .octa 0x543d5c543e65ddf9924752ba2b830011 + + /* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod p(x)` */ + .octa 0x78e87aaf56767c9255bd7f9518e4a304 + + /* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod p(x)` */ + .octa 0x8f68fcec1903da7f6d76739fe0553f1e + + /* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod p(x)` */ + .octa 0x3f4840246791d588c133722b1fe0b5c3 + + /* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod p(x)` */ + .octa 0x34c96751b04de25a64b67ee0e55ef1f3 + + /* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)` */ + .octa 0x156c8e180b4a395b069db049b8fdb1e7 + + /* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */ + .octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e + + /* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */ + .octa 0x041d37768cd75659817cdc5119b29a35 + + /* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */ + .octa 0x3a0777818cfaa9651ce9d94b36c41f1c + + /* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */ + .octa 0x0e148e8252377a554f256efcb82be955 + + /* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */ + .octa 0x9c25531d19e65ddeec1631edb2dea967 + + /* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */ + .octa 0x790606ff9957c0a65d27e147510ac59a + + /* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */ + .octa 0x82f63b786ea2d55ca66805eb18b8ea18 + + +.barrett_constants: + /* 33 bit reflected Barrett constant m - (4^32)/n */ + .octa 0x000000000000000000000000dea713f1 /* x^64 div p(x)` */ + /* 33 bit reflected Barrett constant n */ + .octa 0x00000000000000000000000105ec76f1 +#endif diff --git a/src/support/power8/crc32_wrapper.c b/src/support/power8/crc32_wrapper.c new file mode 100644 index 00000000000..34ac4150338 --- /dev/null +++ b/src/support/power8/crc32_wrapper.c @@ -0,0 +1,66 @@ +#if defined(__powerpc64__) +#define CRC_TABLE +#include "crc32_constants.h" + +#define VMX_ALIGN 16 +#define VMX_ALIGN_MASK (VMX_ALIGN-1) + +#ifdef REFLECT +static unsigned int crc32_align(unsigned int crc, unsigned char *p, + unsigned long len) +{ + while (len--) + crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8); + return crc; +} +#else +static unsigned int crc32_align(unsigned int crc, unsigned char *p, + unsigned long len) +{ + while (len--) + crc = crc_table[((crc >> 24) ^ *p++) & 0xff] ^ (crc << 8); + return crc; +} +#endif + +unsigned int __crc32_vpmsum(unsigned int crc, unsigned char *p, + unsigned long len); + +unsigned int crc32_vpmsum(unsigned int crc, unsigned char *p, + unsigned long len) +{ + unsigned int prealign; + unsigned int tail; + +#ifdef CRC_XOR + crc ^= 0xffffffff; +#endif + + if (len < VMX_ALIGN + VMX_ALIGN_MASK) { + crc = crc32_align(crc, p, len); + goto out; + } + + if ((unsigned long)p & VMX_ALIGN_MASK) { + prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK); + crc = crc32_align(crc, p, prealign); + len -= prealign; + p += prealign; + } + + crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK); + + tail = len & VMX_ALIGN_MASK; + if (tail) { + p += len & ~VMX_ALIGN_MASK; + crc = crc32_align(crc, p, tail); + } + +out: +#ifdef CRC_XOR + crc ^= 0xffffffff; +#endif + + return crc; +} +#endif diff --git a/src/support/power8/ppc-opcode.h b/src/support/power8/ppc-opcode.h new file mode 100644 index 00000000000..b63feea60a0 --- /dev/null +++ b/src/support/power8/ppc-opcode.h @@ -0,0 +1,23 @@ +#ifndef __OPCODES_H +#define __OPCODES_H + +#define __PPC_RA(a) (((a) & 0x1f) << 16) +#define __PPC_RB(b) (((b) & 0x1f) << 11) +#define __PPC_XA(a) ((((a) & 0x1f) << 16) | (((a) & 0x20) >> 3)) +#define __PPC_XB(b) ((((b) & 0x1f) << 11) | (((b) & 0x20) >> 4)) +#define __PPC_XS(s) ((((s) & 0x1f) << 21) | (((s) & 0x20) >> 5)) +#define __PPC_XT(s) __PPC_XS(s) +#define VSX_XX3(t, a, b) (__PPC_XT(t) | __PPC_XA(a) | __PPC_XB(b)) +#define VSX_XX1(s, a, b) (__PPC_XS(s) | __PPC_RA(a) | __PPC_RB(b)) + +#define PPC_INST_VPMSUMW 0x10000488 +#define PPC_INST_VPMSUMD 0x100004c8 +#define PPC_INST_MFVSRD 0x7c000066 +#define PPC_INST_MTVSRD 0x7c000166 + +#define VPMSUMW(t, a, b) .long PPC_INST_VPMSUMW | VSX_XX3((t), a, b) +#define VPMSUMD(t, a, b) .long PPC_INST_VPMSUMD | VSX_XX3((t), a, b) +#define MFVRD(a, t) .long PPC_INST_MFVSRD | VSX_XX1((t)+32, a, 0) +#define MTVRD(t, a) .long PPC_INST_MTVSRD | VSX_XX1((t)+32, a, 0) + +#endif diff --git a/src/support/scratch.c b/src/support/scratch.c index 94020ba2621..aea98dc49ef 100644 --- a/src/support/scratch.c +++ b/src/support/scratch.c @@ -45,7 +45,7 @@ __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) WT_RET(__wt_realloc_aligned( session, &buf->memsize, size, &buf->mem)); else - WT_RET(__wt_realloc( + WT_RET(__wt_realloc_noclear( session, &buf->memsize, size, &buf->mem)); } diff --git a/src/support/stat.c b/src/support/stat.c index 7a615131628..2a826eda962 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -3,102 +3,102 @@ #include "wt_internal.h" static const char * const __stats_dsrc_desc[] = { - "block-manager: file allocation unit size", - "block-manager: blocks allocated", - "block-manager: checkpoint size", - "block-manager: allocations requiring file extension", - "block-manager: blocks freed", - "block-manager: file magic number", - "block-manager: file major version number", - "block-manager: minor version number", - "block-manager: file bytes available for reuse", - "block-manager: file size in bytes", - "LSM: bloom filters in the LSM tree", "LSM: bloom filter false positives", "LSM: bloom filter hits", "LSM: bloom filter misses", "LSM: bloom filter pages evicted from cache", "LSM: bloom filter pages read into cache", + "LSM: bloom filters in the LSM tree", + "LSM: chunks in the LSM tree", + "LSM: highest merge generation in the LSM tree", + "LSM: queries that could have benefited from a Bloom filter that did not exist", + "LSM: sleep for LSM checkpoint throttle", + "LSM: sleep for LSM merge throttle", "LSM: total size of bloom filters", + "block-manager: allocations requiring file extension", + "block-manager: blocks allocated", + "block-manager: blocks freed", + "block-manager: checkpoint size", + "block-manager: file allocation unit size", + "block-manager: file bytes available for reuse", + "block-manager: file magic number", + "block-manager: file major version number", + "block-manager: file size in bytes", + "block-manager: minor version number", "btree: btree checkpoint generation", - "btree: column-store variable-size deleted values", "btree: column-store fixed-size leaf pages", "btree: column-store internal pages", "btree: column-store variable-size RLE encoded values", + "btree: column-store variable-size deleted values", "btree: column-store variable-size leaf pages", - "btree: pages rewritten by compaction", - "btree: number of key/value pairs", "btree: fixed-record size", - "btree: maximum tree depth", "btree: maximum internal page key size", "btree: maximum internal page size", "btree: maximum leaf page key size", "btree: maximum leaf page size", "btree: maximum leaf page value size", + "btree: maximum tree depth", + "btree: number of key/value pairs", "btree: overflow pages", + "btree: pages rewritten by compaction", "btree: row-store internal pages", "btree: row-store leaf pages", "cache: bytes read into cache", "cache: bytes written from cache", "cache: checkpoint blocked page eviction", - "cache: unmodified pages evicted", - "cache: page split during eviction deepened the tree", - "cache: modified pages evicted", "cache: data source pages selected for eviction unable to be evicted", "cache: hazard pointer blocked page eviction", + "cache: in-memory page passed criteria to be split", + "cache: in-memory page splits", "cache: internal pages evicted", "cache: internal pages split during eviction", "cache: leaf pages split during eviction", - "cache: in-memory page splits", - "cache: in-memory page passed criteria to be split", + "cache: modified pages evicted", + "cache: overflow pages read into cache", "cache: overflow values cached in memory", + "cache: page split during eviction deepened the tree", + "cache: page written requiring lookaside records", "cache: pages read into cache", "cache: pages read into cache requiring lookaside entries", - "cache: overflow pages read into cache", "cache: pages written from cache", - "cache: page written requiring lookaside records", "cache: pages written requiring in-memory restoration", - "compression: raw compression call failed, no additional data available", - "compression: raw compression call failed, additional data available", - "compression: raw compression call succeeded", + "cache: unmodified pages evicted", "compression: compressed pages read", "compression: compressed pages written", "compression: page written failed to compress", "compression: page written was too small to compress", - "cursor: create calls", - "cursor: insert calls", + "compression: raw compression call failed, additional data available", + "compression: raw compression call failed, no additional data available", + "compression: raw compression call succeeded", "cursor: bulk-loaded cursor-insert calls", + "cursor: create calls", "cursor: cursor-insert key and value bytes inserted", + "cursor: cursor-remove key bytes removed", + "cursor: cursor-update value bytes updated", + "cursor: insert calls", "cursor: next calls", "cursor: prev calls", "cursor: remove calls", - "cursor: cursor-remove key bytes removed", "cursor: reset calls", "cursor: restarted searches", "cursor: search calls", "cursor: search near calls", "cursor: truncate calls", "cursor: update calls", - "cursor: cursor-update value bytes updated", - "LSM: sleep for LSM checkpoint throttle", - "LSM: chunks in the LSM tree", - "LSM: highest merge generation in the LSM tree", - "LSM: queries that could have benefited from a Bloom filter that did not exist", - "LSM: sleep for LSM merge throttle", "reconciliation: dictionary matches", + "reconciliation: fast-path pages deleted", + "reconciliation: internal page key bytes discarded using suffix compression", "reconciliation: internal page multi-block writes", - "reconciliation: leaf page multi-block writes", - "reconciliation: maximum blocks required for a page", "reconciliation: internal-page overflow keys", + "reconciliation: leaf page key bytes discarded using prefix compression", + "reconciliation: leaf page multi-block writes", "reconciliation: leaf-page overflow keys", + "reconciliation: maximum blocks required for a page", "reconciliation: overflow values written", - "reconciliation: pages deleted", - "reconciliation: fast-path pages deleted", "reconciliation: page checksum matches", "reconciliation: page reconciliation calls", "reconciliation: page reconciliation calls for eviction", - "reconciliation: leaf page key bytes discarded using prefix compression", - "reconciliation: internal page key bytes discarded using suffix compression", + "reconciliation: pages deleted", "session: object compaction", "session: open cursor count", "transaction: update conflicts", @@ -132,6 +132,18 @@ __wt_stat_dsrc_init(WT_DATA_HANDLE *handle) void __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) { + stats->bloom_false_positive = 0; + stats->bloom_hit = 0; + stats->bloom_miss = 0; + stats->bloom_page_evict = 0; + stats->bloom_page_read = 0; + stats->bloom_count = 0; + stats->lsm_chunk_count = 0; + stats->lsm_generation_max = 0; + stats->lsm_lookup_no_bloom = 0; + stats->lsm_checkpoint_throttle = 0; + stats->lsm_merge_throttle = 0; + stats->bloom_size = 0; stats->block_extension = 0; stats->block_alloc = 0; stats->block_free = 0; @@ -145,9 +157,9 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) /* not clearing btree_checkpoint_generation */ stats->btree_column_fix = 0; stats->btree_column_internal = 0; + stats->btree_column_rle = 0; stats->btree_column_deleted = 0; stats->btree_column_variable = 0; - stats->btree_column_rle = 0; stats->btree_fixed_len = 0; stats->btree_maxintlkey = 0; stats->btree_maxintlpage = 0; @@ -202,18 +214,6 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) stats->cursor_search_near = 0; stats->cursor_truncate = 0; stats->cursor_update = 0; - stats->bloom_false_positive = 0; - stats->bloom_hit = 0; - stats->bloom_miss = 0; - stats->bloom_page_evict = 0; - stats->bloom_page_read = 0; - stats->bloom_count = 0; - stats->lsm_chunk_count = 0; - stats->lsm_generation_max = 0; - stats->lsm_lookup_no_bloom = 0; - stats->lsm_checkpoint_throttle = 0; - stats->lsm_merge_throttle = 0; - stats->bloom_size = 0; stats->rec_dictionary = 0; stats->rec_page_delete_fast = 0; stats->rec_suffix_compression = 0; @@ -246,6 +246,19 @@ void __wt_stat_dsrc_aggregate_single( WT_DSRC_STATS *from, WT_DSRC_STATS *to) { + to->bloom_false_positive += from->bloom_false_positive; + to->bloom_hit += from->bloom_hit; + to->bloom_miss += from->bloom_miss; + to->bloom_page_evict += from->bloom_page_evict; + to->bloom_page_read += from->bloom_page_read; + to->bloom_count += from->bloom_count; + to->lsm_chunk_count += from->lsm_chunk_count; + if (from->lsm_generation_max > to->lsm_generation_max) + to->lsm_generation_max = from->lsm_generation_max; + to->lsm_lookup_no_bloom += from->lsm_lookup_no_bloom; + to->lsm_checkpoint_throttle += from->lsm_checkpoint_throttle; + to->lsm_merge_throttle += from->lsm_merge_throttle; + to->bloom_size += from->bloom_size; to->block_extension += from->block_extension; to->block_alloc += from->block_alloc; to->block_free += from->block_free; @@ -263,9 +276,9 @@ __wt_stat_dsrc_aggregate_single( to->btree_checkpoint_generation += from->btree_checkpoint_generation; to->btree_column_fix += from->btree_column_fix; to->btree_column_internal += from->btree_column_internal; + to->btree_column_rle += from->btree_column_rle; to->btree_column_deleted += from->btree_column_deleted; to->btree_column_variable += from->btree_column_variable; - to->btree_column_rle += from->btree_column_rle; if (from->btree_fixed_len > to->btree_fixed_len) to->btree_fixed_len = from->btree_fixed_len; if (from->btree_maxintlkey > to->btree_maxintlkey) @@ -328,19 +341,6 @@ __wt_stat_dsrc_aggregate_single( to->cursor_search_near += from->cursor_search_near; to->cursor_truncate += from->cursor_truncate; to->cursor_update += from->cursor_update; - to->bloom_false_positive += from->bloom_false_positive; - to->bloom_hit += from->bloom_hit; - to->bloom_miss += from->bloom_miss; - to->bloom_page_evict += from->bloom_page_evict; - to->bloom_page_read += from->bloom_page_read; - to->bloom_count += from->bloom_count; - to->lsm_chunk_count += from->lsm_chunk_count; - if (from->lsm_generation_max > to->lsm_generation_max) - to->lsm_generation_max = from->lsm_generation_max; - to->lsm_lookup_no_bloom += from->lsm_lookup_no_bloom; - to->lsm_checkpoint_throttle += from->lsm_checkpoint_throttle; - to->lsm_merge_throttle += from->lsm_merge_throttle; - to->bloom_size += from->bloom_size; to->rec_dictionary += from->rec_dictionary; to->rec_page_delete_fast += from->rec_page_delete_fast; to->rec_suffix_compression += from->rec_suffix_compression; @@ -367,6 +367,21 @@ __wt_stat_dsrc_aggregate( { int64_t v; + to->bloom_false_positive += WT_STAT_READ(from, bloom_false_positive); + to->bloom_hit += WT_STAT_READ(from, bloom_hit); + to->bloom_miss += WT_STAT_READ(from, bloom_miss); + to->bloom_page_evict += WT_STAT_READ(from, bloom_page_evict); + to->bloom_page_read += WT_STAT_READ(from, bloom_page_read); + to->bloom_count += WT_STAT_READ(from, bloom_count); + to->lsm_chunk_count += WT_STAT_READ(from, lsm_chunk_count); + if ((v = WT_STAT_READ(from, lsm_generation_max)) > + to->lsm_generation_max) + to->lsm_generation_max = v; + to->lsm_lookup_no_bloom += WT_STAT_READ(from, lsm_lookup_no_bloom); + to->lsm_checkpoint_throttle += + WT_STAT_READ(from, lsm_checkpoint_throttle); + to->lsm_merge_throttle += WT_STAT_READ(from, lsm_merge_throttle); + to->bloom_size += WT_STAT_READ(from, bloom_size); to->block_extension += WT_STAT_READ(from, block_extension); to->block_alloc += WT_STAT_READ(from, block_alloc); to->block_free += WT_STAT_READ(from, block_free); @@ -387,10 +402,10 @@ __wt_stat_dsrc_aggregate( to->btree_column_fix += WT_STAT_READ(from, btree_column_fix); to->btree_column_internal += WT_STAT_READ(from, btree_column_internal); + to->btree_column_rle += WT_STAT_READ(from, btree_column_rle); to->btree_column_deleted += WT_STAT_READ(from, btree_column_deleted); to->btree_column_variable += WT_STAT_READ(from, btree_column_variable); - to->btree_column_rle += WT_STAT_READ(from, btree_column_rle); if ((v = WT_STAT_READ(from, btree_fixed_len)) > to->btree_fixed_len) to->btree_fixed_len = v; if ((v = WT_STAT_READ(from, btree_maxintlkey)) > to->btree_maxintlkey) @@ -467,21 +482,6 @@ __wt_stat_dsrc_aggregate( to->cursor_search_near += WT_STAT_READ(from, cursor_search_near); to->cursor_truncate += WT_STAT_READ(from, cursor_truncate); to->cursor_update += WT_STAT_READ(from, cursor_update); - to->bloom_false_positive += WT_STAT_READ(from, bloom_false_positive); - to->bloom_hit += WT_STAT_READ(from, bloom_hit); - to->bloom_miss += WT_STAT_READ(from, bloom_miss); - to->bloom_page_evict += WT_STAT_READ(from, bloom_page_evict); - to->bloom_page_read += WT_STAT_READ(from, bloom_page_read); - to->bloom_count += WT_STAT_READ(from, bloom_count); - to->lsm_chunk_count += WT_STAT_READ(from, lsm_chunk_count); - if ((v = WT_STAT_READ(from, lsm_generation_max)) > - to->lsm_generation_max) - to->lsm_generation_max = v; - to->lsm_lookup_no_bloom += WT_STAT_READ(from, lsm_lookup_no_bloom); - to->lsm_checkpoint_throttle += - WT_STAT_READ(from, lsm_checkpoint_throttle); - to->lsm_merge_throttle += WT_STAT_READ(from, lsm_merge_throttle); - to->bloom_size += WT_STAT_READ(from, bloom_size); to->rec_dictionary += WT_STAT_READ(from, rec_dictionary); to->rec_page_delete_fast += WT_STAT_READ(from, rec_page_delete_fast); to->rec_suffix_compression += @@ -509,12 +509,22 @@ __wt_stat_dsrc_aggregate( } static const char * const __stats_connection_desc[] = { - "async: number of allocation state races", - "async: number of operation slots viewed for allocation", + "LSM: application work units currently queued", + "LSM: merge work units currently queued", + "LSM: rows merged in an LSM tree", + "LSM: sleep for LSM checkpoint throttle", + "LSM: sleep for LSM merge throttle", + "LSM: switch work units currently queued", + "LSM: tree maintenance operations discarded", + "LSM: tree maintenance operations executed", + "LSM: tree maintenance operations scheduled", + "LSM: tree queue hit maximum", "async: current work queue length", + "async: maximum work queue length", + "async: number of allocation state races", "async: number of flush calls", + "async: number of operation slots viewed for allocation", "async: number of times operation allocation failed", - "async: maximum work queue length", "async: number of times worker found no work", "async: total allocations", "async: total compact calls", @@ -522,55 +532,66 @@ static const char * const __stats_connection_desc[] = { "async: total remove calls", "async: total search calls", "async: total update calls", - "block-manager: mapped bytes read", - "block-manager: bytes read", - "block-manager: bytes written", - "block-manager: mapped blocks read", "block-manager: blocks pre-loaded", "block-manager: blocks read", "block-manager: blocks written", - "cache: tracked dirty bytes in the cache", - "cache: tracked bytes belonging to internal pages in the cache", + "block-manager: bytes read", + "block-manager: bytes written", + "block-manager: mapped blocks read", + "block-manager: mapped bytes read", "cache: bytes currently in the cache", - "cache: tracked bytes belonging to leaf pages in the cache", - "cache: maximum bytes configured", - "cache: tracked bytes belonging to overflow pages in the cache", "cache: bytes read into cache", "cache: bytes written from cache", - "cache: pages evicted by application threads", "cache: checkpoint blocked page eviction", - "cache: unmodified pages evicted", - "cache: page split during eviction deepened the tree", - "cache: modified pages evicted", - "cache: pages selected for eviction unable to be evicted", - "cache: pages evicted because they exceeded the in-memory maximum", - "cache: pages evicted because they had chains of deleted items", - "cache: failed eviction of pages that exceeded the in-memory maximum", - "cache: hazard pointer blocked page eviction", - "cache: internal pages evicted", - "cache: maximum page size at eviction", + "cache: eviction currently operating in aggressive mode", "cache: eviction server candidate queue empty when topping up", "cache: eviction server candidate queue not empty when topping up", "cache: eviction server evicting pages", "cache: eviction server populating queue, but not evicting pages", "cache: eviction server unable to reach eviction goal", - "cache: internal pages split during eviction", - "cache: leaf pages split during eviction", - "cache: pages walked for eviction", "cache: eviction worker thread evicting pages", - "cache: in-memory page splits", + "cache: failed eviction of pages that exceeded the in-memory maximum", + "cache: hazard pointer blocked page eviction", "cache: in-memory page passed criteria to be split", + "cache: in-memory page splits", + "cache: internal pages evicted", + "cache: internal pages split during eviction", + "cache: leaf pages split during eviction", "cache: lookaside table insert calls", "cache: lookaside table remove calls", - "cache: percentage overhead", - "cache: tracked dirty pages in the cache", + "cache: maximum bytes configured", + "cache: maximum page size at eviction", + "cache: modified pages evicted", + "cache: page split during eviction deepened the tree", + "cache: page written requiring lookaside records", "cache: pages currently held in the cache", + "cache: pages evicted because they exceeded the in-memory maximum", + "cache: pages evicted because they had chains of deleted items", + "cache: pages evicted by application threads", "cache: pages read into cache", "cache: pages read into cache requiring lookaside entries", + "cache: pages selected for eviction unable to be evicted", + "cache: pages walked for eviction", "cache: pages written from cache", - "cache: page written requiring lookaside records", "cache: pages written requiring in-memory restoration", + "cache: percentage overhead", + "cache: tracked bytes belonging to internal pages in the cache", + "cache: tracked bytes belonging to leaf pages in the cache", + "cache: tracked bytes belonging to overflow pages in the cache", + "cache: tracked dirty bytes in the cache", + "cache: tracked dirty pages in the cache", + "cache: unmodified pages evicted", + "connection: auto adjusting condition resets", + "connection: auto adjusting condition wait calls", + "connection: files currently open", + "connection: memory allocations", + "connection: memory frees", + "connection: memory re-allocations", "connection: pthread mutex condition wait calls", + "connection: pthread mutex shared lock read-lock calls", + "connection: pthread mutex shared lock write-lock calls", + "connection: total read I/Os", + "connection: total write I/Os", "cursor: cursor create calls", "cursor: cursor insert calls", "cursor: cursor next calls", @@ -580,96 +601,81 @@ static const char * const __stats_connection_desc[] = { "cursor: cursor restarted searches", "cursor: cursor search calls", "cursor: cursor search near calls", - "cursor: truncate calls", "cursor: cursor update calls", + "cursor: truncate calls", "data-handle: connection data handles currently active", - "data-handle: session dhandles swept", - "data-handle: session sweep attempts", - "data-handle: connection sweep dhandles closed", "data-handle: connection sweep candidate became referenced", + "data-handle: connection sweep dhandles closed", "data-handle: connection sweep dhandles removed from hash list", "data-handle: connection sweep time-of-death sets", "data-handle: connection sweeps", - "connection: files currently open", - "log: total log buffer size", + "data-handle: session dhandles swept", + "data-handle: session sweep attempts", + "log: busy returns attempting to switch slots", + "log: consolidated slot closures", + "log: consolidated slot join races", + "log: consolidated slot join transitions", + "log: consolidated slot joins", + "log: consolidated slot unbuffered writes", "log: log bytes of payload data", "log: log bytes written", - "log: yields waiting for previous log file close", - "log: total size of compressed records", - "log: total in-memory size of compressed records", - "log: log records too small to compress", - "log: log records not compressed", - "log: log records compressed", + "log: log files manually zero-filled", "log: log flush operations", + "log: log force write operations", + "log: log force write operations skipped", + "log: log records compressed", + "log: log records not compressed", + "log: log records too small to compress", + "log: log release advances write LSN", + "log: log scan operations", + "log: log scan records requiring two reads", + "log: log server thread advances write LSN", + "log: log server thread write LSN walk skipped", + "log: log sync operations", + "log: log sync_dir operations", + "log: log write operations", + "log: logging bytes consolidated", "log: maximum log file size", - "log: pre-allocated log files prepared", "log: number of pre-allocated log files to create", "log: pre-allocated log files not ready and missed", + "log: pre-allocated log files prepared", "log: pre-allocated log files used", - "log: log release advances write LSN", "log: records processed by log scan", - "log: log scan records requiring two reads", - "log: log scan operations", - "log: consolidated slot closures", + "log: total in-memory size of compressed records", + "log: total log buffer size", + "log: total size of compressed records", "log: written slots coalesced", - "log: logging bytes consolidated", - "log: consolidated slot joins", - "log: consolidated slot join races", - "log: busy returns attempting to switch slots", - "log: consolidated slot join transitions", - "log: consolidated slot unbuffered writes", - "log: log sync operations", - "log: log sync_dir operations", - "log: log server thread advances write LSN", - "log: log write operations", - "log: log files manually zero-filled", - "LSM: sleep for LSM checkpoint throttle", - "LSM: sleep for LSM merge throttle", - "LSM: rows merged in an LSM tree", - "LSM: application work units currently queued", - "LSM: merge work units currently queued", - "LSM: tree queue hit maximum", - "LSM: switch work units currently queued", - "LSM: tree maintenance operations scheduled", - "LSM: tree maintenance operations discarded", - "LSM: tree maintenance operations executed", - "connection: memory allocations", - "connection: memory frees", - "connection: memory re-allocations", - "thread-yield: page acquire busy blocked", - "thread-yield: page acquire eviction blocked", - "thread-yield: page acquire locked blocked", - "thread-yield: page acquire read blocked", - "thread-yield: page acquire time sleeping (usecs)", - "connection: total read I/Os", - "reconciliation: pages deleted", + "log: yields waiting for previous log file close", "reconciliation: fast-path pages deleted", "reconciliation: page reconciliation calls", "reconciliation: page reconciliation calls for eviction", + "reconciliation: pages deleted", "reconciliation: split bytes currently awaiting free", "reconciliation: split objects currently awaiting free", - "connection: pthread mutex shared lock read-lock calls", - "connection: pthread mutex shared lock write-lock calls", "session: open cursor count", "session: open session count", + "thread-yield: page acquire busy blocked", + "thread-yield: page acquire eviction blocked", + "thread-yield: page acquire locked blocked", + "thread-yield: page acquire read blocked", + "thread-yield: page acquire time sleeping (usecs)", + "transaction: number of named snapshots created", + "transaction: number of named snapshots dropped", "transaction: transaction begins", - "transaction: transaction checkpoints", - "transaction: transaction checkpoint generation", "transaction: transaction checkpoint currently running", + "transaction: transaction checkpoint generation", "transaction: transaction checkpoint max time (msecs)", "transaction: transaction checkpoint min time (msecs)", "transaction: transaction checkpoint most recent time (msecs)", "transaction: transaction checkpoint total time (msecs)", - "transaction: transactions committed", + "transaction: transaction checkpoints", "transaction: transaction failures due to cache overflow", - "transaction: transaction range of IDs currently pinned by a checkpoint", "transaction: transaction range of IDs currently pinned", + "transaction: transaction range of IDs currently pinned by a checkpoint", "transaction: transaction range of IDs currently pinned by named snapshots", - "transaction: transactions rolled back", - "transaction: number of named snapshots created", - "transaction: number of named snapshots dropped", "transaction: transaction sync calls", - "connection: total write I/Os", + "transaction: transactions committed", + "transaction: transactions rolled back", }; int @@ -700,6 +706,16 @@ __wt_stat_connection_init(WT_CONNECTION_IMPL *handle) void __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) { + /* not clearing lsm_work_queue_app */ + /* not clearing lsm_work_queue_manager */ + stats->lsm_rows_merged = 0; + stats->lsm_checkpoint_throttle = 0; + stats->lsm_merge_throttle = 0; + /* not clearing lsm_work_queue_switch */ + stats->lsm_work_units_discarded = 0; + stats->lsm_work_units_done = 0; + stats->lsm_work_units_created = 0; + stats->lsm_work_queue_max = 0; stats->async_cur_queue = 0; /* not clearing async_max_queue */ stats->async_alloc_race = 0; @@ -724,6 +740,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->cache_bytes_read = 0; stats->cache_bytes_write = 0; stats->cache_eviction_checkpoint = 0; + /* not clearing cache_eviction_aggressive_set */ stats->cache_eviction_queue_empty = 0; stats->cache_eviction_queue_not_empty = 0; stats->cache_eviction_server_evicting = 0; @@ -761,6 +778,8 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) /* not clearing cache_bytes_dirty */ /* not clearing cache_pages_dirty */ stats->cache_eviction_clean = 0; + stats->cond_auto_wait_reset = 0; + stats->cond_auto_wait = 0; /* not clearing file_open */ stats->memory_allocation = 0; stats->memory_free = 0; @@ -799,6 +818,8 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->log_bytes_written = 0; stats->log_zero_fills = 0; stats->log_flush = 0; + stats->log_force_write = 0; + stats->log_force_write_skip = 0; stats->log_compress_writes = 0; stats->log_compress_write_fails = 0; stats->log_compress_small = 0; @@ -806,6 +827,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->log_scans = 0; stats->log_scan_rereads = 0; stats->log_write_lsn = 0; + stats->log_write_lsn_skip = 0; stats->log_sync = 0; stats->log_sync_dir = 0; stats->log_writes = 0; @@ -821,16 +843,6 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->log_compress_len = 0; stats->log_slot_coalesced = 0; stats->log_close_yields = 0; - /* not clearing lsm_work_queue_app */ - /* not clearing lsm_work_queue_manager */ - stats->lsm_rows_merged = 0; - stats->lsm_checkpoint_throttle = 0; - stats->lsm_merge_throttle = 0; - /* not clearing lsm_work_queue_switch */ - stats->lsm_work_units_discarded = 0; - stats->lsm_work_units_done = 0; - stats->lsm_work_units_created = 0; - stats->lsm_work_queue_max = 0; stats->rec_page_delete_fast = 0; stats->rec_pages = 0; stats->rec_pages_eviction = 0; @@ -876,6 +888,21 @@ void __wt_stat_connection_aggregate( WT_CONNECTION_STATS **from, WT_CONNECTION_STATS *to) { + to->lsm_work_queue_app += WT_STAT_READ(from, lsm_work_queue_app); + to->lsm_work_queue_manager += + WT_STAT_READ(from, lsm_work_queue_manager); + to->lsm_rows_merged += WT_STAT_READ(from, lsm_rows_merged); + to->lsm_checkpoint_throttle += + WT_STAT_READ(from, lsm_checkpoint_throttle); + to->lsm_merge_throttle += WT_STAT_READ(from, lsm_merge_throttle); + to->lsm_work_queue_switch += + WT_STAT_READ(from, lsm_work_queue_switch); + to->lsm_work_units_discarded += + WT_STAT_READ(from, lsm_work_units_discarded); + to->lsm_work_units_done += WT_STAT_READ(from, lsm_work_units_done); + to->lsm_work_units_created += + WT_STAT_READ(from, lsm_work_units_created); + to->lsm_work_queue_max += WT_STAT_READ(from, lsm_work_queue_max); to->async_cur_queue += WT_STAT_READ(from, async_cur_queue); to->async_max_queue += WT_STAT_READ(from, async_max_queue); to->async_alloc_race += WT_STAT_READ(from, async_alloc_race); @@ -901,6 +928,8 @@ __wt_stat_connection_aggregate( to->cache_bytes_write += WT_STAT_READ(from, cache_bytes_write); to->cache_eviction_checkpoint += WT_STAT_READ(from, cache_eviction_checkpoint); + to->cache_eviction_aggressive_set += + WT_STAT_READ(from, cache_eviction_aggressive_set); to->cache_eviction_queue_empty += WT_STAT_READ(from, cache_eviction_queue_empty); to->cache_eviction_queue_not_empty += @@ -955,6 +984,8 @@ __wt_stat_connection_aggregate( to->cache_bytes_dirty += WT_STAT_READ(from, cache_bytes_dirty); to->cache_pages_dirty += WT_STAT_READ(from, cache_pages_dirty); to->cache_eviction_clean += WT_STAT_READ(from, cache_eviction_clean); + to->cond_auto_wait_reset += WT_STAT_READ(from, cond_auto_wait_reset); + to->cond_auto_wait += WT_STAT_READ(from, cond_auto_wait); to->file_open += WT_STAT_READ(from, file_open); to->memory_allocation += WT_STAT_READ(from, memory_allocation); to->memory_free += WT_STAT_READ(from, memory_free); @@ -993,6 +1024,8 @@ __wt_stat_connection_aggregate( to->log_bytes_written += WT_STAT_READ(from, log_bytes_written); to->log_zero_fills += WT_STAT_READ(from, log_zero_fills); to->log_flush += WT_STAT_READ(from, log_flush); + to->log_force_write += WT_STAT_READ(from, log_force_write); + to->log_force_write_skip += WT_STAT_READ(from, log_force_write_skip); to->log_compress_writes += WT_STAT_READ(from, log_compress_writes); to->log_compress_write_fails += WT_STAT_READ(from, log_compress_write_fails); @@ -1002,6 +1035,7 @@ __wt_stat_connection_aggregate( to->log_scans += WT_STAT_READ(from, log_scans); to->log_scan_rereads += WT_STAT_READ(from, log_scan_rereads); to->log_write_lsn += WT_STAT_READ(from, log_write_lsn); + to->log_write_lsn_skip += WT_STAT_READ(from, log_write_lsn_skip); to->log_sync += WT_STAT_READ(from, log_sync); to->log_sync_dir += WT_STAT_READ(from, log_sync_dir); to->log_writes += WT_STAT_READ(from, log_writes); @@ -1018,21 +1052,6 @@ __wt_stat_connection_aggregate( to->log_compress_len += WT_STAT_READ(from, log_compress_len); to->log_slot_coalesced += WT_STAT_READ(from, log_slot_coalesced); to->log_close_yields += WT_STAT_READ(from, log_close_yields); - to->lsm_work_queue_app += WT_STAT_READ(from, lsm_work_queue_app); - to->lsm_work_queue_manager += - WT_STAT_READ(from, lsm_work_queue_manager); - to->lsm_rows_merged += WT_STAT_READ(from, lsm_rows_merged); - to->lsm_checkpoint_throttle += - WT_STAT_READ(from, lsm_checkpoint_throttle); - to->lsm_merge_throttle += WT_STAT_READ(from, lsm_merge_throttle); - to->lsm_work_queue_switch += - WT_STAT_READ(from, lsm_work_queue_switch); - to->lsm_work_units_discarded += - WT_STAT_READ(from, lsm_work_units_discarded); - to->lsm_work_units_done += WT_STAT_READ(from, lsm_work_units_done); - to->lsm_work_units_created += - WT_STAT_READ(from, lsm_work_units_created); - to->lsm_work_queue_max += WT_STAT_READ(from, lsm_work_queue_max); to->rec_page_delete_fast += WT_STAT_READ(from, rec_page_delete_fast); to->rec_pages += WT_STAT_READ(from, rec_pages); to->rec_pages_eviction += WT_STAT_READ(from, rec_pages_eviction); diff --git a/src/txn/txn.c b/src/txn/txn.c index e8fd8c0c119..7a768a8fe20 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -344,7 +344,7 @@ retry: current_id - oldest_id > 10000 && oldest_session != NULL) { (void)__wt_verbose(session, WT_VERB_TRANSACTION, "old snapshot %" PRIu64 - " pinned in session %d [%s]" + " pinned in session %" PRIu32 " [%s]" " with snap_min %" PRIu64 "\n", oldest_id, oldest_session->id, oldest_session->lastop, diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 6a2c1eef826..1eebc9e9d04 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -8,6 +8,10 @@ #include "wt_internal.h" +static int __checkpoint_lock_tree( + WT_SESSION_IMPL *, bool, bool, const char *[]); +static int __checkpoint_tree_helper(WT_SESSION_IMPL *, const char *[]); + /* * __wt_checkpoint_name_ok -- * Complain if the checkpoint name isn't acceptable. @@ -155,8 +159,8 @@ __checkpoint_apply_all(WT_SESSION_IMPL *session, const char *cfg[], ckpt_closed = cval.len != 0; } WT_ERR(ckpt_closed ? - __wt_meta_btree_apply(session, op, cfg) : - __wt_conn_btree_apply(session, false, NULL, op, cfg)); + __wt_meta_apply_all(session, op, NULL, cfg) : + __wt_conn_btree_apply(session, NULL, op, NULL, cfg)); } if (fullp != NULL) @@ -179,14 +183,8 @@ __checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[], /* If we have already locked the handles, apply the operation. */ for (i = 0; i < session->ckpt_handle_next; ++i) { - if (session->ckpt_handle[i].dhandle != NULL) - WT_WITH_DHANDLE(session, - session->ckpt_handle[i].dhandle, - ret = (*op)(session, cfg)); - else - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_conn_btree_apply_single(session, - session->ckpt_handle[i].name, NULL, op, cfg)); + WT_WITH_DHANDLE(session, session->ckpt_handle[i], + ret = (*op)(session, cfg)); WT_RET(ret); } @@ -230,11 +228,11 @@ __checkpoint_data_source(WT_SESSION_IMPL *session, const char *cfg[]) } /* - * __wt_checkpoint_list -- + * __wt_checkpoint_get_handles -- * Get a list of handles to flush. */ int -__wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[]) +__wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) { WT_DECL_RET; const char *name; @@ -257,15 +255,18 @@ __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[]) name = session->dhandle->name; session->dhandle = NULL; - /* Record busy file names, we'll deal with them in the checkpoint. */ - if ((ret = __wt_session_get_btree(session, name, NULL, NULL, 0)) == 0) - session->ckpt_handle[session->ckpt_handle_next++].dhandle = - session->dhandle; - else if (ret == EBUSY) - ret = __wt_strdup(session, name, - &session->ckpt_handle[session->ckpt_handle_next++].name); + if ((ret = __wt_session_get_btree(session, name, NULL, NULL, 0)) != 0) + return (ret == EBUSY ? 0 : ret); - return (ret); + WT_SAVE_DHANDLE(session, + ret = __checkpoint_lock_tree(session, true, true, cfg)); + if (ret != 0) { + WT_TRET(__wt_session_release_btree(session)); + return (ret); + } + + session->ckpt_handle[session->ckpt_handle_next++] = session->dhandle; + return (0); } /* @@ -277,7 +278,7 @@ __checkpoint_write_leaves(WT_SESSION_IMPL *session, const char *cfg[]) { WT_UNUSED(cfg); - return (__wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES)); + return (__wt_cache_op(session, WT_SYNC_WRITE_LEAVES)); } /* @@ -381,15 +382,20 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) /* Configure logging only if doing a full checkpoint. */ logging = FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED); + /* Keep track of handles acquired for locking. */ + WT_ERR(__wt_meta_track_on(session)); + tracking = true; + /* * Get a list of handles we want to flush; this may pull closed objects * into the session cache, but we're going to do that eventually anyway. */ + WT_ASSERT(session, session->ckpt_handle_next == 0); WT_WITH_SCHEMA_LOCK(session, ret, WT_WITH_TABLE_LOCK(session, ret, WT_WITH_HANDLE_LIST_LOCK(session, ret = __checkpoint_apply_all( - session, cfg, __wt_checkpoint_list, NULL)))); + session, cfg, __wt_checkpoint_get_handles, NULL)))); WT_ERR(ret); /* @@ -418,15 +424,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * completion. Do it after flushing the pages to give the * asynchronous flush as much time as possible before we wait. */ - if (F_ISSET(conn, WT_CONN_CKPT_SYNC)) - WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync)); - - /* Acquire the schema lock. */ - F_SET(session, WT_SESSION_LOCKED_SCHEMA); - __wt_spin_lock(session, &conn->schema_lock); - - WT_ERR(__wt_meta_track_on(session)); - tracking = true; + WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync)); /* Tell logging that we are about to start a database checkpoint. */ if (full && logging) @@ -440,6 +438,8 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__wt_epoch(session, &start)); /* + * Start the checkpoint for real. + * * Bump the global checkpoint generation, used to figure out whether * checkpoint has visited a tree. There is no need for this to be * atomic: it is only written while holding the checkpoint lock. @@ -503,7 +503,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__wt_txn_checkpoint_log( session, full, WT_TXN_LOG_CKPT_START, NULL)); - WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint)); + WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_tree_helper)); /* * Clear the dhandle so the visibility check doesn't get confused about @@ -522,8 +522,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * Checkpoints have to hit disk (it would be reasonable to configure for * lazy checkpoints, but we don't support them yet). */ - if (F_ISSET(conn, WT_CONN_CKPT_SYNC)) - WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync)); + WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync)); WT_ERR(__checkpoint_verbose_track(session, "sync completed", &verb_timer)); @@ -543,16 +542,25 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * Recovery relies on the checkpoint LSN in the metadata only being * updated by full checkpoints so only checkpoint the metadata for * full or non-logged checkpoints. + * + * This is very similar to __wt_meta_track_off, ideally they would be + * merged. */ if (full || !logging) { session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED; /* Disable metadata tracking during the metadata checkpoint. */ saved_meta_next = session->meta_track_next; session->meta_track_next = NULL; + WT_WITH_METADATA_LOCK(session, ret, + WT_WITH_DHANDLE(session, + WT_SESSION_META_DHANDLE(session), + ret = __wt_checkpoint(session, cfg))); + session->meta_track_next = saved_meta_next; + WT_ERR(ret); + WT_WITH_DHANDLE(session, WT_SESSION_META_DHANDLE(session), - ret = __wt_checkpoint(session, cfg)); - session->meta_track_next = saved_meta_next; + ret = __wt_checkpoint_sync(session, NULL)); WT_ERR(ret); WT_ERR(__checkpoint_verbose_track(session, @@ -610,23 +618,13 @@ err: /* WT_TXN_LOG_CKPT_STOP : WT_TXN_LOG_CKPT_CLEANUP, NULL)); } - for (i = 0; i < session->ckpt_handle_next; ++i) { - if (session->ckpt_handle[i].dhandle == NULL) { - __wt_free(session, session->ckpt_handle[i].name); - continue; - } - WT_WITH_DHANDLE(session, session->ckpt_handle[i].dhandle, + for (i = 0; i < session->ckpt_handle_next; ++i) + WT_WITH_DHANDLE(session, session->ckpt_handle[i], WT_TRET(__wt_session_release_btree(session))); - } __wt_free(session, session->ckpt_handle); session->ckpt_handle_allocated = session->ckpt_handle_next = 0; - if (F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)) { - F_CLR(session, WT_SESSION_LOCKED_SCHEMA); - __wt_spin_unlock(session, &conn->schema_lock); - } - session->isolation = txn->isolation = saved_isolation; return (ret); } @@ -768,14 +766,13 @@ __drop_to(WT_CKPT *ckptbase, const char *name, size_t len) } /* - * __checkpoint_worker -- - * Checkpoint a tree. + * __checkpoint_lock_tree -- + * Acquire the locks required to checkpoint a tree. */ static int -__checkpoint_worker(WT_SESSION_IMPL *session, - const char *cfg[], bool is_checkpoint, bool need_tracking) +__checkpoint_lock_tree(WT_SESSION_IMPL *session, + bool is_checkpoint, bool need_tracking, const char *cfg[]) { - WT_BM *bm; WT_BTREE *btree; WT_CKPT *ckpt, *ckptbase; WT_CONFIG dropconf; @@ -783,19 +780,15 @@ __checkpoint_worker(WT_SESSION_IMPL *session, WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; - WT_LSN ckptlsn; - int deleted, was_modified; - bool fake_ckpt, force, hot_backup_locked; - const char *name; char *name_alloc; + const char *name; + bool hot_backup_locked; btree = S2BT(session); - bm = btree->bm; conn = S2C(session); ckpt = ckptbase = NULL; dhandle = session->dhandle; - was_modified = btree->modified; - fake_ckpt = hot_backup_locked = false; + hot_backup_locked = false; name_alloc = NULL; /* @@ -814,15 +807,6 @@ __checkpoint_worker(WT_SESSION_IMPL *session, WT_ASSERT(session, !need_tracking || WT_IS_METADATA(session, dhandle) || WT_META_TRACKING(session)); - /* - * Set the checkpoint LSN to the maximum LSN so that if logging is - * disabled, recovery will never roll old changes forward over the - * non-logged changes in this checkpoint. If logging is enabled, a - * real checkpoint LSN will be assigned later for this checkpoint and - * overwrite this. - */ - WT_MAX_LSN(&ckptlsn); - /* Get the list of checkpoints for this file. */ WT_RET(__wt_meta_ckptlist_get(session, dhandle->name, &ckptbase)); @@ -873,74 +857,15 @@ __checkpoint_worker(WT_SESSION_IMPL *session, /* Drop checkpoints with the same name as the one we're taking. */ __drop(ckptbase, name, strlen(name)); - /* - * Check for clean objects not requiring a checkpoint. - * - * If we're closing a handle, and the object is clean, we can skip the - * checkpoint, whatever checkpoints we have are sufficient. (We might - * not have any checkpoints if the object was never modified, and that's - * OK: the object creation code doesn't mark the tree modified so we can - * skip newly created trees here.) - * - * If the application repeatedly checkpoints an object (imagine hourly - * checkpoints using the same explicit or internal name), there's no - * reason to repeat the checkpoint for clean objects. The test is if - * the only checkpoint we're deleting is the last one in the list and - * it has the same name as the checkpoint we're about to take, skip the - * work. (We can't skip checkpoints that delete more than the last - * checkpoint because deleting those checkpoints might free up space in - * the file.) This means an application toggling between two (or more) - * checkpoint names will repeatedly take empty checkpoints, but that's - * not likely enough to make detection worthwhile. - * - * Checkpoint read-only objects otherwise: the application must be able - * to open the checkpoint in a cursor after taking any checkpoint, which - * means it must exist. - */ - force = false; - F_CLR(btree, WT_BTREE_SKIP_CKPT); - if (!btree->modified && cfg != NULL) { - ret = __wt_config_gets(session, cfg, "force", &cval); - if (ret != 0 && ret != WT_NOTFOUND) - WT_ERR(ret); - if (ret == 0 && cval.val != 0) - force = true; - } - if (!btree->modified && !force) { - if (!is_checkpoint) - goto nockpt; - - deleted = 0; - WT_CKPT_FOREACH(ckptbase, ckpt) - if (F_ISSET(ckpt, WT_CKPT_DELETE)) - ++deleted; - /* - * Complicated test: if the last checkpoint in the object has - * the same name as the checkpoint we're taking (correcting for - * internal checkpoint names with their generational suffix - * numbers), we can skip the checkpoint, there's nothing to do. - * The exception is if we're deleting two or more checkpoints: - * then we may save space. - */ - if (ckpt > ckptbase && - (strcmp(name, (ckpt - 1)->name) == 0 || - (WT_PREFIX_MATCH(name, WT_CHECKPOINT) && - WT_PREFIX_MATCH((ckpt - 1)->name, WT_CHECKPOINT))) && - deleted < 2) { -nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT); - WT_PUBLISH(btree->checkpoint_gen, - S2C(session)->txn_global.checkpoint_gen); - WT_STAT_FAST_DATA_SET(session, - btree_checkpoint_generation, - btree->checkpoint_gen); - goto done; - } - } - /* Add a new checkpoint entry at the end of the list. */ WT_CKPT_FOREACH(ckptbase, ckpt) ; WT_ERR(__wt_strdup(session, name, &ckpt->name)); + /* + * We are now done with the local use of the name. Free the local + * allocation, if needed. + */ + __wt_free(session, name_alloc); F_SET(ckpt, WT_CKPT_ADD); /* @@ -1021,32 +946,128 @@ nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT); * copy instead of forcing checkpoints on clean objects to associate * names with checkpoints. */ - if (is_checkpoint) - switch (F_MASK(btree, WT_BTREE_SPECIAL_FLAGS)) { - case 0: - break; - case WT_BTREE_BULK: - /* - * The only checkpoints a bulk-loaded file should have - * are fake ones we created without the underlying block - * manager. I'm leaving this code here because it's a - * cheap test and a nasty race. - */ - WT_CKPT_FOREACH(ckptbase, ckpt) - if (!F_ISSET(ckpt, WT_CKPT_ADD | WT_CKPT_FAKE)) - WT_ERR_MSG(session, ret, - "block-manager checkpoint found " - "for a bulk-loaded file"); - fake_ckpt = true; - goto fake; - case WT_BTREE_REBALANCE: - case WT_BTREE_SALVAGE: - case WT_BTREE_UPGRADE: - case WT_BTREE_VERIFY: - WT_ERR_MSG(session, EINVAL, - "checkpoints are blocked during rebalance, " - "salvage, upgrade or verify operations"); + WT_ASSERT(session, + !is_checkpoint || !F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)); + + hot_backup_locked = false; + WT_ERR(__wt_readunlock(session, conn->hot_backup_lock)); + + WT_ASSERT(session, btree->ckpt == NULL); + btree->ckpt = ckptbase; + + return (0); + +err: if (hot_backup_locked) + WT_TRET(__wt_readunlock(session, conn->hot_backup_lock)); + + __wt_meta_ckptlist_free(session, ckptbase); + __wt_free(session, name_alloc); + + return (ret); +} + +/* + * __checkpoint_tree -- + * Checkpoint a single tree. + * Assumes all necessary locks have been acquired by the caller. + */ +static int +__checkpoint_tree( + WT_SESSION_IMPL *session, bool is_checkpoint, const char *cfg[]) +{ + WT_BM *bm; + WT_BTREE *btree; + WT_CKPT *ckpt, *ckptbase; + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + WT_LSN ckptlsn; + const char *name; + int deleted, was_modified; + bool fake_ckpt, force; + + btree = S2BT(session); + bm = btree->bm; + ckptbase = btree->ckpt; + conn = S2C(session); + dhandle = session->dhandle; + fake_ckpt = false; + was_modified = btree->modified; + + /* + * Set the checkpoint LSN to the maximum LSN so that if logging is + * disabled, recovery will never roll old changes forward over the + * non-logged changes in this checkpoint. If logging is enabled, a + * real checkpoint LSN will be assigned for this checkpoint and + * overwrite this. + */ + WT_MAX_LSN(&ckptlsn); + + /* + * Check for clean objects not requiring a checkpoint. + * + * If we're closing a handle, and the object is clean, we can skip the + * checkpoint, whatever checkpoints we have are sufficient. (We might + * not have any checkpoints if the object was never modified, and that's + * OK: the object creation code doesn't mark the tree modified so we can + * skip newly created trees here.) + * + * If the application repeatedly checkpoints an object (imagine hourly + * checkpoints using the same explicit or internal name), there's no + * reason to repeat the checkpoint for clean objects. The test is if + * the only checkpoint we're deleting is the last one in the list and + * it has the same name as the checkpoint we're about to take, skip the + * work. (We can't skip checkpoints that delete more than the last + * checkpoint because deleting those checkpoints might free up space in + * the file.) This means an application toggling between two (or more) + * checkpoint names will repeatedly take empty checkpoints, but that's + * not likely enough to make detection worthwhile. + * + * Checkpoint read-only objects otherwise: the application must be able + * to open the checkpoint in a cursor after taking any checkpoint, which + * means it must exist. + */ + force = false; + F_CLR(btree, WT_BTREE_SKIP_CKPT); + if (!btree->modified && cfg != NULL) { + ret = __wt_config_gets(session, cfg, "force", &cval); + if (ret != 0 && ret != WT_NOTFOUND) + WT_ERR(ret); + if (ret == 0 && cval.val != 0) + force = true; + } + if (!btree->modified && !force) { + if (!is_checkpoint) + goto nockpt; + + deleted = 0; + WT_CKPT_FOREACH(ckptbase, ckpt) + if (F_ISSET(ckpt, WT_CKPT_DELETE)) + ++deleted; + /* + * Complicated test: if the tree is clean and last two + * checkpoints have the same name (correcting for internal + * checkpoint names with their generational suffix numbers), we + * can skip the checkpoint, there's nothing to do. The + * exception is if we're deleting two or more checkpoints: then + * we may save space. + */ + name = (ckpt - 1)->name; + if (ckpt > ckptbase + 1 && deleted < 2 && + (strcmp(name, (ckpt - 2)->name) == 0 || + (WT_PREFIX_MATCH(name, WT_CHECKPOINT) && + WT_PREFIX_MATCH((ckpt - 2)->name, WT_CHECKPOINT)))) { +nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT); + WT_PUBLISH(btree->checkpoint_gen, + S2C(session)->txn_global.checkpoint_gen); + WT_STAT_FAST_DATA_SET(session, + btree_checkpoint_generation, + btree->checkpoint_gen); + ret = 0; + goto err; } + } /* * If an object has never been used (in other words, if it could become @@ -1100,9 +1121,9 @@ nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT); /* Flush the file from the cache, creating the checkpoint. */ if (is_checkpoint) - WT_ERR(__wt_cache_op(session, ckptbase, WT_SYNC_CHECKPOINT)); + WT_ERR(__wt_cache_op(session, WT_SYNC_CHECKPOINT)); else - WT_ERR(__wt_cache_op(session, ckptbase, WT_SYNC_CLOSE)); + WT_ERR(__wt_cache_op(session, WT_SYNC_CLOSE)); /* * All blocks being written have been written; set the object's write @@ -1134,9 +1155,8 @@ fake: /* * sync the file here or we could roll forward the metadata in * recovery and open a checkpoint that isn't yet durable. */ - if (F_ISSET(conn, WT_CONN_CKPT_SYNC) && - (WT_IS_METADATA(session, dhandle) || - !F_ISSET(&session->txn, WT_TXN_RUNNING))) + if (WT_IS_METADATA(session, dhandle) || + !F_ISSET(&session->txn, WT_TXN_RUNNING)) WT_ERR(__wt_checkpoint_sync(session, NULL)); WT_ERR(__wt_meta_ckptlist_set( @@ -1161,7 +1181,6 @@ fake: /* WT_ERR(__wt_txn_checkpoint_log( session, false, WT_TXN_LOG_CKPT_STOP, NULL)); -done: err: /* * If the checkpoint didn't complete successfully, make sure the * tree is marked dirty. @@ -1169,29 +1188,42 @@ err: /* if (ret != 0 && !btree->modified && was_modified) btree->modified = 1; - if (hot_backup_locked) - WT_TRET(__wt_readunlock(session, conn->hot_backup_lock)); - __wt_meta_ckptlist_free(session, ckptbase); - __wt_free(session, name_alloc); + btree->ckpt = NULL; return (ret); } /* + * __checkpoint_tree_helper -- + * Checkpoint a tree (suitable for use in *_apply functions). + */ +static int +__checkpoint_tree_helper(WT_SESSION_IMPL *session, const char *cfg[]) +{ + return (__checkpoint_tree(session, true, cfg)); +} + +/* * __wt_checkpoint -- * Checkpoint a file. */ int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) { + WT_DECL_RET; + /* Should not be called with a checkpoint handle. */ WT_ASSERT(session, session->dhandle->checkpoint == NULL); - /* Should be holding the schema lock. */ - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); + /* We must hold the metadata lock if checkpointing the metadata. */ + WT_ASSERT(session, !WT_IS_METADATA(session, session->dhandle) || + F_ISSET(session, WT_SESSION_LOCKED_METADATA)); - return (__checkpoint_worker(session, cfg, true, true)); + WT_SAVE_DHANDLE(session, + ret = __checkpoint_lock_tree(session, true, true, cfg)); + WT_RET(ret); + return (__checkpoint_tree(session, true, cfg)); } /* @@ -1210,8 +1242,9 @@ __wt_checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[]) /* Should not be called with a checkpoint handle. */ WT_ASSERT(session, session->dhandle->checkpoint == NULL); - /* Should have an underlying block manager reference. */ - WT_ASSERT(session, bm != NULL); + /* Unnecessary if checkpoint_sync has been configured "off". */ + if (!F_ISSET(S2C(session), WT_CONN_CKPT_SYNC)) + return (0); return (bm->sync(bm, session, false)); } @@ -1240,7 +1273,7 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final) if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT)) F_SET(session->dhandle, WT_DHANDLE_DEAD); if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) - return (__wt_cache_op(session, NULL, WT_SYNC_DISCARD)); + return (__wt_cache_op(session, WT_SYNC_DISCARD)); /* * If closing an unmodified file, check that no update is required @@ -1249,21 +1282,13 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final) if (!btree->modified && !bulk) { __wt_txn_update_oldest(session, true); return (__wt_txn_visible_all(session, btree->rec_max_txn) ? - __wt_cache_op(session, NULL, WT_SYNC_DISCARD) : EBUSY); + __wt_cache_op(session, WT_SYNC_DISCARD) : EBUSY); } /* - * We should already have the schema lock unless we're finishing a bulk - * load -- the only other paths to closing files (sweep and LSM) have - * already checked for read-only trees. - */ - WT_ASSERT(session, - final || bulk || F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); - - /* * Turn on metadata tracking if: * - The session is not already doing metadata tracking. - * - The file was bulk loaded. + * - The file was not bulk loaded. * - The close is not during connection close. */ need_tracking = !WT_META_TRACKING(session) && !bulk && !final; @@ -1271,10 +1296,14 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final) if (need_tracking) WT_RET(__wt_meta_track_on(session)); - WT_TRET(__checkpoint_worker(session, NULL, false, need_tracking)); + WT_SAVE_DHANDLE(session, + ret = __checkpoint_lock_tree(session, false, need_tracking, NULL)); + WT_ASSERT(session, ret == 0); + if (ret == 0) + ret = __checkpoint_tree(session, false, NULL); if (need_tracking) - WT_RET(__wt_meta_track_off(session, true, ret != 0)); + WT_TRET(__wt_meta_track_off(session, true, ret != 0)); return (ret); } diff --git a/src/txn/txn_recover.c b/src/txn/txn_recover.c index e6bd8a8d755..1ea4dba1152 100644 --- a/src/txn/txn_recover.c +++ b/src/txn/txn_recover.c @@ -88,11 +88,11 @@ __recovery_cursor(WT_SESSION_IMPL *session, WT_RECOVERY *r, * Helper to a cursor if this operation is to be applied during recovery. */ #define GET_RECOVERY_CURSOR(session, r, lsnp, fileid, cp) \ - WT_ERR(__recovery_cursor( \ - (session), (r), (lsnp), (fileid), false, (cp))); \ - WT_ERR(__wt_verbose((session), WT_VERB_RECOVERY, \ - "%s op %d to file %d at LSN %u/%u", \ - (cursor == NULL) ? "Skipping" : "Applying", \ + WT_ERR(__recovery_cursor(session, r, lsnp, fileid, false, cp)); \ + WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY, \ + "%s op %" PRIu32 " to file %" PRIu32 " at LSN %" PRIu32 \ + "/%" PRIu32, \ + cursor == NULL ? "Skipping" : "Applying", \ optype, fileid, lsnp->l.file, lsnp->l.offset)); \ if (cursor == NULL) \ break @@ -334,7 +334,7 @@ __recovery_setup_file(WT_RECOVERY *r, const char *uri, const char *config) r->files[fileid].ckpt_lsn = lsn; WT_RET(__wt_verbose(r->session, WT_VERB_RECOVERY, - "Recovering %s with id %u @ (%" PRIu32 ", %" PRIu32 ")", + "Recovering %s with id %" PRIu32 " @ (%" PRIu32 ", %" PRIu32 ")", uri, fileid, lsn.l.file, lsn.l.offset)); return (0); @@ -449,6 +449,18 @@ __wt_txn_recover(WT_SESSION_IMPL *session) */ if (!was_backup) { r.metadata_only = true; + /* + * If this is a read-only connection, check if the checkpoint + * LSN in the metadata file is up to date, indicating a clean + * shutdown. + */ + if (F_ISSET(conn, WT_CONN_READONLY)) { + WT_ERR(__wt_log_needs_recovery( + session, &metafile->ckpt_lsn, &needs_rec)); + if (needs_rec) + WT_ERR_MSG(session, WT_RUN_RECOVERY, + "Read-only database needs recovery"); + } if (WT_IS_INIT_LSN(&metafile->ckpt_lsn)) WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r)); @@ -484,7 +496,7 @@ __wt_txn_recover(WT_SESSION_IMPL *session) */ r.metadata_only = false; WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY, - "Main recovery loop: starting at %u/%u", + "Main recovery loop: starting at %" PRIu32 "/%" PRIu32, r.ckpt_lsn.l.file, r.ckpt_lsn.l.offset)); WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec)); /* @@ -492,8 +504,17 @@ __wt_txn_recover(WT_SESSION_IMPL *session) * return an error if the user does not want automatic * recovery. */ - if (needs_rec && FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR)) + if (needs_rec && + (FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR) || + F_ISSET(conn, WT_CONN_READONLY))) { + if (F_ISSET(conn, WT_CONN_READONLY)) + WT_ERR_MSG(session, WT_RUN_RECOVERY, + "Read-only database needs recovery"); WT_ERR(WT_RUN_RECOVERY); + } + + if (F_ISSET(conn, WT_CONN_READONLY)) + goto done; /* * Recovery can touch more data than fits in cache, so it relies on @@ -504,7 +525,8 @@ __wt_txn_recover(WT_SESSION_IMPL *session) eviction_started = true; /* - * Always run recovery even if it was a clean shutdown. + * Always run recovery even if it was a clean shutdown only if + * this is not a read-only connection. * We can consider skipping it in the future. */ if (WT_IS_INIT_LSN(&r.ckpt_lsn)) diff --git a/src/utilities/util_dump.c b/src/utilities/util_dump.c index ca761a52d8a..aedd9168fbd 100644 --- a/src/utilities/util_dump.c +++ b/src/utilities/util_dump.c @@ -22,10 +22,10 @@ static int dump_prefix(WT_SESSION *, bool); static int dump_record(WT_CURSOR *, bool, bool); static int dump_suffix(WT_SESSION *); static int dump_table_config(WT_SESSION *, WT_CURSOR *, const char *); -static int dump_table_config_type( +static int dump_table_config_complex( WT_SESSION *, WT_CURSOR *, WT_CURSOR *, const char *, const char *); static int dup_json_string(const char *, char **); -static int print_config(WT_SESSION *, const char *, const char *, const char *); +static int print_config(WT_SESSION *, const char *, char *[]); static int usage(void); int @@ -150,9 +150,9 @@ dump_config(WT_SESSION *session, const char *uri, bool hex) /* Open a metadata cursor. */ if ((ret = session->open_cursor( - session, "metadata:create", NULL, NULL, &cursor)) != 0) { + session, "metadata:", NULL, NULL, &cursor)) != 0) { fprintf(stderr, "%s: %s: session.open_cursor: %s\n", progname, - "metadata:create", session->strerror(session, ret)); + "metadata:", session->strerror(session, ret)); return (1); } /* @@ -352,12 +352,23 @@ match: if ((ret = cursor->get_key(cursor, &key)) != 0) static int dump_json_table_config(WT_SESSION *session, const char *uri) { + WT_CONFIG_ITEM cval; WT_CURSOR *cursor; WT_DECL_RET; + size_t len; int tret; - char *value; + const char *name, *value; + char *p; + + p = NULL; + + /* Get the table name. */ + if ((name = strchr(uri, ':')) == NULL) { + fprintf(stderr, "%s: %s: corrupted uri\n", progname, uri); + return (1); + } + ++name; - /* Dump the config. */ /* Open a metadata cursor. */ if ((ret = session->open_cursor( session, "metadata:create", NULL, NULL, &cursor)) != 0) { @@ -368,12 +379,41 @@ dump_json_table_config(WT_SESSION *session, const char *uri) } /* - * Search for the object itself, to make sure it - * exists, and get its config string. This where we - * find out a table object doesn't exist, use a simple - * error message. + * Search for the object itself, just to make sure it exists, we don't + * want to output a header if the user entered the wrong name. This is + * where we find out a table doesn't exist, use a simple error message. + * + * Workaround for WiredTiger "simple" table handling. Simple tables + * have column-group entries, but they aren't listed in the metadata's + * table entry. Figure out if it's a simple table and in that case, + * retrieve the column-group entry and use the value from its "source" + * file. */ - cursor->set_key(cursor, uri); + if (WT_PREFIX_MATCH(uri, "table:")) { + len = strlen("colgroup:") + strlen(name) + 1; + if ((p = malloc(len)) == NULL) + return (util_err(session, errno, NULL)); + (void)snprintf(p, len, "colgroup:%s", name); + cursor->set_key(cursor, p); + if ((ret = cursor->search(cursor)) == 0) { + if ((ret = cursor->get_value(cursor, &value)) != 0) + return (util_cerr(cursor, "get_value", ret)); + if ((ret = __wt_config_getones( + (WT_SESSION_IMPL *)session, + value, "source", &cval)) != 0) + return (util_err( + session, ret, "%s: source entry", p)); + free(p); + len = cval.len + 10; + if ((p = malloc(len)) == NULL) + return (util_err(session, errno, NULL)); + (void)snprintf(p, len, "%.*s", (int)cval.len, cval.str); + cursor->set_key(cursor, p); + } else + cursor->set_key(cursor, uri); + } else + cursor->set_key(cursor, uri); + if ((ret = cursor->search(cursor)) == 0) { if ((ret = cursor->get_value(cursor, &value)) != 0) ret = util_cerr(cursor, "get_value", ret); @@ -381,8 +421,7 @@ dump_json_table_config(WT_SESSION *session, const char *uri) session, cursor, uri, value) != 0) ret = 1; } else if (ret == WT_NOTFOUND) - ret = util_err( - session, 0, "%s: No such object exists", uri); + ret = util_err(session, 0, "%s: No such object exists", uri); else ret = util_err(session, ret, "%s", uri); @@ -392,6 +431,7 @@ dump_json_table_config(WT_SESSION *session, const char *uri) ret = tret; } + free(p); return (ret); } @@ -414,10 +454,17 @@ dump_json_table_end(WT_SESSION *session) static int dump_table_config(WT_SESSION *session, WT_CURSOR *cursor, const char *uri) { + WT_CONFIG_ITEM cval; WT_CURSOR *srch; WT_DECL_RET; + size_t len; int tret; - const char *key, *name, *value; + bool complex_table; + const char *name, *v; + char *p, **cfg, *_cfg[4] = {NULL, NULL, NULL, NULL}; + + p = NULL; + cfg = &_cfg[3]; /* Get the table name. */ if ((name = strchr(uri, ':')) == NULL) { @@ -427,59 +474,111 @@ dump_table_config(WT_SESSION *session, WT_CURSOR *cursor, const char *uri) ++name; /* - * Dump out the config information: first, dump the uri entry itself - * (requires a lookup). + * Dump out the config information: first, dump the uri entry itself, + * it overrides all subsequent configurations. */ cursor->set_key(cursor, uri); if ((ret = cursor->search(cursor)) != 0) return (util_cerr(cursor, "search", ret)); - if ((ret = cursor->get_key(cursor, &key)) != 0) - return (util_cerr(cursor, "get_key", ret)); - if ((ret = cursor->get_value(cursor, &value)) != 0) + if ((ret = cursor->get_value(cursor, &v)) != 0) return (util_cerr(cursor, "get_value", ret)); - if (print_config(session, key, value, NULL) != 0) - return (1); + if ((*--cfg = strdup(v)) == NULL) + return (util_err(session, errno, NULL)); /* - * The underlying table configuration function needs a second cursor: - * open one before calling it, it makes error handling hugely simpler. + * Workaround for WiredTiger "simple" table handling. Simple tables + * have column-group entries, but they aren't listed in the metadata's + * table entry, and the name is different from other column-groups. + * Figure out if it's a simple table and in that case, retrieve the + * column-group's configuration value and the column-group's "source" + * entry, where the column-group entry overrides the source's. */ - if ((ret = - session->open_cursor(session, NULL, cursor, NULL, &srch)) != 0) - return (util_cerr(cursor, "open_cursor", ret)); + complex_table = false; + if (WT_PREFIX_MATCH(uri, "table:")) { + len = strlen("colgroup:") + strlen(name) + 1; + if ((p = malloc(len)) == NULL) + return (util_err(session, errno, NULL)); + (void)snprintf(p, len, "colgroup:%s", name); + cursor->set_key(cursor, p); + if ((ret = cursor->search(cursor)) == 0) { + if ((ret = cursor->get_value(cursor, &v)) != 0) + return (util_cerr(cursor, "get_value", ret)); + if ((*--cfg = strdup(v)) == NULL) + return (util_err(session, errno, NULL)); + if ((ret =__wt_config_getones( + (WT_SESSION_IMPL *)session, + *cfg, "source", &cval)) != 0) + return (util_err( + session, ret, "%s: source entry", p)); + free(p); + len = cval.len + 10; + if ((p = malloc(len)) == NULL) + return (util_err(session, errno, NULL)); + (void)snprintf(p, len, "%.*s", (int)cval.len, cval.str); + cursor->set_key(cursor, p); + if ((ret = cursor->search(cursor)) != 0) + return (util_cerr(cursor, "search", ret)); + if ((ret = cursor->get_value(cursor, &v)) != 0) + return (util_cerr(cursor, "get_value", ret)); + if ((*--cfg = strdup(v)) == NULL) + return (util_err(session, errno, NULL)); + } else + complex_table = true; + } - if ((ret = dump_table_config_type( - session, cursor, srch, name, "colgroup:")) == 0) - ret = dump_table_config_type( - session, cursor, srch, name, "index:"); + if (print_config(session, uri, cfg) != 0) + return (1); - if ((tret = srch->close(srch)) != 0) { - tret = util_cerr(cursor, "close", tret); - if (ret == 0) - ret = tret; + if (complex_table) { + /* + * The underlying table configuration function needs a second + * cursor: open one before calling it, it makes error handling + * hugely simpler. + */ + if ((ret = session->open_cursor( + session, "metadata:", NULL, NULL, &srch)) != 0) + return (util_cerr(cursor, "open_cursor", ret)); + + if ((ret = dump_table_config_complex( + session, cursor, srch, name, "colgroup:")) == 0) + ret = dump_table_config_complex( + session, cursor, srch, name, "index:"); + + if ((tret = srch->close(srch)) != 0) { + tret = util_cerr(cursor, "close", tret); + if (ret == 0) + ret = tret; + } } + free(p); + free(_cfg[0]); + free(_cfg[1]); + free(_cfg[2]); return (ret); } /* - * dump_table_config_type -- + * dump_table_config_complex -- * Dump the column groups or indices for a table. */ static int -dump_table_config_type(WT_SESSION *session, +dump_table_config_complex(WT_SESSION *session, WT_CURSOR *cursor, WT_CURSOR *srch, const char *name, const char *entry) { WT_CONFIG_ITEM cval; WT_DECL_RET; - const char *key, *skip, *value, *value_source; + const char *key; + size_t len; int exact; - char *p; + const char *v; + char *p, *cfg[3] = {NULL, NULL, NULL}; /* * Search the file looking for column group and index key/value pairs: * for each one, look up the related source information and append it - * to the base record. + * to the base record, where the column group and index configuration + * overrides the source configuration. */ cursor->set_key(cursor, entry); if ((ret = cursor->search_near(cursor, &exact)) != 0) { @@ -497,27 +596,32 @@ match: if ((ret = cursor->get_key(cursor, &key)) != 0) if (!WT_PREFIX_MATCH(key, entry)) return (0); - /* Check for a table name match. */ - skip = key + strlen(entry); - if (strncmp( - skip, name, strlen(name)) != 0 || skip[strlen(name)] != ':') + /* + * Check for a table name match. This test will match "simple" + * table column-groups as well as the more complex ones, but + * the previous version of the test was wrong and we're only + * in this function in the case of complex tables. + */ + if (!WT_PREFIX_MATCH(key + strlen(entry), name)) continue; /* Get the value. */ - if ((ret = cursor->get_value(cursor, &value)) != 0) + if ((ret = cursor->get_value(cursor, &v)) != 0) return (util_cerr(cursor, "get_value", ret)); + if ((cfg[1] = strdup(v)) == NULL) + return (util_err(session, errno, NULL)); /* Crack it and get the underlying source. */ if ((ret = __wt_config_getones( - (WT_SESSION_IMPL *)session, value, "source", &cval)) != 0) + (WT_SESSION_IMPL *)session, cfg[1], "source", &cval)) != 0) return ( util_err(session, ret, "%s: source entry", key)); /* Nul-terminate the source entry. */ - if ((p = malloc(cval.len + 10)) == NULL) + len = cval.len + 10; + if ((p = malloc(len)) == NULL) return (util_err(session, errno, NULL)); - (void)strncpy(p, cval.str, cval.len); - p[cval.len] = '\0'; + (void)snprintf(p, len, "%.*s", (int)cval.len, cval.str); srch->set_key(srch, p); if ((ret = srch->search(srch)) != 0) ret = util_err(session, ret, "%s: %s", key, p); @@ -526,16 +630,22 @@ match: if ((ret = cursor->get_key(cursor, &key)) != 0) return (1); /* Get the source's value. */ - if ((ret = srch->get_value(srch, &value_source)) != 0) + if ((ret = srch->get_value(srch, &v)) != 0) return (util_cerr(cursor, "get_value", ret)); + if ((cfg[0] = strdup(v)) == NULL) + return (util_err(session, errno, NULL)); /* * The dumped configuration string is the original key plus the - * source's configuration. + * source's configuration, where the values of the original key + * override any source configurations of the same name. */ - if (print_config(session, key, value, value_source) != 0) + if (print_config(session, key, cfg) != 0) return (util_err(session, EIO, NULL)); } + free(cfg[0]); + free(cfg[1]); + if (ret == 0 || ret == WT_NOTFOUND) return (0); return (util_cerr(cursor, "next", ret)); @@ -649,27 +759,21 @@ dup_json_string(const char *str, char **result) * Output a key/value URI pair by combining v1 and v2. */ static int -print_config(WT_SESSION *session, - const char *key, const char *v1, const char *v2) +print_config(WT_SESSION *session, const char *key, char *cfg[]) { WT_DECL_RET; char *value_ret; - const char *cfg[] = { v1, v2, NULL }; /* - * The underlying call will stop if the first string is NULL -- check - * here and swap in that case. + * We have all of the object configuration, but don't have the default + * session.create configuration. Have the underlying library add in the + * defaults and collapse it all into one load configuration string. */ - if (cfg[0] == NULL) { - cfg[0] = cfg[1]; - cfg[1] = NULL; - } - - if ((ret = __wt_config_collapse( + if ((ret = __wt_schema_create_final( (WT_SESSION_IMPL *)session, cfg, &value_ret)) != 0) return (util_err(session, ret, NULL)); ret = printf("%s\n%s\n", key, value_ret); - free((char *)value_ret); + free(value_ret); if (ret < 0) return (util_err(session, EIO, NULL)); return (0); diff --git a/test/bloom/test_bloom.c b/test/bloom/test_bloom.c index 04fc8d1c371..f95bc7faaf9 100644 --- a/test/bloom/test_bloom.c +++ b/test/bloom/test_bloom.c @@ -55,6 +55,8 @@ void usage(void); extern char *__wt_optarg; extern int __wt_optind; +void (*custom_die)(void) = NULL; + int main(int argc, char *argv[]) { @@ -129,11 +131,9 @@ setup(void) "create,error_prefix=\"%s\",cache_size=%" PRIu32 "MB,%s", g.progname, g.c_cache, g.config_open == NULL ? "" : g.config_open); - if ((ret = wiredtiger_open(NULL, NULL, config, &conn)) != 0) - testutil_die(ret, "wiredtiger_open"); + testutil_check(wiredtiger_open(NULL, NULL, config, &conn)); - if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) - testutil_die(ret, "connection.open_session"); + testutil_check(conn->open_session(conn, NULL, NULL, &session)); g.wt_conn = conn; g.wt_session = session; @@ -153,39 +153,35 @@ run(void) /* Use the internal session handle to access private APIs. */ sess = (WT_SESSION_IMPL *)g.wt_session; - if ((ret = __wt_bloom_create( - sess, uri, NULL, g.c_ops, g.c_factor, g.c_k, &bloomp)) != 0) - testutil_die(ret, "__wt_bloom_create"); + testutil_check(__wt_bloom_create( + sess, uri, NULL, g.c_ops, g.c_factor, g.c_k, &bloomp)); item.size = g.c_key_max; for (i = 0; i < g.c_ops; i++) { item.data = g.entries[i]; if ((ret = __wt_bloom_insert(bloomp, &item)) != 0) - testutil_die(ret, "__wt_bloom_insert: %d", i); + testutil_die(ret, "__wt_bloom_insert: %" PRIu32, i); } - if ((ret = __wt_bloom_finalize(bloomp)) != 0) - testutil_die(ret, "__wt_bloom_finalize"); + testutil_check(__wt_bloom_finalize(bloomp)); for (i = 0; i < g.c_ops; i++) { item.data = g.entries[i]; if ((ret = __wt_bloom_get(bloomp, &item)) != 0) { - fprintf(stderr, "get failed at record: %d\n", i); + fprintf(stderr, + "get failed at record: %" PRIu32 "\n", i); testutil_die(ret, "__wt_bloom_get"); } } - if ((ret = __wt_bloom_close(bloomp)) != 0) - testutil_die(ret, "__wt_bloom_close"); - - if ((ret = g.wt_session->checkpoint(g.wt_session, NULL)) != 0) - testutil_die(ret, "WT_SESSION.checkpoint"); - if ((ret = __wt_bloom_open( - sess, uri, g.c_factor, g.c_k, NULL, &bloomp)) != 0) - testutil_die(ret, "__wt_bloom_open"); + testutil_check(__wt_bloom_close(bloomp)); + + testutil_check(g.wt_session->checkpoint(g.wt_session, NULL)); + testutil_check(__wt_bloom_open( + sess, uri, g.c_factor, g.c_k, NULL, &bloomp)); + for (i = 0; i < g.c_ops; i++) { item.data = g.entries[i]; - if ((ret = __wt_bloom_get(bloomp, &item)) != 0) - testutil_die(ret, "__wt_bloom_get"); + testutil_check(__wt_bloom_get(bloomp, &item)); } /* @@ -194,33 +190,34 @@ run(void) */ item.size = g.c_key_max + 10; item.data = calloc(item.size, 1); + if (item.data == NULL) + testutil_die(ENOMEM, "value buffer malloc"); memset((void *)item.data, 'a', item.size); for (i = 0, fp = 0; i < g.c_ops; i++) { ((uint8_t *)item.data)[i % item.size] = 'a' + ((uint8_t)rand() % 26); if ((ret = __wt_bloom_get(bloomp, &item)) == 0) ++fp; + if (ret != 0 && ret != WT_NOTFOUND) + testutil_die(ret, "__wt_bloom_get"); } free((void *)item.data); - printf("Out of %d ops, got %d false positives, %.4f%%\n", + printf( + "Out of %" PRIu32 " ops, got %" PRIu32 " false positives, %.4f%%\n", g.c_ops, fp, 100.0 * fp/g.c_ops); - if ((ret = __wt_bloom_drop(bloomp, NULL)) != 0) - testutil_die(ret, "__wt_bloom_drop"); + testutil_check(__wt_bloom_drop(bloomp, NULL)); } void cleanup(void) { uint32_t i; - int ret; for (i = 0; i < g.c_ops; i++) free(g.entries[i]); free(g.entries); - if ((ret = g.wt_session->close(g.wt_session, NULL)) != 0) - testutil_die(ret, "WT_SESSION.close"); - if ((g.wt_conn->close(g.wt_conn, NULL)) != 0) - testutil_die(ret, "WT_CONNECTION.close"); + testutil_check(g.wt_session->close(g.wt_session, NULL)); + testutil_check(g.wt_conn->close(g.wt_conn, NULL)); } /* diff --git a/test/checkpoint/test_checkpoint.c b/test/checkpoint/test_checkpoint.c index 1914ad0188a..c5524b3c63e 100644 --- a/test/checkpoint/test_checkpoint.c +++ b/test/checkpoint/test_checkpoint.c @@ -41,6 +41,8 @@ static int wt_shutdown(void); extern int __wt_optind; extern char *__wt_optarg; +void (*custom_die)(void) = NULL; + int main(int argc, char *argv[]) { @@ -134,7 +136,7 @@ main(int argc, char *argv[]) printf("%s: process %" PRIu64 "\n", g.progname, (uint64_t)getpid()); for (cnt = 1; (runs == 0 || cnt <= runs) && g.status == 0; ++cnt) { - printf(" %d: %u workers, %u tables\n", + printf(" %d: %d workers, %d tables\n", cnt, g.nworkers, g.ntables); (void)cleanup(); /* Clean up previous runs */ diff --git a/test/cursor_order/Makefile.am b/test/cursor_order/Makefile.am new file mode 100644 index 00000000000..c0c0ed639bf --- /dev/null +++ b/test/cursor_order/Makefile.am @@ -0,0 +1,13 @@ +AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/test/utility + +noinst_PROGRAMS = cursor_order +cursor_order_LDADD = $(top_builddir)/libwiredtiger.la + +cursor_order_SOURCES = cursor_order_file.c cursor_order_ops.c cursor_order.c +cursor_order_LDFLAGS = -static + +TESTS = $(noinst_PROGRAMS) + +clean-local: + rm -rf WiredTiger* wt.* *.core __stats diff --git a/test/cursor_order/cursor_order.c b/test/cursor_order/cursor_order.c new file mode 100644 index 00000000000..d8cfc0c1421 --- /dev/null +++ b/test/cursor_order/cursor_order.c @@ -0,0 +1,307 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "cursor_order.h" + +static char home[512]; /* Program working dir */ +static char *progname; /* Program name */ +static FILE *logfp; /* Log file */ + +static int handle_error(WT_EVENT_HANDLER *, WT_SESSION *, int, const char *); +static int handle_message(WT_EVENT_HANDLER *, WT_SESSION *, const char *); +static void onint(int); +static void shutdown(void); +static int usage(void); +static void wt_connect(SHARED_CONFIG *, char *); +static void wt_shutdown(SHARED_CONFIG *); + +extern int __wt_optind; +extern char *__wt_optarg; + +void (*custom_die)(void) = NULL; + +int +main(int argc, char *argv[]) +{ + SHARED_CONFIG _cfg, *cfg; + int ch, cnt, runs; + char *config_open, *working_dir; + + if ((progname = strrchr(argv[0], DIR_DELIM)) == NULL) + progname = argv[0]; + else + ++progname; + + cfg = &_cfg; + config_open = NULL; + working_dir = NULL; + runs = 1; + + /* + * Explicitly initialize the shared configuration object before + * parsing command line options. + */ + cfg->append_inserters = 1; + cfg->conn = NULL; + cfg->ftype = ROW; + cfg->max_nops = 1000000; + cfg->multiple_files = false; + cfg->nkeys = 1000; + cfg->reverse_scanners = 5; + cfg->reverse_scan_ops = 10; + cfg->thread_finish = false; + cfg->vary_nops = false; + + while ((ch = __wt_getopt( + progname, argc, argv, "C:Fk:h:l:n:R:r:t:vw:W:")) != EOF) + switch (ch) { + case 'C': /* wiredtiger_open config */ + config_open = __wt_optarg; + break; + case 'F': /* multiple files */ + cfg->multiple_files = true; + break; + case 'h': + working_dir = __wt_optarg; + break; + case 'k': /* rows */ + cfg->nkeys = (uint64_t)atol(__wt_optarg); + break; + case 'l': /* log */ + if ((logfp = fopen(__wt_optarg, "w")) == NULL) { + fprintf(stderr, + "%s: %s\n", __wt_optarg, strerror(errno)); + return (EXIT_FAILURE); + } + break; + case 'n': /* operations */ + cfg->max_nops = (uint64_t)atol(__wt_optarg); + break; + case 'R': + cfg->reverse_scanners = (uint64_t)atol(__wt_optarg); + break; + case 'r': /* runs */ + runs = atoi(__wt_optarg); + break; + case 't': + switch (__wt_optarg[0]) { + case 'f': + cfg->ftype = FIX; + break; + case 'r': + cfg->ftype = ROW; + break; + case 'v': + cfg->ftype = VAR; + break; + default: + return (usage()); + } + break; + case 'v': /* vary operation count */ + cfg->vary_nops = true; + break; + case 'w': + cfg->reverse_scan_ops = (uint64_t)atol(__wt_optarg); + break; + case 'W': + cfg->append_inserters = (uint64_t)atol(__wt_optarg); + break; + default: + return (usage()); + } + + argc -= __wt_optind; + argv += __wt_optind; + if (argc != 0) + return (usage()); + + testutil_work_dir_from_path(home, 512, working_dir); + + if (cfg->vary_nops && !cfg->multiple_files) { + fprintf(stderr, + "Variable op counts only supported with multiple tables\n"); + return (usage()); + } + + /* Clean up on signal. */ + (void)signal(SIGINT, onint); + + printf("%s: process %" PRIu64 "\n", progname, (uint64_t)getpid()); + for (cnt = 1; runs == 0 || cnt <= runs; ++cnt) { + printf( + " %d: %" PRIu64 + " reverse scanners, %" PRIu64 " writers\n", + cnt, cfg->reverse_scanners, cfg->append_inserters); + + shutdown(); /* Clean up previous runs */ + + wt_connect(cfg, config_open); /* WiredTiger connection */ + + if (ops_start(cfg)) + return (EXIT_FAILURE); + + wt_shutdown(cfg); /* WiredTiger shut down */ + } + return (0); +} + +/* + * wt_connect -- + * Configure the WiredTiger connection. + */ +static void +wt_connect(SHARED_CONFIG *cfg, char *config_open) +{ + static WT_EVENT_HANDLER event_handler = { + handle_error, + handle_message, + NULL, + NULL /* Close handler. */ + }; + int ret; + char config[512]; + size_t print_count; + + testutil_clean_work_dir(home); + testutil_make_work_dir(home); + + print_count = (size_t)snprintf(config, sizeof(config), + "create,statistics=(all),error_prefix=\"%s\",%s%s", + progname, + config_open == NULL ? "" : ",", + config_open == NULL ? "" : config_open); + + if (print_count >= sizeof(config)) + testutil_die(EINVAL, "Config string too long"); + + if ((ret = wiredtiger_open( + home, &event_handler, config, &cfg->conn)) != 0) + testutil_die(ret, "wiredtiger_open"); +} + +/* + * wt_shutdown -- + * Flush the file to disk and shut down the WiredTiger connection. + */ +static void +wt_shutdown(SHARED_CONFIG *cfg) +{ + WT_CONNECTION *conn; + WT_SESSION *session; + int ret; + + conn = cfg->conn; + + if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) + testutil_die(ret, "conn.session"); + + if ((ret = session->checkpoint(session, NULL)) != 0) + testutil_die(ret, "session.checkpoint"); + + if ((ret = conn->close(conn, NULL)) != 0) + testutil_die(ret, "conn.close"); +} + +/* + * shutdown -- + * Clean up from previous runs. + */ +static void +shutdown(void) +{ + testutil_clean_work_dir(home); +} + +static int +handle_error(WT_EVENT_HANDLER *handler, + WT_SESSION *session, int error, const char *errmsg) +{ + (void)(handler); + (void)(session); + (void)(error); + + return (fprintf(stderr, "%s\n", errmsg) < 0 ? -1 : 0); +} + +static int +handle_message(WT_EVENT_HANDLER *handler, + WT_SESSION *session, const char *message) +{ + (void)(handler); + (void)(session); + + if (logfp != NULL) + return (fprintf(logfp, "%s\n", message) < 0 ? -1 : 0); + + return (printf("%s\n", message) < 0 ? -1 : 0); +} + +/* + * onint -- + * Interrupt signal handler. + */ +static void +onint(int signo) +{ + (void)(signo); + + shutdown(); + + fprintf(stderr, "\n"); + exit(EXIT_FAILURE); +} + +/* + * usage -- + * Display usage statement and exit failure. + */ +static int +usage(void) +{ + fprintf(stderr, + "usage: %s " + "[-FLv] [-C wiredtiger-config] [-k keys] [-l log]\n\t" + "[-n ops] [-R reverse_scanners] [-r runs] [-t f|r|v] " + "[-W append_inserters]\n", + progname); + fprintf(stderr, "%s", + "\t-C specify wiredtiger_open configuration arguments\n" + "\t-F create a file per thread\n" + "\t-k set number of keys to load\n" + "\t-L log print per operation\n" + "\t-l specify a log file\n" + "\t-n set number of operations each thread does\n" + "\t-R set number of reverse scanner threads\n" + "\t-r set number of runs (0 for continuous)\n" + "\t-t set a file type (fix | row | var)\n" + "\t-v do a different number of operations on different tables\n" + "\t-w set number of items to walk in a reverse scan\n" + "\t-W set number of threads doing append inserts\n"); + return (EXIT_FAILURE); +} diff --git a/test/cursor_order/cursor_order.h b/test/cursor_order/cursor_order.h new file mode 100644 index 00000000000..dd49fce124b --- /dev/null +++ b/test/cursor_order/cursor_order.h @@ -0,0 +1,54 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <signal.h> + +#include "test_util.i" + +#define FNAME "file:cursor_order.%03d" /* File name */ + +typedef enum { FIX, ROW, VAR } __ftype; /* File type */ + +typedef struct { + uint64_t append_inserters; /* Number of append threads */ + WT_CONNECTION *conn; /* WiredTiger connection */ + __ftype ftype; + uint64_t key_range; /* Current key range */ + uint64_t max_nops; /* Operations per thread */ + bool multiple_files; /* File per thread */ + uint64_t nkeys; /* Keys to load */ + uint64_t reverse_scanners; /* Number of scan threads */ + uint64_t reverse_scan_ops; /* Keys to visit per scan */ + bool thread_finish; /* Signal to finish run. */ + bool vary_nops; /* Operations per thread */ + +} SHARED_CONFIG; + +void load(SHARED_CONFIG *, const char *); +int ops_start(SHARED_CONFIG *); +void verify(SHARED_CONFIG *, const char *); diff --git a/test/cursor_order/cursor_order_file.c b/test/cursor_order/cursor_order_file.c new file mode 100644 index 00000000000..5dc7194b5fb --- /dev/null +++ b/test/cursor_order/cursor_order_file.c @@ -0,0 +1,132 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "cursor_order.h" + +static void +file_create(SHARED_CONFIG *cfg, const char *name) +{ + WT_CONNECTION *conn; + WT_SESSION *session; + int ret; + char *p, *end, config[128]; + + conn = cfg->conn; + + if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) + testutil_die(ret, "conn.session"); + + p = config; + end = config + sizeof(config); + p += snprintf(p, (size_t)(end - p), + "key_format=%s," + "internal_page_max=%d," + "split_deepen_min_child=200," + "leaf_page_max=%d,", + cfg->ftype == ROW ? "S" : "r", 16 * 1024, 128 * 1024); + if (cfg->ftype == FIX) + (void)snprintf(p, (size_t)(end - p), ",value_format=3t"); + + if ((ret = session->create(session, name, config)) != 0) + if (ret != EEXIST) + testutil_die(ret, "session.create"); + + if ((ret = session->close(session, NULL)) != 0) + testutil_die(ret, "session.close"); +} + +void +load(SHARED_CONFIG *cfg, const char *name) +{ + WT_CONNECTION *conn; + WT_CURSOR *cursor; + WT_ITEM *value, _value; + WT_SESSION *session; + char keybuf[64], valuebuf[64]; + int64_t keyno; + int ret; + + conn = cfg->conn; + + file_create(cfg, name); + + if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) + testutil_die(ret, "conn.session"); + + if ((ret = + session->open_cursor(session, name, NULL, "bulk", &cursor)) != 0) + testutil_die(ret, "cursor.open"); + + value = &_value; + for (keyno = 1; keyno <= (int64_t)cfg->nkeys; ++keyno) { + if (cfg->ftype == ROW) { + snprintf(keybuf, sizeof(keybuf), "%016u", (u_int)keyno); + cursor->set_key(cursor, keybuf); + } else + cursor->set_key(cursor, (uint32_t)keyno); + value->data = valuebuf; + if (cfg->ftype == FIX) + cursor->set_value(cursor, 0x01); + else { + value->size = (uint32_t)snprintf( + valuebuf, sizeof(valuebuf), "%37u", (u_int)keyno); + cursor->set_value(cursor, value); + } + if ((ret = cursor->insert(cursor)) != 0) + testutil_die(ret, "cursor.insert"); + } + + /* Setup the starting key range for the workload phase. */ + cfg->key_range = cfg->nkeys; + if ((ret = cursor->close(cursor)) != 0) + testutil_die(ret, "cursor.close"); + if ((ret = session->checkpoint(session, NULL)) != 0) + testutil_die(ret, "session.checkpoint"); + + if ((ret = session->close(session, NULL)) != 0) + testutil_die(ret, "session.close"); +} + +void +verify(SHARED_CONFIG *cfg, const char *name) +{ + WT_CONNECTION *conn; + WT_SESSION *session; + int ret; + + conn = cfg->conn; + + if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) + testutil_die(ret, "conn.session"); + + if ((ret = session->verify(session, name, NULL)) != 0) + testutil_die(ret, "session.create"); + + if ((ret = session->close(session, NULL)) != 0) + testutil_die(ret, "session.close"); +} diff --git a/test/cursor_order/cursor_order_ops.c b/test/cursor_order/cursor_order_ops.c new file mode 100644 index 00000000000..d44505ab2f3 --- /dev/null +++ b/test/cursor_order/cursor_order_ops.c @@ -0,0 +1,370 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "cursor_order.h" + +static void *append_insert(void *); +static void print_stats(SHARED_CONFIG *); +static void *reverse_scan(void *); + +typedef struct { + char *name; /* object name */ + uint64_t nops; /* Thread op count */ + + WT_RAND_STATE rnd; /* RNG */ + + int append_insert; /* cursor.insert */ + int reverse_scans; /* cursor.prev sequences */ + SHARED_CONFIG *cfg; +} INFO; + +static INFO *run_info; + +int +ops_start(SHARED_CONFIG *cfg) +{ + struct timeval start, stop; + double seconds; + pthread_t *tids; + uint64_t i, name_index, offset, total_nops; + int ret; + void *thread_ret; + + tids = NULL; /* Keep GCC 4.1 happy. */ + total_nops = 0; + + /* Create per-thread structures. */ + if ((run_info = calloc( + (size_t)(cfg->reverse_scanners + cfg->append_inserters), + sizeof(*run_info))) == NULL) + testutil_die(errno, "calloc"); + + if ((tids = calloc( + (size_t)(cfg->reverse_scanners + cfg->append_inserters), + sizeof(*tids))) == NULL) + testutil_die(errno, "calloc"); + + /* Create the files and load the initial records. */ + for (i = 0; i < cfg->append_inserters; ++i) { + run_info[i].cfg = cfg; + if (i == 0 || cfg->multiple_files) { + if ((run_info[i].name = malloc(64)) == NULL) + testutil_die(errno, "malloc"); + snprintf(run_info[i].name, 64, FNAME, (int)i); + + /* Vary by orders of magnitude */ + if (cfg->vary_nops) + run_info[i].nops = + WT_MAX(1000, cfg->max_nops >> i); + load(cfg, run_info[i].name); + } else + run_info[i].name = run_info[0].name; + + /* Setup op count if not varying ops. */ + if (run_info[i].nops == 0) + run_info[i].nops = cfg->max_nops; + total_nops += run_info[i].nops; + } + + /* Setup the reverse scanner configurations */ + for (i = 0; i < cfg->reverse_scanners; ++i) { + offset = i + cfg->append_inserters; + run_info[offset].cfg = cfg; + if (cfg->multiple_files) { + if ((run_info[offset].name = malloc(64)) == NULL) + testutil_die(errno, "malloc"); + /* Have reverse scans read from tables with writes. */ + name_index = i % cfg->append_inserters; + snprintf( + run_info[offset].name, 64, FNAME, (int)name_index); + + /* Vary by orders of magnitude */ + if (cfg->vary_nops) + run_info[offset].nops = + WT_MAX(1000, cfg->max_nops >> name_index); + } else + run_info[offset].name = run_info[0].name; + + /* Setup op count if not varying ops. */ + if (run_info[offset].nops == 0) + run_info[offset].nops = cfg->max_nops; + total_nops += run_info[offset].nops; + } + + (void)gettimeofday(&start, NULL); + + /* Create threads. */ + for (i = 0; i < cfg->reverse_scanners; ++i) + if ((ret = pthread_create( + &tids[i], NULL, reverse_scan, (void *)(uintptr_t)i)) != 0) + testutil_die(ret, "pthread_create"); + for (; i < cfg->reverse_scanners + cfg->append_inserters; ++i) { + if ((ret = pthread_create( + &tids[i], NULL, append_insert, (void *)(uintptr_t)i)) != 0) + testutil_die(ret, "pthread_create"); + } + + /* Wait for the threads. */ + for (i = 0; i < cfg->reverse_scanners + cfg->append_inserters; ++i) + (void)pthread_join(tids[i], &thread_ret); + + (void)gettimeofday(&stop, NULL); + seconds = (stop.tv_sec - start.tv_sec) + + (stop.tv_usec - start.tv_usec) * 1e-6; + fprintf(stderr, "timer: %.2lf seconds (%d ops/second)\n", + seconds, (int)(((cfg->reverse_scanners + cfg->append_inserters) * + total_nops) / seconds)); + + /* Verify the files. */ + for (i = 0; i < cfg->reverse_scanners + cfg->append_inserters; ++i) { + verify(cfg, run_info[i].name); + if (!cfg->multiple_files) + break; + } + + /* Output run statistics. */ + print_stats(cfg); + + /* Free allocated memory. */ + for (i = 0; i < cfg->reverse_scanners + cfg->append_inserters; ++i) { + free(run_info[i].name); + if (!cfg->multiple_files) + break; + } + + free(run_info); + free(tids); + + return (0); +} + +/* + * reverse_scan_op -- + * Walk a cursor back from the end of the file. + */ +static inline void +reverse_scan_op( + SHARED_CONFIG *cfg, WT_SESSION *session, WT_CURSOR *cursor, INFO *s) +{ + uint64_t i, initial_key_range, prev_key, this_key; + int ret; + char *strkey; + + WT_UNUSED(session); + WT_UNUSED(s); + + /* Make GCC 4.1 happy */ + prev_key = this_key = 0; + + /* Reset the cursor */ + if ((ret = cursor->reset(cursor)) != 0) + testutil_die(ret, "cursor.reset"); + + /* Save the key range. */ + initial_key_range = cfg->key_range - cfg->append_inserters; + + for (i = 0; i < cfg->reverse_scan_ops; i++) { + if ((ret = cursor->prev(cursor)) != 0) { + if (ret == WT_NOTFOUND) + break; + testutil_die(ret, "cursor.prev"); + } + + if (cfg->ftype == ROW) { + if ((ret = cursor->get_key(cursor, &strkey)) != 0) + testutil_die(ret, "cursor.get_key"); + this_key = (uint64_t)atol(strkey); + } else + if ((ret = cursor->get_key( + cursor, (uint64_t *)&this_key)) != 0) + testutil_die(ret, "cursor.get_key"); + + if (i == 0 && this_key < initial_key_range) + testutil_die(ret, + "cursor scan start range wrong first prev %" PRIu64 + " initial range: %" PRIu64, + this_key, initial_key_range); + if (i != 0 && this_key >= prev_key) + testutil_die(ret, + "cursor scan out of order this: %" PRIu64 + " prev: %" PRIu64, + this_key, prev_key); + prev_key = this_key; + } +} + +/* + * reverse_scan -- + * Reader thread start function. + */ +static void * +reverse_scan(void *arg) +{ + INFO *s; + SHARED_CONFIG *cfg; + WT_CURSOR *cursor; + WT_SESSION *session; + uintmax_t id; + uint64_t i; + int ret; + char tid[128]; + + id = (uintmax_t)arg; + s = &run_info[id]; + cfg = s->cfg; + __wt_thread_id(tid, sizeof(tid)); + __wt_random_init(&s->rnd); + + printf(" reverse scan thread %2" PRIuMAX + " starting: tid: %s, file: %s\n", + id, tid, s->name); + + __wt_yield(); /* Get all the threads created. */ + + if ((ret = cfg->conn->open_session( + cfg->conn, NULL, "isolation=snapshot", &session)) != 0) + testutil_die(ret, "conn.open_session"); + if ((ret = session->open_cursor( + session, s->name, NULL, NULL, &cursor)) != 0) + testutil_die(ret, "session.open_cursor"); + for (i = 0; i < s->nops && !cfg->thread_finish; + ++i, ++s->reverse_scans, __wt_yield()) + reverse_scan_op(cfg, session, cursor, s); + if ((ret = session->close(session, NULL)) != 0) + testutil_die(ret, "session.close"); + + printf(" reverse scan thread %2" PRIuMAX + " stopping: tid: %s, file: %s\n", + id, tid, s->name); + + /* Notify all other threads to finish once the first thread is done */ + cfg->thread_finish = true; + + return (NULL); +} + +/* + * append_insert_op -- + * Write operation. + */ +static inline void +append_insert_op( + SHARED_CONFIG *cfg, WT_SESSION *session, WT_CURSOR *cursor, INFO *s) +{ + WT_ITEM *value, _value; + uint64_t keyno; + int ret; + char keybuf[64], valuebuf[64]; + + WT_UNUSED(session); + + value = &_value; + + keyno = __wt_atomic_add64(&cfg->key_range, 1); + if (cfg->ftype == ROW) { + snprintf(keybuf, sizeof(keybuf), "%016u", (u_int)keyno); + cursor->set_key(cursor, keybuf); + } else + cursor->set_key(cursor, (uint32_t)keyno); + + ++s->append_insert; + value->data = valuebuf; + if (cfg->ftype == FIX) + cursor->set_value(cursor, 0x10); + else { + value->size = (uint32_t)snprintf( + valuebuf, sizeof(valuebuf), "XXX %37u", (u_int)keyno); + cursor->set_value(cursor, value); + } + if ((ret = cursor->insert(cursor)) != 0) + testutil_die(ret, "cursor.insert"); +} + +/* + * append_insert -- + * Writer thread start function. + */ +static void * +append_insert(void *arg) +{ + INFO *s; + SHARED_CONFIG *cfg; + WT_CURSOR *cursor; + WT_SESSION *session; + uintmax_t id; + uint64_t i; + int ret; + char tid[128]; + + id = (uintmax_t)arg; + s = &run_info[id]; + cfg = s->cfg; + __wt_thread_id(tid, sizeof(tid)); + __wt_random_init(&s->rnd); + + printf("write thread %2" PRIuMAX " starting: tid: %s, file: %s\n", + id, tid, s->name); + + __wt_yield(); /* Get all the threads created. */ + + if ((ret = cfg->conn->open_session( + cfg->conn, NULL, "isolation=snapshot", &session)) != 0) + testutil_die(ret, "conn.open_session"); + if ((ret = session->open_cursor( + session, s->name, NULL, NULL, &cursor)) != 0) + testutil_die(ret, "session.open_cursor"); + for (i = 0; i < s->nops && !cfg->thread_finish; ++i, __wt_yield()) + append_insert_op(cfg, session, cursor, s); + if ((ret = session->close(session, NULL)) != 0) + testutil_die(ret, "session.close"); + + printf("write thread %2" PRIuMAX " stopping: tid: %s, file: %s\n", + id, tid, s->name); + + /* Notify all other threads to finish once the first thread is done */ + cfg->thread_finish = true; + + return (NULL); +} + +/* + * print_stats -- + * Display reverse scan/writer thread stats. + */ +static void +print_stats(SHARED_CONFIG *cfg) +{ + INFO *s; + uint64_t id, total_threads; + + total_threads = cfg->reverse_scanners + cfg->append_inserters; + s = run_info; + for (id = 0; id < total_threads; ++id, ++s) + printf("%3d: reverse scans %6d, append inserts %6d\n", + (int)id, (int)s->reverse_scans, (int)s->append_insert); +} diff --git a/test/fops/file.c b/test/fops/file.c index 4cd92e7b590..ea15f1ee80d 100644 --- a/test/fops/file.c +++ b/test/fops/file.c @@ -147,7 +147,7 @@ obj_create_unique(int force) /* Generate a unique object name. */ if ((ret = pthread_rwlock_wrlock(&single)) != 0) testutil_die(ret, "pthread_rwlock_wrlock single"); - (void)snprintf(new_uri, sizeof(new_uri), "%s.%d", uri, ++uid); + (void)snprintf(new_uri, sizeof(new_uri), "%s.%u", uri, ++uid); if ((ret = pthread_rwlock_unlock(&single)) != 0) testutil_die(ret, "pthread_rwlock_unlock single"); diff --git a/test/fops/fops.c b/test/fops/fops.c index fbc9d9c6048..3333ff16858 100644 --- a/test/fops/fops.c +++ b/test/fops/fops.c @@ -109,7 +109,7 @@ fop(void *arg) __wt_random_init(&rnd); for (i = 0; i < nops; ++i, __wt_yield()) - switch (__wt_random(&rnd) % 9) { + switch (__wt_random(&rnd) % 10) { case 0: ++s->bulk; obj_bulk(); diff --git a/test/fops/t.c b/test/fops/t.c index 0881c23d7d4..24994404c7c 100644 --- a/test/fops/t.c +++ b/test/fops/t.c @@ -50,6 +50,8 @@ static void wt_shutdown(void); extern int __wt_optind; extern char *__wt_optarg; +void (*custom_die)(void) = NULL; + int main(int argc, char *argv[]) { diff --git a/test/format/backup.c b/test/format/backup.c index 748494bf841..2b1463bd0e3 100644 --- a/test/format/backup.c +++ b/test/format/backup.c @@ -37,20 +37,18 @@ check_copy(void) { WT_CONNECTION *conn; WT_SESSION *session; - int ret; wts_open(g.home_backup, 0, &conn); - if ((ret = conn->open_session( - conn, NULL, NULL, &session)) != 0) - die(ret, "connection.open_session: %s", g.home_backup); + testutil_checkfmt( + conn->open_session(conn, NULL, NULL, &session), + "%s", g.home_backup); - ret = session->verify(session, g.uri, NULL); - if (ret != 0) - die(ret, "session.verify: %s: %s", g.home_backup, g.uri); + testutil_checkfmt( + session->verify(session, g.uri, NULL), + "%s: %s", g.home_backup, g.uri); - if ((ret = conn->close(conn, NULL)) != 0) - die(ret, "connection.close: %s", g.home_backup); + testutil_checkfmt(conn->close(conn, NULL), "%s", g.home_backup); } /* @@ -62,14 +60,19 @@ copy_file(const char *name) { size_t len; char *cmd; - int ret; len = strlen(g.home) + strlen(g.home_backup) + strlen(name) * 2 + 20; cmd = dmalloc(len); (void)snprintf(cmd, len, "cp %s/%s %s/%s", g.home, name, g.home_backup, name); - if ((ret = system(cmd)) != 0) - die(ret, "backup copy: %s", cmd); + testutil_checkfmt(system(cmd), "backup copy: %s", cmd); + free(cmd); + + len = strlen(g.home) + strlen(g.home_backup2) + strlen(name) * 2 + 20; + cmd = dmalloc(len); + (void)snprintf(cmd, len, + "cp %s/%s %s/%s", g.home, name, g.home_backup2, name); + testutil_checkfmt(system(cmd), "backup copy: %s", cmd); free(cmd); } @@ -96,8 +99,7 @@ backup(void *arg) return (NULL); /* Open a session. */ - if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) - die(ret, "connection.open_session"); + testutil_check(conn->open_session(conn, NULL, NULL, &session)); /* * Perform a backup at somewhere under 10 seconds (so we get at @@ -113,12 +115,12 @@ backup(void *arg) break; /* Lock out named checkpoints */ - if ((ret = pthread_rwlock_wrlock(&g.backup_lock)) != 0) - die(ret, "pthread_rwlock_wrlock: backup lock"); + testutil_check(pthread_rwlock_wrlock(&g.backup_lock)); /* Re-create the backup directory. */ - if ((ret = system(g.home_backup_init)) != 0) - die(ret, "backup directory creation failed"); + testutil_checkfmt( + system(g.home_backup_init), + "%s", "backup directory creation failed"); /* * open_cursor can return EBUSY if a metadata operation is @@ -128,26 +130,21 @@ backup(void *arg) "backup:", NULL, NULL, &backup_cursor)) == EBUSY) sleep(1); if (ret != 0) - die(ret, "session.open_cursor: backup"); + testutil_die(ret, "session.open_cursor: backup"); while ((ret = backup_cursor->next(backup_cursor)) == 0) { - if ((ret = - backup_cursor->get_key(backup_cursor, &key)) != 0) - die(ret, "cursor.get_key"); + testutil_check( + backup_cursor->get_key(backup_cursor, &key)); copy_file(key); } - if ((ret = backup_cursor->close(backup_cursor)) != 0) - die(ret, "cursor.close"); - - if ((ret = pthread_rwlock_unlock(&g.backup_lock)) != 0) - die(ret, "pthread_rwlock_unlock: backup lock"); + testutil_check(backup_cursor->close(backup_cursor)); + testutil_check(pthread_rwlock_unlock(&g.backup_lock)); check_copy(); } - if ((ret = session->close(session, NULL)) != 0) - die(ret, "session.close"); + testutil_check(session->close(session, NULL)); return (NULL); } diff --git a/test/format/bdb.c b/test/format/bdb.c index d7b4bca62f2..823fc8ff888 100644 --- a/test/format/bdb.c +++ b/test/format/bdb.c @@ -128,7 +128,7 @@ bdb_np(int next, if ((ret = dbc->get(dbc, &key, &value, next ? DB_NEXT : DB_PREV)) != 0) { if (ret != DB_NOTFOUND) - die(ret, "dbc.get: %s: {%.*s}", + testutil_die(ret, "dbc.get: %s: {%.*s}", next ? "DB_NEXT" : "DB_PREV", (int)key.size, (char *)key.data); *notfoundp = 1; @@ -154,7 +154,7 @@ bdb_read(uint64_t keyno, void *valuep, size_t *valuesizep, int *notfoundp) *notfoundp = 0; if ((ret = dbc->get(dbc, &key, &value, DB_SET)) != 0) { if (ret != DB_NOTFOUND) - die(ret, "dbc.get: DB_SET: {%.*s}", + testutil_die(ret, "dbc.get: DB_SET: {%.*s}", (int)key.size, (char *)key.data); *notfoundp = 1; } else { @@ -178,7 +178,7 @@ bdb_update(const void *arg_key, size_t arg_key_size, *notfoundp = 0; if ((ret = dbc->put(dbc, &key, &value, DB_KEYFIRST)) != 0) { if (ret != DB_NOTFOUND) { - die(ret, "dbc.put: DB_KEYFIRST: {%.*s}{%.*s}", + testutil_die(ret, "dbc.put: DB_KEYFIRST: {%.*s}{%.*s}", (int)key.size, (char *)key.data, (int)value.size, (char *)value.data); } @@ -204,7 +204,7 @@ bdb_remove(uint64_t keyno, int *notfoundp) if ((ret = dbc->del(dbc, 0)) != 0) { if (ret != DB_NOTFOUND) - die(ret, "dbc.del: {%.*s}", + testutil_die(ret, "dbc.del: {%.*s}", (int)key.size, (char *)key.data); *notfoundp = 1; } diff --git a/test/format/bulk.c b/test/format/bulk.c index 28189e25b65..64b005d294f 100644 --- a/test/format/bulk.c +++ b/test/format/bulk.c @@ -37,13 +37,11 @@ wts_load(void) WT_SESSION *session; uint8_t *keybuf, *valbuf; bool is_bulk; - int ret; conn = g.wts_conn; keybuf = valbuf = NULL; - if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) - die(ret, "connection.open_session"); + testutil_check(conn->open_session(conn, NULL, NULL, &session)); if (g.logging != 0) (void)g.wt_api->msg_printf(g.wt_api, session, @@ -61,9 +59,8 @@ wts_load(void) if (g.c_reverse) is_bulk = false; - if ((ret = session->open_cursor(session, g.uri, NULL, - is_bulk ? "bulk,append" : NULL, &cursor)) != 0) - die(ret, "session.open_cursor"); + testutil_check(session->open_cursor(session, g.uri, NULL, + is_bulk ? "bulk,append" : NULL, &cursor)); /* Set up the key/value buffers. */ key_gen_setup(&keybuf); @@ -120,8 +117,7 @@ wts_load(void) break; } - if ((ret = cursor->insert(cursor)) != 0) - die(ret, "cursor.insert"); + testutil_check(cursor->insert(cursor)); #ifdef HAVE_BERKELEY_DB if (SINGLETHREADED) @@ -129,15 +125,13 @@ wts_load(void) #endif } - if ((ret = cursor->close(cursor)) != 0) - die(ret, "cursor.close"); + testutil_check(cursor->close(cursor)); if (g.logging != 0) (void)g.wt_api->msg_printf(g.wt_api, session, "=============== bulk load stop ==============="); - if ((ret = session->close(session, NULL)) != 0) - die(ret, "session.close"); + testutil_check(session->close(session, NULL)); free(keybuf); free(valbuf); diff --git a/test/format/compact.c b/test/format/compact.c index fdfa597e07e..a75ee4f2adf 100644 --- a/test/format/compact.c +++ b/test/format/compact.c @@ -48,8 +48,7 @@ compact(void *arg) /* Open a session. */ conn = g.wts_conn; - if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) - die(ret, "connection.open_session"); + testutil_check(conn->open_session(conn, NULL, NULL, &session)); /* * Perform compaction at somewhere under 15 seconds (so we get at @@ -66,11 +65,10 @@ compact(void *arg) if ((ret = session->compact( session, g.uri, NULL)) != 0 && ret != WT_ROLLBACK) - die(ret, "session.compact"); + testutil_die(ret, "session.compact"); } - if ((ret = session->close(session, NULL)) != 0) - die(ret, "session.close"); + testutil_check(session->close(session, NULL)); return (NULL); } diff --git a/test/format/config.c b/test/format/config.c index d431546f254..042316d8344 100644 --- a/test/format/config.c +++ b/test/format/config.c @@ -138,9 +138,10 @@ config_setup(void) /* Required shared libraries. */ if (DATASOURCE("helium") && access(HELIUM_PATH, R_OK) != 0) - die(errno, "Levyx/helium shared library: %s", HELIUM_PATH); + testutil_die(errno, + "Levyx/helium shared library: %s", HELIUM_PATH); if (DATASOURCE("kvsbdb") && access(KVS_BDB_PATH, R_OK) != 0) - die(errno, "kvsbdb shared library: %s", KVS_BDB_PATH); + testutil_die(errno, "kvsbdb shared library: %s", KVS_BDB_PATH); /* Some data-sources don't support user-specified collations. */ if (DATASOURCE("helium") || DATASOURCE("kvsbdb")) @@ -199,14 +200,15 @@ config_setup(void) if (!config_is_perm("key_max") && g.c_key_max < g.c_key_min) g.c_key_max = g.c_key_min; if (g.c_key_min > g.c_key_max) - die(EINVAL, "key_min may not be larger than key_max"); + testutil_die(EINVAL, "key_min may not be larger than key_max"); if (!config_is_perm("value_min") && g.c_value_min > g.c_value_max) g.c_value_min = g.c_value_max; if (!config_is_perm("value_max") && g.c_value_max < g.c_value_min) g.c_value_max = g.c_value_min; if (g.c_value_min > g.c_value_max) - die(EINVAL, "value_min may not be larger than value_max"); + testutil_die(EINVAL, + "value_min may not be larger than value_max"); /* Reset the key count. */ g.key_cnt = 0; @@ -412,7 +414,7 @@ config_lrt(void) */ if (g.type == FIX) { if (g.c_long_running_txn && config_is_perm("long_running_txn")) - die(EINVAL, + testutil_die(EINVAL, "long_running_txn not supported with fixed-length " "column store"); g.c_long_running_txn = 0; @@ -453,7 +455,7 @@ config_print(int error_display) fp = stdout; else if ((fp = fopen(g.home_config, "w")) == NULL) - die(errno, "fopen: %s", g.home_config); + testutil_die(errno, "fopen: %s", g.home_config); fprintf(fp, "############################################\n"); fprintf(fp, "# RUN PARAMETERS\n"); @@ -487,7 +489,7 @@ config_file(const char *name) char *p, buf[256]; if ((fp = fopen(name, "r")) == NULL) - die(errno, "fopen: %s", name); + testutil_die(errno, "fopen: %s", name); while (fgets(buf, sizeof(buf), fp) != NULL) { for (p = buf; *p != '\0' && *p != '\n'; ++p) ; @@ -582,7 +584,7 @@ config_single(const char *s, int perm) *cp->vstr = strdup(ep); } if (*cp->vstr == NULL) - die(errno, "malloc"); + testutil_die(errno, "malloc"); return; } @@ -625,7 +627,7 @@ config_map_file_type(const char *s, u_int *vp) strcmp(s, "row-store") == 0) *vp = ROW; else - die(EINVAL, "illegal file type configuration: %s", s); + testutil_die(EINVAL, "illegal file type configuration: %s", s); } /* @@ -642,7 +644,7 @@ config_map_checksum(const char *s, u_int *vp) else if (strcmp(s, "uncompressed") == 0) *vp = CHECKSUM_UNCOMPRESSED; else - die(EINVAL, "illegal checksum configuration: %s", s); + testutil_die(EINVAL, "illegal checksum configuration: %s", s); } /* @@ -667,7 +669,8 @@ config_map_compression(const char *s, u_int *vp) else if (strcmp(s, "zlib-noraw") == 0) *vp = COMPRESS_ZLIB_NO_RAW; else - die(EINVAL, "illegal compression configuration: %s", s); + testutil_die(EINVAL, + "illegal compression configuration: %s", s); } /* @@ -682,7 +685,7 @@ config_map_encryption(const char *s, u_int *vp) else if (strcmp(s, "rotn-7") == 0) *vp = ENCRYPT_ROTN_7; else - die(EINVAL, "illegal encryption configuration: %s", s); + testutil_die(EINVAL, "illegal encryption configuration: %s", s); } /* @@ -701,7 +704,7 @@ config_map_isolation(const char *s, u_int *vp) else if (strcmp(s, "snapshot") == 0) *vp = ISOLATION_SNAPSHOT; else - die(EINVAL, "illegal isolation configuration: %s", s); + testutil_die(EINVAL, "illegal isolation configuration: %s", s); } /* diff --git a/test/format/config.h b/test/format/config.h index d8b11b005d4..a17614bc044 100644 --- a/test/format/config.h +++ b/test/format/config.h @@ -246,6 +246,10 @@ static CONFIG c[] = { "minimum gain before prefix compression is used", 0x0, 0, 8, 256, &g.c_prefix_compression_min, NULL }, + { "quiet", + "quiet run (same as -q)", + C_IGNORE|C_BOOL, 0, 0, 0, &g.c_quiet, NULL }, + { "repeat_data_pct", "percent duplicate values in row- or var-length column-stores", 0x0, 0, 90, 90, &g.c_repeat_data_pct, NULL }, diff --git a/test/format/format.h b/test/format/format.h index 41c9de3dd30..a129c5395fd 100644 --- a/test/format/format.h +++ b/test/format/format.h @@ -109,6 +109,7 @@ typedef struct { char *home; /* Home directory */ char *home_backup; /* Hot-backup directory */ + char *home_backup2; /* Saved Hot-backup directory */ char *home_backup_init; /* Initialize backup command */ char *home_bdb; /* BDB directory */ char *home_config; /* Run CONFIG file path */ @@ -142,7 +143,6 @@ typedef struct { FILE *logfp; /* Log file */ int replay; /* Replaying a run. */ - int track; /* Track progress */ int workers_finished; /* Operations completed */ pthread_rwlock_t backup_lock; /* Hot backup running */ @@ -210,6 +210,7 @@ typedef struct { uint32_t c_merge_max; uint32_t c_mmap; uint32_t c_ops; + uint32_t c_quiet; uint32_t c_prefix_compression; uint32_t c_prefix_compression_min; uint32_t c_repeat_data_pct; @@ -334,12 +335,6 @@ void wts_salvage(void); void wts_stats(void); void wts_verify(const char *); -void die(int, const char *, ...) -#if defined(__GNUC__) -__attribute__((__noreturn__)) -#endif -; - /* * mmrand -- * Return a random value between a min/max pair. diff --git a/test/format/lrt.c b/test/format/lrt.c index b7392829d30..451d2f4fa3c 100644 --- a/test/format/lrt.c +++ b/test/format/lrt.c @@ -60,11 +60,9 @@ lrt(void *arg) /* Open a session and cursor. */ conn = g.wts_conn; - if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) - die(ret, "connection.open_session"); - if ((ret = session->open_cursor( - session, g.uri, NULL, NULL, &cursor)) != 0) - die(ret, "session.open_cursor"); + testutil_check(conn->open_session(conn, NULL, NULL, &session)); + testutil_check(session->open_cursor( + session, g.uri, NULL, NULL, &cursor)); for (pinned = 0;;) { if (pinned) { @@ -73,7 +71,8 @@ lrt(void *arg) &key, saved_keyno, 1)) == WT_ROLLBACK) ; if (ret != 0) - die(ret, "read_row %" PRIu64, saved_keyno); + testutil_die(ret, + "read_row %" PRIu64, saved_keyno); /* Compare the previous value with the current one. */ if (g.type == FIX) { @@ -83,21 +82,19 @@ lrt(void *arg) } else ret = cursor->get_value(cursor, &value); if (ret != 0) - die(ret, + testutil_die(ret, "cursor.get_value: %" PRIu64, saved_keyno); if (buf_size != value.size || memcmp(buf, value.data, value.size) != 0) - die(0, "mismatched start/stop values"); + testutil_die(0, "mismatched start/stop values"); /* End the transaction. */ - if ((ret = - session->commit_transaction(session, NULL)) != 0) - die(ret, "session.commit_transaction"); + testutil_check( + session->commit_transaction(session, NULL)); /* Reset the cursor, releasing our pin. */ - if ((ret = cursor->reset(cursor)) != 0) - die(ret, "cursor.reset"); + testutil_check(cursor->reset(cursor)); pinned = 0; } else { /* @@ -106,9 +103,8 @@ lrt(void *arg) * positioned. As soon as the cursor loses its position * a new snapshot will be allocated. */ - if ((ret = session->begin_transaction( - session, "isolation=snapshot")) != 0) - die(ret, "session.begin_transaction"); + testutil_check(session->begin_transaction( + session, "isolation=snapshot")); /* Read a record at the end of the table. */ do { @@ -120,7 +116,8 @@ lrt(void *arg) ; } while (ret == WT_NOTFOUND); if (ret != 0) - die(ret, "read_row %" PRIu64, saved_keyno); + testutil_die(ret, + "read_row %" PRIu64, saved_keyno); /* Copy the cursor's value. */ if (g.type == FIX) { @@ -130,11 +127,11 @@ lrt(void *arg) } else ret = cursor->get_value(cursor, &value); if (ret != 0) - die(ret, + testutil_die(ret, "cursor.get_value: %" PRIu64, saved_keyno); if (buf_len < value.size && (buf = realloc(buf, buf_len = value.size)) == NULL) - die(errno, "malloc"); + testutil_die(errno, "malloc"); memcpy(buf, value.data, buf_size = value.size); /* @@ -149,7 +146,7 @@ lrt(void *arg) ; } while (ret == WT_NOTFOUND); if (ret != 0) - die(ret, "read_row %" PRIu64, keyno); + testutil_die(ret, "read_row %" PRIu64, keyno); pinned = 1; } @@ -166,8 +163,7 @@ lrt(void *arg) break; } - if ((ret = session->close(session, NULL)) != 0) - die(ret, "session.close"); + testutil_check(session->close(session, NULL)); free(keybuf); free(buf); diff --git a/test/format/ops.c b/test/format/ops.c index 36d56df1505..5d66f4d5391 100644 --- a/test/format/ops.c +++ b/test/format/ops.c @@ -56,7 +56,7 @@ wts_ops(int lastrun) pthread_t backup_tid, compact_tid, lrt_tid; int64_t fourths, thread_ops; uint32_t i; - int ret, running; + int running; conn = g.wts_conn; @@ -97,36 +97,32 @@ wts_ops(int lastrun) /* Open a session. */ if (g.logging != 0) { - if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) - die(ret, "connection.open_session"); + testutil_check(conn->open_session(conn, NULL, NULL, &session)); (void)g.wt_api->msg_printf(g.wt_api, session, "=============== thread ops start ==============="); } /* Create thread structure; start the worker threads. */ if ((tinfo = calloc((size_t)g.c_threads, sizeof(*tinfo))) == NULL) - die(errno, "calloc"); + testutil_die(errno, "calloc"); for (i = 0; i < g.c_threads; ++i) { tinfo[i].id = (int)i + 1; tinfo[i].state = TINFO_RUNNING; - if ((ret = - pthread_create(&tinfo[i].tid, NULL, ops, &tinfo[i])) != 0) - die(ret, "pthread_create"); + testutil_check( + pthread_create(&tinfo[i].tid, NULL, ops, &tinfo[i])); } /* * If a multi-threaded run, start optional backup, compaction and * long-running reader threads. */ - if (g.c_backups && - (ret = pthread_create(&backup_tid, NULL, backup, NULL)) != 0) - die(ret, "pthread_create: backup"); - if (g.c_compact && - (ret = pthread_create(&compact_tid, NULL, compact, NULL)) != 0) - die(ret, "pthread_create: compaction"); - if (!SINGLETHREADED && g.c_long_running_txn && - (ret = pthread_create(&lrt_tid, NULL, lrt, NULL)) != 0) - die(ret, "pthread_create: long-running reader"); + if (g.c_backups) + testutil_check(pthread_create(&backup_tid, NULL, backup, NULL)); + if (g.c_compact) + testutil_check( + pthread_create(&compact_tid, NULL, compact, NULL)); + if (!SINGLETHREADED && g.c_long_running_txn) + testutil_check(pthread_create(&lrt_tid, NULL, lrt, NULL)); /* Spin on the threads, calculating the totals. */ for (;;) { @@ -192,8 +188,7 @@ wts_ops(int lastrun) if (g.logging != 0) { (void)g.wt_api->msg_printf(g.wt_api, session, "=============== thread ops stop ==============="); - if ((ret = session->close(session, NULL)) != 0) - die(ret, "session.close"); + testutil_check(session->close(session, NULL)); } } @@ -234,7 +229,7 @@ ops(void *arg) uint32_t op; uint8_t *keybuf, *valbuf; u_int np; - int ckpt_available, dir, insert, intxn, notfound, readonly, ret; + int ckpt_available, dir, insert, intxn, notfound, readonly; char *ckpt_config, ckpt_name[64]; tinfo = arg; @@ -269,9 +264,8 @@ ops(void *arg) */ if (intxn && (tinfo->ops == ckpt_op || tinfo->ops == session_op)) { - if ((ret = session->commit_transaction( - session, NULL)) != 0) - die(ret, "session.commit_transaction"); + testutil_check( + session->commit_transaction(session, NULL)); ++tinfo->commit; intxn = 0; } @@ -279,13 +273,11 @@ ops(void *arg) /* Open up a new session and cursors. */ if (tinfo->ops == session_op || session == NULL || cursor == NULL) { - if (session != NULL && - (ret = session->close(session, NULL)) != 0) - die(ret, "session.close"); + if (session != NULL) + testutil_check(session->close(session, NULL)); - if ((ret = conn->open_session(conn, NULL, - ops_session_config(&tinfo->rnd), &session)) != 0) - die(ret, "connection.open_session"); + testutil_check(conn->open_session(conn, NULL, + ops_session_config(&tinfo->rnd), &session)); /* * 10% of the time, perform some read-only operations @@ -300,9 +292,8 @@ ops(void *arg) */ if (!SINGLETHREADED && !DATASOURCE("lsm") && ckpt_available && mmrand(&tinfo->rnd, 1, 10) == 1) { - if ((ret = session->open_cursor(session, - g.uri, NULL, ckpt_name, &cursor)) != 0) - die(ret, "session.open_cursor"); + testutil_check(session->open_cursor(session, + g.uri, NULL, ckpt_name, &cursor)); /* Pick the next session/cursor close/open. */ session_op += 250; @@ -323,13 +314,12 @@ ops(void *arg) * want to have to specify the record number, * which requires an append configuration. */ - if ((ret = session->open_cursor(session, g.uri, - NULL, "overwrite", &cursor)) != 0) - die(ret, "session.open_cursor"); - if ((g.type == FIX || g.type == VAR) && - (ret = session->open_cursor(session, g.uri, - NULL, "append", &cursor_insert)) != 0) - die(ret, "session.open_cursor"); + testutil_check(session->open_cursor(session, + g.uri, NULL, "overwrite", &cursor)); + if (g.type == FIX || g.type == VAR) + testutil_check(session->open_cursor( + session, g.uri, + NULL, "append", &cursor_insert)); /* Pick the next session/cursor close/open. */ session_op += mmrand(&tinfo->rnd, 100, 5000); @@ -358,21 +348,17 @@ ops(void *arg) } /* Named checkpoints lock out backups */ - if (ckpt_config != NULL && - (ret = pthread_rwlock_wrlock(&g.backup_lock)) != 0) - die(ret, - "pthread_rwlock_wrlock: backup lock"); - - if ((ret = - session->checkpoint(session, ckpt_config)) != 0) - die(ret, "session.checkpoint%s%s", - ckpt_config == NULL ? "" : ": ", - ckpt_config == NULL ? "" : ckpt_config); - - if (ckpt_config != NULL && - (ret = pthread_rwlock_unlock(&g.backup_lock)) != 0) - die(ret, - "pthread_rwlock_wrlock: backup lock"); + if (ckpt_config != NULL) + testutil_check( + pthread_rwlock_wrlock(&g.backup_lock)); + + testutil_checkfmt( + session->checkpoint(session, ckpt_config), + "%s", ckpt_config == NULL ? "" : ckpt_config); + + if (ckpt_config != NULL) + testutil_check( + pthread_rwlock_unlock(&g.backup_lock)); /* Rephrase the checkpoint name for cursor open. */ if (ckpt_config == NULL) @@ -393,8 +379,7 @@ ops(void *arg) * have to do the reset outside of a transaction. */ if (tinfo->ops > reset_op && !intxn) { - if ((ret = session->reset(session)) != 0) - die(ret, "session.reset"); + testutil_check(session->reset(session)); /* Pick the next reset operation. */ reset_op += mmrand(&tinfo->rnd, 20000, 50000); @@ -406,9 +391,8 @@ ops(void *arg) */ if (!SINGLETHREADED && !intxn && mmrand(&tinfo->rnd, 1, 10) >= 8) { - if ((ret = - session->begin_transaction(session, NULL)) != 0) - die(ret, "session.begin_transaction"); + testutil_check( + session->begin_transaction(session, NULL)); intxn = 1; } @@ -466,9 +450,8 @@ ops(void *arg) if (col_insert(tinfo, cursor_insert, &key, &value, &keyno)) goto deadlock; - if ((ret = - cursor_insert->reset(cursor_insert)) != 0) - die(ret, "cursor.reset"); + testutil_check( + cursor_insert->reset(cursor_insert)); insert = 1; break; @@ -518,8 +501,7 @@ skip_insert: if (col_update(tinfo, goto deadlock; /* Reset the cursor: there is no reason to keep pages pinned. */ - if ((ret = cursor->reset(cursor)) != 0) - die(ret, "cursor.reset"); + testutil_check(cursor->reset(cursor)); /* * If we're in the transaction, commit 40% of the time and @@ -528,9 +510,8 @@ skip_insert: if (col_update(tinfo, if (intxn) switch (mmrand(&tinfo->rnd, 1, 10)) { case 1: case 2: case 3: case 4: /* 40% */ - if ((ret = session->commit_transaction( - session, NULL)) != 0) - die(ret, "session.commit_transaction"); + testutil_check(session->commit_transaction( + session, NULL)); ++tinfo->commit; intxn = 0; break; @@ -538,10 +519,8 @@ skip_insert: if (col_update(tinfo, if (0) { deadlock: ++tinfo->deadlock; } - if ((ret = session->rollback_transaction( - session, NULL)) != 0) - die(ret, - "session.rollback_transaction"); + testutil_check(session->rollback_transaction( + session, NULL)); ++tinfo->rollback; intxn = 0; break; @@ -550,8 +529,8 @@ deadlock: ++tinfo->deadlock; } } - if (session != NULL && (ret = session->close(session, NULL)) != 0) - die(ret, "session.close"); + if (session != NULL) + testutil_check(session->close(session, NULL)); free(keybuf); free(valbuf); @@ -573,7 +552,6 @@ wts_read_scan(void) WT_SESSION *session; uint64_t cnt, last_cnt; uint8_t *keybuf; - int ret; conn = g.wts_conn; @@ -581,12 +559,10 @@ wts_read_scan(void) key_gen_setup(&keybuf); /* Open a session and cursor pair. */ - if ((ret = conn->open_session( - conn, NULL, ops_session_config(NULL), &session)) != 0) - die(ret, "connection.open_session"); - if ((ret = session->open_cursor( - session, g.uri, NULL, NULL, &cursor)) != 0) - die(ret, "session.open_cursor"); + testutil_check(conn->open_session( + conn, NULL, ops_session_config(NULL), &session)); + testutil_check(session->open_cursor( + session, g.uri, NULL, NULL, &cursor)); /* Check a random subset of the records using the key. */ for (last_cnt = cnt = 0; cnt < g.key_cnt;) { @@ -599,12 +575,11 @@ wts_read_scan(void) } key.data = keybuf; - if ((ret = read_row(cursor, &key, cnt, 0)) != 0) - die(ret, "read_scan"); + testutil_checkfmt( + read_row(cursor, &key, cnt, 0), "%s", "read_scan"); } - if ((ret = session->close(session, NULL)) != 0) - die(ret, "session.close"); + testutil_check(session->close(session, NULL)); free(keybuf); } @@ -666,7 +641,7 @@ read_row(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int notfound_err) return (WT_NOTFOUND); break; default: - die(ret, "read_row: read row %" PRIu64, keyno); + testutil_die(ret, "read_row: read row %" PRIu64, keyno); } #ifdef HAVE_BERKELEY_DB @@ -703,7 +678,7 @@ read_row(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int notfound_err) "read_row: value mismatch %" PRIu64 ":\n", keyno); print_item("bdb", &bdb_value); print_item(" wt", &value); - die(0, NULL); + testutil_die(0, NULL); } } #endif @@ -748,7 +723,7 @@ nextprev(WT_CURSOR *cursor, int next, int *notfoundp) break; } if (ret != 0 && ret != WT_NOTFOUND) - die(ret, "%s", which); + testutil_die(ret, "%s", which); *notfoundp = (ret == WT_NOTFOUND); #ifdef HAVE_BERKELEY_DB @@ -777,7 +752,7 @@ nextprev(WT_CURSOR *cursor, int next, int *notfoundp) fprintf(stderr, "nextprev: %s key mismatch:\n", which); print_item("bdb-key", &bdb_key); print_item(" wt-key", &key); - die(0, NULL); + testutil_die(0, NULL); } } else { if (keyno != (uint64_t)atoll(bdb_key.data)) { @@ -787,7 +762,7 @@ nextprev(WT_CURSOR *cursor, int next, int *notfoundp) "nextprev: %s key mismatch: %.*s != %" PRIu64 "\n", which, (int)bdb_key.size, (char *)bdb_key.data, keyno); - die(0, NULL); + testutil_die(0, NULL); } } if (value.size != bdb_value.size || @@ -795,7 +770,7 @@ nextprev(WT_CURSOR *cursor, int next, int *notfoundp) fprintf(stderr, "nextprev: %s value mismatch:\n", which); print_item("bdb-value", &bdb_value); print_item(" wt-value", &value); - die(0, NULL); + testutil_die(0, NULL); } if (g.logging == LOG_OPS) @@ -851,7 +826,8 @@ row_update(TINFO *tinfo, if (ret == WT_ROLLBACK) return (WT_ROLLBACK); if (ret != 0 && ret != WT_NOTFOUND) - die(ret, "row_update: update row %" PRIu64 " by key", keyno); + testutil_die(ret, + "row_update: update row %" PRIu64 " by key", keyno); #ifdef HAVE_BERKELEY_DB if (!SINGLETHREADED) @@ -905,7 +881,7 @@ col_update(TINFO *tinfo, if (ret == WT_ROLLBACK) return (WT_ROLLBACK); if (ret != 0 && ret != WT_NOTFOUND) - die(ret, "col_update: %" PRIu64, keyno); + testutil_die(ret, "col_update: %" PRIu64, keyno); #ifdef HAVE_BERKELEY_DB if (!SINGLETHREADED) @@ -937,7 +913,7 @@ table_append_init(void) free(g.append); if ((g.append = calloc(g.append_max, sizeof(uint64_t))) == NULL) - die(errno, "calloc"); + testutil_die(errno, "calloc"); } /* @@ -948,7 +924,7 @@ static void table_append(uint64_t keyno) { uint64_t *p, *ep; - int done, ret; + int done; ep = g.append + g.append_max; @@ -979,8 +955,7 @@ table_append(uint64_t keyno) * and we find a slot. */ for (done = 0;;) { - if ((ret = pthread_rwlock_wrlock(&g.append_lock)) != 0) - die(ret, "pthread_rwlock_wrlock: append_lock"); + testutil_check(pthread_rwlock_wrlock(&g.append_lock)); /* * If this is the thread we've been waiting for, and its record @@ -1017,8 +992,7 @@ table_append(uint64_t keyno) break; } - if ((ret = pthread_rwlock_unlock(&g.append_lock)) != 0) - die(ret, "pthread_rwlock_unlock: append_lock"); + testutil_check(pthread_rwlock_unlock(&g.append_lock)); if (done) break; @@ -1055,7 +1029,8 @@ row_insert(TINFO *tinfo, if (ret == WT_ROLLBACK) return (WT_ROLLBACK); if (ret != 0 && ret != WT_NOTFOUND) - die(ret, "row_insert: insert row %" PRIu64 " by key", keyno); + testutil_die(ret, + "row_insert: insert row %" PRIu64 " by key", keyno); #ifdef HAVE_BERKELEY_DB if (!SINGLETHREADED) @@ -1094,10 +1069,9 @@ col_insert(TINFO *tinfo, if ((ret = cursor->insert(cursor)) != 0) { if (ret == WT_ROLLBACK) return (WT_ROLLBACK); - die(ret, "cursor.insert"); + testutil_die(ret, "cursor.insert"); } - if ((ret = cursor->get_key(cursor, &keyno)) != 0) - die(ret, "cursor.get_key"); + testutil_check(cursor->get_key(cursor, &keyno)); *keynop = (uint32_t)keyno; table_append(keyno); /* Extend the object. */ @@ -1157,7 +1131,8 @@ row_remove(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int *notfoundp) if (ret == WT_ROLLBACK) return (WT_ROLLBACK); if (ret != 0 && ret != WT_NOTFOUND) - die(ret, "row_remove: remove %" PRIu64 " by key", keyno); + testutil_die(ret, + "row_remove: remove %" PRIu64 " by key", keyno); *notfoundp = (ret == WT_NOTFOUND); #ifdef HAVE_BERKELEY_DB @@ -1200,7 +1175,8 @@ col_remove(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int *notfoundp) if (ret == WT_ROLLBACK) return (WT_ROLLBACK); if (ret != 0 && ret != WT_NOTFOUND) - die(ret, "col_remove: remove %" PRIu64 " by key", keyno); + testutil_die(ret, + "col_remove: remove %" PRIu64 " by key", keyno); *notfoundp = (ret == WT_NOTFOUND); #ifdef HAVE_BERKELEY_DB @@ -1245,7 +1221,7 @@ notfound_chk(const char *f, int wt_ret, int bdb_notfound, uint64_t keyno) fprintf(stderr, " row %" PRIu64 ":", keyno); fprintf(stderr, " not found in Berkeley DB, found in WiredTiger\n"); - die(0, NULL); + testutil_die(0, NULL); } if (wt_ret == WT_NOTFOUND) { fprintf(stderr, "%s: %s:", g.progname, f); @@ -1253,7 +1229,7 @@ notfound_chk(const char *f, int wt_ret, int bdb_notfound, uint64_t keyno) fprintf(stderr, " row %" PRIu64 ":", keyno); fprintf(stderr, " found in Berkeley DB, not found in WiredTiger\n"); - die(0, NULL); + testutil_die(0, NULL); } return (0); } diff --git a/test/format/rebalance.c b/test/format/rebalance.c index 8e8fa1a371f..d35dcec1d53 100644 --- a/test/format/rebalance.c +++ b/test/format/rebalance.c @@ -33,7 +33,6 @@ wts_rebalance(void) { WT_CONNECTION *conn; WT_SESSION *session; - int ret; char cmd[1024]; if (g.c_rebalance == 0) @@ -45,26 +44,23 @@ wts_rebalance(void) (void)snprintf(cmd, sizeof(cmd), "../../wt -h %s dump -f %s/rebalance.orig %s", g.home, g.home, g.uri); - if ((ret = system(cmd)) != 0) - die(ret, "command failed: %s", cmd); + testutil_checkfmt(system(cmd), "command failed: %s", cmd); /* Rebalance, then verify the object. */ wts_reopen(); conn = g.wts_conn; - if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) - die(ret, "connection.open_session"); + testutil_check(conn->open_session(conn, NULL, NULL, &session)); if (g.logging != 0) (void)g.wt_api->msg_printf(g.wt_api, session, "=============== rebalance start ==============="); - if ((ret = session->rebalance(session, g.uri, NULL)) != 0) - die(ret, "session.rebalance: %s: %s", g.uri); + testutil_checkfmt( + session->rebalance(session, g.uri, NULL), "%s", g.uri); if (g.logging != 0) (void)g.wt_api->msg_printf(g.wt_api, session, "=============== rebalance stop ==============="); - if ((ret = session->close(session, NULL)) != 0) - die(ret, "session.close"); + testutil_check(session->close(session, NULL)); wts_verify("post-rebalance verify"); wts_close(); @@ -72,13 +68,11 @@ wts_rebalance(void) (void)snprintf(cmd, sizeof(cmd), "../../wt -h %s dump -f %s/rebalance.new %s", g.home, g.home, g.uri); - if ((ret = system(cmd)) != 0) - die(ret, "command failed: %s", cmd); + testutil_checkfmt(system(cmd), "command failed: %s", cmd); /* Compare the old/new versions of the object. */ (void)snprintf(cmd, sizeof(cmd), "cmp %s/rebalance.orig %s/rebalance.new > /dev/null", g.home, g.home); - if ((ret = system(cmd)) != 0) - die(ret, "command failed: %s", cmd); + testutil_checkfmt(system(cmd), "command failed: %s", cmd); } diff --git a/test/format/salvage.c b/test/format/salvage.c index d0358e998b4..526e1563390 100644 --- a/test/format/salvage.c +++ b/test/format/salvage.c @@ -42,12 +42,10 @@ salvage(void) conn = g.wts_conn; track("salvage", 0ULL, NULL); - if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) - die(ret, "connection.open_session"); + testutil_check(conn->open_session(conn, NULL, NULL, &session)); if ((ret = session->salvage(session, g.uri, "force=true")) != 0) - die(ret, "session.salvage: %s", g.uri); - if ((ret = session->close(session, NULL)) != 0) - die(ret, "session.close"); + testutil_die(ret, "session.salvage: %s", g.uri); + testutil_check(session->close(session, NULL)); } /* @@ -101,37 +99,37 @@ corrupt(void) return (0); found: if (fstat(fd, &sb) == -1) - die(errno, "salvage-corrupt: fstat"); + testutil_die(errno, "salvage-corrupt: fstat"); offset = mmrand(NULL, 0, (u_int)sb.st_size); len = (size_t)(20 + (sb.st_size / 100) * 2); (void)snprintf(buf, sizeof(buf), "%s/slvg.corrupt", g.home); if ((fp = fopen(buf, "w")) == NULL) - die(errno, "salvage-corrupt: open: %s", buf); + testutil_die(errno, "salvage-corrupt: open: %s", buf); (void)fprintf(fp, "salvage-corrupt: offset %" PRIuMAX ", length " SIZET_FMT "\n", (uintmax_t)offset, len); fclose_and_clear(&fp); if (lseek(fd, offset, SEEK_SET) == -1) - die(errno, "salvage-corrupt: lseek"); + testutil_die(errno, "salvage-corrupt: lseek"); memset(buf, 'z', sizeof(buf)); for (; len > 0; len -= nw) { nw = (size_t)(len > sizeof(buf) ? sizeof(buf) : len); if (write(fd, buf, nw) == -1) - die(errno, "salvage-corrupt: write"); + testutil_die(errno, "salvage-corrupt: write"); } if (close(fd) == -1) - die(errno, "salvage-corrupt: close"); + testutil_die(errno, "salvage-corrupt: close"); /* * Save a copy of the corrupted file so we can replay the salvage step * as necessary. */ if ((ret = system(copycmd)) != 0) - die(ret, "salvage corrupt copy step failed"); + testutil_die(ret, "salvage corrupt copy step failed"); return (1); } @@ -157,7 +155,7 @@ wts_salvage(void) * step as necessary. */ if ((ret = system(g.home_salvage_copy)) != 0) - die(ret, "salvage copy step failed"); + testutil_die(ret, "salvage copy step failed"); /* Salvage, then verify. */ wts_open(g.home, 1, &g.wts_conn); diff --git a/test/format/t.c b/test/format/t.c index ccbc0442e4a..28c22e23cb8 100644 --- a/test/format/t.c +++ b/test/format/t.c @@ -30,17 +30,20 @@ GLOBAL g; +static void format_die(void); static void startup(void); static void usage(void); extern int __wt_optind; extern char *__wt_optarg; +void (*custom_die)(void) = format_die; /* Local death handler. */ + int main(int argc, char *argv[]) { time_t start; - int ch, i, onerun, reps, ret; + int ch, i, onerun, reps; const char *config, *home; config = NULL; @@ -64,7 +67,7 @@ main(int argc, char *argv[]) #endif /* Track progress unless we're re-directing output to a file. */ - g.track = isatty(1) ? 1 : 0; + g.c_quiet = isatty(1) ? 0 : 1; /* Set values from the command line. */ home = NULL; @@ -99,7 +102,7 @@ main(int argc, char *argv[]) g.logging = LOG_OPS; break; case 'q': /* Quiet */ - g.track = 0; + g.c_quiet = 1; break; case 'r': /* Replay a run */ g.replay = 1; @@ -125,9 +128,9 @@ main(int argc, char *argv[]) /* If it's a replay, use the home directory's CONFIG file. */ if (g.replay) { if (config != NULL) - die(EINVAL, "-c incompatible with -r"); + testutil_die(EINVAL, "-c incompatible with -r"); if (access(g.home_config, R_OK) != 0) - die(ENOENT, "%s", g.home_config); + testutil_die(ENOENT, "%s", g.home_config); config = g.home_config; } @@ -176,12 +179,9 @@ main(int argc, char *argv[]) * Initialize locks to single-thread named checkpoints and backups, last * last-record updates, and failures. */ - if ((ret = pthread_rwlock_init(&g.append_lock, NULL)) != 0) - die(ret, "pthread_rwlock_init: append lock"); - if ((ret = pthread_rwlock_init(&g.backup_lock, NULL)) != 0) - die(ret, "pthread_rwlock_init: backup lock"); - if ((ret = pthread_rwlock_init(&g.death_lock, NULL)) != 0) - die(ret, "pthread_rwlock_init: death lock"); + testutil_check(pthread_rwlock_init(&g.append_lock, NULL)); + testutil_check(pthread_rwlock_init(&g.backup_lock, NULL)); + testutil_check(pthread_rwlock_init(&g.death_lock, NULL)); printf("%s: process %" PRIdMAX "\n", g.progname, (intmax_t)getpid()); while (++g.run_cnt <= g.c_runs || g.c_runs == 0 ) { @@ -259,7 +259,7 @@ main(int argc, char *argv[]) wts_salvage(); /* Overwrite the progress line with a completion line. */ - if (g.track) + if (!g.c_quiet) printf("\r%78s\r", " "); printf("%4d: %s, %s (%.0f seconds)\n", g.run_cnt, g.c_data_source, @@ -273,10 +273,8 @@ main(int argc, char *argv[]) config_print(0); - if ((ret = pthread_rwlock_destroy(&g.append_lock)) != 0) - die(ret, "pthread_rwlock_destroy: append lock"); - if ((ret = pthread_rwlock_destroy(&g.backup_lock)) != 0) - die(ret, "pthread_rwlock_destroy: backup lock"); + testutil_check(pthread_rwlock_destroy(&g.append_lock)); + testutil_check(pthread_rwlock_destroy(&g.backup_lock)); config_clear(); @@ -298,41 +296,33 @@ startup(void) /* Create or initialize the home and data-source directories. */ if ((ret = system(g.home_init)) != 0) - die(ret, "home directory initialization failed"); + testutil_die(ret, "home directory initialization failed"); /* Open/truncate the logging file. */ if (g.logging != 0 && (g.logfp = fopen(g.home_log, "w")) == NULL) - die(errno, "fopen: %s", g.home_log); + testutil_die(errno, "fopen: %s", g.home_log); /* Open/truncate the random number logging file. */ if ((g.randfp = fopen(g.home_rand, g.replay ? "r" : "w")) == NULL) - die(errno, "%s", g.home_rand); + testutil_die(errno, "%s", g.home_rand); } /* * die -- - * Report an error and quit, dumping the configuration. + * Report an error, dumping the configuration. */ -void -die(int e, const char *fmt, ...) +static void +format_die(void) { - va_list ap; - - /* Single-thread error handling. */ + /* + * Single-thread error handling, our caller exits after calling + * us - don't release the lock. + */ (void)pthread_rwlock_wrlock(&g.death_lock); /* Try and turn off tracking so it doesn't obscure the error message. */ - if (g.track) { - g.track = 0; - fprintf(stderr, "\n"); - } - if (fmt != NULL) { /* Death message. */ - fprintf(stderr, "%s: ", g.progname); - va_start(ap, fmt); - vfprintf(stderr, fmt, ap); - va_end(ap); - if (e != 0) - fprintf(stderr, ": %s", wiredtiger_strerror(e)); + if (!g.c_quiet) { + g.c_quiet = 1; fprintf(stderr, "\n"); } @@ -343,8 +333,6 @@ die(int e, const char *fmt, ...) /* Display the configuration that failed. */ if (g.run_cnt) config_print(1); - - exit(EXIT_FAILURE); } /* diff --git a/test/format/util.c b/test/format/util.c index 2b6b9d67fc3..2e4c869366c 100644 --- a/test/format/util.c +++ b/test/format/util.c @@ -42,7 +42,7 @@ dmalloc(size_t len) void *p; if ((p = malloc(len)) == NULL) - die(errno, "malloc"); + testutil_die(errno, "malloc"); return (p); } @@ -56,7 +56,7 @@ dstrdup(const char *str) char *p; if ((p = strdup(str)) == NULL) - die(errno, "strdup"); + testutil_die(errno, "strdup"); return (p); } @@ -236,7 +236,7 @@ track(const char *tag, uint64_t cnt, TINFO *tinfo) int len; char msg[128]; - if (!g.track || tag == NULL) + if (g.c_quiet || tag == NULL) return; if (tinfo == NULL && cnt == 0) @@ -268,9 +268,9 @@ track(const char *tag, uint64_t cnt, TINFO *tinfo) lastlen = len; if (printf("%s\r", msg) < 0) - die(EIO, "printf"); + testutil_die(EIO, "printf"); if (fflush(stdout) == EOF) - die(errno, "fflush"); + testutil_die(errno, "fflush"); } /* @@ -310,6 +310,10 @@ path_setup(const char *home) g.home_backup = dmalloc(len); snprintf(g.home_backup, len, "%s/%s", g.home, "BACKUP"); + len = strlen(g.home) + strlen("BACKUP2") + 2; + g.home_backup2 = dmalloc(len); + snprintf(g.home_backup2, len, "%s/%s", g.home, "BACKUP2"); + /* BDB directory. */ len = strlen(g.home) + strlen("bdb") + 2; g.home_bdb = dmalloc(len); @@ -340,13 +344,15 @@ path_setup(const char *home) /* Backup directory initialize command, remove and re-create it. */ #undef CMD #ifdef _WIN32 -#define CMD "del /s /q >:nul && mkdir %s" +#define CMD "del /s /q >:nul && mkdir %s %s" #else -#define CMD "rm -rf %s && mkdir %s" +#define CMD "rm -rf %s %s && mkdir %s %s" #endif - len = strlen(g.home_backup) * 2 + strlen(CMD) + 1; + len = strlen(g.home_backup) * 2 + + strlen(g.home_backup2) * 2 + strlen(CMD) + 1; g.home_backup_init = dmalloc(len); - snprintf(g.home_backup_init, len, CMD, g.home_backup, g.home_backup); + snprintf(g.home_backup_init, len, CMD, g.home_backup, g.home_backup2, + g.home_backup, g.home_backup2); /* * Salvage command, save the interesting files so we can replay the @@ -407,7 +413,7 @@ rng(WT_RAND_STATE *rnd) "\n" "end of random number log reached\n"); exit(EXIT_SUCCESS); } - die(errno, "random number log"); + testutil_die(errno, "random number log"); } return ((uint32_t)strtoul(buf, NULL, 10)); @@ -435,6 +441,6 @@ fclose_and_clear(FILE **fpp) return; *fpp = NULL; if (fclose(fp) != 0) - die(errno, "fclose"); + testutil_die(errno, "fclose"); return; } diff --git a/test/format/wts.c b/test/format/wts.c index 9d4d3fe5cb8..81e484296e2 100644 --- a/test/format/wts.c +++ b/test/format/wts.c @@ -53,7 +53,8 @@ compressor(uint32_t compress_flag) default: break; } - die(EINVAL, "illegal compression flag: 0x%x", compress_flag); + testutil_die(EINVAL, + "illegal compression flag: %#" PRIx32, compress_flag); } /* @@ -71,7 +72,8 @@ encryptor(uint32_t encrypt_flag) default: break; } - die(EINVAL, "illegal encryption flag: 0x%x", encrypt_flag); + testutil_die(EINVAL, + "illegal encryption flag: %#" PRIx32, encrypt_flag); } static int @@ -222,7 +224,8 @@ wts_open(const char *home, int set_api, WT_CONNECTION **connp) p += snprintf(p, REMAIN(p, end), ",%s", g.config_open); if (REMAIN(p, end) == 0) - die(ENOMEM, "wiredtiger_open configuration buffer too small"); + testutil_die(ENOMEM, + "wiredtiger_open configuration buffer too small"); /* * Direct I/O may not work with backups, doing copies through the buffer @@ -233,8 +236,8 @@ wts_open(const char *home, int set_api, WT_CONNECTION **connp) if (strstr(config, "direct_io") != NULL) g.c_backups = 0; - if ((ret = wiredtiger_open(home, &event_handler, config, &conn)) != 0) - die(ret, "wiredtiger_open: %s", home); + testutil_checkfmt( + wiredtiger_open(home, &event_handler, config, &conn), "%s", home); if (set_api) g.wt_api = conn->get_extension_api(conn); @@ -247,7 +250,7 @@ wts_open(const char *home, int set_api, WT_CONNECTION **connp) */ if (DATASOURCE("helium")) { if (g.helium_mount == NULL) - die(EINVAL, "no Helium mount point specified"); + testutil_die(EINVAL, "no Helium mount point specified"); (void)snprintf(helium_config, sizeof(helium_config), "entry=wiredtiger_extension_init,config=[" "helium_verbose=0," @@ -256,7 +259,7 @@ wts_open(const char *home, int set_api, WT_CONNECTION **connp) g.helium_mount); if ((ret = conn->load_extension( conn, HELIUM_PATH, helium_config)) != 0) - die(ret, + testutil_die(ret, "WT_CONNECTION.load_extension: %s:%s", HELIUM_PATH, helium_config); } @@ -270,11 +273,8 @@ wts_open(const char *home, int set_api, WT_CONNECTION **connp) void wts_reopen(void) { - int ret; - - if ((ret = wiredtiger_open(g.home, - &event_handler, g.wiredtiger_open_config, &g.wts_conn)) != 0) - die(ret, "wiredtiger_open: %s", g.home); + testutil_checkfmt(wiredtiger_open(g.home, &event_handler, + g.wiredtiger_open_config, &g.wts_conn), "%s", g.home); } /* @@ -287,7 +287,6 @@ wts_create(void) WT_CONNECTION *conn; WT_SESSION *session; uint32_t maxintlpage, maxintlkey, maxleafpage, maxleafkey, maxleafvalue; - int ret; char config[4096], *end, *p; conn = g.wts_conn; @@ -316,7 +315,7 @@ wts_create(void) p += snprintf(p, REMAIN(p, end), "key_format=%s," "allocation_size=512,%s" - "internal_page_max=%d,leaf_page_max=%d", + "internal_page_max=%" PRIu32 ",leaf_page_max=%" PRIu32, (g.type == ROW) ? "u" : "r", g.c_firstfit ? "block_allocation=first," : "", maxintlpage, maxleafpage); @@ -328,15 +327,15 @@ wts_create(void) maxintlkey = mmrand(NULL, maxintlpage / 50, maxintlpage / 40); if (maxintlkey > 20) p += snprintf(p, REMAIN(p, end), - ",internal_key_max=%d", maxintlkey); + ",internal_key_max=%" PRIu32, maxintlkey); maxleafkey = mmrand(NULL, maxleafpage / 50, maxleafpage / 40); if (maxleafkey > 20) p += snprintf(p, REMAIN(p, end), - ",leaf_key_max=%d", maxleafkey); + ",leaf_key_max=%" PRIu32, maxleafkey); maxleafvalue = mmrand(NULL, maxleafpage * 10, maxleafpage / 40); if (maxleafvalue > 40 && maxleafvalue < 100 * 1024) p += snprintf(p, REMAIN(p, end), - ",leaf_value_max=%d", maxleafvalue); + ",leaf_value_max=%" PRIu32, maxleafvalue); switch (g.type) { case FIX: @@ -364,7 +363,7 @@ wts_create(void) ",huffman_value=english"); if (g.c_dictionary) p += snprintf(p, REMAIN(p, end), - ",dictionary=%d", mmrand(NULL, 123, 517)); + ",dictionary=%" PRIu32, mmrand(NULL, 123, 517)); break; } @@ -431,32 +430,28 @@ wts_create(void) } if (REMAIN(p, end) == 0) - die(ENOMEM, "WT_SESSION.create configuration buffer too small"); + testutil_die(ENOMEM, + "WT_SESSION.create configuration buffer too small"); /* * Create the underlying store. */ - if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) - die(ret, "connection.open_session"); - if ((ret = session->create(session, g.uri, config)) != 0) - die(ret, "session.create: %s", g.uri); - if ((ret = session->close(session, NULL)) != 0) - die(ret, "session.close"); + testutil_check(conn->open_session(conn, NULL, NULL, &session)); + testutil_checkfmt(session->create(session, g.uri, config), "%s", g.uri); + testutil_check(session->close(session, NULL)); } void wts_close(void) { WT_CONNECTION *conn; - int ret; const char *config; conn = g.wts_conn; config = g.c_leak_memory ? "leak_memory" : NULL; - if ((ret = conn->close(conn, config)) != 0) - die(ret, "connection.close"); + testutil_check(conn->close(conn, config)); g.wts_conn = NULL; g.wt_api = NULL; } @@ -466,7 +461,6 @@ wts_dump(const char *tag, int dump_bdb) { #ifdef HAVE_BERKELEY_DB size_t len; - int ret; char *cmd; /* @@ -491,8 +485,7 @@ wts_dump(const char *tag, int dump_bdb) g.uri == NULL ? "" : "-n", g.uri == NULL ? "" : g.uri); - if ((ret = system(cmd)) != 0) - die(ret, "%s: dump comparison failed", tag); + testutil_checkfmt(system(cmd), "%s: dump comparison failed", tag); free(cmd); #else (void)tag; /* [-Wunused-variable] */ @@ -513,8 +506,7 @@ wts_verify(const char *tag) conn = g.wts_conn; track("verify", 0ULL, NULL); - if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) - die(ret, "connection.open_session"); + testutil_check(conn->open_session(conn, NULL, NULL, &session)); if (g.logging != 0) (void)g.wt_api->msg_printf(g.wt_api, session, "=============== verify start ==============="); @@ -522,13 +514,12 @@ wts_verify(const char *tag) /* Session operations for LSM can return EBUSY. */ ret = session->verify(session, g.uri, "strict"); if (ret != 0 && !(ret == EBUSY && DATASOURCE("lsm"))) - die(ret, "session.verify: %s: %s", g.uri, tag); + testutil_die(ret, "session.verify: %s: %s", g.uri, tag); if (g.logging != 0) (void)g.wt_api->msg_printf(g.wt_api, session, "=============== verify stop ==============="); - if ((ret = session->close(session, NULL)) != 0) - die(ret, "session.close"); + testutil_check(session->close(session, NULL)); } /* @@ -558,49 +549,43 @@ wts_stats(void) conn = g.wts_conn; track("stat", 0ULL, NULL); - if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) - die(ret, "connection.open_session"); + testutil_check(conn->open_session(conn, NULL, NULL, &session)); if ((fp = fopen(g.home_stats, "w")) == NULL) - die(errno, "fopen: %s", g.home_stats); + testutil_die(errno, "fopen: %s", g.home_stats); /* Connection statistics. */ fprintf(fp, "====== Connection statistics:\n"); - if ((ret = session->open_cursor(session, - "statistics:", NULL, NULL, &cursor)) != 0) - die(ret, "session.open_cursor"); + testutil_check(session->open_cursor( + session, "statistics:", NULL, NULL, &cursor)); while ((ret = cursor->next(cursor)) == 0 && (ret = cursor->get_value(cursor, &desc, &pval, &v)) == 0) if (fprintf(fp, "%s=%s\n", desc, pval) < 0) - die(errno, "fprintf"); + testutil_die(errno, "fprintf"); if (ret != WT_NOTFOUND) - die(ret, "cursor.next"); - if ((ret = cursor->close(cursor)) != 0) - die(ret, "cursor.close"); + testutil_die(ret, "cursor.next"); + testutil_check(cursor->close(cursor)); /* Data source statistics. */ fprintf(fp, "\n\n====== Data source statistics:\n"); stat_name = dmalloc(strlen("statistics:") + strlen(g.uri) + 1); sprintf(stat_name, "statistics:%s", g.uri); - if ((ret = session->open_cursor( - session, stat_name, NULL, NULL, &cursor)) != 0) - die(ret, "session.open_cursor"); + testutil_check(session->open_cursor( + session, stat_name, NULL, NULL, &cursor)); free(stat_name); while ((ret = cursor->next(cursor)) == 0 && (ret = cursor->get_value(cursor, &desc, &pval, &v)) == 0) if (fprintf(fp, "%s=%s\n", desc, pval) < 0) - die(errno, "fprintf"); + testutil_die(errno, "fprintf"); if (ret != WT_NOTFOUND) - die(ret, "cursor.next"); - if ((ret = cursor->close(cursor)) != 0) - die(ret, "cursor.close"); + testutil_die(ret, "cursor.next"); + testutil_check(cursor->close(cursor)); fclose_and_clear(&fp); - if ((ret = session->close(session, NULL)) != 0) - die(ret, "session.close"); + testutil_check(session->close(session, NULL)); } diff --git a/test/huge/huge.c b/test/huge/huge.c index d09f6f375fb..ad19035ff99 100644 --- a/test/huge/huge.c +++ b/test/huge/huge.c @@ -167,6 +167,8 @@ run(CONFIG *cp, int bigkey, size_t bytes) extern int __wt_optind; extern char *__wt_optarg; +void (*custom_die)(void) = NULL; + int main(int argc, char *argv[]) { diff --git a/test/manydbs/Makefile.am b/test/manydbs/Makefile.am new file mode 100644 index 00000000000..53559b25243 --- /dev/null +++ b/test/manydbs/Makefile.am @@ -0,0 +1,13 @@ +AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/test/utility + +noinst_PROGRAMS = t +t_SOURCES = manydbs.c +t_LDADD = $(top_builddir)/libwiredtiger.la +t_LDFLAGS = -static + +# Run this during a "make check" smoke test. +TESTS = smoke.sh + +clean-local: + rm -rf WiredTiger* *.core __* diff --git a/test/manydbs/manydbs.c b/test/manydbs/manydbs.c new file mode 100644 index 00000000000..1d3412a7b06 --- /dev/null +++ b/test/manydbs/manydbs.c @@ -0,0 +1,264 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <sys/wait.h> +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#ifndef _WIN32 +#include <unistd.h> +#endif + +#include <wiredtiger.h> + +#include "test_util.i" + +#define HOME_SIZE 512 +#define HOME_BASE "WT_HOME" +static char home[HOME_SIZE]; /* Base home directory */ +static char hometmp[HOME_SIZE]; /* Each conn home directory */ +static const char *progname; /* Program name */ +static const char * const uri = "table:main"; + +#define WTOPEN_CFG_COMMON \ + "create,log=(file_max=10M,archive=false,enabled)," \ + "statistics=(fast),statistics_log=(wait=5)," +#define WT_CONFIG0 \ + WTOPEN_CFG_COMMON \ + "transaction_sync=(enabled=false)" +#define WT_CONFIG1 \ + WTOPEN_CFG_COMMON \ + "transaction_sync=(enabled,method=none)" +#define WT_CONFIG2 \ + WTOPEN_CFG_COMMON \ + "transaction_sync=(enabled,method=fsync)" + +#define MAX_DBS 10 +#define MAX_IDLE_TIME 30 +#define IDLE_INCR 5 + +#define MAX_KV 100 +#define MAX_VAL 128 + +static void +usage(void) +{ + fprintf(stderr, + "usage: %s [-I] [-D maxdbs] [-h dir]\n", progname); + exit(EXIT_FAILURE); +} + +extern int __wt_optind; +extern char *__wt_optarg; + +void (*custom_die)(void) = NULL; + +WT_CONNECTION **connections = NULL; +WT_CURSOR **cursors = NULL; +WT_RAND_STATE rnd; +WT_SESSION **sessions = NULL; + +static int +get_stat(WT_SESSION *stat_session, int stat_field, uint64_t *valuep) +{ + WT_CURSOR *statc; + const char *desc, *pvalue; + int ret; + + testutil_check(stat_session->open_cursor(stat_session, + "statistics:", NULL, NULL, &statc)); + statc->set_key(statc, stat_field); + if ((ret = statc->search(statc)) != 0) + return (ret); + + ret = statc->get_value(statc, &desc, &pvalue, valuep); + testutil_check(statc->close(statc)); + return (ret); +} + +static int +run_ops(int dbs) +{ + WT_ITEM data; + int db_set, i, key; + uint32_t db; + uint8_t buf[MAX_VAL]; + + memset(buf, 0, sizeof(buf)); + for (i = 0; i < MAX_VAL; ++i) + buf[i] = (uint8_t)__wt_random(&rnd); + data.data = buf; + /* + * Write a small amount of data into a random subset of the databases. + */ + db_set = dbs / 4; + for (i = 0; i < db_set; ++i) { + db = __wt_random(&rnd) % (uint32_t)dbs; + printf("Write to database %" PRIu32 "\n", db); + for (key = 0; key < MAX_KV; ++key) { + data.size = __wt_random(&rnd) % MAX_VAL; + cursors[db]->set_key(cursors[db], key); + cursors[db]->set_value(cursors[db], &data); + testutil_check(cursors[db]->insert(cursors[db])); + } + } + return (0); +} + +int +main(int argc, char *argv[]) +{ + uint64_t cond_reset, cond_wait; + uint64_t *cond_reset_orig; + int cfg, ch, dbs, i; + bool idle; + const char *working_dir, *wt_cfg; + char cmd[128]; + + if ((progname = strrchr(argv[0], DIR_DELIM)) == NULL) + progname = argv[0]; + else + ++progname; + dbs = MAX_DBS; + working_dir = HOME_BASE; + idle = false; + while ((ch = __wt_getopt(progname, argc, argv, "D:h:I")) != EOF) + switch (ch) { + case 'D': + dbs = atoi(__wt_optarg); + break; + case 'h': + working_dir = __wt_optarg; + break; + case 'I': + idle = true; + break; + default: + usage(); + } + argc -= __wt_optind; + argv += __wt_optind; + if (argc != 0) + usage(); + + /* + * Allocate arrays for connection handles, sessions, statistics + * cursors and, if needed, data cursors. + */ + if ((connections = calloc( + (size_t)dbs, sizeof(WT_CONNECTION *))) == NULL) + testutil_die(ENOMEM, "connection array malloc"); + if ((sessions = calloc( + (size_t)dbs, sizeof(WT_SESSION *))) == NULL) + testutil_die(ENOMEM, "session array malloc"); + if ((cond_reset_orig = calloc((size_t)dbs, sizeof(uint64_t))) == NULL) + testutil_die(ENOMEM, "orig stat malloc"); + if (!idle && ((cursors = calloc( + (size_t)dbs, sizeof(WT_CURSOR *))) == NULL)) + testutil_die(ENOMEM, "cursor array malloc"); + memset(cmd, 0, sizeof(cmd)); + /* + * Set up all the directory names. + */ + testutil_work_dir_from_path(home, HOME_SIZE, working_dir); + testutil_make_work_dir(home); + __wt_random_init(&rnd); + for (i = 0; i < dbs; ++i) { + snprintf(hometmp, HOME_SIZE, "%s/%s.%d", home, HOME_BASE, i); + testutil_make_work_dir(hometmp); + /* + * Open each database. Rotate different configurations + * among them. Open a session and statistics cursor. + * If writing data, create the table and open a data cursor. + */ + cfg = i % 3; + if (cfg == 0) + wt_cfg = WT_CONFIG0; + else if (cfg == 1) + wt_cfg = WT_CONFIG1; + else + wt_cfg = WT_CONFIG2; + testutil_check(wiredtiger_open( + hometmp, NULL, wt_cfg, &connections[i])); + testutil_check(connections[i]->open_session(connections[i], + NULL, NULL, &sessions[i])); + if (!idle) { + testutil_check(sessions[i]->create(sessions[i], + uri, "key_format=Q,value_format=u")); + testutil_check(sessions[i]->open_cursor(sessions[i], + uri, NULL, NULL, &cursors[i])); + } + } + + sleep(10); + + /* + * Record original reset setting. There could have been some + * activity during the creation period. + */ + for (i = 0; i < dbs; ++i) + testutil_check(get_stat(sessions[i], + WT_STAT_CONN_COND_AUTO_WAIT_RESET, &cond_reset_orig[i])); + for (i = 0; i < MAX_IDLE_TIME; i += IDLE_INCR) { + if (!idle) + testutil_check(run_ops(dbs)); + printf("Sleep %d (%d of %d)\n", IDLE_INCR, i, MAX_IDLE_TIME); + sleep(IDLE_INCR); + } + for (i = 0; i < dbs; ++i) { + testutil_check(get_stat(sessions[i], + WT_STAT_CONN_COND_AUTO_WAIT_RESET, &cond_reset)); + testutil_check(get_stat(sessions[i], + WT_STAT_CONN_COND_AUTO_WAIT, &cond_wait)); + /* + * On an idle workload there should be no resets of condition + * variables during the idle period. Even with a light + * workload, resets should not be very common. We look for 5%. + */ + if (idle && cond_reset != cond_reset_orig[i]) + testutil_die(ERANGE, + "condition reset on idle connection %d of %" PRIu64, + i, cond_reset); + if (!idle && cond_reset > cond_wait / 20) + testutil_die(ERANGE, "connection %d condition reset %" + PRIu64 " exceeds 5%% of %" PRIu64, + i, cond_reset, cond_wait); + testutil_check(connections[i]->close(connections[i], NULL)); + } + + /* Cleanup allocated memory. */ + free(connections); + free(sessions); + free(cond_reset_orig); + if (!idle) + free(cursors); + + return (EXIT_SUCCESS); +} diff --git a/test/manydbs/smoke.sh b/test/manydbs/smoke.sh new file mode 100755 index 00000000000..c0e2976f154 --- /dev/null +++ b/test/manydbs/smoke.sh @@ -0,0 +1,18 @@ +#!/bin/sh + +set -e + +# Smoke-test format as part of running "make check". +# Run with: +# 1. The defaults +# 2. Set idle flag to turn off operations. +# 3. More dbs. +# +echo "manydbs: default with operations turned on" +$TEST_WRAPPER ./t +echo "manydbs: totally idle databases" +$TEST_WRAPPER ./t -I +echo "manydbs: 40 databases with operations" +$TEST_WRAPPER ./t -D 40 +echo "manydbs: 40 idle databases" +$TEST_WRAPPER ./t -I -D 40 diff --git a/test/readonly/Makefile.am b/test/readonly/Makefile.am new file mode 100644 index 00000000000..3abcd2386a1 --- /dev/null +++ b/test/readonly/Makefile.am @@ -0,0 +1,13 @@ +AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/test/utility + +noinst_PROGRAMS = t +t_SOURCES = readonly.c +t_LDADD = $(top_builddir)/libwiredtiger.la +t_LDFLAGS = -static + +# Run this during a "make check" smoke test. +TESTS = smoke.sh + +clean-local: + rm -rf WT_RD* WiredTiger* *.core __* diff --git a/test/readonly/readonly.c b/test/readonly/readonly.c new file mode 100644 index 00000000000..41400da2605 --- /dev/null +++ b/test/readonly/readonly.c @@ -0,0 +1,409 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <sys/wait.h> +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#ifndef _WIN32 +#include <unistd.h> +#endif + +#include <wiredtiger.h> + +#include "test_util.i" + +#define HOME_SIZE 512 +static char home[HOME_SIZE]; /* Program working dir lock file */ +#define HOME_WR_SUFFIX ".WRNOLOCK" /* Writable dir copy no lock file */ +static char home_wr[HOME_SIZE + sizeof(HOME_WR_SUFFIX)]; +#define HOME_RD_SUFFIX ".RD" /* Read-only dir */ +static char home_rd[HOME_SIZE + sizeof(HOME_RD_SUFFIX)]; +#define HOME_RD2_SUFFIX ".RDNOLOCK" /* Read-only dir no lock file */ +static char home_rd2[HOME_SIZE + sizeof(HOME_RD2_SUFFIX)]; + +static const char *progname; /* Program name */ +static const char *saved_argv0; /* Program command */ +static const char * const uri = "table:main"; + +#define ENV_CONFIG \ + "create,log=(file_max=10M,archive=false,enabled)," \ + "transaction_sync=(enabled,method=none)" +#define ENV_CONFIG_RD "readonly=true" +#define ENV_CONFIG_WR "readonly=false" +#define MAX_VAL 4096 +#define MAX_KV 10000 + +#define EXPECT_ERR 1 +#define EXPECT_SUCCESS 0 + +#define OP_READ 0 +#define OP_WRITE 1 + +static void +usage(void) +{ + fprintf(stderr, "usage: %s [-h dir]\n", progname); + exit(EXIT_FAILURE); +} + +static int +run_child(const char *homedir, int op, int expect) +{ + WT_CONNECTION *conn; + WT_CURSOR *cursor; + WT_SESSION *session; + int i, ret; + const char *cfg; + + /* + * We expect the read-only database will allow the second read-only + * handle to succeed because no one can create or set the lock file. + */ + if (op == OP_READ) + cfg = ENV_CONFIG_RD; + else + cfg = ENV_CONFIG_WR; + if ((ret = wiredtiger_open(homedir, NULL, cfg, &conn)) == 0) { + if (expect == EXPECT_ERR) + testutil_die( + ret, "wiredtiger_open expected error, succeeded"); + } else { + if (expect == EXPECT_SUCCESS) + testutil_die( + ret, "wiredtiger_open expected success, error"); + /* + * If we expect an error and got one, we're done. + */ + return (0); + } + + /* + * Make sure we can read the data. + */ + if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) + testutil_die(ret, "WT_CONNECTION:open_session"); + + if ((ret = + session->open_cursor(session, uri, NULL, NULL, &cursor)) != 0) + testutil_die(ret, "WT_SESSION.open_cursor: %s", uri); + + i = 0; + while ((ret = cursor->next(cursor)) == 0) + ++i; + if (i != MAX_KV) + testutil_die(EPERM, "cursor walk"); + if ((ret = conn->close(conn, NULL)) != 0) + testutil_die(ret, "conn_close"); + return (0); +} + +/* + * Child process opens both databases readonly. + */ +static void +open_dbs(int op, const char *dir, + const char *dir_wr, const char *dir_rd, const char *dir_rd2) +{ + int expect, ret; + + /* + * The parent has an open connection to all directories. + * We expect opening the writeable homes to return an error. + * It is a failure if the child successfully opens that. + */ + expect = EXPECT_ERR; + if ((ret = run_child(dir, op, expect)) != 0) + testutil_die(ret, "wiredtiger_open readonly allowed"); + if ((ret = run_child(dir_wr, op, expect)) != 0) + testutil_die(ret, "wiredtiger_open readonly allowed"); + + /* + * The parent must have a read-only connection open to the + * read-only databases. If the child is opening read-only + * too, we expect success. Otherwise an error if the child + * attempts to open read/write (permission error). + */ + if (op == OP_READ) + expect = EXPECT_SUCCESS; + if ((ret = run_child(dir_rd, op, expect)) != 0) + testutil_die(ret, "run child 1"); + if ((ret = run_child(dir_rd2, op, expect)) != 0) + testutil_die(ret, "run child 2"); + exit(EXIT_SUCCESS); +} + +extern int __wt_optind; +extern char *__wt_optarg; + +void (*custom_die)(void) = NULL; + +int +main(int argc, char *argv[]) +{ + WT_CONNECTION *conn, *conn2, *conn3, *conn4; + WT_CURSOR *cursor; + WT_ITEM data; + WT_SESSION *session; + uint64_t i; + int ch, status, op, ret; + bool child; + const char *working_dir; + char cmd[512]; + uint8_t buf[MAX_VAL]; + + if ((progname = strrchr(argv[0], DIR_DELIM)) == NULL) + progname = argv[0]; + else + ++progname; + /* + * Needed unaltered for system command later. + */ + saved_argv0 = argv[0]; + + working_dir = "WT_RD"; + child = false; + op = OP_READ; + while ((ch = __wt_getopt(progname, argc, argv, "Rh:W")) != EOF) + switch (ch) { + case 'R': + child = true; + op = OP_READ; + break; + case 'W': + child = true; + op = OP_WRITE; + break; + case 'h': + working_dir = __wt_optarg; + break; + default: + usage(); + } + argc -= __wt_optind; + argv += __wt_optind; + if (argc != 0) + usage(); + + /* + * Set up all the directory names. + */ + testutil_work_dir_from_path(home, sizeof(home), working_dir); + (void)snprintf(home_wr, sizeof(home_wr), "%s%s", home, HOME_WR_SUFFIX); + (void)snprintf(home_rd, sizeof(home_rd), "%s%s", home, HOME_RD_SUFFIX); + (void)snprintf( + home_rd2, sizeof(home_rd2), "%s%s", home, HOME_RD2_SUFFIX); + if (!child) { + testutil_make_work_dir(home); + testutil_make_work_dir(home_wr); + testutil_make_work_dir(home_rd); + testutil_make_work_dir(home_rd2); + } else + /* + * We are a child process, we just want to call + * the open_dbs with the directories we have. + * The child function will exit. + */ + open_dbs(op, home, home_wr, home_rd, home_rd2); + + /* + * Parent creates a database and table. Then cleanly shuts down. + * Then copy database to read-only directory and chmod. + * Also copy database to read-only directory and remove the lock + * file. One read-only database will have a lock file in the + * file system and the other will not. + * Parent opens all databases with read-only configuration flag. + * Parent forks off child who tries to also open all databases + * with the read-only flag. It should error on the writeable + * directory, but allow it on the read-only directories. + * The child then confirms it can read all the data. + */ + /* + * Run in the home directory and create the table. + */ + if ((ret = wiredtiger_open(home, NULL, ENV_CONFIG, &conn)) != 0) + testutil_die(ret, "wiredtiger_open"); + if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) + testutil_die(ret, "WT_CONNECTION:open_session"); + if ((ret = session->create(session, + uri, "key_format=Q,value_format=u")) != 0) + testutil_die(ret, "WT_SESSION.create: %s", uri); + if ((ret = + session->open_cursor(session, uri, NULL, NULL, &cursor)) != 0) + testutil_die(ret, "WT_SESSION.open_cursor: %s", uri); + + /* + * Write data into the table and then cleanly shut down connection. + */ + memset(buf, 0, sizeof(buf)); + data.data = buf; + data.size = MAX_VAL; + for (i = 0; i < MAX_KV; ++i) { + cursor->set_key(cursor, i); + cursor->set_value(cursor, &data); + if ((ret = cursor->insert(cursor)) != 0) + testutil_die(ret, "WT_CURSOR.insert"); + } + if ((ret = conn->close(conn, NULL)) != 0) + testutil_die(ret, "WT_CONNECTION:close"); + + /* + * Copy the database. Remove any lock file from one copy + * and chmod the copies to be read-only permissions. + */ + (void)snprintf(cmd, sizeof(cmd), + "cp -rp %s/* %s; rm -f %s/WiredTiger.lock", + home, home_wr, home_wr); + (void)system(cmd); + + (void)snprintf(cmd, sizeof(cmd), + "cp -rp %s/* %s; chmod 0555 %s; chmod -R 0444 %s/*", + home, home_rd, home_rd, home_rd); + (void)system(cmd); + + (void)snprintf(cmd, sizeof(cmd), + "cp -rp %s/* %s; rm -f %s/WiredTiger.lock; " + "chmod 0555 %s; chmod -R 0444 %s/*", + home, home_rd2, home_rd2, home_rd2, home_rd2); + (void)system(cmd); + + /* + * Run four scenarios. Sometimes expect errors, sometimes success. + * The writable database directories should always fail to allow the + * child to open due to the lock file. The read-only ones will only + * succeed when the child attempts read-only. + * + * 1. Parent has read-only handle to all databases. Child opens + * read-only also. + * 2. Parent has read-only handle to all databases. Child opens + * read-write. + * 3. Parent has read-write handle to writable databases and + * read-only to read-only databases. Child opens read-only. + * 4. Parent has read-write handle to writable databases and + * read-only to read-only databases. Child opens read-write. + */ + /* + * Open a connection handle to all databases. + */ + fprintf(stderr, " *** Expect several error messages from WT ***\n"); + /* + * Scenario 1. + */ + if ((ret = wiredtiger_open(home, NULL, ENV_CONFIG_RD, &conn)) != 0) + testutil_die(ret, "wiredtiger_open original home"); + if ((ret = wiredtiger_open(home_wr, NULL, ENV_CONFIG_RD, &conn2)) != 0) + testutil_die(ret, "wiredtiger_open write nolock"); + if ((ret = wiredtiger_open(home_rd, NULL, ENV_CONFIG_RD, &conn3)) != 0) + testutil_die(ret, "wiredtiger_open readonly"); + if ((ret = wiredtiger_open(home_rd2, NULL, ENV_CONFIG_RD, &conn4)) != 0) + testutil_die(ret, "wiredtiger_open readonly nolock"); + + /* + * Create a child to also open a connection handle to the databases. + * We cannot use fork here because using fork the child inherits the + * same memory image. Therefore the WT process structure is set in + * the child even though it should not be. So use 'system' to spawn + * an entirely new process. + */ + (void)snprintf( + cmd, sizeof(cmd), "%s -h %s -R", saved_argv0, working_dir); + if ((status = system(cmd)) < 0) + testutil_die(status, "system"); + /* + * The child will exit with success if its test passes. + */ + if (WEXITSTATUS(status) != 0) + testutil_die(WEXITSTATUS(status), "system"); + + /* + * Scenario 2. Run child with writable config. + */ + (void)snprintf( + cmd, sizeof(cmd), "%s -h %s -W", saved_argv0, working_dir); + if ((status = system(cmd)) < 0) + testutil_die(status, "system"); + + if (WEXITSTATUS(status) != 0) + testutil_die(WEXITSTATUS(status), "system"); + + /* + * Reopen the two writable directories and rerun the child. + */ + if ((ret = conn->close(conn, NULL)) != 0) + testutil_die(ret, "WT_CONNECTION:close"); + if ((ret = conn2->close(conn2, NULL)) != 0) + testutil_die(ret, "WT_CONNECTION:close"); + if ((ret = wiredtiger_open(home, NULL, ENV_CONFIG_RD, &conn)) != 0) + testutil_die(ret, "wiredtiger_open original home"); + if ((ret = wiredtiger_open(home_wr, NULL, ENV_CONFIG_RD, &conn2)) != 0) + testutil_die(ret, "wiredtiger_open write nolock"); + /* + * Scenario 3. Child read-only. + */ + (void)snprintf( + cmd, sizeof(cmd), "%s -h %s -R", saved_argv0, working_dir); + if ((status = system(cmd)) < 0) + testutil_die(status, "system"); + if (WEXITSTATUS(status) != 0) + testutil_die(WEXITSTATUS(status), "system"); + + /* + * Scenario 4. Run child with writable config. + */ + (void)snprintf( + cmd, sizeof(cmd), "%s -h %s -W", saved_argv0, working_dir); + if ((status = system(cmd)) < 0) + testutil_die(status, "system"); + if (WEXITSTATUS(status) != 0) + testutil_die(WEXITSTATUS(status), "system"); + + /* + * Clean-up. + */ + if ((ret = conn->close(conn, NULL)) != 0) + testutil_die(ret, "WT_CONNECTION:close"); + if ((ret = conn2->close(conn2, NULL)) != 0) + testutil_die(ret, "WT_CONNECTION:close"); + if ((ret = conn3->close(conn3, NULL)) != 0) + testutil_die(ret, "WT_CONNECTION:close"); + if ((ret = conn4->close(conn4, NULL)) != 0) + testutil_die(ret, "WT_CONNECTION:close"); + /* + * We need to chmod the read-only databases back so that they can + * be removed by scripts. + */ + (void)snprintf(cmd, sizeof(cmd), "chmod 0777 %s %s", home_rd, home_rd2); + (void)system(cmd); + (void)snprintf(cmd, sizeof(cmd), "chmod -R 0666 %s/* %s/*", + home_rd, home_rd2); + (void)system(cmd); + printf(" *** Readonly test successful ***\n"); + return (EXIT_SUCCESS); +} diff --git a/test/readonly/smoke.sh b/test/readonly/smoke.sh new file mode 100755 index 00000000000..740deb5743a --- /dev/null +++ b/test/readonly/smoke.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +trap 'chmod -R u+w WT_*; exit 0' 0 1 2 3 13 15 + +set -e + +# Smoke-test format as part of running "make check". +$TEST_WRAPPER ./t diff --git a/test/recovery/random-abort.c b/test/recovery/random-abort.c index ddcafbc80fd..f9c3ed28814 100644 --- a/test/recovery/random-abort.c +++ b/test/recovery/random-abort.c @@ -42,7 +42,7 @@ static char home[512]; /* Program working dir */ static const char *progname; /* Program name */ -static const char *uri = "table:main"; +static const char * const uri = "table:main"; #define RECORDS_FILE "records" @@ -88,7 +88,8 @@ fill_db(void) /* * Run in the home directory so that the records file is in there too. */ - chdir(home); + if (chdir(home) != 0) + testutil_die(errno, "chdir: %s", home); if ((ret = wiredtiger_open(NULL, NULL, ENV_CONFIG, &conn)) != 0) testutil_die(ret, "wiredtiger_open"); if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) @@ -109,7 +110,7 @@ fill_db(void) /* * Set to no buffering. */ - setvbuf(fp, NULL, _IONBF, 0); + (void)setvbuf(fp, NULL, _IONBF, 0); /* * Write data into the table until we are killed by the parent. @@ -135,6 +136,8 @@ fill_db(void) extern int __wt_optind; extern char *__wt_optarg; +void (*custom_die)(void) = NULL; + int main(int argc, char *argv[]) { @@ -201,13 +204,15 @@ main(int argc, char *argv[]) printf("Kill child\n"); if (kill(pid, SIGKILL) != 0) testutil_die(errno, "kill"); - waitpid(pid, &status, 0); + if (waitpid(pid, &status, 0) == -1) + testutil_die(errno, "waitpid"); /* * !!! If we wanted to take a copy of the directory before recovery, * this is the place to do it. */ - chdir(home); + if (chdir(home) != 0) + testutil_die(errno, "chdir: %s", home); printf("Open database, run recovery and verify content\n"); if ((ret = wiredtiger_open(NULL, NULL, ENV_CONFIG_REC, &conn)) != 0) testutil_die(ret, "wiredtiger_open"); @@ -239,13 +244,15 @@ main(int argc, char *argv[]) ++absent; } } - fclose(fp); + if (fclose(fp) != 0) + testutil_die(errno, "fclose"); if ((ret = conn->close(conn, NULL)) != 0) testutil_die(ret, "WT_CONNECTION:close"); if (absent) { - printf("%u record(s) absent from %u\n", absent, count); + printf("%" PRIu32 " record(s) absent from %" PRIu32 "\n", + absent, count); return (EXIT_FAILURE); } - printf("%u records verified\n", count); + printf("%" PRIu32 " records verified\n", count); return (EXIT_SUCCESS); } diff --git a/test/recovery/truncated-log.c b/test/recovery/truncated-log.c index 4add7a61f66..67fdb932c27 100644 --- a/test/recovery/truncated-log.c +++ b/test/recovery/truncated-log.c @@ -45,7 +45,7 @@ static char home[512]; /* Program working dir */ static const char *progname; /* Program name */ -static const char *uri = "table:main"; +static const char * const uri = "table:main"; #define RECORDS_FILE "records" @@ -54,7 +54,6 @@ static const char *uri = "table:main"; "transaction_sync=(enabled,method=none)" #define ENV_CONFIG_REC "log=(recover=on)" #define LOG_FILE_1 "WiredTigerLog.0000000001" -#define MAX_VAL 4096 #define K_SIZE 16 #define V_SIZE 256 @@ -86,7 +85,8 @@ fill_db(void) /* * Run in the home directory so that the records file is in there too. */ - chdir(home); + if (chdir(home) != 0) + testutil_die(errno, "chdir: %s", home); if ((ret = wiredtiger_open(NULL, NULL, ENV_CONFIG, &conn)) != 0) testutil_die(ret, "wiredtiger_open"); if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) @@ -107,7 +107,7 @@ fill_db(void) /* * Set to no buffering. */ - setvbuf(fp, NULL, _IONBF, 0); + (void)setvbuf(fp, NULL, _IONBF, 0); save_lsn.l.file = 0; /* @@ -156,18 +156,23 @@ fill_db(void) "%" PRIu32 " %" PRIu32 "\n", save_lsn.l.offset, i - 1) == -1) testutil_die(errno, "fprintf"); - fclose(fp); - abort(); + break; } } first = false; } } + if (fclose(fp) != 0) + testutil_die(errno, "fclose"); + abort(); + /* NOTREACHED */ } extern int __wt_optind; extern char *__wt_optarg; +void (*custom_die)(void) = NULL; + int main(int argc, char *argv[]) { @@ -218,26 +223,32 @@ main(int argc, char *argv[]) /* parent */ /* Wait for child to kill itself. */ - waitpid(pid, &status, 0); + if (waitpid(pid, &status, 0) == -1) + testutil_die(errno, "waitpid"); /* * !!! If we wanted to take a copy of the directory before recovery, * this is the place to do it. */ - chdir(home); + if (chdir(home) != 0) + testutil_die(errno, "chdir: %s", home); + printf("Open database, run recovery and verify content\n"); if ((fp = fopen(RECORDS_FILE, "r")) == NULL) testutil_die(errno, "fopen"); ret = fscanf(fp, "%" SCNu64 " %" SCNu32 "\n", &offset, &max_key); - fclose(fp); if (ret != 2) testutil_die(errno, "fscanf"); + if (fclose(fp) != 0) + testutil_die(errno, "fclose"); /* * The offset is the beginning of the last record. Truncate to * the middle of that last record (i.e. ahead of that offset). */ + if (offset > UINT64_MAX - V_SIZE) + testutil_die(ERANGE, "offset"); new_offset = offset + V_SIZE; - printf("Parent: Truncate to %u\n", (uint32_t)new_offset); + printf("Parent: Truncate to %" PRIu64 "\n", new_offset); if ((ret = truncate(LOG_FILE_1, (wt_off_t)new_offset)) != 0) testutil_die(errno, "truncate"); @@ -260,9 +271,10 @@ main(int argc, char *argv[]) if ((ret = conn->close(conn, NULL)) != 0) testutil_die(ret, "WT_CONNECTION:close"); if (count > max_key) { - printf("expected %u records found %u\n", max_key, count); + printf("expected %" PRIu32 " records found %" PRIu32 "\n", + max_key, count); return (EXIT_FAILURE); } - printf("%u records verified\n", count); + printf("%" PRIu32 " records verified\n", count); return (EXIT_SUCCESS); } diff --git a/test/salvage/salvage.c b/test/salvage/salvage.c index c2ad6224b11..a1517d70787 100644 --- a/test/salvage/salvage.c +++ b/test/salvage/salvage.c @@ -64,6 +64,8 @@ static int verbose; /* -v flag */ extern int __wt_optind; extern char *__wt_optarg; +void (*custom_die)(void) = NULL; + int main(int argc, char *argv[]) { diff --git a/test/suite/helper.py b/test/suite/helper.py index 3c460e23d08..f85d708880f 100644 --- a/test/suite/helper.py +++ b/test/suite/helper.py @@ -107,7 +107,10 @@ def copy_wiredtiger_home(olddir, newdir, aligned=True): for fname in os.listdir(olddir): fullname = os.path.join(olddir, fname) # Skip lock file, on Windows it is locked. - if os.path.isfile(fullname) and "WiredTiger.lock" not in fullname: + # Skip temporary log files. + if os.path.isfile(fullname) and "WiredTiger.lock" not in fullname and \ + "WiredTigerTmplog" not in fullname and \ + "WiredTigerPreplog" not in fullname: # Use a dd command that does not align on a block boundary. if aligned: shutil.copy(fullname, newdir) @@ -196,31 +199,36 @@ def complex_populate_index_count(): # config: prefix of the session.create configuration string # rows: entries to insert def complex_populate(self, uri, config, rows): - complex_populate_type(self, uri, config, rows, '') + complex_populate_type(self, uri, config, '', rows, '') +def complex_populate_cgconfig(self, uri, config, rows): + complex_populate_type(self, uri, config, config, rows, '') def complex_populate_lsm(self, uri, config, rows): - complex_populate_type(self, uri, config, rows, 'type=lsm') -def complex_populate_type(self, uri, config, rows, type): + complex_populate_type(self, uri, config, '', rows, 'type=lsm') +def complex_populate_cgconfig_lsm(self, uri, config, rows): + complex_populate_type(self, uri, config, config, rows, 'type=lsm') +def complex_populate_type(self, uri, config, cgconfig, rows, type): self.session.create(uri, config + ',value_format=SiSS,' + 'columns=(record,column2,column3,column4,column5),' + 'colgroups=(cgroup1,cgroup2,cgroup3,cgroup4,cgroup5,cgroup6)') cgname = 'colgroup:' + uri.split(":")[1] - self.session.create(cgname + ':cgroup1', 'columns=(column2)' + ',' + type) - self.session.create(cgname + ':cgroup2', 'columns=(column3)' + ',' + type) - self.session.create(cgname + ':cgroup3', 'columns=(column4)' + ',' + type) + cgcfg = ',' + cgconfig + ',' + type + self.session.create(cgname + ':cgroup1', 'columns=(column2)' + ',' + cgcfg) + self.session.create(cgname + ':cgroup2', 'columns=(column3)' + ',' + cgcfg) + self.session.create(cgname + ':cgroup3', 'columns=(column4)' + ',' + cgcfg) self.session.create( - cgname + ':cgroup4', 'columns=(column2,column3)' + ',' + type) + cgname + ':cgroup4', 'columns=(column2,column3)' + ',' + cgcfg) self.session.create( - cgname + ':cgroup5', 'columns=(column3,column4)' + ',' + type) + cgname + ':cgroup5', 'columns=(column3,column4)' + ',' + cgcfg) self.session.create( - cgname + ':cgroup6', 'columns=(column2,column4,column5)' + ',' + type) + cgname + ':cgroup6', 'columns=(column2,column4,column5)' + ',' + cgcfg) indxname = 'index:' + uri.split(":")[1] - self.session.create(indxname + ':indx1', 'columns=(column2)' + ',' + type) - self.session.create(indxname + ':indx2', 'columns=(column3)' + ',' + type) - self.session.create(indxname + ':indx3', 'columns=(column4)' + ',' + type) + self.session.create(indxname + ':indx1', 'columns=(column2)' + ',' + cgcfg) + self.session.create(indxname + ':indx2', 'columns=(column3)' + ',' + cgcfg) + self.session.create(indxname + ':indx3', 'columns=(column4)' + ',' + cgcfg) self.session.create( - indxname + ':indx4', 'columns=(column2,column4)' + ',' + type) + indxname + ':indx4', 'columns=(column2,column4)' + ',' + cgcfg) cursor = self.session.open_cursor(uri, None) for i in range(1, rows + 1): cursor[key_populate(cursor, i)] = \ @@ -228,9 +236,9 @@ def complex_populate_type(self, uri, config, rows, type): cursor.close() # add some indices after populating self.session.create( - indxname + ':indx5', 'columns=(column3,column5)' + ',' + type) + indxname + ':indx5', 'columns=(column3,column5)' + ',' + cgcfg) self.session.create( - indxname + ':indx6', 'columns=(column3,column5,column4)' + ',' + type) + indxname + ':indx6', 'columns=(column3,column5,column4)' + ',' + cgcfg) def complex_populate_colgroup_name(self, uri, i): return 'colgroup:' + uri.split(":")[1] + ':cgroup' + str(i + 1) diff --git a/test/suite/test_backup05.py b/test/suite/test_backup05.py index 8b176d0f7d7..991a9f71b19 100644 --- a/test/suite/test_backup05.py +++ b/test/suite/test_backup05.py @@ -44,14 +44,6 @@ class test_backup05(wttest.WiredTigerTestCase, suite_subprocess): create_params = 'key_format=i,value_format=i' freq = 5 - def copy_windows(self, olddir, newdir): - os.mkdir(newdir) - for fname in os.listdir(olddir): - fullname = os.path.join(olddir, fname) - # Skip lock file on Windows since it is locked - if os.path.isfile(fullname) and "WiredTiger.lock" not in fullname: - shutil.copy(fullname, newdir) - def check_manual_backup(self, i, olddir, newdir): ''' Simulate a manual backup from olddir and restart in newdir. ''' self.session.checkpoint() @@ -71,7 +63,7 @@ class test_backup05(wttest.WiredTigerTestCase, suite_subprocess): session.verify(self.uri) conn.close() - def test_backup(self): + def backup(self): '''Check manual fsyncLock backup strategy''' # Here's the strategy: @@ -95,5 +87,9 @@ class test_backup05(wttest.WiredTigerTestCase, suite_subprocess): else: self.session.verify(self.uri) + def test_backup(self): + with self.expectedStdoutPattern('Recreating metadata'): + self.backup() + if __name__ == '__main__': wttest.run() diff --git a/test/suite/test_bug008.py b/test/suite/test_bug008.py index 8f0526d9cef..0243887e258 100644 --- a/test/suite/test_bug008.py +++ b/test/suite/test_bug008.py @@ -33,65 +33,208 @@ import wiredtiger, wttest from helper import simple_populate, key_populate, value_populate from wtscenario import check_scenarios -# Tests for invisible updates. +# Test search/search-near operations, including invisible values and keys +# past the end of the table. class test_bug008(wttest.WiredTigerTestCase): + uri = 'file:test_bug008' # This is a btree layer test. scenarios = check_scenarios([ - ('fix', dict(fmt='key_format=r,value_format=8t', empty=1)), - ('row', dict(fmt='key_format=S', empty=0)), - ('var', dict(fmt='key_format=r', empty=0)) + ('fix', dict(fmt='key_format=r,value_format=8t', empty=1, colvar=0)), + ('row', dict(fmt='key_format=S', empty=0, colvar=0)), + ('var', dict(fmt='key_format=r', empty=0, colvar=1)) ]) + # Verify cursor search and search-near operations in an empty table. + def test_search_empty(self): + # Create the object and open a cursor. + self.session.create(self.uri, self.fmt) + cursor = self.session.open_cursor(self.uri, None) + + # Search for a record past the end of the table, which should fail. + cursor.set_key(key_populate(cursor, 100)) + self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND) + + # Search-near for a record past the end of the table, which should fail. + cursor.set_key(key_populate(cursor, 100)) + self.assertEqual(cursor.search_near(), wiredtiger.WT_NOTFOUND) + + # Verify cursor search and search-near operations at and past the end of + # a file, with a set of on-page visible records. + def test_search_eot(self): + # Populate the tree and reopen the connection, forcing it to disk + # and moving the records to an on-page format. + simple_populate(self, self.uri, self.fmt, 100) + self.reopen_conn() + + # Open a cursor. + cursor = self.session.open_cursor(self.uri, None) + + # Search for a record at the end of the table, which should succeed. + cursor.set_key(key_populate(cursor, 100)) + self.assertEqual(cursor.search(), 0) + self.assertEqual(cursor.get_key(), key_populate(cursor, 100)) + self.assertEqual(cursor.get_value(), value_populate(cursor, 100)) + + # Search-near for a record at the end of the table, which should + # succeed, returning the last record. + cursor.set_key(key_populate(cursor, 100)) + self.assertEqual(cursor.search_near(), 0) + self.assertEqual(cursor.get_key(), key_populate(cursor, 100)) + self.assertEqual(cursor.get_value(), value_populate(cursor, 100)) + + # Search for a record past the end of the table, which should fail. + cursor.set_key(key_populate(cursor, 200)) + self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND) + + # Search-near for a record past the end of the table, which should + # succeed, returning the last record. + cursor.set_key(key_populate(cursor, 200)) + self.assertEqual(cursor.search_near(), -1) + self.assertEqual(cursor.get_key(), key_populate(cursor, 100)) + self.assertEqual(cursor.get_value(), value_populate(cursor, 100)) + + # Verify cursor search-near operations before and after a set of + # column-store duplicates. + def test_search_duplicate(self): + if self.colvar == 0: + return + + # Populate the tree. + simple_populate(self, self.uri, self.fmt, 105) + + # Set up deleted records before and after a set of duplicate records, + # and make sure search/search-near returns the correct record. + cursor = self.session.open_cursor(self.uri, None) + for i in range(20, 100): + cursor[key_populate(cursor, i)] = '=== IDENTICAL VALUE ===' + for i in range(15, 25): + cursor.set_key(key_populate(cursor, i)) + self.assertEqual(cursor.remove(), 0) + for i in range(95, 106): + cursor.set_key(key_populate(cursor, i)) + self.assertEqual(cursor.remove(), 0) + cursor.close() + + # Reopen the connection, forcing it to disk and moving the records to + # an on-page format. + self.reopen_conn() + + # Open a cursor. + cursor = self.session.open_cursor(self.uri, None) + + # Search-near for a record in the deleted set before the duplicate set, + # which should succeed, returning the first record in the duplicate set. + cursor.set_key(key_populate(cursor, 18)) + self.assertEqual(cursor.search_near(), 1) + self.assertEqual(cursor.get_key(), key_populate(cursor, 25)) + + # Search-near for a record in the deleted set after the duplicate set, + # which should succeed, returning the last record in the duplicate set. + cursor.set_key(key_populate(cursor, 98)) + self.assertEqual(cursor.search_near(), -1) + self.assertEqual(cursor.get_key(), key_populate(cursor, 94)) + # Verify cursor search and search-near operations on a file with a set of # on-page visible records, and a set of insert-list invisible records. def test_search_invisible_one(self): - uri = 'file:test_bug008' # This is a btree layer test. + # Populate the tree. + simple_populate(self, self.uri, self.fmt, 100) - # Populate the tree and reopen the connection, forcing it to disk - # and moving the records to an on-page format. - simple_populate(self, uri, self.fmt, 100) + # Delete a range of records. + for i in range(5, 10): + cursor = self.session.open_cursor(self.uri, None) + cursor.set_key(key_populate(cursor, i)) + self.assertEqual(cursor.remove(), 0) + + # Reopen the connection, forcing it to disk and moving the records to + # an on-page format. self.reopen_conn() - # Begin a transaction, and add some additional records. + # Add updates to the existing records (in both the deleted an undeleted + # range), as well as some new records after the end. Put the updates in + # a separate transaction so they're invisible to another cursor. self.session.begin_transaction() - cursor = self.session.open_cursor(uri, None) + cursor = self.session.open_cursor(self.uri, None) + for i in range(5, 10): + cursor[key_populate(cursor, i)] = value_populate(cursor, i + 1000) + for i in range(30, 40): + cursor[key_populate(cursor, i)] = value_populate(cursor, i + 1000) for i in range(100, 140): - cursor[key_populate(cursor, i)] = value_populate(cursor, i) + cursor[key_populate(cursor, i)] = value_populate(cursor, i + 1000) # Open a separate session and cursor. s = self.conn.open_session() - cursor = s.open_cursor(uri, None) + cursor = s.open_cursor(self.uri, None) - # Search for an invisible record. - cursor.set_key(key_populate(cursor, 130)) - if self.empty: - # Invisible updates to fixed-length column-store objects are - # invisible to the reader, but the fact that they exist past - # the end of the initial records causes the instantiation of - # empty records: confirm successful return of an empty row. - cursor.search() - self.assertEqual(cursor.get_key(), 130) - self.assertEqual(cursor.get_value(), 0) - else: - # Otherwise, we should not find any matching records. - self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND) + # Search for an existing record in the deleted range, should not find + # it. + for i in range(5, 10): + cursor.set_key(key_populate(cursor, i)) + if self.empty: + # Fixed-length column-store rows always exist. + self.assertEqual(cursor.search(), 0) + self.assertEqual(cursor.get_key(), i) + self.assertEqual(cursor.get_value(), 0) + else: + self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND) - # Search-near for an invisible record, which should succeed, returning - # the last visible record. - cursor.set_key(key_populate(cursor, 130)) - cursor.search_near() - if self.empty: - # Invisible updates to fixed-length column-store objects are - # invisible to the reader, but the fact that they exist past - # the end of the initial records causes the instantiation of - # empty records: confirm successful return of an empty row. - cursor.search() - self.assertEqual(cursor.get_key(), 130) - self.assertEqual(cursor.get_value(), 0) - else: - # Otherwise, we should find the closest record for which we can see - # the value. - self.assertEqual(cursor.get_key(), key_populate(cursor, 100)) - self.assertEqual(cursor.get_value(), value_populate(cursor, 100)) + # Search for an existing record in the updated range, should see the + # original value. + for i in range(30, 40): + cursor.set_key(key_populate(cursor, i)) + self.assertEqual(cursor.search(), 0) + self.assertEqual(cursor.get_key(), key_populate(cursor, i)) + + # Search for a added record, should not find it. + for i in range(120, 130): + cursor.set_key(key_populate(cursor, i)) + if self.empty: + # Invisible updates to fixed-length column-store objects are + # invisible to the reader, but the fact that they exist past + # the end of the initial records causes the instantiation of + # empty records: confirm successful return of an empty row. + self.assertEqual(cursor.search(), 0) + self.assertEqual(cursor.get_key(), i) + self.assertEqual(cursor.get_value(), 0) + else: + # Otherwise, we should not find any matching records. + self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND) + + # Search-near for an existing record in the deleted range, should find + # the next largest record. (This depends on the implementation behavior + # which currently includes a bias to prefix search.) + for i in range(5, 10): + cursor.set_key(key_populate(cursor, i)) + if self.empty: + # Fixed-length column-store rows always exist. + self.assertEqual(cursor.search_near(), 0) + self.assertEqual(cursor.get_key(), i) + self.assertEqual(cursor.get_value(), 0) + else: + self.assertEqual(cursor.search_near(), 1) + self.assertEqual(cursor.get_key(), key_populate(cursor, 10)) + + # Search-near for an existing record in the updated range, should see + # the original value. + for i in range(30, 40): + cursor.set_key(key_populate(cursor, i)) + self.assertEqual(cursor.search_near(), 0) + self.assertEqual(cursor.get_key(), key_populate(cursor, i)) + + # Search-near for an added record, should find the previous largest + # record. + for i in range(120, 130): + cursor.set_key(key_populate(cursor, i)) + if self.empty: + # Invisible updates to fixed-length column-store objects are + # invisible to the reader, but the fact that they exist past + # the end of the initial records causes the instantiation of + # empty records: confirm successful return of an empty row. + self.assertEqual(cursor.search_near(), 0) + self.assertEqual(cursor.get_key(), i) + self.assertEqual(cursor.get_value(), 0) + else: + self.assertEqual(cursor.search_near(), -1) + self.assertEqual(cursor.get_key(), key_populate(cursor, 100)) # Verify cursor search and search-near operations on a file with a set of # on-page visible records, a set of insert-list visible records, and a set @@ -101,28 +244,26 @@ class test_bug008(wttest.WiredTigerTestCase): # fallback happens, whether the correct position is in the page slots or # the insert list.) def test_search_invisible_two(self): - uri = 'file:test_bug008' # This is a btree layer test. - # Populate the tree and reopen the connection, forcing it to disk # and moving the records to an on-page format. - simple_populate(self, uri, self.fmt, 100) + simple_populate(self, self.uri, self.fmt, 100) self.reopen_conn() # Add some additional visible records. - cursor = self.session.open_cursor(uri, None) + cursor = self.session.open_cursor(self.uri, None) for i in range(100, 120): cursor[key_populate(cursor, i)] = value_populate(cursor, i) cursor.close() # Begin a transaction, and add some additional records. self.session.begin_transaction() - cursor = self.session.open_cursor(uri, None) + cursor = self.session.open_cursor(self.uri, None) for i in range(120, 140): cursor[key_populate(cursor, i)] = value_populate(cursor, i) # Open a separate session and cursor. s = self.conn.open_session() - cursor = s.open_cursor(uri, None) + cursor = s.open_cursor(self.uri, None) # Search for an invisible record. cursor.set_key(key_populate(cursor, 130)) diff --git a/test/suite/test_bulk02.py b/test/suite/test_bulk02.py index eeca6a56967..fe8118209f2 100644 --- a/test/suite/test_bulk02.py +++ b/test/suite/test_bulk02.py @@ -49,8 +49,7 @@ class test_bulkload_checkpoint(wttest.WiredTigerTestCase, suite_subprocess): scenarios = number_scenarios(multiply_scenarios('.', types, ckpt_type)) - # Bulk-load handles return EBUSY to the checkpoint code, causing the - # checkpoint call to find a handle anyway, and create fake checkpoint. + # Bulk-load handles are skipped by checkpoints. # Named and unnamed checkpoint versions. def test_bulkload_checkpoint(self): # Open a bulk cursor and insert a few records. @@ -72,11 +71,8 @@ class test_bulkload_checkpoint(wttest.WiredTigerTestCase, suite_subprocess): # In the case of named checkpoints, verify they're still there, # reflecting an empty file. if self.ckpt_type == 'named': - cursor = self.session.open_cursor( - self.uri, None, 'checkpoint=myckpt') - self.assertEquals(cursor.next(), wiredtiger.WT_NOTFOUND) - cursor.close() - + self.assertRaises(wiredtiger.WiredTigerError, + lambda: self.session.open_cursor(self.uri, None, 'checkpoint=myckpt')) # test_bulkload_backup # Test bulk-load with hot-backup. diff --git a/test/suite/test_checkpoint01.py b/test/suite/test_checkpoint01.py index 7d4503b84b7..6e1ad7814ed 100644 --- a/test/suite/test_checkpoint01.py +++ b/test/suite/test_checkpoint01.py @@ -185,7 +185,7 @@ class test_checkpoint_cursor(wttest.WiredTigerTestCase): # Check dropping all checkpoints fails. msg = '/checkpoints cannot be dropped/' self.assertRaisesWithMessage(wiredtiger.WiredTigerError, - lambda: self.session.checkpoint("name=checkpoint-2"), msg) + lambda: self.session.checkpoint("force,name=checkpoint-2"), msg) self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: self.session.checkpoint("drop=(checkpoint-2)"), msg) self.assertRaisesWithMessage(wiredtiger.WiredTigerError, @@ -265,9 +265,13 @@ class test_checkpoint_cursor_update(wttest.WiredTigerTestCase): cursor = self.session.open_cursor(self.uri, None, "checkpoint=ckpt") cursor.set_key(key_populate(cursor, 10)) cursor.set_value("XXX") - self.assertRaises(wiredtiger.WiredTigerError, lambda: cursor.insert()) - self.assertRaises(wiredtiger.WiredTigerError, lambda: cursor.remove()) - self.assertRaises(wiredtiger.WiredTigerError, lambda: cursor.update()) + msg = "/Unsupported cursor/" + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: cursor.insert(), msg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: cursor.remove(), msg) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: cursor.update(), msg) cursor.close() diff --git a/test/suite/test_collator.py b/test/suite/test_collator.py new file mode 100644 index 00000000000..34b5c20247f --- /dev/null +++ b/test/suite/test_collator.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2016 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import os +import wiredtiger, wttest, run +from wtscenario import check_scenarios, number_scenarios + +# test_collator.py +# Test indices using a custom extractor and collator. +class test_collator(wttest.WiredTigerTestCase): + """ + Test indices with a custom extractor to create an index, + with our own collator. + Our set of rows looks like a multiplication table: + row '0': '0,0,0,0' + row '1': '0,1,2,3' + row '2': '0,2,4,6' + with the twist that entries are mod 100. So, looking further: + row '40': '0,40,80,20' + + Each column is placed into its own index. Our collator reverses + the values. + """ + nentries = 100 + nindices = 4 + + # Return the wiredtiger_open extension argument for a shared library. + def extensionArg(self, exts): + extfiles = [] + for ext in exts: + (dirname, name, libname) = ext + if name != None and name != 'none': + testdir = os.path.dirname(__file__) + extdir = os.path.join(run.wt_builddir, 'ext', dirname) + extfile = os.path.join( + extdir, name, '.libs', 'libwiredtiger_' + libname + '.so') + if not os.path.exists(extfile): + self.skipTest('extension "' + extfile + '" not built') + if not extfile in extfiles: + extfiles.append(extfile) + if len(extfiles) == 0: + return '' + else: + return ',extensions=["' + '","'.join(extfiles) + '"]' + + # Override WiredTigerTestCase, we have extensions. + def setUpConnectionOpen(self, dir): + extarg = self.extensionArg([('extractors', 'csv', 'csv_extractor'), + ('collators', 'revint', 'revint_collator')]) + connarg = 'create,error_prefix="{0}: ",{1}'.format( + self.shortid(), extarg) + conn = self.wiredtiger_open(dir, connarg) + self.pr(`conn`) + return conn + + def create_indices(self): + # Create self.nindices index files, each with a column from the CSV + for i in range(0, self.nindices): + si = str(i) + self.session.create('index:collator:x' + si, + 'key_format=i,columns=(key),' + + 'collator=revint,' + + 'extractor=csv,app_metadata={"format" : "i",' + + '"field" : "' + si + '"}') + + def drop_indices(self): + for i in range(0, self.nindices): + self.session.drop("index:collator:x" + str(i)) + + def csv(self, s, i): + return s.split(',')[i] + + def expected_main_value(self, i): + return ','.join([str((i*j)%100) for j in range(0, self.nindices)]) + + # We split the population into two phases + # (in anticipation of future tests that create + # indices between the two population steps). + def populate(self): + cursor = self.session.open_cursor('table:collator', None, None) + for i in range(0, self.nentries): + cursor[i] = self.expected_main_value(i) + cursor.close() + + def check_entries(self): + cursor = self.session.open_cursor('table:collator', None, None) + icursor = [] + for i in range(0, self.nindices): + icursor.append(self.session.open_cursor('index:collator:x' + str(i), + None, None)) + i = 0 + for primkey, value in cursor: + # Check main table + expect = self.expected_main_value(i) + self.assertEqual(i, primkey) + self.assertEqual(value, expect) + for idx in range(0, self.nindices): + c = icursor[idx] + indexkey = (i*idx)%100 + c.set_key(indexkey) + self.assertEqual(c.search(), 0) + value = c.get_value() + key = c.get_key() + while value != expect and key == indexkey and \ + self.csv(value, idx) == self.csv(expect, idx): + self.assertEqual(0, c.next()) + value = c.get_value() + key = c.get_key() + self.assertEqual(value, expect) + i += 1 + self.assertEqual(self.nentries, i) + for i in range(0, self.nindices): + c = icursor[i] + c.reset() + expected = set(range(0, self.nentries)) + for key, val in c: + primkey = int(val.split(',')[1]) + expected.remove(primkey) + self.assertEquals(0, len(expected)) + c.close() + + def test_index(self): + self.session.create("table:collator", "key_format=i,value_format=S," + "columns=(primarykey,value)") + self.create_indices() + self.populate() + self.check_entries() + + # Drop and recreate all indices, everything should be there. + self.drop_indices() + self.create_indices() + self.check_entries() + + +if __name__ == '__main__': + wttest.run() diff --git a/test/suite/test_compact02.py b/test/suite/test_compact02.py index 14781b0f050..7ad05cd2536 100644 --- a/test/suite/test_compact02.py +++ b/test/suite/test_compact02.py @@ -50,11 +50,12 @@ class test_compact02(wttest.WiredTigerTestCase): # being stored: compaction doesn't work on tables with many overflow items # because we don't rewrite them. Experimentally, 8KB is as small as the test # can go. Additionally, we can't set the maximum page size too large because - # there won't be enough pages to rewrite. Experimentally, 32KB (the default) - # is as large as the test can go. + # there won't be enough pages to rewrite. Experimentally, 128KB works. fileConfig = [ ('default', dict(fileConfig='')), ('8KB', dict(fileConfig='leaf_page_max=8kb')), + ('64KB', dict(fileConfig='leaf_page_max=64KB')), + ('128KB', dict(fileConfig='leaf_page_max=128KB')), ] scenarios = \ number_scenarios(multiply_scenarios('.', types, cacheSize, fileConfig)) diff --git a/test/suite/test_cursor06.py b/test/suite/test_cursor06.py index ff7c1144344..5545c862dd7 100644 --- a/test/suite/test_cursor06.py +++ b/test/suite/test_cursor06.py @@ -89,10 +89,11 @@ class test_cursor06(wttest.WiredTigerTestCase): self.session.drop(uri, "force") self.populate(uri) cursor = self.session.open_cursor(uri, None, open_config) + msg = '/Unsupported cursor/' if open_config == "readonly=1": self.set_kv(cursor) - self.assertRaises(wiredtiger.WiredTigerError, - lambda: cursor.update()) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: cursor.update(), msg) else: self.set_kv(cursor) cursor.update() diff --git a/test/suite/test_cursor_random.py b/test/suite/test_cursor_random.py index 2cef62b218a..16ce5cae685 100644 --- a/test/suite/test_cursor_random.py +++ b/test/suite/test_cursor_random.py @@ -51,15 +51,21 @@ class test_cursor_random(wttest.WiredTigerTestCase): uri = self.type self.session.create(uri, 'key_format=S,value_format=S') cursor = self.session.open_cursor(uri, None, self.config) - self.assertRaises( - wiredtiger.WiredTigerError, lambda: cursor.compare(cursor)) - self.assertRaises(wiredtiger.WiredTigerError, lambda: cursor.insert()) - self.assertRaises(wiredtiger.WiredTigerError, lambda: cursor.prev()) - self.assertRaises(wiredtiger.WiredTigerError, lambda: cursor.remove()) - self.assertRaises(wiredtiger.WiredTigerError, lambda: cursor.search()) - self.assertRaises( - wiredtiger.WiredTigerError, lambda: cursor.search_near()) - self.assertRaises(wiredtiger.WiredTigerError, lambda: cursor.update()) + msg = "/Unsupported cursor/" + self.assertRaisesWithMessage( + wiredtiger.WiredTigerError, lambda: cursor.compare(cursor), msg) + self.assertRaisesWithMessage( + wiredtiger.WiredTigerError, lambda: cursor.insert(), msg) + self.assertRaisesWithMessage( + wiredtiger.WiredTigerError, lambda: cursor.prev(), msg) + self.assertRaisesWithMessage( + wiredtiger.WiredTigerError, lambda: cursor.remove(), msg) + self.assertRaisesWithMessage( + wiredtiger.WiredTigerError, lambda: cursor.search(), msg) + self.assertRaisesWithMessage( + wiredtiger.WiredTigerError, lambda: cursor.search_near(), msg) + self.assertRaisesWithMessage( + wiredtiger.WiredTigerError, lambda: cursor.update(), msg) self.assertTrue(cursor.next(), wiredtiger.WT_NOTFOUND) self.assertEquals(cursor.reconfigure(), 0) @@ -137,7 +143,7 @@ class test_cursor_random_column(wttest.WiredTigerTestCase): def test_cursor_random_column(self): self.session.create(self.uri, 'key_format=r,value_format=S') - msg = '/Operation not supported/' + msg = '/next_random .* not supported/' self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: self.session.open_cursor(self.uri, None, "next_random=true"), msg) diff --git a/test/suite/test_drop.py b/test/suite/test_drop.py index 5663b85d661..52ea7251ab5 100644 --- a/test/suite/test_drop.py +++ b/test/suite/test_drop.py @@ -41,12 +41,11 @@ class test_drop(wttest.WiredTigerTestCase): scenarios = check_scenarios([ ('file', dict(uri='file:')), ('table', dict(uri='table:')), - #Not yet: drop failing with an open cursor needs handle locking - #('table-lsm', dict(uri='table:', extra_config=',type=lsm')), + ('table-lsm', dict(uri='table:', extra_config=',type=lsm')), ]) # Populate an object, remove it and confirm it no longer exists. - def drop(self, populate, with_cursor, close_session, drop_index): + def drop(self, populate, with_cursor, reopen, drop_index): uri = self.uri + self.name populate(self, uri, 'key_format=S' + self.extra_config, 10) @@ -57,7 +56,7 @@ class test_drop(wttest.WiredTigerTestCase): lambda: self.session.drop(uri, None)) cursor.close() - if close_session: + if reopen: self.reopen_conn() if drop_index: @@ -73,17 +72,17 @@ class test_drop(wttest.WiredTigerTestCase): # Try all combinations except dropping the index, the simple # case has no indices. for with_cursor in [False, True]: - for close_session in [False, True]: - self.drop(simple_populate, with_cursor, close_session, False) + for reopen in [False, True]: + self.drop(simple_populate, with_cursor, reopen, False) # A complex, multi-file table object. # Try all test combinations. if self.uri == "table:": for with_cursor in [False, True]: - for close_session in [False, True]: + for reopen in [False, True]: for drop_index in [False, True]: self.drop(complex_populate, with_cursor, - close_session, drop_index) + reopen, drop_index) # Test drop of a non-existent object: force succeeds, without force fails. def test_drop_dne(self): diff --git a/test/suite/test_drop02.py b/test/suite/test_drop02.py new file mode 100644 index 00000000000..677ba3866b2 --- /dev/null +++ b/test/suite/test_drop02.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2016 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest +from helper import simple_populate + +# test_drop02.py +# Test dropping an LSM tree on first open. There was a bug where this +# would cause an assertion failure: WT-2501 +class test_drop02(wttest.WiredTigerTestCase): + name = 'test_drop02' + + # Populate an object, remove it and confirm it no longer exists. + def test_drop(self): + uri = 'lsm:' + self.name + simple_populate(self, uri, 'key_format=S', 100000) + self.reopen_conn() + + self.session.drop(uri, None) + +if __name__ == '__main__': + wttest.run() diff --git a/test/suite/test_dump.py b/test/suite/test_dump.py index c850d1b5d3f..fc1422155e2 100644 --- a/test/suite/test_dump.py +++ b/test/suite/test_dump.py @@ -29,8 +29,8 @@ import os import wiredtiger, wttest from helper import \ - complex_populate, complex_populate_check_cursor,\ - simple_populate, simple_populate_check_cursor + complex_populate, complex_populate_check, \ + simple_populate, simple_populate_check from suite_subprocess import suite_subprocess from wtscenario import multiply_scenarios, number_scenarios @@ -54,15 +54,24 @@ class test_dump(wttest.WiredTigerTestCase, suite_subprocess): ('string', dict(keyfmt='S')) ] types = [ - ('file', dict(type='file:', + ('file', dict(uri='file:', config='', lsm=False, populate=simple_populate, - populate_check=simple_populate_check_cursor)), - ('table-simple', dict(type='table:', + populate_check=simple_populate_check)), + ('lsm', dict(uri='lsm:', config='', lsm=True, populate=simple_populate, - populate_check=simple_populate_check_cursor)), - ('table-complex', dict(type='table:', + populate_check=simple_populate_check)), + ('table-simple', dict(uri='table:', config='', lsm=False, + populate=simple_populate, + populate_check=simple_populate_check)), + ('table-simple-lsm', dict(uri='table:', config='type=lsm', lsm=True, + populate=simple_populate, + populate_check=simple_populate_check)), + ('table-complex', dict(uri='table:', config='', lsm=False, + populate=complex_populate, + populate_check=complex_populate_check)), + ('table-complex-lsm', dict(uri='table:', config='type=lsm', lsm=True, populate=complex_populate, - populate_check=complex_populate_check_cursor)) + populate_check=complex_populate_check)) ] scenarios = number_scenarios( multiply_scenarios('.', types, keyfmt, dumpfmt)) @@ -94,9 +103,14 @@ class test_dump(wttest.WiredTigerTestCase, suite_subprocess): # Dump, re-load and do a content comparison. def test_dump(self): + # LSM and column-store isn't a valid combination. + if self.lsm and self.keyfmt == 'r': + return + # Create the object. - uri = self.type + self.name - self.populate(self, uri, 'key_format=' + self.keyfmt, self.nentries) + uri = self.uri + self.name + self.populate(self, uri, + self.config + ',key_format=' + self.keyfmt, self.nentries) # Dump the object. os.mkdir(self.dir) @@ -108,11 +122,17 @@ class test_dump(wttest.WiredTigerTestCase, suite_subprocess): # Re-load the object. self.runWt(['-h', self.dir, 'load', '-f', 'dump.out']) - # Check the contents + # Check the database contents + self.runWt(['list'], outfilename='list.out') + self.runWt(['-h', self.dir, 'list'], outfilename='list.out.new') + s1 = set(open('list.out').read().split()) + s2 = set(open('list.out.new').read().split()) + self.assertEqual(not s1.symmetric_difference(s2), True) + + # Check the object's contents conn = self.wiredtiger_open(self.dir) session = conn.open_session() - cursor = session.open_cursor(uri, None, None) - self.populate_check(self, cursor, self.nentries) + self.populate_check(self, uri, self.nentries) conn.close() # Re-load the object again. @@ -121,8 +141,7 @@ class test_dump(wttest.WiredTigerTestCase, suite_subprocess): # Check the contents, they shouldn't have changed. conn = self.wiredtiger_open(self.dir) session = conn.open_session() - cursor = session.open_cursor(uri, None, None) - self.populate_check(self, cursor, self.nentries) + self.populate_check(self, uri, self.nentries) conn.close() # Re-load the object again, but confirm -n (no overwrite) fails. @@ -130,7 +149,7 @@ class test_dump(wttest.WiredTigerTestCase, suite_subprocess): 'load', '-n', '-f', 'dump.out'], errfilename='errfile.out') self.check_non_empty_file('errfile.out') - # If there is are indices, dump one of them and check the output. + # If there are indices, dump one of them and check the output. if self.populate == complex_populate: indexuri = 'index:' + self.name + ':indx1' hexopt = ['-x'] if self.hex == 1 else [] diff --git a/test/suite/test_join01.py b/test/suite/test_join01.py index f03c7c6f06c..4aa2bc6e269 100644 --- a/test/suite/test_join01.py +++ b/test/suite/test_join01.py @@ -33,7 +33,6 @@ from wtscenario import check_scenarios, multiply_scenarios, number_scenarios # Join operations # Basic tests for join class test_join01(wttest.WiredTigerTestCase): - table_name1 = 'test_join01' nentries = 100 scenarios = [ @@ -75,8 +74,18 @@ class test_join01(wttest.WiredTigerTestCase): # the join cursor and iterating again. def stats(self, jc, which): statcur = self.session.open_cursor('statistics:join', jc, None) - self.check_stats(statcur, 0, 'join: index:join01:index1: ' + - 'bloom filter false positives') + # pick a stat we always expect to see + statdesc = 'bloom filter false positives' + expectstats = [ + 'join: index:join01:index1: ' + statdesc, + 'join: index:join01:index2: ' + statdesc ] + if self.ref == 'index': + expectstats.append('join: index:join01:index0: ' + statdesc) + else: + expectstats.append('join: table:join01: ' + statdesc) + self.check_stats(statcur, expectstats) + statcur.reset() + self.check_stats(statcur, expectstats) statcur.close() def statstr_to_int(self, str): @@ -87,16 +96,14 @@ class test_join01(wttest.WiredTigerTestCase): parts = str.rpartition('(') return int(parts[2].rstrip(')')) - # string should appear with a minimum value of least "min". - def check_stats(self, statcursor, min, lookfor): + # All of the expect strings should appear + def check_stats(self, statcursor, expectstats): stringclass = ''.__class__ intclass = (0).__class__ # Reset the cursor, we're called multiple times. statcursor.reset() - found = False - foundval = 0 self.printVerbose(3, 'statistics:') for id, desc, valstr, val in statcursor: self.assertEqual(type(desc), stringclass) @@ -105,12 +112,11 @@ class test_join01(wttest.WiredTigerTestCase): self.assertEqual(val, self.statstr_to_int(valstr)) self.printVerbose(3, ' stat: \'' + desc + '\', \'' + valstr + '\', ' + str(val)) - if desc == lookfor: - found = True - foundval = val + if desc in expectstats: + expectstats.remove(desc) - self.assertTrue(found, 'in stats, did not see: ' + lookfor) - self.assertTrue(foundval >= min) + self.assertTrue(len(expectstats) == 0, + 'missing expected values in stats: ' + str(expectstats)) # Common function for testing the most basic functionality # of joins @@ -142,7 +148,8 @@ class test_join01(wttest.WiredTigerTestCase): # and examine primary keys 2,5,8,...,95,98,1,4,7,...,94,97. jc = self.session.open_cursor('join:table:join01' + proj_suffix, None, None) - c2 = self.session.open_cursor('index:join01:index2', None, None) + # Adding a projection to a reference cursor should be allowed. + c2 = self.session.open_cursor('index:join01:index2(v1)', None, None) c2.set_key(99) # skips all entries w/ primary key divisible by three self.assertEquals(0, c2.search()) self.session.join(jc, c2, 'compare=gt') @@ -160,12 +167,12 @@ class test_join01(wttest.WiredTigerTestCase): # Then select all numbers whose reverse string representation # is in '20' < x < '40'. - c1a = self.session.open_cursor('index:join01:index1', None, None) + c1a = self.session.open_cursor('index:join01:index1(v1)', None, None) c1a.set_key('21') self.assertEquals(0, c1a.search()) self.session.join(jc, c1a, 'compare=gt' + joincfg1) - c1b = self.session.open_cursor('index:join01:index1', None, None) + c1b = self.session.open_cursor('index:join01:index1(v1)', None, None) c1b.set_key('41') self.assertEquals(0, c1b.search()) self.session.join(jc, c1b, 'compare=lt' + joincfg1) @@ -342,11 +349,12 @@ class test_join01(wttest.WiredTigerTestCase): '/index cursor is being used in a join/') # Only a small number of operations allowed on a join cursor - self.assertRaises(wiredtiger.WiredTigerError, - lambda: jc.search()) + msg = "/Unsupported cursor/" + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: jc.search(), msg) - self.assertRaises(wiredtiger.WiredTigerError, - lambda: jc.prev()) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: jc.prev(), msg) self.assertEquals(jc.next(), 0) self.assertEquals(jc.next(), wiredtiger.WT_NOTFOUND) @@ -390,6 +398,7 @@ class test_join01(wttest.WiredTigerTestCase): def test_cursor_close2(self): self.cursor_close_common(False) + # test statistics using the framework set up for this test def test_stats(self): bloomcfg1000 = ',strategy=bloom,count=1000' bloomcfg10 = ',strategy=bloom,count=10' @@ -399,6 +408,40 @@ class test_join01(wttest.WiredTigerTestCase): # statistics should pick up some false positives. self.join_common(bloomcfg10, bloomcfg10, False, True) + # test statistics with a simple one index join cursor + def test_simple_stats(self): + self.session.create("table:join01b", + "key_format=i,value_format=i,columns=(k,v)") + self.session.create("index:join01b:index", "columns=(v)") + + cursor = self.session.open_cursor("table:join01b", None, None) + cursor[1] = 11 + cursor[2] = 12 + cursor[3] = 13 + cursor.close() + + cursor = self.session.open_cursor("index:join01b:index", None, None) + cursor.set_key(11) + cursor.search() + + jcursor = self.session.open_cursor("join:table:join01b", None, None) + self.session.join(jcursor, cursor, "compare=gt") + + while jcursor.next() == 0: + [k] = jcursor.get_keys() + [v] = jcursor.get_values() + + statcur = self.session.open_cursor("statistics:join", jcursor, None) + found = False + while statcur.next() == 0: + [desc, pvalue, value] = statcur.get_values() + #self.tty(str(desc) + "=" + str(pvalue)) + found = True + self.assertEquals(found, True) + + jcursor.close() + cursor.close() + if __name__ == '__main__': wttest.run() diff --git a/test/suite/test_join02.py b/test/suite/test_join02.py index d122de8a0eb..a691c499cf6 100644 --- a/test/suite/test_join02.py +++ b/test/suite/test_join02.py @@ -179,15 +179,16 @@ class test_join02(wttest.WiredTigerTestCase): c.close() # Use the primary table in one of the joins. + # Use various projections, which should not matter for ref cursors c0a = self.session.open_cursor('table:join02', None, None) - c0b = self.session.open_cursor('table:join02', None, None) - c1a = self.session.open_cursor('index:join02:index1', None, None) + c0b = self.session.open_cursor('table:join02(v4)', None, None) + c1a = self.session.open_cursor('index:join02:index1(v0)', None, None) c1b = self.session.open_cursor('index:join02:index1', None, None) c2a = self.session.open_cursor('index:join02:index2', None, None) c2b = self.session.open_cursor('index:join02:index2', None, None) - c3a = self.session.open_cursor('index:join02:index3', None, None) - c3b = self.session.open_cursor('index:join02:index3', None, None) - c4a = self.session.open_cursor('index:join02:index4', None, None) + c3a = self.session.open_cursor('index:join02:index3(v4)', None, None) + c3b = self.session.open_cursor('index:join02:index3(v0)', None, None) + c4a = self.session.open_cursor('index:join02:index4(v1)', None, None) # Attach extra properties to each cursor. For cursors that # may appear on the 'left' side of a range CA < x < CB, diff --git a/test/suite/test_join05.py b/test/suite/test_join05.py new file mode 100644 index 00000000000..ef2be4c6460 --- /dev/null +++ b/test/suite/test_join05.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2016 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest +from wtscenario import check_scenarios, multiply_scenarios, number_scenarios + +# test_join05.py +# Tests based on JIRA reports +class test_join05(wttest.WiredTigerTestCase): + + # test join having the first index just be lt/le + def test_wt_2384(self): + self.session.create("table:test_2384", + "key_format=i,value_format=i,columns=(k,v)") + self.session.create("index:test_2384:index", "columns=(v)") + cursor = self.session.open_cursor("table:test_2384", None, None) + cursor[1] = 11 + cursor[2] = 12 + cursor[3] = 13 + cursor.close() + + cursor = self.session.open_cursor("index:test_2384:index", None, None) + cursor.set_key(13) + self.assertEquals(cursor.search(), 0) + + jcursor = self.session.open_cursor("join:table:test_2384", None, None) + self.session.join(jcursor, cursor, "compare=lt") + + nr_found = 0 + while jcursor.next() == 0: + [k] = jcursor.get_keys() + [v] = jcursor.get_values() + #self.tty("jcursor: k=" + str(k) + ", v=" + str(v)) + nr_found += 1 + + self.assertEquals(nr_found, 2) + jcursor.close() + cursor.close() + +if __name__ == '__main__': + wttest.run() diff --git a/test/suite/test_join06.py b/test/suite/test_join06.py new file mode 100644 index 00000000000..9af6f93792f --- /dev/null +++ b/test/suite/test_join06.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2016 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import os +import wiredtiger, wttest, run +from wtscenario import check_scenarios, multiply_scenarios, number_scenarios + +# test_join06.py +# Join operations +# Joins with a read-uncommitted +class test_join06(wttest.WiredTigerTestCase): + nentries = 1000 + + isoscen = [ + ('isolation_read_uncommitted', dict(uncommitted=True)), + ('isolation_default', dict(uncommitted=False)) + ] + + bloomscen = [ + ('bloom', dict(bloom=True)), + ('nobloom', dict(bloom=False)) + ] + + scenarios = number_scenarios(multiply_scenarios('.', isoscen, bloomscen)) + + def gen_values(self, i): + s = str(i) # 345 => "345" + f = s[0:1] + s[0:1] + s[0:1] # 345 => "333" + return [s, f] + + def gen_values2(self, i): + s = str(i) # 345 => "345" + l = s[-1:] + s[-1:] + s[-1:] # 345 => "555" + return [s, l] + + def populate(self, s, gen_values): + c = s.open_cursor('table:join06', None, None) + for i in range(0, self.nentries): + c.set_key(i) + c.set_value(*gen_values(i)) + c.insert() + c.close() + + # Common function for testing the most basic functionality + # of joins + def test_join(self): + self.session.create('table:join06', + 'columns=(k,v0,v1),key_format=i,value_format=SS') + self.session.create('index:join06:index0','columns=(v0)') + self.session.create('index:join06:index1','columns=(v1)') + + self.populate(self.session, self.gen_values) + + # TODO: needed? + #self.reopen_conn() + + if self.uncommitted: + self.session.begin_transaction('isolation=read-uncommitted') + + jc = self.session.open_cursor('join:table:join06', None, None) + c0 = self.session.open_cursor('index:join06:index0', None, None) + c0.set_key('520') + self.assertEquals(0, c0.search()) + self.session.join(jc, c0, 'compare=ge') + + joinconfig = 'compare=eq' + if self.bloom: + joinconfig += ',strategy=bloom,count=1000' + c1 = self.session.open_cursor('index:join06:index1', None, None) + c1.set_key('555') + self.assertEquals(0, c1.search()) + self.session.join(jc, c1, joinconfig) + + if self.uncommitted and self.bloom: + # Make sure that read-uncommitted with Bloom is not allowed. + # This is detected on the first next() operation. + msg = '/cannot be used with read-uncommitted/' + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: jc.next(), msg) + return + + # Changes made in another session may or may not be visible to us, + # depending on the isolation level. + if self.uncommitted: + # isolation level is read-uncommitted, so we will see + # additions deletions made in our other session. + mbr = set(range(525,1000,10)) | set(range(55,100,10)) | set([520]) + else: + # default isolation level, so we should see a consistent + # set at the time we begin iteration. + mbr = set(range(520,600)) | set(range(53,60)) + + altered = False + + while jc.next() == 0: + [k] = jc.get_keys() + [v0,v1] = jc.get_values() + #self.tty('GOT: ' + str(k) + ': ' + str(jc.get_values())) + if altered and self.uncommitted: + self.assertEquals(self.gen_values2(k), [v0, v1]) + else: + self.assertEquals(self.gen_values(k), [v0, v1]) + if not k in mbr: + self.tty('**** ERROR: result ' + str(k) + ' is not in: ' + + str(mbr)) + self.assertTrue(k in mbr) + mbr.remove(k) + + # In another session, we remove entries for keys ending in 6, + # and add entries for keys ending in 5. Depending on the + # isolation level for the transaction, these changes may or + # may not be visible for the original session. + if not altered: + s = self.conn.open_session(None) + s.begin_transaction(None) + self.populate(s, self.gen_values2) + s.commit_transaction() + s.close() + altered = True + + if len(mbr) != 0: + self.tty('**** ERROR: did not see these: ' + str(mbr)) + self.assertEquals(0, len(mbr)) + + jc.close() + c1.close() + c0.close() + if self.uncommitted: + self.session.commit_transaction() + self.session.drop('table:join06') + +if __name__ == '__main__': + wttest.run() diff --git a/test/suite/test_lsm03.py b/test/suite/test_lsm03.py new file mode 100644 index 00000000000..448d864c646 --- /dev/null +++ b/test/suite/test_lsm03.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2016 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wtscenario, wttest +from helper import simple_populate + +# test_lsm03.py +# Check to make sure that LSM schema operations don't get EBUSY when +# there are no user operations active. +class test_lsm03(wttest.WiredTigerTestCase): + name = 'test_lsm03' + + # Use small pages so we generate some internal layout + # Setup LSM so multiple chunks are present + config = 'key_format=S,allocation_size=512,internal_page_max=512' + \ + ',leaf_page_max=1k,lsm=(chunk_size=512k,merge_min=10)' + + # Populate an object then drop it. + def test_lsm_drop_active(self): + uri = 'lsm:' + self.name + simple_populate(self, uri, self.config, 10000) + + # Force to disk + self.reopen_conn() + + # An open cursors should cause failure. + cursor = self.session.open_cursor(uri, None, None) + self.assertRaises(wiredtiger.WiredTigerError, + lambda: self.session.drop(uri, None)) + cursor.close() + + # Add enough records that a merge should be running + simple_populate(self, uri, self.config, 50000) + # The drop should succeed even when LSM work units are active + self.session.drop(uri) diff --git a/test/suite/test_readonly01.py b/test/suite/test_readonly01.py new file mode 100644 index 00000000000..59e9743ab7e --- /dev/null +++ b/test/suite/test_readonly01.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python +# +# Public Domain 2016-2016 MongoDB, Inc. +# Public Domain 2008-2016 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# test_readonly01.py +# Readonly: Test readonly mode. +# + +import fnmatch, os, shutil, time +from suite_subprocess import suite_subprocess +from wtscenario import multiply_scenarios, number_scenarios, prune_scenarios +import wttest + +class test_readonly01(wttest.WiredTigerTestCase, suite_subprocess): + tablename = 'test_readonly01' + create = True + entries = 10000 + + # + # We want a list of directory writable or readonly. + # + basecfg_list = [ + ('basecfg', dict(basecfg='config_base=true,')), + ('no_basecfg', dict(basecfg='config_base=false,')), + ] + dir_list = [ + ('write', dict(dirchmod=False)), + ('readonly', dict(dirchmod=True)), + ] + log_list = [ + ('logging', dict(logcfg='log=(archive=false,enabled,file_max=100K),')), + ('no_logging', dict(logcfg='log=(enabled=false),')), + ] + + types = [ + ('lsm', dict(tabletype='lsm', uri='lsm', + create_params = 'key_format=i,value_format=i')), + ('file-row', dict(tabletype='row', uri='file', + create_params = 'key_format=i,value_format=i')), + ('file-var', dict(tabletype='var', uri='file', + create_params = 'key_format=r,value_format=i')), + ('file-fix', dict(tabletype='fix', uri='file', + create_params = 'key_format=r,value_format=8t')), + ('table-row', dict(tabletype='row', uri='table', + create_params = 'key_format=i,value_format=i')), + ('table-var', dict(tabletype='var', uri='table', + create_params = 'key_format=r,value_format=i')), + ('table-fix', dict(tabletype='fix', uri='table', + create_params = 'key_format=r,value_format=8t')), + ] + + scenarios = multiply_scenarios('.', + basecfg_list, dir_list, log_list, types) + + def conn_config(self, dir): + self.home = dir + params = \ + 'error_prefix="%s",' % self.shortid() + \ + '%s' % self.logcfg + \ + '%s' % self.basecfg + if self.create: + conn_params = 'create,' + params + else: + conn_params = 'readonly=true,' + params + return conn_params + + def close_reopen(self): + ''' Close the connection and reopen readonly''' + # + # close the original connection. If needed, chmod the + # database directory to readonly mode. Then reopen the + # connection with readonly. + # + self.close_conn() + # + # The chmod command is not fully portable to windows. + # + if self.dirchmod and os.name == 'posix': + for f in os.listdir(self.home): + if os.path.isfile(f): + os.chmod(f, 0444) + os.chmod(self.home, 0555) + self.conn = self.setUpConnectionOpen(self.home) + self.session = self.setUpSessionOpen(self.conn) + + def readonly(self): + # Here's the strategy: + # - Create a table. + # - Insert data into table. + # - Close connection. + # - Possibly chmod to readonly + # - Open connection readonly + # - Confirm we can read the data. + # + tablearg = self.uri + ':' + self.tablename + self.session.create(tablearg, self.create_params) + c = self.session.open_cursor(tablearg, None, None) + for i in range(self.entries): + c[i+1] = i % 255 + # Close the connection. Reopen readonly + self.create = False + self.close_reopen() + c = self.session.open_cursor(tablearg, None, None) + i = 0 + for key, value in c: + self.assertEqual(i+1, key) + self.assertEqual(i % 255, value) + i += 1 + self.assertEqual(i, self.entries) + self.pr('Read %d entries' % i) + c.close() + self.create = True + + def test_readonly(self): + if self.dirchmod and os.name == 'posix': + with self.expectedStderrPattern('Permission'): + self.readonly() + else: + self.readonly() + +if __name__ == '__main__': + wttest.run() diff --git a/test/suite/test_readonly02.py b/test/suite/test_readonly02.py new file mode 100644 index 00000000000..0df5465642d --- /dev/null +++ b/test/suite/test_readonly02.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python +# +# Public Domain 2016-2016 MongoDB, Inc. +# Public Domain 2008-2016 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# test_readonly02.py +# Readonly: Test readonly mode with illegal config combinations +# and error checking during updates. +# + +from helper import copy_wiredtiger_home +from suite_subprocess import suite_subprocess +import os, wiredtiger, wttest + +class test_readonly02(wttest.WiredTigerTestCase, suite_subprocess): + tablename = 'table:test_readonly02' + create = True + create_params = 'key_format=i,value_format=i' + entries = 10 + + conn_params = \ + 'create,statistics=(fast),log=(enabled,file_max=100K,zero_fill=true),' + conn_params_rd = \ + 'create,readonly=true,statistics=(fast),log=(enabled,zero_fill=false),' + conn_params_rdcfg = \ + 'create,readonly=true,statistics=(fast),log=(enabled),' + + # + # Run to make sure incompatible configuration options return an error. + # The situations that cause failures (instead of silent overrides) are: + # 1. setting readonly on a new database directory + # 2. an unclean shutdown and reopening readonly + # 3. logging with zero-fill enabled and readonly + # + badcfg1 = 'log=(enabled,zero_fill=true)' + + def setUpConnectionOpen(self, dir): + self.home = dir + rdonlydir = dir + '.rdonly' + # + # First time through check readonly on a non-existent database. + # + if self.create: + # 1. setting readonly on a new database directory + # Setting readonly prevents creation so we should see an + # error because the lock file does not exist. + msg = '/No such file/' + if os.name != 'posix': + msg = '/cannot find the file/' + os.mkdir(rdonlydir) + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.wiredtiger_open( + rdonlydir, self.conn_params_rd), msg) + + self.create = False + conn = self.wiredtiger_open(dir, self.conn_params) + return conn + + def check_unclean(self): + backup = "WT_COPYDIR" + copy_wiredtiger_home(self.home, backup, True) + msg = '/needs recovery/' + # 2. an unclean shutdown and reopening readonly + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.wiredtiger_open(backup, self.conn_params_rd), msg) + + def close_checkerror(self, cfg): + ''' Close the connection and reopen readonly''' + # + # Close the original connection. Reopen readonly and also with + # the given configuration string. + # + self.close_conn() + conn_params = self.conn_params_rd + cfg + msg = '/Invalid argument/' + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.wiredtiger_open(self.home, conn_params), msg) + + def test_readonly(self): + tablearg = self.tablename + self.session.create(tablearg, self.create_params) + c = self.session.open_cursor(tablearg, None, None) + for i in range(self.entries): + c[i+1] = i % 255 + # Check for an error on an unclean recovery/restart. + self.check_unclean() + + # Close the connection. Reopen readonly with other bad settings. + # 3. logging with zero-fill enabled and readonly + self.close_checkerror(self.badcfg1) + +if __name__ == '__main__': + wttest.run() diff --git a/test/suite/test_readonly03.py b/test/suite/test_readonly03.py new file mode 100644 index 00000000000..d9930e8f553 --- /dev/null +++ b/test/suite/test_readonly03.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python +# +# Public Domain 2016-2016 MongoDB, Inc. +# Public Domain 2008-2016 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# test_readonly03.py +# Readonly: Test connection readonly mode with modifying methods. Confirm +# all return ENOTSUP. +# + +from helper import simple_populate +from suite_subprocess import suite_subprocess +import os, sys, wiredtiger, wttest + +class test_readonly03(wttest.WiredTigerTestCase, suite_subprocess): + uri = 'table:test_readonly03' + uri2 = 'table:test_readonly03_2' + create = True + + conn_params = 'create,log=(enabled),' + conn_params_rd = 'readonly=true' + + session_ops = [ 'create', 'compact', 'drop', 'log_flush', 'log_printf', + 'rebalance', 'rename', 'salvage', 'truncate', 'upgrade', ] + cursor_ops = [ 'insert', 'remove', 'update', ] + + def setUpConnectionOpen(self, dir): + self.home = dir + if self.create: + conn_cfg = self.conn_params + else: + conn_cfg = self.conn_params_rd + conn = self.wiredtiger_open(dir, conn_cfg) + self.create = False + return conn + + + def test_readonly(self): + create_params = 'key_format=i,value_format=i' + entries = 10 + # Create a database and a table. + simple_populate(self, self.uri, create_params, entries) + + # + # Now close and reopen. Note that the connection function + # above will reopen it readonly. + self.reopen_conn() + msg = '/Unsupported/' + c = self.session.open_cursor(self.uri, None, None) + for op in self.cursor_ops: + c.set_key(1) + c.set_value(1) + if op == 'insert': + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: c.insert(), msg) + elif op == 'remove': + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: c.remove(), msg) + elif op == 'update': + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: c.update(), msg) + else: + self.fail('Unknown cursor operation: ' + op) + c.close() + for op in self.session_ops: + if op == 'create': + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.create(self.uri2, create_params), + msg) + elif op == 'compact': + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.compact(self.uri, None), msg) + elif op == 'drop': + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.drop(self.uri, None), msg) + elif op == 'log_flush': + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.log_flush(None), msg) + elif op == 'log_printf': + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.log_printf("test"), msg) + elif op == 'rebalance': + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.rebalance(self.uri, None), msg) + elif op == 'rename': + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.rename(self.uri, self.uri2, None), msg) + elif op == 'salvage': + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.salvage(self.uri, None), msg) + elif op == 'truncate': + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.truncate(self.uri, None, None, None), + msg) + elif op == 'upgrade': + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.upgrade(self.uri, None), msg) + else: + self.fail('Unknown session method: ' + op) + +if __name__ == '__main__': + wttest.run() diff --git a/test/suite/test_rebalance.py b/test/suite/test_rebalance.py index 80cce6ed514..f2167e864c9 100644 --- a/test/suite/test_rebalance.py +++ b/test/suite/test_rebalance.py @@ -59,7 +59,7 @@ class test_rebalance(wttest.WiredTigerTestCase): if with_cursor: cursor = self.session.open_cursor(uri, None, None) self.assertRaises(wiredtiger.WiredTigerError, - lambda: self.session.drop(uri, None)) + lambda: self.session.rebalance(uri, None)) cursor.close() self.session.rebalance(uri, None) diff --git a/test/suite/test_schema07.py b/test/suite/test_schema07.py new file mode 100644 index 00000000000..ac397c6e1a1 --- /dev/null +++ b/test/suite/test_schema07.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2016 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest + +# test_schema07.py +# Test that long-running tests don't fill the cache with metadata +class test_schema07(wttest.WiredTigerTestCase): + tablename = 'table:test_schema07' + + def conn_config(self, dir): + return 'cache_size=10MB' + + @wttest.longtest("Creating many tables shouldn't fill the cache") + def test_many_tables(self): + s = self.session + # We have a 10MB cache, metadata is (well) over 512B per table, + # if we can create 20K tables, something must be cleaning up. + for i in xrange(20000): + uri = '%s-%06d' % (self.tablename, i) + s.create(uri) + c = s.open_cursor(uri) + # This will block if the metadata fills the cache + c["key"] = "value" + c.close() + self.session.drop(uri) + +if __name__ == '__main__': + wttest.run() diff --git a/test/suite/test_sweep01.py b/test/suite/test_sweep01.py index f996dbfa06d..bccd2bce012 100644 --- a/test/suite/test_sweep01.py +++ b/test/suite/test_sweep01.py @@ -40,7 +40,7 @@ import wttest class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess): tablebase = 'test_sweep01' uri = 'table:' + tablebase - numfiles = 50 + numfiles = 30 numkv = 1000 conn_config = 'file_manager=(close_handle_minimum=0,' + \ 'close_idle_time=6,close_scan_interval=2),' + \ @@ -87,7 +87,7 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess): # # We've configured checkpoints to run every 5 seconds, sweep server to # run every 2 seconds and idle time to be 6 seconds. It should take - # about 8 seconds for a handle to be closed. Sleep for 12 seconds to be + # about 8 seconds for a handle to be closed. Sleep for double to be # safe. # uri = '%s.test' % self.uri @@ -105,13 +105,24 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess): c = self.session.open_cursor(uri, None) k = 0 sleep = 0 - while sleep < 12: + max = 60 + final_nfile = 4 + while sleep < max: self.session.checkpoint() k = k+1 c[k] = 1 sleep += 2 time.sleep(2) + # Give slow machines time to process files. + stat_cursor = self.session.open_cursor('statistics:', None, None) + this_nfile = stat_cursor[stat.conn.file_open][2] + stat_cursor.close() + self.pr("==== loop " + str(sleep)) + self.pr("this_nfile " + str(this_nfile)) + if this_nfile == final_nfile: + break c.close() + self.pr("Sweep loop took " + str(sleep)) stat_cursor = self.session.open_cursor('statistics:', None, None) close2 = stat_cursor[stat.conn.dh_sweep_close][2] @@ -177,7 +188,7 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess): self.assertEqual(nfile2 < nfile1, True) # The only files that should be left are the metadata, the lookaside # file, the lock file, and the active file. - if (nfile2 != 4): + if (nfile2 != final_nfile): print "close1: " + str(close1) + " close2: " + str(close2) print "remove1: " + str(remove1) + " remove2: " + str(remove2) print "sweep1: " + str(sweep1) + " sweep2: " + str(sweep2) @@ -186,7 +197,7 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess): print "tod1: " + str(tod1) + " tod2: " + str(tod2) print "ref1: " + str(ref1) + " ref2: " + str(ref2) print "XX2: nfile1: " + str(nfile1) + " nfile2: " + str(nfile2) - self.assertEqual(nfile2 == 4, True) + self.assertEqual(nfile2 == final_nfile, True) if __name__ == '__main__': wttest.run() diff --git a/test/suite/test_txn04.py b/test/suite/test_txn04.py index de49c5fe235..bbd6ce8c4e2 100644 --- a/test/suite/test_txn04.py +++ b/test/suite/test_txn04.py @@ -121,17 +121,14 @@ class test_txn04(wttest.WiredTigerTestCase, suite_subprocess): cmd += self.backup_dir self.runWt(cmd.split()) - self.exception='false' backup_conn_params = 'log=(enabled,file_max=%s)' % self.logmax backup_conn = self.wiredtiger_open(self.backup_dir, backup_conn_params) try: self.check(backup_conn.open_session(), None, committed) - except: - self.exception='true' finally: backup_conn.close() - def test_ops(self): + def ops(self): self.session.create(self.uri, self.create_params) c = self.session.open_cursor(self.uri, None, 'overwrite') # Set up the table with entries for 1-5. @@ -149,7 +146,6 @@ class test_txn04(wttest.WiredTigerTestCase, suite_subprocess): # The runWt command closes our connection and sessions so # we need to reopen them here. self.hot_backup(None, committed) - self.assertEqual(True, self.exception == 'false') c = self.session.open_cursor(self.uri, None, 'overwrite') c.set_value(1) # Then do the given modification. @@ -192,14 +188,13 @@ class test_txn04(wttest.WiredTigerTestCase, suite_subprocess): # Check the state after each commit/rollback. self.check_all(current, committed) - # Backup the target we modified. We expect that running - # recovery now will generate an exception if we committed. + # Backup the target we modified and verify the data. # print 'Call hot_backup with ' + self.uri self.hot_backup(self.uri, committed) - if txn == 'commit': - self.assertEqual(True, self.exception == 'true') - else: - self.assertEqual(True, self.exception == 'false') + + def test_ops(self): + with self.expectedStdoutPattern('Recreating metadata'): + self.ops() if __name__ == '__main__': wttest.run() diff --git a/test/suite/test_util13.py b/test/suite/test_util13.py new file mode 100644 index 00000000000..222f42cd7f1 --- /dev/null +++ b/test/suite/test_util13.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2016 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import os, re, string +from suite_subprocess import suite_subprocess +import itertools, wiredtiger, wttest + +from helper import complex_populate_cgconfig, complex_populate_cgconfig_lsm +from helper import simple_populate +from helper import complex_populate_check, simple_populate_check +from wtscenario import multiply_scenarios, number_scenarios + +# test_util13.py +# Utilities: wt dump, as well as the dump cursor +# Test that dump and load retain table configuration information. +# +class test_util13(wttest.WiredTigerTestCase, suite_subprocess): + """ + Test wt dump. We check for specific output and preservation of + non-default table create parameters. + """ + + pfx = 'test_util13' + nentries = 100 + dir = "dump_dir" + # + # Select table configuration settings that are not the default. + # + types = [ + ('file-simple', dict(uri='file:' + pfx, pop=simple_populate, + populate_check=simple_populate_check, + table_config='prefix_compression_min=3', cfg='')), + ('lsm-simple', dict(uri='lsm:' + pfx, pop=simple_populate, + populate_check=simple_populate_check, + table_config='lsm=(bloom_bit_count=29)', + cfg='bloom_bit_count=29')), + ('table-simple', dict(uri='table:' + pfx, pop=simple_populate, + populate_check=simple_populate_check, + table_config='split_pct=50', cfg='')), + ('table-complex', + dict(uri='table:' + pfx, pop=complex_populate_cgconfig, + populate_check=complex_populate_check, + table_config='allocation_size=512B', cfg='')), + ('table-complex-lsm', + dict(uri='table:' + pfx, pop=complex_populate_cgconfig_lsm, + populate_check=complex_populate_check, + table_config='lsm=(merge_max=5)', + cfg='merge_max=5')), + ] + + scenarios = number_scenarios(multiply_scenarios('.', types)) + + def compare_config(self, expected_cfg, actual_cfg): + # Replace '(' characters so configuration groups don't break parsing. + # If we ever want to look for config groups this will need to change. + #print "compare_config Actual config " + #print actual_cfg + #print "compare_config Expected config " + #print expected_cfg + cfg_orig = actual_cfg + if self.pop != simple_populate: + # + # If we have a complex config, strip out the colgroups and + # columns from the config. Doing so allows us to keep the + # split commands below usable because those two items don't + # have assignments in them. + # + nocolgrp = re.sub("colgroups=\((.+?)\),", '', actual_cfg) + cfg_orig = re.sub("columns=\((.+?)\),", '', nocolgrp) + + #print "Using original config " + #print cfg_orig + da = dict(kv.split('=') for kv in + cfg_orig.strip().replace('(',',').split(',')) + dx = dict(kv.split('=') for kv in + expected_cfg.strip().replace('(',',').split(',')) + + # Check that all items in our expected config subset are in + # the actual configuration and they match. + match = all(item in da.items() for item in dx.items()) + if match == False: + print "MISMATCH:" + print "Original dict: " + print da + print "Expected config: " + print dx + return match + + def compare_files(self, expect_subset, dump_out): + inheader = isconfig = False + for l1, l2 in zip(open(expect_subset, "rb"), open(dump_out, "rb")): + if isconfig: + if not self.compare_config(l1, l2): + return False + if inheader: + # This works because the expected subset has a format + # of URI and config lines alternating. + isconfig = not isconfig + if l1.strip() == 'Header': + inheader = True + if l1.strip() == 'Data': + break + return True + + def load_recheck(self, expect_subset, dump_out): + newdump = "newdump.out" + os.mkdir(self.dir) + self.runWt(['-h', self.dir, 'load', '-f', dump_out]) + # Check the contents + conn = self.wiredtiger_open(self.dir) + session = conn.open_session() + cursor = session.open_cursor(self.uri, None, None) + self.populate_check + conn.close() + dumpargs = ["-h"] + dumpargs.append(self.dir) + dumpargs.append("dump") + dumpargs.append(self.uri) + self.runWt(dumpargs, outfilename=newdump) + + self.assertTrue(self.compare_files(expect_subset, newdump)) + return True + + def test_dump_config(self): + # The number of btree_entries reported is influenced by the + # number of column groups and indices. Each insert will have + # a multiplied effect. + self.pop(self, self.uri, + 'key_format=S,value_format=S,' + self.table_config, self.nentries) + + ver = wiredtiger.wiredtiger_version() + verstring = str(ver[1]) + '.' + str(ver[2]) + '.' + str(ver[3]) + expectfile="expect.out" + with open(expectfile, "w") as expectout: + # Note: this output is sensitive to the precise output format + # generated by wt dump. If this is likely to change, we should + # make this test more accommodating. + expectout.write( + 'WiredTiger Dump (WiredTiger Version ' + verstring + ')\n') + expectout.write('Format=print\n') + expectout.write('Header\n') + expectout.write(self.uri + '\n') + # Check the config on the colgroup itself for complex tables. + if self.pop != simple_populate: + expectout.write('key_format=S\n') + expectout.write('colgroup:' + self.pfx + ':cgroup1\n') + if self.cfg == '': + expectout.write(self.table_config + '\n') + else: + expectout.write(self.cfg + '\n') + expectout.write('Data\n') + + self.pr('calling dump') + outfile="dump.out" + dumpargs = ["dump"] + dumpargs.append(self.uri) + self.runWt(dumpargs, outfilename=outfile) + + self.assertTrue(self.compare_files(expectfile, outfile)) + self.assertTrue(self.load_recheck(expectfile, outfile)) + +if __name__ == '__main__': + wttest.run() diff --git a/test/suite/wttest.py b/test/suite/wttest.py index b5a58d1566f..a1945b4325d 100644 --- a/test/suite/wttest.py +++ b/test/suite/wttest.py @@ -335,6 +335,14 @@ class WiredTigerTestCase(unittest.TestCase): # always get back to original directory os.chdir(self.origcwd) + # Make sure no read-only files or directories were left behind + os.chmod(self.testdir, 0777) + for root, dirs, files in os.walk(self.testdir): + for d in dirs: + os.chmod(os.path.join(root, d), 0777) + for f in files: + os.chmod(os.path.join(root, f), 0666) + # Clean up unless there's a failure if (passed or skipped) and not WiredTigerTestCase._preserveFiles: shutil.rmtree(self.testdir, ignore_errors=True) diff --git a/test/thread/t.c b/test/thread/t.c index e72b54bf62a..22334076ee1 100644 --- a/test/thread/t.c +++ b/test/thread/t.c @@ -51,6 +51,8 @@ static void wt_shutdown(void); extern int __wt_optind; extern char *__wt_optarg; +void (*custom_die)(void) = NULL; + int main(int argc, char *argv[]) { diff --git a/test/utility/test_util.i b/test/utility/test_util.i index 3b88d375381..c5cebadcb5c 100644 --- a/test/utility/test_util.i +++ b/test/utility/test_util.i @@ -42,25 +42,60 @@ #define DEFAULT_DIR "WT_TEST" #define MKDIR_COMMAND "mkdir " +/* Allow tests to add their own death handling. */ +extern void (*custom_die)(void); + +static void testutil_die(int, const char *, ...) +#if defined(__GNUC__) +__attribute__((__noreturn__)) +#endif +; + /* * die -- * Report an error and quit. */ -static inline void +static void testutil_die(int e, const char *fmt, ...) { va_list ap; + /* Allow test programs to cleanup on fatal error. */ + if (custom_die != NULL) + (*custom_die)(); + va_start(ap, fmt); vfprintf(stderr, fmt, ap); va_end(ap); if (e != 0) fprintf(stderr, ": %s", wiredtiger_strerror(e)); fprintf(stderr, "\n"); + exit(EXIT_FAILURE); } /* + * testutil_check -- + * Complain and quit if a function call fails. + */ +#define testutil_check(call) do { \ + int __r; \ + if ((__r = (call)) != 0) \ + testutil_die(__r, "%s/%d: %s", __func__, __LINE__, #call);\ +} while (0) + +/* + * testutil_checkfmt -- + * Complain and quit if a function call fails, with additional arguments. + */ +#define testutil_checkfmt(call, fmt, ...) do { \ + int __r; \ + if ((__r = (call)) != 0) \ + testutil_die(__r, "%s/%d: %s: " fmt, \ + __func__, __LINE__, #call, __VA_ARGS__); \ +} while (0) + +/* * testutil_work_dir_from_path -- * Takes a buffer, its size and the intended work directory. * Creates the full intended work directory in buffer. diff --git a/test/windows/windows_shim.h b/test/windows/windows_shim.h index c35c27cb7b0..f32edce88e7 100644 --- a/test/windows/windows_shim.h +++ b/test/windows/windows_shim.h @@ -44,6 +44,11 @@ typedef int u_int; #define R_OK 04 #define X_OK R_OK +/* MSVC Doesn't provide __func__, it has __FUNCTION__ */ +#ifdef _MSC_VER +#define __func__ __FUNCTION__ +#endif + /* snprintf does not exist on <= VS 2013 */ #if _MSC_VER < 1900 #define snprintf _wt_snprintf diff --git a/tools/wtstats/stat_data.py b/tools/wtstats/stat_data.py index 7cee87e49ed..c75e4f194dd 100644 --- a/tools/wtstats/stat_data.py +++ b/tools/wtstats/stat_data.py @@ -1,8 +1,10 @@ # DO NOT EDIT: automatically built by dist/stat.py. */ no_scale_per_second_list = [ + 'async: current work queue length', 'async: maximum work queue length', 'cache: bytes currently in the cache', + 'cache: eviction currently operating in aggressive mode', 'cache: maximum bytes configured', 'cache: maximum page size at eviction', 'cache: pages currently held in the cache', @@ -35,6 +37,7 @@ no_scale_per_second_list = [ 'transaction: transaction range of IDs currently pinned by named snapshots', 'block-manager: checkpoint size', 'block-manager: file allocation unit size', + 'block-manager: file bytes available for reuse', 'block-manager: file magic number', 'block-manager: file major version number', 'block-manager: file size in bytes', @@ -67,6 +70,7 @@ no_scale_per_second_list = [ no_clear_list = [ 'async: maximum work queue length', 'cache: bytes currently in the cache', + 'cache: eviction currently operating in aggressive mode', 'cache: maximum bytes configured', 'cache: maximum page size at eviction', 'cache: pages currently held in the cache', |