diff options
author | Michael Cahill <michael.cahill@mongodb.com> | 2015-07-17 22:40:41 +1000 |
---|---|---|
committer | Michael Cahill <michael.cahill@mongodb.com> | 2015-07-17 22:40:41 +1000 |
commit | d7e9b92a8117edab8869c132a4f7bfae3d3ff2ff (patch) | |
tree | 662253d1f3793c5119e5a9fdb608a3aa520621b6 /src/third_party/wiredtiger/src | |
parent | 5170a33c1a3632cff838c4b9291938cc3a4ad41c (diff) | |
download | mongo-d7e9b92a8117edab8869c132a4f7bfae3d3ff2ff.tar.gz |
Import wiredtiger-wiredtiger-2.6.1-284-g42823c9.tar.gz from wiredtiger branch mongodb-3.2
Diffstat (limited to 'src/third_party/wiredtiger/src')
36 files changed, 646 insertions, 513 deletions
diff --git a/src/third_party/wiredtiger/src/block/block_open.c b/src/third_party/wiredtiger/src/block/block_open.c index 8e45ec85a97..df42a14816f 100644 --- a/src/third_party/wiredtiger/src/block/block_open.c +++ b/src/third_party/wiredtiger/src/block/block_open.c @@ -100,11 +100,10 @@ __wt_block_manager_create( WT_TRET(__wt_close(session, &fh)); /* - * If checkpoint syncing is enabled, some filesystems require that we - * sync the directory to be confident that the file will appear. + * Some filesystems require that we sync the directory to be confident + * that the file will appear. */ - if (ret == 0 && F_ISSET(S2C(session), WT_CONN_CKPT_SYNC) && - (ret = __wt_filename(session, filename, &path)) == 0) { + if (ret == 0 && (ret = __wt_filename(session, filename, &path)) == 0) { ret = __wt_directory_sync(session, path); __wt_free(session, path); } @@ -180,10 +179,10 @@ __wt_block_open(WT_SESSION_IMPL *session, WT_DECL_RET; uint64_t bucket, hash; - WT_TRET(__wt_verbose(session, WT_VERB_BLOCK, "open: %s", filename)); + WT_RET(__wt_verbose(session, WT_VERB_BLOCK, "open: %s", filename)); conn = S2C(session); - *blockp = NULL; + *blockp = block = NULL; hash = __wt_hash_city64(filename, strlen(filename)); bucket = hash % WT_HASH_ARRAY_SIZE; __wt_spin_lock(session, &conn->block_lock); @@ -264,7 +263,8 @@ __wt_block_open(WT_SESSION_IMPL *session, __wt_spin_unlock(session, &conn->block_lock); return (0); -err: WT_TRET(__block_destroy(session, block)); +err: if (block != NULL) + WT_TRET(__block_destroy(session, block)); __wt_spin_unlock(session, &conn->block_lock); return (ret); } diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index 4625865fbf7..77d80cdb3a2 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -334,6 +334,8 @@ __wt_debug_disk( if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) __dmsg(ds, ", compressed"); + if (F_ISSET(dsk, WT_PAGE_ENCRYPTED)) + __dmsg(ds, ", encrypted"); if (F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL)) __dmsg(ds, ", empty-all"); if (F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE)) diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c index f08909a4b85..86edd992b28 100644 --- a/src/third_party/wiredtiger/src/btree/bt_page.c +++ b/src/third_party/wiredtiger/src/btree/bt_page.c @@ -21,7 +21,7 @@ static int __inmem_row_leaf_entries( * Check if a page matches the criteria for forced eviction. */ static int -__evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) +__evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; @@ -35,10 +35,6 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) if (WT_PAGE_IS_INTERNAL(page)) return (0); - /* Eviction may be turned off. */ - if (LF_ISSET(WT_READ_NO_EVICT) || F_ISSET(btree, WT_BTREE_NO_EVICTION)) - return (0); - /* * It's hard to imagine a page with a huge memory footprint that has * never been modified, but check to be sure. @@ -68,11 +64,14 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #endif ) { + WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; u_int sleep_cnt, wait_cnt; int busy, cache_work, force_attempts, oldgen; + btree = S2BT(session); + for (force_attempts = oldgen = 0, wait_cnt = 0;;) { switch (ref->state) { case WT_REF_DISK: @@ -115,7 +114,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags * be evicting if no hazard pointer is required, we're * done. */ - if (F_ISSET(S2BT(session), WT_BTREE_IN_MEMORY)) + if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) goto skip_evict; /* @@ -140,7 +139,8 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags * the page's generation number. If eviction isn't being * done on this file, we're done. */ - if (F_ISSET(S2BT(session), WT_BTREE_NO_EVICTION)) + if (LF_ISSET(WT_READ_NO_EVICT) || + F_ISSET(btree, WT_BTREE_NO_EVICTION)) goto skip_evict; /* @@ -148,7 +148,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags */ page = ref->page; if (force_attempts < 10 && - __evict_force_check(session, page, flags)) { + __evict_force_check(session, page)) { ++force_attempts; ret = __wt_page_release_evict(session, ref); /* If forced eviction fails, stall. */ diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 6f27e077109..dbd4042129d 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -753,7 +753,7 @@ __split_multi_inmem( /* * We modified the page above, which will have set the first dirty - * transaction to the last transaction current running. However, the + * transaction to the last transaction currently running. However, the * updates we installed may be older than that. Set the first dirty * transaction to an impossibly old value so this page is never skipped * in a checkpoint. diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index 6c5b1fb98e8..838d778dadf 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -109,17 +109,6 @@ __sync_file(WT_SESSION_IMPL *session, int syncop) /* Write all dirty in-cache pages. */ flags |= WT_READ_NO_EVICT; for (walk = NULL;;) { - /* - * If we have a page, and it was ever modified, track - * the highest transaction ID in the tree. We do this - * here because we want the value after reconciling - * dirty pages. - */ - if (walk != NULL && walk->page != NULL && - (mod = walk->page->modify) != NULL && - WT_TXNID_LT(btree->rec_max_txn, mod->rec_max_txn)) - btree->rec_max_txn = mod->rec_max_txn; - WT_ERR(__wt_tree_walk(session, &walk, NULL, flags)); if (walk == NULL) break; @@ -190,6 +179,18 @@ err: /* On error, clear any left-over tree walk. */ if (btree->checkpointing) { /* + * Update the checkpoint generation for this handle so visible + * updates newer than the checkpoint can be evicted. + * + * This has to be published before eviction is enabled again, + * so that eviction knows that the checkpoint has completed. + */ + WT_PUBLISH(btree->checkpoint_gen, + S2C(session)->txn_global.checkpoint_gen); + WT_STAT_FAST_DATA_SET(session, + btree_checkpoint_generation, btree->checkpoint_gen); + + /* * Clear the checkpoint flag and push the change; not required, * but publishing the change means stalled eviction gets moving * as soon as possible. diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index f7b65a8f73d..73837c46ee8 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -57,7 +57,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_file_manager_subconfigs[] = { { "close_handle_minimum", "int", NULL, "min=0", NULL, 0 }, { "close_idle_time", "int", - NULL, "min=1,max=100000", + NULL, "min=0,max=100000", NULL, 0 }, { "close_scan_interval", "int", NULL, "min=1,max=100000", diff --git a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c index de7e9e3486f..fdc95a32387 100644 --- a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c +++ b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c @@ -11,15 +11,29 @@ /* * Tuning constants. */ -/* Threshold when a connection is allocated more cache */ -#define WT_CACHE_POOL_BUMP_THRESHOLD 6 -/* Threshold when a connection is allocated less cache */ -#define WT_CACHE_POOL_REDUCE_THRESHOLD 2 +/* + * Threshold when a connection is allocated more cache, as a percentage of + * the amount of pressure the busiest participant has. + */ +#define WT_CACHE_POOL_BUMP_THRESHOLD 60 +/* + * Threshold when a connection is allocated less cache, as a percentage of + * the amount of pressure the busiest participant has. + */ +#define WT_CACHE_POOL_REDUCE_THRESHOLD 20 /* Balancing passes after a bump before a connection is a candidate. */ #define WT_CACHE_POOL_BUMP_SKIPS 10 /* Balancing passes after a reduction before a connection is a candidate. */ #define WT_CACHE_POOL_REDUCE_SKIPS 5 +/* + * Constants that control how much influence different metrics have on + * the pressure calculation. + */ +#define WT_CACHE_POOL_APP_EVICT_MULTIPLIER 10 +#define WT_CACHE_POOL_APP_WAIT_MULTIPLIER 50 +#define WT_CACHE_POOL_READ_MULTIPLIER 1 + static int __cache_pool_adjust(WT_SESSION_IMPL *, uint64_t, uint64_t, int *); static int __cache_pool_assess(WT_SESSION_IMPL *, uint64_t *); static int __cache_pool_balance(WT_SESSION_IMPL *); @@ -441,10 +455,12 @@ __cache_pool_assess(WT_SESSION_IMPL *session, uint64_t *phighest) WT_CACHE_POOL *cp; WT_CACHE *cache; WT_CONNECTION_IMPL *entry; - uint64_t entries, highest, new; + uint64_t app_evicts, app_waits, reads; + uint64_t entries, highest, tmp; cp = __wt_process.cache_pool; - entries = highest = 0; + entries = 0; + highest = 1; /* Avoid divide by zero */ /* Generate read pressure information. */ TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) { @@ -453,22 +469,54 @@ __cache_pool_assess(WT_SESSION_IMPL *session, uint64_t *phighest) continue; cache = entry->cache; ++entries; - new = cache->bytes_read; - /* Handle wrapping of eviction requests. */ - if (new >= cache->cp_saved_read) - cache->cp_current_read = new - cache->cp_saved_read; + + /* + * Figure out a delta since the last time we did an assessment + * for each metric we are tracking. Watch out for wrapping + * of values. + */ + tmp = cache->bytes_read; + if (tmp >= cache->cp_saved_read) + reads = tmp - cache->cp_saved_read; else - cache->cp_current_read = new; - cache->cp_saved_read = new; - if (cache->cp_current_read > highest) - highest = cache->cp_current_read; + reads = (UINT64_MAX - cache->cp_saved_read) + tmp; + cache->cp_saved_read = tmp; + + /* Update the application eviction count information */ + tmp = cache->app_evicts; + if (tmp >= cache->cp_saved_app_evicts) + app_evicts = tmp - cache->cp_saved_app_evicts; + else + app_evicts = + (UINT64_MAX - cache->cp_saved_app_evicts) + tmp; + cache->cp_saved_app_evicts = tmp; + + /* Update the eviction wait information */ + tmp = cache->app_waits; + if (tmp >= cache->cp_saved_app_waits) + app_waits = tmp - cache->cp_saved_app_waits; + else + app_waits = + (UINT64_MAX - cache->cp_saved_app_waits) + tmp; + cache->cp_saved_app_waits = tmp; + + /* Calculate the weighted pressure for this member */ + cache->cp_pass_pressure = + (app_evicts * WT_CACHE_POOL_APP_EVICT_MULTIPLIER) + + (app_waits * WT_CACHE_POOL_APP_WAIT_MULTIPLIER) + + (reads * WT_CACHE_POOL_READ_MULTIPLIER); + + if (cache->cp_pass_pressure > highest) + highest = cache->cp_pass_pressure; + + WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE, + "Assess entry. reads: %" PRIu64 ", app evicts: %" PRIu64 + ", app waits: %" PRIu64 ", pressure: %" PRIu64, + reads, app_evicts, app_waits, cache->cp_pass_pressure)); } WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE, "Highest eviction count: %" PRIu64 ", entries: %" PRIu64, highest, entries)); - /* Normalize eviction information across connections. */ - highest = highest / (entries + 1); - ++highest; /* Avoid divide by zero. */ *phighest = highest; return (0); @@ -487,18 +535,21 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, WT_CACHE_POOL *cp; WT_CACHE *cache; WT_CONNECTION_IMPL *entry; - uint64_t adjusted, reserved, read_pressure; + uint64_t adjusted, highest_percentile, pressure, reserved; int force, grew; *adjustedp = 0; cp = __wt_process.cache_pool; force = (cp->currently_used > cp->size); grew = 0; + /* Highest as a percentage, avoid 0 */ + highest_percentile = (highest / 100) + 1; + if (WT_VERBOSE_ISSET(session, WT_VERB_SHARED_CACHE)) { WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE, "Cache pool distribution: ")); WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE, - "\t" "cache_size, read_pressure, skips: ")); + "\t" "cache_size, pressure, skips: ")); } TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) { @@ -506,10 +557,17 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, reserved = cache->cp_reserved; adjusted = 0; - read_pressure = cache->cp_current_read / highest; + /* + * The read pressure is calculated as a percentage of how + * much read pressure there is on this participant compared + * to the participant with the most activity. The closer we + * are to the most active the more cache we should get + * assigned. + */ + pressure = cache->cp_pass_pressure / highest_percentile; WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE, "\t%" PRIu64 ", %" PRIu64 ", %" PRIu32, - entry->cache_size, read_pressure, cache->cp_skip_count)); + entry->cache_size, pressure, cache->cp_skip_count)); /* Allow to stabilize after changes. */ if (cache->cp_skip_count > 0 && --cache->cp_skip_count > 0) @@ -523,6 +581,7 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, if (entry->cache_size < reserved) { grew = 1; adjusted = reserved - entry->cache_size; + /* * Conditions for reducing the amount of resources for an * entry: @@ -534,9 +593,9 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, * space in the pool. */ } else if ((force && entry->cache_size > reserved) || - (read_pressure < WT_CACHE_POOL_REDUCE_THRESHOLD && - highest > 1 && entry->cache_size > reserved && - cp->currently_used >= cp->size)) { + (pressure < WT_CACHE_POOL_REDUCE_THRESHOLD && + highest > 1 && entry->cache_size > reserved && + cp->currently_used >= cp->size)) { grew = 0; /* * Shrink by a chunk size if that doesn't drop us @@ -553,14 +612,15 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, * - This entry is using less than the entire cache pool * - The connection is using enough cache to require eviction * - There is space available in the pool - * - Additional cache would benefit the connection + * - Additional cache would benefit the connection OR + * - The pool is less than half distributed */ - } else if (highest > 1 && - entry->cache_size < cp->size && - cache->bytes_inmem >= - (entry->cache_size * cache->eviction_target) / 100 && - cp->currently_used < cp->size && - read_pressure > bump_threshold) { + } else if (entry->cache_size < cp->size && + __wt_cache_bytes_inuse(cache) >= + (entry->cache_size * cache->eviction_target) / 100 && + ((cp->currently_used < cp->size && + pressure > bump_threshold) || + cp->currently_used < cp->size * 0.5)) { grew = 1; adjusted = WT_MIN(cp->chunk, cp->size - cp->currently_used); diff --git a/src/third_party/wiredtiger/src/conn/conn_handle.c b/src/third_party/wiredtiger/src/conn/conn_handle.c index 0e7ab0a03d8..94e69897c1d 100644 --- a/src/third_party/wiredtiger/src/conn/conn_handle.c +++ b/src/third_party/wiredtiger/src/conn/conn_handle.c @@ -53,7 +53,8 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_RET(__wt_spin_init(session, &conn->dhandle_lock, "data handle")); WT_RET(__wt_spin_init(session, &conn->encryptor_lock, "encryptor")); WT_RET(__wt_spin_init(session, &conn->fh_lock, "file list")); - WT_RET(__wt_spin_init(session, &conn->hot_backup_lock, "hot backup")); + WT_RET(__wt_rwlock_alloc(session, + &conn->hot_backup_lock, "hot backup")); WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure")); WT_RET(__wt_spin_init(session, &conn->schema_lock, "schema")); WT_RET(__wt_spin_init(session, &conn->table_lock, "table creation")); @@ -136,7 +137,7 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_spin_destroy(session, &conn->dhandle_lock); __wt_spin_destroy(session, &conn->encryptor_lock); __wt_spin_destroy(session, &conn->fh_lock); - __wt_spin_destroy(session, &conn->hot_backup_lock); + WT_TRET(__wt_rwlock_destroy(session, &conn->hot_backup_lock)); __wt_spin_destroy(session, &conn->reconfig_lock); __wt_spin_destroy(session, &conn->schema_lock); __wt_spin_destroy(session, &conn->table_lock); diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c index bc80152f6bf..be7ce2e9344 100644 --- a/src/third_party/wiredtiger/src/conn/conn_log.c +++ b/src/third_party/wiredtiger/src/conn/conn_log.c @@ -139,7 +139,7 @@ __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file) * We can only archive files if a hot backup is not in progress or * if we are the backup. */ - __wt_spin_lock(session, &conn->hot_backup_lock); + WT_RET(__wt_readlock(session, conn->hot_backup_lock)); locked = 1; if (conn->hot_backup == 0 || backup_file != 0) { for (i = 0; i < logcount; i++) { @@ -151,7 +151,7 @@ __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file) } } } - __wt_spin_unlock(session, &conn->hot_backup_lock); + WT_ERR(__wt_readunlock(session, conn->hot_backup_lock)); locked = 0; __wt_log_files_free(session, logfiles, logcount); logfiles = NULL; @@ -167,7 +167,7 @@ __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file) if (0) err: __wt_err(session, ret, "log archive server error"); if (locked) - __wt_spin_unlock(session, &conn->hot_backup_lock); + WT_TRET(__wt_readunlock(session, conn->hot_backup_lock)); if (logfiles != NULL) __wt_log_files_free(session, logfiles, logcount); return (ret); @@ -207,9 +207,8 @@ __log_prealloc_once(WT_SESSION_IMPL *session) if (log->prep_missed > 0) { conn->log_prealloc += log->prep_missed; WT_ERR(__wt_verbose(session, WT_VERB_LOG, - "Now pre-allocating up to %" PRIu32, - conn->log_prealloc)); - log->prep_missed = 0; + "Missed %" PRIu32 ". Now pre-allocating up to %" PRIu32, + log->prep_missed, conn->log_prealloc)); } WT_STAT_FAST_CONN_SET(session, log_prealloc_max, conn->log_prealloc); @@ -221,6 +220,13 @@ __log_prealloc_once(WT_SESSION_IMPL *session) session, ++log->prep_fileid, WT_LOG_PREPNAME, 1)); WT_STAT_FAST_CONN_INCR(session, log_prealloc_files); } + /* + * Reset the missed count now. If we missed during pre-allocating + * the log files, it means the allocation is not keeping up, not that + * we didn't allocate enough. So we don't just want to keep adding + * in more. + */ + log->prep_missed = 0; if (0) err: __wt_err(session, ret, "log pre-alloc server error"); diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c index 08137c9c9ff..ec6f628a02e 100644 --- a/src/third_party/wiredtiger/src/conn/conn_sweep.c +++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c @@ -35,7 +35,8 @@ __sweep_mark(WT_SESSION_IMPL *session, int *dead_handlesp) continue; } if (dhandle->session_inuse != 0 || - now <= dhandle->timeofdeath + conn->sweep_idle_time) + now <= dhandle->timeofdeath + conn->sweep_idle_time || + conn->sweep_idle_time == 0) continue; if (dhandle->timeofdeath == 0) { dhandle->timeofdeath = now; @@ -121,6 +122,10 @@ __sweep_expire(WT_SESSION_IMPL *session) conn = S2C(session); + /* If sweep_idle_time is 0, then we won't expire any cursors */ + if (conn->sweep_idle_time == 0) + return (0); + /* Don't discard handles that have been open recently. */ WT_RET(__wt_seconds(session, &now)); @@ -265,8 +270,14 @@ __sweep_server(void *arg) */ WT_ERR(__sweep_mark(session, &dead_handles)); + /* + * We only want to flush and expire if there are no dead handles + * and if either the sweep_idle_time is not 0, or if we have + * reached the configured limit of handles. + */ if (dead_handles == 0 && - conn->open_file_count < conn->sweep_handles_min) + (conn->open_file_count < conn->sweep_handles_min || + conn->sweep_idle_time != 0)) continue; /* Close handles if we have reached the configured limit */ diff --git a/src/third_party/wiredtiger/src/cursor/cur_backup.c b/src/third_party/wiredtiger/src/cursor/cur_backup.c index 8f43e98e2f7..60d94697189 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_backup.c +++ b/src/third_party/wiredtiger/src/cursor/cur_backup.c @@ -217,9 +217,9 @@ __backup_start( * could start a hot backup that would race with an already-started * checkpoint. */ - __wt_spin_lock(session, &conn->hot_backup_lock); + WT_RET(__wt_writelock(session, conn->hot_backup_lock)); conn->hot_backup = 1; - __wt_spin_unlock(session, &conn->hot_backup_lock); + WT_ERR(__wt_writeunlock(session, conn->hot_backup_lock)); /* Create the hot backup file. */ WT_ERR(__backup_file_create(session, cb, 0)); @@ -318,9 +318,9 @@ __backup_stop(WT_SESSION_IMPL *session) ret = __wt_backup_file_remove(session); /* Checkpoint deletion can proceed, as can the next hot backup. */ - __wt_spin_lock(session, &conn->hot_backup_lock); + WT_TRET(__wt_writelock(session, conn->hot_backup_lock)); conn->hot_backup = 0; - __wt_spin_unlock(session, &conn->hot_backup_lock); + WT_TRET(__wt_writeunlock(session, conn->hot_backup_lock)); return (ret); } diff --git a/src/third_party/wiredtiger/src/cursor/cur_metadata.c b/src/third_party/wiredtiger/src/cursor/cur_metadata.c index 9860eb65a55..460c46c0d29 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_metadata.c +++ b/src/third_party/wiredtiger/src/cursor/cur_metadata.c @@ -30,15 +30,42 @@ WT_CURSTD_VALUE_EXT); \ } while (0) -#define WT_MD_SET_KEY_VALUE(c, mc, fc) do { \ - (c)->key.data = (fc)->key.data; \ - (c)->key.size = (fc)->key.size; \ - (c)->value.data = (fc)->value.data; \ - (c)->value.size = (fc)->value.size; \ - F_SET((c), WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); \ - F_CLR((mc), WT_MDC_ONMETADATA); \ - F_SET((mc), WT_MDC_POSITIONED); \ -} while (0) +/* + * __curmetadata_setkv -- + * Copy key/value into the public cursor, stripping internal metadata for + * "create-only" cursors. + */ +static int +__curmetadata_setkv(WT_CURSOR_METADATA *mdc, WT_CURSOR *fc) +{ + WT_CURSOR *c; + WT_DECL_RET; + WT_SESSION_IMPL *session; + char *value; + + c = &mdc->iface; + session = (WT_SESSION_IMPL *)c->session; + + c->key.data = fc->key.data; + c->key.size = fc->key.size; + if (F_ISSET(mdc, WT_MDC_CREATEONLY)) { + WT_RET(__wt_schema_create_strip( + session, fc->value.data, NULL, &value)); + ret = __wt_buf_set( + session, &c->value, value, strlen(value) + 1); + __wt_free(session, value); + WT_RET(ret); + } else { + c->value.data = fc->value.data; + c->value.size = fc->value.size; + } + + F_SET(c, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); + F_CLR(mdc, WT_MDC_ONMETADATA); + F_SET(mdc, WT_MDC_POSITIONED); + + return (0); +} /* * Check if a key matches the metadata. The public value is "metadata:", @@ -57,17 +84,21 @@ __curmetadata_metadata_search(WT_SESSION_IMPL *session, WT_CURSOR *cursor) { WT_CURSOR_METADATA *mdc; WT_DECL_RET; - char *value; + char *value, *stripped; mdc = (WT_CURSOR_METADATA *)cursor; /* The metadata search interface allocates a new string in value. */ WT_RET(__wt_metadata_search(session, WT_METAFILE_URI, &value)); - /* - * Copy the value to the underlying btree cursor's tmp item which will - * be freed when the cursor is closed. - */ + if (F_ISSET(mdc, WT_MDC_CREATEONLY)) { + ret = __wt_schema_create_strip( + session, value, NULL, &stripped); + __wt_free(session, value); + WT_RET(ret); + value = stripped; + } + ret = __wt_buf_setstr(session, &cursor->value, value); __wt_free(session, value); WT_RET(ret); @@ -141,7 +172,7 @@ __curmetadata_next(WT_CURSOR *cursor) WT_ERR(__curmetadata_metadata_search(session, cursor)); else { WT_ERR(file_cursor->next(mdc->file_cursor)); - WT_MD_SET_KEY_VALUE(cursor, mdc, file_cursor); + WT_ERR(__curmetadata_setkv(mdc, file_cursor)); } err: if (ret != 0) { @@ -174,9 +205,9 @@ __curmetadata_prev(WT_CURSOR *cursor) } ret = file_cursor->prev(file_cursor); - if (ret == 0) { - WT_MD_SET_KEY_VALUE(cursor, mdc, file_cursor); - } else if (ret == WT_NOTFOUND) + if (ret == 0) + WT_ERR(__curmetadata_setkv(mdc, file_cursor)); + else if (ret == WT_NOTFOUND) WT_ERR(__curmetadata_metadata_search(session, cursor)); err: if (ret != 0) { @@ -234,7 +265,7 @@ __curmetadata_search(WT_CURSOR *cursor) WT_ERR(__curmetadata_metadata_search(session, cursor)); else { WT_ERR(file_cursor->search(file_cursor)); - WT_MD_SET_KEY_VALUE(cursor, mdc, file_cursor); + WT_ERR(__curmetadata_setkv(mdc, file_cursor)); } err: if (ret != 0) { @@ -268,7 +299,7 @@ __curmetadata_search_near(WT_CURSOR *cursor, int *exact) *exact = 1; } else { WT_ERR(file_cursor->search_near(file_cursor, exact)); - WT_MD_SET_KEY_VALUE(cursor, mdc, file_cursor); + WT_ERR(__curmetadata_setkv(mdc, file_cursor)); } err: if (ret != 0) { @@ -438,6 +469,10 @@ __wt_curmetadata_open(WT_SESSION_IMPL *session, WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp)); + /* If we are only returning create config, strip internal metadata. */ + if (WT_STREQ(uri, "metadata:create")) + F_SET(mdc, WT_MDC_CREATEONLY); + /* * Metadata cursors default to readonly; if not set to not-readonly, * they are permanently readonly and cannot be reconfigured. diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index 1bf62fc7130..513da401ae6 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -1475,7 +1475,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, int pct_full) * to make sure there is free space in the cache. */ txn_global = &conn->txn_global; - txn_state = &txn_global->states[session->id]; + txn_state = WT_SESSION_TXN_STATE(session); txn_busy = txn_state->id != WT_TXN_NONE || session->nhazard > 0 || (txn_state->snap_min != WT_TXN_NONE && @@ -1512,6 +1512,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, int pct_full) q_found = 0; switch (ret = __evict_page(session, 0)) { case 0: + cache->app_evicts++; if (--count == 0) return (0); @@ -1550,6 +1551,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, int pct_full) WT_RET( __wt_cond_wait(session, cache->evict_waiter_cond, 100000)); + cache->app_waits++; /* Check if things have changed so that we are busy. */ if (!busy && txn_state->snap_min != WT_TXN_NONE && txn_global->current != txn_global->oldest_id) diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index 8680a644421..1e5faf45de2 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -59,6 +59,9 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int closing) conn = S2C(session); + /* Checkpoints should never do eviction. */ + WT_ASSERT(session, !WT_SESSION_IS_CHECKPOINT(session)); + page = ref->page; forced_eviction = page->read_gen == WT_READGEN_OLDEST; inmem_split = 0; diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 4809d257e7e..f13504d66ca 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -195,6 +195,11 @@ struct __wt_page_modify { /* The largest update transaction ID (approximate). */ uint64_t update_txn; +#ifdef HAVE_DIAGNOSTIC + /* Check that transaction time moves forward. */ + uint64_t last_oldest_id; +#endif + /* Dirty bytes added to the cache. */ size_t bytes_dirty; diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h index cb7e66d2bbd..ed93f82538c 100644 --- a/src/third_party/wiredtiger/src/include/cache.h +++ b/src/third_party/wiredtiger/src/include/cache.h @@ -65,6 +65,9 @@ struct __wt_cache { uint64_t pages_dirty; uint64_t bytes_read; /* Bytes read into memory */ + uint64_t app_evicts; /* Pages evicted by user threads */ + uint64_t app_waits; /* User threads waited for cache */ + uint64_t evict_max_page_size; /* Largest page seen at eviction */ /* @@ -105,12 +108,15 @@ struct __wt_cache { /* * Cache pool information. */ - uint64_t cp_saved_read; /* Read count from last pass */ - uint64_t cp_current_read; /* Read count from current pass */ - uint32_t cp_skip_count; /* Post change stabilization */ + uint64_t cp_pass_pressure; /* Calculated pressure from this pass */ uint64_t cp_reserved; /* Base size for this cache */ WT_SESSION_IMPL *cp_session; /* May be used for cache management */ + uint32_t cp_skip_count; /* Post change stabilization */ wt_thread_t cp_tid; /* Thread ID for cache pool manager */ + /* State seen at the last pass of the shared cache manager */ + uint64_t cp_saved_app_evicts; /* User eviction count at last review */ + uint64_t cp_saved_app_waits; /* User wait count at last review */ + uint64_t cp_saved_read; /* Read count at last review */ /* * Flags. diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index d6a2bb0b17a..cd55aadfc07 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -266,7 +266,7 @@ struct __wt_connection_impl { WT_TXN_GLOBAL txn_global; /* Global transaction state */ - WT_SPINLOCK hot_backup_lock; /* Hot backup serialization */ + WT_RWLOCK *hot_backup_lock; /* Hot backup serialization */ int hot_backup; WT_SESSION_IMPL *ckpt_session; /* Checkpoint thread session */ diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h index 9b61318aacc..36f36f2c46c 100644 --- a/src/third_party/wiredtiger/src/include/cursor.h +++ b/src/third_party/wiredtiger/src/include/cursor.h @@ -291,8 +291,9 @@ struct __wt_cursor_metadata { WT_CURSOR *file_cursor; /* Queries of regular metadata */ -#define WT_MDC_POSITIONED 0x01 +#define WT_MDC_CREATEONLY 0x01 #define WT_MDC_ONMETADATA 0x02 +#define WT_MDC_POSITIONED 0x04 uint32_t flags; }; diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index e17b309cf5d..87099ac839f 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -361,7 +361,6 @@ extern int __wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); extern int __wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); extern int64_t __wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size); extern int __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); -extern int __wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, size_t newsize); extern int __wt_clsm_request_switch(WT_CURSOR_LSM *clsm); extern int __wt_clsm_await_switch(WT_CURSOR_LSM *clsm); extern int __wt_clsm_init_merge( WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, u_int nchunks); @@ -533,6 +532,7 @@ extern int __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); extern int __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); extern int __wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); extern int __wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); +extern int __wt_schema_create_strip(WT_SESSION_IMPL *session, const char *v1, const char *v2, char **value_ret); extern int __wt_direct_io_size_check(WT_SESSION_IMPL *session, const char **cfg, const char *config_name, uint32_t *allocsizep); extern int __wt_schema_colgroup_source(WT_SESSION_IMPL *session, WT_TABLE *table, const char *cgname, const char *config, WT_ITEM *buf); extern int __wt_schema_index_source(WT_SESSION_IMPL *session, WT_TABLE *table, const char *idxname, const char *config, WT_ITEM *buf); @@ -575,7 +575,6 @@ extern int __wt_schema_worker(WT_SESSION_IMPL *session, const char *uri, int (*f extern int __wt_session_reset_cursors(WT_SESSION_IMPL *session, int free_buffers); extern int __wt_session_copy_values(WT_SESSION_IMPL *session); extern int __wt_open_cursor(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); -extern int __wt_session_create_strip(WT_SESSION *wt_session, const char *v1, const char *v2, char **value_ret); extern int __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, int uses_dhandles, int open_metadata, WT_SESSION_IMPL **sessionp); extern int __wt_open_session(WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler, const char *config, int open_metadata, WT_SESSION_IMPL **sessionp); extern int __wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, int *skip); diff --git a/src/third_party/wiredtiger/src/include/intpack.i b/src/third_party/wiredtiger/src/include/intpack.i index e1bcdb42ebd..d3fdfeaf1a6 100644 --- a/src/third_party/wiredtiger/src/include/intpack.i +++ b/src/third_party/wiredtiger/src/include/intpack.i @@ -300,7 +300,6 @@ __wt_vunpack_int(const uint8_t **pp, size_t maxlen, int64_t *xp) *xp = (int64_t)(GET_BITS(*p++, 5, 0) << 8); *xp |= *p++; *xp += NEG_2BYTE_MIN; - p += 2; break; case NEG_1BYTE_MARKER: case NEG_1BYTE_MARKER | 0x10: diff --git a/src/third_party/wiredtiger/src/include/log.h b/src/third_party/wiredtiger/src/include/log.h index 3de72b8b9a6..051f9fb262e 100644 --- a/src/third_party/wiredtiger/src/include/log.h +++ b/src/third_party/wiredtiger/src/include/log.h @@ -12,7 +12,7 @@ /* Logging subsystem declarations. */ #define WT_LOG_ALIGN 128 -#define WT_LOG_SLOT_BUF_INIT_SIZE 64 * 1024 +#define WT_LOG_SLOT_BUF_SIZE 256 * 1024 #define WT_INIT_LSN(l) do { \ (l)->file = 1; \ @@ -91,11 +91,10 @@ typedef WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) struct { WT_ITEM slot_buf; /* Buffer for grouped writes */ int32_t slot_churn; /* Active slots are scarce. */ -#define WT_SLOT_BUF_GROW 0x01 /* Grow buffer on release */ -#define WT_SLOT_BUFFERED 0x02 /* Buffer writes */ -#define WT_SLOT_CLOSEFH 0x04 /* Close old fh on release */ -#define WT_SLOT_SYNC 0x08 /* Needs sync on release */ -#define WT_SLOT_SYNC_DIR 0x10 /* Directory sync on release */ +#define WT_SLOT_BUFFERED 0x01 /* Buffer writes */ +#define WT_SLOT_CLOSEFH 0x02 /* Close old fh on release */ +#define WT_SLOT_SYNC 0x04 /* Needs sync on release */ +#define WT_SLOT_SYNC_DIR 0x08 /* Directory sync on release */ uint32_t flags; /* Flags */ } WT_LOGSLOT; @@ -117,6 +116,7 @@ typedef struct { */ uint32_t fileid; /* Current log file number */ uint32_t prep_fileid; /* Pre-allocated file number */ + uint32_t tmp_fileid; /* Temporary file number */ uint32_t prep_missed; /* Pre-allocated file misses */ WT_FH *log_fh; /* Logging file handle */ WT_FH *log_close_fh; /* Logging file handle to close */ @@ -157,10 +157,11 @@ typedef struct { * slot count of one. */ #define WT_SLOT_ACTIVE 1 -#define WT_SLOT_POOL 16 +#define WT_SLOT_POOL 128 uint32_t pool_index; /* Global pool index */ WT_LOGSLOT *slot_array[WT_SLOT_ACTIVE]; /* Active slots */ WT_LOGSLOT slot_pool[WT_SLOT_POOL]; /* Pool of all slots */ + uint32_t slot_buf_size; /* Buffer size for slots */ #define WT_LOG_FORCE_CONSOLIDATE 0x01 /* Disable direct writes */ uint32_t flags; diff --git a/src/third_party/wiredtiger/src/include/packing.i b/src/third_party/wiredtiger/src/include/packing.i index b97b3a322ce..bf6b5abce67 100644 --- a/src/third_party/wiredtiger/src/include/packing.i +++ b/src/third_party/wiredtiger/src/include/packing.i @@ -181,6 +181,7 @@ next: if (pack->cur == pack->end) /* Integral types repeat <size> times. */ if (pv->size == 0) goto next; + pv->havesize = 0; pack->repeats = pv->size - 1; pack->lastv = *pv; return (0); @@ -322,18 +323,19 @@ __pack_write( *pp += pv->size; break; case 's': + WT_SIZE_CHECK(pv->size, maxlen); + memcpy(*pp, pv->u.s, pv->size); + *pp += pv->size; + break; case 'S': - /* - * XXX if pv->havesize, only want to know if there is a - * '\0' in the first pv->size characters. - */ s = strlen(pv->u.s); - if ((pv->type == 's' || pv->havesize) && pv->size < s) { - s = pv->size; - pad = 0; - } else if (pv->havesize) - pad = pv->size - s; - else + if (pv->havesize) { + if (pv->size < s) { + s = pv->size; + pad = 0; + } else + pad = pv->size - s; + } else pad = 1; WT_SIZE_CHECK(s + pad, maxlen); if (s > 0) @@ -665,6 +667,7 @@ __wt_struct_unpackv(WT_SESSION_IMPL *session, if (fmt[0] != '\0' && fmt[1] == '\0') { pv.type = fmt[0]; + pv.size = 1; if ((ret = __unpack_read(session, &pv, &p, size)) == 0) WT_UNPACK_PUT(session, pv, ap); return (0); diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index 728c8c9fe8e..f05d87c058b 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -203,7 +203,6 @@ struct __wt_connection_stats { WT_STATS dh_session_handles; WT_STATS dh_session_sweeps; WT_STATS file_open; - WT_STATS log_buffer_grow; WT_STATS log_buffer_size; WT_STATS log_bytes_payload; WT_STATS log_bytes_written; diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index 85c11e19685..7a67f713244 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -25,6 +25,9 @@ #define WT_SESSION_TXN_STATE(s) (&S2C(s)->txn_global.states[(s)->id]) +#define WT_SESSION_IS_CHECKPOINT(s) \ + ((s)->id != 0 && (s)->id == S2C(s)->txn_global.checkpoint_id) + struct __wt_named_snapshot { const char *name; @@ -64,7 +67,7 @@ struct __wt_txn_global { */ volatile uint32_t checkpoint_id; /* Checkpoint's session ID */ volatile uint64_t checkpoint_gen; - volatile uint64_t checkpoint_snap_min; + volatile uint64_t checkpoint_pinned; /* Named snapshot state. */ WT_RWLOCK *nsnap_rwlock; diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index 1e3afbd4df3..a9b54d26e47 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -105,19 +105,20 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_TXN_GLOBAL *txn_global; - uint64_t checkpoint_snap_min, oldest_id; - uint32_t checkpoint_id; + uint64_t checkpoint_gen, checkpoint_pinned, oldest_id; txn_global = &S2C(session)->txn_global; btree = S2BT_SAFE(session); /* * Take a local copy of these IDs in case they are updated while we are - * checking visibility. + * checking visibility. Only the generation needs to be carefully + * ordered: if a checkpoint is starting and the generation is bumped, + * we take the minimum of the other two IDs, which is what we want. */ - checkpoint_id = txn_global->checkpoint_id; - checkpoint_snap_min = txn_global->checkpoint_snap_min; oldest_id = txn_global->oldest_id; + WT_ORDERED_READ(checkpoint_gen, txn_global->checkpoint_gen); + checkpoint_pinned = txn_global->checkpoint_pinned; /* * Checkpoint transactions often fall behind ordinary application @@ -129,17 +130,13 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session) * checkpoint, or this handle is up to date with the active checkpoint * then it's safe to ignore the checkpoint ID in the visibility check. */ - if (checkpoint_snap_min != WT_TXN_NONE && - checkpoint_id != session->id && (btree == NULL || - btree->checkpoint_gen != txn_global->checkpoint_gen) && - WT_TXNID_LT(checkpoint_snap_min, oldest_id)) - /* - * Use the checkpoint ID for the visibility check if it is the - * oldest ID in the system. - */ - oldest_id = checkpoint_snap_min; + if (checkpoint_pinned == WT_TXN_NONE || + WT_TXNID_LT(oldest_id, checkpoint_pinned) || + WT_SESSION_IS_CHECKPOINT(session) || + (btree != NULL && btree->checkpoint_gen == checkpoint_gen)) + return (oldest_id); - return (oldest_id); + return (checkpoint_pinned); } /* @@ -355,7 +352,7 @@ __wt_txn_id_check(WT_SESSION_IMPL *session) if (!F_ISSET(txn, WT_TXN_HAS_ID)) { conn = S2C(session); txn_global = &conn->txn_global; - txn_state = &txn_global->states[session->id]; + txn_state = WT_SESSION_TXN_STATE(session); WT_ASSERT(session, txn_state->id == WT_TXN_NONE); @@ -447,7 +444,7 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session) txn = &session->txn; txn_global = &S2C(session)->txn_global; - txn_state = &txn_global->states[session->id]; + txn_state = WT_SESSION_TXN_STATE(session); /* * If there is no transaction running (so we don't have an ID), and no diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index 77cccfcf9d3..096fea3eeb3 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -1721,8 +1721,9 @@ struct __wt_connection { * handles open before the file manager will look for handles to close., * an integer greater than or equal to 0; default \c 250.} * @config{ close_idle_time, amount of time in - * seconds a file handle needs to be idle before attempting to close - * it., an integer between 1 and 100000; default \c 30.} + * seconds a file handle needs to be idle before attempting to close it. + * A setting of 0 means that idle handles are not closed., an integer + * between 0 and 100000; default \c 30.} * @config{ close_scan_interval, interval in * seconds at which to check for files that are inactive and close * them., an integer between 1 and 100000; default \c 10.} @@ -2152,11 +2153,12 @@ struct __wt_connection { * before the file manager will look for handles to close., an integer greater * than or equal to 0; default \c 250.} * @config{ close_idle_time, amount of time in seconds a - * file handle needs to be idle before attempting to close it., an integer - * between 1 and 100000; default \c 30.} - * @config{ close_scan_interval, interval in seconds at - * which to check for files that are inactive and close them., an integer - * between 1 and 100000; default \c 10.} + * file handle needs to be idle before attempting to close it. A setting of 0 + * means that idle handles are not closed., an integer between 0 and 100000; + * default \c 30.} + * @config{ close_scan_interval, interval + * in seconds at which to check for files that are inactive and close them., an + * integer between 1 and 100000; default \c 10.} * @config{ ),,} * @config{hazard_max, maximum number of simultaneous hazard pointers per * session handle., an integer greater than or equal to 15; default \c 1000.} @@ -3682,150 +3684,148 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_DH_SESSION_SWEEPS 1069 /*! connection: files currently open */ #define WT_STAT_CONN_FILE_OPEN 1070 -/*! log: log buffer size increases */ -#define WT_STAT_CONN_LOG_BUFFER_GROW 1071 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1072 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1071 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1073 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1072 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1074 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1073 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1075 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1074 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1076 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1075 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1077 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1076 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1078 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1077 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1079 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1078 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1080 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1079 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1081 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1080 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1082 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1081 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1083 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1082 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1084 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1083 /*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1085 +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1084 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1086 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1085 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1087 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1086 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1088 +#define WT_STAT_CONN_LOG_SCANS 1087 /*! log: consolidated slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1089 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1088 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1090 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1089 /*! log: consolidated slot joins */ -#define WT_STAT_CONN_LOG_SLOT_JOINS 1091 +#define WT_STAT_CONN_LOG_SLOT_JOINS 1090 /*! log: consolidated slot join races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1092 +#define WT_STAT_CONN_LOG_SLOT_RACES 1091 /*! log: slots selected for switching that were unavailable */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS 1093 +#define WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS 1092 /*! log: record size exceeded maximum */ -#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1094 +#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1093 /*! log: failed to find a slot large enough for record */ -#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1095 +#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1094 /*! log: consolidated slot join transitions */ -#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1096 +#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1095 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1097 +#define WT_STAT_CONN_LOG_SYNC 1096 /*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1098 +#define WT_STAT_CONN_LOG_SYNC_DIR 1097 /*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1099 +#define WT_STAT_CONN_LOG_WRITE_LSN 1098 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1100 +#define WT_STAT_CONN_LOG_WRITES 1099 /*! LSM: sleep for LSM checkpoint throttle */ -#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1101 +#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1100 /*! LSM: sleep for LSM merge throttle */ -#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1102 +#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1101 /*! LSM: rows merged in an LSM tree */ -#define WT_STAT_CONN_LSM_ROWS_MERGED 1103 +#define WT_STAT_CONN_LSM_ROWS_MERGED 1102 /*! LSM: application work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1104 +#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1103 /*! LSM: merge work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1105 +#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1104 /*! LSM: tree queue hit maximum */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1106 +#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1105 /*! LSM: switch work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1107 +#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1106 /*! LSM: tree maintenance operations scheduled */ -#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1108 +#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1107 /*! LSM: tree maintenance operations discarded */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1109 +#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1108 /*! LSM: tree maintenance operations executed */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1110 +#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1109 /*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1111 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1110 /*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1112 +#define WT_STAT_CONN_MEMORY_FREE 1111 /*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1113 +#define WT_STAT_CONN_MEMORY_GROW 1112 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1114 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1113 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1115 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1114 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1116 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1115 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1117 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1116 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1118 +#define WT_STAT_CONN_PAGE_SLEEP 1117 /*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1119 +#define WT_STAT_CONN_READ_IO 1118 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1120 +#define WT_STAT_CONN_REC_PAGES 1119 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1121 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1120 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1122 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1121 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1123 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1122 /*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1124 +#define WT_STAT_CONN_RWLOCK_READ 1123 /*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1125 +#define WT_STAT_CONN_RWLOCK_WRITE 1124 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1126 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1125 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1127 +#define WT_STAT_CONN_SESSION_OPEN 1126 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1128 +#define WT_STAT_CONN_TXN_BEGIN 1127 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1129 +#define WT_STAT_CONN_TXN_CHECKPOINT 1128 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1130 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1129 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1131 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1130 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1132 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1131 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1133 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1132 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1134 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1133 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1135 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1134 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1136 +#define WT_STAT_CONN_TXN_COMMIT 1135 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1137 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1136 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1138 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1137 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1139 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1138 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1140 +#define WT_STAT_CONN_TXN_ROLLBACK 1139 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1141 +#define WT_STAT_CONN_TXN_SYNC 1140 /*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1142 +#define WT_STAT_CONN_WRITE_IO 1141 /*! * @} diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c index 7776b68e3d2..77ae0383cbe 100644 --- a/src/third_party/wiredtiger/src/log/log.c +++ b/src/third_party/wiredtiger/src/log/log.c @@ -577,7 +577,6 @@ __log_file_header( tmp.slot_fh = fh; } else { WT_ASSERT(session, fh == NULL); - log->prep_missed++; WT_ERR(__log_acquire(session, logrec->len, &tmp)); } WT_ERR(__log_fill(session, &myslot, 1, buf, NULL)); @@ -777,25 +776,28 @@ __wt_log_allocfile( WT_DECL_RET; WT_FH *log_fh; WT_LOG *log; + uint32_t tmp_id; conn = S2C(session); log = conn->log; log_fh = NULL; + /* * Preparing a log file entails creating a temporary file: * - Writing the header. * - Truncating to the offset of the first record. * - Pre-allocating the file if needed. - * - Renaming it to the pre-allocated file name. + * - Renaming it to the desired file name. */ WT_RET(__wt_scr_alloc(session, 0, &from_path)); WT_ERR(__wt_scr_alloc(session, 0, &to_path)); - WT_ERR(__log_filename(session, lognum, WT_LOG_TMPNAME, from_path)); + tmp_id = WT_ATOMIC_ADD4(log->tmp_fileid, 1); + WT_ERR(__log_filename(session, tmp_id, WT_LOG_TMPNAME, from_path)); WT_ERR(__log_filename(session, lognum, dest, to_path)); /* * Set up the temporary file. */ - WT_ERR(__log_openfile(session, 1, &log_fh, WT_LOG_TMPNAME, lognum)); + WT_ERR(__log_openfile(session, 1, &log_fh, WT_LOG_TMPNAME, tmp_id)); WT_ERR(__log_file_header(session, log_fh, NULL, 1)); WT_ERR(__wt_ftruncate(session, log_fh, WT_LOG_FIRST_RECORD)); if (prealloc) @@ -1245,9 +1247,12 @@ __wt_log_newfile(WT_SESSION_IMPL *session, int conn_create, int *created) /* * If we need to create the log file, do so now. */ - if (create_log && (ret = __wt_log_allocfile( - session, log->fileid, WT_LOG_FILENAME, 0)) != 0) - return (ret); + if (create_log) { + log->prep_missed++; + if ((ret = __wt_log_allocfile( + session, log->fileid, WT_LOG_FILENAME, 0)) != 0) + return (ret); + } WT_RET(__log_openfile(session, 0, &log->log_fh, WT_LOG_FILENAME, log->fileid)); /* @@ -1811,11 +1816,6 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, session, record, lsnp, flags)) == EAGAIN) ; WT_ERR(ret); - /* - * Increase the buffer size of any slots we can get access - * to, so future consolidations are likely to succeed. - */ - WT_ERR(__wt_log_slot_grow_buffers(session, 4 * rdup_len)); return (0); } WT_ERR(ret); diff --git a/src/third_party/wiredtiger/src/log/log_slot.c b/src/third_party/wiredtiger/src/log/log_slot.c index 741a8caf108..8723d492e13 100644 --- a/src/third_party/wiredtiger/src/log/log_slot.c +++ b/src/third_party/wiredtiger/src/log/log_slot.c @@ -54,13 +54,18 @@ __wt_log_slot_init(WT_SESSION_IMPL *session) * Allocate memory for buffers now that the arrays are setup. Split * this out to make error handling simpler. */ + /* + * Cap the slot buffer to the log file size. + */ + log->slot_buf_size = (uint32_t)WT_MIN( + conn->log_file_max, WT_LOG_SLOT_BUF_SIZE); for (i = 0; i < WT_SLOT_POOL; i++) { WT_ERR(__wt_buf_init(session, - &log->slot_pool[i].slot_buf, WT_LOG_SLOT_BUF_INIT_SIZE)); + &log->slot_pool[i].slot_buf, (size_t)log->slot_buf_size)); F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS); } WT_STAT_FAST_CONN_INCRV(session, - log_buffer_size, WT_LOG_SLOT_BUF_INIT_SIZE * WT_SLOT_POOL); + log_buffer_size, log->slot_buf_size * WT_SLOT_POOL); if (0) { err: while (--i >= 0) __wt_buf_free(session, &log->slot_pool[i].slot_buf); @@ -101,12 +106,16 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, WT_LOG *log; WT_LOGSLOT *slot; int64_t new_state, old_state; - uint32_t allocated_slot, slot_grow_attempts; + uint32_t allocated_slot, slot_attempts; conn = S2C(session); log = conn->log; - slot_grow_attempts = 0; + slot_attempts = 0; + if (mysize >= (uint64_t)log->slot_buf_size) { + WT_STAT_FAST_CONN_INCR(session, log_slot_toobig); + return (ENOMEM); + } find_slot: #if WT_SLOT_ACTIVE == 1 allocated_slot = 0; @@ -146,12 +155,11 @@ join_slot: goto find_slot; } /* - * If the slot buffer isn't big enough to hold this update, mark - * the slot for a buffer size increase and find another slot. + * If the slot buffer isn't big enough to hold this update, try + * to find another slot. */ if (new_state > (int64_t)slot->slot_buf.memsize) { - F_SET(slot, WT_SLOT_BUF_GROW); - if (++slot_grow_attempts > 5) { + if (++slot_attempts > 5) { WT_STAT_FAST_CONN_INCR(session, log_slot_toosmall); return (ENOMEM); } @@ -310,24 +318,8 @@ __wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size) int __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { - WT_DECL_RET; - ret = 0; - /* - * Grow the buffer if needed before returning it to the pool. - */ - if (F_ISSET(slot, WT_SLOT_BUF_GROW)) { - WT_STAT_FAST_CONN_INCR(session, log_buffer_grow); - WT_STAT_FAST_CONN_INCRV(session, - log_buffer_size, slot->slot_buf.memsize); - WT_ERR(__wt_buf_grow(session, - &slot->slot_buf, slot->slot_buf.memsize * 2)); - } -err: - /* - * No matter if there is an error, we always want to free - * the slot back to the pool. - */ + WT_UNUSED(session); /* * Make sure flags don't get retained between uses. * We have to reset them them here because multiple threads may @@ -335,62 +327,5 @@ err: */ slot->flags = WT_SLOT_INIT_FLAGS; slot->slot_state = WT_LOG_SLOT_FREE; - return (ret); -} - -/* - * __wt_log_slot_grow_buffers -- - * Increase the buffer size of all available slots in the buffer pool. - * Go to some lengths to include active (but unused) slots to handle - * the case where all log write record sizes exceed the size of the - * active buffer. - */ -int -__wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, size_t newsize) -{ - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - WT_LOG *log; - WT_LOGSLOT *slot; - int64_t orig_state; - uint64_t old_size, total_growth; - int i; - - conn = S2C(session); - log = conn->log; - total_growth = 0; - WT_STAT_FAST_CONN_INCR(session, log_buffer_grow); - /* - * Take the log slot lock to prevent other threads growing buffers - * at the same time. Could tighten the scope of this lock, or have - * a separate lock if there is contention. - */ - __wt_spin_lock(session, &log->log_slot_lock); - for (i = 0; i < WT_SLOT_POOL; i++) { - slot = &log->slot_pool[i]; - - /* Don't keep growing unrelated buffers. */ - if (slot->slot_buf.memsize > (10 * newsize) && - !F_ISSET(slot, WT_SLOT_BUF_GROW)) - continue; - - /* Avoid atomic operations if they won't succeed. */ - orig_state = slot->slot_state; - if ((orig_state != WT_LOG_SLOT_FREE && - orig_state != WT_LOG_SLOT_READY) || - !WT_ATOMIC_CAS8( - slot->slot_state, orig_state, WT_LOG_SLOT_PENDING)) - continue; - - /* We have a slot - now go ahead and grow the buffer. */ - old_size = slot->slot_buf.memsize; - F_CLR(slot, WT_SLOT_BUF_GROW); - WT_ERR(__wt_buf_grow(session, &slot->slot_buf, - WT_MAX(slot->slot_buf.memsize * 2, newsize))); - slot->slot_state = orig_state; - total_growth += slot->slot_buf.memsize - old_size; - } -err: __wt_spin_unlock(session, &log->log_slot_lock); - WT_STAT_FAST_CONN_INCRV(session, log_buffer_size, total_growth); - return (ret); + return (0); } diff --git a/src/third_party/wiredtiger/src/meta/meta_track.c b/src/third_party/wiredtiger/src/meta/meta_track.c index 5946f81290d..c887af58540 100644 --- a/src/third_party/wiredtiger/src/meta/meta_track.c +++ b/src/third_party/wiredtiger/src/meta/meta_track.c @@ -15,7 +15,7 @@ */ typedef struct __wt_meta_track { enum { - WT_ST_EMPTY, /* Unused slot */ + WT_ST_EMPTY = 0, /* Unused slot */ WT_ST_CHECKPOINT, /* Complete a checkpoint */ WT_ST_DROP_COMMIT, /* Drop post commit */ WT_ST_FILEOP, /* File operation */ @@ -67,6 +67,35 @@ __meta_track_next(WT_SESSION_IMPL *session, WT_META_TRACK **trkp) } /* + * __meta_track_clear -- + * Clear the structure. + */ +static void +__meta_track_clear(WT_SESSION_IMPL *session, WT_META_TRACK *trk) +{ + __wt_free(session, trk->a); + __wt_free(session, trk->b); + memset(trk, 0, sizeof(WT_META_TRACK)); +} + +/* + * __meta_track_err -- + * Drop the last operation off the end of the list, something went wrong + * during initialization. + */ +static void +__meta_track_err(WT_SESSION_IMPL *session) +{ + WT_META_TRACK *trk; + + trk = session->meta_track_next; + --trk; + __meta_track_clear(session, trk); + + session->meta_track_next = trk; +} + +/* * __wt_meta_track_discard -- * Cleanup metadata tracking when closing a session. */ @@ -185,10 +214,7 @@ __meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll) WT_ILLEGAL_VALUE(session); } -free: trk->op = WT_ST_EMPTY; - __wt_free(session, trk->a); - __wt_free(session, trk->b); - trk->dhandle = NULL; +free: __meta_track_clear(session, trk); return (ret); } @@ -346,14 +372,17 @@ __wt_meta_track_checkpoint(WT_SESSION_IMPL *session) int __wt_meta_track_insert(WT_SESSION_IMPL *session, const char *key) { + WT_DECL_RET; WT_META_TRACK *trk; WT_RET(__meta_track_next(session, &trk)); trk->op = WT_ST_REMOVE; - WT_RET(__wt_strdup(session, key, &trk->a)); - + WT_ERR(__wt_strdup(session, key, &trk->a)); return (0); + +err: __meta_track_err(session); + return (ret); } /* @@ -369,7 +398,7 @@ __wt_meta_track_update(WT_SESSION_IMPL *session, const char *key) WT_RET(__meta_track_next(session, &trk)); trk->op = WT_ST_SET; - WT_RET(__wt_strdup(session, key, &trk->a)); + WT_ERR(__wt_strdup(session, key, &trk->a)); /* * If there was a previous value, keep it around -- if not, then this @@ -380,6 +409,10 @@ __wt_meta_track_update(WT_SESSION_IMPL *session, const char *key) trk->op = WT_ST_REMOVE; ret = 0; } + WT_ERR(ret); + return (0); + +err: __meta_track_err(session); return (ret); } @@ -391,14 +424,18 @@ int __wt_meta_track_fileop( WT_SESSION_IMPL *session, const char *olduri, const char *newuri) { + WT_DECL_RET; WT_META_TRACK *trk; WT_RET(__meta_track_next(session, &trk)); trk->op = WT_ST_FILEOP; - WT_RET(__wt_strdup(session, olduri, &trk->a)); - WT_RET(__wt_strdup(session, newuri, &trk->b)); + WT_ERR(__wt_strdup(session, olduri, &trk->a)); + WT_ERR(__wt_strdup(session, newuri, &trk->b)); return (0); + +err: __meta_track_err(session); + return (ret); } /* @@ -409,13 +446,17 @@ int __wt_meta_track_drop( WT_SESSION_IMPL *session, const char *filename) { + WT_DECL_RET; WT_META_TRACK *trk; WT_RET(__meta_track_next(session, &trk)); trk->op = WT_ST_DROP_COMMIT; - WT_RET(__wt_strdup(session, filename, &trk->a)); + WT_ERR(__wt_strdup(session, filename, &trk->a)); return (0); + +err: __meta_track_err(session); + return (ret); } /* diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index c69344cb6b6..703bebb1597 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -29,7 +29,7 @@ typedef struct { /* Track whether all changes to the page are written. */ uint64_t max_txn; - uint64_t skipped_txn; + uint64_t first_dirty_txn; uint32_t orig_write_gen; /* @@ -162,7 +162,7 @@ typedef struct { * be evicted as new, in-memory pages, restoring the updates on * those pages. */ - WT_UPD_SKIPPED *skip; /* Skipped updates */ + WT_UPD_SKIPPED *skip; /* Skipped updates */ uint32_t skip_next; size_t skip_allocated; @@ -363,6 +363,19 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_STAT_FAST_DATA_INCR(session, rec_pages_eviction); } +#ifdef HAVE_DIAGNOSTIC + { + /* + * Check that transaction time always moves forward for a given page. + * If this check fails, reconciliation can free something that a future + * reconciliation will need. + */ + uint64_t oldest_id = __wt_txn_oldest_id(session); + WT_ASSERT(session, WT_TXNID_LE(mod->last_oldest_id, oldest_id)); + mod->last_oldest_id = oldest_id; + } +#endif + /* Record the most recent transaction ID we will *not* write. */ mod->disk_snap_min = session->txn.snap_min; @@ -689,7 +702,7 @@ __rec_write_init(WT_SESSION_IMPL *session, * Running transactions may update the page after we write it, so * this is the highest ID we can be confident we will see. */ - r->skipped_txn = S2C(session)->txn_global.last_running; + r->first_dirty_txn = S2C(session)->txn_global.last_running; return (0); } @@ -838,6 +851,7 @@ static inline int __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp) { + WT_DECL_RET; WT_ITEM ovfl; WT_PAGE *page; WT_UPDATE *upd, *upd_list, *upd_ovfl; @@ -850,12 +864,17 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, page = r->page; /* - * If we're called with an WT_INSERT reference, use its WT_UPDATE - * list, else is an on-page row-store WT_UPDATE list. + * If called with a WT_INSERT item, use its WT_UPDATE list (which must + * exist), otherwise check for an on-page row-store WT_UPDATE list + * (which may not exist). Return immediately if the item has no updates. */ - upd_list = ins == NULL ? WT_ROW_UPDATE(page, rip) : ins->upd; - skipped = 0; + if (ins == NULL) { + if ((upd_list = WT_ROW_UPDATE(page, rip)) == NULL) + return (0); + } else + upd_list = ins->upd; + skipped = 0; for (max_txn = WT_TXN_NONE, min_txn = UINT64_MAX, upd = upd_list; upd != NULL; upd = upd->next) { if ((txnid = upd->txnid) == WT_TXN_ABORTED) @@ -866,9 +885,9 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, max_txn = txnid; if (WT_TXNID_LT(txnid, min_txn)) min_txn = txnid; - if (WT_TXNID_LT(txnid, r->skipped_txn) && + if (WT_TXNID_LT(txnid, r->first_dirty_txn) && !__wt_txn_visible_all(session, txnid)) - r->skipped_txn = txnid; + r->first_dirty_txn = txnid; /* * Record whether any updates were skipped on the way to finding @@ -898,15 +917,15 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, r->max_txn = max_txn; /* - * If all updates are globally visible and no updates were skipped, the + * If no updates were skipped and all updates are globally visible, the * page can be marked clean and we're done, regardless of whether we're * evicting or checkpointing. * - * The oldest transaction ID may have moved while we were scanning the - * page, so it is possible to skip an update but then find that by the - * end of the scan, all updates are stable. + * We have to check both: the oldest transaction ID may have moved while + * we were scanning the update list, so it is possible to skip an update + * but then find that by the end of the scan, all updates are stable. */ - if (__wt_txn_visible_all(session, max_txn) && !skipped) + if (!skipped && __wt_txn_visible_all(session, max_txn)) return (0); /* @@ -976,8 +995,11 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, */ if (vpack != NULL && vpack->raw == WT_CELL_VALUE_OVFL_RM && !__wt_txn_visible_all(session, min_txn)) { - WT_RET(__wt_ovfl_txnc_search( - page, vpack->data, vpack->size, &ovfl)); + if ((ret = __wt_ovfl_txnc_search( + page, vpack->data, vpack->size, &ovfl)) != 0) + WT_PANIC_RET(session, ret, + "cached overflow item discarded early"); + /* * Create an update structure with an impossibly low transaction * ID and append it to the update list we're about to save. @@ -5064,23 +5086,37 @@ err: __wt_scr_free(session, &tkey); * be set before a subsequent checkpoint reads it, and because the * current checkpoint is waiting on this reconciliation to complete, * there's no risk of that happening). - * - * Otherwise, if no updates were skipped, we have a new maximum - * transaction written for the page (used to decide if a clean page can - * be evicted). The page only might be clean; if the write generation - * is unchanged since reconciliation started, clear it and update cache - * dirty statistics, if the write generation changed, then the page has - * been written since we started reconciliation, it cannot be - * discarded. */ if (r->leave_dirty) { - mod->first_dirty_txn = r->skipped_txn; + mod->first_dirty_txn = r->first_dirty_txn; btree->modified = 1; WT_FULL_BARRIER(); } else { + /* + * If no updates were skipped, we have a new maximum transaction + * written for the page (used to decide if a clean page can be + * evicted). Set the highest transaction ID for the page. + * + * Track the highest transaction ID for the tree (used to decide + * if it's safe to discard all of the pages in the tree without + * further checking). Reconciliation in the service of eviction + * is multi-threaded, only update the tree's maximum transaction + * ID when doing a checkpoint. That's sufficient, we only care + * about the highest transaction ID of any update currently in + * the tree, and checkpoint visits every dirty page in the tree. + */ mod->rec_max_txn = r->max_txn; + if (!F_ISSET(r, WT_EVICTING) && + !WT_TXNID_LT(btree->rec_max_txn, r->max_txn)) + btree->rec_max_txn = r->max_txn; + /* + * The page only might be clean; if the write generation is + * unchanged since reconciliation started, it's clean. If the + * write generation changed, the page has been written since + * we started reconciliation and remains dirty. + */ if (WT_ATOMIC_CAS4(mod->write_gen, r->orig_write_gen, 0)) __wt_cache_dirty_decr(session, page); } diff --git a/src/third_party/wiredtiger/src/schema/schema_create.c b/src/third_party/wiredtiger/src/schema/schema_create.c index c7c47a88f3c..4041a1d7b9f 100644 --- a/src/third_party/wiredtiger/src/schema/schema_create.c +++ b/src/third_party/wiredtiger/src/schema/schema_create.c @@ -9,6 +9,22 @@ #include "wt_internal.h" /* + * __wt_schema_create_strip -- + * Discard any configuration information from a schema entry that is not + * applicable to an session.create call, here for the wt dump command utility, + * which only wants to dump the schema information needed for load. + */ +int +__wt_schema_create_strip(WT_SESSION_IMPL *session, + const char *v1, const char *v2, char **value_ret) +{ + const char *cfg[] = + { WT_CONFIG_BASE(session, WT_SESSION_create), v1, v2, NULL }; + + return (__wt_config_collapse(session, cfg, value_ret)); +} + +/* * __wt_direct_io_size_check -- * Return a size from the configuration, complaining if it's insufficient * for direct I/O. diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index 4f698806511..ef9735a8b98 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -314,8 +314,10 @@ __wt_open_cursor(WT_SESSION_IMPL *session, * copied. */ if ((*cursorp)->uri == NULL && - (ret = __wt_strdup(session, uri, &(*cursorp)->uri)) != 0) + (ret = __wt_strdup(session, uri, &(*cursorp)->uri)) != 0) { WT_TRET((*cursorp)->close(*cursorp)); + *cursorp = NULL; + } return (ret); } @@ -381,23 +383,6 @@ err: if (cursor != NULL) } /* - * __wt_session_create_strip -- - * Discard any configuration information from a schema entry that is not - * applicable to an session.create call, here for the wt dump command utility, - * which only wants to dump the schema information needed for load. - */ -int -__wt_session_create_strip(WT_SESSION *wt_session, - const char *v1, const char *v2, char **value_ret) -{ - WT_SESSION_IMPL *session = (WT_SESSION_IMPL *)wt_session; - const char *cfg[] = - { WT_CONFIG_BASE(session, WT_SESSION_create), v1, v2, NULL }; - - return (__wt_config_collapse(session, cfg, value_ret)); -} - -/* * __session_create -- * WT_SESSION->create method. */ diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index 44c2daa3802..0310fdc207c 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -458,7 +458,6 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats) stats->log_slot_joins.desc = "log: consolidated slot joins"; stats->log_slot_toosmall.desc = "log: failed to find a slot large enough for record"; - stats->log_buffer_grow.desc = "log: log buffer size increases"; stats->log_bytes_payload.desc = "log: log bytes of payload data"; stats->log_bytes_written.desc = "log: log bytes written"; stats->log_compress_writes.desc = "log: log records compressed"; @@ -631,7 +630,6 @@ __wt_stat_refresh_connection_stats(void *stats_arg) stats->log_slot_transitions.v = 0; stats->log_slot_joins.v = 0; stats->log_slot_toosmall.v = 0; - stats->log_buffer_grow.v = 0; stats->log_bytes_payload.v = 0; stats->log_bytes_written.v = 0; stats->log_compress_writes.v = 0; diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 432746186fc..c9924056e91 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -98,7 +98,6 @@ __wt_txn_release_snapshot(WT_SESSION_IMPL *session) WT_ASSERT(session, txn_state->snap_min == WT_TXN_NONE || session->txn.isolation == WT_ISO_READ_UNCOMMITTED || - session->id == S2C(session)->txn_global.checkpoint_id || !__wt_txn_visible_all(session, txn_state->snap_min)); txn_state->snap_min = WT_TXN_NONE; @@ -118,13 +117,13 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) WT_TXN_STATE *s, *txn_state; uint64_t current_id, id; uint64_t prev_oldest_id, snap_min; - uint32_t ckpt_id, i, n, session_cnt; + uint32_t i, n, session_cnt; int32_t count; conn = S2C(session); txn = &session->txn; txn_global = &conn->txn_global; - txn_state = &txn_global->states[session->id]; + txn_state = WT_SESSION_TXN_STATE(session); current_id = snap_min = txn_global->current; prev_oldest_id = txn_global->oldest_id; @@ -157,12 +156,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); - ckpt_id = txn_global->checkpoint_id; for (i = n = 0, s = txn_global->states; i < session_cnt; i++, s++) { - /* Skip the checkpoint transaction; it is never read from. */ - if (i == ckpt_id) - continue; - /* * Build our snapshot of any concurrent transaction IDs. * @@ -221,7 +215,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force) WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s; uint64_t current_id, id, oldest_id, prev_oldest_id, snap_min; - uint32_t ckpt_id, i, session_cnt; + uint32_t i, session_cnt; int32_t count; int last_running_moved; @@ -257,12 +251,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force) /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); - ckpt_id = txn_global->checkpoint_id; for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { - /* Skip the checkpoint transaction; it is never read from. */ - if (i == ckpt_id) - continue; - /* * Update the oldest ID. * @@ -310,15 +299,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force) if (WT_TXNID_LT(prev_oldest_id, oldest_id) && WT_ATOMIC_CAS4(txn_global->scan_count, 1, -1)) { WT_ORDERED_READ(session_cnt, conn->session_cnt); - ckpt_id = txn_global->checkpoint_id; for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { - /* - * Skip the checkpoint transaction; it is never read - * from. - */ - if (i == ckpt_id) - continue; - if ((id = s->id) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) oldest_id = id; @@ -408,19 +389,31 @@ __wt_txn_release(WT_SESSION_IMPL *session) WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *txn_state; + int was_oldest; txn = &session->txn; WT_ASSERT(session, txn->mod_count == 0); txn->notify = NULL; txn_global = &S2C(session)->txn_global; - txn_state = &txn_global->states[session->id]; + txn_state = WT_SESSION_TXN_STATE(session); + was_oldest = 0; /* Clear the transaction's ID from the global table. */ - if (F_ISSET(txn, WT_TXN_HAS_ID)) { + if (WT_SESSION_IS_CHECKPOINT(session)) { + WT_ASSERT(session, txn_state->id == WT_TXN_NONE); + txn->id = WT_TXN_NONE; + + /* Clear the global checkpoint transaction IDs. */ + txn_global->checkpoint_id = 0; + txn_global->checkpoint_pinned = WT_TXN_NONE; + } else if (F_ISSET(txn, WT_TXN_HAS_ID)) { WT_ASSERT(session, txn_state->id != WT_TXN_NONE && txn->id != WT_TXN_NONE); WT_PUBLISH(txn_state->id, WT_TXN_NONE); + + /* Quick check for the oldest transaction. */ + was_oldest = (txn->id == txn_global->last_running); txn->id = WT_TXN_NONE; } @@ -439,6 +432,14 @@ __wt_txn_release(WT_SESSION_IMPL *session) txn->isolation = session->isolation; /* Ensure the transaction flags are cleared on exit */ txn->flags = 0; + + /* + * When the oldest transaction in the system completes, bump the oldest + * ID. This is racy and so not guaranteed, but in practice it keeps + * the oldest ID from falling too far behind. + */ + if (was_oldest) + __wt_txn_update_oldest(session, 1); } /* @@ -518,6 +519,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) */ __wt_txn_release_snapshot(session); ret = __wt_txn_log_commit(session, cfg); + WT_ASSERT(session, ret == 0); } /* @@ -648,19 +650,19 @@ __wt_txn_stats_update(WT_SESSION_IMPL *session) WT_TXN_GLOBAL *txn_global; WT_CONNECTION_IMPL *conn; WT_CONNECTION_STATS *stats; - uint64_t checkpoint_snap_min; + uint64_t checkpoint_pinned; conn = S2C(session); txn_global = &conn->txn_global; stats = &conn->stats; - checkpoint_snap_min = txn_global->checkpoint_snap_min; + checkpoint_pinned = txn_global->checkpoint_pinned; WT_STAT_SET(stats, txn_pinned_range, txn_global->current - txn_global->oldest_id); WT_STAT_SET(stats, txn_pinned_checkpoint_range, - checkpoint_snap_min == WT_TXN_NONE ? - 0 : txn_global->current - checkpoint_snap_min); + checkpoint_pinned == WT_TXN_NONE ? + 0 : txn_global->current - checkpoint_pinned); } /* diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index cfc993418c5..f317a3dc697 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -349,6 +349,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_ISOLATION saved_isolation; + WT_TXN_STATE *txn_state; void *saved_meta_next; u_int i; int full, fullckpt_logging, idle, tracking; @@ -358,6 +359,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) conn = S2C(session); txn = &session->txn; txn_global = &conn->txn_global; + txn_state = WT_SESSION_TXN_STATE(session); saved_isolation = session->isolation; full = fullckpt_logging = idle = tracking = 0; @@ -429,6 +431,22 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__checkpoint_verbose_track(session, "starting transaction", &verb_timer)); + if (full) + WT_ERR(__wt_epoch(session, &start)); + + /* + * Bump the global checkpoint generation, used to figure out whether + * checkpoint has visited a tree. There is no need for this to be + * atomic: it is only written while holding the checkpoint lock. + * + * We do need to update it before clearing the checkpoint's entry out + * of the transaction table, or a thread evicting in a tree could + * ignore the checkpoint's transaction. + */ + ++txn_global->checkpoint_gen; + WT_STAT_FAST_CONN_SET(session, + txn_checkpoint_generation, txn_global->checkpoint_gen); + /* * Start a snapshot transaction for the checkpoint. * @@ -436,30 +454,44 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * side effects on cursors, which applications can hold open across * calls to checkpoint. */ - if (full) - WT_ERR(__wt_epoch(session, &start)); WT_ERR(__wt_txn_begin(session, txn_cfg)); /* Ensure a transaction ID is allocated prior to sharing it globally */ WT_ERR(__wt_txn_id_check(session)); /* - * Save a copy of the checkpoint session ID so that refresh can skip - * the checkpoint transactions. We never do checkpoints in the default - * session with id zero. Save a copy of the snap min so that visibility - * checks for the checkpoint use the right ID. + * Save the checkpoint session ID. We never do checkpoints in the + * default session (with id zero). */ - WT_ASSERT(session, session->id != 0); + WT_ASSERT(session, session->id != 0 && txn_global->checkpoint_id == 0); txn_global->checkpoint_id = session->id; - txn_global->checkpoint_snap_min = session->txn.snap_min; + + txn_global->checkpoint_pinned = + WT_MIN(txn_state->id, txn_state->snap_min); /* - * No need for this to be atomic it is only written while holding the - * checkpoint lock. + * We're about to clear the checkpoint transaction from the global + * state table so the oldest ID can move forward. Make sure everything + * we've done above is scheduled. */ - txn_global->checkpoint_gen += 1; - WT_STAT_FAST_CONN_SET(session, - txn_checkpoint_generation, txn_global->checkpoint_gen); + WT_FULL_BARRIER(); + + /* + * Sanity check that the oldest ID hasn't moved on before we have + * cleared our entry. + */ + WT_ASSERT(session, + WT_TXNID_LE(txn_global->oldest_id, txn_state->id) && + WT_TXNID_LE(txn_global->oldest_id, txn_state->snap_min)); + + /* + * Clear our entry from the global transaction session table. Any + * operation that needs to know about the ID for this checkpoint will + * consider the checkpoint ID in the global structure. Most operations + * can safely ignore the checkpoint ID (see the visible all check for + * details). + */ + txn_state->id = txn_state->snap_min = WT_TXN_NONE; /* Tell logging that we have started a database checkpoint. */ if (fullckpt_logging) @@ -478,10 +510,6 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) /* Release the snapshot so we aren't pinning pages in cache. */ __wt_txn_release_snapshot(session); - /* Clear the global checkpoint transaction IDs */ - txn_global->checkpoint_id = 0; - txn_global->checkpoint_snap_min = WT_TXN_NONE; - WT_ERR(__checkpoint_verbose_track(session, "committing transaction", &verb_timer)); @@ -558,10 +586,6 @@ err: /* WT_TRET(__wt_txn_rollback(session, NULL)); } - /* Ensure the checkpoint IDs are cleared on the error path. */ - txn_global->checkpoint_id = 0; - txn_global->checkpoint_snap_min = WT_TXN_NONE; - /* * Tell logging that we have finished a database checkpoint. Do not * write a log record if the database was idle. @@ -813,10 +837,8 @@ __checkpoint_worker( force = 1; } if (!btree->modified && !force) { - if (!is_checkpoint) { - F_SET(btree, WT_BTREE_SKIP_CKPT); - goto done; - } + if (!is_checkpoint) + goto nockpt; deleted = 0; WT_CKPT_FOREACH(ckptbase, ckpt) @@ -835,7 +857,12 @@ __checkpoint_worker( (WT_PREFIX_MATCH(name, WT_CHECKPOINT) && WT_PREFIX_MATCH((ckpt - 1)->name, WT_CHECKPOINT))) && deleted < 2) { - F_SET(btree, WT_BTREE_SKIP_CKPT); +nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT); + WT_PUBLISH(btree->checkpoint_gen, + S2C(session)->txn_global.checkpoint_gen); + WT_STAT_FAST_DATA_SET(session, + btree_checkpoint_generation, + btree->checkpoint_gen); goto done; } } @@ -853,7 +880,7 @@ __checkpoint_worker( * Hold the lock until we're done (blocking hot backups from starting), * we don't want to race with a future hot backup. */ - __wt_spin_lock(session, &conn->hot_backup_lock); + WT_ERR(__wt_readlock(session, conn->hot_backup_lock)); hot_backup_locked = 1; if (conn->hot_backup) WT_CKPT_FOREACH(ckptbase, ckpt) { @@ -1063,16 +1090,8 @@ fake: /* WT_ERR(__wt_txn_checkpoint_log( session, 0, WT_TXN_LOG_CKPT_STOP, NULL)); - /* - * Update the checkpoint generation for this handle so visible - * updates newer than the checkpoint can be evicted. - */ -done: btree->checkpoint_gen = conn->txn_global.checkpoint_gen; - WT_STAT_FAST_DATA_SET(session, - btree_checkpoint_generation, btree->checkpoint_gen); - -err: - /* +done: +err: /* * If the checkpoint didn't complete successfully, make sure the * tree is marked dirty. */ @@ -1080,7 +1099,7 @@ err: btree->modified = 1; if (hot_backup_locked) - __wt_spin_unlock(session, &conn->hot_backup_lock); + WT_TRET(__wt_readunlock(session, conn->hot_backup_lock)); __wt_meta_ckptlist_free(session, ckptbase); __wt_free(session, name_alloc); diff --git a/src/third_party/wiredtiger/src/utilities/util_dump.c b/src/third_party/wiredtiger/src/utilities/util_dump.c index 0ae201ea21e..28d0309242d 100644 --- a/src/third_party/wiredtiger/src/utilities/util_dump.c +++ b/src/third_party/wiredtiger/src/utilities/util_dump.c @@ -149,9 +149,9 @@ dump_config(WT_SESSION *session, const char *uri, int hex) /* Open a metadata cursor. */ if ((ret = session->open_cursor( - session, WT_METADATA_URI, NULL, NULL, &cursor)) != 0) { - fprintf(stderr, "%s: %s: session.open_cursor: %s\n", - progname, WT_METADATA_URI, session->strerror(session, ret)); + session, "metadata:create", NULL, NULL, &cursor)) != 0) { + fprintf(stderr, "%s: %s: session.open_cursor: %s\n", progname, + "metadata:create", session->strerror(session, ret)); return (1); } /* @@ -225,7 +225,7 @@ dump_json_table_begin( { WT_DECL_RET; const char *name; - char *jsonconfig, *stripped; + char *jsonconfig; jsonconfig = NULL; @@ -236,12 +236,7 @@ dump_json_table_begin( } ++name; - if ((ret = - __wt_session_create_strip(session, config, NULL, &stripped)) != 0) - return (util_err(session, ret, NULL)); - ret = dup_json_string(stripped, &jsonconfig); - free(stripped); - if (ret != 0) + if ((ret = dup_json_string(config, &jsonconfig)) != 0) return (util_cerr(cursor, "config dup", ret)); if (printf(" \"%s\" : [\n {\n", uri) < 0) goto eio; @@ -278,7 +273,7 @@ dump_json_table_cg(WT_SESSION *session, WT_CURSOR *cursor, WT_DECL_RET; const char *key, *skip, *value; int exact, once; - char *jsonconfig, *stripped; + char *jsonconfig; static const char * const indent = " "; once = 0; @@ -326,12 +321,7 @@ match: if ((ret = cursor->get_key(cursor, &key)) != 0) if ((ret = cursor->get_value(cursor, &value)) != 0) return (util_cerr(cursor, "get_value", ret)); - if ((ret = __wt_session_create_strip( - session, value, NULL, &stripped)) != 0) - return (util_err(session, ret, NULL)); - ret = dup_json_string(stripped, &jsonconfig); - free(stripped); - if (ret != 0) + if ((ret = dup_json_string(value, &jsonconfig)) != 0) return (util_cerr(cursor, "config dup", ret)); ret = printf("%s\n" "%s{\n" @@ -362,67 +352,42 @@ dump_json_table_config(WT_SESSION *session, const char *uri) { WT_CURSOR *cursor; WT_DECL_RET; - WT_EXTENSION_API *wtext; int tret; char *value; /* Dump the config. */ - if (WT_PREFIX_MATCH(uri, "table:")) { - /* Open a metadata cursor. */ - if ((ret = session->open_cursor( - session, WT_METADATA_URI, NULL, NULL, &cursor)) != 0) { - fprintf(stderr, "%s: %s: session.open_cursor: %s\n", - progname, WT_METADATA_URI, - session->strerror(session, ret)); - return (1); - } + /* Open a metadata cursor. */ + if ((ret = session->open_cursor( + session, "metadata:create", NULL, NULL, &cursor)) != 0) { + fprintf(stderr, "%s: %s: session.open_cursor: %s\n", + progname, "metadata:create", + session->strerror(session, ret)); + return (1); + } - /* - * Search for the object itself, to make sure it - * exists, and get its config string. This where we - * find out a table object doesn't exist, use a simple - * error message. - */ - cursor->set_key(cursor, uri); - if ((ret = cursor->search(cursor)) == 0) { - if ((ret = cursor->get_value(cursor, &value)) != 0) - ret = util_cerr(cursor, "get_value", ret); - else if (dump_json_table_begin( - session, cursor, uri, value) != 0) - ret = 1; - } else if (ret == WT_NOTFOUND) - ret = util_err( - session, 0, "%s: No such object exists", uri); - else - ret = util_err(session, ret, "%s", uri); + /* + * Search for the object itself, to make sure it + * exists, and get its config string. This where we + * find out a table object doesn't exist, use a simple + * error message. + */ + cursor->set_key(cursor, uri); + if ((ret = cursor->search(cursor)) == 0) { + if ((ret = cursor->get_value(cursor, &value)) != 0) + ret = util_cerr(cursor, "get_value", ret); + else if (dump_json_table_begin( + session, cursor, uri, value) != 0) + ret = 1; + } else if (ret == WT_NOTFOUND) + ret = util_err( + session, 0, "%s: No such object exists", uri); + else + ret = util_err(session, ret, "%s", uri); - if ((tret = cursor->close(cursor)) != 0) { - tret = util_cerr(cursor, "close", tret); - if (ret == 0) - ret = tret; - } - } else { - /* - * We want to be able to dump the metadata file itself, but the - * configuration for that file lives in the turtle file. Reach - * down into the library and ask for the file's configuration, - * that will work in all cases. - * - * This where we find out a file object doesn't exist, use a - * simple error message. - */ - wtext = session-> - connection->get_extension_api(session->connection); - if ((ret = - wtext->metadata_search(wtext, session, uri, &value)) == 0) { - if (dump_json_table_begin( - session, NULL, uri, value) != 0) - ret = 1; - } else if (ret == WT_NOTFOUND) - ret = util_err( - session, 0, "%s: No such object exists", uri); - else - ret = util_err(session, ret, "%s", uri); + if ((tret = cursor->close(cursor)) != 0) { + tret = util_cerr(cursor, "close", tret); + if (ret == 0) + ret = tret; } return (ret); @@ -687,17 +652,19 @@ print_config(WT_SESSION *session, { WT_DECL_RET; char *value_ret; + const char *cfg[] = { v1, v2, NULL }; /* - * The underlying call will ignore v2 if v1 is NULL -- check here and - * swap in that case. + * The underlying call will stop if the first string is NULL -- check + * here and swap in that case. */ - if (v1 == NULL) { - v1 = v2; - v2 = NULL; + if (cfg[0] == NULL) { + cfg[0] = cfg[1]; + cfg[1] = NULL; } - if ((ret = __wt_session_create_strip(session, v1, v2, &value_ret)) != 0) + if ((ret = __wt_config_collapse( + (WT_SESSION_IMPL *)session, cfg, &value_ret)) != 0) return (util_err(session, ret, NULL)); ret = printf("%s\n%s\n", key, value_ret); free((char *)value_ret); |