summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/lsm/lsm_tree.c
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2019-09-05 15:55:04 +1000
committerLuke Chen <luke.chen@mongodb.com>2019-09-05 15:55:04 +1000
commitd2c2e6c73c424d5a28d5bd2a9031e4796a5e4371 (patch)
tree457f5fe506097b766e5e1695ba9d7d2662910416 /src/third_party/wiredtiger/src/lsm/lsm_tree.c
parent41a74df493503fec4ce054cc380a0d7eb01d374c (diff)
downloadmongo-d2c2e6c73c424d5a28d5bd2a9031e4796a5e4371.tar.gz
Import wiredtiger: 543111d3d8737ada1b741b3a25a201feb2ed13a3 from branch mongodb-4.0
ref: 48bf8dae7c..543111d3d8 for: 4.0.13 WT-4502 Assertion checking hazard pointers on page discard is too strong WT-4658 Apply Clang Format WT-4792 Add stat to track pages queued for eviction after LRU sorting WT-4840 WT_CURSOR.modify must require explicit, snapshot-isolation transaction WT-4869 Stop adding cache pressure when eviction is falling behind WT-4881 Soften the restrictions on re-entering reconciliation WT-4882 Improve checkpoint performance when there are large metadata pages WT-4892 Improve statistics about forced eviction WT-4893 Fix a race between internal page child-page eviction checks and cursors in the tree WT-4895 Fix debug eviction mode so it chooses skew more randomly WT-4898 Don't allow the eviction server to reconcile if it's busy WT-4920 Add statistics tracking when eviction server is waiting for page transitions WT-4957 Revert part of a change about when pages are queued for urgent eviction WT-5050 Assertion failure during urgent eviction of metadata page
Diffstat (limited to 'src/third_party/wiredtiger/src/lsm/lsm_tree.c')
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_tree.c2237
1 files changed, 1055 insertions, 1182 deletions
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c
index 5b0639f6a96..9b6933a61e2 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_tree.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c
@@ -10,1443 +10,1316 @@
static int __lsm_tree_cleanup_old(WT_SESSION_IMPL *, const char *);
static int __lsm_tree_open_check(WT_SESSION_IMPL *, WT_LSM_TREE *);
-static int __lsm_tree_open(
- WT_SESSION_IMPL *, const char *, bool, WT_LSM_TREE **);
+static int __lsm_tree_open(WT_SESSION_IMPL *, const char *, bool, WT_LSM_TREE **);
static int __lsm_tree_set_name(WT_SESSION_IMPL *, WT_LSM_TREE *, const char *);
/*
* __lsm_tree_discard_state --
- * Free the metadata configuration state-related LSM tree pointers.
+ * Free the metadata configuration state-related LSM tree pointers.
*/
static void
__lsm_tree_discard_state(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
- WT_LSM_CHUNK *chunk;
- u_int i;
-
- __wt_free(session, lsm_tree->config);
- __wt_free(session, lsm_tree->key_format);
- __wt_free(session, lsm_tree->value_format);
- __wt_free(session, lsm_tree->collator_name);
- __wt_free(session, lsm_tree->custom_prefix);
- __wt_free(session, lsm_tree->custom_suffix);
- __wt_free(session, lsm_tree->bloom_config);
- __wt_free(session, lsm_tree->file_config);
-
- for (i = 0; i < lsm_tree->nchunks; i++) {
- if ((chunk = lsm_tree->chunk[i]) == NULL)
- continue;
-
- __wt_spin_destroy(session, &chunk->timestamp_spinlock);
- __wt_free(session, chunk->bloom_uri);
- __wt_free(session, chunk->uri);
- __wt_free(session, chunk);
- }
-
- for (i = 0; i < lsm_tree->nold_chunks; i++) {
- chunk = lsm_tree->old_chunks[i];
- WT_ASSERT(session, chunk != NULL);
-
- __wt_spin_destroy(session, &chunk->timestamp_spinlock);
- __wt_free(session, chunk->bloom_uri);
- __wt_free(session, chunk->uri);
- __wt_free(session, chunk);
- }
+ WT_LSM_CHUNK *chunk;
+ u_int i;
+
+ __wt_free(session, lsm_tree->config);
+ __wt_free(session, lsm_tree->key_format);
+ __wt_free(session, lsm_tree->value_format);
+ __wt_free(session, lsm_tree->collator_name);
+ __wt_free(session, lsm_tree->custom_prefix);
+ __wt_free(session, lsm_tree->custom_suffix);
+ __wt_free(session, lsm_tree->bloom_config);
+ __wt_free(session, lsm_tree->file_config);
+
+ for (i = 0; i < lsm_tree->nchunks; i++) {
+ if ((chunk = lsm_tree->chunk[i]) == NULL)
+ continue;
+
+ __wt_spin_destroy(session, &chunk->timestamp_spinlock);
+ __wt_free(session, chunk->bloom_uri);
+ __wt_free(session, chunk->uri);
+ __wt_free(session, chunk);
+ }
+
+ for (i = 0; i < lsm_tree->nold_chunks; i++) {
+ chunk = lsm_tree->old_chunks[i];
+ WT_ASSERT(session, chunk != NULL);
+
+ __wt_spin_destroy(session, &chunk->timestamp_spinlock);
+ __wt_free(session, chunk->bloom_uri);
+ __wt_free(session, chunk->uri);
+ __wt_free(session, chunk);
+ }
}
/*
* __lsm_tree_discard --
- * Free an LSM tree structure.
+ * Free an LSM tree structure.
*/
static int
__lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final)
{
- WT_DECL_RET;
+ WT_DECL_RET;
- WT_UNUSED(final); /* Only used in diagnostic builds */
+ WT_UNUSED(final); /* Only used in diagnostic builds */
- WT_ASSERT(session, !lsm_tree->active);
- /*
- * The work unit queue should be empty, but it's worth checking
- * since work units use a different locking scheme to regular tree
- * operations.
- */
- WT_ASSERT(session, lsm_tree->queue_ref == 0);
+ WT_ASSERT(session, !lsm_tree->active);
+ /*
+ * The work unit queue should be empty, but it's worth checking since work units use a different
+ * locking scheme to regular tree operations.
+ */
+ WT_ASSERT(session, lsm_tree->queue_ref == 0);
- /* We may be destroying an lsm_tree before it was added. */
- if (F_ISSET(lsm_tree, WT_LSM_TREE_OPEN)) {
- WT_ASSERT(session, final ||
- F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE));
- TAILQ_REMOVE(&S2C(session)->lsmqh, lsm_tree, q);
- }
+ /* We may be destroying an lsm_tree before it was added. */
+ if (F_ISSET(lsm_tree, WT_LSM_TREE_OPEN)) {
+ WT_ASSERT(session, final || F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE));
+ TAILQ_REMOVE(&S2C(session)->lsmqh, lsm_tree, q);
+ }
- if (lsm_tree->collator_owned &&
- lsm_tree->collator->terminate != NULL)
- WT_TRET(lsm_tree->collator->terminate(
- lsm_tree->collator, &session->iface));
+ if (lsm_tree->collator_owned && lsm_tree->collator->terminate != NULL)
+ WT_TRET(lsm_tree->collator->terminate(lsm_tree->collator, &session->iface));
- __wt_free(session, lsm_tree->name);
- __lsm_tree_discard_state(session, lsm_tree);
- __wt_free(session, lsm_tree->chunk);
- __wt_free(session, lsm_tree->old_chunks);
+ __wt_free(session, lsm_tree->name);
+ __lsm_tree_discard_state(session, lsm_tree);
+ __wt_free(session, lsm_tree->chunk);
+ __wt_free(session, lsm_tree->old_chunks);
- __wt_rwlock_destroy(session, &lsm_tree->rwlock);
+ __wt_rwlock_destroy(session, &lsm_tree->rwlock);
- __wt_free(session, lsm_tree);
+ __wt_free(session, lsm_tree);
- return (ret);
+ return (ret);
}
/*
* __lsm_tree_close --
- * Close an LSM tree structure.
+ * Close an LSM tree structure.
*/
static void
__lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final)
{
- /*
- * Stop any new work units being added. The barrier is necessary
- * because we rely on the state change being visible before checking
- * the tree queue state.
- */
- lsm_tree->active = false;
- WT_FULL_BARRIER();
-
- /*
- * Wait for all LSM operations to drain. If WiredTiger is shutting
- * down also wait for the tree reference count to go to zero, otherwise
- * we know a user is holding a reference to the tree, so exclusive
- * access is not available.
- */
- while (lsm_tree->queue_ref > 0 || (final && lsm_tree->refcnt > 1)) {
- /*
- * Remove any work units from the manager queues. Do this step
- * repeatedly in case a work unit was in the process of being
- * created when we cleared the active flag.
- *
- * !!! Drop the schema and handle list locks whilst completing
- * this step so that we don't block any operations that require
- * the schema lock to complete. This is safe because any
- * operation that is closing the tree should first have gotten
- * exclusive access to the LSM tree via __wt_lsm_tree_get, so
- * other schema level operations will return EBUSY, even though
- * we're dropping the schema lock here.
- */
- WT_WITHOUT_LOCKS(session,
- __wt_lsm_manager_clear_tree(session, lsm_tree));
- }
+ /*
+ * Stop any new work units being added. The barrier is necessary because we rely on the state
+ * change being visible before checking the tree queue state.
+ */
+ lsm_tree->active = false;
+ WT_FULL_BARRIER();
+
+ /*
+ * Wait for all LSM operations to drain. If WiredTiger is shutting down also wait for the tree
+ * reference count to go to zero, otherwise we know a user is holding a reference to the tree,
+ * so exclusive access is not available.
+ */
+ while (lsm_tree->queue_ref > 0 || (final && lsm_tree->refcnt > 1)) {
+ /*
+ * Remove any work units from the manager queues. Do this step
+ * repeatedly in case a work unit was in the process of being
+ * created when we cleared the active flag.
+ *
+ * !!! Drop the schema and handle list locks whilst completing
+ * this step so that we don't block any operations that require
+ * the schema lock to complete. This is safe because any
+ * operation that is closing the tree should first have gotten
+ * exclusive access to the LSM tree via __wt_lsm_tree_get, so
+ * other schema level operations will return EBUSY, even though
+ * we're dropping the schema lock here.
+ */
+ WT_WITHOUT_LOCKS(session, __wt_lsm_manager_clear_tree(session, lsm_tree));
+ }
}
/*
* __wt_lsm_tree_close_all --
- * Close all LSM tree structures.
+ * Close all LSM tree structures.
*/
int
__wt_lsm_tree_close_all(WT_SESSION_IMPL *session)
{
- WT_DECL_RET;
- WT_LSM_TREE *lsm_tree, *lsm_tree_tmp;
-
- /* We are shutting down: the handle list lock isn't required. */
-
- WT_TAILQ_SAFE_REMOVE_BEGIN(lsm_tree,
- &S2C(session)->lsmqh, q, lsm_tree_tmp) {
- /*
- * Tree close assumes that we have a reference to the tree
- * so it can tell when it's safe to do the close. We could
- * get the tree here, but we short circuit instead. There
- * is no need to decrement the reference count since discard
- * is unconditional.
- */
- (void)__wt_atomic_add32(&lsm_tree->refcnt, 1);
- __lsm_tree_close(session, lsm_tree, true);
- WT_TRET(__lsm_tree_discard(session, lsm_tree, true));
- } WT_TAILQ_SAFE_REMOVE_END
-
- return (ret);
+ WT_DECL_RET;
+ WT_LSM_TREE *lsm_tree, *lsm_tree_tmp;
+
+ /* We are shutting down: the handle list lock isn't required. */
+
+ WT_TAILQ_SAFE_REMOVE_BEGIN(lsm_tree, &S2C(session)->lsmqh, q, lsm_tree_tmp)
+ {
+ /*
+ * Tree close assumes that we have a reference to the tree so it can tell when it's safe to
+ * do the close. We could get the tree here, but we short circuit instead. There is no need
+ * to decrement the reference count since discard is unconditional.
+ */
+ (void)__wt_atomic_add32(&lsm_tree->refcnt, 1);
+ __lsm_tree_close(session, lsm_tree, true);
+ WT_TRET(__lsm_tree_discard(session, lsm_tree, true));
+ }
+ WT_TAILQ_SAFE_REMOVE_END
+
+ return (ret);
}
/*
* __lsm_tree_set_name --
- * Set or reset the name of an LSM tree
+ * Set or reset the name of an LSM tree
*/
static int
-__lsm_tree_set_name(WT_SESSION_IMPL *session,
- WT_LSM_TREE *lsm_tree, const char *uri)
+__lsm_tree_set_name(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, const char *uri)
{
- void *p;
+ void *p;
- WT_RET(__wt_strdup(session, uri, &p));
+ WT_RET(__wt_strdup(session, uri, &p));
- __wt_free(session, lsm_tree->name);
- lsm_tree->name = p;
- lsm_tree->filename = lsm_tree->name + strlen("lsm:");
- return (0);
+ __wt_free(session, lsm_tree->name);
+ lsm_tree->name = p;
+ lsm_tree->filename = lsm_tree->name + strlen("lsm:");
+ return (0);
}
/*
* __wt_lsm_tree_bloom_name --
- * Get the URI of the Bloom filter for a given chunk.
+ * Get the URI of the Bloom filter for a given chunk.
*/
int
-__wt_lsm_tree_bloom_name(WT_SESSION_IMPL *session,
- WT_LSM_TREE *lsm_tree, uint32_t id, const char **retp)
+__wt_lsm_tree_bloom_name(
+ WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, uint32_t id, const char **retp)
{
- WT_DECL_ITEM(tmp);
- WT_DECL_RET;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
- WT_RET(__wt_scr_alloc(session, 0, &tmp));
- WT_ERR(__wt_buf_fmt(
- session, tmp, "file:%s-%06" PRIu32 ".bf", lsm_tree->filename, id));
- WT_ERR(__wt_strndup(session, tmp->data, tmp->size, retp));
+ WT_RET(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__wt_buf_fmt(session, tmp, "file:%s-%06" PRIu32 ".bf", lsm_tree->filename, id));
+ WT_ERR(__wt_strndup(session, tmp->data, tmp->size, retp));
-err: __wt_scr_free(session, &tmp);
- return (ret);
+err:
+ __wt_scr_free(session, &tmp);
+ return (ret);
}
/*
* __wt_lsm_tree_chunk_name --
- * Get the URI of the file for a given chunk.
+ * Get the URI of the file for a given chunk.
*/
int
-__wt_lsm_tree_chunk_name(WT_SESSION_IMPL *session,
- WT_LSM_TREE *lsm_tree, uint32_t id, uint32_t generation, const char **retp)
+__wt_lsm_tree_chunk_name(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, uint32_t id,
+ uint32_t generation, const char **retp)
{
- WT_DECL_ITEM(tmp);
- WT_DECL_RET;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
- WT_RET(__wt_scr_alloc(session, 0, &tmp));
+ WT_RET(__wt_scr_alloc(session, 0, &tmp));
- if (lsm_tree->custom_generation != 0 &&
- generation >= lsm_tree->custom_generation)
- WT_ERR(__wt_buf_fmt(session, tmp, "%s:%s-%06" PRIu32 "%s",
- lsm_tree->custom_prefix, lsm_tree->filename, id,
- lsm_tree->custom_suffix));
- else
- WT_ERR(__wt_buf_fmt(session, tmp, "file:%s-%06" PRIu32 ".lsm",
- lsm_tree->filename, id));
+ if (lsm_tree->custom_generation != 0 && generation >= lsm_tree->custom_generation)
+ WT_ERR(__wt_buf_fmt(session, tmp, "%s:%s-%06" PRIu32 "%s", lsm_tree->custom_prefix,
+ lsm_tree->filename, id, lsm_tree->custom_suffix));
+ else
+ WT_ERR(__wt_buf_fmt(session, tmp, "file:%s-%06" PRIu32 ".lsm", lsm_tree->filename, id));
- WT_ERR(__wt_strndup(session, tmp->data, tmp->size, retp));
+ WT_ERR(__wt_strndup(session, tmp->data, tmp->size, retp));
-err: __wt_scr_free(session, &tmp);
- return (ret);
+err:
+ __wt_scr_free(session, &tmp);
+ return (ret);
}
/*
* __wt_lsm_tree_set_chunk_size --
- * Set the size of the chunk. Should only be called for chunks that are
- * on disk, or about to become on disk.
+ * Set the size of the chunk. Should only be called for chunks that are on disk, or about to
+ * become on disk.
*/
int
-__wt_lsm_tree_set_chunk_size(
- WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
+__wt_lsm_tree_set_chunk_size(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
{
- WT_DATA_SOURCE *dsrc;
- wt_off_t size;
- const char *filename;
-
- size = 0;
- if (lsm_tree->custom_generation != 0 &&
- chunk->generation >= lsm_tree->custom_generation) {
- dsrc = __wt_schema_get_source(session, chunk->uri);
- /*
- * We can only retrieve a size if the data source exposes the
- * information.
- */
- if (dsrc != NULL && dsrc->size != NULL) {
- /* Call the callback. */
- WT_RET(dsrc->size(
- dsrc, (WT_SESSION*)session, chunk->uri, &size));
- }
- } else {
- filename = chunk->uri;
- if (!WT_PREFIX_SKIP(filename, "file:"))
- WT_RET_MSG(session, EINVAL,
- "Expected a 'file:' URI: %s", chunk->uri);
- WT_RET(__wt_fs_size(session, filename, &size));
- }
-
- chunk->size = (uint64_t)size;
-
- return (0);
+ WT_DATA_SOURCE *dsrc;
+ wt_off_t size;
+ const char *filename;
+
+ size = 0;
+ if (lsm_tree->custom_generation != 0 && chunk->generation >= lsm_tree->custom_generation) {
+ dsrc = __wt_schema_get_source(session, chunk->uri);
+ /*
+ * We can only retrieve a size if the data source exposes the information.
+ */
+ if (dsrc != NULL && dsrc->size != NULL) {
+ /* Call the callback. */
+ WT_RET(dsrc->size(dsrc, (WT_SESSION *)session, chunk->uri, &size));
+ }
+ } else {
+ filename = chunk->uri;
+ if (!WT_PREFIX_SKIP(filename, "file:"))
+ WT_RET_MSG(session, EINVAL, "Expected a 'file:' URI: %s", chunk->uri);
+ WT_RET(__wt_fs_size(session, filename, &size));
+ }
+
+ chunk->size = (uint64_t)size;
+
+ return (0);
}
/*
* __lsm_tree_cleanup_old --
- * Cleanup any old LSM chunks that might conflict with one we are
- * about to create. Sometimes failed LSM metadata operations can
- * leave old files and bloom filters behind.
+ * Cleanup any old LSM chunks that might conflict with one we are about to create. Sometimes
+ * failed LSM metadata operations can leave old files and bloom filters behind.
*/
static int
__lsm_tree_cleanup_old(WT_SESSION_IMPL *session, const char *uri)
{
- WT_DECL_RET;
- const char *cfg[] =
- { WT_CONFIG_BASE(session, WT_SESSION_drop), "force", NULL };
- bool exists, is_file;
-
- exists = false;
- is_file = WT_PREFIX_MATCH(uri, "file:");
- if (is_file)
- WT_RET(__wt_fs_exist(session, uri + strlen("file:"), &exists));
- if (!is_file || exists)
- WT_WITH_SCHEMA_LOCK(session,
- ret = __wt_schema_drop(session, uri, cfg));
- return (ret);
+ WT_DECL_RET;
+ const char *cfg[] = {WT_CONFIG_BASE(session, WT_SESSION_drop), "force", NULL};
+ bool exists, is_file;
+
+ exists = false;
+ is_file = WT_PREFIX_MATCH(uri, "file:");
+ if (is_file)
+ WT_RET(__wt_fs_exist(session, uri + strlen("file:"), &exists));
+ if (!is_file || exists)
+ WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_drop(session, uri, cfg));
+ return (ret);
}
/*
* __wt_lsm_tree_setup_chunk --
- * Initialize a chunk of an LSM tree.
+ * Initialize a chunk of an LSM tree.
*/
int
-__wt_lsm_tree_setup_chunk(
- WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
+__wt_lsm_tree_setup_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
{
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA));
- __wt_epoch(session, &chunk->create_time);
-
- WT_RET(__wt_spin_init(session,
- &chunk->timestamp_spinlock, "LSM chunk timestamp"));
- WT_RET(__wt_lsm_tree_chunk_name(
- session, lsm_tree, chunk->id, chunk->generation, &chunk->uri));
-
- /*
- * If the underlying file exists, drop the chunk first - there may be
- * some content hanging over from an aborted merge or checkpoint.
- *
- * Don't do this for the very first chunk: we are called during
- * WT_SESSION::create, and doing a drop inside there does interesting
- * things with handle locks and metadata tracking. It can never have
- * been the result of an interrupted merge, anyway.
- */
- if (chunk->id > 1)
- WT_RET(__lsm_tree_cleanup_old(session, chunk->uri));
-
- return (__wt_schema_create(session, chunk->uri, lsm_tree->file_config));
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA));
+ __wt_epoch(session, &chunk->create_time);
+
+ WT_RET(__wt_spin_init(session, &chunk->timestamp_spinlock, "LSM chunk timestamp"));
+ WT_RET(__wt_lsm_tree_chunk_name(session, lsm_tree, chunk->id, chunk->generation, &chunk->uri));
+
+ /*
+ * If the underlying file exists, drop the chunk first - there may be
+ * some content hanging over from an aborted merge or checkpoint.
+ *
+ * Don't do this for the very first chunk: we are called during
+ * WT_SESSION::create, and doing a drop inside there does interesting
+ * things with handle locks and metadata tracking. It can never have
+ * been the result of an interrupted merge, anyway.
+ */
+ if (chunk->id > 1)
+ WT_RET(__lsm_tree_cleanup_old(session, chunk->uri));
+
+ return (__wt_schema_create(session, chunk->uri, lsm_tree->file_config));
}
/*
* __wt_lsm_tree_setup_bloom --
- * Initialize a bloom filter for an LSM tree.
+ * Initialize a bloom filter for an LSM tree.
*/
int
-__wt_lsm_tree_setup_bloom(
- WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
+__wt_lsm_tree_setup_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
{
- /*
- * The Bloom URI can be populated when the chunk is created, but
- * it isn't set yet on open or merge.
- */
- if (chunk->bloom_uri == NULL)
- WT_RET(__wt_lsm_tree_bloom_name(
- session, lsm_tree, chunk->id, &chunk->bloom_uri));
-
- return (__lsm_tree_cleanup_old(session, chunk->bloom_uri));
+ /*
+ * The Bloom URI can be populated when the chunk is created, but it isn't set yet on open or
+ * merge.
+ */
+ if (chunk->bloom_uri == NULL)
+ WT_RET(__wt_lsm_tree_bloom_name(session, lsm_tree, chunk->id, &chunk->bloom_uri));
+
+ return (__lsm_tree_cleanup_old(session, chunk->bloom_uri));
}
/*
* __wt_lsm_tree_create --
- * Create an LSM tree structure for the given name.
+ * Create an LSM tree structure for the given name.
*/
int
-__wt_lsm_tree_create(WT_SESSION_IMPL *session,
- const char *uri, bool exclusive, const char *config)
+__wt_lsm_tree_create(WT_SESSION_IMPL *session, const char *uri, bool exclusive, const char *config)
{
- WT_CONFIG_ITEM cval;
- WT_DECL_RET;
- WT_LSM_TREE *lsm_tree;
- const char *cfg[] =
- { WT_CONFIG_BASE(session, lsm_meta), config, NULL };
- const char *metadata;
-
- metadata = NULL;
-
- /* If the tree can be opened, it already exists. */
- if ((ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree)) == 0) {
- __wt_lsm_tree_release(session, lsm_tree);
- return (exclusive ? EEXIST : 0);
- }
- WT_RET_NOTFOUND_OK(ret);
-
- if (!F_ISSET(S2C(session), WT_CONN_READONLY)) {
- /* LSM doesn't yet support the 'r' format. */
- WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval));
- if (WT_STRING_MATCH("r", cval.str, cval.len))
- WT_ERR_MSG(session, EINVAL,
- "LSM trees do not support a key format of 'r'");
-
- WT_ERR(__wt_config_merge(session, cfg, NULL, &metadata));
- WT_ERR(__wt_metadata_insert(session, uri, metadata));
- }
-
- /*
- * Open our new tree and add it to the handle cache. Don't discard on
- * error: the returned handle is NULL on error, and the metadata
- * tracking macros handle cleaning up on failure.
- */
- WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
- ret = __lsm_tree_open(session, uri, true, &lsm_tree));
- if (ret == 0)
- __wt_lsm_tree_release(session, lsm_tree);
-
-err: __wt_free(session, metadata);
- return (ret);
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ WT_LSM_TREE *lsm_tree;
+ const char *cfg[] = {WT_CONFIG_BASE(session, lsm_meta), config, NULL};
+ const char *metadata;
+
+ metadata = NULL;
+
+ /* If the tree can be opened, it already exists. */
+ if ((ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree)) == 0) {
+ __wt_lsm_tree_release(session, lsm_tree);
+ return (exclusive ? EEXIST : 0);
+ }
+ WT_RET_NOTFOUND_OK(ret);
+
+ if (!F_ISSET(S2C(session), WT_CONN_READONLY)) {
+ /* LSM doesn't yet support the 'r' format. */
+ WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval));
+ if (WT_STRING_MATCH("r", cval.str, cval.len))
+ WT_ERR_MSG(session, EINVAL, "LSM trees do not support a key format of 'r'");
+
+ WT_ERR(__wt_config_merge(session, cfg, NULL, &metadata));
+ WT_ERR(__wt_metadata_insert(session, uri, metadata));
+ }
+
+ /*
+ * Open our new tree and add it to the handle cache. Don't discard on error: the returned handle
+ * is NULL on error, and the metadata tracking macros handle cleaning up on failure.
+ */
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __lsm_tree_open(session, uri, true, &lsm_tree));
+ if (ret == 0)
+ __wt_lsm_tree_release(session, lsm_tree);
+
+err:
+ __wt_free(session, metadata);
+ return (ret);
}
/*
* __lsm_tree_find --
- * Find an LSM tree structure for the given name. Optionally get exclusive
- * access to the handle. Exclusive access works separately to the LSM tree
- * lock - since operations that need exclusive access may also need to
- * take the LSM tree lock for example outstanding work unit operations.
+ * Find an LSM tree structure for the given name. Optionally get exclusive access to the handle.
+ * Exclusive access works separately to the LSM tree lock - since operations that need exclusive
+ * access may also need to take the LSM tree lock for example outstanding work unit operations.
*/
static int
-__lsm_tree_find(WT_SESSION_IMPL *session,
- const char *uri, bool exclusive, WT_LSM_TREE **treep)
+__lsm_tree_find(WT_SESSION_IMPL *session, const char *uri, bool exclusive, WT_LSM_TREE **treep)
{
- WT_LSM_TREE *lsm_tree;
-
- *treep = NULL;
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
-
- /* See if the tree is already open. */
- TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q)
- if (strcmp(uri, lsm_tree->name) == 0) {
- if (exclusive) {
- /*
- * Make sure we win the race to switch on the
- * exclusive flag.
- */
- if (!__wt_atomic_cas_ptr(
- &lsm_tree->excl_session, NULL, session))
- return (__wt_set_return(
- session, EBUSY));
-
- /*
- * Drain the work queue before checking for
- * open cursors - otherwise we can generate
- * spurious busy returns.
- */
- (void)__wt_atomic_add32(&lsm_tree->refcnt, 1);
- __lsm_tree_close(session, lsm_tree, false);
- if (lsm_tree->refcnt != 1) {
- __wt_lsm_tree_release(
- session, lsm_tree);
- return (__wt_set_return(
- session, EBUSY));
- }
- } else {
- (void)__wt_atomic_add32(&lsm_tree->refcnt, 1);
-
- /*
- * We got a reference, check if an exclusive
- * lock beat us to it.
- */
- if (lsm_tree->excl_session != NULL) {
- WT_ASSERT(session,
- lsm_tree->refcnt > 0);
- __wt_lsm_tree_release(
- session, lsm_tree);
- return (__wt_set_return(
- session, EBUSY));
- }
- }
-
- *treep = lsm_tree;
-
- WT_ASSERT(session, lsm_tree->excl_session ==
- (exclusive ? session : NULL));
- return (0);
- }
-
- return (WT_NOTFOUND);
+ WT_LSM_TREE *lsm_tree;
+
+ *treep = NULL;
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
+
+ /* See if the tree is already open. */
+ TAILQ_FOREACH (lsm_tree, &S2C(session)->lsmqh, q)
+ if (strcmp(uri, lsm_tree->name) == 0) {
+ if (exclusive) {
+ /*
+ * Make sure we win the race to switch on the exclusive flag.
+ */
+ if (!__wt_atomic_cas_ptr(&lsm_tree->excl_session, NULL, session))
+ return (__wt_set_return(session, EBUSY));
+
+ /*
+ * Drain the work queue before checking for open cursors - otherwise we can generate
+ * spurious busy returns.
+ */
+ (void)__wt_atomic_add32(&lsm_tree->refcnt, 1);
+ __lsm_tree_close(session, lsm_tree, false);
+ if (lsm_tree->refcnt != 1) {
+ __wt_lsm_tree_release(session, lsm_tree);
+ return (__wt_set_return(session, EBUSY));
+ }
+ } else {
+ (void)__wt_atomic_add32(&lsm_tree->refcnt, 1);
+
+ /*
+ * We got a reference, check if an exclusive lock beat us to it.
+ */
+ if (lsm_tree->excl_session != NULL) {
+ WT_ASSERT(session, lsm_tree->refcnt > 0);
+ __wt_lsm_tree_release(session, lsm_tree);
+ return (__wt_set_return(session, EBUSY));
+ }
+ }
+
+ *treep = lsm_tree;
+
+ WT_ASSERT(session, lsm_tree->excl_session == (exclusive ? session : NULL));
+ return (0);
+ }
+
+ return (WT_NOTFOUND);
}
/*
* __lsm_tree_open_check --
- * Validate the configuration of an LSM tree.
+ * Validate the configuration of an LSM tree.
*/
static int
__lsm_tree_open_check(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
- WT_CONFIG_ITEM cval;
- WT_CONNECTION_IMPL *conn;
- uint64_t maxleafpage, required;
- const char *cfg[] = { WT_CONFIG_BASE(
- session, WT_SESSION_create), lsm_tree->file_config, NULL };
-
- conn = S2C(session);
-
- WT_RET(__wt_config_gets(session, cfg, "leaf_page_max", &cval));
- maxleafpage = (uint64_t)cval.val;
-
- required = WT_LSM_TREE_MINIMUM_SIZE(
- lsm_tree->chunk_size, lsm_tree->merge_max, maxleafpage);
- if (conn->cache_size < required)
- WT_RET_MSG(session, EINVAL,
- "LSM cache size %" PRIu64 " (%" PRIu64 "MB) too small, "
- "must be at least %" PRIu64 " (%" PRIu64 "MB)",
- conn->cache_size, conn->cache_size / WT_MEGABYTE,
- required, (required + (WT_MEGABYTE - 1))/ WT_MEGABYTE);
- return (0);
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+ uint64_t maxleafpage, required;
+ const char *cfg[] = {WT_CONFIG_BASE(session, WT_SESSION_create), lsm_tree->file_config, NULL};
+
+ conn = S2C(session);
+
+ WT_RET(__wt_config_gets(session, cfg, "leaf_page_max", &cval));
+ maxleafpage = (uint64_t)cval.val;
+
+ required = WT_LSM_TREE_MINIMUM_SIZE(lsm_tree->chunk_size, lsm_tree->merge_max, maxleafpage);
+ if (conn->cache_size < required)
+ WT_RET_MSG(session, EINVAL, "LSM cache size %" PRIu64 " (%" PRIu64
+ "MB) too small, "
+ "must be at least %" PRIu64 " (%" PRIu64 "MB)",
+ conn->cache_size, conn->cache_size / WT_MEGABYTE, required,
+ (required + (WT_MEGABYTE - 1)) / WT_MEGABYTE);
+ return (0);
}
/*
* __lsm_tree_open --
- * Open an LSM tree structure.
+ * Open an LSM tree structure.
*/
static int
-__lsm_tree_open(WT_SESSION_IMPL *session,
- const char *uri, bool exclusive, WT_LSM_TREE **treep)
+__lsm_tree_open(WT_SESSION_IMPL *session, const char *uri, bool exclusive, WT_LSM_TREE **treep)
{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- WT_LSM_TREE *lsm_tree;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LSM_TREE *lsm_tree;
- conn = S2C(session);
- lsm_tree = NULL;
+ conn = S2C(session);
+ lsm_tree = NULL;
- WT_ASSERT(session,
- F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE));
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE));
- /* Start the LSM manager thread if it isn't running. */
- WT_RET(__wt_lsm_manager_start(session));
+ /* Start the LSM manager thread if it isn't running. */
+ WT_RET(__wt_lsm_manager_start(session));
- /* Make sure no one beat us to it. */
- if ((ret = __lsm_tree_find(
- session, uri, exclusive, treep)) != WT_NOTFOUND)
- return (ret);
+ /* Make sure no one beat us to it. */
+ if ((ret = __lsm_tree_find(session, uri, exclusive, treep)) != WT_NOTFOUND)
+ return (ret);
- /* Try to open the tree. */
- WT_RET(__wt_calloc_one(session, &lsm_tree));
- WT_ERR(__wt_rwlock_init(session, &lsm_tree->rwlock));
+ /* Try to open the tree. */
+ WT_RET(__wt_calloc_one(session, &lsm_tree));
+ WT_ERR(__wt_rwlock_init(session, &lsm_tree->rwlock));
- WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri));
+ WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri));
- WT_ERR(__wt_lsm_meta_read(session, lsm_tree));
+ WT_ERR(__wt_lsm_meta_read(session, lsm_tree));
- /*
- * Sanity check the configuration. Do it now since this is the first
- * time we have the LSM tree configuration.
- */
- WT_ERR(__lsm_tree_open_check(session, lsm_tree));
+ /*
+ * Sanity check the configuration. Do it now since this is the first time we have the LSM tree
+ * configuration.
+ */
+ WT_ERR(__lsm_tree_open_check(session, lsm_tree));
- /* Set the generation number so cursors are opened on first usage. */
- lsm_tree->dsk_gen = 1;
+ /* Set the generation number so cursors are opened on first usage. */
+ lsm_tree->dsk_gen = 1;
- /*
- * Setup reference counting. Use separate reference counts for tree
- * handles and queue entries, so that queue entries don't interfere
- * with getting handles exclusive.
- */
- lsm_tree->refcnt = 1;
- lsm_tree->excl_session = exclusive ? session : NULL;
- lsm_tree->queue_ref = 0;
+ /*
+ * Setup reference counting. Use separate reference counts for tree handles and queue entries,
+ * so that queue entries don't interfere with getting handles exclusive.
+ */
+ lsm_tree->refcnt = 1;
+ lsm_tree->excl_session = exclusive ? session : NULL;
+ lsm_tree->queue_ref = 0;
- /* Set a flush timestamp as a baseline. */
- __wt_epoch(session, &lsm_tree->last_flush_time);
+ /* Set a flush timestamp as a baseline. */
+ __wt_epoch(session, &lsm_tree->last_flush_time);
- /* Now the tree is setup, make it visible to others. */
- TAILQ_INSERT_HEAD(&conn->lsmqh, lsm_tree, q);
- if (!exclusive)
- lsm_tree->active = true;
- F_SET(lsm_tree, WT_LSM_TREE_OPEN);
+ /* Now the tree is setup, make it visible to others. */
+ TAILQ_INSERT_HEAD(&conn->lsmqh, lsm_tree, q);
+ if (!exclusive)
+ lsm_tree->active = true;
+ F_SET(lsm_tree, WT_LSM_TREE_OPEN);
- *treep = lsm_tree;
+ *treep = lsm_tree;
- if (0) {
-err: WT_TRET(__lsm_tree_discard(session, lsm_tree, false));
- }
- return (ret);
+ if (0) {
+err:
+ WT_TRET(__lsm_tree_discard(session, lsm_tree, false));
+ }
+ return (ret);
}
/*
* __wt_lsm_tree_get --
- * Find an LSM tree handle or open a new one.
+ * Find an LSM tree handle or open a new one.
*/
int
-__wt_lsm_tree_get(WT_SESSION_IMPL *session,
- const char *uri, bool exclusive, WT_LSM_TREE **treep)
+__wt_lsm_tree_get(WT_SESSION_IMPL *session, const char *uri, bool exclusive, WT_LSM_TREE **treep)
{
- WT_DECL_RET;
-
- /*
- * Dropping and re-acquiring the lock is safe here, since the tree open
- * call checks to see if another thread beat it to opening the tree
- * before proceeding.
- */
- if (exclusive)
- WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
- ret = __lsm_tree_find(session, uri, exclusive, treep));
- else
- WT_WITH_HANDLE_LIST_READ_LOCK(session,
- ret = __lsm_tree_find(session, uri, exclusive, treep));
- if (ret == WT_NOTFOUND)
- WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
- ret = __lsm_tree_open(session, uri, exclusive, treep));
-
- return (ret);
+ WT_DECL_RET;
+
+ /*
+ * Dropping and re-acquiring the lock is safe here, since the tree open call checks to see if
+ * another thread beat it to opening the tree before proceeding.
+ */
+ if (exclusive)
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(
+ session, ret = __lsm_tree_find(session, uri, exclusive, treep));
+ else
+ WT_WITH_HANDLE_LIST_READ_LOCK(
+ session, ret = __lsm_tree_find(session, uri, exclusive, treep));
+ if (ret == WT_NOTFOUND)
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(
+ session, ret = __lsm_tree_open(session, uri, exclusive, treep));
+
+ return (ret);
}
/*
* __wt_lsm_tree_release --
- * Release an LSM tree structure.
+ * Release an LSM tree structure.
*/
void
__wt_lsm_tree_release(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
- WT_ASSERT(session, lsm_tree->refcnt > 0);
- if (lsm_tree->excl_session == session) {
- /* We cleared the active flag when getting exclusive access. */
- lsm_tree->active = true;
- lsm_tree->excl_session = NULL;
- }
- (void)__wt_atomic_sub32(&lsm_tree->refcnt, 1);
+ WT_ASSERT(session, lsm_tree->refcnt > 0);
+ if (lsm_tree->excl_session == session) {
+ /* We cleared the active flag when getting exclusive access. */
+ lsm_tree->active = true;
+ lsm_tree->excl_session = NULL;
+ }
+ (void)__wt_atomic_sub32(&lsm_tree->refcnt, 1);
}
/* How aggressively to ramp up or down throttle due to level 0 merging */
-#define WT_LSM_MERGE_THROTTLE_BUMP_PCT (100 / lsm_tree->merge_max)
+#define WT_LSM_MERGE_THROTTLE_BUMP_PCT (100 / lsm_tree->merge_max)
/* Number of level 0 chunks that need to be present to throttle inserts */
-#define WT_LSM_MERGE_THROTTLE_THRESHOLD \
- (2 * lsm_tree->merge_min)
+#define WT_LSM_MERGE_THROTTLE_THRESHOLD (2 * lsm_tree->merge_min)
/* Minimal throttling time */
-#define WT_LSM_THROTTLE_START 20
-
-#define WT_LSM_MERGE_THROTTLE_INCREASE(val) do { \
- (val) += ((val) * WT_LSM_MERGE_THROTTLE_BUMP_PCT) / 100; \
- if ((val) < WT_LSM_THROTTLE_START) \
- (val) = WT_LSM_THROTTLE_START; \
- } while (0)
-
-#define WT_LSM_MERGE_THROTTLE_DECREASE(val) do { \
- (val) -= ((val) * WT_LSM_MERGE_THROTTLE_BUMP_PCT) / 100; \
- if ((val) < WT_LSM_THROTTLE_START) \
- (val) = 0; \
- } while (0)
+#define WT_LSM_THROTTLE_START 20
+
+#define WT_LSM_MERGE_THROTTLE_INCREASE(val) \
+ do { \
+ (val) += ((val)*WT_LSM_MERGE_THROTTLE_BUMP_PCT) / 100; \
+ if ((val) < WT_LSM_THROTTLE_START) \
+ (val) = WT_LSM_THROTTLE_START; \
+ } while (0)
+
+#define WT_LSM_MERGE_THROTTLE_DECREASE(val) \
+ do { \
+ (val) -= ((val)*WT_LSM_MERGE_THROTTLE_BUMP_PCT) / 100; \
+ if ((val) < WT_LSM_THROTTLE_START) \
+ (val) = 0; \
+ } while (0)
/*
* __wt_lsm_tree_throttle --
- * Calculate whether LSM updates need to be throttled. Must be called
- * with the LSM tree lock held.
+ * Calculate whether LSM updates need to be throttled. Must be called with the LSM tree lock
+ * held.
*/
void
-__wt_lsm_tree_throttle(
- WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool decrease_only)
+__wt_lsm_tree_throttle(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool decrease_only)
{
- WT_LSM_CHUNK *last_chunk, **cp, *ondisk, *prev_chunk;
- uint64_t cache_sz, cache_used, oldtime, record_count, timediff;
- uint32_t in_memory, gen0_chunks;
-
- /* Never throttle in small trees. */
- if (lsm_tree->nchunks < 3) {
- lsm_tree->ckpt_throttle = lsm_tree->merge_throttle = 0;
- return;
- }
-
- cache_sz = S2C(session)->cache_size;
-
- /*
- * In the steady state, we expect that the checkpoint worker thread
- * will keep up with inserts. If not, throttle the insert rate to
- * avoid filling the cache with in-memory chunks. Threads sleep every
- * 100 operations, so take that into account in the calculation.
- *
- * Also throttle based on whether merge threads are keeping up. If
- * there are enough chunks that have never been merged we slow down
- * inserts so that merges have some chance of keeping up.
- *
- * Count the number of in-memory chunks, the number of unmerged chunk
- * on disk, and find the most recent on-disk chunk (if any).
- */
- record_count = 1;
- gen0_chunks = in_memory = 0;
- ondisk = NULL;
- for (cp = lsm_tree->chunk + lsm_tree->nchunks - 1;
- cp >= lsm_tree->chunk;
- --cp)
- if (!F_ISSET(*cp, WT_LSM_CHUNK_ONDISK)) {
- record_count += (*cp)->count;
- ++in_memory;
- } else {
- /*
- * Assign ondisk to the last chunk that has been
- * flushed since the tree was last opened (i.e it's on
- * disk and stable is not set).
- */
- if (ondisk == NULL &&
- ((*cp)->generation == 0 &&
- !F_ISSET(*cp, WT_LSM_CHUNK_STABLE)))
- ondisk = *cp;
-
- if ((*cp)->generation == 0 &&
- !F_ISSET(*cp, WT_LSM_CHUNK_MERGING))
- ++gen0_chunks;
- }
-
- last_chunk = lsm_tree->chunk[lsm_tree->nchunks - 1];
-
- /* Checkpoint throttling, based on the number of in-memory chunks. */
- if (!F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) || in_memory <= 3)
- lsm_tree->ckpt_throttle = 0;
- else if (decrease_only)
- ; /* Nothing to do */
- else if (ondisk == NULL) {
- /*
- * No checkpoint has completed this run. Keep slowing down
- * inserts until one does.
- */
- lsm_tree->ckpt_throttle =
- WT_MAX(WT_LSM_THROTTLE_START, 2 * lsm_tree->ckpt_throttle);
- } else {
- WT_ASSERT(session, WT_TIMECMP(
- last_chunk->create_time, ondisk->create_time) >= 0);
- timediff = WT_TIMEDIFF_NS(
- last_chunk->create_time, ondisk->create_time);
- lsm_tree->ckpt_throttle =
- (in_memory - 2) * timediff / (20 * record_count);
-
- /*
- * Get more aggressive as the number of in memory chunks
- * consumes a large proportion of the cache. In memory chunks
- * are allowed to grow up to twice as large as the configured
- * value when checkpoints aren't keeping up. That worst case
- * is when this calculation is relevant.
- * There is nothing particularly special about the chosen
- * multipliers.
- */
- cache_used = in_memory * lsm_tree->chunk_size * 2;
- if (cache_used > cache_sz * 0.8)
- lsm_tree->ckpt_throttle *= 5;
- }
-
- /*
- * Merge throttling, based on the number of on-disk, level 0 chunks.
- *
- * Don't throttle if the tree has less than a single level's number
- * of chunks.
- */
- if (F_ISSET(lsm_tree, WT_LSM_TREE_MERGES)) {
- if (lsm_tree->nchunks < lsm_tree->merge_max)
- lsm_tree->merge_throttle = 0;
- else if (gen0_chunks < WT_LSM_MERGE_THROTTLE_THRESHOLD)
- WT_LSM_MERGE_THROTTLE_DECREASE(
- lsm_tree->merge_throttle);
- else if (!decrease_only)
- WT_LSM_MERGE_THROTTLE_INCREASE(
- lsm_tree->merge_throttle);
- }
-
- /* Put an upper bound of 1s on both throttle calculations. */
- lsm_tree->ckpt_throttle = WT_MIN(WT_MILLION, lsm_tree->ckpt_throttle);
- lsm_tree->merge_throttle = WT_MIN(WT_MILLION, lsm_tree->merge_throttle);
-
- /*
- * Update our estimate of how long each in-memory chunk stays active.
- * Filter out some noise by keeping a weighted history of the
- * calculated value. Wait until we have enough chunks that we can
- * check that the new value is sane: otherwise, after a long idle
- * period, we can calculate a crazy value.
- */
- if (in_memory > 1 && ondisk != NULL) {
- prev_chunk = lsm_tree->chunk[lsm_tree->nchunks - 2];
- WT_ASSERT(session, prev_chunk->generation == 0);
- WT_ASSERT(session, WT_TIMECMP(
- last_chunk->create_time, prev_chunk->create_time) >= 0);
- timediff = WT_TIMEDIFF_NS(
- last_chunk->create_time, prev_chunk->create_time);
- WT_ASSERT(session, WT_TIMECMP(
- prev_chunk->create_time, ondisk->create_time) >= 0);
- oldtime = WT_TIMEDIFF_NS(
- prev_chunk->create_time, ondisk->create_time);
- if (timediff < 10 * oldtime)
- lsm_tree->chunk_fill_ms =
- (3 * lsm_tree->chunk_fill_ms +
- timediff / WT_MILLION) / 4;
- }
+ WT_LSM_CHUNK *last_chunk, **cp, *ondisk, *prev_chunk;
+ uint64_t cache_sz, cache_used, oldtime, record_count, timediff;
+ uint32_t in_memory, gen0_chunks;
+
+ /* Never throttle in small trees. */
+ if (lsm_tree->nchunks < 3) {
+ lsm_tree->ckpt_throttle = lsm_tree->merge_throttle = 0;
+ return;
+ }
+
+ cache_sz = S2C(session)->cache_size;
+
+ /*
+ * In the steady state, we expect that the checkpoint worker thread
+ * will keep up with inserts. If not, throttle the insert rate to
+ * avoid filling the cache with in-memory chunks. Threads sleep every
+ * 100 operations, so take that into account in the calculation.
+ *
+ * Also throttle based on whether merge threads are keeping up. If
+ * there are enough chunks that have never been merged we slow down
+ * inserts so that merges have some chance of keeping up.
+ *
+ * Count the number of in-memory chunks, the number of unmerged chunk
+ * on disk, and find the most recent on-disk chunk (if any).
+ */
+ record_count = 1;
+ gen0_chunks = in_memory = 0;
+ ondisk = NULL;
+ for (cp = lsm_tree->chunk + lsm_tree->nchunks - 1; cp >= lsm_tree->chunk; --cp)
+ if (!F_ISSET(*cp, WT_LSM_CHUNK_ONDISK)) {
+ record_count += (*cp)->count;
+ ++in_memory;
+ } else {
+ /*
+ * Assign ondisk to the last chunk that has been flushed since the tree was last opened
+ * (i.e it's on disk and stable is not set).
+ */
+ if (ondisk == NULL && ((*cp)->generation == 0 && !F_ISSET(*cp, WT_LSM_CHUNK_STABLE)))
+ ondisk = *cp;
+
+ if ((*cp)->generation == 0 && !F_ISSET(*cp, WT_LSM_CHUNK_MERGING))
+ ++gen0_chunks;
+ }
+
+ last_chunk = lsm_tree->chunk[lsm_tree->nchunks - 1];
+
+ /* Checkpoint throttling, based on the number of in-memory chunks. */
+ if (!F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) || in_memory <= 3)
+ lsm_tree->ckpt_throttle = 0;
+ else if (decrease_only)
+ ; /* Nothing to do */
+ else if (ondisk == NULL) {
+ /*
+ * No checkpoint has completed this run. Keep slowing down inserts until one does.
+ */
+ lsm_tree->ckpt_throttle = WT_MAX(WT_LSM_THROTTLE_START, 2 * lsm_tree->ckpt_throttle);
+ } else {
+ WT_ASSERT(session, WT_TIMECMP(last_chunk->create_time, ondisk->create_time) >= 0);
+ timediff = WT_TIMEDIFF_NS(last_chunk->create_time, ondisk->create_time);
+ lsm_tree->ckpt_throttle = (in_memory - 2) * timediff / (20 * record_count);
+
+ /*
+ * Get more aggressive as the number of in memory chunks consumes a large proportion of the
+ * cache. In memory chunks are allowed to grow up to twice as large as the configured value
+ * when checkpoints aren't keeping up. That worst case is when this calculation is relevant.
+ * There is nothing particularly special about the chosen multipliers.
+ */
+ cache_used = in_memory * lsm_tree->chunk_size * 2;
+ if (cache_used > cache_sz * 0.8)
+ lsm_tree->ckpt_throttle *= 5;
+ }
+
+ /*
+ * Merge throttling, based on the number of on-disk, level 0 chunks.
+ *
+ * Don't throttle if the tree has less than a single level's number
+ * of chunks.
+ */
+ if (F_ISSET(lsm_tree, WT_LSM_TREE_MERGES)) {
+ if (lsm_tree->nchunks < lsm_tree->merge_max)
+ lsm_tree->merge_throttle = 0;
+ else if (gen0_chunks < WT_LSM_MERGE_THROTTLE_THRESHOLD)
+ WT_LSM_MERGE_THROTTLE_DECREASE(lsm_tree->merge_throttle);
+ else if (!decrease_only)
+ WT_LSM_MERGE_THROTTLE_INCREASE(lsm_tree->merge_throttle);
+ }
+
+ /* Put an upper bound of 1s on both throttle calculations. */
+ lsm_tree->ckpt_throttle = WT_MIN(WT_MILLION, lsm_tree->ckpt_throttle);
+ lsm_tree->merge_throttle = WT_MIN(WT_MILLION, lsm_tree->merge_throttle);
+
+ /*
+ * Update our estimate of how long each in-memory chunk stays active. Filter out some noise by
+ * keeping a weighted history of the calculated value. Wait until we have enough chunks that we
+ * can check that the new value is sane: otherwise, after a long idle period, we can calculate a
+ * crazy value.
+ */
+ if (in_memory > 1 && ondisk != NULL) {
+ prev_chunk = lsm_tree->chunk[lsm_tree->nchunks - 2];
+ WT_ASSERT(session, prev_chunk->generation == 0);
+ WT_ASSERT(session, WT_TIMECMP(last_chunk->create_time, prev_chunk->create_time) >= 0);
+ timediff = WT_TIMEDIFF_NS(last_chunk->create_time, prev_chunk->create_time);
+ WT_ASSERT(session, WT_TIMECMP(prev_chunk->create_time, ondisk->create_time) >= 0);
+ oldtime = WT_TIMEDIFF_NS(prev_chunk->create_time, ondisk->create_time);
+ if (timediff < 10 * oldtime)
+ lsm_tree->chunk_fill_ms = (3 * lsm_tree->chunk_fill_ms + timediff / WT_MILLION) / 4;
+ }
}
/*
* __wt_lsm_tree_switch --
- * Switch to a new in-memory tree.
+ * Switch to a new in-memory tree.
*/
int
__wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
- WT_DECL_RET;
- WT_LSM_CHUNK *chunk, *last_chunk;
- uint32_t chunks_moved, nchunks, new_id;
- bool first_switch;
-
- __wt_lsm_tree_writelock(session, lsm_tree);
-
- nchunks = lsm_tree->nchunks;
-
- first_switch = nchunks == 0;
-
- /*
- * Check if a switch is still needed: we may have raced while waiting
- * for a lock.
- */
- last_chunk = NULL;
- if (!first_switch &&
- (last_chunk = lsm_tree->chunk[nchunks - 1]) != NULL &&
- !F_ISSET(last_chunk, WT_LSM_CHUNK_ONDISK) &&
- !lsm_tree->need_switch)
- goto err;
-
- /* Update the throttle time. */
- __wt_lsm_tree_throttle(session, lsm_tree, false);
-
- new_id = __wt_atomic_add32(&lsm_tree->last, 1);
-
- WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc,
- nchunks + 1, &lsm_tree->chunk));
-
- __wt_verbose(session, WT_VERB_LSM,
- "Tree %s switch to: %" PRIu32 ", checkpoint throttle %" PRIu64
- ", merge throttle %" PRIu64, lsm_tree->name,
- new_id, lsm_tree->ckpt_throttle, lsm_tree->merge_throttle);
-
- WT_ERR(__wt_calloc_one(session, &chunk));
- chunk->id = new_id;
- chunk->switch_txn = WT_TXN_NONE;
- lsm_tree->chunk[lsm_tree->nchunks++] = chunk;
- WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
-
- WT_ERR(__wt_lsm_meta_write(session, lsm_tree, NULL));
- lsm_tree->need_switch = false;
- lsm_tree->modified = true;
-
- /*
- * Ensure the updated disk generation is visible to all other threads
- * before updating the transaction ID.
- */
- ++lsm_tree->dsk_gen;
- WT_FULL_BARRIER();
-
- /*
- * Set the switch transaction in the previous chunk unless this is
- * the first chunk in a new or newly opened tree.
- */
- if (last_chunk != NULL && last_chunk->switch_txn == WT_TXN_NONE &&
- !F_ISSET(last_chunk, WT_LSM_CHUNK_ONDISK))
- last_chunk->switch_txn = __wt_txn_id_alloc(session, false);
-
- /*
- * If a maximum number of chunks are configured, drop the any chunks
- * past the limit.
- */
- if (lsm_tree->chunk_count_limit != 0 &&
- lsm_tree->nchunks > lsm_tree->chunk_count_limit) {
- chunks_moved = lsm_tree->nchunks - lsm_tree->chunk_count_limit;
- /* Move the last chunk onto the old chunk list. */
- WT_ERR(__wt_lsm_tree_retire_chunks(
- session, lsm_tree, 0, chunks_moved));
-
- /* Update the active chunk list. */
- lsm_tree->nchunks -= chunks_moved;
- /* Move the remaining chunks to the start of the active list */
- memmove(lsm_tree->chunk,
- lsm_tree->chunk + chunks_moved,
- lsm_tree->nchunks * sizeof(*lsm_tree->chunk));
- /* Clear out the chunks at the end of the tree */
- memset(lsm_tree->chunk + lsm_tree->nchunks,
- 0, chunks_moved * sizeof(*lsm_tree->chunk));
-
- /* Make sure the manager knows there is work to do. */
- WT_ERR(__wt_lsm_manager_push_entry(
- session, WT_LSM_WORK_DROP, 0, lsm_tree));
- }
-
-err: __wt_lsm_tree_writeunlock(session, lsm_tree);
- /*
- * Errors that happen during a tree switch leave the tree in a state
- * where we can't make progress. Error out of WiredTiger.
- */
- if (ret != 0)
- WT_PANIC_RET(session, ret, "Failed doing LSM switch");
- else if (!first_switch)
- WT_RET(__wt_lsm_manager_push_entry(
- session, WT_LSM_WORK_FLUSH, 0, lsm_tree));
- return (ret);
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk, *last_chunk;
+ uint32_t chunks_moved, nchunks, new_id;
+ bool first_switch;
+
+ __wt_lsm_tree_writelock(session, lsm_tree);
+
+ nchunks = lsm_tree->nchunks;
+
+ first_switch = nchunks == 0;
+
+ /*
+ * Check if a switch is still needed: we may have raced while waiting for a lock.
+ */
+ last_chunk = NULL;
+ if (!first_switch && (last_chunk = lsm_tree->chunk[nchunks - 1]) != NULL &&
+ !F_ISSET(last_chunk, WT_LSM_CHUNK_ONDISK) && !lsm_tree->need_switch)
+ goto err;
+
+ /* Update the throttle time. */
+ __wt_lsm_tree_throttle(session, lsm_tree, false);
+
+ new_id = __wt_atomic_add32(&lsm_tree->last, 1);
+
+ WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc, nchunks + 1, &lsm_tree->chunk));
+
+ __wt_verbose(session, WT_VERB_LSM,
+ "Tree %s switch to: %" PRIu32 ", checkpoint throttle %" PRIu64 ", merge throttle %" PRIu64,
+ lsm_tree->name, new_id, lsm_tree->ckpt_throttle, lsm_tree->merge_throttle);
+
+ WT_ERR(__wt_calloc_one(session, &chunk));
+ chunk->id = new_id;
+ chunk->switch_txn = WT_TXN_NONE;
+ lsm_tree->chunk[lsm_tree->nchunks++] = chunk;
+ WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
+
+ WT_ERR(__wt_lsm_meta_write(session, lsm_tree, NULL));
+ lsm_tree->need_switch = false;
+ lsm_tree->modified = true;
+
+ /*
+ * Ensure the updated disk generation is visible to all other threads before updating the
+ * transaction ID.
+ */
+ ++lsm_tree->dsk_gen;
+ WT_FULL_BARRIER();
+
+ /*
+ * Set the switch transaction in the previous chunk unless this is the first chunk in a new or
+ * newly opened tree.
+ */
+ if (last_chunk != NULL && last_chunk->switch_txn == WT_TXN_NONE &&
+ !F_ISSET(last_chunk, WT_LSM_CHUNK_ONDISK))
+ last_chunk->switch_txn = __wt_txn_id_alloc(session, false);
+
+ /*
+ * If a maximum number of chunks are configured, drop the any chunks past the limit.
+ */
+ if (lsm_tree->chunk_count_limit != 0 && lsm_tree->nchunks > lsm_tree->chunk_count_limit) {
+ chunks_moved = lsm_tree->nchunks - lsm_tree->chunk_count_limit;
+ /* Move the last chunk onto the old chunk list. */
+ WT_ERR(__wt_lsm_tree_retire_chunks(session, lsm_tree, 0, chunks_moved));
+
+ /* Update the active chunk list. */
+ lsm_tree->nchunks -= chunks_moved;
+ /* Move the remaining chunks to the start of the active list */
+ memmove(lsm_tree->chunk, lsm_tree->chunk + chunks_moved,
+ lsm_tree->nchunks * sizeof(*lsm_tree->chunk));
+ /* Clear out the chunks at the end of the tree */
+ memset(lsm_tree->chunk + lsm_tree->nchunks, 0, chunks_moved * sizeof(*lsm_tree->chunk));
+
+ /* Make sure the manager knows there is work to do. */
+ WT_ERR(__wt_lsm_manager_push_entry(session, WT_LSM_WORK_DROP, 0, lsm_tree));
+ }
+
+err:
+ __wt_lsm_tree_writeunlock(session, lsm_tree);
+ /*
+ * Errors that happen during a tree switch leave the tree in a state where we can't make
+ * progress. Error out of WiredTiger.
+ */
+ if (ret != 0)
+ WT_PANIC_RET(session, ret, "Failed doing LSM switch");
+ else if (!first_switch)
+ WT_RET(__wt_lsm_manager_push_entry(session, WT_LSM_WORK_FLUSH, 0, lsm_tree));
+ return (ret);
}
/*
* __wt_lsm_tree_retire_chunks --
- * Move a set of chunks onto the old chunks list.
- * It's the callers responsibility to update the active chunks list.
- * Must be called with the LSM lock held.
+ * Move a set of chunks onto the old chunks list. It's the callers responsibility to update the
+ * active chunks list. Must be called with the LSM lock held.
*/
int
-__wt_lsm_tree_retire_chunks(WT_SESSION_IMPL *session,
- WT_LSM_TREE *lsm_tree, u_int start_chunk, u_int nchunks)
+__wt_lsm_tree_retire_chunks(
+ WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int start_chunk, u_int nchunks)
{
- u_int i;
+ u_int i;
- WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks);
+ WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks);
- /* Setup the array of obsolete chunks. */
- WT_RET(__wt_realloc_def(session, &lsm_tree->old_alloc,
- lsm_tree->nold_chunks + nchunks, &lsm_tree->old_chunks));
+ /* Setup the array of obsolete chunks. */
+ WT_RET(__wt_realloc_def(
+ session, &lsm_tree->old_alloc, lsm_tree->nold_chunks + nchunks, &lsm_tree->old_chunks));
- /* Copy entries one at a time, so we can reuse gaps in the list. */
- for (i = 0; i < nchunks; i++)
- lsm_tree->old_chunks[lsm_tree->nold_chunks++] =
- lsm_tree->chunk[start_chunk + i];
+ /* Copy entries one at a time, so we can reuse gaps in the list. */
+ for (i = 0; i < nchunks; i++)
+ lsm_tree->old_chunks[lsm_tree->nold_chunks++] = lsm_tree->chunk[start_chunk + i];
- return (0);
+ return (0);
}
/*
* __wt_lsm_tree_drop --
- * Drop an LSM tree.
+ * Drop an LSM tree.
*/
int
-__wt_lsm_tree_drop(
- WT_SESSION_IMPL *session, const char *name, const char *cfg[])
+__wt_lsm_tree_drop(WT_SESSION_IMPL *session, const char *name, const char *cfg[])
{
- WT_DECL_RET;
- WT_LSM_CHUNK *chunk;
- WT_LSM_TREE *lsm_tree;
- u_int i;
- int tret;
- bool locked;
-
- WT_NOT_READ(locked, false);
-
- /* Get the LSM tree. */
- WT_RET(__wt_lsm_tree_get(session, name, true, &lsm_tree));
- WT_ASSERT(session, !lsm_tree->active);
-
- /* Prevent any new opens. */
- __wt_lsm_tree_writelock(session, lsm_tree);
- locked = true;
-
- /* Drop the chunks. */
- for (i = 0; i < lsm_tree->nchunks; i++) {
- chunk = lsm_tree->chunk[i];
- WT_ERR(__wt_schema_drop(session, chunk->uri, cfg));
- if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
- WT_ERR(
- __wt_schema_drop(session, chunk->bloom_uri, cfg));
- }
-
- /* Drop any chunks on the obsolete list. */
- for (i = 0; i < lsm_tree->nold_chunks; i++) {
- if ((chunk = lsm_tree->old_chunks[i]) == NULL)
- continue;
- WT_ERR(__wt_schema_drop(session, chunk->uri, cfg));
- if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
- WT_ERR(
- __wt_schema_drop(session, chunk->bloom_uri, cfg));
- }
-
- locked = false;
- __wt_lsm_tree_writeunlock(session, lsm_tree);
- ret = __wt_metadata_remove(session, name);
-
- WT_ASSERT(session, !lsm_tree->active);
-err: if (locked)
- __wt_lsm_tree_writeunlock(session, lsm_tree);
- WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
- tret = __lsm_tree_discard(session, lsm_tree, false));
- WT_TRET(tret);
- return (ret);
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_TREE *lsm_tree;
+ u_int i;
+ int tret;
+ bool locked;
+
+ WT_NOT_READ(locked, false);
+
+ /* Get the LSM tree. */
+ WT_RET(__wt_lsm_tree_get(session, name, true, &lsm_tree));
+ WT_ASSERT(session, !lsm_tree->active);
+
+ /* Prevent any new opens. */
+ __wt_lsm_tree_writelock(session, lsm_tree);
+ locked = true;
+
+ /* Drop the chunks. */
+ for (i = 0; i < lsm_tree->nchunks; i++) {
+ chunk = lsm_tree->chunk[i];
+ WT_ERR(__wt_schema_drop(session, chunk->uri, cfg));
+ if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+ WT_ERR(__wt_schema_drop(session, chunk->bloom_uri, cfg));
+ }
+
+ /* Drop any chunks on the obsolete list. */
+ for (i = 0; i < lsm_tree->nold_chunks; i++) {
+ if ((chunk = lsm_tree->old_chunks[i]) == NULL)
+ continue;
+ WT_ERR(__wt_schema_drop(session, chunk->uri, cfg));
+ if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+ WT_ERR(__wt_schema_drop(session, chunk->bloom_uri, cfg));
+ }
+
+ locked = false;
+ __wt_lsm_tree_writeunlock(session, lsm_tree);
+ ret = __wt_metadata_remove(session, name);
+
+ WT_ASSERT(session, !lsm_tree->active);
+err:
+ if (locked)
+ __wt_lsm_tree_writeunlock(session, lsm_tree);
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session, tret = __lsm_tree_discard(session, lsm_tree, false));
+ WT_TRET(tret);
+ return (ret);
}
/*
* __wt_lsm_tree_rename --
- * Rename an LSM tree.
+ * Rename an LSM tree.
*/
int
-__wt_lsm_tree_rename(WT_SESSION_IMPL *session,
- const char *olduri, const char *newuri, const char *cfg[])
+__wt_lsm_tree_rename(
+ WT_SESSION_IMPL *session, const char *olduri, const char *newuri, const char *cfg[])
{
- WT_DECL_RET;
- WT_LSM_CHUNK *chunk;
- WT_LSM_TREE *lsm_tree;
- u_int i;
- int tret;
- const char *old;
- bool locked;
-
- old = NULL;
- WT_NOT_READ(locked, false);
-
- /* Get the LSM tree. */
- WT_RET(__wt_lsm_tree_get(session, olduri, true, &lsm_tree));
-
- /* Prevent any new opens. */
- __wt_lsm_tree_writelock(session, lsm_tree);
- locked = true;
-
- /* Set the new name. */
- WT_ERR(__lsm_tree_set_name(session, lsm_tree, newuri));
-
- /* Rename the chunks. */
- for (i = 0; i < lsm_tree->nchunks; i++) {
- chunk = lsm_tree->chunk[i];
- old = chunk->uri;
- chunk->uri = NULL;
-
- WT_ERR(__wt_lsm_tree_chunk_name(session, lsm_tree,
- chunk->id, chunk->generation, &chunk->uri));
- WT_ERR(__wt_schema_rename(session, old, chunk->uri, cfg));
- __wt_free(session, old);
-
- if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) {
- old = chunk->bloom_uri;
- chunk->bloom_uri = NULL;
- WT_ERR(__wt_lsm_tree_bloom_name(
- session, lsm_tree, chunk->id, &chunk->bloom_uri));
- F_SET(chunk, WT_LSM_CHUNK_BLOOM);
- WT_ERR(__wt_schema_rename(
- session, old, chunk->uri, cfg));
- __wt_free(session, old);
- }
- }
-
- WT_ERR(__wt_lsm_meta_write(session, lsm_tree, NULL));
- locked = false;
- __wt_lsm_tree_writeunlock(session, lsm_tree);
- WT_ERR(__wt_metadata_remove(session, olduri));
-
-err: if (locked)
- __wt_lsm_tree_writeunlock(session, lsm_tree);
- __wt_free(session, old);
-
- /*
- * Discard this LSM tree structure. The first operation on the renamed
- * tree will create a new one.
- */
- WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
- tret = __lsm_tree_discard(session, lsm_tree, false));
- WT_TRET(tret);
- return (ret);
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_TREE *lsm_tree;
+ u_int i;
+ int tret;
+ const char *old;
+ bool locked;
+
+ old = NULL;
+ WT_NOT_READ(locked, false);
+
+ /* Get the LSM tree. */
+ WT_RET(__wt_lsm_tree_get(session, olduri, true, &lsm_tree));
+
+ /* Prevent any new opens. */
+ __wt_lsm_tree_writelock(session, lsm_tree);
+ locked = true;
+
+ /* Set the new name. */
+ WT_ERR(__lsm_tree_set_name(session, lsm_tree, newuri));
+
+ /* Rename the chunks. */
+ for (i = 0; i < lsm_tree->nchunks; i++) {
+ chunk = lsm_tree->chunk[i];
+ old = chunk->uri;
+ chunk->uri = NULL;
+
+ WT_ERR(
+ __wt_lsm_tree_chunk_name(session, lsm_tree, chunk->id, chunk->generation, &chunk->uri));
+ WT_ERR(__wt_schema_rename(session, old, chunk->uri, cfg));
+ __wt_free(session, old);
+
+ if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) {
+ old = chunk->bloom_uri;
+ chunk->bloom_uri = NULL;
+ WT_ERR(__wt_lsm_tree_bloom_name(session, lsm_tree, chunk->id, &chunk->bloom_uri));
+ F_SET(chunk, WT_LSM_CHUNK_BLOOM);
+ WT_ERR(__wt_schema_rename(session, old, chunk->uri, cfg));
+ __wt_free(session, old);
+ }
+ }
+
+ WT_ERR(__wt_lsm_meta_write(session, lsm_tree, NULL));
+ locked = false;
+ __wt_lsm_tree_writeunlock(session, lsm_tree);
+ WT_ERR(__wt_metadata_remove(session, olduri));
+
+err:
+ if (locked)
+ __wt_lsm_tree_writeunlock(session, lsm_tree);
+ __wt_free(session, old);
+
+ /*
+ * Discard this LSM tree structure. The first operation on the renamed tree will create a new
+ * one.
+ */
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session, tret = __lsm_tree_discard(session, lsm_tree, false));
+ WT_TRET(tret);
+ return (ret);
}
/*
* __wt_lsm_tree_truncate --
- * Truncate an LSM tree.
+ * Truncate an LSM tree.
*/
int
-__wt_lsm_tree_truncate(
- WT_SESSION_IMPL *session, const char *name, const char *cfg[])
+__wt_lsm_tree_truncate(WT_SESSION_IMPL *session, const char *name, const char *cfg[])
{
- WT_DECL_RET;
- WT_LSM_CHUNK *chunk;
- WT_LSM_TREE *lsm_tree;
- int tret;
- bool locked;
-
- WT_UNUSED(cfg);
-
- chunk = NULL;
- WT_NOT_READ(locked, false);
-
- /* Get the LSM tree. */
- WT_RET(__wt_lsm_tree_get(session, name, true, &lsm_tree));
-
- /* Prevent any new opens. */
- __wt_lsm_tree_writelock(session, lsm_tree);
- locked = true;
-
- /* Create the new chunk. */
- WT_ERR(__wt_calloc_one(session, &chunk));
- chunk->id = __wt_atomic_add32(&lsm_tree->last, 1);
- WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
-
- /* Mark all chunks old. */
- WT_ERR(__wt_lsm_merge_update_tree(
- session, lsm_tree, 0, lsm_tree->nchunks, chunk));
-
- WT_ERR(__wt_lsm_meta_write(session, lsm_tree, NULL));
-
- locked = false;
- __wt_lsm_tree_writeunlock(session, lsm_tree);
- __wt_lsm_tree_release(session, lsm_tree);
-
-err: if (locked)
- __wt_lsm_tree_writeunlock(session, lsm_tree);
- if (ret != 0) {
- if (chunk != NULL) {
- WT_TRET(__wt_schema_drop(session, chunk->uri, NULL));
- __wt_free(session, chunk);
- }
- /*
- * Discard the LSM tree structure on error. This will force the
- * LSM tree to be re-opened the next time it is accessed and
- * the last good version of the metadata will be used, resulting
- * in a valid (not truncated) tree.
- */
- WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
- tret = __lsm_tree_discard(session, lsm_tree, false));
- WT_TRET(tret);
- }
- return (ret);
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_TREE *lsm_tree;
+ int tret;
+ bool locked;
+
+ WT_UNUSED(cfg);
+
+ chunk = NULL;
+ WT_NOT_READ(locked, false);
+
+ /* Get the LSM tree. */
+ WT_RET(__wt_lsm_tree_get(session, name, true, &lsm_tree));
+
+ /* Prevent any new opens. */
+ __wt_lsm_tree_writelock(session, lsm_tree);
+ locked = true;
+
+ /* Create the new chunk. */
+ WT_ERR(__wt_calloc_one(session, &chunk));
+ chunk->id = __wt_atomic_add32(&lsm_tree->last, 1);
+ WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
+
+ /* Mark all chunks old. */
+ WT_ERR(__wt_lsm_merge_update_tree(session, lsm_tree, 0, lsm_tree->nchunks, chunk));
+
+ WT_ERR(__wt_lsm_meta_write(session, lsm_tree, NULL));
+
+ locked = false;
+ __wt_lsm_tree_writeunlock(session, lsm_tree);
+ __wt_lsm_tree_release(session, lsm_tree);
+
+err:
+ if (locked)
+ __wt_lsm_tree_writeunlock(session, lsm_tree);
+ if (ret != 0) {
+ if (chunk != NULL) {
+ WT_TRET(__wt_schema_drop(session, chunk->uri, NULL));
+ __wt_free(session, chunk);
+ }
+ /*
+ * Discard the LSM tree structure on error. This will force the LSM tree to be re-opened the
+ * next time it is accessed and the last good version of the metadata will be used,
+ * resulting in a valid (not truncated) tree.
+ */
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(
+ session, tret = __lsm_tree_discard(session, lsm_tree, false));
+ WT_TRET(tret);
+ }
+ return (ret);
}
/*
* __wt_lsm_tree_readlock --
- * Acquire a shared lock on an LSM tree.
+ * Acquire a shared lock on an LSM tree.
*/
void
__wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
- __wt_readlock(session, &lsm_tree->rwlock);
-
- /*
- * Diagnostic: avoid deadlocks with the schema lock: if we need it for
- * an operation, we should already have it.
- */
- F_SET(session,
- WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK);
+ __wt_readlock(session, &lsm_tree->rwlock);
+
+ /*
+ * Diagnostic: avoid deadlocks with the schema lock: if we need it for an operation, we should
+ * already have it.
+ */
+ F_SET(session, WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK);
}
/*
* __wt_lsm_tree_readunlock --
- * Release a shared lock on an LSM tree.
+ * Release a shared lock on an LSM tree.
*/
void
__wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
- F_CLR(session,
- WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK);
+ F_CLR(session, WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK);
- __wt_readunlock(session, &lsm_tree->rwlock);
+ __wt_readunlock(session, &lsm_tree->rwlock);
}
/*
* __wt_lsm_tree_writelock --
- * Acquire an exclusive lock on an LSM tree.
+ * Acquire an exclusive lock on an LSM tree.
*/
void
__wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
- __wt_writelock(session, &lsm_tree->rwlock);
-
- /*
- * Diagnostic: avoid deadlocks with the schema lock: if we need it for
- * an operation, we should already have it.
- */
- F_SET(session,
- WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK);
+ __wt_writelock(session, &lsm_tree->rwlock);
+
+ /*
+ * Diagnostic: avoid deadlocks with the schema lock: if we need it for an operation, we should
+ * already have it.
+ */
+ F_SET(session, WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK);
}
/*
* __wt_lsm_tree_writeunlock --
- * Release an exclusive lock on an LSM tree.
+ * Release an exclusive lock on an LSM tree.
*/
void
__wt_lsm_tree_writeunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
- F_CLR(session,
- WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK);
+ F_CLR(session, WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_NO_SCHEMA_LOCK);
- __wt_writeunlock(session, &lsm_tree->rwlock);
+ __wt_writeunlock(session, &lsm_tree->rwlock);
}
/*
* __wt_lsm_compact --
- * Compact an LSM tree called via __wt_schema_worker.
+ * Compact an LSM tree called via __wt_schema_worker.
*/
int
__wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp)
{
- WT_DECL_RET;
- WT_LSM_CHUNK *chunk;
- WT_LSM_TREE *lsm_tree;
- uint64_t progress;
- uint32_t i;
- bool compacting, flushing, locked, push_flush, ref;
-
- compacting = flushing = locked = ref = false;
- chunk = NULL;
- /*
- * This function is applied to all matching sources: ignore anything
- * that is not an LSM tree.
- */
- if (!WT_PREFIX_MATCH(name, "lsm:"))
- return (0);
-
- /* Tell __wt_schema_worker not to look inside the LSM tree. */
- *skipp = true;
-
- WT_RET(__wt_lsm_tree_get(session, name, false, &lsm_tree));
-
- if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE))
- WT_ERR_MSG(session, EINVAL,
- "LSM compaction requires active merge threads");
-
- /*
- * There is no work to do if there is only a single chunk in the tree
- * and it has a bloom filter or is configured to never have a bloom
- * filter.
- */
- if (lsm_tree->nchunks == 1 &&
- (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST) ||
- F_ISSET(lsm_tree->chunk[0], WT_LSM_CHUNK_BLOOM))) {
- __wt_lsm_tree_release(session, lsm_tree);
- return (0);
- }
-
- /*
- * Compacting has two distinct phases.
- * 1. All in-memory chunks up to and including the current
- * current chunk must be flushed. Normally, the flush code
- * does not flush the last, in-use chunk, so we set a force
- * flag to include that last chunk. We monitor the state of the
- * last chunk and periodically push another forced flush work
- * unit until it is complete.
- * 2. After all flushing is done, we move onto the merging
- * phase for compaction. Again, we monitor the state and
- * continue to push merge work units until all merging is done.
- */
-
- /* Lock the tree: single-thread compaction. */
- __wt_lsm_tree_writelock(session, lsm_tree);
- locked = true;
-
- /* Clear any merge throttle: compact throws out that calculation. */
- lsm_tree->merge_throttle = 0;
- lsm_tree->merge_aggressiveness = 0;
- progress = lsm_tree->merge_progressing;
-
- /* If another thread started a compact on this tree, we're done. */
- if (F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING))
- goto err;
-
- /*
- * Set the switch transaction on the current chunk, if it
- * hasn't been set before. This prevents further writes, so it
- * can be flushed by the checkpoint worker. If this is a newly
- * opened tree the primary chunk may already be stable. Only
- * push a flush work unit if necessary.
- */
- push_flush = false;
- if (lsm_tree->nchunks > 0 &&
- (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) != NULL &&
- !F_ISSET(chunk, (WT_LSM_CHUNK_ONDISK | WT_LSM_CHUNK_STABLE))) {
- push_flush = true;
- if (chunk->switch_txn == WT_TXN_NONE) {
- /*
- * Make sure any cursors open on the tree see the
- * new switch generation before updating.
- */
- ++lsm_tree->dsk_gen;
- WT_FULL_BARRIER();
- chunk->switch_txn = __wt_txn_id_alloc(session, false);
- }
- /*
- * If we have a chunk, we want to look for it to be on-disk.
- * So we need to add a reference to keep it available.
- */
- (void)__wt_atomic_add32(&chunk->refcnt, 1);
- ref = true;
- }
-
- if (push_flush) {
- __wt_verbose(session, WT_VERB_LSM,
- "Compact force flush %s flags 0x%" PRIx32
- " chunk %" PRIu32 " flags 0x%" PRIx32,
- name, lsm_tree->flags, chunk->id, chunk->flags);
- flushing = true;
- locked = false;
- __wt_lsm_tree_writeunlock(session, lsm_tree);
- /*
- * Make sure the in-memory chunk gets flushed do not push a
- * switch, because we don't want to create a new in-memory
- * chunk if the tree is being used read-only now.
- */
- WT_ERR(__wt_lsm_manager_push_entry(session,
- WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE, lsm_tree));
- } else {
- /*
- * If there is no chunk to flush, go straight to the
- * compacting state.
- */
- compacting = true;
- progress = lsm_tree->merge_progressing;
- F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
- __wt_verbose(session, WT_VERB_LSM,
- "COMPACT: Start compacting %s", lsm_tree->name);
- locked = false;
- __wt_lsm_tree_writeunlock(session, lsm_tree);
- }
-
- /* Wait for the work unit queues to drain. */
- while (lsm_tree->active) {
- /*
- * The flush flag is cleared when the chunk has been flushed.
- * Continue to push forced flushes until the chunk is on disk.
- * Once it is on disk move to the compacting phase.
- */
- if (flushing) {
- WT_ASSERT(session, chunk != NULL);
- if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
- __wt_verbose(session,
- WT_VERB_LSM,
- "Compact flush done %s chunk %" PRIu32 ". "
- "Start compacting progress %" PRIu64,
- name, chunk->id,
- lsm_tree->merge_progressing);
- (void)__wt_atomic_sub32(&chunk->refcnt, 1);
- flushing = ref = false;
- compacting = true;
- F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
- progress = lsm_tree->merge_progressing;
- } else {
- __wt_verbose(session, WT_VERB_LSM,
- "Compact flush retry %s chunk %" PRIu32,
- name, chunk->id);
- WT_ERR(__wt_lsm_manager_push_entry(session,
- WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE,
- lsm_tree));
- }
- }
-
- /*
- * The compacting flag is cleared when no merges can be done.
- * Ensure that we push through some aggressive merges before
- * stopping otherwise we might not do merges that would
- * span chunks with different generations.
- */
- if (compacting && !F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) {
- if (lsm_tree->merge_aggressiveness < 10 ||
- (progress < lsm_tree->merge_progressing) ||
- lsm_tree->merge_syncing) {
- progress = lsm_tree->merge_progressing;
- F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
- lsm_tree->merge_aggressiveness = 10;
- } else
- break;
- }
-
- /*
- * Periodically check if we've timed out or eviction is stuck.
- * Quit if eviction is stuck, we're making the problem worse.
- */
- WT_ERR(__wt_session_compact_check_timeout(session));
- if (__wt_cache_stuck(session))
- WT_ERR(EBUSY);
- __wt_sleep(1, 0);
-
- /*
- * Push merge operations while they are still getting work
- * done. If we are pushing merges, make sure they are
- * aggressive, to avoid duplicating effort.
- */
- if (compacting)
-#define COMPACT_PARALLEL_MERGES 5
- for (i = lsm_tree->queue_ref;
- i < COMPACT_PARALLEL_MERGES; i++) {
- lsm_tree->merge_aggressiveness = 10;
- WT_ERR(__wt_lsm_manager_push_entry(
- session, WT_LSM_WORK_MERGE, 0, lsm_tree));
- }
- }
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_TREE *lsm_tree;
+ uint64_t progress;
+ uint32_t i;
+ bool compacting, flushing, locked, push_flush, ref;
+
+ compacting = flushing = locked = ref = false;
+ chunk = NULL;
+ /*
+ * This function is applied to all matching sources: ignore anything that is not an LSM tree.
+ */
+ if (!WT_PREFIX_MATCH(name, "lsm:"))
+ return (0);
+
+ /* Tell __wt_schema_worker not to look inside the LSM tree. */
+ *skipp = true;
+
+ WT_RET(__wt_lsm_tree_get(session, name, false, &lsm_tree));
+
+ if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE))
+ WT_ERR_MSG(session, EINVAL, "LSM compaction requires active merge threads");
+
+ /*
+ * There is no work to do if there is only a single chunk in the tree and it has a bloom filter
+ * or is configured to never have a bloom filter.
+ */
+ if (lsm_tree->nchunks == 1 && (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST) ||
+ F_ISSET(lsm_tree->chunk[0], WT_LSM_CHUNK_BLOOM))) {
+ __wt_lsm_tree_release(session, lsm_tree);
+ return (0);
+ }
+
+ /*
+ * Compacting has two distinct phases.
+ * 1. All in-memory chunks up to and including the current
+ * current chunk must be flushed. Normally, the flush code
+ * does not flush the last, in-use chunk, so we set a force
+ * flag to include that last chunk. We monitor the state of the
+ * last chunk and periodically push another forced flush work
+ * unit until it is complete.
+ * 2. After all flushing is done, we move onto the merging
+ * phase for compaction. Again, we monitor the state and
+ * continue to push merge work units until all merging is done.
+ */
+
+ /* Lock the tree: single-thread compaction. */
+ __wt_lsm_tree_writelock(session, lsm_tree);
+ locked = true;
+
+ /* Clear any merge throttle: compact throws out that calculation. */
+ lsm_tree->merge_throttle = 0;
+ lsm_tree->merge_aggressiveness = 0;
+ progress = lsm_tree->merge_progressing;
+
+ /* If another thread started a compact on this tree, we're done. */
+ if (F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING))
+ goto err;
+
+ /*
+ * Set the switch transaction on the current chunk, if it hasn't been set before. This prevents
+ * further writes, so it can be flushed by the checkpoint worker. If this is a newly opened tree
+ * the primary chunk may already be stable. Only push a flush work unit if necessary.
+ */
+ push_flush = false;
+ if (lsm_tree->nchunks > 0 && (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) != NULL &&
+ !F_ISSET(chunk, (WT_LSM_CHUNK_ONDISK | WT_LSM_CHUNK_STABLE))) {
+ push_flush = true;
+ if (chunk->switch_txn == WT_TXN_NONE) {
+ /*
+ * Make sure any cursors open on the tree see the new switch generation before updating.
+ */
+ ++lsm_tree->dsk_gen;
+ WT_FULL_BARRIER();
+ chunk->switch_txn = __wt_txn_id_alloc(session, false);
+ }
+ /*
+ * If we have a chunk, we want to look for it to be on-disk. So we need to add a reference
+ * to keep it available.
+ */
+ (void)__wt_atomic_add32(&chunk->refcnt, 1);
+ ref = true;
+ }
+
+ if (push_flush) {
+ __wt_verbose(session, WT_VERB_LSM,
+ "Compact force flush %s flags 0x%" PRIx32 " chunk %" PRIu32 " flags 0x%" PRIx32, name,
+ lsm_tree->flags, chunk->id, chunk->flags);
+ flushing = true;
+ locked = false;
+ __wt_lsm_tree_writeunlock(session, lsm_tree);
+ /*
+ * Make sure the in-memory chunk gets flushed do not push a switch, because we don't want to
+ * create a new in-memory chunk if the tree is being used read-only now.
+ */
+ WT_ERR(
+ __wt_lsm_manager_push_entry(session, WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE, lsm_tree));
+ } else {
+ /*
+ * If there is no chunk to flush, go straight to the compacting state.
+ */
+ compacting = true;
+ progress = lsm_tree->merge_progressing;
+ F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
+ __wt_verbose(session, WT_VERB_LSM, "COMPACT: Start compacting %s", lsm_tree->name);
+ locked = false;
+ __wt_lsm_tree_writeunlock(session, lsm_tree);
+ }
+
+ /* Wait for the work unit queues to drain. */
+ while (lsm_tree->active) {
+ /*
+ * The flush flag is cleared when the chunk has been flushed. Continue to push forced
+ * flushes until the chunk is on disk. Once it is on disk move to the compacting phase.
+ */
+ if (flushing) {
+ WT_ASSERT(session, chunk != NULL);
+ if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
+ __wt_verbose(session, WT_VERB_LSM, "Compact flush done %s chunk %" PRIu32
+ ". "
+ "Start compacting progress %" PRIu64,
+ name, chunk->id, lsm_tree->merge_progressing);
+ (void)__wt_atomic_sub32(&chunk->refcnt, 1);
+ flushing = ref = false;
+ compacting = true;
+ F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
+ progress = lsm_tree->merge_progressing;
+ } else {
+ __wt_verbose(
+ session, WT_VERB_LSM, "Compact flush retry %s chunk %" PRIu32, name, chunk->id);
+ WT_ERR(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE, lsm_tree));
+ }
+ }
+
+ /*
+ * The compacting flag is cleared when no merges can be done. Ensure that we push through
+ * some aggressive merges before stopping otherwise we might not do merges that would span
+ * chunks with different generations.
+ */
+ if (compacting && !F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) {
+ if (lsm_tree->merge_aggressiveness < 10 || (progress < lsm_tree->merge_progressing) ||
+ lsm_tree->merge_syncing) {
+ progress = lsm_tree->merge_progressing;
+ F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
+ lsm_tree->merge_aggressiveness = 10;
+ } else
+ break;
+ }
+
+ /*
+ * Periodically check if we've timed out or eviction is stuck. Quit if eviction is stuck,
+ * we're making the problem worse.
+ */
+ WT_ERR(__wt_session_compact_check_timeout(session));
+ if (__wt_cache_stuck(session))
+ WT_ERR(EBUSY);
+ __wt_sleep(1, 0);
+
+ /*
+ * Push merge operations while they are still getting work done. If we are pushing merges,
+ * make sure they are aggressive, to avoid duplicating effort.
+ */
+ if (compacting)
+#define COMPACT_PARALLEL_MERGES 5
+ for (i = lsm_tree->queue_ref; i < COMPACT_PARALLEL_MERGES; i++) {
+ lsm_tree->merge_aggressiveness = 10;
+ WT_ERR(__wt_lsm_manager_push_entry(session, WT_LSM_WORK_MERGE, 0, lsm_tree));
+ }
+ }
err:
- /* Ensure anything we set is cleared. */
- if (ref)
- (void)__wt_atomic_sub32(&chunk->refcnt, 1);
- if (compacting) {
- F_CLR(lsm_tree, WT_LSM_TREE_COMPACTING);
- lsm_tree->merge_aggressiveness = 0;
- }
- if (locked)
- __wt_lsm_tree_writeunlock(session, lsm_tree);
-
- __wt_verbose(session, WT_VERB_LSM,
- "Compact %s complete, return %d", name, ret);
-
- __wt_lsm_tree_release(session, lsm_tree);
- return (ret);
+ /* Ensure anything we set is cleared. */
+ if (ref)
+ (void)__wt_atomic_sub32(&chunk->refcnt, 1);
+ if (compacting) {
+ F_CLR(lsm_tree, WT_LSM_TREE_COMPACTING);
+ lsm_tree->merge_aggressiveness = 0;
+ }
+ if (locked)
+ __wt_lsm_tree_writeunlock(session, lsm_tree);
+
+ __wt_verbose(session, WT_VERB_LSM, "Compact %s complete, return %d", name, ret);
+
+ __wt_lsm_tree_release(session, lsm_tree);
+ return (ret);
}
/*
* __wt_lsm_tree_worker --
- * Run a schema worker operation on each level of a LSM tree.
+ * Run a schema worker operation on each level of a LSM tree.
*/
int
-__wt_lsm_tree_worker(WT_SESSION_IMPL *session,
- const char *uri,
- int (*file_func)(WT_SESSION_IMPL *, const char *[]),
- int (*name_func)(WT_SESSION_IMPL *, const char *, bool *),
- const char *cfg[], uint32_t open_flags)
+__wt_lsm_tree_worker(WT_SESSION_IMPL *session, const char *uri,
+ int (*file_func)(WT_SESSION_IMPL *, const char *[]),
+ int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), const char *cfg[], uint32_t open_flags)
{
- WT_DECL_RET;
- WT_LSM_CHUNK *chunk;
- WT_LSM_TREE *lsm_tree;
- u_int i;
- bool exclusive, locked, need_release;
-
- WT_NOT_READ(locked, false);
- WT_NOT_READ(need_release, false);
- exclusive = FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE);
-
- WT_RET(__wt_lsm_tree_get(session, uri, exclusive, &lsm_tree));
- need_release = true;
-
- /*
- * We mark that we're busy using the tree to coordinate
- * with merges so that merging doesn't change the chunk
- * array out from underneath us.
- */
- if (exclusive)
- __wt_lsm_tree_writelock(session, lsm_tree);
- else
- __wt_lsm_tree_readlock(session, lsm_tree);
- locked = true;
- for (i = 0; i < lsm_tree->nchunks; i++) {
- chunk = lsm_tree->chunk[i];
- /*
- * If the chunk is on disk, don't include underlying handles in
- * the checkpoint. Checking the "get handles" function is all
- * we need to do, no further checkpoint calls are done if the
- * handle is not gathered.
- */
- if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
- file_func == __wt_checkpoint_get_handles)
- continue;
- WT_ERR(__wt_schema_worker(session, chunk->uri,
- file_func, name_func, cfg, open_flags));
- if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
- WT_ERR(__wt_schema_worker(session, chunk->bloom_uri,
- file_func, name_func, cfg, open_flags));
- }
- /*
- * If this was an alter operation, we need to alter the configuration
- * for the overall tree and then reread it so it isn't out of date.
- * Reread it here so that we update the configuration of the
- * current tree's structure to any new, altered values.
- */
- if (FLD_ISSET(open_flags, WT_BTREE_ALTER)) {
- WT_ERR(__wt_lsm_meta_write(session, lsm_tree, cfg[0]));
-
- locked = false;
- if (exclusive)
- __wt_lsm_tree_writeunlock(session, lsm_tree);
- else
- __wt_lsm_tree_readunlock(session, lsm_tree);
-
- /*
- * We rewrote the meta-data. Discard the tree and the next
- * access will reopen it.
- */
- need_release = false;
- WT_WITH_HANDLE_LIST_WRITE_LOCK(session,
- ret = __lsm_tree_discard(session, lsm_tree, false));
- WT_ERR(ret);
- }
-
-err: if (locked) {
- if (exclusive)
- __wt_lsm_tree_writeunlock(session, lsm_tree);
- else
- __wt_lsm_tree_readunlock(session, lsm_tree);
- }
- if (need_release)
- __wt_lsm_tree_release(session, lsm_tree);
- return (ret);
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_TREE *lsm_tree;
+ u_int i;
+ bool exclusive, locked, need_release;
+
+ WT_NOT_READ(locked, false);
+ WT_NOT_READ(need_release, false);
+ exclusive = FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE);
+
+ WT_RET(__wt_lsm_tree_get(session, uri, exclusive, &lsm_tree));
+ need_release = true;
+
+ /*
+ * We mark that we're busy using the tree to coordinate with merges so that merging doesn't
+ * change the chunk array out from underneath us.
+ */
+ if (exclusive)
+ __wt_lsm_tree_writelock(session, lsm_tree);
+ else
+ __wt_lsm_tree_readlock(session, lsm_tree);
+ locked = true;
+ for (i = 0; i < lsm_tree->nchunks; i++) {
+ chunk = lsm_tree->chunk[i];
+ /*
+ * If the chunk is on disk, don't include underlying handles in the checkpoint. Checking the
+ * "get handles" function is all we need to do, no further checkpoint calls are done if the
+ * handle is not gathered.
+ */
+ if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && file_func == __wt_checkpoint_get_handles)
+ continue;
+ WT_ERR(__wt_schema_worker(session, chunk->uri, file_func, name_func, cfg, open_flags));
+ if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+ WT_ERR(
+ __wt_schema_worker(session, chunk->bloom_uri, file_func, name_func, cfg, open_flags));
+ }
+ /*
+ * If this was an alter operation, we need to alter the configuration for the overall tree and
+ * then reread it so it isn't out of date. Reread it here so that we update the configuration of
+ * the current tree's structure to any new, altered values.
+ */
+ if (FLD_ISSET(open_flags, WT_BTREE_ALTER)) {
+ WT_ERR(__wt_lsm_meta_write(session, lsm_tree, cfg[0]));
+
+ locked = false;
+ if (exclusive)
+ __wt_lsm_tree_writeunlock(session, lsm_tree);
+ else
+ __wt_lsm_tree_readunlock(session, lsm_tree);
+
+ /*
+ * We rewrote the meta-data. Discard the tree and the next access will reopen it.
+ */
+ need_release = false;
+ WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __lsm_tree_discard(session, lsm_tree, false));
+ WT_ERR(ret);
+ }
+
+err:
+ if (locked) {
+ if (exclusive)
+ __wt_lsm_tree_writeunlock(session, lsm_tree);
+ else
+ __wt_lsm_tree_readunlock(session, lsm_tree);
+ }
+ if (need_release)
+ __wt_lsm_tree_release(session, lsm_tree);
+ return (ret);
}