diff options
author | Michael Cahill <mjc@wiredtiger.com> | 2013-12-10 20:47:25 -0800 |
---|---|---|
committer | Michael Cahill <mjc@wiredtiger.com> | 2013-12-10 20:47:25 -0800 |
commit | b470a4ced35f7dc76ebaa4ea5a70862c46d31146 (patch) | |
tree | 4f3c379b15500505bd1a91f0dfd05ba86afdd3d7 /src | |
parent | a6daa15df6d4ff3e306fc08d0f751d8581ad47a8 (diff) | |
parent | de3729860eaea10cded87cc1599a23466b35eb43 (diff) | |
download | mongo-b470a4ced35f7dc76ebaa4ea5a70862c46d31146.tar.gz |
Merge pull request #792 from wiredtiger/compact-lsm
Allow LSM trees to be compacted.
Diffstat (limited to 'src')
-rw-r--r-- | src/config/config_def.c | 9 | ||||
-rw-r--r-- | src/docs/compact.dox | 16 | ||||
-rw-r--r-- | src/include/compact.h | 12 | ||||
-rw-r--r-- | src/include/extern.h | 2 | ||||
-rw-r--r-- | src/include/lsm.h | 10 | ||||
-rw-r--r-- | src/include/session.h | 1 | ||||
-rw-r--r-- | src/include/wiredtiger.in | 7 | ||||
-rw-r--r-- | src/include/wt_internal.h | 3 | ||||
-rw-r--r-- | src/lsm/lsm_merge.c | 27 | ||||
-rw-r--r-- | src/lsm/lsm_tree.c | 46 | ||||
-rw-r--r-- | src/lsm/lsm_worker.c | 6 | ||||
-rw-r--r-- | src/schema/schema_worker.c | 5 | ||||
-rw-r--r-- | src/session/session_api.c | 15 | ||||
-rw-r--r-- | src/session/session_compact.c | 124 |
14 files changed, 219 insertions, 64 deletions
diff --git a/src/config/config_def.c b/src/config/config_def.c index 1ea0a9e4d71..056f2218af1 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -115,6 +115,11 @@ static const WT_CONFIG_CHECK confchk_session_checkpoint[] = { { NULL, NULL, NULL, NULL } }; +static const WT_CONFIG_CHECK confchk_session_compact[] = { + { "timeout", "int", NULL, NULL}, + { NULL, NULL, NULL, NULL } +}; + static const WT_CONFIG_CHECK confchk_lsm_subconfigs[] = { { "auto_throttle", "boolean", NULL, NULL }, { "bloom", "boolean", NULL, NULL }, @@ -355,8 +360,8 @@ static const WT_CONFIG_ENTRY config_entries[] = { NULL }, { "session.compact", - "", - NULL + "timeout=1200", + confchk_session_compact }, { "session.create", "allocation_size=4KB,block_allocation=best,block_compressor=," diff --git a/src/docs/compact.dox b/src/docs/compact.dox index 634bb98d9b5..c68415b5324 100644 --- a/src/docs/compact.dox +++ b/src/docs/compact.dox @@ -1,13 +1,13 @@ /*! @page compaction Compaction -The WT_SESSION::compact method can be used to compact a row- or column-store -btree (log-structured merge trees cannot be compacted). +The WT_SESSION::compact method can be used to compact btree and log-structured +merge tree data sources. -The data source does not have to be quiescent, compaction may be performed on -a live data source. - -Because checkpoints named by the application are not discarded until -explicitly removed or replaced, they may prevent WT_SESSION::compact -from accomplishing anything. +The data source does not have to be quiescent, compaction may be performed on a +live data source. +Because checkpoints named by the application are not discarded until explicitly +removed or replaced, they may prevent WT_SESSION::compact from accomplishing +anything. + * */ diff --git a/src/include/compact.h b/src/include/compact.h new file mode 100644 index 00000000000..f799b75ebca --- /dev/null +++ b/src/include/compact.h @@ -0,0 +1,12 @@ +/*- + * Copyright (c) 2008-2013 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +struct __wt_compact { + uint32_t lsm_count; /* Number of LSM trees seen */ + uint32_t file_count; /* Number of files seen */ + uint64_t max_time; /* Configured timeout */ +}; diff --git a/src/include/extern.h b/src/include/extern.h index 4374a80dc2c..34cc0ae382b 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -899,6 +899,7 @@ extern int __wt_lsm_tree_lock( WT_SESSION_IMPL *session, int exclusive); extern int __wt_lsm_tree_unlock( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree); +extern int __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name); extern int __wt_lsm_tree_worker(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, @@ -1280,6 +1281,7 @@ extern int __wt_open_session(WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler, const char *config, WT_SESSION_IMPL **sessionp); +extern int __wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri); extern int __wt_session_compact( WT_SESSION *wt_session, const char *uri, const char *config); diff --git a/src/include/lsm.h b/src/include/lsm.h index cd957c9d96f..9000e4cb226 100644 --- a/src/include/lsm.h +++ b/src/include/lsm.h @@ -91,6 +91,7 @@ struct __wt_lsm_tree { long throttle_sleep; /* Rate limiting */ uint64_t chunk_fill_ms; /* Estimate of time to fill a chunk */ + uint64_t merge_progressing; /* Bumped when merges are active */ /* Configuration parameters */ uint32_t bloom_bit_count; @@ -123,10 +124,11 @@ struct __wt_lsm_tree { size_t old_alloc; /* Space allocated for old chunks */ u_int nold_chunks; /* Number of old chunks */ -#define WT_LSM_TREE_NEED_SWITCH 0x01 -#define WT_LSM_TREE_OPEN 0x02 -#define WT_LSM_TREE_THROTTLE 0x04 -#define WT_LSM_TREE_WORKING 0x08 +#define WT_LSM_TREE_COMPACTING 0x01 +#define WT_LSM_TREE_NEED_SWITCH 0x02 +#define WT_LSM_TREE_OPEN 0x04 +#define WT_LSM_TREE_THROTTLE 0x08 +#define WT_LSM_TREE_WORKING 0x10 uint32_t flags; }; diff --git a/src/include/session.h b/src/include/session.h index 5c31280f979..a8c215b0e9c 100644 --- a/src/include/session.h +++ b/src/include/session.h @@ -70,6 +70,7 @@ struct __wt_session_impl { TAILQ_HEAD(__cursors, __wt_cursor) cursors; WT_CURSOR_BACKUP *bkp_cursor; /* Hot backup cursor */ + WT_COMPACT *compact; /* Compact state */ WT_BTREE *metafile; /* Metadata file */ void *meta_track; /* Metadata operation tracking */ diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 8d68b3240ad..0a1c7f3156c 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -901,7 +901,12 @@ struct __wt_session { * @param session the session handle * @param name the URI of the object to compact, such as * \c "table:stock" - * @configempty{session.compact, see dist/api_data.py} + * @configstart{session.compact, see dist/api_data.py} + * @config{timeout, maximum amount of time to allow for compact in + * seconds. The actual amount of time spent in compact may exceed the + * configured value. A value of zero disables the timeout., an integer; + * default \c 1200.} + * @configend * @errors */ int __F(compact)(WT_SESSION *session, diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h index d36656be947..3377f4d721e 100644 --- a/src/include/wt_internal.h +++ b/src/include/wt_internal.h @@ -83,6 +83,8 @@ struct __wt_col_rle; typedef struct __wt_col_rle WT_COL_RLE; struct __wt_colgroup; typedef struct __wt_colgroup WT_COLGROUP; +struct __wt_compact; + typedef struct __wt_compact WT_COMPACT; struct __wt_condvar; typedef struct __wt_condvar WT_CONDVAR; struct __wt_config; @@ -237,6 +239,7 @@ struct __wt_update; #include "btree.h" #include "cache.h" #include "config.h" +#include "compact.h" #include "cursor.h" #include "dlh.h" #include "error.h" diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c index eebcf02c93f..f92b8610f80 100644 --- a/src/lsm/lsm_merge.c +++ b/src/lsm/lsm_merge.c @@ -61,6 +61,7 @@ __wt_lsm_merge( uint32_t generation, start_id; uint64_t insert_count, record_count, chunk_size; u_int dest_id, end_chunk, i, merge_min, nchunks, start_chunk; + u_int max_generation_gap; int create_bloom; const char *cfg[3]; @@ -71,13 +72,15 @@ __wt_lsm_merge( start_id = 0; /* - * If the tree is open read-only, be very aggressive. Otherwise, we - * can spend a long time waiting for merges to start in read-only - * applications. + * If the tree is open read-only or we are compacting, be very + * aggressive. Otherwise, we can spend a long time waiting for merges + * to start in read-only applications. */ - if (!lsm_tree->modified) + if (!lsm_tree->modified || + F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) aggressive = 10; merge_min = (aggressive > 5) ? 2 : lsm_tree->merge_min; + max_generation_gap = 1 + aggressive / 5; /* * If there aren't any chunks to merge, or some of the chunks aren't @@ -153,13 +156,16 @@ __wt_lsm_merge( break; /* - * If we have enough chunks for a merge and the next chunk is - * in a different generation, stop. + * In normal operation, if we have enough chunks for a merge + * and the next chunk is in a different generation, stop. + * In aggressive mode, look for the biggest merge we can do. */ if (nchunks >= merge_min) { previous = lsm_tree->chunk[start_chunk]; - if (chunk->generation > previous->generation && - previous->generation <= youngest->generation + 1) + if (previous->generation <= + youngest->generation + max_generation_gap && + chunk->generation > + previous->generation + max_generation_gap - 1) break; } @@ -188,7 +194,8 @@ __wt_lsm_merge( * generations. */ if (nchunks < merge_min || - chunk->generation > youngest->generation + 1) { + chunk->generation > + youngest->generation + max_generation_gap) { for (i = 0; i < nchunks; i++) F_CLR(lsm_tree->chunk[start_chunk + i], WT_LSM_CHUNK_MERGING); @@ -262,6 +269,7 @@ __wt_lsm_merge( WT_ERR(EINTR); WT_STAT_FAST_CONN_INCRV(session, lsm_rows_merged, LSM_MERGE_CHECK_INTERVAL); + ++lsm_tree->merge_progressing; } WT_ERR(src->get_key(src, &key)); @@ -276,6 +284,7 @@ __wt_lsm_merge( WT_STAT_FAST_CONN_INCRV(session, lsm_rows_merged, insert_count % LSM_MERGE_CHECK_INTERVAL); + ++lsm_tree->merge_progressing; WT_VERBOSE_ERR(session, lsm, "Bloom size for %" PRIu64 " has %" PRIu64 " items inserted.", record_count, insert_count); diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c index 7f95ccff9de..135916afa3d 100644 --- a/src/lsm/lsm_tree.c +++ b/src/lsm/lsm_tree.c @@ -927,6 +927,52 @@ __wt_lsm_tree_unlock( } /* + * __wt_lsm_compact -- + * Compact an LSM tree called via __wt_schema_worker. + */ +int +__wt_lsm_compact(WT_SESSION_IMPL *session, const char *name) +{ + WT_DECL_RET; + WT_LSM_TREE *lsm_tree; + uint64_t last_merge_progressing; + time_t begin, end; + + /* Ignore non LSM names. */ + if (!WT_PREFIX_MATCH(name, "lsm:")) + return (0); + + WT_RET(__wt_lsm_tree_get(session, name, 0, &lsm_tree)); + + if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE) || + lsm_tree->merge_threads == 0) + WT_RET_MSG(session, EINVAL, + "LSM compaction requires active merge threads"); + + WT_RET(__wt_seconds(session, &begin)); + + F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); + + /* Wake up the merge threads. */ + WT_RET(__wt_cond_signal(session, lsm_tree->work_cond)); + + /* Now wait for merge activity to stop. */ + do { + last_merge_progressing = lsm_tree->merge_progressing; + __wt_sleep(1, 0); + WT_RET(__wt_seconds(session, &end)); + if (session->compact->max_time > 0 && + session->compact->max_time < (uint64_t)(end - begin)) + WT_ERR(ETIMEDOUT); + } while (lsm_tree->merge_progressing != last_merge_progressing && + lsm_tree->nchunks > 1); + +err: F_CLR(lsm_tree, WT_LSM_TREE_COMPACTING); + + return (ret); +} + +/* * __wt_lsm_tree_worker -- * Run a schema worker operation on each level of a LSM tree. */ diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c index d6c5011c33e..3cfb4119b85 100644 --- a/src/lsm/lsm_worker.c +++ b/src/lsm/lsm_worker.c @@ -445,7 +445,11 @@ __lsm_bloom_create(WT_SESSION_IMPL *session, WT_RET(wt_session->drop(wt_session, chunk->bloom_uri, "force")); bloom = NULL; - + /* + * This is merge-like activity, and we don't want compacts to give up + * because we are creating a bunch of bloom filters before merging. + */ + ++lsm_tree->merge_progressing; WT_RET(__wt_bloom_create(session, chunk->bloom_uri, lsm_tree->bloom_config, chunk->count, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom)); diff --git a/src/schema/schema_worker.c b/src/schema/schema_worker.c index 7c774e178d8..60143f97f77 100644 --- a/src/schema/schema_worker.c +++ b/src/schema/schema_worker.c @@ -52,9 +52,8 @@ __wt_schema_worker(WT_SESSION_IMPL *session, WT_ERR(__wt_schema_worker(session, idx->source, file_func, name_func, cfg, open_flags)); } else if (WT_PREFIX_MATCH(uri, "lsm:")) { - if (file_func != __wt_compact) - WT_ERR(__wt_lsm_tree_worker(session, - uri, file_func, name_func, cfg, open_flags)); + WT_ERR(__wt_lsm_tree_worker(session, + uri, file_func, name_func, cfg, open_flags)); } else if (WT_PREFIX_SKIP(tablename, "table:")) { WT_ERR(__wt_schema_get_table(session, tablename, strlen(tablename), 0, &table)); diff --git a/src/session/session_api.c b/src/session/session_api.c index 133348f2a01..67d43f39ff1 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -382,31 +382,19 @@ static int __session_compact(WT_SESSION *wt_session, const char *uri, const char *config) { WT_SESSION_IMPL *session; - WT_TXN *txn; session = (WT_SESSION_IMPL *)wt_session; - txn = &session->txn; /* Disallow objects in the WiredTiger name space. */ WT_RET(__wt_schema_name_check(session, uri)); - /* Compaction makes no sense for LSM objects, ignore requests. */ - if (WT_PREFIX_MATCH(uri, "lsm:")) - return (0); if (!WT_PREFIX_MATCH(uri, "colgroup:") && !WT_PREFIX_MATCH(uri, "file:") && !WT_PREFIX_MATCH(uri, "index:") && + !WT_PREFIX_MATCH(uri, "lsm:") && !WT_PREFIX_MATCH(uri, "table:")) return (__wt_bad_object_type(session, uri)); - /* - * Compaction requires checkpoints, which will fail in a transactional - * context. Check now so the error message isn't confusing. - */ - if (F_ISSET(txn, TXN_RUNNING)) - WT_RET_MSG(session, EINVAL, - "Compaction not permitted in a transaction"); - return (__wt_session_compact(wt_session, uri, config)); } @@ -693,6 +681,7 @@ __session_checkpoint(WT_SESSION *wt_session, const char *config) WT_TXN *txn; session = (WT_SESSION_IMPL *)wt_session; + txn = &session->txn; WT_STAT_FAST_CONN_INCR(session, txn_checkpoint); diff --git a/src/session/session_compact.c b/src/session/session_compact.c index 03d519004ec..d3f5be6b71b 100644 --- a/src/session/session_compact.c +++ b/src/session/session_compact.c @@ -96,63 +96,141 @@ */ /* - * __session_compact_worker -- - * Worker function to do the actual compaction call. + * __wt_compact_uri_analyze -- + * Extract information relevant to deciding what work compact needs to + * do from a URI that is part of a table schema. + * Called via the schema_worker function. */ -static int -__session_compact_worker( - WT_SESSION *wt_session, const char *uri, const char *config) +int +__wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri) { - WT_DECL_RET; - WT_SESSION_IMPL *session; + /* + * Add references to schema URI objects to the list of objects to be + * compacted. + */ + if (WT_PREFIX_MATCH(uri, "lsm:")) + session->compact->lsm_count++; + else if (WT_PREFIX_MATCH(uri, "file:")) + session->compact->file_count++; - session = (WT_SESSION_IMPL *)wt_session; - SESSION_API_CALL(session, compact, config, cfg); + return (0); +} + +/* + * __session_compact_check_timeout -- + * Check if the timeout has been exceeded. + */ +static int +__session_compact_check_timeout( + WT_SESSION_IMPL *session, struct timespec begin) +{ + struct timespec end; - WT_WITH_SCHEMA_LOCK(session, - ret = __wt_schema_worker(session, uri, __wt_compact, NULL, cfg, 0)); + if (session->compact->max_time == 0) + return (0); -err: API_END_NOTFOUND_MAP(session, ret); + WT_RET(__wt_epoch(session, &end)); + if (session->compact->max_time < + WT_TIMEDIFF(end, begin) / WT_BILLION) + WT_RET(ETIMEDOUT); + return (0); } /* - * __wt_session_compact -- - * Function to alternate between checkpoints and compaction calls. + * __compact_file -- + * */ -int -__wt_session_compact( - WT_SESSION *wt_session, const char *uri, const char *config) +static int +__compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) { - WT_DECL_ITEM(t); WT_DECL_RET; - WT_SESSION_IMPL *session; + WT_DECL_ITEM(t); + WT_SESSION *wt_session; + WT_TXN *txn; int i; + struct timespec start_time; - session = (WT_SESSION_IMPL *)wt_session; + txn = &session->txn; + wt_session = &session->iface; + + /* + * File compaction requires checkpoints, which will fail in a + * transactional context. Check now so the error message isn't + * confusing. + */ + if (session->compact->file_count != 0 && F_ISSET(txn, TXN_RUNNING)) + WT_ERR_MSG(session, EINVAL, + " File compaction not permitted in a transaction"); /* * Force the checkpoint: we don't want to skip it because the work we * need to have done is done in the underlying block manager. */ - WT_RET(__wt_scr_alloc(session, 128, &t)); + WT_ERR(__wt_scr_alloc(session, 128, &t)); WT_ERR(__wt_buf_fmt(session, t, "target=(\"%s\"),force=1", uri)); + WT_ERR(__wt_epoch(session, &start_time)); + /* * We compact 10% of the file on each pass, try 10 times (which is - * probably overkill), and quit if we make no progress. + * probably overkill), and quit if we make no progress. Check for a + * timeout each time through the loop. */ for (i = 0; i < 10; ++i) { WT_ERR(wt_session->checkpoint(wt_session, t->data)); session->compaction = 0; - WT_ERR(__session_compact_worker(wt_session, uri, config)); + WT_WITH_SCHEMA_LOCK(session, + ret = __wt_schema_worker( + session, uri, __wt_compact, NULL, cfg, 0)); + WT_ERR(ret); if (!session->compaction) break; WT_ERR(wt_session->checkpoint(wt_session, t->data)); WT_ERR(wt_session->checkpoint(wt_session, t->data)); + WT_ERR(__session_compact_check_timeout(session, start_time)); } err: __wt_scr_free(&t); return (ret); } + +/* + * __wt_session_compact -- + * Function to alternate between checkpoints and compaction calls. + */ +int +__wt_session_compact( + WT_SESSION *wt_session, const char *uri, const char *config) +{ + WT_COMPACT compact; + WT_CONFIG_ITEM cval; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)wt_session; + SESSION_API_CALL(session, compact, config, cfg); + + /* Setup the structure in the session handle */ + memset(&compact, 0, sizeof(WT_COMPACT)); + session->compact = &compact; + + WT_ERR(__wt_config_gets(session, cfg, "timeout", &cval)); + session->compact->max_time = (uint64_t)cval.val; + + /* Find the types of data sources are being compacted. */ + WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker( + session, uri, NULL, __wt_compact_uri_analyze, cfg, 0)); + WT_ERR(ret); + + if (session->compact->lsm_count != 0) + WT_ERR(__wt_schema_worker( + session, uri, NULL, __wt_lsm_compact, cfg, 0)); + if (session->compact->file_count != 0) + WT_ERR(__compact_file(session, uri, cfg)); + +err: session->compact = NULL; + API_END_NOTFOUND_MAP(session, ret); + return (ret); +} |