diff options
author | Michael Cahill <mjc@wiredtiger.com> | 2012-09-14 06:56:21 -0700 |
---|---|---|
committer | Michael Cahill <mjc@wiredtiger.com> | 2012-09-14 06:56:21 -0700 |
commit | 0abd4252e3db7d623b25077fb5b0c1811cf454ed (patch) | |
tree | 67a14705d9a5e120a1f70a7489595e1c7a24b2f5 | |
parent | 016a035d3acefda6ee14e5fbac215a1bd28bb98f (diff) | |
parent | 45f1893bd8ef5bc327893f7662861e73e23f4d9d (diff) | |
download | mongo-0abd4252e3db7d623b25077fb5b0c1811cf454ed.tar.gz |
Merge pull request #325 from wiredtiger/lsm
Add support for LSM trees
closes #168
70 files changed, 2780 insertions, 240 deletions
diff --git a/dist/api_data.py b/dist/api_data.py index 9d9804b3a94..143a031a7aa 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -79,8 +79,20 @@ format_meta = column_meta + [ type='format'), ] +lsm_config = [ + Config('lsm_chunk_size', '2MB', r''' + the maximum size of the in-memory chunk of an LSM tree''', + min='512K',max='500MB'), + Config('lsm_bloom_hash_count', '4', r''' + the number of hash values per item used for LSM bloom filters.''', + min='2',max='100'), + Config('lsm_bloom_bit_count', '8', r''' + the number of bits used per item for LSM bloom filters.''', + min='2',max='1000'), +] + # Per-file configuration -file_config = format_meta + [ +file_config = format_meta + lsm_config + [ Config('allocation_size', '512B', r''' the file unit allocation size, in bytes, must a power-of-two; smaller values decrease the file space required by overflow @@ -215,6 +227,7 @@ connection_runtime_config = [ 'evictserver', 'fileops', 'hazard', + 'lsm', 'mutex', 'read', 'readserver', @@ -438,6 +451,7 @@ flags = { 'VERB_evict', 'VERB_evictserver', 'VERB_fileops', + 'VERB_lsm', 'VERB_hazard', 'VERB_mutex', 'VERB_read', diff --git a/dist/filelist b/dist/filelist index 1749cf047d7..e2ed4911683 100644 --- a/dist/filelist +++ b/dist/filelist @@ -66,6 +66,12 @@ src/cursor/cur_std.c src/cursor/cur_table.c src/log/log.c src/log/log_desc.c +src/lsm/lsm_cursor.c +src/lsm/lsm_dsrc.c +src/lsm/lsm_merge.c +src/lsm/lsm_meta.c +src/lsm/lsm_tree.c +src/lsm/lsm_worker.c src/meta/meta_api.c src/meta/meta_apply.c src/meta/meta_ckpt.c diff --git a/dist/s_funcs.list b/dist/s_funcs.list index b614aa5552d..574d97d42b6 100644 --- a/dist/s_funcs.list +++ b/dist/s_funcs.list @@ -2,6 +2,7 @@ WT_CURDUMP_PASS __bit_ffs __bit_nclr +__wt_bloom_drop __wt_bm_addr_stderr __wt_btree_lex_compare __wt_config_getone diff --git a/dist/s_string.ok b/dist/s_string.ok index 678f31e7b1b..d3e661b4cfc 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -104,7 +104,6 @@ LRU LSB LSM LSN -LSN's LSNs LeafGreen Llqr @@ -271,6 +270,7 @@ ckptfrag ckptlist cksum clr +clsm cmd cmp cnt diff --git a/examples/c/ex_all.c b/examples/c/ex_all.c index ba63970abf5..09a95d3aaee 100644 --- a/examples/c/ex_all.c +++ b/examples/c/ex_all.c @@ -523,12 +523,13 @@ transaction_ops(WT_CONNECTION *conn, WT_SESSION *session) /*! [WT_DATA_SOURCE create] */ static int my_create(WT_DATA_SOURCE *dsrc, WT_SESSION *session, - const char *name, const char *config) + const char *name, int exclusive, const char *config) { /* Unused parameters */ (void)dsrc; (void)session; (void)name; + (void)exclusive; (void)config; return (0); @@ -538,13 +539,13 @@ my_create(WT_DATA_SOURCE *dsrc, WT_SESSION *session, /*! [WT_DATA_SOURCE drop] */ static int my_drop(WT_DATA_SOURCE *dsrc, WT_SESSION *session, - const char *name, const char *config) + const char *name, const char *cfg[]) { /* Unused parameters */ (void)dsrc; (void)session; (void)name; - (void)config; + (void)cfg; return (0); } @@ -553,16 +554,14 @@ my_drop(WT_DATA_SOURCE *dsrc, WT_SESSION *session, /*! [WT_DATA_SOURCE open_cursor] */ static int my_open_cursor(WT_DATA_SOURCE *dsrc, WT_SESSION *session, - const char *obj, WT_CURSOR *old_cursor, const char *config, - WT_CURSOR **new_cursor) + const char *obj, const char *cfg[], WT_CURSOR **new_cursor) { /* Unused parameters */ (void)dsrc; (void)session; (void)obj; - (void)old_cursor; - (void)config; + (void)cfg; (void)new_cursor; return (0); @@ -572,44 +571,29 @@ my_open_cursor(WT_DATA_SOURCE *dsrc, WT_SESSION *session, /*! [WT_DATA_SOURCE rename] */ static int my_rename(WT_DATA_SOURCE *dsrc, WT_SESSION *session, - const char *oldname, const char *newname, const char *config) + const char *oldname, const char *newname, const char *cfg[]) { /* Unused parameters */ (void)dsrc; (void)session; (void)oldname; (void)newname; - (void)config; + (void)cfg; return (0); } /*! [WT_DATA_SOURCE rename] */ -/*! [WT_DATA_SOURCE sync] */ -static int -my_sync(WT_DATA_SOURCE *dsrc, WT_SESSION *session, - const char *name, const char *config) -{ - /* Unused parameters */ - (void)dsrc; - (void)session; - (void)name; - (void)config; - - return (0); -} -/*! [WT_DATA_SOURCE sync] */ - /*! [WT_DATA_SOURCE truncate] */ static int my_truncate(WT_DATA_SOURCE *dsrc, WT_SESSION *session, - const char *name, const char *config) + const char *name, const char *cfg[]) { /* Unused parameters */ (void)dsrc; (void)session; (void)name; - (void)config; + (void)cfg; return (0); } @@ -626,7 +610,6 @@ add_data_source(WT_CONNECTION *conn) my_drop, my_open_cursor, my_rename, - my_sync, my_truncate }; ret = conn->add_data_source(conn, "dsrc:", &my_dsrc, NULL); diff --git a/src/bloom/bloom.c b/src/bloom/bloom.c index f21de05ac88..657cb56e04b 100644 --- a/src/bloom/bloom.c +++ b/src/bloom/bloom.c @@ -14,6 +14,10 @@ static int __bloom_init( WT_SESSION_IMPL *, const char *, const char *, WT_BLOOM **); static int __bloom_setup(WT_BLOOM *, uint64_t, uint64_t, uint32_t, uint32_t); +/* + * __bloom_init -- + * Allocate a WT_BLOOM handle. + */ static int __bloom_init(WT_SESSION_IMPL *session, const char *uri, const char *config, WT_BLOOM **bloomp) @@ -51,10 +55,12 @@ err: if (bloom->uri != NULL) } /* - * Populate the bloom structure. - * Setup is passed in either the count of items expected (n), or the length - * of the bitstring (m). Depends on whether the function is called via create - * or open. + * __bloom_setup -- + * Populate the bloom structure. + * + * Setup is passed in either the count of items expected (n), or the length of + * the bitstring (m). Depends on whether the function is called via create or + * open. */ static int __bloom_setup( @@ -77,8 +83,8 @@ __bloom_setup( /* * __wt_bloom_create -- * - * Creates and configures a WT_BLOOM handle, allocates a bitstring in memory - * to use while populating the bloom filter. + * Creates and configures a WT_BLOOM handle, allocates a bitstring in memory to + * use while populating the bloom filter. * * count - is the expected number of inserted items * factor - is the number of bits to use per inserted item @@ -101,28 +107,28 @@ __wt_bloom_create( /* * __wt_bloom_open -- - * Open a Bloom filter object for use by a single session. The filter must have - * been created and finalized. + * Open a Bloom filter object for use by a single session. The filter must + * have been created and finalized. */ int __wt_bloom_open(WT_SESSION_IMPL *session, - const char *uri, uint32_t factor, uint32_t k, WT_BLOOM **bloomp) + const char *uri, uint32_t factor, uint32_t k, + WT_CURSOR *owner, WT_BLOOM **bloomp) { WT_BLOOM *bloom; WT_CURSOR *c; - WT_SESSION *wt_session; + const char *cfg[] = API_CONF_DEFAULTS(session, open_cursor, NULL); uint64_t size; - wt_session = (WT_SESSION *)session; - WT_RET(__bloom_init(session, uri, NULL, &bloom)); /* Find the largest key, to get the size of the filter. */ - WT_RET(wt_session->open_cursor(wt_session, bloom->uri, NULL, NULL, &c)); + cfg[1] = bloom->config; + WT_RET(__wt_curfile_open(session, bloom->uri, owner, cfg, &c)); WT_RET(c->prev(c)); WT_RET(c->get_key(c, &size)); - WT_RET(c->close(c)); + bloom->c = c; WT_RET(__bloom_setup(bloom, 0, size, factor, k)); *bloomp = bloom; @@ -131,7 +137,7 @@ __wt_bloom_open(WT_SESSION_IMPL *session, /* * __wt_bloom_insert -- - * Adds the given key to the Bloom filter. + * Adds the given key to the Bloom filter. */ int __wt_bloom_insert(WT_BLOOM *bloom, WT_ITEM *key) @@ -149,8 +155,8 @@ __wt_bloom_insert(WT_BLOOM *bloom, WT_ITEM *key) /* * __wt_bloom_finalize -- - * Writes the Bloom filter to stable storage. After calling finalize, only - * read operations can be performed on the bloom filter. + * Writes the Bloom filter to stable storage. After calling finalize, only + * read operations can be performed on the bloom filter. */ int __wt_bloom_finalize(WT_BLOOM *bloom) @@ -183,8 +189,8 @@ __wt_bloom_finalize(WT_BLOOM *bloom) /* * __wt_bloom_get -- - * Tests whether the given key is in the Bloom filter. - * Returns zero if found, WT_NOTFOUND if not. + * Tests whether the given key is in the Bloom filter. + * Returns zero if found, WT_NOTFOUND if not. */ int __wt_bloom_get(WT_BLOOM *bloom, WT_ITEM *key) @@ -243,7 +249,7 @@ err: /* Don't return WT_NOTFOUND from a failed search. */ /* * __wt_bloom_close -- - * Close the Bloom filter, release any resources. + * Close the Bloom filter, release any resources. */ int __wt_bloom_close(WT_BLOOM *bloom) @@ -265,7 +271,7 @@ __wt_bloom_close(WT_BLOOM *bloom) /* * __wt_bloom_drop -- - * Drop a Bloom filter, release any resources. + * Drop a Bloom filter, release any resources. */ int __wt_bloom_drop(WT_BLOOM *bloom, const char *config) diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c index a9baa586bb3..34c2f7d3e3d 100644 --- a/src/btree/bt_evict.c +++ b/src/btree/bt_evict.c @@ -576,6 +576,14 @@ __evict_file_request(WT_SESSION_IMPL *session, int syncop) session, page, WT_REC_SINGLE)); break; case WT_SYNC_DISCARD_NOWRITE: + /* + * When we discard the root page, clear the reference + * from the btree handle. It is important to do this + * here, so that future eviction doesn't see root_page + * pointing to freed memory. + */ + if (WT_PAGE_IS_ROOT(page)) + session->btree->root_page = NULL; __wt_page_out(session, &page, 0); break; } @@ -734,13 +742,18 @@ __evict_walk(WT_SESSION_IMPL *session) i = WT_EVICT_WALK_BASE; TAILQ_FOREACH(btree, &conn->btqh, q) { /* - * Skip files marked as cache-resident, and files involved (or - * potentially involved), in a bulk load. The real problem is + * Skip files that aren't open or don't have a root page. + * + * Also skip files marked as cache-resident, and files + * potentially involved in a bulk load. The real problem is * eviction doesn't want to be walking the file as it converts * to a bulk-loaded object, and empty trees aren't worth trying * to evict, anyway. */ - if (btree->cache_resident || btree->bulk_load_ok) + if (!F_ISSET(btree, WT_BTREE_OPEN) || + btree->root_page == NULL || + F_ISSET(btree, WT_BTREE_NO_EVICTION) || + btree->bulk_load_ok) continue; /* Reference the correct WT_BTREE handle. */ diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index 24e7e50bf75..a1602173498 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -210,11 +210,14 @@ __btree_conf(WT_SESSION_IMPL *session) /* Eviction; the metadata file is never evicted. */ if (strcmp(btree->name, WT_METADATA_URI) == 0) - btree->cache_resident = 1; + F_SET(btree, WT_BTREE_NO_EVICTION | WT_BTREE_NO_HAZARD); else { WT_RET(__wt_config_getones( session, config, "cache_resident", &cval)); - btree->cache_resident = cval.val ? 1 : 0; + if (cval.val) + F_SET(btree, WT_BTREE_NO_EVICTION | WT_BTREE_NO_HAZARD); + else + F_CLR(btree, WT_BTREE_NO_EVICTION); } /* Huffman encoding */ @@ -403,6 +406,42 @@ __wt_btree_leaf_create( } /* + * __wt_btree_get_memsize -- + * Access the size of an in-memory tree with a single leaf page. + */ +int +__wt_btree_get_memsize( + WT_SESSION_IMPL *session, WT_BTREE *btree, uint32_t **memsizep) +{ + WT_PAGE *root, *child; + + WT_UNUSED(session); + root = btree->root_page; + child = root->u.intl.t->page; + + if (root->entries != 1 || child == NULL) { + *memsizep = NULL; + return (WT_ERROR); + } + + *memsizep = &child->memory_footprint; + F_SET(btree, WT_BTREE_NO_EVICTION); + return (0); +} + +/* + * __wt_btree_release_memsize -- + * Release a cache-resident tree. + */ +int +__wt_btree_release_memsize(WT_SESSION_IMPL *session, WT_BTREE *btree) +{ + WT_UNUSED(session); + F_CLR(btree, WT_BTREE_NO_EVICTION); + return (0); +} + +/* * __btree_get_last_recno -- * Set the last record number for a column-store. */ diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index 321d05608d2..8184bdaf058 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -70,20 +70,10 @@ __wt_bt_cache_flush(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, int op) case WT_SYNC: break; case WT_SYNC_DISCARD: + case WT_SYNC_DISCARD_NOWRITE: /* If discarding the tree, the root page should be gone. */ WT_ASSERT(session, btree->root_page == NULL); break; - case WT_SYNC_DISCARD_NOWRITE: - /* - * XXX - * I'm not sure this is the right place to do this, but it's - * the point in the btree engine where we know the root page - * is gone. Unlike WT_SYNC_DISCARD, which writes, evicts and - * discards the root page, WT_SYNC_DISCARD_NOWRITE simply - * discards the pages, which means "eviction" never happens. - */ - btree->root_page = NULL; - break; } return (0); diff --git a/src/config/config_def.c b/src/config/config_def.c index 3fb6d54e39b..4b26ecd97b5 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -92,7 +92,7 @@ __wt_confchk_connection_reconfigure[] = { { "eviction_target", "int", "min=10,max=99" }, { "eviction_trigger", "int", "min=10,max=99" }, { "verbose", "list", "choices=[\"block\",\"ckpt\",\"evict\"," - "\"evictserver\",\"fileops\",\"hazard\",\"mutex\",\"read\"," + "\"evictserver\",\"fileops\",\"hazard\",\"lsm\",\"mutex\",\"read\"," "\"readserver\",\"reconcile\",\"salvage\",\"verify\",\"write\"]" }, { NULL, NULL, NULL } }; @@ -112,7 +112,8 @@ __wt_confdfl_file_meta = "checksum=,collator=,columns=,dictionary=0,huffman_key=," "huffman_value=,internal_item_max=0,internal_key_truncate=," "internal_page_max=2KB,key_format=u,key_gap=10,leaf_item_max=0," - "leaf_page_max=1MB,prefix_compression=,split_pct=75,type=btree," + "leaf_page_max=1MB,lsm_bloom_bit_count=8,lsm_bloom_hash_count=4," + "lsm_chunk_size=2MB,prefix_compression=,split_pct=75,type=btree," "value_format=u,version=(major=0,minor=0)"; WT_CONFIG_CHECK @@ -134,6 +135,9 @@ __wt_confchk_file_meta[] = { { "key_gap", "int", "min=0" }, { "leaf_item_max", "int", "min=0" }, { "leaf_page_max", "int", "min=512B,max=512MB" }, + { "lsm_bloom_bit_count", "int", "min=2,max=1000" }, + { "lsm_bloom_hash_count", "int", "min=2,max=100" }, + { "lsm_chunk_size", "int", "min=512K,max=500MB" }, { "prefix_compression", "boolean", NULL }, { "split_pct", "int", "min=25,max=100" }, { "type", "string", "choices=[\"btree\"]" }, @@ -208,6 +212,7 @@ __wt_confdfl_session_create = "filename=,huffman_key=,huffman_value=,internal_item_max=0," "internal_key_truncate=,internal_page_max=2KB,key_format=u," "key_format=u,key_gap=10,leaf_item_max=0,leaf_page_max=1MB," + "lsm_bloom_bit_count=8,lsm_bloom_hash_count=4,lsm_chunk_size=2MB," "prefix_compression=,split_pct=75,type=btree,value_format=u," "value_format=u"; @@ -234,6 +239,9 @@ __wt_confchk_session_create[] = { { "key_gap", "int", "min=0" }, { "leaf_item_max", "int", "min=0" }, { "leaf_page_max", "int", "min=512B,max=512MB" }, + { "lsm_bloom_bit_count", "int", "min=2,max=1000" }, + { "lsm_bloom_hash_count", "int", "min=2,max=100" }, + { "lsm_chunk_size", "int", "min=512K,max=500MB" }, { "prefix_compression", "boolean", NULL }, { "split_pct", "int", "min=25,max=100" }, { "type", "string", "choices=[\"btree\"]" }, @@ -394,7 +402,7 @@ __wt_confchk_wiredtiger_open[] = { { "transactional", "boolean", NULL }, { "use_environment_priv", "boolean", NULL }, { "verbose", "list", "choices=[\"block\",\"ckpt\",\"evict\"," - "\"evictserver\",\"fileops\",\"hazard\",\"mutex\",\"read\"," + "\"evictserver\",\"fileops\",\"hazard\",\"lsm\",\"mutex\",\"read\"," "\"readserver\",\"reconcile\",\"salvage\",\"verify\",\"write\"]" }, { NULL, NULL, NULL } }; diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index f69f0771b81..1133c428fa6 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -319,6 +319,9 @@ __conn_close(WT_CONNECTION *wt_conn, const char *config) if (!F_ISSET(s, WT_SESSION_INTERNAL)) __wt_free(session, s->hazard); + /* Clean up open LSM handles. */ + WT_ERR(__wt_lsm_cleanup(&conn->iface)); + /* Close open btree handles. */ WT_TRET(__wt_conn_btree_discard(conn)); @@ -730,6 +733,7 @@ __conn_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) { "evictserver",WT_VERB_evictserver }, { "fileops", WT_VERB_fileops }, { "hazard", WT_VERB_hazard }, + { "lsm", WT_VERB_lsm }, { "mutex", WT_VERB_mutex }, { "read", WT_VERB_read }, { "readserver", WT_VERB_readserver }, @@ -929,6 +933,12 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, /* If there's a hot-backup file, load it. */ WT_ERR(__wt_metadata_load_backup(session)); + /* + * XXX LSM initialization. + * This is structured so that it could be moved to an extension. + */ + WT_ERR(__wt_lsm_init(&conn->iface, NULL)); + STATIC_ASSERT(offsetof(WT_CONNECTION_IMPL, iface) == 0); *wt_connp = &conn->iface; diff --git a/src/conn/conn_btree.c b/src/conn/conn_btree.c index 116ee404ccc..c39d0fd423f 100644 --- a/src/conn/conn_btree.c +++ b/src/conn/conn_btree.c @@ -244,10 +244,7 @@ __wt_conn_btree_get(WT_SESSION_IMPL *session, WT_STAT_INCR(conn->stats, file_open); - if (session->btree != NULL) - WT_RET(__conn_btree_open_lock(session, flags)); - else - WT_RET(__conn_btree_get(session, name, ckpt, flags)); + WT_RET(__conn_btree_get(session, name, ckpt, flags)); btree = session->btree; diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c index 0434feae0d1..4c44d6fe9cf 100644 --- a/src/conn/conn_handle.c +++ b/src/conn/conn_handle.c @@ -25,6 +25,8 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) TAILQ_INIT(&conn->collqh); /* Collator list */ TAILQ_INIT(&conn->compqh); /* Compressor list */ + TAILQ_INIT(&conn->lsmqh); /* WT_LSM_TREE list */ + /* Statistics. */ WT_RET(__wt_stat_alloc_connection_stats(session, &conn->stats)); diff --git a/src/cursor/cur_bulk.c b/src/cursor/cur_bulk.c index ed14beded58..1c6c54783a6 100644 --- a/src/cursor/cur_bulk.c +++ b/src/cursor/cur_bulk.c @@ -48,8 +48,10 @@ __curbulk_close(WT_CURSOR *cursor) CURSOR_API_CALL_NOCONF(cursor, session, close, btree); WT_TRET(__wt_bulk_end(cbulk)); - if (session->btree != NULL) + if (btree != NULL) { + WT_ASSERT(session, session->btree == btree); WT_TRET(__wt_session_release_btree(session)); + } /* The URI is owned by the btree handle. */ cursor->uri = NULL; WT_TRET(__wt_cursor_close(cursor)); diff --git a/src/cursor/cur_config.c b/src/cursor/cur_config.c index db6660e1ba8..5dd91d33c16 100644 --- a/src/cursor/cur_config.c +++ b/src/cursor/cur_config.c @@ -68,7 +68,7 @@ __wt_curconfig_open(WT_SESSION_IMPL *session, /* __wt_cursor_init is last so we don't have to clean up on error. */ STATIC_ASSERT(offsetof(WT_CURSOR_CONFIG, iface) == 0); - WT_ERR(__wt_cursor_init(cursor, uri, 0, cfg, cursorp)); + WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp)); if (0) { err: __wt_free(session, cconfig); diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c index 4838c33c9d8..01545c97d2f 100644 --- a/src/cursor/cur_index.c +++ b/src/cursor/cur_index.c @@ -435,7 +435,7 @@ __wt_curindex_open(WT_SESSION_IMPL *session, WT_ERR(__curindex_open_colgroups(session, cindex, cfg)); /* __wt_cursor_init is last so we don't have to clean up on error. */ - WT_ERR(__wt_cursor_init(cursor, cursor->uri, 0, cfg, cursorp)); + WT_ERR(__wt_cursor_init(cursor, cursor->uri, NULL, cfg, cursorp)); if (0) { err: (void)__curindex_close(cursor); diff --git a/src/docs/command-line.dox b/src/docs/command-line.dox index def8d3864a0..34b703246a2 100644 --- a/src/docs/command-line.dox +++ b/src/docs/command-line.dox @@ -45,7 +45,7 @@ The following are command-specific options for the \c backup command: @par <code>-t uri</code> By default, the \c backup command does a hot backup of the entire database; the \c -t option changes the \c backup command to do a hot -backup of only the listed \c file: and \c table: objects. +backup of only the named objects. <hr> @section util_create wt create diff --git a/src/docs/cursors.dox b/src/docs/cursors.dox index 302dabcf5d2..faf78c1a446 100644 --- a/src/docs/cursors.dox +++ b/src/docs/cursors.dox @@ -63,6 +63,8 @@ table cursor (key=table key\, value=table value)} file cursor (key=file key\, value=file value)} @row{<tt>index:\<tablename\>.\<indexname\></tt>, index cursor (key=index key\, value=table value)} + @row{<tt>lsm:\<name\></tt>, +LSM cursor (key=LSM key\, value=LSM value), see @ref lsm} @row{<tt>statistics:[file</tt><tt>:\<filename\>]</tt>, database or file statistics (key=(int)\, value=(string)description\, (string)value\, (uint64_t)value)} @@ -97,4 +99,9 @@ WT_CURSOR::set_key and WT_CURSOR::set_value in raw mode, the WT_ITEM should be equivalent to calling ::wiredtiger_struct_pack for the cursor's \c key_format or \c value_format, respectively. +@section cursor_random Random lookup + +Cursors can be configured to return pseudo-random records from row-store +objects. See @subpage cursor_random for details. + */ diff --git a/src/docs/data_sources.dox b/src/docs/data_sources.dox index 3a879a2cb94..ec2cd682f2f 100644 --- a/src/docs/data_sources.dox +++ b/src/docs/data_sources.dox @@ -32,12 +32,14 @@ file cursor (key=file key\, value=file value)} index cursor (key=index key\, value=table value)} @row{<tt>join:\<cursor1\>\&\<cursor2\>[&\<cursor3\>...]</tt>, join cursor @notyet{join cursors}} + @row{<tt>lsm:\<name\></tt>, +LSM cursor (key=LSM key\, value=LSM value), see @ref lsm} @row{<tt>statistics:[file</tt><tt>:\<filename\>]</tt>, database or file statistics (key=(int)\, value=(string)description\, (string)value\, (uint64_t)value)} </table> -@subsection hot_backup_cursors Hot backup cursors +@subsection data_backup Hot backup cursors See @ref hot_backup for more information. diff --git a/src/docs/file-formats.dox b/src/docs/file-formats.dox index ee9022247b7..0b4c0f21592 100644 --- a/src/docs/file-formats.dox +++ b/src/docs/file-formats.dox @@ -61,7 +61,7 @@ additional CPU and memory use when searching the in-memory tree (if keys are encoded), and additional CPU and memory use when returning values from the in-memory tree and when writing pages to disk. Note the additional CPU cost of Huffman encoding can be high, and should be -considered. (See @ref huffman for details) +considered. (See @subpage huffman for details) - Stream compression reduces the size requirement of on-disk objects by compressing blocks of the backing object's file. The cost is additional diff --git a/src/docs/lsm.dox b/src/docs/lsm.dox new file mode 100644 index 00000000000..356f6777988 --- /dev/null +++ b/src/docs/lsm.dox @@ -0,0 +1,147 @@ +/*! @page lsm Log-Structured Merge Trees + +@section lsm_background Background + +A common requirement is sustained throughput under a workload that consists +of random inserts, where either the key range is chosen so that inserts are +very unlikely to conflict (e.g., 128-bit hashes), or where inserts are +expected to overwrite existing values. + +With traditional btree variants, inserts are very fast while the data set +remains in cache, but once the tree overflows the cache, performance drops +significantly. There are two factors involved: + +1. once the data fills the cache, new inserts have some probability of going +to a page that is not in cache, requiring a read; and +2. the cache is full of dirty pages, so pages have to be written to free +space in the cache before the read can be satisfied. + +@section lsm_description Description of LSM trees + +Log-Structured Merge Trees were described in this paper by Patrick O'Neil1 , +Edward Cheng, Dieter Gawlick and Elizabeth O'Neil1: +http://www.cs.umb.edu/~poneil/lsmtree.pdf + +A logical tree is split into several physical pieces so that the +most-recently-updated portion of data is in a tree that fits entirely in +memory. The size of the in-memory chunk can be configured with the +\c "lsm_chunk_size" configuration key to WT_SESSION::create. + +Once the in-memory tree reaches a threshold size, a new in-memory tree is +created and the old tree synced to disk. Once written to disk, trees are +read-only, though they are merged in the background with other on-disk +trees to reduce the cost of reads. + +With this structure, "blind writes" can be performed entirely on the +in-memory tree. Deletes are implemented by inserting a special "tombstone" +record into the in-memory tree. + +@section lsm_api Interface to LSM trees + +An LSM tree can be created as follows, in much the same way as a +WiredTiger btree file: + +@code +session->create(session, "lsm:bucket", "key_format=S,value_format=S"); +@endcode + +Once created, the LSM tree can be accessed using the same cursor interface +as other data sources in WiredTiger: + +@code +WT_CURSOR *c; + +session->open_cursor(session, "lsm:bucket", NULL, NULL, &c); +for(;;) { + c->set_key("key"); + c->set_value("data"); + c->insert(); +} +@endcode + +Unlike ordinary file cursors, LSM cursors default to \c overwrite mode, where: + +- WT_CURSOR::insert will update existing values without checking; +- WT_CURSOR::update will insert values regardless of whether they exist; and +- WT_CURSOR::remove will succeed regardless of whether the specified record + exists. + +This behavior can be disabled by passing \c "overwrite=false" to +WT_SESSION::open_cursor, but the result will be a search through the levels +of the LSM tree before every modification. + +@section lsm_merge Merging + +A background thread is opened for each active LSM tree. This thread is +responsible for both writing old chunks to stable storage, and for merging +multiple chunks together so that reads can be satisfied from a small number +of files. There is currently no way to configure merges: they are performed +automatically by the background thread. + +@section lsm_bloom Bloom filters + +WiredTiger creates a Bloom filter when merging. This is an additional file +that contains a configurable number of bits per key (default 8). Keys are +hashed a configurable number of times (default 4), and the corresponding +bits set. The Bloom filter is used to avoid reading from a chunk if the key +cannot be present. + +With the defaults, they Bloom filter only requires one byte per key, so it +usually fits in cache. The Bloom parameters can be configured with +\c "lsm_bloom_bit_count" and \c "lsm_bloom_hash_count" configuration keys to +WT_SESSION::create. + +@section lsm_caveats Caveats + +@subsection lsm_hazard Hazard configuration + +Reads from an LSM cursor may need to position a cursor in each active chunk. +The number of chunks depends on the chunk size, and how many chunks have +been merged. There must be at least as many hazard references available as +there are chunks in the tree. The number of hazard references is configured +with the \c "hazard_max" configuration key to ::wiredtiger_open. + +@subsection lsm_schema Creating tables using LSM trees + +It is not yet possible to create tables or indices using LSM trees for +storage. This will be addressed in a future release of WiredTiger. + +Schema support will be provided for LSM as with an extension to the +WT_SESSION::create method: + +@code +session->create(session, "table:T", "type=lsm"); +@endcode + +The default type for all schema objects will continue to be btree. + +@subsection lsm_tombstones Empty values + +Internally, WiredTiger's LSM trees use an empty value to represent a +record that has been removed (also known as a "tombstone"). For this +reason, applications cannot store records in LSM trees with empty values. + +@subsection lsm_txn Transactional access + +There are currently some significant limitations in transactional access to +data stored in LSM trees: + +- if an update transaction runs for long enough that the chunk it started + writing to has been replaced, its data may not be included in the + checkpoint when that chunk is swapped to "on disk" mode, and thus may not + become visible to readers of the tree immediately when the transaction + commits; + +- update transactions running at snapshot isolation may be permitted to + commit if they conflict with an update in a concurrent transaction and + the current chunk has been replaced; + +- read-only transactions running at snapshot isolation may read newer + changes after a chunk is written to stable storage; + +- named checkpoints are not fully supported on LSM trees: recent updates to + the tree may not appear in the checkpoint; + +We intend to address these limitations in future releases. + + */ diff --git a/src/docs/programming.dox b/src/docs/programming.dox index 6e84632aad8..c2eb1f5c196 100644 --- a/src/docs/programming.dox +++ b/src/docs/programming.dox @@ -11,6 +11,7 @@ WiredTiger applications: - @subpage schema - @subpage file_formats +- @subpage lsm - @subpage transactions - @subpage checkpoints diff --git a/src/docs/spell.ok b/src/docs/spell.ok index 1d707526ab9..5004c29958d 100644 --- a/src/docs/spell.ok +++ b/src/docs/spell.ok @@ -4,6 +4,7 @@ Atomicity BLOBs CFLAGS CPPFLAGS +Cheng DB's DBTs DbCursor @@ -12,6 +13,7 @@ DbMultiple EB EmpId FreeBSD +Gawlick GCC GitHub IEC @@ -158,6 +160,7 @@ loadtext logc lookup lrtf +lsm lsn lt mailto diff --git a/src/include/api.h b/src/include/api.h index d2d588f031c..1beccb5c45c 100644 --- a/src/include/api.h +++ b/src/include/api.h @@ -201,10 +201,11 @@ struct __wt_connection_impl { WT_FH *lock_fh; /* Lock file handle */ pthread_t cache_evict_tid; /* Cache eviction server thread ID */ - pthread_t cache_read_tid; /* Cache read server thread ID */ /* Locked: btree list */ TAILQ_HEAD(__wt_btree_qh, __wt_btree) btqh; + /* Locked: LSM handle list. */ + TAILQ_HEAD(__wt_lsm_qh, __wt_lsm_tree) lsmqh; /* Locked: file list */ TAILQ_HEAD(__wt_fh_qh, __wt_fh) fhqh; @@ -230,10 +231,8 @@ struct __wt_connection_impl { uint32_t session_cnt; /* Session count */ /* - * WiredTiger allocates space for 15 hazard references in each thread of - * control, by default. There's no code path that requires more than 15 - * pages at a time (and if we find one, the right change is to increase - * the default). + * WiredTiger allocates space for a fixed number of hazard references + * in each thread of control. */ uint32_t hazard_size; /* Hazard array size */ @@ -318,7 +317,7 @@ struct __wt_connection_impl { #define CURSOR_API_CALL_NOCONF(cur, s, n, bt) \ (s) = (WT_SESSION_IMPL *)(cur)->session; \ - API_CALL_NOCONF(s, cursor, n, cur, bt); \ + API_CALL_NOCONF(s, cursor, n, cur, bt) /******************************************* * Global variables. @@ -341,12 +340,13 @@ extern WT_PROCESS __wt_process; #define WT_SESSION_INTERNAL 0x00000004 #define WT_SESSION_SALVAGE_QUIET_ERR 0x00000002 #define WT_SESSION_SCHEMA_LOCKED 0x00000001 -#define WT_VERB_block 0x00001000 -#define WT_VERB_ckpt 0x00000800 -#define WT_VERB_evict 0x00000400 -#define WT_VERB_evictserver 0x00000200 -#define WT_VERB_fileops 0x00000100 -#define WT_VERB_hazard 0x00000080 +#define WT_VERB_block 0x00002000 +#define WT_VERB_ckpt 0x00001000 +#define WT_VERB_evict 0x00000800 +#define WT_VERB_evictserver 0x00000400 +#define WT_VERB_fileops 0x00000200 +#define WT_VERB_hazard 0x00000100 +#define WT_VERB_lsm 0x00000080 #define WT_VERB_mutex 0x00000040 #define WT_VERB_read 0x00000020 #define WT_VERB_readserver 0x00000010 diff --git a/src/include/btree.h b/src/include/btree.h index 8a049de541c..7211e8b0704 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -118,7 +118,6 @@ struct __wt_btree { WT_PAGE *evict_page; /* Eviction thread's location */ volatile uint32_t lru_count; /* Count of threads in LRU eviction */ - int cache_resident; /* If no eviction on this object */ WT_BTREE_STATS *stats; /* Btree statistics */ @@ -126,10 +125,12 @@ struct __wt_btree { #define WT_BTREE_DISCARD 0x0002 /* Discard on release */ #define WT_BTREE_EXCLUSIVE 0x0004 /* Need exclusive access to handle */ #define WT_BTREE_LOCK_ONLY 0x0008 /* Handle is only needed for locking */ -#define WT_BTREE_OPEN 0x0020 /* Handle is open */ -#define WT_BTREE_SALVAGE 0x0040 /* Handle is for salvage */ -#define WT_BTREE_UPGRADE 0x0080 /* Handle is for upgrade */ -#define WT_BTREE_VERIFY 0x0100 /* Handle is for verify */ +#define WT_BTREE_NO_EVICTION 0x0010 /* Disable eviction */ +#define WT_BTREE_NO_HAZARD 0x0020 /* Disable hazard references */ +#define WT_BTREE_OPEN 0x0040 /* Handle is open */ +#define WT_BTREE_SALVAGE 0x0080 /* Handle is for salvage */ +#define WT_BTREE_UPGRADE 0x0100 /* Handle is for upgrade */ +#define WT_BTREE_VERIFY 0x0200 /* Handle is for verify */ uint32_t flags; }; diff --git a/src/include/cache.i b/src/include/cache.i index 3ed1a660c63..79b99653a78 100644 --- a/src/include/cache.i +++ b/src/include/cache.i @@ -48,8 +48,13 @@ __wt_eviction_page_check(WT_SESSION_IMPL *session, WT_PAGE *page) conn = S2C(session); mod = page->modify; - /* Root pages and clean pages are never forcibly evicted. */ - if (WT_PAGE_IS_ROOT(page) || !__wt_page_is_modified(page)) + /* + * Root pages and clean pages are never forcibly evicted. + * Nor are pages from files that are purely cache resident. + */ + if (WT_PAGE_IS_ROOT(page) || + !__wt_page_is_modified(page) || + F_ISSET(session->btree, WT_BTREE_NO_EVICTION)) return (0); /* Check the page's memory footprint. */ diff --git a/src/include/extern.h b/src/include/extern.h index ea1cb091fe7..403069a0920 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -234,6 +234,7 @@ extern int __wt_bloom_open(WT_SESSION_IMPL *session, const char *uri, uint32_t factor, uint32_t k, + WT_CURSOR *owner, WT_BLOOM **bloomp); extern int __wt_bloom_insert(WT_BLOOM *bloom, WT_ITEM *key); extern int __wt_bloom_finalize(WT_BLOOM *bloom); @@ -307,6 +308,11 @@ extern int __wt_btree_leaf_create( WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *ref, WT_PAGE **pagep); +extern int __wt_btree_get_memsize( WT_SESSION_IMPL *session, + WT_BTREE *btree, + uint32_t **memsizep); +extern int __wt_btree_release_memsize(WT_SESSION_IMPL *session, + WT_BTREE *btree); extern int __wt_btree_huffman_open(WT_SESSION_IMPL *session, const char *config); extern void __wt_btree_huffman_close(WT_SESSION_IMPL *session); @@ -659,6 +665,60 @@ extern int __wt_log_printf(WT_SESSION_IMPL *session, 2, 3))); extern WT_LOGREC_DESC __wt_logdesc_debug; +extern int __wt_clsm_init_merge(WT_CURSOR *cursor, int nchunks); +extern int __wt_clsm_open(WT_SESSION_IMPL *session, + const char *uri, + const char *cfg[], + WT_CURSOR **cursorp); +extern int __wt_lsm_init(WT_CONNECTION *wt_conn, const char *config); +extern int __wt_lsm_cleanup(WT_CONNECTION *wt_conn); +extern int __wt_lsm_merge_update_tree(WT_SESSION_IMPL *session, + WT_LSM_TREE *lsm_tree, + int nchunks, + WT_LSM_CHUNK **chunkp); +extern int __wt_lsm_major_merge(WT_SESSION_IMPL *session, + WT_LSM_TREE *lsm_tree); +extern int __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree); +extern int __wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree); +extern int __wt_lsm_tree_close_all(WT_SESSION_IMPL *session); +extern int __wt_lsm_tree_bloom_name( WT_SESSION_IMPL *session, + WT_LSM_TREE *lsm_tree, + int i, + WT_ITEM *buf); +extern int __wt_lsm_tree_chunk_name( WT_SESSION_IMPL *session, + WT_LSM_TREE *lsm_tree, + int i, + WT_ITEM *buf); +extern int __wt_lsm_tree_create_chunk( WT_SESSION_IMPL *session, + WT_LSM_TREE *lsm_tree, + int i, + const char **urip); +extern int __wt_lsm_tree_create(WT_SESSION_IMPL *session, + const char *uri, + int exclusive, + const char *config); +extern int __wt_lsm_tree_get( WT_SESSION_IMPL *session, + const char *uri, + WT_LSM_TREE **treep); +extern int __wt_lsm_tree_switch( WT_SESSION_IMPL *session, + WT_LSM_TREE *lsm_tree); +extern int __wt_lsm_tree_drop( WT_SESSION_IMPL *session, + const char *name, + const char *cfg[]); +extern int __wt_lsm_tree_rename(WT_SESSION_IMPL *session, + const char *oldname, + const char *newname, + const char *cfg[]); +extern int __wt_lsm_tree_truncate( WT_SESSION_IMPL *session, + const char *name, + const char *cfg[]); +extern int __wt_lsm_tree_worker(WT_SESSION_IMPL *session, + const char *uri, + int (*func)(WT_SESSION_IMPL *, + const char *[]), + const char *cfg[], + uint32_t open_flags); +extern void *__wt_lsm_worker(void *arg); extern int __wt_metadata_get(WT_SESSION *session, const char *uri, const char **valuep); @@ -1091,7 +1151,7 @@ extern int __wt_buf_catfmt(WT_SESSION_IMPL *session, 4))); extern int __wt_scr_alloc_func(WT_SESSION_IMPL *session, - uint32_t size, WT_ITEM **scratchp + size_t size, WT_ITEM **scratchp #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif diff --git a/src/include/lsm.h b/src/include/lsm.h new file mode 100644 index 00000000000..2308b6c9558 --- /dev/null +++ b/src/include/lsm.h @@ -0,0 +1,80 @@ +/*- + * Copyright (c) 2008-2012 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +struct __wt_cursor_lsm { + WT_CURSOR iface; + + WT_LSM_TREE *lsm_tree; + uint64_t dsk_gen; + + int nchunks; + WT_BLOOM **blooms; + WT_CURSOR **cursors; + WT_CURSOR *current; /* The current cursor for iteration */ + + WT_LSM_CHUNK *primary_chunk; /* The current primary chunk. */ + +#define WT_CLSM_ITERATE_NEXT 0x01 /* Forward iteration */ +#define WT_CLSM_ITERATE_PREV 0x02 /* Backward iteration */ +#define WT_CLSM_MERGE 0x04 /* Merge cursor, don't update. */ +#define WT_CLSM_MULTIPLE 0x08 /* Multiple cursors have values for the + current key */ +#define WT_CLSM_UPDATED 0x10 /* Cursor has done updates */ + uint32_t flags; +}; + +struct __wt_lsm_chunk { + const char *uri; /* Data source for this chunk. */ + const char *bloom_uri; /* URI of Bloom filter, if any. */ + uint64_t count; /* Approximate count of records. */ + + uint32_t ncursor; /* Cursors with the chunk as primary. */ +#define WT_LSM_CHUNK_ONDISK 0x01 + uint32_t flags; +}; + +struct __wt_lsm_tree { + const char *name, *config, *filename; + const char *key_format, *value_format, *file_config; + + WT_COLLATOR *collator; + + WT_RWLOCK *rwlock; + TAILQ_ENTRY(__wt_lsm_tree) q; + + WT_SPINLOCK lock; + uint64_t dsk_gen; + uint32_t *memsizep; + + /* Configuration parameters */ + uint32_t threshold; + uint32_t bloom_bit_count; + uint32_t bloom_hash_count; + + WT_SESSION_IMPL *worker_session;/* Passed to thread_create */ + pthread_t worker_tid; /* LSM worker thread */ + + int nchunks; /* Number of active chunks */ + int last; /* Last allocated ID. */ + WT_LSM_CHUNK **chunk; /* Array of active LSM chunks */ + size_t chunk_alloc; /* Space allocated for chunks */ + WT_LSM_CHUNK **old_chunks; /* Array of old LSM chunks */ + size_t old_alloc; /* Space allocated for old chunks */ + int nold_chunks; /* Number of old chunks */ + int old_avail; /* Available old chunk slots */ + +#define WT_LSM_TREE_OPEN 0x01 + uint32_t flags; +}; + +struct __wt_lsm_data_source { + WT_DATA_SOURCE iface; + + WT_RWLOCK *rwlock; + + TAILQ_HEAD(__trees, __wt_lsm_tree) trees; +}; diff --git a/src/include/packing.i b/src/include/packing.i index 99f097fbcdd..5896ef5ca3f 100644 --- a/src/include/packing.i +++ b/src/include/packing.i @@ -117,6 +117,7 @@ next: if (pack->cur == pack->end) } #define WT_PACK_GET(session, pv, ap) do { \ + WT_ITEM *__item; \ switch (pv.type) { \ case 'x': \ break; \ @@ -126,7 +127,9 @@ next: if (pack->cur == pack->end) break; \ case 'U': \ case 'u': \ - pv.u.item = *va_arg(ap, WT_ITEM *); \ + __item = va_arg(ap, WT_ITEM *); \ + pv.u.item.data = __item->data; \ + pv.u.item.size = __item->size; \ break; \ case 'b': \ case 'h': \ @@ -390,6 +393,7 @@ __unpack_read(WT_SESSION_IMPL *session, } #define WT_UNPACK_PUT(session, pv, ap) do { \ + WT_ITEM *__item; \ switch (pv.type) { \ case 'x': \ break; \ @@ -399,7 +403,9 @@ __unpack_read(WT_SESSION_IMPL *session, break; \ case 'U': \ case 'u': \ - *va_arg(ap, WT_ITEM *) = pv.u.item; \ + __item = va_arg(ap, WT_ITEM *); \ + __item->data = pv.u.item.data; \ + __item->size = pv.u.item.size; \ break; \ case 'b': \ *va_arg(ap, int8_t *) = (int8_t)pv.u.i; \ diff --git a/src/include/schema.h b/src/include/schema.h index 8b2c7871a22..a3c2c32de2b 100644 --- a/src/include/schema.h +++ b/src/include/schema.h @@ -69,3 +69,10 @@ struct __wt_table { F_CLR(session, WT_SESSION_SCHEMA_LOCKED); \ __wt_spin_unlock(session, &S2C(session)->schema_lock); \ } while (0) + +#define WT_WITH_SCHEMA_LOCK_OPT(session, op) do { \ + if (F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) \ + (op); \ + else \ + WT_WITH_SCHEMA_LOCK(session, op); \ +} while (0) diff --git a/src/include/txn.i b/src/include/txn.i index e11d810316b..ce2e369e461 100644 --- a/src/include/txn.i +++ b/src/include/txn.i @@ -5,6 +5,9 @@ * See the file LICENSE for redistribution information. */ +static inline void __wt_txn_read_first(WT_SESSION_IMPL *session); +static inline void __wt_txn_read_last(WT_SESSION_IMPL *session); + /* * __wt_txn_getid -- * Get a transaction ID for a non-transactional operation. @@ -34,6 +37,14 @@ __wt_txn_getid(WT_SESSION_IMPL *session) id == WT_TXN_NONE || id == WT_TXN_ABORTED); txn->id = id; + + /* + * We allocated a new transaction ID for updates without an explicit + * transaction. To ensure that our previous updates are visible, + * update our transaction context (if required). + */ + __wt_txn_read_last(session); + __wt_txn_read_first(session); } /* diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index c7cb00fb2ba..367b52a55cd 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -657,6 +657,12 @@ struct __wt_session { * significant for applications wanting to maximize sequential data * transfer from a storage device.,an integer between 512B and 512MB; * default \c 1MB.} + * @config{lsm_bloom_bit_count, the number of bits used per item for LSM + * bloom filters..,an integer between 2 and 1000; default \c 8.} + * @config{lsm_bloom_hash_count, the number of hash values per item used + * for LSM bloom filters..,an integer between 2 and 100; default \c 4.} + * @config{lsm_chunk_size, the maximum size of the in-memory chunk of an + * LSM tree.,an integer between 512K and 500MB; default \c 2MB.} * @config{prefix_compression, configure row-store format key prefix * compression.,a boolean flag; default \c true.} * @config{split_pct, the Btree page split size as a percentage of the @@ -964,8 +970,8 @@ struct __wt_connection { * given as a list\, such as * <code>"verbose=[evictserver\,read]"</code>.,a list\, with values * chosen from the following options: \c "block"\, \c "ckpt"\, \c - * "evict"\, \c "evictserver"\, \c "fileops"\, \c "hazard"\, \c - * "mutex"\, \c "read"\, \c "readserver"\, \c "reconcile"\, \c + * "evict"\, \c "evictserver"\, \c "fileops"\, \c "hazard"\, \c "lsm"\, + * \c "mutex"\, \c "read"\, \c "readserver"\, \c "reconcile"\, \c * "salvage"\, \c "verify"\, \c "write"; default empty.} * @configend * @errors @@ -1160,9 +1166,9 @@ struct __wt_connection { * @config{verbose, enable messages for various events. Options are given as a * list\, such as <code>"verbose=[evictserver\,read]"</code>.,a list\, with * values chosen from the following options: \c "block"\, \c "ckpt"\, \c - * "evict"\, \c "evictserver"\, \c "fileops"\, \c "hazard"\, \c "mutex"\, \c - * "read"\, \c "readserver"\, \c "reconcile"\, \c "salvage"\, \c "verify"\, \c - * "write"; default empty.} + * "evict"\, \c "evictserver"\, \c "fileops"\, \c "hazard"\, \c "lsm"\, \c + * "mutex"\, \c "read"\, \c "readserver"\, \c "reconcile"\, \c "salvage"\, \c + * "verify"\, \c "write"; default empty.} * @configend * Additionally, if a file named \c WiredTiger.config appears in the WiredTiger * home directory, it is read for configuration values (see @ref config_file @@ -1538,43 +1544,35 @@ struct __wt_data_source { * @snippet ex_all.c WT_DATA_SOURCE create */ int (*create)(WT_DATA_SOURCE *dsrc, WT_SESSION *session, - const char *name, const char *config); + const char *name, int exclusive, const char *config); /*! Callback to drop an object. * * @snippet ex_all.c WT_DATA_SOURCE drop */ int (*drop)(WT_DATA_SOURCE *dsrc, WT_SESSION *session, - const char *name, const char *config); + const char *name, const char *cfg[]); /*! Callback to initialize a cursor. * * @snippet ex_all.c WT_DATA_SOURCE open_cursor */ int (*open_cursor)(WT_DATA_SOURCE *dsrc, WT_SESSION *session, - const char *obj, WT_CURSOR *old_cursor, - const char *config, WT_CURSOR **new_cursor); + const char *obj, const char *cfg[], WT_CURSOR **new_cursor); /*! Callback to rename an object. * - * @snippet ex_all.c WT_DATA_SOURCE sync + * @snippet ex_all.c WT_DATA_SOURCE rename */ int (*rename)(WT_DATA_SOURCE *dsrc, WT_SESSION *session, - const char *oldname, const char *newname, const char *config); - - /*! Callback to sync an object. - * - * @snippet ex_all.c WT_DATA_SOURCE sync - */ - int (*sync)(WT_DATA_SOURCE *dsrc, WT_SESSION *session, - const char *name, const char *config); + const char *oldname, const char *newname, const char *cfg[]); /*! Callback to truncate an object. * * @snippet ex_all.c WT_DATA_SOURCE truncate */ int (*truncate)(WT_DATA_SOURCE *dsrc, WT_SESSION *session, - const char *name, const char *config); + const char *name, const char *cfg[]); }; /*! diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h index d95b1dbef05..b0e58ea1682 100644 --- a/src/include/wt_internal.h +++ b/src/include/wt_internal.h @@ -103,6 +103,8 @@ struct __wt_cursor_dump; typedef struct __wt_cursor_dump WT_CURSOR_DUMP; struct __wt_cursor_index; typedef struct __wt_cursor_index WT_CURSOR_INDEX; +struct __wt_cursor_lsm; + typedef struct __wt_cursor_lsm WT_CURSOR_LSM; struct __wt_cursor_stat; typedef struct __wt_cursor_stat WT_CURSOR_STAT; struct __wt_cursor_table; @@ -127,6 +129,12 @@ struct __wt_insert; typedef struct __wt_insert WT_INSERT; struct __wt_insert_head; typedef struct __wt_insert_head WT_INSERT_HEAD; +struct __wt_lsm_chunk; + typedef struct __wt_lsm_chunk WT_LSM_CHUNK; +struct __wt_lsm_data_source; + typedef struct __wt_lsm_data_source WT_LSM_DATA_SOURCE; +struct __wt_lsm_tree; + typedef struct __wt_lsm_tree WT_LSM_TREE; struct __wt_named_collator; typedef struct __wt_named_collator WT_NAMED_COLLATOR; struct __wt_named_compressor; @@ -195,6 +203,7 @@ struct __wt_update; #include "api.h" #include "cursor.h" +#include "lsm.h" #include "meta.h" #include "schema.h" diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c new file mode 100644 index 00000000000..10b84a317d4 --- /dev/null +++ b/src/lsm/lsm_cursor.c @@ -0,0 +1,891 @@ +/*- + * Copyright (c) 2008-2012 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +#define FORALL_CURSORS(clsm, c, i) \ + for ((i) = (clsm)->nchunks - 1; (i) >= 0; (i)--) \ + if (((c) = (clsm)->cursors[i]) != NULL) + +#define WT_LSM_CMP(s, lsm_tree, k1, k2, cmp) \ + (((lsm_tree)->collator == NULL) ? \ + (((cmp) = __wt_btree_lex_compare((k1), (k2))), 0) : \ + (lsm_tree)->collator->compare((lsm_tree)->collator, \ + &(s)->iface, (k1), (k2), &(cmp))) + +#define WT_LSM_CURCMP(s, lsm_tree, c1, c2, cmp) \ + WT_LSM_CMP(s, lsm_tree, &(c1)->key, &(c2)->key, cmp) + +/* + * LSM API enter/leave: check that the cursor is in sync with the tree. + */ +#define WT_LSM_ENTER(clsm, cursor, session, n) \ + clsm = (WT_CURSOR_LSM *)cursor; \ + CURSOR_API_CALL_NOCONF(cursor, session, n, NULL); \ + WT_ERR(__clsm_enter(clsm)) + +#define WT_LSM_END(clsm, session) \ + API_END(session) + +static int __clsm_open_cursors(WT_CURSOR_LSM *); +static int __clsm_search(WT_CURSOR *); + +static inline int +__clsm_enter(WT_CURSOR_LSM *clsm) +{ + if (!F_ISSET(clsm, WT_CLSM_MERGE) && + clsm->dsk_gen != clsm->lsm_tree->dsk_gen) + WT_RET(__clsm_open_cursors(clsm)); + + return (0); +} + +/* + * TODO: use something other than an empty value as a tombstone: we need + * to support empty values from the application. + */ +static WT_ITEM __lsm_tombstone = { "", 0, 0, NULL, 0 }; + +#define WT_LSM_NEEDVALUE(c) do { \ + WT_CURSOR_NEEDVALUE(c); \ + if (__clsm_deleted(&(c)->value)) \ + WT_ERR(__wt_cursor_kv_not_set(cursor, 0)); \ +} while (0) + +/* + * __clsm_deleted -- + * Check whether the current value is a tombstone. + */ +static inline int +__clsm_deleted(WT_ITEM *item) +{ + return (item->size == 0); +} + +/* + * __clsm_close_cursors -- + * Close all of the btree cursors currently open. + */ +static int +__clsm_close_cursors(WT_CURSOR_LSM *clsm) +{ + WT_BLOOM *bloom; + WT_CURSOR *c; + int i; + + if (clsm->cursors == NULL) + return (0); + + /* Detach from our old primary. */ + if (clsm->primary_chunk != NULL) { + WT_ATOMIC_SUB(clsm->primary_chunk->ncursor, 1); + clsm->primary_chunk = NULL; + } + + FORALL_CURSORS(clsm, c, i) { + clsm->cursors[i] = NULL; + WT_RET(c->close(c)); + if ((bloom = clsm->blooms[i]) != NULL) { + clsm->blooms[i] = NULL; + WT_RET(__wt_bloom_close(bloom)); + } + } + + clsm->current = NULL; + return (0); +} + +/* + * __clsm_open_cursors -- + * Open cursors for the current set of files. + */ +static int +__clsm_open_cursors(WT_CURSOR_LSM *clsm) +{ + WT_CURSOR *c, **cp; + WT_DECL_RET; + WT_LSM_CHUNK *chunk; + WT_LSM_TREE *lsm_tree; + WT_SESSION_IMPL *session; + const char *ckpt_cfg[] = { "checkpoint=WiredTigerCheckpoint", NULL }; + int i, nchunks; + + session = (WT_SESSION_IMPL *)clsm->iface.session; + lsm_tree = clsm->lsm_tree; + c = &clsm->iface; + + /* Copy the key, so we don't lose the cursor position. */ + if (F_ISSET(c, WT_CURSTD_KEY_SET)) { + if (c->key.data != c->key.mem) + WT_RET(__wt_buf_set( + session, &c->key, c->key.data, c->key.size)); + F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV); + } + + WT_RET(__clsm_close_cursors(clsm)); + + __wt_spin_lock(session, &lsm_tree->lock); + /* Merge cursors have already figured out how many chunks they need. */ + if (F_ISSET(clsm, WT_CLSM_MERGE)) + nchunks = clsm->nchunks; + else + nchunks = lsm_tree->nchunks; + + if (clsm->cursors == NULL || nchunks > clsm->nchunks) { + WT_ERR(__wt_realloc(session, NULL, + nchunks * sizeof(WT_BLOOM *), &clsm->blooms)); + WT_ERR(__wt_realloc(session, NULL, + nchunks * sizeof(WT_CURSOR *), &clsm->cursors)); + } + clsm->nchunks = nchunks; + + for (i = 0, cp = clsm->cursors; i != clsm->nchunks; i++, cp++) { + /* + * Read from the checkpoint if the file has been written. + * Once all cursors switch, the in-memory tree can be evicted. + */ + chunk = lsm_tree->chunk[i]; + WT_ERR(__wt_curfile_open(session, + chunk->uri, &clsm->iface, + F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) ? ckpt_cfg : NULL, cp)); + if (chunk->bloom_uri != NULL && !F_ISSET(clsm, WT_CLSM_MERGE)) + WT_ERR(__wt_bloom_open(session, chunk->bloom_uri, + lsm_tree->bloom_bit_count, + lsm_tree->bloom_hash_count, + c, &clsm->blooms[i])); + + /* Child cursors always use overwrite and raw mode. */ + F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW); + } + + /* The last chunk is our new primary. */ + WT_ASSERT(session, + !F_ISSET(clsm, WT_CLSM_UPDATED) || + !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)); + + clsm->primary_chunk = chunk; + WT_ATOMIC_ADD(clsm->primary_chunk->ncursor, 1); + + /* Peek into the btree layer to track the in-memory size. */ + if (lsm_tree->memsizep == NULL) + (void)__wt_btree_get_memsize( + session, session->btree, &lsm_tree->memsizep); + + clsm->dsk_gen = lsm_tree->dsk_gen; +err: __wt_spin_unlock(session, &lsm_tree->lock); + return (ret); +} + +/* __wt_clsm_init_merge -- + * Initialize an LSM cursor for a (major) merge. + */ +int +__wt_clsm_init_merge(WT_CURSOR *cursor, int nchunks) +{ + WT_CURSOR_LSM *clsm; + + clsm = (WT_CURSOR_LSM *)cursor; + F_SET(clsm, WT_CLSM_MERGE); + clsm->nchunks = nchunks; + + return (__clsm_open_cursors(clsm)); +} + +/* + * __clsm_get_current -- + * Find the smallest / largest of the cursors and copy its key/value. + */ +static int +__clsm_get_current( + WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, int smallest, int *deletedp) +{ + WT_CURSOR *c, *current; + int i; + int cmp, multiple; + + current = NULL; + FORALL_CURSORS(clsm, c, i) { + if (!F_ISSET(c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET)) + continue; + if (current == NULL) { + cmp = (smallest ? -1 : 1); + } else + WT_RET(WT_LSM_CURCMP(session, + clsm->lsm_tree, c, current, cmp)); + if (smallest ? cmp < 0 : cmp > 0) { + current = c; + multiple = 0; + } else if (cmp == 0) + multiple = 1; + } + + c = &clsm->iface; + if ((clsm->current = current) == NULL) { + F_CLR(c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + return (WT_NOTFOUND); + } + + if (multiple) + F_SET(clsm, WT_CLSM_MULTIPLE); + else + F_CLR(clsm, WT_CLSM_MULTIPLE); + + WT_RET(current->get_key(current, &c->key)); + WT_RET(current->get_value(current, &c->value)); + + if ((*deletedp = __clsm_deleted(&c->value)) == 0) + F_SET(c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + else + F_CLR(c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + + return (0); +} + +/* + * __clsm_compare -- + * WT_CURSOR->compare implementation for the LSM cursor type. + */ +static int +__clsm_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp) +{ + WT_CURSOR_LSM *alsm; + WT_DECL_RET; + WT_SESSION_IMPL *session; + int cmp; + + /* There's no need to sync with the LSM tree, avoid WT_LSM_ENTER. */ + alsm = (WT_CURSOR_LSM *)a; + CURSOR_API_CALL_NOCONF(a, session, compare, NULL); + + /* + * Confirm both cursors refer to the same source and have keys, then + * compare the keys. + */ + if (strcmp(a->uri, b->uri) != 0) + WT_ERR_MSG(session, EINVAL, + "comparison method cursors must reference the same object"); + + WT_ERR(WT_CURSOR_NEEDKEY(a)); + WT_ERR(WT_CURSOR_NEEDKEY(b)); + + WT_ERR(WT_LSM_CMP(session, alsm->lsm_tree, &a->key, &b->key, cmp)); + *cmpp = cmp; + +err: API_END(session); + return (ret); +} + +/* + * __clsm_next -- + * WT_CURSOR->next method for the LSM cursor type. + */ +static int +__clsm_next(WT_CURSOR *cursor) +{ + WT_CURSOR_LSM *clsm; + WT_CURSOR *c; + WT_DECL_RET; + WT_SESSION_IMPL *session; + int check, cmp, deleted, i; + + WT_LSM_ENTER(clsm, cursor, session, next); + + /* If we aren't positioned for a forward scan, get started. */ + if (clsm->current == NULL || !F_ISSET(clsm, WT_CLSM_ITERATE_NEXT)) { + F_CLR(clsm, WT_CLSM_MULTIPLE); + FORALL_CURSORS(clsm, c, i) { + if (!F_ISSET(cursor, WT_CURSTD_KEY_SET)) { + WT_ERR(c->reset(c)); + ret = c->next(c); + } else if (c != clsm->current) { + c->set_key(c, &cursor->key); + if ((ret = c->search_near(c, &cmp)) == 0) { + if (cmp < 0) + ret = c->next(c); + else if (cmp == 0) { + if (clsm->current == NULL) + clsm->current = c; + else + F_SET(clsm, + WT_CLSM_MULTIPLE); + } + } + } + WT_ERR_NOTFOUND_OK(ret); + } + F_SET(clsm, WT_CLSM_ITERATE_NEXT); + F_CLR(clsm, WT_CLSM_ITERATE_PREV); + + /* We just positioned *at* the key, now move. */ + if (clsm->current != NULL) + goto retry; + } else { +retry: /* + * If there are multiple cursors on that key, move them + * forward. + */ + if (F_ISSET(clsm, WT_CLSM_MULTIPLE)) { + check = 0; + FORALL_CURSORS(clsm, c, i) { + if (!F_ISSET(c, + WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET)) + continue; + if (check) { + WT_ERR(WT_LSM_CURCMP(session, + clsm->lsm_tree, c, clsm->current, + cmp)); + if (cmp == 0) + WT_ERR_NOTFOUND_OK(c->next(c)); + } + if (c == clsm->current) + check = 1; + } + } + + /* Move the smallest cursor forward. */ + c = clsm->current; + WT_ERR_NOTFOUND_OK(c->next(c)); + } + + /* Find the cursor(s) with the smallest key. */ + if ((ret = __clsm_get_current(session, clsm, 1, &deleted)) == 0 && + deleted) + goto retry; +err: WT_LSM_END(clsm, session); + + return (ret); +} + +/* + * __clsm_prev -- + * WT_CURSOR->prev method for the LSM cursor type. + */ +static int +__clsm_prev(WT_CURSOR *cursor) +{ + WT_CURSOR_LSM *clsm; + WT_CURSOR *c; + WT_DECL_RET; + WT_SESSION_IMPL *session; + int check, cmp, deleted, i; + + WT_LSM_ENTER(clsm, cursor, session, next); + + /* If we aren't positioned for a reverse scan, get started. */ + if (clsm->current == NULL || !F_ISSET(clsm, WT_CLSM_ITERATE_PREV)) { + F_CLR(clsm, WT_CLSM_MULTIPLE); + FORALL_CURSORS(clsm, c, i) { + if (!F_ISSET(cursor, WT_CURSTD_KEY_SET)) { + WT_ERR(c->reset(c)); + ret = c->prev(c); + } else if (c != clsm->current) { + c->set_key(c, &cursor->key); + if ((ret = c->search_near(c, &cmp)) == 0) { + if (cmp > 0) + ret = c->prev(c); + else if (cmp == 0) { + if (clsm->current == NULL) + clsm->current = c; + else + F_SET(clsm, + WT_CLSM_MULTIPLE); + } + } + } + WT_ERR_NOTFOUND_OK(ret); + } + F_SET(clsm, WT_CLSM_ITERATE_PREV); + F_CLR(clsm, WT_CLSM_ITERATE_NEXT); + + /* We just positioned *at* the key, now move. */ + if (clsm->current != NULL) + goto retry; + } else { +retry: /* + * If there are multiple cursors on that key, move them + * backwards. + */ + if (F_ISSET(clsm, WT_CLSM_MULTIPLE)) { + check = 0; + FORALL_CURSORS(clsm, c, i) { + if (!F_ISSET(c, + WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET)) + continue; + if (check) { + WT_ERR(WT_LSM_CURCMP(session, + clsm->lsm_tree, c, clsm->current, + cmp)); + if (cmp == 0) + WT_ERR_NOTFOUND_OK(c->prev(c)); + } + if (c == clsm->current) + check = 1; + } + } + + /* Move the smallest cursor backwards. */ + c = clsm->current; + WT_ERR_NOTFOUND_OK(c->prev(c)); + } + + /* Find the cursor(s) with the largest key. */ + if ((ret = __clsm_get_current(session, clsm, 0, &deleted)) == 0 && + deleted) + goto retry; +err: WT_LSM_END(clsm, session); + + return (ret); +} + +/* + * __clsm_reset -- + * WT_CURSOR->reset method for the LSM cursor type. + */ +static int +__clsm_reset(WT_CURSOR *cursor) +{ + WT_CURSOR_LSM *clsm; + WT_CURSOR *c; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + WT_LSM_ENTER(clsm, cursor, session, reset); + if ((c = clsm->current) != NULL) { + ret = c->reset(c); + clsm->current = NULL; + } + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV); +err: WT_LSM_END(clsm, session); + + return (ret); +} + +/* + * __clsm_search -- + * WT_CURSOR->search method for the LSM cursor type. + */ +static int +__clsm_search(WT_CURSOR *cursor) +{ + WT_BLOOM *bloom; + WT_CURSOR *c; + WT_CURSOR_LSM *clsm; + WT_DECL_RET; + WT_SESSION_IMPL *session; + int i; + + WT_LSM_ENTER(clsm, cursor, session, search); + WT_CURSOR_NEEDKEY(cursor); + FORALL_CURSORS(clsm, c, i) { + /* If there is a Bloom filter, see if we can skip the read. */ + if ((bloom = clsm->blooms[i]) != NULL) { + ret = __wt_bloom_get(bloom, &cursor->key); + if (ret == WT_NOTFOUND) + continue; + WT_RET(ret); + } + c->set_key(c, &cursor->key); + if ((ret = c->search(c)) == 0) { + WT_ERR(c->get_key(c, &cursor->key)); + WT_ERR(c->get_value(c, &cursor->value)); + clsm->current = c; + if (__clsm_deleted(&cursor->value)) + ret = WT_NOTFOUND; + goto done; + } else if (ret != WT_NOTFOUND) + goto err; + } + ret = WT_NOTFOUND; + +done: +err: WT_LSM_END(clsm, session); + if (ret == 0) + F_SET(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + else + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + + return (ret); +} + +/* + * __clsm_search_near -- + * WT_CURSOR->search_near method for the LSM cursor type. + */ +static int +__clsm_search_near(WT_CURSOR *cursor, int *exactp) +{ + WT_CURSOR *c, *larger, *smaller; + WT_CURSOR_LSM *clsm; + WT_DECL_RET; + WT_ITEM v; + WT_SESSION_IMPL *session; + int cmp, deleted, i; + + WT_LSM_ENTER(clsm, cursor, session, search_near); + WT_CURSOR_NEEDKEY(cursor); + + /* + * search_near is somewhat fiddly: we can't just return a nearby key + * from the in-memory chunk because there could be a closer key on + * disk. + * + * As we search down the chunks, we stop as soon as we find an exact + * match. Otherwise, we maintain the smallest cursor larger than the + * search key and the largest cursor smaller than the search key. At + * the bottom, if one of those is set, we use it, otherwise we return + * WT_NOTFOUND. + */ + larger = smaller = NULL; + FORALL_CURSORS(clsm, c, i) { + c->set_key(c, &cursor->key); + if ((ret = c->search_near(c, &cmp)) == WT_NOTFOUND) { + ret = 0; + continue; + } else if (ret != 0) + goto err; + + WT_ERR(c->get_value(c, &v)); + deleted = __clsm_deleted(&v); + + if (cmp == 0 && !deleted) { + clsm->current = c; + *exactp = 0; + goto done; + } + + /* + * If we land on a deleted item, try going forwards or + * backwards to find one that isn't deleted. + */ + while (deleted && (ret = c->next(c)) == 0) { + cmp = 1; + WT_ERR(c->get_value(c, &v)); + deleted = __clsm_deleted(&v); + } + WT_ERR_NOTFOUND_OK(ret); + while (deleted && (ret = c->prev(c)) == 0) { + cmp = -1; + WT_ERR(c->get_value(c, &v)); + deleted = __clsm_deleted(&v); + } + WT_ERR_NOTFOUND_OK(ret); + if (deleted) + continue; + if (cmp > 0) { + if (larger == NULL) + larger = c; + else { + WT_ERR(WT_LSM_CURCMP(session, + clsm->lsm_tree, c, larger, cmp)); + if (cmp < 0) + larger = c; + } + } else { + if (smaller == NULL) + smaller = c; + else { + WT_ERR(WT_LSM_CURCMP(session, + clsm->lsm_tree, c, smaller, cmp)); + if (cmp > 0) + smaller = c; + } + } + } + + if (smaller != NULL) { + clsm->current = smaller; + *exactp = -1; + } else if (larger != NULL) { + clsm->current = larger; + *exactp = 1; + } else + ret = WT_NOTFOUND; + +done: +err: WT_LSM_END(clsm, session); + if (ret == 0) { + c = clsm->current; + WT_TRET(c->get_key(c, &cursor->key)); + WT_TRET(c->get_value(c, &cursor->value)); + } + if (ret == 0) + F_SET(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + else + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + + return (ret); +} + +/* + * __clsm_put -- + * Put an entry into the in-memory tree, trigger a file switch if + * necessary. + */ +static inline int +__clsm_put( + WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, WT_ITEM *key, WT_ITEM *value) +{ + WT_BTREE *btree; + WT_CURSOR *primary; + WT_DECL_RET; + WT_LSM_TREE *lsm_tree; + uint32_t *memsizep; + + lsm_tree = clsm->lsm_tree; + + /* + * If this is the first update in this cursor, check if a new in-memory + * chunk is needed. + */ + if (!F_ISSET(clsm, WT_CLSM_UPDATED)) { + __wt_spin_lock(session, &lsm_tree->lock); + if (clsm->dsk_gen == lsm_tree->dsk_gen) + WT_WITH_SCHEMA_LOCK(session, + ret = __wt_lsm_tree_switch(session, lsm_tree)); + __wt_spin_unlock(session, &lsm_tree->lock); + WT_RET(ret); + F_SET(clsm, WT_CLSM_UPDATED); + + /* We changed the structure, or someone else did: update. */ + WT_RET(__clsm_enter(clsm)); + } + + primary = clsm->cursors[clsm->nchunks - 1]; + primary->set_key(primary, key); + primary->set_value(primary, value); + WT_RET(primary->insert(primary)); + + /* + * The count is in a shared structure, but it's only approximate, so + * don't worry about protecting access. + */ + ++clsm->primary_chunk->count; + + /* + * Set the position for future scans. If we were already positioned in + * a non-primary chunk, we may now have multiple cursors matching the + * key. + */ + F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV); + clsm->current = primary; + + if ((memsizep = lsm_tree->memsizep) != NULL && + *memsizep > lsm_tree->threshold) { + /* + * Close our cursors: if we are the only open cursor, this + * means the btree handle is unlocked. + * + * XXX this is insufficient if multiple cursors are open, need + * to move some operations (such as clearing the + * "cache_resident" flag) into the worker thread. + */ + btree = ((WT_CURSOR_BTREE *)primary)->btree; + WT_RET(__wt_btree_release_memsize(session, btree)); + + /* + * Take the LSM lock first: we can't acquire it while + * holding the schema lock, or we will deadlock. + */ + __wt_spin_lock(session, &lsm_tree->lock); + /* Make sure we don't race. */ + if (clsm->dsk_gen == lsm_tree->dsk_gen) + WT_WITH_SCHEMA_LOCK(session, + ret = __wt_lsm_tree_switch(session, lsm_tree)); + __wt_spin_unlock(session, &lsm_tree->lock); + } + + return (ret); +} + +/* + * __clsm_insert -- + * WT_CURSOR->insert method for the LSM cursor type. + */ +static int +__clsm_insert(WT_CURSOR *cursor) +{ + WT_CURSOR_LSM *clsm; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + WT_LSM_ENTER(clsm, cursor, session, insert); + WT_CURSOR_NEEDKEY(cursor); + WT_LSM_NEEDVALUE(cursor); + + if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && + (ret = __clsm_search(cursor)) != WT_NOTFOUND) { + if (ret == 0) + ret = WT_DUPLICATE_KEY; + return (ret); + } + + ret = __clsm_put(session, clsm, &cursor->key, &cursor->value); + +err: WT_LSM_END(clsm, session); + + return (ret); +} + +/* + * __clsm_update -- + * WT_CURSOR->update method for the LSM cursor type. + */ +static int +__clsm_update(WT_CURSOR *cursor) +{ + WT_CURSOR_LSM *clsm; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + WT_LSM_ENTER(clsm, cursor, session, update); + WT_CURSOR_NEEDKEY(cursor); + WT_LSM_NEEDVALUE(cursor); + + if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) || + (ret = __clsm_search(cursor)) == 0) + ret = __clsm_put(session, clsm, &cursor->key, &cursor->value); + +err: WT_LSM_END(clsm, session); + + return (ret); +} + +/* + * __clsm_remove -- + * WT_CURSOR->remove method for the LSM cursor type. + */ +static int +__clsm_remove(WT_CURSOR *cursor) +{ + WT_CURSOR_LSM *clsm; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + WT_LSM_ENTER(clsm, cursor, session, remove); + WT_CURSOR_NEEDKEY(cursor); + + if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) || + (ret = __clsm_search(cursor)) == 0) + ret = __clsm_put(session, clsm, &cursor->key, &__lsm_tombstone); + +err: WT_LSM_END(clsm, session); + + return (ret); +} + +/* + * __clsm_close -- + * WT_CURSOR->close method for the LSM cursor type. + */ +static int +__clsm_close(WT_CURSOR *cursor) +{ + WT_CURSOR_LSM *clsm; + WT_DECL_RET; + WT_SESSION_IMPL *session; + + /* + * Don't use the normal __clsm_enter path: that is wasted work when + * closing, and the cursor may never have been used. + */ + clsm = (WT_CURSOR_LSM *)cursor; + CURSOR_API_CALL_NOCONF(cursor, session, close, NULL); + WT_TRET(__clsm_close_cursors(clsm)); + __wt_free(session, clsm->blooms); + __wt_free(session, clsm->cursors); + /* The WT_LSM_TREE owns the URI. */ + cursor->uri = NULL; + WT_TRET(__wt_cursor_close(cursor)); + API_END(session); + + return (ret); +} + +/* + * __wt_clsm_open -- + * WT_SESSION->open_cursor method for LSM cursors. + */ +int +__wt_clsm_open(WT_SESSION_IMPL *session, + const char *uri, const char *cfg[], WT_CURSOR **cursorp) +{ + static WT_CURSOR iface = { + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + __clsm_compare, + __clsm_next, + __clsm_prev, + __clsm_reset, + __clsm_search, + __clsm_search_near, + __clsm_insert, + __clsm_update, + __clsm_remove, + __clsm_close, + { NULL, NULL }, /* TAILQ_ENTRY q */ + 0, /* recno key */ + { 0 }, /* raw recno buffer */ + { NULL, 0, 0, NULL, 0 },/* WT_ITEM key */ + { NULL, 0, 0, NULL, 0 },/* WT_ITEM value */ + 0, /* int saved_err */ + 0 /* uint32_t flags */ + }; + WT_CONFIG_ITEM cval; + WT_CURSOR *cursor; + WT_CURSOR_LSM *clsm; + WT_DECL_RET; + WT_LSM_TREE *lsm_tree; + + clsm = NULL; + + if (!WT_PREFIX_MATCH(uri, "lsm:")) + return (EINVAL); + + /* Get the LSM tree. */ + WT_RET(__wt_lsm_tree_get(session, uri, &lsm_tree)); + + WT_RET(__wt_calloc_def(session, 1, &clsm)); + + cursor = &clsm->iface; + *cursor = iface; + cursor->session = &session->iface; + cursor->uri = lsm_tree->name; + cursor->key_format = lsm_tree->key_format; + cursor->value_format = lsm_tree->value_format; + + clsm->lsm_tree = lsm_tree; + + /* + * The tree's dsk_gen starts at one, so starting the cursor on zero + * will force a call into open_cursors on the first operation. + */ + clsm->dsk_gen = 0; + + STATIC_ASSERT(offsetof(WT_CURSOR_LSM, iface) == 0); + WT_ERR(__wt_cursor_init(cursor, cursor->uri, NULL, cfg, cursorp)); + + /* + * LSM cursors default to overwrite: if no setting was supplied, turn + * it on. + */ + if (cfg[1] != NULL || __wt_config_getones( + session, cfg[1], "overwrite", &cval) == WT_NOTFOUND) + F_SET(cursor, WT_CURSTD_OVERWRITE); + + if (0) { +err: (void)__clsm_close(cursor); + } + + return (ret); +} diff --git a/src/lsm/lsm_dsrc.c b/src/lsm/lsm_dsrc.c new file mode 100644 index 00000000000..ed627ff9d4d --- /dev/null +++ b/src/lsm/lsm_dsrc.c @@ -0,0 +1,141 @@ +/*- + * Copyright (c) 2008-2012 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __lsm_create -- + * Implementation of the create operation for LSM trees. + */ +static int +__lsm_create(WT_DATA_SOURCE *dsrc, WT_SESSION *wt_session, + const char *uri, int exclusive, const char *config) +{ + WT_SESSION_IMPL *session; + + WT_UNUSED(dsrc); + + session = (WT_SESSION_IMPL *)wt_session; + return (__wt_lsm_tree_create(session, uri, exclusive, config)); +} + +/* + * __lsm_drop -- + * Implementation of the drop operation for LSM trees. + */ +static int +__lsm_drop(WT_DATA_SOURCE *dsrc, WT_SESSION *wt_session, + const char *uri, const char *cfg[]) +{ + WT_SESSION_IMPL *session; + + WT_UNUSED(dsrc); + session = (WT_SESSION_IMPL *)wt_session; + + return (__wt_lsm_tree_drop(session, uri, cfg)); +} + +/* + * __lsm_open_cursor -- + * Implementation of the open_cursor operation for LSM trees. + */ +static int +__lsm_open_cursor(WT_DATA_SOURCE *dsrc, WT_SESSION *wt_session, + const char *obj, const char *cfg[], WT_CURSOR **new_cursor) +{ + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)wt_session; + WT_UNUSED(dsrc); + + return (__wt_clsm_open(session, obj, cfg, new_cursor)); +} + +/* + * __lsm_rename -- + * Implementation of the rename operation for LSM trees. + */ +static int +__lsm_rename(WT_DATA_SOURCE *dsrc, WT_SESSION *wt_session, + const char *oldname, const char *newname, const char *cfg[]) +{ + WT_SESSION_IMPL *session; + + WT_UNUSED(dsrc); + session = (WT_SESSION_IMPL *)wt_session; + + if (!WT_PREFIX_MATCH(newname, "lsm:")) + WT_RET_MSG(session, EINVAL, + "rename target type must match URI: %s to %s", + oldname, newname); + + return (__wt_lsm_tree_rename(session, oldname, newname, cfg)); +} + +/* + * __lsm_truncate -- + * Implementation of the truncate operation for LSM trees. + */ +static int +__lsm_truncate(WT_DATA_SOURCE *dsrc, WT_SESSION *wt_session, + const char *uri, const char *cfg[]) +{ + WT_SESSION_IMPL *session; + + WT_UNUSED(dsrc); + session = (WT_SESSION_IMPL *)wt_session; + + return (__wt_lsm_tree_truncate(session, uri, cfg)); +} + +/* + * __wt_lsm_init -- + * Initialize LSM structures during wiredtiger_open. + */ +int +__wt_lsm_init(WT_CONNECTION *wt_conn, const char *config) +{ + WT_CONNECTION_IMPL *conn; + static WT_LSM_DATA_SOURCE *lsm_dsrc; + WT_SESSION_IMPL *session; + static WT_DATA_SOURCE iface = { + __lsm_create, + __lsm_drop, + __lsm_open_cursor, + __lsm_rename, + __lsm_truncate + }; + + conn = (WT_CONNECTION_IMPL *)wt_conn; + session = conn->default_session; + + WT_RET(__wt_calloc_def(session, 1, &lsm_dsrc)); + + lsm_dsrc->iface = iface; + WT_RET( + __wt_rwlock_alloc(session, "lsm data source", &lsm_dsrc->rwlock)); + TAILQ_INIT(&lsm_dsrc->trees); + + return (wt_conn->add_data_source(wt_conn, + "lsm:", &lsm_dsrc->iface, config)); +} + +/* + * __wt_lsm_cleanup -- + * Clean up LSM structures during connection close. + */ +int +__wt_lsm_cleanup(WT_CONNECTION *wt_conn) +{ + WT_CONNECTION_IMPL *conn; + WT_SESSION_IMPL *session; + + conn = (WT_CONNECTION_IMPL *)wt_conn; + session = conn->default_session; + + return (__wt_lsm_tree_close_all(session)); +} diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c new file mode 100644 index 00000000000..db780193e1e --- /dev/null +++ b/src/lsm/lsm_merge.c @@ -0,0 +1,201 @@ +/*- + * Copyright (c) 2008-2012 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_lsm_merge_update_tree -- + * Merge a set of chunks and create a new one. + * Must be called with the LSM lock held. + */ +int +__wt_lsm_merge_update_tree(WT_SESSION_IMPL *session, + WT_LSM_TREE *lsm_tree, int nchunks, WT_LSM_CHUNK **chunkp) +{ + WT_LSM_CHUNK *chunk; + size_t chunk_sz; + int i, j; + + /* Setup the array of obsolete chunks. */ + if (nchunks > lsm_tree->old_avail) { + chunk_sz = sizeof(*lsm_tree->old_chunks); + WT_RET(__wt_realloc(session, + &lsm_tree->old_alloc, + chunk_sz * WT_MAX(10, lsm_tree->nold_chunks + 2 * nchunks), + &lsm_tree->old_chunks)); + lsm_tree->old_avail += (int)(lsm_tree->old_alloc / chunk_sz) - + lsm_tree->nold_chunks; + lsm_tree->nold_chunks = (int)(lsm_tree->old_alloc / chunk_sz); + } + /* Copy entries one at a time, so we can reuse gaps in the list. */ + for (i = j = 0; j < nchunks && i < lsm_tree->nold_chunks; i++) { + if (lsm_tree->old_chunks[i] == NULL) { + lsm_tree->old_chunks[i] = lsm_tree->chunk[j++]; + --lsm_tree->old_avail; + } + } + + WT_ASSERT(session, j == nchunks); + + /* Update the current chunk list. */ + memmove(lsm_tree->chunk + 1, lsm_tree->chunk + nchunks, + (lsm_tree->nchunks - nchunks) * sizeof(*lsm_tree->chunk)); + lsm_tree->nchunks -= nchunks - 1; + memset(lsm_tree->chunk + lsm_tree->nchunks, 0, + (nchunks - 1) * sizeof(*lsm_tree->chunk)); + WT_RET(__wt_calloc_def(session, 1, &chunk)); + lsm_tree->chunk[0] = chunk; + lsm_tree->dsk_gen++; + + *chunkp = chunk; + return (0); +} + +/* + * __wt_lsm_major_merge -- + * Merge a set of chunks of an LSM tree including the oldest. + */ +int +__wt_lsm_major_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_BLOOM *bloom; + WT_CURSOR *src, *dest; + WT_DECL_ITEM(bbuf); + WT_DECL_RET; + WT_ITEM key, value; + WT_LSM_CHUNK *chunk; + WT_SESSION *wt_session; + const char *dest_uri; + uint64_t insert_count, record_count; + int dest_id, i, nchunks; + + src = dest = NULL; + dest_uri = NULL; + bloom = NULL; + + /* + * Take a copy of the latest chunk id. This value needs to be atomically + * read. We need a copy, since other threads may alter the chunk count + * while we are doing a merge. + */ + nchunks = lsm_tree->nchunks - 1; + + /* + * If there aren't any chunks to merge, or some of the chunks aren't + * yet written, we're done. A non-zero error indicates that the worker + * should assume there is no work to do: if there are unwritten chunks, + * the worker should write them immediately. + */ + if (nchunks <= 1) + return (WT_NOTFOUND); + + /* + * We have a limited number of hazard references, and we want to bound + * the amount of work in the merge. + * + * Use the lsm_tree lock to read the chunks (so no switches occur), but + * avoid holding it while the merge is in progress: that may take a + * long time. + */ + nchunks = WT_MIN((int)S2C(session)->hazard_size / 2, nchunks); + __wt_spin_lock(session, &lsm_tree->lock); + while (nchunks > 1 && + (!F_ISSET(lsm_tree->chunk[nchunks - 1], WT_LSM_CHUNK_ONDISK) || + lsm_tree->chunk[nchunks - 1]->ncursor > 0)) + --nchunks; + for (record_count = 0, i = 0; i < nchunks; i++) + record_count += lsm_tree->chunk[i]->count; + __wt_spin_unlock(session, &lsm_tree->lock); + + if (nchunks <= 1) + return (0); + + /* Allocate an ID for the merge. */ + dest_id = WT_ATOMIC_ADD(lsm_tree->last, 1); + + WT_VERBOSE_RET(session, lsm, + "Merging first %d chunks into %d\n", nchunks, dest_id); + + if (record_count != 0) { + WT_RET(__wt_scr_alloc(session, 0, &bbuf)); + WT_ERR(__wt_lsm_tree_bloom_name( + session, lsm_tree, dest_id, bbuf)); + + WT_ERR(__wt_bloom_create(session, bbuf->data, + NULL, record_count, lsm_tree->bloom_bit_count, + lsm_tree->bloom_hash_count, &bloom)); + } + + /* + * Special setup for the merge cursor: + * first, reset to open the dependent cursors; + * then restrict the cursor to a specific number of chunks; + * then set MERGE so the cursor doesn't track updates to the tree. + */ + wt_session = &session->iface; + WT_ERR(wt_session->open_cursor( + wt_session, lsm_tree->name, NULL, NULL, &src)); + F_SET(src, WT_CURSTD_RAW); + WT_ERR(__wt_clsm_init_merge(src, nchunks)); + + WT_WITH_SCHEMA_LOCK(session, ret = __wt_lsm_tree_create_chunk( + session, lsm_tree, dest_id, &dest_uri)); + WT_ERR(ret); + WT_ERR(wt_session->open_cursor( + wt_session, dest_uri, NULL, "raw,bulk", &dest)); + + for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { + WT_ERR(src->get_key(src, &key)); + dest->set_key(dest, &key); + WT_ERR(src->get_value(src, &value)); + dest->set_value(dest, &value); + WT_ERR(dest->insert(dest)); + if (bloom != NULL) + WT_ERR(__wt_bloom_insert(bloom, &key)); + } + WT_VERBOSE_ERR(session, lsm, + "Bloom size for %" PRIu64 " has %" PRIu64 " items inserted.", + record_count, insert_count); + WT_ERR_NOTFOUND_OK(ret); + + /* We've successfully created the new chunk. Now install it. */ + WT_TRET(src->close(src)); + WT_TRET(dest->close(dest)); + src = dest = NULL; + if (bloom != NULL) { + WT_TRET(__wt_bloom_finalize(bloom)); + WT_TRET(__wt_bloom_close(bloom)); + bloom = NULL; + } + WT_ERR(ret); + + __wt_spin_lock(session, &lsm_tree->lock); + ret = __wt_lsm_merge_update_tree(session, lsm_tree, nchunks, &chunk); + + chunk->uri = dest_uri; + dest_uri = NULL; + if (bloom != NULL) + chunk->bloom_uri = __wt_buf_steal(session, bbuf, 0); + chunk->count = insert_count; + F_SET(chunk, WT_LSM_CHUNK_ONDISK); + + ret = __wt_lsm_meta_write(session, lsm_tree); + __wt_spin_unlock(session, &lsm_tree->lock); + +err: if (src != NULL) + WT_TRET(src->close(src)); + if (dest != NULL) + WT_TRET(dest->close(dest)); + if (bloom != NULL) + WT_TRET(__wt_bloom_close(bloom)); + __wt_scr_free(&bbuf); + __wt_free(session, dest_uri); + if (ret != 0) + WT_VERBOSE_VOID(session, lsm, + "Merge failed with %s\n", wiredtiger_strerror(ret)); + return (ret); +} diff --git a/src/lsm/lsm_meta.c b/src/lsm/lsm_meta.c new file mode 100644 index 00000000000..d731b5356fe --- /dev/null +++ b/src/lsm/lsm_meta.c @@ -0,0 +1,173 @@ +/*- + * Copyright (c) 2008-2012 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_lsm_meta_read -- + * Read the metadata for an LSM tree. + */ +int +__wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_CONFIG cparser, lparser; + WT_CONFIG_ITEM ck, cv, lk, lv; + WT_DECL_RET; + WT_LSM_CHUNK *chunk; + const char *config; + int nchunks; + size_t chunk_sz, alloc; + + WT_RET(__wt_metadata_read(session, lsm_tree->name, &config)); + WT_ERR(__wt_config_init(session, &cparser, config)); + while ((ret = __wt_config_next(&cparser, &ck, &cv)) == 0) { + if (WT_STRING_MATCH("file_config", ck.str, ck.len)) { + __wt_free(session, lsm_tree->file_config); + /* Don't include the brackets. */ + WT_ERR(__wt_strndup(session, + cv.str + 1, cv.len - 2, &lsm_tree->file_config)); + } else if (WT_STRING_MATCH("key_format", ck.str, ck.len)) { + __wt_free(session, lsm_tree->key_format); + WT_ERR(__wt_strndup(session, + cv.str, cv.len, &lsm_tree->key_format)); + } else if (WT_STRING_MATCH("value_format", ck.str, ck.len)) { + __wt_free(session, lsm_tree->value_format); + WT_ERR(__wt_strndup(session, + cv.str, cv.len, &lsm_tree->value_format)); + } else if (WT_STRING_MATCH( + "lsm_bloom_bit_count", ck.str, ck.len)) + lsm_tree->bloom_bit_count = (uint32_t)cv.val; + else if (WT_STRING_MATCH( + "lsm_bloom_hash_count", ck.str, ck.len)) + lsm_tree->bloom_hash_count = (uint32_t)cv.val; + else if (WT_STRING_MATCH("threshold", ck.str, ck.len)) + lsm_tree->threshold = (uint32_t)cv.val; + else if (WT_STRING_MATCH("last", ck.str, ck.len)) + lsm_tree->last = (uint32_t)cv.val; + else if (WT_STRING_MATCH("chunks", ck.str, ck.len)) { + WT_ERR(__wt_config_subinit(session, &lparser, &cv)); + chunk_sz = sizeof(*lsm_tree->chunk); + for (nchunks = 0; (ret = + __wt_config_next(&lparser, &lk, &lv)) == 0; ) { + if (WT_STRING_MATCH("bloom", lk.str, lk.len)) { + WT_ERR(__wt_strndup(session, + lv.str, lv.len, &chunk->bloom_uri)); + continue; + } + if (WT_STRING_MATCH("count", lk.str, lk.len)) { + chunk->count = lv.val; + continue; + } + if ((nchunks + 1) * chunk_sz > + lsm_tree->chunk_alloc) + WT_ERR(__wt_realloc(session, + &lsm_tree->chunk_alloc, + WT_MAX(10 * chunk_sz, + 2 * lsm_tree->chunk_alloc), + &lsm_tree->chunk)); + WT_ERR(__wt_calloc_def(session, 1, &chunk)); + lsm_tree->chunk[nchunks++] = chunk; + WT_ERR(__wt_strndup(session, + lk.str, lk.len, &chunk->uri)); + chunk->flags = WT_LSM_CHUNK_ONDISK; + } + WT_ERR_NOTFOUND_OK(ret); + lsm_tree->nchunks = nchunks; + } else if (WT_STRING_MATCH("old_chunks", ck.str, ck.len)) { + WT_ERR(__wt_config_subinit(session, &lparser, &cv)); + chunk_sz = sizeof(*lsm_tree->old_chunks); + for (nchunks = 0; (ret = + __wt_config_next(&lparser, &lk, &lv)) == 0; ) { + if ((nchunks + 1) * chunk_sz > + lsm_tree->old_avail * chunk_sz) { + alloc = lsm_tree->old_alloc; + WT_ERR(__wt_realloc(session, + &lsm_tree->old_alloc, + chunk_sz * WT_MAX(10, + lsm_tree->nold_chunks + + 2 * nchunks), + &lsm_tree->old_chunks)); + lsm_tree->nold_chunks = (int) + (lsm_tree->old_alloc / chunk_sz); + lsm_tree->old_avail += (int) + ((lsm_tree->old_alloc - alloc) / + chunk_sz); + } + WT_ERR(__wt_calloc_def(session, 1, &chunk)); + lsm_tree->old_chunks[nchunks++] = chunk; + WT_ERR(__wt_strndup(session, + lk.str, lk.len, &chunk->uri)); + chunk->flags = WT_LSM_CHUNK_ONDISK; + --lsm_tree->old_avail; + } + WT_ERR_NOTFOUND_OK(ret); + lsm_tree->nold_chunks = nchunks; + } else + WT_ERR(__wt_illegal_value(session, "LSM metadata")); + + /* TODO: collator */ + } + WT_ERR_NOTFOUND_OK(ret); + +err: __wt_free(session, config); + return (ret); +} + +/* + * __wt_lsm_meta_write -- + * Write the metadata for an LSM tree. + */ +int +__wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_DECL_ITEM(buf); + WT_DECL_RET; + WT_LSM_CHUNK *chunk; + int first, i; + + WT_RET(__wt_scr_alloc(session, 0, &buf)); + WT_ERR(__wt_buf_fmt(session, buf, + "file_config=(%s),key_format=%s,value_format=%s", + lsm_tree->file_config, + lsm_tree->key_format, lsm_tree->value_format)); + WT_ERR(__wt_buf_catfmt(session, buf, + ",last=%" PRIu32 ",threshold=%" PRIu64 + ",lsm_bloom_bit_count=%" PRIu32 ",lsm_bloom_hash_count=%" PRIu32, + lsm_tree->last, (uint64_t)lsm_tree->threshold, + lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count)); + WT_ERR(__wt_buf_catfmt(session, buf, ",chunks=[")); + for (i = 0; i < lsm_tree->nchunks; i++) { + chunk = lsm_tree->chunk[i]; + if (i > 0) + WT_ERR(__wt_buf_catfmt(session, buf, ",")); + WT_ERR(__wt_buf_catfmt(session, buf, "\"%s\"", chunk->uri)); + if (chunk->bloom_uri != NULL) + WT_ERR(__wt_buf_catfmt( + session, buf, ",bloom=\"%s\"", chunk->bloom_uri)); + if (chunk->count != 0) + WT_ERR(__wt_buf_catfmt( + session, buf, ",count=%" PRIu64, chunk->count)); + } + WT_ERR(__wt_buf_catfmt(session, buf, "]")); + WT_ERR(__wt_buf_catfmt(session, buf, ",old_chunks=[")); + first = 1; + for (i = 0; i < (int)lsm_tree->nold_chunks; i++) { + chunk = lsm_tree->old_chunks[i]; + if (chunk == NULL) + continue; + if (first) + first = 0; + else + WT_ERR(__wt_buf_catfmt(session, buf, ",")); + WT_ERR(__wt_buf_catfmt(session, buf, "\"%s\"", chunk->uri)); + } + WT_ERR(__wt_buf_catfmt(session, buf, "]")); + WT_ERR(__wt_metadata_update(session, lsm_tree->name, buf->data)); + +err: __wt_scr_free(&buf); + return (ret); +} diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c new file mode 100644 index 00000000000..ca3f7a28c6b --- /dev/null +++ b/src/lsm/lsm_tree.c @@ -0,0 +1,523 @@ +/*- + * Copyright (c) 2008-2012 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __lsm_tree_discard -- + * Free an LSM tree structure. + */ +static void +__lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_LSM_CHUNK *chunk; + int i; + + TAILQ_REMOVE(&S2C(session)->lsmqh, lsm_tree, q); + __wt_spin_destroy(session, &lsm_tree->lock); + + __wt_free(session, lsm_tree->name); + for (i = 0; i < lsm_tree->nchunks; i++) { + if ((chunk = lsm_tree->chunk[i]) == NULL) + continue; + + __wt_free(session, chunk->bloom_uri); + __wt_free(session, chunk->uri); + __wt_free(session, chunk); + } + __wt_free(session, lsm_tree->chunk); + + for (i = 0; i < lsm_tree->nold_chunks; i++) { + if ((chunk = lsm_tree->old_chunks[i]) == NULL) + continue; + + __wt_free(session, chunk->bloom_uri); + __wt_free(session, chunk->uri); + __wt_free(session, chunk); + } + __wt_free(session, lsm_tree->old_chunks); + + __wt_free(session, lsm_tree); +} + +/* + * __lsm_tree_close -- + * Close an LSM tree structure. + */ +static int +__lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_DECL_RET; + WT_SESSION *wt_session; + + if (F_ISSET(lsm_tree, WT_LSM_TREE_OPEN)) { + F_CLR(lsm_tree, WT_LSM_TREE_OPEN); + WT_TRET(__wt_thread_join(lsm_tree->worker_tid)); + } + + /* + * Close the session and free its hazard array (necessary because + * we set WT_SESSION_INTERNAL to simplify shutdown ordering. + * + * Do this in the main thread to avoid deadlocks. + */ + if (lsm_tree->worker_session != NULL) { + F_SET(lsm_tree->worker_session, + F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)); + + wt_session = &lsm_tree->worker_session->iface; + WT_TRET(wt_session->close(wt_session, NULL)); + + /* + * This is safe after the close because session handles are + * not freed, but are managed by the connection. + */ + __wt_free(NULL, lsm_tree->worker_session->hazard); + } + + return (ret); +} + +/* + * __wt_lsm_tree_close_all -- + * Close an LSM tree structure. + */ +int +__wt_lsm_tree_close_all(WT_SESSION_IMPL *session) +{ + WT_DECL_RET; + WT_LSM_TREE *lsm_tree; + + while ((lsm_tree = TAILQ_FIRST(&S2C(session)->lsmqh)) != NULL) { + WT_TRET(__lsm_tree_close(session, lsm_tree)); + __lsm_tree_discard(session, lsm_tree); + } + + return (ret); +} + +/* + * __wt_lsm_tree_bloom_name -- + * Get the URI of the Bloom filter for a given chunk. + */ +int +__wt_lsm_tree_bloom_name( + WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int i, WT_ITEM *buf) +{ + WT_RET(__wt_buf_fmt(session, buf, "file:%s-%06d.bf", + lsm_tree->filename, i)); + return (0); +} + +/* + * __wt_lsm_tree_chunk_name -- + * Get the URI of the file for a given chunk. + */ +int +__wt_lsm_tree_chunk_name( + WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int i, WT_ITEM *buf) +{ + WT_RET(__wt_buf_fmt(session, buf, "file:%s-%06d.lsm", + lsm_tree->filename, i)); + return (0); +} + +/* + * __wt_lsm_tree_create_chunk -- + * Create a chunk of an LSM tree. + */ +int +__wt_lsm_tree_create_chunk( + WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int i, const char **urip) +{ + WT_DECL_ITEM(buf); + WT_DECL_RET; + + WT_RET(__wt_scr_alloc(session, 0, &buf)); + WT_ERR(__wt_lsm_tree_chunk_name(session, lsm_tree, i, buf)); + WT_ERR(__wt_schema_create(session, + buf->data, lsm_tree->file_config)); + *urip = __wt_buf_steal(session, buf, NULL); + +err: __wt_scr_free(&buf); + return (ret); +} + +/* + * __wt_lsm_start_worker -- + * Start the worker thread for an LSM tree. + */ +static int +__lsm_tree_start_worker(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_CONNECTION *wt_conn; + WT_SESSION *wt_session; + + wt_conn = &S2C(session)->iface; + WT_RET(wt_conn->open_session(wt_conn, NULL, NULL, &wt_session)); + lsm_tree->worker_session = (WT_SESSION_IMPL *)wt_session; + F_SET(lsm_tree->worker_session, WT_SESSION_INTERNAL); + + WT_RET(__wt_thread_create( + &lsm_tree->worker_tid, __wt_lsm_worker, lsm_tree)); + F_SET(lsm_tree, WT_LSM_TREE_OPEN); + + return (0); +} + +/* + * __wt_lsm_tree_create -- + * Create an LSM tree structure for the given name. + */ +int +__wt_lsm_tree_create(WT_SESSION_IMPL *session, + const char *uri, int exclusive, const char *config) +{ + WT_CONFIG_ITEM cval; + WT_DECL_ITEM(buf); + WT_DECL_RET; + WT_LSM_TREE *lsm_tree; + const char *cfg[] = API_CONF_DEFAULTS(session, create, config); + + /* If the tree is open, it already exists. */ + if ((ret = __wt_lsm_tree_get(session, uri, &lsm_tree)) == 0) + return (exclusive ? EEXIST : 0); + WT_RET_NOTFOUND_OK(ret); + + /* If the tree has metadata, it already exists. */ + if (__wt_metadata_read(session, uri, &config) == 0) { + __wt_free(session, config); + return (exclusive ? EEXIST : 0); + } + WT_RET_NOTFOUND_OK(ret); + + /* + * XXX this call should just insert the metadata: most of this should + * move to __wt_lsm_tree_open. + */ + WT_RET(__wt_calloc_def(session, 1, &lsm_tree)); + __wt_spin_init(session, &lsm_tree->lock); + TAILQ_INSERT_HEAD(&S2C(session)->lsmqh, lsm_tree, q); + + WT_RET(__wt_strdup(session, uri, &lsm_tree->name)); + lsm_tree->filename = lsm_tree->name + strlen("lsm:"); + + WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval)); + WT_ERR(__wt_strndup(session, cval.str, cval.len, + &lsm_tree->key_format)); + WT_ERR(__wt_config_gets(session, cfg, "value_format", &cval)); + WT_ERR(__wt_strndup(session, cval.str, cval.len, + &lsm_tree->value_format)); + + WT_ERR(__wt_config_gets(session, cfg, "lsm_chunk_size", &cval)); + lsm_tree->threshold = (uint32_t)cval.val; + WT_ERR(__wt_config_gets(session, cfg, "lsm_bloom_bit_count", &cval)); + lsm_tree->bloom_bit_count = (uint32_t)cval.val; + WT_ERR(__wt_config_gets(session, cfg, "lsm_bloom_hash_count", &cval)); + lsm_tree->bloom_hash_count = (uint32_t)cval.val; + + WT_ERR(__wt_scr_alloc(session, 0, &buf)); + WT_ERR(__wt_buf_fmt(session, buf, + "%s,key_format=u,value_format=u", config)); + lsm_tree->file_config = __wt_buf_steal(session, buf, NULL); + + /* Create the initial chunk. */ + WT_ERR(__wt_lsm_tree_switch(session, lsm_tree)); + + /* XXX This should definitely only happen when opening the tree. */ + WT_ERR(__lsm_tree_start_worker(session, lsm_tree)); + + if (0) { +err: __lsm_tree_discard(session, lsm_tree); + } + __wt_scr_free(&buf); + return (ret); +} + +/* + * __wt_lsm_tree_open -- + * Open an LSM tree structure. + */ +static int +__lsm_tree_open( + WT_SESSION_IMPL *session, const char *uri, WT_LSM_TREE **treep) +{ + WT_DECL_RET; + WT_LSM_TREE *lsm_tree; + + /* Try to open the tree. */ + WT_RET(__wt_calloc_def(session, 1, &lsm_tree)); + __wt_spin_init(session, &lsm_tree->lock); + WT_ERR(__wt_strdup(session, uri, &lsm_tree->name)); + lsm_tree->filename = lsm_tree->name + strlen("lsm:"); + TAILQ_INSERT_HEAD(&S2C(session)->lsmqh, lsm_tree, q); + + WT_ERR(__wt_lsm_meta_read(session, lsm_tree)); + + /* Set the generation number so cursors are opened on first usage. */ + lsm_tree->dsk_gen = 1; + + WT_ERR(__lsm_tree_start_worker(session, lsm_tree)); + *treep = lsm_tree; + + if (0) { +err: __lsm_tree_discard(session, lsm_tree); + } + return (ret); +} + +/* + * __wt_lsm_tree_get -- + * get an LSM tree structure for the given name. + */ +int +__wt_lsm_tree_get( + WT_SESSION_IMPL *session, const char *uri, WT_LSM_TREE **treep) +{ + WT_DECL_RET; + WT_LSM_TREE *lsm_tree; + + TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) + if (strcmp(uri, lsm_tree->name) == 0) { + *treep = lsm_tree; + return (0); + } + + /* + * If we don't already hold the schema lock, get it now so that we + * can find and/or open the handle. + */ + WT_WITH_SCHEMA_LOCK_OPT(session, + ret = __lsm_tree_open(session, uri, treep)); + return (ret); +} + +/* + * __wt_lsm_tree_switch -- + * Switch to a new in-memory tree. + */ +int +__wt_lsm_tree_switch( + WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_DECL_RET; + WT_LSM_CHUNK *chunk; + + WT_VERBOSE_RET(session, lsm, + "Tree switch to: %d because %d > %d", lsm_tree->last + 1, + (lsm_tree->memsizep == NULL ? 0 : (int)*lsm_tree->memsizep), + (int)lsm_tree->threshold); + + lsm_tree->memsizep = NULL; + + if ((lsm_tree->nchunks + 1) * sizeof(*lsm_tree->chunk) > + lsm_tree->chunk_alloc) + WT_ERR(__wt_realloc(session, + &lsm_tree->chunk_alloc, + WT_MAX(10 * sizeof(*lsm_tree->chunk), + 2 * lsm_tree->chunk_alloc), + &lsm_tree->chunk)); + + WT_ERR(__wt_calloc_def(session, 1, &chunk)); + lsm_tree->chunk[lsm_tree->nchunks++] = chunk; + WT_ERR(__wt_lsm_tree_create_chunk(session, + lsm_tree, WT_ATOMIC_ADD(lsm_tree->last, 1), + &chunk->uri)); + + ++lsm_tree->dsk_gen; + WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); + +err: /* TODO: mark lsm_tree bad on error(?) */ + return (ret); +} + +/* + * __wt_lsm_tree_drop -- + * Drop an LSM tree. + */ +int +__wt_lsm_tree_drop( + WT_SESSION_IMPL *session, const char *name, const char *cfg[]) +{ + WT_DECL_RET; + WT_LSM_CHUNK *chunk; + WT_LSM_TREE *lsm_tree; + int i; + + /* Get the LSM tree. */ + WT_RET(__wt_lsm_tree_get(session, name, &lsm_tree)); + + /* Shut down the LSM worker. */ + WT_RET(__lsm_tree_close(session, lsm_tree)); + + /* Prevent any new opens. */ + WT_RET(__wt_spin_trylock(session, &lsm_tree->lock)); + + /* Drop the chunks. */ + for (i = 0; i < lsm_tree->nchunks; i++) { + chunk = lsm_tree->chunk[i]; + WT_ERR(__wt_schema_drop(session, chunk->uri, cfg)); + if (chunk->bloom_uri != NULL) + WT_ERR( + __wt_schema_drop(session, chunk->bloom_uri, cfg)); + } + /* Drop any chunks on the obsolete list. */ + for (i = 0; i < lsm_tree->nold_chunks; i++) { + if ((chunk = lsm_tree->old_chunks[i]) == NULL) + continue; + WT_ERR(__wt_schema_drop(session, chunk->uri, cfg)); + if (chunk->bloom_uri != NULL) + WT_ERR( + __wt_schema_drop(session, chunk->bloom_uri, cfg)); + } + + __wt_spin_unlock(session, &lsm_tree->lock); + WT_ERR(__wt_metadata_remove(session, name)); + + if (0) { +err: __wt_spin_unlock(session, &lsm_tree->lock); + } + __lsm_tree_discard(session, lsm_tree); + return (ret); +} + +/* + * __wt_lsm_tree_rename -- + * Rename an LSM tree. + */ +int +__wt_lsm_tree_rename(WT_SESSION_IMPL *session, + const char *oldname, const char *newname, const char *cfg[]) +{ + WT_DECL_ITEM(buf); + WT_DECL_RET; + WT_LSM_CHUNK *chunk; + WT_LSM_TREE *lsm_tree; + const char *old; + int i; + + old = NULL; + + /* Get the LSM tree. */ + WT_RET(__wt_lsm_tree_get(session, oldname, &lsm_tree)); + + /* Shut down the LSM worker. */ + WT_RET(__lsm_tree_close(session, lsm_tree)); + + /* Prevent any new opens. */ + WT_RET(__wt_spin_trylock(session, &lsm_tree->lock)); + + WT_ERR(__wt_scr_alloc(session, 0, &buf)); + + /* Set the new name. */ + __wt_free(session, lsm_tree->name); + WT_ERR(__wt_strdup(session, newname, &lsm_tree->name)); + + /* Rename the chunks. */ + for (i = 0; i < lsm_tree->nchunks; i++) { + chunk = lsm_tree->chunk[i]; + old = chunk->uri; + chunk->uri = NULL; + + WT_ERR(__wt_lsm_tree_chunk_name(session, lsm_tree, i, buf)); + chunk->uri = __wt_buf_steal(session, buf, NULL); + WT_ERR(__wt_schema_rename(session, old, chunk->uri, cfg)); + __wt_free(session, old); + + if ((old = chunk->bloom_uri) != NULL) { + chunk->bloom_uri = NULL; + WT_ERR(__wt_lsm_tree_bloom_name( + session, lsm_tree, i, buf)); + chunk->bloom_uri = __wt_buf_steal(session, buf, NULL); + WT_ERR(__wt_schema_rename( + session, old, chunk->uri, cfg)); + __wt_free(session, old); + } + } + + __wt_spin_unlock(session, &lsm_tree->lock); + WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); + WT_ERR(__wt_metadata_remove(session, oldname)); + + if (0) { +err: __wt_spin_unlock(session, &lsm_tree->lock); + } + __wt_scr_free(&buf); + if (old != NULL) + __wt_free(session, old); + __lsm_tree_discard(session, lsm_tree); + return (ret); +} + +/* + * __wt_lsm_tree_truncate -- + * Truncate an LSM tree. + */ +int +__wt_lsm_tree_truncate( + WT_SESSION_IMPL *session, const char *name, const char *cfg[]) +{ + WT_DECL_RET; + WT_LSM_CHUNK *chunk; + WT_LSM_TREE *lsm_tree; + + WT_UNUSED(cfg); + + /* Get the LSM tree. */ + WT_RET(__wt_lsm_tree_get(session, name, &lsm_tree)); + + /* Shut down the LSM worker. */ + WT_RET(__lsm_tree_close(session, lsm_tree)); + + /* Prevent any new opens. */ + WT_RET(__wt_spin_trylock(session, &lsm_tree->lock)); + + /* Mark all chunks old. */ + WT_ERR(__wt_lsm_merge_update_tree( + session, lsm_tree, lsm_tree->nchunks, &chunk)); + + /* Create the new chunk. */ + WT_ERR(__wt_lsm_tree_create_chunk( + session, lsm_tree, WT_ATOMIC_ADD(lsm_tree->last, 1), &chunk->uri)); + + WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); + + WT_ERR(__lsm_tree_start_worker(session, lsm_tree)); + __wt_spin_unlock(session, &lsm_tree->lock); + + if (0) { +err: __wt_spin_unlock(session, &lsm_tree->lock); + __lsm_tree_discard(session, lsm_tree); + } + return (ret); +} + +/* + * __wt_lsm_tree_worker -- + * Run a schema worker operation on each level of a LSM tree. + */ +int +__wt_lsm_tree_worker(WT_SESSION_IMPL *session, + const char *uri, + int (*func)(WT_SESSION_IMPL *, const char *[]), + const char *cfg[], uint32_t open_flags) +{ + WT_LSM_CHUNK *chunk; + WT_LSM_TREE *lsm_tree; + int i; + + WT_RET(__wt_lsm_tree_get(session, uri, &lsm_tree)); + for (i = 0; i < lsm_tree->nchunks; i++) { + chunk = lsm_tree->chunk[i]; + if (func == __wt_checkpoint && + F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) + continue; + WT_RET(__wt_schema_worker( + session, chunk->uri, func, cfg, open_flags)); + } + return (0); +} diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c new file mode 100644 index 00000000000..19eb2352ff8 --- /dev/null +++ b/src/lsm/lsm_worker.c @@ -0,0 +1,166 @@ +/*- + * Copyright (c) 2008-2012 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +static int +__lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree); + +/* + * __wt_lsm_worker -- + * The worker thread for an LSM tree, responsible for writing in-memory + * trees to disk and merging on-disk trees. + */ +void * +__wt_lsm_worker(void *arg) +{ + WT_DECL_RET; + WT_LSM_CHUNK *chunk, **chunk_array; + WT_LSM_TREE *lsm_tree; + WT_SESSION_IMPL *session; + const char *cfg[] = { "name=,drop=", NULL }; + size_t chunk_alloc; + int i, nchunks, progress; + + lsm_tree = arg; + session = lsm_tree->worker_session; + + chunk_array = NULL; + chunk_alloc = 0; + + while (F_ISSET(lsm_tree, WT_LSM_TREE_OPEN)) { + progress = 0; + + __wt_spin_lock(session, &lsm_tree->lock); + if (!F_ISSET(lsm_tree, WT_LSM_TREE_OPEN)) { + __wt_spin_unlock(session, &lsm_tree->lock); + break; + } + /* + * Take a copy of the current state of the LSM tree. Skip + * the last chunk - since it is the active one and not relevant + * to merge operations. + */ + for (nchunks = lsm_tree->nchunks - 1; + nchunks > 0 && lsm_tree->chunk[nchunks - 1]->ncursor > 0; + --nchunks) + ; + if (chunk_alloc < lsm_tree->chunk_alloc) + ret = __wt_realloc(session, + &chunk_alloc, lsm_tree->chunk_alloc, + &chunk_array); + if (ret == 0 && nchunks > 0) + memcpy(chunk_array, lsm_tree->chunk, + nchunks * sizeof(*lsm_tree->chunk)); + __wt_spin_unlock(session, &lsm_tree->lock); + WT_ERR(ret); + + /* + * Write checkpoints in all completed files, then find + * something to merge. + */ + for (i = 0; i < nchunks; i++) { + chunk = chunk_array[i]; + if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) || + chunk->ncursor > 0) + continue; + + /* XXX durability: need to checkpoint the metadata? */ + /* + * NOTE: we pass a non-NULL config, because otherwise + * __wt_checkpoint thinks we're closing the file. + */ + WT_WITH_SCHEMA_LOCK(session, ret = + __wt_schema_worker(session, chunk->uri, + __wt_checkpoint, cfg, 0)); + if (ret == 0) { + __wt_spin_lock(session, &lsm_tree->lock); + F_SET(lsm_tree->chunk[i], WT_LSM_CHUNK_ONDISK); + lsm_tree->dsk_gen++; + __wt_spin_unlock(session, &lsm_tree->lock); + progress = 1; + } + } + + /* Clear any state from previous worker thread iterations. */ + session->btree = NULL; + + if (nchunks > 0 && __wt_lsm_major_merge(session, lsm_tree) == 0) + progress = 1; + + /* Clear any state from previous worker thread iterations. */ + session->btree = NULL; + + if (lsm_tree->nold_chunks != lsm_tree->old_avail && + __lsm_free_chunks(session, lsm_tree) == 0) + progress = 1; + + if (!progress) + __wt_sleep(0, 10); + } + +err: __wt_free(session, chunk_array); + + return (NULL); +} + +static int +__lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_DECL_RET; + WT_LSM_CHUNK *chunk; + const char *drop_cfg[] = { NULL }; + int found, i; + + found = 0; + for (i = 0; i < lsm_tree->nold_chunks; i++) { + if ((chunk = lsm_tree->old_chunks[i]) == NULL) + continue; + if (!found) { + found = 1; + /* TODO: Do we need the lsm_tree lock for all drops? */ + __wt_spin_lock(session, &lsm_tree->lock); + } + if (chunk->bloom_uri != NULL) { + WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_drop( + session, chunk->bloom_uri, drop_cfg)); + /* + * An EBUSY return is acceptable - a cursor may still + * be positioned on this old chunk. + */ + if (ret == 0) { + __wt_free(session, chunk->bloom_uri); + chunk->bloom_uri = NULL; + } else if (ret != EBUSY) + goto err; + } + if (chunk->uri != NULL) { + WT_WITH_SCHEMA_LOCK(session, ret = + __wt_schema_drop(session, chunk->uri, drop_cfg)); + /* + * An EBUSY return is acceptable - a cursor may still + * be positioned on this old chunk. + */ + if (ret == 0) { + __wt_free(session, chunk->uri); + chunk->uri = NULL; + } else if (ret != EBUSY) + goto err; + } + + if (chunk->uri == NULL && chunk->bloom_uri == NULL) { + __wt_free(session, lsm_tree->old_chunks[i]); + ++lsm_tree->old_avail; + } + } + if (found) { +err: ret = __wt_lsm_meta_write(session, lsm_tree); + __wt_spin_unlock(session, &lsm_tree->lock); + } + /* Returning non-zero means there is no work to do. */ + return (found ? 0 : WT_NOTFOUND); +} diff --git a/src/schema/schema_create.c b/src/schema/schema_create.c index eefd36333e6..8777b2d3e32 100644 --- a/src/schema/schema_create.c +++ b/src/schema/schema_create.c @@ -11,14 +11,13 @@ static int __create_file(WT_SESSION_IMPL *session, const char *uri, int exclusive, const char *config) { - WT_ITEM *val; + WT_DECL_ITEM(val); WT_DECL_RET; int is_metadata; const char *cfg[] = API_CONF_DEFAULTS(session, create, config); const char *filecfg[4] = API_CONF_DEFAULTS(file, meta, config); const char *filename, *treeconf; - val = NULL; treeconf = NULL; is_metadata = strcmp(uri, WT_METADATA_URI) == 0; @@ -32,7 +31,7 @@ __create_file(WT_SESSION_IMPL *session, __wt_metadata_read(session, uri, &treeconf)) != WT_NOTFOUND) { if (exclusive) WT_TRET(EEXIST); - return (ret); + goto err; } /* Create the file. */ @@ -67,10 +66,10 @@ __create_file(WT_SESSION_IMPL *session, */ WT_ERR(__wt_conn_btree_get( session, uri, NULL, cfg, WT_BTREE_EXCLUSIVE)); - if (!WT_META_TRACKING(session)) - WT_ERR(__wt_session_release_btree(session)); - else + if (WT_META_TRACKING(session)) WT_ERR(__wt_meta_track_handle_lock(session, 1)); + else + WT_ERR(__wt_session_release_btree(session)); err: __wt_scr_free(&val); __wt_free(session, treeconf); @@ -399,7 +398,8 @@ __wt_schema_create( else if (WT_PREFIX_MATCH(name, "table:")) ret = __create_table(session, name, exclusive, config); else if ((ret = __wt_schema_get_source(session, name, &dsrc)) == 0) - ret = dsrc->create(dsrc, &session->iface, name, config); + ret = dsrc->create(dsrc, &session->iface, + name, exclusive, config); session->btree = NULL; WT_TRET(__wt_meta_track_off(session, ret != 0)); diff --git a/src/schema/schema_drop.c b/src/schema/schema_drop.c index 4ef8e8d0220..f16b76286e3 100644 --- a/src/schema/schema_drop.c +++ b/src/schema/schema_drop.c @@ -250,7 +250,7 @@ __wt_schema_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) else if (WT_PREFIX_MATCH(uri, "table:")) ret = __drop_table(session, uri, force, cfg); else if ((ret = __wt_schema_get_source(session, uri, &dsrc)) == 0) - ret = dsrc->drop(dsrc, &session->iface, uri, cfg[1]); + ret = dsrc->drop(dsrc, &session->iface, uri, cfg); /* * Map WT_NOTFOUND to ENOENT (or to 0 if "force" is set), based on the diff --git a/src/schema/schema_rename.c b/src/schema/schema_rename.c index 9115e6cdc4b..f052be06fdc 100644 --- a/src/schema/schema_rename.c +++ b/src/schema/schema_rename.c @@ -225,7 +225,7 @@ __wt_schema_rename(WT_SESSION_IMPL *session, ret = __rename_table(session, oldname, newname); } else if ((ret = __wt_schema_get_source(session, oldname, &dsrc)) == 0) ret = dsrc->rename(dsrc, - &session->iface, oldname, newname, cfg[1]); + &session->iface, oldname, newname, cfg); WT_TRET(__wt_meta_track_off(session, ret != 0)); diff --git a/src/schema/schema_truncate.c b/src/schema/schema_truncate.c index c5492ca69cd..96066be25c7 100644 --- a/src/schema/schema_truncate.c +++ b/src/schema/schema_truncate.c @@ -104,7 +104,7 @@ __wt_schema_truncate( else if (WT_PREFIX_SKIP(tablename, "table:")) ret = __truncate_table(session, tablename); else if ((ret = __wt_schema_get_source(session, uri, &dsrc)) == 0) - ret = dsrc->truncate(dsrc, &session->iface, uri, cfg[1]); + ret = dsrc->truncate(dsrc, &session->iface, uri, cfg); /* If we didn't find a metadata entry, map that error to ENOENT. */ return (ret == WT_NOTFOUND ? ENOENT : ret); diff --git a/src/schema/schema_util.c b/src/schema/schema_util.c index 934781c9d7d..8c5b2c74527 100644 --- a/src/schema/schema_util.c +++ b/src/schema/schema_util.c @@ -35,27 +35,24 @@ int __wt_schema_name_check(WT_SESSION_IMPL *session, const char *uri) { const char *name, *sep; + int skipped; /* * Check if name is somewhere in the WiredTiger name space: it would be - * "bad" if the application truncated the metadata file. We get passed - * both objects and simple strings, skip any leading URI prefix. + * "bad" if the application truncated the metadata file. Skip any + * leading URI prefix, check and then skip over a table name. */ name = uri; - if (WT_PREFIX_SKIP(name, "colgroup:") || - WT_PREFIX_SKIP(name, "index:")) { - /* These URIs normally reference a table name. */ - if ((sep = strchr(name, ':')) != NULL) - name = sep + 1; - } else if (!WT_PREFIX_SKIP(name, "table:") && - !WT_PREFIX_SKIP(name, "file:")) - return (__wt_bad_object_type(session, uri)); + for (skipped = 0; skipped < 2; skipped++) { + if ((sep = strchr(name, ':')) == NULL) + break; - if (WT_PREFIX_MATCH(name, "WiredTiger")) - WT_RET_MSG(session, EINVAL, - "%s: the \"WiredTiger\" name space may not be used by " - "applications", - name); + name = sep + 1; + if (WT_PREFIX_MATCH(name, "WiredTiger")) + WT_RET_MSG(session, EINVAL, + "%s: the \"WiredTiger\" name space may not be " + "used by applications", name); + } /* * Disallow JSON quoting characters -- the config string parsing code @@ -67,5 +64,6 @@ __wt_schema_name_check(WT_SESSION_IMPL *session, const char *uri) "%s: WiredTiger objects should not include grouping " "characters in their names", name); + return (0); } diff --git a/src/schema/schema_worker.c b/src/schema/schema_worker.c index ec9566d72ac..8bb62999d7b 100644 --- a/src/schema/schema_worker.c +++ b/src/schema/schema_worker.c @@ -38,6 +38,9 @@ __wt_schema_worker(WT_SESSION_IMPL *session, session, uri, strlen(uri), cfg, open_flags)); ret = func(session, cfg); WT_TRET(__wt_session_release_btree(session)); + } else if (WT_PREFIX_SKIP(tablename, "lsm:")) { + WT_RET(__wt_lsm_tree_worker( + session, uri, func, cfg, open_flags)); } else if (WT_PREFIX_SKIP(tablename, "table:")) { WT_RET(__wt_schema_get_table(session, tablename, strlen(tablename), 0, &table)); diff --git a/src/session/session_api.c b/src/session/session_api.c index f1e68c471b6..4d408f2aa37 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -25,17 +25,35 @@ __session_reset_cursors(WT_SESSION_IMPL *session) } /* + * __session_close_cache -- + * Close any cached handles in a session. Called holding the schema lock. + */ +static int +__session_close_cache(WT_SESSION_IMPL *session) +{ + WT_BTREE_SESSION *btree_session; + WT_DECL_RET; + + while ((btree_session = TAILQ_FIRST(&session->btrees)) != NULL) + WT_TRET(__wt_session_discard_btree(session, btree_session)); + + WT_TRET(__wt_schema_close_tables(session)); + + return (ret); +} + +/* * __session_close -- * WT_SESSION->close method. */ static int __session_close(WT_SESSION *wt_session, const char *config) { - WT_BTREE_SESSION *btree_session; WT_CONNECTION_IMPL *conn; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; + int tret; conn = (WT_CONNECTION_IMPL *)wt_session->connection; session = (WT_SESSION_IMPL *)wt_session; @@ -53,17 +71,15 @@ __session_close(WT_SESSION *wt_session, const char *config) WT_ASSERT(session, session->ncursors == 0); - /* Acquire the schema lock: we may be closing btree handles. */ - __wt_spin_lock(session, &S2C(session)->schema_lock); - F_SET(session, WT_SESSION_SCHEMA_LOCKED); - - while ((btree_session = TAILQ_FIRST(&session->btrees)) != NULL) - WT_TRET(__wt_session_discard_btree(session, btree_session)); - - WT_TRET(__wt_schema_close_tables(session)); - - F_CLR(session, WT_SESSION_SCHEMA_LOCKED); - __wt_spin_unlock(session, &S2C(session)->schema_lock); + /* + * Acquire the schema lock: we may be closing btree handles. + * + * Note that in some special cases, the schema may already be locked + * (e.g., if this session is an LSM tree worker and the tree is being + * dropped). + */ + WT_WITH_SCHEMA_LOCK_OPT(session, tret = __session_close_cache(session)); + WT_TRET(tret); /* Discard metadata tracking. */ __wt_meta_track_discard(session); @@ -144,7 +160,7 @@ __session_reconfigure(WT_SESSION *wt_session, const char *config) WT_ERR_MSG(session, EINVAL, "Database not configured for transactions"); - session->isolation = + session->isolation = session->txn.isolation = WT_STRING_MATCH("snapshot", cval.str, cval.len) ? TXN_ISO_SNAPSHOT : WT_STRING_MATCH("read-uncommitted", cval.str, cval.len) ? @@ -162,6 +178,7 @@ static int __session_open_cursor(WT_SESSION *wt_session, const char *uri, WT_CURSOR *to_dup, const char *config, WT_CURSOR **cursorp) { + WT_DATA_SOURCE *dsrc; WT_DECL_RET; WT_SESSION_IMPL *session; @@ -178,6 +195,7 @@ __session_open_cursor(WT_SESSION *wt_session, if (WT_PREFIX_MATCH(uri, "colgroup:") || WT_PREFIX_MATCH(uri, "index:") || WT_PREFIX_MATCH(uri, "file:") || + WT_PREFIX_MATCH(uri, "lsm:") || WT_PREFIX_MATCH(uri, "table:")) ret = __wt_cursor_dup(session, to_dup, config, cursorp); else @@ -196,8 +214,9 @@ __session_open_cursor(WT_SESSION *wt_session, ret = __wt_curstat_open(session, uri, cfg, cursorp); else if (WT_PREFIX_MATCH(uri, "table:")) ret = __wt_curtable_open(session, uri, cfg, cursorp); - else - ret = __wt_bad_object_type(session, uri); + else if ((ret = __wt_schema_get_source(session, uri, &dsrc)) == 0) + ret = dsrc->open_cursor(dsrc, &session->iface, + uri, cfg, cursorp); err: API_END_NOTFOUND_MAP(session, ret); } diff --git a/src/session/session_btree.c b/src/session/session_btree.c index 6c673474eac..0dc733a7ce8 100644 --- a/src/session/session_btree.c +++ b/src/session/session_btree.c @@ -191,7 +191,6 @@ __wt_session_get_btree(WT_SESSION_IMPL *session, WT_BTREE *btree; WT_BTREE_SESSION *btree_session; WT_DECL_RET; - int needlock; btree = NULL; @@ -228,16 +227,8 @@ __wt_session_get_btree(WT_SESSION_IMPL *session, * If we don't already hold the schema lock, get it now so that we * can find and/or open the handle. */ - needlock = !F_ISSET(session, WT_SESSION_SCHEMA_LOCKED); - if (needlock) { - __wt_spin_lock(session, &S2C(session)->schema_lock); - F_SET(session, WT_SESSION_SCHEMA_LOCKED); - } - ret = __wt_conn_btree_get(session, uri, checkpoint, cfg, flags); - if (needlock) { - F_CLR(session, WT_SESSION_SCHEMA_LOCKED); - __wt_spin_unlock(session, &S2C(session)->schema_lock); - } + WT_WITH_SCHEMA_LOCK_OPT(session, + ret = __wt_conn_btree_get(session, uri, checkpoint, cfg, flags)); WT_RET(ret); if (btree_session == NULL) diff --git a/src/support/hazard.c b/src/support/hazard.c index c5d5b02e7fd..ded1598b587 100644 --- a/src/support/hazard.c +++ b/src/support/hazard.c @@ -30,8 +30,8 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, int *busyp conn = S2C(session); *busyp = 0; - /* If a file cannot be evicted, hazard references aren't required. */ - if (btree->cache_resident) + /* If a file can never be evicted, hazard references aren't required. */ + if (F_ISSET(btree, WT_BTREE_NO_HAZARD)) return (0); /* @@ -120,8 +120,8 @@ __wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page) btree = session->btree; conn = S2C(session); - /* If a file cannot be evicted, hazard references aren't required. */ - if (btree->cache_resident) + /* If a file can never be evicted, hazard references aren't required. */ + if (F_ISSET(btree, WT_BTREE_NO_HAZARD)) return; /* diff --git a/src/support/scratch.c b/src/support/scratch.c index 57b779bd5c2..f9cff2d0343 100644 --- a/src/support/scratch.c +++ b/src/support/scratch.c @@ -249,7 +249,7 @@ __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) */ int __wt_scr_alloc_func(WT_SESSION_IMPL *session, - uint32_t size, WT_ITEM **scratchp + size_t size, WT_ITEM **scratchp #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif diff --git a/src/utilities/util.h b/src/utilities/util.h index 7035ebdb18b..a736fdfcf85 100644 --- a/src/utilities/util.h +++ b/src/utilities/util.h @@ -10,11 +10,13 @@ #define UTIL_COLGROUP_OK 0x01 /* colgroup: prefix OK */ #define UTIL_FILE_OK 0x02 /* file: prefix OK */ #define UTIL_INDEX_OK 0x04 /* index: prefix OK */ -#define UTIL_TABLE_OK 0x08 /* table: prefix OK */ +#define UTIL_LSM_OK 0x04 /* lsm: prefix OK */ +#define UTIL_TABLE_OK 0x10 /* table: prefix OK */ /* all known prefixes OK */ #define UTIL_ALL_OK \ - (UTIL_COLGROUP_OK | UTIL_FILE_OK | UTIL_INDEX_OK | UTIL_TABLE_OK) + (UTIL_COLGROUP_OK | UTIL_FILE_OK | UTIL_INDEX_OK |\ + UTIL_LSM_OK | UTIL_TABLE_OK) typedef struct { void *mem; /* Managed memory chunk */ diff --git a/src/utilities/util_dump.c b/src/utilities/util_dump.c index ffb0b2b7743..073feb92399 100644 --- a/src/utilities/util_dump.c +++ b/src/utilities/util_dump.c @@ -85,8 +85,8 @@ util_dump(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the uri. */ if (argc != 1) return (usage()); - if ((name = - util_name(*argv, "table", UTIL_FILE_OK | UTIL_TABLE_OK)) == NULL) + if ((name = util_name(*argv, + "table", UTIL_FILE_OK | UTIL_LSM_OK | UTIL_TABLE_OK)) == NULL) goto err; if (dump_config(session, name, hex) != 0) diff --git a/src/utilities/util_list.c b/src/utilities/util_list.c index 06c061ef87f..a535270bc3f 100644 --- a/src/utilities/util_list.c +++ b/src/utilities/util_list.c @@ -39,8 +39,8 @@ util_list(WT_SESSION *session, int argc, char *argv[]) case 0: break; case 1: - if ((name = util_name( - *argv, "table", UTIL_FILE_OK | UTIL_TABLE_OK)) == NULL) + if ((name = util_name(*argv, "table", + UTIL_FILE_OK | UTIL_LSM_OK | UTIL_TABLE_OK)) == NULL) return (1); break; default: @@ -109,14 +109,9 @@ list_print(WT_SESSION *session, const char *name, int cflag, int vflag) return (util_cerr("metadata", "get_key", ret)); /* - * If no object specified, show top-level objects (files and - * tables). + * If a name is specified, only show objects that match. */ - if (name == NULL) { - if (!WT_PREFIX_MATCH(key, "file:") && - !WT_PREFIX_MATCH(key, "table:")) - continue; - } else { + if (name != NULL) { if (!WT_PREFIX_MATCH(key, name)) continue; found = 1; diff --git a/src/utilities/util_load.c b/src/utilities/util_load.c index 444cdc64c22..30f07a96dfd 100644 --- a/src/utilities/util_load.c +++ b/src/utilities/util_load.c @@ -95,8 +95,9 @@ load_dump(WT_SESSION *session) * Single file dumps can only have two lines, the file name and * the configuration information. */ - if (list[0] == NULL || list[1] == NULL || list[2] != NULL || - !WT_PREFIX_MATCH(list[0], "file:")) + if ((list[0] == NULL || list[1] == NULL || list[2] != NULL) || + (WT_PREFIX_MATCH(list[0], "file:") && + WT_PREFIX_MATCH(list[0], "lsm:"))) return (format()); entry = list; diff --git a/src/utilities/util_loadtext.c b/src/utilities/util_loadtext.c index 6befcd82c47..b7f9fa2cafc 100644 --- a/src/utilities/util_loadtext.c +++ b/src/utilities/util_loadtext.c @@ -34,8 +34,8 @@ util_loadtext(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the uri. */ if (argc != 1) return (usage()); - if ((uri = - util_name(*argv, "table", UTIL_FILE_OK | UTIL_TABLE_OK)) == NULL) + if ((uri = util_name(*argv, + "table", UTIL_FILE_OK | UTIL_LSM_OK | UTIL_TABLE_OK)) == NULL) return (1); return (text(session, uri)); diff --git a/src/utilities/util_main.c b/src/utilities/util_main.c index cd3077ebb49..d4be5a45500 100644 --- a/src/utilities/util_main.c +++ b/src/utilities/util_main.c @@ -255,6 +255,14 @@ util_name(const char *s, const char *type, u_int flags) return (NULL); } copy = 1; + } else if (WT_PREFIX_MATCH(s, "lsm:")) { + if (!(flags & UTIL_LSM_OK)) { + fprintf(stderr, + "%s: %s: \"lsm\" type not supported\n", + progname, command); + return (NULL); + } + copy = 1; } else if (WT_PREFIX_MATCH(s, "table:")) { if (!(flags & UTIL_TABLE_OK)) { fprintf(stderr, diff --git a/src/utilities/util_rename.c b/src/utilities/util_rename.c index 6f894e2e9c7..55a90f96b55 100644 --- a/src/utilities/util_rename.c +++ b/src/utilities/util_rename.c @@ -29,8 +29,8 @@ util_rename(WT_SESSION *session, int argc, char *argv[]) /* The remaining arguments are the object uri and new name. */ if (argc != 2) return (usage()); - if ((uri = util_name( - *argv, "table", UTIL_FILE_OK | UTIL_TABLE_OK)) == NULL) + if ((uri = util_name(*argv, + "table", UTIL_FILE_OK | UTIL_LSM_OK | UTIL_TABLE_OK)) == NULL) return (1); newname = argv[1]; diff --git a/src/utilities/util_upgrade.c b/src/utilities/util_upgrade.c index 1529d0b611c..7c4f2035fa2 100644 --- a/src/utilities/util_upgrade.c +++ b/src/utilities/util_upgrade.c @@ -29,8 +29,8 @@ util_upgrade(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the table name. */ if (argc != 1) return (usage()); - if ((name = util_name( - *argv, "table", UTIL_FILE_OK | UTIL_TABLE_OK)) == NULL) + if ((name = util_name(*argv, + "table", UTIL_FILE_OK | UTIL_LSM_OK | UTIL_TABLE_OK)) == NULL) return (1); if ((ret = session->upgrade(session, name, NULL)) != 0) { diff --git a/src/utilities/util_verify.c b/src/utilities/util_verify.c index 7ac74178217..fda2d413346 100644 --- a/src/utilities/util_verify.c +++ b/src/utilities/util_verify.c @@ -29,8 +29,8 @@ util_verify(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the table name. */ if (argc != 1) return (usage()); - if ((name = util_name( - *argv, "table", UTIL_FILE_OK | UTIL_TABLE_OK)) == NULL) + if ((name = util_name(*argv, + "table", UTIL_FILE_OK | UTIL_LSM_OK | UTIL_TABLE_OK)) == NULL) return (1); if ((ret = session->verify(session, name, NULL)) != 0) { diff --git a/src/utilities/util_write.c b/src/utilities/util_write.c index 95def2f652f..ad1f47b067d 100644 --- a/src/utilities/util_write.c +++ b/src/utilities/util_write.c @@ -45,8 +45,8 @@ util_write(WT_SESSION *session, int argc, char *argv[]) } else if (argc < 3 || ((argc - 1) % 2 != 0)) return (usage()); - if ((uri = - util_name(*argv, "table", UTIL_FILE_OK | UTIL_TABLE_OK)) == NULL) + if ((uri = util_name(*argv, + "table", UTIL_FILE_OK | UTIL_LSM_OK | UTIL_TABLE_OK)) == NULL) return (1); /* Open the object. */ diff --git a/test/bloom/test_bloom.c b/test/bloom/test_bloom.c index 0333e7acb53..1b02b958718 100644 --- a/test/bloom/test_bloom.c +++ b/test/bloom/test_bloom.c @@ -138,7 +138,7 @@ int run(void) WT_BLOOM *bloomp; WT_ITEM item; WT_SESSION_IMPL *sess; - const char *uri = "table:my_bloom"; + const char *uri = "file:my_bloom.bf"; int ret; uint32_t fp, i; @@ -171,7 +171,8 @@ int run(void) die(ret, "__wt_bloom_close"); g.wt_session->checkpoint(g.wt_session, NULL); - if ((ret = __wt_bloom_open(sess, uri, g.c_factor, g.c_k, &bloomp)) != 0) + if ((ret = __wt_bloom_open( + sess, uri, g.c_factor, g.c_k, NULL, &bloomp)) != 0) die(ret, "__wt_bloom_open"); for (i = 0; i < g.c_ops; i++) { item.data = g.entries[i]; diff --git a/test/fops/t.c b/test/fops/t.c index ad1d7811771..92719a1e109 100644 --- a/test/fops/t.c +++ b/test/fops/t.c @@ -28,7 +28,8 @@ main(int argc, char *argv[]) u_int nthreads; int ch, cnt, runs; char *config_open; - const char **objp, *objs[] = { "file:__wt", "table:__wt", NULL }; + const char **objp; + const char *objs[] = { "file:__wt", "table:__wt", "lsm:__wt", NULL }; if ((progname = strrchr(argv[0], '/')) == NULL) progname = argv[0]; diff --git a/test/format/config.c b/test/format/config.c index 796a7037ae4..659aea4552a 100644 --- a/test/format/config.c +++ b/test/format/config.c @@ -50,11 +50,9 @@ config_setup(void) case 1: config_single("data_source=table", 0); break; - /* LSM isn't implemented yet. case 2: config_single("data_source=lsm", 0); break; - */ } } @@ -88,6 +86,20 @@ config_setup(void) if (cp->flags & C_OPS) *cp->v = 0; + /* LSM trees are only compatible with row store tables. */ + if (g.c_file_type != ROW && + strncmp(g.c_data_source, "lsm", strlen("lsm")) == 0) { + cp = config_find("file_type", strlen("file_type")); + if (!(cp->flags & C_PERM)) + config_single("file_type=row", 0); + else { + fprintf(stderr, + "%s: LSM data source is only compatible with row file_type\n", + g.progname); + exit(EXIT_FAILURE); + } + } + /* Multi-threaded runs cannot be replayed. */ if (g.replay && g.c_threads != 1) { fprintf(stderr, diff --git a/test/format/util.c b/test/format/util.c index 60f09031dd6..503ae17dbdc 100644 --- a/test/format/util.c +++ b/test/format/util.c @@ -114,8 +114,10 @@ value_gen(uint8_t *val, uint32_t *sizep, uint64_t keyno) /* * WiredTiger doesn't store zero-length data items in row-store files, * test that by inserting a zero-length data item every so often. + * LSM doesn't support zero length items. */ - if (keyno % 63 == 0) { + if (keyno % 63 == 0 && + strncmp("lsm", g.c_data_source, strlen("lsm") != 0)) { val[0] = '\0'; *sizep = 0; return; diff --git a/test/format/wts_ops.c b/test/format/wts_ops.c index 8825c9fb85b..d4a2868769f 100644 --- a/test/format/wts_ops.c +++ b/test/format/wts_ops.c @@ -687,7 +687,12 @@ row_remove(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int *notfoundp) return; bdb_remove(keyno, ¬found); - (void)notfound_chk("row_remove", ret, notfound, keyno); + + /* LSM trees don't check for existence if "overwrite" is set. */ + if (strncmp(cursor->uri, "lsm:", 4) == 0) + *notfoundp = notfound; + else + (void)notfound_chk("row_remove", ret, notfound, keyno); } /* diff --git a/test/suite/test_cursor01.py b/test/suite/test_cursor01.py index 51c65ad9607..f4c448187c5 100644 --- a/test/suite/test_cursor01.py +++ b/test/suite/test_cursor01.py @@ -43,6 +43,7 @@ class test_cursor01(wttest.WiredTigerTestCase): ('file-col', dict(tablekind='col',uri='file')), ('file-fix', dict(tablekind='fix',uri='file')), ('file-row', dict(tablekind='row',uri='file')), + ('lsm-row', dict(tablekind='row',uri='lsm')), ('table-col', dict(tablekind='col',uri='table')), ('table-fix', dict(tablekind='fix',uri='table')), ('table-row', dict(tablekind='row',uri='table')) @@ -139,9 +140,13 @@ class test_cursor01(wttest.WiredTigerTestCase): value = cursor.get_value() self.assertEqual(key, self.genkey(i)) self.assertEqual(value, self.genvalue(i)) - i += 1 dupc = self.session.open_cursor(None, cursor, None) self.assertEquals(cursor.compare(dupc), 0) + key = dupc.get_key() + value = dupc.get_value() + self.assertEqual(key, self.genkey(i)) + self.assertEqual(value, self.genvalue(i)) + i += 1 cursor.close() cursor = dupc diff --git a/test/suite/test_cursor02.py b/test/suite/test_cursor02.py index 7786b902078..da49515dc52 100644 --- a/test/suite/test_cursor02.py +++ b/test/suite/test_cursor02.py @@ -38,13 +38,14 @@ class test_cursor02(TestCursorTracker): after inserts and removes. """ scenarios = [ - ('row', dict(tablekind='row')), - ('col', dict(tablekind='col')), + ('row', dict(tablekind='row', uri='table')), + ('lsm-row', dict(tablekind='row', uri='lsm')), + ('col', dict(tablekind='col', uri='table')), #('fix', dict(tablekind='fix')) ] def create_session_and_cursor(self, ninitialentries): - tablearg = "table:" + self.table_name1 + tablearg = self.uri + ":" + self.table_name1 if self.tablekind == 'row': keyformat = 'key_format=S' else: @@ -56,7 +57,7 @@ class test_cursor02(TestCursorTracker): create_args = keyformat + ',' + valformat + self.config_string() self.session_create(tablearg, create_args) self.pr('creating cursor') - self.cur_initial_conditions(self.table_name1, ninitialentries, self.tablekind, None, None) + self.cur_initial_conditions(self.table_name1, ninitialentries, self.tablekind, None, None, self.uri) return self.session.open_cursor(tablearg, None, 'append') def test_multiple_remove(self): diff --git a/test/suite/test_cursor03.py b/test/suite/test_cursor03.py index 3be68230cff..04002e1e521 100644 --- a/test/suite/test_cursor03.py +++ b/test/suite/test_cursor03.py @@ -40,19 +40,20 @@ class test_cursor03(TestCursorTracker): after inserts and removes. """ scenarios = multiply_scenarios('.', [ - ('row', dict(tablekind='row', keysize=None, valsize=None)), - ('col', dict(tablekind='col', keysize=None, valsize=None)), + ('row', dict(tablekind='row', keysize=None, valsize=None, uri='table')), + ('row', dict(tablekind='row', keysize=None, valsize=None, uri='lsm')), + ('col', dict(tablekind='col', keysize=None, valsize=None, uri='table')), #('fix', dict(tablekind='fix', keysize=None, valsize=None)) - ('row.val10k', dict(tablekind='row', keysize=None, valsize=[10, 10000])), - ('col.val10k', dict(tablekind='col', keysize=None, valsize=[10, 10000])), - ('row.keyval10k', dict(tablekind='row', keysize=[10,10000], valsize=[10, 10000])), + ('row.val10k', dict(tablekind='row', keysize=None, valsize=[10, 10000], uri='table')), + ('col.val10k', dict(tablekind='col', keysize=None, valsize=[10, 10000], uri='table')), + ('row.keyval10k', dict(tablekind='row', keysize=[10,10000], valsize=[10, 10000], uri='table')), ], [ ('count1000', dict(tablecount=1000,cache_size=20*1024*1024)), ('count10000', dict(tablecount=10000, cache_size=64*1024*1024)) ]) def create_session_and_cursor(self): - tablearg = "table:" + self.table_name1 + tablearg = self.uri + ":" + self.table_name1 if self.tablekind == 'row': keyformat = 'key_format=S' else: @@ -64,7 +65,7 @@ class test_cursor03(TestCursorTracker): create_args = keyformat + ',' + valformat + self.config_string() self.session_create(tablearg, create_args) self.pr('creating cursor') - self.cur_initial_conditions(self.table_name1, self.tablecount, self.tablekind, self.keysize, self.valsize) + self.cur_initial_conditions(self.table_name1, self.tablecount, self.tablekind, self.keysize, self.valsize, self.uri) return self.session.open_cursor(tablearg, None, 'append') def setUpConnectionOpen(self, dir): diff --git a/test/suite/test_cursor04.py b/test/suite/test_cursor04.py index 148ca4b72d9..24eb92a8c9c 100644 --- a/test/suite/test_cursor04.py +++ b/test/suite/test_cursor04.py @@ -37,9 +37,10 @@ class test_cursor04(wttest.WiredTigerTestCase): nentries = 20 scenarios = [ - ('row', dict(tablekind='row')), - ('col', dict(tablekind='col')), - ('fix', dict(tablekind='fix')) + ('row', dict(tablekind='row', uri='table')), + ('lsm-row', dict(tablekind='row', uri='lsm')), + ('col', dict(tablekind='col', uri='table')), + ('fix', dict(tablekind='fix', uri='table')) ] def config_string(self): @@ -60,7 +61,7 @@ class test_cursor04(wttest.WiredTigerTestCase): raise def create_session_and_cursor(self): - tablearg = "table:" + self.table_name1 + tablearg = self.uri + ":" + self.table_name1 if self.tablekind == 'row': keyformat = 'key_format=S' else: diff --git a/test/suite/test_cursor_tracker.py b/test/suite/test_cursor_tracker.py index 86b7cbfa27e..3e50b55cd98 100644 --- a/test/suite/test_cursor_tracker.py +++ b/test/suite/test_cursor_tracker.py @@ -136,7 +136,7 @@ class TestCursorTracker(wttest.WiredTigerTestCase): self.encode_value = self.encode_value_fix self.decode_value = self.decode_value_fix - def cur_initial_conditions(self, tablename, npairs, tablekind, keysizes, valuesizes): + def cur_initial_conditions(self, tablename, npairs, tablekind, keysizes, valuesizes, uri="table"): if npairs >= 0xffffffff: raise Exception('cur_initial_conditions: npairs too big') self.tablekind = tablekind @@ -151,7 +151,7 @@ class TestCursorTracker(wttest.WiredTigerTestCase): self.keysizes = keysizes self.valuesizes = valuesizes if tablekind != None: - cursor = self.session.open_cursor('table:' + tablename, None, 'append') + cursor = self.session.open_cursor(uri + ':' + tablename, None, 'append') for i in range(npairs): wtkey = self.encode_key(i << 32) wtval = self.encode_value(i << 32) diff --git a/test/suite/test_util11.py b/test/suite/test_util11.py index 12c64f68086..d24f2213794 100644 --- a/test/suite/test_util11.py +++ b/test/suite/test_util11.py @@ -71,15 +71,13 @@ class test_util11(wttest.WiredTigerTestCase, suite_subprocess): self.populate(pfx + '3') # Construct what we think we'll find - filelist = '' tablelist = '' for i in range(1, 6): - filelist += 'file:' + pfx + str(i) + '.wt\n' tablelist += 'table:' + pfx + str(i) + '\n' outfile = "listout.txt" - self.runWt(["list"], outfilename=outfile) - self.check_file_content(outfile, filelist + tablelist) + self.runWt(["list", "table:"], outfilename=outfile) + self.check_file_content(outfile, tablelist) def test_list_drop(self): """ @@ -99,18 +97,11 @@ class test_util11(wttest.WiredTigerTestCase, suite_subprocess): self.session.drop('table:' + pfx + '4', None) # Construct what we think we'll find - filelist = '' - tablelist = '' - filelist += 'file:' + pfx + '1.wt\n' - tablelist += 'table:' + pfx + '1\n' - filelist += 'file:' + pfx + '3.wt\n' - tablelist += 'table:' + pfx + '3\n' - filelist += 'file:' + pfx + '5.wt\n' - tablelist += 'table:' + pfx + '5\n' + tablelist = ''.join('table:' + pfx + str(i) + '\n' for i in (1, 3, 5)) outfile = "listout.txt" - self.runWt(["list"], outfilename=outfile) - self.check_file_content(outfile, filelist + tablelist) + self.runWt(["list", "table:"], outfilename=outfile) + self.check_file_content(outfile, tablelist) def test_list_drop_all(self): """ |