summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Cahill <mjc@wiredtiger.com>2012-09-14 06:56:21 -0700
committerMichael Cahill <mjc@wiredtiger.com>2012-09-14 06:56:21 -0700
commit0abd4252e3db7d623b25077fb5b0c1811cf454ed (patch)
tree67a14705d9a5e120a1f70a7489595e1c7a24b2f5
parent016a035d3acefda6ee14e5fbac215a1bd28bb98f (diff)
parent45f1893bd8ef5bc327893f7662861e73e23f4d9d (diff)
downloadmongo-0abd4252e3db7d623b25077fb5b0c1811cf454ed.tar.gz
Merge pull request #325 from wiredtiger/lsm
Add support for LSM trees closes #168
-rw-r--r--dist/api_data.py16
-rw-r--r--dist/filelist6
-rw-r--r--dist/s_funcs.list1
-rw-r--r--dist/s_string.ok2
-rw-r--r--examples/c/ex_all.c37
-rw-r--r--src/bloom/bloom.c48
-rw-r--r--src/btree/bt_evict.c19
-rw-r--r--src/btree/bt_handle.c43
-rw-r--r--src/btree/bt_sync.c12
-rw-r--r--src/config/config_def.c14
-rw-r--r--src/conn/conn_api.c10
-rw-r--r--src/conn/conn_btree.c5
-rw-r--r--src/conn/conn_handle.c2
-rw-r--r--src/cursor/cur_bulk.c4
-rw-r--r--src/cursor/cur_config.c2
-rw-r--r--src/cursor/cur_index.c2
-rw-r--r--src/docs/command-line.dox2
-rw-r--r--src/docs/cursors.dox7
-rw-r--r--src/docs/data_sources.dox4
-rw-r--r--src/docs/file-formats.dox2
-rw-r--r--src/docs/lsm.dox147
-rw-r--r--src/docs/programming.dox1
-rw-r--r--src/docs/spell.ok3
-rw-r--r--src/include/api.h24
-rw-r--r--src/include/btree.h11
-rw-r--r--src/include/cache.i9
-rw-r--r--src/include/extern.h62
-rw-r--r--src/include/lsm.h80
-rw-r--r--src/include/packing.i10
-rw-r--r--src/include/schema.h7
-rw-r--r--src/include/txn.i11
-rw-r--r--src/include/wiredtiger.in36
-rw-r--r--src/include/wt_internal.h9
-rw-r--r--src/lsm/lsm_cursor.c891
-rw-r--r--src/lsm/lsm_dsrc.c141
-rw-r--r--src/lsm/lsm_merge.c201
-rw-r--r--src/lsm/lsm_meta.c173
-rw-r--r--src/lsm/lsm_tree.c523
-rw-r--r--src/lsm/lsm_worker.c166
-rw-r--r--src/schema/schema_create.c14
-rw-r--r--src/schema/schema_drop.c2
-rw-r--r--src/schema/schema_rename.c2
-rw-r--r--src/schema/schema_truncate.c2
-rw-r--r--src/schema/schema_util.c28
-rw-r--r--src/schema/schema_worker.c3
-rw-r--r--src/session/session_api.c49
-rw-r--r--src/session/session_btree.c13
-rw-r--r--src/support/hazard.c8
-rw-r--r--src/support/scratch.c2
-rw-r--r--src/utilities/util.h6
-rw-r--r--src/utilities/util_dump.c4
-rw-r--r--src/utilities/util_list.c13
-rw-r--r--src/utilities/util_load.c5
-rw-r--r--src/utilities/util_loadtext.c4
-rw-r--r--src/utilities/util_main.c8
-rw-r--r--src/utilities/util_rename.c4
-rw-r--r--src/utilities/util_upgrade.c4
-rw-r--r--src/utilities/util_verify.c4
-rw-r--r--src/utilities/util_write.c4
-rw-r--r--test/bloom/test_bloom.c5
-rw-r--r--test/fops/t.c3
-rw-r--r--test/format/config.c16
-rw-r--r--test/format/util.c4
-rw-r--r--test/format/wts_ops.c7
-rw-r--r--test/suite/test_cursor01.py7
-rw-r--r--test/suite/test_cursor02.py9
-rw-r--r--test/suite/test_cursor03.py15
-rw-r--r--test/suite/test_cursor04.py9
-rw-r--r--test/suite/test_cursor_tracker.py4
-rw-r--r--test/suite/test_util11.py19
70 files changed, 2780 insertions, 240 deletions
diff --git a/dist/api_data.py b/dist/api_data.py
index 9d9804b3a94..143a031a7aa 100644
--- a/dist/api_data.py
+++ b/dist/api_data.py
@@ -79,8 +79,20 @@ format_meta = column_meta + [
type='format'),
]
+lsm_config = [
+ Config('lsm_chunk_size', '2MB', r'''
+ the maximum size of the in-memory chunk of an LSM tree''',
+ min='512K',max='500MB'),
+ Config('lsm_bloom_hash_count', '4', r'''
+ the number of hash values per item used for LSM bloom filters.''',
+ min='2',max='100'),
+ Config('lsm_bloom_bit_count', '8', r'''
+ the number of bits used per item for LSM bloom filters.''',
+ min='2',max='1000'),
+]
+
# Per-file configuration
-file_config = format_meta + [
+file_config = format_meta + lsm_config + [
Config('allocation_size', '512B', r'''
the file unit allocation size, in bytes, must a power-of-two;
smaller values decrease the file space required by overflow
@@ -215,6 +227,7 @@ connection_runtime_config = [
'evictserver',
'fileops',
'hazard',
+ 'lsm',
'mutex',
'read',
'readserver',
@@ -438,6 +451,7 @@ flags = {
'VERB_evict',
'VERB_evictserver',
'VERB_fileops',
+ 'VERB_lsm',
'VERB_hazard',
'VERB_mutex',
'VERB_read',
diff --git a/dist/filelist b/dist/filelist
index 1749cf047d7..e2ed4911683 100644
--- a/dist/filelist
+++ b/dist/filelist
@@ -66,6 +66,12 @@ src/cursor/cur_std.c
src/cursor/cur_table.c
src/log/log.c
src/log/log_desc.c
+src/lsm/lsm_cursor.c
+src/lsm/lsm_dsrc.c
+src/lsm/lsm_merge.c
+src/lsm/lsm_meta.c
+src/lsm/lsm_tree.c
+src/lsm/lsm_worker.c
src/meta/meta_api.c
src/meta/meta_apply.c
src/meta/meta_ckpt.c
diff --git a/dist/s_funcs.list b/dist/s_funcs.list
index b614aa5552d..574d97d42b6 100644
--- a/dist/s_funcs.list
+++ b/dist/s_funcs.list
@@ -2,6 +2,7 @@
WT_CURDUMP_PASS
__bit_ffs
__bit_nclr
+__wt_bloom_drop
__wt_bm_addr_stderr
__wt_btree_lex_compare
__wt_config_getone
diff --git a/dist/s_string.ok b/dist/s_string.ok
index 678f31e7b1b..d3e661b4cfc 100644
--- a/dist/s_string.ok
+++ b/dist/s_string.ok
@@ -104,7 +104,6 @@ LRU
LSB
LSM
LSN
-LSN's
LSNs
LeafGreen
Llqr
@@ -271,6 +270,7 @@ ckptfrag
ckptlist
cksum
clr
+clsm
cmd
cmp
cnt
diff --git a/examples/c/ex_all.c b/examples/c/ex_all.c
index ba63970abf5..09a95d3aaee 100644
--- a/examples/c/ex_all.c
+++ b/examples/c/ex_all.c
@@ -523,12 +523,13 @@ transaction_ops(WT_CONNECTION *conn, WT_SESSION *session)
/*! [WT_DATA_SOURCE create] */
static int
my_create(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
- const char *name, const char *config)
+ const char *name, int exclusive, const char *config)
{
/* Unused parameters */
(void)dsrc;
(void)session;
(void)name;
+ (void)exclusive;
(void)config;
return (0);
@@ -538,13 +539,13 @@ my_create(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
/*! [WT_DATA_SOURCE drop] */
static int
my_drop(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
- const char *name, const char *config)
+ const char *name, const char *cfg[])
{
/* Unused parameters */
(void)dsrc;
(void)session;
(void)name;
- (void)config;
+ (void)cfg;
return (0);
}
@@ -553,16 +554,14 @@ my_drop(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
/*! [WT_DATA_SOURCE open_cursor] */
static int
my_open_cursor(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
- const char *obj, WT_CURSOR *old_cursor, const char *config,
- WT_CURSOR **new_cursor)
+ const char *obj, const char *cfg[], WT_CURSOR **new_cursor)
{
/* Unused parameters */
(void)dsrc;
(void)session;
(void)obj;
- (void)old_cursor;
- (void)config;
+ (void)cfg;
(void)new_cursor;
return (0);
@@ -572,44 +571,29 @@ my_open_cursor(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
/*! [WT_DATA_SOURCE rename] */
static int
my_rename(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
- const char *oldname, const char *newname, const char *config)
+ const char *oldname, const char *newname, const char *cfg[])
{
/* Unused parameters */
(void)dsrc;
(void)session;
(void)oldname;
(void)newname;
- (void)config;
+ (void)cfg;
return (0);
}
/*! [WT_DATA_SOURCE rename] */
-/*! [WT_DATA_SOURCE sync] */
-static int
-my_sync(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
- const char *name, const char *config)
-{
- /* Unused parameters */
- (void)dsrc;
- (void)session;
- (void)name;
- (void)config;
-
- return (0);
-}
-/*! [WT_DATA_SOURCE sync] */
-
/*! [WT_DATA_SOURCE truncate] */
static int
my_truncate(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
- const char *name, const char *config)
+ const char *name, const char *cfg[])
{
/* Unused parameters */
(void)dsrc;
(void)session;
(void)name;
- (void)config;
+ (void)cfg;
return (0);
}
@@ -626,7 +610,6 @@ add_data_source(WT_CONNECTION *conn)
my_drop,
my_open_cursor,
my_rename,
- my_sync,
my_truncate
};
ret = conn->add_data_source(conn, "dsrc:", &my_dsrc, NULL);
diff --git a/src/bloom/bloom.c b/src/bloom/bloom.c
index f21de05ac88..657cb56e04b 100644
--- a/src/bloom/bloom.c
+++ b/src/bloom/bloom.c
@@ -14,6 +14,10 @@ static int __bloom_init(
WT_SESSION_IMPL *, const char *, const char *, WT_BLOOM **);
static int __bloom_setup(WT_BLOOM *, uint64_t, uint64_t, uint32_t, uint32_t);
+/*
+ * __bloom_init --
+ * Allocate a WT_BLOOM handle.
+ */
static int
__bloom_init(WT_SESSION_IMPL *session,
const char *uri, const char *config, WT_BLOOM **bloomp)
@@ -51,10 +55,12 @@ err: if (bloom->uri != NULL)
}
/*
- * Populate the bloom structure.
- * Setup is passed in either the count of items expected (n), or the length
- * of the bitstring (m). Depends on whether the function is called via create
- * or open.
+ * __bloom_setup --
+ * Populate the bloom structure.
+ *
+ * Setup is passed in either the count of items expected (n), or the length of
+ * the bitstring (m). Depends on whether the function is called via create or
+ * open.
*/
static int
__bloom_setup(
@@ -77,8 +83,8 @@ __bloom_setup(
/*
* __wt_bloom_create --
*
- * Creates and configures a WT_BLOOM handle, allocates a bitstring in memory
- * to use while populating the bloom filter.
+ * Creates and configures a WT_BLOOM handle, allocates a bitstring in memory to
+ * use while populating the bloom filter.
*
* count - is the expected number of inserted items
* factor - is the number of bits to use per inserted item
@@ -101,28 +107,28 @@ __wt_bloom_create(
/*
* __wt_bloom_open --
- * Open a Bloom filter object for use by a single session. The filter must have
- * been created and finalized.
+ * Open a Bloom filter object for use by a single session. The filter must
+ * have been created and finalized.
*/
int
__wt_bloom_open(WT_SESSION_IMPL *session,
- const char *uri, uint32_t factor, uint32_t k, WT_BLOOM **bloomp)
+ const char *uri, uint32_t factor, uint32_t k,
+ WT_CURSOR *owner, WT_BLOOM **bloomp)
{
WT_BLOOM *bloom;
WT_CURSOR *c;
- WT_SESSION *wt_session;
+ const char *cfg[] = API_CONF_DEFAULTS(session, open_cursor, NULL);
uint64_t size;
- wt_session = (WT_SESSION *)session;
-
WT_RET(__bloom_init(session, uri, NULL, &bloom));
/* Find the largest key, to get the size of the filter. */
- WT_RET(wt_session->open_cursor(wt_session, bloom->uri, NULL, NULL, &c));
+ cfg[1] = bloom->config;
+ WT_RET(__wt_curfile_open(session, bloom->uri, owner, cfg, &c));
WT_RET(c->prev(c));
WT_RET(c->get_key(c, &size));
- WT_RET(c->close(c));
+ bloom->c = c;
WT_RET(__bloom_setup(bloom, 0, size, factor, k));
*bloomp = bloom;
@@ -131,7 +137,7 @@ __wt_bloom_open(WT_SESSION_IMPL *session,
/*
* __wt_bloom_insert --
- * Adds the given key to the Bloom filter.
+ * Adds the given key to the Bloom filter.
*/
int
__wt_bloom_insert(WT_BLOOM *bloom, WT_ITEM *key)
@@ -149,8 +155,8 @@ __wt_bloom_insert(WT_BLOOM *bloom, WT_ITEM *key)
/*
* __wt_bloom_finalize --
- * Writes the Bloom filter to stable storage. After calling finalize, only
- * read operations can be performed on the bloom filter.
+ * Writes the Bloom filter to stable storage. After calling finalize, only
+ * read operations can be performed on the bloom filter.
*/
int
__wt_bloom_finalize(WT_BLOOM *bloom)
@@ -183,8 +189,8 @@ __wt_bloom_finalize(WT_BLOOM *bloom)
/*
* __wt_bloom_get --
- * Tests whether the given key is in the Bloom filter.
- * Returns zero if found, WT_NOTFOUND if not.
+ * Tests whether the given key is in the Bloom filter.
+ * Returns zero if found, WT_NOTFOUND if not.
*/
int
__wt_bloom_get(WT_BLOOM *bloom, WT_ITEM *key)
@@ -243,7 +249,7 @@ err: /* Don't return WT_NOTFOUND from a failed search. */
/*
* __wt_bloom_close --
- * Close the Bloom filter, release any resources.
+ * Close the Bloom filter, release any resources.
*/
int
__wt_bloom_close(WT_BLOOM *bloom)
@@ -265,7 +271,7 @@ __wt_bloom_close(WT_BLOOM *bloom)
/*
* __wt_bloom_drop --
- * Drop a Bloom filter, release any resources.
+ * Drop a Bloom filter, release any resources.
*/
int
__wt_bloom_drop(WT_BLOOM *bloom, const char *config)
diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c
index a9baa586bb3..34c2f7d3e3d 100644
--- a/src/btree/bt_evict.c
+++ b/src/btree/bt_evict.c
@@ -576,6 +576,14 @@ __evict_file_request(WT_SESSION_IMPL *session, int syncop)
session, page, WT_REC_SINGLE));
break;
case WT_SYNC_DISCARD_NOWRITE:
+ /*
+ * When we discard the root page, clear the reference
+ * from the btree handle. It is important to do this
+ * here, so that future eviction doesn't see root_page
+ * pointing to freed memory.
+ */
+ if (WT_PAGE_IS_ROOT(page))
+ session->btree->root_page = NULL;
__wt_page_out(session, &page, 0);
break;
}
@@ -734,13 +742,18 @@ __evict_walk(WT_SESSION_IMPL *session)
i = WT_EVICT_WALK_BASE;
TAILQ_FOREACH(btree, &conn->btqh, q) {
/*
- * Skip files marked as cache-resident, and files involved (or
- * potentially involved), in a bulk load. The real problem is
+ * Skip files that aren't open or don't have a root page.
+ *
+ * Also skip files marked as cache-resident, and files
+ * potentially involved in a bulk load. The real problem is
* eviction doesn't want to be walking the file as it converts
* to a bulk-loaded object, and empty trees aren't worth trying
* to evict, anyway.
*/
- if (btree->cache_resident || btree->bulk_load_ok)
+ if (!F_ISSET(btree, WT_BTREE_OPEN) ||
+ btree->root_page == NULL ||
+ F_ISSET(btree, WT_BTREE_NO_EVICTION) ||
+ btree->bulk_load_ok)
continue;
/* Reference the correct WT_BTREE handle. */
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index 24e7e50bf75..a1602173498 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -210,11 +210,14 @@ __btree_conf(WT_SESSION_IMPL *session)
/* Eviction; the metadata file is never evicted. */
if (strcmp(btree->name, WT_METADATA_URI) == 0)
- btree->cache_resident = 1;
+ F_SET(btree, WT_BTREE_NO_EVICTION | WT_BTREE_NO_HAZARD);
else {
WT_RET(__wt_config_getones(
session, config, "cache_resident", &cval));
- btree->cache_resident = cval.val ? 1 : 0;
+ if (cval.val)
+ F_SET(btree, WT_BTREE_NO_EVICTION | WT_BTREE_NO_HAZARD);
+ else
+ F_CLR(btree, WT_BTREE_NO_EVICTION);
}
/* Huffman encoding */
@@ -403,6 +406,42 @@ __wt_btree_leaf_create(
}
/*
+ * __wt_btree_get_memsize --
+ * Access the size of an in-memory tree with a single leaf page.
+ */
+int
+__wt_btree_get_memsize(
+ WT_SESSION_IMPL *session, WT_BTREE *btree, uint32_t **memsizep)
+{
+ WT_PAGE *root, *child;
+
+ WT_UNUSED(session);
+ root = btree->root_page;
+ child = root->u.intl.t->page;
+
+ if (root->entries != 1 || child == NULL) {
+ *memsizep = NULL;
+ return (WT_ERROR);
+ }
+
+ *memsizep = &child->memory_footprint;
+ F_SET(btree, WT_BTREE_NO_EVICTION);
+ return (0);
+}
+
+/*
+ * __wt_btree_release_memsize --
+ * Release a cache-resident tree.
+ */
+int
+__wt_btree_release_memsize(WT_SESSION_IMPL *session, WT_BTREE *btree)
+{
+ WT_UNUSED(session);
+ F_CLR(btree, WT_BTREE_NO_EVICTION);
+ return (0);
+}
+
+/*
* __btree_get_last_recno --
* Set the last record number for a column-store.
*/
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index 321d05608d2..8184bdaf058 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -70,20 +70,10 @@ __wt_bt_cache_flush(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, int op)
case WT_SYNC:
break;
case WT_SYNC_DISCARD:
+ case WT_SYNC_DISCARD_NOWRITE:
/* If discarding the tree, the root page should be gone. */
WT_ASSERT(session, btree->root_page == NULL);
break;
- case WT_SYNC_DISCARD_NOWRITE:
- /*
- * XXX
- * I'm not sure this is the right place to do this, but it's
- * the point in the btree engine where we know the root page
- * is gone. Unlike WT_SYNC_DISCARD, which writes, evicts and
- * discards the root page, WT_SYNC_DISCARD_NOWRITE simply
- * discards the pages, which means "eviction" never happens.
- */
- btree->root_page = NULL;
- break;
}
return (0);
diff --git a/src/config/config_def.c b/src/config/config_def.c
index 3fb6d54e39b..4b26ecd97b5 100644
--- a/src/config/config_def.c
+++ b/src/config/config_def.c
@@ -92,7 +92,7 @@ __wt_confchk_connection_reconfigure[] = {
{ "eviction_target", "int", "min=10,max=99" },
{ "eviction_trigger", "int", "min=10,max=99" },
{ "verbose", "list", "choices=[\"block\",\"ckpt\",\"evict\","
- "\"evictserver\",\"fileops\",\"hazard\",\"mutex\",\"read\","
+ "\"evictserver\",\"fileops\",\"hazard\",\"lsm\",\"mutex\",\"read\","
"\"readserver\",\"reconcile\",\"salvage\",\"verify\",\"write\"]" },
{ NULL, NULL, NULL }
};
@@ -112,7 +112,8 @@ __wt_confdfl_file_meta =
"checksum=,collator=,columns=,dictionary=0,huffman_key=,"
"huffman_value=,internal_item_max=0,internal_key_truncate=,"
"internal_page_max=2KB,key_format=u,key_gap=10,leaf_item_max=0,"
- "leaf_page_max=1MB,prefix_compression=,split_pct=75,type=btree,"
+ "leaf_page_max=1MB,lsm_bloom_bit_count=8,lsm_bloom_hash_count=4,"
+ "lsm_chunk_size=2MB,prefix_compression=,split_pct=75,type=btree,"
"value_format=u,version=(major=0,minor=0)";
WT_CONFIG_CHECK
@@ -134,6 +135,9 @@ __wt_confchk_file_meta[] = {
{ "key_gap", "int", "min=0" },
{ "leaf_item_max", "int", "min=0" },
{ "leaf_page_max", "int", "min=512B,max=512MB" },
+ { "lsm_bloom_bit_count", "int", "min=2,max=1000" },
+ { "lsm_bloom_hash_count", "int", "min=2,max=100" },
+ { "lsm_chunk_size", "int", "min=512K,max=500MB" },
{ "prefix_compression", "boolean", NULL },
{ "split_pct", "int", "min=25,max=100" },
{ "type", "string", "choices=[\"btree\"]" },
@@ -208,6 +212,7 @@ __wt_confdfl_session_create =
"filename=,huffman_key=,huffman_value=,internal_item_max=0,"
"internal_key_truncate=,internal_page_max=2KB,key_format=u,"
"key_format=u,key_gap=10,leaf_item_max=0,leaf_page_max=1MB,"
+ "lsm_bloom_bit_count=8,lsm_bloom_hash_count=4,lsm_chunk_size=2MB,"
"prefix_compression=,split_pct=75,type=btree,value_format=u,"
"value_format=u";
@@ -234,6 +239,9 @@ __wt_confchk_session_create[] = {
{ "key_gap", "int", "min=0" },
{ "leaf_item_max", "int", "min=0" },
{ "leaf_page_max", "int", "min=512B,max=512MB" },
+ { "lsm_bloom_bit_count", "int", "min=2,max=1000" },
+ { "lsm_bloom_hash_count", "int", "min=2,max=100" },
+ { "lsm_chunk_size", "int", "min=512K,max=500MB" },
{ "prefix_compression", "boolean", NULL },
{ "split_pct", "int", "min=25,max=100" },
{ "type", "string", "choices=[\"btree\"]" },
@@ -394,7 +402,7 @@ __wt_confchk_wiredtiger_open[] = {
{ "transactional", "boolean", NULL },
{ "use_environment_priv", "boolean", NULL },
{ "verbose", "list", "choices=[\"block\",\"ckpt\",\"evict\","
- "\"evictserver\",\"fileops\",\"hazard\",\"mutex\",\"read\","
+ "\"evictserver\",\"fileops\",\"hazard\",\"lsm\",\"mutex\",\"read\","
"\"readserver\",\"reconcile\",\"salvage\",\"verify\",\"write\"]" },
{ NULL, NULL, NULL }
};
diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c
index f69f0771b81..1133c428fa6 100644
--- a/src/conn/conn_api.c
+++ b/src/conn/conn_api.c
@@ -319,6 +319,9 @@ __conn_close(WT_CONNECTION *wt_conn, const char *config)
if (!F_ISSET(s, WT_SESSION_INTERNAL))
__wt_free(session, s->hazard);
+ /* Clean up open LSM handles. */
+ WT_ERR(__wt_lsm_cleanup(&conn->iface));
+
/* Close open btree handles. */
WT_TRET(__wt_conn_btree_discard(conn));
@@ -730,6 +733,7 @@ __conn_verbose_config(WT_SESSION_IMPL *session, const char *cfg[])
{ "evictserver",WT_VERB_evictserver },
{ "fileops", WT_VERB_fileops },
{ "hazard", WT_VERB_hazard },
+ { "lsm", WT_VERB_lsm },
{ "mutex", WT_VERB_mutex },
{ "read", WT_VERB_read },
{ "readserver", WT_VERB_readserver },
@@ -929,6 +933,12 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
/* If there's a hot-backup file, load it. */
WT_ERR(__wt_metadata_load_backup(session));
+ /*
+ * XXX LSM initialization.
+ * This is structured so that it could be moved to an extension.
+ */
+ WT_ERR(__wt_lsm_init(&conn->iface, NULL));
+
STATIC_ASSERT(offsetof(WT_CONNECTION_IMPL, iface) == 0);
*wt_connp = &conn->iface;
diff --git a/src/conn/conn_btree.c b/src/conn/conn_btree.c
index 116ee404ccc..c39d0fd423f 100644
--- a/src/conn/conn_btree.c
+++ b/src/conn/conn_btree.c
@@ -244,10 +244,7 @@ __wt_conn_btree_get(WT_SESSION_IMPL *session,
WT_STAT_INCR(conn->stats, file_open);
- if (session->btree != NULL)
- WT_RET(__conn_btree_open_lock(session, flags));
- else
- WT_RET(__conn_btree_get(session, name, ckpt, flags));
+ WT_RET(__conn_btree_get(session, name, ckpt, flags));
btree = session->btree;
diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c
index 0434feae0d1..4c44d6fe9cf 100644
--- a/src/conn/conn_handle.c
+++ b/src/conn/conn_handle.c
@@ -25,6 +25,8 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn)
TAILQ_INIT(&conn->collqh); /* Collator list */
TAILQ_INIT(&conn->compqh); /* Compressor list */
+ TAILQ_INIT(&conn->lsmqh); /* WT_LSM_TREE list */
+
/* Statistics. */
WT_RET(__wt_stat_alloc_connection_stats(session, &conn->stats));
diff --git a/src/cursor/cur_bulk.c b/src/cursor/cur_bulk.c
index ed14beded58..1c6c54783a6 100644
--- a/src/cursor/cur_bulk.c
+++ b/src/cursor/cur_bulk.c
@@ -48,8 +48,10 @@ __curbulk_close(WT_CURSOR *cursor)
CURSOR_API_CALL_NOCONF(cursor, session, close, btree);
WT_TRET(__wt_bulk_end(cbulk));
- if (session->btree != NULL)
+ if (btree != NULL) {
+ WT_ASSERT(session, session->btree == btree);
WT_TRET(__wt_session_release_btree(session));
+ }
/* The URI is owned by the btree handle. */
cursor->uri = NULL;
WT_TRET(__wt_cursor_close(cursor));
diff --git a/src/cursor/cur_config.c b/src/cursor/cur_config.c
index db6660e1ba8..5dd91d33c16 100644
--- a/src/cursor/cur_config.c
+++ b/src/cursor/cur_config.c
@@ -68,7 +68,7 @@ __wt_curconfig_open(WT_SESSION_IMPL *session,
/* __wt_cursor_init is last so we don't have to clean up on error. */
STATIC_ASSERT(offsetof(WT_CURSOR_CONFIG, iface) == 0);
- WT_ERR(__wt_cursor_init(cursor, uri, 0, cfg, cursorp));
+ WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp));
if (0) {
err: __wt_free(session, cconfig);
diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c
index 4838c33c9d8..01545c97d2f 100644
--- a/src/cursor/cur_index.c
+++ b/src/cursor/cur_index.c
@@ -435,7 +435,7 @@ __wt_curindex_open(WT_SESSION_IMPL *session,
WT_ERR(__curindex_open_colgroups(session, cindex, cfg));
/* __wt_cursor_init is last so we don't have to clean up on error. */
- WT_ERR(__wt_cursor_init(cursor, cursor->uri, 0, cfg, cursorp));
+ WT_ERR(__wt_cursor_init(cursor, cursor->uri, NULL, cfg, cursorp));
if (0) {
err: (void)__curindex_close(cursor);
diff --git a/src/docs/command-line.dox b/src/docs/command-line.dox
index def8d3864a0..34b703246a2 100644
--- a/src/docs/command-line.dox
+++ b/src/docs/command-line.dox
@@ -45,7 +45,7 @@ The following are command-specific options for the \c backup command:
@par <code>-t uri</code>
By default, the \c backup command does a hot backup of the entire
database; the \c -t option changes the \c backup command to do a hot
-backup of only the listed \c file: and \c table: objects.
+backup of only the named objects.
<hr>
@section util_create wt create
diff --git a/src/docs/cursors.dox b/src/docs/cursors.dox
index 302dabcf5d2..faf78c1a446 100644
--- a/src/docs/cursors.dox
+++ b/src/docs/cursors.dox
@@ -63,6 +63,8 @@ table cursor (key=table key\, value=table value)}
file cursor (key=file key\, value=file value)}
@row{<tt>index:\<tablename\>.\<indexname\></tt>,
index cursor (key=index key\, value=table value)}
+ @row{<tt>lsm:\<name\></tt>,
+LSM cursor (key=LSM key\, value=LSM value), see @ref lsm}
@row{<tt>statistics:[file</tt><tt>:\<filename\>]</tt>,
database or file statistics (key=(int)\,
value=(string)description\, (string)value\, (uint64_t)value)}
@@ -97,4 +99,9 @@ WT_CURSOR::set_key and WT_CURSOR::set_value in raw mode, the WT_ITEM
should be equivalent to calling ::wiredtiger_struct_pack for the
cursor's \c key_format or \c value_format, respectively.
+@section cursor_random Random lookup
+
+Cursors can be configured to return pseudo-random records from row-store
+objects. See @subpage cursor_random for details.
+
*/
diff --git a/src/docs/data_sources.dox b/src/docs/data_sources.dox
index 3a879a2cb94..ec2cd682f2f 100644
--- a/src/docs/data_sources.dox
+++ b/src/docs/data_sources.dox
@@ -32,12 +32,14 @@ file cursor (key=file key\, value=file value)}
index cursor (key=index key\, value=table value)}
@row{<tt>join:\<cursor1\>\&\<cursor2\>[&\<cursor3\>...]</tt>,
join cursor @notyet{join cursors}}
+ @row{<tt>lsm:\<name\></tt>,
+LSM cursor (key=LSM key\, value=LSM value), see @ref lsm}
@row{<tt>statistics:[file</tt><tt>:\<filename\>]</tt>,
database or file statistics (key=(int)\,
value=(string)description\, (string)value\, (uint64_t)value)}
</table>
-@subsection hot_backup_cursors Hot backup cursors
+@subsection data_backup Hot backup cursors
See @ref hot_backup for more information.
diff --git a/src/docs/file-formats.dox b/src/docs/file-formats.dox
index ee9022247b7..0b4c0f21592 100644
--- a/src/docs/file-formats.dox
+++ b/src/docs/file-formats.dox
@@ -61,7 +61,7 @@ additional CPU and memory use when searching the in-memory tree (if keys
are encoded), and additional CPU and memory use when returning values
from the in-memory tree and when writing pages to disk. Note the
additional CPU cost of Huffman encoding can be high, and should be
-considered. (See @ref huffman for details)
+considered. (See @subpage huffman for details)
- Stream compression reduces the size requirement of on-disk objects by
compressing blocks of the backing object's file. The cost is additional
diff --git a/src/docs/lsm.dox b/src/docs/lsm.dox
new file mode 100644
index 00000000000..356f6777988
--- /dev/null
+++ b/src/docs/lsm.dox
@@ -0,0 +1,147 @@
+/*! @page lsm Log-Structured Merge Trees
+
+@section lsm_background Background
+
+A common requirement is sustained throughput under a workload that consists
+of random inserts, where either the key range is chosen so that inserts are
+very unlikely to conflict (e.g., 128-bit hashes), or where inserts are
+expected to overwrite existing values.
+
+With traditional btree variants, inserts are very fast while the data set
+remains in cache, but once the tree overflows the cache, performance drops
+significantly. There are two factors involved:
+
+1. once the data fills the cache, new inserts have some probability of going
+to a page that is not in cache, requiring a read; and
+2. the cache is full of dirty pages, so pages have to be written to free
+space in the cache before the read can be satisfied.
+
+@section lsm_description Description of LSM trees
+
+Log-Structured Merge Trees were described in this paper by Patrick O'Neil1 ,
+Edward Cheng, Dieter Gawlick and Elizabeth O'Neil1:
+http://www.cs.umb.edu/~poneil/lsmtree.pdf
+
+A logical tree is split into several physical pieces so that the
+most-recently-updated portion of data is in a tree that fits entirely in
+memory. The size of the in-memory chunk can be configured with the
+\c "lsm_chunk_size" configuration key to WT_SESSION::create.
+
+Once the in-memory tree reaches a threshold size, a new in-memory tree is
+created and the old tree synced to disk. Once written to disk, trees are
+read-only, though they are merged in the background with other on-disk
+trees to reduce the cost of reads.
+
+With this structure, "blind writes" can be performed entirely on the
+in-memory tree. Deletes are implemented by inserting a special "tombstone"
+record into the in-memory tree.
+
+@section lsm_api Interface to LSM trees
+
+An LSM tree can be created as follows, in much the same way as a
+WiredTiger btree file:
+
+@code
+session->create(session, "lsm:bucket", "key_format=S,value_format=S");
+@endcode
+
+Once created, the LSM tree can be accessed using the same cursor interface
+as other data sources in WiredTiger:
+
+@code
+WT_CURSOR *c;
+
+session->open_cursor(session, "lsm:bucket", NULL, NULL, &c);
+for(;;) {
+ c->set_key("key");
+ c->set_value("data");
+ c->insert();
+}
+@endcode
+
+Unlike ordinary file cursors, LSM cursors default to \c overwrite mode, where:
+
+- WT_CURSOR::insert will update existing values without checking;
+- WT_CURSOR::update will insert values regardless of whether they exist; and
+- WT_CURSOR::remove will succeed regardless of whether the specified record
+ exists.
+
+This behavior can be disabled by passing \c "overwrite=false" to
+WT_SESSION::open_cursor, but the result will be a search through the levels
+of the LSM tree before every modification.
+
+@section lsm_merge Merging
+
+A background thread is opened for each active LSM tree. This thread is
+responsible for both writing old chunks to stable storage, and for merging
+multiple chunks together so that reads can be satisfied from a small number
+of files. There is currently no way to configure merges: they are performed
+automatically by the background thread.
+
+@section lsm_bloom Bloom filters
+
+WiredTiger creates a Bloom filter when merging. This is an additional file
+that contains a configurable number of bits per key (default 8). Keys are
+hashed a configurable number of times (default 4), and the corresponding
+bits set. The Bloom filter is used to avoid reading from a chunk if the key
+cannot be present.
+
+With the defaults, they Bloom filter only requires one byte per key, so it
+usually fits in cache. The Bloom parameters can be configured with
+\c "lsm_bloom_bit_count" and \c "lsm_bloom_hash_count" configuration keys to
+WT_SESSION::create.
+
+@section lsm_caveats Caveats
+
+@subsection lsm_hazard Hazard configuration
+
+Reads from an LSM cursor may need to position a cursor in each active chunk.
+The number of chunks depends on the chunk size, and how many chunks have
+been merged. There must be at least as many hazard references available as
+there are chunks in the tree. The number of hazard references is configured
+with the \c "hazard_max" configuration key to ::wiredtiger_open.
+
+@subsection lsm_schema Creating tables using LSM trees
+
+It is not yet possible to create tables or indices using LSM trees for
+storage. This will be addressed in a future release of WiredTiger.
+
+Schema support will be provided for LSM as with an extension to the
+WT_SESSION::create method:
+
+@code
+session->create(session, "table:T", "type=lsm");
+@endcode
+
+The default type for all schema objects will continue to be btree.
+
+@subsection lsm_tombstones Empty values
+
+Internally, WiredTiger's LSM trees use an empty value to represent a
+record that has been removed (also known as a "tombstone"). For this
+reason, applications cannot store records in LSM trees with empty values.
+
+@subsection lsm_txn Transactional access
+
+There are currently some significant limitations in transactional access to
+data stored in LSM trees:
+
+- if an update transaction runs for long enough that the chunk it started
+ writing to has been replaced, its data may not be included in the
+ checkpoint when that chunk is swapped to "on disk" mode, and thus may not
+ become visible to readers of the tree immediately when the transaction
+ commits;
+
+- update transactions running at snapshot isolation may be permitted to
+ commit if they conflict with an update in a concurrent transaction and
+ the current chunk has been replaced;
+
+- read-only transactions running at snapshot isolation may read newer
+ changes after a chunk is written to stable storage;
+
+- named checkpoints are not fully supported on LSM trees: recent updates to
+ the tree may not appear in the checkpoint;
+
+We intend to address these limitations in future releases.
+
+ */
diff --git a/src/docs/programming.dox b/src/docs/programming.dox
index 6e84632aad8..c2eb1f5c196 100644
--- a/src/docs/programming.dox
+++ b/src/docs/programming.dox
@@ -11,6 +11,7 @@ WiredTiger applications:
- @subpage schema
- @subpage file_formats
+- @subpage lsm
- @subpage transactions
- @subpage checkpoints
diff --git a/src/docs/spell.ok b/src/docs/spell.ok
index 1d707526ab9..5004c29958d 100644
--- a/src/docs/spell.ok
+++ b/src/docs/spell.ok
@@ -4,6 +4,7 @@ Atomicity
BLOBs
CFLAGS
CPPFLAGS
+Cheng
DB's
DBTs
DbCursor
@@ -12,6 +13,7 @@ DbMultiple
EB
EmpId
FreeBSD
+Gawlick
GCC
GitHub
IEC
@@ -158,6 +160,7 @@ loadtext
logc
lookup
lrtf
+lsm
lsn
lt
mailto
diff --git a/src/include/api.h b/src/include/api.h
index d2d588f031c..1beccb5c45c 100644
--- a/src/include/api.h
+++ b/src/include/api.h
@@ -201,10 +201,11 @@ struct __wt_connection_impl {
WT_FH *lock_fh; /* Lock file handle */
pthread_t cache_evict_tid; /* Cache eviction server thread ID */
- pthread_t cache_read_tid; /* Cache read server thread ID */
/* Locked: btree list */
TAILQ_HEAD(__wt_btree_qh, __wt_btree) btqh;
+ /* Locked: LSM handle list. */
+ TAILQ_HEAD(__wt_lsm_qh, __wt_lsm_tree) lsmqh;
/* Locked: file list */
TAILQ_HEAD(__wt_fh_qh, __wt_fh) fhqh;
@@ -230,10 +231,8 @@ struct __wt_connection_impl {
uint32_t session_cnt; /* Session count */
/*
- * WiredTiger allocates space for 15 hazard references in each thread of
- * control, by default. There's no code path that requires more than 15
- * pages at a time (and if we find one, the right change is to increase
- * the default).
+ * WiredTiger allocates space for a fixed number of hazard references
+ * in each thread of control.
*/
uint32_t hazard_size; /* Hazard array size */
@@ -318,7 +317,7 @@ struct __wt_connection_impl {
#define CURSOR_API_CALL_NOCONF(cur, s, n, bt) \
(s) = (WT_SESSION_IMPL *)(cur)->session; \
- API_CALL_NOCONF(s, cursor, n, cur, bt); \
+ API_CALL_NOCONF(s, cursor, n, cur, bt)
/*******************************************
* Global variables.
@@ -341,12 +340,13 @@ extern WT_PROCESS __wt_process;
#define WT_SESSION_INTERNAL 0x00000004
#define WT_SESSION_SALVAGE_QUIET_ERR 0x00000002
#define WT_SESSION_SCHEMA_LOCKED 0x00000001
-#define WT_VERB_block 0x00001000
-#define WT_VERB_ckpt 0x00000800
-#define WT_VERB_evict 0x00000400
-#define WT_VERB_evictserver 0x00000200
-#define WT_VERB_fileops 0x00000100
-#define WT_VERB_hazard 0x00000080
+#define WT_VERB_block 0x00002000
+#define WT_VERB_ckpt 0x00001000
+#define WT_VERB_evict 0x00000800
+#define WT_VERB_evictserver 0x00000400
+#define WT_VERB_fileops 0x00000200
+#define WT_VERB_hazard 0x00000100
+#define WT_VERB_lsm 0x00000080
#define WT_VERB_mutex 0x00000040
#define WT_VERB_read 0x00000020
#define WT_VERB_readserver 0x00000010
diff --git a/src/include/btree.h b/src/include/btree.h
index 8a049de541c..7211e8b0704 100644
--- a/src/include/btree.h
+++ b/src/include/btree.h
@@ -118,7 +118,6 @@ struct __wt_btree {
WT_PAGE *evict_page; /* Eviction thread's location */
volatile uint32_t lru_count; /* Count of threads in LRU eviction */
- int cache_resident; /* If no eviction on this object */
WT_BTREE_STATS *stats; /* Btree statistics */
@@ -126,10 +125,12 @@ struct __wt_btree {
#define WT_BTREE_DISCARD 0x0002 /* Discard on release */
#define WT_BTREE_EXCLUSIVE 0x0004 /* Need exclusive access to handle */
#define WT_BTREE_LOCK_ONLY 0x0008 /* Handle is only needed for locking */
-#define WT_BTREE_OPEN 0x0020 /* Handle is open */
-#define WT_BTREE_SALVAGE 0x0040 /* Handle is for salvage */
-#define WT_BTREE_UPGRADE 0x0080 /* Handle is for upgrade */
-#define WT_BTREE_VERIFY 0x0100 /* Handle is for verify */
+#define WT_BTREE_NO_EVICTION 0x0010 /* Disable eviction */
+#define WT_BTREE_NO_HAZARD 0x0020 /* Disable hazard references */
+#define WT_BTREE_OPEN 0x0040 /* Handle is open */
+#define WT_BTREE_SALVAGE 0x0080 /* Handle is for salvage */
+#define WT_BTREE_UPGRADE 0x0100 /* Handle is for upgrade */
+#define WT_BTREE_VERIFY 0x0200 /* Handle is for verify */
uint32_t flags;
};
diff --git a/src/include/cache.i b/src/include/cache.i
index 3ed1a660c63..79b99653a78 100644
--- a/src/include/cache.i
+++ b/src/include/cache.i
@@ -48,8 +48,13 @@ __wt_eviction_page_check(WT_SESSION_IMPL *session, WT_PAGE *page)
conn = S2C(session);
mod = page->modify;
- /* Root pages and clean pages are never forcibly evicted. */
- if (WT_PAGE_IS_ROOT(page) || !__wt_page_is_modified(page))
+ /*
+ * Root pages and clean pages are never forcibly evicted.
+ * Nor are pages from files that are purely cache resident.
+ */
+ if (WT_PAGE_IS_ROOT(page) ||
+ !__wt_page_is_modified(page) ||
+ F_ISSET(session->btree, WT_BTREE_NO_EVICTION))
return (0);
/* Check the page's memory footprint. */
diff --git a/src/include/extern.h b/src/include/extern.h
index ea1cb091fe7..403069a0920 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -234,6 +234,7 @@ extern int __wt_bloom_open(WT_SESSION_IMPL *session,
const char *uri,
uint32_t factor,
uint32_t k,
+ WT_CURSOR *owner,
WT_BLOOM **bloomp);
extern int __wt_bloom_insert(WT_BLOOM *bloom, WT_ITEM *key);
extern int __wt_bloom_finalize(WT_BLOOM *bloom);
@@ -307,6 +308,11 @@ extern int __wt_btree_leaf_create( WT_SESSION_IMPL *session,
WT_PAGE *parent,
WT_REF *ref,
WT_PAGE **pagep);
+extern int __wt_btree_get_memsize( WT_SESSION_IMPL *session,
+ WT_BTREE *btree,
+ uint32_t **memsizep);
+extern int __wt_btree_release_memsize(WT_SESSION_IMPL *session,
+ WT_BTREE *btree);
extern int __wt_btree_huffman_open(WT_SESSION_IMPL *session,
const char *config);
extern void __wt_btree_huffman_close(WT_SESSION_IMPL *session);
@@ -659,6 +665,60 @@ extern int __wt_log_printf(WT_SESSION_IMPL *session,
2,
3)));
extern WT_LOGREC_DESC __wt_logdesc_debug;
+extern int __wt_clsm_init_merge(WT_CURSOR *cursor, int nchunks);
+extern int __wt_clsm_open(WT_SESSION_IMPL *session,
+ const char *uri,
+ const char *cfg[],
+ WT_CURSOR **cursorp);
+extern int __wt_lsm_init(WT_CONNECTION *wt_conn, const char *config);
+extern int __wt_lsm_cleanup(WT_CONNECTION *wt_conn);
+extern int __wt_lsm_merge_update_tree(WT_SESSION_IMPL *session,
+ WT_LSM_TREE *lsm_tree,
+ int nchunks,
+ WT_LSM_CHUNK **chunkp);
+extern int __wt_lsm_major_merge(WT_SESSION_IMPL *session,
+ WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_tree_close_all(WT_SESSION_IMPL *session);
+extern int __wt_lsm_tree_bloom_name( WT_SESSION_IMPL *session,
+ WT_LSM_TREE *lsm_tree,
+ int i,
+ WT_ITEM *buf);
+extern int __wt_lsm_tree_chunk_name( WT_SESSION_IMPL *session,
+ WT_LSM_TREE *lsm_tree,
+ int i,
+ WT_ITEM *buf);
+extern int __wt_lsm_tree_create_chunk( WT_SESSION_IMPL *session,
+ WT_LSM_TREE *lsm_tree,
+ int i,
+ const char **urip);
+extern int __wt_lsm_tree_create(WT_SESSION_IMPL *session,
+ const char *uri,
+ int exclusive,
+ const char *config);
+extern int __wt_lsm_tree_get( WT_SESSION_IMPL *session,
+ const char *uri,
+ WT_LSM_TREE **treep);
+extern int __wt_lsm_tree_switch( WT_SESSION_IMPL *session,
+ WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_tree_drop( WT_SESSION_IMPL *session,
+ const char *name,
+ const char *cfg[]);
+extern int __wt_lsm_tree_rename(WT_SESSION_IMPL *session,
+ const char *oldname,
+ const char *newname,
+ const char *cfg[]);
+extern int __wt_lsm_tree_truncate( WT_SESSION_IMPL *session,
+ const char *name,
+ const char *cfg[]);
+extern int __wt_lsm_tree_worker(WT_SESSION_IMPL *session,
+ const char *uri,
+ int (*func)(WT_SESSION_IMPL *,
+ const char *[]),
+ const char *cfg[],
+ uint32_t open_flags);
+extern void *__wt_lsm_worker(void *arg);
extern int __wt_metadata_get(WT_SESSION *session,
const char *uri,
const char **valuep);
@@ -1091,7 +1151,7 @@ extern int __wt_buf_catfmt(WT_SESSION_IMPL *session,
4)));
extern int
__wt_scr_alloc_func(WT_SESSION_IMPL *session,
- uint32_t size, WT_ITEM **scratchp
+ size_t size, WT_ITEM **scratchp
#ifdef HAVE_DIAGNOSTIC
, const char *file, int line
#endif
diff --git a/src/include/lsm.h b/src/include/lsm.h
new file mode 100644
index 00000000000..2308b6c9558
--- /dev/null
+++ b/src/include/lsm.h
@@ -0,0 +1,80 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+struct __wt_cursor_lsm {
+ WT_CURSOR iface;
+
+ WT_LSM_TREE *lsm_tree;
+ uint64_t dsk_gen;
+
+ int nchunks;
+ WT_BLOOM **blooms;
+ WT_CURSOR **cursors;
+ WT_CURSOR *current; /* The current cursor for iteration */
+
+ WT_LSM_CHUNK *primary_chunk; /* The current primary chunk. */
+
+#define WT_CLSM_ITERATE_NEXT 0x01 /* Forward iteration */
+#define WT_CLSM_ITERATE_PREV 0x02 /* Backward iteration */
+#define WT_CLSM_MERGE 0x04 /* Merge cursor, don't update. */
+#define WT_CLSM_MULTIPLE 0x08 /* Multiple cursors have values for the
+ current key */
+#define WT_CLSM_UPDATED 0x10 /* Cursor has done updates */
+ uint32_t flags;
+};
+
+struct __wt_lsm_chunk {
+ const char *uri; /* Data source for this chunk. */
+ const char *bloom_uri; /* URI of Bloom filter, if any. */
+ uint64_t count; /* Approximate count of records. */
+
+ uint32_t ncursor; /* Cursors with the chunk as primary. */
+#define WT_LSM_CHUNK_ONDISK 0x01
+ uint32_t flags;
+};
+
+struct __wt_lsm_tree {
+ const char *name, *config, *filename;
+ const char *key_format, *value_format, *file_config;
+
+ WT_COLLATOR *collator;
+
+ WT_RWLOCK *rwlock;
+ TAILQ_ENTRY(__wt_lsm_tree) q;
+
+ WT_SPINLOCK lock;
+ uint64_t dsk_gen;
+ uint32_t *memsizep;
+
+ /* Configuration parameters */
+ uint32_t threshold;
+ uint32_t bloom_bit_count;
+ uint32_t bloom_hash_count;
+
+ WT_SESSION_IMPL *worker_session;/* Passed to thread_create */
+ pthread_t worker_tid; /* LSM worker thread */
+
+ int nchunks; /* Number of active chunks */
+ int last; /* Last allocated ID. */
+ WT_LSM_CHUNK **chunk; /* Array of active LSM chunks */
+ size_t chunk_alloc; /* Space allocated for chunks */
+ WT_LSM_CHUNK **old_chunks; /* Array of old LSM chunks */
+ size_t old_alloc; /* Space allocated for old chunks */
+ int nold_chunks; /* Number of old chunks */
+ int old_avail; /* Available old chunk slots */
+
+#define WT_LSM_TREE_OPEN 0x01
+ uint32_t flags;
+};
+
+struct __wt_lsm_data_source {
+ WT_DATA_SOURCE iface;
+
+ WT_RWLOCK *rwlock;
+
+ TAILQ_HEAD(__trees, __wt_lsm_tree) trees;
+};
diff --git a/src/include/packing.i b/src/include/packing.i
index 99f097fbcdd..5896ef5ca3f 100644
--- a/src/include/packing.i
+++ b/src/include/packing.i
@@ -117,6 +117,7 @@ next: if (pack->cur == pack->end)
}
#define WT_PACK_GET(session, pv, ap) do { \
+ WT_ITEM *__item; \
switch (pv.type) { \
case 'x': \
break; \
@@ -126,7 +127,9 @@ next: if (pack->cur == pack->end)
break; \
case 'U': \
case 'u': \
- pv.u.item = *va_arg(ap, WT_ITEM *); \
+ __item = va_arg(ap, WT_ITEM *); \
+ pv.u.item.data = __item->data; \
+ pv.u.item.size = __item->size; \
break; \
case 'b': \
case 'h': \
@@ -390,6 +393,7 @@ __unpack_read(WT_SESSION_IMPL *session,
}
#define WT_UNPACK_PUT(session, pv, ap) do { \
+ WT_ITEM *__item; \
switch (pv.type) { \
case 'x': \
break; \
@@ -399,7 +403,9 @@ __unpack_read(WT_SESSION_IMPL *session,
break; \
case 'U': \
case 'u': \
- *va_arg(ap, WT_ITEM *) = pv.u.item; \
+ __item = va_arg(ap, WT_ITEM *); \
+ __item->data = pv.u.item.data; \
+ __item->size = pv.u.item.size; \
break; \
case 'b': \
*va_arg(ap, int8_t *) = (int8_t)pv.u.i; \
diff --git a/src/include/schema.h b/src/include/schema.h
index 8b2c7871a22..a3c2c32de2b 100644
--- a/src/include/schema.h
+++ b/src/include/schema.h
@@ -69,3 +69,10 @@ struct __wt_table {
F_CLR(session, WT_SESSION_SCHEMA_LOCKED); \
__wt_spin_unlock(session, &S2C(session)->schema_lock); \
} while (0)
+
+#define WT_WITH_SCHEMA_LOCK_OPT(session, op) do { \
+ if (F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) \
+ (op); \
+ else \
+ WT_WITH_SCHEMA_LOCK(session, op); \
+} while (0)
diff --git a/src/include/txn.i b/src/include/txn.i
index e11d810316b..ce2e369e461 100644
--- a/src/include/txn.i
+++ b/src/include/txn.i
@@ -5,6 +5,9 @@
* See the file LICENSE for redistribution information.
*/
+static inline void __wt_txn_read_first(WT_SESSION_IMPL *session);
+static inline void __wt_txn_read_last(WT_SESSION_IMPL *session);
+
/*
* __wt_txn_getid --
* Get a transaction ID for a non-transactional operation.
@@ -34,6 +37,14 @@ __wt_txn_getid(WT_SESSION_IMPL *session)
id == WT_TXN_NONE || id == WT_TXN_ABORTED);
txn->id = id;
+
+ /*
+ * We allocated a new transaction ID for updates without an explicit
+ * transaction. To ensure that our previous updates are visible,
+ * update our transaction context (if required).
+ */
+ __wt_txn_read_last(session);
+ __wt_txn_read_first(session);
}
/*
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index c7cb00fb2ba..367b52a55cd 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -657,6 +657,12 @@ struct __wt_session {
* significant for applications wanting to maximize sequential data
* transfer from a storage device.,an integer between 512B and 512MB;
* default \c 1MB.}
+ * @config{lsm_bloom_bit_count, the number of bits used per item for LSM
+ * bloom filters..,an integer between 2 and 1000; default \c 8.}
+ * @config{lsm_bloom_hash_count, the number of hash values per item used
+ * for LSM bloom filters..,an integer between 2 and 100; default \c 4.}
+ * @config{lsm_chunk_size, the maximum size of the in-memory chunk of an
+ * LSM tree.,an integer between 512K and 500MB; default \c 2MB.}
* @config{prefix_compression, configure row-store format key prefix
* compression.,a boolean flag; default \c true.}
* @config{split_pct, the Btree page split size as a percentage of the
@@ -964,8 +970,8 @@ struct __wt_connection {
* given as a list\, such as
* <code>"verbose=[evictserver\,read]"</code>.,a list\, with values
* chosen from the following options: \c "block"\, \c "ckpt"\, \c
- * "evict"\, \c "evictserver"\, \c "fileops"\, \c "hazard"\, \c
- * "mutex"\, \c "read"\, \c "readserver"\, \c "reconcile"\, \c
+ * "evict"\, \c "evictserver"\, \c "fileops"\, \c "hazard"\, \c "lsm"\,
+ * \c "mutex"\, \c "read"\, \c "readserver"\, \c "reconcile"\, \c
* "salvage"\, \c "verify"\, \c "write"; default empty.}
* @configend
* @errors
@@ -1160,9 +1166,9 @@ struct __wt_connection {
* @config{verbose, enable messages for various events. Options are given as a
* list\, such as <code>"verbose=[evictserver\,read]"</code>.,a list\, with
* values chosen from the following options: \c "block"\, \c "ckpt"\, \c
- * "evict"\, \c "evictserver"\, \c "fileops"\, \c "hazard"\, \c "mutex"\, \c
- * "read"\, \c "readserver"\, \c "reconcile"\, \c "salvage"\, \c "verify"\, \c
- * "write"; default empty.}
+ * "evict"\, \c "evictserver"\, \c "fileops"\, \c "hazard"\, \c "lsm"\, \c
+ * "mutex"\, \c "read"\, \c "readserver"\, \c "reconcile"\, \c "salvage"\, \c
+ * "verify"\, \c "write"; default empty.}
* @configend
* Additionally, if a file named \c WiredTiger.config appears in the WiredTiger
* home directory, it is read for configuration values (see @ref config_file
@@ -1538,43 +1544,35 @@ struct __wt_data_source {
* @snippet ex_all.c WT_DATA_SOURCE create
*/
int (*create)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
- const char *name, const char *config);
+ const char *name, int exclusive, const char *config);
/*! Callback to drop an object.
*
* @snippet ex_all.c WT_DATA_SOURCE drop
*/
int (*drop)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
- const char *name, const char *config);
+ const char *name, const char *cfg[]);
/*! Callback to initialize a cursor.
*
* @snippet ex_all.c WT_DATA_SOURCE open_cursor
*/
int (*open_cursor)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
- const char *obj, WT_CURSOR *old_cursor,
- const char *config, WT_CURSOR **new_cursor);
+ const char *obj, const char *cfg[], WT_CURSOR **new_cursor);
/*! Callback to rename an object.
*
- * @snippet ex_all.c WT_DATA_SOURCE sync
+ * @snippet ex_all.c WT_DATA_SOURCE rename
*/
int (*rename)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
- const char *oldname, const char *newname, const char *config);
-
- /*! Callback to sync an object.
- *
- * @snippet ex_all.c WT_DATA_SOURCE sync
- */
- int (*sync)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
- const char *name, const char *config);
+ const char *oldname, const char *newname, const char *cfg[]);
/*! Callback to truncate an object.
*
* @snippet ex_all.c WT_DATA_SOURCE truncate
*/
int (*truncate)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
- const char *name, const char *config);
+ const char *name, const char *cfg[]);
};
/*!
diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h
index d95b1dbef05..b0e58ea1682 100644
--- a/src/include/wt_internal.h
+++ b/src/include/wt_internal.h
@@ -103,6 +103,8 @@ struct __wt_cursor_dump;
typedef struct __wt_cursor_dump WT_CURSOR_DUMP;
struct __wt_cursor_index;
typedef struct __wt_cursor_index WT_CURSOR_INDEX;
+struct __wt_cursor_lsm;
+ typedef struct __wt_cursor_lsm WT_CURSOR_LSM;
struct __wt_cursor_stat;
typedef struct __wt_cursor_stat WT_CURSOR_STAT;
struct __wt_cursor_table;
@@ -127,6 +129,12 @@ struct __wt_insert;
typedef struct __wt_insert WT_INSERT;
struct __wt_insert_head;
typedef struct __wt_insert_head WT_INSERT_HEAD;
+struct __wt_lsm_chunk;
+ typedef struct __wt_lsm_chunk WT_LSM_CHUNK;
+struct __wt_lsm_data_source;
+ typedef struct __wt_lsm_data_source WT_LSM_DATA_SOURCE;
+struct __wt_lsm_tree;
+ typedef struct __wt_lsm_tree WT_LSM_TREE;
struct __wt_named_collator;
typedef struct __wt_named_collator WT_NAMED_COLLATOR;
struct __wt_named_compressor;
@@ -195,6 +203,7 @@ struct __wt_update;
#include "api.h"
#include "cursor.h"
+#include "lsm.h"
#include "meta.h"
#include "schema.h"
diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c
new file mode 100644
index 00000000000..10b84a317d4
--- /dev/null
+++ b/src/lsm/lsm_cursor.c
@@ -0,0 +1,891 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#define FORALL_CURSORS(clsm, c, i) \
+ for ((i) = (clsm)->nchunks - 1; (i) >= 0; (i)--) \
+ if (((c) = (clsm)->cursors[i]) != NULL)
+
+#define WT_LSM_CMP(s, lsm_tree, k1, k2, cmp) \
+ (((lsm_tree)->collator == NULL) ? \
+ (((cmp) = __wt_btree_lex_compare((k1), (k2))), 0) : \
+ (lsm_tree)->collator->compare((lsm_tree)->collator, \
+ &(s)->iface, (k1), (k2), &(cmp)))
+
+#define WT_LSM_CURCMP(s, lsm_tree, c1, c2, cmp) \
+ WT_LSM_CMP(s, lsm_tree, &(c1)->key, &(c2)->key, cmp)
+
+/*
+ * LSM API enter/leave: check that the cursor is in sync with the tree.
+ */
+#define WT_LSM_ENTER(clsm, cursor, session, n) \
+ clsm = (WT_CURSOR_LSM *)cursor; \
+ CURSOR_API_CALL_NOCONF(cursor, session, n, NULL); \
+ WT_ERR(__clsm_enter(clsm))
+
+#define WT_LSM_END(clsm, session) \
+ API_END(session)
+
+static int __clsm_open_cursors(WT_CURSOR_LSM *);
+static int __clsm_search(WT_CURSOR *);
+
+static inline int
+__clsm_enter(WT_CURSOR_LSM *clsm)
+{
+ if (!F_ISSET(clsm, WT_CLSM_MERGE) &&
+ clsm->dsk_gen != clsm->lsm_tree->dsk_gen)
+ WT_RET(__clsm_open_cursors(clsm));
+
+ return (0);
+}
+
+/*
+ * TODO: use something other than an empty value as a tombstone: we need
+ * to support empty values from the application.
+ */
+static WT_ITEM __lsm_tombstone = { "", 0, 0, NULL, 0 };
+
+#define WT_LSM_NEEDVALUE(c) do { \
+ WT_CURSOR_NEEDVALUE(c); \
+ if (__clsm_deleted(&(c)->value)) \
+ WT_ERR(__wt_cursor_kv_not_set(cursor, 0)); \
+} while (0)
+
+/*
+ * __clsm_deleted --
+ * Check whether the current value is a tombstone.
+ */
+static inline int
+__clsm_deleted(WT_ITEM *item)
+{
+ return (item->size == 0);
+}
+
+/*
+ * __clsm_close_cursors --
+ * Close all of the btree cursors currently open.
+ */
+static int
+__clsm_close_cursors(WT_CURSOR_LSM *clsm)
+{
+ WT_BLOOM *bloom;
+ WT_CURSOR *c;
+ int i;
+
+ if (clsm->cursors == NULL)
+ return (0);
+
+ /* Detach from our old primary. */
+ if (clsm->primary_chunk != NULL) {
+ WT_ATOMIC_SUB(clsm->primary_chunk->ncursor, 1);
+ clsm->primary_chunk = NULL;
+ }
+
+ FORALL_CURSORS(clsm, c, i) {
+ clsm->cursors[i] = NULL;
+ WT_RET(c->close(c));
+ if ((bloom = clsm->blooms[i]) != NULL) {
+ clsm->blooms[i] = NULL;
+ WT_RET(__wt_bloom_close(bloom));
+ }
+ }
+
+ clsm->current = NULL;
+ return (0);
+}
+
+/*
+ * __clsm_open_cursors --
+ * Open cursors for the current set of files.
+ */
+static int
+__clsm_open_cursors(WT_CURSOR_LSM *clsm)
+{
+ WT_CURSOR *c, **cp;
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_TREE *lsm_tree;
+ WT_SESSION_IMPL *session;
+ const char *ckpt_cfg[] = { "checkpoint=WiredTigerCheckpoint", NULL };
+ int i, nchunks;
+
+ session = (WT_SESSION_IMPL *)clsm->iface.session;
+ lsm_tree = clsm->lsm_tree;
+ c = &clsm->iface;
+
+ /* Copy the key, so we don't lose the cursor position. */
+ if (F_ISSET(c, WT_CURSTD_KEY_SET)) {
+ if (c->key.data != c->key.mem)
+ WT_RET(__wt_buf_set(
+ session, &c->key, c->key.data, c->key.size));
+ F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);
+ }
+
+ WT_RET(__clsm_close_cursors(clsm));
+
+ __wt_spin_lock(session, &lsm_tree->lock);
+ /* Merge cursors have already figured out how many chunks they need. */
+ if (F_ISSET(clsm, WT_CLSM_MERGE))
+ nchunks = clsm->nchunks;
+ else
+ nchunks = lsm_tree->nchunks;
+
+ if (clsm->cursors == NULL || nchunks > clsm->nchunks) {
+ WT_ERR(__wt_realloc(session, NULL,
+ nchunks * sizeof(WT_BLOOM *), &clsm->blooms));
+ WT_ERR(__wt_realloc(session, NULL,
+ nchunks * sizeof(WT_CURSOR *), &clsm->cursors));
+ }
+ clsm->nchunks = nchunks;
+
+ for (i = 0, cp = clsm->cursors; i != clsm->nchunks; i++, cp++) {
+ /*
+ * Read from the checkpoint if the file has been written.
+ * Once all cursors switch, the in-memory tree can be evicted.
+ */
+ chunk = lsm_tree->chunk[i];
+ WT_ERR(__wt_curfile_open(session,
+ chunk->uri, &clsm->iface,
+ F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) ? ckpt_cfg : NULL, cp));
+ if (chunk->bloom_uri != NULL && !F_ISSET(clsm, WT_CLSM_MERGE))
+ WT_ERR(__wt_bloom_open(session, chunk->bloom_uri,
+ lsm_tree->bloom_bit_count,
+ lsm_tree->bloom_hash_count,
+ c, &clsm->blooms[i]));
+
+ /* Child cursors always use overwrite and raw mode. */
+ F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW);
+ }
+
+ /* The last chunk is our new primary. */
+ WT_ASSERT(session,
+ !F_ISSET(clsm, WT_CLSM_UPDATED) ||
+ !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK));
+
+ clsm->primary_chunk = chunk;
+ WT_ATOMIC_ADD(clsm->primary_chunk->ncursor, 1);
+
+ /* Peek into the btree layer to track the in-memory size. */
+ if (lsm_tree->memsizep == NULL)
+ (void)__wt_btree_get_memsize(
+ session, session->btree, &lsm_tree->memsizep);
+
+ clsm->dsk_gen = lsm_tree->dsk_gen;
+err: __wt_spin_unlock(session, &lsm_tree->lock);
+ return (ret);
+}
+
+/* __wt_clsm_init_merge --
+ * Initialize an LSM cursor for a (major) merge.
+ */
+int
+__wt_clsm_init_merge(WT_CURSOR *cursor, int nchunks)
+{
+ WT_CURSOR_LSM *clsm;
+
+ clsm = (WT_CURSOR_LSM *)cursor;
+ F_SET(clsm, WT_CLSM_MERGE);
+ clsm->nchunks = nchunks;
+
+ return (__clsm_open_cursors(clsm));
+}
+
+/*
+ * __clsm_get_current --
+ * Find the smallest / largest of the cursors and copy its key/value.
+ */
+static int
+__clsm_get_current(
+ WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, int smallest, int *deletedp)
+{
+ WT_CURSOR *c, *current;
+ int i;
+ int cmp, multiple;
+
+ current = NULL;
+ FORALL_CURSORS(clsm, c, i) {
+ if (!F_ISSET(c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET))
+ continue;
+ if (current == NULL) {
+ cmp = (smallest ? -1 : 1);
+ } else
+ WT_RET(WT_LSM_CURCMP(session,
+ clsm->lsm_tree, c, current, cmp));
+ if (smallest ? cmp < 0 : cmp > 0) {
+ current = c;
+ multiple = 0;
+ } else if (cmp == 0)
+ multiple = 1;
+ }
+
+ c = &clsm->iface;
+ if ((clsm->current = current) == NULL) {
+ F_CLR(c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ return (WT_NOTFOUND);
+ }
+
+ if (multiple)
+ F_SET(clsm, WT_CLSM_MULTIPLE);
+ else
+ F_CLR(clsm, WT_CLSM_MULTIPLE);
+
+ WT_RET(current->get_key(current, &c->key));
+ WT_RET(current->get_value(current, &c->value));
+
+ if ((*deletedp = __clsm_deleted(&c->value)) == 0)
+ F_SET(c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ else
+ F_CLR(c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+ return (0);
+}
+
+/*
+ * __clsm_compare --
+ * WT_CURSOR->compare implementation for the LSM cursor type.
+ */
+static int
+__clsm_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
+{
+ WT_CURSOR_LSM *alsm;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ int cmp;
+
+ /* There's no need to sync with the LSM tree, avoid WT_LSM_ENTER. */
+ alsm = (WT_CURSOR_LSM *)a;
+ CURSOR_API_CALL_NOCONF(a, session, compare, NULL);
+
+ /*
+ * Confirm both cursors refer to the same source and have keys, then
+ * compare the keys.
+ */
+ if (strcmp(a->uri, b->uri) != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "comparison method cursors must reference the same object");
+
+ WT_ERR(WT_CURSOR_NEEDKEY(a));
+ WT_ERR(WT_CURSOR_NEEDKEY(b));
+
+ WT_ERR(WT_LSM_CMP(session, alsm->lsm_tree, &a->key, &b->key, cmp));
+ *cmpp = cmp;
+
+err: API_END(session);
+ return (ret);
+}
+
+/*
+ * __clsm_next --
+ * WT_CURSOR->next method for the LSM cursor type.
+ */
+static int
+__clsm_next(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ int check, cmp, deleted, i;
+
+ WT_LSM_ENTER(clsm, cursor, session, next);
+
+ /* If we aren't positioned for a forward scan, get started. */
+ if (clsm->current == NULL || !F_ISSET(clsm, WT_CLSM_ITERATE_NEXT)) {
+ F_CLR(clsm, WT_CLSM_MULTIPLE);
+ FORALL_CURSORS(clsm, c, i) {
+ if (!F_ISSET(cursor, WT_CURSTD_KEY_SET)) {
+ WT_ERR(c->reset(c));
+ ret = c->next(c);
+ } else if (c != clsm->current) {
+ c->set_key(c, &cursor->key);
+ if ((ret = c->search_near(c, &cmp)) == 0) {
+ if (cmp < 0)
+ ret = c->next(c);
+ else if (cmp == 0) {
+ if (clsm->current == NULL)
+ clsm->current = c;
+ else
+ F_SET(clsm,
+ WT_CLSM_MULTIPLE);
+ }
+ }
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ }
+ F_SET(clsm, WT_CLSM_ITERATE_NEXT);
+ F_CLR(clsm, WT_CLSM_ITERATE_PREV);
+
+ /* We just positioned *at* the key, now move. */
+ if (clsm->current != NULL)
+ goto retry;
+ } else {
+retry: /*
+ * If there are multiple cursors on that key, move them
+ * forward.
+ */
+ if (F_ISSET(clsm, WT_CLSM_MULTIPLE)) {
+ check = 0;
+ FORALL_CURSORS(clsm, c, i) {
+ if (!F_ISSET(c,
+ WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET))
+ continue;
+ if (check) {
+ WT_ERR(WT_LSM_CURCMP(session,
+ clsm->lsm_tree, c, clsm->current,
+ cmp));
+ if (cmp == 0)
+ WT_ERR_NOTFOUND_OK(c->next(c));
+ }
+ if (c == clsm->current)
+ check = 1;
+ }
+ }
+
+ /* Move the smallest cursor forward. */
+ c = clsm->current;
+ WT_ERR_NOTFOUND_OK(c->next(c));
+ }
+
+ /* Find the cursor(s) with the smallest key. */
+ if ((ret = __clsm_get_current(session, clsm, 1, &deleted)) == 0 &&
+ deleted)
+ goto retry;
+err: WT_LSM_END(clsm, session);
+
+ return (ret);
+}
+
+/*
+ * __clsm_prev --
+ * WT_CURSOR->prev method for the LSM cursor type.
+ */
+static int
+__clsm_prev(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ int check, cmp, deleted, i;
+
+ WT_LSM_ENTER(clsm, cursor, session, next);
+
+ /* If we aren't positioned for a reverse scan, get started. */
+ if (clsm->current == NULL || !F_ISSET(clsm, WT_CLSM_ITERATE_PREV)) {
+ F_CLR(clsm, WT_CLSM_MULTIPLE);
+ FORALL_CURSORS(clsm, c, i) {
+ if (!F_ISSET(cursor, WT_CURSTD_KEY_SET)) {
+ WT_ERR(c->reset(c));
+ ret = c->prev(c);
+ } else if (c != clsm->current) {
+ c->set_key(c, &cursor->key);
+ if ((ret = c->search_near(c, &cmp)) == 0) {
+ if (cmp > 0)
+ ret = c->prev(c);
+ else if (cmp == 0) {
+ if (clsm->current == NULL)
+ clsm->current = c;
+ else
+ F_SET(clsm,
+ WT_CLSM_MULTIPLE);
+ }
+ }
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ }
+ F_SET(clsm, WT_CLSM_ITERATE_PREV);
+ F_CLR(clsm, WT_CLSM_ITERATE_NEXT);
+
+ /* We just positioned *at* the key, now move. */
+ if (clsm->current != NULL)
+ goto retry;
+ } else {
+retry: /*
+ * If there are multiple cursors on that key, move them
+ * backwards.
+ */
+ if (F_ISSET(clsm, WT_CLSM_MULTIPLE)) {
+ check = 0;
+ FORALL_CURSORS(clsm, c, i) {
+ if (!F_ISSET(c,
+ WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET))
+ continue;
+ if (check) {
+ WT_ERR(WT_LSM_CURCMP(session,
+ clsm->lsm_tree, c, clsm->current,
+ cmp));
+ if (cmp == 0)
+ WT_ERR_NOTFOUND_OK(c->prev(c));
+ }
+ if (c == clsm->current)
+ check = 1;
+ }
+ }
+
+ /* Move the smallest cursor backwards. */
+ c = clsm->current;
+ WT_ERR_NOTFOUND_OK(c->prev(c));
+ }
+
+ /* Find the cursor(s) with the largest key. */
+ if ((ret = __clsm_get_current(session, clsm, 0, &deleted)) == 0 &&
+ deleted)
+ goto retry;
+err: WT_LSM_END(clsm, session);
+
+ return (ret);
+}
+
+/*
+ * __clsm_reset --
+ * WT_CURSOR->reset method for the LSM cursor type.
+ */
+static int
+__clsm_reset(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ WT_LSM_ENTER(clsm, cursor, session, reset);
+ if ((c = clsm->current) != NULL) {
+ ret = c->reset(c);
+ clsm->current = NULL;
+ }
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);
+err: WT_LSM_END(clsm, session);
+
+ return (ret);
+}
+
+/*
+ * __clsm_search --
+ * WT_CURSOR->search method for the LSM cursor type.
+ */
+static int
+__clsm_search(WT_CURSOR *cursor)
+{
+ WT_BLOOM *bloom;
+ WT_CURSOR *c;
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ int i;
+
+ WT_LSM_ENTER(clsm, cursor, session, search);
+ WT_CURSOR_NEEDKEY(cursor);
+ FORALL_CURSORS(clsm, c, i) {
+ /* If there is a Bloom filter, see if we can skip the read. */
+ if ((bloom = clsm->blooms[i]) != NULL) {
+ ret = __wt_bloom_get(bloom, &cursor->key);
+ if (ret == WT_NOTFOUND)
+ continue;
+ WT_RET(ret);
+ }
+ c->set_key(c, &cursor->key);
+ if ((ret = c->search(c)) == 0) {
+ WT_ERR(c->get_key(c, &cursor->key));
+ WT_ERR(c->get_value(c, &cursor->value));
+ clsm->current = c;
+ if (__clsm_deleted(&cursor->value))
+ ret = WT_NOTFOUND;
+ goto done;
+ } else if (ret != WT_NOTFOUND)
+ goto err;
+ }
+ ret = WT_NOTFOUND;
+
+done:
+err: WT_LSM_END(clsm, session);
+ if (ret == 0)
+ F_SET(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ else
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+ return (ret);
+}
+
+/*
+ * __clsm_search_near --
+ * WT_CURSOR->search_near method for the LSM cursor type.
+ */
+static int
+__clsm_search_near(WT_CURSOR *cursor, int *exactp)
+{
+ WT_CURSOR *c, *larger, *smaller;
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_ITEM v;
+ WT_SESSION_IMPL *session;
+ int cmp, deleted, i;
+
+ WT_LSM_ENTER(clsm, cursor, session, search_near);
+ WT_CURSOR_NEEDKEY(cursor);
+
+ /*
+ * search_near is somewhat fiddly: we can't just return a nearby key
+ * from the in-memory chunk because there could be a closer key on
+ * disk.
+ *
+ * As we search down the chunks, we stop as soon as we find an exact
+ * match. Otherwise, we maintain the smallest cursor larger than the
+ * search key and the largest cursor smaller than the search key. At
+ * the bottom, if one of those is set, we use it, otherwise we return
+ * WT_NOTFOUND.
+ */
+ larger = smaller = NULL;
+ FORALL_CURSORS(clsm, c, i) {
+ c->set_key(c, &cursor->key);
+ if ((ret = c->search_near(c, &cmp)) == WT_NOTFOUND) {
+ ret = 0;
+ continue;
+ } else if (ret != 0)
+ goto err;
+
+ WT_ERR(c->get_value(c, &v));
+ deleted = __clsm_deleted(&v);
+
+ if (cmp == 0 && !deleted) {
+ clsm->current = c;
+ *exactp = 0;
+ goto done;
+ }
+
+ /*
+ * If we land on a deleted item, try going forwards or
+ * backwards to find one that isn't deleted.
+ */
+ while (deleted && (ret = c->next(c)) == 0) {
+ cmp = 1;
+ WT_ERR(c->get_value(c, &v));
+ deleted = __clsm_deleted(&v);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ while (deleted && (ret = c->prev(c)) == 0) {
+ cmp = -1;
+ WT_ERR(c->get_value(c, &v));
+ deleted = __clsm_deleted(&v);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ if (deleted)
+ continue;
+ if (cmp > 0) {
+ if (larger == NULL)
+ larger = c;
+ else {
+ WT_ERR(WT_LSM_CURCMP(session,
+ clsm->lsm_tree, c, larger, cmp));
+ if (cmp < 0)
+ larger = c;
+ }
+ } else {
+ if (smaller == NULL)
+ smaller = c;
+ else {
+ WT_ERR(WT_LSM_CURCMP(session,
+ clsm->lsm_tree, c, smaller, cmp));
+ if (cmp > 0)
+ smaller = c;
+ }
+ }
+ }
+
+ if (smaller != NULL) {
+ clsm->current = smaller;
+ *exactp = -1;
+ } else if (larger != NULL) {
+ clsm->current = larger;
+ *exactp = 1;
+ } else
+ ret = WT_NOTFOUND;
+
+done:
+err: WT_LSM_END(clsm, session);
+ if (ret == 0) {
+ c = clsm->current;
+ WT_TRET(c->get_key(c, &cursor->key));
+ WT_TRET(c->get_value(c, &cursor->value));
+ }
+ if (ret == 0)
+ F_SET(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ else
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+ return (ret);
+}
+
+/*
+ * __clsm_put --
+ * Put an entry into the in-memory tree, trigger a file switch if
+ * necessary.
+ */
+static inline int
+__clsm_put(
+ WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, WT_ITEM *key, WT_ITEM *value)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *primary;
+ WT_DECL_RET;
+ WT_LSM_TREE *lsm_tree;
+ uint32_t *memsizep;
+
+ lsm_tree = clsm->lsm_tree;
+
+ /*
+ * If this is the first update in this cursor, check if a new in-memory
+ * chunk is needed.
+ */
+ if (!F_ISSET(clsm, WT_CLSM_UPDATED)) {
+ __wt_spin_lock(session, &lsm_tree->lock);
+ if (clsm->dsk_gen == lsm_tree->dsk_gen)
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_lsm_tree_switch(session, lsm_tree));
+ __wt_spin_unlock(session, &lsm_tree->lock);
+ WT_RET(ret);
+ F_SET(clsm, WT_CLSM_UPDATED);
+
+ /* We changed the structure, or someone else did: update. */
+ WT_RET(__clsm_enter(clsm));
+ }
+
+ primary = clsm->cursors[clsm->nchunks - 1];
+ primary->set_key(primary, key);
+ primary->set_value(primary, value);
+ WT_RET(primary->insert(primary));
+
+ /*
+ * The count is in a shared structure, but it's only approximate, so
+ * don't worry about protecting access.
+ */
+ ++clsm->primary_chunk->count;
+
+ /*
+ * Set the position for future scans. If we were already positioned in
+ * a non-primary chunk, we may now have multiple cursors matching the
+ * key.
+ */
+ F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);
+ clsm->current = primary;
+
+ if ((memsizep = lsm_tree->memsizep) != NULL &&
+ *memsizep > lsm_tree->threshold) {
+ /*
+ * Close our cursors: if we are the only open cursor, this
+ * means the btree handle is unlocked.
+ *
+ * XXX this is insufficient if multiple cursors are open, need
+ * to move some operations (such as clearing the
+ * "cache_resident" flag) into the worker thread.
+ */
+ btree = ((WT_CURSOR_BTREE *)primary)->btree;
+ WT_RET(__wt_btree_release_memsize(session, btree));
+
+ /*
+ * Take the LSM lock first: we can't acquire it while
+ * holding the schema lock, or we will deadlock.
+ */
+ __wt_spin_lock(session, &lsm_tree->lock);
+ /* Make sure we don't race. */
+ if (clsm->dsk_gen == lsm_tree->dsk_gen)
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_lsm_tree_switch(session, lsm_tree));
+ __wt_spin_unlock(session, &lsm_tree->lock);
+ }
+
+ return (ret);
+}
+
+/*
+ * __clsm_insert --
+ * WT_CURSOR->insert method for the LSM cursor type.
+ */
+static int
+__clsm_insert(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ WT_LSM_ENTER(clsm, cursor, session, insert);
+ WT_CURSOR_NEEDKEY(cursor);
+ WT_LSM_NEEDVALUE(cursor);
+
+ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
+ (ret = __clsm_search(cursor)) != WT_NOTFOUND) {
+ if (ret == 0)
+ ret = WT_DUPLICATE_KEY;
+ return (ret);
+ }
+
+ ret = __clsm_put(session, clsm, &cursor->key, &cursor->value);
+
+err: WT_LSM_END(clsm, session);
+
+ return (ret);
+}
+
+/*
+ * __clsm_update --
+ * WT_CURSOR->update method for the LSM cursor type.
+ */
+static int
+__clsm_update(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ WT_LSM_ENTER(clsm, cursor, session, update);
+ WT_CURSOR_NEEDKEY(cursor);
+ WT_LSM_NEEDVALUE(cursor);
+
+ if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) ||
+ (ret = __clsm_search(cursor)) == 0)
+ ret = __clsm_put(session, clsm, &cursor->key, &cursor->value);
+
+err: WT_LSM_END(clsm, session);
+
+ return (ret);
+}
+
+/*
+ * __clsm_remove --
+ * WT_CURSOR->remove method for the LSM cursor type.
+ */
+static int
+__clsm_remove(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ WT_LSM_ENTER(clsm, cursor, session, remove);
+ WT_CURSOR_NEEDKEY(cursor);
+
+ if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) ||
+ (ret = __clsm_search(cursor)) == 0)
+ ret = __clsm_put(session, clsm, &cursor->key, &__lsm_tombstone);
+
+err: WT_LSM_END(clsm, session);
+
+ return (ret);
+}
+
+/*
+ * __clsm_close --
+ * WT_CURSOR->close method for the LSM cursor type.
+ */
+static int
+__clsm_close(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ /*
+ * Don't use the normal __clsm_enter path: that is wasted work when
+ * closing, and the cursor may never have been used.
+ */
+ clsm = (WT_CURSOR_LSM *)cursor;
+ CURSOR_API_CALL_NOCONF(cursor, session, close, NULL);
+ WT_TRET(__clsm_close_cursors(clsm));
+ __wt_free(session, clsm->blooms);
+ __wt_free(session, clsm->cursors);
+ /* The WT_LSM_TREE owns the URI. */
+ cursor->uri = NULL;
+ WT_TRET(__wt_cursor_close(cursor));
+ API_END(session);
+
+ return (ret);
+}
+
+/*
+ * __wt_clsm_open --
+ * WT_SESSION->open_cursor method for LSM cursors.
+ */
+int
+__wt_clsm_open(WT_SESSION_IMPL *session,
+ const char *uri, const char *cfg[], WT_CURSOR **cursorp)
+{
+ static WT_CURSOR iface = {
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ __clsm_compare,
+ __clsm_next,
+ __clsm_prev,
+ __clsm_reset,
+ __clsm_search,
+ __clsm_search_near,
+ __clsm_insert,
+ __clsm_update,
+ __clsm_remove,
+ __clsm_close,
+ { NULL, NULL }, /* TAILQ_ENTRY q */
+ 0, /* recno key */
+ { 0 }, /* raw recno buffer */
+ { NULL, 0, 0, NULL, 0 },/* WT_ITEM key */
+ { NULL, 0, 0, NULL, 0 },/* WT_ITEM value */
+ 0, /* int saved_err */
+ 0 /* uint32_t flags */
+ };
+ WT_CONFIG_ITEM cval;
+ WT_CURSOR *cursor;
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_LSM_TREE *lsm_tree;
+
+ clsm = NULL;
+
+ if (!WT_PREFIX_MATCH(uri, "lsm:"))
+ return (EINVAL);
+
+ /* Get the LSM tree. */
+ WT_RET(__wt_lsm_tree_get(session, uri, &lsm_tree));
+
+ WT_RET(__wt_calloc_def(session, 1, &clsm));
+
+ cursor = &clsm->iface;
+ *cursor = iface;
+ cursor->session = &session->iface;
+ cursor->uri = lsm_tree->name;
+ cursor->key_format = lsm_tree->key_format;
+ cursor->value_format = lsm_tree->value_format;
+
+ clsm->lsm_tree = lsm_tree;
+
+ /*
+ * The tree's dsk_gen starts at one, so starting the cursor on zero
+ * will force a call into open_cursors on the first operation.
+ */
+ clsm->dsk_gen = 0;
+
+ STATIC_ASSERT(offsetof(WT_CURSOR_LSM, iface) == 0);
+ WT_ERR(__wt_cursor_init(cursor, cursor->uri, NULL, cfg, cursorp));
+
+ /*
+ * LSM cursors default to overwrite: if no setting was supplied, turn
+ * it on.
+ */
+ if (cfg[1] != NULL || __wt_config_getones(
+ session, cfg[1], "overwrite", &cval) == WT_NOTFOUND)
+ F_SET(cursor, WT_CURSTD_OVERWRITE);
+
+ if (0) {
+err: (void)__clsm_close(cursor);
+ }
+
+ return (ret);
+}
diff --git a/src/lsm/lsm_dsrc.c b/src/lsm/lsm_dsrc.c
new file mode 100644
index 00000000000..ed627ff9d4d
--- /dev/null
+++ b/src/lsm/lsm_dsrc.c
@@ -0,0 +1,141 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __lsm_create --
+ * Implementation of the create operation for LSM trees.
+ */
+static int
+__lsm_create(WT_DATA_SOURCE *dsrc, WT_SESSION *wt_session,
+ const char *uri, int exclusive, const char *config)
+{
+ WT_SESSION_IMPL *session;
+
+ WT_UNUSED(dsrc);
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ return (__wt_lsm_tree_create(session, uri, exclusive, config));
+}
+
+/*
+ * __lsm_drop --
+ * Implementation of the drop operation for LSM trees.
+ */
+static int
+__lsm_drop(WT_DATA_SOURCE *dsrc, WT_SESSION *wt_session,
+ const char *uri, const char *cfg[])
+{
+ WT_SESSION_IMPL *session;
+
+ WT_UNUSED(dsrc);
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ return (__wt_lsm_tree_drop(session, uri, cfg));
+}
+
+/*
+ * __lsm_open_cursor --
+ * Implementation of the open_cursor operation for LSM trees.
+ */
+static int
+__lsm_open_cursor(WT_DATA_SOURCE *dsrc, WT_SESSION *wt_session,
+ const char *obj, const char *cfg[], WT_CURSOR **new_cursor)
+{
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ WT_UNUSED(dsrc);
+
+ return (__wt_clsm_open(session, obj, cfg, new_cursor));
+}
+
+/*
+ * __lsm_rename --
+ * Implementation of the rename operation for LSM trees.
+ */
+static int
+__lsm_rename(WT_DATA_SOURCE *dsrc, WT_SESSION *wt_session,
+ const char *oldname, const char *newname, const char *cfg[])
+{
+ WT_SESSION_IMPL *session;
+
+ WT_UNUSED(dsrc);
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ if (!WT_PREFIX_MATCH(newname, "lsm:"))
+ WT_RET_MSG(session, EINVAL,
+ "rename target type must match URI: %s to %s",
+ oldname, newname);
+
+ return (__wt_lsm_tree_rename(session, oldname, newname, cfg));
+}
+
+/*
+ * __lsm_truncate --
+ * Implementation of the truncate operation for LSM trees.
+ */
+static int
+__lsm_truncate(WT_DATA_SOURCE *dsrc, WT_SESSION *wt_session,
+ const char *uri, const char *cfg[])
+{
+ WT_SESSION_IMPL *session;
+
+ WT_UNUSED(dsrc);
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ return (__wt_lsm_tree_truncate(session, uri, cfg));
+}
+
+/*
+ * __wt_lsm_init --
+ * Initialize LSM structures during wiredtiger_open.
+ */
+int
+__wt_lsm_init(WT_CONNECTION *wt_conn, const char *config)
+{
+ WT_CONNECTION_IMPL *conn;
+ static WT_LSM_DATA_SOURCE *lsm_dsrc;
+ WT_SESSION_IMPL *session;
+ static WT_DATA_SOURCE iface = {
+ __lsm_create,
+ __lsm_drop,
+ __lsm_open_cursor,
+ __lsm_rename,
+ __lsm_truncate
+ };
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+ session = conn->default_session;
+
+ WT_RET(__wt_calloc_def(session, 1, &lsm_dsrc));
+
+ lsm_dsrc->iface = iface;
+ WT_RET(
+ __wt_rwlock_alloc(session, "lsm data source", &lsm_dsrc->rwlock));
+ TAILQ_INIT(&lsm_dsrc->trees);
+
+ return (wt_conn->add_data_source(wt_conn,
+ "lsm:", &lsm_dsrc->iface, config));
+}
+
+/*
+ * __wt_lsm_cleanup --
+ * Clean up LSM structures during connection close.
+ */
+int
+__wt_lsm_cleanup(WT_CONNECTION *wt_conn)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_SESSION_IMPL *session;
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+ session = conn->default_session;
+
+ return (__wt_lsm_tree_close_all(session));
+}
diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c
new file mode 100644
index 00000000000..db780193e1e
--- /dev/null
+++ b/src/lsm/lsm_merge.c
@@ -0,0 +1,201 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_lsm_merge_update_tree --
+ * Merge a set of chunks and create a new one.
+ * Must be called with the LSM lock held.
+ */
+int
+__wt_lsm_merge_update_tree(WT_SESSION_IMPL *session,
+ WT_LSM_TREE *lsm_tree, int nchunks, WT_LSM_CHUNK **chunkp)
+{
+ WT_LSM_CHUNK *chunk;
+ size_t chunk_sz;
+ int i, j;
+
+ /* Setup the array of obsolete chunks. */
+ if (nchunks > lsm_tree->old_avail) {
+ chunk_sz = sizeof(*lsm_tree->old_chunks);
+ WT_RET(__wt_realloc(session,
+ &lsm_tree->old_alloc,
+ chunk_sz * WT_MAX(10, lsm_tree->nold_chunks + 2 * nchunks),
+ &lsm_tree->old_chunks));
+ lsm_tree->old_avail += (int)(lsm_tree->old_alloc / chunk_sz) -
+ lsm_tree->nold_chunks;
+ lsm_tree->nold_chunks = (int)(lsm_tree->old_alloc / chunk_sz);
+ }
+ /* Copy entries one at a time, so we can reuse gaps in the list. */
+ for (i = j = 0; j < nchunks && i < lsm_tree->nold_chunks; i++) {
+ if (lsm_tree->old_chunks[i] == NULL) {
+ lsm_tree->old_chunks[i] = lsm_tree->chunk[j++];
+ --lsm_tree->old_avail;
+ }
+ }
+
+ WT_ASSERT(session, j == nchunks);
+
+ /* Update the current chunk list. */
+ memmove(lsm_tree->chunk + 1, lsm_tree->chunk + nchunks,
+ (lsm_tree->nchunks - nchunks) * sizeof(*lsm_tree->chunk));
+ lsm_tree->nchunks -= nchunks - 1;
+ memset(lsm_tree->chunk + lsm_tree->nchunks, 0,
+ (nchunks - 1) * sizeof(*lsm_tree->chunk));
+ WT_RET(__wt_calloc_def(session, 1, &chunk));
+ lsm_tree->chunk[0] = chunk;
+ lsm_tree->dsk_gen++;
+
+ *chunkp = chunk;
+ return (0);
+}
+
+/*
+ * __wt_lsm_major_merge --
+ * Merge a set of chunks of an LSM tree including the oldest.
+ */
+int
+__wt_lsm_major_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_BLOOM *bloom;
+ WT_CURSOR *src, *dest;
+ WT_DECL_ITEM(bbuf);
+ WT_DECL_RET;
+ WT_ITEM key, value;
+ WT_LSM_CHUNK *chunk;
+ WT_SESSION *wt_session;
+ const char *dest_uri;
+ uint64_t insert_count, record_count;
+ int dest_id, i, nchunks;
+
+ src = dest = NULL;
+ dest_uri = NULL;
+ bloom = NULL;
+
+ /*
+ * Take a copy of the latest chunk id. This value needs to be atomically
+ * read. We need a copy, since other threads may alter the chunk count
+ * while we are doing a merge.
+ */
+ nchunks = lsm_tree->nchunks - 1;
+
+ /*
+ * If there aren't any chunks to merge, or some of the chunks aren't
+ * yet written, we're done. A non-zero error indicates that the worker
+ * should assume there is no work to do: if there are unwritten chunks,
+ * the worker should write them immediately.
+ */
+ if (nchunks <= 1)
+ return (WT_NOTFOUND);
+
+ /*
+ * We have a limited number of hazard references, and we want to bound
+ * the amount of work in the merge.
+ *
+ * Use the lsm_tree lock to read the chunks (so no switches occur), but
+ * avoid holding it while the merge is in progress: that may take a
+ * long time.
+ */
+ nchunks = WT_MIN((int)S2C(session)->hazard_size / 2, nchunks);
+ __wt_spin_lock(session, &lsm_tree->lock);
+ while (nchunks > 1 &&
+ (!F_ISSET(lsm_tree->chunk[nchunks - 1], WT_LSM_CHUNK_ONDISK) ||
+ lsm_tree->chunk[nchunks - 1]->ncursor > 0))
+ --nchunks;
+ for (record_count = 0, i = 0; i < nchunks; i++)
+ record_count += lsm_tree->chunk[i]->count;
+ __wt_spin_unlock(session, &lsm_tree->lock);
+
+ if (nchunks <= 1)
+ return (0);
+
+ /* Allocate an ID for the merge. */
+ dest_id = WT_ATOMIC_ADD(lsm_tree->last, 1);
+
+ WT_VERBOSE_RET(session, lsm,
+ "Merging first %d chunks into %d\n", nchunks, dest_id);
+
+ if (record_count != 0) {
+ WT_RET(__wt_scr_alloc(session, 0, &bbuf));
+ WT_ERR(__wt_lsm_tree_bloom_name(
+ session, lsm_tree, dest_id, bbuf));
+
+ WT_ERR(__wt_bloom_create(session, bbuf->data,
+ NULL, record_count, lsm_tree->bloom_bit_count,
+ lsm_tree->bloom_hash_count, &bloom));
+ }
+
+ /*
+ * Special setup for the merge cursor:
+ * first, reset to open the dependent cursors;
+ * then restrict the cursor to a specific number of chunks;
+ * then set MERGE so the cursor doesn't track updates to the tree.
+ */
+ wt_session = &session->iface;
+ WT_ERR(wt_session->open_cursor(
+ wt_session, lsm_tree->name, NULL, NULL, &src));
+ F_SET(src, WT_CURSTD_RAW);
+ WT_ERR(__wt_clsm_init_merge(src, nchunks));
+
+ WT_WITH_SCHEMA_LOCK(session, ret = __wt_lsm_tree_create_chunk(
+ session, lsm_tree, dest_id, &dest_uri));
+ WT_ERR(ret);
+ WT_ERR(wt_session->open_cursor(
+ wt_session, dest_uri, NULL, "raw,bulk", &dest));
+
+ for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
+ WT_ERR(src->get_key(src, &key));
+ dest->set_key(dest, &key);
+ WT_ERR(src->get_value(src, &value));
+ dest->set_value(dest, &value);
+ WT_ERR(dest->insert(dest));
+ if (bloom != NULL)
+ WT_ERR(__wt_bloom_insert(bloom, &key));
+ }
+ WT_VERBOSE_ERR(session, lsm,
+ "Bloom size for %" PRIu64 " has %" PRIu64 " items inserted.",
+ record_count, insert_count);
+ WT_ERR_NOTFOUND_OK(ret);
+
+ /* We've successfully created the new chunk. Now install it. */
+ WT_TRET(src->close(src));
+ WT_TRET(dest->close(dest));
+ src = dest = NULL;
+ if (bloom != NULL) {
+ WT_TRET(__wt_bloom_finalize(bloom));
+ WT_TRET(__wt_bloom_close(bloom));
+ bloom = NULL;
+ }
+ WT_ERR(ret);
+
+ __wt_spin_lock(session, &lsm_tree->lock);
+ ret = __wt_lsm_merge_update_tree(session, lsm_tree, nchunks, &chunk);
+
+ chunk->uri = dest_uri;
+ dest_uri = NULL;
+ if (bloom != NULL)
+ chunk->bloom_uri = __wt_buf_steal(session, bbuf, 0);
+ chunk->count = insert_count;
+ F_SET(chunk, WT_LSM_CHUNK_ONDISK);
+
+ ret = __wt_lsm_meta_write(session, lsm_tree);
+ __wt_spin_unlock(session, &lsm_tree->lock);
+
+err: if (src != NULL)
+ WT_TRET(src->close(src));
+ if (dest != NULL)
+ WT_TRET(dest->close(dest));
+ if (bloom != NULL)
+ WT_TRET(__wt_bloom_close(bloom));
+ __wt_scr_free(&bbuf);
+ __wt_free(session, dest_uri);
+ if (ret != 0)
+ WT_VERBOSE_VOID(session, lsm,
+ "Merge failed with %s\n", wiredtiger_strerror(ret));
+ return (ret);
+}
diff --git a/src/lsm/lsm_meta.c b/src/lsm/lsm_meta.c
new file mode 100644
index 00000000000..d731b5356fe
--- /dev/null
+++ b/src/lsm/lsm_meta.c
@@ -0,0 +1,173 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_lsm_meta_read --
+ * Read the metadata for an LSM tree.
+ */
+int
+__wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_CONFIG cparser, lparser;
+ WT_CONFIG_ITEM ck, cv, lk, lv;
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ const char *config;
+ int nchunks;
+ size_t chunk_sz, alloc;
+
+ WT_RET(__wt_metadata_read(session, lsm_tree->name, &config));
+ WT_ERR(__wt_config_init(session, &cparser, config));
+ while ((ret = __wt_config_next(&cparser, &ck, &cv)) == 0) {
+ if (WT_STRING_MATCH("file_config", ck.str, ck.len)) {
+ __wt_free(session, lsm_tree->file_config);
+ /* Don't include the brackets. */
+ WT_ERR(__wt_strndup(session,
+ cv.str + 1, cv.len - 2, &lsm_tree->file_config));
+ } else if (WT_STRING_MATCH("key_format", ck.str, ck.len)) {
+ __wt_free(session, lsm_tree->key_format);
+ WT_ERR(__wt_strndup(session,
+ cv.str, cv.len, &lsm_tree->key_format));
+ } else if (WT_STRING_MATCH("value_format", ck.str, ck.len)) {
+ __wt_free(session, lsm_tree->value_format);
+ WT_ERR(__wt_strndup(session,
+ cv.str, cv.len, &lsm_tree->value_format));
+ } else if (WT_STRING_MATCH(
+ "lsm_bloom_bit_count", ck.str, ck.len))
+ lsm_tree->bloom_bit_count = (uint32_t)cv.val;
+ else if (WT_STRING_MATCH(
+ "lsm_bloom_hash_count", ck.str, ck.len))
+ lsm_tree->bloom_hash_count = (uint32_t)cv.val;
+ else if (WT_STRING_MATCH("threshold", ck.str, ck.len))
+ lsm_tree->threshold = (uint32_t)cv.val;
+ else if (WT_STRING_MATCH("last", ck.str, ck.len))
+ lsm_tree->last = (uint32_t)cv.val;
+ else if (WT_STRING_MATCH("chunks", ck.str, ck.len)) {
+ WT_ERR(__wt_config_subinit(session, &lparser, &cv));
+ chunk_sz = sizeof(*lsm_tree->chunk);
+ for (nchunks = 0; (ret =
+ __wt_config_next(&lparser, &lk, &lv)) == 0; ) {
+ if (WT_STRING_MATCH("bloom", lk.str, lk.len)) {
+ WT_ERR(__wt_strndup(session,
+ lv.str, lv.len, &chunk->bloom_uri));
+ continue;
+ }
+ if (WT_STRING_MATCH("count", lk.str, lk.len)) {
+ chunk->count = lv.val;
+ continue;
+ }
+ if ((nchunks + 1) * chunk_sz >
+ lsm_tree->chunk_alloc)
+ WT_ERR(__wt_realloc(session,
+ &lsm_tree->chunk_alloc,
+ WT_MAX(10 * chunk_sz,
+ 2 * lsm_tree->chunk_alloc),
+ &lsm_tree->chunk));
+ WT_ERR(__wt_calloc_def(session, 1, &chunk));
+ lsm_tree->chunk[nchunks++] = chunk;
+ WT_ERR(__wt_strndup(session,
+ lk.str, lk.len, &chunk->uri));
+ chunk->flags = WT_LSM_CHUNK_ONDISK;
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ lsm_tree->nchunks = nchunks;
+ } else if (WT_STRING_MATCH("old_chunks", ck.str, ck.len)) {
+ WT_ERR(__wt_config_subinit(session, &lparser, &cv));
+ chunk_sz = sizeof(*lsm_tree->old_chunks);
+ for (nchunks = 0; (ret =
+ __wt_config_next(&lparser, &lk, &lv)) == 0; ) {
+ if ((nchunks + 1) * chunk_sz >
+ lsm_tree->old_avail * chunk_sz) {
+ alloc = lsm_tree->old_alloc;
+ WT_ERR(__wt_realloc(session,
+ &lsm_tree->old_alloc,
+ chunk_sz * WT_MAX(10,
+ lsm_tree->nold_chunks +
+ 2 * nchunks),
+ &lsm_tree->old_chunks));
+ lsm_tree->nold_chunks = (int)
+ (lsm_tree->old_alloc / chunk_sz);
+ lsm_tree->old_avail += (int)
+ ((lsm_tree->old_alloc - alloc) /
+ chunk_sz);
+ }
+ WT_ERR(__wt_calloc_def(session, 1, &chunk));
+ lsm_tree->old_chunks[nchunks++] = chunk;
+ WT_ERR(__wt_strndup(session,
+ lk.str, lk.len, &chunk->uri));
+ chunk->flags = WT_LSM_CHUNK_ONDISK;
+ --lsm_tree->old_avail;
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ lsm_tree->nold_chunks = nchunks;
+ } else
+ WT_ERR(__wt_illegal_value(session, "LSM metadata"));
+
+ /* TODO: collator */
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+err: __wt_free(session, config);
+ return (ret);
+}
+
+/*
+ * __wt_lsm_meta_write --
+ * Write the metadata for an LSM tree.
+ */
+int
+__wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ int first, i;
+
+ WT_RET(__wt_scr_alloc(session, 0, &buf));
+ WT_ERR(__wt_buf_fmt(session, buf,
+ "file_config=(%s),key_format=%s,value_format=%s",
+ lsm_tree->file_config,
+ lsm_tree->key_format, lsm_tree->value_format));
+ WT_ERR(__wt_buf_catfmt(session, buf,
+ ",last=%" PRIu32 ",threshold=%" PRIu64
+ ",lsm_bloom_bit_count=%" PRIu32 ",lsm_bloom_hash_count=%" PRIu32,
+ lsm_tree->last, (uint64_t)lsm_tree->threshold,
+ lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count));
+ WT_ERR(__wt_buf_catfmt(session, buf, ",chunks=["));
+ for (i = 0; i < lsm_tree->nchunks; i++) {
+ chunk = lsm_tree->chunk[i];
+ if (i > 0)
+ WT_ERR(__wt_buf_catfmt(session, buf, ","));
+ WT_ERR(__wt_buf_catfmt(session, buf, "\"%s\"", chunk->uri));
+ if (chunk->bloom_uri != NULL)
+ WT_ERR(__wt_buf_catfmt(
+ session, buf, ",bloom=\"%s\"", chunk->bloom_uri));
+ if (chunk->count != 0)
+ WT_ERR(__wt_buf_catfmt(
+ session, buf, ",count=%" PRIu64, chunk->count));
+ }
+ WT_ERR(__wt_buf_catfmt(session, buf, "]"));
+ WT_ERR(__wt_buf_catfmt(session, buf, ",old_chunks=["));
+ first = 1;
+ for (i = 0; i < (int)lsm_tree->nold_chunks; i++) {
+ chunk = lsm_tree->old_chunks[i];
+ if (chunk == NULL)
+ continue;
+ if (first)
+ first = 0;
+ else
+ WT_ERR(__wt_buf_catfmt(session, buf, ","));
+ WT_ERR(__wt_buf_catfmt(session, buf, "\"%s\"", chunk->uri));
+ }
+ WT_ERR(__wt_buf_catfmt(session, buf, "]"));
+ WT_ERR(__wt_metadata_update(session, lsm_tree->name, buf->data));
+
+err: __wt_scr_free(&buf);
+ return (ret);
+}
diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c
new file mode 100644
index 00000000000..ca3f7a28c6b
--- /dev/null
+++ b/src/lsm/lsm_tree.c
@@ -0,0 +1,523 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __lsm_tree_discard --
+ * Free an LSM tree structure.
+ */
+static void
+__lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_LSM_CHUNK *chunk;
+ int i;
+
+ TAILQ_REMOVE(&S2C(session)->lsmqh, lsm_tree, q);
+ __wt_spin_destroy(session, &lsm_tree->lock);
+
+ __wt_free(session, lsm_tree->name);
+ for (i = 0; i < lsm_tree->nchunks; i++) {
+ if ((chunk = lsm_tree->chunk[i]) == NULL)
+ continue;
+
+ __wt_free(session, chunk->bloom_uri);
+ __wt_free(session, chunk->uri);
+ __wt_free(session, chunk);
+ }
+ __wt_free(session, lsm_tree->chunk);
+
+ for (i = 0; i < lsm_tree->nold_chunks; i++) {
+ if ((chunk = lsm_tree->old_chunks[i]) == NULL)
+ continue;
+
+ __wt_free(session, chunk->bloom_uri);
+ __wt_free(session, chunk->uri);
+ __wt_free(session, chunk);
+ }
+ __wt_free(session, lsm_tree->old_chunks);
+
+ __wt_free(session, lsm_tree);
+}
+
+/*
+ * __lsm_tree_close --
+ * Close an LSM tree structure.
+ */
+static int
+__lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+
+ if (F_ISSET(lsm_tree, WT_LSM_TREE_OPEN)) {
+ F_CLR(lsm_tree, WT_LSM_TREE_OPEN);
+ WT_TRET(__wt_thread_join(lsm_tree->worker_tid));
+ }
+
+ /*
+ * Close the session and free its hazard array (necessary because
+ * we set WT_SESSION_INTERNAL to simplify shutdown ordering.
+ *
+ * Do this in the main thread to avoid deadlocks.
+ */
+ if (lsm_tree->worker_session != NULL) {
+ F_SET(lsm_tree->worker_session,
+ F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
+
+ wt_session = &lsm_tree->worker_session->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+
+ /*
+ * This is safe after the close because session handles are
+ * not freed, but are managed by the connection.
+ */
+ __wt_free(NULL, lsm_tree->worker_session->hazard);
+ }
+
+ return (ret);
+}
+
+/*
+ * __wt_lsm_tree_close_all --
+ * Close an LSM tree structure.
+ */
+int
+__wt_lsm_tree_close_all(WT_SESSION_IMPL *session)
+{
+ WT_DECL_RET;
+ WT_LSM_TREE *lsm_tree;
+
+ while ((lsm_tree = TAILQ_FIRST(&S2C(session)->lsmqh)) != NULL) {
+ WT_TRET(__lsm_tree_close(session, lsm_tree));
+ __lsm_tree_discard(session, lsm_tree);
+ }
+
+ return (ret);
+}
+
+/*
+ * __wt_lsm_tree_bloom_name --
+ * Get the URI of the Bloom filter for a given chunk.
+ */
+int
+__wt_lsm_tree_bloom_name(
+ WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int i, WT_ITEM *buf)
+{
+ WT_RET(__wt_buf_fmt(session, buf, "file:%s-%06d.bf",
+ lsm_tree->filename, i));
+ return (0);
+}
+
+/*
+ * __wt_lsm_tree_chunk_name --
+ * Get the URI of the file for a given chunk.
+ */
+int
+__wt_lsm_tree_chunk_name(
+ WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int i, WT_ITEM *buf)
+{
+ WT_RET(__wt_buf_fmt(session, buf, "file:%s-%06d.lsm",
+ lsm_tree->filename, i));
+ return (0);
+}
+
+/*
+ * __wt_lsm_tree_create_chunk --
+ * Create a chunk of an LSM tree.
+ */
+int
+__wt_lsm_tree_create_chunk(
+ WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int i, const char **urip)
+{
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+
+ WT_RET(__wt_scr_alloc(session, 0, &buf));
+ WT_ERR(__wt_lsm_tree_chunk_name(session, lsm_tree, i, buf));
+ WT_ERR(__wt_schema_create(session,
+ buf->data, lsm_tree->file_config));
+ *urip = __wt_buf_steal(session, buf, NULL);
+
+err: __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __wt_lsm_start_worker --
+ * Start the worker thread for an LSM tree.
+ */
+static int
+__lsm_tree_start_worker(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_CONNECTION *wt_conn;
+ WT_SESSION *wt_session;
+
+ wt_conn = &S2C(session)->iface;
+ WT_RET(wt_conn->open_session(wt_conn, NULL, NULL, &wt_session));
+ lsm_tree->worker_session = (WT_SESSION_IMPL *)wt_session;
+ F_SET(lsm_tree->worker_session, WT_SESSION_INTERNAL);
+
+ WT_RET(__wt_thread_create(
+ &lsm_tree->worker_tid, __wt_lsm_worker, lsm_tree));
+ F_SET(lsm_tree, WT_LSM_TREE_OPEN);
+
+ return (0);
+}
+
+/*
+ * __wt_lsm_tree_create --
+ * Create an LSM tree structure for the given name.
+ */
+int
+__wt_lsm_tree_create(WT_SESSION_IMPL *session,
+ const char *uri, int exclusive, const char *config)
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_LSM_TREE *lsm_tree;
+ const char *cfg[] = API_CONF_DEFAULTS(session, create, config);
+
+ /* If the tree is open, it already exists. */
+ if ((ret = __wt_lsm_tree_get(session, uri, &lsm_tree)) == 0)
+ return (exclusive ? EEXIST : 0);
+ WT_RET_NOTFOUND_OK(ret);
+
+ /* If the tree has metadata, it already exists. */
+ if (__wt_metadata_read(session, uri, &config) == 0) {
+ __wt_free(session, config);
+ return (exclusive ? EEXIST : 0);
+ }
+ WT_RET_NOTFOUND_OK(ret);
+
+ /*
+ * XXX this call should just insert the metadata: most of this should
+ * move to __wt_lsm_tree_open.
+ */
+ WT_RET(__wt_calloc_def(session, 1, &lsm_tree));
+ __wt_spin_init(session, &lsm_tree->lock);
+ TAILQ_INSERT_HEAD(&S2C(session)->lsmqh, lsm_tree, q);
+
+ WT_RET(__wt_strdup(session, uri, &lsm_tree->name));
+ lsm_tree->filename = lsm_tree->name + strlen("lsm:");
+
+ WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval));
+ WT_ERR(__wt_strndup(session, cval.str, cval.len,
+ &lsm_tree->key_format));
+ WT_ERR(__wt_config_gets(session, cfg, "value_format", &cval));
+ WT_ERR(__wt_strndup(session, cval.str, cval.len,
+ &lsm_tree->value_format));
+
+ WT_ERR(__wt_config_gets(session, cfg, "lsm_chunk_size", &cval));
+ lsm_tree->threshold = (uint32_t)cval.val;
+ WT_ERR(__wt_config_gets(session, cfg, "lsm_bloom_bit_count", &cval));
+ lsm_tree->bloom_bit_count = (uint32_t)cval.val;
+ WT_ERR(__wt_config_gets(session, cfg, "lsm_bloom_hash_count", &cval));
+ lsm_tree->bloom_hash_count = (uint32_t)cval.val;
+
+ WT_ERR(__wt_scr_alloc(session, 0, &buf));
+ WT_ERR(__wt_buf_fmt(session, buf,
+ "%s,key_format=u,value_format=u", config));
+ lsm_tree->file_config = __wt_buf_steal(session, buf, NULL);
+
+ /* Create the initial chunk. */
+ WT_ERR(__wt_lsm_tree_switch(session, lsm_tree));
+
+ /* XXX This should definitely only happen when opening the tree. */
+ WT_ERR(__lsm_tree_start_worker(session, lsm_tree));
+
+ if (0) {
+err: __lsm_tree_discard(session, lsm_tree);
+ }
+ __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __wt_lsm_tree_open --
+ * Open an LSM tree structure.
+ */
+static int
+__lsm_tree_open(
+ WT_SESSION_IMPL *session, const char *uri, WT_LSM_TREE **treep)
+{
+ WT_DECL_RET;
+ WT_LSM_TREE *lsm_tree;
+
+ /* Try to open the tree. */
+ WT_RET(__wt_calloc_def(session, 1, &lsm_tree));
+ __wt_spin_init(session, &lsm_tree->lock);
+ WT_ERR(__wt_strdup(session, uri, &lsm_tree->name));
+ lsm_tree->filename = lsm_tree->name + strlen("lsm:");
+ TAILQ_INSERT_HEAD(&S2C(session)->lsmqh, lsm_tree, q);
+
+ WT_ERR(__wt_lsm_meta_read(session, lsm_tree));
+
+ /* Set the generation number so cursors are opened on first usage. */
+ lsm_tree->dsk_gen = 1;
+
+ WT_ERR(__lsm_tree_start_worker(session, lsm_tree));
+ *treep = lsm_tree;
+
+ if (0) {
+err: __lsm_tree_discard(session, lsm_tree);
+ }
+ return (ret);
+}
+
+/*
+ * __wt_lsm_tree_get --
+ * get an LSM tree structure for the given name.
+ */
+int
+__wt_lsm_tree_get(
+ WT_SESSION_IMPL *session, const char *uri, WT_LSM_TREE **treep)
+{
+ WT_DECL_RET;
+ WT_LSM_TREE *lsm_tree;
+
+ TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q)
+ if (strcmp(uri, lsm_tree->name) == 0) {
+ *treep = lsm_tree;
+ return (0);
+ }
+
+ /*
+ * If we don't already hold the schema lock, get it now so that we
+ * can find and/or open the handle.
+ */
+ WT_WITH_SCHEMA_LOCK_OPT(session,
+ ret = __lsm_tree_open(session, uri, treep));
+ return (ret);
+}
+
+/*
+ * __wt_lsm_tree_switch --
+ * Switch to a new in-memory tree.
+ */
+int
+__wt_lsm_tree_switch(
+ WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+
+ WT_VERBOSE_RET(session, lsm,
+ "Tree switch to: %d because %d > %d", lsm_tree->last + 1,
+ (lsm_tree->memsizep == NULL ? 0 : (int)*lsm_tree->memsizep),
+ (int)lsm_tree->threshold);
+
+ lsm_tree->memsizep = NULL;
+
+ if ((lsm_tree->nchunks + 1) * sizeof(*lsm_tree->chunk) >
+ lsm_tree->chunk_alloc)
+ WT_ERR(__wt_realloc(session,
+ &lsm_tree->chunk_alloc,
+ WT_MAX(10 * sizeof(*lsm_tree->chunk),
+ 2 * lsm_tree->chunk_alloc),
+ &lsm_tree->chunk));
+
+ WT_ERR(__wt_calloc_def(session, 1, &chunk));
+ lsm_tree->chunk[lsm_tree->nchunks++] = chunk;
+ WT_ERR(__wt_lsm_tree_create_chunk(session,
+ lsm_tree, WT_ATOMIC_ADD(lsm_tree->last, 1),
+ &chunk->uri));
+
+ ++lsm_tree->dsk_gen;
+ WT_ERR(__wt_lsm_meta_write(session, lsm_tree));
+
+err: /* TODO: mark lsm_tree bad on error(?) */
+ return (ret);
+}
+
+/*
+ * __wt_lsm_tree_drop --
+ * Drop an LSM tree.
+ */
+int
+__wt_lsm_tree_drop(
+ WT_SESSION_IMPL *session, const char *name, const char *cfg[])
+{
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_TREE *lsm_tree;
+ int i;
+
+ /* Get the LSM tree. */
+ WT_RET(__wt_lsm_tree_get(session, name, &lsm_tree));
+
+ /* Shut down the LSM worker. */
+ WT_RET(__lsm_tree_close(session, lsm_tree));
+
+ /* Prevent any new opens. */
+ WT_RET(__wt_spin_trylock(session, &lsm_tree->lock));
+
+ /* Drop the chunks. */
+ for (i = 0; i < lsm_tree->nchunks; i++) {
+ chunk = lsm_tree->chunk[i];
+ WT_ERR(__wt_schema_drop(session, chunk->uri, cfg));
+ if (chunk->bloom_uri != NULL)
+ WT_ERR(
+ __wt_schema_drop(session, chunk->bloom_uri, cfg));
+ }
+ /* Drop any chunks on the obsolete list. */
+ for (i = 0; i < lsm_tree->nold_chunks; i++) {
+ if ((chunk = lsm_tree->old_chunks[i]) == NULL)
+ continue;
+ WT_ERR(__wt_schema_drop(session, chunk->uri, cfg));
+ if (chunk->bloom_uri != NULL)
+ WT_ERR(
+ __wt_schema_drop(session, chunk->bloom_uri, cfg));
+ }
+
+ __wt_spin_unlock(session, &lsm_tree->lock);
+ WT_ERR(__wt_metadata_remove(session, name));
+
+ if (0) {
+err: __wt_spin_unlock(session, &lsm_tree->lock);
+ }
+ __lsm_tree_discard(session, lsm_tree);
+ return (ret);
+}
+
+/*
+ * __wt_lsm_tree_rename --
+ * Rename an LSM tree.
+ */
+int
+__wt_lsm_tree_rename(WT_SESSION_IMPL *session,
+ const char *oldname, const char *newname, const char *cfg[])
+{
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_TREE *lsm_tree;
+ const char *old;
+ int i;
+
+ old = NULL;
+
+ /* Get the LSM tree. */
+ WT_RET(__wt_lsm_tree_get(session, oldname, &lsm_tree));
+
+ /* Shut down the LSM worker. */
+ WT_RET(__lsm_tree_close(session, lsm_tree));
+
+ /* Prevent any new opens. */
+ WT_RET(__wt_spin_trylock(session, &lsm_tree->lock));
+
+ WT_ERR(__wt_scr_alloc(session, 0, &buf));
+
+ /* Set the new name. */
+ __wt_free(session, lsm_tree->name);
+ WT_ERR(__wt_strdup(session, newname, &lsm_tree->name));
+
+ /* Rename the chunks. */
+ for (i = 0; i < lsm_tree->nchunks; i++) {
+ chunk = lsm_tree->chunk[i];
+ old = chunk->uri;
+ chunk->uri = NULL;
+
+ WT_ERR(__wt_lsm_tree_chunk_name(session, lsm_tree, i, buf));
+ chunk->uri = __wt_buf_steal(session, buf, NULL);
+ WT_ERR(__wt_schema_rename(session, old, chunk->uri, cfg));
+ __wt_free(session, old);
+
+ if ((old = chunk->bloom_uri) != NULL) {
+ chunk->bloom_uri = NULL;
+ WT_ERR(__wt_lsm_tree_bloom_name(
+ session, lsm_tree, i, buf));
+ chunk->bloom_uri = __wt_buf_steal(session, buf, NULL);
+ WT_ERR(__wt_schema_rename(
+ session, old, chunk->uri, cfg));
+ __wt_free(session, old);
+ }
+ }
+
+ __wt_spin_unlock(session, &lsm_tree->lock);
+ WT_ERR(__wt_lsm_meta_write(session, lsm_tree));
+ WT_ERR(__wt_metadata_remove(session, oldname));
+
+ if (0) {
+err: __wt_spin_unlock(session, &lsm_tree->lock);
+ }
+ __wt_scr_free(&buf);
+ if (old != NULL)
+ __wt_free(session, old);
+ __lsm_tree_discard(session, lsm_tree);
+ return (ret);
+}
+
+/*
+ * __wt_lsm_tree_truncate --
+ * Truncate an LSM tree.
+ */
+int
+__wt_lsm_tree_truncate(
+ WT_SESSION_IMPL *session, const char *name, const char *cfg[])
+{
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_TREE *lsm_tree;
+
+ WT_UNUSED(cfg);
+
+ /* Get the LSM tree. */
+ WT_RET(__wt_lsm_tree_get(session, name, &lsm_tree));
+
+ /* Shut down the LSM worker. */
+ WT_RET(__lsm_tree_close(session, lsm_tree));
+
+ /* Prevent any new opens. */
+ WT_RET(__wt_spin_trylock(session, &lsm_tree->lock));
+
+ /* Mark all chunks old. */
+ WT_ERR(__wt_lsm_merge_update_tree(
+ session, lsm_tree, lsm_tree->nchunks, &chunk));
+
+ /* Create the new chunk. */
+ WT_ERR(__wt_lsm_tree_create_chunk(
+ session, lsm_tree, WT_ATOMIC_ADD(lsm_tree->last, 1), &chunk->uri));
+
+ WT_ERR(__wt_lsm_meta_write(session, lsm_tree));
+
+ WT_ERR(__lsm_tree_start_worker(session, lsm_tree));
+ __wt_spin_unlock(session, &lsm_tree->lock);
+
+ if (0) {
+err: __wt_spin_unlock(session, &lsm_tree->lock);
+ __lsm_tree_discard(session, lsm_tree);
+ }
+ return (ret);
+}
+
+/*
+ * __wt_lsm_tree_worker --
+ * Run a schema worker operation on each level of a LSM tree.
+ */
+int
+__wt_lsm_tree_worker(WT_SESSION_IMPL *session,
+ const char *uri,
+ int (*func)(WT_SESSION_IMPL *, const char *[]),
+ const char *cfg[], uint32_t open_flags)
+{
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_TREE *lsm_tree;
+ int i;
+
+ WT_RET(__wt_lsm_tree_get(session, uri, &lsm_tree));
+ for (i = 0; i < lsm_tree->nchunks; i++) {
+ chunk = lsm_tree->chunk[i];
+ if (func == __wt_checkpoint &&
+ F_ISSET(chunk, WT_LSM_CHUNK_ONDISK))
+ continue;
+ WT_RET(__wt_schema_worker(
+ session, chunk->uri, func, cfg, open_flags));
+ }
+ return (0);
+}
diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c
new file mode 100644
index 00000000000..19eb2352ff8
--- /dev/null
+++ b/src/lsm/lsm_worker.c
@@ -0,0 +1,166 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int
+__lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+
+/*
+ * __wt_lsm_worker --
+ * The worker thread for an LSM tree, responsible for writing in-memory
+ * trees to disk and merging on-disk trees.
+ */
+void *
+__wt_lsm_worker(void *arg)
+{
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk, **chunk_array;
+ WT_LSM_TREE *lsm_tree;
+ WT_SESSION_IMPL *session;
+ const char *cfg[] = { "name=,drop=", NULL };
+ size_t chunk_alloc;
+ int i, nchunks, progress;
+
+ lsm_tree = arg;
+ session = lsm_tree->worker_session;
+
+ chunk_array = NULL;
+ chunk_alloc = 0;
+
+ while (F_ISSET(lsm_tree, WT_LSM_TREE_OPEN)) {
+ progress = 0;
+
+ __wt_spin_lock(session, &lsm_tree->lock);
+ if (!F_ISSET(lsm_tree, WT_LSM_TREE_OPEN)) {
+ __wt_spin_unlock(session, &lsm_tree->lock);
+ break;
+ }
+ /*
+ * Take a copy of the current state of the LSM tree. Skip
+ * the last chunk - since it is the active one and not relevant
+ * to merge operations.
+ */
+ for (nchunks = lsm_tree->nchunks - 1;
+ nchunks > 0 && lsm_tree->chunk[nchunks - 1]->ncursor > 0;
+ --nchunks)
+ ;
+ if (chunk_alloc < lsm_tree->chunk_alloc)
+ ret = __wt_realloc(session,
+ &chunk_alloc, lsm_tree->chunk_alloc,
+ &chunk_array);
+ if (ret == 0 && nchunks > 0)
+ memcpy(chunk_array, lsm_tree->chunk,
+ nchunks * sizeof(*lsm_tree->chunk));
+ __wt_spin_unlock(session, &lsm_tree->lock);
+ WT_ERR(ret);
+
+ /*
+ * Write checkpoints in all completed files, then find
+ * something to merge.
+ */
+ for (i = 0; i < nchunks; i++) {
+ chunk = chunk_array[i];
+ if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) ||
+ chunk->ncursor > 0)
+ continue;
+
+ /* XXX durability: need to checkpoint the metadata? */
+ /*
+ * NOTE: we pass a non-NULL config, because otherwise
+ * __wt_checkpoint thinks we're closing the file.
+ */
+ WT_WITH_SCHEMA_LOCK(session, ret =
+ __wt_schema_worker(session, chunk->uri,
+ __wt_checkpoint, cfg, 0));
+ if (ret == 0) {
+ __wt_spin_lock(session, &lsm_tree->lock);
+ F_SET(lsm_tree->chunk[i], WT_LSM_CHUNK_ONDISK);
+ lsm_tree->dsk_gen++;
+ __wt_spin_unlock(session, &lsm_tree->lock);
+ progress = 1;
+ }
+ }
+
+ /* Clear any state from previous worker thread iterations. */
+ session->btree = NULL;
+
+ if (nchunks > 0 && __wt_lsm_major_merge(session, lsm_tree) == 0)
+ progress = 1;
+
+ /* Clear any state from previous worker thread iterations. */
+ session->btree = NULL;
+
+ if (lsm_tree->nold_chunks != lsm_tree->old_avail &&
+ __lsm_free_chunks(session, lsm_tree) == 0)
+ progress = 1;
+
+ if (!progress)
+ __wt_sleep(0, 10);
+ }
+
+err: __wt_free(session, chunk_array);
+
+ return (NULL);
+}
+
+static int
+__lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ const char *drop_cfg[] = { NULL };
+ int found, i;
+
+ found = 0;
+ for (i = 0; i < lsm_tree->nold_chunks; i++) {
+ if ((chunk = lsm_tree->old_chunks[i]) == NULL)
+ continue;
+ if (!found) {
+ found = 1;
+ /* TODO: Do we need the lsm_tree lock for all drops? */
+ __wt_spin_lock(session, &lsm_tree->lock);
+ }
+ if (chunk->bloom_uri != NULL) {
+ WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_drop(
+ session, chunk->bloom_uri, drop_cfg));
+ /*
+ * An EBUSY return is acceptable - a cursor may still
+ * be positioned on this old chunk.
+ */
+ if (ret == 0) {
+ __wt_free(session, chunk->bloom_uri);
+ chunk->bloom_uri = NULL;
+ } else if (ret != EBUSY)
+ goto err;
+ }
+ if (chunk->uri != NULL) {
+ WT_WITH_SCHEMA_LOCK(session, ret =
+ __wt_schema_drop(session, chunk->uri, drop_cfg));
+ /*
+ * An EBUSY return is acceptable - a cursor may still
+ * be positioned on this old chunk.
+ */
+ if (ret == 0) {
+ __wt_free(session, chunk->uri);
+ chunk->uri = NULL;
+ } else if (ret != EBUSY)
+ goto err;
+ }
+
+ if (chunk->uri == NULL && chunk->bloom_uri == NULL) {
+ __wt_free(session, lsm_tree->old_chunks[i]);
+ ++lsm_tree->old_avail;
+ }
+ }
+ if (found) {
+err: ret = __wt_lsm_meta_write(session, lsm_tree);
+ __wt_spin_unlock(session, &lsm_tree->lock);
+ }
+ /* Returning non-zero means there is no work to do. */
+ return (found ? 0 : WT_NOTFOUND);
+}
diff --git a/src/schema/schema_create.c b/src/schema/schema_create.c
index eefd36333e6..8777b2d3e32 100644
--- a/src/schema/schema_create.c
+++ b/src/schema/schema_create.c
@@ -11,14 +11,13 @@ static int
__create_file(WT_SESSION_IMPL *session,
const char *uri, int exclusive, const char *config)
{
- WT_ITEM *val;
+ WT_DECL_ITEM(val);
WT_DECL_RET;
int is_metadata;
const char *cfg[] = API_CONF_DEFAULTS(session, create, config);
const char *filecfg[4] = API_CONF_DEFAULTS(file, meta, config);
const char *filename, *treeconf;
- val = NULL;
treeconf = NULL;
is_metadata = strcmp(uri, WT_METADATA_URI) == 0;
@@ -32,7 +31,7 @@ __create_file(WT_SESSION_IMPL *session,
__wt_metadata_read(session, uri, &treeconf)) != WT_NOTFOUND) {
if (exclusive)
WT_TRET(EEXIST);
- return (ret);
+ goto err;
}
/* Create the file. */
@@ -67,10 +66,10 @@ __create_file(WT_SESSION_IMPL *session,
*/
WT_ERR(__wt_conn_btree_get(
session, uri, NULL, cfg, WT_BTREE_EXCLUSIVE));
- if (!WT_META_TRACKING(session))
- WT_ERR(__wt_session_release_btree(session));
- else
+ if (WT_META_TRACKING(session))
WT_ERR(__wt_meta_track_handle_lock(session, 1));
+ else
+ WT_ERR(__wt_session_release_btree(session));
err: __wt_scr_free(&val);
__wt_free(session, treeconf);
@@ -399,7 +398,8 @@ __wt_schema_create(
else if (WT_PREFIX_MATCH(name, "table:"))
ret = __create_table(session, name, exclusive, config);
else if ((ret = __wt_schema_get_source(session, name, &dsrc)) == 0)
- ret = dsrc->create(dsrc, &session->iface, name, config);
+ ret = dsrc->create(dsrc, &session->iface,
+ name, exclusive, config);
session->btree = NULL;
WT_TRET(__wt_meta_track_off(session, ret != 0));
diff --git a/src/schema/schema_drop.c b/src/schema/schema_drop.c
index 4ef8e8d0220..f16b76286e3 100644
--- a/src/schema/schema_drop.c
+++ b/src/schema/schema_drop.c
@@ -250,7 +250,7 @@ __wt_schema_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
else if (WT_PREFIX_MATCH(uri, "table:"))
ret = __drop_table(session, uri, force, cfg);
else if ((ret = __wt_schema_get_source(session, uri, &dsrc)) == 0)
- ret = dsrc->drop(dsrc, &session->iface, uri, cfg[1]);
+ ret = dsrc->drop(dsrc, &session->iface, uri, cfg);
/*
* Map WT_NOTFOUND to ENOENT (or to 0 if "force" is set), based on the
diff --git a/src/schema/schema_rename.c b/src/schema/schema_rename.c
index 9115e6cdc4b..f052be06fdc 100644
--- a/src/schema/schema_rename.c
+++ b/src/schema/schema_rename.c
@@ -225,7 +225,7 @@ __wt_schema_rename(WT_SESSION_IMPL *session,
ret = __rename_table(session, oldname, newname);
} else if ((ret = __wt_schema_get_source(session, oldname, &dsrc)) == 0)
ret = dsrc->rename(dsrc,
- &session->iface, oldname, newname, cfg[1]);
+ &session->iface, oldname, newname, cfg);
WT_TRET(__wt_meta_track_off(session, ret != 0));
diff --git a/src/schema/schema_truncate.c b/src/schema/schema_truncate.c
index c5492ca69cd..96066be25c7 100644
--- a/src/schema/schema_truncate.c
+++ b/src/schema/schema_truncate.c
@@ -104,7 +104,7 @@ __wt_schema_truncate(
else if (WT_PREFIX_SKIP(tablename, "table:"))
ret = __truncate_table(session, tablename);
else if ((ret = __wt_schema_get_source(session, uri, &dsrc)) == 0)
- ret = dsrc->truncate(dsrc, &session->iface, uri, cfg[1]);
+ ret = dsrc->truncate(dsrc, &session->iface, uri, cfg);
/* If we didn't find a metadata entry, map that error to ENOENT. */
return (ret == WT_NOTFOUND ? ENOENT : ret);
diff --git a/src/schema/schema_util.c b/src/schema/schema_util.c
index 934781c9d7d..8c5b2c74527 100644
--- a/src/schema/schema_util.c
+++ b/src/schema/schema_util.c
@@ -35,27 +35,24 @@ int
__wt_schema_name_check(WT_SESSION_IMPL *session, const char *uri)
{
const char *name, *sep;
+ int skipped;
/*
* Check if name is somewhere in the WiredTiger name space: it would be
- * "bad" if the application truncated the metadata file. We get passed
- * both objects and simple strings, skip any leading URI prefix.
+ * "bad" if the application truncated the metadata file. Skip any
+ * leading URI prefix, check and then skip over a table name.
*/
name = uri;
- if (WT_PREFIX_SKIP(name, "colgroup:") ||
- WT_PREFIX_SKIP(name, "index:")) {
- /* These URIs normally reference a table name. */
- if ((sep = strchr(name, ':')) != NULL)
- name = sep + 1;
- } else if (!WT_PREFIX_SKIP(name, "table:") &&
- !WT_PREFIX_SKIP(name, "file:"))
- return (__wt_bad_object_type(session, uri));
+ for (skipped = 0; skipped < 2; skipped++) {
+ if ((sep = strchr(name, ':')) == NULL)
+ break;
- if (WT_PREFIX_MATCH(name, "WiredTiger"))
- WT_RET_MSG(session, EINVAL,
- "%s: the \"WiredTiger\" name space may not be used by "
- "applications",
- name);
+ name = sep + 1;
+ if (WT_PREFIX_MATCH(name, "WiredTiger"))
+ WT_RET_MSG(session, EINVAL,
+ "%s: the \"WiredTiger\" name space may not be "
+ "used by applications", name);
+ }
/*
* Disallow JSON quoting characters -- the config string parsing code
@@ -67,5 +64,6 @@ __wt_schema_name_check(WT_SESSION_IMPL *session, const char *uri)
"%s: WiredTiger objects should not include grouping "
"characters in their names",
name);
+
return (0);
}
diff --git a/src/schema/schema_worker.c b/src/schema/schema_worker.c
index ec9566d72ac..8bb62999d7b 100644
--- a/src/schema/schema_worker.c
+++ b/src/schema/schema_worker.c
@@ -38,6 +38,9 @@ __wt_schema_worker(WT_SESSION_IMPL *session,
session, uri, strlen(uri), cfg, open_flags));
ret = func(session, cfg);
WT_TRET(__wt_session_release_btree(session));
+ } else if (WT_PREFIX_SKIP(tablename, "lsm:")) {
+ WT_RET(__wt_lsm_tree_worker(
+ session, uri, func, cfg, open_flags));
} else if (WT_PREFIX_SKIP(tablename, "table:")) {
WT_RET(__wt_schema_get_table(session,
tablename, strlen(tablename), 0, &table));
diff --git a/src/session/session_api.c b/src/session/session_api.c
index f1e68c471b6..4d408f2aa37 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -25,17 +25,35 @@ __session_reset_cursors(WT_SESSION_IMPL *session)
}
/*
+ * __session_close_cache --
+ * Close any cached handles in a session. Called holding the schema lock.
+ */
+static int
+__session_close_cache(WT_SESSION_IMPL *session)
+{
+ WT_BTREE_SESSION *btree_session;
+ WT_DECL_RET;
+
+ while ((btree_session = TAILQ_FIRST(&session->btrees)) != NULL)
+ WT_TRET(__wt_session_discard_btree(session, btree_session));
+
+ WT_TRET(__wt_schema_close_tables(session));
+
+ return (ret);
+}
+
+/*
* __session_close --
* WT_SESSION->close method.
*/
static int
__session_close(WT_SESSION *wt_session, const char *config)
{
- WT_BTREE_SESSION *btree_session;
WT_CONNECTION_IMPL *conn;
WT_CURSOR *cursor;
WT_DECL_RET;
WT_SESSION_IMPL *session;
+ int tret;
conn = (WT_CONNECTION_IMPL *)wt_session->connection;
session = (WT_SESSION_IMPL *)wt_session;
@@ -53,17 +71,15 @@ __session_close(WT_SESSION *wt_session, const char *config)
WT_ASSERT(session, session->ncursors == 0);
- /* Acquire the schema lock: we may be closing btree handles. */
- __wt_spin_lock(session, &S2C(session)->schema_lock);
- F_SET(session, WT_SESSION_SCHEMA_LOCKED);
-
- while ((btree_session = TAILQ_FIRST(&session->btrees)) != NULL)
- WT_TRET(__wt_session_discard_btree(session, btree_session));
-
- WT_TRET(__wt_schema_close_tables(session));
-
- F_CLR(session, WT_SESSION_SCHEMA_LOCKED);
- __wt_spin_unlock(session, &S2C(session)->schema_lock);
+ /*
+ * Acquire the schema lock: we may be closing btree handles.
+ *
+ * Note that in some special cases, the schema may already be locked
+ * (e.g., if this session is an LSM tree worker and the tree is being
+ * dropped).
+ */
+ WT_WITH_SCHEMA_LOCK_OPT(session, tret = __session_close_cache(session));
+ WT_TRET(tret);
/* Discard metadata tracking. */
__wt_meta_track_discard(session);
@@ -144,7 +160,7 @@ __session_reconfigure(WT_SESSION *wt_session, const char *config)
WT_ERR_MSG(session, EINVAL,
"Database not configured for transactions");
- session->isolation =
+ session->isolation = session->txn.isolation =
WT_STRING_MATCH("snapshot", cval.str, cval.len) ?
TXN_ISO_SNAPSHOT :
WT_STRING_MATCH("read-uncommitted", cval.str, cval.len) ?
@@ -162,6 +178,7 @@ static int
__session_open_cursor(WT_SESSION *wt_session,
const char *uri, WT_CURSOR *to_dup, const char *config, WT_CURSOR **cursorp)
{
+ WT_DATA_SOURCE *dsrc;
WT_DECL_RET;
WT_SESSION_IMPL *session;
@@ -178,6 +195,7 @@ __session_open_cursor(WT_SESSION *wt_session,
if (WT_PREFIX_MATCH(uri, "colgroup:") ||
WT_PREFIX_MATCH(uri, "index:") ||
WT_PREFIX_MATCH(uri, "file:") ||
+ WT_PREFIX_MATCH(uri, "lsm:") ||
WT_PREFIX_MATCH(uri, "table:"))
ret = __wt_cursor_dup(session, to_dup, config, cursorp);
else
@@ -196,8 +214,9 @@ __session_open_cursor(WT_SESSION *wt_session,
ret = __wt_curstat_open(session, uri, cfg, cursorp);
else if (WT_PREFIX_MATCH(uri, "table:"))
ret = __wt_curtable_open(session, uri, cfg, cursorp);
- else
- ret = __wt_bad_object_type(session, uri);
+ else if ((ret = __wt_schema_get_source(session, uri, &dsrc)) == 0)
+ ret = dsrc->open_cursor(dsrc, &session->iface,
+ uri, cfg, cursorp);
err: API_END_NOTFOUND_MAP(session, ret);
}
diff --git a/src/session/session_btree.c b/src/session/session_btree.c
index 6c673474eac..0dc733a7ce8 100644
--- a/src/session/session_btree.c
+++ b/src/session/session_btree.c
@@ -191,7 +191,6 @@ __wt_session_get_btree(WT_SESSION_IMPL *session,
WT_BTREE *btree;
WT_BTREE_SESSION *btree_session;
WT_DECL_RET;
- int needlock;
btree = NULL;
@@ -228,16 +227,8 @@ __wt_session_get_btree(WT_SESSION_IMPL *session,
* If we don't already hold the schema lock, get it now so that we
* can find and/or open the handle.
*/
- needlock = !F_ISSET(session, WT_SESSION_SCHEMA_LOCKED);
- if (needlock) {
- __wt_spin_lock(session, &S2C(session)->schema_lock);
- F_SET(session, WT_SESSION_SCHEMA_LOCKED);
- }
- ret = __wt_conn_btree_get(session, uri, checkpoint, cfg, flags);
- if (needlock) {
- F_CLR(session, WT_SESSION_SCHEMA_LOCKED);
- __wt_spin_unlock(session, &S2C(session)->schema_lock);
- }
+ WT_WITH_SCHEMA_LOCK_OPT(session,
+ ret = __wt_conn_btree_get(session, uri, checkpoint, cfg, flags));
WT_RET(ret);
if (btree_session == NULL)
diff --git a/src/support/hazard.c b/src/support/hazard.c
index c5d5b02e7fd..ded1598b587 100644
--- a/src/support/hazard.c
+++ b/src/support/hazard.c
@@ -30,8 +30,8 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, int *busyp
conn = S2C(session);
*busyp = 0;
- /* If a file cannot be evicted, hazard references aren't required. */
- if (btree->cache_resident)
+ /* If a file can never be evicted, hazard references aren't required. */
+ if (F_ISSET(btree, WT_BTREE_NO_HAZARD))
return (0);
/*
@@ -120,8 +120,8 @@ __wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page)
btree = session->btree;
conn = S2C(session);
- /* If a file cannot be evicted, hazard references aren't required. */
- if (btree->cache_resident)
+ /* If a file can never be evicted, hazard references aren't required. */
+ if (F_ISSET(btree, WT_BTREE_NO_HAZARD))
return;
/*
diff --git a/src/support/scratch.c b/src/support/scratch.c
index 57b779bd5c2..f9cff2d0343 100644
--- a/src/support/scratch.c
+++ b/src/support/scratch.c
@@ -249,7 +249,7 @@ __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...)
*/
int
__wt_scr_alloc_func(WT_SESSION_IMPL *session,
- uint32_t size, WT_ITEM **scratchp
+ size_t size, WT_ITEM **scratchp
#ifdef HAVE_DIAGNOSTIC
, const char *file, int line
#endif
diff --git a/src/utilities/util.h b/src/utilities/util.h
index 7035ebdb18b..a736fdfcf85 100644
--- a/src/utilities/util.h
+++ b/src/utilities/util.h
@@ -10,11 +10,13 @@
#define UTIL_COLGROUP_OK 0x01 /* colgroup: prefix OK */
#define UTIL_FILE_OK 0x02 /* file: prefix OK */
#define UTIL_INDEX_OK 0x04 /* index: prefix OK */
-#define UTIL_TABLE_OK 0x08 /* table: prefix OK */
+#define UTIL_LSM_OK 0x04 /* lsm: prefix OK */
+#define UTIL_TABLE_OK 0x10 /* table: prefix OK */
/* all known prefixes OK */
#define UTIL_ALL_OK \
- (UTIL_COLGROUP_OK | UTIL_FILE_OK | UTIL_INDEX_OK | UTIL_TABLE_OK)
+ (UTIL_COLGROUP_OK | UTIL_FILE_OK | UTIL_INDEX_OK |\
+ UTIL_LSM_OK | UTIL_TABLE_OK)
typedef struct {
void *mem; /* Managed memory chunk */
diff --git a/src/utilities/util_dump.c b/src/utilities/util_dump.c
index ffb0b2b7743..073feb92399 100644
--- a/src/utilities/util_dump.c
+++ b/src/utilities/util_dump.c
@@ -85,8 +85,8 @@ util_dump(WT_SESSION *session, int argc, char *argv[])
/* The remaining argument is the uri. */
if (argc != 1)
return (usage());
- if ((name =
- util_name(*argv, "table", UTIL_FILE_OK | UTIL_TABLE_OK)) == NULL)
+ if ((name = util_name(*argv,
+ "table", UTIL_FILE_OK | UTIL_LSM_OK | UTIL_TABLE_OK)) == NULL)
goto err;
if (dump_config(session, name, hex) != 0)
diff --git a/src/utilities/util_list.c b/src/utilities/util_list.c
index 06c061ef87f..a535270bc3f 100644
--- a/src/utilities/util_list.c
+++ b/src/utilities/util_list.c
@@ -39,8 +39,8 @@ util_list(WT_SESSION *session, int argc, char *argv[])
case 0:
break;
case 1:
- if ((name = util_name(
- *argv, "table", UTIL_FILE_OK | UTIL_TABLE_OK)) == NULL)
+ if ((name = util_name(*argv, "table",
+ UTIL_FILE_OK | UTIL_LSM_OK | UTIL_TABLE_OK)) == NULL)
return (1);
break;
default:
@@ -109,14 +109,9 @@ list_print(WT_SESSION *session, const char *name, int cflag, int vflag)
return (util_cerr("metadata", "get_key", ret));
/*
- * If no object specified, show top-level objects (files and
- * tables).
+ * If a name is specified, only show objects that match.
*/
- if (name == NULL) {
- if (!WT_PREFIX_MATCH(key, "file:") &&
- !WT_PREFIX_MATCH(key, "table:"))
- continue;
- } else {
+ if (name != NULL) {
if (!WT_PREFIX_MATCH(key, name))
continue;
found = 1;
diff --git a/src/utilities/util_load.c b/src/utilities/util_load.c
index 444cdc64c22..30f07a96dfd 100644
--- a/src/utilities/util_load.c
+++ b/src/utilities/util_load.c
@@ -95,8 +95,9 @@ load_dump(WT_SESSION *session)
* Single file dumps can only have two lines, the file name and
* the configuration information.
*/
- if (list[0] == NULL || list[1] == NULL || list[2] != NULL ||
- !WT_PREFIX_MATCH(list[0], "file:"))
+ if ((list[0] == NULL || list[1] == NULL || list[2] != NULL) ||
+ (WT_PREFIX_MATCH(list[0], "file:") &&
+ WT_PREFIX_MATCH(list[0], "lsm:")))
return (format());
entry = list;
diff --git a/src/utilities/util_loadtext.c b/src/utilities/util_loadtext.c
index 6befcd82c47..b7f9fa2cafc 100644
--- a/src/utilities/util_loadtext.c
+++ b/src/utilities/util_loadtext.c
@@ -34,8 +34,8 @@ util_loadtext(WT_SESSION *session, int argc, char *argv[])
/* The remaining argument is the uri. */
if (argc != 1)
return (usage());
- if ((uri =
- util_name(*argv, "table", UTIL_FILE_OK | UTIL_TABLE_OK)) == NULL)
+ if ((uri = util_name(*argv,
+ "table", UTIL_FILE_OK | UTIL_LSM_OK | UTIL_TABLE_OK)) == NULL)
return (1);
return (text(session, uri));
diff --git a/src/utilities/util_main.c b/src/utilities/util_main.c
index cd3077ebb49..d4be5a45500 100644
--- a/src/utilities/util_main.c
+++ b/src/utilities/util_main.c
@@ -255,6 +255,14 @@ util_name(const char *s, const char *type, u_int flags)
return (NULL);
}
copy = 1;
+ } else if (WT_PREFIX_MATCH(s, "lsm:")) {
+ if (!(flags & UTIL_LSM_OK)) {
+ fprintf(stderr,
+ "%s: %s: \"lsm\" type not supported\n",
+ progname, command);
+ return (NULL);
+ }
+ copy = 1;
} else if (WT_PREFIX_MATCH(s, "table:")) {
if (!(flags & UTIL_TABLE_OK)) {
fprintf(stderr,
diff --git a/src/utilities/util_rename.c b/src/utilities/util_rename.c
index 6f894e2e9c7..55a90f96b55 100644
--- a/src/utilities/util_rename.c
+++ b/src/utilities/util_rename.c
@@ -29,8 +29,8 @@ util_rename(WT_SESSION *session, int argc, char *argv[])
/* The remaining arguments are the object uri and new name. */
if (argc != 2)
return (usage());
- if ((uri = util_name(
- *argv, "table", UTIL_FILE_OK | UTIL_TABLE_OK)) == NULL)
+ if ((uri = util_name(*argv,
+ "table", UTIL_FILE_OK | UTIL_LSM_OK | UTIL_TABLE_OK)) == NULL)
return (1);
newname = argv[1];
diff --git a/src/utilities/util_upgrade.c b/src/utilities/util_upgrade.c
index 1529d0b611c..7c4f2035fa2 100644
--- a/src/utilities/util_upgrade.c
+++ b/src/utilities/util_upgrade.c
@@ -29,8 +29,8 @@ util_upgrade(WT_SESSION *session, int argc, char *argv[])
/* The remaining argument is the table name. */
if (argc != 1)
return (usage());
- if ((name = util_name(
- *argv, "table", UTIL_FILE_OK | UTIL_TABLE_OK)) == NULL)
+ if ((name = util_name(*argv,
+ "table", UTIL_FILE_OK | UTIL_LSM_OK | UTIL_TABLE_OK)) == NULL)
return (1);
if ((ret = session->upgrade(session, name, NULL)) != 0) {
diff --git a/src/utilities/util_verify.c b/src/utilities/util_verify.c
index 7ac74178217..fda2d413346 100644
--- a/src/utilities/util_verify.c
+++ b/src/utilities/util_verify.c
@@ -29,8 +29,8 @@ util_verify(WT_SESSION *session, int argc, char *argv[])
/* The remaining argument is the table name. */
if (argc != 1)
return (usage());
- if ((name = util_name(
- *argv, "table", UTIL_FILE_OK | UTIL_TABLE_OK)) == NULL)
+ if ((name = util_name(*argv,
+ "table", UTIL_FILE_OK | UTIL_LSM_OK | UTIL_TABLE_OK)) == NULL)
return (1);
if ((ret = session->verify(session, name, NULL)) != 0) {
diff --git a/src/utilities/util_write.c b/src/utilities/util_write.c
index 95def2f652f..ad1f47b067d 100644
--- a/src/utilities/util_write.c
+++ b/src/utilities/util_write.c
@@ -45,8 +45,8 @@ util_write(WT_SESSION *session, int argc, char *argv[])
} else
if (argc < 3 || ((argc - 1) % 2 != 0))
return (usage());
- if ((uri =
- util_name(*argv, "table", UTIL_FILE_OK | UTIL_TABLE_OK)) == NULL)
+ if ((uri = util_name(*argv,
+ "table", UTIL_FILE_OK | UTIL_LSM_OK | UTIL_TABLE_OK)) == NULL)
return (1);
/* Open the object. */
diff --git a/test/bloom/test_bloom.c b/test/bloom/test_bloom.c
index 0333e7acb53..1b02b958718 100644
--- a/test/bloom/test_bloom.c
+++ b/test/bloom/test_bloom.c
@@ -138,7 +138,7 @@ int run(void)
WT_BLOOM *bloomp;
WT_ITEM item;
WT_SESSION_IMPL *sess;
- const char *uri = "table:my_bloom";
+ const char *uri = "file:my_bloom.bf";
int ret;
uint32_t fp, i;
@@ -171,7 +171,8 @@ int run(void)
die(ret, "__wt_bloom_close");
g.wt_session->checkpoint(g.wt_session, NULL);
- if ((ret = __wt_bloom_open(sess, uri, g.c_factor, g.c_k, &bloomp)) != 0)
+ if ((ret = __wt_bloom_open(
+ sess, uri, g.c_factor, g.c_k, NULL, &bloomp)) != 0)
die(ret, "__wt_bloom_open");
for (i = 0; i < g.c_ops; i++) {
item.data = g.entries[i];
diff --git a/test/fops/t.c b/test/fops/t.c
index ad1d7811771..92719a1e109 100644
--- a/test/fops/t.c
+++ b/test/fops/t.c
@@ -28,7 +28,8 @@ main(int argc, char *argv[])
u_int nthreads;
int ch, cnt, runs;
char *config_open;
- const char **objp, *objs[] = { "file:__wt", "table:__wt", NULL };
+ const char **objp;
+ const char *objs[] = { "file:__wt", "table:__wt", "lsm:__wt", NULL };
if ((progname = strrchr(argv[0], '/')) == NULL)
progname = argv[0];
diff --git a/test/format/config.c b/test/format/config.c
index 796a7037ae4..659aea4552a 100644
--- a/test/format/config.c
+++ b/test/format/config.c
@@ -50,11 +50,9 @@ config_setup(void)
case 1:
config_single("data_source=table", 0);
break;
- /* LSM isn't implemented yet.
case 2:
config_single("data_source=lsm", 0);
break;
- */
}
}
@@ -88,6 +86,20 @@ config_setup(void)
if (cp->flags & C_OPS)
*cp->v = 0;
+ /* LSM trees are only compatible with row store tables. */
+ if (g.c_file_type != ROW &&
+ strncmp(g.c_data_source, "lsm", strlen("lsm")) == 0) {
+ cp = config_find("file_type", strlen("file_type"));
+ if (!(cp->flags & C_PERM))
+ config_single("file_type=row", 0);
+ else {
+ fprintf(stderr,
+ "%s: LSM data source is only compatible with row file_type\n",
+ g.progname);
+ exit(EXIT_FAILURE);
+ }
+ }
+
/* Multi-threaded runs cannot be replayed. */
if (g.replay && g.c_threads != 1) {
fprintf(stderr,
diff --git a/test/format/util.c b/test/format/util.c
index 60f09031dd6..503ae17dbdc 100644
--- a/test/format/util.c
+++ b/test/format/util.c
@@ -114,8 +114,10 @@ value_gen(uint8_t *val, uint32_t *sizep, uint64_t keyno)
/*
* WiredTiger doesn't store zero-length data items in row-store files,
* test that by inserting a zero-length data item every so often.
+ * LSM doesn't support zero length items.
*/
- if (keyno % 63 == 0) {
+ if (keyno % 63 == 0 &&
+ strncmp("lsm", g.c_data_source, strlen("lsm") != 0)) {
val[0] = '\0';
*sizep = 0;
return;
diff --git a/test/format/wts_ops.c b/test/format/wts_ops.c
index 8825c9fb85b..d4a2868769f 100644
--- a/test/format/wts_ops.c
+++ b/test/format/wts_ops.c
@@ -687,7 +687,12 @@ row_remove(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int *notfoundp)
return;
bdb_remove(keyno, &notfound);
- (void)notfound_chk("row_remove", ret, notfound, keyno);
+
+ /* LSM trees don't check for existence if "overwrite" is set. */
+ if (strncmp(cursor->uri, "lsm:", 4) == 0)
+ *notfoundp = notfound;
+ else
+ (void)notfound_chk("row_remove", ret, notfound, keyno);
}
/*
diff --git a/test/suite/test_cursor01.py b/test/suite/test_cursor01.py
index 51c65ad9607..f4c448187c5 100644
--- a/test/suite/test_cursor01.py
+++ b/test/suite/test_cursor01.py
@@ -43,6 +43,7 @@ class test_cursor01(wttest.WiredTigerTestCase):
('file-col', dict(tablekind='col',uri='file')),
('file-fix', dict(tablekind='fix',uri='file')),
('file-row', dict(tablekind='row',uri='file')),
+ ('lsm-row', dict(tablekind='row',uri='lsm')),
('table-col', dict(tablekind='col',uri='table')),
('table-fix', dict(tablekind='fix',uri='table')),
('table-row', dict(tablekind='row',uri='table'))
@@ -139,9 +140,13 @@ class test_cursor01(wttest.WiredTigerTestCase):
value = cursor.get_value()
self.assertEqual(key, self.genkey(i))
self.assertEqual(value, self.genvalue(i))
- i += 1
dupc = self.session.open_cursor(None, cursor, None)
self.assertEquals(cursor.compare(dupc), 0)
+ key = dupc.get_key()
+ value = dupc.get_value()
+ self.assertEqual(key, self.genkey(i))
+ self.assertEqual(value, self.genvalue(i))
+ i += 1
cursor.close()
cursor = dupc
diff --git a/test/suite/test_cursor02.py b/test/suite/test_cursor02.py
index 7786b902078..da49515dc52 100644
--- a/test/suite/test_cursor02.py
+++ b/test/suite/test_cursor02.py
@@ -38,13 +38,14 @@ class test_cursor02(TestCursorTracker):
after inserts and removes.
"""
scenarios = [
- ('row', dict(tablekind='row')),
- ('col', dict(tablekind='col')),
+ ('row', dict(tablekind='row', uri='table')),
+ ('lsm-row', dict(tablekind='row', uri='lsm')),
+ ('col', dict(tablekind='col', uri='table')),
#('fix', dict(tablekind='fix'))
]
def create_session_and_cursor(self, ninitialentries):
- tablearg = "table:" + self.table_name1
+ tablearg = self.uri + ":" + self.table_name1
if self.tablekind == 'row':
keyformat = 'key_format=S'
else:
@@ -56,7 +57,7 @@ class test_cursor02(TestCursorTracker):
create_args = keyformat + ',' + valformat + self.config_string()
self.session_create(tablearg, create_args)
self.pr('creating cursor')
- self.cur_initial_conditions(self.table_name1, ninitialentries, self.tablekind, None, None)
+ self.cur_initial_conditions(self.table_name1, ninitialentries, self.tablekind, None, None, self.uri)
return self.session.open_cursor(tablearg, None, 'append')
def test_multiple_remove(self):
diff --git a/test/suite/test_cursor03.py b/test/suite/test_cursor03.py
index 3be68230cff..04002e1e521 100644
--- a/test/suite/test_cursor03.py
+++ b/test/suite/test_cursor03.py
@@ -40,19 +40,20 @@ class test_cursor03(TestCursorTracker):
after inserts and removes.
"""
scenarios = multiply_scenarios('.', [
- ('row', dict(tablekind='row', keysize=None, valsize=None)),
- ('col', dict(tablekind='col', keysize=None, valsize=None)),
+ ('row', dict(tablekind='row', keysize=None, valsize=None, uri='table')),
+ ('row', dict(tablekind='row', keysize=None, valsize=None, uri='lsm')),
+ ('col', dict(tablekind='col', keysize=None, valsize=None, uri='table')),
#('fix', dict(tablekind='fix', keysize=None, valsize=None))
- ('row.val10k', dict(tablekind='row', keysize=None, valsize=[10, 10000])),
- ('col.val10k', dict(tablekind='col', keysize=None, valsize=[10, 10000])),
- ('row.keyval10k', dict(tablekind='row', keysize=[10,10000], valsize=[10, 10000])),
+ ('row.val10k', dict(tablekind='row', keysize=None, valsize=[10, 10000], uri='table')),
+ ('col.val10k', dict(tablekind='col', keysize=None, valsize=[10, 10000], uri='table')),
+ ('row.keyval10k', dict(tablekind='row', keysize=[10,10000], valsize=[10, 10000], uri='table')),
], [
('count1000', dict(tablecount=1000,cache_size=20*1024*1024)),
('count10000', dict(tablecount=10000, cache_size=64*1024*1024))
])
def create_session_and_cursor(self):
- tablearg = "table:" + self.table_name1
+ tablearg = self.uri + ":" + self.table_name1
if self.tablekind == 'row':
keyformat = 'key_format=S'
else:
@@ -64,7 +65,7 @@ class test_cursor03(TestCursorTracker):
create_args = keyformat + ',' + valformat + self.config_string()
self.session_create(tablearg, create_args)
self.pr('creating cursor')
- self.cur_initial_conditions(self.table_name1, self.tablecount, self.tablekind, self.keysize, self.valsize)
+ self.cur_initial_conditions(self.table_name1, self.tablecount, self.tablekind, self.keysize, self.valsize, self.uri)
return self.session.open_cursor(tablearg, None, 'append')
def setUpConnectionOpen(self, dir):
diff --git a/test/suite/test_cursor04.py b/test/suite/test_cursor04.py
index 148ca4b72d9..24eb92a8c9c 100644
--- a/test/suite/test_cursor04.py
+++ b/test/suite/test_cursor04.py
@@ -37,9 +37,10 @@ class test_cursor04(wttest.WiredTigerTestCase):
nentries = 20
scenarios = [
- ('row', dict(tablekind='row')),
- ('col', dict(tablekind='col')),
- ('fix', dict(tablekind='fix'))
+ ('row', dict(tablekind='row', uri='table')),
+ ('lsm-row', dict(tablekind='row', uri='lsm')),
+ ('col', dict(tablekind='col', uri='table')),
+ ('fix', dict(tablekind='fix', uri='table'))
]
def config_string(self):
@@ -60,7 +61,7 @@ class test_cursor04(wttest.WiredTigerTestCase):
raise
def create_session_and_cursor(self):
- tablearg = "table:" + self.table_name1
+ tablearg = self.uri + ":" + self.table_name1
if self.tablekind == 'row':
keyformat = 'key_format=S'
else:
diff --git a/test/suite/test_cursor_tracker.py b/test/suite/test_cursor_tracker.py
index 86b7cbfa27e..3e50b55cd98 100644
--- a/test/suite/test_cursor_tracker.py
+++ b/test/suite/test_cursor_tracker.py
@@ -136,7 +136,7 @@ class TestCursorTracker(wttest.WiredTigerTestCase):
self.encode_value = self.encode_value_fix
self.decode_value = self.decode_value_fix
- def cur_initial_conditions(self, tablename, npairs, tablekind, keysizes, valuesizes):
+ def cur_initial_conditions(self, tablename, npairs, tablekind, keysizes, valuesizes, uri="table"):
if npairs >= 0xffffffff:
raise Exception('cur_initial_conditions: npairs too big')
self.tablekind = tablekind
@@ -151,7 +151,7 @@ class TestCursorTracker(wttest.WiredTigerTestCase):
self.keysizes = keysizes
self.valuesizes = valuesizes
if tablekind != None:
- cursor = self.session.open_cursor('table:' + tablename, None, 'append')
+ cursor = self.session.open_cursor(uri + ':' + tablename, None, 'append')
for i in range(npairs):
wtkey = self.encode_key(i << 32)
wtval = self.encode_value(i << 32)
diff --git a/test/suite/test_util11.py b/test/suite/test_util11.py
index 12c64f68086..d24f2213794 100644
--- a/test/suite/test_util11.py
+++ b/test/suite/test_util11.py
@@ -71,15 +71,13 @@ class test_util11(wttest.WiredTigerTestCase, suite_subprocess):
self.populate(pfx + '3')
# Construct what we think we'll find
- filelist = ''
tablelist = ''
for i in range(1, 6):
- filelist += 'file:' + pfx + str(i) + '.wt\n'
tablelist += 'table:' + pfx + str(i) + '\n'
outfile = "listout.txt"
- self.runWt(["list"], outfilename=outfile)
- self.check_file_content(outfile, filelist + tablelist)
+ self.runWt(["list", "table:"], outfilename=outfile)
+ self.check_file_content(outfile, tablelist)
def test_list_drop(self):
"""
@@ -99,18 +97,11 @@ class test_util11(wttest.WiredTigerTestCase, suite_subprocess):
self.session.drop('table:' + pfx + '4', None)
# Construct what we think we'll find
- filelist = ''
- tablelist = ''
- filelist += 'file:' + pfx + '1.wt\n'
- tablelist += 'table:' + pfx + '1\n'
- filelist += 'file:' + pfx + '3.wt\n'
- tablelist += 'table:' + pfx + '3\n'
- filelist += 'file:' + pfx + '5.wt\n'
- tablelist += 'table:' + pfx + '5\n'
+ tablelist = ''.join('table:' + pfx + str(i) + '\n' for i in (1, 3, 5))
outfile = "listout.txt"
- self.runWt(["list"], outfilename=outfile)
- self.check_file_content(outfile, filelist + tablelist)
+ self.runWt(["list", "table:"], outfilename=outfile)
+ self.check_file_content(outfile, tablelist)
def test_list_drop_all(self):
"""