diff options
author | Michael Cahill <michael.cahill@wiredtiger.com> | 2014-02-07 18:16:22 +1100 |
---|---|---|
committer | Michael Cahill <michael.cahill@wiredtiger.com> | 2014-02-07 18:16:22 +1100 |
commit | 3bcd2a96e6546419a871dba4a35a2e2a3453adb9 (patch) | |
tree | d93f25e4d576e47adbf78b352c910e7354d68639 /src | |
parent | 3b6d36874f716625c3f8c867f9185c829931472e (diff) | |
parent | 0f319b1107960bdeb7d617d1797dd992029bb1df (diff) | |
download | mongo-3bcd2a96e6546419a871dba4a35a2e2a3453adb9.tar.gz |
Merge branch 'develop' into checkpoint-directio
Diffstat (limited to 'src')
-rw-r--r-- | src/btree/bt_evict.c | 149 | ||||
-rw-r--r-- | src/btree/bt_handle.c | 37 | ||||
-rw-r--r-- | src/btree/rec_evict.c | 30 | ||||
-rw-r--r-- | src/btree/rec_merge.c | 45 | ||||
-rw-r--r-- | src/btree/rec_track.c | 53 | ||||
-rw-r--r-- | src/btree/rec_write.c | 22 | ||||
-rw-r--r-- | src/docs/compression.dox | 83 | ||||
-rw-r--r-- | src/docs/helium.dox | 125 | ||||
-rw-r--r-- | src/docs/hot_backup.dox | 10 | ||||
-rw-r--r-- | src/docs/memrata.dox | 129 | ||||
-rw-r--r-- | src/docs/programming.dox | 2 | ||||
-rw-r--r-- | src/docs/spell.ok | 12 | ||||
-rw-r--r-- | src/docs/top/Doxyfile | 2 | ||||
-rw-r--r-- | src/docs/top/main.dox | 6 | ||||
-rw-r--r-- | src/docs/upgrading.dox | 20 | ||||
-rw-r--r-- | src/include/btmem.h | 5 | ||||
-rw-r--r-- | src/include/btree.i | 78 | ||||
-rw-r--r-- | src/include/stat.h | 4 | ||||
-rw-r--r-- | src/include/txn.i | 2 | ||||
-rw-r--r-- | src/include/wiredtiger.in | 108 | ||||
-rw-r--r-- | src/lsm/lsm_cursor.c | 11 | ||||
-rw-r--r-- | src/lsm/lsm_tree.c | 7 | ||||
-rw-r--r-- | src/support/stat.c | 12 | ||||
-rw-r--r-- | src/txn/txn_ckpt.c | 6 |
24 files changed, 534 insertions, 424 deletions
diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c index f5b0180b2a6..d57162c06a9 100644 --- a/src/btree/bt_evict.c +++ b/src/btree/bt_evict.c @@ -246,7 +246,7 @@ __evict_worker(WT_SESSION_IMPL *session) "Eviction pass with: Max: %" PRIu64 " In use: %" PRIu64 " Dirty: %" PRIu64 " Internal: %s", bytes_max, bytes_inuse, dirty_inuse, - F_ISSET(cache, WT_EVICT_INTERNAL) ? "yes" : "no"); + LF_ISSET(WT_EVICT_PASS_INTERNAL) ? "yes" : "no"); /* * When the cache is full, track whether pages are being @@ -858,7 +858,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags) btree->evict_page->ref->state == WT_REF_EVICT_WALK); walk_flags = WT_TREE_EVICT; - if (F_ISSET(cache, WT_EVICT_INTERNAL)) + if (LF_ISSET(WT_EVICT_PASS_INTERNAL)) walk_flags |= WT_TREE_SKIP_LEAF; /* * Get some more eviction candidate pages. @@ -887,7 +887,13 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags) if (WT_PAGE_IS_ROOT(page)) continue; - /* Look for a split-merge (grand)parent page to merge. */ + /* + * Look for a split-merge (grand)parent page to merge. + * + * Only look for a parent at exactly the right height above: if + * the stack is deep enough, we'll find it eventually, and we + * don't want to do too much work on every level. + */ levels = 0; if (__wt_btree_mergeable(page)) for (levels = 1; @@ -900,85 +906,96 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags) continue; /* - * Only look for a parent at exactly the right height above: if - * the stack is deep enough, we'll find it eventually, and we - * don't want to do too much work on every level. - * + * Use the EVICT_LRU flag to avoid putting pages onto the list + * multiple times. + */ + if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)) + continue; + + /* * !!! - * Don't restrict ourselves to only the top-most page (that is, - * don't require that page->parent is not mergeable). If there - * is a big, busy enough split-merge tree, the top-level merge - * will only happen if we can lock the whole subtree - * exclusively. Consider smaller merges in case locking the - * whole tree fails. + * In normal operation, don't restrict ourselves to only the + * top-most page (that is, don't require that page->parent is + * not mergeable). If there is a big, busy enough split-merge + * tree, the top-level merge will only happen if we can lock + * the whole subtree exclusively. Consider smaller merges in + * case locking the whole tree fails. */ - if (levels != 0 && levels != WT_MERGE_STACK_MIN) + if (levels != 0) { + if (levels < WT_MERGE_STACK_MIN) + continue; + + /* + * Concentrate near the top of a stack -- with forced + * eviction, stacks of split-merge pages can get very + * deep, and merging near the bottom isn't helpful. + */ + if (LF_ISSET(WT_EVICT_PASS_INTERNAL) && + __wt_btree_mergeable(page->parent) && + __wt_btree_mergeable(page->parent->parent)) + continue; + + /* The remaining checks don't apply to merges. */ + goto add; + } else if (LF_ISSET(WT_EVICT_PASS_INTERNAL)) continue; /* - * If this page has never been considered for eviction, set its - * read generation to a little bit in the future and move on, - * give readers a chance to start updating the read generation. + * If this page has never been considered for eviction, + * set its read generation to a little bit in the + * future and move on, give readers a chance to start + * updating the read generation. */ if (page->read_gen == WT_READ_GEN_NOTSET) { - page->read_gen = __wt_cache_read_gen_set(session); + page->read_gen = + __wt_cache_read_gen_set(session); continue; } /* - * Use the EVICT_LRU flag to avoid putting pages onto the list - * multiple times. + * If the file is being checkpointed, there's a period + * of time where we can't discard any page with a + * modification structure because it might race with + * the checkpointing thread. + * + * During this phase, there is little point trying to + * evict dirty pages: we might be lucky and find an + * internal page that has not yet been checkpointed, + * but much more likely is that we will waste effort + * considering dirty leaf pages that cannot be evicted + * because they have modifications more recent than the + * checkpoint. */ - if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)) + modified = __wt_page_is_modified(page); + if (modified && btree->checkpointing) continue; - /* The following checks apply to eviction but not merges. */ - if (levels == 0) { - /* - * If the file is being checkpointed, there's a period - * of time where we can't discard any page with a - * modification structure because it might race with - * the checkpointing thread. - * - * During this phase, there is little point trying to - * evict dirty pages: we might be lucky and find an - * internal page that has not yet been checkpointed, - * but much more likely is that we will waste effort - * considering dirty leaf pages that cannot be evicted - * because they have modifications more recent than the - * checkpoint. - */ - modified = __wt_page_is_modified(page); - if (modified && btree->checkpointing) - continue; - - /* Optionally ignore clean pages. */ - if (!modified && LF_ISSET(WT_EVICT_PASS_DIRTY)) - continue; + /* Optionally ignore clean pages. */ + if (!modified && LF_ISSET(WT_EVICT_PASS_DIRTY)) + continue; - /* - * If the oldest transaction hasn't changed since the - * last time this page was written, it's unlikely that - * we can make progress. Similarly, if the most recent - * update on the page is not yet globally visible, - * eviction will fail. These heuristics attempt to - * avoid repeated attempts to evict the same page. - * - * That said, if eviction is stuck, or the file is - * being checkpointed, try anyway: maybe a transaction - * that were running last time we wrote the page has - * since rolled back, or we can help get the checkpoint - * completed sooner. - */ - if (modified && !F_ISSET(cache, WT_EVICT_STUCK) && - (page->modify->disk_snap_min == - S2C(session)->txn_global.oldest_id || - !__wt_txn_visible_all(session, - page->modify->update_txn))) - continue; - } + /* + * If the oldest transaction hasn't changed since the + * last time this page was written, it's unlikely that + * we can make progress. Similarly, if the most recent + * update on the page is not yet globally visible, + * eviction will fail. These heuristics attempt to + * avoid repeated attempts to evict the same page. + * + * That said, if eviction is stuck, or the file is + * being checkpointed, try anyway: maybe a transaction + * that were running last time we wrote the page has + * since rolled back, or we can help get the checkpoint + * completed sooner. + */ + if (modified && !F_ISSET(cache, WT_EVICT_STUCK) && + (page->modify->disk_snap_min == + S2C(session)->txn_global.oldest_id || + !__wt_txn_visible_all(session, + page->modify->update_txn))) + continue; - WT_ASSERT(session, evict->page == NULL); +add: WT_ASSERT(session, evict->page == NULL); __evict_init_candidate(session, evict, page); ++evict; diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index 5c1f45a8030..f6cc4cc6fb3 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -11,7 +11,7 @@ static int __btree_conf(WT_SESSION_IMPL *, WT_CKPT *ckpt); static int __btree_get_last_recno(WT_SESSION_IMPL *); static int __btree_page_sizes(WT_SESSION_IMPL *); static int __btree_preload(WT_SESSION_IMPL *); -static int __btree_tree_open_empty(WT_SESSION_IMPL *, int); +static int __btree_tree_open_empty(WT_SESSION_IMPL *, int, int); static int pse1(WT_SESSION_IMPL *, const char *, uint32_t, uint32_t); static int pse2(WT_SESSION_IMPL *, const char *, uint32_t, uint32_t, int); @@ -102,7 +102,8 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) ckpt.raw.data, ckpt.raw.size, root_addr, &root_addr_size, readonly)); if (creation || root_addr_size == 0) - WT_ERR(__btree_tree_open_empty(session, creation)); + WT_ERR(__btree_tree_open_empty( + session, creation, readonly)); else { WT_ERR(__wt_btree_tree_open( session, root_addr, root_addr_size)); @@ -355,7 +356,7 @@ err: __wt_buf_free(session, &dsk); * Create an empty in-memory tree. */ static int -__btree_tree_open_empty(WT_SESSION_IMPL *session, int creation) +__btree_tree_open_empty(WT_SESSION_IMPL *session, int creation, int readonly) { WT_BTREE *btree; WT_DECL_RET; @@ -423,23 +424,31 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, int creation) * the root page dirty to force a write, and without reconciling the * leaf page we won't realize there's no records to write, we'll write * a root page, which isn't correct for an empty tree. - * Earlier versions of this code kept the leaf page clean, but with - * the "empty" flag set in the leaf page's modification structure; in - * that case, checkpoints works (forced reconciliation of a root with - * a single "empty" page wouldn't write any blocks). That version had + * + * Earlier versions of this code kept the leaf page clean, but with the + * "empty" flag set in the leaf page's modification structure; in that + * case, checkpoints works (forced reconciliation of a root with a + * single "empty" page wouldn't write any blocks). That version had * memory leaks because the eviction code didn't correctly handle pages * that were "clean" (and so never reconciled), yet "modified" with an * "empty" flag. The goal of this code is to mimic a real tree that * simply has no records, for whatever reason, and trust reconciliation * to figure out it's empty and not write any blocks. - * We do not set the tree's modified flag because the checkpoint code - * skips unmodified files in closing checkpoints (checkpoints that don't - * require a write unless the file is actually dirty). There's no need - * to reconcile this file unless the application does a real checkpoint - * or it's actually modified. + * + * We do not set the tree's modified flag because the checkpoint code + * skips unmodified files in closing checkpoints (checkpoints that + * don't require a write unless the file is actually dirty). There's + * no need to reconcile this file unless the application does a real + * checkpoint or it's actually modified. + * + * Only do this for a live tree, not for checkpoints. If we open an + * empty checkpoint, the leaf page cannot be dirty or eviction may try + * to write it, which will fail because checkpoints are read-only. */ - WT_ERR(__wt_page_modify_init(session, leaf)); - __wt_page_only_modify_set(session, leaf); + if (!readonly) { + WT_ERR(__wt_page_modify_init(session, leaf)); + __wt_page_only_modify_set(session, leaf); + } btree->root_page = root; diff --git a/src/btree/rec_evict.c b/src/btree/rec_evict.c index f7d146c42e2..0713989af58 100644 --- a/src/btree/rec_evict.c +++ b/src/btree/rec_evict.c @@ -441,32 +441,32 @@ ckpt: WT_STAT_FAST_CONN_INCR(session, cache_eviction_checkpoint); */ if (__wt_page_is_modified(page) && !F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)) { - ret = __wt_rec_write(session, page, - NULL, WT_EVICTION_SERVER_LOCKED | WT_SKIP_UPDATE_QUIT); - - /* - * Update the page's modification reference, reconciliation - * might have changed it. - */ - mod = page->modify; - /* - * If reconciliation failed due to active modifications and - * the page is a lot larger than the maximum allowed, it is - * likely that we are having trouble reconciling it due to - * contention, attempt to split the page in memory. + * If the page is larger than the maximum allowed, attempt to + * split the page in memory before evicting it. The in-memory + * split checks for left and right splits, and prevents the + * tree deepening unnecessarily. * * Note, we won't be here if recursively descending a tree of * pages: dirty row-store leaf pages can't be merged into their * parents, which means if top wasn't true in this test, we'd * have returned busy before attempting reconciliation. */ - if (ret == EBUSY && - page->type == WT_PAGE_ROW_LEAF && + if (page->type == WT_PAGE_ROW_LEAF && + !F_ISSET_ATOMIC(page, WT_PAGE_WAS_SPLIT) && __wt_eviction_force_check(session, page)) { *inmem_split = 1; return (0); } + + ret = __wt_rec_write(session, page, + NULL, WT_EVICTION_SERVER_LOCKED | WT_SKIP_UPDATE_QUIT); + + /* + * Update the page's modification reference, reconciliation + * might have changed it. + */ + mod = page->modify; if (ret == EBUSY) { /* Give up if there are unwritten changes */ WT_VERBOSE_RET(session, evict, diff --git a/src/btree/rec_merge.c b/src/btree/rec_merge.c index 7599fa8cb84..cf8ef88c5ac 100644 --- a/src/btree/rec_merge.c +++ b/src/btree/rec_merge.c @@ -307,14 +307,9 @@ __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top) if (visit_state.maxdepth < WT_MERGE_STACK_MIN) return (EBUSY); - /* - * Don't allow split merges to generate arbitrarily large pages. - * Ideally we would choose a size based on the internal_page_max - * setting for the btree, but we don't have the correct btree handle - * available. - */ - if (visit_state.refcnt > WT_MERGE_MAX_REFS) - return (EBUSY); + /* Pages cannot grow larger than 2**32, but that should never happen. */ + if (visit_state.refcnt > UINT32_MAX) + return (ENOMEM); /* * Now we either collapse the internal pages into one split-merge page, @@ -332,17 +327,19 @@ __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top) * In the normal case where there are live children spread * through the subtree, create two child pages. * - * Handle the case where the only live child is first / last - * specially: put the live child into the top-level page. + * Handle the case where the live children are all near the + * beginning / end specially: put the last live child into the + * top-level page, to avoid getting much deeper during + * append-only workloads. * * Set SPLIT_MERGE on the internal pages if there are any live * children: they can't be evicted, so there is no point * permanently deepening the tree. */ - if (visit_state.first_live == visit_state.last_live && - (visit_state.first_live == 0 || - visit_state.first_live == refcnt - 1)) - split = (visit_state.first_live == 0) ? 1 : refcnt - 1; + if (visit_state.last_live <= refcnt / 10) + split = 1; + else if (visit_state.first_live >= (9 * refcnt) / 10) + split = refcnt - 1; else split = (refcnt + 1) / 2; @@ -370,7 +367,7 @@ __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top) else { WT_ERR(__wt_btree_new_modified_page( session, page_type, split, - visit_state.first_live < split, &lchild)); + split < WT_MERGE_FULL_PAGE, &lchild)); visit_state.first = lchild; } @@ -380,8 +377,8 @@ __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top) visit_state.second_ref = &newtop->u.intl.t[1]; } else { WT_ERR(__wt_btree_new_modified_page( - session, page_type, - refcnt - split, visit_state.last_live >= split, + session, page_type, refcnt - split, + refcnt - split < WT_MERGE_FULL_PAGE, &rchild)); visit_state.second = rchild; visit_state.second_ref = @@ -389,17 +386,15 @@ __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top) } } else { /* - * Create a new split-merge page for small merges, or if the - * page above is a split merge page. When we do a big enough - * merge, we create a real page at the top and don't consider - * it as a merge candidate again. Over time with an insert - * workload the tree will grow deeper, but that's inevitable, - * and this keeps individual merges small. + * Create a new split-merge page for small merges. When we do + * a big enough merge, we create a real page at the top and + * don't consider it as a merge candidate again. Over time + * with an insert workload the tree will grow deeper, but + * that's inevitable, and this keeps individual merges small. */ WT_ERR(__wt_btree_new_modified_page( session, page_type, refcnt, - refcnt < WT_MERGE_FULL_PAGE || - __wt_btree_mergeable(top->parent), + refcnt < WT_MERGE_FULL_PAGE, &newtop)); visit_state.first = newtop; diff --git a/src/btree/rec_track.c b/src/btree/rec_track.c index 1ea5c1093d5..99e9aebc14f 100644 --- a/src/btree/rec_track.c +++ b/src/btree/rec_track.c @@ -382,13 +382,13 @@ __ovfl_reuse_dump(WT_SESSION_IMPL *session, WT_PAGE *page) /* * __ovfl_reuse_skip_search -- - * Return the first matching value in the overflow reuse list. + * Return the first, not in-use, matching value in the overflow reuse list. */ static WT_OVFL_REUSE * __ovfl_reuse_skip_search( WT_OVFL_REUSE **head, const void *value, size_t value_size) { - WT_OVFL_REUSE **e; + WT_OVFL_REUSE **e, *next; size_t len; int cmp, i; @@ -404,13 +404,29 @@ __ovfl_reuse_skip_search( } /* - * Return any exact matches: we don't care in what search level - * we found a match. + * Values are not unique, and it's possible to have long lists + * of identical overflow items. (We've seen it in benchmarks.) + * Move through a list of identical items at the current level + * as long as the next one is in-use, otherwise, drop down a + * level. When at the bottom level, return items if reusable, + * else NULL. */ len = WT_MIN((*e)->value_size, value_size); cmp = memcmp(WT_OVFL_REUSE_VALUE(*e), value, len); - if (cmp == 0 && (*e)->value_size == value_size) - return (*e); + if (cmp == 0 && (*e)->value_size == value_size) { + if (i == 0) + return (F_ISSET(*e, + WT_OVFL_REUSE_INUSE) ? NULL : *e); + if ((next = (*e)->next[i]) == NULL || + !F_ISSET(next, WT_OVFL_REUSE_INUSE) || + next->value_size != len || memcmp( + WT_OVFL_REUSE_VALUE(next), value, len) != 0) { + --i; /* Drop down a level */ + --e; + } else /* Keep going at this level */ + e = &(*e)->next[i]; + continue; + } /* * If the skiplist value is larger than the search value, or @@ -612,28 +628,19 @@ __wt_ovfl_reuse_search(WT_SESSION_IMPL *session, WT_PAGE *page, head = page->modify->ovfl_track->ovfl_reuse; /* - * The search function returns the first matching record in the list, - * which may be the first of many, overflow records may be identical. - * Find one without the in-use flag set and put it back into service. + * The search function returns the first matching record in the list + * which does not have the in-use flag set, or NULL. */ if ((reuse = __ovfl_reuse_skip_search(head, value, value_size)) == NULL) return (0); - do { - if (!F_ISSET(reuse, WT_OVFL_REUSE_INUSE)) { - *addrp = WT_OVFL_REUSE_ADDR(reuse); - *addr_sizep = reuse->addr_size; - F_SET(reuse, WT_OVFL_REUSE_INUSE); - if (WT_VERBOSE_ISSET(session, overflow)) - WT_RET(__ovfl_reuse_verbose( - session, page, reuse, "reclaim")); - return (1); - } - } while ((reuse = reuse->next[0]) != NULL && - reuse->value_size == value_size && - memcmp(WT_OVFL_REUSE_VALUE(reuse), value, value_size) == 0); + *addrp = WT_OVFL_REUSE_ADDR(reuse); + *addr_sizep = reuse->addr_size; + F_SET(reuse, WT_OVFL_REUSE_INUSE); - return (0); + if (WT_VERBOSE_ISSET(session, overflow)) + WT_RET(__ovfl_reuse_verbose(session, page, reuse, "reclaim")); + return (1); } /* diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c index 66ce4c089e8..81a4ec7a025 100644 --- a/src/btree/rec_write.c +++ b/src/btree/rec_write.c @@ -1616,10 +1616,10 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, WT_RECONCILE *r, int final) * We can't compress the first 64B of the block (it must be * written without compression), and a possible split point * may appear in that 64B; keep it simple, ignore the first - * 1KB of data, anybody splitting a smaller than 1KB piece - * (as calculated before compression), is doing us wrong. + * allocation size of data, anybody splitting smaller than + * that (as calculated before compression), is doing it wrong. */ - if ((len = WT_PTRDIFF(cell, dsk)) > 1024) + if ((len = WT_PTRDIFF(cell, dsk)) > btree->allocsize) r->raw_offsets[++slots] = WT_STORE_SIZE(len - WT_BLOCK_COMPRESS_SKIP); @@ -1677,12 +1677,19 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, WT_RECONCILE *r, int final) * compression function. */ memcpy(dst->mem, dsk, WT_BLOCK_COMPRESS_SKIP); - WT_ERR(compressor->compress_raw(compressor, wt_session, + ret = compressor->compress_raw(compressor, wt_session, r->page_size_max, btree->split_pct, WT_BLOCK_COMPRESS_SKIP, (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP, r->raw_offsets, slots, (uint8_t *)dst->mem + WT_BLOCK_COMPRESS_SKIP, - result_len, final, &result_len, &result_slots)); + result_len, final, &result_len, &result_slots); + if (ret == EAGAIN) { + ret = 0; + if (!final) + goto more_rows; + result_slots = 0; + } + WT_ERR(ret); dst->size = (uint32_t)result_len + WT_BLOCK_COMPRESS_SKIP; if (result_slots != 0) { @@ -1701,11 +1708,14 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, WT_RECONCILE *r, int final) * There may be a remnant in the working buffer that didn't get * compressed; copy it down to the start of the working buffer * and update the starting record number, free space and so on. + * !!! + * Note use of memmove, the source and destination buffers can + * overlap. */ len = WT_PTRDIFF(r->first_free, (uint8_t *)dsk + r->raw_offsets[result_slots] + WT_BLOCK_COMPRESS_SKIP); dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); - (void)memcpy(dsk_start, (uint8_t *)r->first_free - len, len); + (void)memmove(dsk_start, (uint8_t *)r->first_free - len, len); r->entries -= r->raw_entries[result_slots - 1]; r->first_free = dsk_start + len; diff --git a/src/docs/compression.dox b/src/docs/compression.dox index 59f03f4e8ef..92f5c27f25e 100644 --- a/src/docs/compression.dox +++ b/src/docs/compression.dox @@ -1,48 +1,36 @@ /*! @page compression Compressors This section explains how to configure WiredTiger's builtin support for -the bzip2 and snappy compression engines. +the snappy and bzip2 compression engines. -@section compression_bzip2 Using bzip2 compression +@section compression_zlib Using zlib compression -To use the builtin support for -<a href="http://www.bzip.org/">Julian Seward's bzip2</a> -compression, first check that bzip2 is installed in include and library -directories searched by the compiler. Once bzip2 is installed, you can -enable bzip2 using the \c --enable-bzip2 option to configure. +To use the builtin support for Greg Roelofs' and Mark Adler's +<a href="http://www.zlib.net/">zlib</a> +compression, first check that zlib is installed in include and library +directories searched by the compiler. Once zlib is installed, you can +enable zlib using the \c --enable-zlib option to configure. -If bzip2 is installed in a location not normally searched by the -compiler toolchain, you'll need to modify the \c CPPFLAGS and \c LDFLAGS -to indicate these locations. For example, with the bzip2 includes and +If zlib is installed in a location not normally searched by the compiler +toolchain, you'll need to modify the \c CPPFLAGS and \c LDFLAGS to +indicate these locations. For example, with the zlib includes and libraries installed in \c /usr/local/include and \c /usr/local/lib, you -should run configure as follows: +would run configure with the following additional arguments: @code -cd build_posix -../configure --enable-bzip2 CPPFLAGS="-I/usr/local/include" LDFLAGS="-L/usr/local/include" +--enable-zlib CPPFLAGS="-I/usr/local/include" LDFLAGS="-L/usr/local/include" @endcode -When opening the WiredTiger database, load the bzip2 shared library as -an extension. For example, with the bzip2 library installed in +When opening the WiredTiger database, load the zlib shared library as +an extension. For example, with the WiredTiger library installed in \c /usr/local/lib, you would use the following extension: -@snippet ex_all.c Configure bzip2 extension +@snippet ex_all.c Configure zlib extension Finally, when creating the WiredTiger object, set \c block_compressor -to \c bzip2: - -@snippet ex_all.c Create a bzip2 compressed table - -If necessary, you can confirm the compressor is working by running the -compression part of the test suite: +to \c zlib: -@code -cd build_posix -python ../test/suite/run.py compress -@endcode - -Review the test output to verify the bzip2 part of the test passes and -was not skipped. +@snippet ex_all.c Create a zlib compressed table @section compression_snappy Using snappy compression @@ -56,15 +44,14 @@ If snappy is installed in a location not normally searched by the compiler toolchain, you'll need to modify the \c CPPFLAGS and \c LDFLAGS to indicate these locations. For example, with the snappy includes and libraries installed in \c /usr/local/include and \c /usr/local/lib, you -should run configure as follows: +would run configure with the following additional arguments: @code -cd build_posix -../configure --enable-snappy CPPFLAGS="-I/usr/local/include" LDFLAGS="-L/usr/local/include" +--enable-snappy CPPFLAGS="-I/usr/local/include" LDFLAGS="-L/usr/local/include" @endcode When opening the WiredTiger database, load the snappy shared library as -an extension. For example, with the snappy library installed in +an extension. For example, with the WiredTiger library installed in \c /usr/local/lib, you would use the following extension: @snippet ex_all.c Configure snappy extension @@ -74,16 +61,34 @@ to \c snappy: @snippet ex_all.c Create a snappy compressed table -If necessary, you can confirm the compressor is working by running the -compression part of the test suite: +@section compression_bzip2 Using bzip2 compression + +To use the builtin support for +<a href="http://www.bzip.org/">Julian Seward's bzip2</a> +compression, first check that bzip2 is installed in include and library +directories searched by the compiler. Once bzip2 is installed, you can +enable bzip2 using the \c --enable-bzip2 option to configure. + +If bzip2 is installed in a location not normally searched by the +compiler toolchain, you'll need to modify the \c CPPFLAGS and \c LDFLAGS +to indicate these locations. For example, with the bzip2 includes and +libraries installed in \c /usr/local/include and \c /usr/local/lib, you +would run configure with the following additional arguments: @code -cd build_posix -python ../test/suite/run.py compress +--enable-bzip2 CPPFLAGS="-I/usr/local/include" LDFLAGS="-L/usr/local/include" @endcode -Review the test output to verify the snappy part of the test passes and -was not skipped. +When opening the WiredTiger database, load the bzip2 shared library as +an extension. For example, with the WiredTiger library installed in +\c /usr/local/lib, you would use the following extension: + +@snippet ex_all.c Configure bzip2 extension + +Finally, when creating the WiredTiger object, set \c block_compressor +to \c bzip2: + +@snippet ex_all.c Create a bzip2 compressed table @section compression_upgrading Upgrading compression engines diff --git a/src/docs/helium.dox b/src/docs/helium.dox new file mode 100644 index 00000000000..cd6b47fb968 --- /dev/null +++ b/src/docs/helium.dox @@ -0,0 +1,125 @@ +/*! @page helium WiredTiger Helium support + +WiredTiger supports Levyx Inc., Helium Data Store volumes as a data-source. + +To configure one or more Helium volumes as WiredTiger data sources, take +the following steps. + +@section helium_build Building the WiredTiger Helium Support + +To build the Helium support, use the configuration option \c --with-helium=DIR. +For example: + +@code +% cd wiredtiger +% ls /usr/local/lib/Helium +Helium Programmer's Reference.pdf libhe.a +README.TXT libhe.so +he.h +% ./configure --with-helium=/usr/local/lib/Helium && make +@endcode + +@section helium_load Loading the WiredTiger Helium Support + +Next, add code to your application to load the Helium shared library. + +The following example loads the Helium shared library, configuring and +naming two separate Helium volumes. The first volume is named \c dev1, +the second volume is named \c dev2. Volume \c dev1 has two underlying +physical Helium devices, \c /dev/disk3s1 and \c /dev/disk4s1. Volume +\c dev2 has a single underlying physical Helium device, \c /dev/disk5s1. + +@code +#define HELIUM_LIBRARY_PATH "test/helium/.libs/libwiredtiger_helium.so"" +ret = connection->load_extension(connection, HELIUM_LIBRARY_PATH, + "config=[" + "dev1=[helium_devices=[\"he://.//dev/disk3s1,/dev/disk4s1\"]," + "helium_o_volume_truncate=1]," + "dev2=[helium_devices=[\"he://.//dev/disk5s1\"]," + "helium_o_volume_truncate=1]]"); +@endcode + +The \c helium_devices configuration string takes a WiredTiger string +which is a comma-separated list of Helium devices. (Note the quoting +required for that to be possible.) + +In this example, both Helium volumes are configured to be truncated when +first opened, and all previously existing contents discarded. + +When configuring a Helium volume, the following non-standard configuration +strings are supported: + +<table> +@hrow{String, Type, Meaning} +@row{helium_devices, list, WiredTiger URI to Helium volume mapping} +@row{helium_env_read_cache_size, int, struct he_env read_cache_size value} +@row{helium_env_write_cache_size, int, struct he_env write_cache_size value} +@row{helium_o_volume_truncate, boolean, HE_O_VOLUME_TRUNCATE flag} +</table> + +With the exception of the configuration string \c helium_devices (which +is WiredTiger specific), see the Helium documentation for details on +their use. + +@section helium_objects Creating WiredTiger objects on Helium volumes + +When creating WiredTiger objects on Helium volumes, the volume names are +used as part of the URI specified to WiredTiger methods such as +WT_SESSION::create or WT_SESSION::rename, separated from the object name +by a single slash character. + +Additionally, the \c helium \c type configuration string must be included. + +The following example creates a table named \c access on the Helium +volume \c dev1, and then opens a cursor on the table: + +@code +WT_CURSOR *cursor; +WT_SESSION *session; + +/* Create the access table. */ +ret = session->create( + session, "table:dev1/access", "key_format=S,value_format=S,type=helium"); + +/* Open a cursor on the access table. */ +ret = session->open_cursor(session, "table:dev1/access", NULL, NULL, &cursor); +@endcode + +When calling WT_SESSION::create to create an object on a Helium volume, +the following additional configuration strings are supported: + +<table> +@hrow{String, Type, Meaning} +@row{helium_o_compress, boolean, HE_I_COMPRESS flag} +@row{helium_o_truncate, boolean, HE_O_TRUNCATE flag} +</table> + +See the Helium device documentation for details on their use. + +For example, creating and truncating a table could be done as follows: + +@code +WT_SESSION *session; + +/* Create and truncate the access table. */ +ret = session->create(session, "table:dev1/access", + "key_format=S,value_format=S,type=helium,helium_open_o_truncate=1"); +@endcode + +@section helium_notes Helium notes + +- Helium volumes do not support hot backup. +- Helium volumes do not support named checkpoints. +- Helium volumes do not support compression of any kind. +- Helium volumes do not support bulk load as a special case, and configuring +cursors for bulk load has no effect. +- Inserting a new record after the current maximum record in a fixed-length +bit field column-store (that is, a store with an 'r' type key and 't' type +value) does not implicitly create the missing records. + +@section helium_limitations Helium limitations + +- WiredTiger transactions cannot include operations on both Helium volumes +and other stores; this will be corrected in a future release. + +*/ diff --git a/src/docs/hot_backup.dox b/src/docs/hot_backup.dox index 0971eca948a..9c0326bcb17 100644 --- a/src/docs/hot_backup.dox +++ b/src/docs/hot_backup.dox @@ -10,15 +10,15 @@ To perform a hot backup: 1. Open a cursor on the backup data source, which begins the process of a hot backup. -2. Copy each file returned by the WT_CURSOR::next method into a -different directory. +2. Copy each file returned by the WT_CURSOR::next method to the hot +backup location, for example, a different directory. 3. Close the cursor; the cursor must not be closed until all of the files have been copied. -The directory to which the files are copied may subsequently be -specified as an directory to the ::wiredtiger_open function and accessed -as a WiredTiger database home. +A directory to which the files are copied may subsequently be specified +as an directory to the ::wiredtiger_open function and accessed as a +WiredTiger database home. Notes: diff --git a/src/docs/memrata.dox b/src/docs/memrata.dox deleted file mode 100644 index c915f0c59ea..00000000000 --- a/src/docs/memrata.dox +++ /dev/null @@ -1,129 +0,0 @@ -/*! @page memrata WiredTiger Memrata support - -WiredTiger supports Memrata KVS devices as a data-source. - -To configure one or more Memrata KVS devices as WiredTiger data sources, -take the following steps. - -@section memrata_build Building the WiredTiger Memrata Support - -To build the Memrata support, add a link in the WiredTiger build -directory to the installed location of the Memrata software. For -example: - -@code -% cd wiredtiger -% ls /usr/local/memrata -kvs.h libkvs.a libkvs.so -kvs.h.4.2 libkvs.a.4.2 libkvs.so.4.2 -% ln -s /usr/local/memrata memrata -% ./configure && make -@endcode - -@section memrata_load Loading the WiredTiger Memrata Support - -Second, change your application to load the Memrata shared library. The -following example loads the Memrata shared library, configuring and -naming two separate Memrata device pools. The first device pool is -named \c dev1, the second device pool is named \c dev2. Device pool \c -dev1 has two underlying Memrata devices, \c /dev/ssd0 and \c /dev/ssd1. -Device pool \c dev2 has a single underlying Memrata device, \c -/dev/ssd2. - -@code -#define MEMRATA_LIBRARY_PATH "test/memrata/.libs/libwiredtiger_memrata.so"" -ret = connection->load_extension(connection, MEMRATA_LIBRARY_PATH, - "config=[" - "dev1=[kvs_devices=[/dev/ssd0,/dev/ssd1],kvs_open_o_truncate=1]," - "dev2=[kvs_devices=[/dev/ssd2],kvs_open_o_truncate=1]]"); -@endcode - -The \c kvs_devices configuration string takes a WiredTiger configuration -list, that is, a comma-separated list of Memrata devices. - -In this example, both device pools are configured to be truncated (that -is, all previously existing contents discarded), when they are configured. - -When loading a Memrata device, the following additional configuration strings -are supported: - -<table> -@hrow{String, Type} -@row{kvs_devices, list of lists} -@row{kvs_parallelism, int} -@row{kvs_granularity, int} -@row{kvs_avg_key_len, int} -@row{kvs_avg_val_len, int} -@row{kvs_write_bufs, int} -@row{kvs_read_bufs, int} -@row{kvs_commit_timeout, int} -@row{kvs_reclaim_threshold, int} -@row{kvs_reclaim_period, int} -@row{kvs_open_o_debug, boolean} -@row{kvs_open_o_truncate, boolean} -</table> - -With the exception of the configuration string \c kvs_devices (which is -WiredTiger specific), see the Memrata device documentation for details -on their use. - -@section memrata_objects Creating Memrata-backed objects - -The device pool names are used as part of the URI specified to WiredTiger -methods such as WT_SESSION::create or WT_SESSION::rename, separated from -the object name by a single slash character. - -Additionally, the \c memrata \c type configuration string must be included. - -The following example creates a Memrata table named \c access in the -device pool \c dev1, and then opens a cursor on the table: - -@code -WT_CURSOR *cursor; -WT_SESSION *session; - -/* Create the access table. */ -ret = session->create( - session, "table:dev1/access", "key_format=S,value_format=S,type=memrata"); - -/* Open a cursor on the access table. */ -ret = session->open_cursor(session, "table:dev1/access", NULL, NULL, &cursor); -@endcode - -When creating a Memrata-backed object with the WT_SESSION::create method, -the following additional configuration strings are supported: - -<table> -@hrow{String, Type} -@row{kvs_open_o_debug, boolean} -@row{kvs_open_o_truncate, boolean} -</table> - -See the Memrata device documentation for details on their use. - -For example, creating and truncating a table could be done as follows: - -@code -WT_SESSION *session; - -/* Create and truncate the access table. */ -ret = session->create(session, "table:dev1/access", - "key_format=S,value_format=S,type=memrata,kvs_open_o_truncate=1"); -@endcode - -@section memrata_notes Memrata notes - -- Memrata devices do not support named checkpoints. -- Inserting a new record after the current maximum record in a fixed-length -bit field column-store (that is, a store with an 'r' type key and 't' type -value) does not implicitly create the missing records. -- Memrata devices do not support bulk load as a special case, and configuring -cursors for bulk load has no effect. -- Memrata devices do not support compression of any kind. - -@section memrata_limitations Memrata limitations - -- WiredTiger transactions cannot include operations on both Memrata devices -and other stores. - -*/ diff --git a/src/docs/programming.dox b/src/docs/programming.dox index 5bf5d965afc..54e641fa3a4 100644 --- a/src/docs/programming.dox +++ b/src/docs/programming.dox @@ -33,7 +33,7 @@ WiredTiger applications: @section programming_extending Extending WiredTiger - @subpage custom_data_sources -- @subpage memrata +- @subpage helium @section programming_admin Administering a WiredTiger database diff --git a/src/docs/spell.ok b/src/docs/spell.ok index 1012eef1f93..6d24c474e19 100644 --- a/src/docs/spell.ok +++ b/src/docs/spell.ok @@ -1,5 +1,6 @@ personal_ws-1.1 en 200 APIs +Adler's Atomicity BLOBs CFLAGS @@ -12,6 +13,7 @@ DbCursor DbEnv DbMultiple EB +EAGAIN EBUSY EINVAL EmpId @@ -28,18 +30,21 @@ LIBS LSB LSM Lameter +Levyx MERCHANTABILITY MVCC's Makefiles -Memrata Mewhort NOTFOUND NUMA NoSQL +README RepMgr +Roelofs Rrx Seward's SiS +TXT URIs Vv WiredTiger @@ -87,6 +92,7 @@ command's comparator cond config +configurign conn const control's @@ -146,6 +152,7 @@ firstname fnv fput freelist +fsync gcc gdbm getopt @@ -181,6 +188,7 @@ lastname len li libdir +libhe libkvs libtool libwiredtiger @@ -207,7 +215,6 @@ maxleafpage memalloc memfree memp -memrata metadata minkey mkdir @@ -356,3 +363,4 @@ writelocks wrlock xa yieldcpu +zlib diff --git a/src/docs/top/Doxyfile b/src/docs/top/Doxyfile index 59a3667b169..ed4f2eb8c3b 100644 --- a/src/docs/top/Doxyfile +++ b/src/docs/top/Doxyfile @@ -2,7 +2,7 @@ PROJECT_NUMBER = "Developer Site" OUTPUT_DIRECTORY = ../../docs/top -INPUT = top license.dox +INPUT = top community.dox license.dox EXCLUDE = GENERATE_TREEVIEW = NO diff --git a/src/docs/top/main.dox b/src/docs/top/main.dox index 821f22102d3..5481d2deae5 100644 --- a/src/docs/top/main.dox +++ b/src/docs/top/main.dox @@ -6,9 +6,9 @@ WiredTiger is an high performance, scalable, production quality, NoSQL, @section releases Releases <table> -@row{<b>WiredTiger 2.0.1</b> (current), - <a href="releases/wiredtiger-2.0.1.tar.bz2"><b>[Release package]</b></a>, - <a href="2.0.1/index.html"><b>[Documentation]</b></a>} +@row{<b>WiredTiger 2.1.0</b> (current), + <a href="releases/wiredtiger-2.1.0.tar.bz2"><b>[Release package]</b></a>, + <a href="2.1.0/index.html"><b>[Documentation]</b></a>} @row{<b>WiredTiger 1.6.6</b> (previous), <a href="releases/wiredtiger-1.6.6.tar.bz2"><b>[Release package]</b></a>, <a href="1.6.6/index.html"><b>[Documentation]</b></a>} diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox index 9c250824fee..e59b031a1ff 100644 --- a/src/docs/upgrading.dox +++ b/src/docs/upgrading.dox @@ -10,6 +10,26 @@ In the 2.1 release of WiredTiger WT_ITEM::size type has changed from resolve compile-time errors. </dd> +<dt>WT_COMPRESSOR::compress_raw signature</dt> +<dd> +In the 2.1 release of WiredTiger, the behavior of the compress_raw +callback has changed so that it will only be retried if it returns +\c EAGAIN. If it returns zero and sets \c result_slots to zero, +WiredTiger will assume that raw compression has failed and will fall +back to calling WT_COMPRESSOR::compress. +</dd> + +<dt>Transaction sync default setting</dt> +<dd> +In the 2.1 release of WiredTiger the ::wiredtiger_open \c transaction_sync +configuration setting default value has changed from "dsync" to "fsync". +This is due to enhancements to the group commit implementation in +WiredTiger - which mean that greater throughput can be achieved with +explicit "fsync" calls than by enabling "dsync" on a file handle. +Applications that don't execute concurrent transactions may see better +throughput with transaction_sync set to "dsync". +</dd> + @section version_20 Upgrading to Version 2.0 <dl> diff --git a/src/include/btmem.h b/src/include/btmem.h index e4b30f03ab9..7f0bf280d5c 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -521,14 +521,9 @@ struct __wt_ref { * WT_MERGE_FULL_PAGE -- * When the result of a merge contains more than this number of keys, it is * considered "done" and will not be merged again. - * - * WT_MERGE_MAX_REFS -- - * Don't complete merges that contain more than this number of keys, they tend - * to generate pathological trees. */ #define WT_MERGE_STACK_MIN 3 #define WT_MERGE_FULL_PAGE 100 -#define WT_MERGE_MAX_REFS 1000 /* * WT_ROW -- diff --git a/src/include/btree.i b/src/include/btree.i index fc9a73f4d9d..f09d05178ab 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -528,6 +528,41 @@ __wt_ref_info(WT_SESSION_IMPL *session, WT_PAGE *page, } /* + * __wt_eviction_force_check -- + * Check if a page matches the criteria for forced eviction. + */ +static inline int +__wt_eviction_force_check(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_BTREE *btree; + + btree = S2BT(session); + + /* Pages are usually small enough, check that first. */ + if (page->memory_footprint < btree->maxmempage) + return (0); + + /* Leaf pages only. */ + if (page->type != WT_PAGE_COL_FIX && + page->type != WT_PAGE_COL_VAR && + page->type != WT_PAGE_ROW_LEAF) + return (0); + + /* Eviction may be turned off, although that's rare. */ + if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) + return (0); + + /* + * It's hard to imagine a page with a huge memory footprint that has + * never been modified, but check to be sure. + */ + if (page->modify == NULL) + return (0); + + return (1); +} + +/* * __wt_page_release -- * Release a reference to a page. */ @@ -557,7 +592,7 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_PAGE *page) return (ret); } - ret = __wt_evict_page(session, page); + WT_TRET(__wt_evict_page(session, page)); if (ret == 0) WT_STAT_FAST_CONN_INCR(session, cache_eviction_force); else @@ -642,43 +677,8 @@ __wt_page_hazard_check(WT_SESSION_IMPL *session, WT_PAGE *page) } /* - * __wt_eviction_force_check -- - * Check if a page matches the criteria for forced eviction. - */ -static inline int -__wt_eviction_force_check(WT_SESSION_IMPL *session, WT_PAGE *page) -{ - WT_BTREE *btree; - - btree = S2BT(session); - - /* Pages are usually small enough, check that first. */ - if (page->memory_footprint < btree->maxmempage) - return (0); - - /* Leaf pages only. */ - if (page->type != WT_PAGE_COL_FIX && - page->type != WT_PAGE_COL_VAR && - page->type != WT_PAGE_ROW_LEAF) - return (0); - - /* Eviction may be turned off, although that's rare. */ - if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) - return (0); - - /* - * It's hard to imagine a page with a huge memory footprint that has - * never been modified, but check to be sure. - */ - if (page->modify == NULL) - return (0); - - return (1); -} - -/* * __wt_eviction_force -- - * Check if the current transaction permits forced eviction of a page. + * Check if the current transaction permits forced eviction of a page. */ static inline int __wt_eviction_force_txn_check(WT_SESSION_IMPL *session, WT_PAGE *page) @@ -702,7 +702,7 @@ __wt_eviction_force_txn_check(WT_SESSION_IMPL *session, WT_PAGE *page) /* * __wt_eviction_force -- - * Forcefully evict a page, if possible. + * Forcefully evict a page, if possible. */ static inline int __wt_eviction_force(WT_SESSION_IMPL *session, WT_PAGE *page) @@ -852,7 +852,7 @@ __wt_lex_compare_skip( /* * __wt_btree_mergeable -- - * Determines whether the given page is a candidate for merging. + * Determines whether the given page is a candidate for merging. */ static inline int __wt_btree_mergeable(WT_PAGE *page) diff --git a/src/include/stat.h b/src/include/stat.h index 6717b4d081f..ea2a4068f96 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -182,6 +182,8 @@ struct __wt_connection_stats { WT_STATS log_slot_transitions; WT_STATS log_sync; WT_STATS log_writes; + WT_STATS lsm_checkpoint_throttle; + WT_STATS lsm_merge_throttle; WT_STATS lsm_rows_merged; WT_STATS memory_allocation; WT_STATS memory_free; @@ -275,9 +277,11 @@ struct __wt_dsrc_stats { WT_STATS cursor_search_near; WT_STATS cursor_update; WT_STATS cursor_update_bytes; + WT_STATS lsm_checkpoint_throttle; WT_STATS lsm_chunk_count; WT_STATS lsm_generation_max; WT_STATS lsm_lookup_no_bloom; + WT_STATS lsm_merge_throttle; WT_STATS rec_dictionary; WT_STATS rec_overflow_key_internal; WT_STATS rec_overflow_key_leaf; diff --git a/src/include/txn.i b/src/include/txn.i index fc0a4d2317f..cdfe697ee51 100644 --- a/src/include/txn.i +++ b/src/include/txn.i @@ -18,6 +18,8 @@ __txn_next_op(WT_SESSION_IMPL *session, WT_TXN_OP **opp) WT_TXN *txn; txn = &session->txn; + *opp = NULL; + WT_ASSERT(session, F_ISSET(txn, TXN_RUNNING)); WT_RET(__wt_realloc_def(session, &txn->mod_alloc, txn->mod_count + 1, &txn->mod)); diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index be4474ed14f..b5634c9d205 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -855,7 +855,9 @@ struct __wt_session { * value can be created. Must be larger than chunk_size., an integer * between 100MB and 10TB; default \c 5GB.} * @config{ chunk_size, the maximum size of the - * in-memory chunk of an LSM tree., an integer between 512K and 500MB; + * in-memory chunk of an LSM tree. This limit is soft - it is possible + * for chunks to be temporarily larger than this value. This overrides + * the \c memory_page_max setting., an integer between 512K and 500MB; * default \c 10MB.} * @config{ merge_max, the * maximum number of chunks to include in a merge operation., an integer @@ -872,7 +874,8 @@ struct __wt_session { * memory before being reconciled to disk. The specified size will be * adjusted to a lower bound of <code>50 * leaf_page_max</code>. This * limit is soft - it is possible for pages to be temporarily larger - * than this value., an integer between 512B and 10TB; default \c 5MB.} + * than this value. This setting is ignored for LSM trees\, see \c + * chunk_size., an integer between 512B and 10TB; default \c 5MB.} * @config{os_cache_dirty_max, maximum dirty system buffer cache usage\, * in bytes. If non-zero\, schedule writes for dirty blocks belonging * to this object in the system buffer cache after that many bytes from @@ -2133,14 +2136,19 @@ struct __wt_compressor { * set \c result_slotsp to the number of byte strings encoded and * \c result_lenp to the bytes needed for the encoded representation. * - * WiredTiger repeatedly calls the callback function until all rows on - * the page have been encoded. There is no requirement the callback - * encode any or all of the byte strings passed by WiredTiger. If the - * callback does not encode any of the byte strings, the callback must - * set \c result_slotsp to 0. In this case, WiredTiger will accumulate - * more rows and repeat the call; if there are no more rows to - * accumulate, WiredTiger writes the remaining rows without further - * calls to the callback. + * There is no requirement the callback encode any or all of the byte + * strings passed by WiredTiger. If the callback does not encode any + * of the byte strings and compression should not be retried, the + * callback should set \c result_slotsp to 0. + * + * If the callback does not encode any of the byte strings and + * compression should be retried with additional byte strings, the + * callback must return \c EAGAIN. In that case, WiredTiger will + * accumulate more rows and repeat the call. + * + * If there are no more rows to accumulate or the callback indicates + * that it cannot be retried, WiredTiger writes the remaining rows + * using \c WT_COMPRESSOR::compress. * * On entry, \c final is zero if there are more rows to be written as * part of this page (if there will be additional data provided to the @@ -2580,42 +2588,46 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_LOG_SYNC 1063 /*! log: log write operations */ #define WT_STAT_CONN_LOG_WRITES 1064 +/*! sleep for LSM checkpoint throttle */ +#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1065 +/*! sleep for LSM merge throttle */ +#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1066 /*! rows merged in an LSM tree */ -#define WT_STAT_CONN_LSM_ROWS_MERGED 1065 +#define WT_STAT_CONN_LSM_ROWS_MERGED 1067 /*! memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1066 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1068 /*! memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1067 +#define WT_STAT_CONN_MEMORY_FREE 1069 /*! memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1068 +#define WT_STAT_CONN_MEMORY_GROW 1070 /*! total read I/Os */ -#define WT_STAT_CONN_READ_IO 1069 +#define WT_STAT_CONN_READ_IO 1071 /*! page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1070 +#define WT_STAT_CONN_REC_PAGES 1072 /*! page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1071 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1073 /*! reconciliation failed because an update could not be included */ -#define WT_STAT_CONN_REC_SKIPPED_UPDATE 1072 +#define WT_STAT_CONN_REC_SKIPPED_UPDATE 1074 /*! pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1073 +#define WT_STAT_CONN_RWLOCK_READ 1075 /*! pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1074 +#define WT_STAT_CONN_RWLOCK_WRITE 1076 /*! open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1075 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1077 /*! transactions */ -#define WT_STAT_CONN_TXN_BEGIN 1076 +#define WT_STAT_CONN_TXN_BEGIN 1078 /*! transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1077 +#define WT_STAT_CONN_TXN_CHECKPOINT 1079 /*! transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1078 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1080 /*! transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1079 +#define WT_STAT_CONN_TXN_COMMIT 1081 /*! transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1080 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1082 /*! transactions rolled-back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1081 +#define WT_STAT_CONN_TXN_ROLLBACK 1083 /*! total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1082 +#define WT_STAT_CONN_WRITE_IO 1084 /*! * @} @@ -2759,43 +2771,47 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_DSRC_CURSOR_UPDATE 2066 /*! cursor-update value bytes updated */ #define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2067 +/*! sleep for LSM checkpoint throttle */ +#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2068 /*! chunks in the LSM tree */ -#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2068 +#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2069 /*! highest merge generation in the LSM tree */ -#define WT_STAT_DSRC_LSM_GENERATION_MAX 2069 +#define WT_STAT_DSRC_LSM_GENERATION_MAX 2070 /*! queries that could have benefited from a Bloom filter that did not * exist */ -#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2070 +#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2071 +/*! sleep for LSM merge throttle */ +#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2072 /*! reconciliation dictionary matches */ -#define WT_STAT_DSRC_REC_DICTIONARY 2071 +#define WT_STAT_DSRC_REC_DICTIONARY 2073 /*! reconciliation internal-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2072 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2074 /*! reconciliation leaf-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2073 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2075 /*! reconciliation overflow values written */ -#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2074 +#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2076 /*! reconciliation pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE 2075 +#define WT_STAT_DSRC_REC_PAGE_DELETE 2077 /*! reconciliation pages merged */ -#define WT_STAT_DSRC_REC_PAGE_MERGE 2076 +#define WT_STAT_DSRC_REC_PAGE_MERGE 2078 /*! page reconciliation calls */ -#define WT_STAT_DSRC_REC_PAGES 2077 +#define WT_STAT_DSRC_REC_PAGES 2079 /*! page reconciliation calls for eviction */ -#define WT_STAT_DSRC_REC_PAGES_EVICTION 2078 +#define WT_STAT_DSRC_REC_PAGES_EVICTION 2080 /*! reconciliation failed because an update could not be included */ -#define WT_STAT_DSRC_REC_SKIPPED_UPDATE 2079 +#define WT_STAT_DSRC_REC_SKIPPED_UPDATE 2081 /*! reconciliation internal pages split */ -#define WT_STAT_DSRC_REC_SPLIT_INTERNAL 2080 +#define WT_STAT_DSRC_REC_SPLIT_INTERNAL 2082 /*! reconciliation leaf pages split */ -#define WT_STAT_DSRC_REC_SPLIT_LEAF 2081 +#define WT_STAT_DSRC_REC_SPLIT_LEAF 2083 /*! reconciliation maximum splits for a page */ -#define WT_STAT_DSRC_REC_SPLIT_MAX 2082 +#define WT_STAT_DSRC_REC_SPLIT_MAX 2084 /*! object compaction */ -#define WT_STAT_DSRC_SESSION_COMPACT 2083 +#define WT_STAT_DSRC_SESSION_COMPACT 2085 /*! open cursor count */ -#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2084 +#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2086 /*! update conflicts */ -#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2085 +#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2087 /*! @} */ /* * Statistics section: END diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index 618257469ee..c50380b91b9 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -1115,9 +1115,18 @@ __clsm_put(WT_SESSION_IMPL *session, * don't worry about protecting access. */ if (++clsm->primary_chunk->count % 100 == 0 && - lsm_tree->merge_throttle + lsm_tree->ckpt_throttle > 0) + lsm_tree->merge_throttle + lsm_tree->ckpt_throttle > 0) { + WT_STAT_FAST_INCRV(session, &clsm->lsm_tree->stats, + lsm_checkpoint_throttle, (uint64_t)lsm_tree->ckpt_throttle); + WT_STAT_FAST_CONN_INCRV(session, + lsm_checkpoint_throttle, (uint64_t)lsm_tree->ckpt_throttle); + WT_STAT_FAST_INCRV(session, &clsm->lsm_tree->stats, + lsm_merge_throttle, (uint64_t)lsm_tree->merge_throttle); + WT_STAT_FAST_CONN_INCRV(session, + lsm_merge_throttle, (uint64_t)lsm_tree->merge_throttle); __wt_sleep(0, lsm_tree->ckpt_throttle + lsm_tree->merge_throttle); + } /* * In LSM there are multiple btrees active at one time. The tree diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c index 3aec49da252..a830295908f 100644 --- a/src/lsm/lsm_tree.c +++ b/src/lsm/lsm_tree.c @@ -407,6 +407,10 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session, * Set up the config for each chunk. If possible, avoid high latencies * from fsync by flushing the cache every 8MB (will be overridden by * any application setting). + * + * Also make the memory_page_max double the chunk size, so application + * threads don't immediately try to force evict the chunk when the + * worker thread clears the NO_EVICTION flag. */ tmpconfig = ""; #ifdef HAVE_SYNC_FILE_RANGE @@ -415,7 +419,8 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session, #endif WT_ERR(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_buf_fmt(session, buf, - "%s%s,key_format=u,value_format=u", tmpconfig, config)); + "%s%s,key_format=u,value_format=u,memory_page_max=%" PRIu64, + tmpconfig, config, 2 * lsm_tree->chunk_max)); lsm_tree->file_config = __wt_buf_steal(session, buf); /* Create the first chunk and flush the metadata. */ diff --git a/src/support/stat.c b/src/support/stat.c index 621c79220a4..c0caecbe606 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -93,11 +93,14 @@ __wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats) stats->cursor_search_near.desc = "cursor search near calls"; stats->cursor_update.desc = "cursor update calls"; stats->cursor_update_bytes.desc = "cursor-update value bytes updated"; + stats->lsm_checkpoint_throttle.desc = + "sleep for LSM checkpoint throttle"; stats->lsm_chunk_count.desc = "chunks in the LSM tree"; stats->lsm_generation_max.desc = "highest merge generation in the LSM tree"; stats->lsm_lookup_no_bloom.desc = "queries that could have benefited from a Bloom filter that did not exist"; + stats->lsm_merge_throttle.desc = "sleep for LSM merge throttle"; stats->rec_dictionary.desc = "reconciliation dictionary matches"; stats->rec_overflow_key_internal.desc = "reconciliation internal-page overflow keys"; @@ -194,9 +197,11 @@ __wt_stat_refresh_dsrc_stats(void *stats_arg) stats->cursor_search_near.v = 0; stats->cursor_update.v = 0; stats->cursor_update_bytes.v = 0; + stats->lsm_checkpoint_throttle.v = 0; stats->lsm_chunk_count.v = 0; stats->lsm_generation_max.v = 0; stats->lsm_lookup_no_bloom.v = 0; + stats->lsm_merge_throttle.v = 0; stats->rec_dictionary.v = 0; stats->rec_overflow_key_internal.v = 0; stats->rec_overflow_key_leaf.v = 0; @@ -280,9 +285,11 @@ __wt_stat_aggregate_dsrc_stats(const void *child, const void *parent) p->cursor_search_near.v += c->cursor_search_near.v; p->cursor_update.v += c->cursor_update.v; p->cursor_update_bytes.v += c->cursor_update_bytes.v; + p->lsm_checkpoint_throttle.v += c->lsm_checkpoint_throttle.v; if (c->lsm_generation_max.v > p->lsm_generation_max.v) p->lsm_generation_max.v = c->lsm_generation_max.v; p->lsm_lookup_no_bloom.v += c->lsm_lookup_no_bloom.v; + p->lsm_merge_throttle.v += c->lsm_merge_throttle.v; p->rec_dictionary.v += c->rec_dictionary.v; p->rec_overflow_key_internal.v += c->rec_overflow_key_internal.v; p->rec_overflow_key_leaf.v += c->rec_overflow_key_leaf.v; @@ -389,6 +396,9 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats) "log: consolidated slot join transitions"; stats->log_sync.desc = "log: log sync operations"; stats->log_writes.desc = "log: log write operations"; + stats->lsm_checkpoint_throttle.desc = + "sleep for LSM checkpoint throttle"; + stats->lsm_merge_throttle.desc = "sleep for LSM merge throttle"; stats->lsm_rows_merged.desc = "rows merged in an LSM tree"; stats->memory_allocation.desc = "memory allocations"; stats->memory_free.desc = "memory frees"; @@ -479,6 +489,8 @@ __wt_stat_refresh_connection_stats(void *stats_arg) stats->log_slot_transitions.v = 0; stats->log_sync.v = 0; stats->log_writes.v = 0; + stats->lsm_checkpoint_throttle.v = 0; + stats->lsm_merge_throttle.v = 0; stats->lsm_rows_merged.v = 0; stats->memory_allocation.v = 0; stats->memory_free.v = 0; diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 8cafc78c11f..f4cd3a94a15 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -24,7 +24,7 @@ __checkpoint_name_check(WT_SESSION_IMPL *session, const char *uri) /* * This function exists as a place for this comment: named checkpoints - * are only supported on file objects, and not on LSM trees or Memrata + * are only supported on file objects, and not on LSM trees or Helium * devices. If a target list is configured for the checkpoint, this * function is called with each target list entry; check the entry to * make sure it's backed by a file. If no target list is configured, @@ -148,11 +148,11 @@ __checkpoint_data_source(WT_SESSION_IMPL *session, const char *cfg[]) WT_DATA_SOURCE *dsrc; /* - * A place-holder, to support Memrata devices: we assume calling the + * A place-holder, to support Helium devices: we assume calling the * underlying data-source session checkpoint function is sufficient to * checkpoint all objects in the data source, open or closed, and we * don't attempt to optimize the checkpoint of individual targets. - * Those assumptions is correct for the Memrata device, but it's not + * Those assumptions is correct for the Helium device, but it's not * necessarily going to be true for other data sources. * * It's not difficult to support data-source checkpoints of individual |