diff options
author | Luke Chen <luke.chen@mongodb.com> | 2019-08-07 15:12:18 +1000 |
---|---|---|
committer | Luke Chen <luke.chen@mongodb.com> | 2019-08-07 15:12:18 +1000 |
commit | b8602c086ff469967bedc82b14d63d4a236d092c (patch) | |
tree | 8f732b2d259b59b01a891979f6a255e2c28f677c /src | |
parent | f659b76958858056cdb7558032cb70bdd53c57b3 (diff) | |
download | mongo-b8602c086ff469967bedc82b14d63d4a236d092c.tar.gz |
Import wiredtiger: 19fd4ed45b3a2a8c119b68015113630b0c060e5f from branch mongodb-4.4
ref: c29f4c6030..19fd4ed45b
for: 4.3.1
WT-4460 Optimize for in-order, non-overlapping modifications
WT-4956 Handle the case where 4 billion updates are made to a page without eviction
Diffstat (limited to 'src')
17 files changed, 404 insertions, 189 deletions
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index 6ce1ad16a5f..c752a93e7b0 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -1325,6 +1325,7 @@ unmodify unordered unpackv unpadded +unreconciled unreferenced unregister unsized diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 882ae91c078..8bc6565102b 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "c29f4c6030e37794b8c2c1195829ba2d14954677", + "commit": "19fd4ed45b3a2a8c119b68015113630b0c060e5f", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-4.4" diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index 4d94bcdb23e..d045405f85a 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -1506,8 +1506,11 @@ __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries) if (!F_ISSET(cursor, WT_CURSTD_KEY_INT) || !F_ISSET(cursor, WT_CURSTD_VALUE_INT)) WT_ERR(__wt_btcur_search(cbt)); + + WT_ERR(__wt_modify_pack(cursor, &modify, entries, nentries)); + orig = cursor->value.size; - WT_ERR(__wt_modify_apply_api(session, cursor, entries, nentries)); + WT_ERR(__wt_modify_apply(cursor, modify->data)); new = cursor->value.size; WT_ERR(__cursor_size_chk(session, &cursor->value)); @@ -1527,8 +1530,7 @@ __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries) F_CLR(cursor, WT_CURSTD_OVERWRITE); if (cursor->value.size <= 64 || __cursor_chain_exceeded(cbt)) ret = __btcur_update(cbt, &cursor->value, WT_UPDATE_STANDARD); - else if ((ret = - __wt_modify_pack(session, &modify, entries, nentries)) == 0) + else ret = __btcur_update(cbt, modify, WT_UPDATE_MODIFY); if (overwrite) F_SET(cursor, WT_CURSTD_OVERWRITE); diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index 685fb983718..2eb71967799 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -889,7 +889,7 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref) if (split_gen != 0) WT_RET(ds->f(ds, ", split-gen=%" PRIu64, split_gen)); if (mod != NULL) - WT_RET(ds->f(ds, ", write-gen=%" PRIu32, mod->write_gen)); + WT_RET(ds->f(ds, ", page-state=%" PRIu32, mod->page_state)); WT_RET(ds->f(ds, ", memory-size %" WT_SIZET_FMT, page->memory_footprint)); WT_RET(ds->f(ds, "\n")); diff --git a/src/third_party/wiredtiger/src/btree/bt_ret.c b/src/third_party/wiredtiger/src/btree/bt_ret.c index 52277efb85d..d41f76c6442 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ret.c +++ b/src/third_party/wiredtiger/src/btree/bt_ret.c @@ -250,7 +250,7 @@ __wt_value_return_upd(WT_SESSION_IMPL *session, * updates. */ while (i > 0) - WT_ERR(__wt_modify_apply(session, cursor, listp[--i]->data)); + WT_ERR(__wt_modify_apply(cursor, listp[--i]->data)); err: if (allocated_bytes != 0) __wt_free(session, listp); diff --git a/src/third_party/wiredtiger/src/cursor/cur_std.c b/src/third_party/wiredtiger/src/cursor/cur_std.c index 073df6eaaf6..22d067ef90e 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_std.c +++ b/src/third_party/wiredtiger/src/cursor/cur_std.c @@ -931,7 +931,7 @@ __cursor_modify(WT_CURSOR *cursor, WT_MODIFY *entries, int nentries) /* Get the current value, apply the modifications. */ WT_ERR(cursor->search(cursor)); - WT_ERR(__wt_modify_apply_api(session, cursor, entries, nentries)); + WT_ERR(__wt_modify_apply_api(cursor, entries, nentries)); /* We know both key and value are set, "overwrite" doesn't matter. */ ret = cursor->update(cursor); diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index a7c289a7b7f..03643f473e1 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -489,10 +489,21 @@ struct __wt_page_modify { WT_SPINLOCK page_lock; /* Page's spinlock */ /* - * The write generation is incremented when a page is modified, a page - * is clean if the write generation is 0. + * The page state is incremented when a page is modified. + * + * WT_PAGE_CLEAN -- + * The page is clean. + * WT_PAGE_DIRTY_FIRST -- + * The page is in this state after the first operation that marks a + * page dirty, or when reconciliation is checking to see if it has + * done enough work to be able to mark the page clean. + * WT_PAGE_DIRTY -- + * Two or more updates have been added to the page. */ - uint32_t write_gen; +#define WT_PAGE_CLEAN 0 +#define WT_PAGE_DIRTY_FIRST 1 +#define WT_PAGE_DIRTY 2 + uint32_t page_state; #define WT_PM_REC_EMPTY 1 /* Reconciliation: no replacement */ #define WT_PM_REC_MULTIBLOCK 2 /* Reconciliation: multiple blocks */ diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index d0679a9fb38..3fa5d60f1f1 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -34,7 +34,8 @@ __wt_page_is_empty(WT_PAGE *page) static inline bool __wt_page_evict_clean(WT_PAGE *page) { - return (page->modify == NULL || (page->modify->write_gen == 0 && + return (page->modify == NULL || + (page->modify->page_state == WT_PAGE_CLEAN && page->modify->rec_result == 0)); } @@ -45,7 +46,8 @@ __wt_page_evict_clean(WT_PAGE *page) static inline bool __wt_page_is_modified(WT_PAGE *page) { - return (page->modify != NULL && page->modify->write_gen != 0); + return (page->modify != NULL && + page->modify->page_state != WT_PAGE_CLEAN); } /* @@ -496,19 +498,25 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) WT_ASSERT(session, !F_ISSET(session->dhandle, WT_DHANDLE_DEAD)); last_running = 0; - if (page->modify->write_gen == 0) + if (page->modify->page_state == WT_PAGE_CLEAN) last_running = S2C(session)->txn_global.last_running; /* - * We depend on atomic-add being a write barrier, that is, a barrier to - * ensure all changes to the page are flushed before updating the page - * write generation and/or marking the tree dirty, otherwise checkpoints + * We depend on the atomic operation being a write barrier, that is, a + * barrier to ensure all changes to the page are flushed before updating + * the page state and/or marking the tree dirty, otherwise checkpoints * and/or page reconciliation might be looking at a clean page/tree. * * Every time the page transitions from clean to dirty, update the cache * and transactional information. + * + * The page state can only ever be incremented above dirty by the number + * of concurrently running threads, so the counter will never approach + * the point where it would wrap. */ - if (__wt_atomic_add32(&page->modify->write_gen, 1) == 1) { + if (page->modify->page_state < WT_PAGE_DIRTY && + __wt_atomic_add32(&page->modify->page_state, 1) == + WT_PAGE_DIRTY_FIRST) { __wt_cache_dirty_incr(session, page); /* @@ -579,7 +587,17 @@ __wt_page_modify_clear(WT_SESSION_IMPL *session, WT_PAGE *page) * Allow the call to be made on clean pages. */ if (__wt_page_is_modified(page)) { - page->modify->write_gen = 0; + /* + * The only part where ordering matters is during + * reconciliation where updates on other threads are performing + * writes to the page state that need to be visible to the + * reconciliation thread. + * + * Since clearing of the page state is not going to be happening + * during reconciliation on a separate thread, there's no write + * barrier needed here. + */ + page->modify->page_state = WT_PAGE_CLEAN; __wt_cache_dirty_decr(session, page); } } diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 80046127d3f..843cf562c66 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -503,9 +503,9 @@ extern int __wt_metadata_search(WT_SESSION_IMPL *session, const char *key, char extern int __wt_metadata_set_base_write_gen(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_metadata_turtle_rewrite(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_metadata_update( WT_SESSION_IMPL *session, const char *key, const char *value) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_modify_apply( WT_SESSION_IMPL *session, WT_CURSOR *cursor, const void *modify) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_modify_apply_api(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_MODIFY *entries, int nentries) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_modify_pack(WT_SESSION_IMPL *session, WT_ITEM **modifyp, WT_MODIFY *entries, int nentries) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_modify_apply(WT_CURSOR *cursor, const void *modify) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_modify_apply_api(WT_CURSOR *cursor, WT_MODIFY *entries, int nentries) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_modify_pack(WT_CURSOR *cursor, WT_ITEM **modifyp, WT_MODIFY *entries, int nentries) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_msg(WT_SESSION_IMPL *session, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 2, 3))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp, bool closing) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_name_check(WT_SESSION_IMPL *session, const char *str, size_t len) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); diff --git a/src/third_party/wiredtiger/src/include/reconcile.h b/src/third_party/wiredtiger/src/include/reconcile.h index be6440c27bc..c3c46ec11c5 100644 --- a/src/third_party/wiredtiger/src/include/reconcile.h +++ b/src/third_party/wiredtiger/src/include/reconcile.h @@ -20,12 +20,6 @@ struct __wt_reconcile { uint32_t flags; /* Caller's configuration */ /* - * Track start/stop write generation to decide if all changes to the - * page are written. - */ - uint32_t orig_write_gen; - - /* * Track start/stop checkpoint generations to decide if lookaside table * records are correct. */ diff --git a/src/third_party/wiredtiger/src/include/serial.i b/src/third_party/wiredtiger/src/include/serial.i index 1c67a84adbf..701f73df84f 100644 --- a/src/third_party/wiredtiger/src/include/serial.i +++ b/src/third_party/wiredtiger/src/include/serial.i @@ -7,29 +7,6 @@ */ /* - * __page_write_gen_wrapped_check -- - * Confirm the page's write generation number won't wrap. - */ -static inline int -__page_write_gen_wrapped_check(WT_PAGE *page) -{ - /* - * Check to see if the page's write generation is about to wrap (wildly - * unlikely as it implies 4B updates between clean page reconciliations, - * but technically possible), and fail the update. - * - * The check is outside of the serialization mutex because the page's - * write generation is going to be a hot cache line, so technically it's - * possible for the page's write generation to wrap between the test and - * our subsequent modification of it. However, the test is (4B-1M), and - * there cannot be a million threads that have done the test but not yet - * completed their modification. - */ - return (page->modify->write_gen > - UINT32_MAX - WT_MILLION ? WT_RESTART : 0); -} - -/* * __insert_simple_func -- * Worker function to add a WT_INSERT entry to the middle of a skiplist. */ @@ -163,9 +140,6 @@ __wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page, new_ins = *new_insp; *new_insp = NULL; - /* Check for page write generation wrap. */ - WT_RET(__page_write_gen_wrapped_check(page)); - /* * Acquire the page's spinlock unless we already have exclusive access. * Then call the worker function. @@ -215,9 +189,6 @@ __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page, new_ins = *new_insp; *new_insp = NULL; - /* Check for page write generation wrap. */ - WT_RET(__page_write_gen_wrapped_check(page)); - simple = true; for (i = 0; i < skipdepth; i++) if (new_ins->next[i] == NULL) @@ -272,9 +243,6 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, upd = *updp; *updp = NULL; - /* Check for page write generation wrap. */ - WT_RET(__page_write_gen_wrapped_check(page)); - /* * All structure setup must be flushed before the structure is entered * into the list. We need a write barrier here, our callers depend on diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index ce58f9f7301..e2141518e3a 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -844,6 +844,7 @@ __wt_txn_upd_visible_type(WT_SESSION_IMPL *session, WT_UPDATE *upd) return (WT_VISIBLE_TRUE); } + /* * __wt_txn_upd_durable -- * Can the current transaction make the given update durable. diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index f7eeada4c8e..477894bcf14 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -456,14 +456,20 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r) } /* - * The page only might be clean; if the write generation is - * unchanged since reconciliation started, it's clean. + * We set the page state to mark it as having been dirtied for + * the first time prior to reconciliation. A failed atomic cas + * indicates that an update has taken place during + * reconciliation. * - * If the write generation changed, the page has been written - * since reconciliation started and remains dirty (that can't - * happen when evicting, the page is exclusively locked). + * The page only might be clean; if the page state is unchanged + * since reconciliation started, it's clean. + * + * If the page state changed, the page has been written since + * reconciliation started and remains dirty (that can't happen + * when evicting, the page is exclusively locked). */ - if (__wt_atomic_cas32(&mod->write_gen, r->orig_write_gen, 0)) + if (__wt_atomic_cas32( + &mod->page_state, WT_PAGE_DIRTY_FIRST, WT_PAGE_CLEAN)) __wt_cache_dirty_decr(session, page); else WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT)); @@ -602,13 +608,22 @@ __rec_init(WT_SESSION_IMPL *session, r->page = page; /* - * Save the page's write generation before reading the page. * Save the transaction generations before reading the page. * These are all ordered reads, but we only need one. */ r->orig_btree_checkpoint_gen = btree->checkpoint_gen; r->orig_txn_checkpoint_gen = __wt_gen(session, WT_GEN_CHECKPOINT); - WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen); + + /* + * Update the page state to indicate that all currently installed + * updates will be included in this reconciliation if it would mark the + * page clean. + * + * Add a write barrier to make it more likely that a thread adding an + * update will see this state change. + */ + page->modify->page_state = WT_PAGE_DIRTY_FIRST; + WT_FULL_BARRIER(); /* * Cache the oldest running transaction ID. This is used to check diff --git a/src/third_party/wiredtiger/src/support/modify.c b/src/third_party/wiredtiger/src/support/modify.c index be1b1970da6..e8260cb41b6 100644 --- a/src/third_party/wiredtiger/src/support/modify.c +++ b/src/third_party/wiredtiger/src/support/modify.c @@ -8,28 +8,60 @@ #include "wt_internal.h" +#define WT_MODIFY_FOREACH_BEGIN(mod, p, nentries, napplied) do { \ + const size_t *__p = p; \ + const uint8_t *__data = \ + (const uint8_t *)(__p + (size_t)(nentries) * 3); \ + int __i; \ + for (__i = 0; __i < (nentries); ++__i) { \ + memcpy(&(mod).data.size, __p++, sizeof(size_t)); \ + memcpy(&(mod).offset, __p++, sizeof(size_t)); \ + memcpy(&(mod).size, __p++, sizeof(size_t)); \ + (mod).data.data = __data; \ + __data += (mod).data.size; \ + if (__i < (napplied)) \ + continue; + +#define WT_MODIFY_FOREACH_REVERSE(mod, p, nentries, napplied, datasz) do {\ + const size_t *__p = (p) + (size_t)(nentries) * 3; \ + const uint8_t *__data = (const uint8_t *)__p + datasz; \ + int __i; \ + for (__i = (napplied); __i < (nentries); ++__i) { \ + memcpy(&(mod).size, --__p, sizeof(size_t)); \ + memcpy(&(mod).offset, --__p, sizeof(size_t)); \ + memcpy(&(mod).data.size, --__p, sizeof(size_t)); \ + (mod).data.data = (__data -= (mod).data.size); + +#define WT_MODIFY_FOREACH_END \ + } \ +} while (0) + /* * __wt_modify_pack -- * Pack a modify structure into a buffer. */ int -__wt_modify_pack(WT_SESSION_IMPL *session, +__wt_modify_pack(WT_CURSOR *cursor, WT_ITEM **modifyp, WT_MODIFY *entries, int nentries) { WT_ITEM *modify; - size_t len, *p; + WT_SESSION_IMPL *session; + size_t diffsz, len, *p; uint8_t *data; int i; + session = (WT_SESSION_IMPL *)cursor->session; + /* * Build the in-memory modify value. It's the entries count, followed * by the modify structure offsets written in order, followed by the * data (data at the end to minimize unaligned reads/writes). */ len = sizeof(size_t); /* nentries */ - for (i = 0; i < nentries; ++i) { + for (i = 0, diffsz = 0; i < nentries; ++i) { len += 3 * sizeof(size_t); /* WT_MODIFY fields */ len += entries[i].data.size; /* data */ + diffsz += entries[i].size; /* bytes touched */ } WT_RET(__wt_scr_alloc(session, len, &modify)); @@ -48,6 +80,18 @@ __wt_modify_pack(WT_SESSION_IMPL *session, } modify->size = WT_PTRDIFF(data, modify->data); *modifyp = modify; + + /* + * Update statistics. This is the common path called by + * WT_CURSOR::modify implementations. + */ + WT_STAT_CONN_INCR(session, cursor_modify); + WT_STAT_DATA_INCR(session, cursor_modify); + WT_STAT_CONN_INCRV(session, cursor_modify_bytes, cursor->value.size); + WT_STAT_DATA_INCRV(session, cursor_modify_bytes, cursor->value.size); + WT_STAT_CONN_INCRV(session, cursor_modify_bytes_touch, diffsz); + WT_STAT_DATA_INCRV(session, cursor_modify_bytes_touch, diffsz); + return (0); } @@ -56,51 +100,46 @@ __wt_modify_pack(WT_SESSION_IMPL *session, * Apply a single modify structure change to the buffer. */ static int -__modify_apply_one(WT_SESSION_IMPL *session, WT_CURSOR *cursor, - size_t data_size, size_t offset, size_t size, const uint8_t *data) +__modify_apply_one( + WT_SESSION_IMPL *session, WT_ITEM *value, WT_MODIFY *modify, bool sformat) { - WT_ITEM *value; - size_t len; - uint8_t *from, *to; - bool sformat; + size_t data_size, item_offset, offset, size; + const uint8_t *data, *from; + uint8_t *to; - value = &cursor->value; - sformat = cursor->value_format[0] == 'S'; + data = modify->data.data; + data_size = modify->data.size; + offset = modify->offset; + size = modify->size; /* * Grow the buffer to the maximum size we'll need. This is pessimistic * because it ignores replacement bytes, but it's a simpler calculation. * - * Grow the buffer before we fast-path the expected case. This function - * is often called using a cursor buffer referencing on-page memory and - * it's easy to overwrite a page. A side-effect of growing the buffer is - * to ensure the buffer's value is in buffer-local memory. + * Grow the buffer first. This function is often called using a cursor + * buffer referencing on-page memory and it's easy to overwrite a page. + * A side-effect of growing the buffer is to ensure the buffer's value + * is in buffer-local memory. * * Because the buffer may reference an overflow item, the data may not * start at the start of the buffer's memory and we have to correct for * that. */ - len = WT_DATA_IN_ITEM(value) ? WT_PTRDIFF(value->data, value->mem) : 0; - WT_RET(__wt_buf_grow(session, value, - len + WT_MAX(value->size, offset) + data_size + (sformat ? 1 : 0))); + item_offset = + WT_DATA_IN_ITEM(value) ? WT_PTRDIFF(value->data, value->mem) : 0; + WT_RET(__wt_buf_grow(session, value, item_offset + + WT_MAX(value->size, offset) + data_size + (sformat ? 1 : 0))); /* - * Fast-path the expected case, where we're overwriting a set of bytes + * Fast-path the common case, where we're overwriting a set of bytes * that already exist in the buffer. */ if (value->size > offset + data_size && data_size == size) { - memmove((uint8_t *)value->data + offset, data, data_size); + memcpy((uint8_t *)value->data + offset, data, data_size); return (0); } /* - * Decrement the size to discard the trailing nul (done after growing - * the buffer to ensure it can be restored without further checking). - */ - if (sformat) - --value->size; - - /* * If appending bytes past the end of the value, initialize gap bytes * and copy the new bytes into place. */ @@ -108,12 +147,8 @@ __modify_apply_one(WT_SESSION_IMPL *session, WT_CURSOR *cursor, if (value->size < offset) memset((uint8_t *)value->data + value->size, sformat ? ' ' : 0, offset - value->size); - memmove((uint8_t *)value->data + offset, data, data_size); + memcpy((uint8_t *)value->data + offset, data, data_size); value->size = offset + data_size; - - /* Restore the trailing nul. */ - if (sformat) - ((char *)value->data)[value->size++] = '\0'; return (0); } @@ -125,9 +160,12 @@ __modify_apply_one(WT_SESSION_IMPL *session, WT_CURSOR *cursor, if (value->size < offset + size) size = value->size - offset; + WT_ASSERT(session, value->size + (data_size - size) + + (sformat ? 1 : 0) <= value->memsize); + if (data_size == size) { /* Overwrite */ /* Copy in the new data. */ - memmove((uint8_t *)value->data + offset, data, data_size); + memcpy((uint8_t *)value->data + offset, data, data_size); /* * The new data must overlap the buffer's end (else, we'd use @@ -137,7 +175,7 @@ __modify_apply_one(WT_SESSION_IMPL *session, WT_CURSOR *cursor, value->size = offset + data_size; } else { /* Shrink or grow */ /* Move trailing data forward/backward to its new location. */ - from = (uint8_t *)value->data + (offset + size); + from = (const uint8_t *)value->data + (offset + size); WT_ASSERT(session, WT_DATA_IN_ITEM(value) && from + (value->size - (offset + size)) <= (uint8_t *)value->mem + value->memsize); @@ -148,7 +186,7 @@ __modify_apply_one(WT_SESSION_IMPL *session, WT_CURSOR *cursor, memmove(to, from, value->size - (offset + size)); /* Copy in the new data. */ - memmove((uint8_t *)value->data + offset, data, data_size); + memcpy((uint8_t *)value->data + offset, data, data_size); /* * Correct the size. This works because of how the C standard @@ -165,49 +203,134 @@ __modify_apply_one(WT_SESSION_IMPL *session, WT_CURSOR *cursor, value->size += (data_size - size); } - /* Restore the trailing nul. */ - if (sformat) - ((char *)value->data)[value->size++] = '\0'; - return (0); } /* - * __wt_modify_apply_api -- - * Apply a single set of WT_MODIFY changes to a buffer, the cursor API - * interface. + * __modify_fast_path -- + * Process a set of modifications, applying any that can be made in place, + * and check if the remaining ones are sorted and non-overlapping. */ -int -__wt_modify_apply_api(WT_SESSION_IMPL *session, - WT_CURSOR *cursor, WT_MODIFY *entries, int nentries) - WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) +static void +__modify_fast_path( + WT_ITEM *value, const size_t *p, int nentries, + int *nappliedp, bool *overlapp, size_t *dataszp, size_t *destszp) { - size_t modified; - int i; + WT_MODIFY current, prev; + size_t datasz, destoff; + bool fastpath, first; + + *overlapp = true; + + datasz = destoff = 0; + WT_CLEAR(current); + WT_CLEAR(prev); /* [-Werror=maybe-uninitialized] */ - for (modified = 0, i = 0; i < nentries; ++i) { - modified += entries[i].size; - WT_RET(__modify_apply_one(session, cursor, entries[i].data.size, - entries[i].offset, entries[i].size, entries[i].data.data)); - } /* - * This API is used by some external test functions with a NULL - * session pointer - they don't expect statistics to be incremented. + * If the modifications are sorted and don't overlap in the old or new + * values, we can do a fast application of all the modifications + * modifications in a single pass. + * + * The requirement for ordering is unfortunate, but modifications are + * performed in order, and applications specify byte offsets based on + * that. In other words, byte offsets are cumulative, modifications + * that shrink or grow the data affect subsequent modification's byte + * offsets. */ - if (session != NULL) { - WT_STAT_CONN_INCR(session, cursor_modify); - WT_STAT_DATA_INCR(session, cursor_modify); - WT_STAT_CONN_INCRV(session, - cursor_modify_bytes, cursor->value.size); - WT_STAT_DATA_INCRV(session, - cursor_modify_bytes, cursor->value.size); - WT_STAT_CONN_INCRV(session, - cursor_modify_bytes_touch, modified); - WT_STAT_DATA_INCRV(session, - cursor_modify_bytes_touch, modified); - } + fastpath = first = true; + *nappliedp = 0; + WT_MODIFY_FOREACH_BEGIN(current, p, nentries, 0) { + datasz += current.data.size; - return (0); + if (fastpath && current.data.size == current.size && + current.offset + current.size <= value->size) { + memcpy((uint8_t *)value->data + current.offset, + current.data.data, current.data.size); + ++(*nappliedp); + continue; + } + fastpath = false; + + /* Step over the bytes before the current block. */ + if (first) + destoff = current.offset; + else { + /* Check that entries are sorted and non-overlapping. */ + if (current.offset < prev.offset + prev.size || + current.offset < prev.offset + prev.data.size) + return; + destoff += current.offset - (prev.offset + prev.size); + } + + /* + * If the source is past the end of the current value, we have + * to deal with padding bytes. Don't try to fast-path padding + * bytes; it's not common and adds branches to the loop + * applying the changes. + */ + if (current.offset + current.size > value->size) + return; + + /* + * If copying this block overlaps with the next one, we can't + * build the value in reverse order. + */ + if (current.size != current.data.size && + current.offset + current.size > destoff) + return; + + /* Step over the current modification. */ + destoff += current.data.size; + + prev = current; + first = false; + } WT_MODIFY_FOREACH_END; + + /* Step over the final unmodified block. */ + destoff += value->size - (current.offset + current.size); + + *overlapp = false; + *dataszp = datasz; + *destszp = destoff; + return; +} + +/* + * __modify_apply_no_overlap -- + * Apply a single set of WT_MODIFY changes to a buffer, where the changes + * are in sorted order and none of the changes overlap. + */ +static void +__modify_apply_no_overlap(WT_SESSION_IMPL *session, WT_ITEM *value, + const size_t *p, int nentries, int napplied, size_t datasz, size_t destsz) +{ + WT_MODIFY current; + size_t sz; + const uint8_t *from; + uint8_t *to; + + from = (const uint8_t *)value->data + value->size; + to = (uint8_t *)value->data + destsz; + WT_MODIFY_FOREACH_REVERSE(current, p, nentries, napplied, datasz) { + /* Move the current unmodified block into place if necessary. */ + sz = WT_PTRDIFF(to, value->data) - + (current.offset + current.data.size); + from -= sz; + to -= sz; + WT_ASSERT(session, from >= (const uint8_t *)value->data && + to >= (uint8_t *)value->data); + WT_ASSERT(session, + from + sz <= (const uint8_t *)value->data + value->size); + + if (to != from) + memmove(to, from, sz); + + from -= current.size; + to -= current.data.size; + memcpy(to, current.data.data, current.data.size); + } WT_MODIFY_FOREACH_END; + + value->size = destsz; } /* @@ -215,31 +338,91 @@ __wt_modify_apply_api(WT_SESSION_IMPL *session, * Apply a single set of WT_MODIFY changes to a buffer. */ int -__wt_modify_apply( - WT_SESSION_IMPL *session, WT_CURSOR *cursor, const void *modify) +__wt_modify_apply(WT_CURSOR *cursor, const void *modify) { - size_t data_size, nentries, offset, size; + WT_ITEM *value; + WT_MODIFY mod; + WT_SESSION_IMPL *session; + size_t datasz, destsz, item_offset, tmp; const size_t *p; - const uint8_t *data; + int napplied, nentries; + bool overlap, sformat; + + session = (WT_SESSION_IMPL *)cursor->session; + sformat = cursor->value_format[0] == 'S'; + value = &cursor->value; /* - * Get the number of entries, and set a second pointer to reference the - * change data. The modify string isn't necessarily aligned for size_t - * access, copy to be sure. + * Get the number of modify entries and set a second pointer to + * reference the replacement data. */ p = modify; - memcpy(&nentries, p++, sizeof(size_t)); - data = (uint8_t *)modify + - sizeof(size_t) + (nentries * 3 * sizeof(size_t)); - - /* Step through the list of entries, applying them in order. */ - for (; nentries-- > 0; data += data_size) { - memcpy(&data_size, p++, sizeof(size_t)); - memcpy(&offset, p++, sizeof(size_t)); - memcpy(&size, p++, sizeof(size_t)); - WT_RET(__modify_apply_one( - session, cursor, data_size, offset, size, data)); + memcpy(&tmp, p++, sizeof(size_t)); + nentries = (int)tmp; + + /* + * Grow the buffer first. This function is often called using a cursor + * buffer referencing on-page memory and it's easy to overwrite a page. + * A side-effect of growing the buffer is to ensure the buffer's value + * is in buffer-local memory. + * + * Because the buffer may reference an overflow item, the data may not + * start at the start of the buffer's memory and we have to correct for + * that. + */ + item_offset = WT_DATA_IN_ITEM(value) ? + WT_PTRDIFF(value->data, value->mem) : 0; + WT_RET(__wt_buf_grow(session, value, item_offset + value->size)); + + /* + * Decrement the size to discard the trailing nul (done after growing + * the buffer to ensure it can be restored without further checking). + */ + if (sformat) + --value->size; + + __modify_fast_path( + value, p, nentries, &napplied, &overlap, &datasz, &destsz); + + if (napplied == nentries) + goto done; + + if (!overlap) { + /* Grow the buffer first, correcting for the data offset. */ + WT_RET(__wt_buf_grow(session, value, item_offset + + WT_MAX(destsz, value->size) + (sformat ? 1 : 0))); + + __modify_apply_no_overlap( + session, value, p, nentries, napplied, datasz, destsz); + goto done; } + WT_MODIFY_FOREACH_BEGIN(mod, p, nentries, napplied) { + WT_RET(__modify_apply_one(session, value, &mod, sformat)); + } WT_MODIFY_FOREACH_END; + +done: /* Restore the trailing nul. */ + if (sformat) + ((char *)value->data)[value->size++] = '\0'; + return (0); } + +/* + * __wt_modify_apply_api -- + * Apply a single set of WT_MODIFY changes to a buffer, the cursor API + * interface. + */ +int +__wt_modify_apply_api(WT_CURSOR *cursor, WT_MODIFY *entries, int nentries) + WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) +{ + WT_DECL_ITEM(modify); + WT_DECL_RET; + + WT_ERR(__wt_modify_pack(cursor, &modify, entries, nentries)); + WT_ERR(__wt_modify_apply(cursor, modify->data)); + +err: __wt_scr_free((WT_SESSION_IMPL *)cursor->session, &modify); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c index a5e3e139178..504b2c0e8b4 100644 --- a/src/third_party/wiredtiger/src/txn/txn_recover.c +++ b/src/third_party/wiredtiger/src/txn/txn_recover.c @@ -155,7 +155,7 @@ __txn_op_apply( * than using cursor modify to create a partial update * (for no particular reason than simplicity). */ - WT_ERR(__wt_modify_apply(session, cursor, value.data)); + WT_ERR(__wt_modify_apply(cursor, value.data)); WT_ERR(cursor->insert(cursor)); } break; @@ -222,7 +222,7 @@ __txn_op_apply( * than using cursor modify to create a partial update * (for no particular reason than simplicity). */ - WT_ERR(__wt_modify_apply(session, cursor, value.data)); + WT_ERR(__wt_modify_apply(cursor, value.data)); WT_ERR(cursor->insert(cursor)); } break; diff --git a/src/third_party/wiredtiger/test/csuite/wt3338_partial_update/main.c b/src/third_party/wiredtiger/test/csuite/wt3338_partial_update/main.c index 879a8e96c6a..5a413c0df3b 100644 --- a/src/third_party/wiredtiger/test/csuite/wt3338_partial_update/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt3338_partial_update/main.c @@ -37,7 +37,7 @@ #define DATASIZE 1024 #define MAX_MODIFY_ENTRIES 37 /* Maximum modify vectors */ -static WT_MODIFY entries[1000]; /* Entries vector */ +static WT_MODIFY entries[MAX_MODIFY_ENTRIES]; /* Entries vector */ static int nentries; /* Entries count */ /* @@ -50,7 +50,6 @@ static char modify_repl[MAX_REPL_BYTES * 2]; /* Replacement bytes */ static WT_RAND_STATE rnd; /* RNG state */ -#if DEBUG /* * show -- * Dump out a buffer. @@ -62,15 +61,10 @@ show(WT_ITEM *buf, const char *tag) const uint8_t *a; fprintf(stderr, "%s: %" WT_SIZET_FMT " bytes\n\t", tag, buf->size); - for (a = buf->data, i = 0; i < buf->size; ++i, ++a) { - if (isprint(*a)) - fprintf(stderr, " %c", *a); - else - fprintf(stderr, " %#x", *a); - } + for (a = buf->data, i = 0; i < buf->size; ++i, ++a) + fprintf(stderr, " %c", isprint(*a) ? *a : '.'); fprintf(stderr, "\n"); } -#endif /* * modify_repl_init -- @@ -82,7 +76,7 @@ modify_repl_init(void) size_t i; for (i = 0; i < sizeof(modify_repl); ++i) - modify_repl[i] = "zyxwvutsrqponmlkjihgfedcba"[i % 26]; + modify_repl[i] = 'Z' - (i % 26); } /* @@ -95,13 +89,13 @@ modify_build(void) int i; /* Mess up the entries. */ - memset(entries, 0xff, MAX_MODIFY_ENTRIES * sizeof(entries[0])); + memset(entries, 0xff, sizeof(entries)); /* * Randomly select a number of byte changes, offsets and lengths. * Allow a value of 0, the API should accept it. */ - nentries = (int)(__wt_random(&rnd) % MAX_MODIFY_ENTRIES); + nentries = (int)(__wt_random(&rnd) % (MAX_MODIFY_ENTRIES + 1)); for (i = 0; i < nentries; ++i) { entries[i].data.data = modify_repl + __wt_random(&rnd) % MAX_REPL_BYTES; @@ -115,7 +109,7 @@ modify_build(void) printf( "%d: {%.*s} %" WT_SIZET_FMT " bytes replacing %" WT_SIZET_FMT " bytes @ %" WT_SIZET_FMT "\n", - i, (int)entries[i].data.size, entries[i].data.data, + i, (int)entries[i].data.size, (char *)entries[i].data.data, entries[i].data.size, entries[i].size, entries[i].offset); #endif } @@ -217,16 +211,29 @@ slow_apply_api(WT_ITEM *orig) * Compare two results. */ static void -compare(WT_ITEM *local, WT_ITEM *library) +compare(WT_ITEM *orig, WT_ITEM *local, WT_ITEM *library) { -#if DEBUG + size_t i, max; + const uint8_t *p, *t; + + max = WT_MIN(local->size, library->size); if (local->size != library->size || memcmp(local->data, library->data, local->size) != 0) { - fprintf(stderr, "results differ\n"); + for (i = 0, + p = local->data, t = library->data; i < max; ++i, ++p, ++t) + if (*p != *t) + break; + fprintf(stderr, "results differ: "); + if (max == 0) + fprintf(stderr, + "identical up to %" WT_SIZET_FMT " bytes\n", max); + else + fprintf(stderr, + "first mismatch at offset %" WT_SIZET_FMT "\n", i); + show(orig, "original"); show(local, "local results"); show(library, "library results"); } -#endif testutil_assert( local->size == library->size && memcmp( local->data, library->data, local->size) == 0); @@ -250,16 +257,22 @@ compare(WT_ITEM *local, WT_ITEM *library) * calculate-modify API. */ static void -modify_run(bool verbose) +modify_run(TEST_OPTS *opts) { WT_CURSOR *cursor, _cursor; WT_DECL_RET; WT_ITEM *localA, _localA, *localB, _localB; + WT_SESSION_IMPL *session; size_t len; int i, j; + u_char *p; + bool verbose; + + session = (WT_SESSION_IMPL *)opts->session; + verbose = opts->verbose; /* Initialize the RNG. */ - __wt_random_init_seed(NULL, &rnd); + __wt_random_init_seed(session, &rnd); /* Set up replacement information. */ modify_repl_init(); @@ -271,18 +284,24 @@ modify_run(bool verbose) memset(&_localB, 0, sizeof(_localB)); cursor = &_cursor; memset(&_cursor, 0, sizeof(_cursor)); + cursor->session = (WT_SESSION *)session; cursor->value_format = "u"; #define NRUNS 10000 for (i = 0; i < NRUNS; ++i) { /* Create an initial value. */ len = (size_t)(__wt_random(&rnd) % MAX_REPL_BYTES); - testutil_check(__wt_buf_set(NULL, localA, modify_repl, len)); + testutil_check(__wt_buf_set(session, localA, modify_repl, len)); for (j = 0; j < 1000; ++j) { + /* Make lower case so modifications are easy to see. */ + for (p = localA->mem; + WT_PTRDIFF(p, localA->mem) < localA->size; p++) + *p = __wt_tolower(*p); + /* Copy the current value into the second item. */ testutil_check(__wt_buf_set( - NULL, localB, localA->data, localA->size)); + session, localB, localA->data, localA->size)); /* * Create a random set of modify vectors, run the @@ -291,12 +310,12 @@ modify_run(bool verbose) * of modify. */ modify_build(); - testutil_check(__wt_buf_set( - NULL, &cursor->value, localA->data, localA->size)); + testutil_check(__wt_buf_set(session, + &cursor->value, localA->data, localA->size)); testutil_check(__wt_modify_apply_api( - NULL, cursor, entries, nentries)); + cursor, entries, nentries)); slow_apply_api(localA); - compare(localA, &cursor->value); + compare(localB, localA, &cursor->value); /* * Call the WiredTiger function to build a modification @@ -305,18 +324,18 @@ modify_run(bool verbose) * against our implementation of modify. */ nentries = WT_ELEMENTS(entries); - ret = wiredtiger_calc_modify(NULL, + ret = wiredtiger_calc_modify(opts->session, localB, localA, WT_MAX(localB->size, localA->size) + 100, entries, &nentries); if (ret == WT_NOTFOUND) continue; testutil_check(ret); - testutil_check(__wt_buf_set( - NULL, &cursor->value, localB->data, localB->size)); + testutil_check(__wt_buf_set(session, + &cursor->value, localB->data, localB->size)); testutil_check(__wt_modify_apply_api( - NULL, cursor, entries, nentries)); - compare(localA, &cursor->value); + cursor, entries, nentries)); + compare(localB, localA, &cursor->value); } if (verbose) { printf("%d (%d%%)\r", i, (i * 100) / NRUNS); @@ -326,9 +345,9 @@ modify_run(bool verbose) if (verbose) printf("%d (100%%)\n", i); - __wt_buf_free(NULL, localA); - __wt_buf_free(NULL, localB); - __wt_buf_free(NULL, &cursor->value); + __wt_buf_free(session, localA); + __wt_buf_free(session, localB); + __wt_buf_free(session, &cursor->value); } int @@ -342,9 +361,11 @@ main(int argc, char *argv[]) testutil_make_work_dir(opts->home); testutil_check( wiredtiger_open(opts->home, NULL, "create", &opts->conn)); + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &opts->session)); /* Run the test. */ - modify_run(opts->verbose); + modify_run(opts); testutil_cleanup(opts); return (EXIT_SUCCESS); diff --git a/src/third_party/wiredtiger/test/format/snap.c b/src/third_party/wiredtiger/test/format/snap.c index 9442f1fdeb3..e425624ab5d 100644 --- a/src/third_party/wiredtiger/test/format/snap.c +++ b/src/third_party/wiredtiger/test/format/snap.c @@ -61,6 +61,7 @@ snap_track(TINFO *tinfo, thread_op op) snap->ts = WT_TS_NONE; snap->repeatable = false; snap->last = op == TRUNCATE ? tinfo->last : 0; + snap->ksize = snap->vsize = 0; if (op == INSERT && g.type == ROW) { ip = tinfo->key; |