summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2019-08-07 15:12:18 +1000
committerLuke Chen <luke.chen@mongodb.com>2019-08-07 15:12:18 +1000
commitb8602c086ff469967bedc82b14d63d4a236d092c (patch)
tree8f732b2d259b59b01a891979f6a255e2c28f677c /src
parentf659b76958858056cdb7558032cb70bdd53c57b3 (diff)
downloadmongo-b8602c086ff469967bedc82b14d63d4a236d092c.tar.gz
Import wiredtiger: 19fd4ed45b3a2a8c119b68015113630b0c060e5f from branch mongodb-4.4
ref: c29f4c6030..19fd4ed45b for: 4.3.1 WT-4460 Optimize for in-order, non-overlapping modifications WT-4956 Handle the case where 4 billion updates are made to a page without eviction
Diffstat (limited to 'src')
-rw-r--r--src/third_party/wiredtiger/dist/s_string.ok1
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_cursor.c8
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_debug.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_ret.c2
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_std.c2
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h17
-rw-r--r--src/third_party/wiredtiger/src/include/btree.i34
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h6
-rw-r--r--src/third_party/wiredtiger/src/include/reconcile.h6
-rw-r--r--src/third_party/wiredtiger/src/include/serial.i32
-rw-r--r--src/third_party/wiredtiger/src/include/txn.i1
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c31
-rw-r--r--src/third_party/wiredtiger/src/support/modify.c355
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_recover.c4
-rw-r--r--src/third_party/wiredtiger/test/csuite/wt3338_partial_update/main.c89
-rw-r--r--src/third_party/wiredtiger/test/format/snap.c1
17 files changed, 404 insertions, 189 deletions
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok
index 6ce1ad16a5f..c752a93e7b0 100644
--- a/src/third_party/wiredtiger/dist/s_string.ok
+++ b/src/third_party/wiredtiger/dist/s_string.ok
@@ -1325,6 +1325,7 @@ unmodify
unordered
unpackv
unpadded
+unreconciled
unreferenced
unregister
unsized
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 882ae91c078..8bc6565102b 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -1,5 +1,5 @@
{
- "commit": "c29f4c6030e37794b8c2c1195829ba2d14954677",
+ "commit": "19fd4ed45b3a2a8c119b68015113630b0c060e5f",
"github": "wiredtiger/wiredtiger.git",
"vendor": "wiredtiger",
"branch": "mongodb-4.4"
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
index 4d94bcdb23e..d045405f85a 100644
--- a/src/third_party/wiredtiger/src/btree/bt_cursor.c
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -1506,8 +1506,11 @@ __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries)
if (!F_ISSET(cursor, WT_CURSTD_KEY_INT) ||
!F_ISSET(cursor, WT_CURSTD_VALUE_INT))
WT_ERR(__wt_btcur_search(cbt));
+
+ WT_ERR(__wt_modify_pack(cursor, &modify, entries, nentries));
+
orig = cursor->value.size;
- WT_ERR(__wt_modify_apply_api(session, cursor, entries, nentries));
+ WT_ERR(__wt_modify_apply(cursor, modify->data));
new = cursor->value.size;
WT_ERR(__cursor_size_chk(session, &cursor->value));
@@ -1527,8 +1530,7 @@ __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries)
F_CLR(cursor, WT_CURSTD_OVERWRITE);
if (cursor->value.size <= 64 || __cursor_chain_exceeded(cbt))
ret = __btcur_update(cbt, &cursor->value, WT_UPDATE_STANDARD);
- else if ((ret =
- __wt_modify_pack(session, &modify, entries, nentries)) == 0)
+ else
ret = __btcur_update(cbt, modify, WT_UPDATE_MODIFY);
if (overwrite)
F_SET(cursor, WT_CURSTD_OVERWRITE);
diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c
index 685fb983718..2eb71967799 100644
--- a/src/third_party/wiredtiger/src/btree/bt_debug.c
+++ b/src/third_party/wiredtiger/src/btree/bt_debug.c
@@ -889,7 +889,7 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref)
if (split_gen != 0)
WT_RET(ds->f(ds, ", split-gen=%" PRIu64, split_gen));
if (mod != NULL)
- WT_RET(ds->f(ds, ", write-gen=%" PRIu32, mod->write_gen));
+ WT_RET(ds->f(ds, ", page-state=%" PRIu32, mod->page_state));
WT_RET(ds->f(ds,
", memory-size %" WT_SIZET_FMT, page->memory_footprint));
WT_RET(ds->f(ds, "\n"));
diff --git a/src/third_party/wiredtiger/src/btree/bt_ret.c b/src/third_party/wiredtiger/src/btree/bt_ret.c
index 52277efb85d..d41f76c6442 100644
--- a/src/third_party/wiredtiger/src/btree/bt_ret.c
+++ b/src/third_party/wiredtiger/src/btree/bt_ret.c
@@ -250,7 +250,7 @@ __wt_value_return_upd(WT_SESSION_IMPL *session,
* updates.
*/
while (i > 0)
- WT_ERR(__wt_modify_apply(session, cursor, listp[--i]->data));
+ WT_ERR(__wt_modify_apply(cursor, listp[--i]->data));
err: if (allocated_bytes != 0)
__wt_free(session, listp);
diff --git a/src/third_party/wiredtiger/src/cursor/cur_std.c b/src/third_party/wiredtiger/src/cursor/cur_std.c
index 073df6eaaf6..22d067ef90e 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_std.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_std.c
@@ -931,7 +931,7 @@ __cursor_modify(WT_CURSOR *cursor, WT_MODIFY *entries, int nentries)
/* Get the current value, apply the modifications. */
WT_ERR(cursor->search(cursor));
- WT_ERR(__wt_modify_apply_api(session, cursor, entries, nentries));
+ WT_ERR(__wt_modify_apply_api(cursor, entries, nentries));
/* We know both key and value are set, "overwrite" doesn't matter. */
ret = cursor->update(cursor);
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index a7c289a7b7f..03643f473e1 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -489,10 +489,21 @@ struct __wt_page_modify {
WT_SPINLOCK page_lock; /* Page's spinlock */
/*
- * The write generation is incremented when a page is modified, a page
- * is clean if the write generation is 0.
+ * The page state is incremented when a page is modified.
+ *
+ * WT_PAGE_CLEAN --
+ * The page is clean.
+ * WT_PAGE_DIRTY_FIRST --
+ * The page is in this state after the first operation that marks a
+ * page dirty, or when reconciliation is checking to see if it has
+ * done enough work to be able to mark the page clean.
+ * WT_PAGE_DIRTY --
+ * Two or more updates have been added to the page.
*/
- uint32_t write_gen;
+#define WT_PAGE_CLEAN 0
+#define WT_PAGE_DIRTY_FIRST 1
+#define WT_PAGE_DIRTY 2
+ uint32_t page_state;
#define WT_PM_REC_EMPTY 1 /* Reconciliation: no replacement */
#define WT_PM_REC_MULTIBLOCK 2 /* Reconciliation: multiple blocks */
diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i
index d0679a9fb38..3fa5d60f1f1 100644
--- a/src/third_party/wiredtiger/src/include/btree.i
+++ b/src/third_party/wiredtiger/src/include/btree.i
@@ -34,7 +34,8 @@ __wt_page_is_empty(WT_PAGE *page)
static inline bool
__wt_page_evict_clean(WT_PAGE *page)
{
- return (page->modify == NULL || (page->modify->write_gen == 0 &&
+ return (page->modify == NULL ||
+ (page->modify->page_state == WT_PAGE_CLEAN &&
page->modify->rec_result == 0));
}
@@ -45,7 +46,8 @@ __wt_page_evict_clean(WT_PAGE *page)
static inline bool
__wt_page_is_modified(WT_PAGE *page)
{
- return (page->modify != NULL && page->modify->write_gen != 0);
+ return (page->modify != NULL &&
+ page->modify->page_state != WT_PAGE_CLEAN);
}
/*
@@ -496,19 +498,25 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
WT_ASSERT(session, !F_ISSET(session->dhandle, WT_DHANDLE_DEAD));
last_running = 0;
- if (page->modify->write_gen == 0)
+ if (page->modify->page_state == WT_PAGE_CLEAN)
last_running = S2C(session)->txn_global.last_running;
/*
- * We depend on atomic-add being a write barrier, that is, a barrier to
- * ensure all changes to the page are flushed before updating the page
- * write generation and/or marking the tree dirty, otherwise checkpoints
+ * We depend on the atomic operation being a write barrier, that is, a
+ * barrier to ensure all changes to the page are flushed before updating
+ * the page state and/or marking the tree dirty, otherwise checkpoints
* and/or page reconciliation might be looking at a clean page/tree.
*
* Every time the page transitions from clean to dirty, update the cache
* and transactional information.
+ *
+ * The page state can only ever be incremented above dirty by the number
+ * of concurrently running threads, so the counter will never approach
+ * the point where it would wrap.
*/
- if (__wt_atomic_add32(&page->modify->write_gen, 1) == 1) {
+ if (page->modify->page_state < WT_PAGE_DIRTY &&
+ __wt_atomic_add32(&page->modify->page_state, 1) ==
+ WT_PAGE_DIRTY_FIRST) {
__wt_cache_dirty_incr(session, page);
/*
@@ -579,7 +587,17 @@ __wt_page_modify_clear(WT_SESSION_IMPL *session, WT_PAGE *page)
* Allow the call to be made on clean pages.
*/
if (__wt_page_is_modified(page)) {
- page->modify->write_gen = 0;
+ /*
+ * The only part where ordering matters is during
+ * reconciliation where updates on other threads are performing
+ * writes to the page state that need to be visible to the
+ * reconciliation thread.
+ *
+ * Since clearing of the page state is not going to be happening
+ * during reconciliation on a separate thread, there's no write
+ * barrier needed here.
+ */
+ page->modify->page_state = WT_PAGE_CLEAN;
__wt_cache_dirty_decr(session, page);
}
}
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 80046127d3f..843cf562c66 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -503,9 +503,9 @@ extern int __wt_metadata_search(WT_SESSION_IMPL *session, const char *key, char
extern int __wt_metadata_set_base_write_gen(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_metadata_turtle_rewrite(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_metadata_update( WT_SESSION_IMPL *session, const char *key, const char *value) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_modify_apply( WT_SESSION_IMPL *session, WT_CURSOR *cursor, const void *modify) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_modify_apply_api(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_MODIFY *entries, int nentries) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_modify_pack(WT_SESSION_IMPL *session, WT_ITEM **modifyp, WT_MODIFY *entries, int nentries) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_modify_apply(WT_CURSOR *cursor, const void *modify) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_modify_apply_api(WT_CURSOR *cursor, WT_MODIFY *entries, int nentries) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_modify_pack(WT_CURSOR *cursor, WT_ITEM **modifyp, WT_MODIFY *entries, int nentries) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_msg(WT_SESSION_IMPL *session, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 2, 3))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp, bool closing) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_name_check(WT_SESSION_IMPL *session, const char *str, size_t len) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
diff --git a/src/third_party/wiredtiger/src/include/reconcile.h b/src/third_party/wiredtiger/src/include/reconcile.h
index be6440c27bc..c3c46ec11c5 100644
--- a/src/third_party/wiredtiger/src/include/reconcile.h
+++ b/src/third_party/wiredtiger/src/include/reconcile.h
@@ -20,12 +20,6 @@ struct __wt_reconcile {
uint32_t flags; /* Caller's configuration */
/*
- * Track start/stop write generation to decide if all changes to the
- * page are written.
- */
- uint32_t orig_write_gen;
-
- /*
* Track start/stop checkpoint generations to decide if lookaside table
* records are correct.
*/
diff --git a/src/third_party/wiredtiger/src/include/serial.i b/src/third_party/wiredtiger/src/include/serial.i
index 1c67a84adbf..701f73df84f 100644
--- a/src/third_party/wiredtiger/src/include/serial.i
+++ b/src/third_party/wiredtiger/src/include/serial.i
@@ -7,29 +7,6 @@
*/
/*
- * __page_write_gen_wrapped_check --
- * Confirm the page's write generation number won't wrap.
- */
-static inline int
-__page_write_gen_wrapped_check(WT_PAGE *page)
-{
- /*
- * Check to see if the page's write generation is about to wrap (wildly
- * unlikely as it implies 4B updates between clean page reconciliations,
- * but technically possible), and fail the update.
- *
- * The check is outside of the serialization mutex because the page's
- * write generation is going to be a hot cache line, so technically it's
- * possible for the page's write generation to wrap between the test and
- * our subsequent modification of it. However, the test is (4B-1M), and
- * there cannot be a million threads that have done the test but not yet
- * completed their modification.
- */
- return (page->modify->write_gen >
- UINT32_MAX - WT_MILLION ? WT_RESTART : 0);
-}
-
-/*
* __insert_simple_func --
* Worker function to add a WT_INSERT entry to the middle of a skiplist.
*/
@@ -163,9 +140,6 @@ __wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
new_ins = *new_insp;
*new_insp = NULL;
- /* Check for page write generation wrap. */
- WT_RET(__page_write_gen_wrapped_check(page));
-
/*
* Acquire the page's spinlock unless we already have exclusive access.
* Then call the worker function.
@@ -215,9 +189,6 @@ __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
new_ins = *new_insp;
*new_insp = NULL;
- /* Check for page write generation wrap. */
- WT_RET(__page_write_gen_wrapped_check(page));
-
simple = true;
for (i = 0; i < skipdepth; i++)
if (new_ins->next[i] == NULL)
@@ -272,9 +243,6 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
upd = *updp;
*updp = NULL;
- /* Check for page write generation wrap. */
- WT_RET(__page_write_gen_wrapped_check(page));
-
/*
* All structure setup must be flushed before the structure is entered
* into the list. We need a write barrier here, our callers depend on
diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i
index ce58f9f7301..e2141518e3a 100644
--- a/src/third_party/wiredtiger/src/include/txn.i
+++ b/src/third_party/wiredtiger/src/include/txn.i
@@ -844,6 +844,7 @@ __wt_txn_upd_visible_type(WT_SESSION_IMPL *session, WT_UPDATE *upd)
return (WT_VISIBLE_TRUE);
}
+
/*
* __wt_txn_upd_durable --
* Can the current transaction make the given update durable.
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index f7eeada4c8e..477894bcf14 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -456,14 +456,20 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r)
}
/*
- * The page only might be clean; if the write generation is
- * unchanged since reconciliation started, it's clean.
+ * We set the page state to mark it as having been dirtied for
+ * the first time prior to reconciliation. A failed atomic cas
+ * indicates that an update has taken place during
+ * reconciliation.
*
- * If the write generation changed, the page has been written
- * since reconciliation started and remains dirty (that can't
- * happen when evicting, the page is exclusively locked).
+ * The page only might be clean; if the page state is unchanged
+ * since reconciliation started, it's clean.
+ *
+ * If the page state changed, the page has been written since
+ * reconciliation started and remains dirty (that can't happen
+ * when evicting, the page is exclusively locked).
*/
- if (__wt_atomic_cas32(&mod->write_gen, r->orig_write_gen, 0))
+ if (__wt_atomic_cas32(
+ &mod->page_state, WT_PAGE_DIRTY_FIRST, WT_PAGE_CLEAN))
__wt_cache_dirty_decr(session, page);
else
WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT));
@@ -602,13 +608,22 @@ __rec_init(WT_SESSION_IMPL *session,
r->page = page;
/*
- * Save the page's write generation before reading the page.
* Save the transaction generations before reading the page.
* These are all ordered reads, but we only need one.
*/
r->orig_btree_checkpoint_gen = btree->checkpoint_gen;
r->orig_txn_checkpoint_gen = __wt_gen(session, WT_GEN_CHECKPOINT);
- WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen);
+
+ /*
+ * Update the page state to indicate that all currently installed
+ * updates will be included in this reconciliation if it would mark the
+ * page clean.
+ *
+ * Add a write barrier to make it more likely that a thread adding an
+ * update will see this state change.
+ */
+ page->modify->page_state = WT_PAGE_DIRTY_FIRST;
+ WT_FULL_BARRIER();
/*
* Cache the oldest running transaction ID. This is used to check
diff --git a/src/third_party/wiredtiger/src/support/modify.c b/src/third_party/wiredtiger/src/support/modify.c
index be1b1970da6..e8260cb41b6 100644
--- a/src/third_party/wiredtiger/src/support/modify.c
+++ b/src/third_party/wiredtiger/src/support/modify.c
@@ -8,28 +8,60 @@
#include "wt_internal.h"
+#define WT_MODIFY_FOREACH_BEGIN(mod, p, nentries, napplied) do { \
+ const size_t *__p = p; \
+ const uint8_t *__data = \
+ (const uint8_t *)(__p + (size_t)(nentries) * 3); \
+ int __i; \
+ for (__i = 0; __i < (nentries); ++__i) { \
+ memcpy(&(mod).data.size, __p++, sizeof(size_t)); \
+ memcpy(&(mod).offset, __p++, sizeof(size_t)); \
+ memcpy(&(mod).size, __p++, sizeof(size_t)); \
+ (mod).data.data = __data; \
+ __data += (mod).data.size; \
+ if (__i < (napplied)) \
+ continue;
+
+#define WT_MODIFY_FOREACH_REVERSE(mod, p, nentries, napplied, datasz) do {\
+ const size_t *__p = (p) + (size_t)(nentries) * 3; \
+ const uint8_t *__data = (const uint8_t *)__p + datasz; \
+ int __i; \
+ for (__i = (napplied); __i < (nentries); ++__i) { \
+ memcpy(&(mod).size, --__p, sizeof(size_t)); \
+ memcpy(&(mod).offset, --__p, sizeof(size_t)); \
+ memcpy(&(mod).data.size, --__p, sizeof(size_t)); \
+ (mod).data.data = (__data -= (mod).data.size);
+
+#define WT_MODIFY_FOREACH_END \
+ } \
+} while (0)
+
/*
* __wt_modify_pack --
* Pack a modify structure into a buffer.
*/
int
-__wt_modify_pack(WT_SESSION_IMPL *session,
+__wt_modify_pack(WT_CURSOR *cursor,
WT_ITEM **modifyp, WT_MODIFY *entries, int nentries)
{
WT_ITEM *modify;
- size_t len, *p;
+ WT_SESSION_IMPL *session;
+ size_t diffsz, len, *p;
uint8_t *data;
int i;
+ session = (WT_SESSION_IMPL *)cursor->session;
+
/*
* Build the in-memory modify value. It's the entries count, followed
* by the modify structure offsets written in order, followed by the
* data (data at the end to minimize unaligned reads/writes).
*/
len = sizeof(size_t); /* nentries */
- for (i = 0; i < nentries; ++i) {
+ for (i = 0, diffsz = 0; i < nentries; ++i) {
len += 3 * sizeof(size_t); /* WT_MODIFY fields */
len += entries[i].data.size; /* data */
+ diffsz += entries[i].size; /* bytes touched */
}
WT_RET(__wt_scr_alloc(session, len, &modify));
@@ -48,6 +80,18 @@ __wt_modify_pack(WT_SESSION_IMPL *session,
}
modify->size = WT_PTRDIFF(data, modify->data);
*modifyp = modify;
+
+ /*
+ * Update statistics. This is the common path called by
+ * WT_CURSOR::modify implementations.
+ */
+ WT_STAT_CONN_INCR(session, cursor_modify);
+ WT_STAT_DATA_INCR(session, cursor_modify);
+ WT_STAT_CONN_INCRV(session, cursor_modify_bytes, cursor->value.size);
+ WT_STAT_DATA_INCRV(session, cursor_modify_bytes, cursor->value.size);
+ WT_STAT_CONN_INCRV(session, cursor_modify_bytes_touch, diffsz);
+ WT_STAT_DATA_INCRV(session, cursor_modify_bytes_touch, diffsz);
+
return (0);
}
@@ -56,51 +100,46 @@ __wt_modify_pack(WT_SESSION_IMPL *session,
* Apply a single modify structure change to the buffer.
*/
static int
-__modify_apply_one(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
- size_t data_size, size_t offset, size_t size, const uint8_t *data)
+__modify_apply_one(
+ WT_SESSION_IMPL *session, WT_ITEM *value, WT_MODIFY *modify, bool sformat)
{
- WT_ITEM *value;
- size_t len;
- uint8_t *from, *to;
- bool sformat;
+ size_t data_size, item_offset, offset, size;
+ const uint8_t *data, *from;
+ uint8_t *to;
- value = &cursor->value;
- sformat = cursor->value_format[0] == 'S';
+ data = modify->data.data;
+ data_size = modify->data.size;
+ offset = modify->offset;
+ size = modify->size;
/*
* Grow the buffer to the maximum size we'll need. This is pessimistic
* because it ignores replacement bytes, but it's a simpler calculation.
*
- * Grow the buffer before we fast-path the expected case. This function
- * is often called using a cursor buffer referencing on-page memory and
- * it's easy to overwrite a page. A side-effect of growing the buffer is
- * to ensure the buffer's value is in buffer-local memory.
+ * Grow the buffer first. This function is often called using a cursor
+ * buffer referencing on-page memory and it's easy to overwrite a page.
+ * A side-effect of growing the buffer is to ensure the buffer's value
+ * is in buffer-local memory.
*
* Because the buffer may reference an overflow item, the data may not
* start at the start of the buffer's memory and we have to correct for
* that.
*/
- len = WT_DATA_IN_ITEM(value) ? WT_PTRDIFF(value->data, value->mem) : 0;
- WT_RET(__wt_buf_grow(session, value,
- len + WT_MAX(value->size, offset) + data_size + (sformat ? 1 : 0)));
+ item_offset =
+ WT_DATA_IN_ITEM(value) ? WT_PTRDIFF(value->data, value->mem) : 0;
+ WT_RET(__wt_buf_grow(session, value, item_offset +
+ WT_MAX(value->size, offset) + data_size + (sformat ? 1 : 0)));
/*
- * Fast-path the expected case, where we're overwriting a set of bytes
+ * Fast-path the common case, where we're overwriting a set of bytes
* that already exist in the buffer.
*/
if (value->size > offset + data_size && data_size == size) {
- memmove((uint8_t *)value->data + offset, data, data_size);
+ memcpy((uint8_t *)value->data + offset, data, data_size);
return (0);
}
/*
- * Decrement the size to discard the trailing nul (done after growing
- * the buffer to ensure it can be restored without further checking).
- */
- if (sformat)
- --value->size;
-
- /*
* If appending bytes past the end of the value, initialize gap bytes
* and copy the new bytes into place.
*/
@@ -108,12 +147,8 @@ __modify_apply_one(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
if (value->size < offset)
memset((uint8_t *)value->data + value->size,
sformat ? ' ' : 0, offset - value->size);
- memmove((uint8_t *)value->data + offset, data, data_size);
+ memcpy((uint8_t *)value->data + offset, data, data_size);
value->size = offset + data_size;
-
- /* Restore the trailing nul. */
- if (sformat)
- ((char *)value->data)[value->size++] = '\0';
return (0);
}
@@ -125,9 +160,12 @@ __modify_apply_one(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
if (value->size < offset + size)
size = value->size - offset;
+ WT_ASSERT(session, value->size + (data_size - size) +
+ (sformat ? 1 : 0) <= value->memsize);
+
if (data_size == size) { /* Overwrite */
/* Copy in the new data. */
- memmove((uint8_t *)value->data + offset, data, data_size);
+ memcpy((uint8_t *)value->data + offset, data, data_size);
/*
* The new data must overlap the buffer's end (else, we'd use
@@ -137,7 +175,7 @@ __modify_apply_one(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
value->size = offset + data_size;
} else { /* Shrink or grow */
/* Move trailing data forward/backward to its new location. */
- from = (uint8_t *)value->data + (offset + size);
+ from = (const uint8_t *)value->data + (offset + size);
WT_ASSERT(session, WT_DATA_IN_ITEM(value) &&
from + (value->size - (offset + size)) <=
(uint8_t *)value->mem + value->memsize);
@@ -148,7 +186,7 @@ __modify_apply_one(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
memmove(to, from, value->size - (offset + size));
/* Copy in the new data. */
- memmove((uint8_t *)value->data + offset, data, data_size);
+ memcpy((uint8_t *)value->data + offset, data, data_size);
/*
* Correct the size. This works because of how the C standard
@@ -165,49 +203,134 @@ __modify_apply_one(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
value->size += (data_size - size);
}
- /* Restore the trailing nul. */
- if (sformat)
- ((char *)value->data)[value->size++] = '\0';
-
return (0);
}
/*
- * __wt_modify_apply_api --
- * Apply a single set of WT_MODIFY changes to a buffer, the cursor API
- * interface.
+ * __modify_fast_path --
+ * Process a set of modifications, applying any that can be made in place,
+ * and check if the remaining ones are sorted and non-overlapping.
*/
-int
-__wt_modify_apply_api(WT_SESSION_IMPL *session,
- WT_CURSOR *cursor, WT_MODIFY *entries, int nentries)
- WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
+static void
+__modify_fast_path(
+ WT_ITEM *value, const size_t *p, int nentries,
+ int *nappliedp, bool *overlapp, size_t *dataszp, size_t *destszp)
{
- size_t modified;
- int i;
+ WT_MODIFY current, prev;
+ size_t datasz, destoff;
+ bool fastpath, first;
+
+ *overlapp = true;
+
+ datasz = destoff = 0;
+ WT_CLEAR(current);
+ WT_CLEAR(prev); /* [-Werror=maybe-uninitialized] */
- for (modified = 0, i = 0; i < nentries; ++i) {
- modified += entries[i].size;
- WT_RET(__modify_apply_one(session, cursor, entries[i].data.size,
- entries[i].offset, entries[i].size, entries[i].data.data));
- }
/*
- * This API is used by some external test functions with a NULL
- * session pointer - they don't expect statistics to be incremented.
+ * If the modifications are sorted and don't overlap in the old or new
+ * values, we can do a fast application of all the modifications
+ * modifications in a single pass.
+ *
+ * The requirement for ordering is unfortunate, but modifications are
+ * performed in order, and applications specify byte offsets based on
+ * that. In other words, byte offsets are cumulative, modifications
+ * that shrink or grow the data affect subsequent modification's byte
+ * offsets.
*/
- if (session != NULL) {
- WT_STAT_CONN_INCR(session, cursor_modify);
- WT_STAT_DATA_INCR(session, cursor_modify);
- WT_STAT_CONN_INCRV(session,
- cursor_modify_bytes, cursor->value.size);
- WT_STAT_DATA_INCRV(session,
- cursor_modify_bytes, cursor->value.size);
- WT_STAT_CONN_INCRV(session,
- cursor_modify_bytes_touch, modified);
- WT_STAT_DATA_INCRV(session,
- cursor_modify_bytes_touch, modified);
- }
+ fastpath = first = true;
+ *nappliedp = 0;
+ WT_MODIFY_FOREACH_BEGIN(current, p, nentries, 0) {
+ datasz += current.data.size;
- return (0);
+ if (fastpath && current.data.size == current.size &&
+ current.offset + current.size <= value->size) {
+ memcpy((uint8_t *)value->data + current.offset,
+ current.data.data, current.data.size);
+ ++(*nappliedp);
+ continue;
+ }
+ fastpath = false;
+
+ /* Step over the bytes before the current block. */
+ if (first)
+ destoff = current.offset;
+ else {
+ /* Check that entries are sorted and non-overlapping. */
+ if (current.offset < prev.offset + prev.size ||
+ current.offset < prev.offset + prev.data.size)
+ return;
+ destoff += current.offset - (prev.offset + prev.size);
+ }
+
+ /*
+ * If the source is past the end of the current value, we have
+ * to deal with padding bytes. Don't try to fast-path padding
+ * bytes; it's not common and adds branches to the loop
+ * applying the changes.
+ */
+ if (current.offset + current.size > value->size)
+ return;
+
+ /*
+ * If copying this block overlaps with the next one, we can't
+ * build the value in reverse order.
+ */
+ if (current.size != current.data.size &&
+ current.offset + current.size > destoff)
+ return;
+
+ /* Step over the current modification. */
+ destoff += current.data.size;
+
+ prev = current;
+ first = false;
+ } WT_MODIFY_FOREACH_END;
+
+ /* Step over the final unmodified block. */
+ destoff += value->size - (current.offset + current.size);
+
+ *overlapp = false;
+ *dataszp = datasz;
+ *destszp = destoff;
+ return;
+}
+
+/*
+ * __modify_apply_no_overlap --
+ * Apply a single set of WT_MODIFY changes to a buffer, where the changes
+ * are in sorted order and none of the changes overlap.
+ */
+static void
+__modify_apply_no_overlap(WT_SESSION_IMPL *session, WT_ITEM *value,
+ const size_t *p, int nentries, int napplied, size_t datasz, size_t destsz)
+{
+ WT_MODIFY current;
+ size_t sz;
+ const uint8_t *from;
+ uint8_t *to;
+
+ from = (const uint8_t *)value->data + value->size;
+ to = (uint8_t *)value->data + destsz;
+ WT_MODIFY_FOREACH_REVERSE(current, p, nentries, napplied, datasz) {
+ /* Move the current unmodified block into place if necessary. */
+ sz = WT_PTRDIFF(to, value->data) -
+ (current.offset + current.data.size);
+ from -= sz;
+ to -= sz;
+ WT_ASSERT(session, from >= (const uint8_t *)value->data &&
+ to >= (uint8_t *)value->data);
+ WT_ASSERT(session,
+ from + sz <= (const uint8_t *)value->data + value->size);
+
+ if (to != from)
+ memmove(to, from, sz);
+
+ from -= current.size;
+ to -= current.data.size;
+ memcpy(to, current.data.data, current.data.size);
+ } WT_MODIFY_FOREACH_END;
+
+ value->size = destsz;
}
/*
@@ -215,31 +338,91 @@ __wt_modify_apply_api(WT_SESSION_IMPL *session,
* Apply a single set of WT_MODIFY changes to a buffer.
*/
int
-__wt_modify_apply(
- WT_SESSION_IMPL *session, WT_CURSOR *cursor, const void *modify)
+__wt_modify_apply(WT_CURSOR *cursor, const void *modify)
{
- size_t data_size, nentries, offset, size;
+ WT_ITEM *value;
+ WT_MODIFY mod;
+ WT_SESSION_IMPL *session;
+ size_t datasz, destsz, item_offset, tmp;
const size_t *p;
- const uint8_t *data;
+ int napplied, nentries;
+ bool overlap, sformat;
+
+ session = (WT_SESSION_IMPL *)cursor->session;
+ sformat = cursor->value_format[0] == 'S';
+ value = &cursor->value;
/*
- * Get the number of entries, and set a second pointer to reference the
- * change data. The modify string isn't necessarily aligned for size_t
- * access, copy to be sure.
+ * Get the number of modify entries and set a second pointer to
+ * reference the replacement data.
*/
p = modify;
- memcpy(&nentries, p++, sizeof(size_t));
- data = (uint8_t *)modify +
- sizeof(size_t) + (nentries * 3 * sizeof(size_t));
-
- /* Step through the list of entries, applying them in order. */
- for (; nentries-- > 0; data += data_size) {
- memcpy(&data_size, p++, sizeof(size_t));
- memcpy(&offset, p++, sizeof(size_t));
- memcpy(&size, p++, sizeof(size_t));
- WT_RET(__modify_apply_one(
- session, cursor, data_size, offset, size, data));
+ memcpy(&tmp, p++, sizeof(size_t));
+ nentries = (int)tmp;
+
+ /*
+ * Grow the buffer first. This function is often called using a cursor
+ * buffer referencing on-page memory and it's easy to overwrite a page.
+ * A side-effect of growing the buffer is to ensure the buffer's value
+ * is in buffer-local memory.
+ *
+ * Because the buffer may reference an overflow item, the data may not
+ * start at the start of the buffer's memory and we have to correct for
+ * that.
+ */
+ item_offset = WT_DATA_IN_ITEM(value) ?
+ WT_PTRDIFF(value->data, value->mem) : 0;
+ WT_RET(__wt_buf_grow(session, value, item_offset + value->size));
+
+ /*
+ * Decrement the size to discard the trailing nul (done after growing
+ * the buffer to ensure it can be restored without further checking).
+ */
+ if (sformat)
+ --value->size;
+
+ __modify_fast_path(
+ value, p, nentries, &napplied, &overlap, &datasz, &destsz);
+
+ if (napplied == nentries)
+ goto done;
+
+ if (!overlap) {
+ /* Grow the buffer first, correcting for the data offset. */
+ WT_RET(__wt_buf_grow(session, value, item_offset +
+ WT_MAX(destsz, value->size) + (sformat ? 1 : 0)));
+
+ __modify_apply_no_overlap(
+ session, value, p, nentries, napplied, datasz, destsz);
+ goto done;
}
+ WT_MODIFY_FOREACH_BEGIN(mod, p, nentries, napplied) {
+ WT_RET(__modify_apply_one(session, value, &mod, sformat));
+ } WT_MODIFY_FOREACH_END;
+
+done: /* Restore the trailing nul. */
+ if (sformat)
+ ((char *)value->data)[value->size++] = '\0';
+
return (0);
}
+
+/*
+ * __wt_modify_apply_api --
+ * Apply a single set of WT_MODIFY changes to a buffer, the cursor API
+ * interface.
+ */
+int
+__wt_modify_apply_api(WT_CURSOR *cursor, WT_MODIFY *entries, int nentries)
+ WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
+{
+ WT_DECL_ITEM(modify);
+ WT_DECL_RET;
+
+ WT_ERR(__wt_modify_pack(cursor, &modify, entries, nentries));
+ WT_ERR(__wt_modify_apply(cursor, modify->data));
+
+err: __wt_scr_free((WT_SESSION_IMPL *)cursor->session, &modify);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
index a5e3e139178..504b2c0e8b4 100644
--- a/src/third_party/wiredtiger/src/txn/txn_recover.c
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -155,7 +155,7 @@ __txn_op_apply(
* than using cursor modify to create a partial update
* (for no particular reason than simplicity).
*/
- WT_ERR(__wt_modify_apply(session, cursor, value.data));
+ WT_ERR(__wt_modify_apply(cursor, value.data));
WT_ERR(cursor->insert(cursor));
}
break;
@@ -222,7 +222,7 @@ __txn_op_apply(
* than using cursor modify to create a partial update
* (for no particular reason than simplicity).
*/
- WT_ERR(__wt_modify_apply(session, cursor, value.data));
+ WT_ERR(__wt_modify_apply(cursor, value.data));
WT_ERR(cursor->insert(cursor));
}
break;
diff --git a/src/third_party/wiredtiger/test/csuite/wt3338_partial_update/main.c b/src/third_party/wiredtiger/test/csuite/wt3338_partial_update/main.c
index 879a8e96c6a..5a413c0df3b 100644
--- a/src/third_party/wiredtiger/test/csuite/wt3338_partial_update/main.c
+++ b/src/third_party/wiredtiger/test/csuite/wt3338_partial_update/main.c
@@ -37,7 +37,7 @@
#define DATASIZE 1024
#define MAX_MODIFY_ENTRIES 37 /* Maximum modify vectors */
-static WT_MODIFY entries[1000]; /* Entries vector */
+static WT_MODIFY entries[MAX_MODIFY_ENTRIES]; /* Entries vector */
static int nentries; /* Entries count */
/*
@@ -50,7 +50,6 @@ static char modify_repl[MAX_REPL_BYTES * 2]; /* Replacement bytes */
static WT_RAND_STATE rnd; /* RNG state */
-#if DEBUG
/*
* show --
* Dump out a buffer.
@@ -62,15 +61,10 @@ show(WT_ITEM *buf, const char *tag)
const uint8_t *a;
fprintf(stderr, "%s: %" WT_SIZET_FMT " bytes\n\t", tag, buf->size);
- for (a = buf->data, i = 0; i < buf->size; ++i, ++a) {
- if (isprint(*a))
- fprintf(stderr, " %c", *a);
- else
- fprintf(stderr, " %#x", *a);
- }
+ for (a = buf->data, i = 0; i < buf->size; ++i, ++a)
+ fprintf(stderr, " %c", isprint(*a) ? *a : '.');
fprintf(stderr, "\n");
}
-#endif
/*
* modify_repl_init --
@@ -82,7 +76,7 @@ modify_repl_init(void)
size_t i;
for (i = 0; i < sizeof(modify_repl); ++i)
- modify_repl[i] = "zyxwvutsrqponmlkjihgfedcba"[i % 26];
+ modify_repl[i] = 'Z' - (i % 26);
}
/*
@@ -95,13 +89,13 @@ modify_build(void)
int i;
/* Mess up the entries. */
- memset(entries, 0xff, MAX_MODIFY_ENTRIES * sizeof(entries[0]));
+ memset(entries, 0xff, sizeof(entries));
/*
* Randomly select a number of byte changes, offsets and lengths.
* Allow a value of 0, the API should accept it.
*/
- nentries = (int)(__wt_random(&rnd) % MAX_MODIFY_ENTRIES);
+ nentries = (int)(__wt_random(&rnd) % (MAX_MODIFY_ENTRIES + 1));
for (i = 0; i < nentries; ++i) {
entries[i].data.data =
modify_repl + __wt_random(&rnd) % MAX_REPL_BYTES;
@@ -115,7 +109,7 @@ modify_build(void)
printf(
"%d: {%.*s} %" WT_SIZET_FMT " bytes replacing %"
WT_SIZET_FMT " bytes @ %" WT_SIZET_FMT "\n",
- i, (int)entries[i].data.size, entries[i].data.data,
+ i, (int)entries[i].data.size, (char *)entries[i].data.data,
entries[i].data.size, entries[i].size, entries[i].offset);
#endif
}
@@ -217,16 +211,29 @@ slow_apply_api(WT_ITEM *orig)
* Compare two results.
*/
static void
-compare(WT_ITEM *local, WT_ITEM *library)
+compare(WT_ITEM *orig, WT_ITEM *local, WT_ITEM *library)
{
-#if DEBUG
+ size_t i, max;
+ const uint8_t *p, *t;
+
+ max = WT_MIN(local->size, library->size);
if (local->size != library->size ||
memcmp(local->data, library->data, local->size) != 0) {
- fprintf(stderr, "results differ\n");
+ for (i = 0,
+ p = local->data, t = library->data; i < max; ++i, ++p, ++t)
+ if (*p != *t)
+ break;
+ fprintf(stderr, "results differ: ");
+ if (max == 0)
+ fprintf(stderr,
+ "identical up to %" WT_SIZET_FMT " bytes\n", max);
+ else
+ fprintf(stderr,
+ "first mismatch at offset %" WT_SIZET_FMT "\n", i);
+ show(orig, "original");
show(local, "local results");
show(library, "library results");
}
-#endif
testutil_assert(
local->size == library->size && memcmp(
local->data, library->data, local->size) == 0);
@@ -250,16 +257,22 @@ compare(WT_ITEM *local, WT_ITEM *library)
* calculate-modify API.
*/
static void
-modify_run(bool verbose)
+modify_run(TEST_OPTS *opts)
{
WT_CURSOR *cursor, _cursor;
WT_DECL_RET;
WT_ITEM *localA, _localA, *localB, _localB;
+ WT_SESSION_IMPL *session;
size_t len;
int i, j;
+ u_char *p;
+ bool verbose;
+
+ session = (WT_SESSION_IMPL *)opts->session;
+ verbose = opts->verbose;
/* Initialize the RNG. */
- __wt_random_init_seed(NULL, &rnd);
+ __wt_random_init_seed(session, &rnd);
/* Set up replacement information. */
modify_repl_init();
@@ -271,18 +284,24 @@ modify_run(bool verbose)
memset(&_localB, 0, sizeof(_localB));
cursor = &_cursor;
memset(&_cursor, 0, sizeof(_cursor));
+ cursor->session = (WT_SESSION *)session;
cursor->value_format = "u";
#define NRUNS 10000
for (i = 0; i < NRUNS; ++i) {
/* Create an initial value. */
len = (size_t)(__wt_random(&rnd) % MAX_REPL_BYTES);
- testutil_check(__wt_buf_set(NULL, localA, modify_repl, len));
+ testutil_check(__wt_buf_set(session, localA, modify_repl, len));
for (j = 0; j < 1000; ++j) {
+ /* Make lower case so modifications are easy to see. */
+ for (p = localA->mem;
+ WT_PTRDIFF(p, localA->mem) < localA->size; p++)
+ *p = __wt_tolower(*p);
+
/* Copy the current value into the second item. */
testutil_check(__wt_buf_set(
- NULL, localB, localA->data, localA->size));
+ session, localB, localA->data, localA->size));
/*
* Create a random set of modify vectors, run the
@@ -291,12 +310,12 @@ modify_run(bool verbose)
* of modify.
*/
modify_build();
- testutil_check(__wt_buf_set(
- NULL, &cursor->value, localA->data, localA->size));
+ testutil_check(__wt_buf_set(session,
+ &cursor->value, localA->data, localA->size));
testutil_check(__wt_modify_apply_api(
- NULL, cursor, entries, nentries));
+ cursor, entries, nentries));
slow_apply_api(localA);
- compare(localA, &cursor->value);
+ compare(localB, localA, &cursor->value);
/*
* Call the WiredTiger function to build a modification
@@ -305,18 +324,18 @@ modify_run(bool verbose)
* against our implementation of modify.
*/
nentries = WT_ELEMENTS(entries);
- ret = wiredtiger_calc_modify(NULL,
+ ret = wiredtiger_calc_modify(opts->session,
localB, localA,
WT_MAX(localB->size, localA->size) + 100,
entries, &nentries);
if (ret == WT_NOTFOUND)
continue;
testutil_check(ret);
- testutil_check(__wt_buf_set(
- NULL, &cursor->value, localB->data, localB->size));
+ testutil_check(__wt_buf_set(session,
+ &cursor->value, localB->data, localB->size));
testutil_check(__wt_modify_apply_api(
- NULL, cursor, entries, nentries));
- compare(localA, &cursor->value);
+ cursor, entries, nentries));
+ compare(localB, localA, &cursor->value);
}
if (verbose) {
printf("%d (%d%%)\r", i, (i * 100) / NRUNS);
@@ -326,9 +345,9 @@ modify_run(bool verbose)
if (verbose)
printf("%d (100%%)\n", i);
- __wt_buf_free(NULL, localA);
- __wt_buf_free(NULL, localB);
- __wt_buf_free(NULL, &cursor->value);
+ __wt_buf_free(session, localA);
+ __wt_buf_free(session, localB);
+ __wt_buf_free(session, &cursor->value);
}
int
@@ -342,9 +361,11 @@ main(int argc, char *argv[])
testutil_make_work_dir(opts->home);
testutil_check(
wiredtiger_open(opts->home, NULL, "create", &opts->conn));
+ testutil_check(
+ opts->conn->open_session(opts->conn, NULL, NULL, &opts->session));
/* Run the test. */
- modify_run(opts->verbose);
+ modify_run(opts);
testutil_cleanup(opts);
return (EXIT_SUCCESS);
diff --git a/src/third_party/wiredtiger/test/format/snap.c b/src/third_party/wiredtiger/test/format/snap.c
index 9442f1fdeb3..e425624ab5d 100644
--- a/src/third_party/wiredtiger/test/format/snap.c
+++ b/src/third_party/wiredtiger/test/format/snap.c
@@ -61,6 +61,7 @@ snap_track(TINFO *tinfo, thread_op op)
snap->ts = WT_TS_NONE;
snap->repeatable = false;
snap->last = op == TRUNCATE ? tinfo->last : 0;
+ snap->ksize = snap->vsize = 0;
if (op == INSERT && g.type == ROW) {
ip = tinfo->key;