Import wiredtiger: 2e9744d11a65c63ba7445060dc78371250f04051 from branch mongodb-3.6

ref: 6173a98979..2e9744d11a for: 3.5.11 WT-2309 Add yields and/or sleeps in #DIAGNOSTIC mode WT-3047 Add mode aimed at uncovering race conditions in split code WT-3308 Add statistics tracking around yield loops WT-3316 Add new engineering section to reference guide documentation WT-3338 Optimize cursor modify WT-3380 Special case 8-byte timestamps WT-3387 Add support for a stable timestamp WT-3389 Restructure split code to hold a split generation for the entire operation. WT-3406 Reconciliation is choosing reserved records for writing. WT-3410 Add developer documentation for table rename WT-3412 Add backoff logic to the btree delete and walk yield loops WT-3418 block manager object race WT-3422 WiredTiger upgrading documents out of date WT-3432 workgen needs braces around an "if" body WT-3433 session->alter method should not be supported in read-only mode WT-3439 lint/cleanup WT-3440 Add a log record when starting a checkpoint WT-3442 Coverity 1378213: false positive on diagnostic assignment. WT-3446 Temporarily disable timestamp testing in test/checkpoint WT-3447 test_stat_log02 can assert before table stats are printed WT-3461 Avoid long sleeps when the system clock is adjusted WT-3463 Add recovery of backup to test_timestamp03.py WT-3466 Track the first commit timestamp for each transaction WT-3467 Minor lint/cleanup
author: Alex Gorrod <alexander.gorrod@mongodb.com> 2017-08-01 16:42:49 +1000
committer: Alex Gorrod <alexander.gorrod@mongodb.com> 2017-08-01 16:42:49 +1000
commit: 835bfb21d8e67663d84a40aa4f7370a4403725a9 (patch)
tree: 4f5edb231524f95272f834e31461ba4e17e52903 /src/third_party/wiredtiger/src/btree
parent: 6300b3bd4ad9cd238a02bdb8ca681a447913f1af (diff)
download: mongo-835bfb21d8e67663d84a40aa4f7370a4403725a9.tar.gz
13 files changed, 685 insertions, 328 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c
index cb59bff8f75..eb8a258d475 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curnext.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c
@@ -15,12 +15,10 @@
 static inline int
 __cursor_fix_append_next(WT_CURSOR_BTREE *cbt, bool newpage)
 {
-	WT_ITEM *val;
 	WT_SESSION_IMPL *session;
 	WT_UPDATE *upd;
 
 	session = (WT_SESSION_IMPL *)cbt->iface.session;
-	val = &cbt->iface.value;
 
 	if (newpage) {
 		if ((cbt->ins = WT_SKIP_FIRST(cbt->ins_head)) == NULL)
@@ -59,10 +57,10 @@ __cursor_fix_append_next(WT_CURSOR_BTREE *cbt, bool newpage)
 	if (cbt->recno < WT_INSERT_RECNO(cbt->ins) ||
 	    (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) {
 		cbt->v = 0;
-		val->data = &cbt->v;
+		cbt->iface.value.data = &cbt->v;
 	} else
-		val->data = WT_UPDATE_DATA(upd);
-	val->size = 1;
+		cbt->iface.value.data = upd->data;
+	cbt->iface.value.size = 1;
 	return (0);
 }
 
@@ -74,7 +72,6 @@ static inline int
 __cursor_fix_next(WT_CURSOR_BTREE *cbt, bool newpage)
 {
 	WT_BTREE *btree;
-	WT_ITEM *val;
 	WT_PAGE *page;
 	WT_SESSION_IMPL *session;
 	WT_UPDATE *upd;
@@ -82,7 +79,6 @@ __cursor_fix_next(WT_CURSOR_BTREE *cbt, bool newpage)
 	session = (WT_SESSION_IMPL *)cbt->iface.session;
 	btree = S2BT(session);
 	page = cbt->ref->page;
-	val = &cbt->iface.value;
 
 	/* Initialize for each new page. */
 	if (newpage) {
@@ -108,10 +104,10 @@ new_page:
 	upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd);
 	if (upd == NULL) {
 		cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt);
-		val->data = &cbt->v;
+		cbt->iface.value.data = &cbt->v;
 	} else
-		val->data = WT_UPDATE_DATA(upd);
-	val->size = 1;
+		cbt->iface.value.data = upd->data;
+	cbt->iface.value.size = 1;
 	return (0);
 }
 
@@ -122,12 +118,10 @@ new_page:
 static inline int
 __cursor_var_append_next(WT_CURSOR_BTREE *cbt, bool newpage)
 {
-	WT_ITEM *val;
 	WT_SESSION_IMPL *session;
 	WT_UPDATE *upd;
 
 	session = (WT_SESSION_IMPL *)cbt->iface.session;
-	val = &cbt->iface.value;
 
 	if (newpage) {
 		cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
@@ -147,9 +141,7 @@ new_page:	if (cbt->ins == NULL)
 				++cbt->page_deleted_count;
 			continue;
 		}
-		val->data = WT_UPDATE_DATA(upd);
-		val->size = upd->size;
-		return (0);
+		return (__wt_value_return(session, cbt, upd));
 	}
 	/* NOTREACHED */
 }
@@ -164,7 +156,6 @@ __cursor_var_next(WT_CURSOR_BTREE *cbt, bool newpage)
 	WT_CELL *cell;
 	WT_CELL_UNPACK unpack;
 	WT_COL *cip;
-	WT_ITEM *val;
 	WT_INSERT *ins;
 	WT_PAGE *page;
 	WT_SESSION_IMPL *session;
@@ -173,7 +164,6 @@ __cursor_var_next(WT_CURSOR_BTREE *cbt, bool newpage)
 
 	session = (WT_SESSION_IMPL *)cbt->iface.session;
 	page = cbt->ref->page;
-	val = &cbt->iface.value;
 
 	rle_start = 0;			/* -Werror=maybe-uninitialized */
 
@@ -210,10 +200,7 @@ new_page:	/* Find the matching WT_COL slot. */
 					++cbt->page_deleted_count;
 				continue;
 			}
-
-			val->data = WT_UPDATE_DATA(upd);
-			val->size = upd->size;
-			return (0);
+			return (__wt_value_return(session, cbt, upd));
 		}
 
 		/*
@@ -267,8 +254,8 @@ new_page:	/* Find the matching WT_COL slot. */
 
 			cbt->cip_saved = cip;
 		}
-		val->data = cbt->tmp->data;
-		val->size = cbt->tmp->size;
+		cbt->iface.value.data = cbt->tmp->data;
+		cbt->iface.value.size = cbt->tmp->size;
 		return (0);
 	}
 	/* NOTREACHED */
@@ -282,7 +269,7 @@ static inline int
 __cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage)
 {
 	WT_INSERT *ins;
-	WT_ITEM *key, *val;
+	WT_ITEM *key;
 	WT_PAGE *page;
 	WT_ROW *rip;
 	WT_SESSION_IMPL *session;
@@ -291,7 +278,6 @@ __cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage)
 	session = (WT_SESSION_IMPL *)cbt->iface.session;
 	page = cbt->ref->page;
 	key = &cbt->iface.key;
-	val = &cbt->iface.value;
 
 	/*
 	 * For row-store pages, we need a single item that tells us the part
@@ -332,9 +318,7 @@ new_insert:	if ((ins = cbt->ins) != NULL) {
 			}
 			key->data = WT_INSERT_KEY(ins);
 			key->size = WT_INSERT_KEY_SIZE(ins);
-			val->data = WT_UPDATE_DATA(upd);
-			val->size = upd->size;
-			return (0);
+			return (__wt_value_return(session, cbt, upd));
 		}
 
 		/* Check for the end of the page. */
@@ -363,7 +347,6 @@ new_insert:	if ((ins = cbt->ins) != NULL) {
 				++cbt->page_deleted_count;
 			continue;
 		}
-
 		return (__cursor_row_slot_return(cbt, rip, upd));
 	}
 	/* NOTREACHED */
diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c
index 6e49f4df68c..c1395ea9008 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curprev.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c
@@ -127,12 +127,10 @@ restart:
 static inline int
 __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage)
 {
-	WT_ITEM *val;
 	WT_SESSION_IMPL *session;
 	WT_UPDATE *upd;
 
 	session = (WT_SESSION_IMPL *)cbt->iface.session;
-	val = &cbt->iface.value;
 
 	if (newpage) {
 		if ((cbt->ins = WT_SKIP_LAST(cbt->ins_head)) == NULL)
@@ -205,10 +203,10 @@ __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage)
 	    cbt->recno > WT_INSERT_RECNO(cbt->ins) ||
 	    (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) {
 		cbt->v = 0;
-		val->data = &cbt->v;
+		cbt->iface.value.data = &cbt->v;
 	} else
-		val->data = WT_UPDATE_DATA(upd);
-	val->size = 1;
+		cbt->iface.value.data = upd->data;
+	cbt->iface.value.size = 1;
 	return (0);
 }
 
@@ -220,7 +218,6 @@ static inline int
 __cursor_fix_prev(WT_CURSOR_BTREE *cbt, bool newpage)
 {
 	WT_BTREE *btree;
-	WT_ITEM *val;
 	WT_PAGE *page;
 	WT_SESSION_IMPL *session;
 	WT_UPDATE *upd;
@@ -228,7 +225,6 @@ __cursor_fix_prev(WT_CURSOR_BTREE *cbt, bool newpage)
 	session = (WT_SESSION_IMPL *)cbt->iface.session;
 	page = cbt->ref->page;
 	btree = S2BT(session);
-	val = &cbt->iface.value;
 
 	/* Initialize for each new page. */
 	if (newpage) {
@@ -254,10 +250,10 @@ new_page:
 	upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd);
 	if (upd == NULL) {
 		cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt);
-		val->data = &cbt->v;
+		cbt->iface.value.data = &cbt->v;
 	} else
-		val->data = WT_UPDATE_DATA(upd);
-	val->size = 1;
+		cbt->iface.value.data = upd->data;
+	cbt->iface.value.size = 1;
 	return (0);
 }
 
@@ -268,12 +264,10 @@ new_page:
 static inline int
 __cursor_var_append_prev(WT_CURSOR_BTREE *cbt, bool newpage)
 {
-	WT_ITEM *val;
 	WT_SESSION_IMPL *session;
 	WT_UPDATE *upd;
 
 	session = (WT_SESSION_IMPL *)cbt->iface.session;
-	val = &cbt->iface.value;
 
 	if (newpage) {
 		cbt->ins = WT_SKIP_LAST(cbt->ins_head);
@@ -293,9 +287,7 @@ new_page:	if (cbt->ins == NULL)
 				++cbt->page_deleted_count;
 			continue;
 		}
-		val->data = WT_UPDATE_DATA(upd);
-		val->size = upd->size;
-		return (0);
+		return (__wt_value_return(session, cbt, upd));
 	}
 	/* NOTREACHED */
 }
@@ -311,7 +303,6 @@ __cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage)
 	WT_CELL_UNPACK unpack;
 	WT_COL *cip;
 	WT_INSERT *ins;
-	WT_ITEM *val;
 	WT_PAGE *page;
 	WT_SESSION_IMPL *session;
 	WT_UPDATE *upd;
@@ -319,7 +310,6 @@ __cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage)
 
 	session = (WT_SESSION_IMPL *)cbt->iface.session;
 	page = cbt->ref->page;
-	val = &cbt->iface.value;
 
 	rle_start = 0;			/* -Werror=maybe-uninitialized */
 
@@ -357,10 +347,7 @@ new_page:	if (cbt->recno < cbt->ref->ref_recno)
 					++cbt->page_deleted_count;
 				continue;
 			}
-
-			val->data = WT_UPDATE_DATA(upd);
-			val->size = upd->size;
-			return (0);
+			return (__wt_value_return(session, cbt, upd));
 		}
 
 		/*
@@ -413,8 +400,8 @@ new_page:	if (cbt->recno < cbt->ref->ref_recno)
 
 			cbt->cip_saved = cip;
 		}
-		val->data = cbt->tmp->data;
-		val->size = cbt->tmp->size;
+		cbt->iface.value.data = cbt->tmp->data;
+		cbt->iface.value.size = cbt->tmp->size;
 		return (0);
 	}
 	/* NOTREACHED */
@@ -428,7 +415,7 @@ static inline int
 __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage)
 {
 	WT_INSERT *ins;
-	WT_ITEM *key, *val;
+	WT_ITEM *key;
 	WT_PAGE *page;
 	WT_ROW *rip;
 	WT_SESSION_IMPL *session;
@@ -437,7 +424,6 @@ __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage)
 	session = (WT_SESSION_IMPL *)cbt->iface.session;
 	page = cbt->ref->page;
 	key = &cbt->iface.key;
-	val = &cbt->iface.value;
 
 	/*
 	 * For row-store pages, we need a single item that tells us the part
@@ -489,9 +475,7 @@ new_insert:	if ((ins = cbt->ins) != NULL) {
 			}
 			key->data = WT_INSERT_KEY(ins);
 			key->size = WT_INSERT_KEY_SIZE(ins);
-			val->data = WT_UPDATE_DATA(upd);
-			val->size = upd->size;
-			return (0);
+			return (__wt_value_return(session, cbt, upd));
 		}
 
 		/* Check for the beginning of the page. */
@@ -522,7 +506,6 @@ new_insert:	if ((ins = cbt->ins) != NULL) {
 				++cbt->page_deleted_count;
 			continue;
 		}
-
 		return (__cursor_row_slot_return(cbt, rip, upd));
 	}
 	/* NOTREACHED */
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
index 52435eeefed..d58dc78fbed 100644
--- a/src/third_party/wiredtiger/src/btree/bt_cursor.c
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -308,8 +308,22 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
 }
 
 /*
+ * __cursor_kv_return --
+ *	Return a page referenced key/value pair to the application.
+ */
+static inline int
+__cursor_kv_return(
+    WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
+{
+	WT_RET(__wt_key_return(session, cbt));
+	WT_RET(__wt_value_return(session, cbt, upd));
+
+	return (0);
+}
+
+/*
  * __cursor_col_search --
- *	Column-store search from an application cursor.
+ *	Column-store search from a cursor.
  */
 static inline int
 __cursor_col_search(
@@ -324,7 +338,7 @@ __cursor_col_search(
 
 /*
  * __cursor_row_search --
- *	Row-store search from an application cursor.
+ *	Row-store search from a cursor.
  */
 static inline int
 __cursor_row_search(
@@ -338,8 +352,32 @@ __cursor_row_search(
 }
 
 /*
+ * __cursor_col_modify_v --
+ *	Column-store modify from a cursor, with a separate value.
+ */
+static inline int
+__cursor_col_modify_v(WT_SESSION_IMPL *session,
+    WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type)
+{
+	return (__wt_col_modify(session, cbt,
+	    cbt->iface.recno, value, NULL, modify_type, false));
+}
+
+/*
+ * __cursor_row_modify_v --
+ *	Row-store modify from a cursor, with a separate value.
+ */
+static inline int
+__cursor_row_modify_v(WT_SESSION_IMPL *session,
+    WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type)
+{
+	return (__wt_row_modify(session, cbt,
+	    &cbt->iface.key, value, NULL, modify_type, false));
+}
+
+/*
  * __cursor_col_modify --
- *	Column-store delete, insert, and update from an application cursor.
+ *	Column-store modify from a cursor.
  */
 static inline int
 __cursor_col_modify(
@@ -351,7 +389,7 @@ __cursor_col_modify(
 
 /*
  * __cursor_row_modify --
- *	Row-store insert, update and delete from an application cursor.
+ *	Row-store modify from a cursor.
  */
 static inline int
 __cursor_row_modify(
@@ -442,7 +480,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt)
 	}
 
 	if (valid)
-		ret = __wt_kv_return(session, cbt, upd);
+		ret = __cursor_kv_return(session, cbt, upd);
 	else if (__cursor_fix_implicit(btree, cbt)) {
 		/*
 		 * Creating a record past the end of the tree in a fixed-length
@@ -564,7 +602,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
 	 */
 	if (valid) {
 		exact = cbt->compare;
-		ret = __wt_kv_return(session, cbt, upd);
+		ret = __cursor_kv_return(session, cbt, upd);
 	} else if (__cursor_fix_implicit(btree, cbt)) {
 		cbt->recno = cursor->recno;
 		cbt->v = 0;
@@ -582,7 +620,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
 		    __cursor_col_search(session, cbt, NULL));
 		if (__wt_cursor_valid(cbt, &upd)) {
 			exact = cbt->compare;
-			ret = __wt_kv_return(session, cbt, upd);
+			ret = __cursor_kv_return(session, cbt, upd);
 		} else if ((ret = __wt_btcur_prev(cbt, false)) != WT_NOTFOUND)
 			exact = -1;
 	}
@@ -987,7 +1025,7 @@ done:	/*
  *	Update a record in the tree.
  */
 static int
-__btcur_update(WT_CURSOR_BTREE *cbt, u_int modify_type)
+__btcur_update(WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type)
 {
 	WT_BTREE *btree;
 	WT_CURFILE_STATE state;
@@ -1015,6 +1053,7 @@ __btcur_update(WT_CURSOR_BTREE *cbt, u_int modify_type)
 	 */
 	if (__cursor_page_pinned(cbt) && F_ISSET(cursor, WT_CURSTD_KEY_INT)) {
 		WT_ERR(__wt_txn_autocommit_check(session));
+
 		/*
 		 * The cursor position may not be exact (the cursor's comparison
 		 * value not equal to zero). Correct to an exact match so we can
@@ -1022,8 +1061,8 @@ __btcur_update(WT_CURSOR_BTREE *cbt, u_int modify_type)
 		 */
 		cbt->compare = 0;
 		ret = btree->type == BTREE_ROW ?
-		    __cursor_row_modify(session, cbt, modify_type) :
-		    __cursor_col_modify(session, cbt, modify_type);
+		    __cursor_row_modify_v(session, cbt, value, modify_type) :
+		    __cursor_col_modify_v(session, cbt, value, modify_type);
 		if (ret == 0)
 			goto done;
 
@@ -1052,6 +1091,7 @@ retry:	WT_ERR(__cursor_func_init(cbt, true));
 
 	if (btree->type == BTREE_ROW) {
 		WT_ERR(__cursor_row_search(session, cbt, NULL, true));
+
 		/*
 		 * If not overwriting, check for conflicts and fail if the key
 		 * does not exist.
@@ -1061,7 +1101,7 @@ retry:	WT_ERR(__cursor_func_init(cbt, true));
 			if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL))
 				WT_ERR(WT_NOTFOUND);
 		}
-		ret = __cursor_row_modify(session, cbt, modify_type);
+		ret = __cursor_row_modify_v(session, cbt, value, modify_type);
 	} else {
 		WT_ERR(__cursor_col_search(session, cbt, NULL));
 
@@ -1080,7 +1120,7 @@ retry:	WT_ERR(__cursor_func_init(cbt, true));
 			    !__cursor_fix_implicit(btree, cbt))
 				WT_ERR(WT_NOTFOUND);
 		}
-		ret = __cursor_col_modify(session, cbt, modify_type);
+		ret = __cursor_col_modify_v(session, cbt, value, modify_type);
 	}
 
 err:	if (ret == WT_RESTART) {
@@ -1097,14 +1137,33 @@ err:	if (ret == WT_RESTART) {
 	 * To make this work, we add a field to the btree cursor to pass back a
 	 * pointer to the modify function's allocated update structure.
 	 */
-done:	if (ret == 0) {
-		if (modify_type == WT_UPDATE_RESERVED) {
+done:	if (ret == 0)
+		switch (modify_type) {
+		case WT_UPDATE_STANDARD:
+			/*
+			 * WT_CURSOR.update returns a key and a value.
+			 */
+			WT_TRET(__cursor_kv_return(
+			    session, cbt, cbt->modify_update));
+			break;
+		case WT_UPDATE_RESERVED:
+			/*
+			 * WT_CURSOR.reserve doesn't return any value.
+			 */
 			F_CLR(cursor, WT_CURSTD_VALUE_SET);
+			/* FALLTHROUGH */
+		case WT_UPDATE_MODIFIED:
+			/*
+			 * WT_CURSOR.modify has already created the return value
+			 * and our job is to leave it untouched.
+			 */
 			WT_TRET(__wt_key_return(session, cbt));
-		} else
-			WT_TRET(
-			    __wt_kv_return(session, cbt, cbt->modify_update));
-	}
+			break;
+		case WT_UPDATE_DELETED:
+		default:
+			WT_TRET(__wt_illegal_value(session, NULL));
+			break;
+		}
 
 	if (ret != 0) {
 		WT_TRET(__cursor_reset(cbt));
@@ -1115,6 +1174,121 @@ done:	if (ret == 0) {
 }
 
 /*
+ * __cursor_chain_exceeded --
+ *	Return if the update chain has exceeded the limit. Deleted or standard
+ * updates are anticipated to be sufficient to base the modify (although that's
+ * not guaranteed, they may not be visible or might abort before we read them).
+ * Also, this is not a hard limit, threads can race modifying updates.
+ */
+static bool
+__cursor_chain_exceeded(WT_CURSOR_BTREE *cbt)
+{
+	WT_PAGE *page;
+	WT_UPDATE *upd;
+	int i;
+
+	page = cbt->ref->page;
+
+	upd = NULL;
+	if (cbt->ins != NULL)
+		upd = cbt->ins->upd;
+	else if (cbt->btree->type == BTREE_ROW &&
+	    page->modify != NULL && page->modify->mod_row_update != NULL)
+		upd = page->modify->mod_row_update[cbt->slot];
+
+	for (i = 0; upd != NULL; ++i, upd = upd->next) {
+		if (upd->type == WT_UPDATE_DELETED ||
+		    upd->type == WT_UPDATE_STANDARD)
+			return (false);
+		if (i >= WT_MAX_MODIFY_UPDATE)
+			return (true);
+	}
+	return (false);
+}
+
+/*
+ * __wt_btcur_modify --
+ *     Modify a record in the tree.
+ */
+int
+__wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries)
+{
+	WT_CURFILE_STATE state;
+	WT_CURSOR *cursor;
+	WT_DECL_ITEM(modify);
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	size_t orig, new;
+	bool chain_exceeded, overwrite;
+
+	cursor = &cbt->iface;
+	session = (WT_SESSION_IMPL *)cursor->session;
+
+	WT_STAT_CONN_INCR(session, cursor_modify);
+	WT_STAT_DATA_INCR(session, cursor_modify);
+
+	/* Save the cursor state. */
+	__cursor_state_save(cursor, &state);
+
+	/*
+	 * Get the current value and apply the modification to it, for a few
+	 * reasons: first, we set the updated value so the application can
+	 * retrieve the cursor's value; second, we use the updated value as
+	 * the update if the update chain is too long; third, there's a check
+	 * if the updated value is too large to store; fourth, to simplify the
+	 * count of bytes being added/removed; fifth, we can get into serious
+	 * trouble if we attempt to modify a value that doesn't exist. For the
+	 * fifth reason, verify we're not in a read-uncommitted transaction,
+	 * that implies a value that might disappear out from under us.
+	 */
+	if (session->txn.isolation == WT_ISO_READ_UNCOMMITTED)
+		WT_ERR_MSG(session, ENOTSUP,
+		    "not supported in read-uncommitted transactions");
+
+	WT_ERR(__wt_btcur_search(cbt));
+	orig = cursor->value.size;
+	WT_ERR(__wt_modify_apply_api(
+	    session, &cursor->value, entries, nentries));
+	new = cursor->value.size;
+	WT_ERR(__cursor_size_chk(session, &cursor->value));
+	if (new > orig)
+		WT_STAT_DATA_INCRV(session, cursor_update_bytes, new - orig);
+	else
+		WT_STAT_DATA_DECRV(session, cursor_update_bytes, orig - new);
+
+	/*
+	 * WT_CURSOR.modify is update-without-overwrite.
+	 *
+	 * Use the modify buffer as the update if under the limit, else use the
+	 * complete value.
+	 */
+	overwrite = F_ISSET(cursor, WT_CURSTD_OVERWRITE);
+	F_CLR(cursor, WT_CURSTD_OVERWRITE);
+	chain_exceeded = __cursor_chain_exceeded(cbt);
+	if (chain_exceeded)
+		ret = __btcur_update(cbt, &cursor->value, WT_UPDATE_STANDARD);
+	else if ((ret =
+	    __wt_modify_pack(session, &modify, entries, nentries)) == 0)
+		ret = __btcur_update(cbt, modify, WT_UPDATE_MODIFIED);
+	if (overwrite)
+	       F_SET(cursor, WT_CURSTD_OVERWRITE);
+
+	/*
+	 * We have our own cursor state restoration because we've modified the
+	 * cursor before calling the underlying cursor update function and we
+	 * need to restore it to its original state. This means multiple calls
+	 * to reset the cursor, but that shouldn't be a problem.
+	 */
+	if (ret != 0) {
+err:		WT_TRET(__cursor_reset(cbt));
+		__cursor_state_restore(cursor, &state);
+	}
+
+	__wt_scr_free(session, &modify);
+	return (ret);
+}
+
+/*
  * __wt_btcur_reserve --
  *     Reserve a record in the tree.
  */
@@ -1135,7 +1309,7 @@ __wt_btcur_reserve(WT_CURSOR_BTREE *cbt)
 	/* WT_CURSOR.reserve is update-without-overwrite and a special value. */
 	overwrite = F_ISSET(cursor, WT_CURSTD_OVERWRITE);
 	F_CLR(cursor, WT_CURSTD_OVERWRITE);
-	ret = __btcur_update(cbt, WT_UPDATE_RESERVED);
+	ret = __btcur_update(cbt, &cursor->value, WT_UPDATE_RESERVED);
 	if (overwrite)
 	       F_SET(cursor, WT_CURSTD_OVERWRITE);
 	return (ret);
@@ -1164,7 +1338,7 @@ __wt_btcur_update(WT_CURSOR_BTREE *cbt)
 		WT_RET(__cursor_size_chk(session, &cursor->key));
 	WT_RET(__cursor_size_chk(session, &cursor->value));
 
-	return (__btcur_update(cbt, WT_UPDATE_STANDARD));
+	return (__btcur_update(cbt, &cursor->value, WT_UPDATE_STANDARD));
 }
 
 /*
diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c
index c0aaf3f42d9..b8d11be7b3e 100644
--- a/src/third_party/wiredtiger/src/btree/bt_debug.c
+++ b/src/third_party/wiredtiger/src/btree/bt_debug.c
@@ -986,6 +986,35 @@ __debug_row_skip(WT_DBG *ds, WT_INSERT_HEAD *head)
 }
 
 /*
+ * __debug_modified --
+ *	Dump a modified update.
+ */
+static int
+__debug_modified(WT_DBG *ds, WT_UPDATE *upd)
+{
+	const size_t *p;
+	int nentries;
+	const uint8_t *data;
+	void *modify;
+
+	modify = upd->data;
+
+	p = modify;
+	nentries = (int)*p++;
+	data = (uint8_t *)modify +
+	    sizeof(size_t) + ((size_t)nentries * 3 * sizeof(size_t));
+
+	WT_RET(ds->f(ds, "%d: ", nentries));
+	for (; nentries-- > 0; data += p[0], p += 3)
+		WT_RET(ds->f(ds,
+		    "{%" WT_SIZET_FMT ", %" WT_SIZET_FMT ", %" WT_SIZET_FMT
+		    ", %.*s}%s", p[0], p[1], p[2],
+		    (int)p[2], data, nentries == 0 ? "" : ", "));
+
+	return (0);
+}
+
+/*
  * __debug_update --
  *	Dump an update list.
  */
@@ -993,37 +1022,46 @@ static int
 __debug_update(WT_DBG *ds, WT_UPDATE *upd, bool hexbyte)
 {
 	for (; upd != NULL; upd = upd->next) {
-		if (upd->type == WT_UPDATE_DELETED)
+		switch (upd->type) {
+		case WT_UPDATE_DELETED:
 			WT_RET(ds->f(ds, "\tvalue {deleted}\n"));
-		else if (upd->type == WT_UPDATE_RESERVED)
-			WT_RET(ds->f(ds, "\tvalue {reserved}\n"));
-		else if (hexbyte) {
-			WT_RET(ds->f(ds, "\t{"));
-			WT_RET(__debug_hex_byte(ds,
-			    *(uint8_t *)WT_UPDATE_DATA(upd)));
+			break;
+		case WT_UPDATE_MODIFIED:
+			WT_RET(ds->f(ds, "\tvalue {modified: "));
+			WT_RET(__debug_modified(ds, upd));
 			WT_RET(ds->f(ds, "}\n"));
-		} else
-			WT_RET(__debug_item(ds,
-			    "value", WT_UPDATE_DATA(upd), upd->size));
-		WT_RET(ds->f(ds, "\t" "txn id %" PRIu64, upd->txnid));
+			break;
+		case WT_UPDATE_RESERVED:
+			WT_RET(ds->f(ds, "\tvalue {reserved}\n"));
+			break;
+		case WT_UPDATE_STANDARD:
+			if (hexbyte) {
+				WT_RET(ds->f(ds, "\t{"));
+				WT_RET(__debug_hex_byte(ds, *upd->data));
+				WT_RET(ds->f(ds, "}\n"));
+			} else
+				WT_RET(__debug_item(ds,
+				    "value", upd->data, upd->size));
+			break;
+		}
+		if (upd->txnid == WT_TXN_ABORTED)
+			WT_RET(ds->f(ds, "\t" "txn aborted"));
+		else
+			WT_RET(ds->f(ds, "\t" "txn id %" PRIu64, upd->txnid));
 
 #ifdef HAVE_TIMESTAMPS
-		if (!__wt_timestamp_iszero(upd->timestamp)) {
+		if (!__wt_timestamp_iszero(
+		    WT_TIMESTAMP_NULL(&upd->timestamp))) {
 #if WT_TIMESTAMP_SIZE == 8
-			{
-			uint64_t ts;
-			__wt_timestamp_set(
-			    (uint8_t *)&ts, (uint8_t *)&upd->timestamp[0]);
-			ts = __wt_bswap64(ts);
-			WT_RET(ds->f(ds, ", stamp %" PRIu64, ts));
-			}
+			WT_RET(ds->f(ds,
+			    ", stamp %" PRIu64, upd->timestamp.val));
 #else
-			{
 			int i;
+
 			WT_RET(ds->f(ds, ", stamp 0x"));
 			for (i = 0; i < WT_TIMESTAMP_SIZE; ++i)
-				WT_RET(ds->f(ds, "%" PRIx8, upd->timestamp[i]));
-			}
+				WT_RET(ds->f(ds,
+				    "%" PRIx8, upd->timestamp.ts[i]));
 #endif
 		}
 #endif
diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c
index eac8994a5a4..093192dbaa0 100644
--- a/src/third_party/wiredtiger/src/btree/bt_delete.c
+++ b/src/third_party/wiredtiger/src/btree/bt_delete.c
@@ -153,6 +153,7 @@ void
 __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
 {
 	WT_UPDATE **upd;
+	uint64_t sleep_count, yield_count;
 
 	/*
 	 * If the page is still "deleted", it's as we left it, reset the state
@@ -160,7 +161,7 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
 	 * instantiated or being instantiated.  Loop because it's possible for
 	 * the page to return to the deleted state if instantiation fails.
 	 */
-	for (;; __wt_yield())
+	for (sleep_count = yield_count = 0;;) {
 		switch (ref->state) {
 		case WT_REF_DISK:
 		case WT_REF_READING:
@@ -205,6 +206,15 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
 			__wt_free(session, ref->page_del);
 			return;
 		}
+		/*
+		 * We wait for the change in page state, yield before retrying,
+		 * and if we've yielded enough times, start sleeping so we don't
+		 * burn CPU to no purpose.
+		 */
+		__wt_ref_state_yield_sleep(&yield_count, &sleep_count);
+		WT_STAT_CONN_INCRV(session, page_del_rollback_blocked,
+		    sleep_count);
+	}
 }
 
 /*
@@ -242,10 +252,10 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
 		return (false);
 
 	skip = ref->page_del == NULL || (visible_all ?
-	    __wt_txn_visible_all(session,
-		ref->page_del->txnid, WT_GET_TIMESTAMP(ref->page_del)):
-	    __wt_txn_visible(session,
-		ref->page_del->txnid, WT_GET_TIMESTAMP(ref->page_del)));
+	    __wt_txn_visible_all(session, ref->page_del->txnid,
+		WT_TIMESTAMP_NULL(&ref->page_del->timestamp)):
+	    __wt_txn_visible(session, ref->page_del->txnid,
+		WT_TIMESTAMP_NULL(&ref->page_del->timestamp)));
 
 	/*
 	 * The page_del structure can be freed as soon as the delete is stable:
@@ -254,8 +264,8 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
 	 * no longer need synchronization to check the ref.
 	 */
 	if (skip && ref->page_del != NULL && (visible_all ||
-	    __wt_txn_visible_all(session,
-	    ref->page_del->txnid, WT_GET_TIMESTAMP(ref->page_del)))) {
+	    __wt_txn_visible_all(session, ref->page_del->txnid,
+		WT_TIMESTAMP_NULL(&ref->page_del->timestamp)))) {
 		__wt_free(session, ref->page_del->update_list);
 		__wt_free(session, ref->page_del);
 	}
diff --git a/src/third_party/wiredtiger/src/btree/bt_ovfl.c b/src/third_party/wiredtiger/src/btree/bt_ovfl.c
index a0b1ff65006..f933245eaef 100644
--- a/src/third_party/wiredtiger/src/btree/bt_ovfl.c
+++ b/src/third_party/wiredtiger/src/btree/bt_ovfl.c
@@ -45,13 +45,15 @@ __ovfl_read(WT_SESSION_IMPL *session,
  */
 int
 __wt_ovfl_read(WT_SESSION_IMPL *session,
-    WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store)
+    WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store, bool *decoded)
 {
 	WT_DECL_RET;
 	WT_OVFL_TRACK *track;
 	WT_UPDATE *upd;
 	size_t i;
 
+	*decoded = false;
+
 	/*
 	 * If no page specified, there's no need to lock and there's no cache
 	 * to search, we don't care about WT_CELL_VALUE_OVFL_RM cells.
@@ -78,8 +80,9 @@ __wt_ovfl_read(WT_SESSION_IMPL *session,
 				break;
 			}
 		WT_ASSERT(session, i < track->remove_next);
-		store->data = WT_UPDATE_DATA(upd);
+		store->data = upd->data;
 		store->size = upd->size;
+		*decoded = true;
 	} else
 		ret = __ovfl_read(session, unpack->data, unpack->size, store);
 	__wt_readunlock(session, &S2BT(session)->ovfl_lock);
@@ -147,7 +150,7 @@ __ovfl_cache_append_update(WT_SESSION_IMPL *session, WT_PAGE *page,
 
 	/* Read the overflow value. */
 	WT_RET(__wt_scr_alloc(session, 1024, &tmp));
-	WT_ERR(__ovfl_read(session, unpack->data, unpack->size, tmp));
+	WT_ERR(__wt_dsk_cell_data_ref(session, page->type, unpack, tmp));
 
 	/*
 	 * Create an update entry with no transaction ID to ensure global
@@ -159,10 +162,23 @@ __ovfl_cache_append_update(WT_SESSION_IMPL *session, WT_PAGE *page,
 	 * involves atomic operations which will act as our barrier. Regardless,
 	 * we update the page footprint as part of this operation, which acts as
 	 * a barrier as well.
+	 *
+	 * The update transaction ID choice is tricky, to work around an issue
+	 * in variable-length column store. Imagine an overflow value with an
+	 * RLE greater than 1. We append a copy to the end of an update chain,
+	 * but it's possible it's the overflow value for more than one record,
+	 * and appending it to the end of one record's update chain means a
+	 * subsequent enter of a globally visible value to one of the records
+	 * would allow the truncation of the overflow chain that leaves other
+	 * records without a value. If appending such an overflow record, set
+	 * the transaction ID to the first possible transaction ID. That ID is
+	 * old enough to be globally visible, but we can use it as a flag if an
+	 * update record cannot be discarded when truncating an update chain.
 	 */
 	WT_ERR(__wt_update_alloc(
 	    session, tmp, &append, &size, WT_UPDATE_STANDARD));
-	append->txnid = WT_TXN_NONE;
+	append->txnid = page->type == WT_PAGE_COL_VAR &&
+	    __wt_cell_rle(unpack) > 1 ? WT_TXN_FIRST : WT_TXN_NONE;
 	for (upd = upd_list; upd->next != NULL; upd = upd->next)
 		;
 	WT_PUBLISH(upd->next, append);
diff --git a/src/third_party/wiredtiger/src/btree/bt_random.c b/src/third_party/wiredtiger/src/btree/bt_random.c
index 1bdf0fd1c8b..f28c4e10594 100644
--- a/src/third_party/wiredtiger/src/btree/bt_random.c
+++ b/src/third_party/wiredtiger/src/btree/bt_random.c
@@ -417,9 +417,10 @@ random_page_entry:
 	 * the next entry, if that doesn't work, move to the previous entry.
 	 */
 	WT_ERR(__wt_row_random_leaf(session, cbt));
-	if (__wt_cursor_valid(cbt, &upd))
-		WT_ERR(__wt_kv_return(session, cbt, upd));
-	else {
+	if (__wt_cursor_valid(cbt, &upd)) {
+		WT_ERR(__wt_key_return(session, cbt));
+		WT_ERR(__wt_value_return(session, cbt, upd));
+	} else {
 		if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND)
 			ret = __wt_btcur_prev(cbt, false);
 		WT_ERR(ret);
diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c
index 6a89f505c31..91c1499840e 100644
--- a/src/third_party/wiredtiger/src/btree/bt_read.c
+++ b/src/third_party/wiredtiger/src/btree/bt_read.c
@@ -194,7 +194,7 @@ __las_page_instantiate(WT_SESSION_IMPL *session,
 		upd->txnid = upd_txnid;
 #ifdef HAVE_TIMESTAMPS
 		WT_ASSERT(session, las_timestamp.size == WT_TIMESTAMP_SIZE);
-		__wt_timestamp_set(upd->timestamp, las_timestamp.data);
+		__wt_timestamp_set(&upd->timestamp, las_timestamp.data);
 #endif
 
 		switch (page->type) {
@@ -487,7 +487,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
 	WT_BTREE *btree;
 	WT_DECL_RET;
 	WT_PAGE *page;
-	u_int sleep_cnt, wait_cnt;
+	uint64_t sleep_cnt, wait_cnt;
 	bool busy, cache_work, evict_soon, stalled;
 	int force_attempts;
 
@@ -672,9 +672,8 @@ skip_evict:
 			if (cache_work)
 				continue;
 		}
-		sleep_cnt = WT_MIN(sleep_cnt + WT_THOUSAND, 10000);
+		__wt_ref_state_yield_sleep(&wait_cnt, &sleep_cnt);
 		WT_STAT_CONN_INCRV(session, page_sleep, sleep_cnt);
-		__wt_sleep(0, sleep_cnt);
 	}
 }
 
diff --git a/src/third_party/wiredtiger/src/btree/bt_ret.c b/src/third_party/wiredtiger/src/btree/bt_ret.c
index 7212de72d6e..4452e6eb0c6 100644
--- a/src/third_party/wiredtiger/src/btree/bt_ret.c
+++ b/src/third_party/wiredtiger/src/btree/bt_ret.c
@@ -75,10 +75,10 @@ __key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
 
 /*
  * __value_return --
- *	Change the cursor to reference an internal return value.
+ *	Change the cursor to reference an internal original-page return value.
  */
 static inline int
-__value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
+__value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
 {
 	WT_BTREE *btree;
 	WT_CELL *cell;
@@ -93,13 +93,6 @@ __value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
 	page = cbt->ref->page;
 	cursor = &cbt->iface;
 
-	/* If the cursor references a WT_UPDATE item, return it. */
-	if (upd != NULL) {
-		cursor->value.data = WT_UPDATE_DATA(upd);
-		cursor->value.size = upd->size;
-		return (0);
-	}
-
 	if (page->type == WT_PAGE_ROW_LEAF) {
 		rip = &page->pg_row[cbt->slot];
 
@@ -136,6 +129,99 @@ __value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
 }
 
 /*
+ * __value_return_upd --
+ *	Change the cursor to reference an internal update structure return
+ * value.
+ */
+static inline int
+__value_return_upd(
+    WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	WT_UPDATE **listp, *list[WT_MAX_MODIFY_UPDATE];
+	u_int i;
+	size_t allocated_bytes;
+
+	cursor = &cbt->iface;
+	allocated_bytes = 0;
+
+	/*
+	 * We're passed a "standard" or "modified"  update that's visible to us.
+	 * Our caller should have already checked for deleted items (we're too
+	 * far down the call stack to return not-found).
+	 *
+	 * Fast path if it's a standard item, assert our caller's behavior.
+	 */
+	if (upd->type == WT_UPDATE_STANDARD) {
+		cursor->value.data = upd->data;
+		cursor->value.size = upd->size;
+		return (0);
+	}
+	WT_ASSERT(session, upd->type == WT_UPDATE_MODIFIED);
+
+	/*
+	 * Find a complete update that's visible to us, tracking modifications
+	 * that are visible to us.
+	 */
+	for (i = 0, listp = list; upd != NULL; upd = upd->next) {
+		if (!__wt_txn_upd_visible(session, upd))
+			continue;
+
+		if (WT_UPDATE_DATA_VALUE(upd))
+			break;
+
+		if (upd->type == WT_UPDATE_MODIFIED) {
+			/*
+			 * Update lists are expected to be short, but it's not
+			 * guaranteed. There's sufficient room on the stack to
+			 * avoid memory allocation in normal cases, but we have
+			 * to handle the edge cases too.
+			 */
+			if (i >= WT_MAX_MODIFY_UPDATE) {
+				if (i == WT_MAX_MODIFY_UPDATE)
+					listp = NULL;
+				WT_ERR(__wt_realloc_def(
+				    session, &allocated_bytes, i + 1, &listp));
+				if (i == WT_MAX_MODIFY_UPDATE)
+					memcpy(listp, list, sizeof(list));
+			}
+			listp[i++] = upd;
+		}
+	}
+
+	/*
+	 * If we hit the end of the chain, roll forward from the update item we
+	 * found, otherwise, from the original page's value.
+	 */
+	if (upd == NULL) {
+		/*
+		 * Callers of this function set the cursor slot to an impossible
+		 * value to check we're not trying to return on-page values when
+		 * the update list should have been sufficient (which happens,
+		 * for example, if an update list was truncated, deleting some
+		 * standard update required by a previous modify update). Assert
+		 * the case.
+		 */
+		WT_ASSERT(session, cbt->slot != UINT32_MAX);
+
+		WT_ERR(__value_return(session, cbt));
+	} else if (upd->type == WT_UPDATE_DELETED)
+		WT_ERR(__wt_buf_set(session, &cursor->value, "", 0));
+	else
+		WT_ERR(__wt_buf_set(session,
+		    &cursor->value, upd->data, upd->size));
+
+	while (i > 0)
+		WT_ERR(__wt_modify_apply(
+		    session, &cursor->value, listp[--i]->data));
+
+err:	if (allocated_bytes)
+		__wt_free(session, listp);
+	return (ret);
+}
+
+/*
  * __wt_key_return --
  *	Change the cursor to reference an internal return key.
  */
@@ -164,21 +250,22 @@ __wt_key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
 }
 
 /*
- * __wt_kv_return --
- *	Return a page referenced key/value pair to the application.
+ * __wt_value_return --
+ *	Change the cursor to reference an internal return value.
  */
 int
-__wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
+__wt_value_return(
+    WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
 {
 	WT_CURSOR *cursor;
 
 	cursor = &cbt->iface;
 
-	WT_RET(__wt_key_return(session, cbt));
-
 	F_CLR(cursor, WT_CURSTD_VALUE_EXT);
-	WT_RET(__value_return(session, cbt, upd));
+	if (upd == NULL)
+		WT_RET(__value_return(session, cbt));
+	else
+		WT_RET(__value_return_upd(session, cbt, upd));
 	F_SET(cursor, WT_CURSTD_VALUE_INT);
-
 	return (0);
 }
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index c1b7b6c4001..2862c7fb6d7 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -31,6 +31,24 @@ typedef enum {
 } WT_SPLIT_ERROR_PHASE;
 
 /*
+ * __page_split_timing_stress --
+ *	Optionally add delay to simulate the race conditions in page split for
+ * debug purposes. The purpose is to uncover the race conditions in page split.
+ */
+static void
+__page_split_timing_stress(WT_SESSION_IMPL *session,
+    uint32_t flag, uint64_t micro_seconds)
+{
+	WT_CONNECTION_IMPL *conn;
+
+	conn = S2C(session);
+
+	/* We only want to sleep when page split race flag is set. */
+	if (FLD_ISSET(conn->timing_stress_flags, flag))
+		__wt_sleep(0, micro_seconds);
+}
+
+/*
  * __split_safe_free --
  *	Free a buffer if we can be sure no thread is accessing it, or schedule
  *	it to be freed otherwise.
@@ -308,8 +326,8 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
  *	Prepare a set of WT_REFs for a move.
  */
 static void
-__split_ref_prepare(WT_SESSION_IMPL *session,
-    WT_PAGE_INDEX *pindex, uint64_t split_gen, bool skip_first)
+__split_ref_prepare(
+    WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first)
 {
 	WT_PAGE *child;
 	WT_REF *child_ref, *ref;
@@ -331,40 +349,12 @@ __split_ref_prepare(WT_SESSION_IMPL *session,
 		ref = pindex->index[i];
 		child = ref->page;
 
-		/*
-		 * Block eviction in newly created pages.
-		 *
-		 * Once the split is live, newly created internal pages might be
-		 * evicted and their WT_REF structures freed. If that happened
-		 * before all threads exit the index of the page that previously
-		 * "owned" the WT_REF, a thread might see a freed WT_REF. To
-		 * ensure that doesn't happen, the newly created page contains
-		 * the current split generation and can't be evicted until
-		 * all readers have left the old generation.
-		 *
-		 * Historic, we also blocked splits in newly created pages
-		 * because we didn't update the WT_REF.home field until after
-		 * the split was live, so the WT_REF.home fields being updated
-		 * could split again before the update, there's a race between
-		 * splits as to which would update them first. The current code
-		 * updates the WT_REF.home fields before going live (in this
-		 * function), this isn't an issue.
-		 */
-		child->pg_intl_split_gen = split_gen;
-
-		/*
-		 * We use a page flag to prevent the child from splitting from
-		 * underneath us, but the split-generation error checks don't
-		 * know about that flag; use the standard macros to ensure that
-		 * reading the child's page index structure is safe.
-		 */
+		/* Switch the WT_REF's to their new page. */
 		j = 0;
-		WT_ENTER_PAGE_INDEX(session);
 		WT_INTL_FOREACH_BEGIN(session, child, child_ref) {
 			child_ref->home = child;
 			child_ref->pindex_hint = j++;
 		} WT_INTL_FOREACH_END;
-		WT_LEAVE_PAGE_INDEX(session);
 
 #ifdef HAVE_DIAGNOSTIC
 		WT_WITH_PAGE_INDEX(session,
@@ -447,6 +437,18 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
 		WT_ERR(__wt_calloc_one(session, alloc_refp));
 	root_incr += children * sizeof(WT_REF);
 
+	/*
+	 * Once the split is live, newly created internal pages might be evicted
+	 * and their WT_REF structures freed. If that happens before all threads
+	 * exit the index of the page that previously "owned" the WT_REF, a
+	 * thread might see a freed WT_REF. To ensure that doesn't happen, the
+	 * created pages are set to the current split generation and so can't be
+	 * evicted until all readers have left the old generation.
+	 *
+	 * Our thread has a stable split generation, get a copy.
+	 */
+	split_gen = __wt_session_gen(session, WT_GEN_SPLIT);
+
 	/* Allocate child pages, and connect them into the new page index. */
 	for (root_refp = pindex->index,
 	    alloc_refp = alloc_index->index, i = 0; i < children; ++i) {
@@ -471,10 +473,12 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
 			ref->ref_recno = (*root_refp)->ref_recno;
 		ref->state = WT_REF_MEM;
 
-		/* Initialize the child page. */
+		/*
+		 * Initialize the child page.
+		 * Block eviction in newly created pages and mark them dirty.
+		 */
 		child->pg_intl_parent_ref = ref;
-
-		/* Mark it dirty. */
+		child->pg_intl_split_gen = split_gen;
 		WT_ERR(__wt_page_modify_init(session, child));
 		__wt_page_modify_set(session, child);
 
@@ -504,13 +508,12 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
 	/* Start making real changes to the tree, errors are fatal. */
 	complete = WT_ERR_PANIC;
 
-	/*
-	 * Prepare the WT_REFs for the move: this requires a stable split
-	 * generation to block splits in newly created pages, so get one.
-	 */
-	WT_ENTER_PAGE_INDEX(session);
-	__split_ref_prepare(session, alloc_index,
-	    __wt_session_gen(session, WT_GEN_SPLIT), false);
+	/* Prepare the WT_REFs for the move. */
+	__split_ref_prepare(session, alloc_index, false);
+
+	/* Encourage a race */
+	__page_split_timing_stress(session,
+	    WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND);
 
 	/*
 	 * Confirm the root page's index hasn't moved, then update it, which
@@ -520,12 +523,21 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
 	WT_INTL_INDEX_SET(root, alloc_index);
 	alloc_index = NULL;
 
-	WT_LEAVE_PAGE_INDEX(session);
+	/* Encourage a race */
+	__page_split_timing_stress(session,
+	    WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND);
 
 	/*
 	 * Get a generation for this split, mark the root page.  This must be
 	 * after the new index is swapped into place in order to know that no
 	 * readers are looking at the old index.
+	 *
+	 * Note: as the root page cannot currently be evicted, the root split
+	 * generation isn't ever used. That said, it future proofs eviction
+	 * and isn't expensive enough to special-case.
+	 *
+	 * Getting a new split generation implies a full barrier, no additional
+	 * barrier is needed.
 	 */
 	split_gen = __wt_gen_next(session, WT_GEN_SPLIT);
 	root->pg_intl_split_gen = split_gen;
@@ -700,6 +712,10 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
 	/* Start making real changes to the tree, errors are fatal. */
 	complete = WT_ERR_PANIC;
 
+	/* Encourage a race */
+	__page_split_timing_stress(session,
+	    WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND);
+
 	/*
 	 * Confirm the parent page's index hasn't moved then update it, which
 	 * makes the split visible to threads descending the tree.
@@ -708,10 +724,17 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
 	WT_INTL_INDEX_SET(parent, alloc_index);
 	alloc_index = NULL;
 
+	/* Encourage a race */
+	__page_split_timing_stress(session,
+	    WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND);
+
 	/*
 	 * Get a generation for this split, mark the page.  This must be after
 	 * the new index is swapped into place in order to know that no readers
 	 * are looking at the old index.
+	 *
+	 * Getting a new split generation implies a full barrier, no additional
+	 * barrier is needed.
 	 */
 	split_gen = __wt_gen_next(session, WT_GEN_SPLIT);
 	parent->pg_intl_split_gen = split_gen;
@@ -760,7 +783,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
 	 * Swapping in the new page index released the page for eviction, we can
 	 * no longer look inside the page.
 	 */
-
 	if (ref->page == NULL)
 		__wt_verbose(session, WT_VERB_SPLIT,
 		    "%p: reverse split into parent %p, %" PRIu32 " -> %" PRIu32
@@ -779,8 +801,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
 	/*
 	 * The new page index is in place, free the WT_REF we were splitting and
 	 * any deleted WT_REFs we found, modulo the usual safe free semantics.
-	 *
-	 * Acquire a new split generation.
 	 */
 	for (i = 0, deleted_refs = scr->mem; i < deleted_entries; ++i) {
 		next_ref = pindex->index[deleted_refs[i]];
@@ -976,6 +996,18 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
 		WT_ERR(__wt_calloc_one(session, alloc_refp));
 	parent_incr += children * sizeof(WT_REF);
 
+	/*
+	 * Once the split is live, newly created internal pages might be evicted
+	 * and their WT_REF structures freed. If that happens before all threads
+	 * exit the index of the page that previously "owned" the WT_REF, a
+	 * thread might see a freed WT_REF. To ensure that doesn't happen, the
+	 * created pages are set to the current split generation and so can't be
+	 * evicted until all readers have left the old generation.
+	 *
+	 * Our thread has a stable split generation, get a copy.
+	 */
+	split_gen = __wt_session_gen(session, WT_GEN_SPLIT);
+
 	/* Allocate child pages, and connect them into the new page index. */
 	WT_ASSERT(session, page_refp == pindex->index + chunk);
 	for (alloc_refp = alloc_index->index + 1, i = 1; i < children; ++i) {
@@ -1000,10 +1032,12 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
 			ref->ref_recno = (*page_refp)->ref_recno;
 		ref->state = WT_REF_MEM;
 
-		/* Initialize the child page. */
+		/*
+		 * Initialize the child page.
+		 * Block eviction in newly created pages and mark them dirty.
+		 */
 		child->pg_intl_parent_ref = ref;
-
-		/* Mark it dirty. */
+		child->pg_intl_split_gen = split_gen;
 		WT_ERR(__wt_page_modify_init(session, child));
 		__wt_page_modify_set(session, child);
 
@@ -1033,32 +1067,35 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
 	/* Start making real changes to the tree, errors are fatal. */
 	complete = WT_ERR_PANIC;
 
-	/*
-	 * Prepare the WT_REFs for the move: this requires a stable split
-	 * generation to block splits in newly created pages, so get one.
-	 */
-	WT_ENTER_PAGE_INDEX(session);
-	__split_ref_prepare(session, alloc_index,
-	    __wt_session_gen(session, WT_GEN_SPLIT), true);
+	/* Prepare the WT_REFs for the move. */
+	__split_ref_prepare(session, alloc_index, true);
+
+	/* Encourage a race */
+	__page_split_timing_stress(session,
+	    WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND);
 
 	/* Split into the parent. */
-	if ((ret = __split_parent(session, page_ref, alloc_index->index,
-	    alloc_index->entries, parent_incr, false, false)) == 0) {
-		/*
-		 * Confirm the page's index hasn't moved, then update it, which
-		 * makes the split visible to threads descending the tree.
-		 */
-		WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex);
-		WT_INTL_INDEX_SET(page, replace_index);
-	}
+	WT_ERR(__split_parent(session, page_ref, alloc_index->index,
+	    alloc_index->entries, parent_incr, false, false));
 
-	WT_LEAVE_PAGE_INDEX(session);
-	WT_ERR(ret);
+	/*
+	 * Confirm the page's index hasn't moved, then update it, which
+	 * makes the split visible to threads descending the tree.
+	 */
+	WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex);
+	WT_INTL_INDEX_SET(page, replace_index);
+
+	/* Encourage a race */
+	__page_split_timing_stress(session,
+	    WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND);
 
 	/*
 	 * Get a generation for this split, mark the parent page.  This must be
 	 * after the new index is swapped into place in order to know that no
 	 * readers are looking at the old index.
+	 *
+	 * Getting a new split generation implies a full barrier, no additional
+	 * barrier is needed.
 	 */
 	split_gen = __wt_gen_next(session, WT_GEN_SPLIT);
 	page->pg_intl_split_gen = split_gen;
@@ -1122,18 +1159,15 @@ err:	switch (complete) {
 }
 
 /*
- * __split_internal_lock_worker --
+ * __split_internal_lock --
  *	Lock an internal page.
  */
 static int
-__split_internal_lock_worker(WT_SESSION_IMPL *session,
-    WT_REF *ref, bool trylock, WT_PAGE **parentp, bool *hazardp)
+__split_internal_lock(
+    WT_SESSION_IMPL *session, WT_REF *ref, bool trylock, WT_PAGE **parentp)
 {
-	WT_DECL_RET;
 	WT_PAGE *parent;
-	WT_REF *parent_ref;
 
-	*hazardp = false;
 	*parentp = NULL;
 
 	/*
@@ -1166,10 +1200,11 @@ __split_internal_lock_worker(WT_SESSION_IMPL *session,
 	for (;;) {
 		parent = ref->home;
 
-		/*
-		 * The page will be marked dirty, and we can only lock a page
-		 * with a modify structure.
-		 */
+		/* Encourage race */
+		__page_split_timing_stress(session,
+		    WT_TIMING_STRESS_PAGE_SPLIT_RACE, WT_THOUSAND);
+
+		/* Page locks live in the modify structure. */
 		WT_RET(__wt_page_modify_init(session, parent));
 
 		if (trylock)
@@ -1182,69 +1217,28 @@ __split_internal_lock_worker(WT_SESSION_IMPL *session,
 	}
 
 	/*
-	 * We have exclusive access to split the parent, and at this point, the
-	 * child prevents the parent from being evicted.  However, once we
+	 * This child has exclusive access to split its parent and the child's
+	 * existence prevents the parent from being evicted. However, once we
 	 * update the parent's index, it may no longer refer to the child, and
-	 * could conceivably be evicted.  Get a hazard pointer on the parent
-	 * now, so that we can safely access it after updating the index.
-	 *
-	 * Take care getting the page doesn't trigger eviction work: we could
-	 * block trying to split a different child of our parent and deadlock
-	 * or we could be the eviction server relied upon by other threads to
-	 * populate the eviction queue.
-	 */
-	if (!__wt_ref_is_root(parent_ref = parent->pg_intl_parent_ref)) {
-		WT_ERR(__wt_page_in(session, parent_ref, WT_READ_NO_EVICT));
-		*hazardp = true;
-	}
+	 * could conceivably be evicted. If the parent page is dirty, our page
+	 * lock prevents eviction because reconciliation is blocked. However,
+	 * if the page were clean, it could be evicted without encountering our
+	 * page lock. That isn't possible because you cannot move a child page
+	 * and still leave the parent page clean.
+	 */
 
 	*parentp = parent;
 	return (0);
-
-err:	WT_PAGE_UNLOCK(session, parent);
-	return (ret);
-}
-
-/*
- * __split_internal_lock --
- *	Lock an internal page.
- */
-static int
-__split_internal_lock(WT_SESSION_IMPL *session,
-    WT_REF *ref, bool trylock, WT_PAGE **parentp, bool *hazardp)
-{
-	WT_DECL_RET;
-
-	/*
-	 * There's no lock on our parent page and we're about to acquire one,
-	 * which implies using the WT_REF.home field to reference our parent
-	 * page. As a child of the parent page, we prevent its eviction, but
-	 * that's a weak guarantee. If the parent page splits, and our WT_REF
-	 * were to move with the split, the WT_REF.home field might change
-	 * underneath us and we could race, and end up attempting to access
-	 * an evicted page. Set the session page-index generation so if the
-	 * parent splits, it still can't be evicted.
-	 */
-	WT_WITH_PAGE_INDEX(session,
-	    ret = __split_internal_lock_worker(
-	    session, ref, trylock, parentp, hazardp));
-	return (ret);
 }
 
 /*
  * __split_internal_unlock --
  *	Unlock the parent page.
  */
-static int
-__split_internal_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard)
+static void
+__split_internal_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent)
 {
-	WT_DECL_RET;
-
-	if (hazard)
-		ret = __wt_hazard_clear(session, parent->pg_intl_parent_ref);
-
 	WT_PAGE_UNLOCK(session, parent);
-	return (ret);
 }
 
 /*
@@ -1297,13 +1291,12 @@ __split_internal_should_split(WT_SESSION_IMPL *session, WT_REF *ref)
  *	Check if we should split up the tree.
  */
 static int
-__split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard)
+__split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page)
 {
 	WT_BTREE *btree;
 	WT_DECL_RET;
 	WT_PAGE *parent;
 	WT_REF *ref;
-	bool parent_hazard;
 
 	btree = S2BT(session);
 
@@ -1317,8 +1310,10 @@ __split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard)
 	 * split chunk, but we'll write it upon finding it in a different part
 	 * of the tree.
 	 */
-	if (btree->checkpointing != WT_CKPT_OFF)
-		return (__split_internal_unlock(session, page, page_hazard));
+	if (btree->checkpointing != WT_CKPT_OFF) {
+		__split_internal_unlock(session, page);
+		return (0);
+	}
 
 	/*
 	 * Page splits trickle up the tree, that is, as leaf pages grow large
@@ -1340,7 +1335,6 @@ __split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard)
 	 */
 	for (;;) {
 		parent = NULL;
-		parent_hazard = false;
 		ref = page->pg_intl_parent_ref;
 
 		/* If we don't need to split the page, we're done. */
@@ -1360,22 +1354,18 @@ __split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard)
 		 * Lock the parent and split into it, then swap the parent/page
 		 * locks, lock-coupling up the tree.
 		 */
-		WT_ERR(__split_internal_lock(
-		    session, ref, true, &parent, &parent_hazard));
+		WT_ERR(__split_internal_lock(session, ref, true, &parent));
 		ret = __split_internal(session, parent, page);
-		WT_TRET(__split_internal_unlock(session, page, page_hazard));
+		__split_internal_unlock(session, page);
 
 		page = parent;
-		page_hazard = parent_hazard;
 		parent = NULL;
-		parent_hazard = false;
 		WT_ERR(ret);
 	}
 
 err:	if (parent != NULL)
-		WT_TRET(
-		    __split_internal_unlock(session, parent, parent_hazard));
-	WT_TRET(__split_internal_unlock(session, page, page_hazard));
+		__split_internal_unlock(session, parent);
+	__split_internal_unlock(session, page);
 
 	/* A page may have been busy, in which case return without error. */
 	WT_RET_BUSY_OK(ret);
@@ -1462,11 +1452,11 @@ __split_multi_inmem(
 		case WT_PAGE_ROW_LEAF:
 			/* Build a key. */
 			if (supd->ins == NULL) {
-				slot = WT_ROW_SLOT(orig, supd->rip);
+				slot = WT_ROW_SLOT(orig, supd->ripcip);
 				upd = orig->modify->mod_row_update[slot];
 
 				WT_ERR(__wt_row_leaf_key(
-				    session, orig, supd->rip, key, false));
+				    session, orig, supd->ripcip, key, false));
 			} else {
 				upd = supd->ins->upd;
 
@@ -1530,7 +1520,7 @@ __split_multi_inmem_final(WT_PAGE *orig, WT_MULTI *multi)
 			break;
 		case WT_PAGE_ROW_LEAF:
 			if (supd->ins == NULL) {
-				slot = WT_ROW_SLOT(orig, supd->rip);
+				slot = WT_ROW_SLOT(orig, supd->ripcip);
 				orig->modify->mod_row_update[slot] = NULL;
 			} else
 				supd->ins->upd = NULL;
@@ -1986,21 +1976,19 @@ err:	if (split_ref[0] != NULL) {
 }
 
 /*
- * __wt_split_insert --
- *	Lock, then split.
+ * __split_insert_lock --
+ *	Split a page's last insert list entries into a separate page.
  */
-int
-__wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
+static int
+__split_insert_lock(WT_SESSION_IMPL *session, WT_REF *ref)
 {
 	WT_DECL_RET;
 	WT_PAGE *parent;
-	bool hazard;
-
-	__wt_verbose(session, WT_VERB_SPLIT, "%p: split-insert", (void *)ref);
 
-	WT_RET(__split_internal_lock(session, ref, true, &parent, &hazard));
+	/* Lock the parent page, then proceed with the insert split. */
+	WT_RET(__split_internal_lock(session, ref, true, &parent));
 	if ((ret = __split_insert(session, ref)) != 0) {
-		WT_TRET(__split_internal_unlock(session, parent, hazard));
+		__split_internal_unlock(session, parent);
 		return (ret);
 	}
 
@@ -2009,7 +1997,27 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
 	 * parent page locked, note the functions we call are responsible for
 	 * releasing that lock.
 	 */
-	return (__split_parent_climb(session, parent, hazard));
+	return (__split_parent_climb(session, parent));
+}
+
+/*
+ * __wt_split_insert --
+ *	Split a page's last insert list entries into a separate page.
+ */
+int
+__wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+	WT_DECL_RET;
+
+	__wt_verbose(session, WT_VERB_SPLIT, "%p: split-insert", (void *)ref);
+
+	/*
+	 * Set the session split generation to ensure underlying code isn't
+	 * surprised by internal page eviction, then proceed with the insert
+	 * split.
+	 */
+	WT_WITH_PAGE_INDEX(session, ret = __split_insert_lock(session, ref));
+	return (ret);
 }
 
 /*
@@ -2077,21 +2085,19 @@ err:		for (i = 0; i < new_entries; ++i)
 }
 
 /*
- * __wt_split_multi --
- *	Lock, then split.
+ * __split_multi_lock --
+ *	Split a page into multiple pages.
  */
-int
-__wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
+static int
+__split_multi_lock(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
 {
 	WT_DECL_RET;
 	WT_PAGE *parent;
-	bool hazard;
 
-	__wt_verbose(session, WT_VERB_SPLIT, "%p: split-multi", (void *)ref);
-
-	WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard));
+	/* Lock the parent page, then proceed with the split. */
+	WT_RET(__split_internal_lock(session, ref, false, &parent));
 	if ((ret = __split_multi(session, ref, closing)) != 0 || closing) {
-		WT_TRET(__split_internal_unlock(session, parent, hazard));
+		__split_internal_unlock(session, parent);
 		return (ret);
 	}
 
@@ -2100,26 +2106,63 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
 	 * parent page locked, note the functions we call are responsible for
 	 * releasing that lock.
 	 */
-	return (__split_parent_climb(session, parent, hazard));
+	return (__split_parent_climb(session, parent));
+}
+
+/*
+ * __wt_split_multi --
+ *	Split a page into multiple pages.
+ */
+int
+__wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
+{
+	WT_DECL_RET;
+
+	__wt_verbose(session, WT_VERB_SPLIT, "%p: split-multi", (void *)ref);
+
+	/*
+	 * Set the session split generation to ensure underlying code isn't
+	 * surprised by internal page eviction, then proceed with the split.
+	 */
+	WT_WITH_PAGE_INDEX(session,
+	    ret = __split_multi_lock(session, ref, closing));
+	return (ret);
+}
+
+/*
+ * __split_reverse --
+ *	Reverse split (rewrite a parent page's index to reflect an empty page).
+ */
+static int
+__split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+	WT_DECL_RET;
+	WT_PAGE *parent;
+
+	/* Lock the parent page, then proceed with the reverse split. */
+	WT_RET(__split_internal_lock(session, ref, false, &parent));
+	ret = __split_parent(session, ref, NULL, 0, 0, false, true);
+	__split_internal_unlock(session, parent);
+	return (ret);
 }
 
 /*
  * __wt_split_reverse --
- *	We have a locked ref that is empty and we want to rewrite the index in
- *	its parent.
+ *	Reverse split (rewrite a parent page's index to reflect an empty page).
  */
 int
 __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
 {
 	WT_DECL_RET;
-	WT_PAGE *parent;
-	bool hazard;
 
 	__wt_verbose(session, WT_VERB_SPLIT, "%p: reverse-split", (void *)ref);
 
-	WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard));
-	ret = __split_parent(session, ref, NULL, 0, 0, false, true);
-	WT_TRET(__split_internal_unlock(session, parent, hazard));
+	/*
+	 * Set the session split generation to ensure underlying code isn't
+	 * surprised by internal page eviction, then proceed with the reverse
+	 * split.
+	 */
+	WT_WITH_PAGE_INDEX(session, ret = __split_reverse(session, ref));
 	return (ret);
 }
 
diff --git a/src/third_party/wiredtiger/src/btree/bt_stat.c b/src/third_party/wiredtiger/src/btree/bt_stat.c
index e3b9bbced48..d7150859e8f 100644
--- a/src/third_party/wiredtiger/src/btree/bt_stat.c
+++ b/src/third_party/wiredtiger/src/btree/bt_stat.c
@@ -137,7 +137,6 @@ __stat_page_col_var(
 	WT_CELL_UNPACK *unpack, _unpack;
 	WT_COL *cip;
 	WT_INSERT *ins;
-	WT_UPDATE *upd;
 	uint64_t deleted_cnt, entry_cnt, ovfl_cnt, rle_cnt;
 	uint32_t i;
 	bool orig_deleted;
@@ -177,31 +176,39 @@ __stat_page_col_var(
 		 * we find, correct the original count based on its state.
 		 */
 		WT_SKIP_FOREACH(ins, WT_COL_UPDATE(page, cip)) {
-			upd = ins->upd;
-			if (upd->type == WT_UPDATE_RESERVED)
-				continue;
-			if (upd->type == WT_UPDATE_DELETED) {
+			switch (ins->upd->type) {
+			case WT_UPDATE_DELETED:
 				if (!orig_deleted) {
 					++deleted_cnt;
 					--entry_cnt;
 				}
-			} else
+				break;
+			case WT_UPDATE_MODIFIED:
+			case WT_UPDATE_STANDARD:
 				if (orig_deleted) {
 					--deleted_cnt;
 					++entry_cnt;
 				}
+				break;
+			case WT_UPDATE_RESERVED:
+				break;
+			}
 		}
 	}
 
 	/* Walk any append list. */
-	WT_SKIP_FOREACH(ins, WT_COL_APPEND(page)) {
-		if (ins->upd->type == WT_UPDATE_RESERVED)
-			continue;
-		if (ins->upd->type == WT_UPDATE_DELETED)
+	WT_SKIP_FOREACH(ins, WT_COL_APPEND(page))
+		switch (ins->upd->type) {
+		case WT_UPDATE_DELETED:
 			++deleted_cnt;
-		else
+			break;
+		case WT_UPDATE_MODIFIED:
+		case WT_UPDATE_STANDARD:
 			++entry_cnt;
-	}
+			break;
+		case WT_UPDATE_RESERVED:
+			break;
+		}
 
 	WT_STAT_INCRV(session, stats, btree_column_deleted, deleted_cnt);
 	WT_STAT_INCRV(session, stats, btree_column_rle, rle_cnt);
diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c
index 225e6812aa1..d783f8f6e71 100644
--- a/src/third_party/wiredtiger/src/btree/bt_walk.c
+++ b/src/third_party/wiredtiger/src/btree/bt_walk.c
@@ -18,9 +18,16 @@ __ref_index_slot(WT_SESSION_IMPL *session,
 {
 	WT_PAGE_INDEX *pindex;
 	WT_REF **start, **stop, **p, **t;
+	uint64_t sleep_count, yield_count;
 	uint32_t entries, slot;
 
-	for (;;) {
+	/*
+	 * If we don't find our reference, the page split and our home
+	 * pointer references the wrong page. When internal pages
+	 * split, their WT_REF structure home values are updated; yield
+	 * and wait for that to happen.
+	 */
+	for (sleep_count = yield_count = 0;;) {
 		/*
 		 * Copy the parent page's index value: the page can split at
 		 * any time, but the index's value is always valid, even if
@@ -58,14 +65,14 @@ __ref_index_slot(WT_SESSION_IMPL *session,
 				goto found;
 			}
 		}
-
 		/*
-		 * If we don't find our reference, the page split and our home
-		 * pointer references the wrong page. When internal pages
-		 * split, their WT_REF structure home values are updated; yield
-		 * and wait for that to happen.
+		 * We failed to get the page index and slot reference, yield
+		 * before retrying, and if we've yielded enough times, start
+		 * sleeping so we don't burn CPU to no purpose.
 		 */
-		__wt_yield();
+		__wt_ref_state_yield_sleep(&yield_count, &sleep_count);
+		WT_STAT_CONN_INCRV(session, page_index_slot_ref_blocked,
+		    sleep_count);
 	}
 
 found:	WT_ASSERT(session, pindex->index[slot] == ref);
@@ -177,12 +184,13 @@ __ref_descend_prev(
     WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp)
 {
 	WT_PAGE_INDEX *pindex;
+	uint64_t yield_count;
 
 	/*
 	 * We're passed a child page into which we're descending, and on which
 	 * we have a hazard pointer.
 	 */
-	for (;; __wt_yield()) {
+	for (yield_count = 0;; yield_count++, __wt_yield()) {
 		/*
 		 * There's a split race when a cursor moving backwards through
 		 * the tree descends the tree. If we're splitting an internal
@@ -242,6 +250,7 @@ __ref_descend_prev(
 			break;
 	}
 	*pindexp = pindex;
+	WT_STAT_CONN_INCRV(session, tree_descend_blocked, yield_count);
 }
 
 /*
diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c
index e2d19bf705b..a57a9c17edb 100644
--- a/src/third_party/wiredtiger/src/btree/row_modify.c
+++ b/src/third_party/wiredtiger/src/btree/row_modify.c
@@ -268,13 +268,13 @@ __wt_update_alloc(WT_SESSION_IMPL *session, const WT_ITEM *value,
 	 */
 	if (modify_type == WT_UPDATE_DELETED ||
 	    modify_type == WT_UPDATE_RESERVED)
-		WT_RET(__wt_calloc(session, 1, sizeof(WT_UPDATE), &upd));
+		WT_RET(__wt_calloc(session, 1, WT_UPDATE_SIZE, &upd));
 	else {
 		WT_RET(__wt_calloc(
-		    session, 1, sizeof(WT_UPDATE) + value->size, &upd));
+		    session, 1, WT_UPDATE_SIZE + value->size, &upd));
 		if (value->size != 0) {
 			upd->size = WT_STORE_SIZE(value->size);
-			memcpy(WT_UPDATE_DATA(upd), value->data, value->size);
+			memcpy(upd->data, value->data, value->size);
 		}
 	}
 	upd->type = (uint8_t)modify_type;
@@ -302,9 +302,16 @@ __wt_update_obsolete_check(
 	 * freeing the memory.
 	 *
 	 * Walk the list of updates, looking for obsolete updates at the end.
+	 *
+	 * Only updates with globally visible, self-contained data can terminate
+	 * update chains, ignore modified and reserved updates. Special case the
+	 * first transaction ID, it flags column-store overflow values which can
+	 * never be discarded.
 	 */
 	for (first = NULL, count = 0; upd != NULL; upd = upd->next, count++)
-		if (__wt_txn_upd_visible_all(session, upd)) {
+		if (WT_UPDATE_DATA_VALUE(upd) &&
+		    __wt_txn_upd_visible_all(session, upd) &&
+		    upd->txnid != WT_TXN_FIRST) {
 			if (first == NULL)
 				first = upd;
 		} else if (upd->txnid != WT_TXN_ABORTED)
author	Alex Gorrod <alexander.gorrod@mongodb.com>	2017-08-01 16:42:49 +1000
committer	Alex Gorrod <alexander.gorrod@mongodb.com>	2017-08-01 16:42:49 +1000
commit	835bfb21d8e67663d84a40aa4f7370a4403725a9 (patch)
tree	4f5edb231524f95272f834e31461ba4e17e52903 /src/third_party/wiredtiger/src/btree
parent	6300b3bd4ad9cd238a02bdb8ca681a447913f1af (diff)
download	mongo-835bfb21d8e67663d84a40aa4f7370a4403725a9.tar.gz