1 files changed, 88 insertions, 28 deletions
diff --git a/src/include/btree.i b/src/include/btree.i
index 23e0dfea2cd..94111397abd 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
  * Copyright (c) 2008-2014 WiredTiger, Inc.
  *	All rights reserved.
  *
@@ -1046,15 +1046,16 @@ __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
 	 * do it without making the appending threads wait. See if it's worth
 	 * doing a split to let the threads continue before doing eviction.
 	 *
-	 * Ignore anything other than large, dirty row-store leaf pages. The
-	 * split code only supports row-store pages, and we depend on the page
-	 * being dirty for correctness (the page must be reconciled again
+	 * Ignore anything other than large, dirty leaf pages. We depend on the
+	 * page being dirty for correctness (the page must be reconciled again
 	 * before being evicted after the split, information from a previous
 	 * reconciliation will be wrong, so we can't evict immediately).
 	 */
-	if (page->type != WT_PAGE_ROW_LEAF ||
-	    page->memory_footprint < btree->splitmempage ||
-	    !__wt_page_is_modified(page))
+	if (page->memory_footprint < btree->splitmempage)
+		return (false);
+	if (WT_PAGE_IS_INTERNAL(page))
+		return (false);
+	if (!__wt_page_is_modified(page))
 		return (false);
 
 	/*
@@ -1071,9 +1072,11 @@ __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
 #define	WT_MIN_SPLIT_COUNT	30
 #define	WT_MIN_SPLIT_MULTIPLIER 16      /* At level 2, we see 1/16th entries */
 
-	ins_head = page->pg_row_entries == 0 ?
+	ins_head = page->type == WT_PAGE_ROW_LEAF ?
+	    (page->pg_row_entries == 0 ?
 	    WT_ROW_INSERT_SMALLEST(page) :
-	    WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1);
+	    WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1)) :
+	    WT_COL_APPEND(page);
 	if (ins_head == NULL)
 		return (false);
 	for (count = 0, size = 0, ins = ins_head->head[WT_MIN_SPLIT_DEPTH];
@@ -1280,8 +1283,8 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
  * coupling up/down the tree.
  */
 static inline int
-__wt_page_swap_func(WT_SESSION_IMPL *session, WT_REF *held,
-    WT_REF *want, uint32_t flags
+__wt_page_swap_func(
+    WT_SESSION_IMPL *session, WT_REF *held, WT_REF *want, uint32_t flags
 #ifdef HAVE_DIAGNOSTIC
     , const char *file, int line
 #endif
@@ -1310,20 +1313,40 @@ __wt_page_swap_func(WT_SESSION_IMPL *session, WT_REF *held,
 #endif
 	    );
 
-	/* Expected failures: page not found or restart. */
-	if (ret == WT_NOTFOUND || ret == WT_RESTART)
-		return (ret);
+	/*
+	 * Expected failures: page not found or restart. Our callers list the
+	 * errors they're expecting to handle.
+	 */
+	if (LF_ISSET(WT_READ_NOTFOUND_OK) && ret == WT_NOTFOUND)
+		return (WT_NOTFOUND);
+	if (LF_ISSET(WT_READ_RESTART_OK) && ret == WT_RESTART)
+		return (WT_RESTART);
 
-	/* Discard the original held page. */
+	/* Discard the original held page on either success or error. */
 	acquired = ret == 0;
 	WT_TRET(__wt_page_release(session, held, flags));
 
+	/* Fast-path expected success. */
+	if (ret == 0)
+		return (0);
+
 	/*
-	 * If there was an error discarding the original held page, discard
-	 * the acquired page too, keeping it is never useful.
+	 * If there was an error at any point that our caller isn't prepared to
+	 * handle, discard any page we acquired.
 	 */
-	if (acquired && ret != 0)
+	if (acquired)
 		WT_TRET(__wt_page_release(session, want, flags));
+
+	/*
+	 * If we're returning an error, don't let it be one our caller expects
+	 * to handle as returned by page-in: the expectation includes the held
+	 * page not having been released, and that's not the case.
+	 */
+	if (LF_ISSET(WT_READ_NOTFOUND_OK) && ret == WT_NOTFOUND)
+		return (EINVAL);
+	if (LF_ISSET(WT_READ_RESTART_OK) && ret == WT_RESTART)
+		return (EINVAL);
+
 	return (ret);
 }
 
@@ -1437,17 +1460,54 @@ __wt_split_intl_race(
 	 *
 	 * There's a page-split race when we walk the tree: if we're splitting
 	 * an internal page into its parent, we update the parent's page index
-	 * and then update the page being split, and it's not an atomic update.
-	 * A thread could read the parent page's original page index, and then
-	 * read the page's replacement index. Because internal page splits work
-	 * by replacing the original page with the initial part of the original
-	 * page, the result of this race is we will have a key that's past the
-	 * end of the current page, and the parent's page index will have moved.
+	 * before updating the split page's page index, and it's not an atomic
+	 * update. A thread can read the parent page's original page index and
+	 * then read the split page's replacement index.
+	 *
+	 * Because internal page splits work by truncating the original page to
+	 * the initial part of the original page, the result of this race is we
+	 * will have a search key that points past the end of the current page.
+	 * This is only an issue when we search past the end of the page, if we
+	 * find a WT_REF in the page with the namespace we're searching for, we
+	 * don't care if the WT_REF moved or not while we were searching, we
+	 * have the correct page.
+	 *
+	 * For example, imagine an internal page with 3 child pages, with the
+	 * namespaces a-f, g-h and i-j; the first child page splits. The parent
+	 * starts out with the following page-index:
+	 *
+	 *	| ... | a | g | i | ... |
+	 *
+	 * which changes to this:
+	 *
+	 *	| ... | a | c | e | g | i | ... |
+	 *
+	 * The child starts out with the following page-index:
+	 *
+	 *	| a | b | c | d | e | f |
+	 *
+	 * which changes to this:
+	 *
+	 *	| a | b |
+	 *
+	 * The thread searches the original parent page index for the key "cat",
+	 * it couples to the "a" child page; if it uses the replacement child
+	 * page index, it will search past the end of the page and couple to the
+	 * "b" page, which is wrong.
+	 *
+	 * To detect the problem, we remember the parent page's page index used
+	 * to descend the tree. Whenever we search past the end of a page, we
+	 * check to see if the parent's page index has changed since our use of
+	 * it during descent. As the problem only appears if we read the split
+	 * page's replacement index, the parent page's index must already have
+	 * changed, ensuring we detect the problem.
 	 *
-	 * It's also possible a thread could read the parent page's replacement
-	 * page index, and then read the page's original index. Because internal
-	 * splits work by truncating the original page, the original page's old
-	 * content is compatible, this isn't a problem and we ignore this race.
+	 * It's possible for the opposite race to happen (a thread could read
+	 * the parent page's replacement page index and then read the split
+	 * page's original index). This isn't a problem because internal splits
+	 * work by truncating the split page, so the split page search is for
+	 * content the split page retains after the split, and we ignore this
+	 * race.
 	 */
 	WT_INTL_INDEX_GET(session, parent, pindex);
 	return (pindex != saved_pindex);