/*-
 * Copyright (c) 2014-2018 MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

#define	WT_MEM_TRANSFER(from_decr, to_incr, len) do {			\
	size_t __len = (len);						\
	(from_decr) += __len;						\
	(to_incr) += __len;						\
} while (0)

/*
 * A note on error handling: main split functions first allocate/initialize new
 * structures; failures during that period are handled by discarding the memory
 * and returning an error code, the caller knows the split didn't happen and
 * proceeds accordingly. Second, split functions update the tree, and a failure
 * in that period is catastrophic, any partial update to the tree requires a
 * panic, we can't recover. Third, once the split is complete and the tree has
 * been fully updated, we have to ignore most errors, the split is complete and
 * correct, callers have to proceed accordingly.
 */
typedef enum {
	WT_ERR_IGNORE,				/* Ignore minor errors */
	WT_ERR_PANIC,				/* Panic on all errors */
	WT_ERR_RETURN				/* Clean up and return error */
} WT_SPLIT_ERROR_PHASE;

/*
 * __page_split_timing_stress --
 *	Optionally add delay to simulate the race conditions in page split for
 * debug purposes. The purpose is to uncover the race conditions in page split.
 */
static void
__page_split_timing_stress(WT_SESSION_IMPL *session,
    uint32_t flag, uint64_t micro_seconds)
{
	WT_CONNECTION_IMPL *conn;

	conn = S2C(session);

	/* We only want to sleep when page split race flag is set. */
	if (FLD_ISSET(conn->timing_stress_flags, flag))
		__wt_sleep(0, micro_seconds);
}

/*
 * __split_safe_free --
 *	Free a buffer if we can be sure no thread is accessing it, or schedule
 *	it to be freed otherwise.
 */
static int
__split_safe_free(WT_SESSION_IMPL *session,
    uint64_t split_gen, bool exclusive, void *p, size_t s)
{
	/* We should only call safe free if we aren't pinning the memory. */
	WT_ASSERT(session,
	    __wt_session_gen(session, WT_GEN_SPLIT) != split_gen);

	/*
	 * We have swapped something in a page: if we don't have exclusive
	 * access, check whether there are other threads in the same tree.
	 */
	if (!exclusive && __wt_gen_oldest(session, WT_GEN_SPLIT) > split_gen)
		exclusive = true;

	if (exclusive) {
		__wt_overwrite_and_free_len(session, p, s);
		return (0);
	}

	return (__wt_stash_add(session, WT_GEN_SPLIT, split_gen, p, s));
}

#ifdef HAVE_DIAGNOSTIC
/*
 * __split_verify_intl_key_order --
 *	Verify the key order on an internal page after a split.
 */
static void
__split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_BTREE *btree;
	WT_ITEM *next, _next, *last, _last, *tmp;
	WT_REF *ref;
	uint64_t recno;
	int cmp;
	bool first;

	btree = S2BT(session);

	switch (page->type) {
	case WT_PAGE_COL_INT:
		recno = 0;		/* Less than any valid record number. */
		WT_INTL_FOREACH_BEGIN(session, page, ref) {
			WT_ASSERT(session, ref->home == page);

			WT_ASSERT(session, ref->ref_recno > recno);
			recno = ref->ref_recno;
		} WT_INTL_FOREACH_END;
		break;
	case WT_PAGE_ROW_INT:
		next = &_next;
		WT_CLEAR(_next);
		last = &_last;
		WT_CLEAR(_last);

		first = true;
		WT_INTL_FOREACH_BEGIN(session, page, ref) {
			WT_ASSERT(session, ref->home == page);

			__wt_ref_key(page, ref, &next->data, &next->size);
			if (last->size == 0) {
				if (first)
					first = false;
				else {
					WT_ASSERT(session, __wt_compare(
					    session, btree->collator, last,
					    next, &cmp) == 0);
					WT_ASSERT(session, cmp < 0);
				}
			}
			tmp = last;
			last = next;
			next = tmp;
		} WT_INTL_FOREACH_END;
		break;
	}
}

/*
 * __split_verify_root --
 *	Verify a root page involved in a split.
 */
static int
__split_verify_root(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_DECL_RET;
	WT_REF *ref;
	uint32_t read_flags;

	read_flags = WT_READ_CACHE | WT_READ_NO_EVICT;

	/* The split is complete and live, verify all of the pages involved. */
	__split_verify_intl_key_order(session, page);

	WT_INTL_FOREACH_BEGIN(session, page, ref) {
		/*
		 * An eviction thread might be attempting to evict the page
		 * (the WT_REF may be WT_REF_LOCKED), or it may be a disk based
		 * page (the WT_REF may be WT_REF_READING), or it may be in
		 * some other state.  Acquire a hazard pointer for any
		 * in-memory pages so we know the state of the page.
		 *
		 * Ignore pages not in-memory (deleted, on-disk, being read),
		 * there's no in-memory structure to check.
		 */
		if ((ret =
		    __wt_page_in(session, ref, read_flags)) == WT_NOTFOUND)
			continue;
		WT_ERR(ret);

		__split_verify_intl_key_order(session, ref->page);

		WT_ERR(__wt_page_release(session, ref, read_flags));
	} WT_INTL_FOREACH_END;

	return (0);

err:	/* Something really bad just happened. */
	WT_PANIC_RET(session, ret, "fatal error during page split");
}
#endif

/*
 * __split_ovfl_key_cleanup --
 *	Handle cleanup for on-page row-store overflow keys.
 */
static int
__split_ovfl_key_cleanup(WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref)
{
	WT_CELL *cell;
	WT_CELL_UNPACK kpack;
	WT_IKEY *ikey;
	uint32_t cell_offset;

	/* There's a per-page flag if there are any overflow keys at all. */
	if (!F_ISSET_ATOMIC(page, WT_PAGE_OVERFLOW_KEYS))
		return (0);

	/*
	 * A key being discarded (page split) or moved to a different page (page
	 * deepening) may be an on-page overflow key.  Clear any reference to an
	 * underlying disk image, and, if the key hasn't been deleted, delete it
	 * along with any backing blocks.
	 */
	if ((ikey = __wt_ref_key_instantiated(ref)) == NULL)
		return (0);
	if ((cell_offset = ikey->cell_offset) == 0)
		return (0);

	/* Leak blocks rather than try this twice. */
	ikey->cell_offset = 0;

	cell = WT_PAGE_REF_OFFSET(page, cell_offset);
	__wt_cell_unpack(cell, &kpack);
	if (kpack.ovfl && kpack.raw != WT_CELL_KEY_OVFL_RM) {
		/*
		 * Eviction cannot free overflow items once a checkpoint is
		 * running in a tree: that can corrupt the checkpoint's block
		 * management.  Assert that checkpoints aren't running to make
		 * sure we're catching all paths and to avoid regressions.
		 */
		WT_ASSERT(session,
		    S2BT(session)->checkpointing != WT_CKPT_RUNNING ||
		    WT_SESSION_IS_CHECKPOINT(session));

		WT_RET(__wt_ovfl_discard(session, cell));
	}

	return (0);
}

/*
 * __split_ref_move --
 *	Move a WT_REF from one page to another, including updating accounting
 * information.
 */
static int
__split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
    WT_REF **from_refp, size_t *decrp, WT_REF **to_refp, size_t *incrp)
{
	WT_ADDR *addr, *ref_addr;
	WT_CELL_UNPACK unpack;
	WT_DECL_RET;
	WT_IKEY *ikey;
	WT_REF *ref;
	size_t size;
	void *key;

	ref = *from_refp;
	addr = NULL;

	/*
	 * The from-home argument is the page into which the "from" WT_REF may
	 * point, for example, if there's an on-page key the "from" WT_REF
	 * references, it will be on the page "from-home".
	 *
	 * Instantiate row-store keys, and column- and row-store addresses in
	 * the WT_REF structures referenced by a page that's being split. The
	 * WT_REF structures aren't moving, but the index references are moving
	 * from the page we're splitting to a set of new pages, and so we can
	 * no longer reference the block image that remains with the page being
	 * split.
	 *
	 * No locking is required to update the WT_REF structure because we're
	 * the only thread splitting the page, and there's no way for readers
	 * to race with our updates of single pointers.  The changes have to be
	 * written before the page goes away, of course, our caller owns that
	 * problem.
	 */
	if (from_home->type == WT_PAGE_ROW_INT) {
		/*
		 * Row-store keys: if it's not yet instantiated, instantiate it.
		 * If already instantiated, check for overflow cleanup (overflow
		 * keys are always instantiated).
		 */
		if ((ikey = __wt_ref_key_instantiated(ref)) == NULL) {
			__wt_ref_key(from_home, ref, &key, &size);
			WT_RET(__wt_row_ikey(session, 0, key, size, ref));
			ikey = ref->ref_ikey;
		} else {
			WT_RET(
			    __split_ovfl_key_cleanup(session, from_home, ref));
			*decrp += sizeof(WT_IKEY) + ikey->size;
		}
		*incrp += sizeof(WT_IKEY) + ikey->size;
	}

	/*
	 * If there's no address at all (the page has never been written), or
	 * the address has already been instantiated, there's no work to do.
	 * Otherwise, the address still references a split page on-page cell,
	 * instantiate it. We can race with reconciliation and/or eviction of
	 * the child pages, be cautious: read the address and verify it, and
	 * only update it if the value is unchanged from the original. In the
	 * case of a race, the address must no longer reference the split page,
	 * we're done.
	 */
	WT_ORDERED_READ(ref_addr, ref->addr);
	if (ref_addr != NULL && !__wt_off_page(from_home, ref_addr)) {
		__wt_cell_unpack((WT_CELL *)ref_addr, &unpack);
		WT_RET(__wt_calloc_one(session, &addr));
		WT_ERR(__wt_memdup(
		    session, unpack.data, unpack.size, &addr->addr));
		addr->size = (uint8_t)unpack.size;
		switch (unpack.raw) {
		case WT_CELL_ADDR_INT:
			addr->type = WT_ADDR_INT;
			break;
		case WT_CELL_ADDR_LEAF:
			addr->type = WT_ADDR_LEAF;
			break;
		case WT_CELL_ADDR_LEAF_NO:
			addr->type = WT_ADDR_LEAF_NO;
			break;
		WT_ILLEGAL_VALUE_ERR(session);
		}
		if (__wt_atomic_cas_ptr(&ref->addr, ref_addr, addr))
			addr = NULL;
	}

	/* And finally, copy the WT_REF pointer itself. */
	*to_refp = ref;
	WT_MEM_TRANSFER(*decrp, *incrp, sizeof(WT_REF));

err:	if (addr != NULL) {
		__wt_free(session, addr->addr);
		__wt_free(session, addr);
	}
	return (ret);
}

/*
 * __split_ref_prepare --
 *	Prepare a set of WT_REFs for a move.
 */
static void
__split_ref_prepare(
    WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first)
{
	WT_PAGE *child;
	WT_REF *child_ref, *ref;
	uint32_t i, j;

	/* The newly created subtree is complete. */
	WT_WRITE_BARRIER();

	/*
	 * Update the moved WT_REFs so threads moving through them start looking
	 * at the created children's page index information. Because we've not
	 * yet updated the page index of the parent page into which we are going
	 * to split this subtree, a cursor moving through these WT_REFs will
	 * ascend into the created children, but eventually fail as that parent
	 * page won't yet know about the created children pages. That's OK, we
	 * spin there until the parent's page index is updated.
	 *
	 * Lock the newly created page to ensure it doesn't split until all
	 * child pages have been updated.
	 */
	for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) {
		ref = pindex->index[i];
		child = ref->page;

		/* Switch the WT_REF's to their new page. */
		j = 0;
		WT_PAGE_LOCK(session, child);
		WT_INTL_FOREACH_BEGIN(session, child, child_ref) {
			child_ref->home = child;
			child_ref->pindex_hint = j++;
		} WT_INTL_FOREACH_END;
		WT_PAGE_UNLOCK(session, child);

#ifdef HAVE_DIAGNOSTIC
		WT_WITH_PAGE_INDEX(session,
		    __split_verify_intl_key_order(session, child));
#endif
	}
}

/*
 * __split_root --
 *	Split the root page in-memory, deepening the tree.
 */
static int
__split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE *child;
	WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex;
	WT_REF **alloc_refp;
	WT_REF **child_refp, *ref, **root_refp;
	WT_SPLIT_ERROR_PHASE complete;
	size_t child_incr, root_decr, root_incr, size;
	uint64_t split_gen;
	uint32_t children, chunk, i, j, remain;
	uint32_t slots;
	void *p;

	WT_STAT_CONN_INCR(session, cache_eviction_deepen);
	WT_STAT_DATA_INCR(session, cache_eviction_deepen);
	WT_STAT_CONN_INCR(session, cache_eviction_split_internal);
	WT_STAT_DATA_INCR(session, cache_eviction_split_internal);

	btree = S2BT(session);
	alloc_index = NULL;
	root_decr = root_incr = 0;
	complete = WT_ERR_RETURN;

	/* The root page will be marked dirty, make sure that will succeed. */
	WT_RET(__wt_page_modify_init(session, root));

	/*
	 * Our caller is holding the root page locked to single-thread splits,
	 * which means we can safely look at the page's index without setting a
	 * split generation.
	 */
	pindex = WT_INTL_INDEX_GET_SAFE(root);

	/*
	 * Decide how many child pages to create, then calculate the standard
	 * chunk and whatever remains. Sanity check the number of children:
	 * the decision to split matched to the deepen-per-child configuration
	 * might get it wrong.
	 */
	children = pindex->entries / btree->split_deepen_per_child;
	if (children < 10) {
		if (pindex->entries < 100)
			return (EBUSY);
		children = 10;
	}
	chunk = pindex->entries / children;
	remain = pindex->entries - chunk * (children - 1);

	__wt_verbose(session, WT_VERB_SPLIT,
	    "%p: %" PRIu32 " root page elements, splitting into %" PRIu32
	    " children",
	    (void *)root, pindex->entries, children);

	/*
	 * Allocate a new WT_PAGE_INDEX and set of WT_REF objects to be inserted
	 * into the root page, replacing the root's page-index.
	 */
	size = sizeof(WT_PAGE_INDEX) + children * sizeof(WT_REF *);
	WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
	root_incr += size;
	alloc_index->index = (WT_REF **)(alloc_index + 1);
	alloc_index->entries = children;
	alloc_refp = alloc_index->index;
	for (i = 0; i < children; alloc_refp++, ++i)
		WT_ERR(__wt_calloc_one(session, alloc_refp));
	root_incr += children * sizeof(WT_REF);

	/*
	 * Once the split is live, newly created internal pages might be evicted
	 * and their WT_REF structures freed. If that happens before all threads
	 * exit the index of the page that previously "owned" the WT_REF, a
	 * thread might see a freed WT_REF. To ensure that doesn't happen, the
	 * created pages are set to the current split generation and so can't be
	 * evicted until all readers have left the old generation.
	 *
	 * Our thread has a stable split generation, get a copy.
	 */
	split_gen = __wt_session_gen(session, WT_GEN_SPLIT);

	/* Allocate child pages, and connect them into the new page index. */
	for (root_refp = pindex->index,
	    alloc_refp = alloc_index->index, i = 0; i < children; ++i) {
		slots = i == children - 1 ? remain : chunk;

		WT_ERR(__wt_page_alloc(
		    session, root->type, slots, false, &child));

		/*
		 * Initialize the page's child reference; we need a copy of the
		 * page's key.
		 */
		ref = *alloc_refp++;
		ref->home = root;
		ref->page = child;
		ref->addr = NULL;
		if (root->type == WT_PAGE_ROW_INT) {
			__wt_ref_key(root, *root_refp, &p, &size);
			WT_ERR(__wt_row_ikey(session, 0, p, size, ref));
			root_incr += sizeof(WT_IKEY) + size;
		} else
			ref->ref_recno = (*root_refp)->ref_recno;
		ref->state = WT_REF_MEM;

		/*
		 * Initialize the child page.
		 * Block eviction in newly created pages and mark them dirty.
		 */
		child->pg_intl_parent_ref = ref;
		child->pg_intl_split_gen = split_gen;
		WT_ERR(__wt_page_modify_init(session, child));
		__wt_page_modify_set(session, child);

		/*
		 * The newly allocated child's page index references the same
		 * structures as the root.  (We cannot move WT_REF structures,
		 * threads may be underneath us right now changing the structure
		 * state.)  However, if the WT_REF structures reference on-page
		 * information, we have to fix that, because the disk image for
		 * the page that has a page index entry for the WT_REF is about
		 * to change.
		 */
		child_pindex = WT_INTL_INDEX_GET_SAFE(child);
		child_incr = 0;
		for (child_refp = child_pindex->index,
		    j = 0; j < slots; ++child_refp, ++root_refp, ++j)
			WT_ERR(__split_ref_move(session, root,
			    root_refp, &root_decr, child_refp, &child_incr));

		__wt_cache_page_inmem_incr(session, child, child_incr);
	}
	WT_ASSERT(session,
	    alloc_refp - alloc_index->index == (ptrdiff_t)alloc_index->entries);
	WT_ASSERT(session,
	    root_refp - pindex->index == (ptrdiff_t)pindex->entries);

	/* Start making real changes to the tree, errors are fatal. */
	complete = WT_ERR_PANIC;

	/* Prepare the WT_REFs for the move. */
	__split_ref_prepare(session, alloc_index, false);

	/* Encourage a race */
	__page_split_timing_stress(session,
	    WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND);

	/*
	 * Confirm the root page's index hasn't moved, then update it, which
	 * makes the split visible to threads descending the tree.
	 */
	WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(root) == pindex);
	WT_INTL_INDEX_SET(root, alloc_index);
	alloc_index = NULL;

	/* Encourage a race */
	__page_split_timing_stress(session,
	    WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND);

	/*
	 * Get a generation for this split, mark the root page.  This must be
	 * after the new index is swapped into place in order to know that no
	 * readers are looking at the old index.
	 *
	 * Note: as the root page cannot currently be evicted, the root split
	 * generation isn't ever used. That said, it future proofs eviction
	 * and isn't expensive enough to special-case.
	 *
	 * Getting a new split generation implies a full barrier, no additional
	 * barrier is needed.
	 */
	split_gen = __wt_gen_next(session, WT_GEN_SPLIT);
	root->pg_intl_split_gen = split_gen;

#ifdef HAVE_DIAGNOSTIC
	WT_WITH_PAGE_INDEX(session,
	    ret = __split_verify_root(session, root));
	WT_ERR(ret);
#endif

	/* The split is complete and verified, ignore benign errors. */
	complete = WT_ERR_IGNORE;

	/*
	 * We can't free the previous root's index, there may be threads using
	 * it.  Add to the session's discard list, to be freed once we know no
	 * threads can still be using it.
	 *
	 * This change requires care with error handling: we have already
	 * updated the page with a new index.  Even if stashing the old value
	 * fails, we don't roll back that change, because threads may already
	 * be using the new index.
	 */
	size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
	WT_TRET(__split_safe_free(session, split_gen, false, pindex, size));
	root_decr += size;

	/* Adjust the root's memory footprint and mark it dirty. */
	__wt_cache_page_inmem_incr(session, root, root_incr);
	__wt_cache_page_inmem_decr(session, root, root_decr);
	__wt_page_modify_set(session, root);

err:	switch (complete) {
	case WT_ERR_RETURN:
		__wt_free_ref_index(session, root, alloc_index, true);
		break;
	case WT_ERR_PANIC:
		__wt_err(session, ret,
		    "fatal error during root page split to deepen the tree");
		ret = WT_PANIC;
		break;
	case WT_ERR_IGNORE:
		if (ret != 0 && ret != WT_PANIC) {
			__wt_err(session, ret,
			    "ignoring not-fatal error during root page split "
			    "to deepen the tree");
			ret = 0;
		}
		break;
	}
	return (ret);
}

/*
 * __split_parent --
 *	Resolve a multi-page split, inserting new information into the parent.
 */
static int
__split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
    uint32_t new_entries, size_t parent_incr, bool exclusive, bool discard)
{
	WT_DECL_ITEM(scr);
	WT_DECL_RET;
	WT_IKEY *ikey;
	WT_PAGE *parent;
	WT_PAGE_INDEX *alloc_index, *pindex;
	WT_REF **alloc_refp, *next_ref;
	WT_SPLIT_ERROR_PHASE complete;
	size_t parent_decr, size;
	uint64_t split_gen;
	uint32_t deleted_entries, parent_entries, result_entries;
	uint32_t *deleted_refs;
	uint32_t hint, i, j;
	bool empty_parent;

	parent = ref->home;

	alloc_index = pindex = NULL;
	parent_decr = 0;
	empty_parent = false;
	complete = WT_ERR_RETURN;

	/* The parent page will be marked dirty, make sure that will succeed. */
	WT_RET(__wt_page_modify_init(session, parent));

	/*
	 * We've locked the parent, which means it cannot split (which is the
	 * only reason to worry about split generation values).
	 */
	pindex = WT_INTL_INDEX_GET_SAFE(parent);
	parent_entries = pindex->entries;

	/*
	 * Remove any refs to deleted pages while we are splitting, we have
	 * the internal page locked down, and are copying the refs into a new
	 * array anyway.  Switch them to the special split state, so that any
	 * reading thread will restart.
	 */
	WT_ERR(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr));
	for (deleted_entries = 0, i = 0; i < parent_entries; ++i) {
		next_ref = pindex->index[i];
		WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
		if ((discard && next_ref == ref) ||
		    (next_ref->state == WT_REF_DELETED &&
		    __wt_delete_page_skip(session, next_ref, true) &&
		    __wt_atomic_casv32(
		    &next_ref->state, WT_REF_DELETED, WT_REF_SPLIT))) {
			WT_ERR(__wt_buf_grow(session, scr,
			    (deleted_entries + 1) * sizeof(uint32_t)));
			deleted_refs = scr->mem;
			deleted_refs[deleted_entries++] = i;
		}
	}

	/*
	 * The final entry count consists of the original count, plus any new
	 * pages, less any WT_REFs we're removing (deleted entries plus the
	 * entry we're replacing).
	 */
	result_entries = (parent_entries + new_entries) - deleted_entries;
	if (!discard)
		--result_entries;

	/*
	 * If there are no remaining entries on the parent, give up, we can't
	 * leave an empty internal page. Mark it to be evicted soon and clean
	 * up any references that have changed state.
	 */
	if (result_entries == 0) {
		empty_parent = true;
		if (!__wt_ref_is_root(parent->pg_intl_parent_ref))
			__wt_page_evict_soon(
			    session, parent->pg_intl_parent_ref);
		goto err;
	}

	/*
	 * Allocate and initialize a new page index array for the parent, then
	 * copy references from the original index array, plus references from
	 * the newly created split array, into place.
	 *
	 * Update the WT_REF's page-index hint as we go. This can race with a
	 * thread setting the hint based on an older page-index, and the change
	 * isn't backed out in the case of an error, so there ways for the hint
	 * to be wrong; OK because it's just a hint.
	 */
	size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *);
	WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
	parent_incr += size;
	alloc_index->index = (WT_REF **)(alloc_index + 1);
	alloc_index->entries = result_entries;
	for (alloc_refp = alloc_index->index,
	    hint = i = 0; i < parent_entries; ++i) {
		next_ref = pindex->index[i];
		if (next_ref == ref)
			for (j = 0; j < new_entries; ++j) {
				ref_new[j]->home = parent;
				ref_new[j]->pindex_hint = hint++;
				*alloc_refp++ = ref_new[j];
			}
		else if (next_ref->state != WT_REF_SPLIT) {
			/* Skip refs we have marked for deletion. */
			next_ref->pindex_hint = hint++;
			*alloc_refp++ = next_ref;
		}
	}

	/* Check that we filled in all the entries. */
	WT_ASSERT(session,
	    alloc_refp - alloc_index->index == (ptrdiff_t)result_entries);

	/* Start making real changes to the tree, errors are fatal. */
	WT_NOT_READ(complete, WT_ERR_PANIC);

	/* Encourage a race */
	__page_split_timing_stress(session,
	    WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND);

	/*
	 * Confirm the parent page's index hasn't moved then update it, which
	 * makes the split visible to threads descending the tree.
	 */
	WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex);
	WT_INTL_INDEX_SET(parent, alloc_index);
	alloc_index = NULL;

	/* Encourage a race */
	__page_split_timing_stress(session,
	    WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND);

	/*
	 * Get a generation for this split, mark the page.  This must be after
	 * the new index is swapped into place in order to know that no readers
	 * are looking at the old index.
	 *
	 * Getting a new split generation implies a full barrier, no additional
	 * barrier is needed.
	 */
	split_gen = __wt_gen_next(session, WT_GEN_SPLIT);
	parent->pg_intl_split_gen = split_gen;

	/*
	 * If discarding the page's original WT_REF field, reset it to split.
	 * Threads cursoring through the tree were blocked because that WT_REF
	 * state was set to locked. Changing the locked state to split unblocks
	 * those threads and causes them to re-calculate their position based
	 * on the just-updated parent page's index.
	 */
	if (discard) {
		/*
		 * Set the discarded WT_REF state to split, ensuring we don't
		 * race with any discard of the WT_REF deleted fields.
		 */
		WT_PUBLISH(ref->state, WT_REF_SPLIT);

		/*
		 * Push out the change: not required for correctness, but stops
		 * threads spinning on incorrect page references.
		 */
		WT_FULL_BARRIER();
	}

#ifdef HAVE_DIAGNOSTIC
	WT_WITH_PAGE_INDEX(session,
	    __split_verify_intl_key_order(session, parent));
#endif

	/* The split is complete and verified, ignore benign errors. */
	complete = WT_ERR_IGNORE;

	/*
	 * !!!
	 * Swapping in the new page index released the page for eviction, we can
	 * no longer look inside the page.
	 */
	if (ref->page == NULL)
		__wt_verbose(session, WT_VERB_SPLIT,
		    "%p: reverse split into parent %p, %" PRIu32 " -> %" PRIu32
		    " (-%" PRIu32 ")",
		    (void *)ref->page, (void *)parent,
		    parent_entries, result_entries,
		    parent_entries - result_entries);
	else
		__wt_verbose(session, WT_VERB_SPLIT,
		    "%p: split into parent %p, %" PRIu32 " -> %" PRIu32
		    " (+%" PRIu32 ")",
		    (void *)ref->page, (void *)parent,
		    parent_entries, result_entries,
		    result_entries - parent_entries);

	/*
	 * The new page index is in place, free the WT_REF we were splitting and
	 * any deleted WT_REFs we found, modulo the usual safe free semantics.
	 */
	for (i = 0, deleted_refs = scr->mem; i < deleted_entries; ++i) {
		next_ref = pindex->index[deleted_refs[i]];
		WT_ASSERT(session, next_ref->state == WT_REF_SPLIT);

		/*
		 * We set the WT_REF to split, discard it, freeing any resources
		 * it holds.
		 *
		 * Row-store trees where the old version of the page is being
		 * discarded: the previous parent page's key for this child page
		 * may have been an on-page overflow key.  In that case, if the
		 * key hasn't been deleted, delete it now, including its backing
		 * blocks.  We are exchanging the WT_REF that referenced it for
		 * the split page WT_REFs and their keys, and there's no longer
		 * any reference to it.  Done after completing the split (if we
		 * failed, we'd leak the underlying blocks, but the parent page
		 * would be unaffected).
		 */
		if (parent->type == WT_PAGE_ROW_INT) {
			WT_TRET(__split_ovfl_key_cleanup(
			    session, parent, next_ref));
			ikey = __wt_ref_key_instantiated(next_ref);
			if (ikey != NULL) {
				size = sizeof(WT_IKEY) + ikey->size;
				WT_TRET(__split_safe_free(
				    session, split_gen, exclusive, ikey, size));
				parent_decr += size;
			}
		}

		/*
		 * The page-delete and lookaside memory weren't added to the
		 * parent's footprint, ignore it here.
		 */
		if (next_ref->page_del != NULL) {
			__wt_free(session, next_ref->page_del->update_list);
			__wt_free(session, next_ref->page_del);
		}
		__wt_free(session, next_ref->page_las);

		/* Free the backing block and address. */
		WT_TRET(__wt_ref_block_free(session, next_ref));

		WT_TRET(__split_safe_free(
		    session, split_gen, exclusive, next_ref, sizeof(WT_REF)));
		parent_decr += sizeof(WT_REF);
	}

	/*
	 * !!!
	 * The original WT_REF has now been freed, we can no longer look at it.
	 */

	/*
	 * We can't free the previous page index, there may be threads using it.
	 * Add it to the session discard list, to be freed when it's safe.
	 */
	size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
	WT_TRET(__split_safe_free(session, split_gen, exclusive, pindex, size));
	parent_decr += size;

	/* Adjust the parent's memory footprint and mark it dirty. */
	__wt_cache_page_inmem_incr(session, parent, parent_incr);
	__wt_cache_page_inmem_decr(session, parent, parent_decr);
	__wt_page_modify_set(session, parent);

err:	__wt_scr_free(session, &scr);
	/*
	 * A note on error handling: if we completed the split, return success,
	 * nothing really bad can have happened, and our caller has to proceed
	 * with the split.
	 */
	switch (complete) {
	case WT_ERR_RETURN:
		for (i = 0; i < parent_entries; ++i) {
			next_ref = pindex->index[i];
			if (next_ref->state == WT_REF_SPLIT)
				next_ref->state = WT_REF_DELETED;
		}

		__wt_free_ref_index(session, NULL, alloc_index, false);
		/*
		 * The split couldn't proceed because the parent would be empty,
		 * return EBUSY so our caller knows to unlock the WT_REF that's
		 * being deleted, but don't be noisy, there's nothing wrong.
		 */
		if (empty_parent)
			ret = EBUSY;
		break;
	case WT_ERR_PANIC:
		__wt_err(session, ret, "fatal error during parent page split");
		ret = WT_PANIC;
		break;
	case WT_ERR_IGNORE:
		if (ret != 0 && ret != WT_PANIC) {
			__wt_err(session, ret,
			    "ignoring not-fatal error during parent page "
			    "split");
			ret = 0;
		}
		break;
	}
	return (ret);
}

/*
 * __split_internal --
 *	Split an internal page into its parent.
 */
static int
__split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE *child;
	WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex, *replace_index;
	WT_REF **alloc_refp;
	WT_REF **child_refp, *page_ref, **page_refp, *ref;
	WT_SPLIT_ERROR_PHASE complete;
	size_t child_incr, page_decr, page_incr, parent_incr, size;
	uint64_t split_gen;
	uint32_t children, chunk, i, j, remain;
	uint32_t slots;
	void *p;

	WT_STAT_CONN_INCR(session, cache_eviction_split_internal);
	WT_STAT_DATA_INCR(session, cache_eviction_split_internal);

	/* The page will be marked dirty, make sure that will succeed. */
	WT_RET(__wt_page_modify_init(session, page));

	btree = S2BT(session);
	alloc_index = replace_index = NULL;
	page_ref = page->pg_intl_parent_ref;
	page_decr = page_incr = parent_incr = 0;
	complete = WT_ERR_RETURN;

	/*
	 * Our caller is holding the page locked to single-thread splits, which
	 * means we can safely look at the page's index without setting a split
	 * generation.
	 */
	pindex = WT_INTL_INDEX_GET_SAFE(page);

	/*
	 * Decide how many child pages to create, then calculate the standard
	 * chunk and whatever remains. Sanity check the number of children:
	 * the decision to split matched to the deepen-per-child configuration
	 * might get it wrong.
	 */
	children = pindex->entries / btree->split_deepen_per_child;
	if (children < 10) {
		if (pindex->entries < 100)
			return (EBUSY);
		children = 10;
	}
	chunk = pindex->entries / children;
	remain = pindex->entries - chunk * (children - 1);

	__wt_verbose(session, WT_VERB_SPLIT,
	    "%p: %" PRIu32 " internal page elements, splitting %" PRIu32
	    " children into parent %p",
	    (void *)page, pindex->entries, children, (void *)parent);

	/*
	 * Ideally, we'd discard the original page, but that's hard since other
	 * threads of control are using it (for example, if eviction is walking
	 * the tree and looking at the page.) Instead, perform a right-split,
	 * moving all except the first chunk of the page's WT_REF objects to new
	 * pages.
	 *
	 * Create and initialize a replacement WT_PAGE_INDEX for the original
	 * page.
	 */
	size = sizeof(WT_PAGE_INDEX) + chunk * sizeof(WT_REF *);
	WT_ERR(__wt_calloc(session, 1, size, &replace_index));
	page_incr += size;
	replace_index->index = (WT_REF **)(replace_index + 1);
	replace_index->entries = chunk;
	for (page_refp = pindex->index, i = 0; i < chunk; ++i)
		replace_index->index[i] = *page_refp++;

	/*
	 * Allocate a new WT_PAGE_INDEX and set of WT_REF objects to be inserted
	 * into the page's parent, replacing the page's page-index.
	 *
	 * The first slot of the new WT_PAGE_INDEX is the original page WT_REF.
	 * The remainder of the slots are allocated WT_REFs.
	 */
	size = sizeof(WT_PAGE_INDEX) + children * sizeof(WT_REF *);
	WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
	parent_incr += size;
	alloc_index->index = (WT_REF **)(alloc_index + 1);
	alloc_index->entries = children;
	alloc_refp = alloc_index->index;
	*alloc_refp++ = page_ref;
	for (i = 1; i < children; ++alloc_refp, ++i)
		WT_ERR(__wt_calloc_one(session, alloc_refp));
	parent_incr += children * sizeof(WT_REF);

	/*
	 * Once the split is live, newly created internal pages might be evicted
	 * and their WT_REF structures freed. If that happens before all threads
	 * exit the index of the page that previously "owned" the WT_REF, a
	 * thread might see a freed WT_REF. To ensure that doesn't happen, the
	 * created pages are set to the current split generation and so can't be
	 * evicted until all readers have left the old generation.
	 *
	 * Our thread has a stable split generation, get a copy.
	 */
	split_gen = __wt_session_gen(session, WT_GEN_SPLIT);

	/* Allocate child pages, and connect them into the new page index. */
	WT_ASSERT(session, page_refp == pindex->index + chunk);
	for (alloc_refp = alloc_index->index + 1, i = 1; i < children; ++i) {
		slots = i == children - 1 ? remain : chunk;

		WT_ERR(__wt_page_alloc(
		    session, page->type, slots, false, &child));

		/*
		 * Initialize the page's child reference; we need a copy of the
		 * page's key.
		 */
		ref = *alloc_refp++;
		ref->home = parent;
		ref->page = child;
		ref->addr = NULL;
		if (page->type == WT_PAGE_ROW_INT) {
			__wt_ref_key(page, *page_refp, &p, &size);
			WT_ERR(__wt_row_ikey(session, 0, p, size, ref));
			parent_incr += sizeof(WT_IKEY) + size;
		} else
			ref->ref_recno = (*page_refp)->ref_recno;
		ref->state = WT_REF_MEM;

		/*
		 * Initialize the child page.
		 * Block eviction in newly created pages and mark them dirty.
		 */
		child->pg_intl_parent_ref = ref;
		child->pg_intl_split_gen = split_gen;
		WT_ERR(__wt_page_modify_init(session, child));
		__wt_page_modify_set(session, child);

		/*
		 * The newly allocated child's page index references the same
		 * structures as the parent. (We cannot move WT_REF structures,
		 * threads may be underneath us right now changing the structure
		 * state.)  However, if the WT_REF structures reference on-page
		 * information, we have to fix that, because the disk image for
		 * the page that has an page index entry for the WT_REF is about
		 * to be discarded.
		 */
		child_pindex = WT_INTL_INDEX_GET_SAFE(child);
		child_incr = 0;
		for (child_refp = child_pindex->index,
		    j = 0; j < slots; ++child_refp, ++page_refp, ++j)
			WT_ERR(__split_ref_move(session, page,
			    page_refp, &page_decr, child_refp, &child_incr));

		__wt_cache_page_inmem_incr(session, child, child_incr);
	}
	WT_ASSERT(session, alloc_refp -
	    alloc_index->index == (ptrdiff_t)alloc_index->entries);
	WT_ASSERT(session,
	    page_refp - pindex->index == (ptrdiff_t)pindex->entries);

	/* Start making real changes to the tree, errors are fatal. */
	complete = WT_ERR_PANIC;

	/* Prepare the WT_REFs for the move. */
	__split_ref_prepare(session, alloc_index, true);

	/* Encourage a race */
	__page_split_timing_stress(session,
	    WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND);

	/* Split into the parent. */
	WT_ERR(__split_parent(session, page_ref, alloc_index->index,
	    alloc_index->entries, parent_incr, false, false));

	/*
	 * Confirm the page's index hasn't moved, then update it, which
	 * makes the split visible to threads descending the tree.
	 */
	WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex);
	WT_INTL_INDEX_SET(page, replace_index);

	/* Encourage a race */
	__page_split_timing_stress(session,
	    WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND);

	/*
	 * Get a generation for this split, mark the parent page.  This must be
	 * after the new index is swapped into place in order to know that no
	 * readers are looking at the old index.
	 *
	 * Getting a new split generation implies a full barrier, no additional
	 * barrier is needed.
	 */
	split_gen = __wt_gen_next(session, WT_GEN_SPLIT);
	page->pg_intl_split_gen = split_gen;

#ifdef HAVE_DIAGNOSTIC
	WT_WITH_PAGE_INDEX(session,
	    __split_verify_intl_key_order(session, parent));
	WT_WITH_PAGE_INDEX(session,
	    __split_verify_intl_key_order(session, page));
#endif

	/* The split is complete and verified, ignore benign errors. */
	complete = WT_ERR_IGNORE;

	/*
	 * We don't care about the page-index we allocated, all we needed was
	 * the array of WT_REF structures, which has now been split into the
	 * parent page.
	 */
	__wt_free(session, alloc_index);

	/*
	 * We can't free the previous page's index, there may be threads using
	 * it. Add to the session's discard list, to be freed once we know no
	 * threads can still be using it.
	 *
	 * This change requires care with error handling, we've already updated
	 * the parent page. Even if stashing the old value fails, we don't roll
	 * back that change, because threads may already be using the new parent
	 * page.
	 */
	size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
	WT_TRET(__split_safe_free(session, split_gen, false, pindex, size));
	page_decr += size;

	/* Adjust the page's memory footprint, and mark it dirty. */
	__wt_cache_page_inmem_incr(session, page, page_incr);
	__wt_cache_page_inmem_decr(session, page, page_decr);
	__wt_page_modify_set(session, page);

err:	switch (complete) {
	case WT_ERR_RETURN:
		__wt_free_ref_index(session, page, alloc_index, true);
		__wt_free_ref_index(session, page, replace_index, false);
		break;
	case WT_ERR_PANIC:
		__wt_err(session, ret,
		    "fatal error during internal page split");
		ret = WT_PANIC;
		break;
	case WT_ERR_IGNORE:
		if (ret != 0 && ret != WT_PANIC) {
			__wt_err(session, ret,
			    "ignoring not-fatal error during internal page "
			    "split");
			ret = 0;
		}
		break;
	}
	return (ret);
}

/*
 * __split_internal_lock --
 *	Lock an internal page.
 */
static int
__split_internal_lock(
    WT_SESSION_IMPL *session, WT_REF *ref, bool trylock, WT_PAGE **parentp)
{
	WT_PAGE *parent;

	*parentp = NULL;

	/*
	 * A checkpoint reconciling this parent page can deadlock with
	 * our split. We have an exclusive page lock on the child before
	 * we acquire the page's reconciliation lock, and reconciliation
	 * acquires the page's reconciliation lock before it encounters
	 * the child's exclusive lock (which causes reconciliation to
	 * loop until the exclusive lock is resolved). If we want to split
	 * the parent, give up to avoid that deadlock.
	 */
	if (!trylock && !__wt_btree_can_evict_dirty(session))
		return (EBUSY);

	/*
	 * Get a page-level lock on the parent to single-thread splits into the
	 * page because we need to single-thread sizing/growing the page index.
	 * It's OK to queue up multiple splits as the child pages split, but the
	 * actual split into the parent has to be serialized.  Note we allocate
	 * memory inside of the lock and may want to invest effort in making the
	 * locked period shorter.
	 *
	 * We use the reconciliation lock here because not only do we have to
	 * single-thread the split, we have to lock out reconciliation of the
	 * parent because reconciliation of the parent can't deal with finding
	 * a split child during internal page traversal. Basically, there's no
	 * reason to use a different lock if we have to block reconciliation
	 * anyway.
	 */
	for (;;) {
		parent = ref->home;

		/* Encourage race */
		__page_split_timing_stress(session,
		    WT_TIMING_STRESS_PAGE_SPLIT_RACE, WT_THOUSAND);

		/* Page locks live in the modify structure. */
		WT_RET(__wt_page_modify_init(session, parent));

		if (trylock)
			WT_RET(WT_PAGE_TRYLOCK(session, parent));
		else
			WT_PAGE_LOCK(session, parent);
		if (parent == ref->home)
			break;
		WT_PAGE_UNLOCK(session, parent);
	}

	/*
	 * This child has exclusive access to split its parent and the child's
	 * existence prevents the parent from being evicted. However, once we
	 * update the parent's index, it may no longer refer to the child, and
	 * could conceivably be evicted. If the parent page is dirty, our page
	 * lock prevents eviction because reconciliation is blocked. However,
	 * if the page were clean, it could be evicted without encountering our
	 * page lock. That isn't possible because you cannot move a child page
	 * and still leave the parent page clean.
	 */

	*parentp = parent;
	return (0);
}

/*
 * __split_internal_unlock --
 *	Unlock the parent page.
 */
static void
__split_internal_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent)
{
	WT_PAGE_UNLOCK(session, parent);
}

/*
 * __split_internal_should_split --
 *	Return if we should split an internal page.
 */
static bool
__split_internal_should_split(WT_SESSION_IMPL *session, WT_REF *ref)
{
	WT_BTREE *btree;
	WT_PAGE *page;
	WT_PAGE_INDEX *pindex;

	btree = S2BT(session);
	page = ref->page;

	/*
	 * Our caller is holding the parent page locked to single-thread splits,
	 * which means we can safely look at the page's index without setting a
	 * split generation.
	 */
	pindex = WT_INTL_INDEX_GET_SAFE(page);

	/* Sanity check for a reasonable number of on-page keys. */
	if (pindex->entries < 100)
		return (false);

	/*
	 * Deepen the tree if the page's memory footprint is larger than the
	 * maximum size for a page in memory (presumably putting eviction
	 * pressure on the cache).
	 */
	if (page->memory_footprint > btree->maxmempage)
		return (true);

	/*
	 * Check if the page has enough keys to make it worth splitting. If
	 * the number of keys is allowed to grow too large, the cost of
	 * splitting into parent pages can become large enough to result
	 * in slow operations.
	 */
	if (pindex->entries > btree->split_deepen_min_child)
		return (true);

	return (false);
}

/*
 * __split_parent_climb --
 *	Check if we should split up the tree.
 */
static int
__split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_DECL_RET;
	WT_PAGE *parent;
	WT_REF *ref;

	/*
	 * Disallow internal splits during the final pass of a checkpoint. Most
	 * splits are already disallowed during checkpoints, but an important
	 * exception is insert splits. The danger is an insert split creates a
	 * new chunk of the namespace, and then the internal split will move it
	 * to a different part of the tree where it will be written; in other
	 * words, in one part of the tree we'll skip the newly created insert
	 * split chunk, but we'll write it upon finding it in a different part
	 * of the tree.
	 */
	if (!__wt_btree_can_evict_dirty(session)) {
		__split_internal_unlock(session, page);
		return (0);
	}

	/*
	 * Page splits trickle up the tree, that is, as leaf pages grow large
	 * enough and are evicted, they'll split into their parent.  And, as
	 * that parent page grows large enough and is evicted, it splits into
	 * its parent and so on.  When the page split wave reaches the root,
	 * the tree will permanently deepen as multiple root pages are written.
	 *
	 * However, this only helps if internal pages are evicted (and we resist
	 * evicting internal pages for obvious reasons), or if the tree were to
	 * be closed and re-opened from a disk image, which may be a rare event.
	 *
	 * To avoid internal pages becoming too large absent eviction, check
	 * parent pages each time pages are split into them. If the page is big
	 * enough, either split the page into its parent or, in the case of the
	 * root, deepen the tree.
	 *
	 * Split up the tree.
	 */
	for (;;) {
		parent = NULL;
		ref = page->pg_intl_parent_ref;

		/* If we don't need to split the page, we're done. */
		if (!__split_internal_should_split(session, ref))
			break;

		/*
		 * If we've reached the root page, there are no subsequent pages
		 * to review, deepen the tree and quit.
		 */
		if (__wt_ref_is_root(ref)) {
			ret = __split_root(session, page);
			break;
		}

		/*
		 * Lock the parent and split into it, then swap the parent/page
		 * locks, lock-coupling up the tree.
		 */
		WT_ERR(__split_internal_lock(session, ref, true, &parent));
		ret = __split_internal(session, parent, page);
		__split_internal_unlock(session, page);

		page = parent;
		parent = NULL;
		WT_ERR(ret);
	}

err:	if (parent != NULL)
		__split_internal_unlock(session, parent);
	__split_internal_unlock(session, page);

	/* A page may have been busy, in which case return without error. */
	WT_RET_BUSY_OK(ret);
	return (0);
}

/*
 * __split_multi_inmem --
 *	Instantiate a page from a disk image.
 */
static int
__split_multi_inmem(
    WT_SESSION_IMPL *session, WT_PAGE *orig, WT_MULTI *multi, WT_REF *ref)
{
	WT_CURSOR_BTREE cbt;
	WT_DECL_ITEM(key);
	WT_DECL_RET;
	WT_PAGE *page;
	WT_SAVE_UPD *supd;
	WT_UPDATE *upd;
	uint64_t recno;
	uint32_t i, slot;

	WT_ASSERT(session, multi->page_las.las_pageid == 0);

	/*
	 * In 04/2016, we removed column-store record numbers from the WT_PAGE
	 * structure, leading to hard-to-debug problems because we corrupt the
	 * page if we search it using the wrong initial record number. For now,
	 * assert the record number is set.
	 */
	WT_ASSERT(session,
	    orig->type != WT_PAGE_COL_VAR || ref->ref_recno != 0);

	/*
	 * This code re-creates an in-memory page from a disk image, and adds
	 * references to any unresolved update chains to the new page. We get
	 * here either because an update could not be written when evicting a
	 * page, or eviction chose to keep a page in memory.
	 *
	 * Steal the disk image and link the page into the passed-in WT_REF to
	 * simplify error handling: our caller will not discard the disk image
	 * when discarding the original page, and our caller will discard the
	 * allocated page on error, when discarding the allocated WT_REF.
	 */
	WT_RET(__wt_page_inmem(
	    session, ref, multi->disk_image, WT_PAGE_DISK_ALLOC, &page));
	multi->disk_image = NULL;

	/*
	 * Put the re-instantiated page in the same LRU queue location as the
	 * original page, unless this was a forced eviction, in which case we
	 * leave the new page with the read generation unset.  Eviction will
	 * set the read generation next time it visits this page.
	 */
	if (!WT_READGEN_EVICT_SOON(orig->read_gen))
		page->read_gen = orig->read_gen;

	/* If there are no updates to apply to the page, we're done. */
	if (multi->supd_entries == 0)
		return (0);

	if (orig->type == WT_PAGE_ROW_LEAF)
		WT_RET(__wt_scr_alloc(session, 0, &key));

	__wt_btcur_init(session, &cbt);
	__wt_btcur_open(&cbt);

	/* Re-create each modification we couldn't write. */
	for (i = 0, supd = multi->supd; i < multi->supd_entries; ++i, ++supd) {
		switch (orig->type) {
		case WT_PAGE_COL_FIX:
		case WT_PAGE_COL_VAR:
			/* Build a key. */
			upd = supd->ins->upd;
			recno = WT_INSERT_RECNO(supd->ins);

			/* Search the page. */
			WT_ERR(__wt_col_search(
			    session, recno, ref, &cbt, true));

			/* Apply the modification. */
			WT_ERR(__wt_col_modify(session, &cbt,
			    recno, NULL, upd, WT_UPDATE_INVALID, true));
			break;
		case WT_PAGE_ROW_LEAF:
			/* Build a key. */
			if (supd->ins == NULL) {
				slot = WT_ROW_SLOT(orig, supd->ripcip);
				upd = orig->modify->mod_row_update[slot];

				WT_ERR(__wt_row_leaf_key(
				    session, orig, supd->ripcip, key, false));
			} else {
				upd = supd->ins->upd;

				key->data = WT_INSERT_KEY(supd->ins);
				key->size = WT_INSERT_KEY_SIZE(supd->ins);
			}

			/* Search the page. */
			WT_ERR(__wt_row_search(
			    session, key, ref, &cbt, true, true));

			/*
			 * Birthmarks should only be applied to on-page values.
			 */
			WT_ASSERT(session, cbt.compare == 0 ||
			    upd->type != WT_UPDATE_BIRTHMARK);

			/* Apply the modification. */
			WT_ERR(__wt_row_modify(session,
			    &cbt, key, NULL, upd, WT_UPDATE_INVALID, true));
			break;
		WT_ILLEGAL_VALUE_ERR(session);
		}
	}

	/*
	 * When modifying the page we set the first dirty transaction to the
	 * last transaction currently running.  However, the updates we made
	 * might be older than that. Set the first dirty transaction to an
	 * impossibly old value so this page is never skipped in a checkpoint.
	 */
	page->modify->first_dirty_txn = WT_TXN_FIRST;

	/*
	 * If the new page is modified, save the eviction generation to avoid
	 * repeatedly attempting eviction on the same page.
	 */
	page->modify->last_evict_pass_gen = orig->modify->last_evict_pass_gen;
	page->modify->last_eviction_id = orig->modify->last_eviction_id;
	__wt_timestamp_set(&page->modify->last_eviction_timestamp,
	    &orig->modify->last_eviction_timestamp);
	page->modify->update_restored = 1;

err:	/* Free any resources that may have been cached in the cursor. */
	WT_TRET(__wt_btcur_close(&cbt, true));

	__wt_scr_free(session, &key);
	return (ret);
}

/*
 * __split_multi_inmem_final --
 *	Discard moved update lists from the original page.
 */
static void
__split_multi_inmem_final(WT_PAGE *orig, WT_MULTI *multi)
{
	WT_SAVE_UPD *supd;
	uint32_t i, slot;

	/*
	 * We successfully created new in-memory pages. For error-handling
	 * reasons, we've left the update chains referenced by both the original
	 * and new pages. We're ready to discard the original page, terminate
	 * the original page's reference to any update list we moved.
	 */
	for (i = 0, supd = multi->supd; i < multi->supd_entries; ++i, ++supd)
		switch (orig->type) {
		case WT_PAGE_COL_FIX:
		case WT_PAGE_COL_VAR:
			supd->ins->upd = NULL;
			break;
		case WT_PAGE_ROW_LEAF:
			if (supd->ins == NULL) {
				slot = WT_ROW_SLOT(orig, supd->ripcip);
				orig->modify->mod_row_update[slot] = NULL;
			} else
				supd->ins->upd = NULL;
			break;
		}
}

/*
 * __split_multi_inmem_fail --
 *	Discard allocated pages after failure.
 */
static void
__split_multi_inmem_fail(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_REF *ref)
{
	/*
	 * We failed creating new in-memory pages. For error-handling reasons,
	 * we've left the update chains referenced by both the original and
	 * new pages. Discard the new allocated WT_REF structures and their
	 * pages (setting a flag so the discard code doesn't discard the updates
	 * on the page).
	 *
	 * Our callers allocate WT_REF arrays, then individual WT_REFs, check
	 * for uninitialized information.
	 */
	if (ref != NULL) {
		if (ref->page != NULL)
			F_SET_ATOMIC(ref->page, WT_PAGE_UPDATE_IGNORE);
		__wt_free_ref(session, ref, orig->type, true);
	}
}

/*
 * __wt_multi_to_ref --
 *	Move a multi-block entry into a WT_REF structure.
 */
int
__wt_multi_to_ref(WT_SESSION_IMPL *session,
    WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp, bool closing)
{
	WT_ADDR *addr;
	WT_IKEY *ikey;
	WT_REF *ref;

	/* Allocate an underlying WT_REF. */
	WT_RET(__wt_calloc_one(session, refp));
	ref = *refp;
	if (incrp)
		*incrp += sizeof(WT_REF);

	/*
	 * Set the WT_REF key before (optionally) building the page, underlying
	 * column-store functions need the page's key space to search it.
	 */
	switch (page->type) {
	case WT_PAGE_ROW_INT:
	case WT_PAGE_ROW_LEAF:
		ikey = multi->key.ikey;
		WT_RET(__wt_row_ikey(
		    session, 0, WT_IKEY_DATA(ikey), ikey->size, ref));
		if (incrp)
			*incrp += sizeof(WT_IKEY) + ikey->size;
		break;
	default:
		ref->ref_recno = multi->key.recno;
		break;
	}

	/*
	 * There can be an address or a disk image or both, but if there is
	 * neither, there must be a backing lookaside page.
	 */
	WT_ASSERT(session, multi->page_las.las_pageid != 0 ||
	    multi->addr.addr != NULL || multi->disk_image != NULL);

	/* If closing the file, there better be an address. */
	WT_ASSERT(session, !closing || multi->addr.addr != NULL);

	/* If closing the file, there better not be any saved updates. */
	WT_ASSERT(session, !closing || multi->supd == NULL);

	/* If there are saved updates, there better be a disk image. */
	WT_ASSERT(session, multi->supd == NULL || multi->disk_image != NULL);

	/* Verify any disk image we have. */
	WT_ASSERT(session, multi->disk_image == NULL ||
	    __wt_verify_dsk_image(session,
	    "[page instantiate]", multi->disk_image, 0, true) == 0);

	/*
	 * If there's an address, the page was written, set it.
	 *
	 * Copy the address: we could simply take the buffer, but that would
	 * complicate error handling, freeing the reference array would have
	 * to avoid freeing the memory, and it's not worth the confusion.
	 */
	if (multi->addr.addr != NULL) {
		WT_RET(__wt_calloc_one(session, &addr));
		ref->addr = addr;
		addr->size = multi->addr.size;
		addr->type = multi->addr.type;
		WT_RET(__wt_memdup(session,
		    multi->addr.addr, addr->size, &addr->addr));

		ref->state = WT_REF_DISK;
	}

	/*
	 * Copy any associated lookaside reference, potentially resetting
	 * WT_REF.state. Regardless of a backing address, WT_REF_LOOKASIDE
	 * overrides WT_REF_DISK.
	 */
	if (multi->page_las.las_pageid != 0) {
		/*
		 * We should not have a disk image if we did lookaside
		 * eviction.
		 */
		WT_ASSERT(session, multi->disk_image == NULL);

		WT_RET(__wt_calloc_one(session, &ref->page_las));
		*ref->page_las = multi->page_las;
		WT_ASSERT(session, ref->page_las->las_max_txn != WT_TXN_NONE);
		ref->state = WT_REF_LOOKASIDE;
	}

	/*
	 * If we have a disk image and we're not closing the file,
	 * re-instantiate the page.
	 *
	 * Discard any page image we don't use.
	 */
	if (multi->disk_image != NULL && !closing) {
		WT_RET(__split_multi_inmem(session, page, multi, ref));
		ref->state = WT_REF_MEM;
	}
	__wt_free(session, multi->disk_image);

	return (0);
}

/*
 * __split_insert --
 *	Split a page's last insert list entries into a separate page.
 */
static int
__split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
{
	WT_DECL_ITEM(key);
	WT_DECL_RET;
	WT_INSERT *ins, **insp, *moved_ins, *prev_ins;
	WT_INSERT_HEAD *ins_head, *tmp_ins_head;
	WT_PAGE *page, *right;
	WT_REF *child, *split_ref[2] = { NULL, NULL };
	size_t page_decr, parent_incr, right_incr;
	uint8_t type;
	int i;

	WT_STAT_CONN_INCR(session, cache_inmem_split);
	WT_STAT_DATA_INCR(session, cache_inmem_split);

	page = ref->page;
	right = NULL;
	page_decr = parent_incr = right_incr = 0;
	type = page->type;

	/*
	 * Assert splitting makes sense; specifically assert the page is dirty,
	 * we depend on that, otherwise the page might be evicted based on its
	 * last reconciliation which no longer matches reality after the split.
	 *
	 * Note this page has already been through an in-memory split.
	 */
	WT_ASSERT(session, __wt_leaf_page_can_split(session, page));
	WT_ASSERT(session, __wt_page_is_modified(page));
	F_SET_ATOMIC(page, WT_PAGE_SPLIT_INSERT);

	/* Find the last item on the page. */
	if (type == WT_PAGE_ROW_LEAF)
		ins_head = page->entries == 0 ?
		    WT_ROW_INSERT_SMALLEST(page) :
		    WT_ROW_INSERT_SLOT(page, page->entries - 1);
	else
		ins_head = WT_COL_APPEND(page);
	moved_ins = WT_SKIP_LAST(ins_head);

	/*
	 * The first page in the split is the current page, but we still have
	 * to create a replacement WT_REF, the original WT_REF will be set to
	 * split status and eventually freed.
	 *
	 * The new WT_REF is not quite identical: we have to instantiate a key,
	 * and the new reference is visible to readers once the split completes.
	 *
	 * Don't copy any deleted page state: we may be splitting a page that
	 * was instantiated after a truncate and that history should not be
	 * carried onto these new child pages.
	 */
	WT_ERR(__wt_calloc_one(session, &split_ref[0]));
	parent_incr += sizeof(WT_REF);
	child = split_ref[0];
	child->page = ref->page;
	child->home = ref->home;
	child->pindex_hint = ref->pindex_hint;
	child->state = WT_REF_MEM;
	child->addr = ref->addr;

	/*
	 * The address has moved to the replacement WT_REF.  Make sure it isn't
	 * freed when the original ref is discarded.
	 */
	ref->addr = NULL;

	if (type == WT_PAGE_ROW_LEAF) {
		/*
		 * Copy the first key from the original page into first ref in
		 * the new parent. Pages created in memory always have a
		 * "smallest" insert list, so look there first.  If we don't
		 * find one, get the first key from the disk image.
		 *
		 * We can't just use the key from the original ref: it may have
		 * been suffix-compressed, and after the split the truncated key
		 * may not be valid.
		 */
		WT_ERR(__wt_scr_alloc(session, 0, &key));
		if ((ins =
		    WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL) {
			key->data = WT_INSERT_KEY(ins);
			key->size = WT_INSERT_KEY_SIZE(ins);
		} else
			WT_ERR(__wt_row_leaf_key(
			    session, page, &page->pg_row[0], key, true));
		WT_ERR(__wt_row_ikey(session, 0, key->data, key->size, child));
		parent_incr += sizeof(WT_IKEY) + key->size;
		__wt_scr_free(session, &key);
	} else
		child->ref_recno = ref->ref_recno;

	/*
	 * The second page in the split is a new WT_REF/page pair.
	 */
	WT_ERR(__wt_page_alloc(session, type, 0, false, &right));

	/*
	 * The new page is dirty by definition, plus column-store splits update
	 * the page-modify structure, so create it now.
	 */
	WT_ERR(__wt_page_modify_init(session, right));
	__wt_page_modify_set(session, right);

	if (type == WT_PAGE_ROW_LEAF) {
		WT_ERR(__wt_calloc_one(
		    session, &right->modify->mod_row_insert));
		WT_ERR(__wt_calloc_one(
		    session, &right->modify->mod_row_insert[0]));
	} else {
		WT_ERR(__wt_calloc_one(
		    session, &right->modify->mod_col_append));
		WT_ERR(__wt_calloc_one(
		    session, &right->modify->mod_col_append[0]));
	}
	right_incr += sizeof(WT_INSERT_HEAD);
	right_incr += sizeof(WT_INSERT_HEAD *);

	WT_ERR(__wt_calloc_one(session, &split_ref[1]));
	parent_incr += sizeof(WT_REF);
	child = split_ref[1];
	child->page = right;
	child->state = WT_REF_MEM;

	if (type == WT_PAGE_ROW_LEAF) {
		WT_ERR(__wt_row_ikey(session, 0,
		    WT_INSERT_KEY(moved_ins), WT_INSERT_KEY_SIZE(moved_ins),
		    child));
		parent_incr += sizeof(WT_IKEY) + WT_INSERT_KEY_SIZE(moved_ins);
	} else
		child->ref_recno = WT_INSERT_RECNO(moved_ins);

	/*
	 * Allocation operations completed, we're going to split.
	 *
	 * Record the split column-store page record, used in reconciliation.
	 */
	if (type != WT_PAGE_ROW_LEAF) {
		WT_ASSERT(session,
		    page->modify->mod_col_split_recno == WT_RECNO_OOB);
		page->modify->mod_col_split_recno = child->ref_recno;
	}

	/*
	 * Calculate how much memory we're moving: figure out how deep the skip
	 * list stack is for the element we are moving, and the memory used by
	 * the item's list of updates.
	 */
	for (i = 0; i < WT_SKIP_MAXDEPTH && ins_head->tail[i] == moved_ins; ++i)
		;
	WT_MEM_TRANSFER(page_decr, right_incr,
	    sizeof(WT_INSERT) + (size_t)i * sizeof(WT_INSERT *));
	if (type == WT_PAGE_ROW_LEAF)
		WT_MEM_TRANSFER(
		    page_decr, right_incr, WT_INSERT_KEY_SIZE(moved_ins));
	WT_MEM_TRANSFER(
	    page_decr, right_incr, __wt_update_list_memsize(moved_ins->upd));

	/*
	 * Move the last insert list item from the original page to the new
	 * page.
	 *
	 * First, update the item to the new child page. (Just append the entry
	 * for simplicity, the previous skip list pointers originally allocated
	 * can be ignored.)
	 */
	tmp_ins_head = type == WT_PAGE_ROW_LEAF ?
	    right->modify->mod_row_insert[0] : right->modify->mod_col_append[0];
	tmp_ins_head->head[0] = tmp_ins_head->tail[0] = moved_ins;

	/*
	 * Remove the entry from the orig page (i.e truncate the skip list).
	 * Following is an example skip list that might help.
	 *
	 *               __
	 *              |c3|
	 *               |
	 *   __		 __    __
	 *  |a2|--------|c2|--|d2|
	 *   |		 |	|
	 *   __		 __    __	   __
	 *  |a1|--------|c1|--|d1|--------|f1|
	 *   |		 |	|	   |
	 *   __    __    __    __    __    __
	 *  |a0|--|b0|--|c0|--|d0|--|e0|--|f0|
	 *
	 *   From the above picture.
	 *   The head array will be: a0, a1, a2, c3, NULL
	 *   The tail array will be: f0, f1, d2, c3, NULL
	 *   We are looking for: e1, d2, NULL
	 *   If there were no f1, we'd be looking for: e0, NULL
	 *   If there were an f2, we'd be looking for: e0, d1, d2, NULL
	 *
	 *   The algorithm does:
	 *   1) Start at the top of the head list.
	 *   2) Step down until we find a level that contains more than one
	 *      element.
	 *   3) Step across until we reach the tail of the level.
	 *   4) If the tail is the item being moved, remove it.
	 *   5) Drop down a level, and go to step 3 until at level 0.
	 */
	prev_ins = NULL;		/* -Wconditional-uninitialized */
	for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i];
	    i >= 0;
	    i--, insp--) {
		/* Level empty, or a single element. */
		if (ins_head->head[i] == NULL ||
		     ins_head->head[i] == ins_head->tail[i]) {
			/* Remove if it is the element being moved. */
			if (ins_head->head[i] == moved_ins)
				ins_head->head[i] = ins_head->tail[i] = NULL;
			continue;
		}

		for (ins = *insp; ins != ins_head->tail[i]; ins = ins->next[i])
			prev_ins = ins;

		/*
		 * Update the stack head so that we step down as far to the
		 * right as possible. We know that prev_ins is valid since
		 * levels must contain at least two items to be here.
		 */
		insp = &prev_ins->next[i];
		if (ins == moved_ins) {
			/* Remove the item being moved. */
			WT_ASSERT(session, ins_head->head[i] != moved_ins);
			WT_ASSERT(session, prev_ins->next[i] == moved_ins);
			*insp = NULL;
			ins_head->tail[i] = prev_ins;
		}
	}

#ifdef HAVE_DIAGNOSTIC
	/*
	 * Verify the moved insert item appears nowhere on the skip list.
	 */
	for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i];
	    i >= 0;
	    i--, insp--)
		for (ins = *insp; ins != NULL; ins = ins->next[i])
			WT_ASSERT(session, ins != moved_ins);
#endif

	/*
	 * We perform insert splits concurrently with checkpoints, where the
	 * requirement is a checkpoint must include either the original page
	 * or both new pages. The page we're splitting is dirty, but that's
	 * insufficient: set the first dirty transaction to an impossibly old
	 * value so this page is not skipped by a checkpoint.
	 */
	page->modify->first_dirty_txn = WT_TXN_FIRST;

	/*
	 * We modified the page above, which will have set the first dirty
	 * transaction to the last transaction current running.  However, the
	 * updates we installed may be older than that.  Set the first dirty
	 * transaction to an impossibly old value so this page is never skipped
	 * in a checkpoint.
	 */
	right->modify->first_dirty_txn = WT_TXN_FIRST;

	/*
	 * Update the page accounting.
	 */
	__wt_cache_page_inmem_decr(session, page, page_decr);
	__wt_cache_page_inmem_incr(session, right, right_incr);

	/*
	 * The act of splitting into the parent releases the pages for eviction;
	 * ensure the page contents are consistent.
	 */
	WT_WRITE_BARRIER();

	/*
	 * Split into the parent.
	 */
	if ((ret = __split_parent(
	    session, ref, split_ref, 2, parent_incr, false, true)) == 0)
		return (0);

	/*
	 * Failure.
	 *
	 * Reset the split column-store page record.
	 */
	if (type != WT_PAGE_ROW_LEAF)
		page->modify->mod_col_split_recno = WT_RECNO_OOB;

	/*
	 * Clear the allocated page's reference to the moved insert list element
	 * so it's not freed when we discard the page.
	 *
	 * Move the element back to the original page list. For simplicity, the
	 * previous skip list pointers originally allocated can be ignored, just
	 * append the entry to the end of the level 0 list. As before, we depend
	 * on the list having multiple elements and ignore the edge cases small
	 * lists have.
	 */
	if (type == WT_PAGE_ROW_LEAF)
		right->modify->mod_row_insert[0]->head[0] =
		    right->modify->mod_row_insert[0]->tail[0] = NULL;
	else
		right->modify->mod_col_append[0]->head[0] =
		    right->modify->mod_col_append[0]->tail[0] = NULL;

	ins_head->tail[0]->next[0] = moved_ins;
	ins_head->tail[0] = moved_ins;

	/* Fix up accounting for the page size. */
	__wt_cache_page_inmem_incr(session, page, page_decr);

err:	if (split_ref[0] != NULL) {
		/*
		 * The address was moved to the replacement WT_REF, restore it.
		 */
		ref->addr = split_ref[0]->addr;

		if (type == WT_PAGE_ROW_LEAF)
			__wt_free(session, split_ref[0]->ref_ikey);
		__wt_free(session, split_ref[0]);
	}
	if (split_ref[1] != NULL) {
		if (type == WT_PAGE_ROW_LEAF)
			__wt_free(session, split_ref[1]->ref_ikey);
		__wt_free(session, split_ref[1]);
	}
	if (right != NULL) {
		/*
		 * We marked the new page dirty; we're going to discard it,
		 * but first mark it clean and fix up the cache statistics.
		 */
		__wt_page_modify_clear(session, right);
		__wt_page_out(session, &right);
	}
	__wt_scr_free(session, &key);
	return (ret);
}

/*
 * __split_insert_lock --
 *	Split a page's last insert list entries into a separate page.
 */
static int
__split_insert_lock(WT_SESSION_IMPL *session, WT_REF *ref)
{
	WT_DECL_RET;
	WT_PAGE *parent;

	/* Lock the parent page, then proceed with the insert split. */
	WT_RET(__split_internal_lock(session, ref, true, &parent));
	if ((ret = __split_insert(session, ref)) != 0) {
		__split_internal_unlock(session, parent);
		return (ret);
	}

	/*
	 * Split up through the tree as necessary; we're holding the original
	 * parent page locked, note the functions we call are responsible for
	 * releasing that lock.
	 */
	return (__split_parent_climb(session, parent));
}

/*
 * __wt_split_insert --
 *	Split a page's last insert list entries into a separate page.
 */
int
__wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
{
	WT_DECL_RET;

	__wt_verbose(session, WT_VERB_SPLIT, "%p: split-insert", (void *)ref);

	/*
	 * Set the session split generation to ensure underlying code isn't
	 * surprised by internal page eviction, then proceed with the insert
	 * split.
	 */
	WT_WITH_PAGE_INDEX(session, ret = __split_insert_lock(session, ref));
	return (ret);
}

/*
 * __split_multi --
 *	Split a page into multiple pages.
 */
static int
__split_multi(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
{
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_MODIFY *mod;
	WT_REF **ref_new;
	size_t parent_incr;
	uint32_t i, new_entries;

	WT_STAT_CONN_INCR(session, cache_eviction_split_leaf);
	WT_STAT_DATA_INCR(session, cache_eviction_split_leaf);

	page = ref->page;
	mod = page->modify;
	new_entries = mod->mod_multi_entries;

	parent_incr = 0;

	/*
	 * Convert the split page's multiblock reconciliation information into
	 * an array of page reference structures.
	 */
	WT_RET(__wt_calloc_def(session, new_entries, &ref_new));
	for (i = 0; i < new_entries; ++i)
		WT_ERR(__wt_multi_to_ref(session, page,
		    &mod->mod_multi[i], &ref_new[i], &parent_incr, closing));

	/*
	 * Split into the parent; if we're closing the file, we hold it
	 * exclusively.
	 */
	WT_ERR(__split_parent(
	    session, ref, ref_new, new_entries, parent_incr, closing, true));

	/*
	 * The split succeeded, we can no longer fail.
	 *
	 * Finalize the move, discarding moved update lists from the original
	 * page.
	 */
	for (i = 0; i < new_entries; ++i)
		__split_multi_inmem_final(page, &mod->mod_multi[i]);

	/*
	 * Pages with unresolved changes are not marked clean in reconciliation,
	 * do it now, then discard the page.
	 */
	__wt_page_modify_clear(session, page);
	__wt_page_out(session, &page);

	if (0) {
err:		for (i = 0; i < new_entries; ++i)
			__split_multi_inmem_fail(session, page, ref_new[i]);
	}

	__wt_free(session, ref_new);
	return (ret);
}

/*
 * __split_multi_lock --
 *	Split a page into multiple pages.
 */
static int
__split_multi_lock(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
{
	WT_DECL_RET;
	WT_PAGE *parent;

	/* Lock the parent page, then proceed with the split. */
	WT_RET(__split_internal_lock(session, ref, false, &parent));
	if ((ret = __split_multi(session, ref, closing)) != 0 || closing) {
		__split_internal_unlock(session, parent);
		return (ret);
	}

	/*
	 * Split up through the tree as necessary; we're holding the original
	 * parent page locked, note the functions we call are responsible for
	 * releasing that lock.
	 */
	return (__split_parent_climb(session, parent));
}

/*
 * __wt_split_multi --
 *	Split a page into multiple pages.
 */
int
__wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
{
	WT_DECL_RET;

	__wt_verbose(session, WT_VERB_SPLIT, "%p: split-multi", (void *)ref);

	/*
	 * Set the session split generation to ensure underlying code isn't
	 * surprised by internal page eviction, then proceed with the split.
	 */
	WT_WITH_PAGE_INDEX(session,
	    ret = __split_multi_lock(session, ref, closing));
	return (ret);
}

/*
 * __split_reverse --
 *	Reverse split (rewrite a parent page's index to reflect an empty page).
 */
static int
__split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
{
	WT_DECL_RET;
	WT_PAGE *parent;

	/* Lock the parent page, then proceed with the reverse split. */
	WT_RET(__split_internal_lock(session, ref, false, &parent));
	ret = __split_parent(session, ref, NULL, 0, 0, false, true);
	__split_internal_unlock(session, parent);
	return (ret);
}

/*
 * __wt_split_reverse --
 *	Reverse split (rewrite a parent page's index to reflect an empty page).
 */
int
__wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
{
	WT_DECL_RET;

	__wt_verbose(session, WT_VERB_SPLIT, "%p: reverse-split", (void *)ref);

	/*
	 * Set the session split generation to ensure underlying code isn't
	 * surprised by internal page eviction, then proceed with the reverse
	 * split.
	 */
	WT_WITH_PAGE_INDEX(session, ret = __split_reverse(session, ref));
	return (ret);
}

/*
 * __wt_split_rewrite --
 *	Rewrite an in-memory page with a new version.
 */
int
__wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, WT_MULTI *multi)
{
	WT_DECL_RET;
	WT_PAGE *page;
	WT_REF *new;

	page = ref->page;

	__wt_verbose(session, WT_VERB_SPLIT, "%p: split-rewrite", (void *)ref);

	/*
	 * This isn't a split: a reconciliation failed because we couldn't write
	 * something, and in the case of forced eviction, we need to stop this
	 * page from being such a problem. We have exclusive access, rewrite the
	 * page in memory. The code lives here because the split code knows how
	 * to re-create a page in memory after it's been reconciled, and that's
	 * exactly what we want to do.
	 *
	 * Build the new page.
	 *
	 * Allocate a WT_REF, the error path calls routines that free memory.
	 * The only field we need to set is the record number, as it's used by
	 * the search routines.
	 */
	WT_RET(__wt_calloc_one(session, &new));
	new->ref_recno = ref->ref_recno;

	WT_ERR(__split_multi_inmem(session, page, multi, new));

	/*
	 * The rewrite succeeded, we can no longer fail.
	 *
	 * Finalize the move, discarding moved update lists from the original
	 * page.
	 */
	__split_multi_inmem_final(page, multi);

	/*
	 * Discard the original page.
	 *
	 * Pages with unresolved changes are not marked clean during
	 * reconciliation, do it now.
	 *
	 * Don't count this as eviction making progress, we did a one-for-one
	 * rewrite of a page in memory, typical in the case of cache pressure.
	 */
	__wt_page_modify_clear(session, page);
	F_SET_ATOMIC(page, WT_PAGE_EVICT_NO_PROGRESS);
	__wt_ref_out(session, ref);

	/* Swap the new page into place. */
	ref->page = new->page;

	WT_PUBLISH(ref->state, WT_REF_MEM);

	__wt_free(session, new);
	return (0);

err:	__split_multi_inmem_fail(session, page, new);
	return (ret);
}