/*-
 * Copyright (c) 2014-2017 MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

/*
 * When returning an error, we need to restore the cursor to a valid state, the
 * upper-level cursor code is likely to retry. This structure and the associated
 * functions are used save and restore the cursor state.
 */
typedef struct {
	WT_ITEM key;
	WT_ITEM value;
	uint64_t recno;
	uint32_t flags;
} WT_CURFILE_STATE;

/*
 * __cursor_state_save --
 *	Save the cursor's external state.
 */
static inline void
__cursor_state_save(WT_CURSOR *cursor, WT_CURFILE_STATE *state)
{
	WT_ITEM_SET(state->key, cursor->key);
	WT_ITEM_SET(state->value, cursor->value);
	state->recno = cursor->recno;
	state->flags = cursor->flags;
}

/*
 * __cursor_state_restore --
 *	Restore the cursor's external state.
 */
static inline void
__cursor_state_restore(WT_CURSOR *cursor, WT_CURFILE_STATE *state)
{
	if (F_ISSET(state, WT_CURSTD_KEY_EXT))
		WT_ITEM_SET(cursor->key, state->key);
	if (F_ISSET(state, WT_CURSTD_VALUE_EXT))
		WT_ITEM_SET(cursor->value, state->value);
	cursor->recno = state->recno;
	F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
	F_SET(cursor, F_MASK(state, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT));

}

/*
 * __cursor_page_pinned --
 *	Return if we have a page pinned and it's not been flagged for forced
 * eviction (the forced eviction test is so we periodically release pages
 * grown too large).
 */
static inline bool
__cursor_page_pinned(WT_CURSOR_BTREE *cbt)
{
	return (F_ISSET(cbt, WT_CBT_ACTIVE) &&
	    cbt->ref->page->read_gen != WT_READGEN_OLDEST);
}

/*
 * __cursor_size_chk --
 *	Return if an inserted item is too large.
 */
static inline int
__cursor_size_chk(WT_SESSION_IMPL *session, WT_ITEM *kv)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_DECL_RET;
	size_t size;

	btree = S2BT(session);
	bm = btree->bm;

	if (btree->type == BTREE_COL_FIX) {
		/* Fixed-size column-stores take a single byte. */
		if (kv->size != 1)
			WT_RET_MSG(session, EINVAL,
			    "item size of %" WT_SIZET_FMT " does not match "
			    "fixed-length file requirement of 1 byte",
			    kv->size);
		return (0);
	}

	/* Don't waste effort, 1GB is always cool. */
	if (kv->size <= WT_GIGABYTE)
		return (0);

	/* Check what we are willing to store in the tree. */
	if (kv->size > WT_BTREE_MAX_OBJECT_SIZE)
		WT_RET_MSG(session, EINVAL,
		    "item size of %" WT_SIZET_FMT " exceeds the maximum "
		    "supported WiredTiger size of %" PRIu32,
		    kv->size, WT_BTREE_MAX_OBJECT_SIZE);

	/* Check what the block manager can actually write. */
	size = kv->size;
	if ((ret = bm->write_size(bm, session, &size)) != 0)
		WT_RET_MSG(session, ret,
		    "item size of %" WT_SIZET_FMT " refused by block manager",
		    kv->size);

	return (0);
}

/*
 * __cursor_disable_bulk --
 *	Disable bulk loads into a tree.
 */
static inline void
__cursor_disable_bulk(WT_SESSION_IMPL *session, WT_BTREE *btree)
{
	/*
	 * Once a tree (other than the LSM primary) is no longer empty, eviction
	 * should pay attention to it, and it's no longer possible to bulk-load
	 * into it.
	 */
	if (!btree->original)
		return;
	if (btree->lsm_primary) {
		btree->original = 0;		/* Make the next test faster. */
		return;
	}

	/*
	 * We use a compare-and-swap here to avoid races among the first inserts
	 * into a tree.  Eviction is disabled when an empty tree is opened, and
	 * it must only be enabled once.
	 */
	if (__wt_atomic_cas8(&btree->original, 1, 0)) {
		btree->evict_disabled_open = false;
		__wt_evict_file_exclusive_off(session);
	}
}

/*
 * __cursor_fix_implicit --
 *	Return if search went past the end of the tree.
 */
static inline int
__cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt)
{
	/*
	 * When there's no exact match, column-store search returns the key
	 * nearest the searched-for key (continuing past keys smaller than the
	 * searched-for key to return the next-largest key). Therefore, if the
	 * returned comparison is -1, the searched-for key was larger than any
	 * row on the page's standard information or column-store insert list.
	 *
	 * If the returned comparison is NOT -1, there was a row equal to or
	 * larger than the searched-for key, and we implicitly create missing
	 * rows.
	 */
	return (btree->type == BTREE_COL_FIX && cbt->compare != -1);
}

/*
 * __wt_cursor_valid --
 *	Return if the cursor references an valid key/value pair.
 */
bool
__wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
{
	WT_BTREE *btree;
	WT_CELL *cell;
	WT_COL *cip;
	WT_PAGE *page;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;

	btree = cbt->btree;
	page = cbt->ref->page;
	session = (WT_SESSION_IMPL *)cbt->iface.session;
	if (updp != NULL)
		*updp = NULL;

	/*
	 * We may be pointing to an insert object, and we may have a page with
	 * existing entries.  Insert objects always have associated update
	 * objects (the value).  Any update object may be deleted, or invisible
	 * to us.  In the case of an on-page entry, there is by definition a
	 * value that is visible to us, the original page cell.
	 *
	 * If we find a visible update structure, return our caller a reference
	 * to it because we don't want to repeatedly search for the update, it
	 * might suddenly become invisible (imagine a read-uncommitted session
	 * with another session's aborted insert), and we don't want to handle
	 * that potential error every time we look at the value.
	 *
	 * Unfortunately, the objects we might have and their relationships are
	 * different for the underlying page types.
	 *
	 * In the case of row-store, an insert object implies ignoring any page
	 * objects, no insert object can have the same key as an on-page object.
	 * For row-store:
	 *	if there's an insert object:
	 *		if there's a visible update:
	 *			exact match
	 *		else
	 *			no exact match
	 *	else
	 *		use the on-page object (which may have an associated
	 *		update object that may or may not be visible to us).
	 *
	 * Column-store is more complicated because an insert object can have
	 * the same key as an on-page object: updates to column-store rows
	 * are insert/object pairs, and an invisible update isn't the end as
	 * there may be an on-page object that is visible.  This changes the
	 * logic to:
	 *	if there's an insert object:
	 *		if there's a visible update:
	 *			exact match
	 *		else if the on-page object's key matches the insert key
	 *			use the on-page object
	 *	else
	 *		use the on-page object
	 *
	 * First, check for an insert object with a visible update (a visible
	 * update that's been deleted is not a valid key/value pair).
	 */
	if (cbt->ins != NULL &&
	    (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) {
		if (upd->type == WT_UPDATE_DELETED)
			return (false);
		if (updp != NULL)
			*updp = upd;
		return (true);
	}

	/*
	 * If we don't have an insert object, or in the case of column-store,
	 * there's an insert object but no update was visible to us and the key
	 * on the page is the same as the insert object's key, and the slot as
	 * set by the search function is valid, we can use the original page
	 * information.
	 */
	switch (btree->type) {
	case BTREE_COL_FIX:
		/*
		 * If search returned an insert object, there may or may not be
		 * a matching on-page object, we have to check.  Fixed-length
		 * column-store pages don't have slots, but map one-to-one to
		 * keys, check for retrieval past the end of the page.
		 */
		if (cbt->recno >= cbt->ref->ref_recno + page->entries)
			return (false);

		/*
		 * An update would have appeared as an "insert" object; no
		 * further checks to do.
		 */
		break;
	case BTREE_COL_VAR:
		/* The search function doesn't check for empty pages. */
		if (page->entries == 0)
			return (false);
		WT_ASSERT(session, cbt->slot < page->entries);

		/*
		 * Column-store updates are stored as "insert" objects. If
		 * search returned an insert object we can't return, the
		 * returned on-page object must be checked for a match.
		 */
		if (cbt->ins != NULL && !F_ISSET(cbt, WT_CBT_VAR_ONPAGE_MATCH))
			return (false);

		/*
		 * Although updates would have appeared as an "insert" objects,
		 * variable-length column store deletes are written into the
		 * backing store; check the cell for a record already deleted
		 * when read.
		 */
		cip = &page->pg_var[cbt->slot];
		if ((cell = WT_COL_PTR(page, cip)) == NULL ||
		    __wt_cell_type(cell) == WT_CELL_DEL)
			return (false);
		break;
	case BTREE_ROW:
		/* The search function doesn't check for empty pages. */
		if (page->entries == 0)
			return (false);
		WT_ASSERT(session, cbt->slot < page->entries);

		/*
		 * See above: for row-store, no insert object can have the same
		 * key as an on-page object, we're done.
		 */
		if (cbt->ins != NULL)
			return (false);

		/* Check for an update. */
		if (page->modify != NULL &&
		    page->modify->mod_row_update != NULL &&
		    (upd = __wt_txn_read(session,
		    page->modify->mod_row_update[cbt->slot])) != NULL) {
			if (upd->type == WT_UPDATE_DELETED)
				return (false);
			if (updp != NULL)
				*updp = upd;
		}
		break;
	}
	return (true);
}

/*
 * __cursor_kv_return --
 *	Return a page referenced key/value pair to the application.
 */
static inline int
__cursor_kv_return(
    WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
{
	WT_RET(__wt_key_return(session, cbt));
	WT_RET(__wt_value_return(session, cbt, upd));

	return (0);
}

/*
 * __cursor_col_search --
 *	Column-store search from a cursor.
 */
static inline int
__cursor_col_search(
    WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_REF *leaf)
{
	WT_DECL_RET;

	WT_WITH_PAGE_INDEX(session,
	    ret = __wt_col_search(session, cbt->iface.recno, leaf, cbt, false));
	return (ret);
}

/*
 * __cursor_row_search --
 *	Row-store search from a cursor.
 */
static inline int
__cursor_row_search(
    WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_REF *leaf, bool insert)
{
	WT_DECL_RET;

	WT_WITH_PAGE_INDEX(session, ret = __wt_row_search(
	    session, &cbt->iface.key, leaf, cbt, insert, false));
	return (ret);
}

/*
 * __cursor_col_modify_v --
 *	Column-store modify from a cursor, with a separate value.
 */
static inline int
__cursor_col_modify_v(WT_SESSION_IMPL *session,
    WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type)
{
	return (__wt_col_modify(session, cbt,
	    cbt->iface.recno, value, NULL, modify_type, false));
}

/*
 * __cursor_row_modify_v --
 *	Row-store modify from a cursor, with a separate value.
 */
static inline int
__cursor_row_modify_v(WT_SESSION_IMPL *session,
    WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type)
{
	return (__wt_row_modify(session, cbt,
	    &cbt->iface.key, value, NULL, modify_type, false));
}

/*
 * __cursor_col_modify --
 *	Column-store modify from a cursor.
 */
static inline int
__cursor_col_modify(
    WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, u_int modify_type)
{
	return (__wt_col_modify(session, cbt,
	    cbt->iface.recno, &cbt->iface.value, NULL, modify_type, false));
}

/*
 * __cursor_row_modify --
 *	Row-store modify from a cursor.
 */
static inline int
__cursor_row_modify(
    WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, u_int modify_type)
{
	return (__wt_row_modify(session, cbt,
	    &cbt->iface.key, &cbt->iface.value, NULL, modify_type, false));
}

/*
 * __wt_btcur_reset --
 *	Invalidate the cursor position.
 */
int
__wt_btcur_reset(WT_CURSOR_BTREE *cbt)
{
	WT_CURSOR *cursor;
	WT_SESSION_IMPL *session;

	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cbt->iface.session;

	WT_STAT_CONN_INCR(session, cursor_reset);
	WT_STAT_DATA_INCR(session, cursor_reset);

	F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);

	return (__cursor_reset(cbt));
}

/*
 * __wt_btcur_search --
 *	Search for a matching record in the tree.
 */
int
__wt_btcur_search(WT_CURSOR_BTREE *cbt)
{
	WT_BTREE *btree;
	WT_CURFILE_STATE state;
	WT_CURSOR *cursor;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;
	bool valid;

	btree = cbt->btree;
	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;
	upd = NULL;					/* -Wuninitialized */

	WT_STAT_CONN_INCR(session, cursor_search);
	WT_STAT_DATA_INCR(session, cursor_search);

	WT_RET(__wt_txn_search_check(session));
	__cursor_state_save(cursor, &state);

	/*
	 * The pinned page goes away if we search the tree, get a local copy of
	 * any pinned key and discard any pinned value, then re-save the cursor
	 * state. Done before searching pinned pages (unlike other cursor
	 * functions), because we don't anticipate applications searching for a
	 * key they currently have pinned.)
	 */
	WT_ERR(__cursor_localkey(cursor));
	__cursor_novalue(cursor);
	__cursor_state_save(cursor, &state);

	/*
	 * If we have a page pinned, search it; if we don't have a page pinned,
	 * or the search of the pinned page doesn't find an exact match, search
	 * from the root.
	 */
	valid = false;
	if (__cursor_page_pinned(cbt)) {
		__wt_txn_cursor_op(session);

		WT_ERR(btree->type == BTREE_ROW ?
		    __cursor_row_search(session, cbt, cbt->ref, false) :
		    __cursor_col_search(session, cbt, cbt->ref));
		valid = cbt->compare == 0 && __wt_cursor_valid(cbt, &upd);
	}
	if (!valid) {
		WT_ERR(__cursor_func_init(cbt, true));

		WT_ERR(btree->type == BTREE_ROW ?
		    __cursor_row_search(session, cbt, NULL, false) :
		    __cursor_col_search(session, cbt, NULL));
		valid = cbt->compare == 0 && __wt_cursor_valid(cbt, &upd);
	}

	if (valid)
		ret = __cursor_kv_return(session, cbt, upd);
	else if (__cursor_fix_implicit(btree, cbt)) {
		/*
		 * Creating a record past the end of the tree in a fixed-length
		 * column-store implicitly fills the gap with empty records.
		 */
		cbt->recno = cursor->recno;
		cbt->v = 0;
		cursor->value.data = &cbt->v;
		cursor->value.size = 1;
		F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
		F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
	} else
		ret = WT_NOTFOUND;

#ifdef HAVE_DIAGNOSTIC
	if (ret == 0)
		WT_ERR(__wt_cursor_key_order_init(session, cbt));
#endif

err:	if (ret != 0) {
		WT_TRET(__cursor_reset(cbt));
		__cursor_state_restore(cursor, &state);
	}
	return (ret);
}

/*
 * __wt_btcur_search_near --
 *	Search for a record in the tree.
 */
int
__wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
{
	WT_BTREE *btree;
	WT_CURFILE_STATE state;
	WT_CURSOR *cursor;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;
	int exact;
	bool valid;

	btree = cbt->btree;
	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;
	upd = NULL;					/* -Wuninitialized */
	exact = 0;

	WT_STAT_CONN_INCR(session, cursor_search_near);
	WT_STAT_DATA_INCR(session, cursor_search_near);

	WT_RET(__wt_txn_search_check(session));
	__cursor_state_save(cursor, &state);

	/*
	 * The pinned page goes away if we search the tree, get a local copy of
	 * any pinned key and discard any pinned value, then re-save the cursor
	 * state. Done before searching pinned pages (unlike other cursor
	 * functions), because we don't anticipate applications searching for a
	 * key they currently have pinned.)
	 */
	WT_ERR(__cursor_localkey(cursor));
	__cursor_novalue(cursor);
	__cursor_state_save(cursor, &state);

	/*
	 * If we have a row-store page pinned, search it; if we don't have a
	 * page pinned, or the search of the pinned page doesn't find an exact
	 * match, search from the root. Unlike WT_CURSOR.search, ignore pinned
	 * pages in the case of column-store, search-near isn't an interesting
	 * enough case for column-store to add the complexity needed to avoid
	 * the tree search.
	 *
	 * Set the "insert" flag for the btree row-store search; we may intend
	 * to position the cursor at the end of the tree, rather than match an
	 * existing record.
	 */
	valid = false;
	if (btree->type == BTREE_ROW && __cursor_page_pinned(cbt)) {
		__wt_txn_cursor_op(session);

		WT_ERR(__cursor_row_search(session, cbt, cbt->ref, true));

		/*
		 * Search-near is trickier than search when searching an already
		 * pinned page. If search returns the first or last page slots,
		 * discard the results and search the full tree as the neighbor
		 * pages might offer better matches. This test is simplistic as
		 * we're ignoring append lists (there may be no page slots or we
		 * might be legitimately positioned after the last page slot).
		 * Ignore those cases, it makes things too complicated.
		 */
		if (cbt->slot != 0 && cbt->slot != cbt->ref->page->entries - 1)
			valid = __wt_cursor_valid(cbt, &upd);
	}
	if (!valid) {
		WT_ERR(__cursor_func_init(cbt, true));
		WT_ERR(btree->type == BTREE_ROW ?
		    __cursor_row_search(session, cbt, NULL, true) :
		    __cursor_col_search(session, cbt, NULL));
		valid = __wt_cursor_valid(cbt, &upd);
	}

	/*
	 * If we find a valid key, return it.
	 *
	 * Else, creating a record past the end of the tree in a fixed-length
	 * column-store implicitly fills the gap with empty records.  In this
	 * case, we instantiate the empty record, it's an exact match.
	 *
	 * Else, move to the next key in the tree (bias for prefix searches).
	 * Cursor next skips invalid rows, so we don't have to test for them
	 * again.
	 *
	 * Else, redo the search and move to the previous key in the tree.
	 * Cursor previous skips invalid rows, so we don't have to test for
	 * them again.
	 *
	 * If that fails, quit, there's no record to return.
	 */
	if (valid) {
		exact = cbt->compare;
		ret = __cursor_kv_return(session, cbt, upd);
	} else if (__cursor_fix_implicit(btree, cbt)) {
		cbt->recno = cursor->recno;
		cbt->v = 0;
		cursor->value.data = &cbt->v;
		cursor->value.size = 1;
		exact = 0;
		F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
		F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
	} else if ((ret = __wt_btcur_next(cbt, false)) != WT_NOTFOUND)
		exact = 1;
	else {
		WT_ERR(__cursor_func_init(cbt, true));
		WT_ERR(btree->type == BTREE_ROW ?
		    __cursor_row_search(session, cbt, NULL, true) :
		    __cursor_col_search(session, cbt, NULL));
		if (__wt_cursor_valid(cbt, &upd)) {
			exact = cbt->compare;
			ret = __cursor_kv_return(session, cbt, upd);
		} else if ((ret = __wt_btcur_prev(cbt, false)) != WT_NOTFOUND)
			exact = -1;
	}

err:	if (ret == 0 && exactp != NULL)
		*exactp = exact;

#ifdef HAVE_DIAGNOSTIC
	if (ret == 0)
		WT_TRET(__wt_cursor_key_order_init(session, cbt));
#endif

	if (ret != 0) {
		WT_TRET(__cursor_reset(cbt));
		__cursor_state_restore(cursor, &state);
	}
	return (ret);
}

/*
 * __wt_btcur_insert --
 *	Insert a record into the tree.
 */
int
__wt_btcur_insert(WT_CURSOR_BTREE *cbt)
{
	WT_BTREE *btree;
	WT_CURFILE_STATE state;
	WT_CURSOR *cursor;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	bool append_key;

	btree = cbt->btree;
	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;

	WT_STAT_CONN_INCR(session, cursor_insert);
	WT_STAT_DATA_INCR(session, cursor_insert);
	WT_STAT_DATA_INCRV(session,
	    cursor_insert_bytes, cursor->key.size + cursor->value.size);

	if (btree->type == BTREE_ROW)
		WT_RET(__cursor_size_chk(session, &cursor->key));
	WT_RET(__cursor_size_chk(session, &cursor->value));

	/* It's no longer possible to bulk-load into the tree. */
	__cursor_disable_bulk(session, btree);

	/*
	 * Insert a new record if WT_CURSTD_APPEND configured, (ignoring any
	 * application set record number). Although append can't be configured
	 * for a row-store, this code would break if it were, and that's owned
	 * by the upper cursor layer, be cautious.
	 */
	append_key =
	    F_ISSET(cursor, WT_CURSTD_APPEND) && btree->type != BTREE_ROW;

	/* Save the cursor state. */
	__cursor_state_save(cursor, &state);

	/*
	 * If inserting with overwrite configured, and positioned to an on-page
	 * key, the update doesn't require another search. The cursor won't be
	 * positioned on a page with an external key set, but be sure. Cursors
	 * configured for append aren't included, regardless of whether or not
	 * they meet all other criteria.
	 */
	if (__cursor_page_pinned(cbt) &&
	    F_ISSET_ALL(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_OVERWRITE) &&
	    !append_key) {
		WT_ERR(__wt_txn_autocommit_check(session));
		/*
		 * The cursor position may not be exact (the cursor's comparison
		 * value not equal to zero). Correct to an exact match so we can
		 * update whatever we're pointing at.
		 */
		cbt->compare = 0;
		ret = btree->type == BTREE_ROW ?
		    __cursor_row_modify(session, cbt, WT_UPDATE_STANDARD) :
		    __cursor_col_modify(session, cbt, WT_UPDATE_STANDARD);
		if (ret == 0)
			goto done;

		/*
		 * The pinned page goes away if we fail for any reason, get a
		 * local copy of any pinned key or value. (Restart could still
		 * use the pinned page, but that's an unlikely path.) Re-save
		 * the cursor state: we may retry but eventually fail.
		 */
		WT_TRET(__cursor_localkey(cursor));
		WT_TRET(__cursor_localvalue(cursor));
		__cursor_state_save(cursor, &state);
		goto err;
	}

	/*
	 * The pinned page goes away if we do a search, get a local copy of any
	 * pinned key or value. Re-save the cursor state: we may retry but
	 * eventually fail.
	 */
	WT_ERR(__cursor_localkey(cursor));
	WT_ERR(__cursor_localvalue(cursor));
	__cursor_state_save(cursor, &state);

retry:	WT_ERR(__cursor_func_init(cbt, true));

	if (btree->type == BTREE_ROW) {
		WT_ERR(__cursor_row_search(session, cbt, NULL, true));
		/*
		 * If not overwriting, fail if the key exists, else insert the
		 * key/value pair.
		 */
		if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
		    cbt->compare == 0 && __wt_cursor_valid(cbt, NULL))
			WT_ERR(WT_DUPLICATE_KEY);

		ret = __cursor_row_modify(session, cbt, WT_UPDATE_STANDARD);
	} else {
		/*
		 * Optionally insert a new record (ignoring the application's
		 * record number). The real record number is allocated by the
		 * serialized append operation.
		 */
		if (append_key)
			cbt->iface.recno = WT_RECNO_OOB;

		WT_ERR(__cursor_col_search(session, cbt, NULL));

		/*
		 * If not overwriting, fail if the key exists.  Creating a
		 * record past the end of the tree in a fixed-length
		 * column-store implicitly fills the gap with empty records.
		 * Fail in that case, the record exists.
		 */
		if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
		    ((cbt->compare == 0 && __wt_cursor_valid(cbt, NULL)) ||
		    (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt))))
			WT_ERR(WT_DUPLICATE_KEY);

		WT_ERR(__cursor_col_modify(session, cbt, WT_UPDATE_STANDARD));

		if (append_key)
			cbt->iface.recno = cbt->recno;
	}

err:	if (ret == WT_RESTART) {
		WT_STAT_CONN_INCR(session, cursor_restart);
		WT_STAT_DATA_INCR(session, cursor_restart);
		goto retry;
	}

done:	/* Insert doesn't maintain a position across calls, clear resources. */
	if (ret == 0) {
		F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
		if (append_key)
			F_SET(cursor, WT_CURSTD_KEY_INT);
	}
	WT_TRET(__cursor_reset(cbt));
	if (ret != 0)
		__cursor_state_restore(cursor, &state);

	return (ret);
}

/*
 * __curfile_update_check --
 *	Check whether an update would conflict.
 *
 *	This function expects the cursor to already be positioned.  It should
 *	be called before deciding whether to skip an update operation based on
 *	existence of a visible update for a key -- even if there is no value
 *	visible to the transaction, an update could still conflict.
 */
static int
__curfile_update_check(WT_CURSOR_BTREE *cbt)
{
	WT_BTREE *btree;
	WT_SESSION_IMPL *session;

	btree = cbt->btree;
	session = (WT_SESSION_IMPL *)cbt->iface.session;

	if (cbt->compare != 0)
		return (0);
	if (cbt->ins != NULL)
		return (__wt_txn_update_check(session, cbt->ins->upd));

	if (btree->type == BTREE_ROW &&
	    cbt->ref->page->modify != NULL &&
	    cbt->ref->page->modify->mod_row_update != NULL)
		return (__wt_txn_update_check(session,
		    cbt->ref->page->modify->mod_row_update[cbt->slot]));
	return (0);
}

/*
 * __wt_btcur_insert_check --
 *	Check whether an update would conflict.
 *
 * This can replace WT_CURSOR::insert, so it only checks for conflicts without
 * updating the tree. It is used to maintain snapshot isolation for transactions
 * that span multiple chunks in an LSM tree.
 */
int
__wt_btcur_insert_check(WT_CURSOR_BTREE *cbt)
{
	WT_BTREE *btree;
	WT_CURSOR *cursor;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	cursor = &cbt->iface;
	btree = cbt->btree;
	session = (WT_SESSION_IMPL *)cursor->session;

	/*
	 * The pinned page goes away if we do a search, get a local copy of any
	 * pinned key and discard any pinned value. Unlike most of the btree
	 * cursor routines, we don't have to save/restore the cursor key state,
	 * none of the work done here changes the cursor state.
	 */
	WT_ERR(__cursor_localkey(cursor));
	__cursor_novalue(cursor);

retry:	WT_ERR(__cursor_func_init(cbt, true));

	if (btree->type == BTREE_ROW) {
		WT_ERR(__cursor_row_search(session, cbt, NULL, true));

		/* Just check for conflicts. */
		ret = __curfile_update_check(cbt);
	} else
		WT_ERR(__wt_illegal_value(session, NULL));

err:	if (ret == WT_RESTART) {
		WT_STAT_CONN_INCR(session, cursor_restart);
		WT_STAT_DATA_INCR(session, cursor_restart);
		goto retry;
	}

	/* Insert doesn't maintain a position across calls, clear resources. */
	if (ret == 0)
		F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
	WT_TRET(__cursor_reset(cbt));

	return (ret);
}

/*
 * __wt_btcur_remove --
 *	Remove a record from the tree.
 */
int
__wt_btcur_remove(WT_CURSOR_BTREE *cbt)
{
	WT_BTREE *btree;
	WT_CURFILE_STATE state;
	WT_CURSOR *cursor;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	bool positioned;

	btree = cbt->btree;
	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;

	WT_STAT_CONN_INCR(session, cursor_remove);
	WT_STAT_DATA_INCR(session, cursor_remove);
	WT_STAT_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size);

	/*
	 * WT_CURSOR.remove has a unique semantic, the cursor stays positioned
	 * if it starts positioned, otherwise clear the cursor on completion.
	 */
	positioned = F_ISSET(cursor, WT_CURSTD_KEY_INT);

	/* Save the cursor state. */
	__cursor_state_save(cursor, &state);

	/*
	 * If remove positioned to an on-page key, the remove doesn't require
	 * another search. We don't care about the "overwrite" configuration
	 * because regardless of the overwrite setting, any existing record is
	 * removed, and the record must exist with a positioned cursor. The
	 * cursor won't be positioned on a page with an external key set, but
	 * be sure.
	 */
	if (__cursor_page_pinned(cbt) && F_ISSET(cursor, WT_CURSTD_KEY_INT)) {
		WT_ERR(__wt_txn_autocommit_check(session));

		/*
		 * The cursor position may not be exact (the cursor's comparison
		 * value not equal to zero). Correct to an exact match so we can
		 * remove whatever we're pointing at.
		 */
		cbt->compare = 0;
		ret = btree->type == BTREE_ROW ?
		    __cursor_row_modify(session, cbt, WT_UPDATE_DELETED) :
		    __cursor_col_modify(session, cbt, WT_UPDATE_DELETED);
		if (ret == 0)
			goto done;

		/*
		 * The pinned page goes away if we fail for any reason, get a
		 * local copy of any pinned key and discard any value (remove
		 * discards any previous value on success or failure). (Restart
		 * could still use the pinned page, but that's an unlikely
		 * path.) Re-save the cursor state: we may retry but eventually
		 * fail.
		 */
		WT_TRET(__cursor_localkey(cursor));
		F_CLR(cursor, WT_CURSTD_VALUE_SET);
		__cursor_state_save(cursor, &state);
		goto err;
	}

	/*
	 * The pinned page goes away if we do a search, get a local copy of any
	 * pinned key and discard any value (remove discards any previous
	 * value on success or failure). Re-save the cursor state: we may retry
	 * but eventually fail.
	 */
	WT_ERR(__cursor_localkey(cursor));
	F_CLR(cursor, WT_CURSTD_VALUE_SET);
	__cursor_state_save(cursor, &state);

retry:	WT_ERR(__cursor_func_init(cbt, true));

	if (btree->type == BTREE_ROW) {
		WT_ERR(__cursor_row_search(session, cbt, NULL, false));

		/* Check whether an update would conflict. */
		WT_ERR(__curfile_update_check(cbt));

		if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL))
			WT_ERR(WT_NOTFOUND);

		ret = __cursor_row_modify(session, cbt, WT_UPDATE_DELETED);
	} else {
		WT_ERR(__cursor_col_search(session, cbt, NULL));

		/*
		 * If we find a matching record, check whether an update would
		 * conflict.  Do this before checking if the update is visible
		 * in __wt_cursor_valid, or we can miss conflict.
		 */
		WT_ERR(__curfile_update_check(cbt));

		/* Remove the record if it exists. */
		if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) {
			if (!__cursor_fix_implicit(btree, cbt))
				WT_ERR(WT_NOTFOUND);
			/*
			 * Creating a record past the end of the tree in a
			 * fixed-length column-store implicitly fills the
			 * gap with empty records.  Return success in that
			 * case, the record was deleted successfully.
			 *
			 * Correct the btree cursor's location: the search
			 * will have pointed us at the previous/next item,
			 * and that's not correct.
			 */
			cbt->recno = cursor->recno;
		} else
			ret = __cursor_col_modify(
			    session, cbt, WT_UPDATE_DELETED);
	}

err:	if (ret == WT_RESTART) {
		WT_STAT_CONN_INCR(session, cursor_restart);
		WT_STAT_DATA_INCR(session, cursor_restart);
		goto retry;
	}

	/*
	 * If the cursor is configured to overwrite and the record is not found,
	 * that is exactly what we want, return success.
	 */
	if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) && ret == WT_NOTFOUND)
		ret = 0;

done:	/*
	 * If the cursor was positioned, it stays positioned, point the cursor
	 * at an internal copy of the key. Otherwise, there's no position or
	 * key/value.
	 */
	if (ret == 0)
		F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
	if (ret == 0 && positioned)
		WT_TRET(__wt_key_return(session, cbt));
	else
		WT_TRET(__cursor_reset(cbt));
	if (ret != 0)
		__cursor_state_restore(cursor, &state);

	return (ret);
}

/*
 * __btcur_update --
 *	Update a record in the tree.
 */
static int
__btcur_update(WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type)
{
	WT_BTREE *btree;
	WT_CURFILE_STATE state;
	WT_CURSOR *cursor;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	btree = cbt->btree;
	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;

	/* It's no longer possible to bulk-load into the tree. */
	__cursor_disable_bulk(session, btree);

	/* Save the cursor state. */
	__cursor_state_save(cursor, &state);

	/*
	 * If update positioned to an on-page key, the update doesn't require
	 * another search. We don't care about the "overwrite" configuration
	 * because regardless of the overwrite setting, any existing record is
	 * updated, and the record must exist with a positioned cursor. The
	 * cursor won't be positioned on a page with an external key set, but
	 * be sure.
	 */
	if (__cursor_page_pinned(cbt) && F_ISSET(cursor, WT_CURSTD_KEY_INT)) {
		WT_ERR(__wt_txn_autocommit_check(session));

		/*
		 * The cursor position may not be exact (the cursor's comparison
		 * value not equal to zero). Correct to an exact match so we can
		 * update whatever we're pointing at.
		 */
		cbt->compare = 0;
		ret = btree->type == BTREE_ROW ?
		    __cursor_row_modify_v(session, cbt, value, modify_type) :
		    __cursor_col_modify_v(session, cbt, value, modify_type);
		if (ret == 0)
			goto done;

		/*
		 * The pinned page goes away if we fail for any reason, get a
		 * a local copy of any pinned key or value. (Restart could still
		 * use the pinned page, but that's an unlikely path.) Re-save
		 * the cursor state: we may retry but eventually fail.
		 */
		WT_TRET(__cursor_localkey(cursor));
		WT_TRET(__cursor_localvalue(cursor));
		__cursor_state_save(cursor, &state);
		goto err;
	}

	/*
	 * The pinned page goes away if we do a search, get a local copy of any
	 * pinned key or value. Re-save the cursor state: we may retry but
	 * eventually fail.
	 */
	WT_ERR(__cursor_localkey(cursor));
	WT_ERR(__cursor_localvalue(cursor));
	__cursor_state_save(cursor, &state);

retry:	WT_ERR(__cursor_func_init(cbt, true));

	if (btree->type == BTREE_ROW) {
		WT_ERR(__cursor_row_search(session, cbt, NULL, true));

		/*
		 * If not overwriting, check for conflicts and fail if the key
		 * does not exist.
		 */
		if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
			WT_ERR(__curfile_update_check(cbt));
			if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL))
				WT_ERR(WT_NOTFOUND);
		}
		ret = __cursor_row_modify_v(session, cbt, value, modify_type);
	} else {
		WT_ERR(__cursor_col_search(session, cbt, NULL));

		/*
		 * If not overwriting, fail if the key doesn't exist.  If we
		 * find an update for the key, check for conflicts.  Update the
		 * record if it exists.  Creating a record past the end of the
		 * tree in a fixed-length column-store implicitly fills the gap
		 * with empty records.  Update the record in that case, the
		 * record exists.
		 */
		if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
			WT_ERR(__curfile_update_check(cbt));
			if ((cbt->compare != 0 ||
			    !__wt_cursor_valid(cbt, NULL)) &&
			    !__cursor_fix_implicit(btree, cbt))
				WT_ERR(WT_NOTFOUND);
		}
		ret = __cursor_col_modify_v(session, cbt, value, modify_type);
	}

err:	if (ret == WT_RESTART) {
		WT_STAT_CONN_INCR(session, cursor_restart);
		WT_STAT_DATA_INCR(session, cursor_restart);
		goto retry;
	}

	/*
	 * If successful, point the cursor at internal copies of the data.  We
	 * could shuffle memory in the cursor so the key/value pair are in local
	 * buffer memory, but that's a data copy.  We don't want to do another
	 * search (and we might get a different update structure if we race).
	 * To make this work, we add a field to the btree cursor to pass back a
	 * pointer to the modify function's allocated update structure.
	 */
done:	if (ret == 0)
		switch (modify_type) {
		case WT_UPDATE_STANDARD:
			/*
			 * WT_CURSOR.update returns a key and a value.
			 */
			WT_TRET(__cursor_kv_return(
			    session, cbt, cbt->modify_update));
			break;
		case WT_UPDATE_RESERVED:
			/*
			 * WT_CURSOR.reserve doesn't return any value.
			 */
			F_CLR(cursor, WT_CURSTD_VALUE_SET);
			/* FALLTHROUGH */
		case WT_UPDATE_MODIFIED:
			/*
			 * WT_CURSOR.modify has already created the return value
			 * and our job is to leave it untouched.
			 */
			WT_TRET(__wt_key_return(session, cbt));
			break;
		case WT_UPDATE_DELETED:
		default:
			WT_TRET(__wt_illegal_value(session, NULL));
			break;
		}

	if (ret != 0) {
		WT_TRET(__cursor_reset(cbt));
		__cursor_state_restore(cursor, &state);
	}

	return (ret);
}

/*
 * __cursor_chain_exceeded --
 *	Return if the update chain has exceeded the limit. Deleted or standard
 * updates are anticipated to be sufficient to base the modify (although that's
 * not guaranteed, they may not be visible or might abort before we read them).
 * Also, this is not a hard limit, threads can race modifying updates.
 */
static bool
__cursor_chain_exceeded(WT_CURSOR_BTREE *cbt)
{
	WT_PAGE *page;
	WT_UPDATE *upd;
	int i;

	page = cbt->ref->page;

	upd = NULL;
	if (cbt->ins != NULL)
		upd = cbt->ins->upd;
	else if (cbt->btree->type == BTREE_ROW &&
	    page->modify != NULL && page->modify->mod_row_update != NULL)
		upd = page->modify->mod_row_update[cbt->slot];

	for (i = 0; upd != NULL; ++i, upd = upd->next) {
		if (WT_UPDATE_DATA_VALUE(upd))
			return (false);
		if (i >= WT_MAX_MODIFY_UPDATE)
			return (true);
	}
	return (false);
}

/*
 * __wt_btcur_modify --
 *     Modify a record in the tree.
 */
int
__wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries)
{
	WT_CURFILE_STATE state;
	WT_CURSOR *cursor;
	WT_DECL_ITEM(modify);
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	size_t orig, new;
	bool overwrite;

	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;

	WT_STAT_CONN_INCR(session, cursor_modify);
	WT_STAT_DATA_INCR(session, cursor_modify);

	/* Save the cursor state. */
	__cursor_state_save(cursor, &state);

	/*
	 * Get the current value and apply the modification to it, for a few
	 * reasons: first, we set the updated value so the application can
	 * retrieve the cursor's value; second, we use the updated value as
	 * the update if the update chain is too long; third, there's a check
	 * if the updated value is too large to store; fourth, to simplify the
	 * count of bytes being added/removed; fifth, we can get into serious
	 * trouble if we attempt to modify a value that doesn't exist. For the
	 * fifth reason, verify we're not in a read-uncommitted transaction,
	 * that implies a value that might disappear out from under us.
	 */
	if (session->txn.isolation == WT_ISO_READ_UNCOMMITTED)
		WT_ERR_MSG(session, ENOTSUP,
		    "not supported in read-uncommitted transactions");

	WT_ERR(__wt_btcur_search(cbt));
	orig = cursor->value.size;
	WT_ERR(__wt_modify_apply_api(
	    session, &cursor->value, entries, nentries));
	new = cursor->value.size;
	WT_ERR(__cursor_size_chk(session, &cursor->value));
	if (new > orig)
		WT_STAT_DATA_INCRV(session, cursor_update_bytes, new - orig);
	else
		WT_STAT_DATA_DECRV(session, cursor_update_bytes, orig - new);

	/*
	 * WT_CURSOR.modify is update-without-overwrite.
	 *
	 * Use the modify buffer as the update if the data package saves us some
	 * memory and the update chain is under the limit, else use the complete
	 * value.
	 */
	overwrite = F_ISSET(cursor, WT_CURSTD_OVERWRITE);
	F_CLR(cursor, WT_CURSTD_OVERWRITE);
	if (cursor->value.size <= 64 || __cursor_chain_exceeded(cbt))
		ret = __btcur_update(cbt, &cursor->value, WT_UPDATE_STANDARD);
	else if ((ret =
	    __wt_modify_pack(session, &modify, entries, nentries)) == 0)
		ret = __btcur_update(cbt, modify, WT_UPDATE_MODIFIED);
	if (overwrite)
	       F_SET(cursor, WT_CURSTD_OVERWRITE);

	/*
	 * We have our own cursor state restoration because we've modified the
	 * cursor before calling the underlying cursor update function and we
	 * need to restore it to its original state. This means multiple calls
	 * to reset the cursor, but that shouldn't be a problem.
	 */
	if (ret != 0) {
err:		WT_TRET(__cursor_reset(cbt));
		__cursor_state_restore(cursor, &state);
	}

	__wt_scr_free(session, &modify);
	return (ret);
}

/*
 * __wt_btcur_reserve --
 *     Reserve a record in the tree.
 */
int
__wt_btcur_reserve(WT_CURSOR_BTREE *cbt)
{
	WT_CURSOR *cursor;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	bool overwrite;

	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;

	WT_STAT_CONN_INCR(session, cursor_reserve);
	WT_STAT_DATA_INCR(session, cursor_reserve);

	/* WT_CURSOR.reserve is update-without-overwrite and a special value. */
	overwrite = F_ISSET(cursor, WT_CURSTD_OVERWRITE);
	F_CLR(cursor, WT_CURSTD_OVERWRITE);
	ret = __btcur_update(cbt, &cursor->value, WT_UPDATE_RESERVED);
	if (overwrite)
	       F_SET(cursor, WT_CURSTD_OVERWRITE);
	return (ret);
}

/*
 * __wt_btcur_update --
 *     Update a record in the tree.
 */
int
__wt_btcur_update(WT_CURSOR_BTREE *cbt)
{
	WT_BTREE *btree;
	WT_CURSOR *cursor;
	WT_SESSION_IMPL *session;

	btree = cbt->btree;
	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;

	WT_STAT_CONN_INCR(session, cursor_update);
	WT_STAT_DATA_INCR(session, cursor_update);
	WT_STAT_DATA_INCRV(session, cursor_update_bytes, cursor->value.size);

	if (btree->type == BTREE_ROW)
		WT_RET(__cursor_size_chk(session, &cursor->key));
	WT_RET(__cursor_size_chk(session, &cursor->value));

	return (__btcur_update(cbt, &cursor->value, WT_UPDATE_STANDARD));
}

/*
 * __wt_btcur_compare --
 *	Return a comparison between two cursors.
 */
int
__wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp)
{
	WT_CURSOR *a, *b;
	WT_SESSION_IMPL *session;

	a = (WT_CURSOR *)a_arg;
	b = (WT_CURSOR *)b_arg;
	session = (WT_SESSION_IMPL *)a->session;

	/* Confirm both cursors reference the same object. */
	if (a_arg->btree != b_arg->btree)
		WT_RET_MSG(
		    session, EINVAL, "Cursors must reference the same object");

	switch (a_arg->btree->type) {
	case BTREE_COL_FIX:
	case BTREE_COL_VAR:
		/*
		 * Compare the interface's cursor record, not the underlying
		 * cursor reference: the interface's cursor reference is the
		 * one being returned to the application.
		 */
		if (a->recno < b->recno)
			*cmpp = -1;
		else if (a->recno == b->recno)
			*cmpp = 0;
		else
			*cmpp = 1;
		break;
	case BTREE_ROW:
		WT_RET(__wt_compare(
		    session, a_arg->btree->collator, &a->key, &b->key, cmpp));
		break;
	}
	return (0);
}

/*
 * __cursor_equals --
 *	Return if two cursors reference the same row.
 */
static inline bool
__cursor_equals(WT_CURSOR_BTREE *a, WT_CURSOR_BTREE *b)
{
	switch (a->btree->type) {
	case BTREE_COL_FIX:
	case BTREE_COL_VAR:
		/*
		 * Compare the interface's cursor record, not the underlying
		 * cursor reference: the interface's cursor reference is the
		 * one being returned to the application.
		 */
		if (((WT_CURSOR *)a)->recno == ((WT_CURSOR *)b)->recno)
			return (true);
		break;
	case BTREE_ROW:
		if (a->ref != b->ref)
			return (false);
		if (a->ins != NULL || b->ins != NULL) {
			if (a->ins == b->ins)
				return (true);
			break;
		}
		if (a->slot == b->slot)
			return (true);
		break;
	}
	return (false);
}

/*
 * __wt_btcur_equals --
 *	Return an equality comparison between two cursors.
 */
int
__wt_btcur_equals(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *equalp)
{
	WT_CURSOR *a, *b;
	WT_SESSION_IMPL *session;
	int cmp;

	a = (WT_CURSOR *)a_arg;
	b = (WT_CURSOR *)b_arg;
	cmp = 0;
	session = (WT_SESSION_IMPL *)a->session;

	/* Confirm both cursors reference the same object. */
	if (a_arg->btree != b_arg->btree)
		WT_RET_MSG(
		    session, EINVAL, "Cursors must reference the same object");

	/*
	 * The reason for an equals method is because we can avoid doing
	 * a full key comparison in some cases. If both cursors point into the
	 * tree, take the fast path, otherwise fall back to the slower compare
	 * method; in both cases, return 1 if the cursors are equal, 0 if they
	 * are not.
	 */
	if (F_ISSET(a, WT_CURSTD_KEY_INT) && F_ISSET(b, WT_CURSTD_KEY_INT))
		*equalp = __cursor_equals(a_arg, b_arg);
	else {
		WT_RET(__wt_btcur_compare(a_arg, b_arg, &cmp));
		*equalp = (cmp == 0) ? 1 : 0;
	}
	return (0);
}

/*
 * __cursor_truncate --
 *	Discard a cursor range from row-store or variable-width column-store
 * tree.
 */
static int
__cursor_truncate(WT_SESSION_IMPL *session,
    WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop,
    int (*rmfunc)(WT_SESSION_IMPL *, WT_CURSOR_BTREE *, u_int))
{
	WT_DECL_RET;

	/*
	 * First, call the cursor search method to re-position the cursor: we
	 * may not have a cursor position (if the higher-level truncate code
	 * switched the cursors to have an "external" cursor key, and because
	 * we don't save a copy of the page's write generation information,
	 * which we need to remove records.
	 *
	 * Once that's done, we can delete records without a full search, unless
	 * we encounter a restart error because the page was modified by some
	 * other thread of control; in that case, repeat the full search to
	 * refresh the page's modification information.
	 *
	 * If this is a row-store, we delete leaf pages having no overflow items
	 * without reading them; for that to work, we have to ensure we read the
	 * page referenced by the ending cursor, since we may be deleting only a
	 * partial page at the end of the truncation.  Our caller already fully
	 * instantiated the end cursor, so we know that page is pinned in memory
	 * and we can proceed without concern.
	 */
retry:	WT_RET(__wt_btcur_search(start));
	WT_ASSERT(session,
	    F_MASK((WT_CURSOR *)start, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT);

	for (;;) {
		if ((ret = rmfunc(session, start, WT_UPDATE_DELETED)) != 0)
			break;

		if (stop != NULL && __cursor_equals(start, stop))
			break;
		if ((ret = __wt_btcur_next(start, true)) != 0)
			break;
		start->compare = 0;		/* Exact match */
	}

	if (ret == WT_RESTART) {
		WT_STAT_CONN_INCR(session, cursor_restart);
		WT_STAT_DATA_INCR(session, cursor_restart);
		goto retry;
	}

	WT_RET_NOTFOUND_OK(ret);
	return (0);
}

/*
 * __cursor_truncate_fix --
 *	Discard a cursor range from fixed-width column-store tree.
 */
static int
__cursor_truncate_fix(WT_SESSION_IMPL *session,
    WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop,
    int (*rmfunc)(WT_SESSION_IMPL *, WT_CURSOR_BTREE *, u_int))
{
	WT_DECL_RET;
	const uint8_t *value;

	/*
	 * Handle fixed-length column-store objects separately: for row-store
	 * and variable-length column-store objects we have "deleted" values
	 * and so returned objects actually exist: fixed-length column-store
	 * objects are filled-in if they don't exist, that is, if you create
	 * record 37, records 1-36 magically appear.  Those records can't be
	 * deleted, which means we have to ignore already "deleted" records.
	 *
	 * First, call the cursor search method to re-position the cursor: we
	 * may not have a cursor position (if the higher-level truncate code
	 * switched the cursors to have an "external" cursor key, and because
	 * we don't save a copy of the page's write generation information,
	 * which we need to remove records.
	 *
	 * Once that's done, we can delete records without a full search, unless
	 * we encounter a restart error because the page was modified by some
	 * other thread of control; in that case, repeat the full search to
	 * refresh the page's modification information.
	 */
retry:	WT_RET(__wt_btcur_search(start));
	WT_ASSERT(session,
	    F_MASK((WT_CURSOR *)start, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT);

	for (;;) {
		value = (const uint8_t *)start->iface.value.data;
		if (*value != 0 &&
		    (ret = rmfunc(session, start, WT_UPDATE_DELETED)) != 0)
			break;

		if (stop != NULL && __cursor_equals(start, stop))
			break;
		if ((ret = __wt_btcur_next(start, true)) != 0)
			break;
		start->compare = 0;	/* Exact match */
	}

	if (ret == WT_RESTART) {
		WT_STAT_CONN_INCR(session, cursor_restart);
		WT_STAT_DATA_INCR(session, cursor_restart);
		goto retry;
	}

	WT_RET_NOTFOUND_OK(ret);
	return (0);
}

/*
 * __wt_btcur_range_truncate --
 *	Discard a cursor range from the tree.
 */
int
__wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop)
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	session = (WT_SESSION_IMPL *)start->iface.session;
	btree = start->btree;
	WT_STAT_DATA_INCR(session, cursor_truncate);

	/*
	 * For recovery, log the start and stop keys for a truncate operation,
	 * not the individual records removed.  On the other hand, for rollback
	 * we need to keep track of all the in-memory operations.
	 *
	 * We deal with this here by logging the truncate range first, then (in
	 * the logging code) disabling writing of the in-memory remove records
	 * to disk.
	 */
	if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED))
		WT_RET(__wt_txn_truncate_log(session, start, stop));

	switch (btree->type) {
	case BTREE_COL_FIX:
		WT_ERR(__cursor_truncate_fix(
		    session, start, stop, __cursor_col_modify));
		break;
	case BTREE_COL_VAR:
		WT_ERR(__cursor_truncate(
		    session, start, stop, __cursor_col_modify));
		break;
	case BTREE_ROW:
		/*
		 * The underlying cursor comparison routine requires cursors be
		 * fully instantiated when truncating row-store objects because
		 * it's comparing page and/or skiplist positions, not keys. (Key
		 * comparison would work, it's only that a key comparison would
		 * be relatively expensive, especially with custom collators.
		 * Column-store objects have record number keys, so the key
		 * comparison is cheap.)  The session truncate code did cursor
		 * searches when setting up the truncate so we're good to go: if
		 * that ever changes, we'd need to do something here to ensure a
		 * fully instantiated cursor.
		 */
		WT_ERR(__cursor_truncate(
		    session, start, stop, __cursor_row_modify));
		break;
	}

err:	if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED))
		__wt_txn_truncate_end(session);
	return (ret);
}

/*
 * __wt_btcur_init --
 *	Initialize a cursor used for internal purposes.
 */
void
__wt_btcur_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
{
	memset(cbt, 0, sizeof(WT_CURSOR_BTREE));

	cbt->iface.session = &session->iface;
	cbt->btree = S2BT(session);
}

/*
 * __wt_btcur_open --
 *	Open a btree cursor.
 */
void
__wt_btcur_open(WT_CURSOR_BTREE *cbt)
{
	cbt->row_key = &cbt->_row_key;
	cbt->tmp = &cbt->_tmp;

#ifdef HAVE_DIAGNOSTIC
	cbt->lastkey = &cbt->_lastkey;
	cbt->lastrecno = WT_RECNO_OOB;
#endif
}

/*
 * __wt_btcur_close --
 *	Close a btree cursor.
 */
int
__wt_btcur_close(WT_CURSOR_BTREE *cbt, bool lowlevel)
{
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	session = (WT_SESSION_IMPL *)cbt->iface.session;

	/*
	 * The in-memory split and lookaside table code creates low-level btree
	 * cursors to search/modify leaf pages. Those cursors don't hold hazard
	 * pointers, nor are they counted in the session handle's cursor count.
	 * Skip the usual cursor tear-down in that case.
	 */
	if (!lowlevel)
		ret = __cursor_reset(cbt);

	__wt_buf_free(session, &cbt->_row_key);
	__wt_buf_free(session, &cbt->_tmp);
#ifdef HAVE_DIAGNOSTIC
	__wt_buf_free(session, &cbt->_lastkey);
#endif

	return (ret);
}