/*-
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

static void __inmem_row_leaf_slots(uint8_t *, uint32_t, uint32_t, uint32_t);

/*
 * __wt_row_leaf_keys --
 *	Instantiate the interesting keys for random search of a page.
 */
int
__wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_BTREE *btree;
	WT_DECL_ITEM(key);
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_ROW *rip;
	uint32_t gap, i;

	btree = S2BT(session);

	if (page->pg_row_entries == 0 ||		/* Just checking... */
	    F_ISSET_ATOMIC(page, WT_PAGE_DIRECT_KEY)) {
		F_SET_ATOMIC(page, WT_PAGE_BUILD_KEYS);
		return (0);
	}

	/*
	 * Row-store leaf pages are written as one big prefix-compressed chunk,
	 * that is, only the first key on the page is not prefix-compressed, and
	 * to instantiate the last key on the page, you have to take the first
	 * key on the page and roll it forward to the end of the page.  We don't
	 * want to do that on every page access, of course, so we instantiate a
	 * set of keys, essentially creating prefix chunks on the page, where we
	 * can roll forward from the closest, previous, instantiated key.  The
	 * complication is that not all keys on a page are equal: we're doing a
	 * binary search on the  page, which means there are keys we look at a
	 * lot (every time we search the page), and keys we never look at unless
	 * they are actually being searched for.  This function figures out the
	 * "interesting" keys on a page, and then we sequentially walk that list
	 * instantiating those keys.
	 *
	 * Allocate a bit array and figure out the set of "interesting" keys,
	 * marking up the array.
	 */
	WT_RET(__wt_scr_alloc(session, 0, &key));
	WT_RET(__wt_scr_alloc(session,
	    (uint32_t)__bitstr_size(page->pg_row_entries), &tmp));

	if ((gap = btree->key_gap) == 0)
		gap = 1;
	__inmem_row_leaf_slots(tmp->mem, 0, page->pg_row_entries, gap);

	/* Instantiate the keys. */
	for (rip = page->pg_row_d, i = 0; i < page->pg_row_entries; ++rip, ++i)
		if (__bit_test(tmp->mem, i))
			WT_ERR(__wt_row_leaf_key_work(
			    session, page, rip, key, 1));

	F_SET_ATOMIC(page, WT_PAGE_BUILD_KEYS);

err:	__wt_scr_free(&key);
	__wt_scr_free(&tmp);
	return (ret);
}

/*
 * __inmem_row_leaf_slots --
 *	Figure out the interesting slots of a page for random search, up to
 * the specified depth.
 */
static void
__inmem_row_leaf_slots(
    uint8_t *list, uint32_t base, uint32_t entries, uint32_t gap)
{
	uint32_t indx, limit;

	if (entries < gap)
		return;

	/*
	 * !!!
	 * Don't clean this code up -- it deliberately looks like the binary
	 * search code.
	 *
	 * !!!
	 * There's got to be a function that would give me this information, but
	 * I don't see any performance reason we can't just do this recursively.
	 */
	limit = entries;
	indx = base + (limit >> 1);
	__bit_set(list, indx);

	__inmem_row_leaf_slots(list, base, limit >> 1, gap);

	base = indx + 1;
	--limit;
	__inmem_row_leaf_slots(list, base, limit >> 1, gap);
}

/*
 * __wt_row_leaf_key_copy --
 *	Get a copy of a row-store leaf-page key.
 */
int
__wt_row_leaf_key_copy(
    WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key)
{
	WT_RET(__wt_row_leaf_key(session, page, rip, key, 0));

	/* The return buffer may only hold a reference to a key, copy it. */
	if (!WT_DATA_IN_ITEM(key))
		WT_RET(__wt_buf_set(session, key, key->data, key->size));

	return (0);
}

/*
 * __wt_row_leaf_key_work --
 *	Return a reference to, a row-store leaf-page key, optionally instantiate
 * the key into the in-memory page.
 */
int
__wt_row_leaf_key_work(WT_SESSION_IMPL *session,
    WT_PAGE *page, WT_ROW *rip_arg, WT_ITEM *keyb, int instantiate)
{
	enum { FORWARD, BACKWARD } direction;
	WT_BTREE *btree;
	WT_CELL_UNPACK *unpack, _unpack;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_IKEY *ikey;
	WT_ROW *rip, *jump_rip;
	size_t size;
	u_int last_prefix;
	int jump_slot_offset, slot_offset;
	void *key;
	const void *p;

	/*
	 * !!!
	 * It is unusual to call this function: most code should be calling the
	 * front-end, __wt_row_leaf_key, be careful if you're calling this code
	 * directly.
	 */
	WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_DIRECT_KEY));

	btree = S2BT(session);
	unpack = &_unpack;
	rip = rip_arg;

	jump_rip = NULL;
	jump_slot_offset = 0;
	last_prefix = 0;

	direction = BACKWARD;
	for (slot_offset = 0;;) {
jump_slot:	key = WT_ROW_KEY_COPY(rip);

		/*
		 * Key copied.
		 *
		 * If another thread instantiated the key while we were doing
		 * that, we don't have any work to do.  Figure this out using
		 * the key's value:
		 *
		 * If the key points off-page, another thread updated the key,
		 * we can just use it.
		 *
		 * If the key points on-page, we have a copy of a WT_CELL value
		 * that can be processed, regardless of what any other thread is
		 * doing.
		 *
		 * Overflow keys are not prefix-compressed, we don't want to
		 * read/write them during reconciliation simply because their
		 * prefix might change.  That means we can't use instantiated
		 * overflow keys to figure out the prefix for other keys,
		 * specifically, in this code when we're looking for a key we
		 * can roll-forward to figure out the target key's prefix,
		 * instantiated overflow keys aren't useful.
		 *
		 * 1: the test for an on/off page reference.
		 */
		if (__wt_off_page(page, key)) {
off_page:		ikey = key;

			/*
			 * If this is the key we originally wanted, we don't
			 * care if we're rolling forward or backward, or if
			 * it's an overflow key or not, it's what we wanted.
			 * Take a copy and wrap up.
			 */
			if (slot_offset == 0) {
				keyb->data = WT_IKEY_DATA(ikey);
				keyb->size = ikey->size;

				/*
				 * The key is already instantiated, ignore the
				 * caller's suggestion.
				 */
				instantiate = 0;
				break;
			}

			/*
			 * If we wanted a different key and this key is an
			 * overflow key:
			 *	If we're rolling backward, this key is useless
			 * to us because it doesn't have a valid prefix: keep
			 * rolling backward.
			 *	If we're rolling forward, there's no work to be
			 * done because prefixes skip overflow keys: keep
			 * rolling forward.
			 */
			if (__wt_cell_type(WT_PAGE_REF_OFFSET(
			    page, ikey->cell_offset)) == WT_CELL_KEY_OVFL)
				goto next;

			/*
			 * If we wanted a different key and this key is not an
			 * overflow key, it has a valid prefix, we can use it.
			 *	If rolling backward, take a copy of the key and
			 * switch directions, we can roll forward from this key.
			 *	If rolling forward, replace the key we've been
			 * building with this key, it's what we would have built
			 * anyway.
			 * In short: if it's not an overflow key, take a copy
			 * and roll forward.
			 */
			keyb->data = WT_IKEY_DATA(ikey);
			keyb->size = ikey->size;
			direction = FORWARD;
			goto next;
		}

		/* Unpack the key's cell. */
		__wt_cell_unpack(key, unpack);

		/* 2: the test for an on-page reference to an overflow key. */
		if (unpack->type == WT_CELL_KEY_OVFL) {
			/*
			 * If this is the key we wanted from the start, we don't
			 * care if it's an overflow key, get a copy and wrap up.
			 *
			 * Avoid racing with reconciliation deleting overflow
			 * keys.  Deleted overflow keys must be instantiated
			 * first, acquire the overflow lock and check.  Read
			 * the key if we still need to do so, but holding the
			 * overflow lock.  Note we are not using the version of
			 * the cell-data-ref calls that acquire the overflow
			 * lock and do a look-aside into the tracking cache:
			 * this is an overflow key, not a value, meaning it's
			 * instantiated before being deleted, not copied into
			 * the tracking cache.
			 */
			if (slot_offset == 0) {
				WT_ERR(__wt_readlock(
				    session, btree->ovfl_lock));
				key = WT_ROW_KEY_COPY(rip);
				if (__wt_off_page(page, key)) {
					WT_ERR(__wt_rwunlock(
					    session, btree->ovfl_lock));
					goto off_page;
				}
				ret = __wt_dsk_cell_data_ref(
				    session, WT_PAGE_ROW_LEAF, unpack, keyb);
				WT_TRET(__wt_rwunlock(
				    session, btree->ovfl_lock));
				WT_ERR(ret);
				break;
			}

			/*
			 * If we wanted a different key and this key is an
			 * overflow key:
			 *	If we're rolling backward, this key is useless
			 * to us because it doesn't have a valid prefix: keep
			 * rolling backward.
			 *	If we're rolling forward, there's no work to be
			 * done because prefixes skip overflow keys: keep
			 * rolling forward.
			 */
			goto next;
		}

		/*
		 * 3: the test for an on-page reference to a key that isn't
		 * prefix compressed.
		 */
		if (unpack->prefix == 0) {
			/*
			 * If this is the key we originally wanted, we don't
			 * care if we're rolling forward or backward, it's
			 * what we want.  Take a copy and wrap up.
			 *
			 * If we wanted a different key, this key has a valid
			 * prefix, we can use it.
			 *	If rolling backward, take a copy of the key and
			 * switch directions, we can roll forward from this key.
			 *	If rolling forward there's a bug, we should have
			 * found this key while rolling backwards and switched
			 * directions then.
			 */
			if (btree->huffman_key == NULL) {
				keyb->data = unpack->data;
				keyb->size = unpack->size;
			} else
				WT_ERR(__wt_dsk_cell_data_ref(
				    session, WT_PAGE_ROW_LEAF, unpack, keyb));

			if (slot_offset == 0) {
				/*
				 * If we have an uncompressed, on-page key with
				 * no prefix, don't bother instantiating it,
				 * regardless of what our caller thought.  The
				 * memory cost is greater than the performance
				 * cost of finding the key each time we need it.
				 */
				if (btree->huffman_key == NULL)
					instantiate = 0;
				break;
			}

			WT_ASSERT(session, direction == BACKWARD);
			direction = FORWARD;

			/*
			 * Switching to the forward roll; skip over any list of
			 * keys with compatible prefixes.
			 */
			rip = jump_rip;
			slot_offset = jump_slot_offset;

			/*
			 * I'm using an explicit branch instead of a continue,
			 * it needs to be obvious that new code at the top of
			 * this loop is problematical.
			 */
			goto jump_slot;
		}

		/*
		 * 4: an on-page reference to a key that's prefix compressed.
		 *	If rolling backward, keep looking for something we can
		 * use.
		 *	If rolling forward, build the full key and keep rolling
		 * forward.
		 */
		if (direction == BACKWARD) {
			/*
			 * If there's a set of keys with identical prefixes, we
			 * don't want to instantiate each one, the prefixes are
			 * all the same.
			 *
			 * As we roll backward through the page, track the last
			 * time the prefix decreased in size, so we can start
			 * with that key during our roll-forward.  For a page
			 * populated with a single key prefix, we'll be able to
			 * instantiate the key we want as soon as we find a key
			 * without a prefix.
			 */
			if (slot_offset == 0)
				last_prefix = unpack->prefix;
			if (slot_offset == 0 || last_prefix > unpack->prefix) {
				jump_rip = rip;
				jump_slot_offset = slot_offset;
				last_prefix = unpack->prefix;
			}
		}
		if (direction == FORWARD) {
			/*
			 * Get a reference to the current key's bytes.  Usually
			 * we want bytes from the page, fast-path that case.
			 */
			if (btree->huffman_key == NULL) {
				p = unpack->data;
				size = unpack->size;
			} else {
				if (tmp == NULL)
					WT_ERR(
					__wt_scr_alloc(session, 0, &tmp));
				WT_ERR(__wt_dsk_cell_data_ref(
				    session, WT_PAGE_ROW_LEAF, unpack, tmp));
				p = tmp->data;
				size = tmp->size;
			}

			/*
			 * Grow the buffer as necessary as well as ensure data
			 * has been copied into local buffer space, then append
			 * the suffix to the prefix already in the buffer.
			 *
			 * Don't grow the buffer unnecessarily or copy data we
			 * don't need, truncate the item's data length to the
			 * prefix bytes.
			 */
			keyb->size = unpack->prefix;
			WT_ERR(__wt_buf_grow(session, keyb, keyb->size + size));
			memcpy((uint8_t *)keyb->data + keyb->size, p, size);
			keyb->size += size;

			if (slot_offset == 0)
				break;
		}

next:		switch (direction) {
		case  BACKWARD:
			--rip;
			++slot_offset;
			break;
		case FORWARD:
			++rip;
			--slot_offset;
			break;
		}
	}

	/*
	 * Optionally instantiate the key: there's a cost to figuring out a key
	 * value in a leaf page with prefix-compressed or Huffman encoded keys,
	 * amortize the cost by instantiating a copy of the calculated key in
	 * allocated memory.  We don't instantiate keys when pages are first
	 * brought into memory because it's wasted effort if the page is only
	 * read by a cursor in sorted order.  If, instead, the page is read by a
	 * cursor in reverse order, we immediately instantiate periodic keys for
	 * the page (otherwise the reverse walk would be insanely slow).  If,
	 * instead, the page is randomly searched, we instantiate keys as they
	 * are accessed (meaning, for example, as long as the binary search only
	 * touches one-half of the page, the only keys we instantiate will be in
	 * that half of the page).
	 */
	if (instantiate) {
		key = WT_ROW_KEY_COPY(rip_arg);
		if (!__wt_off_page(page, key)) {
			WT_ERR(__wt_row_ikey(session,
			    WT_PAGE_DISK_OFFSET(page, key),
			    keyb->data, keyb->size, &ikey));

			/*
			 * Serialize the swap of the key into place: on success,
			 * update the page's memory footprint, on failure, free
			 * the allocated memory.
			 */
			if (WT_ATOMIC_CAS(WT_ROW_KEY_COPY(rip), key, ikey))
				__wt_cache_page_inmem_incr(session,
				    page, sizeof(WT_IKEY) + ikey->size);
			else
				__wt_free(session, ikey);
		}
	}

err:	__wt_scr_free(&tmp);
	return (ret);
}

/*
 * __wt_row_ikey_incr --
 *	Instantiate a key in a WT_IKEY structure and increment the page's
 * memory footprint.
 */
int
__wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page,
    uint32_t cell_offset, const void *key, size_t size, void *ikeyp)
{
	WT_RET(__wt_row_ikey(session, cell_offset, key, size, ikeyp));

	__wt_cache_page_inmem_incr(session, page, sizeof(WT_IKEY) + size);

	return (0);
}

/*
 * __wt_row_ikey --
 *	Instantiate a key in a WT_IKEY structure.
 */
int
__wt_row_ikey(WT_SESSION_IMPL *session,
    uint32_t cell_offset, const void *key, size_t size, void *ikeyp)
{
	WT_IKEY *ikey;

	/*
	 * Allocate memory for the WT_IKEY structure and the key, then copy
	 * the key into place.
	 */
	WT_RET(__wt_calloc(session, 1, sizeof(WT_IKEY) + size, &ikey));
	ikey->size = WT_STORE_SIZE(size);
	ikey->cell_offset = cell_offset;
	memcpy(WT_IKEY_DATA(ikey), key, size);

	*(WT_IKEY **)ikeyp = ikey;
	return (0);
}