/*-
 * Copyright (c) 2014-2016 MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

/*
 * Initialize a static WT_CURSOR structure.
 */
#define	WT_CURSOR_STATIC_INIT(n,					\
	get_key,							\
	get_value,							\
	set_key,							\
	set_value,							\
	compare,							\
	equals,								\
	next,								\
	prev,								\
	reset,								\
	search,								\
	search_near,							\
	insert,								\
	update,								\
	remove,								\
	reconfigure,							\
	close)								\
	static const WT_CURSOR n = {					\
	NULL,				/* session */			\
	NULL,				/* uri */			\
	NULL,				/* key_format */		\
	NULL,				/* value_format */		\
	get_key,							\
	get_value,							\
	set_key,							\
	set_value,							\
	compare,							\
	equals,								\
	next,								\
	prev,								\
	reset,								\
	search,								\
	search_near,							\
	insert,								\
	update,								\
	remove,								\
	close,								\
	reconfigure,							\
	{ NULL, NULL },			/* TAILQ_ENTRY q */		\
	0,				/* recno key */			\
	{ 0 },				/* recno raw buffer */		\
	NULL,				/* json_private */		\
	NULL,				/* lang_private */		\
	{ NULL, 0, NULL, 0, 0 },	/* WT_ITEM key */		\
	{ NULL, 0, NULL, 0, 0 },	/* WT_ITEM value */		\
	0,				/* int saved_err */		\
	NULL,				/* internal_uri */		\
	0				/* uint32_t flags */		\
}

struct __wt_cursor_backup {
	WT_CURSOR iface;

	size_t next;			/* Cursor position */
	WT_FSTREAM *bfs;		/* Backup file stream */
	uint32_t maxid;			/* Maximum log file ID seen */

	char **list;			/* List of files to be copied. */
	size_t list_allocated;
	size_t list_next;

#define	WT_CURBACKUP_LOCKER	0x01	/* Hot-backup started */
	uint8_t	flags;
};
#define	WT_CURSOR_BACKUP_ID(cursor)	(((WT_CURSOR_BACKUP *)(cursor))->maxid)

struct __wt_cursor_btree {
	WT_CURSOR iface;

	WT_BTREE *btree;		/* Enclosing btree */

	/*
	 * The following fields are set by the search functions as a precursor
	 * to page modification: we have a page, a WT_COL/WT_ROW slot on the
	 * page, an insert head, insert list and a skiplist stack (the stack of
	 * skiplist entries leading to the insert point).  The search functions
	 * also return the relationship of the search key to the found key.
	 */
	WT_REF	  *ref;			/* Current page */
	uint32_t   slot;		/* WT_COL/WT_ROW 0-based slot */

	WT_INSERT_HEAD	*ins_head;	/* Insert chain head */
	WT_INSERT	*ins;		/* Current insert node */
					/* Search stack */
	WT_INSERT	**ins_stack[WT_SKIP_MAXDEPTH];

					/* Next item(s) found during search */
	WT_INSERT	*next_stack[WT_SKIP_MAXDEPTH];

	uint32_t page_deleted_count;	/* Deleted items on the page */

	uint64_t recno;			/* Record number */

	/*
	 * Next-random cursors can optionally be configured to step through a
	 * percentage of the total leaf pages to their next value. Note the
	 * configured value and the calculated number of leaf pages to skip.
	 */
	uint64_t next_random_leaf_skip;
	u_int	 next_random_sample_size;

	/*
	 * The search function sets compare to:
	 *	< 1 if the found key is less than the specified key
	 *	  0 if the found key matches the specified key
	 *	> 1 if the found key is larger than the specified key
	 */
	int	compare;

	/*
	 * A key returned from a binary search or cursor movement on a row-store
	 * page; if we find an exact match on a row-store leaf page in a search
	 * operation, keep a copy of key we built during the search to avoid
	 * doing the additional work of getting the key again for return to the
	 * application. Note, this only applies to exact matches when searching
	 * disk-image structures, so it's not, for example, a key from an insert
	 * list. Additionally, this structure is used to build keys when moving
	 * a cursor through a row-store leaf page.
	 */
	WT_ITEM *row_key, _row_key;

	/*
	 * It's relatively expensive to calculate the last record on a variable-
	 * length column-store page because of the repeat values.  Calculate it
	 * once per page and cache it.  This value doesn't include the skiplist
	 * of appended entries on the last page.
	 */
	uint64_t last_standard_recno;

	/*
	 * For row-store pages, we need a single item that tells us the part of
	 * the page we're walking (otherwise switching from next to prev and
	 * vice-versa is just too complicated), so we map the WT_ROW and
	 * WT_INSERT_HEAD insert array slots into a single name space: slot 1
	 * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is
	 * WT_INSERT_HEAD[0], and so on.  This means WT_INSERT lists are
	 * odd-numbered slots, and WT_ROW array slots are even-numbered slots.
	 */
	uint32_t row_iteration_slot;	/* Row-store iteration slot */

	/*
	 * Variable-length column-store values are run-length encoded and may
	 * be overflow values or Huffman encoded. To avoid repeatedly reading
	 * overflow values or decompressing encoded values, process it once and
	 * store the result in a temporary buffer.  The cip_saved field is used
	 * to determine if we've switched columns since our last cursor call.
	 */
	WT_COL *cip_saved;		/* Last iteration reference */

	/*
	 * We don't instantiate prefix-compressed keys on pages where there's no
	 * Huffman encoding because we don't want to waste memory if only moving
	 * a cursor through the page, and it's faster to build keys while moving
	 * through the page than to roll-forward from a previously instantiated
	 * key (we don't instantiate all of the keys, just the ones at binary
	 * search points).  We can't use the application's WT_CURSOR key field
	 * as a copy of the last-returned key because it may have been altered
	 * by the API layer, for example, dump cursors.  Instead we store the
	 * last-returned key in a temporary buffer.  The rip_saved field is used
	 * to determine if the key in the temporary buffer has the prefix needed
	 * for building the current key.
	 */
	WT_ROW *rip_saved;		/* Last-returned key reference */

	/*
	 * A temporary buffer for caching RLE values for column-store files (if
	 * RLE is non-zero, then we don't unpack the value every time we move
	 * to the next cursor position, we re-use the unpacked value we stored
	 * here the first time we hit the value).
	 *
	 * A temporary buffer for building on-page keys when searching row-store
	 * files.
	 */
	WT_ITEM *tmp, _tmp;

	/*
	 * The update structure allocated by the row- and column-store modify
	 * functions, used to avoid a data copy in the WT_CURSOR.update call.
	 */
	WT_UPDATE *modify_update;

	/*
	 * Fixed-length column-store items are a single byte, and it's simpler
	 * and cheaper to allocate the space for it now than keep checking to
	 * see if we need to grow the buffer.
	 */
	uint8_t v;			/* Fixed-length return value */

	uint8_t	append_tree;		/* Cursor appended to the tree */

#ifdef HAVE_DIAGNOSTIC
	/* Check that cursor next/prev never returns keys out-of-order. */
	WT_ITEM *lastkey, _lastkey;
	uint64_t lastrecno;
#endif

#define	WT_CBT_ACTIVE		0x01	/* Active in the tree */
#define	WT_CBT_ITERATE_APPEND	0x02	/* Col-store: iterating append list */
#define	WT_CBT_ITERATE_NEXT	0x04	/* Next iteration configuration */
#define	WT_CBT_ITERATE_PREV	0x08	/* Prev iteration configuration */
#define	WT_CBT_NO_TXN   	0x10	/* Non-transactional cursor
					   (e.g. on a checkpoint) */
#define	WT_CBT_SEARCH_SMALLEST	0x20	/* Row-store: small-key insert list */
#define	WT_CBT_VAR_ONPAGE_MATCH	0x40	/* Var-store: on-page recno match */

#define	WT_CBT_POSITION_MASK		/* Flags associated with position */ \
	(WT_CBT_ITERATE_APPEND | WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV | \
	WT_CBT_SEARCH_SMALLEST | WT_CBT_VAR_ONPAGE_MATCH)

	uint8_t flags;
};

struct __wt_cursor_bulk {
	WT_CURSOR_BTREE cbt;

	/*
	 * Variable-length column store compares values during bulk load as
	 * part of RLE compression, row-store compares keys during bulk load
	 * to avoid corruption.
	 */
	bool	first_insert;		/* First insert */
	WT_ITEM	last;			/* Last key/value inserted */

	/*
	 * Additional column-store bulk load support.
	 */
	uint64_t recno;			/* Record number */
	uint64_t rle;			/* Variable-length RLE counter */

	/*
	 * Additional fixed-length column store bitmap bulk load support:
	 * current entry in memory chunk count, and the maximum number of
	 * records per chunk.
	 */
	bool	 bitmap;		/* Bitmap bulk load */
	uint32_t entry;			/* Entry count */
	uint32_t nrecs;			/* Max records per chunk */

	void	*reconcile;		/* Reconciliation support */
	WT_REF	*ref;			/* The leaf page */
	WT_PAGE *leaf;
};

struct __wt_cursor_config {
	WT_CURSOR iface;
};

struct __wt_cursor_data_source {
	WT_CURSOR iface;

	WT_COLLATOR *collator;		/* Configured collator */
	int collator_owned;		/* Collator needs to be terminated */

	WT_CURSOR *source;		/* Application-owned cursor */
};

struct __wt_cursor_dump {
	WT_CURSOR iface;

	WT_CURSOR *child;
};

struct __wt_cursor_index {
	WT_CURSOR iface;

	WT_TABLE *table;
	WT_INDEX *index;
	const char *key_plan, *value_plan;

	WT_CURSOR *child;
	WT_CURSOR **cg_cursors;
	uint8_t	*cg_needvalue;
};

/*
 * A join iterator structure is used to generate candidate primary keys. It
 * is the responsibility of the caller of the iterator to filter these
 * primary key against the other conditions of the join before returning
 * them the caller of WT_CURSOR::next.
 *
 * For a conjunction join (the default), entry_count will be 1, meaning that
 * the iterator only consumes the first entry (WT_CURSOR_JOIN_ENTRY).  That
 * is, it successively returns primary keys from a cursor for the first
 * index that was joined.  When the values returned by that cursor are
 * exhausted, the iterator has completed.  For a disjunction join,
 * exhausting a cursor just means that the iterator advances to the next
 * entry. If the next entry represents an index, a new cursor is opened and
 * primary keys from that index are then successively returned.
 *
 * When positioned on an entry that represents a nested join, a new child
 * iterator is created that will be bound to the nested WT_CURSOR_JOIN.
 * That iterator is then used to generate candidate primary keys.  When its
 * iteration is completed, that iterator is destroyed and the parent
 * iterator advances to the next entry.  Thus, depending on how deeply joins
 * are nested, a similarly deep stack of iterators is created.
 */
struct __wt_cursor_join_iter {
	WT_SESSION_IMPL		*session;
	WT_CURSOR_JOIN		*cjoin;
	WT_CURSOR_JOIN_ENTRY	*entry;
	WT_CURSOR_JOIN_ITER	*child;
	WT_CURSOR		*cursor;	/* has null projection */
	WT_ITEM			*curkey;	/* primary key */
	WT_ITEM			 idxkey;
	u_int			 entry_pos;	/* the current entry */
	u_int			 entry_count;	/* entries to walk */
	u_int			 end_pos;	/* the current endpoint */
	u_int			 end_count;	/* endpoints to walk */
	u_int			 end_skip;	/* when testing for inclusion */
						/* can we skip current end? */
	bool			 positioned;
	bool			 is_equal;
};

/*
 * A join endpoint represents a positioned cursor that is 'captured' by a
 * WT_SESSION::join call.
 */
struct __wt_cursor_join_endpoint {
	WT_ITEM			 key;
	uint8_t			 recno_buf[10];	/* holds packed recno */
	WT_CURSOR		*cursor;

#define	WT_CURJOIN_END_LT	0x01		/* include values <  cursor */
#define	WT_CURJOIN_END_EQ	0x02		/* include values == cursor */
#define	WT_CURJOIN_END_GT	0x04		/* include values >  cursor */
#define	WT_CURJOIN_END_GE	(WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ)
#define	WT_CURJOIN_END_LE	(WT_CURJOIN_END_LT | WT_CURJOIN_END_EQ)
#define	WT_CURJOIN_END_OWN_CURSOR 0x08		/* must close cursor */
	uint8_t			 flags;		/* range for this endpoint */
};
#define	WT_CURJOIN_END_RANGE(endp)					\
	((endp)->flags &						\
	    (WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ | WT_CURJOIN_END_LT))

/*
 * Each join entry typically represents an index's participation in a join.
 * For example, if 'k' is an index, then "t.k > 10 && t.k < 20" would be
 * represented by a single entry, with two endpoints.  When the index and
 * subjoin fields are NULL, the join is on the main table.  When subjoin is
 * non-NULL, there is a nested join clause.
 */
struct __wt_cursor_join_entry {
	WT_INDEX		*index;
	WT_CURSOR		*main;		/* raw main table cursor */
	WT_CURSOR_JOIN		*subjoin;	/* a nested join clause */
	WT_BLOOM		*bloom;		/* Bloom filter handle */
	char			*repack_format; /* target format for repack */
	uint32_t		 bloom_bit_count; /* bits per item in bloom */
	uint32_t		 bloom_hash_count; /* hash functions in bloom */
	uint64_t		 count;		/* approx number of matches */

#define	WT_CURJOIN_ENTRY_BLOOM		 0x01	/* use a bloom filter */
#define	WT_CURJOIN_ENTRY_DISJUNCTION	 0x02	/* endpoints are or-ed */
#define	WT_CURJOIN_ENTRY_FALSE_POSITIVES 0x04	/* after bloom filter do not
						 * filter false positives */
#define	WT_CURJOIN_ENTRY_OWN_BLOOM	 0x08	/* this entry owns the bloom */
	uint8_t			 flags;

	WT_CURSOR_JOIN_ENDPOINT	*ends;		/* reference endpoints */
	size_t			 ends_allocated;
	u_int			 ends_next;

	WT_JOIN_STATS		 stats;		/* Join statistics */
};

struct __wt_cursor_join {
	WT_CURSOR iface;

	WT_TABLE		*table;
	const char		*projection;
	WT_CURSOR		*main;		/* main table with projection */
	WT_CURSOR_JOIN		*parent;	/* parent of nested group */
	WT_CURSOR_JOIN_ITER	*iter;		/* chain of iterators */
	WT_CURSOR_JOIN_ENTRY	*entries;
	size_t			 entries_allocated;
	u_int			 entries_next;
	uint8_t			 recno_buf[10];	/* holds packed recno */

#define	WT_CURJOIN_DISJUNCTION		0x01	/* Entries are or-ed */
#define	WT_CURJOIN_ERROR		0x02	/* Error in initialization */
#define	WT_CURJOIN_INITIALIZED		0x04	/* Successful initialization */
	uint8_t			 flags;
};

struct __wt_cursor_json {
	char	*key_buf;		/* JSON formatted string */
	char	*value_buf;		/* JSON formatted string */
	WT_CONFIG_ITEM key_names;	/* Names of key columns */
	WT_CONFIG_ITEM value_names;	/* Names of value columns */
};

struct __wt_cursor_log {
	WT_CURSOR iface;

	WT_LSN		*cur_lsn;	/* LSN of current record */
	WT_LSN		*next_lsn;	/* LSN of next record */
	WT_ITEM		*logrec;	/* Copy of record for cursor */
	WT_ITEM		*opkey, *opvalue;	/* Op key/value copy */
	const uint8_t	*stepp, *stepp_end;	/* Pointer within record */
	uint8_t		*packed_key;	/* Packed key for 'raw' interface */
	uint8_t		*packed_value;	/* Packed value for 'raw' interface */
	uint32_t	step_count;	/* Intra-record count */
	uint32_t	rectype;	/* Record type */
	uint64_t	txnid;		/* Record txnid */

#define	WT_CURLOG_ARCHIVE_LOCK	0x01	/* Archive lock held */
	uint8_t		flags;
};

struct __wt_cursor_metadata {
	WT_CURSOR iface;

	WT_CURSOR *file_cursor;		/* Queries of regular metadata */
	WT_CURSOR *create_cursor;	/* Extra cursor for create option */

#define	WT_MDC_CREATEONLY	0x01
#define	WT_MDC_ONMETADATA	0x02
#define	WT_MDC_POSITIONED	0x04
	uint8_t	flags;
};

struct __wt_join_stats_group {
	const char *desc_prefix;	/* Prefix appears before description */
	WT_CURSOR_JOIN *join_cursor;
	ssize_t join_cursor_entry;	/* Position in entries */
	WT_JOIN_STATS join_stats;
};

struct __wt_cursor_stat {
	WT_CURSOR iface;

	bool	notinitialized;		/* Cursor not initialized */
	bool	notpositioned;		/* Cursor not positioned */

	int64_t	     *stats;		/* Statistics */
	int	      stats_base;	/* Base statistics value */
	int	      stats_count;	/* Count of statistics values */
	int	     (*stats_desc)(WT_CURSOR_STAT *, int, const char **);
					/* Statistics descriptions */
	int	     (*next_set)(WT_SESSION_IMPL *, WT_CURSOR_STAT *, bool,
			 bool);		/* Advance to next set */

	union {				/* Copies of the statistics */
		WT_DSRC_STATS dsrc_stats;
		WT_CONNECTION_STATS conn_stats;
		WT_JOIN_STATS_GROUP join_stats_group;
	} u;

	const char **cfg;		/* Original cursor configuration */
	char	*desc_buf;		/* Saved description string */

	int	 key;			/* Current stats key */
	uint64_t v;			/* Current stats value */
	WT_ITEM	 pv;			/* Current stats value (string) */

	/* Options declared in flags.py, shared by WT_CONNECTION::stat_flags */
	uint32_t flags;
};

/*
 * WT_CURSOR_STATS --
 *	Return a reference to a statistic cursor's stats structures.
 */
#define	WT_CURSOR_STATS(cursor)						\
	(((WT_CURSOR_STAT *)(cursor))->stats)

struct __wt_cursor_table {
	WT_CURSOR iface;

	WT_TABLE *table;
	const char *plan;

	const char **cfg;		/* Saved configuration string */

	WT_CURSOR **cg_cursors;
	WT_ITEM *cg_valcopy;		/*
					 * Copies of column group values, for
					 * overlapping set_value calls.
					 */
	WT_CURSOR **idx_cursors;
};

#define	WT_CURSOR_PRIMARY(cursor)					\
	(((WT_CURSOR_TABLE *)(cursor))->cg_cursors[0])

#define	WT_CURSOR_RECNO(cursor)	WT_STREQ((cursor)->key_format, "r")

/*
 * WT_CURSOR_NEEDKEY, WT_CURSOR_NEEDVALUE --
 *	Check if we have a key/value set.  There's an additional semantic
 * implemented here: if we're pointing into the tree, and about to perform
 * a cursor operation, get a local copy of whatever we're referencing in
 * the tree, there's an obvious race with the cursor moving and the key or
 * value reference, and it's better to solve it here than in the underlying
 * data-source layers.
 *
 * WT_CURSOR_CHECKKEY --
 *	Check if a key is set without making a copy.
 *
 * WT_CURSOR_NOVALUE --
 *	Release any cached value before an operation that could update the
 * transaction context and free data a value is pointing to.
 */
#define	WT_CURSOR_CHECKKEY(cursor) do {					\
	if (!F_ISSET(cursor, WT_CURSTD_KEY_SET))			\
		WT_ERR(__wt_cursor_kv_not_set(cursor, true));		\
} while (0)
#define	WT_CURSOR_CHECKVALUE(cursor) do {				\
	if (!F_ISSET(cursor, WT_CURSTD_VALUE_SET))			\
		WT_ERR(__wt_cursor_kv_not_set(cursor, false));		\
} while (0)
#define	WT_CURSOR_NEEDKEY(cursor) do {					\
	if (F_ISSET(cursor, WT_CURSTD_KEY_INT)) {			\
		if (!WT_DATA_IN_ITEM(&(cursor)->key))			\
			WT_ERR(__wt_buf_set(				\
			    (WT_SESSION_IMPL *)(cursor)->session,	\
			    &(cursor)->key,				\
			    (cursor)->key.data, (cursor)->key.size));	\
		F_CLR(cursor, WT_CURSTD_KEY_INT);			\
		F_SET(cursor, WT_CURSTD_KEY_EXT);			\
	}								\
	WT_CURSOR_CHECKKEY(cursor);					\
} while (0)
#define	WT_CURSOR_NEEDVALUE(cursor) do {				\
	if (F_ISSET(cursor, WT_CURSTD_VALUE_INT)) {			\
		if (!WT_DATA_IN_ITEM(&(cursor)->value))			\
			WT_ERR(__wt_buf_set(				\
			    (WT_SESSION_IMPL *)(cursor)->session,	\
			    &(cursor)->value,				\
			    (cursor)->value.data, (cursor)->value.size));\
		F_CLR(cursor, WT_CURSTD_VALUE_INT);			\
		F_SET(cursor, WT_CURSTD_VALUE_EXT);			\
	}								\
	WT_CURSOR_CHECKVALUE(cursor);					\
} while (0)
#define	WT_CURSOR_NOVALUE(cursor) do {					\
	F_CLR(cursor, WT_CURSTD_VALUE_INT);				\
} while (0)

#define	WT_CURSOR_RAW_OK						\
	(WT_CURSTD_DUMP_HEX | WT_CURSTD_DUMP_PRINT | WT_CURSTD_RAW)