summaryrefslogtreecommitdiff
path: root/src/include/cursor.h
blob: 275e2f2db46027e5e87b5f77ab036de8ab0ec23c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
/*-
 * Copyright (c) 2014-2015 MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

/*
 * Initialize a static WT_CURSOR structure.
 */
#define	WT_CURSOR_STATIC_INIT(n,					\
	get_key,							\
	get_value,							\
	set_key,							\
	set_value,							\
	compare,							\
	equals,								\
	next,								\
	prev,								\
	reset,								\
	search,								\
	search_near,							\
	insert,								\
	update,								\
	remove,								\
	reconfigure,							\
	close)								\
	static const WT_CURSOR n = {					\
	NULL,				/* session */			\
	NULL,				/* uri */			\
	NULL,				/* key_format */		\
	NULL,				/* value_format */		\
	(int (*)(WT_CURSOR *, ...))(get_key),				\
	(int (*)(WT_CURSOR *, ...))(get_value),				\
	(void (*)(WT_CURSOR *, ...))(set_key),				\
	(void (*)(WT_CURSOR *, ...))(set_value),			\
	(int (*)(WT_CURSOR *, WT_CURSOR *, int *))(compare),		\
	(int (*)(WT_CURSOR *, WT_CURSOR *, int *))(equals),		\
	next,								\
	prev,								\
	reset,								\
	search,								\
	(int (*)(WT_CURSOR *, int *))(search_near),			\
	insert,								\
	update,								\
	remove,								\
	close,								\
	(int (*)(WT_CURSOR *, const char *))(reconfigure),		\
	{ NULL, NULL },			/* TAILQ_ENTRY q */		\
	0,				/* recno key */			\
	{ 0 },				/* recno raw buffer */		\
	NULL,				/* json_private */		\
	NULL,				/* lang_private */		\
	{ NULL, 0, 0, NULL, 0 },	/* WT_ITEM key */		\
	{ NULL, 0, 0, NULL, 0 },	/* WT_ITEM value */		\
	0,				/* int saved_err */		\
	NULL,				/* internal_uri */		\
	0				/* uint32_t flags */		\
}

struct __wt_cursor_backup_entry {
	char *name;			/* File name */
	WT_DATA_HANDLE *handle;		/* Handle */
};
struct __wt_cursor_backup {
	WT_CURSOR iface;

	size_t next;			/* Cursor position */
	FILE *bfp;			/* Backup file */
	uint32_t maxid;			/* Maximum log file ID seen */

	WT_CURSOR_BACKUP_ENTRY *list;	/* List of files to be copied. */
	size_t list_allocated;
	size_t list_next;
};
#define	WT_CURSOR_BACKUP_ID(cursor)	(((WT_CURSOR_BACKUP *)cursor)->maxid)

struct __wt_cursor_btree {
	WT_CURSOR iface;

	WT_BTREE *btree;		/* Enclosing btree */

	/*
	 * The following fields are set by the search functions as a precursor
	 * to page modification: we have a page, a WT_COL/WT_ROW slot on the
	 * page, an insert head, insert list and a skiplist stack (the stack of
	 * skiplist entries leading to the insert point).  The search functions
	 * also return the relationship of the search key to the found key.
	 */
	WT_REF	  *ref;			/* Current page */
	uint32_t   slot;		/* WT_COL/WT_ROW 0-based slot */

	WT_INSERT_HEAD	*ins_head;	/* Insert chain head */
	WT_INSERT	*ins;		/* Current insert node */
					/* Search stack */
	WT_INSERT	**ins_stack[WT_SKIP_MAXDEPTH];

					/* Next item(s) found during search */
	WT_INSERT	*next_stack[WT_SKIP_MAXDEPTH];

	uint32_t page_deleted_count;	/* Deleted items on the page */

	uint64_t recno;			/* Record number */

	/*
	 * Next-random cursors can optionally be configured to step through a
	 * percentage of the total leaf pages to their next value. Note the
	 * configured value and the calculated number of leaf pages to skip.
	 */
	uint64_t next_random_leaf_skip;
	u_int	 next_random_sample_size;

	/*
	 * The search function sets compare to:
	 *	< 1 if the found key is less than the specified key
	 *	  0 if the found key matches the specified key
	 *	> 1 if the found key is larger than the specified key
	 */
	int	compare;

	/*
	 * A key returned from a binary search or cursor movement on a row-store
	 * page; if we find an exact match on a row-store leaf page in a search
	 * operation, keep a copy of key we built during the search to avoid
	 * doing the additional work of getting the key again for return to the
	 * application. Note, this only applies to exact matches when searching
	 * disk-image structures, so it's not, for example, a key from an insert
	 * list. Additionally, this structure is used to build keys when moving
	 * a cursor through a row-store leaf page.
	 */
	WT_ITEM *row_key, _row_key;

	/*
	 * It's relatively expensive to calculate the last record on a variable-
	 * length column-store page because of the repeat values.  Calculate it
	 * once per page and cache it.  This value doesn't include the skiplist
	 * of appended entries on the last page.
	 */
	uint64_t last_standard_recno;

	/*
	 * For row-store pages, we need a single item that tells us the part of
	 * the page we're walking (otherwise switching from next to prev and
	 * vice-versa is just too complicated), so we map the WT_ROW and
	 * WT_INSERT_HEAD insert array slots into a single name space: slot 1
	 * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is
	 * WT_INSERT_HEAD[0], and so on.  This means WT_INSERT lists are
	 * odd-numbered slots, and WT_ROW array slots are even-numbered slots.
	 */
	uint32_t row_iteration_slot;	/* Row-store iteration slot */

	/*
	 * Variable-length column-store values are run-length encoded and may
	 * be overflow values or Huffman encoded. To avoid repeatedly reading
	 * overflow values or decompressing encoded values, process it once and
	 * store the result in a temporary buffer.  The cip_saved field is used
	 * to determine if we've switched columns since our last cursor call.
	 */
	WT_COL *cip_saved;		/* Last iteration reference */

	/*
	 * We don't instantiate prefix-compressed keys on pages where there's no
	 * Huffman encoding because we don't want to waste memory if only moving
	 * a cursor through the page, and it's faster to build keys while moving
	 * through the page than to roll-forward from a previously instantiated
	 * key (we don't instantiate all of the keys, just the ones at binary
	 * search points).  We can't use the application's WT_CURSOR key field
	 * as a copy of the last-returned key because it may have been altered
	 * by the API layer, for example, dump cursors.  Instead we store the
	 * last-returned key in a temporary buffer.  The rip_saved field is used
	 * to determine if the key in the temporary buffer has the prefix needed
	 * for building the current key.
	 */
	WT_ROW *rip_saved;		/* Last-returned key reference */

	/*
	 * A temporary buffer for caching RLE values for column-store files (if
	 * RLE is non-zero, then we don't unpack the value every time we move
	 * to the next cursor position, we re-use the unpacked value we stored
	 * here the first time we hit the value).
	 *
	 * A temporary buffer for building on-page keys when searching row-store
	 * files.
	 */
	WT_ITEM *tmp, _tmp;

	/*
	 * The update structure allocated by the row- and column-store modify
	 * functions, used to avoid a data copy in the WT_CURSOR.update call.
	 */
	WT_UPDATE *modify_update;

	/*
	 * Fixed-length column-store items are a single byte, and it's simpler
	 * and cheaper to allocate the space for it now than keep checking to
	 * see if we need to grow the buffer.
	 */
	uint8_t v;			/* Fixed-length return value */

	uint8_t	append_tree;		/* Cursor appended to the tree */

#define	WT_CBT_ACTIVE		0x01	/* Active in the tree */
#define	WT_CBT_ITERATE_APPEND	0x02	/* Col-store: iterating append list */
#define	WT_CBT_ITERATE_NEXT	0x04	/* Next iteration configuration */
#define	WT_CBT_ITERATE_PREV	0x08	/* Prev iteration configuration */
#define	WT_CBT_MAX_RECORD	0x10	/* Col-store: past end-of-table */
#define	WT_CBT_NO_TXN   	0x20	/* Non-transactional cursor
					   (e.g. on a checkpoint) */
#define	WT_CBT_SEARCH_SMALLEST	0x40	/* Row-store: small-key insert list */

#define	WT_CBT_POSITION_MASK		/* Flags associated with position */ \
	(WT_CBT_ITERATE_APPEND | WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV | \
	WT_CBT_MAX_RECORD | WT_CBT_SEARCH_SMALLEST)

	uint8_t flags;
};

struct __wt_cursor_bulk {
	WT_CURSOR_BTREE cbt;

	WT_REF	*ref;			/* The leaf page */
	WT_PAGE *leaf;

	/*
	 * Variable-length column store compares values during bulk load as
	 * part of RLE compression, row-store compares keys during bulk load
	 * to avoid corruption.
	 */
	WT_ITEM last;			/* Last key/value seen */

	/*
	 * Variable-length column-store RLE counter (also overloaded to mean
	 * the first time through the bulk-load insert routine, when set to 0).
	 */
	uint64_t rle;

	/*
	 * Fixed-length column-store current entry in memory chunk count, and
	 * the maximum number of records per chunk.
	 */
	uint32_t entry;			/* Entry count */
	uint32_t nrecs;			/* Max records per chunk */

	/* Special bitmap bulk load for fixed-length column stores. */
	bool	bitmap;

	void	*reconcile;		/* Reconciliation information */
};

struct __wt_cursor_config {
	WT_CURSOR iface;
};

struct __wt_cursor_data_source {
	WT_CURSOR iface;

	WT_COLLATOR *collator;		/* Configured collator */
	int collator_owned;		/* Collator needs to be terminated */

	WT_CURSOR *source;		/* Application-owned cursor */
};

struct __wt_cursor_dump {
	WT_CURSOR iface;

	WT_CURSOR *child;
};

struct __wt_cursor_index {
	WT_CURSOR iface;

	WT_TABLE *table;
	WT_INDEX *index;
	const char *key_plan, *value_plan;

	WT_CURSOR *child;
	WT_CURSOR **cg_cursors;
	uint8_t	*cg_needvalue;
};

struct __wt_cursor_join_iter {
	WT_SESSION_IMPL		*session;
	WT_CURSOR_JOIN		*cjoin;
	WT_CURSOR_JOIN_ENTRY	*entry;
	WT_CURSOR		*cursor;
	WT_ITEM			*curkey;
	bool			 advance;
};

struct __wt_cursor_join_endpoint {
	WT_ITEM			 key;
	uint8_t			 recno_buf[10];	/* holds packed recno */
	WT_CURSOR		*cursor;

#define	WT_CURJOIN_END_LT	0x01		/* include values <  cursor */
#define	WT_CURJOIN_END_EQ	0x02		/* include values == cursor */
#define	WT_CURJOIN_END_GT	0x04		/* include values >  cursor */
#define	WT_CURJOIN_END_GE	(WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ)
#define	WT_CURJOIN_END_LE	(WT_CURJOIN_END_LT | WT_CURJOIN_END_EQ)
#define	WT_CURJOIN_END_OWN_KEY	0x08		/* must free key's data */
	uint8_t			 flags;		/* range for this endpoint */
};

struct __wt_cursor_join_entry {
	WT_INDEX		*index;
	WT_CURSOR		*main;		/* raw main table cursor */
	WT_BLOOM		*bloom;		/* Bloom filter handle */
	uint32_t		 bloom_bit_count; /* bits per item in bloom */
	uint32_t		 bloom_hash_count; /* hash functions in bloom */
	uint64_t		 count;		/* approx number of matches */

#define	WT_CURJOIN_ENTRY_BLOOM		0x01	/* use a bloom filter */
#define	WT_CURJOIN_ENTRY_DISJUNCTION	0x02	/* endpoints are or-ed */
#define	WT_CURJOIN_ENTRY_OWN_BLOOM	0x04	/* this entry owns the bloom */
	uint8_t			 flags;

	WT_CURSOR_JOIN_ENDPOINT	*ends;		/* reference endpoints */
	size_t			 ends_allocated;
	u_int			 ends_next;

	WT_JOIN_STATS		 stats;		/* Join statistics */
};

struct __wt_cursor_join {
	WT_CURSOR iface;

	WT_TABLE		*table;
	const char		*projection;
	WT_CURSOR_JOIN_ITER	*iter;
	WT_CURSOR_JOIN_ENTRY	*entries;
	size_t			 entries_allocated;
	u_int			 entries_next;
	uint8_t			 recno_buf[10];	/* holds packed recno */

#define	WT_CURJOIN_ERROR		0x01	/* Error in initialization */
#define	WT_CURJOIN_INITIALIZED		0x02	/* Successful initialization */
#define	WT_CURJOIN_SKIP_FIRST_LEFT	0x04	/* First check not needed */
	uint8_t			 flags;
};

struct __wt_cursor_json {
	char	*key_buf;		/* JSON formatted string */
	char	*value_buf;		/* JSON formatted string */
	WT_CONFIG_ITEM key_names;	/* Names of key columns */
	WT_CONFIG_ITEM value_names;	/* Names of value columns */
};

struct __wt_cursor_log {
	WT_CURSOR iface;

	WT_LSN		*cur_lsn;	/* LSN of current record */
	WT_LSN		*next_lsn;	/* LSN of next record */
	WT_ITEM		*logrec;	/* Copy of record for cursor */
	WT_ITEM		*opkey, *opvalue;	/* Op key/value copy */
	const uint8_t	*stepp, *stepp_end;	/* Pointer within record */
	uint8_t		*packed_key;	/* Packed key for 'raw' interface */
	uint8_t		*packed_value;	/* Packed value for 'raw' interface */
	uint32_t	step_count;	/* Intra-record count */
	uint32_t	rectype;	/* Record type */
	uint64_t	txnid;		/* Record txnid */
	uint32_t	flags;
};

struct __wt_cursor_metadata {
	WT_CURSOR iface;

	WT_CURSOR *file_cursor;		/* Queries of regular metadata */

#define	WT_MDC_CREATEONLY	0x01
#define	WT_MDC_ONMETADATA	0x02
#define	WT_MDC_POSITIONED	0x04
	uint32_t flags;
};

struct __wt_join_stats_group {
	const char *desc_prefix;	/* Prefix appears before description */
	WT_CURSOR_JOIN *join_cursor;
	ssize_t join_cursor_entry;	/* Position in entries */
	WT_JOIN_STATS join_stats;
};

struct __wt_cursor_stat {
	WT_CURSOR iface;

	bool	notinitialized;		/* Cursor not initialized */
	bool	notpositioned;		/* Cursor not positioned */

	int64_t	     *stats;		/* Statistics */
	int	      stats_base;	/* Base statistics value */
	int	      stats_count;	/* Count of statistics values */
	int	     (*stats_desc)(WT_CURSOR_STAT *, int, const char **);
					/* Statistics descriptions */
	int	     (*next_set)(WT_SESSION_IMPL *, WT_CURSOR_STAT *, bool,
			 bool);		/* Advance to next set */

	union {				/* Copies of the statistics */
		WT_DSRC_STATS dsrc_stats;
		WT_CONNECTION_STATS conn_stats;
		WT_JOIN_STATS_GROUP join_stats_group;
	} u;

	const char **cfg;		/* Original cursor configuration */
	char	*desc_buf;		/* Saved description string */

	int	 key;			/* Current stats key */
	uint64_t v;			/* Current stats value */
	WT_ITEM	 pv;			/* Current stats value (string) */

	/* Uses the same values as WT_CONNECTION::stat_flags field */
	uint32_t flags;
};

/*
 * WT_CURSOR_STATS --
 *	Return a reference to a statistic cursor's stats structures.
 */
#define	WT_CURSOR_STATS(cursor)						\
	(((WT_CURSOR_STAT *)cursor)->stats)

struct __wt_cursor_table {
	WT_CURSOR iface;

	WT_TABLE *table;
	const char *plan;

	const char **cfg;		/* Saved configuration string */

	WT_CURSOR **cg_cursors;
	WT_ITEM *cg_valcopy;		/*
					 * Copies of column group values, for
					 * overlapping set_value calls.
					 */
	WT_CURSOR **idx_cursors;
};

#define	WT_CURSOR_PRIMARY(cursor)					\
	(((WT_CURSOR_TABLE *)cursor)->cg_cursors[0])

#define	WT_CURSOR_RECNO(cursor)	WT_STREQ((cursor)->key_format, "r")

/*
 * WT_CURSOR_NEEDKEY, WT_CURSOR_NEEDVALUE --
 *	Check if we have a key/value set.  There's an additional semantic
 * implemented here: if we're pointing into the tree, and about to perform
 * a cursor operation, get a local copy of whatever we're referencing in
 * the tree, there's an obvious race with the cursor moving and the key or
 * value reference, and it's better to solve it here than in the underlying
 * data-source layers.
 *
 * WT_CURSOR_CHECKKEY --
 *	Check if a key is set without making a copy.
 *
 * WT_CURSOR_NOVALUE --
 *	Release any cached value before an operation that could update the
 * transaction context and free data a value is pointing to.
 */
#define	WT_CURSOR_CHECKKEY(cursor) do {					\
	if (!F_ISSET(cursor, WT_CURSTD_KEY_SET))			\
		WT_ERR(__wt_cursor_kv_not_set(cursor, true));		\
} while (0)
#define	WT_CURSOR_CHECKVALUE(cursor) do {				\
	if (!F_ISSET(cursor, WT_CURSTD_VALUE_SET))			\
		WT_ERR(__wt_cursor_kv_not_set(cursor, false));		\
} while (0)
#define	WT_CURSOR_NEEDKEY(cursor) do {					\
	if (F_ISSET(cursor, WT_CURSTD_KEY_INT)) {			\
		if (!WT_DATA_IN_ITEM(&(cursor)->key))			\
			WT_ERR(__wt_buf_set(				\
			    (WT_SESSION_IMPL *)(cursor)->session,	\
			    &(cursor)->key,				\
			    (cursor)->key.data, (cursor)->key.size));	\
		F_CLR(cursor, WT_CURSTD_KEY_INT);			\
		F_SET(cursor, WT_CURSTD_KEY_EXT);			\
	}								\
	WT_CURSOR_CHECKKEY(cursor);					\
} while (0)
#define	WT_CURSOR_NEEDVALUE(cursor) do {				\
	if (F_ISSET(cursor, WT_CURSTD_VALUE_INT)) {			\
		if (!WT_DATA_IN_ITEM(&(cursor)->value))			\
			WT_ERR(__wt_buf_set(				\
			    (WT_SESSION_IMPL *)(cursor)->session,	\
			    &(cursor)->value,				\
			    (cursor)->value.data, (cursor)->value.size));\
		F_CLR(cursor, WT_CURSTD_VALUE_INT);			\
		F_SET(cursor, WT_CURSTD_VALUE_EXT);			\
	}								\
	WT_CURSOR_CHECKVALUE(cursor);					\
} while (0)
#define	WT_CURSOR_NOVALUE(cursor) do {					\
	F_CLR(cursor, WT_CURSTD_VALUE_INT);				\
} while (0)

#define	WT_CURSOR_RAW_OK						\
	WT_CURSTD_DUMP_HEX | WT_CURSTD_DUMP_PRINT | WT_CURSTD_RAW