summaryrefslogtreecommitdiff
path: root/src/include/cursor.h
blob: f32b4250d300b3bf082ecbd0938ec560b73bd5cd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
/*-
 * Copyright (c) 2014-2016 MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

/*
 * Initialize a static WT_CURSOR structure.
 */
#define	WT_CURSOR_STATIC_INIT(n,					\
	get_key,							\
	get_value,							\
	set_key,							\
	set_value,							\
	compare,							\
	equals,								\
	next,								\
	prev,								\
	reset,								\
	search,								\
	search_near,							\
	insert,								\
	update,								\
	remove,								\
	reconfigure,							\
	close)								\
	static const WT_CURSOR n = {					\
	NULL,				/* session */			\
	NULL,				/* uri */			\
	NULL,				/* key_format */		\
	NULL,				/* value_format */		\
	get_key,							\
	get_value,							\
	set_key,							\
	set_value,							\
	compare,							\
	equals,								\
	next,								\
	prev,								\
	reset,								\
	search,								\
	search_near,							\
	insert,								\
	update,								\
	remove,								\
	close,								\
	reconfigure,							\
	{ NULL, NULL },			/* TAILQ_ENTRY q */		\
	0,				/* recno key */			\
	{ 0 },				/* recno raw buffer */		\
	NULL,				/* json_private */		\
	NULL,				/* lang_private */		\
	{ NULL, 0, NULL, 0, 0 },	/* WT_ITEM key */		\
	{ NULL, 0, NULL, 0, 0 },	/* WT_ITEM value */		\
	0,				/* int saved_err */		\
	NULL,				/* internal_uri */		\
	0				/* uint32_t flags */		\
}

struct __wt_cursor_backup {
	WT_CURSOR iface;

	size_t next;			/* Cursor position */
	WT_FSTREAM *bfs;		/* Backup file stream */
	uint32_t maxid;			/* Maximum log file ID seen */

	char **list;			/* List of files to be copied. */
	size_t list_allocated;
	size_t list_next;

#define	WT_CURBACKUP_LOCKER	0x01	/* Hot-backup started */
	uint8_t	flags;
};
#define	WT_CURSOR_BACKUP_ID(cursor)	(((WT_CURSOR_BACKUP *)(cursor))->maxid)

struct __wt_cursor_btree {
	WT_CURSOR iface;

	WT_BTREE *btree;		/* Enclosing btree */

	/*
	 * The following fields are set by the search functions as a precursor
	 * to page modification: we have a page, a WT_COL/WT_ROW slot on the
	 * page, an insert head, insert list and a skiplist stack (the stack of
	 * skiplist entries leading to the insert point).  The search functions
	 * also return the relationship of the search key to the found key.
	 */
	WT_REF	  *ref;			/* Current page */
	uint32_t   slot;		/* WT_COL/WT_ROW 0-based slot */

	WT_INSERT_HEAD	*ins_head;	/* Insert chain head */
	WT_INSERT	*ins;		/* Current insert node */
					/* Search stack */
	WT_INSERT	**ins_stack[WT_SKIP_MAXDEPTH];

					/* Next item(s) found during search */
	WT_INSERT	*next_stack[WT_SKIP_MAXDEPTH];

	uint32_t page_deleted_count;	/* Deleted items on the page */

	uint64_t recno;			/* Record number */

	/*
	 * Next-random cursors can optionally be configured to step through a
	 * percentage of the total leaf pages to their next value. Note the
	 * configured value and the calculated number of leaf pages to skip.
	 */
	uint64_t next_random_leaf_skip;
	u_int	 next_random_sample_size;

	/*
	 * The search function sets compare to:
	 *	< 1 if the found key is less than the specified key
	 *	  0 if the found key matches the specified key
	 *	> 1 if the found key is larger than the specified key
	 */
	int	compare;

	/*
	 * A key returned from a binary search or cursor movement on a row-store
	 * page; if we find an exact match on a row-store leaf page in a search
	 * operation, keep a copy of key we built during the search to avoid
	 * doing the additional work of getting the key again for return to the
	 * application. Note, this only applies to exact matches when searching
	 * disk-image structures, so it's not, for example, a key from an insert
	 * list. Additionally, this structure is used to build keys when moving
	 * a cursor through a row-store leaf page.
	 */
	WT_ITEM *row_key, _row_key;

	/*
	 * It's relatively expensive to calculate the last record on a variable-
	 * length column-store page because of the repeat values.  Calculate it
	 * once per page and cache it.  This value doesn't include the skiplist
	 * of appended entries on the last page.
	 */
	uint64_t last_standard_recno;

	/*
	 * For row-store pages, we need a single item that tells us the part of
	 * the page we're walking (otherwise switching from next to prev and
	 * vice-versa is just too complicated), so we map the WT_ROW and
	 * WT_INSERT_HEAD insert array slots into a single name space: slot 1
	 * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is
	 * WT_INSERT_HEAD[0], and so on.  This means WT_INSERT lists are
	 * odd-numbered slots, and WT_ROW array slots are even-numbered slots.
	 */
	uint32_t row_iteration_slot;	/* Row-store iteration slot */

	/*
	 * Variable-length column-store values are run-length encoded and may
	 * be overflow values or Huffman encoded. To avoid repeatedly reading
	 * overflow values or decompressing encoded values, process it once and
	 * store the result in a temporary buffer.  The cip_saved field is used
	 * to determine if we've switched columns since our last cursor call.
	 */
	WT_COL *cip_saved;		/* Last iteration reference */

	/*
	 * We don't instantiate prefix-compressed keys on pages where there's no
	 * Huffman encoding because we don't want to waste memory if only moving
	 * a cursor through the page, and it's faster to build keys while moving
	 * through the page than to roll-forward from a previously instantiated
	 * key (we don't instantiate all of the keys, just the ones at binary
	 * search points).  We can't use the application's WT_CURSOR key field
	 * as a copy of the last-returned key because it may have been altered
	 * by the API layer, for example, dump cursors.  Instead we store the
	 * last-returned key in a temporary buffer.  The rip_saved field is used
	 * to determine if the key in the temporary buffer has the prefix needed
	 * for building the current key.
	 */
	WT_ROW *rip_saved;		/* Last-returned key reference */

	/*
	 * A temporary buffer for caching RLE values for column-store files (if
	 * RLE is non-zero, then we don't unpack the value every time we move
	 * to the next cursor position, we re-use the unpacked value we stored
	 * here the first time we hit the value).
	 *
	 * A temporary buffer for building on-page keys when searching row-store
	 * files.
	 */
	WT_ITEM *tmp, _tmp;

	/*
	 * The update structure allocated by the row- and column-store modify
	 * functions, used to avoid a data copy in the WT_CURSOR.update call.
	 */
	WT_UPDATE *modify_update;

	/*
	 * Fixed-length column-store items are a single byte, and it's simpler
	 * and cheaper to allocate the space for it now than keep checking to
	 * see if we need to grow the buffer.
	 */
	uint8_t v;			/* Fixed-length return value */

	uint8_t	append_tree;		/* Cursor appended to the tree */

#ifdef HAVE_DIAGNOSTIC
	/* Check that cursor next/prev never returns keys out-of-order. */
	WT_ITEM *lastkey, _lastkey;
	uint64_t lastrecno;
#endif

#define	WT_CBT_ACTIVE		0x01	/* Active in the tree */
#define	WT_CBT_ITERATE_APPEND	0x02	/* Col-store: iterating append list */
#define	WT_CBT_ITERATE_NEXT	0x04	/* Next iteration configuration */
#define	WT_CBT_ITERATE_PREV	0x08	/* Prev iteration configuration */
#define	WT_CBT_NO_TXN   	0x10	/* Non-transactional cursor
					   (e.g. on a checkpoint) */
#define	WT_CBT_SEARCH_SMALLEST	0x20	/* Row-store: small-key insert list */
#define	WT_CBT_VAR_ONPAGE_MATCH	0x40	/* Var-store: on-page recno match */

#define	WT_CBT_POSITION_MASK		/* Flags associated with position */ \
	(WT_CBT_ITERATE_APPEND | WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV | \
	WT_CBT_SEARCH_SMALLEST | WT_CBT_VAR_ONPAGE_MATCH)

	uint8_t flags;
};

struct __wt_cursor_bulk {
	WT_CURSOR_BTREE cbt;

	/*
	 * Variable-length column store compares values during bulk load as
	 * part of RLE compression, row-store compares keys during bulk load
	 * to avoid corruption.
	 */
	bool	first_insert;		/* First insert */
	WT_ITEM	last;			/* Last key/value inserted */

	/*
	 * Additional column-store bulk load support.
	 */
	uint64_t recno;			/* Record number */
	uint64_t rle;			/* Variable-length RLE counter */

	/*
	 * Additional fixed-length column store bitmap bulk load support:
	 * current entry in memory chunk count, and the maximum number of
	 * records per chunk.
	 */
	bool	 bitmap;		/* Bitmap bulk load */
	uint32_t entry;			/* Entry count */
	uint32_t nrecs;			/* Max records per chunk */

	void	*reconcile;		/* Reconciliation support */
	WT_REF	*ref;			/* The leaf page */
	WT_PAGE *leaf;
};

struct __wt_cursor_config {
	WT_CURSOR iface;
};

struct __wt_cursor_data_source {
	WT_CURSOR iface;

	WT_COLLATOR *collator;		/* Configured collator */
	int collator_owned;		/* Collator needs to be terminated */

	WT_CURSOR *source;		/* Application-owned cursor */
};

struct __wt_cursor_dump {
	WT_CURSOR iface;

	WT_CURSOR *child;
};

struct __wt_cursor_index {
	WT_CURSOR iface;

	WT_TABLE *table;
	WT_INDEX *index;
	const char *key_plan, *value_plan;

	WT_CURSOR *child;
	WT_CURSOR **cg_cursors;
	uint8_t	*cg_needvalue;
};

/*
 * A join iterator structure is used to generate candidate primary keys. It
 * is the responsibility of the caller of the iterator to filter these
 * primary key against the other conditions of the join before returning
 * them the caller of WT_CURSOR::next.
 *
 * For a conjunction join (the default), entry_count will be 1, meaning that
 * the iterator only consumes the first entry (WT_CURSOR_JOIN_ENTRY).  That
 * is, it successively returns primary keys from a cursor for the first
 * index that was joined.  When the values returned by that cursor are
 * exhausted, the iterator has completed.  For a disjunction join,
 * exhausting a cursor just means that the iterator advances to the next
 * entry. If the next entry represents an index, a new cursor is opened and
 * primary keys from that index are then successively returned.
 *
 * When positioned on an entry that represents a nested join, a new child
 * iterator is created that will be bound to the nested WT_CURSOR_JOIN.
 * That iterator is then used to generate candidate primary keys.  When its
 * iteration is completed, that iterator is destroyed and the parent
 * iterator advances to the next entry.  Thus, depending on how deeply joins
 * are nested, a similarly deep stack of iterators is created.
 */
struct __wt_cursor_join_iter {
	WT_SESSION_IMPL		*session;
	WT_CURSOR_JOIN		*cjoin;
	WT_CURSOR_JOIN_ENTRY	*entry;
	WT_CURSOR_JOIN_ITER	*child;
	WT_CURSOR		*cursor;	/* has null projection */
	WT_ITEM			*curkey;	/* primary key */
	WT_ITEM			 idxkey;
	u_int			 entry_pos;	/* the current entry */
	u_int			 entry_count;	/* entries to walk */
	u_int			 end_pos;	/* the current endpoint */
	u_int			 end_count;	/* endpoints to walk */
	u_int			 end_skip;	/* when testing for inclusion */
						/* can we skip current end? */
	bool			 positioned;
	bool			 is_equal;
};

/*
 * A join endpoint represents a positioned cursor that is 'captured' by a
 * WT_SESSION::join call.
 */
struct __wt_cursor_join_endpoint {
	WT_ITEM			 key;
	uint8_t			 recno_buf[10];	/* holds packed recno */
	WT_CURSOR		*cursor;

#define	WT_CURJOIN_END_LT	0x01		/* include values <  cursor */
#define	WT_CURJOIN_END_EQ	0x02		/* include values == cursor */
#define	WT_CURJOIN_END_GT	0x04		/* include values >  cursor */
#define	WT_CURJOIN_END_GE	(WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ)
#define	WT_CURJOIN_END_LE	(WT_CURJOIN_END_LT | WT_CURJOIN_END_EQ)
#define	WT_CURJOIN_END_OWN_CURSOR 0x08		/* must close cursor */
	uint8_t			 flags;		/* range for this endpoint */
};
#define	WT_CURJOIN_END_RANGE(endp)					\
	((endp)->flags &						\
	    (WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ | WT_CURJOIN_END_LT))

/*
 * Each join entry typically represents an index's participation in a join.
 * For example, if 'k' is an index, then "t.k > 10 && t.k < 20" would be
 * represented by a single entry, with two endpoints.  When the index and
 * subjoin fields are NULL, the join is on the main table.  When subjoin is
 * non-NULL, there is a nested join clause.
 */
struct __wt_cursor_join_entry {
	WT_INDEX		*index;
	WT_CURSOR		*main;		/* raw main table cursor */
	WT_CURSOR_JOIN		*subjoin;	/* a nested join clause */
	WT_BLOOM		*bloom;		/* Bloom filter handle */
	char			*repack_format; /* target format for repack */
	uint32_t		 bloom_bit_count; /* bits per item in bloom */
	uint32_t		 bloom_hash_count; /* hash functions in bloom */
	uint64_t		 count;		/* approx number of matches */

#define	WT_CURJOIN_ENTRY_BLOOM		 0x01	/* use a bloom filter */
#define	WT_CURJOIN_ENTRY_DISJUNCTION	 0x02	/* endpoints are or-ed */
#define	WT_CURJOIN_ENTRY_FALSE_POSITIVES 0x04	/* after bloom filter do not
						 * filter false positives */
#define	WT_CURJOIN_ENTRY_OWN_BLOOM	 0x08	/* this entry owns the bloom */
	uint8_t			 flags;

	WT_CURSOR_JOIN_ENDPOINT	*ends;		/* reference endpoints */
	size_t			 ends_allocated;
	u_int			 ends_next;

	WT_JOIN_STATS		 stats;		/* Join statistics */
};

struct __wt_cursor_join {
	WT_CURSOR iface;

	WT_TABLE		*table;
	const char		*projection;
	WT_CURSOR		*main;		/* main table with projection */
	WT_CURSOR_JOIN		*parent;	/* parent of nested group */
	WT_CURSOR_JOIN_ITER	*iter;		/* chain of iterators */
	WT_CURSOR_JOIN_ENTRY	*entries;
	size_t			 entries_allocated;
	u_int			 entries_next;
	uint8_t			 recno_buf[10];	/* holds packed recno */

#define	WT_CURJOIN_DISJUNCTION		0x01	/* Entries are or-ed */
#define	WT_CURJOIN_ERROR		0x02	/* Error in initialization */
#define	WT_CURJOIN_INITIALIZED		0x04	/* Successful initialization */
	uint8_t			 flags;
};

struct __wt_cursor_json {
	char	*key_buf;		/* JSON formatted string */
	char	*value_buf;		/* JSON formatted string */
	WT_CONFIG_ITEM key_names;	/* Names of key columns */
	WT_CONFIG_ITEM value_names;	/* Names of value columns */
};

struct __wt_cursor_log {
	WT_CURSOR iface;

	WT_LSN		*cur_lsn;	/* LSN of current record */
	WT_LSN		*next_lsn;	/* LSN of next record */
	WT_ITEM		*logrec;	/* Copy of record for cursor */
	WT_ITEM		*opkey, *opvalue;	/* Op key/value copy */
	const uint8_t	*stepp, *stepp_end;	/* Pointer within record */
	uint8_t		*packed_key;	/* Packed key for 'raw' interface */
	uint8_t		*packed_value;	/* Packed value for 'raw' interface */
	uint32_t	step_count;	/* Intra-record count */
	uint32_t	rectype;	/* Record type */
	uint64_t	txnid;		/* Record txnid */

#define	WT_CURLOG_ARCHIVE_LOCK	0x01	/* Archive lock held */
	uint8_t		flags;
};

struct __wt_cursor_metadata {
	WT_CURSOR iface;

	WT_CURSOR *file_cursor;		/* Queries of regular metadata */
	WT_CURSOR *create_cursor;	/* Extra cursor for create option */

#define	WT_MDC_CREATEONLY	0x01
#define	WT_MDC_ONMETADATA	0x02
#define	WT_MDC_POSITIONED	0x04
	uint8_t	flags;
};

struct __wt_join_stats_group {
	const char *desc_prefix;	/* Prefix appears before description */
	WT_CURSOR_JOIN *join_cursor;
	ssize_t join_cursor_entry;	/* Position in entries */
	WT_JOIN_STATS join_stats;
};

struct __wt_cursor_stat {
	WT_CURSOR iface;

	bool	notinitialized;		/* Cursor not initialized */
	bool	notpositioned;		/* Cursor not positioned */

	int64_t	     *stats;		/* Statistics */
	int	      stats_base;	/* Base statistics value */
	int	      stats_count;	/* Count of statistics values */
	int	     (*stats_desc)(WT_CURSOR_STAT *, int, const char **);
					/* Statistics descriptions */
	int	     (*next_set)(WT_SESSION_IMPL *, WT_CURSOR_STAT *, bool,
			 bool);		/* Advance to next set */

	union {				/* Copies of the statistics */
		WT_DSRC_STATS dsrc_stats;
		WT_CONNECTION_STATS conn_stats;
		WT_JOIN_STATS_GROUP join_stats_group;
	} u;

	const char **cfg;		/* Original cursor configuration */
	char	*desc_buf;		/* Saved description string */

	int	 key;			/* Current stats key */
	uint64_t v;			/* Current stats value */
	WT_ITEM	 pv;			/* Current stats value (string) */

	/* Options declared in flags.py, shared by WT_CONNECTION::stat_flags */
	uint32_t flags;
};

/*
 * WT_CURSOR_STATS --
 *	Return a reference to a statistic cursor's stats structures.
 */
#define	WT_CURSOR_STATS(cursor)						\
	(((WT_CURSOR_STAT *)(cursor))->stats)

struct __wt_cursor_table {
	WT_CURSOR iface;

	WT_TABLE *table;
	const char *plan;

	const char **cfg;		/* Saved configuration string */

	WT_CURSOR **cg_cursors;
	WT_ITEM *cg_valcopy;		/*
					 * Copies of column group values, for
					 * overlapping set_value calls.
					 */
	WT_CURSOR **idx_cursors;
};

#define	WT_CURSOR_PRIMARY(cursor)					\
	(((WT_CURSOR_TABLE *)(cursor))->cg_cursors[0])

#define	WT_CURSOR_RECNO(cursor)	WT_STREQ((cursor)->key_format, "r")

/*
 * WT_CURSOR_NEEDKEY, WT_CURSOR_NEEDVALUE --
 *	Check if we have a key/value set.  There's an additional semantic
 * implemented here: if we're pointing into the tree, and about to perform
 * a cursor operation, get a local copy of whatever we're referencing in
 * the tree, there's an obvious race with the cursor moving and the key or
 * value reference, and it's better to solve it here than in the underlying
 * data-source layers.
 *
 * WT_CURSOR_CHECKKEY --
 *	Check if a key is set without making a copy.
 *
 * WT_CURSOR_NOVALUE --
 *	Release any cached value before an operation that could update the
 * transaction context and free data a value is pointing to.
 */
#define	WT_CURSOR_CHECKKEY(cursor) do {					\
	if (!F_ISSET(cursor, WT_CURSTD_KEY_SET))			\
		WT_ERR(__wt_cursor_kv_not_set(cursor, true));		\
} while (0)
#define	WT_CURSOR_CHECKVALUE(cursor) do {				\
	if (!F_ISSET(cursor, WT_CURSTD_VALUE_SET))			\
		WT_ERR(__wt_cursor_kv_not_set(cursor, false));		\
} while (0)
#define	WT_CURSOR_NEEDKEY(cursor) do {					\
	if (F_ISSET(cursor, WT_CURSTD_KEY_INT)) {			\
		if (!WT_DATA_IN_ITEM(&(cursor)->key))			\
			WT_ERR(__wt_buf_set(				\
			    (WT_SESSION_IMPL *)(cursor)->session,	\
			    &(cursor)->key,				\
			    (cursor)->key.data, (cursor)->key.size));	\
		F_CLR(cursor, WT_CURSTD_KEY_INT);			\
		F_SET(cursor, WT_CURSTD_KEY_EXT);			\
	}								\
	WT_CURSOR_CHECKKEY(cursor);					\
} while (0)
#define	WT_CURSOR_NEEDVALUE(cursor) do {				\
	if (F_ISSET(cursor, WT_CURSTD_VALUE_INT)) {			\
		if (!WT_DATA_IN_ITEM(&(cursor)->value))			\
			WT_ERR(__wt_buf_set(				\
			    (WT_SESSION_IMPL *)(cursor)->session,	\
			    &(cursor)->value,				\
			    (cursor)->value.data, (cursor)->value.size));\
		F_CLR(cursor, WT_CURSTD_VALUE_INT);			\
		F_SET(cursor, WT_CURSTD_VALUE_EXT);			\
	}								\
	WT_CURSOR_CHECKVALUE(cursor);					\
} while (0)
#define	WT_CURSOR_NOVALUE(cursor) do {					\
	F_CLR(cursor, WT_CURSTD_VALUE_INT);				\
} while (0)

#define	WT_CURSOR_RAW_OK						\
	(WT_CURSTD_DUMP_HEX | WT_CURSTD_DUMP_PRINT | WT_CURSTD_RAW)