summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/include/cell.h
blob: e17a037042591f96c72ea628c86cd9cc56cce85b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
/*-
 * Copyright (c) 2014-2020 MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

/*
 * WT_CELL --
 *	Variable-length cell type.
 *
 * Pages containing variable-length keys or values data (the WT_PAGE_ROW_INT,
 * WT_PAGE_ROW_LEAF, WT_PAGE_COL_INT and WT_PAGE_COL_VAR page types), have
 * cells after the page header.
 *
 * There are 4 basic cell types: keys and data (each of which has an overflow
 * form), deleted cells and off-page references.  The cell is usually followed
 * by additional data, varying by type: keys are followed by a chunk of data,
 * values are followed by an optional validity window and a chunk of data,
 * overflow and off-page cells are followed by an optional validity window and
 * an address cookie.
 *
 * Deleted cells are place-holders for column-store files, where entries cannot
 * be removed in order to preserve the record count.
 *
 * Here's the cell use by page type:
 *
 * WT_PAGE_ROW_INT (row-store internal page):
 *	Keys and offpage-reference pairs (a WT_CELL_KEY or WT_CELL_KEY_OVFL
 * cell followed by a WT_CELL_ADDR_XXX cell).
 *
 * WT_PAGE_ROW_LEAF (row-store leaf page):
 *	Keys with optional data cells (a WT_CELL_KEY or WT_CELL_KEY_OVFL cell,
 *	normally followed by a WT_CELL_{VALUE,VALUE_COPY,VALUE_OVFL} cell).
 *
 *	WT_PAGE_ROW_LEAF pages optionally prefix-compress keys, using a single
 *	byte count immediately following the cell.
 *
 * WT_PAGE_COL_INT (Column-store internal page):
 *	Off-page references (a WT_CELL_ADDR_XXX cell).
 *
 * WT_PAGE_COL_VAR (Column-store leaf page storing variable-length cells):
 *	Data cells (a WT_CELL_{VALUE,VALUE_COPY,VALUE_OVFL} cell), or deleted
 * cells (a WT_CELL_DEL cell).
 *
 * Each cell starts with a descriptor byte:
 *
 * Bits 1 and 2 are reserved for "short" key and value cells (that is, a cell
 * carrying data less than 64B, where we can store the data length in the cell
 * descriptor byte):
 *	0b00	Not a short key/data cell
 *	0b01	Short key cell
 *	0b10	Short key cell, with a following prefix-compression byte
 *	0b11	Short value cell
 * In the "short" variants, the other 6 bits of the descriptor byte are the
 * data length.
 *
 * Bit 3 marks an 8B packed, uint64_t value following the cell description byte.
 * (A run-length counter or a record number for variable-length column store.)
 *
 * Bit 4 marks a value with an additional descriptor byte. If this flag is set,
 * the next byte after the initial cell byte is an additional description byte.
 * The bottom bit in this additional byte indicates that the cell is part of a
 * prepared, and not yet committed transaction. The next 6 bits describe a validity
 * and durability window of timestamp/transaction IDs.  The top bit is currently unused.
 *
 * Bits 5-8 are cell "types".
 */
#define WT_CELL_KEY_SHORT 0x01     /* Short key */
#define WT_CELL_KEY_SHORT_PFX 0x02 /* Short key with prefix byte */
#define WT_CELL_VALUE_SHORT 0x03   /* Short data */
#define WT_CELL_SHORT_TYPE(v) ((v)&0x03U)

#define WT_CELL_SHORT_MAX 63  /* Maximum short key/value */
#define WT_CELL_SHORT_SHIFT 2 /* Shift for short key/value */

#define WT_CELL_64V 0x04         /* Associated value */
#define WT_CELL_SECOND_DESC 0x08 /* Second descriptor byte */

#define WT_CELL_PREPARE 0x01          /* Part of prepared transaction */
#define WT_CELL_TS_DURABLE_START 0x02 /* Start durable timestamp */
#define WT_CELL_TS_DURABLE_STOP 0x04  /* Stop durable timestamp */
#define WT_CELL_TS_START 0x08         /* Oldest-start timestamp */
#define WT_CELL_TS_STOP 0x10          /* Newest-stop timestamp */
#define WT_CELL_TXN_START 0x20        /* Oldest-start txn ID */
#define WT_CELL_TXN_STOP 0x40         /* Newest-stop txn ID */

/*
 * WT_CELL_ADDR_INT is an internal block location, WT_CELL_ADDR_LEAF is a leaf block location, and
 * WT_CELL_ADDR_LEAF_NO is a leaf block location where the page has no overflow items. (The goal is
 * to speed up truncation as we don't have to read pages without overflow items in order to delete
 * them. Note, WT_CELL_ADDR_LEAF_NO is not guaranteed to be set on every page without overflow
 * items, the only guarantee is that if set, the page has no overflow items.)
 *
 * WT_CELL_VALUE_COPY is a reference to a previous cell on the page, supporting value dictionaries:
 * if the two values are the same, we only store them once and have any second and subsequent uses
 * reference the original.
 */
#define WT_CELL_ADDR_DEL (0)            /* Address: deleted */
#define WT_CELL_ADDR_INT (1 << 4)       /* Address: internal  */
#define WT_CELL_ADDR_LEAF (2 << 4)      /* Address: leaf */
#define WT_CELL_ADDR_LEAF_NO (3 << 4)   /* Address: leaf no overflow */
#define WT_CELL_DEL (4 << 4)            /* Deleted value */
#define WT_CELL_KEY (5 << 4)            /* Key */
#define WT_CELL_KEY_OVFL (6 << 4)       /* Overflow key */
#define WT_CELL_KEY_OVFL_RM (12 << 4)   /* Overflow key (removed) */
#define WT_CELL_KEY_PFX (7 << 4)        /* Key with prefix byte */
#define WT_CELL_VALUE (8 << 4)          /* Value */
#define WT_CELL_VALUE_COPY (9 << 4)     /* Value copy */
#define WT_CELL_VALUE_OVFL (10 << 4)    /* Overflow value */
#define WT_CELL_VALUE_OVFL_RM (11 << 4) /* Overflow value (removed) */

#define WT_CELL_TYPE_MASK (0x0fU << 4) /* Maximum 16 cell types */
#define WT_CELL_TYPE(v) ((v)&WT_CELL_TYPE_MASK)

/*
 * When unable to create a short key or value (and where it wasn't an associated RLE or validity
 * window that prevented creating a short value), the data must be at least 64B, else we'd have used
 * a short cell. When packing/unpacking the size, decrement/increment the size, in the hopes that a
 * smaller size will pack into a single byte instead of two.
 */
#define WT_CELL_SIZE_ADJUST (WT_CELL_SHORT_MAX + 1)

/*
 * WT_CELL --
 *	Variable-length, on-page cell header.
 */
struct __wt_cell {
    /*
     * Maximum of 71 bytes:
     *  1: cell descriptor byte
     *  1: prefix compression count
     *  1: secondary descriptor byte
     * 36: 4 timestamps		(uint64_t encoding, max 9 bytes)
     * 18: 2 transaction IDs	(uint64_t encoding, max 9 bytes)
     *  9: associated 64-bit value	(uint64_t encoding, max 9 bytes)
     *  5: data length		(uint32_t encoding, max 5 bytes)
     *
     * This calculation is extremely pessimistic: the prefix compression
     * count and 64V value overlap, and the validity window, 64V value
     * and data length are all optional in some cases.
     */
    uint8_t __chunk[1 + 1 + 1 + 7 * WT_INTPACK64_MAXSIZE + WT_INTPACK32_MAXSIZE];
};

/* AUTOMATIC FLAG VALUE GENERATION START */
#define WT_CELL_UNPACK_OVERFLOW 0x1u            /* cell is an overflow */
#define WT_CELL_UNPACK_TIME_WINDOW_CLEARED 0x2u /* time window cleared because of restart */
                                                /* AUTOMATIC FLAG VALUE GENERATION STOP */

/*
 * We have two "unpacked cell" structures: one holding holds unpacked cells from internal nodes
 * (address pages), and one holding unpacked cells from leaf nodes (key/value pages). They share a
 * common set of initial fields: in a few places where a function has to handle both types of
 * unpacked cells, the unpacked cell structures are cast to an "unpack-common" structure that can
 * only reference shared fields.
 */
#define WT_CELL_COMMON_FIELDS                                                                   \
    WT_CELL *cell; /* Cell's disk image address */                                              \
                                                                                                \
    uint64_t v; /* RLE count or recno */                                                        \
                                                                                                \
    /*                                                                                          \
     * The size and __len fields are reasonably type size_t; don't change the type, performance \
     * drops significantly if they're type size_t.                                              \
     */                                                                                         \
    const void *data; /* Data */                                                                \
    uint32_t size;    /* Data size */                                                           \
                                                                                                \
    uint32_t __len; /* Cell + data length (usually) */                                          \
                                                                                                \
    uint8_t prefix; /* Cell prefix length */                                                    \
                                                                                                \
    uint8_t raw;  /* Raw cell type (include "shorts") */                                        \
    uint8_t type; /* Cell type */                                                               \
                                                                                                \
    uint8_t flags

/*
 * WT_CELL_UNPACK_COMMON --
 *     Unpacked address cell, the common fields.
 */
struct __wt_cell_unpack_common {
    WT_CELL_COMMON_FIELDS;
};

/*
 * WT_CELL_UNPACK_ADDR --
 *     Unpacked address cell.
 */
struct __wt_cell_unpack_addr {
    WT_CELL_COMMON_FIELDS;

    WT_TIME_AGGREGATE ta; /* Address validity window */
};

/*
 * WT_CELL_UNPACK_KV --
 *     Unpacked value cell.
 */
struct __wt_cell_unpack_kv {
    WT_CELL_COMMON_FIELDS;

    WT_TIME_WINDOW tw; /* Value validity window */
};