1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
|
/*-
* Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
* See the file LICENSE for redistribution information.
*/
/*
* Supported btree formats: the "current" version is the maximum supported
* major/minor versions.
*/
#define WT_BTREE_MAJOR_VERSION_MIN 1 /* Oldest version supported */
#define WT_BTREE_MINOR_VERSION_MIN 1
#define WT_BTREE_MAJOR_VERSION_MAX 1 /* Newest version supported */
#define WT_BTREE_MINOR_VERSION_MAX 1
/*
* The maximum btree leaf and internal page size is 512MB (2^29). The limit
* is enforced in software, it could be larger, specifically, the underlying
* default block manager can support 4GB (2^32). Currently, the maximum page
* size must accommodate our dependence on the maximum page size fitting into
* a number of bits less than 32; see the row-store page key-lookup functions
* for the magic.
*/
#define WT_BTREE_PAGE_SIZE_MAX (512 * WT_MEGABYTE)
/*
* The length of variable-length column-store values and row-store keys/values
* are stored in a 4B type, so the largest theoretical key/value item is 4GB.
* However, in the WT_UPDATE structure we use the UINT32_MAX size as a "deleted"
* flag, and second, the size of an overflow object is constrained by what an
* underlying block manager can actually write. (For example, in the default
* block manager, writing an overflow item includes the underlying block's page
* header and block manager specific structure, aligned to an allocation-sized
* unit). The btree engine limits the size of a single object to (4GB - 1KB);
* that gives us additional bytes if we ever want to store a structure length
* plus the object size in 4B, or if we need additional flag values. Attempts
* to store large key/value items in the tree trigger an immediate check to the
* block manager, to make sure it can write the item. Storing 4GB objects in a
* btree borders on clinical insanity, anyway.
*
* Record numbers are stored in 64-bit unsigned integers, meaning the largest
* record number is "really, really big".
*/
#define WT_BTREE_MAX_OBJECT_SIZE ((uint32_t)(UINT32_MAX - 1024))
/*
* A location in a file is a variable-length cookie, but it has a maximum size
* so it's easy to create temporary space in which to store them. (Locations
* can't be much larger than this anyway, they must fit onto the minimum size
* page because a reference to an overflow page is itself a location.)
*/
#define WT_BTREE_MAX_ADDR_COOKIE 255 /* Maximum address cookie */
/* Evict pages if we see this many consecutive deleted records. */
#define WT_BTREE_DELETE_THRESHOLD 1000
/*
* Minimum size of the chunks (in percentage of the page size) a page gets split
* into during reconciliation.
*/
#define WT_BTREE_MIN_SPLIT_PCT 50
/*
* WT_BTREE --
* A btree handle.
*/
struct __wt_btree {
WT_DATA_HANDLE *dhandle;
WT_CKPT *ckpt; /* Checkpoint information */
enum { BTREE_COL_FIX=1, /* Fixed-length column store */
BTREE_COL_VAR=2, /* Variable-length column store */
BTREE_ROW=3 /* Row-store */
} type; /* Type */
const char *key_format; /* Key format */
const char *value_format; /* Value format */
uint8_t bitcnt; /* Fixed-length field size in bits */
WT_COLLATOR *collator; /* Row-store comparator */
int collator_owned; /* The collator needs to be freed */
uint32_t id; /* File ID, for logging */
uint32_t key_gap; /* Row-store prefix key gap */
uint32_t allocsize; /* Allocation size */
uint32_t maxintlpage; /* Internal page max size */
uint32_t maxintlkey; /* Internal page max key size */
uint32_t maxleafpage; /* Leaf page max size */
uint32_t maxleafkey; /* Leaf page max key size */
uint32_t maxleafvalue; /* Leaf page max value size */
uint64_t maxmempage; /* In-memory page max size */
uint64_t splitmempage; /* In-memory split trigger size */
void *huffman_key; /* Key huffman encoding */
void *huffman_value; /* Value huffman encoding */
enum { CKSUM_ON=1, /* On */
CKSUM_OFF=2, /* Off */
CKSUM_UNCOMPRESSED=3 /* Uncompressed blocks only */
} checksum; /* Checksum configuration */
/*
* Reconciliation...
*/
u_int dictionary; /* Dictionary slots */
bool internal_key_truncate; /* Internal key truncate */
int maximum_depth; /* Maximum tree depth */
bool prefix_compression; /* Prefix compression */
u_int prefix_compression_min; /* Prefix compression min */
#define WT_SPLIT_DEEPEN_MIN_CHILD_DEF 10000
u_int split_deepen_min_child; /* Minimum entries to deepen tree */
#define WT_SPLIT_DEEPEN_PER_CHILD_DEF 100
u_int split_deepen_per_child; /* Entries per child when deepened */
int split_pct; /* Split page percent */
WT_COMPRESSOR *compressor; /* Page compressor */
WT_KEYED_ENCRYPTOR *kencryptor; /* Page encryptor */
WT_RWLOCK ovfl_lock; /* Overflow lock */
uint64_t last_recno; /* Column-store last record number */
WT_REF root; /* Root page reference */
bool modified; /* If the tree ever modified */
uint8_t original; /* Newly created: bulk-load possible
(want a bool but needs atomic cas) */
bool lsm_primary; /* Handle is/was the LSM primary */
WT_BM *bm; /* Block manager reference */
u_int block_header; /* WT_PAGE_HEADER_BYTE_SIZE */
uint64_t write_gen; /* Write generation */
uint64_t rec_max_txn; /* Maximum txn seen (clean trees) */
uint64_t checkpoint_gen; /* Checkpoint generation */
volatile enum {
WT_CKPT_OFF, WT_CKPT_PREPARE, WT_CKPT_RUNNING
} checkpointing; /* Checkpoint in progress */
uint64_t bytes_inmem; /* Cache bytes in memory. */
uint64_t bytes_dirty_intl; /* Bytes in dirty internal pages. */
uint64_t bytes_dirty_leaf; /* Bytes in dirty leaf pages. */
WT_REF *evict_ref; /* Eviction thread's location */
uint64_t evict_priority; /* Relative priority of cached pages */
u_int evict_walk_period; /* Skip this many LRU walks */
u_int evict_walk_saved; /* Saved walk skips for checkpoints */
u_int evict_walk_skips; /* Number of walks skipped */
int32_t evict_disabled; /* Eviction disabled count */
volatile uint32_t evict_busy; /* Count of threads in eviction */
int evict_start_type; /* Start position for eviction walk
(see WT_EVICT_WALK_START). */
/*
* We flush pages from the tree (in order to make checkpoint faster),
* without a high-level lock. To avoid multiple threads flushing at
* the same time, lock the tree.
*/
WT_SPINLOCK flush_lock; /* Lock to flush the tree's pages */
/* Flags values up to 0xff are reserved for WT_DHANDLE_* */
#define WT_BTREE_BULK 0x000100 /* Bulk-load handle */
#define WT_BTREE_CLOSED 0x000200 /* Handle closed */
#define WT_BTREE_IGNORE_CACHE 0x000400 /* Cache-resident object */
#define WT_BTREE_IN_MEMORY 0x000800 /* Cache-resident object */
#define WT_BTREE_LOOKASIDE 0x001000 /* Look-aside table */
#define WT_BTREE_NO_CHECKPOINT 0x002000 /* Disable checkpoints */
#define WT_BTREE_NO_LOGGING 0x004000 /* Disable logging */
#define WT_BTREE_REBALANCE 0x008000 /* Handle is for rebalance */
#define WT_BTREE_SALVAGE 0x010000 /* Handle is for salvage */
#define WT_BTREE_SKIP_CKPT 0x020000 /* Handle skipped checkpoint */
#define WT_BTREE_UPGRADE 0x040000 /* Handle is for upgrade */
#define WT_BTREE_VERIFY 0x080000 /* Handle is for verify */
uint32_t flags;
};
/* Flags that make a btree handle special (not for normal use). */
#define WT_BTREE_SPECIAL_FLAGS \
(WT_BTREE_BULK | WT_BTREE_REBALANCE | \
WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)
/*
* WT_SALVAGE_COOKIE --
* Encapsulation of salvage information for reconciliation.
*/
struct __wt_salvage_cookie {
uint64_t missing; /* Initial items to create */
uint64_t skip; /* Initial items to skip */
uint64_t take; /* Items to take */
bool done; /* Ignore the rest */
};
|