summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/include/btree.h
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2019-09-05 15:55:04 +1000
committerLuke Chen <luke.chen@mongodb.com>2019-09-05 15:55:04 +1000
commitd2c2e6c73c424d5a28d5bd2a9031e4796a5e4371 (patch)
tree457f5fe506097b766e5e1695ba9d7d2662910416 /src/third_party/wiredtiger/src/include/btree.h
parent41a74df493503fec4ce054cc380a0d7eb01d374c (diff)
downloadmongo-d2c2e6c73c424d5a28d5bd2a9031e4796a5e4371.tar.gz
Import wiredtiger: 543111d3d8737ada1b741b3a25a201feb2ed13a3 from branch mongodb-4.0
ref: 48bf8dae7c..543111d3d8 for: 4.0.13 WT-4502 Assertion checking hazard pointers on page discard is too strong WT-4658 Apply Clang Format WT-4792 Add stat to track pages queued for eviction after LRU sorting WT-4840 WT_CURSOR.modify must require explicit, snapshot-isolation transaction WT-4869 Stop adding cache pressure when eviction is falling behind WT-4881 Soften the restrictions on re-entering reconciliation WT-4882 Improve checkpoint performance when there are large metadata pages WT-4892 Improve statistics about forced eviction WT-4893 Fix a race between internal page child-page eviction checks and cursors in the tree WT-4895 Fix debug eviction mode so it chooses skew more randomly WT-4898 Don't allow the eviction server to reconcile if it's busy WT-4920 Add statistics tracking when eviction server is waiting for page transitions WT-4957 Revert part of a change about when pages are queued for urgent eviction WT-5050 Assertion failure during urgent eviction of metadata page
Diffstat (limited to 'src/third_party/wiredtiger/src/include/btree.h')
-rw-r--r--src/third_party/wiredtiger/src/include/btree.h405
1 files changed, 199 insertions, 206 deletions
diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h
index 17722a806e5..c81f402e6c3 100644
--- a/src/third_party/wiredtiger/src/include/btree.h
+++ b/src/third_party/wiredtiger/src/include/btree.h
@@ -7,24 +7,21 @@
*/
/*
- * Supported btree formats: the "current" version is the maximum supported
- * major/minor versions.
+ * Supported btree formats: the "current" version is the maximum supported major/minor versions.
*/
-#define WT_BTREE_MAJOR_VERSION_MIN 1 /* Oldest version supported */
-#define WT_BTREE_MINOR_VERSION_MIN 1
+#define WT_BTREE_MAJOR_VERSION_MIN 1 /* Oldest version supported */
+#define WT_BTREE_MINOR_VERSION_MIN 1
-#define WT_BTREE_MAJOR_VERSION_MAX 1 /* Newest version supported */
-#define WT_BTREE_MINOR_VERSION_MAX 1
+#define WT_BTREE_MAJOR_VERSION_MAX 1 /* Newest version supported */
+#define WT_BTREE_MINOR_VERSION_MAX 1
/*
- * The maximum btree leaf and internal page size is 512MB (2^29). The limit
- * is enforced in software, it could be larger, specifically, the underlying
- * default block manager can support 4GB (2^32). Currently, the maximum page
- * size must accommodate our dependence on the maximum page size fitting into
- * a number of bits less than 32; see the row-store page key-lookup functions
- * for the magic.
+ * The maximum btree leaf and internal page size is 512MB (2^29). The limit is enforced in software,
+ * it could be larger, specifically, the underlying default block manager can support 4GB (2^32).
+ * Currently, the maximum page size must accommodate our dependence on the maximum page size fitting
+ * into a number of bits less than 32; see the row-store page key-lookup functions for the magic.
*/
-#define WT_BTREE_PAGE_SIZE_MAX (512 * WT_MEGABYTE)
+#define WT_BTREE_PAGE_SIZE_MAX (512 * WT_MEGABYTE)
/*
* The length of variable-length column-store values and row-store keys/values
@@ -44,230 +41,226 @@
* Record numbers are stored in 64-bit unsigned integers, meaning the largest
* record number is "really, really big".
*/
-#define WT_BTREE_MAX_OBJECT_SIZE ((uint32_t)(UINT32_MAX - 1024))
+#define WT_BTREE_MAX_OBJECT_SIZE ((uint32_t)(UINT32_MAX - 1024))
/*
- * A location in a file is a variable-length cookie, but it has a maximum size
- * so it's easy to create temporary space in which to store them. (Locations
- * can't be much larger than this anyway, they must fit onto the minimum size
- * page because a reference to an overflow page is itself a location.)
+ * A location in a file is a variable-length cookie, but it has a maximum size so it's easy to
+ * create temporary space in which to store them. (Locations can't be much larger than this anyway,
+ * they must fit onto the minimum size page because a reference to an overflow page is itself a
+ * location.)
*/
-#define WT_BTREE_MAX_ADDR_COOKIE 255 /* Maximum address cookie */
+#define WT_BTREE_MAX_ADDR_COOKIE 255 /* Maximum address cookie */
/* Evict pages if we see this many consecutive deleted records. */
-#define WT_BTREE_DELETE_THRESHOLD 1000
+#define WT_BTREE_DELETE_THRESHOLD 1000
/*
- * Minimum size of the chunks (in percentage of the page size) a page gets split
- * into during reconciliation.
+ * Minimum size of the chunks (in percentage of the page size) a page gets split into during
+ * reconciliation.
*/
-#define WT_BTREE_MIN_SPLIT_PCT 50
+#define WT_BTREE_MIN_SPLIT_PCT 50
/*
* WT_BTREE --
* A btree handle.
*/
struct __wt_btree {
- WT_DATA_HANDLE *dhandle;
+ WT_DATA_HANDLE *dhandle;
- WT_CKPT *ckpt; /* Checkpoint information */
+ WT_CKPT *ckpt; /* Checkpoint information */
- enum { BTREE_COL_FIX=1, /* Fixed-length column store */
- BTREE_COL_VAR=2, /* Variable-length column store */
- BTREE_ROW=3 /* Row-store */
- } type; /* Type */
+ enum {
+ BTREE_COL_FIX = 1, /* Fixed-length column store */
+ BTREE_COL_VAR = 2, /* Variable-length column store */
+ BTREE_ROW = 3 /* Row-store */
+ } type; /* Type */
- const char *key_format; /* Key format */
- const char *value_format; /* Value format */
- uint8_t bitcnt; /* Fixed-length field size in bits */
+ const char *key_format; /* Key format */
+ const char *value_format; /* Value format */
+ uint8_t bitcnt; /* Fixed-length field size in bits */
- WT_COLLATOR *collator; /* Row-store comparator */
- int collator_owned; /* The collator needs to be freed */
+ WT_COLLATOR *collator; /* Row-store comparator */
+ int collator_owned; /* The collator needs to be freed */
- uint32_t id; /* File ID, for logging */
+ uint32_t id; /* File ID, for logging */
- uint32_t key_gap; /* Row-store prefix key gap */
+ uint32_t key_gap; /* Row-store prefix key gap */
- uint32_t allocsize; /* Allocation size */
- uint32_t maxintlpage; /* Internal page max size */
- uint32_t maxintlkey; /* Internal page max key size */
- uint32_t maxleafpage; /* Leaf page max size */
- uint32_t maxleafkey; /* Leaf page max key size */
- uint32_t maxleafvalue; /* Leaf page max value size */
- uint64_t maxmempage; /* In-memory page max size */
- uint32_t maxmempage_image; /* In-memory page image max size */
- uint64_t splitmempage; /* In-memory split trigger size */
+ uint32_t allocsize; /* Allocation size */
+ uint32_t maxintlpage; /* Internal page max size */
+ uint32_t maxintlkey; /* Internal page max key size */
+ uint32_t maxleafpage; /* Leaf page max size */
+ uint32_t maxleafkey; /* Leaf page max key size */
+ uint32_t maxleafvalue; /* Leaf page max value size */
+ uint64_t maxmempage; /* In-memory page max size */
+ uint32_t maxmempage_image; /* In-memory page image max size */
+ uint64_t splitmempage; /* In-memory split trigger size */
/* AUTOMATIC FLAG VALUE GENERATION START */
-#define WT_ASSERT_COMMIT_TS_ALWAYS 0x01u
-#define WT_ASSERT_COMMIT_TS_KEYS 0x02u
-#define WT_ASSERT_COMMIT_TS_NEVER 0x04u
-#define WT_ASSERT_READ_TS_ALWAYS 0x08u
-#define WT_ASSERT_READ_TS_NEVER 0x10u
-/* AUTOMATIC FLAG VALUE GENERATION STOP */
- uint32_t assert_flags; /* Debugging assertion information */
-
- void *huffman_key; /* Key huffman encoding */
- void *huffman_value; /* Value huffman encoding */
-
- enum { CKSUM_ON=1, /* On */
- CKSUM_OFF=2, /* Off */
- CKSUM_UNCOMPRESSED=3 /* Uncompressed blocks only */
- } checksum; /* Checksum configuration */
-
- /*
- * Reconciliation...
- */
- u_int dictionary; /* Dictionary slots */
- bool internal_key_truncate; /* Internal key truncate */
- bool prefix_compression; /* Prefix compression */
- u_int prefix_compression_min; /* Prefix compression min */
-
-#define WT_SPLIT_DEEPEN_MIN_CHILD_DEF 10000
- u_int split_deepen_min_child; /* Minimum entries to deepen tree */
-#define WT_SPLIT_DEEPEN_PER_CHILD_DEF 100
- u_int split_deepen_per_child; /* Entries per child when deepened */
- int split_pct; /* Split page percent */
-
- WT_COMPRESSOR *compressor; /* Page compressor */
- /*
- * When doing compression, the pre-compression in-memory byte size is
- * optionally adjusted based on previous compression results.
- * It's an 8B value because it's updated without a lock.
- */
- bool leafpage_compadjust; /* Run-time compression adjustment */
- uint64_t maxleafpage_precomp; /* Leaf page pre-compression size */
- bool intlpage_compadjust; /* Run-time compression adjustment */
- uint64_t maxintlpage_precomp; /* Internal page pre-compression size */
-
- WT_KEYED_ENCRYPTOR *kencryptor; /* Page encryptor */
-
- WT_RWLOCK ovfl_lock; /* Overflow lock */
-
- int maximum_depth; /* Maximum tree depth during search */
- u_int rec_multiblock_max; /* Maximum blocks written for a page */
-
- uint64_t last_recno; /* Column-store last record number */
-
- WT_REF root; /* Root page reference */
- bool modified; /* If the tree ever modified */
- uint8_t original; /* Newly created: bulk-load possible
- (want a bool but needs atomic cas) */
-
- bool lookaside_entries; /* Has entries in the lookaside table */
- bool lsm_primary; /* Handle is/was the LSM primary */
-
- WT_BM *bm; /* Block manager reference */
- u_int block_header; /* WT_PAGE_HEADER_BYTE_SIZE */
-
- uint64_t write_gen; /* Write generation */
- uint64_t rec_max_txn; /* Maximum txn seen (clean trees) */
- wt_timestamp_t rec_max_timestamp;
-
- uint64_t checkpoint_gen; /* Checkpoint generation */
- WT_SESSION_IMPL *sync_session; /* Syncing session */
- volatile enum {
- WT_BTREE_SYNC_OFF, WT_BTREE_SYNC_WAIT, WT_BTREE_SYNC_RUNNING
- } syncing; /* Sync status */
-
- /*
- * Helper macros:
- * WT_BTREE_SYNCING indicates if a sync is active (either waiting to
- * start or already running), so no new operations should start that
- * would conflict with the sync.
- * WT_SESSION_BTREE_SYNC indicates if the session is performing a sync
- * on its current tree.
- * WT_SESSION_BTREE_SYNC_SAFE checks whether it is safe to perform an
- * operation that would conflict with a sync.
- */
-#define WT_BTREE_SYNCING(btree) \
- ((btree)->syncing != WT_BTREE_SYNC_OFF)
-#define WT_SESSION_BTREE_SYNC(session) \
- (S2BT(session)->sync_session == (session))
-#define WT_SESSION_BTREE_SYNC_SAFE(session, btree) \
- ((btree)->syncing != WT_BTREE_SYNC_RUNNING || \
- (btree)->sync_session == (session))
-
- uint64_t bytes_inmem; /* Cache bytes in memory. */
- uint64_t bytes_dirty_intl; /* Bytes in dirty internal pages. */
- uint64_t bytes_dirty_leaf; /* Bytes in dirty leaf pages. */
- uint64_t bytes_dirty_total; /* Bytes ever dirtied in cache. */
-
- /*
- * The maximum bytes allowed to be used for the table on disk. This is
- * currently only used for the lookaside table.
- */
- uint64_t file_max;
-
- /*
- * We flush pages from the tree (in order to make checkpoint faster),
- * without a high-level lock. To avoid multiple threads flushing at
- * the same time, lock the tree.
- */
- WT_SPINLOCK flush_lock; /* Lock to flush the tree's pages */
-
- /*
- * All of the following fields live at the end of the structure so it's
- * easier to clear everything but the fields that persist.
- */
-#define WT_BTREE_CLEAR_SIZE (offsetof(WT_BTREE, evict_ref))
-
- /*
- * Eviction information is maintained in the btree handle, but owned by
- * eviction, not the btree code.
- */
- WT_REF *evict_ref; /* Eviction thread's location */
- uint64_t evict_priority; /* Relative priority of cached pages */
- uint32_t evict_walk_progress;/* Eviction walk progress */
- uint32_t evict_walk_target; /* Eviction walk target */
- u_int evict_walk_period; /* Skip this many LRU walks */
- u_int evict_walk_saved; /* Saved walk skips for checkpoints */
- u_int evict_walk_skips; /* Number of walks skipped */
- int32_t evict_disabled; /* Eviction disabled count */
- bool evict_disabled_open;/* Eviction disabled on open */
- volatile uint32_t evict_busy; /* Count of threads in eviction */
- enum { /* Start position for eviction walk */
- WT_EVICT_WALK_NEXT,
- WT_EVICT_WALK_PREV,
- WT_EVICT_WALK_RAND_NEXT,
- WT_EVICT_WALK_RAND_PREV
- } evict_start_type;
-
- /*
- * Flag values up to 0xff are reserved for WT_DHANDLE_XXX. We don't
- * automatically generate these flag values for that reason, there's
- * no way to start at an offset.
- */
-#define WT_BTREE_ALTER 0x000100u /* Handle is for alter */
-#define WT_BTREE_BULK 0x000200u /* Bulk-load handle */
-#define WT_BTREE_CLOSED 0x000400u /* Handle closed */
-#define WT_BTREE_IGNORE_CACHE 0x000800u /* Cache-resident object */
-#define WT_BTREE_IN_MEMORY 0x001000u /* Cache-resident object */
-#define WT_BTREE_LOOKASIDE 0x002000u /* Look-aside table */
-#define WT_BTREE_NO_CHECKPOINT 0x004000u /* Disable checkpoints */
-#define WT_BTREE_NO_LOGGING 0x008000u /* Disable logging */
-#define WT_BTREE_READONLY 0x010000u /* Handle is readonly */
-#define WT_BTREE_REBALANCE 0x020000u /* Handle is for rebalance */
-#define WT_BTREE_SALVAGE 0x040000u /* Handle is for salvage */
-#define WT_BTREE_SKIP_CKPT 0x080000u /* Handle skipped checkpoint */
-#define WT_BTREE_UPGRADE 0x100000u /* Handle is for upgrade */
-#define WT_BTREE_VERIFY 0x200000u /* Handle is for verify */
- uint32_t flags;
+#define WT_ASSERT_COMMIT_TS_ALWAYS 0x01u
+#define WT_ASSERT_COMMIT_TS_KEYS 0x02u
+#define WT_ASSERT_COMMIT_TS_NEVER 0x04u
+#define WT_ASSERT_READ_TS_ALWAYS 0x08u
+#define WT_ASSERT_READ_TS_NEVER 0x10u
+ /* AUTOMATIC FLAG VALUE GENERATION STOP */
+ uint32_t assert_flags; /* Debugging assertion information */
+
+ void *huffman_key; /* Key huffman encoding */
+ void *huffman_value; /* Value huffman encoding */
+
+ enum {
+ CKSUM_ON = 1, /* On */
+ CKSUM_OFF = 2, /* Off */
+ CKSUM_UNCOMPRESSED = 3 /* Uncompressed blocks only */
+ } checksum; /* Checksum configuration */
+
+ /*
+ * Reconciliation...
+ */
+ u_int dictionary; /* Dictionary slots */
+ bool internal_key_truncate; /* Internal key truncate */
+ bool prefix_compression; /* Prefix compression */
+ u_int prefix_compression_min; /* Prefix compression min */
+
+#define WT_SPLIT_DEEPEN_MIN_CHILD_DEF 10000
+ u_int split_deepen_min_child; /* Minimum entries to deepen tree */
+#define WT_SPLIT_DEEPEN_PER_CHILD_DEF 100
+ u_int split_deepen_per_child; /* Entries per child when deepened */
+ int split_pct; /* Split page percent */
+
+ WT_COMPRESSOR *compressor; /* Page compressor */
+ /*
+ * When doing compression, the pre-compression in-memory byte size
+ * is optionally adjusted based on previous compression results.
+ * It's an 8B value because it's updated without a lock.
+ */
+ bool leafpage_compadjust; /* Run-time compression adjustment */
+ uint64_t maxleafpage_precomp; /* Leaf page pre-compression size */
+ bool intlpage_compadjust; /* Run-time compression adjustment */
+ uint64_t maxintlpage_precomp; /* Internal page pre-compression size */
+
+ WT_KEYED_ENCRYPTOR *kencryptor; /* Page encryptor */
+
+ WT_RWLOCK ovfl_lock; /* Overflow lock */
+
+ int maximum_depth; /* Maximum tree depth during search */
+ u_int rec_multiblock_max; /* Maximum blocks written for a page */
+
+ uint64_t last_recno; /* Column-store last record number */
+
+ WT_REF root; /* Root page reference */
+ bool modified; /* If the tree ever modified */
+ uint8_t original; /* Newly created: bulk-load possible
+ (want a bool but needs atomic cas) */
+
+ bool lookaside_entries; /* Has entries in the lookaside table */
+ bool lsm_primary; /* Handle is/was the LSM primary */
+
+ WT_BM *bm; /* Block manager reference */
+ u_int block_header; /* WT_PAGE_HEADER_BYTE_SIZE */
+
+ uint64_t write_gen; /* Write generation */
+ uint64_t rec_max_txn; /* Maximum txn seen (clean trees) */
+ wt_timestamp_t rec_max_timestamp;
+
+ uint64_t checkpoint_gen; /* Checkpoint generation */
+ WT_SESSION_IMPL *sync_session; /* Syncing session */
+ volatile enum {
+ WT_BTREE_SYNC_OFF,
+ WT_BTREE_SYNC_WAIT,
+ WT_BTREE_SYNC_RUNNING
+ } syncing; /* Sync status */
+
+/*
+ * Helper macros: WT_BTREE_SYNCING indicates if a sync is active (either waiting to start or already
+ * running), so no new operations should start that would conflict with the sync.
+ * WT_SESSION_BTREE_SYNC indicates if the session is performing a sync on its current tree.
+ * WT_SESSION_BTREE_SYNC_SAFE checks whether it is safe to perform an operation that would conflict
+ * with a sync.
+ */
+#define WT_BTREE_SYNCING(btree) ((btree)->syncing != WT_BTREE_SYNC_OFF)
+#define WT_SESSION_BTREE_SYNC(session) (S2BT(session)->sync_session == (session))
+#define WT_SESSION_BTREE_SYNC_SAFE(session, btree) \
+ ((btree)->syncing != WT_BTREE_SYNC_RUNNING || (btree)->sync_session == (session))
+
+ uint64_t bytes_inmem; /* Cache bytes in memory. */
+ uint64_t bytes_dirty_intl; /* Bytes in dirty internal pages. */
+ uint64_t bytes_dirty_leaf; /* Bytes in dirty leaf pages. */
+ uint64_t bytes_dirty_total; /* Bytes ever dirtied in cache. */
+
+ /*
+ * The maximum bytes allowed to be used for the table on disk. This is currently only used for
+ * the lookaside table.
+ */
+ uint64_t file_max;
+
+ /*
+ * We flush pages from the tree (in order to make checkpoint faster), without a high-level lock.
+ * To avoid multiple threads flushing at the same time, lock the tree.
+ */
+ WT_SPINLOCK flush_lock; /* Lock to flush the tree's pages */
+
+/*
+ * All of the following fields live at the end of the structure so it's easier to clear everything
+ * but the fields that persist.
+ */
+#define WT_BTREE_CLEAR_SIZE (offsetof(WT_BTREE, evict_ref))
+
+ /*
+ * Eviction information is maintained in the btree handle, but owned by eviction, not the btree
+ * code.
+ */
+ WT_REF *evict_ref; /* Eviction thread's location */
+ uint64_t evict_priority; /* Relative priority of cached pages */
+ uint32_t evict_walk_progress; /* Eviction walk progress */
+ uint32_t evict_walk_target; /* Eviction walk target */
+ u_int evict_walk_period; /* Skip this many LRU walks */
+ u_int evict_walk_saved; /* Saved walk skips for checkpoints */
+ u_int evict_walk_skips; /* Number of walks skipped */
+ int32_t evict_disabled; /* Eviction disabled count */
+ bool evict_disabled_open; /* Eviction disabled on open */
+ volatile uint32_t evict_busy; /* Count of threads in eviction */
+ enum { /* Start position for eviction walk */
+ WT_EVICT_WALK_NEXT,
+ WT_EVICT_WALK_PREV,
+ WT_EVICT_WALK_RAND_NEXT,
+ WT_EVICT_WALK_RAND_PREV
+ } evict_start_type;
+
+/*
+ * Flag values up to 0xff are reserved for WT_DHANDLE_XXX. We don't automatically generate these
+ * flag values for that reason, there's no way to start at an offset.
+ */
+#define WT_BTREE_ALTER 0x000100u /* Handle is for alter */
+#define WT_BTREE_BULK 0x000200u /* Bulk-load handle */
+#define WT_BTREE_CLOSED 0x000400u /* Handle closed */
+#define WT_BTREE_IGNORE_CACHE 0x000800u /* Cache-resident object */
+#define WT_BTREE_IN_MEMORY 0x001000u /* Cache-resident object */
+#define WT_BTREE_LOOKASIDE 0x002000u /* Look-aside table */
+#define WT_BTREE_NO_CHECKPOINT 0x004000u /* Disable checkpoints */
+#define WT_BTREE_NO_LOGGING 0x008000u /* Disable logging */
+#define WT_BTREE_READONLY 0x010000u /* Handle is readonly */
+#define WT_BTREE_REBALANCE 0x020000u /* Handle is for rebalance */
+#define WT_BTREE_SALVAGE 0x040000u /* Handle is for salvage */
+#define WT_BTREE_SKIP_CKPT 0x080000u /* Handle skipped checkpoint */
+#define WT_BTREE_UPGRADE 0x100000u /* Handle is for upgrade */
+#define WT_BTREE_VERIFY 0x200000u /* Handle is for verify */
+ uint32_t flags;
};
/* Flags that make a btree handle special (not for normal use). */
-#define WT_BTREE_SPECIAL_FLAGS \
- (WT_BTREE_ALTER | WT_BTREE_BULK | WT_BTREE_REBALANCE | \
- WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)
+#define WT_BTREE_SPECIAL_FLAGS \
+ (WT_BTREE_ALTER | WT_BTREE_BULK | WT_BTREE_REBALANCE | WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | \
+ WT_BTREE_VERIFY)
/*
* WT_SALVAGE_COOKIE --
* Encapsulation of salvage information for reconciliation.
*/
struct __wt_salvage_cookie {
- uint64_t missing; /* Initial items to create */
- uint64_t skip; /* Initial items to skip */
- uint64_t take; /* Items to take */
+ uint64_t missing; /* Initial items to create */
+ uint64_t skip; /* Initial items to skip */
+ uint64_t take; /* Items to take */
- bool done; /* Ignore the rest */
+ bool done; /* Ignore the rest */
};