diff options
author | Luke Chen <luke.chen@mongodb.com> | 2019-09-05 15:55:04 +1000 |
---|---|---|
committer | Luke Chen <luke.chen@mongodb.com> | 2019-09-05 15:55:04 +1000 |
commit | d2c2e6c73c424d5a28d5bd2a9031e4796a5e4371 (patch) | |
tree | 457f5fe506097b766e5e1695ba9d7d2662910416 /src/third_party/wiredtiger/src/include/btree.h | |
parent | 41a74df493503fec4ce054cc380a0d7eb01d374c (diff) | |
download | mongo-d2c2e6c73c424d5a28d5bd2a9031e4796a5e4371.tar.gz |
Import wiredtiger: 543111d3d8737ada1b741b3a25a201feb2ed13a3 from branch mongodb-4.0
ref: 48bf8dae7c..543111d3d8
for: 4.0.13
WT-4502 Assertion checking hazard pointers on page discard is too strong
WT-4658 Apply Clang Format
WT-4792 Add stat to track pages queued for eviction after LRU sorting
WT-4840 WT_CURSOR.modify must require explicit, snapshot-isolation transaction
WT-4869 Stop adding cache pressure when eviction is falling behind
WT-4881 Soften the restrictions on re-entering reconciliation
WT-4882 Improve checkpoint performance when there are large metadata pages
WT-4892 Improve statistics about forced eviction
WT-4893 Fix a race between internal page child-page eviction checks and cursors in the tree
WT-4895 Fix debug eviction mode so it chooses skew more randomly
WT-4898 Don't allow the eviction server to reconcile if it's busy
WT-4920 Add statistics tracking when eviction server is waiting for page transitions
WT-4957 Revert part of a change about when pages are queued for urgent eviction
WT-5050 Assertion failure during urgent eviction of metadata page
Diffstat (limited to 'src/third_party/wiredtiger/src/include/btree.h')
-rw-r--r-- | src/third_party/wiredtiger/src/include/btree.h | 405 |
1 files changed, 199 insertions, 206 deletions
diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h index 17722a806e5..c81f402e6c3 100644 --- a/src/third_party/wiredtiger/src/include/btree.h +++ b/src/third_party/wiredtiger/src/include/btree.h @@ -7,24 +7,21 @@ */ /* - * Supported btree formats: the "current" version is the maximum supported - * major/minor versions. + * Supported btree formats: the "current" version is the maximum supported major/minor versions. */ -#define WT_BTREE_MAJOR_VERSION_MIN 1 /* Oldest version supported */ -#define WT_BTREE_MINOR_VERSION_MIN 1 +#define WT_BTREE_MAJOR_VERSION_MIN 1 /* Oldest version supported */ +#define WT_BTREE_MINOR_VERSION_MIN 1 -#define WT_BTREE_MAJOR_VERSION_MAX 1 /* Newest version supported */ -#define WT_BTREE_MINOR_VERSION_MAX 1 +#define WT_BTREE_MAJOR_VERSION_MAX 1 /* Newest version supported */ +#define WT_BTREE_MINOR_VERSION_MAX 1 /* - * The maximum btree leaf and internal page size is 512MB (2^29). The limit - * is enforced in software, it could be larger, specifically, the underlying - * default block manager can support 4GB (2^32). Currently, the maximum page - * size must accommodate our dependence on the maximum page size fitting into - * a number of bits less than 32; see the row-store page key-lookup functions - * for the magic. + * The maximum btree leaf and internal page size is 512MB (2^29). The limit is enforced in software, + * it could be larger, specifically, the underlying default block manager can support 4GB (2^32). + * Currently, the maximum page size must accommodate our dependence on the maximum page size fitting + * into a number of bits less than 32; see the row-store page key-lookup functions for the magic. */ -#define WT_BTREE_PAGE_SIZE_MAX (512 * WT_MEGABYTE) +#define WT_BTREE_PAGE_SIZE_MAX (512 * WT_MEGABYTE) /* * The length of variable-length column-store values and row-store keys/values @@ -44,230 +41,226 @@ * Record numbers are stored in 64-bit unsigned integers, meaning the largest * record number is "really, really big". */ -#define WT_BTREE_MAX_OBJECT_SIZE ((uint32_t)(UINT32_MAX - 1024)) +#define WT_BTREE_MAX_OBJECT_SIZE ((uint32_t)(UINT32_MAX - 1024)) /* - * A location in a file is a variable-length cookie, but it has a maximum size - * so it's easy to create temporary space in which to store them. (Locations - * can't be much larger than this anyway, they must fit onto the minimum size - * page because a reference to an overflow page is itself a location.) + * A location in a file is a variable-length cookie, but it has a maximum size so it's easy to + * create temporary space in which to store them. (Locations can't be much larger than this anyway, + * they must fit onto the minimum size page because a reference to an overflow page is itself a + * location.) */ -#define WT_BTREE_MAX_ADDR_COOKIE 255 /* Maximum address cookie */ +#define WT_BTREE_MAX_ADDR_COOKIE 255 /* Maximum address cookie */ /* Evict pages if we see this many consecutive deleted records. */ -#define WT_BTREE_DELETE_THRESHOLD 1000 +#define WT_BTREE_DELETE_THRESHOLD 1000 /* - * Minimum size of the chunks (in percentage of the page size) a page gets split - * into during reconciliation. + * Minimum size of the chunks (in percentage of the page size) a page gets split into during + * reconciliation. */ -#define WT_BTREE_MIN_SPLIT_PCT 50 +#define WT_BTREE_MIN_SPLIT_PCT 50 /* * WT_BTREE -- * A btree handle. */ struct __wt_btree { - WT_DATA_HANDLE *dhandle; + WT_DATA_HANDLE *dhandle; - WT_CKPT *ckpt; /* Checkpoint information */ + WT_CKPT *ckpt; /* Checkpoint information */ - enum { BTREE_COL_FIX=1, /* Fixed-length column store */ - BTREE_COL_VAR=2, /* Variable-length column store */ - BTREE_ROW=3 /* Row-store */ - } type; /* Type */ + enum { + BTREE_COL_FIX = 1, /* Fixed-length column store */ + BTREE_COL_VAR = 2, /* Variable-length column store */ + BTREE_ROW = 3 /* Row-store */ + } type; /* Type */ - const char *key_format; /* Key format */ - const char *value_format; /* Value format */ - uint8_t bitcnt; /* Fixed-length field size in bits */ + const char *key_format; /* Key format */ + const char *value_format; /* Value format */ + uint8_t bitcnt; /* Fixed-length field size in bits */ - WT_COLLATOR *collator; /* Row-store comparator */ - int collator_owned; /* The collator needs to be freed */ + WT_COLLATOR *collator; /* Row-store comparator */ + int collator_owned; /* The collator needs to be freed */ - uint32_t id; /* File ID, for logging */ + uint32_t id; /* File ID, for logging */ - uint32_t key_gap; /* Row-store prefix key gap */ + uint32_t key_gap; /* Row-store prefix key gap */ - uint32_t allocsize; /* Allocation size */ - uint32_t maxintlpage; /* Internal page max size */ - uint32_t maxintlkey; /* Internal page max key size */ - uint32_t maxleafpage; /* Leaf page max size */ - uint32_t maxleafkey; /* Leaf page max key size */ - uint32_t maxleafvalue; /* Leaf page max value size */ - uint64_t maxmempage; /* In-memory page max size */ - uint32_t maxmempage_image; /* In-memory page image max size */ - uint64_t splitmempage; /* In-memory split trigger size */ + uint32_t allocsize; /* Allocation size */ + uint32_t maxintlpage; /* Internal page max size */ + uint32_t maxintlkey; /* Internal page max key size */ + uint32_t maxleafpage; /* Leaf page max size */ + uint32_t maxleafkey; /* Leaf page max key size */ + uint32_t maxleafvalue; /* Leaf page max value size */ + uint64_t maxmempage; /* In-memory page max size */ + uint32_t maxmempage_image; /* In-memory page image max size */ + uint64_t splitmempage; /* In-memory split trigger size */ /* AUTOMATIC FLAG VALUE GENERATION START */ -#define WT_ASSERT_COMMIT_TS_ALWAYS 0x01u -#define WT_ASSERT_COMMIT_TS_KEYS 0x02u -#define WT_ASSERT_COMMIT_TS_NEVER 0x04u -#define WT_ASSERT_READ_TS_ALWAYS 0x08u -#define WT_ASSERT_READ_TS_NEVER 0x10u -/* AUTOMATIC FLAG VALUE GENERATION STOP */ - uint32_t assert_flags; /* Debugging assertion information */ - - void *huffman_key; /* Key huffman encoding */ - void *huffman_value; /* Value huffman encoding */ - - enum { CKSUM_ON=1, /* On */ - CKSUM_OFF=2, /* Off */ - CKSUM_UNCOMPRESSED=3 /* Uncompressed blocks only */ - } checksum; /* Checksum configuration */ - - /* - * Reconciliation... - */ - u_int dictionary; /* Dictionary slots */ - bool internal_key_truncate; /* Internal key truncate */ - bool prefix_compression; /* Prefix compression */ - u_int prefix_compression_min; /* Prefix compression min */ - -#define WT_SPLIT_DEEPEN_MIN_CHILD_DEF 10000 - u_int split_deepen_min_child; /* Minimum entries to deepen tree */ -#define WT_SPLIT_DEEPEN_PER_CHILD_DEF 100 - u_int split_deepen_per_child; /* Entries per child when deepened */ - int split_pct; /* Split page percent */ - - WT_COMPRESSOR *compressor; /* Page compressor */ - /* - * When doing compression, the pre-compression in-memory byte size is - * optionally adjusted based on previous compression results. - * It's an 8B value because it's updated without a lock. - */ - bool leafpage_compadjust; /* Run-time compression adjustment */ - uint64_t maxleafpage_precomp; /* Leaf page pre-compression size */ - bool intlpage_compadjust; /* Run-time compression adjustment */ - uint64_t maxintlpage_precomp; /* Internal page pre-compression size */ - - WT_KEYED_ENCRYPTOR *kencryptor; /* Page encryptor */ - - WT_RWLOCK ovfl_lock; /* Overflow lock */ - - int maximum_depth; /* Maximum tree depth during search */ - u_int rec_multiblock_max; /* Maximum blocks written for a page */ - - uint64_t last_recno; /* Column-store last record number */ - - WT_REF root; /* Root page reference */ - bool modified; /* If the tree ever modified */ - uint8_t original; /* Newly created: bulk-load possible - (want a bool but needs atomic cas) */ - - bool lookaside_entries; /* Has entries in the lookaside table */ - bool lsm_primary; /* Handle is/was the LSM primary */ - - WT_BM *bm; /* Block manager reference */ - u_int block_header; /* WT_PAGE_HEADER_BYTE_SIZE */ - - uint64_t write_gen; /* Write generation */ - uint64_t rec_max_txn; /* Maximum txn seen (clean trees) */ - wt_timestamp_t rec_max_timestamp; - - uint64_t checkpoint_gen; /* Checkpoint generation */ - WT_SESSION_IMPL *sync_session; /* Syncing session */ - volatile enum { - WT_BTREE_SYNC_OFF, WT_BTREE_SYNC_WAIT, WT_BTREE_SYNC_RUNNING - } syncing; /* Sync status */ - - /* - * Helper macros: - * WT_BTREE_SYNCING indicates if a sync is active (either waiting to - * start or already running), so no new operations should start that - * would conflict with the sync. - * WT_SESSION_BTREE_SYNC indicates if the session is performing a sync - * on its current tree. - * WT_SESSION_BTREE_SYNC_SAFE checks whether it is safe to perform an - * operation that would conflict with a sync. - */ -#define WT_BTREE_SYNCING(btree) \ - ((btree)->syncing != WT_BTREE_SYNC_OFF) -#define WT_SESSION_BTREE_SYNC(session) \ - (S2BT(session)->sync_session == (session)) -#define WT_SESSION_BTREE_SYNC_SAFE(session, btree) \ - ((btree)->syncing != WT_BTREE_SYNC_RUNNING || \ - (btree)->sync_session == (session)) - - uint64_t bytes_inmem; /* Cache bytes in memory. */ - uint64_t bytes_dirty_intl; /* Bytes in dirty internal pages. */ - uint64_t bytes_dirty_leaf; /* Bytes in dirty leaf pages. */ - uint64_t bytes_dirty_total; /* Bytes ever dirtied in cache. */ - - /* - * The maximum bytes allowed to be used for the table on disk. This is - * currently only used for the lookaside table. - */ - uint64_t file_max; - - /* - * We flush pages from the tree (in order to make checkpoint faster), - * without a high-level lock. To avoid multiple threads flushing at - * the same time, lock the tree. - */ - WT_SPINLOCK flush_lock; /* Lock to flush the tree's pages */ - - /* - * All of the following fields live at the end of the structure so it's - * easier to clear everything but the fields that persist. - */ -#define WT_BTREE_CLEAR_SIZE (offsetof(WT_BTREE, evict_ref)) - - /* - * Eviction information is maintained in the btree handle, but owned by - * eviction, not the btree code. - */ - WT_REF *evict_ref; /* Eviction thread's location */ - uint64_t evict_priority; /* Relative priority of cached pages */ - uint32_t evict_walk_progress;/* Eviction walk progress */ - uint32_t evict_walk_target; /* Eviction walk target */ - u_int evict_walk_period; /* Skip this many LRU walks */ - u_int evict_walk_saved; /* Saved walk skips for checkpoints */ - u_int evict_walk_skips; /* Number of walks skipped */ - int32_t evict_disabled; /* Eviction disabled count */ - bool evict_disabled_open;/* Eviction disabled on open */ - volatile uint32_t evict_busy; /* Count of threads in eviction */ - enum { /* Start position for eviction walk */ - WT_EVICT_WALK_NEXT, - WT_EVICT_WALK_PREV, - WT_EVICT_WALK_RAND_NEXT, - WT_EVICT_WALK_RAND_PREV - } evict_start_type; - - /* - * Flag values up to 0xff are reserved for WT_DHANDLE_XXX. We don't - * automatically generate these flag values for that reason, there's - * no way to start at an offset. - */ -#define WT_BTREE_ALTER 0x000100u /* Handle is for alter */ -#define WT_BTREE_BULK 0x000200u /* Bulk-load handle */ -#define WT_BTREE_CLOSED 0x000400u /* Handle closed */ -#define WT_BTREE_IGNORE_CACHE 0x000800u /* Cache-resident object */ -#define WT_BTREE_IN_MEMORY 0x001000u /* Cache-resident object */ -#define WT_BTREE_LOOKASIDE 0x002000u /* Look-aside table */ -#define WT_BTREE_NO_CHECKPOINT 0x004000u /* Disable checkpoints */ -#define WT_BTREE_NO_LOGGING 0x008000u /* Disable logging */ -#define WT_BTREE_READONLY 0x010000u /* Handle is readonly */ -#define WT_BTREE_REBALANCE 0x020000u /* Handle is for rebalance */ -#define WT_BTREE_SALVAGE 0x040000u /* Handle is for salvage */ -#define WT_BTREE_SKIP_CKPT 0x080000u /* Handle skipped checkpoint */ -#define WT_BTREE_UPGRADE 0x100000u /* Handle is for upgrade */ -#define WT_BTREE_VERIFY 0x200000u /* Handle is for verify */ - uint32_t flags; +#define WT_ASSERT_COMMIT_TS_ALWAYS 0x01u +#define WT_ASSERT_COMMIT_TS_KEYS 0x02u +#define WT_ASSERT_COMMIT_TS_NEVER 0x04u +#define WT_ASSERT_READ_TS_ALWAYS 0x08u +#define WT_ASSERT_READ_TS_NEVER 0x10u + /* AUTOMATIC FLAG VALUE GENERATION STOP */ + uint32_t assert_flags; /* Debugging assertion information */ + + void *huffman_key; /* Key huffman encoding */ + void *huffman_value; /* Value huffman encoding */ + + enum { + CKSUM_ON = 1, /* On */ + CKSUM_OFF = 2, /* Off */ + CKSUM_UNCOMPRESSED = 3 /* Uncompressed blocks only */ + } checksum; /* Checksum configuration */ + + /* + * Reconciliation... + */ + u_int dictionary; /* Dictionary slots */ + bool internal_key_truncate; /* Internal key truncate */ + bool prefix_compression; /* Prefix compression */ + u_int prefix_compression_min; /* Prefix compression min */ + +#define WT_SPLIT_DEEPEN_MIN_CHILD_DEF 10000 + u_int split_deepen_min_child; /* Minimum entries to deepen tree */ +#define WT_SPLIT_DEEPEN_PER_CHILD_DEF 100 + u_int split_deepen_per_child; /* Entries per child when deepened */ + int split_pct; /* Split page percent */ + + WT_COMPRESSOR *compressor; /* Page compressor */ + /* + * When doing compression, the pre-compression in-memory byte size + * is optionally adjusted based on previous compression results. + * It's an 8B value because it's updated without a lock. + */ + bool leafpage_compadjust; /* Run-time compression adjustment */ + uint64_t maxleafpage_precomp; /* Leaf page pre-compression size */ + bool intlpage_compadjust; /* Run-time compression adjustment */ + uint64_t maxintlpage_precomp; /* Internal page pre-compression size */ + + WT_KEYED_ENCRYPTOR *kencryptor; /* Page encryptor */ + + WT_RWLOCK ovfl_lock; /* Overflow lock */ + + int maximum_depth; /* Maximum tree depth during search */ + u_int rec_multiblock_max; /* Maximum blocks written for a page */ + + uint64_t last_recno; /* Column-store last record number */ + + WT_REF root; /* Root page reference */ + bool modified; /* If the tree ever modified */ + uint8_t original; /* Newly created: bulk-load possible + (want a bool but needs atomic cas) */ + + bool lookaside_entries; /* Has entries in the lookaside table */ + bool lsm_primary; /* Handle is/was the LSM primary */ + + WT_BM *bm; /* Block manager reference */ + u_int block_header; /* WT_PAGE_HEADER_BYTE_SIZE */ + + uint64_t write_gen; /* Write generation */ + uint64_t rec_max_txn; /* Maximum txn seen (clean trees) */ + wt_timestamp_t rec_max_timestamp; + + uint64_t checkpoint_gen; /* Checkpoint generation */ + WT_SESSION_IMPL *sync_session; /* Syncing session */ + volatile enum { + WT_BTREE_SYNC_OFF, + WT_BTREE_SYNC_WAIT, + WT_BTREE_SYNC_RUNNING + } syncing; /* Sync status */ + +/* + * Helper macros: WT_BTREE_SYNCING indicates if a sync is active (either waiting to start or already + * running), so no new operations should start that would conflict with the sync. + * WT_SESSION_BTREE_SYNC indicates if the session is performing a sync on its current tree. + * WT_SESSION_BTREE_SYNC_SAFE checks whether it is safe to perform an operation that would conflict + * with a sync. + */ +#define WT_BTREE_SYNCING(btree) ((btree)->syncing != WT_BTREE_SYNC_OFF) +#define WT_SESSION_BTREE_SYNC(session) (S2BT(session)->sync_session == (session)) +#define WT_SESSION_BTREE_SYNC_SAFE(session, btree) \ + ((btree)->syncing != WT_BTREE_SYNC_RUNNING || (btree)->sync_session == (session)) + + uint64_t bytes_inmem; /* Cache bytes in memory. */ + uint64_t bytes_dirty_intl; /* Bytes in dirty internal pages. */ + uint64_t bytes_dirty_leaf; /* Bytes in dirty leaf pages. */ + uint64_t bytes_dirty_total; /* Bytes ever dirtied in cache. */ + + /* + * The maximum bytes allowed to be used for the table on disk. This is currently only used for + * the lookaside table. + */ + uint64_t file_max; + + /* + * We flush pages from the tree (in order to make checkpoint faster), without a high-level lock. + * To avoid multiple threads flushing at the same time, lock the tree. + */ + WT_SPINLOCK flush_lock; /* Lock to flush the tree's pages */ + +/* + * All of the following fields live at the end of the structure so it's easier to clear everything + * but the fields that persist. + */ +#define WT_BTREE_CLEAR_SIZE (offsetof(WT_BTREE, evict_ref)) + + /* + * Eviction information is maintained in the btree handle, but owned by eviction, not the btree + * code. + */ + WT_REF *evict_ref; /* Eviction thread's location */ + uint64_t evict_priority; /* Relative priority of cached pages */ + uint32_t evict_walk_progress; /* Eviction walk progress */ + uint32_t evict_walk_target; /* Eviction walk target */ + u_int evict_walk_period; /* Skip this many LRU walks */ + u_int evict_walk_saved; /* Saved walk skips for checkpoints */ + u_int evict_walk_skips; /* Number of walks skipped */ + int32_t evict_disabled; /* Eviction disabled count */ + bool evict_disabled_open; /* Eviction disabled on open */ + volatile uint32_t evict_busy; /* Count of threads in eviction */ + enum { /* Start position for eviction walk */ + WT_EVICT_WALK_NEXT, + WT_EVICT_WALK_PREV, + WT_EVICT_WALK_RAND_NEXT, + WT_EVICT_WALK_RAND_PREV + } evict_start_type; + +/* + * Flag values up to 0xff are reserved for WT_DHANDLE_XXX. We don't automatically generate these + * flag values for that reason, there's no way to start at an offset. + */ +#define WT_BTREE_ALTER 0x000100u /* Handle is for alter */ +#define WT_BTREE_BULK 0x000200u /* Bulk-load handle */ +#define WT_BTREE_CLOSED 0x000400u /* Handle closed */ +#define WT_BTREE_IGNORE_CACHE 0x000800u /* Cache-resident object */ +#define WT_BTREE_IN_MEMORY 0x001000u /* Cache-resident object */ +#define WT_BTREE_LOOKASIDE 0x002000u /* Look-aside table */ +#define WT_BTREE_NO_CHECKPOINT 0x004000u /* Disable checkpoints */ +#define WT_BTREE_NO_LOGGING 0x008000u /* Disable logging */ +#define WT_BTREE_READONLY 0x010000u /* Handle is readonly */ +#define WT_BTREE_REBALANCE 0x020000u /* Handle is for rebalance */ +#define WT_BTREE_SALVAGE 0x040000u /* Handle is for salvage */ +#define WT_BTREE_SKIP_CKPT 0x080000u /* Handle skipped checkpoint */ +#define WT_BTREE_UPGRADE 0x100000u /* Handle is for upgrade */ +#define WT_BTREE_VERIFY 0x200000u /* Handle is for verify */ + uint32_t flags; }; /* Flags that make a btree handle special (not for normal use). */ -#define WT_BTREE_SPECIAL_FLAGS \ - (WT_BTREE_ALTER | WT_BTREE_BULK | WT_BTREE_REBALANCE | \ - WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY) +#define WT_BTREE_SPECIAL_FLAGS \ + (WT_BTREE_ALTER | WT_BTREE_BULK | WT_BTREE_REBALANCE | WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | \ + WT_BTREE_VERIFY) /* * WT_SALVAGE_COOKIE -- * Encapsulation of salvage information for reconciliation. */ struct __wt_salvage_cookie { - uint64_t missing; /* Initial items to create */ - uint64_t skip; /* Initial items to skip */ - uint64_t take; /* Items to take */ + uint64_t missing; /* Initial items to create */ + uint64_t skip; /* Initial items to skip */ + uint64_t take; /* Items to take */ - bool done; /* Ignore the rest */ + bool done; /* Ignore the rest */ }; |