diff options
author | Keith Bostic <keith.bostic@wiredtiger.com> | 2011-12-17 17:57:45 +0000 |
---|---|---|
committer | Keith Bostic <keith.bostic@wiredtiger.com> | 2011-12-17 17:57:45 +0000 |
commit | e7038697261d94f52bc2fef638519a8c1ad60cc1 (patch) | |
tree | ba1060da8ce09cfb0273254b132fb64852c2664e /src/include/cell.i | |
parent | 1187d187021d6229aad874c4a1b57d1722585fb0 (diff) | |
download | mongo-e7038697261d94f52bc2fef638519a8c1ad60cc1.tar.gz |
Replace WT_OFF/WT_OFF_RECORD structures on row- and column-store internal
pages with variable length address cookies.
--HG--
extra : rebase_source : cfcfa6948a709ec0e3ff2682628feab59e83438d
Diffstat (limited to 'src/include/cell.i')
-rw-r--r-- | src/include/cell.i | 211 |
1 files changed, 135 insertions, 76 deletions
diff --git a/src/include/cell.i b/src/include/cell.i index 2141f4dddb8..f1e632f919b 100644 --- a/src/include/cell.i +++ b/src/include/cell.i @@ -15,25 +15,27 @@ * There are 4 basic cell types: keys and data (each of which has an overflow * form), deleted cells and off-page references. The cell is usually followed * by additional data, varying by type: a key or data cell is followed by a set - * of bytes, a WT_OFF structure with addr/size pair follows overflow or off-page - * cells. + * of bytes, an address cookie follows overflow or off-page cells. * * Deleted cells are place-holders for column-store files, where entries cannot * be removed in order to preserve the record count. * * Here's cell use by page type: * - * WT_PAGE_ROW_INT (row-store internal pages): - * Keys with offpage-reference pairs (a WT_CELL_KEY or WT_CELL_KEY_OVFL - * cell followed by a WT_CELL_OFF cell). + * WT_PAGE_ROW_INT (row-store internal page): + * Keys and offpage-reference pairs (a WT_CELL_KEY or WT_CELL_KEY_OVFL + * cell followed by a WT_CELL_ADDR cell). * - * WT_PAGE_ROW_LEAF (row-store leaf pages): + * WT_PAGE_ROW_LEAF (row-store leaf page): * Keys with optional data cells (a WT_CELL_KEY or WT_CELL_KEY_OVFL cell, * optionally followed by a WT_CELL_VALUE or WT_CELL_VALUE_OVFL cell). * * Both WT_PAGE_ROW_INT and WT_PAGE_ROW_LEAF pages prefix compress keys, using * a single byte immediately following the cell. * + * WT_PAGE_COL_INT (Column-store internal page): + * Off-page references (a WT_CELL_ADDR cell). + * * WT_PAGE_COL_VAR (Column-store leaf page storing variable-length cells): * Data cells (a WT_CELL_VALUE or WT_CELL_VALUE_OVFL cell), and deleted * cells (a WT_CELL_DEL cell). @@ -47,26 +49,26 @@ * 0x03 bit combination (setting both 0x01 and 0x02) is unused, but will require * code changes. * - * Bit 3 marks run-length encoded variable-length column store data: immediately - * after the cell description byte, there's a uint32_t repeat count. + * Bit 3 marks variable-length column store data with an associated run-length + * counter or a record number: there is a uint64_t value immediately after the + * cell description byte. */ #define WT_CELL_VALUE_SHORT 0x001 /* Short data */ #define WT_CELL_KEY_SHORT 0x002 /* Short key */ -#define WT_CELL_RLE 0x004 /* Run-length encoding */ +#define WT_CELL_64V 0x004 /* RLE or recno */ #define WT_CELL_UNUSED_BIT4 0x008 /* Unused */ #define WT_CELL_UNUSED_BIT5 0x010 /* Unused */ /* * Bits 6-8 are for other cell types (there are currently 6 cell types). */ -#define WT_CELL_DEL (0 << 5) /* Deleted */ -#define WT_CELL_KEY (1 << 5) /* Key */ -#define WT_CELL_KEY_OVFL (2 << 5) /* Key overflow */ -#define WT_CELL_OFF (3 << 5) /* Off-page ref */ +#define WT_CELL_ADDR (0 << 5) /* Location reference */ +#define WT_CELL_DEL (1 << 5) /* Deleted */ +#define WT_CELL_KEY (2 << 5) /* Key */ +#define WT_CELL_KEY_OVFL (3 << 5) /* Key overflow */ #define WT_CELL_VALUE (4 << 5) /* Value */ #define WT_CELL_VALUE_OVFL (5 << 5) /* Value overflow */ #define WT_CELL_UNUSED_TYPE6 (6 << 5) /* Unused */ -#define WT_CELL_UNUSED_TYPE7 (7 << 5) /* Unused */ #define WT_CELL_TYPE_MASK (7 << 5) /* @@ -75,13 +77,16 @@ */ struct __wt_cell { /* - * Maximum of 12 bytes: - * 0: type, RLE flag - * 1: prefix compression count - * 2-6: optional RLE count (uint32_t encoding, max 5 bytes) - * 7-11: optional data length (uint32_t encoding, max 5 bytes) + * Maximum of 16 bytes: + * 1: type + 64V flag (recno/rle) + * 1: prefix compression count + * 9: optional RLE or recno (uint64_t encoding, max 9 bytes) + * 5: optional data length (uint32_t encoding, max 5 bytes) + * + * The prefix compression count and 64V value overlap, this calculation + * is pessimistic, but it isn't worth optimizing. */ - uint8_t __chunk[12]; + uint8_t __chunk[16]; }; /* @@ -89,12 +94,10 @@ struct __wt_cell { * Unpacked cell. */ struct __wt_cell_unpack { - WT_OFF off; /* WT_OFF structure */ - - uint64_t rle; /* RLE count */ + uint64_t v; /* RLE count or recno */ const void *data; /* Data */ - uint32_t size; /* Data size */ + uint32_t size; /* Data size */ uint32_t len; /* Cell + data total length */ @@ -114,34 +117,19 @@ struct __wt_cell_unpack { (cell) = (WT_CELL *)((uint8_t *)(cell) + (unpack)->len), --(i)) /* - * __wt_cell_pack_key -- - * Set a key's WT_CELL contents. + * __wt_cell_pack_addr -- + * Pack an address cell. */ static inline uint32_t -__wt_cell_pack_key(WT_CELL *cell, uint8_t prefix, uint32_t size) +__wt_cell_pack_addr(WT_CELL *cell, uint64_t recno, uint32_t size) { - uint8_t byte, *p; - - /* - * Short keys have 6-bits of length in the descriptor byte and no length - * bytes. - * - * Bit 0 is 0, bit 1 is the WT_CELL_KEY_SHORT flag; the other 6 bits are - * the size. - */ - if (size <= 0x3f) { - byte = (uint8_t)size; - cell->__chunk[0] = (byte << 2) | WT_CELL_KEY_SHORT; - cell->__chunk[1] = prefix; - return (2); - } - - cell->__chunk[0] = WT_CELL_KEY; /* Type */ - cell->__chunk[1] = prefix; /* Prefix */ + uint8_t *p; - p = cell->__chunk + 2; /* Length */ + p = cell->__chunk + 1; /* Type + recno */ + cell->__chunk[0] = WT_CELL_ADDR | WT_CELL_64V; + (void)__wt_vpack_uint(&p, 0, recno); + /* Length */ (void)__wt_vpack_uint(&p, 0, (uint64_t)size); - return (WT_PTRDIFF32(p, cell)); } @@ -160,7 +148,7 @@ __wt_cell_pack_data(WT_CELL *cell, uint64_t rle, uint32_t size) * * Bit 0 is the WT_CELL_VALUE_SHORT flag; the other 7 bits are the size. */ - if (rle < 2 && size <= 0x7f) { + if (rle == 0 && size <= 0x7f) { byte = (uint8_t)size; cell->__chunk[0] = (byte << 1) | WT_CELL_VALUE_SHORT; return (1); @@ -170,7 +158,7 @@ __wt_cell_pack_data(WT_CELL *cell, uint64_t rle, uint32_t size) if (rle < 2) /* Type + RLE */ cell->__chunk[0] = WT_CELL_VALUE; else { - cell->__chunk[0] = WT_CELL_VALUE | WT_CELL_RLE; + cell->__chunk[0] = WT_CELL_VALUE | WT_CELL_64V; (void)__wt_vpack_uint(&p, 0, rle); } /* Length */ @@ -180,29 +168,111 @@ __wt_cell_pack_data(WT_CELL *cell, uint64_t rle, uint32_t size) } /* - * __wt_cell_pack_type -- - * Write a WT_CELL's contents based on a type with optional RLE count but - * fixed-size data (that is, no need for data length bytes). + * __wt_cell_pack_del -- + * Write a deleted value cell. */ static inline uint32_t -__wt_cell_pack_type(WT_CELL *cell, uint8_t type, uint64_t rle) +__wt_cell_pack_del(WT_CELL *cell, uint64_t rle) { uint8_t *p; + p = cell->__chunk + 1; if (rle < 2) { /* Type + RLE */ - cell->__chunk[0] = type; + cell->__chunk[0] = WT_CELL_DEL; return (1); } - cell->__chunk[0] = type | WT_CELL_RLE; + cell->__chunk[0] = WT_CELL_DEL | WT_CELL_64V; + (void)__wt_vpack_uint(&p, 0, rle); + + return (WT_PTRDIFF32(p, cell)); +} + +/* + * __wt_cell_pack_key -- + * Set a key's WT_CELL contents. + */ +static inline uint32_t +__wt_cell_pack_key(WT_CELL *cell, uint8_t prefix, uint32_t size) +{ + uint8_t byte, *p; + + /* + * Short keys have 6-bits of length in the descriptor byte and no length + * bytes. + * + * Bit 0 is 0, bit 1 is the WT_CELL_KEY_SHORT flag; the other 6 bits are + * the size. + */ + if (size <= 0x3f) { + byte = (uint8_t)size; + cell->__chunk[0] = (byte << 2) | WT_CELL_KEY_SHORT; + cell->__chunk[1] = prefix; + return (2); + } + + cell->__chunk[0] = WT_CELL_KEY; /* Type */ + cell->__chunk[1] = prefix; /* Prefix */ + + p = cell->__chunk + 2; /* Length */ + (void)__wt_vpack_uint(&p, 0, (uint64_t)size); + + return (WT_PTRDIFF32(p, cell)); +} + +/* + * __wt_cell_pack_key_empty -- + * Write an empty key cell. + */ +static inline void +__wt_cell_pack_key_empty(WT_CELL *cell) +{ + /* + * At the end of a row-store leaf page we have to write an empty key to + * act as a marker in case the last value on the page is zero-length. + * See the caller of this function for details. + */ + cell->__chunk[0] = WT_CELL_KEY; +} + +/* + * __wt_cell_pack_ovfl -- + * Pack an overflow cell. + */ +static inline uint32_t +__wt_cell_pack_ovfl(WT_CELL *cell, uint8_t type, uint64_t rle, uint32_t size) +{ + uint8_t *p; p = cell->__chunk + 1; - (void)__wt_vpack_uint(&p, 0, rle); + if (rle < 2) /* Type + RLE */ + cell->__chunk[0] = type; + else { + cell->__chunk[0] = type | WT_CELL_64V; + (void)__wt_vpack_uint(&p, 0, rle); + } + /* Length */ + (void)__wt_vpack_uint(&p, 0, (uint64_t)size); return (WT_PTRDIFF32(p, cell)); } /* + * __wt_cell_rle -- + * Return the cell's RLE value. + */ +static inline uint64_t +__wt_cell_rle(WT_CELL_UNPACK *unpack) +{ + /* + * Any item with only 1 occurrence is stored with an RLE of 0, that is, + * without any RLE at all. This code is a single place to handle that + * correction, for simplicity. + */ + return (unpack->v < 2 ? 1 : unpack->v); +} + +/* * __wt_cell_type -- * Return the cell's type (collapsing "short" types). */ @@ -243,7 +313,6 @@ __wt_cell_unpack_safe(WT_CELL *cell, WT_CELL_UNPACK *unpack, uint8_t *end) } while (0) memset(unpack, 0, sizeof(*unpack)); - unpack->rle = 1; /* * Check the cell description byte, then get the cell type. @@ -279,14 +348,6 @@ __wt_cell_unpack_safe(WT_CELL *cell, WT_CELL_UNPACK *unpack, uint8_t *end) unpack->size = cell->__chunk[0] >> 2; unpack->len = 2 + unpack->size; goto done; - case WT_CELL_OFF: - /* - * Check the WT_OFF that follows the cell descriptor byte. - */ - CHK(cell, 1 + sizeof(WT_OFF)); /* check WT_OFF */ - memcpy(&unpack->off, cell->__chunk + 1, sizeof(WT_OFF)); - unpack->len = 1 + sizeof(WT_OFF); - goto done; case WT_CELL_VALUE_SHORT: /* * Not reading any more memory, no further checks until the @@ -313,25 +374,23 @@ __wt_cell_unpack_safe(WT_CELL *cell, WT_CELL_UNPACK *unpack, uint8_t *end) unpack->prefix = cell->__chunk[1]; } - if (cell->__chunk[0] & WT_CELL_RLE) /* skip RLE */ + if (cell->__chunk[0] & WT_CELL_64V) /* skip RLE/recno */ WT_RET(__wt_vunpack_uint( - &p, end == NULL ? 0 : (size_t)(end - p), &unpack->rle)); + &p, end == NULL ? 0 : (size_t)(end - p), &unpack->v)); /* - * Overflow and deleted cells have known sizes, and no length bytes. - * Key/data cells have data length bytes. + * Deleted cells have known sizes, and no length bytes. + * Key/data and overflow cells have data length bytes. */ switch (unpack->raw) { - case WT_CELL_KEY_OVFL: - case WT_CELL_VALUE_OVFL: - CHK(p, sizeof(WT_OFF)); /* check WT_OFF */ - unpack->ovfl = 1; - memcpy(&unpack->off, p, sizeof(WT_OFF)); - unpack->len = WT_PTRDIFF32(p + sizeof(WT_OFF), cell); - break; case WT_CELL_DEL: unpack->len = WT_PTRDIFF32(p, cell); break; + case WT_CELL_KEY_OVFL: + case WT_CELL_VALUE_OVFL: + unpack->ovfl = 1; + /* FALLTHROUGH */ + case WT_CELL_ADDR: case WT_CELL_KEY: case WT_CELL_VALUE: WT_RET(__wt_vunpack_uint( |