diff options
author | Keith Bostic <keith.bostic@wiredtiger.com> | 2011-02-07 12:47:13 -0500 |
---|---|---|
committer | Keith Bostic <keith.bostic@wiredtiger.com> | 2011-02-07 12:47:13 -0500 |
commit | 841f45dd46cc2f3abad12688c60eb4bf192558b3 (patch) | |
tree | bc24486f67ac8e2758c2f0cf816aee8433829165 | |
parent | 88a359cb1fa3f276613da0903c60185f04c1f0bc (diff) | |
download | mongo-841f45dd46cc2f3abad12688c60eb4bf192558b3.tar.gz |
Split WT_OFF into WT_OFF and WT_OFF_RECORDS: the former is used when we don't
need record counts associated with the subtree (row-store internal pages), and
when we do need record counts associated with the subtree (column-store internal
pages and row-store leaf pages referencing off-page duplicate trees).
-rw-r--r-- | dist/serial.py | 2 | ||||
-rw-r--r-- | src/btree/bt_bulk.c | 88 | ||||
-rw-r--r-- | src/btree/bt_debug.c | 77 | ||||
-rw-r--r-- | src/btree/bt_dump.c | 8 | ||||
-rw-r--r-- | src/btree/bt_misc.c | 2 | ||||
-rw-r--r-- | src/btree/bt_page.c | 22 | ||||
-rw-r--r-- | src/btree/bt_read.c | 17 | ||||
-rw-r--r-- | src/btree/bt_reconcile.c | 43 | ||||
-rw-r--r-- | src/btree/bt_stat.c | 8 | ||||
-rw-r--r-- | src/btree/bt_vrfy.c | 64 | ||||
-rw-r--r-- | src/btree/bt_walk.c | 11 | ||||
-rw-r--r-- | src/btree/col_srch.c | 6 | ||||
-rw-r--r-- | src/include/btree.h | 150 | ||||
-rw-r--r-- | src/include/extern.h | 2 | ||||
-rw-r--r-- | src/include/serial.h | 2 | ||||
-rw-r--r-- | src/include/verify_build.h | 1 | ||||
-rw-r--r-- | src/include/wt_internal.in | 5 |
17 files changed, 301 insertions, 207 deletions
diff --git a/dist/serial.py b/dist/serial.py index a34b26c53d4..5460524a038 100644 --- a/dist/serial.py +++ b/dist/serial.py @@ -44,7 +44,7 @@ serial['cache_read'] = Serial( 'WT_WORKQ_READ', '0', ['WT_PAGE */parent', 'WT_REF */ref', - 'WT_OFF */off', + 'void */off', 'int/dsk_verify']) # func_serial -- diff --git a/src/btree/bt_bulk.c b/src/btree/bt_bulk.c index 2f791b75655..ec71111b176 100644 --- a/src/btree/bt_bulk.c +++ b/src/btree/bt_bulk.c @@ -29,7 +29,8 @@ typedef struct { static int __wt_bulk_dbt_copy(ENV *, DBT *, DBT *); static int __wt_bulk_dup_offpage(WT_TOC *, DBT **, DBT **, DBT *, WT_ITEM *, - uint32_t, uint32_t, WT_OFF *, int (*)(DB *, DBT **, DBT **)); + uint32_t, uint32_t, WT_OFF_RECORD *, + int (*)(DB *, DBT **, DBT **)); static int __wt_bulk_fix(WT_TOC *, void (*)(const char *, uint64_t), int (*)(DB *, DBT **, DBT **)); static int __wt_bulk_ovfl_copy(WT_TOC *, WT_OVFL *, WT_OVFL *); @@ -247,14 +248,14 @@ __wt_bulk_var(WT_TOC *toc, uint32_t flags, ENV *env; IDB *idb; WT_ITEM key_item, data_item, *dup_key, *dup_data; - WT_OFF off; + WT_OFF_RECORD off_record; WT_OVFL key_ovfl, data_ovfl; WT_PAGE *page, *next; WT_STACK stack; uint64_t insert_cnt; uint32_t dup_count, dup_space, len, next_space_avail, space_avail; - uint8_t *first_free, *next_first_free, *p, type; - int ret; + uint8_t *first_free, *next_first_free, *p, page_type; + int is_column, ret; db = toc->db; tmp1 = tmp2 = NULL; @@ -265,7 +266,7 @@ __wt_bulk_var(WT_TOC *toc, uint32_t flags, WT_CLEAR(stack); dup_space = dup_count = 0; insert_cnt = 0; - type = F_ISSET(idb, WT_COLUMN) ? WT_PAGE_COL_VAR : WT_PAGE_ROW_LEAF; + is_column = F_ISSET(idb, WT_COLUMN) ? 1 : 0; lastkey = &lastkey_std; WT_CLEAR(data_copy); @@ -275,11 +276,12 @@ __wt_bulk_var(WT_TOC *toc, uint32_t flags, WT_ERR(__wt_scr_alloc(toc, 0, &lastkey_copy)); /* Get a scratch buffer and make it look like our work page. */ + page_type = is_column ? WT_PAGE_COL_VAR : WT_PAGE_ROW_LEAF; WT_ERR(__wt_bulk_scratch_page( - toc, db->leafmin, type, WT_LLEAF, &page, &tmp1)); + toc, db->leafmin, page_type, WT_LLEAF, &page, &tmp1)); __wt_set_ff_and_sa_from_offset( page, WT_PAGE_BYTE(page), &first_free, &space_avail); - if (type == WT_PAGE_COL_VAR) + if (is_column) page->dsk->start_recno = 1; while ((ret = cb(db, &key, &data)) == 0) { @@ -409,11 +411,11 @@ skip_read: /* if ((key == NULL ? 0 : WT_ITEM_SPACE_REQ(key->size)) + WT_ITEM_SPACE_REQ(data->size) > space_avail) { WT_ERR(__wt_bulk_scratch_page(toc, - db->leafmin, type, WT_LLEAF, &next, &tmp2)); + db->leafmin, page_type, WT_LLEAF, &next, &tmp2)); __wt_set_ff_and_sa_from_offset(next, WT_PAGE_BYTE(next), &next_first_free, &next_space_avail); - if (type == WT_PAGE_COL_VAR) + if (is_column) next->dsk->start_recno = insert_cnt; /* @@ -586,24 +588,25 @@ skip_read: /* WT_ERR(__wt_bulk_dup_offpage(toc, &key, &data, lastkey, dup_data, (uint32_t)(first_free - (uint8_t *)dup_data), - dup_count, &off, cb)); + dup_count, &off_record, cb)); /* Reset the page entry and record counts. */ page->dsk->u.entries -= (dup_count - 1); - page->records -= dup_count; - page->records += WT_RECORDS(&off); /* - * Replace the duplicate set with a WT_OFF structure, - * that is, we've replaced dup_count entries with a - * single entry. + * Replace the set of duplicates with a WT_OFF_RECORD + * structure, that is, we've replaced dup_count entries + * with a single offpage reference. */ - WT_ITEM_SET(&data_item, WT_ITEM_OFF, sizeof(WT_OFF)); + WT_ITEM_SET(&data_item, + WT_ITEM_OFF_RECORD, sizeof(WT_OFF_RECORD)); p = (uint8_t *)dup_data; memcpy(p, &data_item, sizeof(data_item)); - memcpy(p + sizeof(data_item), &off, sizeof(WT_OFF)); + memcpy(p + sizeof(data_item), + &off_record, sizeof(WT_OFF_RECORD)); __wt_set_ff_and_sa_from_offset(page, - (uint8_t *)p + WT_ITEM_SPACE_REQ(sizeof(WT_OFF)), + (uint8_t *)p + + WT_ITEM_SPACE_REQ(sizeof(WT_OFF_RECORD)), &first_free, &space_avail); /* Reset local counters. */ @@ -646,8 +649,9 @@ err: WT_TRET(__wt_bulk_stack_put(toc, &stack)); * then load the rest of the duplicate set. */ static int -__wt_bulk_dup_offpage(WT_TOC *toc, DBT **keyp, DBT **datap, DBT *lastkey, - WT_ITEM *dup_data, uint32_t dup_len, uint32_t dup_count, WT_OFF *off, +__wt_bulk_dup_offpage( + WT_TOC *toc, DBT **keyp, DBT **datap, DBT *lastkey, WT_ITEM *dup_data, + uint32_t dup_len, uint32_t dup_count, WT_OFF_RECORD *off_record, int (*cb)(DB *, DBT **, DBT **)) { DB *db; @@ -689,8 +693,8 @@ __wt_bulk_dup_offpage(WT_TOC *toc, DBT **keyp, DBT **datap, DBT *lastkey, * On-page reference to the first duplicate data item in the set. * dup_count -- * Count of duplicates in the set. - * off -- - * Callers WT_OFF structure, which we have to fill in. + * off_record -- + * Callers WT_OFF_RECORD structure, which we have to fill in. * cb -- * User's callback function. */ @@ -789,9 +793,9 @@ __wt_bulk_dup_offpage(WT_TOC *toc, DBT **keyp, DBT **datap, DBT *lastkey, WT_ERR(__wt_page_write(toc, page)); /* Fill in the caller's WT_OFF structure. */ - WT_RECORDS(off) = dup_count; - off->addr = root_addr; - off->size = db->intlmin; + WT_RECORDS(off_record) = dup_count; + off_record->addr = root_addr; + off_record->size = db->intlmin; err: WT_TRET(__wt_bulk_stack_put(toc, &stack)); if (tmp != NULL) @@ -813,6 +817,7 @@ __wt_bulk_promote(WT_TOC *toc, WT_PAGE *page, uint64_t incr, ENV *env; WT_ITEM *key_item, item; WT_OFF off; + WT_OFF_RECORD off_record; WT_OVFL tmp_ovfl; WT_PAGE *next, *parent; WT_PAGE_DISK *dsk; @@ -1069,20 +1074,21 @@ split: switch (dsk->type) { */ switch (parent->dsk->type) { case WT_PAGE_COL_INT: - if (elem->space_avail < sizeof(WT_OFF)) + if (elem->space_avail < sizeof(WT_OFF_RECORD)) goto split; - /* Create the WT_OFF reference. */ - WT_RECORDS(&off) = page->records; - off.addr = page->addr; - off.size = dsk->level == WT_LLEAF ? db->leafmin : db->intlmin; + /* Create the WT_OFF_RECORD reference. */ + WT_RECORDS(&off_record) = page->records; + off_record.addr = page->addr; + off_record.size = + dsk->level == WT_LLEAF ? db->leafmin : db->intlmin; /* Store the data item. */ ++parent->dsk->u.entries; parent_data = elem->first_free; - memcpy(elem->first_free, &off, sizeof(off)); - elem->first_free += sizeof(WT_OFF); - elem->space_avail -= sizeof(WT_OFF); + memcpy(elem->first_free, &off_record, sizeof(off_record)); + elem->first_free += sizeof(WT_OFF_RECORD); + elem->space_avail -= sizeof(WT_OFF_RECORD); /* Track the last entry on the page for record count updates. */ stack->elem[level].data = parent_data; @@ -1103,7 +1109,6 @@ split: switch (dsk->type) { /* Create the WT_ITEM(WT_OFF) reference. */ WT_ITEM_SET(&item, WT_ITEM_OFF, sizeof(WT_OFF)); - WT_RECORDS(&off) = 0; off.addr = page->addr; off.size = dsk->level == WT_LLEAF ? db->leafmin : db->intlmin; @@ -1138,18 +1143,11 @@ split: switch (dsk->type) { * page, so proceed from there to the root. */ for (elem = - &stack->elem[level + 1]; elem->page != NULL; ++elem) { - switch (elem->page->dsk->type) { - case WT_PAGE_COL_INT: - WT_RECORDS((WT_OFF *)elem->data) += incr; - break; - case WT_PAGE_ROW_INT: - case WT_PAGE_DUP_INT: - break; - WT_ILLEGAL_FORMAT(db); + &stack->elem[level + 1]; elem->page != NULL; ++elem) + if (elem->page->dsk->type == WT_PAGE_COL_INT) { + elem->page->records += incr; + WT_RECORDS((WT_OFF_RECORD *)elem->data) += incr; } - elem->page->records += incr; - } } err: if (next_tmp != NULL) diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index dc61838b89a..d7cd101f44f 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -14,15 +14,14 @@ static void __wt_debug_dsk_col_fix(DB *, WT_PAGE_DISK *, FILE *); static void __wt_debug_dsk_col_int(WT_PAGE_DISK *, FILE *); static void __wt_debug_dsk_col_rle(DB *, WT_PAGE_DISK *, FILE *); static int __wt_debug_dsk_item(WT_TOC *, WT_PAGE_DISK *, FILE *); +static int __wt_debug_item(WT_TOC *, WT_ITEM *, FILE *); +static int __wt_debug_item_data(WT_TOC *, WT_ITEM *, FILE *fp); static void __wt_debug_page_col_fix(WT_TOC *, WT_PAGE *, FILE *); static void __wt_debug_page_col_int(WT_PAGE *, FILE *); static void __wt_debug_page_col_rle(WT_TOC *, WT_PAGE *, FILE *); static int __wt_debug_page_col_var(WT_TOC *, WT_PAGE *, FILE *); static void __wt_debug_page_row_int(WT_PAGE *, FILE *); static int __wt_debug_page_row_leaf(WT_TOC *, WT_PAGE *, FILE *); -static int __wt_debug_item(WT_TOC *, WT_ITEM *, FILE *); -static int __wt_debug_item_data(WT_TOC *, WT_ITEM *, FILE *fp); -static void __wt_debug_off(WT_OFF *, const char *, FILE *); static void __wt_debug_pair(const char *, void *, uint32_t, FILE *); static void __wt_debug_repl(WT_REPL *, FILE *); static void __wt_debug_rleexp(WT_RLE_EXPAND *, FILE *); @@ -254,13 +253,18 @@ static void __wt_debug_page_col_int(WT_PAGE *page, FILE *fp) { WT_COL *cip; + WT_OFF_RECORD *off_record; uint32_t i; if (fp == NULL) /* Default to stderr */ fp = stderr; - WT_INDX_FOREACH(page, cip, i) - __wt_debug_off(cip->data, "\t", fp); + WT_INDX_FOREACH(page, cip, i) { + off_record = cip->data; + fprintf(fp, "\toffpage: addr %lu, size %lu, records %llu\n", + (u_long)off_record->addr, (u_long)off_record->size, + (unsigned long long)WT_RECORDS(off_record)); + } } /* @@ -357,6 +361,7 @@ __wt_debug_page_row_leaf(WT_TOC *toc, WT_PAGE *page, FILE *fp) static void __wt_debug_page_row_int(WT_PAGE *page, FILE *fp) { + WT_OFF *off; WT_ROW *rip; uint32_t i; @@ -368,8 +373,9 @@ __wt_debug_page_row_int(WT_PAGE *page, FILE *fp) fprintf(fp, "\tkey: {requires processing}\n"); else __wt_debug_dbt("\tkey", rip, fp); - - __wt_debug_off(rip->data, "\t", fp); + off = rip->data; + fprintf(fp, "\toffpage: addr %lu, size %lu\n", + (u_long)off->addr, (u_long)off->size); } } @@ -439,6 +445,8 @@ static int __wt_debug_item(WT_TOC *toc, WT_ITEM *item, FILE *fp) { DB *db; + WT_OFF *off; + WT_OFF_RECORD *off_record; WT_OVFL *ovfl; if (fp == NULL) /* Default to stderr */ @@ -450,25 +458,31 @@ __wt_debug_item(WT_TOC *toc, WT_ITEM *item, FILE *fp) __wt_item_type_string(item), (u_long)WT_ITEM_LEN(item)); switch (WT_ITEM_TYPE(item)) { - case WT_ITEM_KEY: - case WT_ITEM_KEY_DUP: case WT_ITEM_DATA: case WT_ITEM_DATA_DUP: + case WT_ITEM_DEL: + case WT_ITEM_KEY: + case WT_ITEM_KEY_DUP: break; - case WT_ITEM_KEY_OVFL: - case WT_ITEM_KEY_DUP_OVFL: - case WT_ITEM_DATA_OVFL: case WT_ITEM_DATA_DUP_OVFL: + case WT_ITEM_DATA_OVFL: + case WT_ITEM_KEY_DUP_OVFL: + case WT_ITEM_KEY_OVFL: ovfl = WT_ITEM_BYTE_OVFL(item); fprintf(fp, ", addr %lu, size %lu", (u_long)ovfl->addr, (u_long)ovfl->size); break; - case WT_ITEM_DEL: - fprintf(fp, "\n"); - return (0); case WT_ITEM_OFF: - __wt_debug_off(WT_ITEM_BYTE_OFF(item), ", ", fp); - return (0); + off = WT_ITEM_BYTE_OFF(item); + fprintf(fp, ", offpage: addr %lu, size %lu\n", + (u_long)off->addr, (u_long)off->size); + break; + case WT_ITEM_OFF_RECORD: + off_record = WT_ITEM_BYTE_OFF_RECORD(item); + fprintf(fp, ", offpage: addr %lu, size %lu, records %llu\n", + (u_long)off_record->addr, (u_long)off_record->size, + (unsigned long long)WT_RECORDS(off_record)); + break; WT_ILLEGAL_FORMAT(db); } @@ -485,14 +499,16 @@ __wt_debug_item(WT_TOC *toc, WT_ITEM *item, FILE *fp) static void __wt_debug_dsk_col_int(WT_PAGE_DISK *dsk, FILE *fp) { - WT_OFF *off; + WT_OFF_RECORD *off_record; uint32_t i; if (fp == NULL) /* Default to stderr */ fp = stderr; - WT_OFF_FOREACH(dsk, off, i) - __wt_debug_off(off, "\t", fp); + WT_OFF_FOREACH(dsk, off_record, i) + fprintf(fp, "\toffpage: addr %lu, size %lu, records %llu\n", + (u_long)off_record->addr, (u_long)off_record->size, + (unsigned long long)WT_RECORDS(off_record)); } /* @@ -593,7 +609,11 @@ process: WT_ERR(__wt_scr_alloc(toc, 0, &tmp)); break; case WT_ITEM_OFF: p = (uint8_t *)"offpage"; - size = 7; + size = sizeof("offpage") - 1; + break; + case WT_ITEM_OFF_RECORD: + p = (uint8_t *)"offpage_record"; + size = sizeof("offpage_record") - 1; break; WT_ILLEGAL_FORMAT_ERR(db, ret); } @@ -606,21 +626,6 @@ err: if (tmp != NULL) } /* - * __wt_debug_off -- - * Dump a WT_OFF structure. - */ -static void -__wt_debug_off(WT_OFF *off, const char *prefix, FILE *fp) -{ - if (fp == NULL) /* Default to stderr */ - fp = stderr; - - fprintf(fp, "%soffpage: addr %lu, size %lu, records %llu\n", - prefix, (u_long)off->addr, (u_long)off->size, - (unsigned long long)WT_RECORDS(off)); -} - -/* * __wt_debug_dbt -- * Dump a single DBT in debugging mode, with an optional tag. */ diff --git a/src/btree/bt_dump.c b/src/btree/bt_dump.c index 4d46fceff27..503f06c24b6 100644 --- a/src/btree/bt_dump.c +++ b/src/btree/bt_dump.c @@ -322,7 +322,7 @@ __wt_dump_page_row_leaf(WT_TOC *toc, WT_PAGE *page, WT_DSTUFF *dp) DB *db; DBT *key, *data, *key_tmp, *data_tmp, key_local, data_local; WT_ITEM *item; - WT_OFF *off; + WT_OFF_RECORD *off_record; WT_REF *ref; WT_REPL *repl; WT_ROW *rip; @@ -388,7 +388,7 @@ __wt_dump_page_row_leaf(WT_TOC *toc, WT_PAGE *page, WT_DSTUFF *dp) WT_ERR(__wt_item_process(toc, item, data_tmp)); data = data_tmp; break; - case WT_ITEM_OFF: + case WT_ITEM_OFF_RECORD: /* * Set the key and recursively call the tree-walk code * for any off-page duplicate trees. (Check for any @@ -399,8 +399,8 @@ __wt_dump_page_row_leaf(WT_TOC *toc, WT_PAGE *page, WT_DSTUFF *dp) dp->dupkey = key; ref = WT_ROW_DUP(page, rip); - off = WT_ROW_OFF(rip); - WT_RET(__wt_page_in(toc, page, ref, off, 0)); + off_record = WT_ROW_OFF_RECORD(rip); + WT_RET(__wt_page_in(toc, page, ref, off_record, 0)); ret = __wt_tree_walk(toc, ref, 0, __wt_dump_page, dp); __wt_hazard_clear(toc, ref->page); if (ret != 0) diff --git a/src/btree/bt_misc.c b/src/btree/bt_misc.c index f8267429d3d..cf271c89c9f 100644 --- a/src/btree/bt_misc.c +++ b/src/btree/bt_misc.c @@ -97,6 +97,8 @@ __wt_item_type_string(WT_ITEM *item) return ("key-overflow"); case WT_ITEM_OFF: return ("off-page"); + case WT_ITEM_OFF_RECORD: + return ("off-page-records"); default: break; } diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c index ade0dcc187c..e66ea50dc80 100644 --- a/src/btree/bt_page.c +++ b/src/btree/bt_page.c @@ -25,7 +25,7 @@ static int __wt_page_inmem_row_leaf(DB *, WT_PAGE *); */ int __wt_page_in( - WT_TOC *toc, WT_PAGE *parent, WT_REF *ref, WT_OFF *off, int dsk_verify) + WT_TOC *toc, WT_PAGE *parent, WT_REF *ref, void *off, int dsk_verify) { ENV *env; WT_CACHE *cache; @@ -209,7 +209,8 @@ __wt_page_inmem_col_fix(DB *db, WT_PAGE *page) ++cip; } - page->indx_count = page->records = dsk->u.entries; + page->indx_count = dsk->u.entries; + page->records = page->indx_count; } /* @@ -220,7 +221,7 @@ static void __wt_page_inmem_col_int(WT_PAGE *page) { WT_COL *cip; - WT_OFF *off; + WT_OFF_RECORD *off_record; WT_PAGE_DISK *dsk; uint64_t records; uint32_t i; @@ -231,12 +232,12 @@ __wt_page_inmem_col_int(WT_PAGE *page) /* * Walk the page, building indices and finding the end of the page. - * The page contains WT_OFF structures. + * The page contains WT_OFF_RECORD structures. */ - WT_OFF_FOREACH(dsk, off, i) { - cip->data = off; + WT_OFF_FOREACH(dsk, off_record, i) { + cip->data = off_record; ++cip; - records += WT_RECORDS(off); + records += WT_RECORDS(off_record); } page->indx_count = dsk->u.entries; @@ -302,7 +303,8 @@ __wt_page_inmem_col_var(WT_PAGE *page) ++cip; } - page->indx_count = page->records = dsk->u.entries; + page->indx_count = dsk->u.entries; + page->records = page->indx_count; } /* @@ -433,7 +435,7 @@ __wt_page_inmem_row_leaf(DB *db, WT_PAGE *page) * single on-page (WT_ITEM_DATA) or overflow (WT_ITEM_DATA_OVFL) item; * a group of duplicate data items where each duplicate is an on-page * (WT_ITEM_DATA_DUP) or overflow (WT_ITEM_DUP_OVFL) item; or an offpage - * reference (WT_ITEM_OFF). + * reference (WT_ITEM_OFF_RECORDS). */ rip = NULL; indx_count = 0; @@ -469,7 +471,7 @@ __wt_page_inmem_row_leaf(DB *db, WT_PAGE *page) case WT_ITEM_DATA_OVFL: rip->data = item; break; - case WT_ITEM_OFF: + case WT_ITEM_OFF_RECORD: rip->data = item; /* diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index f7e594d2217..769a047cf15 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -69,10 +69,10 @@ __wt_cache_read_serial_func(WT_TOC *toc) { ENV *env; WT_CACHE *cache; - WT_OFF *off; WT_PAGE *parent; WT_READ_REQ *rr, *rr_end; WT_REF *ref; + void *off; int dsk_verify; __wt_cache_read_unpack(toc, parent, ref, off, dsk_verify); @@ -190,7 +190,6 @@ __wt_cache_read(WT_READ_REQ *rr) { ENV *env; WT_CACHE *cache; - WT_OFF *off; WT_PAGE *page; WT_PAGE_DISK *dsk; WT_REF *ref; @@ -200,9 +199,13 @@ __wt_cache_read(WT_READ_REQ *rr) toc = rr->toc; ref = rr->ref; - off = rr->off; - addr = off->addr; - size = off->size; + + /* + * We're passed a reference to a WT_OFF or a WT_OFF_RECORD structure; + * the initial addr/size pair fields are the same, get what we came for. + */ + addr = ((WT_OFF *)rr->off)->addr; + size = ((WT_OFF *)rr->off)->size; env = toc->env; cache = env->ienv->cache; @@ -241,13 +244,13 @@ __wt_cache_read(WT_READ_REQ *rr) /* * Fill in the WT_PAGE addr, size. - * Reference the parent's WT_PAGE and parent's WT_OFF structures. + * Reference the parent's WT_PAGE and WT_OFF/WT_OFF_RECORD structures. * Reference the underlying disk page. */ page->addr = addr; page->size = size; page->parent = rr->parent; - page->parent_off = off; + page->parent_off = rr->off; page->dsk = dsk; /* Build the in-memory version of the page. */ diff --git a/src/btree/bt_reconcile.c b/src/btree/bt_reconcile.c index 64b7044da9b..3987a621958 100644 --- a/src/btree/bt_reconcile.c +++ b/src/btree/bt_reconcile.c @@ -189,7 +189,7 @@ static int __wt_rec_col_int(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new) { WT_COL *cip; - WT_OFF *from; + WT_OFF_RECORD *from; WT_PAGE_DISK *dsk; uint32_t i, space_avail; uint8_t *first_free; @@ -207,16 +207,16 @@ __wt_rec_col_int(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new) * size, but it still wasn't enough. We must allocate another * page and split the parent. */ - if (sizeof(WT_OFF) > space_avail) { + if (sizeof(WT_OFF_RECORD) > space_avail) { fprintf(stderr, "__wt_rec_col_int: page %lu split\n", (u_long)page->addr); __wt_abort(toc->env); } - memcpy(first_free, from, sizeof(WT_OFF)); - first_free += sizeof(WT_OFF); - space_avail -= sizeof(WT_OFF); + memcpy(first_free, from, sizeof(WT_OFF_RECORD)); + first_free += sizeof(WT_OFF_RECORD); + space_avail -= sizeof(WT_OFF_RECORD); ++dsk->u.entries; } @@ -959,9 +959,12 @@ __wt_rec_page_write(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new) static int __wt_rec_parent_update(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new) { + ENV *env; IDB *idb; - WT_OFF *parent_off; + WT_OFF *off; + WT_OFF_RECORD *off_record; + env = toc->env; idb = toc->db->idb; /* @@ -976,15 +979,27 @@ __wt_rec_parent_update(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new) } /* - * Update the relevant WT_OFF structure. There are two memory locations - * that change (address and size), and we could race, but that's not a - * problem. Only a single thread ever reconciles a page at a time, and - * pages cannot leave memory while they have children. + * Update the relevant WT_OFF/WT_OFF_RECORD structure. There are two + * memory locations that change (address and size), and we could race, + * but that's not a problem. Only a single thread ever reconciles a + * page at a time, and pages cannot leave memory if they have children. */ - parent_off = page->parent_off; - WT_RECORDS(parent_off) = new->records; - parent_off->addr = new->addr; - parent_off->size = new->size; + switch (page->dsk->type) { + case WT_PAGE_COL_INT: + off_record = page->parent_off; + off_record->addr = new->addr; + off_record->size = new->size; + WT_ASSERT(env, WT_RECORDS(off_record) == new->records); + break; + case WT_PAGE_DUP_INT: + case WT_PAGE_ROW_INT: + off = page->parent_off; + off->addr = new->addr; + off->size = new->size; + break; + default: + break; + } /* * Mark the parent page as dirty. diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c index 5beb931f578..ffe1f6f512f 100644 --- a/src/btree/bt_stat.c +++ b/src/btree/bt_stat.c @@ -251,7 +251,7 @@ static int __wt_stat_page_row_leaf(WT_TOC *toc, WT_PAGE *page, void *arg) { DB *db; - WT_OFF *off; + WT_OFF_RECORD *off_record; WT_REF *ref; WT_REPL *repl; WT_ROW *rip; @@ -299,7 +299,7 @@ __wt_stat_page_row_leaf(WT_TOC *toc, WT_PAGE *page, void *arg) WT_STAT_INCR(stats, ITEM_DATA_OVFL); WT_STAT_INCR(stats, ITEM_TOTAL_DATA); break; - case WT_ITEM_OFF: + case WT_ITEM_OFF_RECORD: /* * Recursively call the tree-walk code for any off-page * duplicate trees. (Check for any off-page duplicate @@ -308,8 +308,8 @@ __wt_stat_page_row_leaf(WT_TOC *toc, WT_PAGE *page, void *arg) * and in the tree-walk function.) */ ref = WT_ROW_REF(page, rip); - off = WT_ROW_OFF(rip); - WT_RET(__wt_page_in(toc, page, ref, off, 0)); + off_record = WT_ROW_OFF_RECORD(rip); + WT_RET(__wt_page_in(toc, page, ref, off_record, 0)); ret = __wt_tree_walk(toc, ref, 0, __wt_page_stat, arg); __wt_hazard_clear(toc, ref->page); if (ret != 0) diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c index 1c52586ec83..612502b5070 100644 --- a/src/btree/bt_vrfy.c +++ b/src/btree/bt_vrfy.c @@ -146,6 +146,7 @@ __wt_verify_tree( WT_COL *cip; WT_ITEM *item; WT_OFF *off; + WT_OFF_RECORD *off_record; WT_PAGE *page; WT_PAGE_DISK *dsk; WT_REPL *repl; @@ -219,11 +220,11 @@ __wt_verify_tree( * Check the starting record number and record counts. * * Confirm the number of records found on this page (by summing the - * WT_OFF structure record counts) matches the WT_OFF structure record - * count in our parent. Use the in-memory record count for internal - * pages -- we could sum the record counts as we walk the page below, - * but we did that when building the in-memory version of the page, - * there's no reason to do it again. + * WT_OFF_RECORD structure record counts) matches the WT_OFF_RECORD + * structure record count in our parent. Use the in-memory record + * count for internal pages -- we could sum the record counts as we + * walk the page below, but we did that when building the in-memory + * version of the page, there's no reason to do it again. */ switch (dsk->type) { case WT_PAGE_COL_FIX: @@ -322,9 +323,9 @@ __wt_verify_tree( WT_INDX_FOREACH(page, cip, i) { /* cip references the subtree containing the record */ ref = WT_COL_REF(page, cip); - off = WT_COL_OFF(cip); + off_record = WT_COL_OFF(cip); records = WT_COL_OFF_RECORDS(cip); - WT_ERR(__wt_page_in(toc, page, ref, off, 1)); + WT_ERR(__wt_page_in(toc, page, ref, off_record, 1)); ret = __wt_verify_tree(toc, NULL, records, start_recno, level - 1, ref, vs); __wt_hazard_clear(toc, ref->page); @@ -408,22 +409,22 @@ __wt_verify_tree( page, rip)) != NULL && WT_REPL_DELETED_ISSET(repl)) continue; item = rip->data; - if (WT_ITEM_TYPE(item) != WT_ITEM_OFF) + if (WT_ITEM_TYPE(item) != WT_ITEM_OFF_RECORD) continue; /* Verify the off-page duplicate tree. */ vs->duptree = 0; ref = WT_ROW_DUP(page, rip); - off = WT_ROW_OFF(rip); - WT_ERR(__wt_page_in(toc, page, ref, off, 1)); + off_record = WT_ROW_OFF_RECORD(rip); + WT_ERR(__wt_page_in(toc, page, ref, off_record, 1)); ret = __wt_verify_tree(toc, NULL, (uint64_t)0, (uint64_t)0, WT_NOLEVEL, ref, vs); __wt_hazard_clear(toc, ref->page); if (ret != 0) goto err; - if (vs->duptree != WT_RECORDS(off)) { + if (vs->duptree != WT_RECORDS(off_record)) { __wt_api_db_errx(db, "off-page duplicate tree referenced from " "item %lu of page %lu has a record count " @@ -431,7 +432,7 @@ __wt_verify_tree( "expected", (u_long)item_num, (u_long)page->addr, (unsigned long long)vs->duptree, - (unsigned long long)WT_RECORDS(off)); + (unsigned long long)WT_RECORDS(off_record)); goto err; } } @@ -735,6 +736,7 @@ __wt_verify_dsk_item( WT_ITEM *item; WT_OVFL *ovfl; WT_OFF *off; + WT_OFF_RECORD *off_record; off_t file_size; uint8_t *end; uint32_t i, item_num, item_len, item_type; @@ -789,7 +791,11 @@ __wt_verify_dsk_item( case WT_ITEM_OFF: if (dsk->type != WT_PAGE_DUP_INT && dsk->type != WT_PAGE_ROW_INT && - dsk->type != WT_PAGE_ROW_LEAF) { + dsk->type != WT_PAGE_ROW_LEAF) + goto item_vs_page; + break; + case WT_ITEM_OFF_RECORD: + if (dsk->type != WT_PAGE_ROW_LEAF) { item_vs_page: __wt_api_db_errx(db, "illegal item and page type combination " "(item %lu on page at addr %lu is a %s " @@ -817,7 +823,7 @@ item_vs_page: __wt_api_db_errx(db, goto skip_order_check; /* - * For row-stores leaf pages, check for: + * For row-store leaf pages, check for: * two keys in a row, * two non-dup data items in a row, * inter-mixed dup and non-dup data items, @@ -842,12 +848,12 @@ item_vs_page: __wt_api_db_errx(db, case WT_ITEM_DATA_DUP_OVFL: case WT_ITEM_DATA_OVFL: case WT_ITEM_DEL: - case WT_ITEM_OFF: + case WT_ITEM_OFF_RECORD: switch (item_type) { case WT_ITEM_DATA: case WT_ITEM_DATA_OVFL: case WT_ITEM_DEL: - case WT_ITEM_OFF: + case WT_ITEM_OFF_RECORD: switch (last_item_type) { case IS_FIRST: goto first_data; @@ -916,7 +922,11 @@ skip_order_check: goto item_len; break; case WT_ITEM_OFF: - if (item_len != sizeof(WT_OFF)) { + if (item_len != sizeof(WT_OFF)) + goto item_len; + break; + case WT_ITEM_OFF_RECORD: + if (item_len != sizeof(WT_OFF_RECORD)) { item_len: __wt_api_db_errx(db, "item %lu on page at addr %lu has an " "incorrect length", @@ -945,8 +955,14 @@ item_len: __wt_api_db_errx(db, break; case WT_ITEM_OFF: off = WT_ITEM_BYTE_OFF(item); - if (WT_ADDR_TO_OFF(db, off->addr) + - off->size > file_size) + if (WT_ADDR_TO_OFF(db, + off->addr) + off->size > file_size) + goto eof; + break; + case WT_ITEM_OFF_RECORD: + off_record = WT_ITEM_BYTE_OFF_RECORD(item); + if (WT_ADDR_TO_OFF(db, + off_record->addr) + off_record->size > file_size) goto eof; break; default: @@ -967,7 +983,7 @@ static int __wt_verify_dsk_col_int(DB *db, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size) { IDB *idb; - WT_OFF *off; + WT_OFF_RECORD *off_record; uint8_t *end; uint32_t i, entry_num; @@ -975,16 +991,16 @@ __wt_verify_dsk_col_int(DB *db, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size) end = (uint8_t *)dsk + size; entry_num = 0; - WT_OFF_FOREACH(dsk, off, i) { + WT_OFF_FOREACH(dsk, off_record, i) { ++entry_num; /* Check if this entry is entirely on the page. */ - if ((uint8_t *)off + sizeof(WT_OFF) > end) + if ((uint8_t *)off_record + sizeof(WT_OFF_RECORD) > end) return (__wt_verify_eop(db, entry_num, addr)); /* Check if the reference is past the end-of-file. */ - if (WT_ADDR_TO_OFF( - db, off->addr) + off->size > idb->fh->file_size) + if (WT_ADDR_TO_OFF(db, + off_record->addr) + off_record->size > idb->fh->file_size) return (__wt_verify_eof(db, entry_num, addr)); } diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c index f5ef9674f9b..3228fe53600 100644 --- a/src/btree/bt_walk.c +++ b/src/btree/bt_walk.c @@ -45,6 +45,7 @@ __wt_tree_walk(WT_TOC *toc, WT_REF *ref, IDB *idb; WT_COL *cip; WT_OFF *off; + WT_OFF_RECORD *off_record; WT_PAGE *page; WT_ROW *rip; uint32_t i; @@ -82,8 +83,8 @@ __wt_tree_walk(WT_TOC *toc, WT_REF *ref, if (LF_ISSET(WT_WALK_CACHE) && ref->state != WT_OK) continue; - off = WT_COL_OFF(cip); - WT_RET(__wt_page_in(toc, page, ref, off, 0)); + off_record = WT_COL_OFF(cip); + WT_RET(__wt_page_in(toc, page, ref, off_record, 0)); ret = __wt_tree_walk(toc, ref, flags, work, arg); __wt_hazard_clear(toc, ref->page); if (ret != 0) @@ -110,7 +111,7 @@ __wt_tree_walk(WT_TOC *toc, WT_REF *ref, if (!LF_ISSET(WT_WALK_OFFDUP)) break; WT_INDX_FOREACH(page, rip, i) { - if (WT_ITEM_TYPE(rip->data) != WT_ITEM_OFF) + if (WT_ITEM_TYPE(rip->data) != WT_ITEM_OFF_RECORD) break; /* @@ -121,8 +122,8 @@ __wt_tree_walk(WT_TOC *toc, WT_REF *ref, if (LF_ISSET(WT_WALK_CACHE) && ref->state != WT_OK) continue; - off = WT_ROW_OFF(rip); - WT_RET(__wt_page_in(toc, page, ref, off, 0)); + off_record = WT_ROW_OFF_RECORD(rip); + WT_RET(__wt_page_in(toc, page, ref, off_record, 0)); ret = __wt_tree_walk(toc, ref, flags, work, arg); __wt_hazard_clear(toc, ref->page); if (ret != 0) diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c index 81c24e3d54f..0549e7c6e04 100644 --- a/src/btree/col_srch.c +++ b/src/btree/col_srch.c @@ -19,7 +19,7 @@ __wt_col_search(WT_TOC *toc, uint64_t recno, uint32_t level, uint32_t flags) DB *db; IDB *idb; WT_COL *cip; - WT_OFF *off; + WT_OFF_RECORD *off_record; WT_PAGE *page; WT_PAGE_DISK *dsk; WT_RLE_EXPAND *exp; @@ -90,8 +90,8 @@ __wt_col_search(WT_TOC *toc, uint64_t recno, uint32_t level, uint32_t flags) /* cip references the subtree containing the record. */ ref = WT_COL_REF(page, cip); - off = WT_COL_OFF(cip); - WT_ERR(__wt_page_in(toc, page, ref, off, 0)); + off_record = WT_COL_OFF(cip); + WT_ERR(__wt_page_in(toc, page, ref, off_record, 0)); /* Swap the parent page for the child page. */ if (page != idb->root_page.page) diff --git a/src/include/btree.h b/src/include/btree.h index edcf235b7f8..722c969af90 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -136,8 +136,13 @@ struct __wt_page { /* Record count is only maintained for column-store files. */ uint64_t records; /* Records in this subtree */ + /* + * Two links to the parent's WT_PAGE structure -- the physical parent + * page, and the WT_OFF or WT_OFF_RECORD structure used to find this + * page. + */ WT_PAGE *parent; /* Page's parent */ - WT_OFF *parent_off; /* Page's parent reference */ + void *parent_off; /* Page's parent reference */ WT_PAGE_DISK *dsk; /* Page's on-disk representation */ @@ -157,7 +162,7 @@ struct __wt_page { * value. If we ever add a flags field to this structure, the pinned * flag could move there. */ -#define WT_PAGE_SET_PIN(p) (p)->read_gen = UINT64_MAX +#define WT_PAGE_SET_PIN(p) ((p)->read_gen = UINT64_MAX) #define WT_PAGE_IS_PINNED(p) ((p)->read_gen == UINT64_MAX) uint64_t read_gen; @@ -311,10 +316,8 @@ struct __wt_page { * individually allocated structures. The WT_{COL,ROW}_REF macros return * the appropriate entry based on a WT_{COL,ROW} reference. */ -#define WT_COL_REF(page, ip) \ - (&((page)->u3.ref[WT_COL_SLOT(page, ip)])) -#define WT_ROW_REF(page, ip) \ - (&((page)->u3.ref[WT_ROW_SLOT(page, ip)])) +#define WT_COL_REF(page, ip) (&((page)->u3.ref[WT_COL_SLOT(page, ip)])) +#define WT_ROW_REF(page, ip) (&((page)->u3.ref[WT_ROW_SLOT(page, ip)])) /* * The other arrays may not exist, and are arrays of pointers to individually @@ -613,19 +616,24 @@ struct __wt_rle_expand { (dupp) = (page)->u3.dup; (i) > 0; ++(dupp), --(i)) /* - * On both row- and column-store internal pages, the on-page data referenced - * by the WT_ROW/WT_COL data field is a WT_OFF structure, which contains a - * record count and a page addr/size pair. Macros to reach into the on-page - * structure and return the values. + * On row-store internal pages, the on-page data referenced by the WT_ROW field + * is a WT_OFF structure, which contains a page addr/size pair. + */ +#define WT_ROW_OFF(ip) \ + ((WT_OFF *)WT_ITEM_BYTE(((WT_ROW *)ip)->data)) +#define WT_ROW_OFF_RECORD(ip) \ + ((WT_OFF_RECORD *)WT_ITEM_BYTE(((WT_ROW *)ip)->data)) + +/* + * On column-store internal pages, the on-page data referenced by the WT_COL + * field is a WT_OFF_RECORD structure which contains a page addr/size pair + * and a total record count. */ #define WT_COL_OFF(ip) \ - ((WT_OFF *)(((WT_COL *)ip)->data)) + ((WT_OFF_RECORD *)(((WT_COL *)ip)->data)) #define WT_COL_OFF_RECORDS(ip) \ WT_RECORDS(WT_COL_OFF(ip)) -#define WT_ROW_OFF(ip) \ - ((WT_OFF *)WT_ITEM_BYTE(((WT_ROW *)ip)->data)) - /* * WT_ITEM -- * Trailing data length (in bytes) plus item type. @@ -667,14 +675,18 @@ struct __wt_item { * items, each of which has an overflow form. Items are followed by additional * data, which varies by type: a key, duplicate key, data or duplicate item is * followed by a set of bytes; a WT_OVFL structure follows an overflow form. - * There are two additional types: First, a deleted type (a place-holder for - * deleted items where the item cannot be removed, for example, an column store - * item that must remain to preserve the record count). Second, a subtree - * reference for keys that reference subtrees of information (for example, an - * internal Btree page has a key and a reference to the tree that contains all - * key/data pairs greater than the internal page's key, or, a leaf Btree page - * where a key references all of the duplicate data items for the key when the - * duplicate data items can no longer fit onto the Btree leaf page). + * There are 2 additional types: (1) a deleted type (a place-holder for deleted + * items where the item cannot be removed, for example, an column store item + * that must remain to preserve the record count); (2a) a subtree reference for + * keys that reference subtrees without an associated record count (a row-store + * internal page has a key/reference pairs for the tree containing all key/data + * pairs greater than the key); (2b) a subtree reference for keys that reference + * subtrees with an associated record count (a column-store internal page has + * a reference for the tree containing all records greater than the specified + * record, or leaf Btree pages where a key references a set of duplicate data + * items for the key when the duplicate data items no longer fit onto the leaf + * page itself -- offpage duplicate data sets are counted, which is why Btree + * leaf pages fall under 2b, and not 2a). * * Here's the usage by page type: * @@ -687,10 +699,11 @@ struct __wt_item { * WT_ITEM_KEY_OVFL item followed by a WT_ITEM_DATA or WT_ITEM_DATA_OVFL * item); * -- Variable-length key and set of duplicates moved into a separate tree - * (a WT_ITEM_KEY or WT_ITEM_KEY_OVFL item followed by a WT_ITEM_OFF item); + * (a WT_ITEM_KEY or WT_ITEM_KEY_OVFL item followed by a WT_ITEM_OFF_RECORD + * item); * -- Variable-length key and set of duplicates not yet moved into a separate - * tree (a WT_ITEM_KEY/KEY_OVFL item followed by two or more - * WT_ITEM_DATA_DUP or WT_ITEM_DATA_DUP_OVFL items). + * tree (a WT_ITEM_KEY/KEY_OVFL item followed by two or more WT_ITEM_DATA_DUP + * or WT_ITEM_DATA_DUP_OVFL items). * * WT_PAGE_DUP_INT (row-store offpage duplicates internal pages): * -- Variable-length duplicate key and offpage-reference pairs (a @@ -710,16 +723,26 @@ struct __wt_item { * These pages contain fixed-sized structures (WT_PAGE_COL_{INT,FIX,RLE}), * or a string of bytes (WT_PAGE_OVFL), not WT_ITEM structures. * - * There are currently 10 item types, requiring 4 bits, with 6 values unused. + * There are currently 11 item types, using 4 bits, with 5 values unused. If + * we run out of bits, we could compress the item types in a couple of ways: + * + * We could merge the WT_ITEM_KEY and WT_ITEM_KEY_DUP types, but that requires + * we know the page's type in order to know how an item might be encoded (that + * is, if it's an off-page duplicate key, it's encoded using the Huffman data + * coder, or if it's a Btree row store key, it's encoded using the Huffman key + * encoder). * - * We could compress the item types in a couple of ways. We could merge the - * WT_ITEM_KEY and WT_ITEM_KEY_DUP types, but that would require we know the - * underlying page type in order to know how an item might be encoded (that - * is, if it's an off-page duplicate key, encoded using the Huffman data coder, - * or a Btree row store key, encoded using the Huffman key encoder). We could - * also use a bit to mean overflow, merging all overflow types into a single - * bit plus the ""primary" item type, but that would require more bit shuffling + * We could use a single bit to mean overflow, merging all overflow types into + * that bit plus the "primary" item type, but that requires more bit shuffling * than the current scheme. + * + * We could combine WT_ITEM_OFF and WT_ITEM_OFF_RECORD types, again, by using + * the underlying page type to know what kind of off-page reference it is (if + * it's a row-store leaf or column-store internal, it's a WT_ITEM_OFF_RECORD, + * if it's a row-store internal, it's a WT_ITEM_OFF). + * + * All of these changes require some amount of compatibility work because they + * involved on-page format information. */ #define WT_ITEM_KEY 0x00000000 /* Key */ #define WT_ITEM_KEY_OVFL 0x01000000 /* Key: overflow */ @@ -731,6 +754,7 @@ struct __wt_item { #define WT_ITEM_DATA_DUP_OVFL 0x07000000 /* Data: duplicate overflow */ #define WT_ITEM_DEL 0x08000000 /* Deleted */ #define WT_ITEM_OFF 0x09000000 /* Off-page reference */ +#define WT_ITEM_OFF_RECORD 0x0a000000 /* Off-page reference with records */ #define WT_ITEM_TYPE(addr) \ (((WT_ITEM *)(addr))->__item_chunk & 0x0f000000) @@ -749,12 +773,13 @@ struct __wt_item { /* * On row-store pages, the on-page data referenced by the WT_ROW data field - * may be a WT_OVFL (which contains the address for the start of the overflow - * pages and its length), or a WT_OFF structure. These macros do the cast - * to the right type. + * may be WT_OFF, WT_OFF_RECORD or WT_OVFL structures. These macros do the + * cast to the right type. */ #define WT_ITEM_BYTE_OFF(addr) \ ((WT_OFF *)(WT_ITEM_BYTE(addr))) +#define WT_ITEM_BYTE_OFF_RECORD(addr) \ + ((WT_OFF_RECORD *)(WT_ITEM_BYTE(addr))) #define WT_ITEM_BYTE_OVFL(addr) \ ((WT_OVFL *)(WT_ITEM_BYTE(addr))) @@ -778,32 +803,57 @@ struct __wt_item { /* * WT_OFF -- - * Btree internal items and offpage duplicates reference another tree. + * Row-store internal pages reference subtrees with no record count. + * + * WT_OFF_RECORD -- + * Column-store internal pages, and row-store leaf pages with offpage + * duplicate references, reference subtrees, including total record counts + * for the subtree. + * + * !!! + * Note the initial two fields of the WT_OFF and WT_OFF_RECORD fields are the + * same -- this is deliberate, and we use it to pass references to places that + * only care about the addr/size information. */ struct __wt_off { + uint32_t addr; /* Subtree root page address */ + uint32_t size; /* Subtree root page length */ +}; /* - * Solaris and the gcc compiler on Linux pad the WT_OFF structure because of the - * 64-bit records field. This is an on-disk structure, which means we have to - * have a fixed size, without padding, so we declare it as two 32-bit fields and - * cast it. We haven't yet found a compiler that aligns the 32-bit fields such - * that a cast won't work; if we find one, we'll have to go to bit masks, or to - * reading/write the bytes to/from a local variable. + * WT_OFF_SIZE is the expected structure size -- we verify the build to + * ensure the compiler hasn't inserted padding (which would break the world). */ -#define WT_RECORDS(offp) (*(uint64_t *)(&(offp)->__record_chunk[0])) - uint32_t __record_chunk[2]; /* Subtree record count */ +#define WT_OFF_SIZE 8 +/* + * + * Compilers pad the WT_OFF_RECORD structure because of the 64-bit record count + * field. This is an on-disk structure, which means we require a fixed size, + * so we declare it as two 32-bit fields and cast it. We haven't yet found a + * compiler that aligns the 32-bit fields such that a cast won't work; if we + * find one, we'll have to go to bit masks, or to copying bytes to/from a local + * variable. + */ +struct __wt_off_record { uint32_t addr; /* Subtree root page address */ uint32_t size; /* Subtree root page length */ + +#define WT_RECORDS(offp) (*(uint64_t *)(&(offp)->__record_chunk[0])) + uint32_t __record_chunk[2]; /* Subtree record count */ }; /* - * WT_OFF_SIZE is the expected structure size -- we verify the build to + * WT_OFF_RECORD_SIZE is the expected structure size -- we verify the build to * ensure the compiler hasn't inserted padding (which would break the world). */ -#define WT_OFF_SIZE 16 +#define WT_OFF_RECORD_SIZE 16 -/* WT_OFF_FOREACH is a loop that walks offpage references on a page */ +/* + * WT_OFF_FOREACH -- + * Walks WT_OFF/WT_OFF_RECORD references on a page, incrementing a pointer + * based on its declared type. + */ #define WT_OFF_FOREACH(dsk, offp, i) \ - for ((offp) = (WT_OFF *)WT_PAGE_DISK_BYTE(dsk), \ - (i) = dsk->u.entries; (i) > 0; ++(offp), --(i)) + for ((offp) = WT_PAGE_DISK_BYTE(dsk), \ + (i) = (dsk)->u.entries; (i) > 0; ++(offp), --(i)) /* * Btree overflow items reference another page, and so the data is another diff --git a/src/include/extern.h b/src/include/extern.h index c7264163a34..5814735d646 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -87,7 +87,7 @@ int __wt_ovfl_in(WT_TOC *toc, WT_OVFL *ovfl, DBT *store); int __wt_page_in( - WT_TOC *toc, WT_PAGE *parent, WT_REF *ref, WT_OFF *off, int dsk_verify); + WT_TOC *toc, WT_PAGE *parent, WT_REF *ref, void *off, int dsk_verify); int __wt_page_inmem(WT_TOC *toc, WT_PAGE *page); int diff --git a/src/include/serial.h b/src/include/serial.h index af6c87e83c1..19f201e49d4 100644 --- a/src/include/serial.h +++ b/src/include/serial.h @@ -3,7 +3,7 @@ typedef struct { WT_PAGE * parent; WT_REF * ref; - WT_OFF * off; + void * off; int dsk_verify; } __wt_cache_read_args; #define __wt_cache_read_serial(\ diff --git a/src/include/verify_build.h b/src/include/verify_build.h index 48d9fba760a..228636b2249 100644 --- a/src/include/verify_build.h +++ b/src/include/verify_build.h @@ -44,6 +44,7 @@ __wt_verify_build(void) STATIC_ASSERT(sizeof(WT_COL) == WT_COL_SIZE); STATIC_ASSERT(sizeof(WT_ITEM) == WT_ITEM_SIZE); STATIC_ASSERT(sizeof(WT_OFF) == WT_OFF_SIZE); + STATIC_ASSERT(sizeof(WT_OFF_RECORD) == WT_OFF_RECORD_SIZE); STATIC_ASSERT(sizeof(WT_OVFL) == WT_OVFL_SIZE); STATIC_ASSERT(sizeof(WT_PAGE) == WT_PAGE_SIZE); STATIC_ASSERT(sizeof(WT_PAGE_DESC) == WT_PAGE_DESC_SIZE); diff --git a/src/include/wt_internal.in b/src/include/wt_internal.in index d05d77b7443..1263be0c9c3 100644 --- a/src/include/wt_internal.in +++ b/src/include/wt_internal.in @@ -30,6 +30,7 @@ struct __wt_item; typedef struct __wt_item WT_ITEM; struct __wt_lsn; typedef struct __wt_lsn WT_LSN; struct __wt_mtx; typedef struct __wt_mtx WT_MTX; struct __wt_off; typedef struct __wt_off WT_OFF; +struct __wt_off_record; typedef struct __wt_off_record WT_OFF_RECORD; struct __wt_ovfl; typedef struct __wt_ovfl WT_OVFL; struct __wt_page; typedef struct __wt_page WT_PAGE; struct __wt_page_desc; typedef struct __wt_page_desc WT_PAGE_DESC; @@ -130,8 +131,8 @@ struct __idb { uint32_t file_id; /* In-memory file ID */ WT_FH *fh; /* Backing file handle */ - WT_REF root_page; /* Root page reference */ - WT_OFF root_off; /* Root page location */ + WT_REF root_page; /* Root page reference */ + WT_OFF_RECORD root_off; /* Root page location */ WT_WALK evict_walk; /* Eviction thread's walk state */ |