summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKeith Bostic <keith.bostic@wiredtiger.com>2011-02-07 12:47:13 -0500
committerKeith Bostic <keith.bostic@wiredtiger.com>2011-02-07 12:47:13 -0500
commit841f45dd46cc2f3abad12688c60eb4bf192558b3 (patch)
treebc24486f67ac8e2758c2f0cf816aee8433829165
parent88a359cb1fa3f276613da0903c60185f04c1f0bc (diff)
downloadmongo-841f45dd46cc2f3abad12688c60eb4bf192558b3.tar.gz
Split WT_OFF into WT_OFF and WT_OFF_RECORDS: the former is used when we don't
need record counts associated with the subtree (row-store internal pages), and when we do need record counts associated with the subtree (column-store internal pages and row-store leaf pages referencing off-page duplicate trees).
-rw-r--r--dist/serial.py2
-rw-r--r--src/btree/bt_bulk.c88
-rw-r--r--src/btree/bt_debug.c77
-rw-r--r--src/btree/bt_dump.c8
-rw-r--r--src/btree/bt_misc.c2
-rw-r--r--src/btree/bt_page.c22
-rw-r--r--src/btree/bt_read.c17
-rw-r--r--src/btree/bt_reconcile.c43
-rw-r--r--src/btree/bt_stat.c8
-rw-r--r--src/btree/bt_vrfy.c64
-rw-r--r--src/btree/bt_walk.c11
-rw-r--r--src/btree/col_srch.c6
-rw-r--r--src/include/btree.h150
-rw-r--r--src/include/extern.h2
-rw-r--r--src/include/serial.h2
-rw-r--r--src/include/verify_build.h1
-rw-r--r--src/include/wt_internal.in5
17 files changed, 301 insertions, 207 deletions
diff --git a/dist/serial.py b/dist/serial.py
index a34b26c53d4..5460524a038 100644
--- a/dist/serial.py
+++ b/dist/serial.py
@@ -44,7 +44,7 @@ serial['cache_read'] = Serial(
'WT_WORKQ_READ', '0',
['WT_PAGE */parent',
'WT_REF */ref',
- 'WT_OFF */off',
+ 'void */off',
'int/dsk_verify'])
# func_serial --
diff --git a/src/btree/bt_bulk.c b/src/btree/bt_bulk.c
index 2f791b75655..ec71111b176 100644
--- a/src/btree/bt_bulk.c
+++ b/src/btree/bt_bulk.c
@@ -29,7 +29,8 @@ typedef struct {
static int __wt_bulk_dbt_copy(ENV *, DBT *, DBT *);
static int __wt_bulk_dup_offpage(WT_TOC *, DBT **, DBT **, DBT *, WT_ITEM *,
- uint32_t, uint32_t, WT_OFF *, int (*)(DB *, DBT **, DBT **));
+ uint32_t, uint32_t, WT_OFF_RECORD *,
+ int (*)(DB *, DBT **, DBT **));
static int __wt_bulk_fix(WT_TOC *, void (*)(const char *,
uint64_t), int (*)(DB *, DBT **, DBT **));
static int __wt_bulk_ovfl_copy(WT_TOC *, WT_OVFL *, WT_OVFL *);
@@ -247,14 +248,14 @@ __wt_bulk_var(WT_TOC *toc, uint32_t flags,
ENV *env;
IDB *idb;
WT_ITEM key_item, data_item, *dup_key, *dup_data;
- WT_OFF off;
+ WT_OFF_RECORD off_record;
WT_OVFL key_ovfl, data_ovfl;
WT_PAGE *page, *next;
WT_STACK stack;
uint64_t insert_cnt;
uint32_t dup_count, dup_space, len, next_space_avail, space_avail;
- uint8_t *first_free, *next_first_free, *p, type;
- int ret;
+ uint8_t *first_free, *next_first_free, *p, page_type;
+ int is_column, ret;
db = toc->db;
tmp1 = tmp2 = NULL;
@@ -265,7 +266,7 @@ __wt_bulk_var(WT_TOC *toc, uint32_t flags,
WT_CLEAR(stack);
dup_space = dup_count = 0;
insert_cnt = 0;
- type = F_ISSET(idb, WT_COLUMN) ? WT_PAGE_COL_VAR : WT_PAGE_ROW_LEAF;
+ is_column = F_ISSET(idb, WT_COLUMN) ? 1 : 0;
lastkey = &lastkey_std;
WT_CLEAR(data_copy);
@@ -275,11 +276,12 @@ __wt_bulk_var(WT_TOC *toc, uint32_t flags,
WT_ERR(__wt_scr_alloc(toc, 0, &lastkey_copy));
/* Get a scratch buffer and make it look like our work page. */
+ page_type = is_column ? WT_PAGE_COL_VAR : WT_PAGE_ROW_LEAF;
WT_ERR(__wt_bulk_scratch_page(
- toc, db->leafmin, type, WT_LLEAF, &page, &tmp1));
+ toc, db->leafmin, page_type, WT_LLEAF, &page, &tmp1));
__wt_set_ff_and_sa_from_offset(
page, WT_PAGE_BYTE(page), &first_free, &space_avail);
- if (type == WT_PAGE_COL_VAR)
+ if (is_column)
page->dsk->start_recno = 1;
while ((ret = cb(db, &key, &data)) == 0) {
@@ -409,11 +411,11 @@ skip_read: /*
if ((key == NULL ? 0 : WT_ITEM_SPACE_REQ(key->size)) +
WT_ITEM_SPACE_REQ(data->size) > space_avail) {
WT_ERR(__wt_bulk_scratch_page(toc,
- db->leafmin, type, WT_LLEAF, &next, &tmp2));
+ db->leafmin, page_type, WT_LLEAF, &next, &tmp2));
__wt_set_ff_and_sa_from_offset(next,
WT_PAGE_BYTE(next),
&next_first_free, &next_space_avail);
- if (type == WT_PAGE_COL_VAR)
+ if (is_column)
next->dsk->start_recno = insert_cnt;
/*
@@ -586,24 +588,25 @@ skip_read: /*
WT_ERR(__wt_bulk_dup_offpage(toc, &key, &data, lastkey,
dup_data,
(uint32_t)(first_free - (uint8_t *)dup_data),
- dup_count, &off, cb));
+ dup_count, &off_record, cb));
/* Reset the page entry and record counts. */
page->dsk->u.entries -= (dup_count - 1);
- page->records -= dup_count;
- page->records += WT_RECORDS(&off);
/*
- * Replace the duplicate set with a WT_OFF structure,
- * that is, we've replaced dup_count entries with a
- * single entry.
+ * Replace the set of duplicates with a WT_OFF_RECORD
+ * structure, that is, we've replaced dup_count entries
+ * with a single offpage reference.
*/
- WT_ITEM_SET(&data_item, WT_ITEM_OFF, sizeof(WT_OFF));
+ WT_ITEM_SET(&data_item,
+ WT_ITEM_OFF_RECORD, sizeof(WT_OFF_RECORD));
p = (uint8_t *)dup_data;
memcpy(p, &data_item, sizeof(data_item));
- memcpy(p + sizeof(data_item), &off, sizeof(WT_OFF));
+ memcpy(p + sizeof(data_item),
+ &off_record, sizeof(WT_OFF_RECORD));
__wt_set_ff_and_sa_from_offset(page,
- (uint8_t *)p + WT_ITEM_SPACE_REQ(sizeof(WT_OFF)),
+ (uint8_t *)p +
+ WT_ITEM_SPACE_REQ(sizeof(WT_OFF_RECORD)),
&first_free, &space_avail);
/* Reset local counters. */
@@ -646,8 +649,9 @@ err: WT_TRET(__wt_bulk_stack_put(toc, &stack));
* then load the rest of the duplicate set.
*/
static int
-__wt_bulk_dup_offpage(WT_TOC *toc, DBT **keyp, DBT **datap, DBT *lastkey,
- WT_ITEM *dup_data, uint32_t dup_len, uint32_t dup_count, WT_OFF *off,
+__wt_bulk_dup_offpage(
+ WT_TOC *toc, DBT **keyp, DBT **datap, DBT *lastkey, WT_ITEM *dup_data,
+ uint32_t dup_len, uint32_t dup_count, WT_OFF_RECORD *off_record,
int (*cb)(DB *, DBT **, DBT **))
{
DB *db;
@@ -689,8 +693,8 @@ __wt_bulk_dup_offpage(WT_TOC *toc, DBT **keyp, DBT **datap, DBT *lastkey,
* On-page reference to the first duplicate data item in the set.
* dup_count --
* Count of duplicates in the set.
- * off --
- * Callers WT_OFF structure, which we have to fill in.
+ * off_record --
+ * Callers WT_OFF_RECORD structure, which we have to fill in.
* cb --
* User's callback function.
*/
@@ -789,9 +793,9 @@ __wt_bulk_dup_offpage(WT_TOC *toc, DBT **keyp, DBT **datap, DBT *lastkey,
WT_ERR(__wt_page_write(toc, page));
/* Fill in the caller's WT_OFF structure. */
- WT_RECORDS(off) = dup_count;
- off->addr = root_addr;
- off->size = db->intlmin;
+ WT_RECORDS(off_record) = dup_count;
+ off_record->addr = root_addr;
+ off_record->size = db->intlmin;
err: WT_TRET(__wt_bulk_stack_put(toc, &stack));
if (tmp != NULL)
@@ -813,6 +817,7 @@ __wt_bulk_promote(WT_TOC *toc, WT_PAGE *page, uint64_t incr,
ENV *env;
WT_ITEM *key_item, item;
WT_OFF off;
+ WT_OFF_RECORD off_record;
WT_OVFL tmp_ovfl;
WT_PAGE *next, *parent;
WT_PAGE_DISK *dsk;
@@ -1069,20 +1074,21 @@ split: switch (dsk->type) {
*/
switch (parent->dsk->type) {
case WT_PAGE_COL_INT:
- if (elem->space_avail < sizeof(WT_OFF))
+ if (elem->space_avail < sizeof(WT_OFF_RECORD))
goto split;
- /* Create the WT_OFF reference. */
- WT_RECORDS(&off) = page->records;
- off.addr = page->addr;
- off.size = dsk->level == WT_LLEAF ? db->leafmin : db->intlmin;
+ /* Create the WT_OFF_RECORD reference. */
+ WT_RECORDS(&off_record) = page->records;
+ off_record.addr = page->addr;
+ off_record.size =
+ dsk->level == WT_LLEAF ? db->leafmin : db->intlmin;
/* Store the data item. */
++parent->dsk->u.entries;
parent_data = elem->first_free;
- memcpy(elem->first_free, &off, sizeof(off));
- elem->first_free += sizeof(WT_OFF);
- elem->space_avail -= sizeof(WT_OFF);
+ memcpy(elem->first_free, &off_record, sizeof(off_record));
+ elem->first_free += sizeof(WT_OFF_RECORD);
+ elem->space_avail -= sizeof(WT_OFF_RECORD);
/* Track the last entry on the page for record count updates. */
stack->elem[level].data = parent_data;
@@ -1103,7 +1109,6 @@ split: switch (dsk->type) {
/* Create the WT_ITEM(WT_OFF) reference. */
WT_ITEM_SET(&item, WT_ITEM_OFF, sizeof(WT_OFF));
- WT_RECORDS(&off) = 0;
off.addr = page->addr;
off.size = dsk->level == WT_LLEAF ? db->leafmin : db->intlmin;
@@ -1138,18 +1143,11 @@ split: switch (dsk->type) {
* page, so proceed from there to the root.
*/
for (elem =
- &stack->elem[level + 1]; elem->page != NULL; ++elem) {
- switch (elem->page->dsk->type) {
- case WT_PAGE_COL_INT:
- WT_RECORDS((WT_OFF *)elem->data) += incr;
- break;
- case WT_PAGE_ROW_INT:
- case WT_PAGE_DUP_INT:
- break;
- WT_ILLEGAL_FORMAT(db);
+ &stack->elem[level + 1]; elem->page != NULL; ++elem)
+ if (elem->page->dsk->type == WT_PAGE_COL_INT) {
+ elem->page->records += incr;
+ WT_RECORDS((WT_OFF_RECORD *)elem->data) += incr;
}
- elem->page->records += incr;
- }
}
err: if (next_tmp != NULL)
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index dc61838b89a..d7cd101f44f 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -14,15 +14,14 @@ static void __wt_debug_dsk_col_fix(DB *, WT_PAGE_DISK *, FILE *);
static void __wt_debug_dsk_col_int(WT_PAGE_DISK *, FILE *);
static void __wt_debug_dsk_col_rle(DB *, WT_PAGE_DISK *, FILE *);
static int __wt_debug_dsk_item(WT_TOC *, WT_PAGE_DISK *, FILE *);
+static int __wt_debug_item(WT_TOC *, WT_ITEM *, FILE *);
+static int __wt_debug_item_data(WT_TOC *, WT_ITEM *, FILE *fp);
static void __wt_debug_page_col_fix(WT_TOC *, WT_PAGE *, FILE *);
static void __wt_debug_page_col_int(WT_PAGE *, FILE *);
static void __wt_debug_page_col_rle(WT_TOC *, WT_PAGE *, FILE *);
static int __wt_debug_page_col_var(WT_TOC *, WT_PAGE *, FILE *);
static void __wt_debug_page_row_int(WT_PAGE *, FILE *);
static int __wt_debug_page_row_leaf(WT_TOC *, WT_PAGE *, FILE *);
-static int __wt_debug_item(WT_TOC *, WT_ITEM *, FILE *);
-static int __wt_debug_item_data(WT_TOC *, WT_ITEM *, FILE *fp);
-static void __wt_debug_off(WT_OFF *, const char *, FILE *);
static void __wt_debug_pair(const char *, void *, uint32_t, FILE *);
static void __wt_debug_repl(WT_REPL *, FILE *);
static void __wt_debug_rleexp(WT_RLE_EXPAND *, FILE *);
@@ -254,13 +253,18 @@ static void
__wt_debug_page_col_int(WT_PAGE *page, FILE *fp)
{
WT_COL *cip;
+ WT_OFF_RECORD *off_record;
uint32_t i;
if (fp == NULL) /* Default to stderr */
fp = stderr;
- WT_INDX_FOREACH(page, cip, i)
- __wt_debug_off(cip->data, "\t", fp);
+ WT_INDX_FOREACH(page, cip, i) {
+ off_record = cip->data;
+ fprintf(fp, "\toffpage: addr %lu, size %lu, records %llu\n",
+ (u_long)off_record->addr, (u_long)off_record->size,
+ (unsigned long long)WT_RECORDS(off_record));
+ }
}
/*
@@ -357,6 +361,7 @@ __wt_debug_page_row_leaf(WT_TOC *toc, WT_PAGE *page, FILE *fp)
static void
__wt_debug_page_row_int(WT_PAGE *page, FILE *fp)
{
+ WT_OFF *off;
WT_ROW *rip;
uint32_t i;
@@ -368,8 +373,9 @@ __wt_debug_page_row_int(WT_PAGE *page, FILE *fp)
fprintf(fp, "\tkey: {requires processing}\n");
else
__wt_debug_dbt("\tkey", rip, fp);
-
- __wt_debug_off(rip->data, "\t", fp);
+ off = rip->data;
+ fprintf(fp, "\toffpage: addr %lu, size %lu\n",
+ (u_long)off->addr, (u_long)off->size);
}
}
@@ -439,6 +445,8 @@ static int
__wt_debug_item(WT_TOC *toc, WT_ITEM *item, FILE *fp)
{
DB *db;
+ WT_OFF *off;
+ WT_OFF_RECORD *off_record;
WT_OVFL *ovfl;
if (fp == NULL) /* Default to stderr */
@@ -450,25 +458,31 @@ __wt_debug_item(WT_TOC *toc, WT_ITEM *item, FILE *fp)
__wt_item_type_string(item), (u_long)WT_ITEM_LEN(item));
switch (WT_ITEM_TYPE(item)) {
- case WT_ITEM_KEY:
- case WT_ITEM_KEY_DUP:
case WT_ITEM_DATA:
case WT_ITEM_DATA_DUP:
+ case WT_ITEM_DEL:
+ case WT_ITEM_KEY:
+ case WT_ITEM_KEY_DUP:
break;
- case WT_ITEM_KEY_OVFL:
- case WT_ITEM_KEY_DUP_OVFL:
- case WT_ITEM_DATA_OVFL:
case WT_ITEM_DATA_DUP_OVFL:
+ case WT_ITEM_DATA_OVFL:
+ case WT_ITEM_KEY_DUP_OVFL:
+ case WT_ITEM_KEY_OVFL:
ovfl = WT_ITEM_BYTE_OVFL(item);
fprintf(fp, ", addr %lu, size %lu",
(u_long)ovfl->addr, (u_long)ovfl->size);
break;
- case WT_ITEM_DEL:
- fprintf(fp, "\n");
- return (0);
case WT_ITEM_OFF:
- __wt_debug_off(WT_ITEM_BYTE_OFF(item), ", ", fp);
- return (0);
+ off = WT_ITEM_BYTE_OFF(item);
+ fprintf(fp, ", offpage: addr %lu, size %lu\n",
+ (u_long)off->addr, (u_long)off->size);
+ break;
+ case WT_ITEM_OFF_RECORD:
+ off_record = WT_ITEM_BYTE_OFF_RECORD(item);
+ fprintf(fp, ", offpage: addr %lu, size %lu, records %llu\n",
+ (u_long)off_record->addr, (u_long)off_record->size,
+ (unsigned long long)WT_RECORDS(off_record));
+ break;
WT_ILLEGAL_FORMAT(db);
}
@@ -485,14 +499,16 @@ __wt_debug_item(WT_TOC *toc, WT_ITEM *item, FILE *fp)
static void
__wt_debug_dsk_col_int(WT_PAGE_DISK *dsk, FILE *fp)
{
- WT_OFF *off;
+ WT_OFF_RECORD *off_record;
uint32_t i;
if (fp == NULL) /* Default to stderr */
fp = stderr;
- WT_OFF_FOREACH(dsk, off, i)
- __wt_debug_off(off, "\t", fp);
+ WT_OFF_FOREACH(dsk, off_record, i)
+ fprintf(fp, "\toffpage: addr %lu, size %lu, records %llu\n",
+ (u_long)off_record->addr, (u_long)off_record->size,
+ (unsigned long long)WT_RECORDS(off_record));
}
/*
@@ -593,7 +609,11 @@ process: WT_ERR(__wt_scr_alloc(toc, 0, &tmp));
break;
case WT_ITEM_OFF:
p = (uint8_t *)"offpage";
- size = 7;
+ size = sizeof("offpage") - 1;
+ break;
+ case WT_ITEM_OFF_RECORD:
+ p = (uint8_t *)"offpage_record";
+ size = sizeof("offpage_record") - 1;
break;
WT_ILLEGAL_FORMAT_ERR(db, ret);
}
@@ -606,21 +626,6 @@ err: if (tmp != NULL)
}
/*
- * __wt_debug_off --
- * Dump a WT_OFF structure.
- */
-static void
-__wt_debug_off(WT_OFF *off, const char *prefix, FILE *fp)
-{
- if (fp == NULL) /* Default to stderr */
- fp = stderr;
-
- fprintf(fp, "%soffpage: addr %lu, size %lu, records %llu\n",
- prefix, (u_long)off->addr, (u_long)off->size,
- (unsigned long long)WT_RECORDS(off));
-}
-
-/*
* __wt_debug_dbt --
* Dump a single DBT in debugging mode, with an optional tag.
*/
diff --git a/src/btree/bt_dump.c b/src/btree/bt_dump.c
index 4d46fceff27..503f06c24b6 100644
--- a/src/btree/bt_dump.c
+++ b/src/btree/bt_dump.c
@@ -322,7 +322,7 @@ __wt_dump_page_row_leaf(WT_TOC *toc, WT_PAGE *page, WT_DSTUFF *dp)
DB *db;
DBT *key, *data, *key_tmp, *data_tmp, key_local, data_local;
WT_ITEM *item;
- WT_OFF *off;
+ WT_OFF_RECORD *off_record;
WT_REF *ref;
WT_REPL *repl;
WT_ROW *rip;
@@ -388,7 +388,7 @@ __wt_dump_page_row_leaf(WT_TOC *toc, WT_PAGE *page, WT_DSTUFF *dp)
WT_ERR(__wt_item_process(toc, item, data_tmp));
data = data_tmp;
break;
- case WT_ITEM_OFF:
+ case WT_ITEM_OFF_RECORD:
/*
* Set the key and recursively call the tree-walk code
* for any off-page duplicate trees. (Check for any
@@ -399,8 +399,8 @@ __wt_dump_page_row_leaf(WT_TOC *toc, WT_PAGE *page, WT_DSTUFF *dp)
dp->dupkey = key;
ref = WT_ROW_DUP(page, rip);
- off = WT_ROW_OFF(rip);
- WT_RET(__wt_page_in(toc, page, ref, off, 0));
+ off_record = WT_ROW_OFF_RECORD(rip);
+ WT_RET(__wt_page_in(toc, page, ref, off_record, 0));
ret = __wt_tree_walk(toc, ref, 0, __wt_dump_page, dp);
__wt_hazard_clear(toc, ref->page);
if (ret != 0)
diff --git a/src/btree/bt_misc.c b/src/btree/bt_misc.c
index f8267429d3d..cf271c89c9f 100644
--- a/src/btree/bt_misc.c
+++ b/src/btree/bt_misc.c
@@ -97,6 +97,8 @@ __wt_item_type_string(WT_ITEM *item)
return ("key-overflow");
case WT_ITEM_OFF:
return ("off-page");
+ case WT_ITEM_OFF_RECORD:
+ return ("off-page-records");
default:
break;
}
diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c
index ade0dcc187c..e66ea50dc80 100644
--- a/src/btree/bt_page.c
+++ b/src/btree/bt_page.c
@@ -25,7 +25,7 @@ static int __wt_page_inmem_row_leaf(DB *, WT_PAGE *);
*/
int
__wt_page_in(
- WT_TOC *toc, WT_PAGE *parent, WT_REF *ref, WT_OFF *off, int dsk_verify)
+ WT_TOC *toc, WT_PAGE *parent, WT_REF *ref, void *off, int dsk_verify)
{
ENV *env;
WT_CACHE *cache;
@@ -209,7 +209,8 @@ __wt_page_inmem_col_fix(DB *db, WT_PAGE *page)
++cip;
}
- page->indx_count = page->records = dsk->u.entries;
+ page->indx_count = dsk->u.entries;
+ page->records = page->indx_count;
}
/*
@@ -220,7 +221,7 @@ static void
__wt_page_inmem_col_int(WT_PAGE *page)
{
WT_COL *cip;
- WT_OFF *off;
+ WT_OFF_RECORD *off_record;
WT_PAGE_DISK *dsk;
uint64_t records;
uint32_t i;
@@ -231,12 +232,12 @@ __wt_page_inmem_col_int(WT_PAGE *page)
/*
* Walk the page, building indices and finding the end of the page.
- * The page contains WT_OFF structures.
+ * The page contains WT_OFF_RECORD structures.
*/
- WT_OFF_FOREACH(dsk, off, i) {
- cip->data = off;
+ WT_OFF_FOREACH(dsk, off_record, i) {
+ cip->data = off_record;
++cip;
- records += WT_RECORDS(off);
+ records += WT_RECORDS(off_record);
}
page->indx_count = dsk->u.entries;
@@ -302,7 +303,8 @@ __wt_page_inmem_col_var(WT_PAGE *page)
++cip;
}
- page->indx_count = page->records = dsk->u.entries;
+ page->indx_count = dsk->u.entries;
+ page->records = page->indx_count;
}
/*
@@ -433,7 +435,7 @@ __wt_page_inmem_row_leaf(DB *db, WT_PAGE *page)
* single on-page (WT_ITEM_DATA) or overflow (WT_ITEM_DATA_OVFL) item;
* a group of duplicate data items where each duplicate is an on-page
* (WT_ITEM_DATA_DUP) or overflow (WT_ITEM_DUP_OVFL) item; or an offpage
- * reference (WT_ITEM_OFF).
+ * reference (WT_ITEM_OFF_RECORDS).
*/
rip = NULL;
indx_count = 0;
@@ -469,7 +471,7 @@ __wt_page_inmem_row_leaf(DB *db, WT_PAGE *page)
case WT_ITEM_DATA_OVFL:
rip->data = item;
break;
- case WT_ITEM_OFF:
+ case WT_ITEM_OFF_RECORD:
rip->data = item;
/*
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c
index f7e594d2217..769a047cf15 100644
--- a/src/btree/bt_read.c
+++ b/src/btree/bt_read.c
@@ -69,10 +69,10 @@ __wt_cache_read_serial_func(WT_TOC *toc)
{
ENV *env;
WT_CACHE *cache;
- WT_OFF *off;
WT_PAGE *parent;
WT_READ_REQ *rr, *rr_end;
WT_REF *ref;
+ void *off;
int dsk_verify;
__wt_cache_read_unpack(toc, parent, ref, off, dsk_verify);
@@ -190,7 +190,6 @@ __wt_cache_read(WT_READ_REQ *rr)
{
ENV *env;
WT_CACHE *cache;
- WT_OFF *off;
WT_PAGE *page;
WT_PAGE_DISK *dsk;
WT_REF *ref;
@@ -200,9 +199,13 @@ __wt_cache_read(WT_READ_REQ *rr)
toc = rr->toc;
ref = rr->ref;
- off = rr->off;
- addr = off->addr;
- size = off->size;
+
+ /*
+ * We're passed a reference to a WT_OFF or a WT_OFF_RECORD structure;
+ * the initial addr/size pair fields are the same, get what we came for.
+ */
+ addr = ((WT_OFF *)rr->off)->addr;
+ size = ((WT_OFF *)rr->off)->size;
env = toc->env;
cache = env->ienv->cache;
@@ -241,13 +244,13 @@ __wt_cache_read(WT_READ_REQ *rr)
/*
* Fill in the WT_PAGE addr, size.
- * Reference the parent's WT_PAGE and parent's WT_OFF structures.
+ * Reference the parent's WT_PAGE and WT_OFF/WT_OFF_RECORD structures.
* Reference the underlying disk page.
*/
page->addr = addr;
page->size = size;
page->parent = rr->parent;
- page->parent_off = off;
+ page->parent_off = rr->off;
page->dsk = dsk;
/* Build the in-memory version of the page. */
diff --git a/src/btree/bt_reconcile.c b/src/btree/bt_reconcile.c
index 64b7044da9b..3987a621958 100644
--- a/src/btree/bt_reconcile.c
+++ b/src/btree/bt_reconcile.c
@@ -189,7 +189,7 @@ static int
__wt_rec_col_int(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new)
{
WT_COL *cip;
- WT_OFF *from;
+ WT_OFF_RECORD *from;
WT_PAGE_DISK *dsk;
uint32_t i, space_avail;
uint8_t *first_free;
@@ -207,16 +207,16 @@ __wt_rec_col_int(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new)
* size, but it still wasn't enough. We must allocate another
* page and split the parent.
*/
- if (sizeof(WT_OFF) > space_avail) {
+ if (sizeof(WT_OFF_RECORD) > space_avail) {
fprintf(stderr,
"__wt_rec_col_int: page %lu split\n",
(u_long)page->addr);
__wt_abort(toc->env);
}
- memcpy(first_free, from, sizeof(WT_OFF));
- first_free += sizeof(WT_OFF);
- space_avail -= sizeof(WT_OFF);
+ memcpy(first_free, from, sizeof(WT_OFF_RECORD));
+ first_free += sizeof(WT_OFF_RECORD);
+ space_avail -= sizeof(WT_OFF_RECORD);
++dsk->u.entries;
}
@@ -959,9 +959,12 @@ __wt_rec_page_write(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new)
static int
__wt_rec_parent_update(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new)
{
+ ENV *env;
IDB *idb;
- WT_OFF *parent_off;
+ WT_OFF *off;
+ WT_OFF_RECORD *off_record;
+ env = toc->env;
idb = toc->db->idb;
/*
@@ -976,15 +979,27 @@ __wt_rec_parent_update(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new)
}
/*
- * Update the relevant WT_OFF structure. There are two memory locations
- * that change (address and size), and we could race, but that's not a
- * problem. Only a single thread ever reconciles a page at a time, and
- * pages cannot leave memory while they have children.
+ * Update the relevant WT_OFF/WT_OFF_RECORD structure. There are two
+ * memory locations that change (address and size), and we could race,
+ * but that's not a problem. Only a single thread ever reconciles a
+ * page at a time, and pages cannot leave memory if they have children.
*/
- parent_off = page->parent_off;
- WT_RECORDS(parent_off) = new->records;
- parent_off->addr = new->addr;
- parent_off->size = new->size;
+ switch (page->dsk->type) {
+ case WT_PAGE_COL_INT:
+ off_record = page->parent_off;
+ off_record->addr = new->addr;
+ off_record->size = new->size;
+ WT_ASSERT(env, WT_RECORDS(off_record) == new->records);
+ break;
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_ROW_INT:
+ off = page->parent_off;
+ off->addr = new->addr;
+ off->size = new->size;
+ break;
+ default:
+ break;
+ }
/*
* Mark the parent page as dirty.
diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c
index 5beb931f578..ffe1f6f512f 100644
--- a/src/btree/bt_stat.c
+++ b/src/btree/bt_stat.c
@@ -251,7 +251,7 @@ static int
__wt_stat_page_row_leaf(WT_TOC *toc, WT_PAGE *page, void *arg)
{
DB *db;
- WT_OFF *off;
+ WT_OFF_RECORD *off_record;
WT_REF *ref;
WT_REPL *repl;
WT_ROW *rip;
@@ -299,7 +299,7 @@ __wt_stat_page_row_leaf(WT_TOC *toc, WT_PAGE *page, void *arg)
WT_STAT_INCR(stats, ITEM_DATA_OVFL);
WT_STAT_INCR(stats, ITEM_TOTAL_DATA);
break;
- case WT_ITEM_OFF:
+ case WT_ITEM_OFF_RECORD:
/*
* Recursively call the tree-walk code for any off-page
* duplicate trees. (Check for any off-page duplicate
@@ -308,8 +308,8 @@ __wt_stat_page_row_leaf(WT_TOC *toc, WT_PAGE *page, void *arg)
* and in the tree-walk function.)
*/
ref = WT_ROW_REF(page, rip);
- off = WT_ROW_OFF(rip);
- WT_RET(__wt_page_in(toc, page, ref, off, 0));
+ off_record = WT_ROW_OFF_RECORD(rip);
+ WT_RET(__wt_page_in(toc, page, ref, off_record, 0));
ret = __wt_tree_walk(toc, ref, 0, __wt_page_stat, arg);
__wt_hazard_clear(toc, ref->page);
if (ret != 0)
diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c
index 1c52586ec83..612502b5070 100644
--- a/src/btree/bt_vrfy.c
+++ b/src/btree/bt_vrfy.c
@@ -146,6 +146,7 @@ __wt_verify_tree(
WT_COL *cip;
WT_ITEM *item;
WT_OFF *off;
+ WT_OFF_RECORD *off_record;
WT_PAGE *page;
WT_PAGE_DISK *dsk;
WT_REPL *repl;
@@ -219,11 +220,11 @@ __wt_verify_tree(
* Check the starting record number and record counts.
*
* Confirm the number of records found on this page (by summing the
- * WT_OFF structure record counts) matches the WT_OFF structure record
- * count in our parent. Use the in-memory record count for internal
- * pages -- we could sum the record counts as we walk the page below,
- * but we did that when building the in-memory version of the page,
- * there's no reason to do it again.
+ * WT_OFF_RECORD structure record counts) matches the WT_OFF_RECORD
+ * structure record count in our parent. Use the in-memory record
+ * count for internal pages -- we could sum the record counts as we
+ * walk the page below, but we did that when building the in-memory
+ * version of the page, there's no reason to do it again.
*/
switch (dsk->type) {
case WT_PAGE_COL_FIX:
@@ -322,9 +323,9 @@ __wt_verify_tree(
WT_INDX_FOREACH(page, cip, i) {
/* cip references the subtree containing the record */
ref = WT_COL_REF(page, cip);
- off = WT_COL_OFF(cip);
+ off_record = WT_COL_OFF(cip);
records = WT_COL_OFF_RECORDS(cip);
- WT_ERR(__wt_page_in(toc, page, ref, off, 1));
+ WT_ERR(__wt_page_in(toc, page, ref, off_record, 1));
ret = __wt_verify_tree(toc, NULL,
records, start_recno, level - 1, ref, vs);
__wt_hazard_clear(toc, ref->page);
@@ -408,22 +409,22 @@ __wt_verify_tree(
page, rip)) != NULL && WT_REPL_DELETED_ISSET(repl))
continue;
item = rip->data;
- if (WT_ITEM_TYPE(item) != WT_ITEM_OFF)
+ if (WT_ITEM_TYPE(item) != WT_ITEM_OFF_RECORD)
continue;
/* Verify the off-page duplicate tree. */
vs->duptree = 0;
ref = WT_ROW_DUP(page, rip);
- off = WT_ROW_OFF(rip);
- WT_ERR(__wt_page_in(toc, page, ref, off, 1));
+ off_record = WT_ROW_OFF_RECORD(rip);
+ WT_ERR(__wt_page_in(toc, page, ref, off_record, 1));
ret = __wt_verify_tree(toc, NULL,
(uint64_t)0, (uint64_t)0, WT_NOLEVEL, ref, vs);
__wt_hazard_clear(toc, ref->page);
if (ret != 0)
goto err;
- if (vs->duptree != WT_RECORDS(off)) {
+ if (vs->duptree != WT_RECORDS(off_record)) {
__wt_api_db_errx(db,
"off-page duplicate tree referenced from "
"item %lu of page %lu has a record count "
@@ -431,7 +432,7 @@ __wt_verify_tree(
"expected",
(u_long)item_num, (u_long)page->addr,
(unsigned long long)vs->duptree,
- (unsigned long long)WT_RECORDS(off));
+ (unsigned long long)WT_RECORDS(off_record));
goto err;
}
}
@@ -735,6 +736,7 @@ __wt_verify_dsk_item(
WT_ITEM *item;
WT_OVFL *ovfl;
WT_OFF *off;
+ WT_OFF_RECORD *off_record;
off_t file_size;
uint8_t *end;
uint32_t i, item_num, item_len, item_type;
@@ -789,7 +791,11 @@ __wt_verify_dsk_item(
case WT_ITEM_OFF:
if (dsk->type != WT_PAGE_DUP_INT &&
dsk->type != WT_PAGE_ROW_INT &&
- dsk->type != WT_PAGE_ROW_LEAF) {
+ dsk->type != WT_PAGE_ROW_LEAF)
+ goto item_vs_page;
+ break;
+ case WT_ITEM_OFF_RECORD:
+ if (dsk->type != WT_PAGE_ROW_LEAF) {
item_vs_page: __wt_api_db_errx(db,
"illegal item and page type combination "
"(item %lu on page at addr %lu is a %s "
@@ -817,7 +823,7 @@ item_vs_page: __wt_api_db_errx(db,
goto skip_order_check;
/*
- * For row-stores leaf pages, check for:
+ * For row-store leaf pages, check for:
* two keys in a row,
* two non-dup data items in a row,
* inter-mixed dup and non-dup data items,
@@ -842,12 +848,12 @@ item_vs_page: __wt_api_db_errx(db,
case WT_ITEM_DATA_DUP_OVFL:
case WT_ITEM_DATA_OVFL:
case WT_ITEM_DEL:
- case WT_ITEM_OFF:
+ case WT_ITEM_OFF_RECORD:
switch (item_type) {
case WT_ITEM_DATA:
case WT_ITEM_DATA_OVFL:
case WT_ITEM_DEL:
- case WT_ITEM_OFF:
+ case WT_ITEM_OFF_RECORD:
switch (last_item_type) {
case IS_FIRST:
goto first_data;
@@ -916,7 +922,11 @@ skip_order_check:
goto item_len;
break;
case WT_ITEM_OFF:
- if (item_len != sizeof(WT_OFF)) {
+ if (item_len != sizeof(WT_OFF))
+ goto item_len;
+ break;
+ case WT_ITEM_OFF_RECORD:
+ if (item_len != sizeof(WT_OFF_RECORD)) {
item_len: __wt_api_db_errx(db,
"item %lu on page at addr %lu has an "
"incorrect length",
@@ -945,8 +955,14 @@ item_len: __wt_api_db_errx(db,
break;
case WT_ITEM_OFF:
off = WT_ITEM_BYTE_OFF(item);
- if (WT_ADDR_TO_OFF(db, off->addr) +
- off->size > file_size)
+ if (WT_ADDR_TO_OFF(db,
+ off->addr) + off->size > file_size)
+ goto eof;
+ break;
+ case WT_ITEM_OFF_RECORD:
+ off_record = WT_ITEM_BYTE_OFF_RECORD(item);
+ if (WT_ADDR_TO_OFF(db,
+ off_record->addr) + off_record->size > file_size)
goto eof;
break;
default:
@@ -967,7 +983,7 @@ static int
__wt_verify_dsk_col_int(DB *db, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size)
{
IDB *idb;
- WT_OFF *off;
+ WT_OFF_RECORD *off_record;
uint8_t *end;
uint32_t i, entry_num;
@@ -975,16 +991,16 @@ __wt_verify_dsk_col_int(DB *db, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size)
end = (uint8_t *)dsk + size;
entry_num = 0;
- WT_OFF_FOREACH(dsk, off, i) {
+ WT_OFF_FOREACH(dsk, off_record, i) {
++entry_num;
/* Check if this entry is entirely on the page. */
- if ((uint8_t *)off + sizeof(WT_OFF) > end)
+ if ((uint8_t *)off_record + sizeof(WT_OFF_RECORD) > end)
return (__wt_verify_eop(db, entry_num, addr));
/* Check if the reference is past the end-of-file. */
- if (WT_ADDR_TO_OFF(
- db, off->addr) + off->size > idb->fh->file_size)
+ if (WT_ADDR_TO_OFF(db,
+ off_record->addr) + off_record->size > idb->fh->file_size)
return (__wt_verify_eof(db, entry_num, addr));
}
diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c
index f5ef9674f9b..3228fe53600 100644
--- a/src/btree/bt_walk.c
+++ b/src/btree/bt_walk.c
@@ -45,6 +45,7 @@ __wt_tree_walk(WT_TOC *toc, WT_REF *ref,
IDB *idb;
WT_COL *cip;
WT_OFF *off;
+ WT_OFF_RECORD *off_record;
WT_PAGE *page;
WT_ROW *rip;
uint32_t i;
@@ -82,8 +83,8 @@ __wt_tree_walk(WT_TOC *toc, WT_REF *ref,
if (LF_ISSET(WT_WALK_CACHE) && ref->state != WT_OK)
continue;
- off = WT_COL_OFF(cip);
- WT_RET(__wt_page_in(toc, page, ref, off, 0));
+ off_record = WT_COL_OFF(cip);
+ WT_RET(__wt_page_in(toc, page, ref, off_record, 0));
ret = __wt_tree_walk(toc, ref, flags, work, arg);
__wt_hazard_clear(toc, ref->page);
if (ret != 0)
@@ -110,7 +111,7 @@ __wt_tree_walk(WT_TOC *toc, WT_REF *ref,
if (!LF_ISSET(WT_WALK_OFFDUP))
break;
WT_INDX_FOREACH(page, rip, i) {
- if (WT_ITEM_TYPE(rip->data) != WT_ITEM_OFF)
+ if (WT_ITEM_TYPE(rip->data) != WT_ITEM_OFF_RECORD)
break;
/*
@@ -121,8 +122,8 @@ __wt_tree_walk(WT_TOC *toc, WT_REF *ref,
if (LF_ISSET(WT_WALK_CACHE) && ref->state != WT_OK)
continue;
- off = WT_ROW_OFF(rip);
- WT_RET(__wt_page_in(toc, page, ref, off, 0));
+ off_record = WT_ROW_OFF_RECORD(rip);
+ WT_RET(__wt_page_in(toc, page, ref, off_record, 0));
ret = __wt_tree_walk(toc, ref, flags, work, arg);
__wt_hazard_clear(toc, ref->page);
if (ret != 0)
diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c
index 81c24e3d54f..0549e7c6e04 100644
--- a/src/btree/col_srch.c
+++ b/src/btree/col_srch.c
@@ -19,7 +19,7 @@ __wt_col_search(WT_TOC *toc, uint64_t recno, uint32_t level, uint32_t flags)
DB *db;
IDB *idb;
WT_COL *cip;
- WT_OFF *off;
+ WT_OFF_RECORD *off_record;
WT_PAGE *page;
WT_PAGE_DISK *dsk;
WT_RLE_EXPAND *exp;
@@ -90,8 +90,8 @@ __wt_col_search(WT_TOC *toc, uint64_t recno, uint32_t level, uint32_t flags)
/* cip references the subtree containing the record. */
ref = WT_COL_REF(page, cip);
- off = WT_COL_OFF(cip);
- WT_ERR(__wt_page_in(toc, page, ref, off, 0));
+ off_record = WT_COL_OFF(cip);
+ WT_ERR(__wt_page_in(toc, page, ref, off_record, 0));
/* Swap the parent page for the child page. */
if (page != idb->root_page.page)
diff --git a/src/include/btree.h b/src/include/btree.h
index edcf235b7f8..722c969af90 100644
--- a/src/include/btree.h
+++ b/src/include/btree.h
@@ -136,8 +136,13 @@ struct __wt_page {
/* Record count is only maintained for column-store files. */
uint64_t records; /* Records in this subtree */
+ /*
+ * Two links to the parent's WT_PAGE structure -- the physical parent
+ * page, and the WT_OFF or WT_OFF_RECORD structure used to find this
+ * page.
+ */
WT_PAGE *parent; /* Page's parent */
- WT_OFF *parent_off; /* Page's parent reference */
+ void *parent_off; /* Page's parent reference */
WT_PAGE_DISK *dsk; /* Page's on-disk representation */
@@ -157,7 +162,7 @@ struct __wt_page {
* value. If we ever add a flags field to this structure, the pinned
* flag could move there.
*/
-#define WT_PAGE_SET_PIN(p) (p)->read_gen = UINT64_MAX
+#define WT_PAGE_SET_PIN(p) ((p)->read_gen = UINT64_MAX)
#define WT_PAGE_IS_PINNED(p) ((p)->read_gen == UINT64_MAX)
uint64_t read_gen;
@@ -311,10 +316,8 @@ struct __wt_page {
* individually allocated structures. The WT_{COL,ROW}_REF macros return
* the appropriate entry based on a WT_{COL,ROW} reference.
*/
-#define WT_COL_REF(page, ip) \
- (&((page)->u3.ref[WT_COL_SLOT(page, ip)]))
-#define WT_ROW_REF(page, ip) \
- (&((page)->u3.ref[WT_ROW_SLOT(page, ip)]))
+#define WT_COL_REF(page, ip) (&((page)->u3.ref[WT_COL_SLOT(page, ip)]))
+#define WT_ROW_REF(page, ip) (&((page)->u3.ref[WT_ROW_SLOT(page, ip)]))
/*
* The other arrays may not exist, and are arrays of pointers to individually
@@ -613,19 +616,24 @@ struct __wt_rle_expand {
(dupp) = (page)->u3.dup; (i) > 0; ++(dupp), --(i))
/*
- * On both row- and column-store internal pages, the on-page data referenced
- * by the WT_ROW/WT_COL data field is a WT_OFF structure, which contains a
- * record count and a page addr/size pair. Macros to reach into the on-page
- * structure and return the values.
+ * On row-store internal pages, the on-page data referenced by the WT_ROW field
+ * is a WT_OFF structure, which contains a page addr/size pair.
+ */
+#define WT_ROW_OFF(ip) \
+ ((WT_OFF *)WT_ITEM_BYTE(((WT_ROW *)ip)->data))
+#define WT_ROW_OFF_RECORD(ip) \
+ ((WT_OFF_RECORD *)WT_ITEM_BYTE(((WT_ROW *)ip)->data))
+
+/*
+ * On column-store internal pages, the on-page data referenced by the WT_COL
+ * field is a WT_OFF_RECORD structure which contains a page addr/size pair
+ * and a total record count.
*/
#define WT_COL_OFF(ip) \
- ((WT_OFF *)(((WT_COL *)ip)->data))
+ ((WT_OFF_RECORD *)(((WT_COL *)ip)->data))
#define WT_COL_OFF_RECORDS(ip) \
WT_RECORDS(WT_COL_OFF(ip))
-#define WT_ROW_OFF(ip) \
- ((WT_OFF *)WT_ITEM_BYTE(((WT_ROW *)ip)->data))
-
/*
* WT_ITEM --
* Trailing data length (in bytes) plus item type.
@@ -667,14 +675,18 @@ struct __wt_item {
* items, each of which has an overflow form. Items are followed by additional
* data, which varies by type: a key, duplicate key, data or duplicate item is
* followed by a set of bytes; a WT_OVFL structure follows an overflow form.
- * There are two additional types: First, a deleted type (a place-holder for
- * deleted items where the item cannot be removed, for example, an column store
- * item that must remain to preserve the record count). Second, a subtree
- * reference for keys that reference subtrees of information (for example, an
- * internal Btree page has a key and a reference to the tree that contains all
- * key/data pairs greater than the internal page's key, or, a leaf Btree page
- * where a key references all of the duplicate data items for the key when the
- * duplicate data items can no longer fit onto the Btree leaf page).
+ * There are 2 additional types: (1) a deleted type (a place-holder for deleted
+ * items where the item cannot be removed, for example, an column store item
+ * that must remain to preserve the record count); (2a) a subtree reference for
+ * keys that reference subtrees without an associated record count (a row-store
+ * internal page has a key/reference pairs for the tree containing all key/data
+ * pairs greater than the key); (2b) a subtree reference for keys that reference
+ * subtrees with an associated record count (a column-store internal page has
+ * a reference for the tree containing all records greater than the specified
+ * record, or leaf Btree pages where a key references a set of duplicate data
+ * items for the key when the duplicate data items no longer fit onto the leaf
+ * page itself -- offpage duplicate data sets are counted, which is why Btree
+ * leaf pages fall under 2b, and not 2a).
*
* Here's the usage by page type:
*
@@ -687,10 +699,11 @@ struct __wt_item {
* WT_ITEM_KEY_OVFL item followed by a WT_ITEM_DATA or WT_ITEM_DATA_OVFL
* item);
* -- Variable-length key and set of duplicates moved into a separate tree
- * (a WT_ITEM_KEY or WT_ITEM_KEY_OVFL item followed by a WT_ITEM_OFF item);
+ * (a WT_ITEM_KEY or WT_ITEM_KEY_OVFL item followed by a WT_ITEM_OFF_RECORD
+ * item);
* -- Variable-length key and set of duplicates not yet moved into a separate
- * tree (a WT_ITEM_KEY/KEY_OVFL item followed by two or more
- * WT_ITEM_DATA_DUP or WT_ITEM_DATA_DUP_OVFL items).
+ * tree (a WT_ITEM_KEY/KEY_OVFL item followed by two or more WT_ITEM_DATA_DUP
+ * or WT_ITEM_DATA_DUP_OVFL items).
*
* WT_PAGE_DUP_INT (row-store offpage duplicates internal pages):
* -- Variable-length duplicate key and offpage-reference pairs (a
@@ -710,16 +723,26 @@ struct __wt_item {
* These pages contain fixed-sized structures (WT_PAGE_COL_{INT,FIX,RLE}),
* or a string of bytes (WT_PAGE_OVFL), not WT_ITEM structures.
*
- * There are currently 10 item types, requiring 4 bits, with 6 values unused.
+ * There are currently 11 item types, using 4 bits, with 5 values unused. If
+ * we run out of bits, we could compress the item types in a couple of ways:
+ *
+ * We could merge the WT_ITEM_KEY and WT_ITEM_KEY_DUP types, but that requires
+ * we know the page's type in order to know how an item might be encoded (that
+ * is, if it's an off-page duplicate key, it's encoded using the Huffman data
+ * coder, or if it's a Btree row store key, it's encoded using the Huffman key
+ * encoder).
*
- * We could compress the item types in a couple of ways. We could merge the
- * WT_ITEM_KEY and WT_ITEM_KEY_DUP types, but that would require we know the
- * underlying page type in order to know how an item might be encoded (that
- * is, if it's an off-page duplicate key, encoded using the Huffman data coder,
- * or a Btree row store key, encoded using the Huffman key encoder). We could
- * also use a bit to mean overflow, merging all overflow types into a single
- * bit plus the ""primary" item type, but that would require more bit shuffling
+ * We could use a single bit to mean overflow, merging all overflow types into
+ * that bit plus the "primary" item type, but that requires more bit shuffling
* than the current scheme.
+ *
+ * We could combine WT_ITEM_OFF and WT_ITEM_OFF_RECORD types, again, by using
+ * the underlying page type to know what kind of off-page reference it is (if
+ * it's a row-store leaf or column-store internal, it's a WT_ITEM_OFF_RECORD,
+ * if it's a row-store internal, it's a WT_ITEM_OFF).
+ *
+ * All of these changes require some amount of compatibility work because they
+ * involved on-page format information.
*/
#define WT_ITEM_KEY 0x00000000 /* Key */
#define WT_ITEM_KEY_OVFL 0x01000000 /* Key: overflow */
@@ -731,6 +754,7 @@ struct __wt_item {
#define WT_ITEM_DATA_DUP_OVFL 0x07000000 /* Data: duplicate overflow */
#define WT_ITEM_DEL 0x08000000 /* Deleted */
#define WT_ITEM_OFF 0x09000000 /* Off-page reference */
+#define WT_ITEM_OFF_RECORD 0x0a000000 /* Off-page reference with records */
#define WT_ITEM_TYPE(addr) \
(((WT_ITEM *)(addr))->__item_chunk & 0x0f000000)
@@ -749,12 +773,13 @@ struct __wt_item {
/*
* On row-store pages, the on-page data referenced by the WT_ROW data field
- * may be a WT_OVFL (which contains the address for the start of the overflow
- * pages and its length), or a WT_OFF structure. These macros do the cast
- * to the right type.
+ * may be WT_OFF, WT_OFF_RECORD or WT_OVFL structures. These macros do the
+ * cast to the right type.
*/
#define WT_ITEM_BYTE_OFF(addr) \
((WT_OFF *)(WT_ITEM_BYTE(addr)))
+#define WT_ITEM_BYTE_OFF_RECORD(addr) \
+ ((WT_OFF_RECORD *)(WT_ITEM_BYTE(addr)))
#define WT_ITEM_BYTE_OVFL(addr) \
((WT_OVFL *)(WT_ITEM_BYTE(addr)))
@@ -778,32 +803,57 @@ struct __wt_item {
/*
* WT_OFF --
- * Btree internal items and offpage duplicates reference another tree.
+ * Row-store internal pages reference subtrees with no record count.
+ *
+ * WT_OFF_RECORD --
+ * Column-store internal pages, and row-store leaf pages with offpage
+ * duplicate references, reference subtrees, including total record counts
+ * for the subtree.
+ *
+ * !!!
+ * Note the initial two fields of the WT_OFF and WT_OFF_RECORD fields are the
+ * same -- this is deliberate, and we use it to pass references to places that
+ * only care about the addr/size information.
*/
struct __wt_off {
+ uint32_t addr; /* Subtree root page address */
+ uint32_t size; /* Subtree root page length */
+};
/*
- * Solaris and the gcc compiler on Linux pad the WT_OFF structure because of the
- * 64-bit records field. This is an on-disk structure, which means we have to
- * have a fixed size, without padding, so we declare it as two 32-bit fields and
- * cast it. We haven't yet found a compiler that aligns the 32-bit fields such
- * that a cast won't work; if we find one, we'll have to go to bit masks, or to
- * reading/write the bytes to/from a local variable.
+ * WT_OFF_SIZE is the expected structure size -- we verify the build to
+ * ensure the compiler hasn't inserted padding (which would break the world).
*/
-#define WT_RECORDS(offp) (*(uint64_t *)(&(offp)->__record_chunk[0]))
- uint32_t __record_chunk[2]; /* Subtree record count */
+#define WT_OFF_SIZE 8
+/*
+ *
+ * Compilers pad the WT_OFF_RECORD structure because of the 64-bit record count
+ * field. This is an on-disk structure, which means we require a fixed size,
+ * so we declare it as two 32-bit fields and cast it. We haven't yet found a
+ * compiler that aligns the 32-bit fields such that a cast won't work; if we
+ * find one, we'll have to go to bit masks, or to copying bytes to/from a local
+ * variable.
+ */
+struct __wt_off_record {
uint32_t addr; /* Subtree root page address */
uint32_t size; /* Subtree root page length */
+
+#define WT_RECORDS(offp) (*(uint64_t *)(&(offp)->__record_chunk[0]))
+ uint32_t __record_chunk[2]; /* Subtree record count */
};
/*
- * WT_OFF_SIZE is the expected structure size -- we verify the build to
+ * WT_OFF_RECORD_SIZE is the expected structure size -- we verify the build to
* ensure the compiler hasn't inserted padding (which would break the world).
*/
-#define WT_OFF_SIZE 16
+#define WT_OFF_RECORD_SIZE 16
-/* WT_OFF_FOREACH is a loop that walks offpage references on a page */
+/*
+ * WT_OFF_FOREACH --
+ * Walks WT_OFF/WT_OFF_RECORD references on a page, incrementing a pointer
+ * based on its declared type.
+ */
#define WT_OFF_FOREACH(dsk, offp, i) \
- for ((offp) = (WT_OFF *)WT_PAGE_DISK_BYTE(dsk), \
- (i) = dsk->u.entries; (i) > 0; ++(offp), --(i))
+ for ((offp) = WT_PAGE_DISK_BYTE(dsk), \
+ (i) = (dsk)->u.entries; (i) > 0; ++(offp), --(i))
/*
* Btree overflow items reference another page, and so the data is another
diff --git a/src/include/extern.h b/src/include/extern.h
index c7264163a34..5814735d646 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -87,7 +87,7 @@ int
__wt_ovfl_in(WT_TOC *toc, WT_OVFL *ovfl, DBT *store);
int
__wt_page_in(
- WT_TOC *toc, WT_PAGE *parent, WT_REF *ref, WT_OFF *off, int dsk_verify);
+ WT_TOC *toc, WT_PAGE *parent, WT_REF *ref, void *off, int dsk_verify);
int
__wt_page_inmem(WT_TOC *toc, WT_PAGE *page);
int
diff --git a/src/include/serial.h b/src/include/serial.h
index af6c87e83c1..19f201e49d4 100644
--- a/src/include/serial.h
+++ b/src/include/serial.h
@@ -3,7 +3,7 @@
typedef struct {
WT_PAGE * parent;
WT_REF * ref;
- WT_OFF * off;
+ void * off;
int dsk_verify;
} __wt_cache_read_args;
#define __wt_cache_read_serial(\
diff --git a/src/include/verify_build.h b/src/include/verify_build.h
index 48d9fba760a..228636b2249 100644
--- a/src/include/verify_build.h
+++ b/src/include/verify_build.h
@@ -44,6 +44,7 @@ __wt_verify_build(void)
STATIC_ASSERT(sizeof(WT_COL) == WT_COL_SIZE);
STATIC_ASSERT(sizeof(WT_ITEM) == WT_ITEM_SIZE);
STATIC_ASSERT(sizeof(WT_OFF) == WT_OFF_SIZE);
+ STATIC_ASSERT(sizeof(WT_OFF_RECORD) == WT_OFF_RECORD_SIZE);
STATIC_ASSERT(sizeof(WT_OVFL) == WT_OVFL_SIZE);
STATIC_ASSERT(sizeof(WT_PAGE) == WT_PAGE_SIZE);
STATIC_ASSERT(sizeof(WT_PAGE_DESC) == WT_PAGE_DESC_SIZE);
diff --git a/src/include/wt_internal.in b/src/include/wt_internal.in
index d05d77b7443..1263be0c9c3 100644
--- a/src/include/wt_internal.in
+++ b/src/include/wt_internal.in
@@ -30,6 +30,7 @@ struct __wt_item; typedef struct __wt_item WT_ITEM;
struct __wt_lsn; typedef struct __wt_lsn WT_LSN;
struct __wt_mtx; typedef struct __wt_mtx WT_MTX;
struct __wt_off; typedef struct __wt_off WT_OFF;
+struct __wt_off_record; typedef struct __wt_off_record WT_OFF_RECORD;
struct __wt_ovfl; typedef struct __wt_ovfl WT_OVFL;
struct __wt_page; typedef struct __wt_page WT_PAGE;
struct __wt_page_desc; typedef struct __wt_page_desc WT_PAGE_DESC;
@@ -130,8 +131,8 @@ struct __idb {
uint32_t file_id; /* In-memory file ID */
WT_FH *fh; /* Backing file handle */
- WT_REF root_page; /* Root page reference */
- WT_OFF root_off; /* Root page location */
+ WT_REF root_page; /* Root page reference */
+ WT_OFF_RECORD root_off; /* Root page location */
WT_WALK evict_walk; /* Eviction thread's walk state */