summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKeith Bostic <keith@wiredtiger.com>2012-10-24 20:01:10 +0000
committerKeith Bostic <keith@wiredtiger.com>2012-10-24 20:01:10 +0000
commit448ac2fd58f312861c55508dc0976138831d0775 (patch)
tree926cbe8fc4174d12d86f28e18260ace4ca203826
parent5da25a5291b2db1103b4c358b12ee386b0604a24 (diff)
downloadmongo-448ac2fd58f312861c55508dc0976138831d0775.tar.gz
Instead of entering a fake key cell after the last cell on the page just
in case the page ends with a key cell which has no value, use the end of the page to detect that case. This simplifies the changes required for the new compression API. Reference #324.
-rw-r--r--src/btree/bt_page.c8
-rw-r--r--src/btree/rec_write.c39
-rw-r--r--src/btree/row_key.c24
-rw-r--r--src/docs/upgrading.dox11
-rw-r--r--src/include/cell.i28
5 files changed, 39 insertions, 71 deletions
diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c
index 2631ee29d39..5a0c17dcf5f 100644
--- a/src/btree/bt_page.c
+++ b/src/btree/bt_page.c
@@ -515,6 +515,14 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep)
}
}
+ /*
+ * We use the fact that cells exactly fill a page to detect the case of
+ * a row-store leaf page where the last cell is a key (that is, there's
+ * no subsequent value cell). Assert that to be true, the bug would be
+ * difficult to find/diagnose in the field.
+ */
+ WT_ASSERT(session, cell == (WT_CELL *)((uint8_t *)dsk + dsk->size));
+
WT_RET((__wt_calloc_def(session, (size_t)nindx, &page->u.row.d)));
if (inmem_sizep != NULL)
*inmem_sizep += nindx * sizeof(*page->u.row.d);
diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c
index e13184b389d..a23afca13d9 100644
--- a/src/btree/rec_write.c
+++ b/src/btree/rec_write.c
@@ -1191,7 +1191,6 @@ static int
__rec_split_write(WT_SESSION_IMPL *session,
WT_RECONCILE *r, WT_BOUNDARY *bnd, WT_ITEM *buf, int checkpoint)
{
- WT_CELL *cell;
WT_PAGE_HEADER *dsk;
uint32_t size;
uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE];
@@ -1199,27 +1198,6 @@ __rec_split_write(WT_SESSION_IMPL *session,
dsk = buf->mem;
/*
- * We always write an additional byte on row-store leaf pages after the
- * key value pairs. The reason is that zero-length value items are not
- * written on the page and they're detected by finding two adjacent key
- * cells. If the last value item on a page is zero length, we need a
- * key cell after it on the page to detect it. The row-store leaf page
- * reconciliation code made sure we had a spare byte in the buffer, now
- * write a trailing zero-length key cell. This isn't a valid key cell,
- * but since it's not referenced by the entries on the page, no code but
- * the code reading after the key cell, to find the key value, will ever
- * see it.
- */
-#define WT_TRAILING_KEY_CELL (sizeof(uint8_t))
- if (dsk->type == WT_PAGE_ROW_LEAF) {
- WT_ASSERT_RET(session, buf->size < buf->memsize);
-
- cell = (WT_CELL *)&(((uint8_t *)buf->data)[buf->size]);
- __wt_cell_pack_key_empty(cell);
- ++buf->size;
- }
-
- /*
* Write the chunk and save the location information. There is one big
* question: if this is a checkpoint, we're going to have to wrap up
* our tracking information (freeing blocks we no longer need) before we
@@ -1441,11 +1419,8 @@ __wt_rec_row_bulk_insert(WT_CURSOR_BULK *cbulk)
/*
* Boundary, split or write the page.
- *
- * We write a trailing key cell on the page after the K/V pairs
- * (see WT_TRAILING_KEY_CELL for more information).
*/
- while (key->len + val->len + WT_TRAILING_KEY_CELL > r->space_avail) {
+ while (key->len + val->len > r->space_avail) {
/* Split the page. */
WT_RET(__rec_split(session, r));
@@ -2999,12 +2974,8 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
/*
* Boundary, split or write the page.
- *
- * We write a trailing key cell on the page after the K/V pairs
- * (see WT_TRAILING_KEY_CELL for more information).
*/
- while (key->len +
- val->len + WT_TRAILING_KEY_CELL > r->space_avail) {
+ while (key->len + val->len > r->space_avail) {
/*
* In one path above, we copied the key from the page
* rather than building the actual key. In that case,
@@ -3086,12 +3057,8 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
/*
* Boundary, split or write the page.
- *
- * We write a trailing key cell on the page after the K/V pairs
- * (see WT_TRAILING_KEY_CELL for more information).
*/
- while (key->len +
- val->len + WT_TRAILING_KEY_CELL > r->space_avail) {
+ while (key->len + val->len > r->space_avail) {
WT_RET(__rec_split(session, r));
/*
diff --git a/src/btree/row_key.c b/src/btree/row_key.c
index bf2326a19e4..f6dc88aa5af 100644
--- a/src/btree/row_key.c
+++ b/src/btree/row_key.c
@@ -342,16 +342,17 @@ WT_CELL *
__wt_row_value(WT_PAGE *page, WT_ROW *rip)
{
WT_CELL *cell;
+ WT_CELL_UNPACK unpack;
u_int type;
cell = WT_ROW_KEY_COPY(rip);
/*
* Key copied.
*
- * Now, cell either references a WT_IKEY structure that has a value-cell
- * offset, or references the on-page key WT_CELL, and we can walk past
- * that to find the value WT_CELL. Both can be processed regardless of
- * what other threads are doing.
+ * Cell now either references a WT_IKEY structure with a cell offset,
+ * or references the on-page key WT_CELL. Both can be processed
+ * regardless of what other threads are doing. If it's the former,
+ * use it to get the latter.
*/
if (__wt_off_page(page, cell))
cell = WT_PAGE_REF_OFFSET(page, ((WT_IKEY *)cell)->cell_offset);
@@ -359,10 +360,19 @@ __wt_row_value(WT_PAGE *page, WT_ROW *rip)
/*
* Row-store leaf pages may have a single data cell between each key, or
* keys may be adjacent (when the data cell is empty). Move to the next
- * key. The page reconciliation code guarantees there is always a key
- * cell after an empty data cell, so this is safe.
+ * cell and check its type.
+ *
+ * One special case: if the last key on a page is a key without a value,
+ * don't walk off the end of the page: the size of the underlying disk
+ * image is exact, which means the end of the last cell on the page plus
+ * the length of the cell should be the byte immediately after the page
+ * disk image.
*/
- cell = __wt_cell_next(cell);
+ __wt_cell_unpack(cell, &unpack);
+ cell = (WT_CELL *)((uint8_t *)cell + __wt_cell_total_len(&unpack));
+ if (__wt_off_page(page, cell))
+ return (NULL);
+
type = __wt_cell_type(cell);
return (type == WT_CELL_KEY || type == WT_CELL_KEY_OVFL ? NULL : cell);
}
diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox
index ed2a6927fe5..65894cb032f 100644
--- a/src/docs/upgrading.dox
+++ b/src/docs/upgrading.dox
@@ -1,5 +1,16 @@
/*! @page upgrading Upgrading WiredTiger applications
+@section version_135 Upgrading to Version 1.3.5
+<dl>
+
+<dt>Version 1.3.5 file format changes</dt>
+<dd>
+The underlying file formats changed in the 1.3.5 release; tables and files
+should be dumped and re-loaded into a new database.
+</dd>
+
+</dl>
+<hr>
@section version_13 Upgrading to Version 1.3
<dl>
diff --git a/src/include/cell.i b/src/include/cell.i
index 888a367801d..7a919d5ebe3 100644
--- a/src/include/cell.i
+++ b/src/include/cell.i
@@ -330,21 +330,6 @@ __wt_cell_pack_key(WT_CELL *cell, uint8_t prefix, uint32_t size)
}
/*
- * __wt_cell_pack_key_empty --
- * Write an empty key cell.
- */
-static inline void
-__wt_cell_pack_key_empty(WT_CELL *cell)
-{
- /*
- * At the end of a row-store leaf page we have to write an empty key to
- * act as a marker in case the last value on the page is zero-length.
- * See the caller of this function for details.
- */
- cell->__chunk[0] = WT_CELL_KEY;
-}
-
-/*
* __wt_cell_pack_ovfl --
* Pack an overflow cell.
*/
@@ -629,19 +614,6 @@ __wt_cell_unpack(WT_CELL *cell, WT_CELL_UNPACK *unpack)
}
/*
- * __wt_cell_next --
- * Return the next WT_CELL on the page.
- */
-static inline WT_CELL *
-__wt_cell_next(WT_CELL *cell)
-{
- WT_CELL_UNPACK unpack;
-
- __wt_cell_unpack(cell, &unpack);
- return ((WT_CELL *)((uint8_t *)cell + unpack.__len));
-}
-
-/*
* __wt_cell_unpack_ref --
* Set a buffer to reference the data from an unpacked cell.
*/