summaryrefslogtreecommitdiff
path: root/src/include
diff options
context:
space:
mode:
authorKeith Bostic <keith@wiredtiger.com>2013-10-18 12:01:11 -0400
committerKeith Bostic <keith@wiredtiger.com>2013-10-18 12:01:11 -0400
commit5fe7cd979fc015b644bc4f8f046ea4fca4a69915 (patch)
tree5fb2f70d13834e55b8beca27b2748d2c5fb4fc07 /src/include
parent72d8cffd91d3c877eaf17cd8846fe64394f4b84d (diff)
downloadmongo-5fe7cd979fc015b644bc4f8f046ea4fca4a69915.tar.gz
There's magic code to handle overflow records that have been deleted but
still need to be visible to othread threads (see bt_ovfl.c for lots more information). We were using WT_RESTART returns from the overflow read function to handle this, but that led to a bug where, if the read of an overflow item returns restart, the cursor may have already moved past the entry, and so restart potentially skips a row. Row-store used to store cached "deleted overflow" items each row's WT_UPDATE list, while column-store stored cached "deleted overflow" items in the page's modify structure. Change row-store to do the same thing as column-store, and then change the system to pass enough information into the overflow read function that it never needs to return restart, it can simply look-aside into the cached entries as soon as we realize we're reading a "deleted overflow" item. This means restart is now only returned as a result of modification failure, remove a bunch of restart handling code.
Diffstat (limited to 'src/include')
-rw-r--r--src/include/cell.i46
-rw-r--r--src/include/cursor.i3
-rw-r--r--src/include/extern.h5
3 files changed, 40 insertions, 14 deletions
diff --git a/src/include/cell.i b/src/include/cell.i
index 31826c6dcfd..bc528a5c1ee 100644
--- a/src/include/cell.i
+++ b/src/include/cell.i
@@ -648,12 +648,12 @@ __wt_cell_unpack(WT_CELL *cell, WT_CELL_UNPACK *unpack)
}
/*
- * __wt_cell_unpack_ref --
+ * __cell_data_ref --
* Set a buffer to reference the data from an unpacked cell.
*/
static inline int
-__wt_cell_unpack_ref(WT_SESSION_IMPL *session,
- int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store)
+__cell_data_ref(WT_SESSION_IMPL *session,
+ WT_PAGE *page, int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store)
{
WT_BTREE *btree;
void *huffman;
@@ -676,14 +676,14 @@ __wt_cell_unpack_ref(WT_SESSION_IMPL *session,
huffman = btree->huffman_value;
break;
case WT_CELL_KEY_OVFL:
- WT_RET(__wt_ovfl_read(session, unpack, store));
+ WT_RET(__wt_ovfl_read(session, page, unpack, store));
if (page_type == WT_PAGE_ROW_INT)
return (0);
huffman = btree->huffman_key;
break;
case WT_CELL_VALUE_OVFL:
- WT_RET(__wt_ovfl_read(session, unpack, store));
+ WT_RET(__wt_ovfl_read(session, page, unpack, store));
huffman = btree->huffman_value;
break;
WT_ILLEGAL_VALUE(session);
@@ -695,11 +695,38 @@ __wt_cell_unpack_ref(WT_SESSION_IMPL *session,
}
/*
- * __wt_cell_unpack_copy --
+ * __wt_dsk_cell_data_ref, __wt_page_cell_data_ref --
+ * Set a buffer to reference the data from an unpacked cell, two flavors.
+ * There are two version because of WT_CELL_VALUE_OVFL_RM type cells. When an
+ * overflow item is deleted, its backing blocks are removed; if there are still
+ * running transactions that might need to see the overflow item, we cache a
+ * copy of the item and reset the item's cell to WT_CELL_VALUE_OVFL_RM. If we
+ * find a WT_CELL_VALUE_OVFL_RM cell when reading an overflow item, we use the
+ * page reference to look aside into the cache. So, calling the "dsk" version
+ * of the function declares the cell cannot be of type WT_CELL_VALUE_OVFL_RM,
+ * and calling the "page" version means it might be.
+ */
+static inline int
+__wt_dsk_cell_data_ref(WT_SESSION_IMPL *session,
+ int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store)
+{
+ WT_ASSERT(session,
+ __wt_cell_type_raw(unpack->cell) != WT_CELL_VALUE_OVFL_RM);
+ return (__cell_data_ref(session, NULL, page_type, unpack, store));
+}
+static inline int
+__wt_page_cell_data_ref(WT_SESSION_IMPL *session,
+ WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store)
+{
+ return (__cell_data_ref(session, page, page->type, unpack, store));
+}
+
+/*
+ * __wt_cell_data_copy --
* Copy the data from an unpacked cell into a buffer.
*/
static inline int
-__wt_cell_unpack_copy(WT_SESSION_IMPL *session,
+__wt_cell_data_copy(WT_SESSION_IMPL *session,
int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store)
{
/*
@@ -710,8 +737,11 @@ __wt_cell_unpack_copy(WT_SESSION_IMPL *session,
* a copy will be made (for example, when reading an overflow item from
* the underlying object. If that happens, we're done, otherwise make
* a copy.
+ *
+ * We don't require two versions of this function, no callers need to
+ * handle WT_CELL_VALUE_OVFL_RM cells.
*/
- WT_RET(__wt_cell_unpack_ref(session, page_type, unpack, store));
+ WT_RET(__wt_dsk_cell_data_ref(session, page_type, unpack, store));
if (!WT_DATA_IN_ITEM(store))
WT_RET(__wt_buf_set(session, store, store->data, store->size));
return (0);
diff --git a/src/include/cursor.i b/src/include/cursor.i
index 3b07bd162f6..477b879cfd8 100644
--- a/src/include/cursor.i
+++ b/src/include/cursor.i
@@ -253,8 +253,7 @@ slow: WT_RET(__wt_row_leaf_key_work(
vb->size = 0;
} else {
__wt_cell_unpack(cell, unpack);
- WT_RET(__wt_cell_unpack_ref(
- session, WT_PAGE_ROW_LEAF, unpack, vb));
+ WT_RET(__wt_page_cell_data_ref(session, cbt->page, unpack, vb));
}
return (0);
diff --git a/src/include/extern.h b/src/include/extern.h
index a9c19c1b499..a4b7e4a4aed 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -319,13 +319,10 @@ extern const char *__wt_addr_string( WT_SESSION_IMPL *session,
const uint8_t *addr,
uint32_t size);
extern int __wt_ovfl_read(WT_SESSION_IMPL *session,
- WT_CELL_UNPACK *unpack,
- WT_ITEM *store);
-extern int __wt_ovfl_cache_col_restart(WT_SESSION_IMPL *session,
WT_PAGE *page,
WT_CELL_UNPACK *unpack,
WT_ITEM *store);
-extern int __wt_val_ovfl_cache(WT_SESSION_IMPL *session,
+extern int __wt_ovfl_cache(WT_SESSION_IMPL *session,
WT_PAGE *page,
void *cookie,
WT_CELL_UNPACK *unpack);