diff options
author | Keith Bostic <keith@wiredtiger.com> | 2013-10-18 12:01:11 -0400 |
---|---|---|
committer | Keith Bostic <keith@wiredtiger.com> | 2013-10-18 12:01:11 -0400 |
commit | 5fe7cd979fc015b644bc4f8f046ea4fca4a69915 (patch) | |
tree | 5fb2f70d13834e55b8beca27b2748d2c5fb4fc07 /src/include | |
parent | 72d8cffd91d3c877eaf17cd8846fe64394f4b84d (diff) | |
download | mongo-5fe7cd979fc015b644bc4f8f046ea4fca4a69915.tar.gz |
There's magic code to handle overflow records that have been deleted but
still need to be visible to othread threads (see bt_ovfl.c for lots more
information). We were using WT_RESTART returns from the overflow read
function to handle this, but that led to a bug where, if the read of an
overflow item returns restart, the cursor may have already moved past
the entry, and so restart potentially skips a row.
Row-store used to store cached "deleted overflow" items each row's
WT_UPDATE list, while column-store stored cached "deleted overflow"
items in the page's modify structure. Change row-store to do the same
thing as column-store, and then change the system to pass enough
information into the overflow read function that it never needs to
return restart, it can simply look-aside into the cached entries as soon
as we realize we're reading a "deleted overflow" item.
This means restart is now only returned as a result of modification
failure, remove a bunch of restart handling code.
Diffstat (limited to 'src/include')
-rw-r--r-- | src/include/cell.i | 46 | ||||
-rw-r--r-- | src/include/cursor.i | 3 | ||||
-rw-r--r-- | src/include/extern.h | 5 |
3 files changed, 40 insertions, 14 deletions
diff --git a/src/include/cell.i b/src/include/cell.i index 31826c6dcfd..bc528a5c1ee 100644 --- a/src/include/cell.i +++ b/src/include/cell.i @@ -648,12 +648,12 @@ __wt_cell_unpack(WT_CELL *cell, WT_CELL_UNPACK *unpack) } /* - * __wt_cell_unpack_ref -- + * __cell_data_ref -- * Set a buffer to reference the data from an unpacked cell. */ static inline int -__wt_cell_unpack_ref(WT_SESSION_IMPL *session, - int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store) +__cell_data_ref(WT_SESSION_IMPL *session, + WT_PAGE *page, int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store) { WT_BTREE *btree; void *huffman; @@ -676,14 +676,14 @@ __wt_cell_unpack_ref(WT_SESSION_IMPL *session, huffman = btree->huffman_value; break; case WT_CELL_KEY_OVFL: - WT_RET(__wt_ovfl_read(session, unpack, store)); + WT_RET(__wt_ovfl_read(session, page, unpack, store)); if (page_type == WT_PAGE_ROW_INT) return (0); huffman = btree->huffman_key; break; case WT_CELL_VALUE_OVFL: - WT_RET(__wt_ovfl_read(session, unpack, store)); + WT_RET(__wt_ovfl_read(session, page, unpack, store)); huffman = btree->huffman_value; break; WT_ILLEGAL_VALUE(session); @@ -695,11 +695,38 @@ __wt_cell_unpack_ref(WT_SESSION_IMPL *session, } /* - * __wt_cell_unpack_copy -- + * __wt_dsk_cell_data_ref, __wt_page_cell_data_ref -- + * Set a buffer to reference the data from an unpacked cell, two flavors. + * There are two version because of WT_CELL_VALUE_OVFL_RM type cells. When an + * overflow item is deleted, its backing blocks are removed; if there are still + * running transactions that might need to see the overflow item, we cache a + * copy of the item and reset the item's cell to WT_CELL_VALUE_OVFL_RM. If we + * find a WT_CELL_VALUE_OVFL_RM cell when reading an overflow item, we use the + * page reference to look aside into the cache. So, calling the "dsk" version + * of the function declares the cell cannot be of type WT_CELL_VALUE_OVFL_RM, + * and calling the "page" version means it might be. + */ +static inline int +__wt_dsk_cell_data_ref(WT_SESSION_IMPL *session, + int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store) +{ + WT_ASSERT(session, + __wt_cell_type_raw(unpack->cell) != WT_CELL_VALUE_OVFL_RM); + return (__cell_data_ref(session, NULL, page_type, unpack, store)); +} +static inline int +__wt_page_cell_data_ref(WT_SESSION_IMPL *session, + WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store) +{ + return (__cell_data_ref(session, page, page->type, unpack, store)); +} + +/* + * __wt_cell_data_copy -- * Copy the data from an unpacked cell into a buffer. */ static inline int -__wt_cell_unpack_copy(WT_SESSION_IMPL *session, +__wt_cell_data_copy(WT_SESSION_IMPL *session, int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store) { /* @@ -710,8 +737,11 @@ __wt_cell_unpack_copy(WT_SESSION_IMPL *session, * a copy will be made (for example, when reading an overflow item from * the underlying object. If that happens, we're done, otherwise make * a copy. + * + * We don't require two versions of this function, no callers need to + * handle WT_CELL_VALUE_OVFL_RM cells. */ - WT_RET(__wt_cell_unpack_ref(session, page_type, unpack, store)); + WT_RET(__wt_dsk_cell_data_ref(session, page_type, unpack, store)); if (!WT_DATA_IN_ITEM(store)) WT_RET(__wt_buf_set(session, store, store->data, store->size)); return (0); diff --git a/src/include/cursor.i b/src/include/cursor.i index 3b07bd162f6..477b879cfd8 100644 --- a/src/include/cursor.i +++ b/src/include/cursor.i @@ -253,8 +253,7 @@ slow: WT_RET(__wt_row_leaf_key_work( vb->size = 0; } else { __wt_cell_unpack(cell, unpack); - WT_RET(__wt_cell_unpack_ref( - session, WT_PAGE_ROW_LEAF, unpack, vb)); + WT_RET(__wt_page_cell_data_ref(session, cbt->page, unpack, vb)); } return (0); diff --git a/src/include/extern.h b/src/include/extern.h index a9c19c1b499..a4b7e4a4aed 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -319,13 +319,10 @@ extern const char *__wt_addr_string( WT_SESSION_IMPL *session, const uint8_t *addr, uint32_t size); extern int __wt_ovfl_read(WT_SESSION_IMPL *session, - WT_CELL_UNPACK *unpack, - WT_ITEM *store); -extern int __wt_ovfl_cache_col_restart(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store); -extern int __wt_val_ovfl_cache(WT_SESSION_IMPL *session, +extern int __wt_ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, void *cookie, WT_CELL_UNPACK *unpack); |