diff options
author | Alex Gorrod <alexg@wiredtiger.com> | 2015-12-24 00:17:06 +0000 |
---|---|---|
committer | Keith Bostic <keith@wiredtiger.com> | 2015-12-28 15:36:22 -0500 |
commit | afb0edb67cf8f342f68bbdbcc42d4537899cf1d2 (patch) | |
tree | e693f3067fb8b7202e39d27f8ee0562c9996e2e4 | |
parent | 12aaeb6ad9ce1cebba26bcfb96613ccad7eda78e (diff) | |
download | mongo-afb0edb67cf8f342f68bbdbcc42d4537899cf1d2.tar.gz |
WT-2307 workaround for cursor next bug.
Detect when a cursor next returned an invalid result and retry
the next operation. Horrible since it adds a comparison into each
cursor next, but fixes the symptom until we can fix the underlying bug.
-rw-r--r-- | src/btree/bt_curnext.c | 77 | ||||
-rw-r--r-- | src/btree/bt_curprev.c | 1 | ||||
-rw-r--r-- | src/btree/bt_cursor.c | 11 | ||||
-rw-r--r-- | src/include/cursor.h | 16 | ||||
-rw-r--r-- | src/include/extern.h | 1 |
5 files changed, 105 insertions, 1 deletions
diff --git a/src/btree/bt_curnext.c b/src/btree/bt_curnext.c index 55843d1cae5..8b74a2c7ffd 100644 --- a/src/btree/bt_curnext.c +++ b/src/btree/bt_curnext.c @@ -431,6 +431,62 @@ __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt) } /* + * __wt_set_last_op -- + * Set the last operation. + */ +void +__wt_set_last_op(WT_CURSOR_BTREE *cbt, int v) +{ + int i; + + for (i = 20; --i > 0;) + cbt->last_op[i] = cbt->last_op[i - 1]; + cbt->last_op[0] = v; +} + +/* + * __key_order_check -- + * Check key ordering for cursor movements. + */ +static int +__key_order_check(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) +{ + WT_BTREE *btree; + WT_ITEM *key; + WT_DECL_RET; + int cmp; + + btree = S2BT(session); + key = &cbt->iface.key; + + if ((ret = __wt_compare( + session, btree->collator, cbt->lastkey, key, &cmp)) != 0) + WT_RET_MSG(session, ret, + "WT-2307: comparison function failed"); + if (cmp < 0) + return (0); + + /* Flag an error and keep going */ + __wt_errx(session, "encountered out of order key"); + + /* + * The cursor next hit a bug due to a race in splits, move the cursor + * back to the last known good position and retry the next. + */ + key->data = cbt->lastkey->data; + key->size = cbt->lastkey->size; + if ((ret = __wt_btcur_search(cbt)) != 0) + WT_RET_MSG(session, ret, + "WT-2307: searching for the previous key failed"); + + /* Set last op as a next, in case we need to loop retrying */ + cbt->last_op[0] = WT_LASTOP_NEXT; + + /* Return a duplicate key error to tell next it needs to retry. */ + return (WT_DUPLICATE_KEY); +} + +/* * __wt_btcur_next -- * Move to the next record in the tree. */ @@ -452,7 +508,7 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) if (truncating) LF_SET(WT_READ_TRUNCATE); - WT_RET(__cursor_func_init(cbt, false)); +retry: WT_RET(__cursor_func_init(cbt, false)); /* * If we aren't already iterating in the right direction, there's @@ -531,6 +587,25 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } + /* + * WT-2307 check that the previous key returned sorts less than + * the current key being returned. If it didn't we've searched back + * to the previous key, retry the next operation. + */ + if (ret == 0 && page->type == WT_PAGE_ROW_LEAF) { + if (cbt->last_op[0] == WT_LASTOP_NEXT && + cbt->lastkey != NULL && cbt->lastkey->size != 0) { + ret = __key_order_check(session, cbt); + if (ret == WT_DUPLICATE_KEY) + goto retry; + WT_ERR(ret); + } + + WT_ERR(__wt_buf_set(session, + cbt->lastkey, cbt->iface.key.data, cbt->iface.key.size)); + } + __wt_set_last_op(cbt, WT_LASTOP_NEXT); + err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); diff --git a/src/btree/bt_curprev.c b/src/btree/bt_curprev.c index 1d23b976edd..5ce54f60648 100644 --- a/src/btree/bt_curprev.c +++ b/src/btree/bt_curprev.c @@ -532,6 +532,7 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) bool newpage; session = (WT_SESSION_IMPL *)cbt->iface.session; + __wt_set_last_op(cbt, WT_LASTOP_PREV); WT_STAT_FAST_CONN_INCR(session, cursor_prev); WT_STAT_FAST_DATA_INCR(session, cursor_prev); diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index cc3f5b11a9e..6f12b51c7fc 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -288,6 +288,7 @@ __wt_btcur_reset(WT_CURSOR_BTREE *cbt) session = (WT_SESSION_IMPL *)cbt->iface.session; + __wt_set_last_op(cbt, WT_LASTOP_RESET); WT_STAT_FAST_CONN_INCR(session, cursor_reset); WT_STAT_FAST_DATA_INCR(session, cursor_reset); @@ -313,6 +314,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) session = (WT_SESSION_IMPL *)cursor->session; upd = NULL; /* -Wuninitialized */ + __wt_set_last_op(cbt, WT_LASTOP_SEARCH); WT_STAT_FAST_CONN_INCR(session, cursor_search); WT_STAT_FAST_DATA_INCR(session, cursor_search); @@ -380,6 +382,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) upd = NULL; /* -Wuninitialized */ exact = 0; + __wt_set_last_op(cbt, WT_LASTOP_SEARCH_NEAR); WT_STAT_FAST_CONN_INCR(session, cursor_search_near); WT_STAT_FAST_DATA_INCR(session, cursor_search_near); @@ -487,6 +490,7 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt) cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; + __wt_set_last_op(cbt, WT_LASTOP_INSERT); WT_STAT_FAST_CONN_INCR(session, cursor_insert); WT_STAT_FAST_DATA_INCR(session, cursor_insert); WT_STAT_FAST_DATA_INCRV(session, @@ -655,6 +659,7 @@ __wt_btcur_remove(WT_CURSOR_BTREE *cbt) cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; + __wt_set_last_op(cbt, WT_LASTOP_REMOVE); WT_STAT_FAST_CONN_INCR(session, cursor_remove); WT_STAT_FAST_DATA_INCR(session, cursor_remove); WT_STAT_FAST_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size); @@ -740,6 +745,7 @@ __wt_btcur_update(WT_CURSOR_BTREE *cbt) cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; + __wt_set_last_op(cbt, WT_LASTOP_UPDATE); WT_STAT_FAST_CONN_INCR(session, cursor_update); WT_STAT_FAST_DATA_INCR(session, cursor_update); WT_STAT_FAST_DATA_INCRV( @@ -1159,6 +1165,8 @@ __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) cbt = (start != NULL) ? start : stop; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = cbt->btree; + + __wt_set_last_op(cbt, WT_LASTOP_TRUNCATE); WT_STAT_FAST_DATA_INCR(session, cursor_truncate); /* @@ -1233,6 +1241,8 @@ __wt_btcur_open(WT_CURSOR_BTREE *cbt) { cbt->row_key = &cbt->_row_key; cbt->tmp = &cbt->_tmp; + + cbt->lastkey = &cbt->_lastkey; } /* @@ -1258,6 +1268,7 @@ __wt_btcur_close(WT_CURSOR_BTREE *cbt, bool lowlevel) __wt_buf_free(session, &cbt->_row_key); __wt_buf_free(session, &cbt->_tmp); + __wt_buf_free(session, &cbt->_lastkey); return (ret); } diff --git a/src/include/cursor.h b/src/include/cursor.h index 13f18adab4a..db5774e5cb6 100644 --- a/src/include/cursor.h +++ b/src/include/cursor.h @@ -192,6 +192,22 @@ struct __wt_cursor_btree { WT_UPDATE *modify_update; /* + * WT-2307 tracking for a bug where cursor next jumps backwards. + */ + WT_ITEM *lastkey, _lastkey; + +#define WT_LASTOP_NEXT 1 +#define WT_LASTOP_PREV 2 +#define WT_LASTOP_RESET 3 +#define WT_LASTOP_SEARCH 4 +#define WT_LASTOP_SEARCH_NEAR 5 +#define WT_LASTOP_INSERT 6 +#define WT_LASTOP_TRUNCATE 7 +#define WT_LASTOP_UPDATE 8 +#define WT_LASTOP_REMOVE 9 + uint8_t last_op[20]; /* Last 20 operations */ + + /* * Fixed-length column-store items are a single byte, and it's simpler * and cheaper to allocate the space for it now than keep checking to * see if we need to grow the buffer. diff --git a/src/include/extern.h b/src/include/extern.h index b4e977595b4..b3257f8a045 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -91,6 +91,7 @@ extern int __wt_bloom_drop(WT_BLOOM *bloom, const char *config); extern int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp); extern void __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt); +extern void __wt_set_last_op(WT_CURSOR_BTREE *cbt, int v); extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating); extern int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating); extern int __wt_btcur_reset(WT_CURSOR_BTREE *cbt); |