summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Gorrod <alexg@wiredtiger.com>2015-12-24 00:17:06 +0000
committerKeith Bostic <keith@wiredtiger.com>2015-12-28 15:36:22 -0500
commitafb0edb67cf8f342f68bbdbcc42d4537899cf1d2 (patch)
treee693f3067fb8b7202e39d27f8ee0562c9996e2e4
parent12aaeb6ad9ce1cebba26bcfb96613ccad7eda78e (diff)
downloadmongo-afb0edb67cf8f342f68bbdbcc42d4537899cf1d2.tar.gz
WT-2307 workaround for cursor next bug.
Detect when a cursor next returned an invalid result and retry the next operation. Horrible since it adds a comparison into each cursor next, but fixes the symptom until we can fix the underlying bug.
-rw-r--r--src/btree/bt_curnext.c77
-rw-r--r--src/btree/bt_curprev.c1
-rw-r--r--src/btree/bt_cursor.c11
-rw-r--r--src/include/cursor.h16
-rw-r--r--src/include/extern.h1
5 files changed, 105 insertions, 1 deletions
diff --git a/src/btree/bt_curnext.c b/src/btree/bt_curnext.c
index 55843d1cae5..8b74a2c7ffd 100644
--- a/src/btree/bt_curnext.c
+++ b/src/btree/bt_curnext.c
@@ -431,6 +431,62 @@ __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt)
}
/*
+ * __wt_set_last_op --
+ * Set the last operation.
+ */
+void
+__wt_set_last_op(WT_CURSOR_BTREE *cbt, int v)
+{
+ int i;
+
+ for (i = 20; --i > 0;)
+ cbt->last_op[i] = cbt->last_op[i - 1];
+ cbt->last_op[0] = v;
+}
+
+/*
+ * __key_order_check --
+ * Check key ordering for cursor movements.
+ */
+static int
+__key_order_check(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+{
+ WT_BTREE *btree;
+ WT_ITEM *key;
+ WT_DECL_RET;
+ int cmp;
+
+ btree = S2BT(session);
+ key = &cbt->iface.key;
+
+ if ((ret = __wt_compare(
+ session, btree->collator, cbt->lastkey, key, &cmp)) != 0)
+ WT_RET_MSG(session, ret,
+ "WT-2307: comparison function failed");
+ if (cmp < 0)
+ return (0);
+
+ /* Flag an error and keep going */
+ __wt_errx(session, "encountered out of order key");
+
+ /*
+ * The cursor next hit a bug due to a race in splits, move the cursor
+ * back to the last known good position and retry the next.
+ */
+ key->data = cbt->lastkey->data;
+ key->size = cbt->lastkey->size;
+ if ((ret = __wt_btcur_search(cbt)) != 0)
+ WT_RET_MSG(session, ret,
+ "WT-2307: searching for the previous key failed");
+
+ /* Set last op as a next, in case we need to loop retrying */
+ cbt->last_op[0] = WT_LASTOP_NEXT;
+
+ /* Return a duplicate key error to tell next it needs to retry. */
+ return (WT_DUPLICATE_KEY);
+}
+
+/*
* __wt_btcur_next --
* Move to the next record in the tree.
*/
@@ -452,7 +508,7 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating)
if (truncating)
LF_SET(WT_READ_TRUNCATE);
- WT_RET(__cursor_func_init(cbt, false));
+retry: WT_RET(__cursor_func_init(cbt, false));
/*
* If we aren't already iterating in the right direction, there's
@@ -531,6 +587,25 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating)
WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
}
+ /*
+ * WT-2307 check that the previous key returned sorts less than
+ * the current key being returned. If it didn't we've searched back
+ * to the previous key, retry the next operation.
+ */
+ if (ret == 0 && page->type == WT_PAGE_ROW_LEAF) {
+ if (cbt->last_op[0] == WT_LASTOP_NEXT &&
+ cbt->lastkey != NULL && cbt->lastkey->size != 0) {
+ ret = __key_order_check(session, cbt);
+ if (ret == WT_DUPLICATE_KEY)
+ goto retry;
+ WT_ERR(ret);
+ }
+
+ WT_ERR(__wt_buf_set(session,
+ cbt->lastkey, cbt->iface.key.data, cbt->iface.key.size));
+ }
+ __wt_set_last_op(cbt, WT_LASTOP_NEXT);
+
err: if (ret != 0)
WT_TRET(__cursor_reset(cbt));
return (ret);
diff --git a/src/btree/bt_curprev.c b/src/btree/bt_curprev.c
index 1d23b976edd..5ce54f60648 100644
--- a/src/btree/bt_curprev.c
+++ b/src/btree/bt_curprev.c
@@ -532,6 +532,7 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating)
bool newpage;
session = (WT_SESSION_IMPL *)cbt->iface.session;
+ __wt_set_last_op(cbt, WT_LASTOP_PREV);
WT_STAT_FAST_CONN_INCR(session, cursor_prev);
WT_STAT_FAST_DATA_INCR(session, cursor_prev);
diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c
index cc3f5b11a9e..6f12b51c7fc 100644
--- a/src/btree/bt_cursor.c
+++ b/src/btree/bt_cursor.c
@@ -288,6 +288,7 @@ __wt_btcur_reset(WT_CURSOR_BTREE *cbt)
session = (WT_SESSION_IMPL *)cbt->iface.session;
+ __wt_set_last_op(cbt, WT_LASTOP_RESET);
WT_STAT_FAST_CONN_INCR(session, cursor_reset);
WT_STAT_FAST_DATA_INCR(session, cursor_reset);
@@ -313,6 +314,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt)
session = (WT_SESSION_IMPL *)cursor->session;
upd = NULL; /* -Wuninitialized */
+ __wt_set_last_op(cbt, WT_LASTOP_SEARCH);
WT_STAT_FAST_CONN_INCR(session, cursor_search);
WT_STAT_FAST_DATA_INCR(session, cursor_search);
@@ -380,6 +382,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
upd = NULL; /* -Wuninitialized */
exact = 0;
+ __wt_set_last_op(cbt, WT_LASTOP_SEARCH_NEAR);
WT_STAT_FAST_CONN_INCR(session, cursor_search_near);
WT_STAT_FAST_DATA_INCR(session, cursor_search_near);
@@ -487,6 +490,7 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt)
cursor = &cbt->iface;
session = (WT_SESSION_IMPL *)cursor->session;
+ __wt_set_last_op(cbt, WT_LASTOP_INSERT);
WT_STAT_FAST_CONN_INCR(session, cursor_insert);
WT_STAT_FAST_DATA_INCR(session, cursor_insert);
WT_STAT_FAST_DATA_INCRV(session,
@@ -655,6 +659,7 @@ __wt_btcur_remove(WT_CURSOR_BTREE *cbt)
cursor = &cbt->iface;
session = (WT_SESSION_IMPL *)cursor->session;
+ __wt_set_last_op(cbt, WT_LASTOP_REMOVE);
WT_STAT_FAST_CONN_INCR(session, cursor_remove);
WT_STAT_FAST_DATA_INCR(session, cursor_remove);
WT_STAT_FAST_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size);
@@ -740,6 +745,7 @@ __wt_btcur_update(WT_CURSOR_BTREE *cbt)
cursor = &cbt->iface;
session = (WT_SESSION_IMPL *)cursor->session;
+ __wt_set_last_op(cbt, WT_LASTOP_UPDATE);
WT_STAT_FAST_CONN_INCR(session, cursor_update);
WT_STAT_FAST_DATA_INCR(session, cursor_update);
WT_STAT_FAST_DATA_INCRV(
@@ -1159,6 +1165,8 @@ __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop)
cbt = (start != NULL) ? start : stop;
session = (WT_SESSION_IMPL *)cbt->iface.session;
btree = cbt->btree;
+
+ __wt_set_last_op(cbt, WT_LASTOP_TRUNCATE);
WT_STAT_FAST_DATA_INCR(session, cursor_truncate);
/*
@@ -1233,6 +1241,8 @@ __wt_btcur_open(WT_CURSOR_BTREE *cbt)
{
cbt->row_key = &cbt->_row_key;
cbt->tmp = &cbt->_tmp;
+
+ cbt->lastkey = &cbt->_lastkey;
}
/*
@@ -1258,6 +1268,7 @@ __wt_btcur_close(WT_CURSOR_BTREE *cbt, bool lowlevel)
__wt_buf_free(session, &cbt->_row_key);
__wt_buf_free(session, &cbt->_tmp);
+ __wt_buf_free(session, &cbt->_lastkey);
return (ret);
}
diff --git a/src/include/cursor.h b/src/include/cursor.h
index 13f18adab4a..db5774e5cb6 100644
--- a/src/include/cursor.h
+++ b/src/include/cursor.h
@@ -192,6 +192,22 @@ struct __wt_cursor_btree {
WT_UPDATE *modify_update;
/*
+ * WT-2307 tracking for a bug where cursor next jumps backwards.
+ */
+ WT_ITEM *lastkey, _lastkey;
+
+#define WT_LASTOP_NEXT 1
+#define WT_LASTOP_PREV 2
+#define WT_LASTOP_RESET 3
+#define WT_LASTOP_SEARCH 4
+#define WT_LASTOP_SEARCH_NEAR 5
+#define WT_LASTOP_INSERT 6
+#define WT_LASTOP_TRUNCATE 7
+#define WT_LASTOP_UPDATE 8
+#define WT_LASTOP_REMOVE 9
+ uint8_t last_op[20]; /* Last 20 operations */
+
+ /*
* Fixed-length column-store items are a single byte, and it's simpler
* and cheaper to allocate the space for it now than keep checking to
* see if we need to grow the buffer.
diff --git a/src/include/extern.h b/src/include/extern.h
index b4e977595b4..b3257f8a045 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -91,6 +91,7 @@ extern int __wt_bloom_drop(WT_BLOOM *bloom, const char *config);
extern int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp);
extern void __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt);
+extern void __wt_set_last_op(WT_CURSOR_BTREE *cbt, int v);
extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating);
extern int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating);
extern int __wt_btcur_reset(WT_CURSOR_BTREE *cbt);