summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEtienne Petrel <etienne.petrel@mongodb.com>2023-02-07 03:53:15 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2023-02-07 04:46:39 +0000
commitd1ec31a870e6189634389f8a1c4dac293c9352f5 (patch)
tree1a21fd48ba41c9459b6b2a55803fc3b2e3aa0b97
parentc03da2a6212a0fc6ce55b7a0ba62a12177072688 (diff)
downloadmongo-d1ec31a870e6189634389f8a1c4dac293c9352f5.tar.gz
Import wiredtiger: 3687d84457c5468542645aff23930a2a248a16ed from branch mongodb-master
ref: 3e8c3bdd67..3687d84457 for: 6.3.0-rc0 WT-10461 Fix key out of order in skip list on weakly ordered architecture
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/btree/row_srch.c59
-rw-r--r--src/third_party/wiredtiger/src/include/serial_inline.h22
3 files changed, 78 insertions, 5 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 83f6f45a5a0..da6f0711304 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-master",
- "commit": "3e8c3bdd675df363540399db8c4e3b409db95bd8"
+ "commit": "3687d84457c5468542645aff23930a2a248a16ed"
}
diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c
index abd98bbd506..e42b207b0d8 100644
--- a/src/third_party/wiredtiger/src/btree/row_srch.c
+++ b/src/third_party/wiredtiger/src/btree/row_srch.c
@@ -102,7 +102,18 @@ __wt_search_insert(
match = skiphigh = skiplow = 0;
ins = last_ins = NULL;
for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; i >= 0;) {
- if ((ins = *insp) == NULL) {
+ /*
+ * The algorithm requires that the skip list insert pointer is only read once within the
+ * loop. While the compiler can change the code in a way that it reads the insert pointer
+ * value from memory again in the following code.
+ *
+ * In addition, a CPU with weak memory ordering, such as ARM, may reorder the reads and read
+ * a stale value. It is not OK and the reason is explained in the following comment.
+ *
+ * Place a read barrier here to avoid these issues.
+ */
+ WT_ORDERED_READ(ins, *insp);
+ if (ins == NULL) {
cbt->next_stack[i] = NULL;
cbt->ins_stack[i--] = insp--;
continue;
@@ -116,6 +127,46 @@ __wt_search_insert(
last_ins = ins;
key.data = WT_INSERT_KEY(ins);
key.size = WT_INSERT_KEY_SIZE(ins);
+ /*
+ * We have an optimization to reduce the number of bytes we need to compare during the
+ * search if we know a prefix of the search key matches the keys we have already
+ * compared on the upper stacks. This works because we know the keys become denser down
+ * the stack.
+ *
+ * However, things become tricky if we have another key inserted concurrently next to
+ * the search key. The current search may or may not see the concurrently inserted key
+ * but it should always see a valid skip list. In other words,
+ *
+ * 1) at any level of the list, keys are in sorted order;
+ *
+ * 2) if a reader sees a key in level N, that key is also in all levels below N.
+ *
+ * Otherwise, we may wrongly skip the comparison of a prefix and land on the wrong spot.
+ * Here's an example:
+ *
+ * Suppose we have a skip list:
+ *
+ * L1: AA -> BA
+ *
+ * L0: AA -> BA
+ *
+ * and we want to search AB and a key AC is inserted concurrently. If we see the
+ * following skip list in the search:
+ *
+ * L1: AA -> AC -> BA
+ *
+ * L0: AA -> BA
+ *
+ * Since we have compared with AA and AC on level 1 before dropping down to level 0, we
+ * decide we can skip comparing the first byte of the key. However, since we don't see
+ * AC on level 0, we compare with BA and wrongly skip the comparison with prefix B.
+ *
+ * On architectures with strong memory ordering, the requirement is satisfied by
+ * inserting the new key to the skip list from lower stack to upper stack using an
+ * atomic compare and swap operation, which functions as a full barrier. However, it is
+ * not enough on the architecture that has weaker memory ordering, such as ARM.
+ * Therefore, an extra read barrier is needed for these platforms.
+ */
match = WT_MIN(skiplow, skiphigh);
WT_RET(__wt_compare_skip(session, collator, srch_key, &key, &cmp, &match));
}
@@ -129,7 +180,11 @@ __wt_search_insert(
skiphigh = match;
} else
for (; i >= 0; i--) {
- cbt->next_stack[i] = ins->next[i];
+ /*
+ * It is possible that we read an old value down the stack due to CPU read
+ * reordering. Add a read barrier to avoid this issue.
+ */
+ WT_ORDERED_READ(cbt->next_stack[i], ins->next[i]);
cbt->ins_stack[i] = &ins->next[i];
}
}
diff --git a/src/third_party/wiredtiger/src/include/serial_inline.h b/src/third_party/wiredtiger/src/include/serial_inline.h
index ced6d3f7373..252a0190398 100644
--- a/src/third_party/wiredtiger/src/include/serial_inline.h
+++ b/src/third_party/wiredtiger/src/include/serial_inline.h
@@ -14,6 +14,7 @@ static inline int
__insert_simple_func(
WT_SESSION_IMPL *session, WT_INSERT ***ins_stack, WT_INSERT *new_ins, u_int skipdepth)
{
+ WT_INSERT *old_ins;
u_int i;
WT_UNUSED(session);
@@ -29,7 +30,15 @@ __insert_simple_func(
* implementations read the old value multiple times.
*/
for (i = 0; i < skipdepth; i++) {
- WT_INSERT *old_ins = *ins_stack[i];
+ /*
+ * The insert stack position must be read only once - if the compiler chooses to re-read the
+ * shared variable it could lead to skip list corruption. Specifically the comparison
+ * against the next pointer might indicate that the skip list location is still valid, but
+ * that may no longer be true when the atomic_cas operation executes.
+ *
+ * Place a read barrier here to avoid this issue.
+ */
+ WT_ORDERED_READ(old_ins, *ins_stack[i]);
if (old_ins != new_ins->next[i] || !__wt_atomic_cas_ptr(ins_stack[i], old_ins, new_ins))
return (i == 0 ? WT_RESTART : 0);
}
@@ -45,6 +54,7 @@ static inline int
__insert_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head, WT_INSERT ***ins_stack,
WT_INSERT *new_ins, u_int skipdepth)
{
+ WT_INSERT *old_ins;
u_int i;
/* The cursor should be positioned. */
@@ -63,7 +73,15 @@ __insert_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head, WT_INSE
* implementations read the old value multiple times.
*/
for (i = 0; i < skipdepth; i++) {
- WT_INSERT *old_ins = *ins_stack[i];
+ /*
+ * The insert stack position must be read only once - if the compiler chooses to re-read the
+ * shared variable it could lead to skip list corruption. Specifically the comparison
+ * against the next pointer might indicate that the skip list location is still valid, but
+ * that may no longer be true when the atomic_cas operation executes.
+ *
+ * Place a read barrier here to avoid this issue.
+ */
+ WT_ORDERED_READ(old_ins, *ins_stack[i]);
if (old_ins != new_ins->next[i] || !__wt_atomic_cas_ptr(ins_stack[i], old_ins, new_ins))
return (i == 0 ? WT_RESTART : 0);
if (ins_head->tail[i] == NULL || ins_stack[i] == &ins_head->tail[i]->next[i])