diff options
-rw-r--r-- | src/third_party/wiredtiger/import.data | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/row_srch.c | 59 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/serial_inline.h | 22 |
3 files changed, 78 insertions, 5 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 83f6f45a5a0..da6f0711304 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-master", - "commit": "3e8c3bdd675df363540399db8c4e3b409db95bd8" + "commit": "3687d84457c5468542645aff23930a2a248a16ed" } diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c index abd98bbd506..e42b207b0d8 100644 --- a/src/third_party/wiredtiger/src/btree/row_srch.c +++ b/src/third_party/wiredtiger/src/btree/row_srch.c @@ -102,7 +102,18 @@ __wt_search_insert( match = skiphigh = skiplow = 0; ins = last_ins = NULL; for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; i >= 0;) { - if ((ins = *insp) == NULL) { + /* + * The algorithm requires that the skip list insert pointer is only read once within the + * loop. While the compiler can change the code in a way that it reads the insert pointer + * value from memory again in the following code. + * + * In addition, a CPU with weak memory ordering, such as ARM, may reorder the reads and read + * a stale value. It is not OK and the reason is explained in the following comment. + * + * Place a read barrier here to avoid these issues. + */ + WT_ORDERED_READ(ins, *insp); + if (ins == NULL) { cbt->next_stack[i] = NULL; cbt->ins_stack[i--] = insp--; continue; @@ -116,6 +127,46 @@ __wt_search_insert( last_ins = ins; key.data = WT_INSERT_KEY(ins); key.size = WT_INSERT_KEY_SIZE(ins); + /* + * We have an optimization to reduce the number of bytes we need to compare during the + * search if we know a prefix of the search key matches the keys we have already + * compared on the upper stacks. This works because we know the keys become denser down + * the stack. + * + * However, things become tricky if we have another key inserted concurrently next to + * the search key. The current search may or may not see the concurrently inserted key + * but it should always see a valid skip list. In other words, + * + * 1) at any level of the list, keys are in sorted order; + * + * 2) if a reader sees a key in level N, that key is also in all levels below N. + * + * Otherwise, we may wrongly skip the comparison of a prefix and land on the wrong spot. + * Here's an example: + * + * Suppose we have a skip list: + * + * L1: AA -> BA + * + * L0: AA -> BA + * + * and we want to search AB and a key AC is inserted concurrently. If we see the + * following skip list in the search: + * + * L1: AA -> AC -> BA + * + * L0: AA -> BA + * + * Since we have compared with AA and AC on level 1 before dropping down to level 0, we + * decide we can skip comparing the first byte of the key. However, since we don't see + * AC on level 0, we compare with BA and wrongly skip the comparison with prefix B. + * + * On architectures with strong memory ordering, the requirement is satisfied by + * inserting the new key to the skip list from lower stack to upper stack using an + * atomic compare and swap operation, which functions as a full barrier. However, it is + * not enough on the architecture that has weaker memory ordering, such as ARM. + * Therefore, an extra read barrier is needed for these platforms. + */ match = WT_MIN(skiplow, skiphigh); WT_RET(__wt_compare_skip(session, collator, srch_key, &key, &cmp, &match)); } @@ -129,7 +180,11 @@ __wt_search_insert( skiphigh = match; } else for (; i >= 0; i--) { - cbt->next_stack[i] = ins->next[i]; + /* + * It is possible that we read an old value down the stack due to CPU read + * reordering. Add a read barrier to avoid this issue. + */ + WT_ORDERED_READ(cbt->next_stack[i], ins->next[i]); cbt->ins_stack[i] = &ins->next[i]; } } diff --git a/src/third_party/wiredtiger/src/include/serial_inline.h b/src/third_party/wiredtiger/src/include/serial_inline.h index ced6d3f7373..252a0190398 100644 --- a/src/third_party/wiredtiger/src/include/serial_inline.h +++ b/src/third_party/wiredtiger/src/include/serial_inline.h @@ -14,6 +14,7 @@ static inline int __insert_simple_func( WT_SESSION_IMPL *session, WT_INSERT ***ins_stack, WT_INSERT *new_ins, u_int skipdepth) { + WT_INSERT *old_ins; u_int i; WT_UNUSED(session); @@ -29,7 +30,15 @@ __insert_simple_func( * implementations read the old value multiple times. */ for (i = 0; i < skipdepth; i++) { - WT_INSERT *old_ins = *ins_stack[i]; + /* + * The insert stack position must be read only once - if the compiler chooses to re-read the + * shared variable it could lead to skip list corruption. Specifically the comparison + * against the next pointer might indicate that the skip list location is still valid, but + * that may no longer be true when the atomic_cas operation executes. + * + * Place a read barrier here to avoid this issue. + */ + WT_ORDERED_READ(old_ins, *ins_stack[i]); if (old_ins != new_ins->next[i] || !__wt_atomic_cas_ptr(ins_stack[i], old_ins, new_ins)) return (i == 0 ? WT_RESTART : 0); } @@ -45,6 +54,7 @@ static inline int __insert_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head, WT_INSERT ***ins_stack, WT_INSERT *new_ins, u_int skipdepth) { + WT_INSERT *old_ins; u_int i; /* The cursor should be positioned. */ @@ -63,7 +73,15 @@ __insert_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head, WT_INSE * implementations read the old value multiple times. */ for (i = 0; i < skipdepth; i++) { - WT_INSERT *old_ins = *ins_stack[i]; + /* + * The insert stack position must be read only once - if the compiler chooses to re-read the + * shared variable it could lead to skip list corruption. Specifically the comparison + * against the next pointer might indicate that the skip list location is still valid, but + * that may no longer be true when the atomic_cas operation executes. + * + * Place a read barrier here to avoid this issue. + */ + WT_ORDERED_READ(old_ins, *ins_stack[i]); if (old_ins != new_ins->next[i] || !__wt_atomic_cas_ptr(ins_stack[i], old_ins, new_ins)) return (i == 0 ? WT_RESTART : 0); if (ins_head->tail[i] == NULL || ins_stack[i] == &ins_head->tail[i]->next[i]) |