3 files changed, 78 insertions, 5 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 83f6f45a5a0..da6f0711304 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
     "vendor": "wiredtiger",
     "github": "wiredtiger/wiredtiger.git",
     "branch": "mongodb-master",
-    "commit": "3e8c3bdd675df363540399db8c4e3b409db95bd8"
+    "commit": "3687d84457c5468542645aff23930a2a248a16ed"
 }
diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c
index abd98bbd506..e42b207b0d8 100644
--- a/src/third_party/wiredtiger/src/btree/row_srch.c
+++ b/src/third_party/wiredtiger/src/btree/row_srch.c
@@ -102,7 +102,18 @@ __wt_search_insert(
     match = skiphigh = skiplow = 0;
     ins = last_ins = NULL;
     for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; i >= 0;) {
-        if ((ins = *insp) == NULL) {
+        /*
+         * The algorithm requires that the skip list insert pointer is only read once within the
+         * loop. While the compiler can change the code in a way that it reads the insert pointer
+         * value from memory again in the following code.
+         *
+         * In addition, a CPU with weak memory ordering, such as ARM, may reorder the reads and read
+         * a stale value. It is not OK and the reason is explained in the following comment.
+         *
+         * Place a read barrier here to avoid these issues.
+         */
+        WT_ORDERED_READ(ins, *insp);
+        if (ins == NULL) {
             cbt->next_stack[i] = NULL;
             cbt->ins_stack[i--] = insp--;
             continue;
@@ -116,6 +127,46 @@ __wt_search_insert(
             last_ins = ins;
             key.data = WT_INSERT_KEY(ins);
             key.size = WT_INSERT_KEY_SIZE(ins);
+            /*
+             * We have an optimization to reduce the number of bytes we need to compare during the
+             * search if we know a prefix of the search key matches the keys we have already
+             * compared on the upper stacks. This works because we know the keys become denser down
+             * the stack.
+             *
+             * However, things become tricky if we have another key inserted concurrently next to
+             * the search key. The current search may or may not see the concurrently inserted key
+             * but it should always see a valid skip list. In other words,
+             *
+             * 1) at any level of the list, keys are in sorted order;
+             *
+             * 2) if a reader sees a key in level N, that key is also in all levels below N.
+             *
+             * Otherwise, we may wrongly skip the comparison of a prefix and land on the wrong spot.
+             * Here's an example:
+             *
+             * Suppose we have a skip list:
+             *
+             * L1: AA -> BA
+             *
+             * L0: AA -> BA
+             *
+             * and we want to search AB and a key AC is inserted concurrently. If we see the
+             * following skip list in the search:
+             *
+             * L1: AA -> AC -> BA
+             *
+             * L0: AA -> BA
+             *
+             * Since we have compared with AA and AC on level 1 before dropping down to level 0, we
+             * decide we can skip comparing the first byte of the key. However, since we don't see
+             * AC on level 0, we compare with BA and wrongly skip the comparison with prefix B.
+             *
+             * On architectures with strong memory ordering, the requirement is satisfied by
+             * inserting the new key to the skip list from lower stack to upper stack using an
+             * atomic compare and swap operation, which functions as a full barrier. However, it is
+             * not enough on the architecture that has weaker memory ordering, such as ARM.
+             * Therefore, an extra read barrier is needed for these platforms.
+             */
             match = WT_MIN(skiplow, skiphigh);
             WT_RET(__wt_compare_skip(session, collator, srch_key, &key, &cmp, &match));
         }
@@ -129,7 +180,11 @@ __wt_search_insert(
             skiphigh = match;
         } else
             for (; i >= 0; i--) {
-                cbt->next_stack[i] = ins->next[i];
+                /*
+                 * It is possible that we read an old value down the stack due to CPU read
+                 * reordering. Add a read barrier to avoid this issue.
+                 */
+                WT_ORDERED_READ(cbt->next_stack[i], ins->next[i]);
                 cbt->ins_stack[i] = &ins->next[i];
             }
     }
diff --git a/src/third_party/wiredtiger/src/include/serial_inline.h b/src/third_party/wiredtiger/src/include/serial_inline.h
index ced6d3f7373..252a0190398 100644
--- a/src/third_party/wiredtiger/src/include/serial_inline.h
+++ b/src/third_party/wiredtiger/src/include/serial_inline.h
@@ -14,6 +14,7 @@ static inline int
 __insert_simple_func(
   WT_SESSION_IMPL *session, WT_INSERT ***ins_stack, WT_INSERT *new_ins, u_int skipdepth)
 {
+    WT_INSERT *old_ins;
     u_int i;
 
     WT_UNUSED(session);
@@ -29,7 +30,15 @@ __insert_simple_func(
      * implementations read the old value multiple times.
      */
     for (i = 0; i < skipdepth; i++) {
-        WT_INSERT *old_ins = *ins_stack[i];
+        /*
+         * The insert stack position must be read only once - if the compiler chooses to re-read the
+         * shared variable it could lead to skip list corruption. Specifically the comparison
+         * against the next pointer might indicate that the skip list location is still valid, but
+         * that may no longer be true when the atomic_cas operation executes.
+         *
+         * Place a read barrier here to avoid this issue.
+         */
+        WT_ORDERED_READ(old_ins, *ins_stack[i]);
         if (old_ins != new_ins->next[i] || !__wt_atomic_cas_ptr(ins_stack[i], old_ins, new_ins))
             return (i == 0 ? WT_RESTART : 0);
     }
@@ -45,6 +54,7 @@ static inline int
 __insert_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head, WT_INSERT ***ins_stack,
   WT_INSERT *new_ins, u_int skipdepth)
 {
+    WT_INSERT *old_ins;
     u_int i;
 
     /* The cursor should be positioned. */
@@ -63,7 +73,15 @@ __insert_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head, WT_INSE
      * implementations read the old value multiple times.
      */
     for (i = 0; i < skipdepth; i++) {
-        WT_INSERT *old_ins = *ins_stack[i];
+        /*
+         * The insert stack position must be read only once - if the compiler chooses to re-read the
+         * shared variable it could lead to skip list corruption. Specifically the comparison
+         * against the next pointer might indicate that the skip list location is still valid, but
+         * that may no longer be true when the atomic_cas operation executes.
+         *
+         * Place a read barrier here to avoid this issue.
+         */
+        WT_ORDERED_READ(old_ins, *ins_stack[i]);
         if (old_ins != new_ins->next[i] || !__wt_atomic_cas_ptr(ins_stack[i], old_ins, new_ins))
             return (i == 0 ? WT_RESTART : 0);
         if (ins_head->tail[i] == NULL || ins_stack[i] == &ins_head->tail[i]->next[i])