summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@mongodb.com>2015-10-01 16:45:32 +1000
committerMichael Cahill <michael.cahill@mongodb.com>2015-10-01 16:45:32 +1000
commit3e911933d058aa5842af77240fb0ac5027b1468d (patch)
treef8db5f2679bd2bff45d2db1ba1512678da4ec4a5
parentfc51ae17d2dab4d76e4bac3cbbc1f0eb5bb29b1a (diff)
parent79f74e505ae7b15c3c695cdc72f71e4f9a105647 (diff)
downloadmongo-3e911933d058aa5842af77240fb0ac5027b1468d.tar.gz
Merge branch 'develop' into log-zero
-rw-r--r--bench/wtperf/runners/multi-btree-stress.wtperf17
-rw-r--r--bench/wtperf/wtperf.c31
-rw-r--r--dist/s_string.ok1
-rw-r--r--src/btree/bt_compact.c6
-rw-r--r--src/btree/bt_debug.c7
-rw-r--r--src/btree/bt_discard.c2
-rw-r--r--src/btree/bt_split.c47
-rw-r--r--src/conn/conn_log.c19
-rw-r--r--src/evict/evict_lru.c66
-rw-r--r--src/include/btmem.h9
-rw-r--r--src/include/connection.h13
-rw-r--r--src/include/hardware.h10
-rw-r--r--src/include/mutex.h18
-rw-r--r--src/include/mutex.i88
-rw-r--r--src/include/serial.i5
-rw-r--r--src/include/wt_internal.h2
-rw-r--r--src/reconcile/rec_write.c4
17 files changed, 236 insertions, 109 deletions
diff --git a/bench/wtperf/runners/multi-btree-stress.wtperf b/bench/wtperf/runners/multi-btree-stress.wtperf
new file mode 100644
index 00000000000..b10b08f6035
--- /dev/null
+++ b/bench/wtperf/runners/multi-btree-stress.wtperf
@@ -0,0 +1,17 @@
+# wtperf options file: multi-database configuration attempting to
+# trigger slow operations by overloading CPU and disk.
+# References Jira WT-2131
+conn_config="cache_size=2GB,eviction=(threads_min=2,threads_max=2),log=(enabled=false),direct_io=(data,checkpoint),buffer_alignment=4096,checkpoint_sync=true,checkpoint=(wait=60)"
+table_config="allocation_size=4k,prefix_compression=false,split_pct=75,leaf_page_max=4k,internal_page_max=16k,leaf_item_max=1433,internal_item_max=3100,type=file"
+# Divide original icount by database_count.
+database_count=5
+icount=50000
+populate_threads=1
+random_range=50000000
+report_interval=5
+run_time=3600
+threads=((count=1,inserts=1),(count=10,reads=1))
+value_sz=100
+max_latency=1000
+sample_interval=5
+sample_rate=1
diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c
index 8dceeab2832..0d30bb6b2b7 100644
--- a/bench/wtperf/wtperf.c
+++ b/bench/wtperf/wtperf.c
@@ -600,7 +600,34 @@ worker(void *arg)
if (ret == WT_NOTFOUND)
break;
-op_err: lprintf(cfg, ret, 0,
+op_err: if (ret == WT_ROLLBACK && ops_per_txn != 0) {
+ /*
+ * If we are running with explicit transactions
+ * configured and we hit a WT_ROLLBACK, then we
+ * should rollback the current transaction and
+ * attempt to continue.
+ * This does break the guarantee of insertion
+ * order in cases of ordered inserts, as we
+ * aren't retrying here.
+ */
+ lprintf(cfg, ret, 1,
+ "%s for: %s, range: %"PRIu64, op_name(op),
+ key_buf, wtperf_value_range(cfg));
+ if ((ret = session->rollback_transaction(
+ session, NULL)) != 0) {
+ lprintf(cfg, ret, 0,
+ "Failed rollback_transaction");
+ goto err;
+ }
+ if ((ret = session->begin_transaction(
+ session, NULL)) != 0) {
+ lprintf(cfg, ret, 0,
+ "Worker begin transaction failed");
+ goto err;
+ }
+ break;
+ }
+ lprintf(cfg, ret, 0,
"%s failed for: %s, range: %"PRIu64,
op_name(op), key_buf, wtperf_value_range(cfg));
goto err;
@@ -644,7 +671,7 @@ op_err: lprintf(cfg, ret, 0,
if ((ret = session->begin_transaction(
session, NULL)) != 0) {
lprintf(cfg, ret, 0,
- "Worker transaction commit failed");
+ "Worker begin transaction failed");
goto err;
}
}
diff --git a/dist/s_string.ok b/dist/s_string.ok
index 021e222919e..d234a3c101f 100644
--- a/dist/s_string.ok
+++ b/dist/s_string.ok
@@ -646,6 +646,7 @@ intrin
inuse
io
ip
+islocked
ispo
iteratively
jnr
diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c
index 18b6860c758..b2c9e4b67f8 100644
--- a/src/btree/bt_compact.c
+++ b/src/btree/bt_compact.c
@@ -55,10 +55,12 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
* The page's modification information can change underfoot if
* the page is being reconciled, serialize with reconciliation.
*/
- F_CAS_ATOMIC_WAIT(page, WT_PAGE_RECONCILIATION);
+ WT_RET(__wt_fair_lock(session, &page->page_lock));
+
ret = bm->compact_page_skip(bm, session,
mod->mod_replace.addr, mod->mod_replace.size, skipp);
- F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION);
+
+ WT_TRET(__wt_fair_unlock(session, &page->page_lock));
WT_RET(ret);
}
return (0);
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index ee2898f60be..15ae93522a7 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -636,7 +636,10 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page)
__dmsg(ds, ": %s\n", __wt_page_type_string(page->type));
__dmsg(ds, "\t" "disk %p, entries %" PRIu32, page->dsk, entries);
- __dmsg(ds, "%s", __wt_page_is_modified(page) ? ", dirty" : ", clean");
+ __dmsg(ds, ", %s", __wt_page_is_modified(page) ? "dirty" : "clean");
+ __dmsg(ds, ", %s", __wt_fair_islocked(
+ session, &page->page_lock) ? "locked" : "unlocked");
+
if (F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS))
__dmsg(ds, ", keys-built");
if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC))
@@ -647,8 +650,6 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page)
__dmsg(ds, ", evict-lru");
if (F_ISSET_ATOMIC(page, WT_PAGE_OVERFLOW_KEYS))
__dmsg(ds, ", overflow-keys");
- if (F_ISSET_ATOMIC(page, WT_PAGE_RECONCILIATION))
- __dmsg(ds, ", reconciliation");
if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT))
__dmsg(ds, ", split-insert");
diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c
index c27d42d38f4..998667e3e1f 100644
--- a/src/btree/bt_discard.c
+++ b/src/btree/bt_discard.c
@@ -55,7 +55,7 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep)
*/
WT_ASSERT(session, !__wt_page_is_modified(page));
WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU));
- WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_RECONCILIATION));
+ WT_ASSERT(session, !__wt_fair_islocked(session, &page->page_lock));
#ifdef HAVE_DIAGNOSTIC
{
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 29153ced178..adda9145ee4 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -866,6 +866,18 @@ __split_parent_lock(
*parentp = NULL;
/*
+ * A checkpoint reconciling this parent page can deadlock with
+ * our split. We have an exclusive page lock on the child before
+ * we acquire the page's reconciliation lock, and reconciliation
+ * acquires the page's reconciliation lock before it encounters
+ * the child's exclusive lock (which causes reconciliation to
+ * loop until the exclusive lock is resolved). If we want to split
+ * the parent, give up to avoid that deadlock.
+ */
+ if (S2BT(session)->checkpointing != WT_CKPT_OFF)
+ return (EBUSY);
+
+ /*
* Get a page-level lock on the parent to single-thread splits into the
* page because we need to single-thread sizing/growing the page index.
* It's OK to queue up multiple splits as the child pages split, but the
@@ -882,32 +894,11 @@ __split_parent_lock(
*/
for (;;) {
parent = ref->home;
- F_CAS_ATOMIC(parent, WT_PAGE_RECONCILIATION, ret);
- if (ret == 0) {
- /*
- * We can race with another thread deepening our parent.
- * To deal with that, read the parent pointer each time
- * we try to lock it, and check it's still correct after
- * it's locked.
- */
- if (parent == ref->home)
- break;
- F_CLR_ATOMIC(parent, WT_PAGE_RECONCILIATION);
- continue;
- }
-
- /*
- * A checkpoint reconciling this parent page can deadlock with
- * our split. We have an exclusive page lock on the child before
- * we acquire the page's reconciliation lock, and reconciliation
- * acquires the page's reconciliation lock before it encounters
- * the child's exclusive lock (which causes reconciliation to
- * loop until the exclusive lock is resolved). If we can't lock
- * the parent, give up to avoid that deadlock.
- */
- if (S2BT(session)->checkpointing != WT_CKPT_OFF)
- return (EBUSY);
- __wt_yield();
+ WT_RET(__wt_fair_lock(session, &parent->page_lock));
+ if (parent == ref->home)
+ break;
+ /* Try again if the page deepened while we were waiting */
+ WT_RET(__wt_fair_unlock(session, &parent->page_lock));
}
/*
@@ -930,7 +921,7 @@ __split_parent_lock(
*parentp = parent;
return (0);
-err: F_CLR_ATOMIC(parent, WT_PAGE_RECONCILIATION);
+err: WT_TRET(__wt_fair_unlock(session, &parent->page_lock));
return (ret);
}
@@ -946,7 +937,7 @@ __split_parent_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard)
if (hazard)
ret = __wt_hazard_clear(session, parent);
- F_CLR_ATOMIC(parent, WT_PAGE_RECONCILIATION);
+ WT_TRET(__wt_fair_unlock(session, &parent->page_lock));
return (ret);
}
diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c
index 41420462f6e..6f9021ccfc3 100644
--- a/src/conn/conn_log.c
+++ b/src/conn/conn_log.c
@@ -50,15 +50,14 @@ __logmgr_config(
conn = S2C(session);
- WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval));
/*
* If we're reconfiguring, enabled must match the already
* existing setting.
- */
- /*
+ *
* If it is off and the user it turning it on, or it is on
* and the user is turning it off, return an error.
*/
+ WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval));
if (reconfig &&
((cval.val != 0 &&
!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) ||
@@ -104,20 +103,19 @@ __logmgr_config(
log_max_filesize, conn->log_file_max);
}
- WT_RET(__wt_config_gets(session, cfg, "log.prealloc", &cval));
/*
* If pre-allocation is configured, set the initial number to a few.
* We'll adapt as load dictates.
*/
- if (cval.val != 0) {
- FLD_SET(conn->log_flags, WT_CONN_LOG_PREALLOC);
+ WT_RET(__wt_config_gets(session, cfg, "log.prealloc", &cval));
+ if (cval.val != 0)
conn->log_prealloc = 5;
- }
- WT_RET(__wt_config_gets_def(session, cfg, "log.recover", 0, &cval));
+
/*
* Note that it is meaningless to reconfigure this value during
* runtime. It only matters on create before recovery runs.
*/
+ WT_RET(__wt_config_gets_def(session, cfg, "log.recover", 0, &cval));
if (cval.len != 0 && WT_STRING_MATCH("error", cval.str, cval.len))
FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR);
@@ -858,11 +856,6 @@ __wt_logmgr_open(WT_SESSION_IMPL *session)
&conn->log_wrlsn_tid, __log_wrlsn_server, conn->log_wrlsn_session));
conn->log_wrlsn_tid_set = true;
- /* If no log thread services are configured, we're done. */
- if (!FLD_ISSET(conn->log_flags,
- (WT_CONN_LOG_ARCHIVE | WT_CONN_LOG_PREALLOC)))
- return (0);
-
/*
* If a log server thread exists, the user may have reconfigured
* archiving or pre-allocation. Signal the thread. Otherwise the
diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c
index 3c00ee30896..d0cc60b583d 100644
--- a/src/evict/evict_lru.c
+++ b/src/evict/evict_lru.c
@@ -1457,15 +1457,12 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full)
WT_DECL_RET;
WT_TXN_GLOBAL *txn_global;
WT_TXN_STATE *txn_state;
- int count;
- bool q_found, txn_busy;
+ uint64_t init_evict_count, max_pages_evicted;
+ bool txn_busy;
conn = S2C(session);
cache = conn->cache;
- /* First, wake the eviction server. */
- WT_RET(__wt_evict_server_wake(session));
-
/*
* If the current transaction is keeping the oldest ID pinned, it is in
* the middle of an operation. This may prevent the oldest ID from
@@ -1479,11 +1476,15 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full)
session->nhazard > 0 ||
(txn_state->snap_min != WT_TXN_NONE &&
txn_global->current != txn_global->oldest_id);
- if (txn_busy) {
- if (pct_full < 100)
- return (0);
- busy = true;
- }
+
+ if (txn_busy && pct_full < 100)
+ return (0);
+
+ if (busy == 1)
+ txn_busy = 1;
+
+ /* Wake the eviction server if we need to do work. */
+ WT_RET(__wt_evict_server_wake(session));
/*
* If we're busy, either because of the transaction check we just did,
@@ -1491,9 +1492,11 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full)
* as a page read), limit the work to a single eviction and return. If
* that's not the case, we can do more.
*/
- count = busy ? 1 : 10;
+ init_evict_count = cache->pages_evict;
for (;;) {
+ max_pages_evicted = txn_busy ? 5 : 20;
+
/*
* A pathological case: if we're the oldest transaction in the
* system and the eviction server is stuck trying to find space,
@@ -1507,43 +1510,34 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full)
return (WT_ROLLBACK);
}
+ /* See if eviction is still needed. */
+ if (!__wt_eviction_needed(session, NULL) ||
+ cache->pages_evict > init_evict_count + max_pages_evicted)
+ return (0);
+
/* Evict a page. */
- q_found = false;
switch (ret = __evict_page(session, false)) {
case 0:
cache->app_evicts++;
- if (--count == 0)
+ if (txn_busy)
return (0);
-
- q_found = true;
- break;
+ /* FALLTHROUGH */
case EBUSY:
- continue;
+ break;
case WT_NOTFOUND:
+ /* Allow the queue to re-populate before retrying. */
+ WT_RET(__wt_cond_wait(
+ session, cache->evict_waiter_cond, 100000));
+ cache->app_waits++;
break;
default:
return (ret);
}
- /* See if eviction is still needed. */
- if (!__wt_eviction_needed(session, NULL))
- return (0);
-
- /* If we found pages in the eviction queue, continue there. */
- if (q_found)
- continue;
-
- /* Wait for the queue to re-populate before trying again. */
- WT_RET(
- __wt_cond_wait(session, cache->evict_waiter_cond, 100000));
-
- cache->app_waits++;
- /* Check if things have changed so that we are busy. */
- if (!busy && txn_state->snap_min != WT_TXN_NONE &&
- txn_global->current != txn_global->oldest_id) {
- busy = true;
- count = 1;
- }
+ /* Check if we have become busy. */
+ if (!txn_busy && txn_state->snap_min != WT_TXN_NONE &&
+ txn_global->current != txn_global->oldest_id)
+ txn_busy = true;
}
/* NOTREACHED */
}
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 0302533bb04..41b2c98f9e8 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -578,8 +578,7 @@ struct __wt_page {
#define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */
#define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */
#define WT_PAGE_OVERFLOW_KEYS 0x10 /* Page has overflow keys */
-#define WT_PAGE_RECONCILIATION 0x20 /* Page reconciliation lock */
-#define WT_PAGE_SPLIT_INSERT 0x40 /* A leaf page was split for append */
+#define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */
uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */
/*
@@ -603,6 +602,12 @@ struct __wt_page {
#define WT_READGEN_STEP 100
uint64_t read_gen;
+ /*
+ * Used to protect and co-ordinate splits for internal pages and
+ * reconciliation for all pages.
+ */
+ WT_FAIR_LOCK page_lock;
+
size_t memory_footprint; /* Memory attached to the page */
/* Page's on-disk representation: NULL for pages created in memory. */
diff --git a/src/include/connection.h b/src/include/connection.h
index 0273414f42e..61ef9e2391c 100644
--- a/src/include/connection.h
+++ b/src/include/connection.h
@@ -336,13 +336,12 @@ struct __wt_connection_impl {
const char *stat_stamp; /* Statistics log entry timestamp */
uint64_t stat_usecs; /* Statistics log period */
-#define WT_CONN_LOG_ARCHIVE 0x01 /* Archive is enabled */
-#define WT_CONN_LOG_ENABLED 0x02 /* Logging is enabled */
-#define WT_CONN_LOG_EXISTED 0x04 /* Log files found */
-#define WT_CONN_LOG_PREALLOC 0x08 /* Pre-allocation is enabled */
-#define WT_CONN_LOG_RECOVER_DONE 0x10 /* Recovery completed */
-#define WT_CONN_LOG_RECOVER_ERR 0x20 /* Error if recovery required */
-#define WT_CONN_LOG_ZERO_FILL 0x40 /* Manually zero files */
+#define WT_CONN_LOG_ARCHIVE 0x01 /* Archive is enabled */
+#define WT_CONN_LOG_ENABLED 0x02 /* Logging is enabled */
+#define WT_CONN_LOG_EXISTED 0x04 /* Log files found */
+#define WT_CONN_LOG_RECOVER_DONE 0x08 /* Recovery completed */
+#define WT_CONN_LOG_RECOVER_ERR 0x10 /* Error if recovery required */
+#define WT_CONN_LOG_ZERO_FILL 0x20 /* Manually zero files */
uint32_t log_flags; /* Global logging configuration */
WT_CONDVAR *log_cond; /* Log server wait mutex */
WT_SESSION_IMPL *log_session; /* Log server session */
diff --git a/src/include/hardware.h b/src/include/hardware.h
index 32353072c5b..c9b72f8a609 100644
--- a/src/include/hardware.h
+++ b/src/include/hardware.h
@@ -50,16 +50,6 @@
&(p)->flags_atomic, __orig, __orig | (uint8_t)(mask))); \
} while (0)
-#define F_CAS_ATOMIC_WAIT(p, mask) do { \
- int __ret; \
- for (;;) { \
- F_CAS_ATOMIC(p, mask, __ret); \
- if (__ret == 0) \
- break; \
- __wt_yield(); \
- } \
-} while (0)
-
#define F_CLR_ATOMIC(p, mask) do { \
uint8_t __orig; \
do { \
diff --git a/src/include/mutex.h b/src/include/mutex.h
index 1f1bb8f4b5c..b67e5e610e8 100644
--- a/src/include/mutex.h
+++ b/src/include/mutex.h
@@ -52,6 +52,24 @@ struct __wt_rwlock {
};
/*
+ * A light weight lock that can be used to replace spinlocks if fairness is
+ * necessary. Implements a ticket-based back off spin lock.
+ * The fields are available as a union to allow for atomically setting
+ * the state of the entire lock.
+ */
+struct __wt_fair_lock {
+ union {
+ uint32_t lock;
+ struct {
+ uint16_t owner; /* Ticket for current owner */
+ uint16_t waiter; /* Last allocated ticket */
+ } s;
+ } u;
+#define fair_lock_owner u.s.owner
+#define fair_lock_waiter u.s.waiter
+};
+
+/*
* Spin locks:
*
* WiredTiger uses spinlocks for fast mutual exclusion (where operations done
diff --git a/src/include/mutex.i b/src/include/mutex.i
index 5ea4583a2ab..54a9cc6f9fd 100644
--- a/src/include/mutex.i
+++ b/src/include/mutex.i
@@ -251,3 +251,91 @@ __wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
#error Unknown spinlock type
#endif
+
+/*
+ * __wt_fair_trylock --
+ * Try to get a lock - give up if it is not immediately available.
+ */
+static inline int
+__wt_fair_trylock(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock)
+{
+ WT_FAIR_LOCK new, old;
+
+ WT_UNUSED(session);
+
+ old = new = *lock;
+
+ /* Exit early if there is no chance we can get the lock. */
+ if (old.fair_lock_waiter != old.fair_lock_owner)
+ return (EBUSY);
+
+ /* The replacement lock value is a result of allocating a new ticket. */
+ ++new.fair_lock_waiter;
+ return (__wt_atomic_cas32(
+ &lock->u.lock, old.u.lock, new.u.lock) ? 0 : EBUSY);
+}
+
+/*
+ * __wt_fair_lock --
+ * Get a lock.
+ */
+static inline int
+__wt_fair_lock(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock)
+{
+ uint16_t ticket;
+ int pause_cnt;
+
+ WT_UNUSED(session);
+
+ /*
+ * Possibly wrap: if we have more than 64K lockers waiting, the ticket
+ * value will wrap and two lockers will simultaneously be granted the
+ * lock.
+ */
+ ticket = __wt_atomic_fetch_add16(&lock->fair_lock_waiter, 1);
+ for (pause_cnt = 0; ticket != lock->fair_lock_owner;) {
+ /*
+ * We failed to get the lock; pause before retrying and if we've
+ * paused enough, sleep so we don't burn CPU to no purpose. This
+ * situation happens if there are more threads than cores in the
+ * system and we're thrashing on shared resources.
+ */
+ if (++pause_cnt < 1000)
+ WT_PAUSE();
+ else
+ __wt_sleep(0, 10);
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_fair_unlock --
+ * Release a shared lock.
+ */
+static inline int
+__wt_fair_unlock(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock)
+{
+ WT_UNUSED(session);
+
+ /*
+ * We have exclusive access - the update does not need to be atomic.
+ */
+ ++lock->fair_lock_owner;
+
+ return (0);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_fair_islocked --
+ * Test whether the lock is currently held
+ */
+static inline bool
+__wt_fair_islocked(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock)
+{
+ WT_UNUSED(session);
+
+ return (lock->fair_lock_waiter != lock->fair_lock_owner);
+}
+#endif
diff --git a/src/include/serial.i b/src/include/serial.i
index 5358b874c06..ca22ce12d81 100644
--- a/src/include/serial.i
+++ b/src/include/serial.i
@@ -316,12 +316,11 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
}
/* If we can't lock it, don't scan, that's okay. */
- F_CAS_ATOMIC(page, WT_PAGE_RECONCILIATION, ret);
- if (ret != 0)
+ if (__wt_fair_trylock(session, &page->page_lock) != 0)
return (0);
obsolete = __wt_update_obsolete_check(session, page, upd->next);
- F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION);
+ WT_RET(__wt_fair_unlock(session, &page->page_lock));
if (obsolete != NULL)
__wt_update_obsolete_free(session, page, obsolete);
diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h
index 4d46a25b63c..3f4e0ada7f1 100644
--- a/src/include/wt_internal.h
+++ b/src/include/wt_internal.h
@@ -164,6 +164,8 @@ struct __wt_ext;
typedef struct __wt_ext WT_EXT;
struct __wt_extlist;
typedef struct __wt_extlist WT_EXTLIST;
+struct __wt_fair_lock;
+ typedef struct __wt_fair_lock WT_FAIR_LOCK;
struct __wt_fh;
typedef struct __wt_fh WT_FH;
struct __wt_hazard;
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index 82264f7c58f..f2c32a434bf 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -394,7 +394,7 @@ __wt_reconcile(WT_SESSION_IMPL *session,
* In-memory splits: reconciliation of an internal page cannot handle
* a child page splitting during the reconciliation.
*/
- F_CAS_ATOMIC_WAIT(page, WT_PAGE_RECONCILIATION);
+ WT_RET(__wt_fair_lock(session, &page->page_lock));
/* Reconcile the page. */
switch (page->type) {
@@ -432,7 +432,7 @@ __wt_reconcile(WT_SESSION_IMPL *session,
WT_TRET(__rec_write_wrapup_err(session, r, page));
/* Release the reconciliation lock. */
- F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION);
+ WT_TRET(__wt_fair_unlock(session, &page->page_lock));
/* Update statistics. */
WT_STAT_FAST_CONN_INCR(session, rec_pages);