summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorKeith Bostic <keith@wiredtiger.com>2015-08-23 10:11:36 -0400
committerKeith Bostic <keith@wiredtiger.com>2015-08-23 10:11:36 -0400
commit3a41ccbadfa59eb1dffcb2f8173299a1c974d815 (patch)
treeba405cefd6d98f25f9282b4173e934f48b2eac18 /src
parentdc2adba3e2d9c51dfb23bac3442a986acb444c25 (diff)
downloadmongo-3a41ccbadfa59eb1dffcb2f8173299a1c974d815.tar.gz
There are three locks in play with the lookaside file, and it's leading
to deadlock. The three locks are as follows: reconciliation takes a page lock (IFF compaction is running), then subsequently acquires the shared lookaside cursor (IFF eviction is being performed by an application thread), then acquires the page lock in order to insert a record into the lookaside file. The simple deadlock is when the two page locks are the same. More complicated deadlocks are possible, for example, thread X acquires page lock 5, acquires the shared lookaside cursor, sleeps; thread Y acquires page lock 6, then waits on the lookaside cursor; thread X wakes and attempts to acquire page lock 6 as its second page lock. There's a WT_PAGE_SCANNING lock reconciliation always acquires in order to block threads trimming update lists while reconciliation is running. Rename that lock to WT_PAGE_RECONCILIATION and give it the more general meaning that reconciliation is working on a page. Change compaction to use the new WT_PAGE_RECONCILIATION lock instead of the page lock. This means compaction can collide with threads trimming update lists, but compaction is both a relatively rare operation and only holds the lock for short time. Page locks revert to their original remaining use, serialization around page inserts. Reconciliation does less work when compaction is configured (acquiring one less lock), and the combination of compaction and reconciliation no longer blocks page inserts. This also simplifies compaction. Previously, compaction set a flag to instruct reconciliation to start taking page locks, and then waited for on-going reconciliation work to drain to ensure it didn't race; that's no longer necessary because compaction is using a lock reconciliation always acquires, the wait-to-drain isn't necessary. Add a new F_CAS_ATOMIC_WAIT macro, the same as F_CAS_ATOMIC, but it loops until successful.
Diffstat (limited to 'src')
-rw-r--r--src/btree/bt_compact.c35
-rw-r--r--src/btree/bt_debug.c4
-rw-r--r--src/include/btmem.h2
-rw-r--r--src/include/connection.h2
-rw-r--r--src/include/hardware.h10
-rw-r--r--src/include/serial.i14
-rw-r--r--src/reconcile/rec_write.c39
7 files changed, 34 insertions, 72 deletions
diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c
index 18f8ca54601..79a52dbcaa3 100644
--- a/src/btree/bt_compact.c
+++ b/src/btree/bt_compact.c
@@ -53,12 +53,12 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp)
} else if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_REPLACE) {
/*
* The page's modification information can change underfoot if
- * the page is being reconciled, lock the page down.
+ * the page is being reconciled, serialize with reconciliation.
*/
- WT_PAGE_LOCK(session, page);
+ F_CAS_ATOMIC_WAIT(page, WT_PAGE_RECONCILIATION);
ret = bm->compact_page_skip(bm, session,
mod->mod_replace.addr, mod->mod_replace.size, skipp);
- WT_PAGE_UNLOCK(session, page);
+ F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION);
WT_RET(ret);
}
return (0);
@@ -73,14 +73,12 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[])
{
WT_BM *bm;
WT_BTREE *btree;
- WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_REF *ref;
- int block_manager_begin, evict_reset, skip;
+ int block_manager_begin, skip;
WT_UNUSED(cfg);
- conn = S2C(session);
btree = S2BT(session);
bm = btree->bm;
ref = NULL;
@@ -118,25 +116,6 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[])
*/
__wt_spin_lock(session, &btree->flush_lock);
- /*
- * That leaves eviction, we don't want to block eviction. Set a flag
- * so reconciliation knows compaction is running. If reconciliation
- * sees the flag it locks the page it's writing, we acquire the same
- * lock when reading the page's modify information, serializing access.
- * The same page lock blocks work on the page, but compaction is an
- * uncommon, heavy-weight operation. If it's ever a problem, there's
- * no reason we couldn't use an entirely separate lock than the page
- * lock.
- *
- * We also need to ensure we don't race with an on-going reconciliation.
- * After we set the flag, wait for eviction of this file to drain, and
- * then let eviction continue;
- */
- conn->compact_in_memory_pass = 1;
- WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset));
- if (evict_reset)
- __wt_evict_file_exclusive_off(session);
-
/* Start compaction. */
WT_ERR(bm->compact_start(bm, session));
block_manager_begin = 1;
@@ -172,11 +151,7 @@ err: if (ref != NULL)
if (block_manager_begin)
WT_TRET(bm->compact_end(bm, session));
- /*
- * Unlock will be a release barrier, use it to update the compaction
- * status for reconciliation.
- */
- conn->compact_in_memory_pass = 0;
+ /* Unblock threads writing leaf pages. */
__wt_spin_unlock(session, &btree->flush_lock);
return (ret);
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index a320cbd3ef7..daa2e200854 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -645,8 +645,8 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page)
__dmsg(ds, ", disk-mapped");
if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU))
__dmsg(ds, ", evict-lru");
- if (F_ISSET_ATOMIC(page, WT_PAGE_SCANNING))
- __dmsg(ds, ", scanning");
+ if (F_ISSET_ATOMIC(page, WT_PAGE_RECONCILIATION))
+ __dmsg(ds, ", reconciliation");
if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT))
__dmsg(ds, ", split-insert");
if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_LOCKED))
diff --git a/src/include/btmem.h b/src/include/btmem.h
index f613c082c77..e313ff412da 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -579,7 +579,7 @@ struct __wt_page {
#define WT_PAGE_DISK_ALLOC 0x02 /* Disk image in allocated memory */
#define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */
#define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */
-#define WT_PAGE_SCANNING 0x10 /* Obsolete updates are being scanned */
+#define WT_PAGE_RECONCILIATION 0x10 /* Page reconciliation lock */
#define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */
#define WT_PAGE_SPLIT_LOCKED 0x40 /* An internal page is growing */
uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */
diff --git a/src/include/connection.h b/src/include/connection.h
index 4203907a4e8..d4583025ecf 100644
--- a/src/include/connection.h
+++ b/src/include/connection.h
@@ -288,8 +288,6 @@ struct __wt_connection_impl {
uint64_t ckpt_time_recent; /* Checkpoint time recent/total */
uint64_t ckpt_time_total;
- int compact_in_memory_pass; /* Compaction serialization */
-
#define WT_CONN_STAT_ALL 0x01 /* "all" statistics configured */
#define WT_CONN_STAT_CLEAR 0x02 /* clear after gathering */
#define WT_CONN_STAT_FAST 0x04 /* "fast" statistics configured */
diff --git a/src/include/hardware.h b/src/include/hardware.h
index 335663fc651..e1dc065a93d 100644
--- a/src/include/hardware.h
+++ b/src/include/hardware.h
@@ -50,6 +50,16 @@
&(p)->flags_atomic, __orig, __orig | (uint8_t)(mask))); \
} while (0)
+#define F_CAS_ATOMIC_WAIT(p, mask) do { \
+ int __ret; \
+ for (;;) { \
+ F_CAS_ATOMIC(p, mask, __ret); \
+ if (__ret == 0) \
+ break; \
+ __wt_yield(); \
+ } \
+} while (0)
+
#define F_CLR_ATOMIC(p, mask) do { \
uint8_t __orig; \
do { \
diff --git a/src/include/serial.i b/src/include/serial.i
index 7b62e66eccb..152addf07f9 100644
--- a/src/include/serial.i
+++ b/src/include/serial.i
@@ -292,20 +292,20 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
__wt_page_modify_set(session, page);
/*
- * If there are subsequent WT_UPDATE structures, we're evicting pages
- * and the page-scanning mutex isn't held, discard obsolete WT_UPDATE
- * structures. Serialization is needed so only one thread does the
- * obsolete check at a time, and to protect updates from disappearing
- * under reconciliation.
+ * If there are subsequent obsolete WT_UPDATE structures, discard them.
+ * Serialization is needed because reconciliation reads the update list,
+ * and obsolete updates cannot be discarded while reconciliation is in
+ * progress. Serialization is also needed so only one thread does the
+ * obsolete check at a time.
*/
if (upd->next != NULL &&
__wt_txn_visible_all(session, page->modify->obsolete_check_txn)) {
- F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret);
+ F_CAS_ATOMIC(page, WT_PAGE_RECONCILIATION, ret);
/* If we can't lock it, don't scan, that's okay. */
if (ret != 0)
return (0);
obsolete = __wt_update_obsolete_check(session, page, upd->next);
- F_CLR_ATOMIC(page, WT_PAGE_SCANNING);
+ F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION);
if (obsolete != NULL) {
page->modify->obsolete_check_txn = WT_TXN_NONE;
__wt_update_obsolete_free(session, page, obsolete);
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index a4a2da1f6b6..28c33b9a2b4 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -369,12 +369,12 @@ __wt_reconcile(WT_SESSION_IMPL *session,
WT_PAGE *page;
WT_PAGE_MODIFY *mod;
WT_RECONCILE *r;
- int page_lock, scan_lock, split_lock;
+ int split_lock;
conn = S2C(session);
page = ref->page;
mod = page->modify;
- page_lock = scan_lock = split_lock = 0;
+ split_lock = 0;
/* We're shouldn't get called with a clean page, that's an error. */
if (!__wt_page_is_modified(page))
@@ -412,37 +412,19 @@ __wt_reconcile(WT_SESSION_IMPL *session,
r = session->reconcile;
/*
- * The compaction process looks at the page's modification information;
- * if compaction is running, acquire the page's lock.
+ * Reconciliation locks the page for two reasons: reconciliation reads
+ * the lists of page updates, so obsolete updates cannot be discarded
+ * while reconciliation is in progress. Second, the compaction process
+ * reads page modification information, which reconciliation modifies.
*/
- if (conn->compact_in_memory_pass) {
- WT_PAGE_LOCK(session, page);
- page_lock = 1;
- }
-
- /*
- * Reconciliation reads the lists of updates, so obsolete updates cannot
- * be discarded while reconciliation is in progress.
- */
- for (;;) {
- F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret);
- if (ret == 0)
- break;
- __wt_yield();
- }
- scan_lock = 1;
+ F_CAS_ATOMIC_WAIT(page, WT_PAGE_RECONCILIATION);
/*
* Mark internal pages as splitting to ensure we don't deadlock when
* performing an in-memory split during a checkpoint.
*/
if (WT_PAGE_IS_INTERNAL(page)) {
- for (;;) {
- F_CAS_ATOMIC(page, WT_PAGE_SPLIT_LOCKED, ret);
- if (ret == 0)
- break;
- __wt_yield();
- }
+ F_CAS_ATOMIC_WAIT(page, WT_PAGE_SPLIT_LOCKED);
split_lock = 1;
}
@@ -484,10 +466,7 @@ __wt_reconcile(WT_SESSION_IMPL *session,
/* Release the locks we're holding. */
if (split_lock)
F_CLR_ATOMIC(page, WT_PAGE_SPLIT_LOCKED);
- if (scan_lock)
- F_CLR_ATOMIC(page, WT_PAGE_SCANNING);
- if (page_lock)
- WT_PAGE_UNLOCK(session, page);
+ F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION);
/*
* Clean up the boundary structures: some workloads result in millions