summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@mongodb.com>2017-06-02 12:08:34 +1000
committerGitHub <noreply@github.com>2017-06-02 12:08:34 +1000
commit42daa132f21c1391ae2b2b3d789df85878aca471 (patch)
tree385f10e3f035a7bea711ed18d6ffdbd77c601160
parentb3ff7c4ab91d3c5fda64381d8ab5957cb697167d (diff)
downloadmongo-42daa132f21c1391ae2b2b3d789df85878aca471.tar.gz
WT-3345 Tune WiredTiger's read/write locks. (#3446)
* Add a workload that stresses rwlock performance under various conditions (including `threads >> cores`), tune read and write lock operations to only spin when it is likely to help, and to back off to a condition variable when there is heavy contention. * New rwlock implementation: queue readers and writers separately, don't enforce fairness among readers or if the lock is overwhelmed. * Switch to a spinlock whenever we need to lock a page. Previously we had a read/write lock in the __wt_page structure that was only ever acquired in write mode, plus a spinlock in the page->modify structure. Switch to using the spinlock for everything. One slight downside of this change is that we can no longer precisely determine whether a page is locked based on the status of the spinlock (since another page sharing the same lock could be holding it in the places where we used to check). Since that was only ever used for diagnostic / debugging purposes, I think the benefit of the change outweighs this issue. * Fix a bug where a failure during `__wt_curfile_create` caused a data handle to be released twice. This is caught by the sanity checking assertions in the new read/write lock code. * Split may be holding a page lock when restoring update. Tell the restore code we have the page exclusive and no further locking is required. * Allocate a spinlock for each modified page. Using shared page locks for mulitple operations that need to lock a page (including inserts and reconciliation) resulted in self-deadlock when the lookaside table was used. That's because reconciliation held a page lock, then caused inserts to the lookaside table, which acquired the page lock for a page in the lookaside table. With a shared set of page locks, they could both be the same lock. Switch (back?) to allocating a spinlock per modified page. Earlier in this ticket we saved some space in __wt_page, so growing __wt_page_modify is unlikely to be noticeable. * Tweak padding and position of the spinlock in WT_PAGE_MODIFY to claw back some bytes. Move evict_pass_gen to the end of WT_PAGE: on inspection, it should be a cold field relative to the others, which now fit in one x86 cache line.
-rw-r--r--src/async/async_api.c2
-rw-r--r--src/btree/bt_compact.c4
-rw-r--r--src/btree/bt_cursor.c4
-rw-r--r--src/btree/bt_debug.c2
-rw-r--r--src/btree/bt_discard.c2
-rw-r--r--src/btree/bt_handle.c2
-rw-r--r--src/btree/bt_read.c4
-rw-r--r--src/btree/bt_split.c22
-rw-r--r--src/btree/col_modify.c9
-rw-r--r--src/btree/row_modify.c16
-rw-r--r--src/conn/conn_cache.c2
-rw-r--r--src/conn/conn_cache_pool.c4
-rw-r--r--src/conn/conn_ckpt.c2
-rw-r--r--src/conn/conn_dhandle.c2
-rw-r--r--src/conn/conn_handle.c15
-rw-r--r--src/conn/conn_log.c12
-rw-r--r--src/conn/conn_stat.c2
-rw-r--r--src/conn/conn_sweep.c2
-rw-r--r--src/cursor/cur_file.c9
-rw-r--r--src/include/btmem.h27
-rw-r--r--src/include/connection.h15
-rw-r--r--src/include/extern.h7
-rw-r--r--src/include/extern_posix.h2
-rw-r--r--src/include/extern_win.h2
-rw-r--r--src/include/mutex.h31
-rw-r--r--src/include/serial.i30
-rw-r--r--src/include/session.h4
-rw-r--r--src/include/verify_build.h1
-rw-r--r--src/include/wt_internal.h4
-rw-r--r--src/lsm/lsm_manager.c2
-rw-r--r--src/lsm/lsm_tree.c2
-rw-r--r--src/os_posix/os_mtx_cond.c16
-rw-r--r--src/os_win/os_mtx_cond.c6
-rw-r--r--src/reconcile/rec_write.c6
-rw-r--r--src/session/session_dhandle.c4
-rw-r--r--src/support/mtx_rw.c405
-rw-r--r--src/support/thread_group.c10
-rw-r--r--src/txn/txn.c8
-rw-r--r--test/csuite/Makefile.am3
-rw-r--r--test/csuite/rwlock/main.c184
40 files changed, 567 insertions, 319 deletions
diff --git a/src/async/async_api.c b/src/async/async_api.c
index 1e4bfd51c46..0f3e376fbfd 100644
--- a/src/async/async_api.c
+++ b/src/async/async_api.c
@@ -435,7 +435,7 @@ __wt_async_destroy(WT_SESSION_IMPL *session)
F_CLR(conn, WT_CONN_SERVER_ASYNC);
for (i = 0; i < conn->async_workers; i++)
WT_TRET(__wt_thread_join(session, async->worker_tids[i]));
- WT_TRET(__wt_cond_destroy(session, &async->flush_cond));
+ __wt_cond_destroy(session, &async->flush_cond);
/* Close the server threads' sessions. */
for (i = 0; i < conn->async_workers; i++)
diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c
index 17308d02d91..c6a412aa84e 100644
--- a/src/btree/bt_compact.c
+++ b/src/btree/bt_compact.c
@@ -60,7 +60,7 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
*/
if (mod->rec_result == WT_PM_REC_REPLACE ||
mod->rec_result == WT_PM_REC_MULTIBLOCK)
- __wt_writelock(session, &page->page_lock);
+ WT_PAGE_LOCK(session, page);
if (mod->rec_result == WT_PM_REC_REPLACE)
ret = bm->compact_page_skip(bm, session,
@@ -80,7 +80,7 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
if (mod->rec_result == WT_PM_REC_REPLACE ||
mod->rec_result == WT_PM_REC_MULTIBLOCK)
- __wt_writeunlock(session, &page->page_lock);
+ WT_PAGE_UNLOCK(session, page);
return (ret);
}
diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c
index 7e415150cc5..52435eeefed 100644
--- a/src/btree/bt_cursor.c
+++ b/src/btree/bt_cursor.c
@@ -346,7 +346,7 @@ __cursor_col_modify(
WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, u_int modify_type)
{
return (__wt_col_modify(session, cbt,
- cbt->iface.recno, &cbt->iface.value, NULL, modify_type));
+ cbt->iface.recno, &cbt->iface.value, NULL, modify_type, false));
}
/*
@@ -358,7 +358,7 @@ __cursor_row_modify(
WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, u_int modify_type)
{
return (__wt_row_modify(session, cbt,
- &cbt->iface.key, &cbt->iface.value, NULL, modify_type));
+ &cbt->iface.key, &cbt->iface.value, NULL, modify_type, false));
}
/*
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index c3f98a98ec5..394ac6c7b84 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -689,8 +689,6 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref)
WT_RET(ds->f(ds, ", entries %" PRIu32, entries));
WT_RET(ds->f(ds,
", %s", __wt_page_is_modified(page) ? "dirty" : "clean"));
- WT_RET(ds->f(ds, ", %s", __wt_rwlock_islocked(
- session, &page->page_lock) ? "locked" : "unlocked"));
if (F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS))
WT_RET(ds->f(ds, ", keys-built"));
diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c
index a04face8f64..bfa8eb25aac 100644
--- a/src/btree/bt_discard.c
+++ b/src/btree/bt_discard.c
@@ -98,7 +98,6 @@ __page_out_int(WT_SESSION_IMPL *session, WT_PAGE **pagep, bool rewrite)
*/
WT_ASSERT(session, !__wt_page_is_modified(page));
WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU));
- WT_ASSERT(session, !__wt_rwlock_islocked(session, &page->page_lock));
/*
* If a root page split, there may be one or more pages linked from the
@@ -254,6 +253,7 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
__wt_ovfl_discard_free(session, page);
__wt_free(session, page->modify->ovfl_track);
+ __wt_spin_destroy(session, &page->modify->page_lock);
__wt_free(session, page->modify);
}
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index e4780f1bf42..06fbd6b74c7 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -442,7 +442,7 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
}
/* Initialize locks. */
- __wt_rwlock_init(session, &btree->ovfl_lock);
+ WT_RET(__wt_rwlock_init(session, &btree->ovfl_lock));
WT_RET(__wt_spin_init(session, &btree->flush_lock, "btree flush"));
btree->modified = false; /* Clean */
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c
index e6a0f53ab40..de84a711019 100644
--- a/src/btree/bt_read.c
+++ b/src/btree/bt_read.c
@@ -91,7 +91,7 @@ __col_instantiate(WT_SESSION_IMPL *session,
/* Search the page and add updates. */
WT_RET(__wt_col_search(session, recno, ref, cbt));
WT_RET(__wt_col_modify(
- session, cbt, recno, NULL, upd, WT_UPDATE_STANDARD));
+ session, cbt, recno, NULL, upd, WT_UPDATE_STANDARD, false));
return (0);
}
@@ -106,7 +106,7 @@ __row_instantiate(WT_SESSION_IMPL *session,
/* Search the page and add updates. */
WT_RET(__wt_row_search(session, key, ref, cbt, true));
WT_RET(__wt_row_modify(
- session, cbt, key, NULL, upd, WT_UPDATE_STANDARD));
+ session, cbt, key, NULL, upd, WT_UPDATE_STANDARD, false));
return (0);
}
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index c2c56a18131..71346baee2e 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -1166,13 +1166,19 @@ __split_internal_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool trylock,
for (;;) {
parent = ref->home;
+ /*
+ * The page will be marked dirty, and we can only lock a page
+ * with a modify structure.
+ */
+ WT_RET(__wt_page_modify_init(session, parent));
+
if (trylock)
- WT_RET(__wt_try_writelock(session, &parent->page_lock));
+ WT_RET(WT_PAGE_TRYLOCK(session, parent));
else
- __wt_writelock(session, &parent->page_lock);
+ WT_PAGE_LOCK(session, parent);
if (parent == ref->home)
break;
- __wt_writeunlock(session, &parent->page_lock);
+ WT_PAGE_UNLOCK(session, parent);
}
/*
@@ -1195,7 +1201,7 @@ __split_internal_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool trylock,
*parentp = parent;
return (0);
-err: __wt_writeunlock(session, &parent->page_lock);
+err: WT_PAGE_UNLOCK(session, parent);
return (ret);
}
@@ -1211,7 +1217,7 @@ __split_internal_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard)
if (hazard)
ret = __wt_hazard_clear(session, parent->pg_intl_parent_ref);
- __wt_writeunlock(session, &parent->page_lock);
+ WT_PAGE_UNLOCK(session, parent);
return (ret);
}
@@ -1425,7 +1431,7 @@ __split_multi_inmem(
/* Apply the modification. */
WT_ERR(__wt_col_modify(session,
- &cbt, recno, NULL, upd, WT_UPDATE_STANDARD));
+ &cbt, recno, NULL, upd, WT_UPDATE_STANDARD, true));
break;
case WT_PAGE_ROW_LEAF:
/* Build a key. */
@@ -1446,8 +1452,8 @@ __split_multi_inmem(
WT_ERR(__wt_row_search(session, key, ref, &cbt, true));
/* Apply the modification. */
- WT_ERR(__wt_row_modify(
- session, &cbt, key, NULL, upd, WT_UPDATE_STANDARD));
+ WT_ERR(__wt_row_modify(session, &cbt,
+ key, NULL, upd, WT_UPDATE_STANDARD, true));
break;
WT_ILLEGAL_VALUE_ERR(session);
}
diff --git a/src/btree/col_modify.c b/src/btree/col_modify.c
index c256f03a612..2a64ec03952 100644
--- a/src/btree/col_modify.c
+++ b/src/btree/col_modify.c
@@ -17,7 +17,8 @@ static int __col_insert_alloc(
*/
int
__wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
- uint64_t recno, const WT_ITEM *value, WT_UPDATE *upd_arg, u_int modify_type)
+ uint64_t recno, const WT_ITEM *value,
+ WT_UPDATE *upd_arg, u_int modify_type, bool exclusive)
{
static const WT_ITEM col_fix_remove = { "", 1, NULL, 0, 0 };
WT_BTREE *btree;
@@ -106,7 +107,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
/* Serialize the update. */
WT_ERR(__wt_update_serial(
- session, page, &cbt->ins->upd, &upd, upd_size));
+ session, page, &cbt->ins->upd, &upd, upd_size, false));
} else {
/* Allocate the append/update list reference as necessary. */
if (append) {
@@ -188,11 +189,11 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
if (append)
WT_ERR(__wt_col_append_serial(
session, page, cbt->ins_head, cbt->ins_stack,
- &ins, ins_size, &cbt->recno, skipdepth));
+ &ins, ins_size, &cbt->recno, skipdepth, exclusive));
else
WT_ERR(__wt_insert_serial(
session, page, cbt->ins_head, cbt->ins_stack,
- &ins, ins_size, skipdepth));
+ &ins, ins_size, skipdepth, exclusive));
}
/* If the update was successful, add it to the in-memory log. */
diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c
index 2bf3c2f29bc..c7afdcfcb31 100644
--- a/src/btree/row_modify.c
+++ b/src/btree/row_modify.c
@@ -15,18 +15,12 @@
int
__wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page)
{
- WT_CONNECTION_IMPL *conn;
WT_PAGE_MODIFY *modify;
- conn = S2C(session);
-
WT_RET(__wt_calloc_one(session, &modify));
- /*
- * Select a spinlock for the page; let the barrier immediately below
- * keep things from racing too badly.
- */
- modify->page_lock = ++conn->page_lock_cnt % WT_PAGE_LOCKS;
+ /* Initialize the spinlock for the page. */
+ WT_RET(__wt_spin_init(session, &modify->page_lock, "btree page"));
/*
* Multiple threads of control may be searching and deciding to modify
@@ -48,7 +42,7 @@ __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page)
int
__wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
const WT_ITEM *key, const WT_ITEM *value,
- WT_UPDATE *upd_arg, u_int modify_type)
+ WT_UPDATE *upd_arg, u_int modify_type, bool exclusive)
{
WT_DECL_RET;
WT_INSERT *ins;
@@ -129,7 +123,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
/* Serialize the update. */
WT_ERR(__wt_update_serial(
- session, page, upd_entry, &upd, upd_size));
+ session, page, upd_entry, &upd, upd_size, exclusive));
} else {
/*
* Allocate the insert array as necessary.
@@ -204,7 +198,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
/* Insert the WT_INSERT structure. */
WT_ERR(__wt_insert_serial(
session, page, cbt->ins_head, cbt->ins_stack,
- &ins, ins_size, skipdepth));
+ &ins, ins_size, skipdepth, exclusive));
}
if (logged && modify_type != WT_UPDATE_RESERVED)
diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c
index ad83f0b2b4a..5515eb026ca 100644
--- a/src/conn/conn_cache.c
+++ b/src/conn/conn_cache.c
@@ -312,7 +312,7 @@ __wt_cache_destroy(WT_SESSION_IMPL *session)
cache->bytes_dirty_intl + cache->bytes_dirty_leaf,
cache->pages_dirty_intl + cache->pages_dirty_leaf);
- WT_TRET(__wt_cond_destroy(session, &cache->evict_cond));
+ __wt_cond_destroy(session, &cache->evict_cond);
__wt_spin_destroy(session, &cache->evict_pass_lock);
__wt_spin_destroy(session, &cache->evict_queue_lock);
__wt_spin_destroy(session, &cache->evict_walk_lock);
diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c
index c1c9c98b30c..adc2e2bffc3 100644
--- a/src/conn/conn_cache_pool.c
+++ b/src/conn/conn_cache_pool.c
@@ -225,7 +225,7 @@ err: __wt_spin_unlock(session, &__wt_process.spinlock);
__wt_free(session, pool_name);
if (ret != 0 && created) {
__wt_free(session, cp->name);
- WT_TRET(__wt_cond_destroy(session, &cp->cache_pool_cond));
+ __wt_cond_destroy(session, &cp->cache_pool_cond);
__wt_free(session, cp);
}
return (ret);
@@ -391,7 +391,7 @@ __wt_conn_cache_pool_destroy(WT_SESSION_IMPL *session)
__wt_free(session, cp->name);
__wt_spin_destroy(session, &cp->cache_pool_lock);
- WT_TRET(__wt_cond_destroy(session, &cp->cache_pool_cond));
+ __wt_cond_destroy(session, &cp->cache_pool_cond);
__wt_free(session, cp);
}
diff --git a/src/conn/conn_ckpt.c b/src/conn/conn_ckpt.c
index 43673cd335e..a47524af2d7 100644
--- a/src/conn/conn_ckpt.c
+++ b/src/conn/conn_ckpt.c
@@ -231,7 +231,7 @@ __wt_checkpoint_server_destroy(WT_SESSION_IMPL *session)
WT_TRET(__wt_thread_join(session, conn->ckpt_tid));
conn->ckpt_tid_set = false;
}
- WT_TRET(__wt_cond_destroy(session, &conn->ckpt_cond));
+ __wt_cond_destroy(session, &conn->ckpt_cond);
/* Close the server thread's session. */
if (conn->ckpt_session != NULL) {
diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c
index fa79de0cfbe..d4670562eb8 100644
--- a/src/conn/conn_dhandle.c
+++ b/src/conn/conn_dhandle.c
@@ -52,7 +52,7 @@ __wt_conn_dhandle_alloc(
WT_RET(__wt_calloc_one(session, &dhandle));
- __wt_rwlock_init(session, &dhandle->rwlock);
+ WT_ERR(__wt_rwlock_init(session, &dhandle->rwlock));
dhandle->name_hash = __wt_hash_city64(uri, strlen(uri));
WT_ERR(__wt_strdup(session, uri, &dhandle->name));
WT_ERR(__wt_strdup(session, checkpoint, &dhandle->checkpoint));
diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c
index 11b5368e9ad..32a0d80c1f3 100644
--- a/src/conn/conn_handle.c
+++ b/src/conn/conn_handle.c
@@ -62,14 +62,9 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn)
WT_RET(__wt_spin_init(session, &conn->turtle_lock, "turtle file"));
/* Read-write locks */
- __wt_rwlock_init(session, &conn->dhandle_lock);
- __wt_rwlock_init(session, &conn->hot_backup_lock);
- __wt_rwlock_init(session, &conn->table_lock);
-
- WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS, &conn->page_lock));
- for (i = 0; i < WT_PAGE_LOCKS; ++i)
- WT_RET(
- __wt_spin_init(session, &conn->page_lock[i], "btree page"));
+ WT_RET(__wt_rwlock_init(session, &conn->dhandle_lock));
+ WT_RET(__wt_rwlock_init(session, &conn->hot_backup_lock));
+ WT_RET(__wt_rwlock_init(session, &conn->table_lock));
/* Setup the spin locks for the LSM manager queues. */
WT_RET(__wt_spin_init(session,
@@ -106,7 +101,6 @@ void
__wt_connection_destroy(WT_CONNECTION_IMPL *conn)
{
WT_SESSION_IMPL *session;
- u_int i;
/* Check there's something to destroy. */
if (conn == NULL)
@@ -137,9 +131,6 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn)
__wt_spin_destroy(session, &conn->schema_lock);
__wt_rwlock_destroy(session, &conn->table_lock);
__wt_spin_destroy(session, &conn->turtle_lock);
- for (i = 0; i < WT_PAGE_LOCKS; ++i)
- __wt_spin_destroy(session, &conn->page_lock[i]);
- __wt_free(session, conn->page_lock);
/* Free allocated memory. */
__wt_free(session, conn->cfg);
diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c
index dac16cc9d00..37acbe4a1a4 100644
--- a/src/conn/conn_log.c
+++ b/src/conn/conn_log.c
@@ -879,7 +879,7 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[])
WT_RET(__wt_spin_init(session, &log->log_sync_lock, "log sync"));
WT_RET(__wt_spin_init(session, &log->log_writelsn_lock,
"log write LSN"));
- __wt_rwlock_init(session, &log->log_archive_lock);
+ WT_RET(__wt_rwlock_init(session, &log->log_archive_lock));
if (FLD_ISSET(conn->direct_io, WT_DIRECT_IO_LOG))
log->allocsize = (uint32_t)
WT_MAX(conn->buffer_alignment, WT_LOG_ALIGN);
@@ -1042,12 +1042,12 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
}
/* Destroy the condition variables now that all threads are stopped */
- WT_TRET(__wt_cond_destroy(session, &conn->log_cond));
- WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond));
- WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond));
+ __wt_cond_destroy(session, &conn->log_cond);
+ __wt_cond_destroy(session, &conn->log_file_cond);
+ __wt_cond_destroy(session, &conn->log_wrlsn_cond);
- WT_TRET(__wt_cond_destroy(session, &conn->log->log_sync_cond));
- WT_TRET(__wt_cond_destroy(session, &conn->log->log_write_cond));
+ __wt_cond_destroy(session, &conn->log->log_sync_cond);
+ __wt_cond_destroy(session, &conn->log->log_write_cond);
__wt_rwlock_destroy(session, &conn->log->log_archive_lock);
__wt_spin_destroy(session, &conn->log->log_lock);
__wt_spin_destroy(session, &conn->log->log_slot_lock);
diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c
index 2554083b26c..f38d81a7f7a 100644
--- a/src/conn/conn_stat.c
+++ b/src/conn/conn_stat.c
@@ -648,7 +648,7 @@ __wt_statlog_destroy(WT_SESSION_IMPL *session, bool is_close)
WT_TRET(__wt_thread_join(session, conn->stat_tid));
conn->stat_tid_set = false;
}
- WT_TRET(__wt_cond_destroy(session, &conn->stat_cond));
+ __wt_cond_destroy(session, &conn->stat_cond);
/* Log a set of statistics on shutdown if configured. */
if (is_close)
diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c
index fbedb938bd8..df60a3c784d 100644
--- a/src/conn/conn_sweep.c
+++ b/src/conn/conn_sweep.c
@@ -429,7 +429,7 @@ __wt_sweep_destroy(WT_SESSION_IMPL *session)
WT_TRET(__wt_thread_join(session, conn->sweep_tid));
conn->sweep_tid_set = 0;
}
- WT_TRET(__wt_cond_destroy(session, &conn->sweep_cond));
+ __wt_cond_destroy(session, &conn->sweep_cond);
if (conn->sweep_session != NULL) {
wt_session = &conn->sweep_session->iface;
diff --git a/src/cursor/cur_file.c b/src/cursor/cur_file.c
index 4469cac685d..3b6328a2d93 100644
--- a/src/cursor/cur_file.c
+++ b/src/cursor/cur_file.c
@@ -520,7 +520,14 @@ __curfile_create(WT_SESSION_IMPL *session,
WT_STAT_DATA_INCR(session, cursor_create);
if (0) {
-err: WT_TRET(__curfile_close(cursor));
+err: /*
+ * Our caller expects to release the data handle if we fail.
+ * Disconnect it from the cursor before closing.
+ */
+ if (session->dhandle != NULL)
+ __wt_cursor_dhandle_decr_use(session);
+ cbt->btree = NULL;
+ WT_TRET(__curfile_close(cursor));
*cursorp = NULL;
}
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 4e8d3c05d7d..32839192a96 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -414,18 +414,20 @@ struct __wt_page_modify {
size_t discard_allocated;
} *ovfl_track;
+#define WT_PAGE_LOCK(s, p) \
+ __wt_spin_lock((s), &(p)->modify->page_lock)
+#define WT_PAGE_TRYLOCK(s, p) \
+ __wt_spin_trylock((s), &(p)->modify->page_lock)
+#define WT_PAGE_UNLOCK(s, p) \
+ __wt_spin_unlock((s), &(p)->modify->page_lock)
+ WT_SPINLOCK page_lock; /* Page's spinlock */
+
/*
* The write generation is incremented when a page is modified, a page
* is clean if the write generation is 0.
*/
uint32_t write_gen;
-#define WT_PAGE_LOCK(s, p) \
- __wt_spin_lock((s), &S2C(s)->page_lock[(p)->modify->page_lock])
-#define WT_PAGE_UNLOCK(s, p) \
- __wt_spin_unlock((s), &S2C(s)->page_lock[(p)->modify->page_lock])
- uint8_t page_lock; /* Page's spinlock */
-
#define WT_PM_REC_EMPTY 1 /* Reconciliation: no replacement */
#define WT_PM_REC_MULTIBLOCK 2 /* Reconciliation: multiple blocks */
#define WT_PM_REC_REPLACE 3 /* Reconciliation: single block */
@@ -604,13 +606,6 @@ struct __wt_page {
uint8_t unused[2]; /* Unused padding */
/*
- * Used to protect and co-ordinate splits for internal pages and
- * reconciliation for all pages. Only used to co-ordinate among the
- * uncommon cases that require exclusive access to a page.
- */
- WT_RWLOCK page_lock;
-
- /*
* The page's read generation acts as an LRU value for each page in the
* tree; it is used by the eviction server thread to select pages to be
* discarded from the in-memory tree.
@@ -636,8 +631,6 @@ struct __wt_page {
#define WT_READGEN_STEP 100
uint64_t read_gen;
- uint64_t evict_pass_gen; /* Eviction pass generation */
-
size_t memory_footprint; /* Memory attached to the page */
/* Page's on-disk representation: NULL for pages created in memory. */
@@ -645,6 +638,10 @@ struct __wt_page {
/* If/when the page is modified, we need lots more information. */
WT_PAGE_MODIFY *modify;
+
+ /* This is the 64 byte boundary, try to keep hot fields above here. */
+
+ uint64_t evict_pass_gen; /* Eviction pass generation */
};
/*
diff --git a/src/include/connection.h b/src/include/connection.h
index 6f656270f38..bf2f8a2c7e1 100644
--- a/src/include/connection.h
+++ b/src/include/connection.h
@@ -175,21 +175,6 @@ struct __wt_connection_impl {
WT_SPINLOCK turtle_lock; /* Turtle file spinlock */
WT_RWLOCK dhandle_lock; /* Data handle list lock */
- /*
- * We distribute the btree page locks across a set of spin locks. Don't
- * use too many: they are only held for very short operations, each one
- * is 64 bytes, so 256 will fill the L1 cache on most CPUs.
- *
- * Use a prime number of buckets rather than assuming a good hash
- * (Reference Sedgewick, Algorithms in C, "Hash Functions").
- *
- * Note: this can't be an array, we impose cache-line alignment and gcc
- * doesn't support that for arrays smaller than the alignment.
- */
-#define WT_PAGE_LOCKS 17
- WT_SPINLOCK *page_lock; /* Btree page spinlocks */
- u_int page_lock_cnt; /* Next spinlock to use */
-
/* Connection queue */
TAILQ_ENTRY(__wt_connection_impl) q;
/* Cache pool queue */
diff --git a/src/include/extern.h b/src/include/extern.h
index 01c21b188c0..f055e4810b3 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -180,7 +180,7 @@ extern int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *tag, WT_ITEM *b
extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_tree_walk_count(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_tree_walk_skip( WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, const WT_ITEM *value, WT_UPDATE *upd_arg, u_int modify_type) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, const WT_ITEM *value, WT_UPDATE *upd_arg, u_int modify_type, bool exclusive) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_row_leaf_key_copy( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -189,7 +189,7 @@ extern int __wt_row_ikey_alloc(WT_SESSION_IMPL *session, uint32_t cell_offset, c
extern int __wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t cell_offset, const void *key, size_t size, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_row_ikey(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key, size_t size, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_ITEM *value, WT_UPDATE *upd_arg, u_int modify_type) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_ITEM *value, WT_UPDATE *upd_arg, u_int modify_type, bool exclusive) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_row_insert_alloc(WT_SESSION_IMPL *session, const WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_update_alloc(WT_SESSION_IMPL *session, const WT_ITEM *value, WT_UPDATE **updp, size_t *sizep, u_int modify_type) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern WT_UPDATE *__wt_update_obsolete_check( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd);
@@ -687,10 +687,9 @@ extern void __wt_huffman_close(WT_SESSION_IMPL *session, void *huffman_arg);
extern void __wt_print_huffman_code(void *huffman_arg, uint16_t symbol);
extern int __wt_huffman_encode(WT_SESSION_IMPL *session, void *huffman_arg, const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg, const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern void __wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l);
+extern int __wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK *l);
extern int __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern void __wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *l);
extern void __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l);
extern void __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l);
extern int __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
diff --git a/src/include/extern_posix.h b/src/include/extern_posix.h
index 3afffef687b..c0ed056c7b6 100644
--- a/src/include/extern_posix.h
+++ b/src/include/extern_posix.h
@@ -15,7 +15,7 @@ extern int __wt_posix_unmap(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, void *ma
extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled);
extern void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond);
-extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern void __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp);
extern int __wt_once(void (*init_routine)(void)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_get_vm_pagesize(void) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern bool __wt_absolute_path(const char *path);
diff --git a/src/include/extern_win.h b/src/include/extern_win.h
index 4e232a2df80..d548ee0b2ec 100644
--- a/src/include/extern_win.h
+++ b/src/include/extern_win.h
@@ -13,7 +13,7 @@ extern int __wt_win_unmap(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, v
extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled);
extern void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond);
-extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern void __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp);
extern int __wt_once(void (*init_routine)(void)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_get_vm_pagesize(void) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern bool __wt_absolute_path(const char *path);
diff --git a/src/include/mutex.h b/src/include/mutex.h
index 00babd47fbf..5f814c2799e 100644
--- a/src/include/mutex.h
+++ b/src/include/mutex.h
@@ -37,17 +37,21 @@ struct __wt_condvar {
* Don't modify this structure without understanding the read/write locking
* functions.
*/
-union __wt_rwlock { /* Read/write lock */
- uint64_t u;
- struct {
- uint32_t wr; /* Writers and readers */
- } i;
- struct {
- uint16_t writers; /* Now serving for writers */
- uint16_t readers; /* Now serving for readers */
- uint16_t next; /* Next available ticket number */
- uint16_t writers_active;/* Count of active writers */
- } s;
+struct __wt_rwlock { /* Read/write lock */
+ volatile union {
+ uint64_t v; /* Full 64-bit value */
+ struct {
+ uint8_t current; /* Current ticket */
+ uint8_t next; /* Next available ticket */
+ uint8_t reader; /* Read queue ticket */
+ uint8_t __notused; /* Padding */
+ uint16_t readers_active;/* Count of active readers */
+ uint16_t readers_queued;/* Count of queued readers */
+ } s;
+ } u;
+
+ WT_CONDVAR *cond_readers; /* Blocking readers */
+ WT_CONDVAR *cond_writers; /* Blocking writers */
};
/*
@@ -63,8 +67,8 @@ union __wt_rwlock { /* Read/write lock */
#define SPINLOCK_PTHREAD_MUTEX_ADAPTIVE 3
struct __wt_spinlock {
- WT_CACHE_LINE_PAD_BEGIN
#if SPINLOCK_TYPE == SPINLOCK_GCC
+ WT_CACHE_LINE_PAD_BEGIN
volatile int lock;
#elif SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX || \
SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE || \
@@ -87,5 +91,8 @@ struct __wt_spinlock {
int16_t stat_int_usecs_off; /* waiting server threads offset */
int8_t initialized; /* Lock initialized, for cleanup */
+
+#if SPINLOCK_TYPE == SPINLOCK_GCC
WT_CACHE_LINE_PAD_END
+#endif
};
diff --git a/src/include/serial.i b/src/include/serial.i
index 18ff0bb7ec2..bd0e498f621 100644
--- a/src/include/serial.i
+++ b/src/include/serial.i
@@ -154,7 +154,7 @@ __col_append_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head,
static inline int
__wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
WT_INSERT_HEAD *ins_head, WT_INSERT ***ins_stack, WT_INSERT **new_insp,
- size_t new_ins_size, uint64_t *recnop, u_int skipdepth)
+ size_t new_ins_size, uint64_t *recnop, u_int skipdepth, bool exclusive)
{
WT_INSERT *new_ins = *new_insp;
WT_DECL_RET;
@@ -165,11 +165,16 @@ __wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
/* Clear references to memory we now own and must free on error. */
*new_insp = NULL;
- /* Acquire the page's spinlock, call the worker function. */
- WT_PAGE_LOCK(session, page);
+ /*
+ * Acquire the page's spinlock unless we already have exclusive access.
+ * Then call the worker function.
+ */
+ if (!exclusive)
+ WT_PAGE_LOCK(session, page);
ret = __col_append_serial_func(
session, ins_head, ins_stack, new_ins, recnop, skipdepth);
- WT_PAGE_UNLOCK(session, page);
+ if (!exclusive)
+ WT_PAGE_UNLOCK(session, page);
if (ret != 0) {
/* Free unused memory on error. */
@@ -198,7 +203,7 @@ __wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
static inline int
__wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
WT_INSERT_HEAD *ins_head, WT_INSERT ***ins_stack, WT_INSERT **new_insp,
- size_t new_ins_size, u_int skipdepth)
+ size_t new_ins_size, u_int skipdepth, bool exclusive)
{
WT_INSERT *new_ins = *new_insp;
WT_DECL_RET;
@@ -220,10 +225,12 @@ __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
ret = __insert_simple_func(
session, ins_stack, new_ins, skipdepth);
else {
- WT_PAGE_LOCK(session, page);
+ if (!exclusive)
+ WT_PAGE_LOCK(session, page);
ret = __insert_serial_func(
session, ins_head, ins_stack, new_ins, skipdepth);
- WT_PAGE_UNLOCK(session, page);
+ if (!exclusive)
+ WT_PAGE_UNLOCK(session, page);
}
if (ret != 0) {
@@ -252,7 +259,8 @@ __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
*/
static inline int
__wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
- WT_UPDATE **srch_upd, WT_UPDATE **updp, size_t upd_size)
+ WT_UPDATE **srch_upd, WT_UPDATE **updp, size_t upd_size,
+ bool exclusive)
{
WT_DECL_RET;
WT_UPDATE *obsolete, *upd = *updp;
@@ -295,7 +303,7 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
/*
* If there are no subsequent WT_UPDATE structures we are done here.
*/
- if (upd->next == NULL)
+ if (upd->next == NULL || exclusive)
return (0);
/*
@@ -316,11 +324,11 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
}
/* If we can't lock it, don't scan, that's okay. */
- if (__wt_try_writelock(session, &page->page_lock) != 0)
+ if (WT_PAGE_TRYLOCK(session, page) != 0)
return (0);
obsolete = __wt_update_obsolete_check(session, page, upd->next);
- __wt_writeunlock(session, &page->page_lock);
+ WT_PAGE_UNLOCK(session, page);
if (obsolete != NULL)
__wt_update_obsolete_free(session, page, obsolete);
diff --git a/src/include/session.h b/src/include/session.h
index 543063f5a90..dfd84675721 100644
--- a/src/include/session.h
+++ b/src/include/session.h
@@ -98,6 +98,10 @@ struct __wt_session_impl {
*/
TAILQ_HEAD(__tables, __wt_table) tables;
+ /* Current rwlock for callback. */
+ WT_RWLOCK *current_rwlock;
+ uint8_t current_rwticket;
+
WT_ITEM **scratch; /* Temporary memory for any function */
u_int scratch_alloc; /* Currently allocated */
size_t scratch_cached; /* Scratch bytes cached */
diff --git a/src/include/verify_build.h b/src/include/verify_build.h
index e93f5931c21..57189b5c2b2 100644
--- a/src/include/verify_build.h
+++ b/src/include/verify_build.h
@@ -60,7 +60,6 @@ __wt_verify_build(void)
sizeof(s) > WT_CACHE_LINE_ALIGNMENT || \
sizeof(s) % WT_CACHE_LINE_ALIGNMENT == 0)
WT_PADDING_CHECK(WT_LOGSLOT);
- WT_PADDING_CHECK(WT_SPINLOCK);
WT_PADDING_CHECK(WT_TXN_STATE);
/*
diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h
index e250cfc33ba..1c9600dd27f 100644
--- a/src/include/wt_internal.h
+++ b/src/include/wt_internal.h
@@ -268,6 +268,8 @@ struct __wt_ref;
typedef struct __wt_ref WT_REF;
struct __wt_row;
typedef struct __wt_row WT_ROW;
+struct __wt_rwlock;
+ typedef struct __wt_rwlock WT_RWLOCK;
struct __wt_salvage_cookie;
typedef struct __wt_salvage_cookie WT_SALVAGE_COOKIE;
struct __wt_save_upd;
@@ -304,8 +306,6 @@ union __wt_lsn;
typedef union __wt_lsn WT_LSN;
union __wt_rand_state;
typedef union __wt_rand_state WT_RAND_STATE;
-union __wt_rwlock;
- typedef union __wt_rwlock WT_RWLOCK;
/*
* Forward type declarations for internal types: END
* DO NOT EDIT: automatically built by dist/s_typedef.
diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c
index f391c553d2a..b1f775a275e 100644
--- a/src/lsm/lsm_manager.c
+++ b/src/lsm/lsm_manager.c
@@ -330,7 +330,7 @@ __wt_lsm_manager_destroy(WT_SESSION_IMPL *session)
__wt_spin_destroy(session, &manager->switch_lock);
__wt_spin_destroy(session, &manager->app_lock);
__wt_spin_destroy(session, &manager->manager_lock);
- WT_TRET(__wt_cond_destroy(session, &manager->work_cond));
+ __wt_cond_destroy(session, &manager->work_cond);
return (ret);
}
diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c
index fe36237969f..9932ba6b5b3 100644
--- a/src/lsm/lsm_tree.c
+++ b/src/lsm/lsm_tree.c
@@ -472,7 +472,7 @@ __lsm_tree_open(WT_SESSION_IMPL *session,
/* Try to open the tree. */
WT_RET(__wt_calloc_one(session, &lsm_tree));
- __wt_rwlock_init(session, &lsm_tree->rwlock);
+ WT_RET(__wt_rwlock_init(session, &lsm_tree->rwlock));
WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri));
diff --git a/src/os_posix/os_mtx_cond.c b/src/os_posix/os_mtx_cond.c
index 10606e8108e..1018bf860d6 100644
--- a/src/os_posix/os_mtx_cond.c
+++ b/src/os_posix/os_mtx_cond.c
@@ -153,7 +153,7 @@ err:
* __wt_cond_destroy --
* Destroy a condition variable.
*/
-int
+void
__wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp)
{
WT_CONDVAR *cond;
@@ -161,11 +161,15 @@ __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp)
cond = *condp;
if (cond == NULL)
- return (0);
+ return;
- ret = pthread_cond_destroy(&cond->cond);
- WT_TRET(pthread_mutex_destroy(&cond->mtx));
- __wt_free(session, *condp);
+ if ((ret = pthread_cond_destroy(&cond->cond)) != 0)
+ WT_PANIC_MSG(
+ session, ret, "pthread_cond_destroy: %s", cond->name);
- return (ret);
+ if ((ret = pthread_mutex_destroy(&cond->mtx)) != 0)
+ WT_PANIC_MSG(
+ session, ret, "pthread_mutex_destroy: %s", cond->name);
+
+ __wt_free(session, *condp);
}
diff --git a/src/os_win/os_mtx_cond.c b/src/os_win/os_mtx_cond.c
index 2002d1e925c..9d4339c8731 100644
--- a/src/os_win/os_mtx_cond.c
+++ b/src/os_win/os_mtx_cond.c
@@ -163,18 +163,16 @@ __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond)
* __wt_cond_destroy --
* Destroy a condition variable.
*/
-int
+void
__wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp)
{
WT_CONDVAR *cond;
cond = *condp;
if (cond == NULL)
- return (0);
+ return;
/* Do nothing to delete Condition Variable */
DeleteCriticalSection(&cond->mtx);
__wt_free(session, *condp);
-
- return (0);
}
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index 1e76f0d84d0..8bff4c630c0 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -386,7 +386,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
* In-memory splits: reconciliation of an internal page cannot handle
* a child page splitting during the reconciliation.
*/
- __wt_writelock(session, &page->page_lock);
+ WT_PAGE_LOCK(session, page);
oldest_id = __wt_txn_oldest_id(session);
if (LF_ISSET(WT_EVICTING))
@@ -405,7 +405,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
/* Initialize the reconciliation structure for each new run. */
if ((ret = __rec_write_init(
session, ref, flags, salvage, &session->reconcile)) != 0) {
- __wt_writeunlock(session, &page->page_lock);
+ WT_PAGE_UNLOCK(session, page);
return (ret);
}
r = session->reconcile;
@@ -446,7 +446,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
WT_TRET(__rec_write_wrapup_err(session, r, page));
/* Release the reconciliation lock. */
- __wt_writeunlock(session, &page->page_lock);
+ WT_PAGE_UNLOCK(session, page);
/*
* If our caller can configure lookaside table reconciliation, flag if
diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c
index 2d0a2eeb2dc..4565ae71896 100644
--- a/src/session/session_dhandle.c
+++ b/src/session/session_dhandle.c
@@ -261,8 +261,8 @@ __wt_session_release_btree(WT_SESSION_IMPL *session)
* can get a handle without special flags.
*/
if (F_ISSET(dhandle, WT_DHANDLE_DISCARD | WT_DHANDLE_DISCARD_FORCE)) {
- __session_find_dhandle(session,
- dhandle->name, dhandle->checkpoint, &dhandle_cache);
+ WT_SAVE_DHANDLE(session, __session_find_dhandle(session,
+ dhandle->name, dhandle->checkpoint, &dhandle_cache));
if (dhandle_cache != NULL)
__session_discard_dhandle(session, dhandle_cache);
}
diff --git a/src/support/mtx_rw.c b/src/support/mtx_rw.c
index 0126e77e9b8..d86d75a5340 100644
--- a/src/support/mtx_rw.c
+++ b/src/support/mtx_rw.c
@@ -27,7 +27,7 @@
*/
/*
- * Based on "Spinlocks and Read-Write Locks" by Dr. Steven Fuerst:
+ * Inspired by "Spinlocks and Read-Write Locks" by Dr. Steven Fuerst:
* http://locklessinc.com/articles/locks/
*
* Dr. Fuerst further credits:
@@ -39,77 +39,46 @@
* by John Mellor-Crummey and Michael Scott in their landmark paper "Scalable
* Reader-Writer Synchronization for Shared-Memory Multiprocessors".
*
- * The following is an explanation of this code. First, the underlying lock
- * structure.
+ * The following is an explanation of our interpretation and implementation.
+ * First, the underlying lock structure.
*
+ * volatile union {
+ * uint64_t v; // Full 64-bit value
* struct {
- * uint16_t writers; Now serving for writers
- * uint16_t readers; Now serving for readers
- * uint16_t next; Next available ticket number
- * uint16_t __notused; Padding
- * }
+ * uint8_t current; // Current ticket
+ * uint8_t next; // Next available ticket
+ * uint8_t reader; // Read queue ticket
+ * uint8_t __notused; // Padding
+ * uint16_t readers_active; // Count of active readers
+ * uint16_t readers_queued; // Count of queued readers
+ * } s;
+ * } u;
*
* First, imagine a store's 'take a number' ticket algorithm. A customer takes
* a unique ticket number and customers are served in ticket order. In the data
- * structure, 'writers' is the next writer to be served, 'readers' is the next
- * reader to be served, and 'next' is the next available ticket number.
+ * structure, 'next' is the ticket that will be allocated next, and 'current'
+ * is the ticket being served.
*
- * Next, consider exclusive (write) locks. The 'now serving' number for writers
- * is 'writers'. To lock, 'take a number' and wait until that number is being
- * served; more specifically, atomically copy and increment the current value of
- * 'next', and then wait until 'writers' equals that copied number.
+ * Next, consider exclusive (write) locks. To lock, 'take a number' and wait
+ * until that number is being served; more specifically, atomically increment
+ * 'next', and then wait until 'current' equals that allocated ticket.
*
- * Shared (read) locks are similar. Like writers, readers atomically get the
- * next number available. However, instead of waiting for 'writers' to equal
- * their number, they wait for 'readers' to equal their number.
+ * Shared (read) locks are similar, except that readers can share a ticket
+ * (both with each other and with a single writer). Readers with a given
+ * ticket execute before the writer with that ticket. In other words, writers
+ * wait for both their ticket to become current and for all readers to exit
+ * the lock.
*
- * This has the effect of queuing lock requests in the order they arrive
- * (incidentally avoiding starvation).
+ * If there are no active writers (indicated by 'current' == 'next'), readers
+ * can immediately enter the lock by atomically incrementing 'readers_active'.
+ * When there are writers active, readers form a new queue by first setting
+ * 'reader' to 'next' (i.e. readers are scheduled after any queued writers,
+ * avoiding starvation), then atomically incrementing 'readers_queued'.
*
- * Each lock/unlock pair requires incrementing both 'readers' and 'writers'.
- * In the case of a reader, the 'readers' increment happens when the reader
- * acquires the lock (to allow read-lock sharing), and the 'writers' increment
- * happens when the reader releases the lock. In the case of a writer, both
- * 'readers' and 'writers' are incremented when the writer releases the lock.
- *
- * For example, consider the following read (R) and write (W) lock requests:
- *
- * writers readers next
- * 0 0 0
- * R: ticket 0, readers match OK 0 1 1
- * R: ticket 1, readers match OK 0 2 2
- * R: ticket 2, readers match OK 0 3 3
- * W: ticket 3, writers no match block 0 3 4
- * R: ticket 2, unlock 1 3 4
- * R: ticket 0, unlock 2 3 4
- * R: ticket 1, unlock 3 3 4
- * W: ticket 3, writers match OK 3 3 4
- *
- * Note the writer blocks until 'writers' equals its ticket number and it does
- * not matter if readers unlock in order or not.
- *
- * Readers or writers entering the system after the write lock is queued block,
- * and the next ticket holder (reader or writer) will unblock when the writer
- * unlocks. An example, continuing from the last line of the above example:
- *
- * writers readers next
- * W: ticket 3, writers match OK 3 3 4
- * R: ticket 4, readers no match block 3 3 5
- * R: ticket 5, readers no match block 3 3 6
- * W: ticket 6, writers no match block 3 3 7
- * W: ticket 3, unlock 4 4 7
- * R: ticket 4, readers match OK 4 5 7
- * R: ticket 5, readers match OK 4 6 7
- *
- * The 'next' field is a 2-byte value so the available ticket number wraps at
- * 64K requests. If a thread's lock request is not granted until the 'next'
- * field cycles and the same ticket is taken by another thread, we could grant
- * a lock to two separate threads at the same time, and bad things happen: two
- * writer threads or a reader thread and a writer thread would run in parallel,
- * and lock waiters could be skipped if the unlocks race. This is unlikely, it
- * only happens if a lock request is blocked by 64K other requests. The fix is
- * to grow the lock structure fields, but the largest atomic instruction we have
- * is 8 bytes, the structure has no room to grow.
+ * The 'next' field is a 1-byte value so the available ticket number wraps
+ * after 256 requests. If a thread's write lock request would cause the 'next'
+ * field to catch up with 'current', instead it waits to avoid the same ticket
+ * being allocated to multiple threads.
*/
#include "wt_internal.h"
@@ -118,12 +87,14 @@
* __wt_rwlock_init --
* Initialize a read/write lock.
*/
-void
+int
__wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l)
{
- WT_UNUSED(session);
+ l->u.v = 0;
- l->u = 0;
+ WT_RET(__wt_cond_alloc(session, "rwlock wait", &l->cond_readers));
+ WT_RET(__wt_cond_alloc(session, "rwlock wait", &l->cond_writers));
+ return (0);
}
/*
@@ -133,9 +104,10 @@ __wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l)
void
__wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK *l)
{
- WT_UNUSED(session);
+ l->u.v = 0;
- l->u = 0;
+ __wt_cond_destroy(session, &l->cond_readers);
+ __wt_cond_destroy(session, &l->cond_writers);
}
/*
@@ -149,46 +121,35 @@ __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
WT_STAT_CONN_INCR(session, rwlock_read);
- new = old = *l;
+ new.u.v = old.u.v = l->u.v;
/*
- * This read lock can only be granted if the lock was last granted to
- * a reader and there are no readers or writers blocked on the lock,
- * that is, if this thread's ticket would be the next ticket granted.
- * Do the cheap test to see if this can possibly succeed (and confirm
- * the lock is in the correct state to grant this read lock).
+ * This read lock can only be granted if there are no active writers.
+ *
+ * Also check for overflow in case there are 64K active readers.
*/
- if (old.s.readers != old.s.next)
+ if (old.u.s.current != old.u.s.next ||
+ new.u.s.readers_active == UINT16_MAX)
return (EBUSY);
/*
- * The replacement lock value is a result of allocating a new ticket and
- * incrementing the reader value to match it.
+ * The replacement lock value is a result of adding an active reader.
+ *
+ * We rely on this atomic operation to provide a barrier.
*/
- new.s.readers = new.s.next = old.s.next + 1;
- return (__wt_atomic_cas64(&l->u, old.u, new.u) ? 0 : EBUSY);
+ new.u.s.readers_active++;
+ return (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v) ? 0 : EBUSY);
}
/*
- * __wt_readlock_spin --
- * Spin to get a read lock: only yield the CPU if the lock is held
- * exclusive.
+ * __read_blocked --
+ * Check whether the current read lock request should keep waiting.
*/
-void
-__wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *l)
+static bool
+__read_blocked(WT_SESSION_IMPL *session)
{
- /*
- * Try to get the lock in a single operation if it is available to
- * readers. This avoids the situation where multiple readers arrive
- * concurrently and have to line up in order to enter the lock. For
- * read-heavy workloads it can make a significant difference.
- */
- while (__wt_try_readlock(session, l) != 0) {
- if (l->s.writers_active > 0)
- __wt_yield();
- else
- WT_PAUSE();
- }
+ return (session->current_rwticket !=
+ session->current_rwlock->u.s.current);
}
/*
@@ -198,41 +159,90 @@ __wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *l)
void
__wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
{
- uint16_t ticket;
+ WT_RWLOCK new, old;
int pause_cnt;
+ int16_t writers_active;
+ uint8_t ticket;
WT_STAT_CONN_INCR(session, rwlock_read);
WT_DIAGNOSTIC_YIELD;
- /*
- * Possibly wrap: if we have more than 64K lockers waiting, the ticket
- * value will wrap and two lockers will simultaneously be granted the
- * lock.
- */
- ticket = __wt_atomic_fetch_add16(&l->s.next, 1);
- for (pause_cnt = 0; ticket != l->s.readers;) {
+ for (;;) {
/*
- * We failed to get the lock; pause before retrying and if we've
- * paused enough, yield so we don't burn CPU to no purpose. This
- * situation happens if there are more threads than cores in the
- * system and we're thrashing on shared resources.
+ * Fast path: if there is no active writer, join the current
+ * group.
*/
- if (++pause_cnt < WT_THOUSAND)
+ for (old.u.v = l->u.v;
+ old.u.s.current == old.u.s.next;
+ old.u.v = l->u.v) {
+ new.u.v = old.u.v;
+ /*
+ * Check for overflow: if the maximum number of readers
+ * are already active, wait to try again.
+ */
+ if (++new.u.s.readers_active == 0)
+ goto stall;
+ if (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v))
+ return;
WT_PAUSE();
- else
+ }
+
+ /*
+ * There is an active writer: join the next group.
+ *
+ * Limit how many readers can queue: don't allow more readers
+ * to queue than there are active writers (calculated as
+ * `next - current`): otherwise, in write-heavy workloads,
+ * readers can keep queuing up in front of writers and
+ * throughput is unstable.
+ *
+ * If the maximum number of readers are already queued, wait
+ * until we can get a valid ticket.
+ */
+ writers_active = old.u.s.next - old.u.s.current;
+ if (old.u.s.readers_queued > writers_active) {
+stall: __wt_cond_wait(
+ session, l->cond_readers, WT_THOUSAND, NULL);
+ continue;
+ }
+
+ /*
+ * If we are the first reader to queue, set the next read
+ * group. Note: don't re-read from the lock or we could race
+ * with a writer unlocking.
+ */
+ new.u.v = old.u.v;
+ if (new.u.s.readers_queued++ == 0)
+ new.u.s.reader = new.u.s.next;
+ ticket = new.u.s.reader;
+
+ if (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v))
+ break;
+ }
+
+ /* Wait for our group to start. */
+ for (pause_cnt = 0; ticket != l->u.s.current; pause_cnt++) {
+ if (pause_cnt < 1000)
+ WT_PAUSE();
+ else if (pause_cnt < 1200)
__wt_yield();
+ else {
+ session->current_rwlock = l;
+ session->current_rwticket = ticket;
+ __wt_cond_wait(
+ session, l->cond_readers, 0, __read_blocked);
+ }
}
- /*
- * We're the only writer of the readers field, so the update does not
- * need to be atomic.
- */
- ++l->s.readers;
+ WT_ASSERT(session, l->u.s.readers_active > 0);
/*
* Applications depend on a barrier here so that operations holding the
- * lock see consistent data.
+ * lock see consistent data. The atomic operation above isn't
+ * sufficient here because we don't own the lock until our ticket comes
+ * up and whatever data we are protecting may have changed in the
+ * meantime.
*/
WT_READ_BARRIER();
}
@@ -244,13 +254,22 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
void
__wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
{
- WT_UNUSED(session);
+ WT_RWLOCK new, old;
- /*
- * Increment the writers value (other readers are doing the same, make
- * sure we don't race).
- */
- (void)__wt_atomic_add16(&l->s.writers, 1);
+ do {
+ old.u.v = l->u.v;
+ WT_ASSERT(session, old.u.s.readers_active > 0);
+
+ /*
+ * Decrement the active reader count (other readers are doing
+ * the same, make sure we don't race).
+ */
+ new.u.v = old.u.v;
+ --new.u.s.readers_active;
+ } while (!__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v));
+
+ if (new.u.s.readers_active == 0 && new.u.s.current != new.u.s.next)
+ __wt_cond_signal(session, l->cond_writers);
}
/*
@@ -264,22 +283,44 @@ __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
WT_STAT_CONN_INCR(session, rwlock_write);
- old = new = *l;
-
/*
- * This write lock can only be granted if the lock was last granted to
- * a writer and there are no readers or writers blocked on the lock,
- * that is, if this thread's ticket would be the next ticket granted.
- * Do the cheap test to see if this can possibly succeed (and confirm
- * the lock is in the correct state to grant this write lock).
+ * This write lock can only be granted if no readers or writers blocked
+ * on the lock, that is, if this thread's ticket would be the next
+ * ticket granted. Check if this can possibly succeed (and confirm the
+ * lock is in the correct state to grant this write lock).
*/
- if (old.s.writers != old.s.next)
+ old.u.v = l->u.v;
+ if (old.u.s.current != old.u.s.next || old.u.s.readers_active != 0)
return (EBUSY);
- /* The replacement lock value is a result of allocating a new ticket. */
- ++new.s.next;
- ++new.s.writers_active;
- return (__wt_atomic_cas64(&l->u, old.u, new.u) ? 0 : EBUSY);
+ /*
+ * We've checked above that there is no writer active (since
+ * `current == next`), so there should be no readers queued.
+ */
+ WT_ASSERT(session, old.u.s.readers_queued == 0);
+
+ /*
+ * The replacement lock value is a result of allocating a new ticket.
+ *
+ * We rely on this atomic operation to provide a barrier.
+ */
+ new.u.v = old.u.v;
+ new.u.s.next++;
+ return (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v) ? 0 : EBUSY);
+}
+
+/*
+ * __write_blocked --
+ * Check whether the current write lock request should keep waiting.
+ */
+static bool
+__write_blocked(WT_SESSION_IMPL *session)
+{
+ WT_RWLOCK *l;
+
+ l = session->current_rwlock;
+ return (session->current_rwticket != l->u.s.current ||
+ l->u.s.readers_active != 0);
}
/*
@@ -289,34 +330,51 @@ __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
void
__wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
{
- uint16_t ticket;
+ WT_RWLOCK new, old;
int pause_cnt;
+ uint8_t ticket;
WT_STAT_CONN_INCR(session, rwlock_write);
- /*
- * Possibly wrap: if we have more than 64K lockers waiting, the ticket
- * value will wrap and two lockers will simultaneously be granted the
- * lock.
- */
- ticket = __wt_atomic_fetch_add16(&l->s.next, 1);
- (void)__wt_atomic_add16(&l->s.writers_active, 1);
- for (pause_cnt = 0; ticket != l->s.writers;) {
+ for (;;) {
+ new.u.v = old.u.v = l->u.v;
+ ticket = new.u.s.next++;
+
/*
- * We failed to get the lock; pause before retrying and if we've
- * paused enough, sleep so we don't burn CPU to no purpose. This
- * situation happens if there are more threads than cores in the
- * system and we're thrashing on shared resources.
+ * Avoid wrapping: if we allocate more than 256 tickets, two
+ * lockers will simultaneously be granted the lock.
*/
- if (++pause_cnt < WT_THOUSAND)
+ if (new.u.s.current == new.u.s.next) {
+ __wt_cond_wait(
+ session, l->cond_writers, WT_THOUSAND, NULL);
+ continue;
+ }
+ if (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v))
+ break;
+ }
+
+ /* Wait for our group to start and any readers to drain. */
+ for (pause_cnt = 0;
+ ticket != l->u.s.current || l->u.s.readers_active != 0;
+ pause_cnt++) {
+ if (pause_cnt < 1000)
WT_PAUSE();
- else
- __wt_sleep(0, 10);
+ else if (pause_cnt < 1200)
+ __wt_yield();
+ else {
+ session->current_rwlock = l;
+ session->current_rwticket = ticket;
+ __wt_cond_wait(
+ session, l->cond_writers, 0, __write_blocked);
+ }
}
/*
* Applications depend on a barrier here so that operations holding the
- * lock see consistent data.
+ * lock see consistent data. The atomic operation above isn't
+ * sufficient here because we don't own the lock until our ticket comes
+ * up and whatever data we are protecting may have changed in the
+ * meantime.
*/
WT_READ_BARRIER();
}
@@ -328,29 +386,34 @@ __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
void
__wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
{
- WT_RWLOCK new;
-
- WT_UNUSED(session);
+ WT_RWLOCK new, old;
- (void)__wt_atomic_sub16(&l->s.writers_active, 1);
+ do {
+ new.u.v = old.u.v = l->u.v;
- /*
- * Ensure that all updates made while the lock was held are visible to
- * the next thread to acquire the lock.
- */
- WT_WRITE_BARRIER();
+ /*
+ * We're holding the lock exclusive, there shouldn't be any
+ * active readers.
+ */
+ WT_ASSERT(session, old.u.s.readers_active == 0);
- new = *l;
+ /*
+ * Allow the next batch to start.
+ *
+ * If there are readers in the next group, swap queued readers
+ * to active: this could race with new readlock requests, so we
+ * have to spin.
+ */
+ if (++new.u.s.current == new.u.s.reader) {
+ new.u.s.readers_active = new.u.s.readers_queued;
+ new.u.s.readers_queued = 0;
+ }
+ } while (!__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v));
- /*
- * We're the only writer of the writers/readers fields, so the update
- * does not need to be atomic; we have to update both values at the
- * same time though, otherwise we'd potentially race with the thread
- * next granted the lock.
- */
- ++new.s.writers;
- ++new.s.readers;
- l->i.wr = new.i.wr;
+ if (new.u.s.readers_active != 0)
+ __wt_cond_signal(session, l->cond_readers);
+ else if (new.u.s.current != new.u.s.next)
+ __wt_cond_signal(session, l->cond_writers);
WT_DIAGNOSTIC_YIELD;
}
@@ -365,6 +428,6 @@ __wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *l)
{
WT_UNUSED(session);
- return (l->s.writers != l->s.next || l->s.readers != l->s.next);
+ return (l->u.s.current != l->u.s.next || l->u.s.readers_active != 0);
}
#endif
diff --git a/src/support/thread_group.c b/src/support/thread_group.c
index 5abc3d28cc0..59caaedf5cf 100644
--- a/src/support/thread_group.c
+++ b/src/support/thread_group.c
@@ -104,7 +104,7 @@ __thread_group_shrink(
if (thread == NULL)
continue;
WT_TRET(__wt_thread_join(session, thread->tid));
- WT_TRET(__wt_cond_destroy(session, &thread->pause_cond));
+ __wt_cond_destroy(session, &thread->pause_cond);
}
__wt_writelock(session, &group->lock);
for (current_slot = group->alloc; current_slot > new_count; ) {
@@ -234,7 +234,7 @@ err: /*
wt_session = (WT_SESSION *)thread->session;
WT_TRET(wt_session->close(wt_session, NULL));
}
- WT_TRET(__wt_cond_destroy(session, &thread->pause_cond));
+ __wt_cond_destroy(session, &thread->pause_cond);
__wt_free(session, thread);
}
@@ -290,7 +290,7 @@ __wt_thread_group_create(
__wt_verbose(session, WT_VERB_THREAD_GROUP,
"Creating thread group: %p", (void *)group);
- __wt_rwlock_init(session, &group->lock);
+ WT_RET(__wt_rwlock_init(session, &group->lock));
WT_ERR(__wt_cond_alloc(
session, "thread group cond", &group->wait_cond));
cond_alloced = true;
@@ -307,7 +307,7 @@ __wt_thread_group_create(
/* Cleanup on error to avoid leaking resources */
err: if (ret != 0) {
if (cond_alloced)
- WT_TRET(__wt_cond_destroy(session, &group->wait_cond));
+ __wt_cond_destroy(session, &group->wait_cond);
__wt_rwlock_destroy(session, &group->lock);
}
return (ret);
@@ -332,7 +332,7 @@ __wt_thread_group_destroy(WT_SESSION_IMPL *session, WT_THREAD_GROUP *group)
__wt_free(session, group->threads);
- WT_TRET(__wt_cond_destroy(session, &group->wait_cond));
+ __wt_cond_destroy(session, &group->wait_cond);
__wt_rwlock_destroy(session, &group->lock);
/*
diff --git a/src/txn/txn.c b/src/txn/txn.c
index d9edbb80564..fb77ab4e860 100644
--- a/src/txn/txn.c
+++ b/src/txn/txn.c
@@ -126,7 +126,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
n = 0;
/* We're going to scan the table: wait for the lock. */
- __wt_readlock_spin(session, &txn_global->scan_rwlock);
+ __wt_readlock(session, &txn_global->scan_rwlock);
current_id = pinned_id = txn_global->current;
prev_oldest_id = txn_global->oldest_id;
@@ -293,7 +293,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags)
/* First do a read-only scan. */
if (wait)
- __wt_readlock_spin(session, &txn_global->scan_rwlock);
+ __wt_readlock(session, &txn_global->scan_rwlock);
else if ((ret =
__wt_try_readlock(session, &txn_global->scan_rwlock)) != 0)
return (ret == EBUSY ? 0 : ret);
@@ -782,8 +782,8 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[])
WT_RET(__wt_spin_init(session,
&txn_global->id_lock, "transaction id lock"));
- __wt_rwlock_init(session, &txn_global->scan_rwlock);
- __wt_rwlock_init(session, &txn_global->nsnap_rwlock);
+ WT_RET(__wt_rwlock_init(session, &txn_global->scan_rwlock));
+ WT_RET(__wt_rwlock_init(session, &txn_global->nsnap_rwlock));
txn_global->nsnap_oldest_id = WT_TXN_NONE;
TAILQ_INIT(&txn_global->nsnaph);
diff --git a/test/csuite/Makefile.am b/test/csuite/Makefile.am
index 10ab890f2f5..f2b4fcacdc8 100644
--- a/test/csuite/Makefile.am
+++ b/test/csuite/Makefile.am
@@ -57,6 +57,9 @@ noinst_PROGRAMS += test_wt3135_search_near_collator
test_wt3184_dup_index_collator_SOURCES = wt3184_dup_index_collator/main.c
noinst_PROGRAMS += test_wt3184_dup_index_collator
+test_rwlock_SOURCES = rwlock/main.c
+noinst_PROGRAMS += test_rwlock
+
# Run this during a "make check" smoke test.
TESTS = $(noinst_PROGRAMS)
LOG_COMPILER = $(TEST_WRAPPER)
diff --git a/test/csuite/rwlock/main.c b/test/csuite/rwlock/main.c
new file mode 100644
index 00000000000..04813182478
--- /dev/null
+++ b/test/csuite/rwlock/main.c
@@ -0,0 +1,184 @@
+/*-
+ * Public Domain 2014-2017 MongoDB, Inc.
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "test_util.h"
+
+/*
+ * JIRA ticket reference: HELP-4355
+ * Test rwlock collapse under load.
+ */
+#define MAX_THREADS 1000
+#define READS_PER_WRITE 10000
+//#define READS_PER_WRITE 1000000
+//#define READS_PER_WRITE 100
+
+#define CHECK_CORRECTNESS 1
+//#define USE_POSIX 1
+
+static WT_RWLOCK rwlock;
+static pthread_rwlock_t p_rwlock;
+static bool running;
+static uint64_t shared_counter;
+
+void *thread_rwlock(void *);
+void *thread_dump(void *);
+
+int
+main(int argc, char *argv[])
+{
+ TEST_OPTS *opts, _opts;
+ struct timespec te, ts;
+ pthread_t dump_id, id[MAX_THREADS];
+ int i;
+
+ if (!testutil_enable_long_tests()) /* Ignore unless requested */
+ return (EXIT_SUCCESS);
+
+ opts = &_opts;
+ memset(opts, 0, sizeof(*opts));
+ opts->nthreads = 100;
+ opts->nops = 1000000; /* per thread */
+ testutil_check(testutil_parse_opts(argc, argv, opts));
+ running = true;
+
+ testutil_make_work_dir(opts->home);
+ testutil_check(wiredtiger_open(opts->home, NULL,
+ "create,session_max=1000,statistics=(fast)", &opts->conn));
+
+ testutil_check(__wt_rwlock_init(NULL, &rwlock));
+ testutil_check(pthread_rwlock_init(&p_rwlock, NULL));
+
+ testutil_check(pthread_create(
+ &dump_id, NULL, thread_dump, (void *)opts));
+
+ __wt_epoch(NULL, &ts);
+ for (i = 0; i < (int)opts->nthreads; ++i)
+ testutil_check(pthread_create(
+ &id[i], NULL, thread_rwlock, (void *)opts));
+
+ while (--i >= 0)
+ testutil_check(pthread_join(id[i], NULL));
+ __wt_epoch(NULL, &te);
+ printf("%.2lf\n", WT_TIMEDIFF_MS(te, ts) / 1000.0);
+
+ running = false;
+ testutil_check(pthread_join(dump_id, NULL));
+
+ testutil_check(pthread_rwlock_destroy(&p_rwlock));
+ testutil_cleanup(opts);
+ return (EXIT_SUCCESS);
+}
+
+/*
+ * Acquire a rwlock, every Nth operation, acquire exclusive.
+ */
+void *
+thread_rwlock(void *arg)
+{
+ TEST_OPTS *opts;
+ WT_SESSION *wt_session;
+ WT_SESSION_IMPL *session;
+ uint64_t i, counter;
+ bool writelock;
+
+ opts = (TEST_OPTS *)arg;
+ testutil_check(
+ opts->conn->open_session(opts->conn, NULL, NULL, &wt_session));
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ printf("Running rwlock thread\n");
+ for (i = 1; i <= opts->nops; ++i) {
+ writelock = (i % READS_PER_WRITE == 0);
+
+#ifdef USE_POSIX
+ if (writelock)
+ testutil_check(pthread_rwlock_wrlock(&p_rwlock));
+ else
+ testutil_check(pthread_rwlock_rdlock(&p_rwlock));
+#else
+ if (writelock)
+ __wt_writelock(session, &rwlock);
+ else
+ __wt_readlock(session, &rwlock);
+#endif
+
+ /*
+ * Do a tiny amount of work inside the lock so the compiler
+ * can't optimize everything away.
+ */
+ (void)__wt_atomic_add64(&counter, 1);
+
+#ifdef CHECK_CORRECTNESS
+ if (writelock)
+ counter = ++shared_counter;
+ else
+ counter = shared_counter;
+
+ __wt_yield();
+
+ testutil_assert(counter == shared_counter);
+#endif
+
+#ifdef USE_POSIX
+ testutil_check(pthread_rwlock_unlock(&p_rwlock));
+#else
+ if (writelock)
+ __wt_writeunlock(session, &rwlock);
+ else
+ __wt_readunlock(session, &rwlock);
+#endif
+
+ if (i % 10000 == 0) {
+ printf("%s", session->id == 20 ? ".\n" : ".");
+ fflush(stdout);
+ }
+ }
+
+ opts->running = false;
+
+ return (NULL);
+}
+
+void *
+thread_dump(void *arg) {
+ WT_UNUSED(arg);
+
+ while (running) {
+ sleep(1);
+ printf("\n"
+ "rwlock { current %" PRIu8 ", next %" PRIu8
+ ", reader %" PRIu8 ", readers_active %" PRIu16
+ ", readers_queued %" PRIu16 " }\n",
+ rwlock.u.s.current,
+ rwlock.u.s.next,
+ rwlock.u.s.reader,
+ rwlock.u.s.readers_active,
+ rwlock.u.s.readers_queued);
+ }
+
+ return (NULL);
+}