summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/async/async_api.c2
-rw-r--r--src/btree/bt_compact.c4
-rw-r--r--src/btree/bt_cursor.c4
-rw-r--r--src/btree/bt_debug.c2
-rw-r--r--src/btree/bt_discard.c2
-rw-r--r--src/btree/bt_handle.c2
-rw-r--r--src/btree/bt_read.c4
-rw-r--r--src/btree/bt_split.c20
-rw-r--r--src/btree/col_modify.c9
-rw-r--r--src/btree/row_modify.c17
-rw-r--r--src/conn/conn_cache.c2
-rw-r--r--src/conn/conn_cache_pool.c4
-rw-r--r--src/conn/conn_ckpt.c2
-rw-r--r--src/conn/conn_dhandle.c2
-rw-r--r--src/conn/conn_handle.c15
-rw-r--r--src/conn/conn_log.c12
-rw-r--r--src/conn/conn_stat.c2
-rw-r--r--src/conn/conn_sweep.c2
-rw-r--r--src/cursor/cur_file.c9
-rw-r--r--src/include/btmem.h27
-rw-r--r--src/include/connection.h15
-rw-r--r--src/include/extern.h7
-rw-r--r--src/include/extern_posix.h2
-rw-r--r--src/include/extern_win.h2
-rw-r--r--src/include/mutex.h31
-rw-r--r--src/include/serial.i30
-rw-r--r--src/include/session.h4
-rw-r--r--src/include/verify_build.h1
-rw-r--r--src/include/wt_internal.h4
-rw-r--r--src/lsm/lsm_manager.c2
-rw-r--r--src/lsm/lsm_tree.c2
-rw-r--r--src/os_posix/os_mtx_cond.c16
-rw-r--r--src/os_win/os_mtx_cond.c6
-rw-r--r--src/reconcile/rec_write.c6
-rw-r--r--src/session/session_dhandle.c4
-rw-r--r--src/support/mtx_rw.c405
-rw-r--r--src/support/thread_group.c6
-rw-r--r--src/txn/txn.c8
-rw-r--r--test/csuite/Makefile.am3
-rw-r--r--test/csuite/rwlock/main.c184
40 files changed, 565 insertions, 316 deletions
diff --git a/src/async/async_api.c b/src/async/async_api.c
index b9cc995f5a5..3319332aa04 100644
--- a/src/async/async_api.c
+++ b/src/async/async_api.c
@@ -440,7 +440,7 @@ __wt_async_destroy(WT_SESSION_IMPL *session)
session, async->worker_tids[i]));
async->worker_tids[i] = 0;
}
- WT_TRET(__wt_cond_destroy(session, &async->flush_cond));
+ __wt_cond_destroy(session, &async->flush_cond);
/* Close the server threads' sessions. */
for (i = 0; i < conn->async_workers; i++)
diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c
index 2edcac76d0b..4b40d0aeed0 100644
--- a/src/btree/bt_compact.c
+++ b/src/btree/bt_compact.c
@@ -60,7 +60,7 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
*/
if (mod->rec_result == WT_PM_REC_REPLACE ||
mod->rec_result == WT_PM_REC_MULTIBLOCK)
- __wt_writelock(session, &page->page_lock);
+ WT_PAGE_LOCK(session, page);
if (mod->rec_result == WT_PM_REC_REPLACE)
ret = bm->compact_page_skip(bm, session,
@@ -80,7 +80,7 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
if (mod->rec_result == WT_PM_REC_REPLACE ||
mod->rec_result == WT_PM_REC_MULTIBLOCK)
- __wt_writeunlock(session, &page->page_lock);
+ WT_PAGE_UNLOCK(session, page);
return (ret);
}
diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c
index 944e276fc01..f0aa632551b 100644
--- a/src/btree/bt_cursor.c
+++ b/src/btree/bt_cursor.c
@@ -369,7 +369,7 @@ __cursor_col_modify(
WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool is_remove)
{
return (__wt_col_modify(session,
- cbt, cbt->iface.recno, &cbt->iface.value, NULL, is_remove));
+ cbt, cbt->iface.recno, &cbt->iface.value, NULL, is_remove, false));
}
/*
@@ -381,7 +381,7 @@ __cursor_row_modify(
WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool is_remove)
{
return (__wt_row_modify(session,
- cbt, &cbt->iface.key, &cbt->iface.value, NULL, is_remove));
+ cbt, &cbt->iface.key, &cbt->iface.value, NULL, is_remove, false));
}
/*
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index d3f02e29b90..fdc33b608ec 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -689,8 +689,6 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref)
WT_RET(ds->f(ds, ", entries %" PRIu32, entries));
WT_RET(ds->f(ds,
", %s", __wt_page_is_modified(page) ? "dirty" : "clean"));
- WT_RET(ds->f(ds, ", %s", __wt_rwlock_islocked(
- session, &page->page_lock) ? "locked" : "unlocked"));
if (F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS))
WT_RET(ds->f(ds, ", keys-built"));
diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c
index bab7b8145d6..04c0a5d410d 100644
--- a/src/btree/bt_discard.c
+++ b/src/btree/bt_discard.c
@@ -98,7 +98,6 @@ __page_out_int(WT_SESSION_IMPL *session, WT_PAGE **pagep, bool rewrite)
*/
WT_ASSERT(session, !__wt_page_is_modified(page));
WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU));
- WT_ASSERT(session, !__wt_rwlock_islocked(session, &page->page_lock));
/*
* If a root page split, there may be one or more pages linked from the
@@ -254,6 +253,7 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
__wt_ovfl_discard_free(session, page);
__wt_free(session, page->modify->ovfl_track);
+ __wt_spin_destroy(session, &page->modify->page_lock);
__wt_free(session, page->modify);
}
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index d76720b19ae..a0da7df0998 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -444,7 +444,7 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
}
/* Initialize locks. */
- __wt_rwlock_init(session, &btree->ovfl_lock);
+ WT_RET(__wt_rwlock_init(session, &btree->ovfl_lock));
WT_RET(__wt_spin_init(session, &btree->flush_lock, "btree flush"));
btree->checkpointing = WT_CKPT_OFF; /* Not checkpointing */
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c
index 64874547b9c..ae1f8427b25 100644
--- a/src/btree/bt_read.c
+++ b/src/btree/bt_read.c
@@ -90,7 +90,7 @@ __col_instantiate(WT_SESSION_IMPL *session,
{
/* Search the page and add updates. */
WT_RET(__wt_col_search(session, recno, ref, cbt));
- WT_RET(__wt_col_modify(session, cbt, recno, NULL, upd, false));
+ WT_RET(__wt_col_modify(session, cbt, recno, NULL, upd, false, false));
return (0);
}
@@ -104,7 +104,7 @@ __row_instantiate(WT_SESSION_IMPL *session,
{
/* Search the page and add updates. */
WT_RET(__wt_row_search(session, key, ref, cbt, true));
- WT_RET(__wt_row_modify(session, cbt, key, NULL, upd, false));
+ WT_RET(__wt_row_modify(session, cbt, key, NULL, upd, false, false));
return (0);
}
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 49043c8bab4..627e6b9cb48 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -1300,13 +1300,19 @@ __split_internal_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool trylock,
for (;;) {
parent = ref->home;
+ /*
+ * The page will be marked dirty, and we can only lock a page
+ * with a modify structure.
+ */
+ WT_RET(__wt_page_modify_init(session, parent));
+
if (trylock)
- WT_RET(__wt_try_writelock(session, &parent->page_lock));
+ WT_RET(WT_PAGE_TRYLOCK(session, parent));
else
- __wt_writelock(session, &parent->page_lock);
+ WT_PAGE_LOCK(session, parent);
if (parent == ref->home)
break;
- __wt_writeunlock(session, &parent->page_lock);
+ WT_PAGE_UNLOCK(session, parent);
}
/*
@@ -1329,7 +1335,7 @@ __split_internal_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool trylock,
*parentp = parent;
return (0);
-err: __wt_writeunlock(session, &parent->page_lock);
+err: WT_PAGE_UNLOCK(session, parent);
return (ret);
}
@@ -1345,7 +1351,7 @@ __split_internal_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard)
if (hazard)
ret = __wt_hazard_clear(session, parent->pg_intl_parent_ref);
- __wt_writeunlock(session, &parent->page_lock);
+ WT_PAGE_UNLOCK(session, parent);
return (ret);
}
@@ -1559,7 +1565,7 @@ __split_multi_inmem(
/* Apply the modification. */
WT_ERR(__wt_col_modify(
- session, &cbt, recno, NULL, upd, false));
+ session, &cbt, recno, NULL, upd, false, true));
break;
case WT_PAGE_ROW_LEAF:
/* Build a key. */
@@ -1581,7 +1587,7 @@ __split_multi_inmem(
/* Apply the modification. */
WT_ERR(__wt_row_modify(
- session, &cbt, key, NULL, upd, false));
+ session, &cbt, key, NULL, upd, false, true));
break;
WT_ILLEGAL_VALUE_ERR(session);
}
diff --git a/src/btree/col_modify.c b/src/btree/col_modify.c
index 9ccb9728189..8b758453288 100644
--- a/src/btree/col_modify.c
+++ b/src/btree/col_modify.c
@@ -17,7 +17,8 @@ static int __col_insert_alloc(
*/
int
__wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
- uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove)
+ uint64_t recno, WT_ITEM *value,
+ WT_UPDATE *upd_arg, bool is_remove, bool exclusive)
{
WT_BTREE *btree;
WT_DECL_RET;
@@ -103,7 +104,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
/* Serialize the update. */
WT_ERR(__wt_update_serial(
- session, page, &cbt->ins->upd, &upd, upd_size));
+ session, page, &cbt->ins->upd, &upd, upd_size, false));
} else {
/* Allocate the append/update list reference as necessary. */
if (append) {
@@ -185,11 +186,11 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
if (append)
WT_ERR(__wt_col_append_serial(
session, page, cbt->ins_head, cbt->ins_stack,
- &ins, ins_size, &cbt->recno, skipdepth));
+ &ins, ins_size, &cbt->recno, skipdepth, exclusive));
else
WT_ERR(__wt_insert_serial(
session, page, cbt->ins_head, cbt->ins_stack,
- &ins, ins_size, skipdepth));
+ &ins, ins_size, skipdepth, exclusive));
}
/* If the update was successful, add it to the in-memory log. */
diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c
index b1a81ca3d9f..6b66c4bcdc4 100644
--- a/src/btree/row_modify.c
+++ b/src/btree/row_modify.c
@@ -15,18 +15,12 @@
int
__wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page)
{
- WT_CONNECTION_IMPL *conn;
WT_PAGE_MODIFY *modify;
- conn = S2C(session);
-
WT_RET(__wt_calloc_one(session, &modify));
- /*
- * Select a spinlock for the page; let the barrier immediately below
- * keep things from racing too badly.
- */
- modify->page_lock = ++conn->page_lock_cnt % WT_PAGE_LOCKS;
+ /* Initialize the spinlock for the page. */
+ WT_RET(__wt_spin_init(session, &modify->page_lock, "btree page"));
/*
* Multiple threads of control may be searching and deciding to modify
@@ -47,7 +41,8 @@ __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page)
*/
int
__wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
- WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove)
+ WT_ITEM *key, WT_ITEM *value,
+ WT_UPDATE *upd_arg, bool is_remove, bool exclusive)
{
WT_DECL_RET;
WT_INSERT *ins;
@@ -132,7 +127,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
/* Serialize the update. */
WT_ERR(__wt_update_serial(
- session, page, upd_entry, &upd, upd_size));
+ session, page, upd_entry, &upd, upd_size, exclusive));
} else {
/*
* Allocate the insert array as necessary.
@@ -207,7 +202,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
/* Insert the WT_INSERT structure. */
WT_ERR(__wt_insert_serial(
session, page, cbt->ins_head, cbt->ins_stack,
- &ins, ins_size, skipdepth));
+ &ins, ins_size, skipdepth, exclusive));
}
if (logged)
diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c
index 28dd06332e0..12279709191 100644
--- a/src/conn/conn_cache.c
+++ b/src/conn/conn_cache.c
@@ -312,7 +312,7 @@ __wt_cache_destroy(WT_SESSION_IMPL *session)
cache->bytes_dirty_intl + cache->bytes_dirty_leaf,
cache->pages_dirty_intl + cache->pages_dirty_leaf);
- WT_TRET(__wt_cond_destroy(session, &cache->evict_cond));
+ __wt_cond_destroy(session, &cache->evict_cond);
__wt_spin_destroy(session, &cache->evict_pass_lock);
__wt_spin_destroy(session, &cache->evict_queue_lock);
__wt_spin_destroy(session, &cache->evict_walk_lock);
diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c
index ed078991581..3f7a770b762 100644
--- a/src/conn/conn_cache_pool.c
+++ b/src/conn/conn_cache_pool.c
@@ -225,7 +225,7 @@ err: __wt_spin_unlock(session, &__wt_process.spinlock);
__wt_free(session, pool_name);
if (ret != 0 && created) {
__wt_free(session, cp->name);
- WT_TRET(__wt_cond_destroy(session, &cp->cache_pool_cond));
+ __wt_cond_destroy(session, &cp->cache_pool_cond);
__wt_free(session, cp);
}
return (ret);
@@ -391,7 +391,7 @@ __wt_conn_cache_pool_destroy(WT_SESSION_IMPL *session)
__wt_free(session, cp->name);
__wt_spin_destroy(session, &cp->cache_pool_lock);
- WT_TRET(__wt_cond_destroy(session, &cp->cache_pool_cond));
+ __wt_cond_destroy(session, &cp->cache_pool_cond);
__wt_free(session, cp);
}
diff --git a/src/conn/conn_ckpt.c b/src/conn/conn_ckpt.c
index d5a6faf7bd7..dea261162c2 100644
--- a/src/conn/conn_ckpt.c
+++ b/src/conn/conn_ckpt.c
@@ -233,7 +233,7 @@ __wt_checkpoint_server_destroy(WT_SESSION_IMPL *session)
WT_TRET(__wt_thread_join(session, conn->ckpt_tid));
conn->ckpt_tid_set = false;
}
- WT_TRET(__wt_cond_destroy(session, &conn->ckpt_cond));
+ __wt_cond_destroy(session, &conn->ckpt_cond);
/* Close the server thread's session. */
if (conn->ckpt_session != NULL) {
diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c
index 657cdebf7ee..41dcfb8fffb 100644
--- a/src/conn/conn_dhandle.c
+++ b/src/conn/conn_dhandle.c
@@ -52,7 +52,7 @@ __wt_conn_dhandle_alloc(
WT_RET(__wt_calloc_one(session, &dhandle));
- __wt_rwlock_init(session, &dhandle->rwlock);
+ WT_ERR(__wt_rwlock_init(session, &dhandle->rwlock));
dhandle->name_hash = __wt_hash_city64(uri, strlen(uri));
WT_ERR(__wt_strdup(session, uri, &dhandle->name));
WT_ERR(__wt_strdup(session, checkpoint, &dhandle->checkpoint));
diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c
index 287e9ca7b99..5e4a8c29adc 100644
--- a/src/conn/conn_handle.c
+++ b/src/conn/conn_handle.c
@@ -62,14 +62,9 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn)
WT_RET(__wt_spin_init(session, &conn->turtle_lock, "turtle file"));
/* Read-write locks */
- __wt_rwlock_init(session, &conn->dhandle_lock);
- __wt_rwlock_init(session, &conn->hot_backup_lock);
- __wt_rwlock_init(session, &conn->table_lock);
-
- WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS, &conn->page_lock));
- for (i = 0; i < WT_PAGE_LOCKS; ++i)
- WT_RET(
- __wt_spin_init(session, &conn->page_lock[i], "btree page"));
+ WT_RET(__wt_rwlock_init(session, &conn->dhandle_lock));
+ WT_RET(__wt_rwlock_init(session, &conn->hot_backup_lock));
+ WT_RET(__wt_rwlock_init(session, &conn->table_lock));
/* Setup the spin locks for the LSM manager queues. */
WT_RET(__wt_spin_init(session,
@@ -113,7 +108,6 @@ void
__wt_connection_destroy(WT_CONNECTION_IMPL *conn)
{
WT_SESSION_IMPL *session;
- u_int i;
/* Check there's something to destroy. */
if (conn == NULL)
@@ -144,9 +138,6 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn)
__wt_spin_destroy(session, &conn->schema_lock);
__wt_rwlock_destroy(session, &conn->table_lock);
__wt_spin_destroy(session, &conn->turtle_lock);
- for (i = 0; i < WT_PAGE_LOCKS; ++i)
- __wt_spin_destroy(session, &conn->page_lock[i]);
- __wt_free(session, conn->page_lock);
/* Free allocated memory. */
__wt_free(session, conn->cfg);
diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c
index b8b5bd2a908..d2ed314fd2e 100644
--- a/src/conn/conn_log.c
+++ b/src/conn/conn_log.c
@@ -880,7 +880,7 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[])
WT_RET(__wt_spin_init(session, &log->log_sync_lock, "log sync"));
WT_RET(__wt_spin_init(session, &log->log_writelsn_lock,
"log write LSN"));
- __wt_rwlock_init(session, &log->log_archive_lock);
+ WT_RET(__wt_rwlock_init(session, &log->log_archive_lock));
if (FLD_ISSET(conn->direct_io, WT_DIRECT_IO_LOG))
log->allocsize = (uint32_t)
WT_MAX(conn->buffer_alignment, WT_LOG_ALIGN);
@@ -1043,12 +1043,12 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
}
/* Destroy the condition variables now that all threads are stopped */
- WT_TRET(__wt_cond_destroy(session, &conn->log_cond));
- WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond));
- WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond));
+ __wt_cond_destroy(session, &conn->log_cond);
+ __wt_cond_destroy(session, &conn->log_file_cond);
+ __wt_cond_destroy(session, &conn->log_wrlsn_cond);
- WT_TRET(__wt_cond_destroy(session, &conn->log->log_sync_cond));
- WT_TRET(__wt_cond_destroy(session, &conn->log->log_write_cond));
+ __wt_cond_destroy(session, &conn->log->log_sync_cond);
+ __wt_cond_destroy(session, &conn->log->log_write_cond);
__wt_rwlock_destroy(session, &conn->log->log_archive_lock);
__wt_spin_destroy(session, &conn->log->log_lock);
__wt_spin_destroy(session, &conn->log->log_slot_lock);
diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c
index d89392b66c6..3234d51cd4c 100644
--- a/src/conn/conn_stat.c
+++ b/src/conn/conn_stat.c
@@ -648,7 +648,7 @@ __wt_statlog_destroy(WT_SESSION_IMPL *session, bool is_close)
WT_TRET(__wt_thread_join(session, conn->stat_tid));
conn->stat_tid_set = false;
}
- WT_TRET(__wt_cond_destroy(session, &conn->stat_cond));
+ __wt_cond_destroy(session, &conn->stat_cond);
/* Log a set of statistics on shutdown if configured. */
if (is_close)
diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c
index 22d90b08438..e273f1d08e5 100644
--- a/src/conn/conn_sweep.c
+++ b/src/conn/conn_sweep.c
@@ -432,7 +432,7 @@ __wt_sweep_destroy(WT_SESSION_IMPL *session)
WT_TRET(__wt_thread_join(session, conn->sweep_tid));
conn->sweep_tid_set = 0;
}
- WT_TRET(__wt_cond_destroy(session, &conn->sweep_cond));
+ __wt_cond_destroy(session, &conn->sweep_cond);
if (conn->sweep_session != NULL) {
wt_session = &conn->sweep_session->iface;
diff --git a/src/cursor/cur_file.c b/src/cursor/cur_file.c
index 205afb607c3..bc54f10f2d6 100644
--- a/src/cursor/cur_file.c
+++ b/src/cursor/cur_file.c
@@ -486,7 +486,14 @@ __curfile_create(WT_SESSION_IMPL *session,
WT_STAT_DATA_INCR(session, cursor_create);
if (0) {
-err: WT_TRET(__curfile_close(cursor));
+err: /*
+ * Our caller expects to release the data handle if we fail.
+ * Disconnect it from the cursor before closing.
+ */
+ if (session->dhandle != NULL)
+ __wt_cursor_dhandle_decr_use(session);
+ cbt->btree = NULL;
+ WT_TRET(__curfile_close(cursor));
*cursorp = NULL;
}
diff --git a/src/include/btmem.h b/src/include/btmem.h
index f1bb08d2699..eb523c01ad7 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -414,18 +414,20 @@ struct __wt_page_modify {
size_t discard_allocated;
} *ovfl_track;
+#define WT_PAGE_LOCK(s, p) \
+ __wt_spin_lock((s), &(p)->modify->page_lock)
+#define WT_PAGE_TRYLOCK(s, p) \
+ __wt_spin_trylock((s), &(p)->modify->page_lock)
+#define WT_PAGE_UNLOCK(s, p) \
+ __wt_spin_unlock((s), &(p)->modify->page_lock)
+ WT_SPINLOCK page_lock; /* Page's spinlock */
+
/*
* The write generation is incremented when a page is modified, a page
* is clean if the write generation is 0.
*/
uint32_t write_gen;
-#define WT_PAGE_LOCK(s, p) \
- __wt_spin_lock((s), &S2C(s)->page_lock[(p)->modify->page_lock])
-#define WT_PAGE_UNLOCK(s, p) \
- __wt_spin_unlock((s), &S2C(s)->page_lock[(p)->modify->page_lock])
- uint8_t page_lock; /* Page's spinlock */
-
#define WT_PM_REC_EMPTY 1 /* Reconciliation: no replacement */
#define WT_PM_REC_MULTIBLOCK 2 /* Reconciliation: multiple blocks */
#define WT_PM_REC_REPLACE 3 /* Reconciliation: single block */
@@ -603,13 +605,6 @@ struct __wt_page {
uint8_t unused[2]; /* Unused padding */
/*
- * Used to protect and co-ordinate splits for internal pages and
- * reconciliation for all pages. Only used to co-ordinate among the
- * uncommon cases that require exclusive access to a page.
- */
- WT_RWLOCK page_lock;
-
- /*
* The page's read generation acts as an LRU value for each page in the
* tree; it is used by the eviction server thread to select pages to be
* discarded from the in-memory tree.
@@ -635,8 +630,6 @@ struct __wt_page {
#define WT_READGEN_STEP 100
uint64_t read_gen;
- uint64_t evict_pass_gen; /* Eviction pass generation */
-
size_t memory_footprint; /* Memory attached to the page */
/* Page's on-disk representation: NULL for pages created in memory. */
@@ -644,6 +637,10 @@ struct __wt_page {
/* If/when the page is modified, we need lots more information. */
WT_PAGE_MODIFY *modify;
+
+ /* This is the 64 byte boundary, try to keep hot fields above here. */
+
+ uint64_t evict_pass_gen; /* Eviction pass generation */
};
/*
diff --git a/src/include/connection.h b/src/include/connection.h
index 6c23492e926..f74732684f5 100644
--- a/src/include/connection.h
+++ b/src/include/connection.h
@@ -175,21 +175,6 @@ struct __wt_connection_impl {
WT_SPINLOCK turtle_lock; /* Turtle file spinlock */
WT_RWLOCK dhandle_lock; /* Data handle list lock */
- /*
- * We distribute the btree page locks across a set of spin locks. Don't
- * use too many: they are only held for very short operations, each one
- * is 64 bytes, so 256 will fill the L1 cache on most CPUs.
- *
- * Use a prime number of buckets rather than assuming a good hash
- * (Reference Sedgewick, Algorithms in C, "Hash Functions").
- *
- * Note: this can't be an array, we impose cache-line alignment and gcc
- * doesn't support that for arrays smaller than the alignment.
- */
-#define WT_PAGE_LOCKS 17
- WT_SPINLOCK *page_lock; /* Btree page spinlocks */
- u_int page_lock_cnt; /* Next spinlock to use */
-
/* Connection queue */
TAILQ_ENTRY(__wt_connection_impl) q;
/* Cache pool queue */
diff --git a/src/include/extern.h b/src/include/extern.h
index 0cfc284b313..bf3279d0f94 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -182,7 +182,7 @@ extern int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *tag, WT_ITEM *b
extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_tree_walk_count(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_tree_walk_skip(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove, bool exclusive) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_row_leaf_key_copy( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -191,7 +191,7 @@ extern int __wt_row_ikey_alloc(WT_SESSION_IMPL *session, uint32_t cell_offset, c
extern int __wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t cell_offset, const void *key, size_t size, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_row_ikey(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key, size_t size, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove, bool exclusive) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_row_insert_alloc(WT_SESSION_IMPL *session, WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_update_alloc( WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp, size_t *sizep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern WT_UPDATE *__wt_update_obsolete_check( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd);
@@ -674,10 +674,9 @@ extern void __wt_huffman_close(WT_SESSION_IMPL *session, void *huffman_arg);
extern void __wt_print_huffman_code(void *huffman_arg, uint16_t symbol);
extern int __wt_huffman_encode(WT_SESSION_IMPL *session, void *huffman_arg, const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg, const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern void __wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l);
+extern int __wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK *l);
extern int __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern void __wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *l);
extern void __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l);
extern void __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l);
extern int __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
diff --git a/src/include/extern_posix.h b/src/include/extern_posix.h
index 3afffef687b..c0ed056c7b6 100644
--- a/src/include/extern_posix.h
+++ b/src/include/extern_posix.h
@@ -15,7 +15,7 @@ extern int __wt_posix_unmap(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, void *ma
extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled);
extern void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond);
-extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern void __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp);
extern int __wt_once(void (*init_routine)(void)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_get_vm_pagesize(void) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern bool __wt_absolute_path(const char *path);
diff --git a/src/include/extern_win.h b/src/include/extern_win.h
index 4e232a2df80..d548ee0b2ec 100644
--- a/src/include/extern_win.h
+++ b/src/include/extern_win.h
@@ -13,7 +13,7 @@ extern int __wt_win_unmap(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, v
extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled);
extern void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond);
-extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern void __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp);
extern int __wt_once(void (*init_routine)(void)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_get_vm_pagesize(void) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern bool __wt_absolute_path(const char *path);
diff --git a/src/include/mutex.h b/src/include/mutex.h
index 910eb7af5b9..36acea810d9 100644
--- a/src/include/mutex.h
+++ b/src/include/mutex.h
@@ -37,17 +37,21 @@ struct __wt_condvar {
* Don't modify this structure without understanding the read/write locking
* functions.
*/
-union __wt_rwlock { /* Read/write lock */
- uint64_t u;
- struct {
- uint32_t wr; /* Writers and readers */
- } i;
- struct {
- uint16_t writers; /* Now serving for writers */
- uint16_t readers; /* Now serving for readers */
- uint16_t next; /* Next available ticket number */
- uint16_t writers_active;/* Count of active writers */
- } s;
+struct __wt_rwlock { /* Read/write lock */
+ volatile union {
+ uint64_t v; /* Full 64-bit value */
+ struct {
+ uint8_t current; /* Current ticket */
+ uint8_t next; /* Next available ticket */
+ uint8_t reader; /* Read queue ticket */
+ uint8_t __notused; /* Padding */
+ uint16_t readers_active;/* Count of active readers */
+ uint16_t readers_queued;/* Count of queued readers */
+ } s;
+ } u;
+
+ WT_CONDVAR *cond_readers; /* Blocking readers */
+ WT_CONDVAR *cond_writers; /* Blocking writers */
};
/*
@@ -63,8 +67,8 @@ union __wt_rwlock { /* Read/write lock */
#define SPINLOCK_PTHREAD_MUTEX_ADAPTIVE 3
struct __wt_spinlock {
- WT_CACHE_LINE_PAD_BEGIN
#if SPINLOCK_TYPE == SPINLOCK_GCC
+ WT_CACHE_LINE_PAD_BEGIN
volatile int lock;
#elif SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX ||\
SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE ||\
@@ -87,5 +91,8 @@ struct __wt_spinlock {
int16_t stat_int_usecs_off; /* waiting server threads offset */
int8_t initialized; /* Lock initialized, for cleanup */
+
+#if SPINLOCK_TYPE == SPINLOCK_GCC
WT_CACHE_LINE_PAD_END
+#endif
};
diff --git a/src/include/serial.i b/src/include/serial.i
index 982f196b0b8..0134e1a9c20 100644
--- a/src/include/serial.i
+++ b/src/include/serial.i
@@ -154,7 +154,7 @@ __col_append_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head,
static inline int
__wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
WT_INSERT_HEAD *ins_head, WT_INSERT ***ins_stack, WT_INSERT **new_insp,
- size_t new_ins_size, uint64_t *recnop, u_int skipdepth)
+ size_t new_ins_size, uint64_t *recnop, u_int skipdepth, bool exclusive)
{
WT_INSERT *new_ins = *new_insp;
WT_DECL_RET;
@@ -165,11 +165,16 @@ __wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
/* Clear references to memory we now own and must free on error. */
*new_insp = NULL;
- /* Acquire the page's spinlock, call the worker function. */
- WT_PAGE_LOCK(session, page);
+ /*
+ * Acquire the page's spinlock unless we already have exclusive access.
+ * Then call the worker function.
+ */
+ if (!exclusive)
+ WT_PAGE_LOCK(session, page);
ret = __col_append_serial_func(
session, ins_head, ins_stack, new_ins, recnop, skipdepth);
- WT_PAGE_UNLOCK(session, page);
+ if (!exclusive)
+ WT_PAGE_UNLOCK(session, page);
if (ret != 0) {
/* Free unused memory on error. */
@@ -198,7 +203,7 @@ __wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
static inline int
__wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
WT_INSERT_HEAD *ins_head, WT_INSERT ***ins_stack, WT_INSERT **new_insp,
- size_t new_ins_size, u_int skipdepth)
+ size_t new_ins_size, u_int skipdepth, bool exclusive)
{
WT_INSERT *new_ins = *new_insp;
WT_DECL_RET;
@@ -220,10 +225,12 @@ __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
ret = __insert_simple_func(
session, ins_stack, new_ins, skipdepth);
else {
- WT_PAGE_LOCK(session, page);
+ if (!exclusive)
+ WT_PAGE_LOCK(session, page);
ret = __insert_serial_func(
session, ins_head, ins_stack, new_ins, skipdepth);
- WT_PAGE_UNLOCK(session, page);
+ if (!exclusive)
+ WT_PAGE_UNLOCK(session, page);
}
if (ret != 0) {
@@ -252,7 +259,8 @@ __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
*/
static inline int
__wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
- WT_UPDATE **srch_upd, WT_UPDATE **updp, size_t upd_size)
+ WT_UPDATE **srch_upd, WT_UPDATE **updp, size_t upd_size,
+ bool exclusive)
{
WT_DECL_RET;
WT_UPDATE *obsolete, *upd = *updp;
@@ -295,7 +303,7 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
/*
* If there are no subsequent WT_UPDATE structures we are done here.
*/
- if (upd->next == NULL)
+ if (upd->next == NULL || exclusive)
return (0);
/*
@@ -316,11 +324,11 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
}
/* If we can't lock it, don't scan, that's okay. */
- if (__wt_try_writelock(session, &page->page_lock) != 0)
+ if (WT_PAGE_TRYLOCK(session, page) != 0)
return (0);
obsolete = __wt_update_obsolete_check(session, page, upd->next);
- __wt_writeunlock(session, &page->page_lock);
+ WT_PAGE_UNLOCK(session, page);
if (obsolete != NULL)
__wt_update_obsolete_free(session, page, obsolete);
diff --git a/src/include/session.h b/src/include/session.h
index 674e92671b1..1b2dfd1ed2b 100644
--- a/src/include/session.h
+++ b/src/include/session.h
@@ -97,6 +97,10 @@ struct __wt_session_impl {
*/
TAILQ_HEAD(__tables, __wt_table) tables;
+ /* Current rwlock for callback. */
+ WT_RWLOCK *current_rwlock;
+ uint8_t current_rwticket;
+
WT_ITEM **scratch; /* Temporary memory for any function */
u_int scratch_alloc; /* Currently allocated */
size_t scratch_cached; /* Scratch bytes cached */
diff --git a/src/include/verify_build.h b/src/include/verify_build.h
index 640f5e4cf5f..a657b9ac460 100644
--- a/src/include/verify_build.h
+++ b/src/include/verify_build.h
@@ -59,7 +59,6 @@ __wt_verify_build(void)
sizeof(s) > WT_CACHE_LINE_ALIGNMENT || \
sizeof(s) % WT_CACHE_LINE_ALIGNMENT == 0)
WT_PADDING_CHECK(WT_LOGSLOT);
- WT_PADDING_CHECK(WT_SPINLOCK);
WT_PADDING_CHECK(WT_TXN_STATE);
/*
diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h
index da318ad8a86..cf79578985b 100644
--- a/src/include/wt_internal.h
+++ b/src/include/wt_internal.h
@@ -268,6 +268,8 @@ struct __wt_ref;
typedef struct __wt_ref WT_REF;
struct __wt_row;
typedef struct __wt_row WT_ROW;
+struct __wt_rwlock;
+ typedef struct __wt_rwlock WT_RWLOCK;
struct __wt_salvage_cookie;
typedef struct __wt_salvage_cookie WT_SALVAGE_COOKIE;
struct __wt_save_upd;
@@ -302,8 +304,6 @@ union __wt_lsn;
typedef union __wt_lsn WT_LSN;
union __wt_rand_state;
typedef union __wt_rand_state WT_RAND_STATE;
-union __wt_rwlock;
- typedef union __wt_rwlock WT_RWLOCK;
/*
* Forward type declarations for internal types: END
* DO NOT EDIT: automatically built by dist/s_typedef.
diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c
index e33e119aa41..b7d9086d10e 100644
--- a/src/lsm/lsm_manager.c
+++ b/src/lsm/lsm_manager.c
@@ -334,7 +334,7 @@ __wt_lsm_manager_destroy(WT_SESSION_IMPL *session)
__wt_spin_destroy(session, &manager->switch_lock);
__wt_spin_destroy(session, &manager->app_lock);
__wt_spin_destroy(session, &manager->manager_lock);
- WT_TRET(__wt_cond_destroy(session, &manager->work_cond));
+ __wt_cond_destroy(session, &manager->work_cond);
return (ret);
}
diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c
index a9275976023..77600c1bf34 100644
--- a/src/lsm/lsm_tree.c
+++ b/src/lsm/lsm_tree.c
@@ -471,7 +471,7 @@ __lsm_tree_open(WT_SESSION_IMPL *session,
/* Try to open the tree. */
WT_RET(__wt_calloc_one(session, &lsm_tree));
- __wt_rwlock_init(session, &lsm_tree->rwlock);
+ WT_RET(__wt_rwlock_init(session, &lsm_tree->rwlock));
WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri));
diff --git a/src/os_posix/os_mtx_cond.c b/src/os_posix/os_mtx_cond.c
index a5ee78f9e3e..fe010b62305 100644
--- a/src/os_posix/os_mtx_cond.c
+++ b/src/os_posix/os_mtx_cond.c
@@ -153,7 +153,7 @@ err:
* __wt_cond_destroy --
* Destroy a condition variable.
*/
-int
+void
__wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp)
{
WT_CONDVAR *cond;
@@ -161,11 +161,15 @@ __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp)
cond = *condp;
if (cond == NULL)
- return (0);
+ return;
- ret = pthread_cond_destroy(&cond->cond);
- WT_TRET(pthread_mutex_destroy(&cond->mtx));
- __wt_free(session, *condp);
+ if ((ret = pthread_cond_destroy(&cond->cond)) != 0)
+ WT_PANIC_MSG(
+ session, ret, "pthread_cond_destroy: %s", cond->name);
- return (ret);
+ if ((ret = pthread_mutex_destroy(&cond->mtx)) != 0)
+ WT_PANIC_MSG(
+ session, ret, "pthread_mutex_destroy: %s", cond->name);
+
+ __wt_free(session, *condp);
}
diff --git a/src/os_win/os_mtx_cond.c b/src/os_win/os_mtx_cond.c
index 0001c6c2322..c1b9f509d33 100644
--- a/src/os_win/os_mtx_cond.c
+++ b/src/os_win/os_mtx_cond.c
@@ -163,18 +163,16 @@ __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond)
* __wt_cond_destroy --
* Destroy a condition variable.
*/
-int
+void
__wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp)
{
WT_CONDVAR *cond;
cond = *condp;
if (cond == NULL)
- return (0);
+ return;
/* Do nothing to delete Condition Variable */
DeleteCriticalSection(&cond->mtx);
__wt_free(session, *condp);
-
- return (0);
}
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index 6f95b84d292..e59d9796352 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -386,7 +386,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
* In-memory splits: reconciliation of an internal page cannot handle
* a child page splitting during the reconciliation.
*/
- __wt_writelock(session, &page->page_lock);
+ WT_PAGE_LOCK(session, page);
oldest_id = __wt_txn_oldest_id(session);
if (LF_ISSET(WT_EVICTING))
@@ -405,7 +405,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
/* Initialize the reconciliation structure for each new run. */
if ((ret = __rec_write_init(
session, ref, flags, salvage, &session->reconcile)) != 0) {
- __wt_writeunlock(session, &page->page_lock);
+ WT_PAGE_UNLOCK(session, page);
return (ret);
}
r = session->reconcile;
@@ -446,7 +446,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
WT_TRET(__rec_write_wrapup_err(session, r, page));
/* Release the reconciliation lock. */
- __wt_writeunlock(session, &page->page_lock);
+ WT_PAGE_UNLOCK(session, page);
/*
* If our caller can configure lookaside table reconciliation, flag if
diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c
index 95fb6a6f90e..9abc7a54f5d 100644
--- a/src/session/session_dhandle.c
+++ b/src/session/session_dhandle.c
@@ -261,8 +261,8 @@ __wt_session_release_btree(WT_SESSION_IMPL *session)
* can get a handle without special flags.
*/
if (F_ISSET(dhandle, WT_DHANDLE_DISCARD | WT_DHANDLE_DISCARD_FORCE)) {
- __session_find_dhandle(session,
- dhandle->name, dhandle->checkpoint, &dhandle_cache);
+ WT_SAVE_DHANDLE(session, __session_find_dhandle(session,
+ dhandle->name, dhandle->checkpoint, &dhandle_cache));
if (dhandle_cache != NULL)
__session_discard_dhandle(session, dhandle_cache);
}
diff --git a/src/support/mtx_rw.c b/src/support/mtx_rw.c
index 35ad5da23f2..b2ab32bdef1 100644
--- a/src/support/mtx_rw.c
+++ b/src/support/mtx_rw.c
@@ -27,7 +27,7 @@
*/
/*
- * Based on "Spinlocks and Read-Write Locks" by Dr. Steven Fuerst:
+ * Inspired by "Spinlocks and Read-Write Locks" by Dr. Steven Fuerst:
* http://locklessinc.com/articles/locks/
*
* Dr. Fuerst further credits:
@@ -39,77 +39,46 @@
* by John Mellor-Crummey and Michael Scott in their landmark paper "Scalable
* Reader-Writer Synchronization for Shared-Memory Multiprocessors".
*
- * The following is an explanation of this code. First, the underlying lock
- * structure.
+ * The following is an explanation of our interpretation and implementation.
+ * First, the underlying lock structure.
*
+ * volatile union {
+ * uint64_t v; // Full 64-bit value
* struct {
- * uint16_t writers; Now serving for writers
- * uint16_t readers; Now serving for readers
- * uint16_t next; Next available ticket number
- * uint16_t __notused; Padding
- * }
+ * uint8_t current; // Current ticket
+ * uint8_t next; // Next available ticket
+ * uint8_t reader; // Read queue ticket
+ * uint8_t __notused; // Padding
+ * uint16_t readers_active; // Count of active readers
+ * uint16_t readers_queued; // Count of queued readers
+ * } s;
+ * } u;
*
* First, imagine a store's 'take a number' ticket algorithm. A customer takes
* a unique ticket number and customers are served in ticket order. In the data
- * structure, 'writers' is the next writer to be served, 'readers' is the next
- * reader to be served, and 'next' is the next available ticket number.
+ * structure, 'next' is the ticket that will be allocated next, and 'current'
+ * is the ticket being served.
*
- * Next, consider exclusive (write) locks. The 'now serving' number for writers
- * is 'writers'. To lock, 'take a number' and wait until that number is being
- * served; more specifically, atomically copy and increment the current value of
- * 'next', and then wait until 'writers' equals that copied number.
+ * Next, consider exclusive (write) locks. To lock, 'take a number' and wait
+ * until that number is being served; more specifically, atomically increment
+ * 'next', and then wait until 'current' equals that allocated ticket.
*
- * Shared (read) locks are similar. Like writers, readers atomically get the
- * next number available. However, instead of waiting for 'writers' to equal
- * their number, they wait for 'readers' to equal their number.
+ * Shared (read) locks are similar, except that readers can share a ticket
+ * (both with each other and with a single writer). Readers with a given
+ * ticket execute before the writer with that ticket. In other words, writers
+ * wait for both their ticket to become current and for all readers to exit
+ * the lock.
*
- * This has the effect of queuing lock requests in the order they arrive
- * (incidentally avoiding starvation).
+ * If there are no active writers (indicated by 'current' == 'next'), readers
+ * can immediately enter the lock by atomically incrementing 'readers_active'.
+ * When there are writers active, readers form a new queue by first setting
+ * 'reader' to 'next' (i.e. readers are scheduled after any queued writers,
+ * avoiding starvation), then atomically incrementing 'readers_queued'.
*
- * Each lock/unlock pair requires incrementing both 'readers' and 'writers'.
- * In the case of a reader, the 'readers' increment happens when the reader
- * acquires the lock (to allow read-lock sharing), and the 'writers' increment
- * happens when the reader releases the lock. In the case of a writer, both
- * 'readers' and 'writers' are incremented when the writer releases the lock.
- *
- * For example, consider the following read (R) and write (W) lock requests:
- *
- * writers readers next
- * 0 0 0
- * R: ticket 0, readers match OK 0 1 1
- * R: ticket 1, readers match OK 0 2 2
- * R: ticket 2, readers match OK 0 3 3
- * W: ticket 3, writers no match block 0 3 4
- * R: ticket 2, unlock 1 3 4
- * R: ticket 0, unlock 2 3 4
- * R: ticket 1, unlock 3 3 4
- * W: ticket 3, writers match OK 3 3 4
- *
- * Note the writer blocks until 'writers' equals its ticket number and it does
- * not matter if readers unlock in order or not.
- *
- * Readers or writers entering the system after the write lock is queued block,
- * and the next ticket holder (reader or writer) will unblock when the writer
- * unlocks. An example, continuing from the last line of the above example:
- *
- * writers readers next
- * W: ticket 3, writers match OK 3 3 4
- * R: ticket 4, readers no match block 3 3 5
- * R: ticket 5, readers no match block 3 3 6
- * W: ticket 6, writers no match block 3 3 7
- * W: ticket 3, unlock 4 4 7
- * R: ticket 4, readers match OK 4 5 7
- * R: ticket 5, readers match OK 4 6 7
- *
- * The 'next' field is a 2-byte value so the available ticket number wraps at
- * 64K requests. If a thread's lock request is not granted until the 'next'
- * field cycles and the same ticket is taken by another thread, we could grant
- * a lock to two separate threads at the same time, and bad things happen: two
- * writer threads or a reader thread and a writer thread would run in parallel,
- * and lock waiters could be skipped if the unlocks race. This is unlikely, it
- * only happens if a lock request is blocked by 64K other requests. The fix is
- * to grow the lock structure fields, but the largest atomic instruction we have
- * is 8 bytes, the structure has no room to grow.
+ * The 'next' field is a 1-byte value so the available ticket number wraps
+ * after 256 requests. If a thread's write lock request would cause the 'next'
+ * field to catch up with 'current', instead it waits to avoid the same ticket
+ * being allocated to multiple threads.
*/
#include "wt_internal.h"
@@ -118,12 +87,14 @@
* __wt_rwlock_init --
* Initialize a read/write lock.
*/
-void
+int
__wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l)
{
- WT_UNUSED(session);
+ l->u.v = 0;
- l->u = 0;
+ WT_RET(__wt_cond_alloc(session, "rwlock wait", &l->cond_readers));
+ WT_RET(__wt_cond_alloc(session, "rwlock wait", &l->cond_writers));
+ return (0);
}
/*
@@ -133,9 +104,10 @@ __wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l)
void
__wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK *l)
{
- WT_UNUSED(session);
+ l->u.v = 0;
- l->u = 0;
+ __wt_cond_destroy(session, &l->cond_readers);
+ __wt_cond_destroy(session, &l->cond_writers);
}
/*
@@ -149,46 +121,35 @@ __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
WT_STAT_CONN_INCR(session, rwlock_read);
- new = old = *l;
+ new.u.v = old.u.v = l->u.v;
/*
- * This read lock can only be granted if the lock was last granted to
- * a reader and there are no readers or writers blocked on the lock,
- * that is, if this thread's ticket would be the next ticket granted.
- * Do the cheap test to see if this can possibly succeed (and confirm
- * the lock is in the correct state to grant this read lock).
+ * This read lock can only be granted if there are no active writers.
+ *
+ * Also check for overflow in case there are 64K active readers.
*/
- if (old.s.readers != old.s.next)
+ if (old.u.s.current != old.u.s.next ||
+ new.u.s.readers_active == UINT16_MAX)
return (EBUSY);
/*
- * The replacement lock value is a result of allocating a new ticket and
- * incrementing the reader value to match it.
+ * The replacement lock value is a result of adding an active reader.
+ *
+ * We rely on this atomic operation to provide a barrier.
*/
- new.s.readers = new.s.next = old.s.next + 1;
- return (__wt_atomic_cas64(&l->u, old.u, new.u) ? 0 : EBUSY);
+ new.u.s.readers_active++;
+ return (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v) ? 0 : EBUSY);
}
/*
- * __wt_readlock_spin --
- * Spin to get a read lock: only yield the CPU if the lock is held
- * exclusive.
+ * __read_blocked --
+ * Check whether the current read lock request should keep waiting.
*/
-void
-__wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *l)
+static bool
+__read_blocked(WT_SESSION_IMPL *session)
{
- /*
- * Try to get the lock in a single operation if it is available to
- * readers. This avoids the situation where multiple readers arrive
- * concurrently and have to line up in order to enter the lock. For
- * read-heavy workloads it can make a significant difference.
- */
- while (__wt_try_readlock(session, l) != 0) {
- if (l->s.writers_active > 0)
- __wt_yield();
- else
- WT_PAUSE();
- }
+ return (session->current_rwticket !=
+ session->current_rwlock->u.s.current);
}
/*
@@ -198,41 +159,90 @@ __wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *l)
void
__wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
{
- uint16_t ticket;
+ WT_RWLOCK new, old;
int pause_cnt;
+ int16_t writers_active;
+ uint8_t ticket;
WT_STAT_CONN_INCR(session, rwlock_read);
WT_DIAGNOSTIC_YIELD;
- /*
- * Possibly wrap: if we have more than 64K lockers waiting, the ticket
- * value will wrap and two lockers will simultaneously be granted the
- * lock.
- */
- ticket = __wt_atomic_fetch_add16(&l->s.next, 1);
- for (pause_cnt = 0; ticket != l->s.readers;) {
+ for (;;) {
/*
- * We failed to get the lock; pause before retrying and if we've
- * paused enough, yield so we don't burn CPU to no purpose. This
- * situation happens if there are more threads than cores in the
- * system and we're thrashing on shared resources.
+ * Fast path: if there is no active writer, join the current
+ * group.
*/
- if (++pause_cnt < WT_THOUSAND)
+ for (old.u.v = l->u.v;
+ old.u.s.current == old.u.s.next;
+ old.u.v = l->u.v) {
+ new.u.v = old.u.v;
+ /*
+ * Check for overflow: if the maximum number of readers
+ * are already active, wait to try again.
+ */
+ if (++new.u.s.readers_active == 0)
+ goto stall;
+ if (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v))
+ return;
WT_PAUSE();
- else
+ }
+
+ /*
+ * There is an active writer: join the next group.
+ *
+ * Limit how many readers can queue: don't allow more readers
+ * to queue than there are active writers (calculated as
+ * `next - current`): otherwise, in write-heavy workloads,
+ * readers can keep queuing up in front of writers and
+ * throughput is unstable.
+ *
+ * If the maximum number of readers are already queued, wait
+ * until we can get a valid ticket.
+ */
+ writers_active = old.u.s.next - old.u.s.current;
+ if (old.u.s.readers_queued > writers_active) {
+stall: __wt_cond_wait(
+ session, l->cond_readers, WT_THOUSAND, NULL);
+ continue;
+ }
+
+ /*
+ * If we are the first reader to queue, set the next read
+ * group. Note: don't re-read from the lock or we could race
+ * with a writer unlocking.
+ */
+ new.u.v = old.u.v;
+ if (new.u.s.readers_queued++ == 0)
+ new.u.s.reader = new.u.s.next;
+ ticket = new.u.s.reader;
+
+ if (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v))
+ break;
+ }
+
+ /* Wait for our group to start. */
+ for (pause_cnt = 0; ticket != l->u.s.current; pause_cnt++) {
+ if (pause_cnt < 1000)
+ WT_PAUSE();
+ else if (pause_cnt < 1200)
__wt_yield();
+ else {
+ session->current_rwlock = l;
+ session->current_rwticket = ticket;
+ __wt_cond_wait(
+ session, l->cond_readers, 0, __read_blocked);
+ }
}
- /*
- * We're the only writer of the readers field, so the update does not
- * need to be atomic.
- */
- ++l->s.readers;
+ WT_ASSERT(session, l->u.s.readers_active > 0);
/*
* Applications depend on a barrier here so that operations holding the
- * lock see consistent data.
+ * lock see consistent data. The atomic operation above isn't
+ * sufficient here because we don't own the lock until our ticket comes
+ * up and whatever data we are protecting may have changed in the
+ * meantime.
*/
WT_READ_BARRIER();
}
@@ -244,13 +254,22 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
void
__wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
{
- WT_UNUSED(session);
+ WT_RWLOCK new, old;
- /*
- * Increment the writers value (other readers are doing the same, make
- * sure we don't race).
- */
- (void)__wt_atomic_add16(&l->s.writers, 1);
+ do {
+ old.u.v = l->u.v;
+ WT_ASSERT(session, old.u.s.readers_active > 0);
+
+ /*
+ * Decrement the active reader count (other readers are doing
+ * the same, make sure we don't race).
+ */
+ new.u.v = old.u.v;
+ --new.u.s.readers_active;
+ } while (!__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v));
+
+ if (new.u.s.readers_active == 0 && new.u.s.current != new.u.s.next)
+ __wt_cond_signal(session, l->cond_writers);
}
/*
@@ -264,22 +283,44 @@ __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
WT_STAT_CONN_INCR(session, rwlock_write);
- old = new = *l;
-
/*
- * This write lock can only be granted if the lock was last granted to
- * a writer and there are no readers or writers blocked on the lock,
- * that is, if this thread's ticket would be the next ticket granted.
- * Do the cheap test to see if this can possibly succeed (and confirm
- * the lock is in the correct state to grant this write lock).
+ * This write lock can only be granted if no readers or writers blocked
+ * on the lock, that is, if this thread's ticket would be the next
+ * ticket granted. Check if this can possibly succeed (and confirm the
+ * lock is in the correct state to grant this write lock).
*/
- if (old.s.writers != old.s.next)
+ old.u.v = l->u.v;
+ if (old.u.s.current != old.u.s.next || old.u.s.readers_active != 0)
return (EBUSY);
- /* The replacement lock value is a result of allocating a new ticket. */
- ++new.s.next;
- ++new.s.writers_active;
- return (__wt_atomic_cas64(&l->u, old.u, new.u) ? 0 : EBUSY);
+ /*
+ * We've checked above that there is no writer active (since
+ * `current == next`), so there should be no readers queued.
+ */
+ WT_ASSERT(session, old.u.s.readers_queued == 0);
+
+ /*
+ * The replacement lock value is a result of allocating a new ticket.
+ *
+ * We rely on this atomic operation to provide a barrier.
+ */
+ new.u.v = old.u.v;
+ new.u.s.next++;
+ return (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v) ? 0 : EBUSY);
+}
+
+/*
+ * __write_blocked --
+ * Check whether the current write lock request should keep waiting.
+ */
+static bool
+__write_blocked(WT_SESSION_IMPL *session)
+{
+ WT_RWLOCK *l;
+
+ l = session->current_rwlock;
+ return (session->current_rwticket != l->u.s.current ||
+ l->u.s.readers_active != 0);
}
/*
@@ -289,34 +330,51 @@ __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
void
__wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
{
- uint16_t ticket;
+ WT_RWLOCK new, old;
int pause_cnt;
+ uint8_t ticket;
WT_STAT_CONN_INCR(session, rwlock_write);
- /*
- * Possibly wrap: if we have more than 64K lockers waiting, the ticket
- * value will wrap and two lockers will simultaneously be granted the
- * lock.
- */
- ticket = __wt_atomic_fetch_add16(&l->s.next, 1);
- (void)__wt_atomic_add16(&l->s.writers_active, 1);
- for (pause_cnt = 0; ticket != l->s.writers;) {
+ for (;;) {
+ new.u.v = old.u.v = l->u.v;
+ ticket = new.u.s.next++;
+
/*
- * We failed to get the lock; pause before retrying and if we've
- * paused enough, sleep so we don't burn CPU to no purpose. This
- * situation happens if there are more threads than cores in the
- * system and we're thrashing on shared resources.
+ * Avoid wrapping: if we allocate more than 256 tickets, two
+ * lockers will simultaneously be granted the lock.
*/
- if (++pause_cnt < WT_THOUSAND)
+ if (new.u.s.current == new.u.s.next) {
+ __wt_cond_wait(
+ session, l->cond_writers, WT_THOUSAND, NULL);
+ continue;
+ }
+ if (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v))
+ break;
+ }
+
+ /* Wait for our group to start and any readers to drain. */
+ for (pause_cnt = 0;
+ ticket != l->u.s.current || l->u.s.readers_active != 0;
+ pause_cnt++) {
+ if (pause_cnt < 1000)
WT_PAUSE();
- else
- __wt_sleep(0, 10);
+ else if (pause_cnt < 1200)
+ __wt_yield();
+ else {
+ session->current_rwlock = l;
+ session->current_rwticket = ticket;
+ __wt_cond_wait(
+ session, l->cond_writers, 0, __write_blocked);
+ }
}
/*
* Applications depend on a barrier here so that operations holding the
- * lock see consistent data.
+ * lock see consistent data. The atomic operation above isn't
+ * sufficient here because we don't own the lock until our ticket comes
+ * up and whatever data we are protecting may have changed in the
+ * meantime.
*/
WT_READ_BARRIER();
}
@@ -328,29 +386,34 @@ __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
void
__wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
{
- WT_RWLOCK new;
-
- WT_UNUSED(session);
+ WT_RWLOCK new, old;
- (void)__wt_atomic_sub16(&l->s.writers_active, 1);
+ do {
+ new.u.v = old.u.v = l->u.v;
- /*
- * Ensure that all updates made while the lock was held are visible to
- * the next thread to acquire the lock.
- */
- WT_WRITE_BARRIER();
+ /*
+ * We're holding the lock exclusive, there shouldn't be any
+ * active readers.
+ */
+ WT_ASSERT(session, old.u.s.readers_active == 0);
- new = *l;
+ /*
+ * Allow the next batch to start.
+ *
+ * If there are readers in the next group, swap queued readers
+ * to active: this could race with new readlock requests, so we
+ * have to spin.
+ */
+ if (++new.u.s.current == new.u.s.reader) {
+ new.u.s.readers_active = new.u.s.readers_queued;
+ new.u.s.readers_queued = 0;
+ }
+ } while (!__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v));
- /*
- * We're the only writer of the writers/readers fields, so the update
- * does not need to be atomic; we have to update both values at the
- * same time though, otherwise we'd potentially race with the thread
- * next granted the lock.
- */
- ++new.s.writers;
- ++new.s.readers;
- l->i.wr = new.i.wr;
+ if (new.u.s.readers_active != 0)
+ __wt_cond_signal(session, l->cond_readers);
+ else if (new.u.s.current != new.u.s.next)
+ __wt_cond_signal(session, l->cond_writers);
WT_DIAGNOSTIC_YIELD;
}
@@ -365,6 +428,6 @@ __wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *l)
{
WT_UNUSED(session);
- return (l->s.writers != l->s.next || l->s.readers != l->s.next);
+ return (l->u.s.current != l->u.s.next || l->u.s.readers_active != 0);
}
#endif
diff --git a/src/support/thread_group.c b/src/support/thread_group.c
index 2b4b7ad4e61..090621fa138 100644
--- a/src/support/thread_group.c
+++ b/src/support/thread_group.c
@@ -257,7 +257,7 @@ __wt_thread_group_create(
__wt_verbose(session, WT_VERB_THREAD_GROUP,
"Creating thread group: %p", (void *)group);
- __wt_rwlock_init(session, &group->lock);
+ WT_RET(__wt_rwlock_init(session, &group->lock));
WT_ERR(__wt_cond_alloc(
session, "thread group cond", &group->wait_cond));
cond_alloced = true;
@@ -272,7 +272,7 @@ __wt_thread_group_create(
/* Cleanup on error to avoid leaking resources */
err: if (ret != 0) {
if (cond_alloced)
- WT_TRET(__wt_cond_destroy(session, &group->wait_cond));
+ __wt_cond_destroy(session, &group->wait_cond);
__wt_rwlock_destroy(session, &group->lock);
}
return (ret);
@@ -297,7 +297,7 @@ __wt_thread_group_destroy(WT_SESSION_IMPL *session, WT_THREAD_GROUP *group)
__wt_free(session, group->threads);
- WT_TRET(__wt_cond_destroy(session, &group->wait_cond));
+ __wt_cond_destroy(session, &group->wait_cond);
__wt_rwlock_destroy(session, &group->lock);
/*
diff --git a/src/txn/txn.c b/src/txn/txn.c
index 6eebf5ecf9f..ea7faa2e966 100644
--- a/src/txn/txn.c
+++ b/src/txn/txn.c
@@ -126,7 +126,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
n = 0;
/* We're going to scan the table: wait for the lock. */
- __wt_readlock_spin(session, &txn_global->scan_rwlock);
+ __wt_readlock(session, &txn_global->scan_rwlock);
current_id = pinned_id = txn_global->current;
prev_oldest_id = txn_global->oldest_id;
@@ -293,7 +293,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags)
/* First do a read-only scan. */
if (wait)
- __wt_readlock_spin(session, &txn_global->scan_rwlock);
+ __wt_readlock(session, &txn_global->scan_rwlock);
else if ((ret =
__wt_try_readlock(session, &txn_global->scan_rwlock)) != 0)
return (ret == EBUSY ? 0 : ret);
@@ -768,8 +768,8 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[])
WT_RET(__wt_spin_init(session,
&txn_global->id_lock, "transaction id lock"));
- __wt_rwlock_init(session, &txn_global->scan_rwlock);
- __wt_rwlock_init(session, &txn_global->nsnap_rwlock);
+ WT_RET(__wt_rwlock_init(session, &txn_global->scan_rwlock));
+ WT_RET(__wt_rwlock_init(session, &txn_global->nsnap_rwlock));
txn_global->nsnap_oldest_id = WT_TXN_NONE;
TAILQ_INIT(&txn_global->nsnaph);
diff --git a/test/csuite/Makefile.am b/test/csuite/Makefile.am
index 10ab890f2f5..f2b4fcacdc8 100644
--- a/test/csuite/Makefile.am
+++ b/test/csuite/Makefile.am
@@ -57,6 +57,9 @@ noinst_PROGRAMS += test_wt3135_search_near_collator
test_wt3184_dup_index_collator_SOURCES = wt3184_dup_index_collator/main.c
noinst_PROGRAMS += test_wt3184_dup_index_collator
+test_rwlock_SOURCES = rwlock/main.c
+noinst_PROGRAMS += test_rwlock
+
# Run this during a "make check" smoke test.
TESTS = $(noinst_PROGRAMS)
LOG_COMPILER = $(TEST_WRAPPER)
diff --git a/test/csuite/rwlock/main.c b/test/csuite/rwlock/main.c
new file mode 100644
index 00000000000..04813182478
--- /dev/null
+++ b/test/csuite/rwlock/main.c
@@ -0,0 +1,184 @@
+/*-
+ * Public Domain 2014-2017 MongoDB, Inc.
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "test_util.h"
+
+/*
+ * JIRA ticket reference: HELP-4355
+ * Test rwlock collapse under load.
+ */
+#define MAX_THREADS 1000
+#define READS_PER_WRITE 10000
+//#define READS_PER_WRITE 1000000
+//#define READS_PER_WRITE 100
+
+#define CHECK_CORRECTNESS 1
+//#define USE_POSIX 1
+
+static WT_RWLOCK rwlock;
+static pthread_rwlock_t p_rwlock;
+static bool running;
+static uint64_t shared_counter;
+
+void *thread_rwlock(void *);
+void *thread_dump(void *);
+
+int
+main(int argc, char *argv[])
+{
+ TEST_OPTS *opts, _opts;
+ struct timespec te, ts;
+ pthread_t dump_id, id[MAX_THREADS];
+ int i;
+
+ if (!testutil_enable_long_tests()) /* Ignore unless requested */
+ return (EXIT_SUCCESS);
+
+ opts = &_opts;
+ memset(opts, 0, sizeof(*opts));
+ opts->nthreads = 100;
+ opts->nops = 1000000; /* per thread */
+ testutil_check(testutil_parse_opts(argc, argv, opts));
+ running = true;
+
+ testutil_make_work_dir(opts->home);
+ testutil_check(wiredtiger_open(opts->home, NULL,
+ "create,session_max=1000,statistics=(fast)", &opts->conn));
+
+ testutil_check(__wt_rwlock_init(NULL, &rwlock));
+ testutil_check(pthread_rwlock_init(&p_rwlock, NULL));
+
+ testutil_check(pthread_create(
+ &dump_id, NULL, thread_dump, (void *)opts));
+
+ __wt_epoch(NULL, &ts);
+ for (i = 0; i < (int)opts->nthreads; ++i)
+ testutil_check(pthread_create(
+ &id[i], NULL, thread_rwlock, (void *)opts));
+
+ while (--i >= 0)
+ testutil_check(pthread_join(id[i], NULL));
+ __wt_epoch(NULL, &te);
+ printf("%.2lf\n", WT_TIMEDIFF_MS(te, ts) / 1000.0);
+
+ running = false;
+ testutil_check(pthread_join(dump_id, NULL));
+
+ testutil_check(pthread_rwlock_destroy(&p_rwlock));
+ testutil_cleanup(opts);
+ return (EXIT_SUCCESS);
+}
+
+/*
+ * Acquire a rwlock, every Nth operation, acquire exclusive.
+ */
+void *
+thread_rwlock(void *arg)
+{
+ TEST_OPTS *opts;
+ WT_SESSION *wt_session;
+ WT_SESSION_IMPL *session;
+ uint64_t i, counter;
+ bool writelock;
+
+ opts = (TEST_OPTS *)arg;
+ testutil_check(
+ opts->conn->open_session(opts->conn, NULL, NULL, &wt_session));
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ printf("Running rwlock thread\n");
+ for (i = 1; i <= opts->nops; ++i) {
+ writelock = (i % READS_PER_WRITE == 0);
+
+#ifdef USE_POSIX
+ if (writelock)
+ testutil_check(pthread_rwlock_wrlock(&p_rwlock));
+ else
+ testutil_check(pthread_rwlock_rdlock(&p_rwlock));
+#else
+ if (writelock)
+ __wt_writelock(session, &rwlock);
+ else
+ __wt_readlock(session, &rwlock);
+#endif
+
+ /*
+ * Do a tiny amount of work inside the lock so the compiler
+ * can't optimize everything away.
+ */
+ (void)__wt_atomic_add64(&counter, 1);
+
+#ifdef CHECK_CORRECTNESS
+ if (writelock)
+ counter = ++shared_counter;
+ else
+ counter = shared_counter;
+
+ __wt_yield();
+
+ testutil_assert(counter == shared_counter);
+#endif
+
+#ifdef USE_POSIX
+ testutil_check(pthread_rwlock_unlock(&p_rwlock));
+#else
+ if (writelock)
+ __wt_writeunlock(session, &rwlock);
+ else
+ __wt_readunlock(session, &rwlock);
+#endif
+
+ if (i % 10000 == 0) {
+ printf("%s", session->id == 20 ? ".\n" : ".");
+ fflush(stdout);
+ }
+ }
+
+ opts->running = false;
+
+ return (NULL);
+}
+
+void *
+thread_dump(void *arg) {
+ WT_UNUSED(arg);
+
+ while (running) {
+ sleep(1);
+ printf("\n"
+ "rwlock { current %" PRIu8 ", next %" PRIu8
+ ", reader %" PRIu8 ", readers_active %" PRIu16
+ ", readers_queued %" PRIu16 " }\n",
+ rwlock.u.s.current,
+ rwlock.u.s.next,
+ rwlock.u.s.reader,
+ rwlock.u.s.readers_active,
+ rwlock.u.s.readers_queued);
+ }
+
+ return (NULL);
+}