diff options
Diffstat (limited to 'src')
38 files changed, 378 insertions, 316 deletions
diff --git a/src/async/async_api.c b/src/async/async_api.c index b9cc995f5a5..3319332aa04 100644 --- a/src/async/async_api.c +++ b/src/async/async_api.c @@ -440,7 +440,7 @@ __wt_async_destroy(WT_SESSION_IMPL *session) session, async->worker_tids[i])); async->worker_tids[i] = 0; } - WT_TRET(__wt_cond_destroy(session, &async->flush_cond)); + __wt_cond_destroy(session, &async->flush_cond); /* Close the server threads' sessions. */ for (i = 0; i < conn->async_workers; i++) diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c index 2edcac76d0b..4b40d0aeed0 100644 --- a/src/btree/bt_compact.c +++ b/src/btree/bt_compact.c @@ -60,7 +60,7 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) */ if (mod->rec_result == WT_PM_REC_REPLACE || mod->rec_result == WT_PM_REC_MULTIBLOCK) - __wt_writelock(session, &page->page_lock); + WT_PAGE_LOCK(session, page); if (mod->rec_result == WT_PM_REC_REPLACE) ret = bm->compact_page_skip(bm, session, @@ -80,7 +80,7 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) if (mod->rec_result == WT_PM_REC_REPLACE || mod->rec_result == WT_PM_REC_MULTIBLOCK) - __wt_writeunlock(session, &page->page_lock); + WT_PAGE_UNLOCK(session, page); return (ret); } diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index 944e276fc01..f0aa632551b 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -369,7 +369,7 @@ __cursor_col_modify( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool is_remove) { return (__wt_col_modify(session, - cbt, cbt->iface.recno, &cbt->iface.value, NULL, is_remove)); + cbt, cbt->iface.recno, &cbt->iface.value, NULL, is_remove, false)); } /* @@ -381,7 +381,7 @@ __cursor_row_modify( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool is_remove) { return (__wt_row_modify(session, - cbt, &cbt->iface.key, &cbt->iface.value, NULL, is_remove)); + cbt, &cbt->iface.key, &cbt->iface.value, NULL, is_remove, false)); } /* diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index d3f02e29b90..fdc33b608ec 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -689,8 +689,6 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref) WT_RET(ds->f(ds, ", entries %" PRIu32, entries)); WT_RET(ds->f(ds, ", %s", __wt_page_is_modified(page) ? "dirty" : "clean")); - WT_RET(ds->f(ds, ", %s", __wt_rwlock_islocked( - session, &page->page_lock) ? "locked" : "unlocked")); if (F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS)) WT_RET(ds->f(ds, ", keys-built")); diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c index bab7b8145d6..04c0a5d410d 100644 --- a/src/btree/bt_discard.c +++ b/src/btree/bt_discard.c @@ -98,7 +98,6 @@ __page_out_int(WT_SESSION_IMPL *session, WT_PAGE **pagep, bool rewrite) */ WT_ASSERT(session, !__wt_page_is_modified(page)); WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)); - WT_ASSERT(session, !__wt_rwlock_islocked(session, &page->page_lock)); /* * If a root page split, there may be one or more pages linked from the @@ -254,6 +253,7 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) __wt_ovfl_discard_free(session, page); __wt_free(session, page->modify->ovfl_track); + __wt_spin_destroy(session, &page->modify->page_lock); __wt_free(session, page->modify); } diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index d76720b19ae..a0da7df0998 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -444,7 +444,7 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) } /* Initialize locks. */ - __wt_rwlock_init(session, &btree->ovfl_lock); + WT_RET(__wt_rwlock_init(session, &btree->ovfl_lock)); WT_RET(__wt_spin_init(session, &btree->flush_lock, "btree flush")); btree->checkpointing = WT_CKPT_OFF; /* Not checkpointing */ diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index 64874547b9c..ae1f8427b25 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -90,7 +90,7 @@ __col_instantiate(WT_SESSION_IMPL *session, { /* Search the page and add updates. */ WT_RET(__wt_col_search(session, recno, ref, cbt)); - WT_RET(__wt_col_modify(session, cbt, recno, NULL, upd, false)); + WT_RET(__wt_col_modify(session, cbt, recno, NULL, upd, false, false)); return (0); } @@ -104,7 +104,7 @@ __row_instantiate(WT_SESSION_IMPL *session, { /* Search the page and add updates. */ WT_RET(__wt_row_search(session, key, ref, cbt, true)); - WT_RET(__wt_row_modify(session, cbt, key, NULL, upd, false)); + WT_RET(__wt_row_modify(session, cbt, key, NULL, upd, false, false)); return (0); } diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 49043c8bab4..627e6b9cb48 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -1300,13 +1300,19 @@ __split_internal_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool trylock, for (;;) { parent = ref->home; + /* + * The page will be marked dirty, and we can only lock a page + * with a modify structure. + */ + WT_RET(__wt_page_modify_init(session, parent)); + if (trylock) - WT_RET(__wt_try_writelock(session, &parent->page_lock)); + WT_RET(WT_PAGE_TRYLOCK(session, parent)); else - __wt_writelock(session, &parent->page_lock); + WT_PAGE_LOCK(session, parent); if (parent == ref->home) break; - __wt_writeunlock(session, &parent->page_lock); + WT_PAGE_UNLOCK(session, parent); } /* @@ -1329,7 +1335,7 @@ __split_internal_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool trylock, *parentp = parent; return (0); -err: __wt_writeunlock(session, &parent->page_lock); +err: WT_PAGE_UNLOCK(session, parent); return (ret); } @@ -1345,7 +1351,7 @@ __split_internal_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard) if (hazard) ret = __wt_hazard_clear(session, parent->pg_intl_parent_ref); - __wt_writeunlock(session, &parent->page_lock); + WT_PAGE_UNLOCK(session, parent); return (ret); } @@ -1559,7 +1565,7 @@ __split_multi_inmem( /* Apply the modification. */ WT_ERR(__wt_col_modify( - session, &cbt, recno, NULL, upd, false)); + session, &cbt, recno, NULL, upd, false, true)); break; case WT_PAGE_ROW_LEAF: /* Build a key. */ @@ -1581,7 +1587,7 @@ __split_multi_inmem( /* Apply the modification. */ WT_ERR(__wt_row_modify( - session, &cbt, key, NULL, upd, false)); + session, &cbt, key, NULL, upd, false, true)); break; WT_ILLEGAL_VALUE_ERR(session); } diff --git a/src/btree/col_modify.c b/src/btree/col_modify.c index 9ccb9728189..8b758453288 100644 --- a/src/btree/col_modify.c +++ b/src/btree/col_modify.c @@ -17,7 +17,8 @@ static int __col_insert_alloc( */ int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, - uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove) + uint64_t recno, WT_ITEM *value, + WT_UPDATE *upd_arg, bool is_remove, bool exclusive) { WT_BTREE *btree; WT_DECL_RET; @@ -103,7 +104,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, /* Serialize the update. */ WT_ERR(__wt_update_serial( - session, page, &cbt->ins->upd, &upd, upd_size)); + session, page, &cbt->ins->upd, &upd, upd_size, false)); } else { /* Allocate the append/update list reference as necessary. */ if (append) { @@ -185,11 +186,11 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, if (append) WT_ERR(__wt_col_append_serial( session, page, cbt->ins_head, cbt->ins_stack, - &ins, ins_size, &cbt->recno, skipdepth)); + &ins, ins_size, &cbt->recno, skipdepth, exclusive)); else WT_ERR(__wt_insert_serial( session, page, cbt->ins_head, cbt->ins_stack, - &ins, ins_size, skipdepth)); + &ins, ins_size, skipdepth, exclusive)); } /* If the update was successful, add it to the in-memory log. */ diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c index b1a81ca3d9f..6b66c4bcdc4 100644 --- a/src/btree/row_modify.c +++ b/src/btree/row_modify.c @@ -15,18 +15,12 @@ int __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page) { - WT_CONNECTION_IMPL *conn; WT_PAGE_MODIFY *modify; - conn = S2C(session); - WT_RET(__wt_calloc_one(session, &modify)); - /* - * Select a spinlock for the page; let the barrier immediately below - * keep things from racing too badly. - */ - modify->page_lock = ++conn->page_lock_cnt % WT_PAGE_LOCKS; + /* Initialize the spinlock for the page. */ + WT_RET(__wt_spin_init(session, &modify->page_lock, "btree page")); /* * Multiple threads of control may be searching and deciding to modify @@ -47,7 +41,8 @@ __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page) */ int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, - WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove) + WT_ITEM *key, WT_ITEM *value, + WT_UPDATE *upd_arg, bool is_remove, bool exclusive) { WT_DECL_RET; WT_INSERT *ins; @@ -132,7 +127,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, /* Serialize the update. */ WT_ERR(__wt_update_serial( - session, page, upd_entry, &upd, upd_size)); + session, page, upd_entry, &upd, upd_size, exclusive)); } else { /* * Allocate the insert array as necessary. @@ -207,7 +202,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, /* Insert the WT_INSERT structure. */ WT_ERR(__wt_insert_serial( session, page, cbt->ins_head, cbt->ins_stack, - &ins, ins_size, skipdepth)); + &ins, ins_size, skipdepth, exclusive)); } if (logged) diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c index 28dd06332e0..12279709191 100644 --- a/src/conn/conn_cache.c +++ b/src/conn/conn_cache.c @@ -312,7 +312,7 @@ __wt_cache_destroy(WT_SESSION_IMPL *session) cache->bytes_dirty_intl + cache->bytes_dirty_leaf, cache->pages_dirty_intl + cache->pages_dirty_leaf); - WT_TRET(__wt_cond_destroy(session, &cache->evict_cond)); + __wt_cond_destroy(session, &cache->evict_cond); __wt_spin_destroy(session, &cache->evict_pass_lock); __wt_spin_destroy(session, &cache->evict_queue_lock); __wt_spin_destroy(session, &cache->evict_walk_lock); diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c index ed078991581..3f7a770b762 100644 --- a/src/conn/conn_cache_pool.c +++ b/src/conn/conn_cache_pool.c @@ -225,7 +225,7 @@ err: __wt_spin_unlock(session, &__wt_process.spinlock); __wt_free(session, pool_name); if (ret != 0 && created) { __wt_free(session, cp->name); - WT_TRET(__wt_cond_destroy(session, &cp->cache_pool_cond)); + __wt_cond_destroy(session, &cp->cache_pool_cond); __wt_free(session, cp); } return (ret); @@ -391,7 +391,7 @@ __wt_conn_cache_pool_destroy(WT_SESSION_IMPL *session) __wt_free(session, cp->name); __wt_spin_destroy(session, &cp->cache_pool_lock); - WT_TRET(__wt_cond_destroy(session, &cp->cache_pool_cond)); + __wt_cond_destroy(session, &cp->cache_pool_cond); __wt_free(session, cp); } diff --git a/src/conn/conn_ckpt.c b/src/conn/conn_ckpt.c index 7797ed4421c..ac24c8196ca 100644 --- a/src/conn/conn_ckpt.c +++ b/src/conn/conn_ckpt.c @@ -231,7 +231,7 @@ __wt_checkpoint_server_destroy(WT_SESSION_IMPL *session) WT_TRET(__wt_thread_join(session, conn->ckpt_tid)); conn->ckpt_tid_set = false; } - WT_TRET(__wt_cond_destroy(session, &conn->ckpt_cond)); + __wt_cond_destroy(session, &conn->ckpt_cond); /* Close the server thread's session. */ if (conn->ckpt_session != NULL) { diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c index 657cdebf7ee..41dcfb8fffb 100644 --- a/src/conn/conn_dhandle.c +++ b/src/conn/conn_dhandle.c @@ -52,7 +52,7 @@ __wt_conn_dhandle_alloc( WT_RET(__wt_calloc_one(session, &dhandle)); - __wt_rwlock_init(session, &dhandle->rwlock); + WT_ERR(__wt_rwlock_init(session, &dhandle->rwlock)); dhandle->name_hash = __wt_hash_city64(uri, strlen(uri)); WT_ERR(__wt_strdup(session, uri, &dhandle->name)); WT_ERR(__wt_strdup(session, checkpoint, &dhandle->checkpoint)); diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c index 287e9ca7b99..5e4a8c29adc 100644 --- a/src/conn/conn_handle.c +++ b/src/conn/conn_handle.c @@ -62,14 +62,9 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_RET(__wt_spin_init(session, &conn->turtle_lock, "turtle file")); /* Read-write locks */ - __wt_rwlock_init(session, &conn->dhandle_lock); - __wt_rwlock_init(session, &conn->hot_backup_lock); - __wt_rwlock_init(session, &conn->table_lock); - - WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS, &conn->page_lock)); - for (i = 0; i < WT_PAGE_LOCKS; ++i) - WT_RET( - __wt_spin_init(session, &conn->page_lock[i], "btree page")); + WT_RET(__wt_rwlock_init(session, &conn->dhandle_lock)); + WT_RET(__wt_rwlock_init(session, &conn->hot_backup_lock)); + WT_RET(__wt_rwlock_init(session, &conn->table_lock)); /* Setup the spin locks for the LSM manager queues. */ WT_RET(__wt_spin_init(session, @@ -113,7 +108,6 @@ void __wt_connection_destroy(WT_CONNECTION_IMPL *conn) { WT_SESSION_IMPL *session; - u_int i; /* Check there's something to destroy. */ if (conn == NULL) @@ -144,9 +138,6 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_spin_destroy(session, &conn->schema_lock); __wt_rwlock_destroy(session, &conn->table_lock); __wt_spin_destroy(session, &conn->turtle_lock); - for (i = 0; i < WT_PAGE_LOCKS; ++i) - __wt_spin_destroy(session, &conn->page_lock[i]); - __wt_free(session, conn->page_lock); /* Free allocated memory. */ __wt_free(session, conn->cfg); diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c index b8b5bd2a908..d2ed314fd2e 100644 --- a/src/conn/conn_log.c +++ b/src/conn/conn_log.c @@ -880,7 +880,7 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_spin_init(session, &log->log_sync_lock, "log sync")); WT_RET(__wt_spin_init(session, &log->log_writelsn_lock, "log write LSN")); - __wt_rwlock_init(session, &log->log_archive_lock); + WT_RET(__wt_rwlock_init(session, &log->log_archive_lock)); if (FLD_ISSET(conn->direct_io, WT_DIRECT_IO_LOG)) log->allocsize = (uint32_t) WT_MAX(conn->buffer_alignment, WT_LOG_ALIGN); @@ -1043,12 +1043,12 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) } /* Destroy the condition variables now that all threads are stopped */ - WT_TRET(__wt_cond_destroy(session, &conn->log_cond)); - WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond)); - WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond)); + __wt_cond_destroy(session, &conn->log_cond); + __wt_cond_destroy(session, &conn->log_file_cond); + __wt_cond_destroy(session, &conn->log_wrlsn_cond); - WT_TRET(__wt_cond_destroy(session, &conn->log->log_sync_cond)); - WT_TRET(__wt_cond_destroy(session, &conn->log->log_write_cond)); + __wt_cond_destroy(session, &conn->log->log_sync_cond); + __wt_cond_destroy(session, &conn->log->log_write_cond); __wt_rwlock_destroy(session, &conn->log->log_archive_lock); __wt_spin_destroy(session, &conn->log->log_lock); __wt_spin_destroy(session, &conn->log->log_slot_lock); diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c index d89392b66c6..3234d51cd4c 100644 --- a/src/conn/conn_stat.c +++ b/src/conn/conn_stat.c @@ -648,7 +648,7 @@ __wt_statlog_destroy(WT_SESSION_IMPL *session, bool is_close) WT_TRET(__wt_thread_join(session, conn->stat_tid)); conn->stat_tid_set = false; } - WT_TRET(__wt_cond_destroy(session, &conn->stat_cond)); + __wt_cond_destroy(session, &conn->stat_cond); /* Log a set of statistics on shutdown if configured. */ if (is_close) diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c index 22d90b08438..e273f1d08e5 100644 --- a/src/conn/conn_sweep.c +++ b/src/conn/conn_sweep.c @@ -432,7 +432,7 @@ __wt_sweep_destroy(WT_SESSION_IMPL *session) WT_TRET(__wt_thread_join(session, conn->sweep_tid)); conn->sweep_tid_set = 0; } - WT_TRET(__wt_cond_destroy(session, &conn->sweep_cond)); + __wt_cond_destroy(session, &conn->sweep_cond); if (conn->sweep_session != NULL) { wt_session = &conn->sweep_session->iface; diff --git a/src/cursor/cur_file.c b/src/cursor/cur_file.c index 205afb607c3..bc54f10f2d6 100644 --- a/src/cursor/cur_file.c +++ b/src/cursor/cur_file.c @@ -486,7 +486,14 @@ __curfile_create(WT_SESSION_IMPL *session, WT_STAT_DATA_INCR(session, cursor_create); if (0) { -err: WT_TRET(__curfile_close(cursor)); +err: /* + * Our caller expects to release the data handle if we fail. + * Disconnect it from the cursor before closing. + */ + if (session->dhandle != NULL) + __wt_cursor_dhandle_decr_use(session); + cbt->btree = NULL; + WT_TRET(__curfile_close(cursor)); *cursorp = NULL; } diff --git a/src/include/btmem.h b/src/include/btmem.h index b1d5df4e9d2..d0b21b17965 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -414,18 +414,20 @@ struct __wt_page_modify { size_t discard_allocated; } *ovfl_track; +#define WT_PAGE_LOCK(s, p) \ + __wt_spin_lock((s), &(p)->modify->page_lock) +#define WT_PAGE_TRYLOCK(s, p) \ + __wt_spin_trylock((s), &(p)->modify->page_lock) +#define WT_PAGE_UNLOCK(s, p) \ + __wt_spin_unlock((s), &(p)->modify->page_lock) + WT_SPINLOCK page_lock; /* Page's spinlock */ + /* * The write generation is incremented when a page is modified, a page * is clean if the write generation is 0. */ uint32_t write_gen; -#define WT_PAGE_LOCK(s, p) \ - __wt_spin_lock((s), &S2C(s)->page_lock[(p)->modify->page_lock]) -#define WT_PAGE_UNLOCK(s, p) \ - __wt_spin_unlock((s), &S2C(s)->page_lock[(p)->modify->page_lock]) - uint8_t page_lock; /* Page's spinlock */ - #define WT_PM_REC_EMPTY 1 /* Reconciliation: no replacement */ #define WT_PM_REC_MULTIBLOCK 2 /* Reconciliation: multiple blocks */ #define WT_PM_REC_REPLACE 3 /* Reconciliation: single block */ @@ -603,13 +605,6 @@ struct __wt_page { uint8_t unused[2]; /* Unused padding */ /* - * Used to protect and co-ordinate splits for internal pages and - * reconciliation for all pages. Only used to co-ordinate among the - * uncommon cases that require exclusive access to a page. - */ - WT_RWLOCK page_lock; - - /* * The page's read generation acts as an LRU value for each page in the * tree; it is used by the eviction server thread to select pages to be * discarded from the in-memory tree. @@ -635,8 +630,6 @@ struct __wt_page { #define WT_READGEN_STEP 100 uint64_t read_gen; - uint64_t evict_pass_gen; /* Eviction pass generation */ - size_t memory_footprint; /* Memory attached to the page */ /* Page's on-disk representation: NULL for pages created in memory. */ @@ -644,6 +637,10 @@ struct __wt_page { /* If/when the page is modified, we need lots more information. */ WT_PAGE_MODIFY *modify; + + /* This is the 64 byte boundary, try to keep hot fields above here. */ + + uint64_t evict_pass_gen; /* Eviction pass generation */ }; /* diff --git a/src/include/connection.h b/src/include/connection.h index 6c23492e926..f74732684f5 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -175,21 +175,6 @@ struct __wt_connection_impl { WT_SPINLOCK turtle_lock; /* Turtle file spinlock */ WT_RWLOCK dhandle_lock; /* Data handle list lock */ - /* - * We distribute the btree page locks across a set of spin locks. Don't - * use too many: they are only held for very short operations, each one - * is 64 bytes, so 256 will fill the L1 cache on most CPUs. - * - * Use a prime number of buckets rather than assuming a good hash - * (Reference Sedgewick, Algorithms in C, "Hash Functions"). - * - * Note: this can't be an array, we impose cache-line alignment and gcc - * doesn't support that for arrays smaller than the alignment. - */ -#define WT_PAGE_LOCKS 17 - WT_SPINLOCK *page_lock; /* Btree page spinlocks */ - u_int page_lock_cnt; /* Next spinlock to use */ - /* Connection queue */ TAILQ_ENTRY(__wt_connection_impl) q; /* Cache pool queue */ diff --git a/src/include/extern.h b/src/include/extern.h index 0cfc284b313..bf3279d0f94 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -182,7 +182,7 @@ extern int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *tag, WT_ITEM *b extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_tree_walk_count(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_tree_walk_skip(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove, bool exclusive) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_leaf_key_copy( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -191,7 +191,7 @@ extern int __wt_row_ikey_alloc(WT_SESSION_IMPL *session, uint32_t cell_offset, c extern int __wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t cell_offset, const void *key, size_t size, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_ikey(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key, size_t size, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove, bool exclusive) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_insert_alloc(WT_SESSION_IMPL *session, WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_update_alloc( WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp, size_t *sizep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern WT_UPDATE *__wt_update_obsolete_check( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd); @@ -674,10 +674,9 @@ extern void __wt_huffman_close(WT_SESSION_IMPL *session, void *huffman_arg); extern void __wt_print_huffman_code(void *huffman_arg, uint16_t symbol); extern int __wt_huffman_encode(WT_SESSION_IMPL *session, void *huffman_arg, const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg, const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern void __wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l); +extern int __wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK *l); extern int __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern void __wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *l); extern void __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l); extern void __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l); extern int __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); diff --git a/src/include/extern_posix.h b/src/include/extern_posix.h index 3afffef687b..c0ed056c7b6 100644 --- a/src/include/extern_posix.h +++ b/src/include/extern_posix.h @@ -15,7 +15,7 @@ extern int __wt_posix_unmap(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, void *ma extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled); extern void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond); -extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern void __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp); extern int __wt_once(void (*init_routine)(void)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_get_vm_pagesize(void) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern bool __wt_absolute_path(const char *path); diff --git a/src/include/extern_win.h b/src/include/extern_win.h index 4e232a2df80..d548ee0b2ec 100644 --- a/src/include/extern_win.h +++ b/src/include/extern_win.h @@ -13,7 +13,7 @@ extern int __wt_win_unmap(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, v extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled); extern void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond); -extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern void __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp); extern int __wt_once(void (*init_routine)(void)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_get_vm_pagesize(void) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern bool __wt_absolute_path(const char *path); diff --git a/src/include/mutex.h b/src/include/mutex.h index 910eb7af5b9..36acea810d9 100644 --- a/src/include/mutex.h +++ b/src/include/mutex.h @@ -37,17 +37,21 @@ struct __wt_condvar { * Don't modify this structure without understanding the read/write locking * functions. */ -union __wt_rwlock { /* Read/write lock */ - uint64_t u; - struct { - uint32_t wr; /* Writers and readers */ - } i; - struct { - uint16_t writers; /* Now serving for writers */ - uint16_t readers; /* Now serving for readers */ - uint16_t next; /* Next available ticket number */ - uint16_t writers_active;/* Count of active writers */ - } s; +struct __wt_rwlock { /* Read/write lock */ + volatile union { + uint64_t v; /* Full 64-bit value */ + struct { + uint8_t current; /* Current ticket */ + uint8_t next; /* Next available ticket */ + uint8_t reader; /* Read queue ticket */ + uint8_t __notused; /* Padding */ + uint16_t readers_active;/* Count of active readers */ + uint16_t readers_queued;/* Count of queued readers */ + } s; + } u; + + WT_CONDVAR *cond_readers; /* Blocking readers */ + WT_CONDVAR *cond_writers; /* Blocking writers */ }; /* @@ -63,8 +67,8 @@ union __wt_rwlock { /* Read/write lock */ #define SPINLOCK_PTHREAD_MUTEX_ADAPTIVE 3 struct __wt_spinlock { - WT_CACHE_LINE_PAD_BEGIN #if SPINLOCK_TYPE == SPINLOCK_GCC + WT_CACHE_LINE_PAD_BEGIN volatile int lock; #elif SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX ||\ SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE ||\ @@ -87,5 +91,8 @@ struct __wt_spinlock { int16_t stat_int_usecs_off; /* waiting server threads offset */ int8_t initialized; /* Lock initialized, for cleanup */ + +#if SPINLOCK_TYPE == SPINLOCK_GCC WT_CACHE_LINE_PAD_END +#endif }; diff --git a/src/include/serial.i b/src/include/serial.i index 982f196b0b8..0134e1a9c20 100644 --- a/src/include/serial.i +++ b/src/include/serial.i @@ -154,7 +154,7 @@ __col_append_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head, static inline int __wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT_HEAD *ins_head, WT_INSERT ***ins_stack, WT_INSERT **new_insp, - size_t new_ins_size, uint64_t *recnop, u_int skipdepth) + size_t new_ins_size, uint64_t *recnop, u_int skipdepth, bool exclusive) { WT_INSERT *new_ins = *new_insp; WT_DECL_RET; @@ -165,11 +165,16 @@ __wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page, /* Clear references to memory we now own and must free on error. */ *new_insp = NULL; - /* Acquire the page's spinlock, call the worker function. */ - WT_PAGE_LOCK(session, page); + /* + * Acquire the page's spinlock unless we already have exclusive access. + * Then call the worker function. + */ + if (!exclusive) + WT_PAGE_LOCK(session, page); ret = __col_append_serial_func( session, ins_head, ins_stack, new_ins, recnop, skipdepth); - WT_PAGE_UNLOCK(session, page); + if (!exclusive) + WT_PAGE_UNLOCK(session, page); if (ret != 0) { /* Free unused memory on error. */ @@ -198,7 +203,7 @@ __wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page, static inline int __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT_HEAD *ins_head, WT_INSERT ***ins_stack, WT_INSERT **new_insp, - size_t new_ins_size, u_int skipdepth) + size_t new_ins_size, u_int skipdepth, bool exclusive) { WT_INSERT *new_ins = *new_insp; WT_DECL_RET; @@ -220,10 +225,12 @@ __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page, ret = __insert_simple_func( session, ins_stack, new_ins, skipdepth); else { - WT_PAGE_LOCK(session, page); + if (!exclusive) + WT_PAGE_LOCK(session, page); ret = __insert_serial_func( session, ins_head, ins_stack, new_ins, skipdepth); - WT_PAGE_UNLOCK(session, page); + if (!exclusive) + WT_PAGE_UNLOCK(session, page); } if (ret != 0) { @@ -252,7 +259,8 @@ __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page, */ static inline int __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, - WT_UPDATE **srch_upd, WT_UPDATE **updp, size_t upd_size) + WT_UPDATE **srch_upd, WT_UPDATE **updp, size_t upd_size, + bool exclusive) { WT_DECL_RET; WT_UPDATE *obsolete, *upd = *updp; @@ -295,7 +303,7 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, /* * If there are no subsequent WT_UPDATE structures we are done here. */ - if (upd->next == NULL) + if (upd->next == NULL || exclusive) return (0); /* @@ -316,11 +324,11 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, } /* If we can't lock it, don't scan, that's okay. */ - if (__wt_try_writelock(session, &page->page_lock) != 0) + if (WT_PAGE_TRYLOCK(session, page) != 0) return (0); obsolete = __wt_update_obsolete_check(session, page, upd->next); - __wt_writeunlock(session, &page->page_lock); + WT_PAGE_UNLOCK(session, page); if (obsolete != NULL) __wt_update_obsolete_free(session, page, obsolete); diff --git a/src/include/session.h b/src/include/session.h index 674e92671b1..1b2dfd1ed2b 100644 --- a/src/include/session.h +++ b/src/include/session.h @@ -97,6 +97,10 @@ struct __wt_session_impl { */ TAILQ_HEAD(__tables, __wt_table) tables; + /* Current rwlock for callback. */ + WT_RWLOCK *current_rwlock; + uint8_t current_rwticket; + WT_ITEM **scratch; /* Temporary memory for any function */ u_int scratch_alloc; /* Currently allocated */ size_t scratch_cached; /* Scratch bytes cached */ diff --git a/src/include/verify_build.h b/src/include/verify_build.h index 640f5e4cf5f..a657b9ac460 100644 --- a/src/include/verify_build.h +++ b/src/include/verify_build.h @@ -59,7 +59,6 @@ __wt_verify_build(void) sizeof(s) > WT_CACHE_LINE_ALIGNMENT || \ sizeof(s) % WT_CACHE_LINE_ALIGNMENT == 0) WT_PADDING_CHECK(WT_LOGSLOT); - WT_PADDING_CHECK(WT_SPINLOCK); WT_PADDING_CHECK(WT_TXN_STATE); /* diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h index da318ad8a86..cf79578985b 100644 --- a/src/include/wt_internal.h +++ b/src/include/wt_internal.h @@ -268,6 +268,8 @@ struct __wt_ref; typedef struct __wt_ref WT_REF; struct __wt_row; typedef struct __wt_row WT_ROW; +struct __wt_rwlock; + typedef struct __wt_rwlock WT_RWLOCK; struct __wt_salvage_cookie; typedef struct __wt_salvage_cookie WT_SALVAGE_COOKIE; struct __wt_save_upd; @@ -302,8 +304,6 @@ union __wt_lsn; typedef union __wt_lsn WT_LSN; union __wt_rand_state; typedef union __wt_rand_state WT_RAND_STATE; -union __wt_rwlock; - typedef union __wt_rwlock WT_RWLOCK; /* * Forward type declarations for internal types: END * DO NOT EDIT: automatically built by dist/s_typedef. diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c index e33e119aa41..b7d9086d10e 100644 --- a/src/lsm/lsm_manager.c +++ b/src/lsm/lsm_manager.c @@ -334,7 +334,7 @@ __wt_lsm_manager_destroy(WT_SESSION_IMPL *session) __wt_spin_destroy(session, &manager->switch_lock); __wt_spin_destroy(session, &manager->app_lock); __wt_spin_destroy(session, &manager->manager_lock); - WT_TRET(__wt_cond_destroy(session, &manager->work_cond)); + __wt_cond_destroy(session, &manager->work_cond); return (ret); } diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c index a9275976023..77600c1bf34 100644 --- a/src/lsm/lsm_tree.c +++ b/src/lsm/lsm_tree.c @@ -471,7 +471,7 @@ __lsm_tree_open(WT_SESSION_IMPL *session, /* Try to open the tree. */ WT_RET(__wt_calloc_one(session, &lsm_tree)); - __wt_rwlock_init(session, &lsm_tree->rwlock); + WT_RET(__wt_rwlock_init(session, &lsm_tree->rwlock)); WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri)); diff --git a/src/os_posix/os_mtx_cond.c b/src/os_posix/os_mtx_cond.c index a5ee78f9e3e..fe010b62305 100644 --- a/src/os_posix/os_mtx_cond.c +++ b/src/os_posix/os_mtx_cond.c @@ -153,7 +153,7 @@ err: * __wt_cond_destroy -- * Destroy a condition variable. */ -int +void __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) { WT_CONDVAR *cond; @@ -161,11 +161,15 @@ __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) cond = *condp; if (cond == NULL) - return (0); + return; - ret = pthread_cond_destroy(&cond->cond); - WT_TRET(pthread_mutex_destroy(&cond->mtx)); - __wt_free(session, *condp); + if ((ret = pthread_cond_destroy(&cond->cond)) != 0) + WT_PANIC_MSG( + session, ret, "pthread_cond_destroy: %s", cond->name); - return (ret); + if ((ret = pthread_mutex_destroy(&cond->mtx)) != 0) + WT_PANIC_MSG( + session, ret, "pthread_mutex_destroy: %s", cond->name); + + __wt_free(session, *condp); } diff --git a/src/os_win/os_mtx_cond.c b/src/os_win/os_mtx_cond.c index 0001c6c2322..c1b9f509d33 100644 --- a/src/os_win/os_mtx_cond.c +++ b/src/os_win/os_mtx_cond.c @@ -163,18 +163,16 @@ __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) * __wt_cond_destroy -- * Destroy a condition variable. */ -int +void __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) { WT_CONDVAR *cond; cond = *condp; if (cond == NULL) - return (0); + return; /* Do nothing to delete Condition Variable */ DeleteCriticalSection(&cond->mtx); __wt_free(session, *condp); - - return (0); } diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index 6f95b84d292..e59d9796352 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -386,7 +386,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, * In-memory splits: reconciliation of an internal page cannot handle * a child page splitting during the reconciliation. */ - __wt_writelock(session, &page->page_lock); + WT_PAGE_LOCK(session, page); oldest_id = __wt_txn_oldest_id(session); if (LF_ISSET(WT_EVICTING)) @@ -405,7 +405,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, /* Initialize the reconciliation structure for each new run. */ if ((ret = __rec_write_init( session, ref, flags, salvage, &session->reconcile)) != 0) { - __wt_writeunlock(session, &page->page_lock); + WT_PAGE_UNLOCK(session, page); return (ret); } r = session->reconcile; @@ -446,7 +446,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_TRET(__rec_write_wrapup_err(session, r, page)); /* Release the reconciliation lock. */ - __wt_writeunlock(session, &page->page_lock); + WT_PAGE_UNLOCK(session, page); /* * If our caller can configure lookaside table reconciliation, flag if diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c index 95fb6a6f90e..9abc7a54f5d 100644 --- a/src/session/session_dhandle.c +++ b/src/session/session_dhandle.c @@ -261,8 +261,8 @@ __wt_session_release_btree(WT_SESSION_IMPL *session) * can get a handle without special flags. */ if (F_ISSET(dhandle, WT_DHANDLE_DISCARD | WT_DHANDLE_DISCARD_FORCE)) { - __session_find_dhandle(session, - dhandle->name, dhandle->checkpoint, &dhandle_cache); + WT_SAVE_DHANDLE(session, __session_find_dhandle(session, + dhandle->name, dhandle->checkpoint, &dhandle_cache)); if (dhandle_cache != NULL) __session_discard_dhandle(session, dhandle_cache); } diff --git a/src/support/mtx_rw.c b/src/support/mtx_rw.c index 35ad5da23f2..b2ab32bdef1 100644 --- a/src/support/mtx_rw.c +++ b/src/support/mtx_rw.c @@ -27,7 +27,7 @@ */ /* - * Based on "Spinlocks and Read-Write Locks" by Dr. Steven Fuerst: + * Inspired by "Spinlocks and Read-Write Locks" by Dr. Steven Fuerst: * http://locklessinc.com/articles/locks/ * * Dr. Fuerst further credits: @@ -39,77 +39,46 @@ * by John Mellor-Crummey and Michael Scott in their landmark paper "Scalable * Reader-Writer Synchronization for Shared-Memory Multiprocessors". * - * The following is an explanation of this code. First, the underlying lock - * structure. + * The following is an explanation of our interpretation and implementation. + * First, the underlying lock structure. * + * volatile union { + * uint64_t v; // Full 64-bit value * struct { - * uint16_t writers; Now serving for writers - * uint16_t readers; Now serving for readers - * uint16_t next; Next available ticket number - * uint16_t __notused; Padding - * } + * uint8_t current; // Current ticket + * uint8_t next; // Next available ticket + * uint8_t reader; // Read queue ticket + * uint8_t __notused; // Padding + * uint16_t readers_active; // Count of active readers + * uint16_t readers_queued; // Count of queued readers + * } s; + * } u; * * First, imagine a store's 'take a number' ticket algorithm. A customer takes * a unique ticket number and customers are served in ticket order. In the data - * structure, 'writers' is the next writer to be served, 'readers' is the next - * reader to be served, and 'next' is the next available ticket number. + * structure, 'next' is the ticket that will be allocated next, and 'current' + * is the ticket being served. * - * Next, consider exclusive (write) locks. The 'now serving' number for writers - * is 'writers'. To lock, 'take a number' and wait until that number is being - * served; more specifically, atomically copy and increment the current value of - * 'next', and then wait until 'writers' equals that copied number. + * Next, consider exclusive (write) locks. To lock, 'take a number' and wait + * until that number is being served; more specifically, atomically increment + * 'next', and then wait until 'current' equals that allocated ticket. * - * Shared (read) locks are similar. Like writers, readers atomically get the - * next number available. However, instead of waiting for 'writers' to equal - * their number, they wait for 'readers' to equal their number. + * Shared (read) locks are similar, except that readers can share a ticket + * (both with each other and with a single writer). Readers with a given + * ticket execute before the writer with that ticket. In other words, writers + * wait for both their ticket to become current and for all readers to exit + * the lock. * - * This has the effect of queuing lock requests in the order they arrive - * (incidentally avoiding starvation). + * If there are no active writers (indicated by 'current' == 'next'), readers + * can immediately enter the lock by atomically incrementing 'readers_active'. + * When there are writers active, readers form a new queue by first setting + * 'reader' to 'next' (i.e. readers are scheduled after any queued writers, + * avoiding starvation), then atomically incrementing 'readers_queued'. * - * Each lock/unlock pair requires incrementing both 'readers' and 'writers'. - * In the case of a reader, the 'readers' increment happens when the reader - * acquires the lock (to allow read-lock sharing), and the 'writers' increment - * happens when the reader releases the lock. In the case of a writer, both - * 'readers' and 'writers' are incremented when the writer releases the lock. - * - * For example, consider the following read (R) and write (W) lock requests: - * - * writers readers next - * 0 0 0 - * R: ticket 0, readers match OK 0 1 1 - * R: ticket 1, readers match OK 0 2 2 - * R: ticket 2, readers match OK 0 3 3 - * W: ticket 3, writers no match block 0 3 4 - * R: ticket 2, unlock 1 3 4 - * R: ticket 0, unlock 2 3 4 - * R: ticket 1, unlock 3 3 4 - * W: ticket 3, writers match OK 3 3 4 - * - * Note the writer blocks until 'writers' equals its ticket number and it does - * not matter if readers unlock in order or not. - * - * Readers or writers entering the system after the write lock is queued block, - * and the next ticket holder (reader or writer) will unblock when the writer - * unlocks. An example, continuing from the last line of the above example: - * - * writers readers next - * W: ticket 3, writers match OK 3 3 4 - * R: ticket 4, readers no match block 3 3 5 - * R: ticket 5, readers no match block 3 3 6 - * W: ticket 6, writers no match block 3 3 7 - * W: ticket 3, unlock 4 4 7 - * R: ticket 4, readers match OK 4 5 7 - * R: ticket 5, readers match OK 4 6 7 - * - * The 'next' field is a 2-byte value so the available ticket number wraps at - * 64K requests. If a thread's lock request is not granted until the 'next' - * field cycles and the same ticket is taken by another thread, we could grant - * a lock to two separate threads at the same time, and bad things happen: two - * writer threads or a reader thread and a writer thread would run in parallel, - * and lock waiters could be skipped if the unlocks race. This is unlikely, it - * only happens if a lock request is blocked by 64K other requests. The fix is - * to grow the lock structure fields, but the largest atomic instruction we have - * is 8 bytes, the structure has no room to grow. + * The 'next' field is a 1-byte value so the available ticket number wraps + * after 256 requests. If a thread's write lock request would cause the 'next' + * field to catch up with 'current', instead it waits to avoid the same ticket + * being allocated to multiple threads. */ #include "wt_internal.h" @@ -118,12 +87,14 @@ * __wt_rwlock_init -- * Initialize a read/write lock. */ -void +int __wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - WT_UNUSED(session); + l->u.v = 0; - l->u = 0; + WT_RET(__wt_cond_alloc(session, "rwlock wait", &l->cond_readers)); + WT_RET(__wt_cond_alloc(session, "rwlock wait", &l->cond_writers)); + return (0); } /* @@ -133,9 +104,10 @@ __wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l) void __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - WT_UNUSED(session); + l->u.v = 0; - l->u = 0; + __wt_cond_destroy(session, &l->cond_readers); + __wt_cond_destroy(session, &l->cond_writers); } /* @@ -149,46 +121,35 @@ __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_STAT_CONN_INCR(session, rwlock_read); - new = old = *l; + new.u.v = old.u.v = l->u.v; /* - * This read lock can only be granted if the lock was last granted to - * a reader and there are no readers or writers blocked on the lock, - * that is, if this thread's ticket would be the next ticket granted. - * Do the cheap test to see if this can possibly succeed (and confirm - * the lock is in the correct state to grant this read lock). + * This read lock can only be granted if there are no active writers. + * + * Also check for overflow in case there are 64K active readers. */ - if (old.s.readers != old.s.next) + if (old.u.s.current != old.u.s.next || + new.u.s.readers_active == UINT16_MAX) return (EBUSY); /* - * The replacement lock value is a result of allocating a new ticket and - * incrementing the reader value to match it. + * The replacement lock value is a result of adding an active reader. + * + * We rely on this atomic operation to provide a barrier. */ - new.s.readers = new.s.next = old.s.next + 1; - return (__wt_atomic_cas64(&l->u, old.u, new.u) ? 0 : EBUSY); + new.u.s.readers_active++; + return (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v) ? 0 : EBUSY); } /* - * __wt_readlock_spin -- - * Spin to get a read lock: only yield the CPU if the lock is held - * exclusive. + * __read_blocked -- + * Check whether the current read lock request should keep waiting. */ -void -__wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *l) +static bool +__read_blocked(WT_SESSION_IMPL *session) { - /* - * Try to get the lock in a single operation if it is available to - * readers. This avoids the situation where multiple readers arrive - * concurrently and have to line up in order to enter the lock. For - * read-heavy workloads it can make a significant difference. - */ - while (__wt_try_readlock(session, l) != 0) { - if (l->s.writers_active > 0) - __wt_yield(); - else - WT_PAUSE(); - } + return (session->current_rwticket != + session->current_rwlock->u.s.current); } /* @@ -198,41 +159,90 @@ __wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *l) void __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - uint16_t ticket; + WT_RWLOCK new, old; int pause_cnt; + int16_t writers_active; + uint8_t ticket; WT_STAT_CONN_INCR(session, rwlock_read); WT_DIAGNOSTIC_YIELD; - /* - * Possibly wrap: if we have more than 64K lockers waiting, the ticket - * value will wrap and two lockers will simultaneously be granted the - * lock. - */ - ticket = __wt_atomic_fetch_add16(&l->s.next, 1); - for (pause_cnt = 0; ticket != l->s.readers;) { + for (;;) { /* - * We failed to get the lock; pause before retrying and if we've - * paused enough, yield so we don't burn CPU to no purpose. This - * situation happens if there are more threads than cores in the - * system and we're thrashing on shared resources. + * Fast path: if there is no active writer, join the current + * group. */ - if (++pause_cnt < WT_THOUSAND) + for (old.u.v = l->u.v; + old.u.s.current == old.u.s.next; + old.u.v = l->u.v) { + new.u.v = old.u.v; + /* + * Check for overflow: if the maximum number of readers + * are already active, wait to try again. + */ + if (++new.u.s.readers_active == 0) + goto stall; + if (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v)) + return; WT_PAUSE(); - else + } + + /* + * There is an active writer: join the next group. + * + * Limit how many readers can queue: don't allow more readers + * to queue than there are active writers (calculated as + * `next - current`): otherwise, in write-heavy workloads, + * readers can keep queuing up in front of writers and + * throughput is unstable. + * + * If the maximum number of readers are already queued, wait + * until we can get a valid ticket. + */ + writers_active = old.u.s.next - old.u.s.current; + if (old.u.s.readers_queued > writers_active) { +stall: __wt_cond_wait( + session, l->cond_readers, WT_THOUSAND, NULL); + continue; + } + + /* + * If we are the first reader to queue, set the next read + * group. Note: don't re-read from the lock or we could race + * with a writer unlocking. + */ + new.u.v = old.u.v; + if (new.u.s.readers_queued++ == 0) + new.u.s.reader = new.u.s.next; + ticket = new.u.s.reader; + + if (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v)) + break; + } + + /* Wait for our group to start. */ + for (pause_cnt = 0; ticket != l->u.s.current; pause_cnt++) { + if (pause_cnt < 1000) + WT_PAUSE(); + else if (pause_cnt < 1200) __wt_yield(); + else { + session->current_rwlock = l; + session->current_rwticket = ticket; + __wt_cond_wait( + session, l->cond_readers, 0, __read_blocked); + } } - /* - * We're the only writer of the readers field, so the update does not - * need to be atomic. - */ - ++l->s.readers; + WT_ASSERT(session, l->u.s.readers_active > 0); /* * Applications depend on a barrier here so that operations holding the - * lock see consistent data. + * lock see consistent data. The atomic operation above isn't + * sufficient here because we don't own the lock until our ticket comes + * up and whatever data we are protecting may have changed in the + * meantime. */ WT_READ_BARRIER(); } @@ -244,13 +254,22 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) void __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - WT_UNUSED(session); + WT_RWLOCK new, old; - /* - * Increment the writers value (other readers are doing the same, make - * sure we don't race). - */ - (void)__wt_atomic_add16(&l->s.writers, 1); + do { + old.u.v = l->u.v; + WT_ASSERT(session, old.u.s.readers_active > 0); + + /* + * Decrement the active reader count (other readers are doing + * the same, make sure we don't race). + */ + new.u.v = old.u.v; + --new.u.s.readers_active; + } while (!__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v)); + + if (new.u.s.readers_active == 0 && new.u.s.current != new.u.s.next) + __wt_cond_signal(session, l->cond_writers); } /* @@ -264,22 +283,44 @@ __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_STAT_CONN_INCR(session, rwlock_write); - old = new = *l; - /* - * This write lock can only be granted if the lock was last granted to - * a writer and there are no readers or writers blocked on the lock, - * that is, if this thread's ticket would be the next ticket granted. - * Do the cheap test to see if this can possibly succeed (and confirm - * the lock is in the correct state to grant this write lock). + * This write lock can only be granted if no readers or writers blocked + * on the lock, that is, if this thread's ticket would be the next + * ticket granted. Check if this can possibly succeed (and confirm the + * lock is in the correct state to grant this write lock). */ - if (old.s.writers != old.s.next) + old.u.v = l->u.v; + if (old.u.s.current != old.u.s.next || old.u.s.readers_active != 0) return (EBUSY); - /* The replacement lock value is a result of allocating a new ticket. */ - ++new.s.next; - ++new.s.writers_active; - return (__wt_atomic_cas64(&l->u, old.u, new.u) ? 0 : EBUSY); + /* + * We've checked above that there is no writer active (since + * `current == next`), so there should be no readers queued. + */ + WT_ASSERT(session, old.u.s.readers_queued == 0); + + /* + * The replacement lock value is a result of allocating a new ticket. + * + * We rely on this atomic operation to provide a barrier. + */ + new.u.v = old.u.v; + new.u.s.next++; + return (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v) ? 0 : EBUSY); +} + +/* + * __write_blocked -- + * Check whether the current write lock request should keep waiting. + */ +static bool +__write_blocked(WT_SESSION_IMPL *session) +{ + WT_RWLOCK *l; + + l = session->current_rwlock; + return (session->current_rwticket != l->u.s.current || + l->u.s.readers_active != 0); } /* @@ -289,34 +330,51 @@ __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) void __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - uint16_t ticket; + WT_RWLOCK new, old; int pause_cnt; + uint8_t ticket; WT_STAT_CONN_INCR(session, rwlock_write); - /* - * Possibly wrap: if we have more than 64K lockers waiting, the ticket - * value will wrap and two lockers will simultaneously be granted the - * lock. - */ - ticket = __wt_atomic_fetch_add16(&l->s.next, 1); - (void)__wt_atomic_add16(&l->s.writers_active, 1); - for (pause_cnt = 0; ticket != l->s.writers;) { + for (;;) { + new.u.v = old.u.v = l->u.v; + ticket = new.u.s.next++; + /* - * We failed to get the lock; pause before retrying and if we've - * paused enough, sleep so we don't burn CPU to no purpose. This - * situation happens if there are more threads than cores in the - * system and we're thrashing on shared resources. + * Avoid wrapping: if we allocate more than 256 tickets, two + * lockers will simultaneously be granted the lock. */ - if (++pause_cnt < WT_THOUSAND) + if (new.u.s.current == new.u.s.next) { + __wt_cond_wait( + session, l->cond_writers, WT_THOUSAND, NULL); + continue; + } + if (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v)) + break; + } + + /* Wait for our group to start and any readers to drain. */ + for (pause_cnt = 0; + ticket != l->u.s.current || l->u.s.readers_active != 0; + pause_cnt++) { + if (pause_cnt < 1000) WT_PAUSE(); - else - __wt_sleep(0, 10); + else if (pause_cnt < 1200) + __wt_yield(); + else { + session->current_rwlock = l; + session->current_rwticket = ticket; + __wt_cond_wait( + session, l->cond_writers, 0, __write_blocked); + } } /* * Applications depend on a barrier here so that operations holding the - * lock see consistent data. + * lock see consistent data. The atomic operation above isn't + * sufficient here because we don't own the lock until our ticket comes + * up and whatever data we are protecting may have changed in the + * meantime. */ WT_READ_BARRIER(); } @@ -328,29 +386,34 @@ __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) void __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - WT_RWLOCK new; - - WT_UNUSED(session); + WT_RWLOCK new, old; - (void)__wt_atomic_sub16(&l->s.writers_active, 1); + do { + new.u.v = old.u.v = l->u.v; - /* - * Ensure that all updates made while the lock was held are visible to - * the next thread to acquire the lock. - */ - WT_WRITE_BARRIER(); + /* + * We're holding the lock exclusive, there shouldn't be any + * active readers. + */ + WT_ASSERT(session, old.u.s.readers_active == 0); - new = *l; + /* + * Allow the next batch to start. + * + * If there are readers in the next group, swap queued readers + * to active: this could race with new readlock requests, so we + * have to spin. + */ + if (++new.u.s.current == new.u.s.reader) { + new.u.s.readers_active = new.u.s.readers_queued; + new.u.s.readers_queued = 0; + } + } while (!__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v)); - /* - * We're the only writer of the writers/readers fields, so the update - * does not need to be atomic; we have to update both values at the - * same time though, otherwise we'd potentially race with the thread - * next granted the lock. - */ - ++new.s.writers; - ++new.s.readers; - l->i.wr = new.i.wr; + if (new.u.s.readers_active != 0) + __wt_cond_signal(session, l->cond_readers); + else if (new.u.s.current != new.u.s.next) + __wt_cond_signal(session, l->cond_writers); WT_DIAGNOSTIC_YIELD; } @@ -365,6 +428,6 @@ __wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *l) { WT_UNUSED(session); - return (l->s.writers != l->s.next || l->s.readers != l->s.next); + return (l->u.s.current != l->u.s.next || l->u.s.readers_active != 0); } #endif diff --git a/src/support/thread_group.c b/src/support/thread_group.c index 2b4b7ad4e61..090621fa138 100644 --- a/src/support/thread_group.c +++ b/src/support/thread_group.c @@ -257,7 +257,7 @@ __wt_thread_group_create( __wt_verbose(session, WT_VERB_THREAD_GROUP, "Creating thread group: %p", (void *)group); - __wt_rwlock_init(session, &group->lock); + WT_RET(__wt_rwlock_init(session, &group->lock)); WT_ERR(__wt_cond_alloc( session, "thread group cond", &group->wait_cond)); cond_alloced = true; @@ -272,7 +272,7 @@ __wt_thread_group_create( /* Cleanup on error to avoid leaking resources */ err: if (ret != 0) { if (cond_alloced) - WT_TRET(__wt_cond_destroy(session, &group->wait_cond)); + __wt_cond_destroy(session, &group->wait_cond); __wt_rwlock_destroy(session, &group->lock); } return (ret); @@ -297,7 +297,7 @@ __wt_thread_group_destroy(WT_SESSION_IMPL *session, WT_THREAD_GROUP *group) __wt_free(session, group->threads); - WT_TRET(__wt_cond_destroy(session, &group->wait_cond)); + __wt_cond_destroy(session, &group->wait_cond); __wt_rwlock_destroy(session, &group->lock); /* diff --git a/src/txn/txn.c b/src/txn/txn.c index 6eebf5ecf9f..ea7faa2e966 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -126,7 +126,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) n = 0; /* We're going to scan the table: wait for the lock. */ - __wt_readlock_spin(session, &txn_global->scan_rwlock); + __wt_readlock(session, &txn_global->scan_rwlock); current_id = pinned_id = txn_global->current; prev_oldest_id = txn_global->oldest_id; @@ -293,7 +293,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) /* First do a read-only scan. */ if (wait) - __wt_readlock_spin(session, &txn_global->scan_rwlock); + __wt_readlock(session, &txn_global->scan_rwlock); else if ((ret = __wt_try_readlock(session, &txn_global->scan_rwlock)) != 0) return (ret == EBUSY ? 0 : ret); @@ -768,8 +768,8 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_spin_init(session, &txn_global->id_lock, "transaction id lock")); - __wt_rwlock_init(session, &txn_global->scan_rwlock); - __wt_rwlock_init(session, &txn_global->nsnap_rwlock); + WT_RET(__wt_rwlock_init(session, &txn_global->scan_rwlock)); + WT_RET(__wt_rwlock_init(session, &txn_global->nsnap_rwlock)); txn_global->nsnap_oldest_id = WT_TXN_NONE; TAILQ_INIT(&txn_global->nsnaph); |