diff options
author | Keith Bostic <keith.bostic@mongodb.com> | 2017-08-14 07:25:48 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-08-14 07:25:48 -0400 |
commit | 6e66393bbc3ceba364fa8e42612f0f8caa0dc4ea (patch) | |
tree | b59296d19f20e03ce71e49d93d846b209b96ea97 | |
parent | 71bb828bdca4f0efe1cbbcf717d83791817b0efa (diff) | |
download | mongo-6e66393bbc3ceba364fa8e42612f0f8caa0dc4ea.tar.gz |
WT-3358 LSM will hang if the manager fails to start (#3582)
We increment WT_LSM_MANAGER.lsm_workers to 1 before starting the manager
and don't reset it to 0 on failure, causing __wt_lsm_manager_destroy()
to hang, waiting on the manager thread to set WT_LSM_MANAGER_SHUTDOWN.
Move the increment of WT_LSM_MANAGER.lsm_workers into
__wt_lsm_manager_start() to clarify what's happening, and reset that
value to 0 if we fail to start the manager.
In addition, set WT_LSM_MANAGER_SHUTDOWN on error (that way, even if we
somehow get the test wrong, __wt_lsm_manager_destroy() will proceed).
In addition, test WT_LSM_MANAGER_SHUTDOWN in __wt_lsm_manager_start()
so that once we fail to start the manager, subsequent LSM tree open
calls won't attempt to start the manager again.
-rw-r--r-- | src/lsm/lsm_manager.c | 27 | ||||
-rw-r--r-- | src/lsm/lsm_tree.c | 3 |
2 files changed, 22 insertions, 8 deletions
diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c index 24a0429a184..3949d88cec4 100644 --- a/src/lsm/lsm_manager.c +++ b/src/lsm/lsm_manager.c @@ -208,14 +208,20 @@ __wt_lsm_manager_start(WT_SESSION_IMPL *session) conn = S2C(session); manager = &conn->lsm_manager; - if (F_ISSET(conn, WT_CONN_READONLY)) { - manager->lsm_workers = 0; - return (0); - } /* - * We need at least a manager, a switch thread and a generic - * worker. + * If readonly or the manager is running, or we've already failed, + * there's no work to do. */ + if (F_ISSET(conn, WT_CONN_READONLY) || + manager->lsm_workers != 0 || + F_ISSET(manager, WT_LSM_MANAGER_SHUTDOWN)) + return (0); + + /* It's possible to race, see if we're the winner. */ + if (!__wt_atomic_cas32(&manager->lsm_workers, 0, 1)) + return (0); + + /* We need at least a manager, a switch thread and a generic worker. */ WT_ASSERT(session, manager->lsm_workers_max > 2); /* @@ -245,6 +251,15 @@ err: for (i = 0; i++) WT_TRET((&worker_session->iface)->close( &worker_session->iface, NULL)); + + /* Make the failure permanent, we won't try again. */ + F_SET(manager, WT_LSM_MANAGER_SHUTDOWN); + + /* + * Reset the workers count (otherwise, LSM destroy will hang + * waiting for threads to exit. + */ + WT_PUBLISH(manager->lsm_workers, 0); } return (ret); } diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c index 18e1f6d3115..33d9e472df6 100644 --- a/src/lsm/lsm_tree.c +++ b/src/lsm/lsm_tree.c @@ -474,8 +474,7 @@ __lsm_tree_open(WT_SESSION_IMPL *session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); /* Start the LSM manager thread if it isn't running. */ - if (__wt_atomic_cas32(&conn->lsm_manager.lsm_workers, 0, 1)) - WT_RET(__wt_lsm_manager_start(session)); + WT_RET(__wt_lsm_manager_start(session)); /* Make sure no one beat us to it. */ if ((ret = __lsm_tree_find( |