diff options
author | Paul J. Davis <paul.joseph.davis@gmail.com> | 2018-09-06 11:01:07 -0500 |
---|---|---|
committer | Paul J. Davis <paul.joseph.davis@gmail.com> | 2018-09-06 14:12:34 -0500 |
commit | f66c734d3c056b6b391f8f8487a7d81c1d55b87f (patch) | |
tree | bc83b4a3162a986242e3123bbae64452ecefb6eb | |
parent | 1c25163d3782faa18aae0bea2be0d18e7b0b7aaa (diff) | |
download | couchdb-f66c734d3c056b6b391f8f8487a7d81c1d55b87f.tar.gz |
Fix couch_server concurrency error
Its possible that a busy couch_server and a specific ordering and timing
of events can end up with an open_async message in the mailbox while a
new and unrelated open_async process is spawned. This change just ensure
that if we encounter any old messages in the mailbox that we ignore
them.
The underlying issue here is that a delete request clears out the state
in our couch_dbs ets table while not clearing out state in the message
queue. In some fairly specific circumstances this leads to the message
on in the mailbox satisfying an ets entry for a newer open_async
process. This change just includes a match on the opener process.
Anything unmatched came before the current open_async request which
means it should be ignored.
-rw-r--r-- | src/couch/src/couch_server.erl | 26 | ||||
-rw-r--r-- | src/couch/test/couch_server_tests.erl | 6 |
2 files changed, 21 insertions, 11 deletions
diff --git a/src/couch/src/couch_server.erl b/src/couch/src/couch_server.erl index af728fa01..72105202c 100644 --- a/src/couch/src/couch_server.erl +++ b/src/couch/src/couch_server.erl @@ -390,8 +390,8 @@ handle_call(reload_engines, _From, Server) -> {reply, ok, Server#server{engines = get_configured_engines()}}; handle_call(get_server, _From, Server) -> {reply, {ok, Server}, Server}; -handle_call({open_result, T0, DbName, {ok, Db}}, {FromPid, _Tag}, Server) -> - true = ets:delete(couch_dbs_pid_to_name, FromPid), +handle_call({open_result, T0, DbName, {ok, Db}}, {Opener, _}, Server) -> + true = ets:delete(couch_dbs_pid_to_name, Opener), OpenTime = timer:now_diff(os:timestamp(), T0) / 1000, couch_stats:update_histogram([couchdb, db_open_time], OpenTime), DbPid = couch_db:get_pid(Db), @@ -400,7 +400,7 @@ handle_call({open_result, T0, DbName, {ok, Db}}, {FromPid, _Tag}, Server) -> % db was deleted during async open exit(DbPid, kill), {reply, ok, Server}; - [#entry{req_type = ReqType, waiters = Waiters} = Entry] -> + [#entry{pid = Opener, req_type = ReqType, waiters = Waiters} = Entry] -> link(DbPid), [gen_server:reply(Waiter, {ok, Db}) || Waiter <- Waiters], % Cancel the creation request if it exists. @@ -425,27 +425,37 @@ handle_call({open_result, T0, DbName, {ok, Db}}, {FromPid, _Tag}, Server) -> true -> Server#server.lru end, - {reply, ok, Server#server{lru = Lru}} + {reply, ok, Server#server{lru = Lru}}; + [#entry{}] -> + % A mismatched opener pid means that this open_result message + % was in our mailbox but is now stale. Mostly ignore + % it except to ensure that the db pid is super dead. + exit(couch_db:get_pid(Db), kill), + {reply, ok, Server} end; handle_call({open_result, T0, DbName, {error, eexist}}, From, Server) -> handle_call({open_result, T0, DbName, file_exists}, From, Server); -handle_call({open_result, _T0, DbName, Error}, {FromPid, _Tag}, Server) -> +handle_call({open_result, _T0, DbName, Error}, {Opener, _}, Server) -> case ets:lookup(couch_dbs, DbName) of [] -> % db was deleted during async open {reply, ok, Server}; - [#entry{req_type = ReqType, waiters = Waiters} = Entry] -> + [#entry{pid = Opener, req_type = ReqType, waiters = Waiters} = Entry] -> [gen_server:reply(Waiter, Error) || Waiter <- Waiters], couch_log:info("open_result error ~p for ~s", [Error, DbName]), true = ets:delete(couch_dbs, DbName), - true = ets:delete(couch_dbs_pid_to_name, FromPid), + true = ets:delete(couch_dbs_pid_to_name, Opener), NewServer = case ReqType of {create, DbName, Engine, Options, CrFrom} -> open_async(Server, CrFrom, DbName, Engine, Options); _ -> Server end, - {reply, ok, db_closed(NewServer, Entry#entry.db_options)} + {reply, ok, db_closed(NewServer, Entry#entry.db_options)}; + [#entry{}] -> + % A mismatched pid means that this open_result message + % was in our mailbox and is now stale. Ignore it. + {reply, ok, Server} end; handle_call({open, DbName, Options}, From, Server) -> case ets:lookup(couch_dbs, DbName) of diff --git a/src/couch/test/couch_server_tests.erl b/src/couch/test/couch_server_tests.erl index f2a98e74c..fdbc175ce 100644 --- a/src/couch/test/couch_server_tests.erl +++ b/src/couch/test/couch_server_tests.erl @@ -224,9 +224,9 @@ t_interleaved_create_delete_open(DbName) -> % Our delete request was processed normally ?assertEqual({DelRef, ok}, get_next_message()), - % TODO: This assertion will change after I fix the bug as - % its incorrectly receiving the create message's response - ?assertMatch({OpenRef, {ok, _}}, get_next_message()), + % The db was deleted thus it should be not found + % when we try and open it. + ?assertMatch({OpenRef, {not_found, no_db_file}}, get_next_message()), % And finally assert that couch_server is still % alive. |