summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul J. Davis <paul.joseph.davis@gmail.com>2018-09-06 11:01:07 -0500
committerPaul J. Davis <paul.joseph.davis@gmail.com>2018-09-06 11:21:53 -0500
commita2e41a83e1f2921fa0350f2171cc6a88abd54dbe (patch)
treebc83b4a3162a986242e3123bbae64452ecefb6eb
parent690353c20ddae6e4b280eadfa862813764c60ab8 (diff)
downloadcouchdb-fix-couch-server-race-condition.tar.gz
Fix couch_server concurrency errorfix-couch-server-race-condition
Its possible that a busy couch_server and a specific ordering and timing of events can end up with an open_async message in the mailbox while a new and unrelated open_async process is spawned. This change just ensure that if we encounter any old messages in the mailbox that we ignore them. The underlying issue here is that a delete request clears out the state in our couch_dbs ets table while not clearing out state in the message queue. In some fairly specific circumstances this leads to the message on in the mailbox satisfying an ets entry for a newer open_async process. This change just includes a match on the opener process. Anything unmatched came before the current open_async request which means it should be ignored.
-rw-r--r--src/couch/src/couch_server.erl26
-rw-r--r--src/couch/test/couch_server_tests.erl6
2 files changed, 21 insertions, 11 deletions
diff --git a/src/couch/src/couch_server.erl b/src/couch/src/couch_server.erl
index af728fa01..72105202c 100644
--- a/src/couch/src/couch_server.erl
+++ b/src/couch/src/couch_server.erl
@@ -390,8 +390,8 @@ handle_call(reload_engines, _From, Server) ->
{reply, ok, Server#server{engines = get_configured_engines()}};
handle_call(get_server, _From, Server) ->
{reply, {ok, Server}, Server};
-handle_call({open_result, T0, DbName, {ok, Db}}, {FromPid, _Tag}, Server) ->
- true = ets:delete(couch_dbs_pid_to_name, FromPid),
+handle_call({open_result, T0, DbName, {ok, Db}}, {Opener, _}, Server) ->
+ true = ets:delete(couch_dbs_pid_to_name, Opener),
OpenTime = timer:now_diff(os:timestamp(), T0) / 1000,
couch_stats:update_histogram([couchdb, db_open_time], OpenTime),
DbPid = couch_db:get_pid(Db),
@@ -400,7 +400,7 @@ handle_call({open_result, T0, DbName, {ok, Db}}, {FromPid, _Tag}, Server) ->
% db was deleted during async open
exit(DbPid, kill),
{reply, ok, Server};
- [#entry{req_type = ReqType, waiters = Waiters} = Entry] ->
+ [#entry{pid = Opener, req_type = ReqType, waiters = Waiters} = Entry] ->
link(DbPid),
[gen_server:reply(Waiter, {ok, Db}) || Waiter <- Waiters],
% Cancel the creation request if it exists.
@@ -425,27 +425,37 @@ handle_call({open_result, T0, DbName, {ok, Db}}, {FromPid, _Tag}, Server) ->
true ->
Server#server.lru
end,
- {reply, ok, Server#server{lru = Lru}}
+ {reply, ok, Server#server{lru = Lru}};
+ [#entry{}] ->
+ % A mismatched opener pid means that this open_result message
+ % was in our mailbox but is now stale. Mostly ignore
+ % it except to ensure that the db pid is super dead.
+ exit(couch_db:get_pid(Db), kill),
+ {reply, ok, Server}
end;
handle_call({open_result, T0, DbName, {error, eexist}}, From, Server) ->
handle_call({open_result, T0, DbName, file_exists}, From, Server);
-handle_call({open_result, _T0, DbName, Error}, {FromPid, _Tag}, Server) ->
+handle_call({open_result, _T0, DbName, Error}, {Opener, _}, Server) ->
case ets:lookup(couch_dbs, DbName) of
[] ->
% db was deleted during async open
{reply, ok, Server};
- [#entry{req_type = ReqType, waiters = Waiters} = Entry] ->
+ [#entry{pid = Opener, req_type = ReqType, waiters = Waiters} = Entry] ->
[gen_server:reply(Waiter, Error) || Waiter <- Waiters],
couch_log:info("open_result error ~p for ~s", [Error, DbName]),
true = ets:delete(couch_dbs, DbName),
- true = ets:delete(couch_dbs_pid_to_name, FromPid),
+ true = ets:delete(couch_dbs_pid_to_name, Opener),
NewServer = case ReqType of
{create, DbName, Engine, Options, CrFrom} ->
open_async(Server, CrFrom, DbName, Engine, Options);
_ ->
Server
end,
- {reply, ok, db_closed(NewServer, Entry#entry.db_options)}
+ {reply, ok, db_closed(NewServer, Entry#entry.db_options)};
+ [#entry{}] ->
+ % A mismatched pid means that this open_result message
+ % was in our mailbox and is now stale. Ignore it.
+ {reply, ok, Server}
end;
handle_call({open, DbName, Options}, From, Server) ->
case ets:lookup(couch_dbs, DbName) of
diff --git a/src/couch/test/couch_server_tests.erl b/src/couch/test/couch_server_tests.erl
index f2a98e74c..fdbc175ce 100644
--- a/src/couch/test/couch_server_tests.erl
+++ b/src/couch/test/couch_server_tests.erl
@@ -224,9 +224,9 @@ t_interleaved_create_delete_open(DbName) ->
% Our delete request was processed normally
?assertEqual({DelRef, ok}, get_next_message()),
- % TODO: This assertion will change after I fix the bug as
- % its incorrectly receiving the create message's response
- ?assertMatch({OpenRef, {ok, _}}, get_next_message()),
+ % The db was deleted thus it should be not found
+ % when we try and open it.
+ ?assertMatch({OpenRef, {not_found, no_db_file}}, get_next_message()),
% And finally assert that couch_server is still
% alive.