summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick Vatamaniuc <vatamane@apache.org>2020-01-10 12:53:46 -0500
committerNick Vatamaniuc <nickva@users.noreply.github.com>2020-01-10 14:58:05 -0500
commitdd1b2817bbf7a0efce858414310a0c822ce89468 (patch)
tree9f68e1f92ccb6ea32ad7b8670e6dd7b17479d37f
parent24201a34288b6d28c2153f6f9dc7f81bd50f8353 (diff)
downloadcouchdb-dd1b2817bbf7a0efce858414310a0c822ce89468.tar.gz
When shard splitting make sure to reset the targets before any retries
Previously the target was reset only when the whole job started, but not when the initial copy phase restarted on its own. If that happened, we left the target around so the retry failed always with the `eexist` error. Target reset has a check to make sure the shards are not in the global shard map, in case someone manually added them, for example. If they are found there the job panics and exists.
-rw-r--r--src/mem3/src/mem3_reshard_job.erl14
-rw-r--r--src/mem3/test/eunit/mem3_reshard_test.erl6
2 files changed, 5 insertions, 15 deletions
diff --git a/src/mem3/src/mem3_reshard_job.erl b/src/mem3/src/mem3_reshard_job.erl
index d3a33d3f6..aedca21bb 100644
--- a/src/mem3/src/mem3_reshard_job.erl
+++ b/src/mem3/src/mem3_reshard_job.erl
@@ -184,19 +184,12 @@ run(#job{split_state = CurrState} = Job) ->
set_start_state(#job{split_state = State} = Job) ->
- case {State, maps:get(State, ?STATE_RESTART, undefined)} of
- {_, undefined} ->
+ case maps:get(State, ?STATE_RESTART, undefined) of
+ undefined ->
Fmt1 = "~p recover : unknown state ~s",
couch_log:error(Fmt1, [?MODULE, jobfmt(Job)]),
erlang:error({invalid_split_job_recover_state, Job});
- {initial_copy, initial_copy} ->
- % Since we recover from initial_copy to initial_copy, we need
- % to reset the target state as initial_copy expects to
- % create a new target
- Fmt2 = "~p recover : resetting target ~s",
- couch_log:notice(Fmt2, [?MODULE, jobfmt(Job)]),
- reset_target(Job);
- {_, StartState} ->
+ StartState->
Job#job{split_state = StartState}
end.
@@ -403,6 +396,7 @@ initial_copy_impl(#job{source = Source, target = Targets0} = Job) ->
LogMsg1 = "~p initial_copy started ~s",
LogArgs1 = [?MODULE, shardsstr(Source, Targets0)],
couch_log:notice(LogMsg1, LogArgs1),
+ reset_target(Job),
case couch_db_split:split(SourceName, TMap, fun pickfun/3) of
{ok, Seq} ->
LogMsg2 = "~p initial_copy of ~s finished @ seq:~p",
diff --git a/src/mem3/test/eunit/mem3_reshard_test.erl b/src/mem3/test/eunit/mem3_reshard_test.erl
index 1e89755a9..7cd6b1fe6 100644
--- a/src/mem3/test/eunit/mem3_reshard_test.erl
+++ b/src/mem3/test/eunit/mem3_reshard_test.erl
@@ -453,19 +453,15 @@ target_reset_in_initial_copy(#{db1 := Db}) ->
job_state = running,
split_state = initial_copy
},
- BogusParent = spawn(fun() -> receive {ack, _, _} -> ok end end),
- put('$ancestors', [BogusParent]), % make prock_lib:ack not blow up
- meck:expect(mem3_reshard, checkpoint, 2, ok),
meck:expect(couch_db_split, cleanup_target, 2, ok),
meck:expect(couch_server, exists, fun
(<<"t1">>) -> true;
(<<"t2">>) -> true;
(DbName) -> meck:passthrough([DbName])
end),
- JobPid = spawn(fun() -> mem3_reshard_job:init(Job) end),
+ JobPid = spawn(fun() -> mem3_reshard_job:initial_copy_impl(Job) end),
meck:wait(2, couch_db_split, cleanup_target, ['_', '_'], 5000),
exit(JobPid, kill),
- exit(BogusParent, kill),
?assertEqual(2, meck:num_calls(couch_db_split, cleanup_target, 2))
end)}.