Fix fabric_open_doc_revs

There was a subtle bug when opening specific revisions in fabric_doc_open_revs due to a race condition between updates being applied across a cluster. The underlying cause here was due to the stemming after a document had been updated more than revs_limit number of times along with concurrent reads to a node that had not yet made the update. To illustrate lets consider a document A which has a revision history from `{N, RevN}` to `{N+1000, RevN+1000}` (assuming revs_limit is the default 1000). If we consider a single node perspective when an update comes in we added the new revision and stem the oldest revision. The docs the revisions on the node would be `{N+1, RevN+1}` to `{N+1001, RevN+1001}`. The bug exists when we attempt to open revisions on a different node that has yet to apply the new update. In this case when fabric_doc_open_revs could be called with `{N+1000, RevN+1000}`. This results in a response from fabric_doc_open_revs that includes two different `{ok, Doc}` results instead of the expected one instance. The reason for this is that one document has revisions `{N+1, RevN+1}` to `{N+1000, RevN+1000}` from the node that has applied the update, while the node without the update responds with revisions `{N, RevN}` to {N+1000, RevN+1000}`. To rephrase that, a node that has applied an update can end up returning a revision path that contains `revs_limit - 1` revisions while a node wihtout the update returns all `revs_limit` revisions. This slight change in the path prevented the responses from being properly combined into a single response. This bug has existed for many years. However, read repair effectively prevents it from being a significant issue by immediately fixing the revision history discrepancy. This was discovered due to the recent bug in read repair during a mixed cluster upgrade to a release including clustered purge. In this situation we end up crashing the design document cache which then leads to all of the design document requests being direct reads which can end up causing cluster nodes to OOM and die. The conditions require a significant number of design document edits coupled with already significant load to those modified design documents. The most direct example observed was a clustered that had a significant number of filtered replications in and out of the cluster.
author: Paul J. Davis <paul.joseph.davis@gmail.com> 2019-01-14 18:32:34 -0600
committer: Jan Lehnardt <jan@apache.org> 2019-02-17 18:33:19 +0100
commit: 1f7a775633c2b5d7b8537c7c86e5b9094ca4e0a4 (patch)
tree: ff68c4fe3bd676f4acb9bf0ef95e63bc5dc9d631
parent: a50bff3c0fc55c39efe6fe036adee91081e80b45 (diff)
download: couchdb-1f7a775633c2b5d7b8537c7c86e5b9094ca4e0a4.tar.gz
1 files changed, 91 insertions, 25 deletions
diff --git a/src/fabric/src/fabric_doc_open_revs.erl b/src/fabric/src/fabric_doc_open_revs.erl
index 234b108ef..dc03f3df6 100644
--- a/src/fabric/src/fabric_doc_open_revs.erl
+++ b/src/fabric/src/fabric_doc_open_revs.erl
@@ -243,8 +243,7 @@ format_reply(true, Replies, _) ->
     tree_format_replies(Replies);
 
 format_reply(false, Replies, _) ->
-    Filtered = filter_reply(Replies),
-    dict_format_replies(Filtered).
+    dict_format_replies(Replies).
 
 
 tree_format_replies(RevTree) ->
@@ -260,22 +259,59 @@ tree_format_replies(RevTree) ->
 
 
 dict_format_replies(Dict) ->
-    lists:sort([Reply || {_, {Reply, _}} <- Dict]).
-
-filter_reply(Replies) ->
-    AllFoundRevs = lists:foldl(fun
-        ({{{not_found, missing}, _}, _}, Acc) ->
-            Acc;
-        ({{_, {Pos, [Rev | _]}}, _}, Acc) ->
-            [{Pos, Rev} | Acc]
-    end, [], Replies),
-    %% keep not_found replies only for the revs that don't also have doc reply
-    lists:filter(fun
-        ({{{not_found, missing}, Rev}, _}) ->
-            not lists:member(Rev, AllFoundRevs);
-        (_) ->
-            true
-    end, Replies).
+    Replies0 = [Reply || {_, {Reply, _}} <- Dict],
+
+    AllFoundRevs = lists:foldl(fun(Reply, Acc) ->
+        case Reply of
+            {ok, #doc{revs = {Pos, [RevId | _]}}} ->
+                [{Pos, RevId} | Acc];
+            _ ->
+                Acc
+        end
+    end, [], Replies0),
+
+    %% Drop any not_found replies for which we
+    %% found the revision on a different node.
+    Replies1 = lists:filter(fun(Reply) ->
+        case Reply of
+            {{not_found, missing}, Rev} ->
+                not lists:member(Rev, AllFoundRevs);
+            _ ->
+                true
+        end
+    end, Replies0),
+
+    % Remove replies with shorter revision
+    % paths for a given revision.
+    collapse_duplicate_revs(Replies1).
+
+
+collapse_duplicate_revs(Replies) ->
+    % The collapse logic requires that replies are
+    % sorted so that shorter rev paths are in
+    % the list just before longer lists.
+    %
+    % This somewhat implicitly relies on Erlang's
+    % sorting of [A, B] < [A, B, C] for all values
+    % of C.
+    collapse_duplicate_revs_int(lists:sort(Replies)).
+
+
+collapse_duplicate_revs_int([]) ->
+    [];
+
+collapse_duplicate_revs_int([{ok, Doc1}, {ok, Doc2} | Rest]) ->
+    {D1, R1} = Doc1#doc.revs,
+    {D2, R2} = Doc2#doc.revs,
+    Head = case D1 == D2 andalso lists:prefix(R1, R2) of
+        true -> [];
+        false -> [{ok, Doc1}]
+    end,
+    Head ++ collapse_duplicate_revs([{ok, Doc2} | Rest]);
+
+collapse_duplicate_revs_int([Reply | Rest]) ->
+    [Reply | collapse_duplicate_revs(Rest)].
+
 
 -ifdef(TEST).
 -include_lib("eunit/include/eunit.hrl").
@@ -313,7 +349,9 @@ revs() -> [{1,<<"foo">>}, {1,<<"bar">>}, {1,<<"baz">>}].
 
 foo1() -> {ok, #doc{revs = {1, [<<"foo">>]}}}.
 foo2() -> {ok, #doc{revs = {2, [<<"foo2">>, <<"foo">>]}}}.
+foo2stemmed() -> {ok, #doc{revs = {2, [<<"foo2">>]}}}.
 fooNF() -> {{not_found, missing}, {1,<<"foo">>}}.
+foo2NF() -> {{not_found, missing}, {2, <<"foo2">>}}.
 bar1() -> {ok, #doc{revs = {1, [<<"bar">>]}}}.
 barNF() -> {{not_found, missing}, {1,<<"bar">>}}.
 bazNF() -> {{not_found, missing}, {1,<<"baz">>}}.
@@ -351,7 +389,10 @@ open_doc_revs_test_() ->
             check_node_rev_unmodified_on_down_or_exit(),
             check_not_found_replies_are_removed_when_doc_found(),
             check_not_found_returned_when_one_of_docs_not_found(),
-            check_not_found_returned_when_doc_not_found()
+            check_not_found_returned_when_doc_not_found(),
+            check_longer_rev_list_returned(),
+            check_longer_rev_list_not_combined(),
+            check_not_found_removed_and_longer_rev_list()
         ]
     }.
 
@@ -685,24 +726,49 @@ check_node_rev_unmodified_on_down_or_exit() ->
 check_not_found_replies_are_removed_when_doc_found() ->
     ?_test(begin
         Replies = replies_to_dict([foo1(), bar1(), fooNF()]),
-        Expect = replies_to_dict([foo1(), bar1()]),
-        ?assertEqual(Expect, filter_reply(Replies))
+        Expect = [bar1(), foo1()],
+        ?assertEqual(Expect, dict_format_replies(Replies))
     end).
 
 check_not_found_returned_when_one_of_docs_not_found() ->
     ?_test(begin
         Replies = replies_to_dict([foo1(), foo2(), barNF()]),
-        Expect = replies_to_dict([foo1(), foo2(), barNF()]),
-        ?assertEqual(Expect, filter_reply(Replies))
+        Expect = [foo1(), foo2(), barNF()],
+        ?assertEqual(Expect, dict_format_replies(Replies))
     end).
 
 check_not_found_returned_when_doc_not_found() ->
     ?_test(begin
         Replies = replies_to_dict([fooNF(), barNF(), bazNF()]),
-        Expect = replies_to_dict([fooNF(), barNF(), bazNF()]),
-        ?assertEqual(Expect, filter_reply(Replies))
+        Expect = [barNF(), bazNF(), fooNF()],
+        ?assertEqual(Expect, dict_format_replies(Replies))
     end).
 
+check_longer_rev_list_returned() ->
+    ?_test(begin
+        Replies = replies_to_dict([foo2(), foo2stemmed()]),
+        Expect = [foo2()],
+        ?assertEqual(2, length(Replies)),
+        ?assertEqual(Expect, dict_format_replies(Replies))
+    end).
+
+check_longer_rev_list_not_combined() ->
+    ?_test(begin
+        Replies = replies_to_dict([foo2(), foo2stemmed(), bar1()]),
+        Expect = [bar1(), foo2()],
+        ?assertEqual(3, length(Replies)),
+        ?assertEqual(Expect, dict_format_replies(Replies))
+    end).
+
+check_not_found_removed_and_longer_rev_list() ->
+    ?_test(begin
+        Replies = replies_to_dict([foo2(), foo2stemmed(), foo2NF()]),
+        Expect = [foo2()],
+        ?assertEqual(3, length(Replies)),
+        ?assertEqual(Expect, dict_format_replies(Replies))
+    end).
+
+
 replies_to_dict(Replies) ->
     [reply_to_element(R) || R <- Replies].
author	Paul J. Davis <paul.joseph.davis@gmail.com>	2019-01-14 18:32:34 -0600
committer	Jan Lehnardt <jan@apache.org>	2019-02-17 18:33:19 +0100
commit	1f7a775633c2b5d7b8537c7c86e5b9094ca4e0a4 (patch)
tree	ff68c4fe3bd676f4acb9bf0ef95e63bc5dc9d631
parent	a50bff3c0fc55c39efe6fe036adee91081e80b45 (diff)
download	couchdb-1f7a775633c2b5d7b8537c7c86e5b9094ca4e0a4.tar.gz