summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul J. Davis <paul.joseph.davis@gmail.com>2018-06-13 13:15:11 -0500
committerPaul J. Davis <paul.joseph.davis@gmail.com>2018-06-16 16:56:53 -0500
commitaebdbc452573f70f4e50d88af5814d0fbe936333 (patch)
tree3592cf5288ad409a6e2e6da18378102d622f4a73
parent000766c4f8b51562757818dfa36b3e6579d16309 (diff)
downloadcouchdb-aebdbc452573f70f4e50d88af5814d0fbe936333.tar.gz
Optimize couch_key_tree:stem/2
This is two related optimizations for stemming revisions. The first optimization re-writes the stemming algorithm to drop it from an O(N^2) to O(N) operation by using a depth first search through the tree and tracking which revisions are within `revs_limit` revs from a leaf dropping any revision that exceeds that limit. The second optimization is just that we avoid calling stemming more often than necessary by switching away from using `merge/3` to `merge/2` and then calling `stem/2` only when necessary. Co-Authored-By: Nick Vatamaniuc <vatamane@apache.org>
-rw-r--r--src/couch/src/couch_db.erl10
-rw-r--r--src/couch/src/couch_db_updater.erl33
-rw-r--r--src/couch/src/couch_key_tree.erl74
3 files changed, 95 insertions, 22 deletions
diff --git a/src/couch/src/couch_db.erl b/src/couch/src/couch_db.erl
index 93ea07e65..b47cc7ece 100644
--- a/src/couch/src/couch_db.erl
+++ b/src/couch/src/couch_db.erl
@@ -870,16 +870,10 @@ prep_and_validate_replicated_updates(Db, [Bucket|RestBuckets], [OldInfo|RestOldI
{[], AccErrors}, Bucket),
prep_and_validate_replicated_updates(Db, RestBuckets, RestOldInfo, [ValidatedBucket | AccPrepped], AccErrors3);
#full_doc_info{rev_tree=OldTree} ->
- RevsLimit = get_revs_limit(Db),
OldLeafs = couch_key_tree:get_all_leafs_full(OldTree),
OldLeafsLU = [{Start, RevId} || {Start, [{RevId, _}|_]} <- OldLeafs],
- NewRevTree = lists:foldl(
- fun(NewDoc, AccTree) ->
- {NewTree, _} = couch_key_tree:merge(AccTree,
- couch_doc:to_path(NewDoc), RevsLimit),
- NewTree
- end,
- OldTree, Bucket),
+ NewPaths = lists:map(fun couch_doc:to_path/1, Bucket),
+ NewRevTree = couch_key_tree:multi_merge(OldTree, NewPaths),
Leafs = couch_key_tree:get_all_leafs_full(NewRevTree),
LeafRevsFullDict = dict:from_list( [{{Start, RevId}, FullPath} || {Start, [{RevId, _}|_]}=FullPath <- Leafs]),
{ValidatedBucket, AccErrors3} =
diff --git a/src/couch/src/couch_db_updater.erl b/src/couch/src/couch_db_updater.erl
index a2de3bc60..fba99a773 100644
--- a/src/couch/src/couch_db_updater.erl
+++ b/src/couch/src/couch_db_updater.erl
@@ -504,23 +504,24 @@ merge_rev_trees(Limit, MergeConflicts, [NewDocs|RestDocsList],
[OldDocInfo|RestOldInfo], AccNewInfos, AccRemoveSeqs, AccSeq) ->
erlang:put(last_id_merged, OldDocInfo#full_doc_info.id), % for debugging
NewDocInfo0 = lists:foldl(fun({Client, NewDoc}, OldInfoAcc) ->
- merge_rev_tree(OldInfoAcc, NewDoc, Client, Limit, MergeConflicts)
+ merge_rev_tree(OldInfoAcc, NewDoc, Client, MergeConflicts)
end, OldDocInfo, NewDocs),
+ NewDocInfo1 = stem_full_doc_info(NewDocInfo0, Limit),
% When MergeConflicts is false, we updated #full_doc_info.deleted on every
% iteration of merge_rev_tree. However, merge_rev_tree does not update
% #full_doc_info.deleted when MergeConflicts is true, since we don't need
% to know whether the doc is deleted between iterations. Since we still
% need to know if the doc is deleted after the merge happens, we have to
% set it here.
- NewDocInfo1 = case MergeConflicts of
+ NewDocInfo2 = case MergeConflicts of
true ->
- NewDocInfo0#full_doc_info{
- deleted = couch_doc:is_deleted(NewDocInfo0)
+ NewDocInfo1#full_doc_info{
+ deleted = couch_doc:is_deleted(NewDocInfo1)
};
false ->
- NewDocInfo0
+ NewDocInfo1
end,
- if NewDocInfo1 == OldDocInfo ->
+ if NewDocInfo2 == OldDocInfo ->
% nothing changed
merge_rev_trees(Limit, MergeConflicts, RestDocsList, RestOldInfo,
AccNewInfos, AccRemoveSeqs, AccSeq);
@@ -529,7 +530,7 @@ merge_rev_trees(Limit, MergeConflicts, [NewDocs|RestDocsList],
% important to note that the update_seq on OldDocInfo should
% be identical to the value on NewDocInfo1.
OldSeq = OldDocInfo#full_doc_info.update_seq,
- NewDocInfo2 = NewDocInfo1#full_doc_info{
+ NewDocInfo3 = NewDocInfo2#full_doc_info{
update_seq = AccSeq + 1
},
RemoveSeqs = case OldSeq of
@@ -537,10 +538,10 @@ merge_rev_trees(Limit, MergeConflicts, [NewDocs|RestDocsList],
_ -> [OldSeq | AccRemoveSeqs]
end,
merge_rev_trees(Limit, MergeConflicts, RestDocsList, RestOldInfo,
- [NewDocInfo2|AccNewInfos], RemoveSeqs, AccSeq+1)
+ [NewDocInfo3|AccNewInfos], RemoveSeqs, AccSeq+1)
end.
-merge_rev_tree(OldInfo, NewDoc, Client, Limit, false)
+merge_rev_tree(OldInfo, NewDoc, Client, false)
when OldInfo#full_doc_info.deleted ->
% We're recreating a document that was previously
% deleted. To check that this is a recreation from
@@ -574,7 +575,7 @@ merge_rev_tree(OldInfo, NewDoc, Client, Limit, false)
% Merge our modified new doc into the tree
#full_doc_info{rev_tree=OldTree} = OldInfo,
NewTree0 = couch_doc:to_path(NewDoc2),
- case couch_key_tree:merge(OldTree, NewTree0, Limit) of
+ case couch_key_tree:merge(OldTree, NewTree0) of
{NewTree1, new_leaf} ->
% We changed the revision id so inform the caller
send_result(Client, NewDoc, {ok, {OldPos+1, NewRevId}}),
@@ -589,7 +590,7 @@ merge_rev_tree(OldInfo, NewDoc, Client, Limit, false)
send_result(Client, NewDoc, conflict),
OldInfo
end;
-merge_rev_tree(OldInfo, NewDoc, Client, Limit, false) ->
+merge_rev_tree(OldInfo, NewDoc, Client, false) ->
% We're attempting to merge a new revision into an
% undeleted document. To not be a conflict we require
% that the merge results in extending a branch.
@@ -597,7 +598,7 @@ merge_rev_tree(OldInfo, NewDoc, Client, Limit, false) ->
OldTree = OldInfo#full_doc_info.rev_tree,
NewTree0 = couch_doc:to_path(NewDoc),
NewDeleted = NewDoc#doc.deleted,
- case couch_key_tree:merge(OldTree, NewTree0, Limit) of
+ case couch_key_tree:merge(OldTree, NewTree0) of
{NewTree, new_leaf} when not NewDeleted ->
OldInfo#full_doc_info{
rev_tree = NewTree,
@@ -615,14 +616,18 @@ merge_rev_tree(OldInfo, NewDoc, Client, Limit, false) ->
send_result(Client, NewDoc, conflict),
OldInfo
end;
-merge_rev_tree(OldInfo, NewDoc, _Client, Limit, true) ->
+merge_rev_tree(OldInfo, NewDoc, _Client, true) ->
% We're merging in revisions without caring about
% conflicts. Most likely this is a replication update.
OldTree = OldInfo#full_doc_info.rev_tree,
NewTree0 = couch_doc:to_path(NewDoc),
- {NewTree, _} = couch_key_tree:merge(OldTree, NewTree0, Limit),
+ {NewTree, _} = couch_key_tree:merge(OldTree, NewTree0),
OldInfo#full_doc_info{rev_tree = NewTree}.
+stem_full_doc_info(#full_doc_info{rev_tree = Tree} = Info, Limit) ->
+ Stemmed = couch_key_tree:stem(Tree, Limit),
+ Info#full_doc_info{rev_tree = Stemmed}.
+
update_docs_int(Db, DocsList, LocalDocs, MergeConflicts, FullCommit) ->
UpdateSeq = couch_db_engine:get_update_seq(Db),
RevsLimit = couch_db_engine:get_revs_limit(Db),
diff --git a/src/couch/src/couch_key_tree.erl b/src/couch/src/couch_key_tree.erl
index cd661e29a..5d5361507 100644
--- a/src/couch/src/couch_key_tree.erl
+++ b/src/couch/src/couch_key_tree.erl
@@ -59,6 +59,7 @@ get_key_leafs/2,
map/2,
map_leafs/2,
mapfold/3,
+multi_merge/2,
merge/3,
merge/2,
remove_leafs/2,
@@ -71,6 +72,15 @@ stem/2
-type revtree() :: [tree()].
+%% @doc Merge multiple paths into the given tree.
+-spec multi_merge(revtree(), tree()) -> revtree().
+multi_merge(RevTree, Trees) ->
+ lists:foldl(fun(Tree, RevTreeAcc) ->
+ {NewRevTree, _} = merge(RevTreeAcc, Tree),
+ NewRevTree
+ end, RevTree, lists:sort(Trees)).
+
+
%% @doc Merge a path into the given tree and then stem the result.
%% Although Tree is of type tree(), it must not contain any branches.
-spec merge(revtree(), tree() | path(), pos_integer()) ->
@@ -470,6 +480,70 @@ map_leafs_simple(Fun, Pos, [{Key, Value, SubTree} | RestTree]) ->
stem(Trees, Limit) ->
+ try
+ {_, Branches} = lists:foldl(fun(Tree, {Seen, TreeAcc}) ->
+ {NewSeen, NewBranches} = stem_tree(Tree, Limit, Seen),
+ {NewSeen, NewBranches ++ TreeAcc}
+ end, {sets:new(), []}, Trees),
+ lists:sort(Branches)
+ catch throw:dupe_keys ->
+ repair_tree(Trees, Limit)
+ end.
+
+
+stem_tree({Depth, Child}, Limit, Seen) ->
+ case stem_tree(Depth, Child, Limit, Seen) of
+ {NewSeen, _, NewChild, NewBranches} ->
+ {NewSeen, [{Depth, NewChild} | NewBranches]};
+ {NewSeen, _, NewBranches} ->
+ {NewSeen, NewBranches}
+ end.
+
+
+stem_tree(_Depth, {Key, _Val, []} = Leaf, Limit, Seen) ->
+ {check_key(Key, Seen), Limit - 1, Leaf, []};
+
+stem_tree(Depth, {Key, Val, Children}, Limit, Seen0) ->
+ Seen1 = check_key(Key, Seen0),
+ FinalAcc = lists:foldl(fun(Child, Acc) ->
+ {SeenAcc, LimitPosAcc, ChildAcc, BranchAcc} = Acc,
+ case stem_tree(Depth + 1, Child, Limit, SeenAcc) of
+ {NewSeenAcc, LimitPos, NewChild, NewBranches} ->
+ NewLimitPosAcc = erlang:max(LimitPos, LimitPosAcc),
+ NewChildAcc = [NewChild | ChildAcc],
+ NewBranchAcc = NewBranches ++ BranchAcc,
+ {NewSeenAcc, NewLimitPosAcc, NewChildAcc, NewBranchAcc};
+ {NewSeenAcc, LimitPos, NewBranches} ->
+ NewLimitPosAcc = erlang:max(LimitPos, LimitPosAcc),
+ NewBranchAcc = NewBranches ++ BranchAcc,
+ {NewSeenAcc, NewLimitPosAcc, ChildAcc, NewBranchAcc}
+ end
+ end, {Seen1, -1, [], []}, Children),
+ {FinalSeen, FinalLimitPos, FinalChildren, FinalBranches} = FinalAcc,
+ case FinalLimitPos of
+ N when N > 0, length(FinalChildren) > 0 ->
+ FinalNode = {Key, Val, lists:reverse(FinalChildren)},
+ {FinalSeen, FinalLimitPos - 1, FinalNode, FinalBranches};
+ 0 when length(FinalChildren) > 0 ->
+ NewBranches = lists:map(fun(Child) ->
+ {Depth + 1, Child}
+ end, lists:reverse(FinalChildren)),
+ {FinalSeen, -1, NewBranches ++ FinalBranches};
+ N when N < 0, length(FinalChildren) == 0 ->
+ {FinalSeen, FinalLimitPos - 1, FinalBranches}
+ end.
+
+
+check_key(Key, Seen) ->
+ case sets:is_element(Key, Seen) of
+ true ->
+ throw(dupe_keys);
+ false ->
+ sets:add_element(Key, Seen)
+ end.
+
+
+repair_tree(Trees, Limit) ->
% flatten each branch in a tree into a tree path, sort by starting rev #
Paths = lists:sort(lists:map(fun({Pos, Path}) ->
StemmedPath = lists:sublist(Path, Limit),