summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul J. Davis <paul.joseph.davis@gmail.com>2017-11-02 14:30:14 -0500
committerNick Vatamaniuc <vatamane@apache.org>2018-01-15 17:37:46 -0500
commit4c490b0792381dfa6c219ccbb246b1a9dcc106be (patch)
treeb23ef3536cab04dd3a005b6c7bbe2f864cd0fa7a
parente4ff49bb7e0f09a4df34c883e27c1e9673773c25 (diff)
downloadcouchdb-4c490b0792381dfa6c219ccbb246b1a9dcc106be.tar.gz
Optimize couch_key_tree:stem/2
This changes the stemming algorithm to be a single pass O(N) operation rather than the existing O(N^2) operation. The old stemming algorithm works by taking the entire tree apart and then merging the resulting paths back into a single tree. The merging operation ends up causing the O(N^2) aspect because every branch has to be compared to every exisitng sub-tree.
-rw-r--r--src/couch/src/couch_db_updater.erl9
-rw-r--r--src/couch/src/couch_key_tree.erl50
2 files changed, 55 insertions, 4 deletions
diff --git a/src/couch/src/couch_db_updater.erl b/src/couch/src/couch_db_updater.erl
index bcddbe04a..f437426c8 100644
--- a/src/couch/src/couch_db_updater.erl
+++ b/src/couch/src/couch_db_updater.erl
@@ -876,8 +876,11 @@ stem_full_doc_info(#full_doc_info{rev_tree = Tree} = Info, Limit) ->
Stemmed = couch_key_tree:stem(Tree, Limit),
Info#full_doc_info{rev_tree = Stemmed}.
-stem_full_doc_infos(#db{revs_limit=Limit}, DocInfos) ->
- lists:map(fun(FDI) -> stem_full_doc_info(FDI, Limit) end, DocInfos).
+full_stem_full_doc_infos(#db{revs_limit=Limit}, DocInfos) ->
+ lists:map(fun(#full_doc_info{rev_tree=Tree}=FDI) ->
+ Stemmed = couch_key_tree:full_stem(Tree, Limit),
+ FDI#full_doc_info{rev_tree=Stemmed}
+ end, DocInfos).
update_docs_int(Db, DocsList, NonRepDocs, MergeConflicts, FullCommit) ->
#db{
@@ -1125,7 +1128,7 @@ copy_docs(Db, #db{fd = DestFd} = NewDb, MixedInfos, Retry) ->
}
end, NewInfos0),
- NewInfos = stem_full_doc_infos(Db, NewInfos1),
+ NewInfos = full_stem_full_doc_infos(Db, NewInfos1),
RemoveSeqs =
case Retry of
nil ->
diff --git a/src/couch/src/couch_key_tree.erl b/src/couch/src/couch_key_tree.erl
index e2e187eb1..eda466467 100644
--- a/src/couch/src/couch_key_tree.erl
+++ b/src/couch/src/couch_key_tree.erl
@@ -62,7 +62,8 @@ mapfold/3,
merge/3,
merge/2,
remove_leafs/2,
-stem/2
+stem/2,
+full_stem/2
]).
-include_lib("couch/include/couch_db.hrl").
@@ -470,6 +471,53 @@ map_leafs_simple(Fun, Pos, [{Key, Value, SubTree} | RestTree]) ->
stem(Trees, Limit) ->
+ lists:sort(lists:flatmap(fun(Tree) ->
+ stem_tree(Tree, Limit)
+ end, Trees)).
+
+
+stem_tree({Depth, Child}, Limit) ->
+ case stem_tree(Depth, Child, Limit) of
+ {_, NewChild, NewBranches} ->
+ [{Depth, NewChild} | NewBranches];
+ {_, NewBranches} ->
+ NewBranches
+ end.
+
+
+stem_tree(_Depth, {_Key, _Val, []} = Leaf, Limit) ->
+ {Limit - 1, Leaf, []};
+
+stem_tree(Depth, {Key, Val, Children}, Limit) ->
+ FinalAcc = lists:foldl(fun(Child, {LimitPosAcc, ChildAcc, BranchAcc}) ->
+ case stem_tree(Depth + 1, Child, Limit) of
+ {LimitPos, NewChild, NewBranches} ->
+ NewLimitPosAcc = erlang:max(LimitPos, LimitPosAcc),
+ NewChildAcc = [NewChild | ChildAcc],
+ NewBranchAcc = NewBranches ++ BranchAcc,
+ {NewLimitPosAcc, NewChildAcc, NewBranchAcc};
+ {LimitPos, NewBranches} ->
+ NewLimitPosAcc = erlang:max(LimitPos, LimitPosAcc),
+ NewBranchAcc = NewBranches ++ BranchAcc,
+ {NewLimitPosAcc, ChildAcc, NewBranchAcc}
+ end
+ end, {-1, [], []}, Children),
+ {FinalLimitPos, FinalChildren, FinalBranches} = FinalAcc,
+ case FinalLimitPos of
+ N when N > 0, length(FinalChildren) > 0 ->
+ FinalNode = {Key, Val, lists:reverse(FinalChildren)},
+ {FinalLimitPos - 1, FinalNode, FinalBranches};
+ 0 when length(FinalChildren) > 0 ->
+ NewBranches = lists:map(fun(Child) ->
+ {Depth + 1, Child}
+ end, lists:reverse(FinalChildren)),
+ {-1, NewBranches ++ FinalBranches};
+ N when N < 0, length(FinalChildren) == 0 ->
+ {FinalLimitPos - 1, FinalBranches}
+ end.
+
+
+full_stem(Trees, Limit) ->
% flatten each branch in a tree into a tree path, sort by starting rev #
Paths = lists:sort(lists:map(fun({Pos, Path}) ->
StemmedPath = lists:sublist(Path, Limit),