diff options
author | Paul J. Davis <paul.joseph.davis@gmail.com> | 2017-11-02 14:30:14 -0500 |
---|---|---|
committer | Nick Vatamaniuc <vatamane@apache.org> | 2018-01-15 17:37:46 -0500 |
commit | 4c490b0792381dfa6c219ccbb246b1a9dcc106be (patch) | |
tree | b23ef3536cab04dd3a005b6c7bbe2f864cd0fa7a | |
parent | e4ff49bb7e0f09a4df34c883e27c1e9673773c25 (diff) | |
download | couchdb-4c490b0792381dfa6c219ccbb246b1a9dcc106be.tar.gz |
Optimize couch_key_tree:stem/2
This changes the stemming algorithm to be a single pass O(N) operation
rather than the existing O(N^2) operation.
The old stemming algorithm works by taking the entire tree apart and
then merging the resulting paths back into a single tree. The merging
operation ends up causing the O(N^2) aspect because every branch has to
be compared to every exisitng sub-tree.
-rw-r--r-- | src/couch/src/couch_db_updater.erl | 9 | ||||
-rw-r--r-- | src/couch/src/couch_key_tree.erl | 50 |
2 files changed, 55 insertions, 4 deletions
diff --git a/src/couch/src/couch_db_updater.erl b/src/couch/src/couch_db_updater.erl index bcddbe04a..f437426c8 100644 --- a/src/couch/src/couch_db_updater.erl +++ b/src/couch/src/couch_db_updater.erl @@ -876,8 +876,11 @@ stem_full_doc_info(#full_doc_info{rev_tree = Tree} = Info, Limit) -> Stemmed = couch_key_tree:stem(Tree, Limit), Info#full_doc_info{rev_tree = Stemmed}. -stem_full_doc_infos(#db{revs_limit=Limit}, DocInfos) -> - lists:map(fun(FDI) -> stem_full_doc_info(FDI, Limit) end, DocInfos). +full_stem_full_doc_infos(#db{revs_limit=Limit}, DocInfos) -> + lists:map(fun(#full_doc_info{rev_tree=Tree}=FDI) -> + Stemmed = couch_key_tree:full_stem(Tree, Limit), + FDI#full_doc_info{rev_tree=Stemmed} + end, DocInfos). update_docs_int(Db, DocsList, NonRepDocs, MergeConflicts, FullCommit) -> #db{ @@ -1125,7 +1128,7 @@ copy_docs(Db, #db{fd = DestFd} = NewDb, MixedInfos, Retry) -> } end, NewInfos0), - NewInfos = stem_full_doc_infos(Db, NewInfos1), + NewInfos = full_stem_full_doc_infos(Db, NewInfos1), RemoveSeqs = case Retry of nil -> diff --git a/src/couch/src/couch_key_tree.erl b/src/couch/src/couch_key_tree.erl index e2e187eb1..eda466467 100644 --- a/src/couch/src/couch_key_tree.erl +++ b/src/couch/src/couch_key_tree.erl @@ -62,7 +62,8 @@ mapfold/3, merge/3, merge/2, remove_leafs/2, -stem/2 +stem/2, +full_stem/2 ]). -include_lib("couch/include/couch_db.hrl"). @@ -470,6 +471,53 @@ map_leafs_simple(Fun, Pos, [{Key, Value, SubTree} | RestTree]) -> stem(Trees, Limit) -> + lists:sort(lists:flatmap(fun(Tree) -> + stem_tree(Tree, Limit) + end, Trees)). + + +stem_tree({Depth, Child}, Limit) -> + case stem_tree(Depth, Child, Limit) of + {_, NewChild, NewBranches} -> + [{Depth, NewChild} | NewBranches]; + {_, NewBranches} -> + NewBranches + end. + + +stem_tree(_Depth, {_Key, _Val, []} = Leaf, Limit) -> + {Limit - 1, Leaf, []}; + +stem_tree(Depth, {Key, Val, Children}, Limit) -> + FinalAcc = lists:foldl(fun(Child, {LimitPosAcc, ChildAcc, BranchAcc}) -> + case stem_tree(Depth + 1, Child, Limit) of + {LimitPos, NewChild, NewBranches} -> + NewLimitPosAcc = erlang:max(LimitPos, LimitPosAcc), + NewChildAcc = [NewChild | ChildAcc], + NewBranchAcc = NewBranches ++ BranchAcc, + {NewLimitPosAcc, NewChildAcc, NewBranchAcc}; + {LimitPos, NewBranches} -> + NewLimitPosAcc = erlang:max(LimitPos, LimitPosAcc), + NewBranchAcc = NewBranches ++ BranchAcc, + {NewLimitPosAcc, ChildAcc, NewBranchAcc} + end + end, {-1, [], []}, Children), + {FinalLimitPos, FinalChildren, FinalBranches} = FinalAcc, + case FinalLimitPos of + N when N > 0, length(FinalChildren) > 0 -> + FinalNode = {Key, Val, lists:reverse(FinalChildren)}, + {FinalLimitPos - 1, FinalNode, FinalBranches}; + 0 when length(FinalChildren) > 0 -> + NewBranches = lists:map(fun(Child) -> + {Depth + 1, Child} + end, lists:reverse(FinalChildren)), + {-1, NewBranches ++ FinalBranches}; + N when N < 0, length(FinalChildren) == 0 -> + {FinalLimitPos - 1, FinalBranches} + end. + + +full_stem(Trees, Limit) -> % flatten each branch in a tree into a tree path, sort by starting rev # Paths = lists:sort(lists:map(fun({Pos, Path}) -> StemmedPath = lists:sublist(Path, Limit), |