summaryrefslogtreecommitdiff
path: root/src/couch/src/couch_key_tree.erl
blob: 94150418e9eb24ee3d176cbae58f129a6127edae (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
% Licensed under the Apache License, Version 2.0 (the "License"); you may not
% use this file except in compliance with the License. You may obtain a copy of
% the License at
%
%   http://www.apache.org/licenses/LICENSE-2.0
%
% Unless required by applicable law or agreed to in writing, software
% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
% License for the specific language governing permissions and limitations under
% the License.

%% @doc Data structure used to represent document edit histories.

%% A key tree is used to represent the edit history of a document. Each node of
%% the tree represents a particular version. Relations between nodes represent
%% the order that these edits were applied. For instance, a set of three edits
%% would produce a tree of versions A->B->C indicating that edit C was based on
%% version B which was in turn based on A. In a world without replication (and
%% no ability to disable MVCC checks), all histories would be forced to be
%% linear lists of edits due to constraints imposed by MVCC (ie, new edits must
%% be based on the current version). However, we have replication, so we must
%% deal with not so easy cases, which lead to trees.
%%
%% Consider a document in state A. This doc is replicated to a second node. We
%% then edit the document on each node leaving it in two different states, B
%% and C. We now have two key trees, A->B and A->C. When we go to replicate a
%% second time, the key tree must combine these two trees which gives us
%% A->(B|C). This is how conflicts are introduced. In terms of the key tree, we
%% say that we have two leaves (B and C) that are not deleted. The presense of
%% the multiple leaves indicate conflict. To remove a conflict, one of the
%% edits (B or C) can be deleted, which results in, A->(B|C->D) where D is an
%% edit that is specially marked with the a deleted=true flag.
%%
%% What makes this a bit more complicated is that there is a limit to the
%% number of revisions kept, specified in couch_db.hrl (default is 1000). When
%% this limit is exceeded only the last 1000 are kept. This comes in to play
%% when branches are merged. The comparison has to begin at the same place in
%% the branches. A revision id is of the form N-XXXXXXX where N is the current
%% revision depth. So each path will have a start number, calculated in
%% couch_doc:to_path using the formula N - length(RevIds) + 1 So, .eg. if a doc
%% was edit 1003 times this start number would be 4, indicating that 3
%% revisions were truncated.
%%
%% This comes into play in @see merge_at/3 which recursively walks down one
%% tree or the other until they begin at the same revision.

-module(couch_key_tree).

-export([
count_leafs/1,
find_missing/2,
fold/3,
get/2,
get_all_leafs/1,
get_all_leafs_full/1,
get_full_key_paths/2,
get_key_leafs/2,
map/2,
map_leafs/2,
mapfold/3,
multi_merge/2,
merge/2,
remove_leafs/2,
stem/2
]).

-include_lib("couch/include/couch_db.hrl").
-type treenode() :: {Key::term(), Value::term(), [Node::treenode()]}.
-type tree() :: {Depth::pos_integer(), [treenode()]}.
-type revtree() :: [tree()].


%% @doc Merge multiple paths into the given tree.
-spec multi_merge(revtree(), tree()) -> revtree().
multi_merge(RevTree, Trees) ->
    lists:foldl(fun(Tree, RevTreeAcc) ->
        {NewRevTree, _} = merge(RevTreeAcc, Tree),
        NewRevTree
    end, RevTree, lists:sort(Trees)).


%% @doc Merge a path into a tree.
-spec merge(revtree(), tree() | path()) ->
                {revtree(), new_leaf | new_branch | internal_node}.
merge(RevTree, Tree) ->
    {Merged, Result} = merge_tree(RevTree, Tree, []),
    {lists:sort(Merged), Result}.

%% @private
%% @doc Attempt to merge Tree into each branch of the RevTree.
%% If it can't find a branch that the new tree merges into, add it as a
%% new branch in the RevTree.
-spec merge_tree(revtree(), tree() | path(), revtree()) ->
                {revtree(), new_leaf | new_branch | internal_node}.
merge_tree([], Tree, []) ->
    {[Tree], new_leaf};
merge_tree([], Tree, MergeAcc) ->
    {[Tree|MergeAcc], new_branch};
merge_tree([{Depth, Nodes} | Rest], {IDepth, INodes}=Tree, MergeAcc) ->
    % For the intrepid observer following along at home, notice what we're
    % doing here with (Depth - IDepth). This tells us which of the two
    % branches (Nodes or INodes) we need to seek into. If Depth > IDepth
    % that means we need go into INodes to find where we line up with
    % Nodes. If Depth < IDepth, its obviously the other way. If it turns
    % out that (Depth - IDepth) == 0, then we know that this is where
    % we begin our actual merge operation (ie, looking for key matches).
    % Its helpful to note that this whole moving into sub-branches is due
    % to how we store trees that have been stemmed. When a path is
    % stemmed so that the root node is lost, we wrap it in a tuple with
    % the number keys that have been droped. This number is the depth
    % value that's used throughout this module.
    case merge_at([Nodes], Depth - IDepth, [INodes]) of
        {[Merged], Result} ->
            NewDepth = erlang:min(Depth, IDepth),
            {Rest ++ [{NewDepth, Merged} | MergeAcc], Result};
        fail ->
            merge_tree(Rest, Tree, [{Depth, Nodes} | MergeAcc])
    end.

%% @private
%% @doc Locate the point at which merging can start.
%% Because of stemming we may need to seek into one of the branches
%% before we can start comparing node keys. If one of the branches
%% ends up running out of nodes we know that these two branches can
%% not be merged.
-spec merge_at([node()], integer(), [node()]) ->
                {revtree(), new_leaf | new_branch | internal_node} | fail.
merge_at(_Nodes, _Pos, []) ->
    fail;
merge_at([], _Pos, _INodes) ->
    fail;
merge_at(Nodes, Pos, [{IK, IV, [NextINode]}]) when Pos > 0 ->
    % Depth was bigger than IDepth, so we need to discard from the
    % insert path to find where it might start matching.
    case merge_at(Nodes, Pos - 1, [NextINode]) of
        {Merged, Result} -> {[{IK, IV, Merged}], Result};
        fail -> fail
    end;
merge_at(_Nodes, Pos, [{_IK, _IV, []}]) when Pos > 0 ->
    % We've run out of path on the insert side, there's no way we can
    % merge with this branch
    fail;
merge_at([{K, V, SubTree} | Sibs], Pos, INodes) when Pos < 0 ->
    % When Pos is negative, Depth was less than IDepth, so we
    % need to discard from the revision tree path
    case merge_at(SubTree, Pos + 1, INodes) of
        {Merged, Result} ->
            {[{K, V, Merged} | Sibs], Result};
        fail ->
            % Merging along the subtree failed. We need to also try
            % merging the insert branch against the siblings of this
            % node.
            case merge_at(Sibs, Pos, INodes) of
                {Merged, Result} -> {[{K, V, SubTree} | Merged], Result};
                fail -> fail
            end
    end;
merge_at([{K, V1, Nodes} | Sibs], 0, [{K, V2, INodes}]) ->
    % Keys are equal. At this point we have found a possible starting
    % position for our merge to take place.
    {Merged, Result} = merge_extend(Nodes, INodes),
    {[{K, value_pref(V1, V2), Merged} | Sibs], Result};
merge_at([{K1, _, _} | _], 0, [{K2, _, _}]) when K1 > K2 ->
    % Siblings keys are ordered, no point in continuing
    fail;
merge_at([Tree | Sibs], 0, INodes) ->
    % INodes key comes after this key, so move on to the next sibling.
    case merge_at(Sibs, 0, INodes) of
        {Merged, Result} -> {[Tree | Merged], Result};
        fail -> fail
    end.

-spec merge_extend(revtree(), revtree()) ->
                {revtree(), new_leaf | new_branch | internal_node}.
merge_extend([], B) when B =/= [] ->
    % Most likely the insert branch simply extends this one, so the new
    % branch is exactly B. Its also possible that B is a branch because
    % its key sorts greater than all siblings of an internal node. This
    % condition is checked in the last clause of this function and the
    % new_leaf result is fixed to be new_branch.
    {B, new_leaf};
merge_extend(A, []) ->
    % Insert branch ends an internal node in our original revtree()
    % so the end result is exactly our original revtree.
    {A, internal_node};
merge_extend([{K, V1, SubA} | NextA], [{K, V2, SubB}]) ->
    % Here we're simply extending the path to the next deeper
    % level in the two branches.
    {Merged, Result} = merge_extend(SubA, SubB),
    {[{K, value_pref(V1, V2), Merged} | NextA], Result};
merge_extend([{K1, _, _}=NodeA | Rest], [{K2, _, _}=NodeB]) when K1 > K2 ->
    % Keys are ordered so we know this is where the insert branch needs
    % to be inserted into the tree. We also know that this creates a new
    % branch so we have a new leaf to report.
    {[NodeB, NodeA | Rest], new_branch};
merge_extend([Tree | RestA], NextB) ->
    % Here we're moving on to the next sibling to try and extend our
    % merge even deeper. The length check is due to the fact that the
    % key in NextB might be larger than the largest key in RestA which
    % means we've created a new branch.
    {Merged, Result0} = merge_extend(RestA, NextB),
    Result = case length(Merged) == length(RestA) of
        true -> Result0;
        false -> new_branch
    end,
    {[Tree | Merged], Result}.

find_missing(_Tree, []) ->
    [];
find_missing([], SeachKeys) ->
    SeachKeys;
find_missing([{Start, {Key, Value, SubTree}} | RestTree], SeachKeys) ->
    PossibleKeys = [{KeyPos, KeyValue} || {KeyPos, KeyValue} <- SeachKeys, KeyPos >= Start],
    ImpossibleKeys = [{KeyPos, KeyValue} || {KeyPos, KeyValue} <- SeachKeys, KeyPos < Start],
    Missing = find_missing_simple(Start, [{Key, Value, SubTree}], PossibleKeys),
    find_missing(RestTree, ImpossibleKeys ++ Missing).

find_missing_simple(_Pos, _Tree, []) ->
    [];
find_missing_simple(_Pos, [], SeachKeys) ->
    SeachKeys;
find_missing_simple(Pos, [{Key, _, SubTree} | RestTree], SeachKeys) ->
    PossibleKeys = [{KeyPos, KeyValue} || {KeyPos, KeyValue} <- SeachKeys, KeyPos >= Pos],
    ImpossibleKeys = [{KeyPos, KeyValue} || {KeyPos, KeyValue} <- SeachKeys, KeyPos < Pos],

    SrcKeys2 = PossibleKeys -- [{Pos, Key}],
    SrcKeys3 = find_missing_simple(Pos + 1, SubTree, SrcKeys2),
    ImpossibleKeys ++ find_missing_simple(Pos, RestTree, SrcKeys3).


filter_leafs([], _Keys, FilteredAcc, RemovedKeysAcc) ->
    {FilteredAcc, RemovedKeysAcc};
filter_leafs([{Pos, [{LeafKey, _}|_]} = Path |Rest], Keys, FilteredAcc, RemovedKeysAcc) ->
    FilteredKeys = lists:delete({Pos, LeafKey}, Keys),
    if FilteredKeys == Keys ->
        % this leaf is not a key we are looking to remove
        filter_leafs(Rest, Keys, [Path | FilteredAcc], RemovedKeysAcc);
    true ->
        % this did match a key, remove both the node and the input key
        filter_leafs(Rest, FilteredKeys, FilteredAcc, [{Pos, LeafKey} | RemovedKeysAcc])
    end.

% Removes any branches from the tree whose leaf node(s) are in the Keys
remove_leafs(Trees, Keys) ->
    % flatten each branch in a tree into a tree path
    Paths = get_all_leafs_full(Trees),

    % filter out any that are in the keys list.
    {FilteredPaths, RemovedKeys} = filter_leafs(Paths, Keys, [], []),

    SortedPaths = lists:sort(
        [{Pos + 1 - length(Path), Path} || {Pos, Path} <- FilteredPaths]
    ),

    % convert paths back to trees
    NewTree = lists:foldl(
        fun({StartPos, Path},TreeAcc) ->
            [SingleTree] = lists:foldl(
                fun({K,V},NewTreeAcc) -> [{K,V,NewTreeAcc}] end, [], Path),
            {NewTrees, _} = merge(TreeAcc, {StartPos, SingleTree}),
            NewTrees
        end, [], SortedPaths),
    {NewTree, RemovedKeys}.


% get the leafs in the tree matching the keys. The matching key nodes can be
% leafs or an inner nodes. If an inner node, then the leafs for that node
% are returned.
get_key_leafs(Tree, Keys) ->
    get_key_leafs(Tree, Keys, []).

get_key_leafs(_, [], Acc) ->
    {Acc, []};
get_key_leafs([], Keys, Acc) ->
    {Acc, Keys};
get_key_leafs([{Pos, Tree}|Rest], Keys, Acc) ->
    {Gotten, RemainingKeys} = get_key_leafs_simple(Pos, [Tree], Keys, []),
    get_key_leafs(Rest, RemainingKeys, Gotten ++ Acc).

get_key_leafs_simple(_Pos, _Tree, [], _PathAcc) ->
    {[], []};
get_key_leafs_simple(_Pos, [], Keys, _PathAcc) ->
    {[], Keys};
get_key_leafs_simple(Pos, [{Key, _, SubTree}=Tree | RestTree], Keys, PathAcc) ->
    case lists:delete({Pos, Key}, Keys) of
        Keys ->
            % Same list, key not found
            NewPathAcc = [Key | PathAcc],
            {ChildLeafs, Keys2} = get_key_leafs_simple(Pos + 1, SubTree, Keys, NewPathAcc),
            {SiblingLeafs, Keys3} = get_key_leafs_simple(Pos, RestTree, Keys2, PathAcc),
            {ChildLeafs ++ SiblingLeafs, Keys3};
        Keys2 ->
            % This is a key we were looking for, get all descendant
            % leafs while removing any requested key we find. Notice
            % that this key will be returned by get_key_leafs_simple2
            % if it's a leaf so there's no need to return it here.
            {ChildLeafs, Keys3} = get_key_leafs_simple2(Pos, [Tree], Keys2, PathAcc),
            {SiblingLeafs, Keys4} = get_key_leafs_simple(Pos, RestTree, Keys3, PathAcc),
            {ChildLeafs ++ SiblingLeafs, Keys4}
    end.


get_key_leafs_simple2(_Pos, [], Keys, _PathAcc) ->
    % No more tree to deal with so no more keys to return.
    {[], Keys};
get_key_leafs_simple2(Pos, [{Key, Value, []} | RestTree], Keys, PathAcc) ->
    % This is a leaf as defined by having an empty list of
    % child nodes. The assertion is a bit subtle but the function
    % clause match means its a leaf.
    Keys2 = lists:delete({Pos, Key}, Keys),
    {SiblingLeafs, Keys3} = get_key_leafs_simple2(Pos, RestTree, Keys2, PathAcc),
    {[{Value, {Pos, [Key | PathAcc]}} | SiblingLeafs], Keys3};
get_key_leafs_simple2(Pos, [{Key, _Value, SubTree} | RestTree], Keys, PathAcc) ->
    % This isn't a leaf. Recurse into the subtree and then
    % process any sibling branches.
    Keys2 = lists:delete({Pos, Key}, Keys),
    NewPathAcc = [Key | PathAcc],
    {ChildLeafs, Keys3} = get_key_leafs_simple2(Pos + 1, SubTree, Keys2, NewPathAcc),
    {SiblingLeafs, Keys4} = get_key_leafs_simple2(Pos, RestTree, Keys3, PathAcc),
    {ChildLeafs ++ SiblingLeafs, Keys4}.


get(Tree, KeysToGet) ->
    {KeyPaths, KeysNotFound} = get_full_key_paths(Tree, KeysToGet),
    FixedResults = [ {Value, {Pos, [Key0 || {Key0, _} <- Path]}} || {Pos, [{_Key, Value}|_]=Path} <- KeyPaths],
    {FixedResults, KeysNotFound}.

get_full_key_paths(Tree, Keys) ->
    get_full_key_paths(Tree, Keys, []).

get_full_key_paths(_, [], Acc) ->
    {Acc, []};
get_full_key_paths([], Keys, Acc) ->
    {Acc, Keys};
get_full_key_paths([{Pos, Tree}|Rest], Keys, Acc) ->
    {Gotten, RemainingKeys} = get_full_key_paths(Pos, [Tree], Keys, []),
    get_full_key_paths(Rest, RemainingKeys, Gotten ++ Acc).


get_full_key_paths(_Pos, _Tree, [], _KeyPathAcc) ->
    {[], []};
get_full_key_paths(_Pos, [], KeysToGet, _KeyPathAcc) ->
    {[], KeysToGet};
get_full_key_paths(Pos, [{KeyId, Value, SubTree} | RestTree], KeysToGet, KeyPathAcc) ->
    KeysToGet2 = KeysToGet -- [{Pos, KeyId}],
    CurrentNodeResult =
    case length(KeysToGet2) =:= length(KeysToGet) of
    true -> % not in the key list.
        [];
    false -> % this node is the key list. return it
        [{Pos, [{KeyId, Value} | KeyPathAcc]}]
    end,
    {KeysGotten, KeysRemaining} = get_full_key_paths(Pos + 1, SubTree, KeysToGet2, [{KeyId, Value} | KeyPathAcc]),
    {KeysGotten2, KeysRemaining2} = get_full_key_paths(Pos, RestTree, KeysRemaining, KeyPathAcc),
    {CurrentNodeResult ++ KeysGotten ++ KeysGotten2, KeysRemaining2}.

get_all_leafs_full(Tree) ->
    get_all_leafs_full(Tree, []).

get_all_leafs_full([], Acc) ->
    Acc;
get_all_leafs_full([{Pos, Tree} | Rest], Acc) ->
    get_all_leafs_full(Rest, get_all_leafs_full_simple(Pos, [Tree], []) ++ Acc).

get_all_leafs_full_simple(_Pos, [], _KeyPathAcc) ->
    [];
get_all_leafs_full_simple(Pos, [{KeyId, Value, []} | RestTree], KeyPathAcc) ->
    [{Pos, [{KeyId, Value} | KeyPathAcc]} | get_all_leafs_full_simple(Pos, RestTree, KeyPathAcc)];
get_all_leafs_full_simple(Pos, [{KeyId, Value, SubTree} | RestTree], KeyPathAcc) ->
    get_all_leafs_full_simple(Pos + 1, SubTree, [{KeyId, Value} | KeyPathAcc]) ++ get_all_leafs_full_simple(Pos, RestTree, KeyPathAcc).

get_all_leafs(Trees) ->
    get_all_leafs(Trees, []).

get_all_leafs([], Acc) ->
    Acc;
get_all_leafs([{Pos, Tree}|Rest], Acc) ->
    get_all_leafs(Rest, get_all_leafs_simple(Pos, [Tree], []) ++ Acc).

get_all_leafs_simple(_Pos, [], _KeyPathAcc) ->
    [];
get_all_leafs_simple(Pos, [{KeyId, Value, []} | RestTree], KeyPathAcc) ->
    [{Value, {Pos, [KeyId | KeyPathAcc]}} | get_all_leafs_simple(Pos, RestTree, KeyPathAcc)];
get_all_leafs_simple(Pos, [{KeyId, _Value, SubTree} | RestTree], KeyPathAcc) ->
    get_all_leafs_simple(Pos + 1, SubTree, [KeyId | KeyPathAcc]) ++ get_all_leafs_simple(Pos, RestTree, KeyPathAcc).


count_leafs([]) ->
    0;
count_leafs([{_Pos,Tree}|Rest]) ->
    count_leafs_simple([Tree]) + count_leafs(Rest).

count_leafs_simple([]) ->
    0;
count_leafs_simple([{_Key, _Value, []} | RestTree]) ->
    1 + count_leafs_simple(RestTree);
count_leafs_simple([{_Key, _Value, SubTree} | RestTree]) ->
    count_leafs_simple(SubTree) + count_leafs_simple(RestTree).


fold(_Fun, Acc, []) ->
    Acc;
fold(Fun, Acc0, [{Pos, Tree}|Rest]) ->
    Acc1 = fold_simple(Fun, Acc0, Pos, [Tree]),
    fold(Fun, Acc1, Rest).

fold_simple(_Fun, Acc, _Pos, []) ->
    Acc;
fold_simple(Fun, Acc0, Pos, [{Key, Value, SubTree} | RestTree]) ->
    Type = if SubTree == [] -> leaf; true -> branch end,
    Acc1 = Fun({Pos, Key}, Value, Type, Acc0),
    Acc2 = fold_simple(Fun, Acc1, Pos+1, SubTree),
    fold_simple(Fun, Acc2, Pos, RestTree).


map(_Fun, []) ->
    [];
map(Fun, [{Pos, Tree}|Rest]) ->
    case erlang:fun_info(Fun, arity) of
    {arity, 2} ->
        [NewTree] = map_simple(fun(A,B,_C) -> Fun(A,B) end, Pos, [Tree]),
        [{Pos, NewTree} | map(Fun, Rest)];
    {arity, 3} ->
        [NewTree] = map_simple(Fun, Pos, [Tree]),
        [{Pos, NewTree} | map(Fun, Rest)]
    end.

map_simple(_Fun, _Pos, []) ->
    [];
map_simple(Fun, Pos, [{Key, Value, SubTree} | RestTree]) ->
    Value2 = Fun({Pos, Key}, Value,
            if SubTree == [] -> leaf; true -> branch end),
    [{Key, Value2, map_simple(Fun, Pos + 1, SubTree)} | map_simple(Fun, Pos, RestTree)].


mapfold(_Fun, Acc, []) ->
    {[], Acc};
mapfold(Fun, Acc, [{Pos, Tree} | Rest]) ->
    {[NewTree], Acc2} = mapfold_simple(Fun, Acc, Pos, [Tree]),
    {Rest2, Acc3} = mapfold(Fun, Acc2, Rest),
    {[{Pos, NewTree} | Rest2], Acc3}.

mapfold_simple(_Fun, Acc, _Pos, []) ->
    {[], Acc};
mapfold_simple(Fun, Acc, Pos, [{Key, Value, SubTree} | RestTree]) ->
    {Value2, Acc2} = Fun({Pos, Key}, Value,
            if SubTree == [] -> leaf; true -> branch end, Acc),
    {SubTree2, Acc3} = mapfold_simple(Fun, Acc2, Pos + 1, SubTree),
    {RestTree2, Acc4} = mapfold_simple(Fun, Acc3, Pos, RestTree),
    {[{Key, Value2, SubTree2} | RestTree2], Acc4}.


map_leafs(_Fun, []) ->
    [];
map_leafs(Fun, [{Pos, Tree}|Rest]) ->
    [NewTree] = map_leafs_simple(Fun, Pos, [Tree]),
    [{Pos, NewTree} | map_leafs(Fun, Rest)].

map_leafs_simple(_Fun, _Pos, []) ->
    [];
map_leafs_simple(Fun, Pos, [{Key, Value, []} | RestTree]) ->
    Value2 = Fun({Pos, Key}, Value),
    [{Key, Value2, []} | map_leafs_simple(Fun, Pos, RestTree)];
map_leafs_simple(Fun, Pos, [{Key, Value, SubTree} | RestTree]) ->
    [{Key, Value, map_leafs_simple(Fun, Pos + 1, SubTree)} | map_leafs_simple(Fun, Pos, RestTree)].


stem(Trees, Limit) ->
    try
        {_, Branches} = lists:foldl(fun(Tree, {Seen, TreeAcc}) ->
            {NewSeen, NewBranches} = stem_tree(Tree, Limit, Seen),
            {NewSeen, NewBranches ++ TreeAcc}
        end, {sets:new(), []}, Trees),
        lists:sort(Branches)
    catch throw:dupe_keys ->
        repair_tree(Trees, Limit)
    end.


stem_tree({Depth, Child}, Limit, Seen) ->
    case stem_tree(Depth, Child, Limit, Seen) of
        {NewSeen, _, NewChild, NewBranches} ->
            {NewSeen, [{Depth, NewChild} | NewBranches]};
        {NewSeen, _, NewBranches} ->
            {NewSeen, NewBranches}
    end.


stem_tree(_Depth, {Key, _Val, []} = Leaf, Limit, Seen) ->
    {check_key(Key, Seen), Limit - 1, Leaf, []};

stem_tree(Depth, {Key, Val, Children}, Limit, Seen0) ->
    Seen1 = check_key(Key, Seen0),
    FinalAcc = lists:foldl(fun(Child, Acc) ->
        {SeenAcc, LimitPosAcc, ChildAcc, BranchAcc} = Acc,
        case stem_tree(Depth + 1, Child, Limit, SeenAcc) of
            {NewSeenAcc, LimitPos, NewChild, NewBranches} ->
                NewLimitPosAcc = erlang:max(LimitPos, LimitPosAcc),
                NewChildAcc = [NewChild | ChildAcc],
                NewBranchAcc = NewBranches ++ BranchAcc,
                {NewSeenAcc, NewLimitPosAcc, NewChildAcc, NewBranchAcc};
            {NewSeenAcc, LimitPos, NewBranches} ->
                NewLimitPosAcc = erlang:max(LimitPos, LimitPosAcc),
                NewBranchAcc = NewBranches ++ BranchAcc,
                {NewSeenAcc, NewLimitPosAcc, ChildAcc, NewBranchAcc}
        end
    end, {Seen1, -1, [], []}, Children),
    {FinalSeen, FinalLimitPos, FinalChildren, FinalBranches} = FinalAcc,
    case FinalLimitPos of
        N when N > 0, length(FinalChildren) > 0 ->
            FinalNode = {Key, Val, lists:reverse(FinalChildren)},
            {FinalSeen, FinalLimitPos - 1, FinalNode, FinalBranches};
        0 when length(FinalChildren) > 0 ->
            NewBranches = lists:map(fun(Child) ->
                {Depth + 1, Child}
            end, lists:reverse(FinalChildren)),
            {FinalSeen, -1, NewBranches ++ FinalBranches};
        N when N < 0, length(FinalChildren) == 0 ->
            {FinalSeen, FinalLimitPos - 1, FinalBranches}
    end.


check_key(Key, Seen) ->
    case sets:is_element(Key, Seen) of
        true ->
            throw(dupe_keys);
        false ->
            sets:add_element(Key, Seen)
    end.


repair_tree(Trees, Limit) ->
    % flatten each branch in a tree into a tree path, sort by starting rev #
    Paths = lists:sort(lists:map(fun({Pos, Path}) ->
        StemmedPath = lists:sublist(Path, Limit),
        {Pos + 1 - length(StemmedPath), StemmedPath}
    end, get_all_leafs_full(Trees))),

    % convert paths back to trees
    lists:foldl(
        fun({StartPos, Path},TreeAcc) ->
            [SingleTree] = lists:foldl(
                fun({K,V},NewTreeAcc) -> [{K,V,NewTreeAcc}] end, [], Path),
            {NewTrees, _} = merge(TreeAcc, {StartPos, SingleTree}),
            NewTrees
        end, [], Paths).


value_pref(Tuple, _) when is_tuple(Tuple),
        (tuple_size(Tuple) == 3 orelse tuple_size(Tuple) == 4) ->
    Tuple;
value_pref(_, Tuple) when is_tuple(Tuple),
        (tuple_size(Tuple) == 3 orelse tuple_size(Tuple) == 4) ->
    Tuple;
value_pref(?REV_MISSING, Other) ->
    Other;
value_pref(Other, ?REV_MISSING) ->
    Other;
value_pref(Last, _) ->
    Last.