summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul J. Davis <paul.joseph.davis@gmail.com>2017-05-03 12:27:08 -0500
committerPaul J. Davis <paul.joseph.davis@gmail.com>2017-06-08 12:44:45 -0500
commit4296ec0604bad0db3d82eee690512f87a792a76b (patch)
treef4e55469b4904de24f1e0aab7294b6ea8ab7c7ee
parent4d192ecc36e0912d62be2825917430b8ac325c27 (diff)
downloadcouchdb-COUCHDB-3298-optimize-writing-btree-nodes.tar.gz
As it turns out, the original change in COUCHDB-3298 ends up hurting disk usage when a view emits large amounts of data (i.e., more than half of the btree chunk size). The cause for this is that instead of writing single element nodes it would instead prefer to write kv nodes with three elements. While normally we might prefer this in memory, it turns out that our append only storage this causes a significantly more amount of trash on disk. We can show this with a few trivial examples. Imagine we write KV's a through f. The two following patterns show the nodes as we write each new kv. Before 3298: [] [a] [a, b] [a, b]', [c] [a, b]', [c, d] [a, b]', [c, d]', [e] [a, b]', [c, d]', [e, f] After 3298: [] [a] [a, b] [a, b, c] [a, b]', [c, d] [a, b]', [c, d, e] [a, b]', [c, d]', [e, f] The thing to realize here is which of these nodes end up as garbage. In the first example we end up with [a], [a, b], [c], [c, d], and [e] nodes that have been orphaned. Where as in the second case we end up with [a], [a, b], [a, b, c], [c, d], [c, d, e] as nodes that have been orphaned. A quick aside, the reason that [a, b] and [c, d] are orphaned is due to how a btree update works. For instance, when adding c, we read [a, b] into memory, append c, and then during our node write we call chunkify which gives us back [a, b], [c] which leads us to writing [a, b] a second time. The main benefit of this patch is to realize when its possible to reuse a node that already exists on disk. It achieves this by looking at the list of key/values when writing new nodes and comparing it to the old list of key/values for the node read from disk. By checking to see if the old list exists unchanged in the new list we can just reuse the old node. Node reuse is limited to when the old node is larger than 50% of the chunk threshold to maintain the B+Tree properties. The disk usage improvements this gives can also be quite dramatic. In the case above when we have ordered keys with large values (> 50% of the btree chunk size) we find upwards of 50% less disk usage. Random keys also benefit as well though to a lesser extent depending on disk size (as they will often be in the middle of an existing node which prevents our optimization). COUCHDB-3298
-rw-r--r--src/couch/src/couch_btree.erl64
1 files changed, 63 insertions, 1 deletions
diff --git a/src/couch/src/couch_btree.erl b/src/couch/src/couch_btree.erl
index d61daf1c6..ea224b1ab 100644
--- a/src/couch/src/couch_btree.erl
+++ b/src/couch/src/couch_btree.erl
@@ -19,6 +19,8 @@
-include_lib("couch/include/couch_db.hrl").
+-define(FILL_RATIO, 0.5).
+
extract(#btree{extract_kv=undefined}, Value) ->
Value;
extract(#btree{extract_kv=Extract}, Value) ->
@@ -398,7 +400,14 @@ modify_node(Bt, RootPointerInfo, Actions, QueryOutput) ->
{LastKey, _LastValue} = element(tuple_size(NodeTuple), NodeTuple),
{ok, [{LastKey, RootPointerInfo}], QueryOutput2};
_Else2 ->
- {ok, ResultList} = write_node(Bt, NodeType, NewNodeList),
+ {ok, ResultList} = case RootPointerInfo of
+ nil ->
+ write_node(Bt, NodeType, NewNodeList);
+ _ ->
+ {LastKey, _LastValue} = element(tuple_size(NodeTuple), NodeTuple),
+ OldNode = {LastKey, RootPointerInfo},
+ write_node(Bt, OldNode, NodeType, NodeList, NewNodeList)
+ end,
{ok, ResultList, QueryOutput2}
end.
@@ -442,6 +451,59 @@ write_node(#btree{fd = Fd, compression = Comp} = Bt, NodeType, NodeList) ->
],
{ok, ResultList}.
+
+write_node(Bt, _OldNode, NodeType, [], NewList) ->
+ write_node(Bt, NodeType, NewList);
+write_node(Bt, _OldNode, NodeType, [_], NewList) ->
+ write_node(Bt, NodeType, NewList);
+write_node(Bt, OldNode, NodeType, OldList, NewList) ->
+ case can_reuse_old_node(OldList, NewList) of
+ {true, Prefix, Suffix} ->
+ {ok, PrefixKVs} = case Prefix of
+ [] -> {ok, []};
+ _ -> write_node(Bt, NodeType, Prefix)
+ end,
+ {ok, SuffixKVs} = case Suffix of
+ [] -> {ok, []};
+ _ -> write_node(Bt, NodeType, Suffix)
+ end,
+ Result = PrefixKVs ++ [OldNode] ++ SuffixKVs,
+ {ok, Result};
+ false ->
+ write_node(Bt, NodeType, NewList)
+ end.
+
+can_reuse_old_node(OldList, NewList) ->
+ {Prefix, RestNewList} = remove_prefix_kvs(hd(OldList), NewList),
+ case old_list_is_prefix(OldList, RestNewList, 0) of
+ {true, Size, Suffix} ->
+ ReuseThreshold = get_chunk_size() * ?FILL_RATIO,
+ if Size < ReuseThreshold -> false; true ->
+ {true, Prefix, Suffix}
+ end;
+ false ->
+ false
+ end.
+
+remove_prefix_kvs(KV1, [KV2 | Rest]) when KV2 < KV1 ->
+ {Prefix, RestNewList} = remove_prefix_kvs(KV1, Rest),
+ {[KV2 | Prefix], RestNewList};
+remove_prefix_kvs(_, RestNewList) ->
+ {[], RestNewList}.
+
+% No more KV's in the old node so its a prefix
+old_list_is_prefix([], Suffix, Size) ->
+ {true, Size, Suffix};
+% Some KV's have been removed from the old node
+old_list_is_prefix(_OldList, [], _Size) ->
+ false;
+% KV is equal in both old and new node so continue
+old_list_is_prefix([KV | Rest1], [KV | Rest2], Acc) ->
+ old_list_is_prefix(Rest1, Rest2, ?term_size(KV) + Acc);
+% KV mismatch between old and new node so not a prefix
+old_list_is_prefix(_OldList, _NewList, _Acc) ->
+ false.
+
modify_kpnode(Bt, {}, _LowerBound, Actions, [], QueryOutput) ->
modify_node(Bt, nil, Actions, QueryOutput);
modify_kpnode(_Bt, NodeTuple, LowerBound, [], ResultNode, QueryOutput) ->