From a369c5285f6b9de91d723a962a907db27bae02a4 Mon Sep 17 00:00:00 2001 From: Nick Vatamaniuc Date: Sat, 4 Apr 2020 16:45:52 -0400 Subject: Compress doc bodies and attachments In CouchDB < 4.x we compressed document bodies by default, so enable it for 4.x as well. Use the basic term_to_binary compression mechanism for: - Document bodies - Local document bodies - Attachments, but only if they have not already been compressed. --- src/fabric/include/fabric2.hrl | 4 +++ src/fabric/src/fabric2_db.erl | 3 +- src/fabric/src/fabric2_fdb.erl | 42 ++++++++++++++++++++----- src/fabric/test/fabric2_doc_att_tests.erl | 51 +++++++++++++++++++++++++++++-- 4 files changed, 89 insertions(+), 11 deletions(-) diff --git a/src/fabric/include/fabric2.hrl b/src/fabric/include/fabric2.hrl index 0c0757567..99ac87497 100644 --- a/src/fabric/include/fabric2.hrl +++ b/src/fabric/include/fabric2.hrl @@ -54,6 +54,10 @@ -define(CURR_LDOC_FORMAT, 0). +% 0 - Attachment storage version + +-define(CURR_ATT_STORAGE_VER, 0). + % Misc constants -define(PDICT_DB_KEY, '$fabric_db_handle'). diff --git a/src/fabric/src/fabric2_db.erl b/src/fabric/src/fabric2_db.erl index fb6ae5176..54c6b37de 100644 --- a/src/fabric/src/fabric2_db.erl +++ b/src/fabric/src/fabric2_db.erl @@ -809,7 +809,8 @@ read_attachment(Db, DocId, AttId) -> write_attachment(Db, DocId, Att) -> Data = couch_att:fetch(data, Att), - {ok, AttId} = fabric2_fdb:write_attachment(Db, DocId, Data), + Encoding = couch_att:fetch(encoding, Att), + {ok, AttId} = fabric2_fdb:write_attachment(Db, DocId, Data, Encoding), couch_att:store(data, {loc, Db, DocId, AttId}, Att). diff --git a/src/fabric/src/fabric2_fdb.erl b/src/fabric/src/fabric2_fdb.erl index 2295a5648..5e5ba3e5f 100644 --- a/src/fabric/src/fabric2_fdb.erl +++ b/src/fabric/src/fabric2_fdb.erl @@ -54,7 +54,7 @@ write_local_doc/2, read_attachment/3, - write_attachment/3, + write_attachment/4, get_last_change/1, @@ -902,26 +902,53 @@ read_attachment(#{} = Db, DocId, AttId) -> } = ensure_current(Db), AttKey = erlfdb_tuple:pack({?DB_ATTS, DocId, AttId}, DbPrefix), - case erlfdb:wait(erlfdb:get_range_startswith(Tx, AttKey)) of + Data = case erlfdb:wait(erlfdb:get_range_startswith(Tx, AttKey)) of not_found -> throw({not_found, missing}); KVs -> Vs = [V || {_K, V} <- KVs], iolist_to_binary(Vs) + end, + + IdKey = erlfdb_tuple:pack({?DB_ATT_NAMES, DocId, AttId}, DbPrefix), + case erlfdb:wait(erlfdb:get(Tx, IdKey)) of + <<>> -> + Data; % Old format, before CURR_ATT_STORAGE_VER = 0 + <<_/binary>> = InfoBin -> + {?CURR_ATT_STORAGE_VER, Compressed} = erlfdb_tuple:unpack(InfoBin), + case Compressed of + true -> binary_to_term(Data, [safe]); + false -> Data + end end. -write_attachment(#{} = Db, DocId, Data) when is_binary(Data) -> +write_attachment(#{} = Db, DocId, Data, Encoding) + when is_binary(Data), is_atom(Encoding) -> #{ tx := Tx, db_prefix := DbPrefix } = ensure_current(Db), AttId = fabric2_util:uuid(), - Chunks = chunkify_binary(Data), + + {Data1, Compressed} = case Encoding of + gzip -> + {Data, false}; + _ -> + Opts = [{minor_version, 1}, {compressed, 6}], + CompressedData = term_to_binary(Data, Opts), + case size(CompressedData) < Data of + true -> {CompressedData, true}; + false -> {Data, false} + end + end, IdKey = erlfdb_tuple:pack({?DB_ATT_NAMES, DocId, AttId}, DbPrefix), - ok = erlfdb:set(Tx, IdKey, <<>>), + InfoVal = erlfdb_tuple:pack({?CURR_ATT_STORAGE_VER, Compressed}), + ok = erlfdb:set(Tx, IdKey, InfoVal), + + Chunks = chunkify_binary(Data1), lists:foldl(fun(Chunk, ChunkId) -> AttKey = erlfdb_tuple:pack({?DB_ATTS, DocId, AttId, ChunkId}, DbPrefix), @@ -1366,7 +1393,8 @@ doc_to_fdb(Db, #doc{} = Doc) -> DiskAtts = lists:map(fun couch_att:to_disk_term/1, Atts), - Value = term_to_binary({Body, DiskAtts, Deleted}, [{minor_version, 1}]), + Opts = [{minor_version, 1}, {compressed, 6}], + Value = term_to_binary({Body, DiskAtts, Deleted}, Opts), Chunks = chunkify_binary(Value), {Rows, _} = lists:mapfoldl(fun(Chunk, ChunkId) -> @@ -1418,7 +1446,7 @@ local_doc_to_fdb(Db, #doc{} = Doc) -> _ when is_binary(Rev) -> Rev end, - BVal = term_to_binary(Body, [{minor_version, 1}]), + BVal = term_to_binary(Body, [{minor_version, 1}, {compressed, 6}]), {Rows, _} = lists:mapfoldl(fun(Chunk, ChunkId) -> K = erlfdb_tuple:pack({?DB_LOCAL_DOC_BODIES, Id, ChunkId}, DbPrefix), {{K, Chunk}, ChunkId + 1} diff --git a/src/fabric/test/fabric2_doc_att_tests.erl b/src/fabric/test/fabric2_doc_att_tests.erl index ac531e913..fc7bbccbc 100644 --- a/src/fabric/test/fabric2_doc_att_tests.erl +++ b/src/fabric/test/fabric2_doc_att_tests.erl @@ -29,6 +29,7 @@ doc_crud_test_() -> fun cleanup/1, with([ ?TDEF(create_att), + ?TDEF(create_att_already_compressed), ?TDEF(delete_att), ?TDEF(multiple_atts), ?TDEF(delete_one_att), @@ -84,7 +85,47 @@ create_att({Db, _}) -> IdVal = erlfdb:wait(erlfdb:get(Tx, IdKey)), AttVals = erlfdb:wait(erlfdb:get_range_startswith(Tx, AttKey)), - ?assertEqual(<<>>, IdVal), + ?assertEqual(erlfdb_tuple:pack({0, true}), IdVal), + Opts = [{minor_version, 1}, {compressed, 6}], + Expect = term_to_binary(<<"foobar">>, Opts), + ?assertMatch([{_, Expect}], AttVals) + end). + +create_att_already_compressed({Db, _}) -> + DocId = fabric2_util:uuid(), + Att1 = couch_att:new([ + {name, <<"foo.txt">>}, + {type, <<"application/octet-stream">>}, + {att_len, 6}, + {data, <<"foobar">>}, + {encoding, gzip}, + {md5, <<>>} + ]), + Doc1 = #doc{ + id = DocId, + atts = [Att1] + }, + {ok, _} = fabric2_db:update_doc(Db, Doc1), + {ok, Doc2} = fabric2_db:open_doc(Db, DocId), + #doc{ + atts = [Att2] + } = Doc2, + {loc, _Db, DocId, AttId} = couch_att:fetch(data, Att2), + AttData = fabric2_db:read_attachment(Db, DocId, AttId), + ?assertEqual(<<"foobar">>, AttData), + + % Check that the raw keys exist + #{ + db_prefix := DbPrefix + } = Db, + IdKey = erlfdb_tuple:pack({?DB_ATT_NAMES, DocId, AttId}, DbPrefix), + AttKey = erlfdb_tuple:pack({?DB_ATTS, DocId, AttId}, DbPrefix), + + fabric2_fdb:transactional(fun(Tx) -> + IdVal = erlfdb:wait(erlfdb:get(Tx, IdKey)), + AttVals = erlfdb:wait(erlfdb:get_range_startswith(Tx, AttKey)), + + ?assertEqual(erlfdb_tuple:pack({0, false}), IdVal), ?assertMatch([{_, <<"foobar">>}], AttVals) end). @@ -175,7 +216,7 @@ large_att({Db, _}) -> AttData = iolist_to_binary([ <<"foobar">> || _ <- lists:seq(1, 60000) ]), - Att1 = mk_att(<<"long.txt">>, AttData), + Att1 = mk_att(<<"long.txt">>, AttData, gzip), {ok, _} = create_doc(Db, DocId, [Att1]), ?assertEqual(#{<<"long.txt">> => AttData}, read_atts(Db, DocId)), @@ -204,12 +245,16 @@ att_on_conflict_isolation({Db, _}) -> mk_att(Name, Data) -> + mk_att(Name, Data, identity). + + +mk_att(Name, Data, Encoding) -> couch_att:new([ {name, Name}, {type, <<"application/octet-stream">>}, {att_len, size(Data)}, {data, Data}, - {encoding, identity}, + {encoding, Encoding}, {md5, <<>>} ]). -- cgit v1.2.1