From 52b69f12e597a69843457888f6660a3bef8934f7 Mon Sep 17 00:00:00 2001 From: Nick Vatamaniuc Date: Sat, 6 May 2023 02:20:43 -0400 Subject: Use xxHash for couch_file checksums Check xxhash first, since it's faster [1], and if that fails, check the slower md5 version. Bump a stats counter to indicate if there are still any md5 checksums found during normal cluster operation. [1] Comparison of hashing a 4KB block (units are microseconds). ``` (node1@127.0.0.1)20> f(T), {T, ok} = timer:tc(fun() -> lists:foreach(fun (_) -> do_nothing_overhead end, lists:seq(1, 1000000)) end), (T/1000000.0). 0.167425 (node1@127.0.0.1)21> f(T), {T, ok} = timer:tc(fun() -> lists:foreach(fun (_) -> exxhash:xxhash128(B) end, lists:seq(1, 1000000)) end), (T/1000000). 0.770687 (node1@127.0.0.1)22> f(T), {T, ok} = timer:tc(fun() -> lists:foreach(fun (_) -> crypto:hash(md5, B) end, lists:seq(1, 1000000)) end), (T/1000000). 6.205445 ``` --- src/couch/priv/stats_descriptions.cfg | 4 ++ src/couch/src/couch_file.erl | 72 +++++++++++++++++++++-------------- 2 files changed, 47 insertions(+), 29 deletions(-) diff --git a/src/couch/priv/stats_descriptions.cfg b/src/couch/priv/stats_descriptions.cfg index 7c8fd94cb..2dae98954 100644 --- a/src/couch/priv/stats_descriptions.cfg +++ b/src/couch/priv/stats_descriptions.cfg @@ -298,6 +298,10 @@ {type, counter}, {desc, <<"number of the attempts to read beyond set limit">>} ]}. +{[couch_file, old_digests], [ + {type, counter}, + {desc, <<"number of old digests found in couch_file instances">>} +]}. {[mango, unindexed_queries], [ {type, counter}, {desc, <<"number of mango queries that could not use an index">>} diff --git a/src/couch/src/couch_file.erl b/src/couch/src/couch_file.erl index 514d4e3d9..afa848efa 100644 --- a/src/couch/src/couch_file.erl +++ b/src/couch/src/couch_file.erl @@ -142,8 +142,8 @@ assemble_file_chunk(Bin) -> [<<0:1/integer, (iolist_size(Bin)):31/integer>>, Bin]. assemble_file_chunk_and_checksum(Bin) -> - Md5 = couch_hash:md5_hash(Bin), - [<<1:1/integer, (iolist_size(Bin)):31/integer>>, Md5, Bin]. + Digest = exxhash:xxhash128(Bin), + [<<1:1/integer, (iolist_size(Bin)):31/integer>>, Digest, Bin]. %%---------------------------------------------------------------------- %% Purpose: Reads a term from a file that was written with append_term @@ -169,8 +169,8 @@ pread_binary(Fd, Pos) -> pread_iolist(Fd, Pos) -> case ioq:call(Fd, {pread_iolist, Pos}, erlang:get(io_priority)) of - {ok, IoList, Md5} -> - {ok, verify_md5(Fd, Pos, IoList, Md5)}; + {ok, IoList, Digest} -> + {ok, verify_digest(Fd, Pos, IoList, Digest)}; Error -> Error end. @@ -191,13 +191,13 @@ pread_binaries(Fd, PosList) -> pread_iolists(Fd, PosList) -> case ioq:call(Fd, {pread_iolists, PosList}, erlang:get(io_priority)) of - {ok, DataMd5s} -> + {ok, DataAndDigests} -> Data = lists:zipwith( - fun(Pos, {IoList, Md5}) -> - verify_md5(Fd, Pos, IoList, Md5) + fun(Pos, {IoList, Digest}) -> + verify_digest(Fd, Pos, IoList, Digest) end, PosList, - DataMd5s + DataAndDigests ), {ok, Data}; Error -> @@ -400,9 +400,9 @@ read_header(Fd) -> write_header(Fd, Data) -> Bin = term_to_binary(Data), - Md5 = couch_hash:md5_hash(Bin), + Digest = exxhash:xxhash128(Bin), % now we assemble the final header binary and write to disk - FinalBin = <>, + FinalBin = <>, ioq:call(Fd, {write_header, FinalBin}, erlang:get(io_priority)). init_status_error(ReturnPid, Ref, Error) -> @@ -504,11 +504,11 @@ handle_call({pread_iolist, Pos}, _From, File) -> update_read_timestamp(), {LenIolist, NextPos} = read_raw_iolist_int(File, Pos, 4), case iolist_to_binary(LenIolist) of - % an MD5-prefixed term + % an digest-prefixed term <<1:1/integer, Len:31/integer>> -> - {Md5AndIoList, _} = read_raw_iolist_int(File, NextPos, Len + 16), - {Md5, IoList} = extract_md5(Md5AndIoList), - {reply, {ok, IoList, Md5}, File}; + {DigestAndIoList, _} = read_raw_iolist_int(File, NextPos, Len + 16), + {Digest, IoList} = extract_digest(DigestAndIoList), + {reply, {ok, IoList, Digest}, File}; <<0:1/integer, Len:31/integer>> -> {Iolist, _} = read_raw_iolist_int(File, NextPos, Len), {reply, {ok, Iolist, <<>>}, File} @@ -520,7 +520,7 @@ handle_call({pread_iolists, PosL}, _From, File) -> LocNums2 = lists:map( fun({LenIoList, NextPos}) -> case iolist_to_binary(LenIoList) of - % an MD5-prefixed term + % a digest-prefixed term <<1:1/integer, Len:31/integer>> -> {NextPos, Len + 16}; <<0:1/integer, Len:31/integer>> -> @@ -534,8 +534,8 @@ handle_call({pread_iolists, PosL}, _From, File) -> fun({LenIoList, _}, {IoList, _}) -> case iolist_to_binary(LenIoList) of <<1:1/integer, _:31/integer>> -> - {Md5, IoList} = extract_md5(IoList), - {IoList, Md5}; + {Digest, IoList} = extract_digest(IoList), + {IoList, Digest}; <<0:1/integer, _:31/integer>> -> {IoList, <<>>} end @@ -674,9 +674,15 @@ load_header(Fd, Pos, HeaderLen, RestBlock) -> {ok, Missing} = file:pread(Fd, ReadStart, ReadLen), <> end, - <> = + <> = iolist_to_binary(remove_block_prefixes(?PREFIX_SIZE, RawBin)), - Md5Sig = couch_hash:md5_hash(HeaderBin), + case exxhash:xxhash128(HeaderBin) of + Digest -> + ok; + <<_/binary>> -> + couch_stats:increment_counter([couch_file, old_digests]), + Digest = couch_hash:md5_hash(HeaderBin) + end, {ok, HeaderBin}. %% Read multiple block locations using a single file:pread/2. @@ -779,10 +785,10 @@ get_pread_locnum(File, Pos, Len) -> {Pos, TotalBytes} end. --spec extract_md5(iolist()) -> {binary(), iolist()}. -extract_md5(FullIoList) -> - {Md5List, IoList} = split_iolist(FullIoList, 16, []), - {iolist_to_binary(Md5List), IoList}. +-spec extract_digest(iolist()) -> {binary(), iolist()}. +extract_digest(FullIoList) -> + {DigestList, IoList} = split_iolist(FullIoList, 16, []), + {iolist_to_binary(DigestList), IoList}. calculate_total_read_len(0, FinalLen) -> calculate_total_read_len(1, FinalLen) + 1; @@ -852,15 +858,23 @@ monitored_by_pids() -> {monitored_by, PidsAndRefs} = process_info(self(), monitored_by), lists:filter(fun is_pid/1, PidsAndRefs). -verify_md5(_Fd, _Pos, IoList, <<>>) -> +verify_digest(_Fd, _Pos, IoList, <<>>) -> IoList; -verify_md5(Fd, Pos, IoList, Md5) -> - case couch_hash:md5_hash(IoList) of - Md5 -> IoList; - _ -> report_md5_error(Fd, Pos) +verify_digest(Fd, Pos, IoList, Digest) -> + case exxhash:xxhash128(iolist_to_binary(IoList)) of + Digest -> + IoList; + <<_/binary>> -> + case couch_hash:md5_hash(IoList) of + Digest -> + couch_stats:increment_counter([couch_file, old_digests]), + IoList; + _ -> + report_digest_error(Fd, Pos) + end end. -report_md5_error(Fd, Pos) -> +report_digest_error(Fd, Pos) -> couch_log:emergency("File corruption in ~p at position ~B", [Fd, Pos]), exit({file_corruption, <<"file corruption">>}). -- cgit v1.2.1