diff options
author | Jay Doane <jaydoane@apache.org> | 2021-05-27 22:17:15 -0700 |
---|---|---|
committer | Jay Doane <jay.s.doane@gmail.com> | 2021-06-02 12:04:07 -0700 |
commit | 97a136adec57f76b22a0ec5d3a4900b16bb2c286 (patch) | |
tree | 64cb4cd2faa06490bfddb154f39a32f734c0eadc | |
parent | 8903216ba503a0bda4be3ab46be8ce7cfbd92046 (diff) | |
download | couchdb-97a136adec57f76b22a0ec5d3a4900b16bb2c286.tar.gz |
Grace period before deleting aegis cached items
On a heavily loaded system, it is possible for `is_key_fresh/1` to
return true, but subsequently for `lookup/2` to return `{error, not_found}`.
An example occurred during performance testing, where a `badmatch`
error in aegis_server generated this stack trace:
```
May 6 21:34:48 c-fdbcore-perf-api-5bc54ff569-jkjxx db error [error] 2021-05-06T21:34:41.337533Z dbcore@127.0.0.1 <0.18553.273> 6adfd54aa5 req_err(3074846293) {{badmatch,{error,not_found}},
[{aegis_server,handle_call,3,[{file,"src/aegis_server.erl"},{line,170}]},
{gen_server,try_handle_call,4,[{file,"gen_server.erl"},{line,636}]},
{gen_server,handle_msg,6,[{file,"gen_server.erl"},{line,665}]},
{proc_lib,init_p_do_apply,3,[{file,"proc_lib.erl"},{line,247}]}]} : {gen_server,call,
[aegis_server,
{encrypt,
#{after_doc_read => undefined,before_doc_update => undefined,
check_current_ts => -576456460633,db_options => [],
db_prefix => <<21,22,21,15,1,23,5,202,10,0>>,
db_version => <<"f518950b94a759bb410f86d32126f421">>,
interactive => true,is_encrypted => true,
layer_prefix => <<21,22>>,
md_version => <<0,0,12,155,121,36,204,80,0,0>>,
name =>
<<"test-user-standard-0002/ccm_capacity_unit_1620334990_test-user-standard-1512">>,
revs_limit => 1000,
security_doc => {[]},
security_fun => undefined,
tx => {erlfdb_transaction,#Ref<0.4001737934.1270480897.28885>},
user_ctx =>
{user_ctx,<<"test-user-standard-0002">>,
[<<"_admin">>,<<"_reader">>,<<"_writer">>],
<<"cookie">>},
uuid => <<"97e681cfd7224680f72213b8bf59a25c">>,
validate_doc_update_funs => []},
[<<"gen_server:call/2 L206">>,<<"aegis_server:encrypt/3 L93">>,<<"fabric2_fdb:-write_doc_body/2-fun-0-/3 L1430">>,<<"lists:foreach/2 L1338">>,<<"fabric2_fdb:write_doc/6 L851">>,<<"fabric2_db:update_doc_interactive/4 L1944">>,<<"fabric2_db:update_docs_interactive/5 L1808">>,<<"fabric2_db:batch_update_interactive_tx/1 L1752">>]
```
The crash occurred in aegis_server [1] (now at a slightly different
line from the trace) because `lookup/2` returned `{error, not_found}`
[2] when unable to find the UUID in the cache.
This commit introduces a configurable grace period, defaulting to 5
seconds, which expired entries remain in the cache, so that such a
race is less likely to occur. It also handles the error case,
preventing a badmatch, and finally DRYs out the code by factoring a
`handle_crypto_call/6` function for both encryption operations.
[1] https://github.com/apache/couchdb/blob/main/src/aegis/src/aegis_server.erl#L173
[2] https://github.com/apache/couchdb/blob/main/src/aegis/src/aegis_server.erl#L339
-rw-r--r-- | src/aegis/src/aegis_server.erl | 72 | ||||
-rw-r--r-- | src/aegis/test/aegis_server_test.erl | 10 |
2 files changed, 41 insertions, 41 deletions
diff --git a/src/aegis/src/aegis_server.erl b/src/aegis/src/aegis_server.erl index 92ba7e80d..508e4531a 100644 --- a/src/aegis/src/aegis_server.erl +++ b/src/aegis/src/aegis_server.erl @@ -49,6 +49,7 @@ -define(CACHE_MAX_AGE_SEC, 1800). -define(CACHE_EXPIRATION_CHECK_SEC, 10). -define(LAST_ACCESSED_INACTIVITY_SEC, 10). +-define(CACHE_DELETION_GRACE_SEC, 5). % Keep in cache after expiration -record(entry, {uuid, encryption_key, counter, last_accessed, expires_at}). @@ -168,43 +169,11 @@ handle_call({insert_key, UUID, DbKey}, _From, #{cache := Cache} = St) -> NewSt = insert(St, UUID, DbKey), {reply, ok, NewSt, ?TIMEOUT}; -handle_call({encrypt, #{uuid := UUID} = Db, Key, Value}, From, St) -> +handle_call({encrypt, Db, Key, Value}, From, St) -> + handle_crypto_call(fun do_encrypt/4, Db, Key, Value, From, St); - {ok, DbKey} = lookup(St, UUID), - - erlang:spawn(fun() -> - process_flag(sensitive, true), - try - do_encrypt(DbKey, Db, Key, Value) - of - Resp -> - gen_server:reply(From, Resp) - catch - _:Error -> - gen_server:reply(From, {error, Error}) - end - end), - - {noreply, St, ?TIMEOUT}; - -handle_call({decrypt, #{uuid := UUID} = Db, Key, Value}, From, St) -> - - {ok, DbKey} = lookup(St, UUID), - - erlang:spawn(fun() -> - process_flag(sensitive, true), - try - do_decrypt(DbKey, Db, Key, Value) - of - Resp -> - gen_server:reply(From, Resp) - catch - _:Error -> - gen_server:reply(From, {error, Error}) - end - end), - - {noreply, St, ?TIMEOUT}; +handle_call({decrypt, Db, Key, Value}, From, St) -> + handle_crypto_call(fun do_decrypt/4, Db, Key, Value, From, St); handle_call(_Msg, _From, St) -> {noreply, St}. @@ -236,6 +205,29 @@ code_change(_OldVsn, St, _Extra) -> %% private functions + +handle_crypto_call(DoCryptoOp, Db, Key, Value, From, St) -> + #{uuid := UUID} = Db, + case lookup(St, UUID) of + {error, not_found} -> + gen_server:reply(From, {error, db_encryption_key_not_cached}); + {ok, DbKey} -> + erlang:spawn(fun() -> + process_flag(sensitive, true), + try + DoCryptoOp(DbKey, Db, Key, Value) + of + Resp -> + gen_server:reply(From, Resp) + catch + _:Error -> + gen_server:reply(From, {error, Error}) + end + end), + {noreply, St, ?TIMEOUT} + end. + + do_open_db(#{uuid := UUID} = Db) -> case ?AEGIS_KEY_MANAGER:open_db(Db) of {ok, DbKey} -> @@ -389,7 +381,8 @@ remove_expired_entries(St) -> by_access := ByAccess } = St, - MatchConditions = [{'=<', '$1', fabric2_util:now(sec)}], + DeleteEntriesUntil = fabric2_util:now(sec) - cache_deletion_grace(), + MatchConditions = [{'<', '$1', DeleteEntriesUntil}], KeyCheckMatchHead = {'_', '$1'}, KeyCheckExpired = [{KeyCheckMatchHead, MatchConditions, [true]}], @@ -415,6 +408,11 @@ cache_limit() -> config:get_integer("aegis", "cache_limit", ?CACHE_LIMIT). +cache_deletion_grace() -> + config:get_integer( + "aegis", "cache_deletion_grace_sec", ?CACHE_DELETION_GRACE_SEC). + + sensitive(Fun) when is_function(Fun, 0) -> OldValue = process_flag(sensitive, true), try diff --git a/src/aegis/test/aegis_server_test.erl b/src/aegis/test/aegis_server_test.erl index 281dce59d..69ad1ed56 100644 --- a/src/aegis/test/aegis_server_test.erl +++ b/src/aegis/test/aegis_server_test.erl @@ -139,9 +139,9 @@ disabled_test_() -> end, fun teardown/1, [ - {"init_db returns false when encryptions disabled", + {"init_db returns false when encryption disabled", {timeout, ?TIMEOUT, fun test_disabled_init_db/0}}, - {"open_db returns false when encryptions disabled", + {"open_db returns false when encryption disabled", {timeout, ?TIMEOUT, fun test_disabled_open_db/0}}, {"pass through on encrypt when encryption disabled", {timeout, ?TIMEOUT, fun test_disabled_encrypt/0}}, @@ -185,6 +185,7 @@ lru_cache_with_expiration_test_() -> ("aegis", "cache_limit", _) -> 5; ("aegis", "cache_max_age_sec", _) -> 130; ("aegis", "cache_expiration_check_sec", _) -> 1; + ("aegis", "cache_deletion_grace_sec", _) -> 1; (_, _, Default) -> Default end), Ctx = setup(), @@ -313,10 +314,11 @@ test_remove_expired() -> meck:reset(aegis_server), meck:wait(aegis_server, handle_info, [maybe_remove_expired, '_'], 2500), - %% 3 "oldest" entries should be removed, 2 yet to expire still in cache + %% 2 "oldest" entries should be removed, 2 yet to expire still in cache, + %% and one remaining in cache due to grace period lists:foreach(fun(I) -> Db = ?DB#{uuid => <<I:64>>}, aegis_server:encrypt(Db, <<I:64>>, ?VALUE) end, lists:seq(1, 5)), - ?assertEqual(8, meck:num_calls(?AEGIS_KEY_MANAGER, open_db, 1)). + ?assertEqual(7, meck:num_calls(?AEGIS_KEY_MANAGER, open_db, 1)). |