summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJay Doane <jaydoane@apache.org>2021-05-27 22:17:15 -0700
committerJay Doane <jay.s.doane@gmail.com>2021-06-02 12:04:07 -0700
commit97a136adec57f76b22a0ec5d3a4900b16bb2c286 (patch)
tree64cb4cd2faa06490bfddb154f39a32f734c0eadc
parent8903216ba503a0bda4be3ab46be8ce7cfbd92046 (diff)
downloadcouchdb-97a136adec57f76b22a0ec5d3a4900b16bb2c286.tar.gz
Grace period before deleting aegis cached items
On a heavily loaded system, it is possible for `is_key_fresh/1` to return true, but subsequently for `lookup/2` to return `{error, not_found}`. An example occurred during performance testing, where a `badmatch` error in aegis_server generated this stack trace: ``` May 6 21:34:48 c-fdbcore-perf-api-5bc54ff569-jkjxx db error [error] 2021-05-06T21:34:41.337533Z dbcore@127.0.0.1 <0.18553.273> 6adfd54aa5 req_err(3074846293) {{badmatch,{error,not_found}}, [{aegis_server,handle_call,3,[{file,"src/aegis_server.erl"},{line,170}]}, {gen_server,try_handle_call,4,[{file,"gen_server.erl"},{line,636}]}, {gen_server,handle_msg,6,[{file,"gen_server.erl"},{line,665}]}, {proc_lib,init_p_do_apply,3,[{file,"proc_lib.erl"},{line,247}]}]} : {gen_server,call, [aegis_server, {encrypt, #{after_doc_read => undefined,before_doc_update => undefined, check_current_ts => -576456460633,db_options => [], db_prefix => <<21,22,21,15,1,23,5,202,10,0>>, db_version => <<"f518950b94a759bb410f86d32126f421">>, interactive => true,is_encrypted => true, layer_prefix => <<21,22>>, md_version => <<0,0,12,155,121,36,204,80,0,0>>, name => <<"test-user-standard-0002/ccm_capacity_unit_1620334990_test-user-standard-1512">>, revs_limit => 1000, security_doc => {[]}, security_fun => undefined, tx => {erlfdb_transaction,#Ref<0.4001737934.1270480897.28885>}, user_ctx => {user_ctx,<<"test-user-standard-0002">>, [<<"_admin">>,<<"_reader">>,<<"_writer">>], <<"cookie">>}, uuid => <<"97e681cfd7224680f72213b8bf59a25c">>, validate_doc_update_funs => []}, [<<"gen_server:call/2 L206">>,<<"aegis_server:encrypt/3 L93">>,<<"fabric2_fdb:-write_doc_body/2-fun-0-/3 L1430">>,<<"lists:foreach/2 L1338">>,<<"fabric2_fdb:write_doc/6 L851">>,<<"fabric2_db:update_doc_interactive/4 L1944">>,<<"fabric2_db:update_docs_interactive/5 L1808">>,<<"fabric2_db:batch_update_interactive_tx/1 L1752">>] ``` The crash occurred in aegis_server [1] (now at a slightly different line from the trace) because `lookup/2` returned `{error, not_found}` [2] when unable to find the UUID in the cache. This commit introduces a configurable grace period, defaulting to 5 seconds, which expired entries remain in the cache, so that such a race is less likely to occur. It also handles the error case, preventing a badmatch, and finally DRYs out the code by factoring a `handle_crypto_call/6` function for both encryption operations. [1] https://github.com/apache/couchdb/blob/main/src/aegis/src/aegis_server.erl#L173 [2] https://github.com/apache/couchdb/blob/main/src/aegis/src/aegis_server.erl#L339
-rw-r--r--src/aegis/src/aegis_server.erl72
-rw-r--r--src/aegis/test/aegis_server_test.erl10
2 files changed, 41 insertions, 41 deletions
diff --git a/src/aegis/src/aegis_server.erl b/src/aegis/src/aegis_server.erl
index 92ba7e80d..508e4531a 100644
--- a/src/aegis/src/aegis_server.erl
+++ b/src/aegis/src/aegis_server.erl
@@ -49,6 +49,7 @@
-define(CACHE_MAX_AGE_SEC, 1800).
-define(CACHE_EXPIRATION_CHECK_SEC, 10).
-define(LAST_ACCESSED_INACTIVITY_SEC, 10).
+-define(CACHE_DELETION_GRACE_SEC, 5). % Keep in cache after expiration
-record(entry, {uuid, encryption_key, counter, last_accessed, expires_at}).
@@ -168,43 +169,11 @@ handle_call({insert_key, UUID, DbKey}, _From, #{cache := Cache} = St) ->
NewSt = insert(St, UUID, DbKey),
{reply, ok, NewSt, ?TIMEOUT};
-handle_call({encrypt, #{uuid := UUID} = Db, Key, Value}, From, St) ->
+handle_call({encrypt, Db, Key, Value}, From, St) ->
+ handle_crypto_call(fun do_encrypt/4, Db, Key, Value, From, St);
- {ok, DbKey} = lookup(St, UUID),
-
- erlang:spawn(fun() ->
- process_flag(sensitive, true),
- try
- do_encrypt(DbKey, Db, Key, Value)
- of
- Resp ->
- gen_server:reply(From, Resp)
- catch
- _:Error ->
- gen_server:reply(From, {error, Error})
- end
- end),
-
- {noreply, St, ?TIMEOUT};
-
-handle_call({decrypt, #{uuid := UUID} = Db, Key, Value}, From, St) ->
-
- {ok, DbKey} = lookup(St, UUID),
-
- erlang:spawn(fun() ->
- process_flag(sensitive, true),
- try
- do_decrypt(DbKey, Db, Key, Value)
- of
- Resp ->
- gen_server:reply(From, Resp)
- catch
- _:Error ->
- gen_server:reply(From, {error, Error})
- end
- end),
-
- {noreply, St, ?TIMEOUT};
+handle_call({decrypt, Db, Key, Value}, From, St) ->
+ handle_crypto_call(fun do_decrypt/4, Db, Key, Value, From, St);
handle_call(_Msg, _From, St) ->
{noreply, St}.
@@ -236,6 +205,29 @@ code_change(_OldVsn, St, _Extra) ->
%% private functions
+
+handle_crypto_call(DoCryptoOp, Db, Key, Value, From, St) ->
+ #{uuid := UUID} = Db,
+ case lookup(St, UUID) of
+ {error, not_found} ->
+ gen_server:reply(From, {error, db_encryption_key_not_cached});
+ {ok, DbKey} ->
+ erlang:spawn(fun() ->
+ process_flag(sensitive, true),
+ try
+ DoCryptoOp(DbKey, Db, Key, Value)
+ of
+ Resp ->
+ gen_server:reply(From, Resp)
+ catch
+ _:Error ->
+ gen_server:reply(From, {error, Error})
+ end
+ end),
+ {noreply, St, ?TIMEOUT}
+ end.
+
+
do_open_db(#{uuid := UUID} = Db) ->
case ?AEGIS_KEY_MANAGER:open_db(Db) of
{ok, DbKey} ->
@@ -389,7 +381,8 @@ remove_expired_entries(St) ->
by_access := ByAccess
} = St,
- MatchConditions = [{'=<', '$1', fabric2_util:now(sec)}],
+ DeleteEntriesUntil = fabric2_util:now(sec) - cache_deletion_grace(),
+ MatchConditions = [{'<', '$1', DeleteEntriesUntil}],
KeyCheckMatchHead = {'_', '$1'},
KeyCheckExpired = [{KeyCheckMatchHead, MatchConditions, [true]}],
@@ -415,6 +408,11 @@ cache_limit() ->
config:get_integer("aegis", "cache_limit", ?CACHE_LIMIT).
+cache_deletion_grace() ->
+ config:get_integer(
+ "aegis", "cache_deletion_grace_sec", ?CACHE_DELETION_GRACE_SEC).
+
+
sensitive(Fun) when is_function(Fun, 0) ->
OldValue = process_flag(sensitive, true),
try
diff --git a/src/aegis/test/aegis_server_test.erl b/src/aegis/test/aegis_server_test.erl
index 281dce59d..69ad1ed56 100644
--- a/src/aegis/test/aegis_server_test.erl
+++ b/src/aegis/test/aegis_server_test.erl
@@ -139,9 +139,9 @@ disabled_test_() ->
end,
fun teardown/1,
[
- {"init_db returns false when encryptions disabled",
+ {"init_db returns false when encryption disabled",
{timeout, ?TIMEOUT, fun test_disabled_init_db/0}},
- {"open_db returns false when encryptions disabled",
+ {"open_db returns false when encryption disabled",
{timeout, ?TIMEOUT, fun test_disabled_open_db/0}},
{"pass through on encrypt when encryption disabled",
{timeout, ?TIMEOUT, fun test_disabled_encrypt/0}},
@@ -185,6 +185,7 @@ lru_cache_with_expiration_test_() ->
("aegis", "cache_limit", _) -> 5;
("aegis", "cache_max_age_sec", _) -> 130;
("aegis", "cache_expiration_check_sec", _) -> 1;
+ ("aegis", "cache_deletion_grace_sec", _) -> 1;
(_, _, Default) -> Default
end),
Ctx = setup(),
@@ -313,10 +314,11 @@ test_remove_expired() ->
meck:reset(aegis_server),
meck:wait(aegis_server, handle_info, [maybe_remove_expired, '_'], 2500),
- %% 3 "oldest" entries should be removed, 2 yet to expire still in cache
+ %% 2 "oldest" entries should be removed, 2 yet to expire still in cache,
+ %% and one remaining in cache due to grace period
lists:foreach(fun(I) ->
Db = ?DB#{uuid => <<I:64>>},
aegis_server:encrypt(Db, <<I:64>>, ?VALUE)
end, lists:seq(1, 5)),
- ?assertEqual(8, meck:num_calls(?AEGIS_KEY_MANAGER, open_db, 1)).
+ ?assertEqual(7, meck:num_calls(?AEGIS_KEY_MANAGER, open_db, 1)).