summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJay Doane <jaydoane@apache.org>2020-04-26 15:22:04 -0700
committerJay Doane <jaydoane@apache.org>2020-04-26 15:22:04 -0700
commit042e401cc92fd74c3e7e60521ff392d136fa782f (patch)
treedb1132a0a323748633ac4da29e01502779bc5683
parent232e1d51fd95c111dd81a19112de56a59e3f2a74 (diff)
downloadcouchdb-improve-expiring-cache-test-robustness.tar.gz
Improve robustness of couch expiring cache testimprove-expiring-cache-test-robustness
In its current incarnation, the so-called "simple lifecycle" test is prone to numerous failures in the CI system [1], doubtless because it's riddled with race conditions. The original author makes many assumptions about how quickly an (actual, unmocked) FDB instance will respond to a request. The primary goal is to stop failing CI builds, while other considerations include: keeping the run time of the test as low as possible, keeping the code coverage high, and documenting the known races. Specifically: - Increase the `stale` and `expired` times by a factor of 5 to decrease sensitivity to poor FDB performance. - Change default timer from `erlang:system_time/1` to `os:timestamp` on the assumption that the latter is less prone to warping [2]. - Decrease the period of the cache server reaper by half to increase accuracy of eviction time. - Inline and modify the `test_util:wait` code to make the timer explicit, and emphasize that `timer:delay/1` only works with millisecond resolution. - Don't fail the test if it can't get a fresh lookup immediately after insertion, but let it continue on to the next race, at least to the point of expiration and deletion, which continue to be asserted. - Factor `Timeout` and `Interval` to allow declarations near the other hard-coded parameters. - Move cache server `Opts` into `setup/0` and eliminate `start_link/0`. - Double the overall test timeout to 20 seconds. This has soaked for hundreds of runs on a 5 year old laptop, but the real test is the CI system. Should this test continue to fail CI builds, additional improvements could include mocking the timer and/or FDB layer to eliminate the variability of an integrated system. [1] https://ci-couchdb.apache.org/blue/organizations/jenkins/jenkins-cm1%2FPullRequests/detail/PR-2813/10/pipeline [2] http://erlang.org/doc/apps/erts/time_correction.html#terminology
-rw-r--r--src/couch_expiring_cache/src/couch_expiring_cache_server.erl10
-rw-r--r--src/couch_expiring_cache/test/couch_expiring_cache_tests.erl101
2 files changed, 77 insertions, 34 deletions
diff --git a/src/couch_expiring_cache/src/couch_expiring_cache_server.erl b/src/couch_expiring_cache/src/couch_expiring_cache_server.erl
index eb74e6eb2..74c432e25 100644
--- a/src/couch_expiring_cache/src/couch_expiring_cache_server.erl
+++ b/src/couch_expiring_cache/src/couch_expiring_cache_server.erl
@@ -17,6 +17,7 @@
-callback start_link() -> {ok, pid()} | ignore | {error, term()}.
-export([
+ now_ts/0,
start_link/2
]).
@@ -80,10 +81,10 @@ handle_info(remove_expired, St) ->
largest_elapsed := LargestElapsed
} = St,
- NowTS = erlang:system_time(?TIME_UNIT),
+ NowTS = now_ts(),
OldestTS = max(OldestTS0,
couch_expiring_cache_fdb:clear_range_to(Name, NowTS, BatchSize)),
- Elapsed = erlang:system_time(?TIME_UNIT) - NowTS,
+ Elapsed = now_ts() - NowTS,
{noreply, St#{
timer_ref := schedule_remove_expired(Period, MaxJitter),
@@ -108,6 +109,11 @@ code_change(_OldVsn, St, _Extra) ->
{ok, St}.
+now_ts() ->
+ {Mega, Sec, Micro} = os:timestamp(),
+ ((Mega * 1000000) + Sec) * 1000 + Micro div 1000.
+
+
%% Private
diff --git a/src/couch_expiring_cache/test/couch_expiring_cache_tests.erl b/src/couch_expiring_cache/test/couch_expiring_cache_tests.erl
index 2e06fcc5a..0780b8847 100644
--- a/src/couch_expiring_cache/test/couch_expiring_cache_tests.erl
+++ b/src/couch_expiring_cache/test/couch_expiring_cache_tests.erl
@@ -18,16 +18,9 @@
-include_lib("couch_expiring_cache/include/couch_expiring_cache.hrl").
--define(CACHE_NAME, <<"test">>).
+-define(CACHE_NAME, atom_to_binary(?MODULE, utf8)).
-
-start_link() ->
- Opts = #{
- cache_name => ?CACHE_NAME,
- period => 20,
- max_jitter => 0
- },
- couch_expiring_cache_server:start_link(?MODULE, Opts).
+-define(FOREVER, 576460752303423488). % max int 64 bit
couch_expiring_cache_basic_test_() ->
@@ -56,7 +49,11 @@ teardown_couch(Ctx) ->
setup() ->
- {ok, Pid} = start_link(),
+ Opts = #{
+ cache_name => ?CACHE_NAME,
+ period => 10,
+ max_jitter => 0},
+ {ok, Pid} = couch_expiring_cache_server:start_link(?MODULE, Opts),
true = unlink(Pid),
#{pid => Pid}.
@@ -66,10 +63,18 @@ teardown(#{pid := Pid}) ->
simple_lifecycle(_) ->
- {timeout, 10, ?_test(begin
- Now = erlang:system_time(?TIME_UNIT),
- StaleTS = Now + 100,
- ExpiresTS = Now + 200,
+ % The entire test is racing against FDB being faster than timeout seconds
+ {timeout, 20, ?_test(begin
+ Start = couch_expiring_cache_server:now_ts(),
+ % Race Alert!
+ % We're betting on FDB returning a lookup faster than these:
+ Stale = 500,
+ Expires = 1000,
+ Timeout = 5000,
+ Interval = 5,
+
+ StaleTS = Start + Stale,
+ ExpiresTS = Start + Expires,
Name = ?CACHE_NAME,
Key = <<"key">>,
Val = <<"val">>,
@@ -77,34 +82,66 @@ simple_lifecycle(_) ->
?assertEqual(ok, couch_expiring_cache_fdb:clear_all(Name)),
?assertEqual(not_found, couch_expiring_cache:lookup(Name, Key)),
?assertEqual([], entries(Name)),
- ?assertEqual(ok,
- couch_expiring_cache:insert(Name, Key, Val, StaleTS, ExpiresTS)),
- ?assertEqual({fresh, Val}, couch_expiring_cache:lookup(Name, Key)),
- ok = wait_lookup(Name, Key, {stale, Val}),
+ ?assertEqual(ok, couch_expiring_cache:insert(Name, Key, Val,
+ StaleTS, ExpiresTS)),
+ ok = attempt_fresh_and_stale_lookups(Name, Key, Timeout, Interval),
% Refresh the existing key with updated timestamps
- ?assertEqual(ok,
- couch_expiring_cache:insert(Name, Key, Val,
- StaleTS + 100, ExpiresTS + 100)),
- ?assertEqual({fresh, Val}, couch_expiring_cache:lookup(Name, Key)),
+ Refresh = couch_expiring_cache_server:now_ts(),
+ ?assertEqual(ok, couch_expiring_cache:insert(Name, Key, Val,
+ Refresh + Stale, Refresh + Expires)),
+ ok = attempt_fresh_and_stale_lookups(Name, Key, Timeout, Interval),
?assertEqual(1, length(entries(Name))),
- ok = wait_lookup(Name, Key, {stale, Val}),
- ok = wait_lookup(Name, Key, expired),
- ok = wait_lookup(Name, Key, not_found),
- ?assertEqual([], entries(Name)),
- ?assertEqual(not_found, couch_expiring_cache:lookup(Name, Key))
+ % These last 2 are also races, betting on FDB to be reasonably
+ % fast on the home stretch
+ ok = wait_lookup(Name, Key, expired, Timeout, Interval),
+ ok = wait_lookup(Name, Key, not_found, Timeout, Interval),
+ ?assertEqual([], entries(Name))
end)}.
+% In this race we're betting on FDB to take less than `Stale` and then
+% `Expired` milliseconds to respond
+attempt_fresh_and_stale_lookups(Name, Key, Timeout, Interval) ->
+ case couch_expiring_cache:lookup(Name, Key) of
+ {fresh, Val} ->
+ % We won that race, let's bet on another!
+ ok = wait_lookup(Name, Key, {stale, Val}, Timeout, Interval);
+ _ ->
+ % Unlucky! But don't fail the test just yet...
+ ok
+ end.
+
+
entries(Name) ->
- FarFuture = erlang:system_time(?TIME_UNIT) * 2,
- couch_expiring_cache_fdb:get_range_to(Name, FarFuture, _Limit=100).
+ couch_expiring_cache_fdb:get_range_to(Name, ?FOREVER, _Limit=100).
-wait_lookup(Name, Key, Expect) ->
- test_util:wait(fun() ->
+% This lookup races against Timeout
+wait_lookup(Name, Key, Expect, Timeout, Interval) ->
+ wait(fun() ->
case couch_expiring_cache:lookup(Name, Key) of
Expect -> ok;
_ -> wait
end
- end, _Timeout = 1000, _PollingInterval = 10).
+ end, Timeout, Interval).
+
+
+wait(Fun, Timeout, Delay) ->
+ Now = couch_expiring_cache_server:now_ts(),
+ wait(Fun, Timeout, Delay, Now, Now).
+
+
+wait(_Fun, Timeout, _Delay, Started, Prev) when Prev - Started > Timeout ->
+ timeout;
+
+wait(Fun, Timeout, Delay, Started, _Prev) ->
+ case Fun() of
+ wait ->
+ % http://erlang.org/doc/man/timer.html#sleep-1
+ ok = timer:sleep(Delay), % always millisecond
+ wait(Fun, Timeout, Delay, Started,
+ couch_expiring_cache_server:now_ts());
+ Else ->
+ Else
+ end.