From 97227c4f3a8cdfdcf15fd4ff4a6983873595544b Mon Sep 17 00:00:00 2001 From: Jay Doane Date: Sun, 26 Apr 2020 15:22:04 -0700 Subject: Improve robustness of couch expiring cache test In its current incarnation, the so-called "simple lifecycle" test is prone to numerous failures in the CI system [1], doubtless because it's riddled with race conditions. The original author makes many assumptions about how quickly an (actual, unmocked) FDB instance will respond to a request. The primary goal is to stop failing CI builds, while other considerations include: keeping the run time of the test as low as possible, keeping the code coverage high, and documenting the known races. Specifically: - Increase the `stale` and `expired` times by a factor of 5 to decrease sensitivity to poor FDB performance. - Change default timer from `erlang:system_time/1` to `os:timestamp` on the assumption that the latter is less prone to warping [2]. - Decrease the period of the cache server reaper by half to increase accuracy of eviction time. - Inline and modify the `test_util:wait` code to make the timer explicit, and emphasize that `timer:delay/1` only works with millisecond resolution. - Don't fail the test if it can't get a fresh lookup immediately after insertion, but let it continue on to the next race, at least to the point of expiration and deletion, which continue to be asserted. - Factor `Timeout` and `Interval` to allow declarations near the other hard-coded parameters. - Move cache server `Opts` into `setup/0` and eliminate `start_link/0`. - Double the overall test timeout to 20 seconds. This has soaked for hundreds of runs on a 5 year old laptop, but the real test is the CI system. Should this test continue to fail CI builds, additional improvements could include mocking the timer and/or FDB layer to eliminate the variability of an integrated system. [1] https://ci-couchdb.apache.org/blue/organizations/jenkins/jenkins-cm1%2FPullRequests/detail/PR-2813/10/pipeline [2] http://erlang.org/doc/apps/erts/time_correction.html#terminology --- .../src/couch_expiring_cache_server.erl | 10 +- .../test/couch_expiring_cache_tests.erl | 101 ++++++++++++++------- 2 files changed, 77 insertions(+), 34 deletions(-) diff --git a/src/couch_expiring_cache/src/couch_expiring_cache_server.erl b/src/couch_expiring_cache/src/couch_expiring_cache_server.erl index eb74e6eb2..74c432e25 100644 --- a/src/couch_expiring_cache/src/couch_expiring_cache_server.erl +++ b/src/couch_expiring_cache/src/couch_expiring_cache_server.erl @@ -17,6 +17,7 @@ -callback start_link() -> {ok, pid()} | ignore | {error, term()}. -export([ + now_ts/0, start_link/2 ]). @@ -80,10 +81,10 @@ handle_info(remove_expired, St) -> largest_elapsed := LargestElapsed } = St, - NowTS = erlang:system_time(?TIME_UNIT), + NowTS = now_ts(), OldestTS = max(OldestTS0, couch_expiring_cache_fdb:clear_range_to(Name, NowTS, BatchSize)), - Elapsed = erlang:system_time(?TIME_UNIT) - NowTS, + Elapsed = now_ts() - NowTS, {noreply, St#{ timer_ref := schedule_remove_expired(Period, MaxJitter), @@ -108,6 +109,11 @@ code_change(_OldVsn, St, _Extra) -> {ok, St}. +now_ts() -> + {Mega, Sec, Micro} = os:timestamp(), + ((Mega * 1000000) + Sec) * 1000 + Micro div 1000. + + %% Private diff --git a/src/couch_expiring_cache/test/couch_expiring_cache_tests.erl b/src/couch_expiring_cache/test/couch_expiring_cache_tests.erl index 2e06fcc5a..0780b8847 100644 --- a/src/couch_expiring_cache/test/couch_expiring_cache_tests.erl +++ b/src/couch_expiring_cache/test/couch_expiring_cache_tests.erl @@ -18,16 +18,9 @@ -include_lib("couch_expiring_cache/include/couch_expiring_cache.hrl"). --define(CACHE_NAME, <<"test">>). +-define(CACHE_NAME, atom_to_binary(?MODULE, utf8)). - -start_link() -> - Opts = #{ - cache_name => ?CACHE_NAME, - period => 20, - max_jitter => 0 - }, - couch_expiring_cache_server:start_link(?MODULE, Opts). +-define(FOREVER, 576460752303423488). % max int 64 bit couch_expiring_cache_basic_test_() -> @@ -56,7 +49,11 @@ teardown_couch(Ctx) -> setup() -> - {ok, Pid} = start_link(), + Opts = #{ + cache_name => ?CACHE_NAME, + period => 10, + max_jitter => 0}, + {ok, Pid} = couch_expiring_cache_server:start_link(?MODULE, Opts), true = unlink(Pid), #{pid => Pid}. @@ -66,10 +63,18 @@ teardown(#{pid := Pid}) -> simple_lifecycle(_) -> - {timeout, 10, ?_test(begin - Now = erlang:system_time(?TIME_UNIT), - StaleTS = Now + 100, - ExpiresTS = Now + 200, + % The entire test is racing against FDB being faster than timeout seconds + {timeout, 20, ?_test(begin + Start = couch_expiring_cache_server:now_ts(), + % Race Alert! + % We're betting on FDB returning a lookup faster than these: + Stale = 500, + Expires = 1000, + Timeout = 5000, + Interval = 5, + + StaleTS = Start + Stale, + ExpiresTS = Start + Expires, Name = ?CACHE_NAME, Key = <<"key">>, Val = <<"val">>, @@ -77,34 +82,66 @@ simple_lifecycle(_) -> ?assertEqual(ok, couch_expiring_cache_fdb:clear_all(Name)), ?assertEqual(not_found, couch_expiring_cache:lookup(Name, Key)), ?assertEqual([], entries(Name)), - ?assertEqual(ok, - couch_expiring_cache:insert(Name, Key, Val, StaleTS, ExpiresTS)), - ?assertEqual({fresh, Val}, couch_expiring_cache:lookup(Name, Key)), - ok = wait_lookup(Name, Key, {stale, Val}), + ?assertEqual(ok, couch_expiring_cache:insert(Name, Key, Val, + StaleTS, ExpiresTS)), + ok = attempt_fresh_and_stale_lookups(Name, Key, Timeout, Interval), % Refresh the existing key with updated timestamps - ?assertEqual(ok, - couch_expiring_cache:insert(Name, Key, Val, - StaleTS + 100, ExpiresTS + 100)), - ?assertEqual({fresh, Val}, couch_expiring_cache:lookup(Name, Key)), + Refresh = couch_expiring_cache_server:now_ts(), + ?assertEqual(ok, couch_expiring_cache:insert(Name, Key, Val, + Refresh + Stale, Refresh + Expires)), + ok = attempt_fresh_and_stale_lookups(Name, Key, Timeout, Interval), ?assertEqual(1, length(entries(Name))), - ok = wait_lookup(Name, Key, {stale, Val}), - ok = wait_lookup(Name, Key, expired), - ok = wait_lookup(Name, Key, not_found), - ?assertEqual([], entries(Name)), - ?assertEqual(not_found, couch_expiring_cache:lookup(Name, Key)) + % These last 2 are also races, betting on FDB to be reasonably + % fast on the home stretch + ok = wait_lookup(Name, Key, expired, Timeout, Interval), + ok = wait_lookup(Name, Key, not_found, Timeout, Interval), + ?assertEqual([], entries(Name)) end)}. +% In this race we're betting on FDB to take less than `Stale` and then +% `Expired` milliseconds to respond +attempt_fresh_and_stale_lookups(Name, Key, Timeout, Interval) -> + case couch_expiring_cache:lookup(Name, Key) of + {fresh, Val} -> + % We won that race, let's bet on another! + ok = wait_lookup(Name, Key, {stale, Val}, Timeout, Interval); + _ -> + % Unlucky! But don't fail the test just yet... + ok + end. + + entries(Name) -> - FarFuture = erlang:system_time(?TIME_UNIT) * 2, - couch_expiring_cache_fdb:get_range_to(Name, FarFuture, _Limit=100). + couch_expiring_cache_fdb:get_range_to(Name, ?FOREVER, _Limit=100). -wait_lookup(Name, Key, Expect) -> - test_util:wait(fun() -> +% This lookup races against Timeout +wait_lookup(Name, Key, Expect, Timeout, Interval) -> + wait(fun() -> case couch_expiring_cache:lookup(Name, Key) of Expect -> ok; _ -> wait end - end, _Timeout = 1000, _PollingInterval = 10). + end, Timeout, Interval). + + +wait(Fun, Timeout, Delay) -> + Now = couch_expiring_cache_server:now_ts(), + wait(Fun, Timeout, Delay, Now, Now). + + +wait(_Fun, Timeout, _Delay, Started, Prev) when Prev - Started > Timeout -> + timeout; + +wait(Fun, Timeout, Delay, Started, _Prev) -> + case Fun() of + wait -> + % http://erlang.org/doc/man/timer.html#sleep-1 + ok = timer:sleep(Delay), % always millisecond + wait(Fun, Timeout, Delay, Started, + couch_expiring_cache_server:now_ts()); + Else -> + Else + end. -- cgit v1.2.1