From e1de2b9c938158b185afb35c480486ba3ad6b90b Mon Sep 17 00:00:00 2001 From: Nick Vatamaniuc Date: Thu, 25 Mar 2021 19:17:46 -0400 Subject: Improve retryable FDB error handling After running the Elixir suite under the buggify" mode, we noticed retryable errors were not handled well. In some cases we handled some errors (1009) but not others (1007, 1031). Some other retryable errors were not considered at all. So here we use the newly defined constants and `ERLFDB_IS_RETRYABLE/2` guard from erlfdb v1.3.1 to make handling of these errors a bit more consistent. --- rebar.config.script | 2 +- src/couch_jobs/src/couch_jobs.erl | 9 +++++++-- src/couch_jobs/src/couch_jobs_activity_monitor.erl | 7 ++++++- src/couch_jobs/src/couch_jobs_notifier.erl | 8 ++++++-- src/couch_jobs/src/couch_jobs_type_monitor.erl | 5 ++++- src/couch_views/src/couch_views_indexer.erl | 10 +++++----- src/fabric/include/fabric2.hrl | 11 +++-------- src/fabric/src/fabric2_fdb.erl | 9 +++++---- src/fabric/test/fabric2_test_util.erl | 5 ++++- src/fabric/test/fabric2_tx_options_tests.erl | 6 +++--- 10 files changed, 44 insertions(+), 28 deletions(-) diff --git a/rebar.config.script b/rebar.config.script index 9e872be36..a2da2d8c9 100644 --- a/rebar.config.script +++ b/rebar.config.script @@ -153,7 +153,7 @@ DepDescs = [ %% Independent Apps {config, "config", {tag, "2.1.8"}}, {b64url, "b64url", {tag, "1.0.2"}}, -{erlfdb, "erlfdb", {tag, "v1.3.0"}}, +{erlfdb, "erlfdb", {tag, "v1.3.1"}}, {ets_lru, "ets-lru", {tag, "1.1.0"}}, {khash, "khash", {tag, "1.1.0"}}, {snappy, "snappy", {tag, "CouchDB-1.0.4"}}, diff --git a/src/couch_jobs/src/couch_jobs.erl b/src/couch_jobs/src/couch_jobs.erl index 6c40f5dff..8d2fcd807 100644 --- a/src/couch_jobs/src/couch_jobs.erl +++ b/src/couch_jobs/src/couch_jobs.erl @@ -351,7 +351,9 @@ accept_loop(Type, NoSched, MaxSchedTime, Timeout) -> catch error:{timeout, _} -> retry; - error:{erlfdb_error, Err} when Err =:= 1020 orelse Err =:= 1031 -> + error:{erlfdb_error, ?ERLFDB_TRANSACTION_TIMED_OUT} -> + retry; + error:{erlfdb_error, Err} when ?ERLFDB_IS_RETRYABLE(Err) -> retry end, case AcceptResult of @@ -387,7 +389,10 @@ wait_pending(PendingWatch, MaxSTime, UserTimeout, NoSched) -> erlfdb:wait(PendingWatch, [{timeout, Timeout}]), ok catch - error:{erlfdb_error, ?FUTURE_VERSION} -> + error:{erlfdb_error, ?ERLFDB_TRANSACTION_TIMED_OUT} -> + erlfdb:cancel(PendingWatch, [flush]), + retry; + error:{erlfdb_error, Error} when ?ERLFDB_IS_RETRYABLE(Error) -> erlfdb:cancel(PendingWatch, [flush]), retry; error:{timeout, _} -> diff --git a/src/couch_jobs/src/couch_jobs_activity_monitor.erl b/src/couch_jobs/src/couch_jobs_activity_monitor.erl index 9802f5798..d5dfa41d3 100644 --- a/src/couch_jobs/src/couch_jobs_activity_monitor.erl +++ b/src/couch_jobs/src/couch_jobs_activity_monitor.erl @@ -28,6 +28,10 @@ code_change/3 ]). + +-include("couch_jobs.hrl"). + + -record(st, { jtx, type, @@ -68,7 +72,8 @@ handle_info(check_activity, St) -> St1 = try check_activity(St) catch - error:{erlfdb_error, Err} when Err =:= 1020 orelse Err =:= 1031 -> + error:{erlfdb_error, Err} when ?ERLFDB_IS_RETRYABLE(Err) orelse + Err =:= ?ERLFDB_TRANSACTION_TIMED_OUT -> LogMsg = "~p : type:~p got ~p error, possibly from overload", couch_log:error(LogMsg, [?MODULE, St#st.type, Err]), St diff --git a/src/couch_jobs/src/couch_jobs_notifier.erl b/src/couch_jobs/src/couch_jobs_notifier.erl index 99581cb79..db5d5e0e5 100644 --- a/src/couch_jobs/src/couch_jobs_notifier.erl +++ b/src/couch_jobs/src/couch_jobs_notifier.erl @@ -252,8 +252,12 @@ try_notify_subscribers(ActiveVS, #st{} = St) -> try notify_subscribers(ActiveVS, St) catch - error:{timeout, _} -> try_notify_subscribers(ActiveVS, St); - error:{erlfdb_error, 1031} -> try_notify_subscribers(ActiveVS, St) + error:{timeout, _} -> + try_notify_subscribers(ActiveVS, St); + error:{erlfdb_error, ?ERLFDB_TRANSACTION_TIMED_OUT} -> + try_notify_subscribers(ActiveVS, St); + error:{erlfdb_error, Code} when ?ERLFDB_IS_RETRYABLE(Code) -> + try_notify_subscribers(ActiveVS, St) end. diff --git a/src/couch_jobs/src/couch_jobs_type_monitor.erl b/src/couch_jobs/src/couch_jobs_type_monitor.erl index 04ad60acc..a62eb6217 100644 --- a/src/couch_jobs/src/couch_jobs_type_monitor.erl +++ b/src/couch_jobs/src/couch_jobs_type_monitor.erl @@ -55,7 +55,10 @@ loop(#st{vs = VS, timeout = Timeout} = St) -> try erlfdb:wait(Watch, [{timeout, Timeout}]) catch - error:{erlfdb_error, ?FUTURE_VERSION} -> + error:{erlfdb_error, ?ERLFDB_TRANSACTION_TIMED_OUT} -> + erlfdb:cancel(Watch, [flush]), + ok; + error:{erlfdb_error, Code} when ?ERLFDB_IS_RETRYABLE(Code) -> erlfdb:cancel(Watch, [flush]), ok; error:{timeout, _} -> diff --git a/src/couch_views/src/couch_views_indexer.erl b/src/couch_views/src/couch_views_indexer.erl index 8556b9946..df537977f 100644 --- a/src/couch_views/src/couch_views_indexer.erl +++ b/src/couch_views/src/couch_views_indexer.erl @@ -43,10 +43,10 @@ % These are all of the errors that we can fix by using % a smaller batch size. -define(IS_RECOVERABLE_ERROR(Code), ( - (Code == 1004) % timed_out - orelse (Code == 1007) % transaction_too_old - orelse (Code == 1031) % transaction_timed_out - orelse (Code == 2101) % transaction_too_large + (Code == ?ERLFDB_TIMED_OUT) orelse + (Code == ?ERLFDB_TRANSACTION_TOO_OLD) orelse + (Code == ?ERLFDB_TRANSACTION_TIMED_OUT) orelse + (Code == ?ERLFDB_TRANSACTION_TOO_LARGE) )). @@ -161,7 +161,7 @@ upgrade_data(Data) -> % Transaction limit exceeded don't retry -should_retry(_, _, {erlfdb_error, 2101}) -> +should_retry(_, _, {erlfdb_error, ?ERLFDB_TRANSACTION_TOO_LARGE}) -> false; should_retry(Retries, RetryLimit, _) when Retries < RetryLimit -> diff --git a/src/fabric/include/fabric2.hrl b/src/fabric/include/fabric2.hrl index ebbb7c7c5..f0d789940 100644 --- a/src/fabric/include/fabric2.hrl +++ b/src/fabric/include/fabric2.hrl @@ -11,6 +11,9 @@ % the License. +-include_lib("erlfdb/include/erlfdb.hrl"). + + -define(uint2bin(I), binary:encode_unsigned(I, little)). -define(bin2uint(I), binary:decode_unsigned(I, little)). -define(bin2int(V), binary_to_integer(V)). @@ -77,12 +80,4 @@ -define(PDICT_TX_RES_KEY, '$fabric_tx_result'). -define(PDICT_FOLD_ACC_STATE, '$fabric_fold_acc_state'). -% Let's keep these in ascending order --define(TRANSACTION_TOO_OLD, 1007). --define(FUTURE_VERSION, 1009). --define(COMMIT_UNKNOWN_RESULT, 1021). --define(TRANSACTION_CANCELLED, 1025). --define(TRANSACTION_TOO_LARGE, 2101). - - -define(DEFAULT_BINARY_CHUNK_SIZE, 100000). diff --git a/src/fabric/src/fabric2_fdb.erl b/src/fabric/src/fabric2_fdb.erl index 8c60d960c..bccf1d635 100644 --- a/src/fabric/src/fabric2_fdb.erl +++ b/src/fabric/src/fabric2_fdb.erl @@ -567,10 +567,10 @@ get_info_wait(#info_future{tx = Tx, retries = Retries} = Future) -> try get_info_wait_int(Future) catch - error:{erlfdb_error, ?TRANSACTION_CANCELLED} -> + error:{erlfdb_error, ?ERLFDB_TRANSACTION_CANCELLED} -> Future1 = get_info_future(Tx, Future#info_future.db_prefix), get_info_wait(Future1#info_future{retries = Retries + 1}); - error:{erlfdb_error, ?TRANSACTION_TOO_OLD} -> + error:{erlfdb_error, Error} when ?ERLFDB_IS_RETRYABLE(Error) -> ok = erlfdb:reset(Tx), Future1 = get_info_future(Tx, Future#info_future.db_prefix), get_info_wait(Future1#info_future{retries = Retries + 1}) @@ -1161,7 +1161,8 @@ fold_range(Tx, FAcc) -> user_acc = FinalUserAcc } = erlfdb:fold_range(Tx, Start, End, Callback, FAcc, Opts), FinalUserAcc - catch error:{erlfdb_error, ?TRANSACTION_TOO_OLD} when DoRestart -> + catch error:{erlfdb_error, Error} when + ?ERLFDB_IS_RETRYABLE(Error) andalso DoRestart -> % Possibly handle cluster_version_changed and future_version as well to % continue iteration instead fallback to transactional and retrying % from the beginning which is bound to fail when streaming data out to a @@ -1991,7 +1992,7 @@ clear_transaction() -> is_commit_unknown_result() -> - erlfdb:get_last_error() == ?COMMIT_UNKNOWN_RESULT. + erlfdb:get_last_error() == ?ERLFDB_COMMIT_UNKNOWN_RESULT. has_transaction_id() -> diff --git a/src/fabric/test/fabric2_test_util.erl b/src/fabric/test/fabric2_test_util.erl index acbe252b1..3d3477c5d 100644 --- a/src/fabric/test/fabric2_test_util.erl +++ b/src/fabric/test/fabric2_test_util.erl @@ -21,6 +21,9 @@ ]). +-include_lib("fabric/include/fabric2.hrl"). + + -define(PDICT_ERROR_IN_FOLD_RANGE, '$fabric2_error_in_fold_range'). -define(PDICT_ERROR_IN_USER_FUN, '$fabric2_error_throw_in_user_fun'). @@ -68,7 +71,7 @@ maybe_tx_too_old(Key) -> put(Key, {Skip - 1, Count}); {0, Count} when is_integer(Count), Count > 0 -> put(Key, {0, Count - 1}), - error({erlfdb_error, 1007}); + error({erlfdb_error, ?ERLFDB_TRANSACTION_TOO_OLD}); {0, 0} -> ok; undefined -> diff --git a/src/fabric/test/fabric2_tx_options_tests.erl b/src/fabric/test/fabric2_tx_options_tests.erl index cff383e57..78428c6e4 100644 --- a/src/fabric/test/fabric2_tx_options_tests.erl +++ b/src/fabric/test/fabric2_tx_options_tests.erl @@ -63,7 +63,7 @@ options_take_effect(_) -> DbName = ?tempdb(), {ok, Db} = fabric2_db:create(DbName, [?ADMIN_CTX]), - ?assertError({erlfdb_error, ?TRANSACTION_TOO_LARGE}, + ?assertError({erlfdb_error, ?ERLFDB_TRANSACTION_TOO_LARGE}, add_large_doc(Db, 200000)), ok = fabric2_db:delete(DbName, [?ADMIN_CTX]). @@ -81,7 +81,7 @@ can_configure_options_at_runtime(_) -> DbName = ?tempdb(), {ok, Db} = fabric2_db:create(DbName, [?ADMIN_CTX]), - ?assertError({erlfdb_error, ?TRANSACTION_TOO_LARGE}, + ?assertError({erlfdb_error, ?ERLFDB_TRANSACTION_TOO_LARGE}, add_large_doc(Db, 200000)), meck:reset(erlfdb), @@ -125,7 +125,7 @@ can_apply_options_to_db_handle_transactions(_) -> fabric2_db:update_doc(TxDb, large_doc(200000)) end, TxOpts = #{size_limit => 150000}, - ?assertError({erlfdb_error, ?TRANSACTION_TOO_LARGE}, + ?assertError({erlfdb_error, ?ERLFDB_TRANSACTION_TOO_LARGE}, fabric2_fdb:transactional(Db, TxOpts, TxFun)), ok = fabric2_db:delete(DbName, [?ADMIN_CTX]). -- cgit v1.2.1