summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick Vatamaniuc <vatamane@gmail.com>2021-03-25 19:17:46 -0400
committerNick Vatamaniuc <vatamane@apache.org>2021-03-25 23:47:14 -0400
commite1de2b9c938158b185afb35c480486ba3ad6b90b (patch)
treec326e84ad3356cabef8b8b7a213270df52050b77
parent0059b8f90e58e10b199a4b768a06a762d12a30d3 (diff)
downloadcouchdb-improve-resilience-to-retriable-errors.tar.gz
Improve retryable FDB error handlingimprove-resilience-to-retriable-errors
After running the Elixir suite under the buggify" mode, we noticed retryable errors were not handled well. In some cases we handled some errors (1009) but not others (1007, 1031). Some other retryable errors were not considered at all. So here we use the newly defined constants and `ERLFDB_IS_RETRYABLE/2` guard from erlfdb v1.3.1 to make handling of these errors a bit more consistent.
-rw-r--r--rebar.config.script2
-rw-r--r--src/couch_jobs/src/couch_jobs.erl9
-rw-r--r--src/couch_jobs/src/couch_jobs_activity_monitor.erl7
-rw-r--r--src/couch_jobs/src/couch_jobs_notifier.erl8
-rw-r--r--src/couch_jobs/src/couch_jobs_type_monitor.erl5
-rw-r--r--src/couch_views/src/couch_views_indexer.erl10
-rw-r--r--src/fabric/include/fabric2.hrl11
-rw-r--r--src/fabric/src/fabric2_fdb.erl9
-rw-r--r--src/fabric/test/fabric2_test_util.erl5
-rw-r--r--src/fabric/test/fabric2_tx_options_tests.erl6
10 files changed, 44 insertions, 28 deletions
diff --git a/rebar.config.script b/rebar.config.script
index 9e872be36..a2da2d8c9 100644
--- a/rebar.config.script
+++ b/rebar.config.script
@@ -153,7 +153,7 @@ DepDescs = [
%% Independent Apps
{config, "config", {tag, "2.1.8"}},
{b64url, "b64url", {tag, "1.0.2"}},
-{erlfdb, "erlfdb", {tag, "v1.3.0"}},
+{erlfdb, "erlfdb", {tag, "v1.3.1"}},
{ets_lru, "ets-lru", {tag, "1.1.0"}},
{khash, "khash", {tag, "1.1.0"}},
{snappy, "snappy", {tag, "CouchDB-1.0.4"}},
diff --git a/src/couch_jobs/src/couch_jobs.erl b/src/couch_jobs/src/couch_jobs.erl
index 6c40f5dff..8d2fcd807 100644
--- a/src/couch_jobs/src/couch_jobs.erl
+++ b/src/couch_jobs/src/couch_jobs.erl
@@ -351,7 +351,9 @@ accept_loop(Type, NoSched, MaxSchedTime, Timeout) ->
catch
error:{timeout, _} ->
retry;
- error:{erlfdb_error, Err} when Err =:= 1020 orelse Err =:= 1031 ->
+ error:{erlfdb_error, ?ERLFDB_TRANSACTION_TIMED_OUT} ->
+ retry;
+ error:{erlfdb_error, Err} when ?ERLFDB_IS_RETRYABLE(Err) ->
retry
end,
case AcceptResult of
@@ -387,7 +389,10 @@ wait_pending(PendingWatch, MaxSTime, UserTimeout, NoSched) ->
erlfdb:wait(PendingWatch, [{timeout, Timeout}]),
ok
catch
- error:{erlfdb_error, ?FUTURE_VERSION} ->
+ error:{erlfdb_error, ?ERLFDB_TRANSACTION_TIMED_OUT} ->
+ erlfdb:cancel(PendingWatch, [flush]),
+ retry;
+ error:{erlfdb_error, Error} when ?ERLFDB_IS_RETRYABLE(Error) ->
erlfdb:cancel(PendingWatch, [flush]),
retry;
error:{timeout, _} ->
diff --git a/src/couch_jobs/src/couch_jobs_activity_monitor.erl b/src/couch_jobs/src/couch_jobs_activity_monitor.erl
index 9802f5798..d5dfa41d3 100644
--- a/src/couch_jobs/src/couch_jobs_activity_monitor.erl
+++ b/src/couch_jobs/src/couch_jobs_activity_monitor.erl
@@ -28,6 +28,10 @@
code_change/3
]).
+
+-include("couch_jobs.hrl").
+
+
-record(st, {
jtx,
type,
@@ -68,7 +72,8 @@ handle_info(check_activity, St) ->
St1 = try
check_activity(St)
catch
- error:{erlfdb_error, Err} when Err =:= 1020 orelse Err =:= 1031 ->
+ error:{erlfdb_error, Err} when ?ERLFDB_IS_RETRYABLE(Err) orelse
+ Err =:= ?ERLFDB_TRANSACTION_TIMED_OUT ->
LogMsg = "~p : type:~p got ~p error, possibly from overload",
couch_log:error(LogMsg, [?MODULE, St#st.type, Err]),
St
diff --git a/src/couch_jobs/src/couch_jobs_notifier.erl b/src/couch_jobs/src/couch_jobs_notifier.erl
index 99581cb79..db5d5e0e5 100644
--- a/src/couch_jobs/src/couch_jobs_notifier.erl
+++ b/src/couch_jobs/src/couch_jobs_notifier.erl
@@ -252,8 +252,12 @@ try_notify_subscribers(ActiveVS, #st{} = St) ->
try
notify_subscribers(ActiveVS, St)
catch
- error:{timeout, _} -> try_notify_subscribers(ActiveVS, St);
- error:{erlfdb_error, 1031} -> try_notify_subscribers(ActiveVS, St)
+ error:{timeout, _} ->
+ try_notify_subscribers(ActiveVS, St);
+ error:{erlfdb_error, ?ERLFDB_TRANSACTION_TIMED_OUT} ->
+ try_notify_subscribers(ActiveVS, St);
+ error:{erlfdb_error, Code} when ?ERLFDB_IS_RETRYABLE(Code) ->
+ try_notify_subscribers(ActiveVS, St)
end.
diff --git a/src/couch_jobs/src/couch_jobs_type_monitor.erl b/src/couch_jobs/src/couch_jobs_type_monitor.erl
index 04ad60acc..a62eb6217 100644
--- a/src/couch_jobs/src/couch_jobs_type_monitor.erl
+++ b/src/couch_jobs/src/couch_jobs_type_monitor.erl
@@ -55,7 +55,10 @@ loop(#st{vs = VS, timeout = Timeout} = St) ->
try
erlfdb:wait(Watch, [{timeout, Timeout}])
catch
- error:{erlfdb_error, ?FUTURE_VERSION} ->
+ error:{erlfdb_error, ?ERLFDB_TRANSACTION_TIMED_OUT} ->
+ erlfdb:cancel(Watch, [flush]),
+ ok;
+ error:{erlfdb_error, Code} when ?ERLFDB_IS_RETRYABLE(Code) ->
erlfdb:cancel(Watch, [flush]),
ok;
error:{timeout, _} ->
diff --git a/src/couch_views/src/couch_views_indexer.erl b/src/couch_views/src/couch_views_indexer.erl
index 8556b9946..df537977f 100644
--- a/src/couch_views/src/couch_views_indexer.erl
+++ b/src/couch_views/src/couch_views_indexer.erl
@@ -43,10 +43,10 @@
% These are all of the errors that we can fix by using
% a smaller batch size.
-define(IS_RECOVERABLE_ERROR(Code), (
- (Code == 1004) % timed_out
- orelse (Code == 1007) % transaction_too_old
- orelse (Code == 1031) % transaction_timed_out
- orelse (Code == 2101) % transaction_too_large
+ (Code == ?ERLFDB_TIMED_OUT) orelse
+ (Code == ?ERLFDB_TRANSACTION_TOO_OLD) orelse
+ (Code == ?ERLFDB_TRANSACTION_TIMED_OUT) orelse
+ (Code == ?ERLFDB_TRANSACTION_TOO_LARGE)
)).
@@ -161,7 +161,7 @@ upgrade_data(Data) ->
% Transaction limit exceeded don't retry
-should_retry(_, _, {erlfdb_error, 2101}) ->
+should_retry(_, _, {erlfdb_error, ?ERLFDB_TRANSACTION_TOO_LARGE}) ->
false;
should_retry(Retries, RetryLimit, _) when Retries < RetryLimit ->
diff --git a/src/fabric/include/fabric2.hrl b/src/fabric/include/fabric2.hrl
index ebbb7c7c5..f0d789940 100644
--- a/src/fabric/include/fabric2.hrl
+++ b/src/fabric/include/fabric2.hrl
@@ -11,6 +11,9 @@
% the License.
+-include_lib("erlfdb/include/erlfdb.hrl").
+
+
-define(uint2bin(I), binary:encode_unsigned(I, little)).
-define(bin2uint(I), binary:decode_unsigned(I, little)).
-define(bin2int(V), binary_to_integer(V)).
@@ -77,12 +80,4 @@
-define(PDICT_TX_RES_KEY, '$fabric_tx_result').
-define(PDICT_FOLD_ACC_STATE, '$fabric_fold_acc_state').
-% Let's keep these in ascending order
--define(TRANSACTION_TOO_OLD, 1007).
--define(FUTURE_VERSION, 1009).
--define(COMMIT_UNKNOWN_RESULT, 1021).
--define(TRANSACTION_CANCELLED, 1025).
--define(TRANSACTION_TOO_LARGE, 2101).
-
-
-define(DEFAULT_BINARY_CHUNK_SIZE, 100000).
diff --git a/src/fabric/src/fabric2_fdb.erl b/src/fabric/src/fabric2_fdb.erl
index 8c60d960c..bccf1d635 100644
--- a/src/fabric/src/fabric2_fdb.erl
+++ b/src/fabric/src/fabric2_fdb.erl
@@ -567,10 +567,10 @@ get_info_wait(#info_future{tx = Tx, retries = Retries} = Future) ->
try
get_info_wait_int(Future)
catch
- error:{erlfdb_error, ?TRANSACTION_CANCELLED} ->
+ error:{erlfdb_error, ?ERLFDB_TRANSACTION_CANCELLED} ->
Future1 = get_info_future(Tx, Future#info_future.db_prefix),
get_info_wait(Future1#info_future{retries = Retries + 1});
- error:{erlfdb_error, ?TRANSACTION_TOO_OLD} ->
+ error:{erlfdb_error, Error} when ?ERLFDB_IS_RETRYABLE(Error) ->
ok = erlfdb:reset(Tx),
Future1 = get_info_future(Tx, Future#info_future.db_prefix),
get_info_wait(Future1#info_future{retries = Retries + 1})
@@ -1161,7 +1161,8 @@ fold_range(Tx, FAcc) ->
user_acc = FinalUserAcc
} = erlfdb:fold_range(Tx, Start, End, Callback, FAcc, Opts),
FinalUserAcc
- catch error:{erlfdb_error, ?TRANSACTION_TOO_OLD} when DoRestart ->
+ catch error:{erlfdb_error, Error} when
+ ?ERLFDB_IS_RETRYABLE(Error) andalso DoRestart ->
% Possibly handle cluster_version_changed and future_version as well to
% continue iteration instead fallback to transactional and retrying
% from the beginning which is bound to fail when streaming data out to a
@@ -1991,7 +1992,7 @@ clear_transaction() ->
is_commit_unknown_result() ->
- erlfdb:get_last_error() == ?COMMIT_UNKNOWN_RESULT.
+ erlfdb:get_last_error() == ?ERLFDB_COMMIT_UNKNOWN_RESULT.
has_transaction_id() ->
diff --git a/src/fabric/test/fabric2_test_util.erl b/src/fabric/test/fabric2_test_util.erl
index acbe252b1..3d3477c5d 100644
--- a/src/fabric/test/fabric2_test_util.erl
+++ b/src/fabric/test/fabric2_test_util.erl
@@ -21,6 +21,9 @@
]).
+-include_lib("fabric/include/fabric2.hrl").
+
+
-define(PDICT_ERROR_IN_FOLD_RANGE, '$fabric2_error_in_fold_range').
-define(PDICT_ERROR_IN_USER_FUN, '$fabric2_error_throw_in_user_fun').
@@ -68,7 +71,7 @@ maybe_tx_too_old(Key) ->
put(Key, {Skip - 1, Count});
{0, Count} when is_integer(Count), Count > 0 ->
put(Key, {0, Count - 1}),
- error({erlfdb_error, 1007});
+ error({erlfdb_error, ?ERLFDB_TRANSACTION_TOO_OLD});
{0, 0} ->
ok;
undefined ->
diff --git a/src/fabric/test/fabric2_tx_options_tests.erl b/src/fabric/test/fabric2_tx_options_tests.erl
index cff383e57..78428c6e4 100644
--- a/src/fabric/test/fabric2_tx_options_tests.erl
+++ b/src/fabric/test/fabric2_tx_options_tests.erl
@@ -63,7 +63,7 @@ options_take_effect(_) ->
DbName = ?tempdb(),
{ok, Db} = fabric2_db:create(DbName, [?ADMIN_CTX]),
- ?assertError({erlfdb_error, ?TRANSACTION_TOO_LARGE},
+ ?assertError({erlfdb_error, ?ERLFDB_TRANSACTION_TOO_LARGE},
add_large_doc(Db, 200000)),
ok = fabric2_db:delete(DbName, [?ADMIN_CTX]).
@@ -81,7 +81,7 @@ can_configure_options_at_runtime(_) ->
DbName = ?tempdb(),
{ok, Db} = fabric2_db:create(DbName, [?ADMIN_CTX]),
- ?assertError({erlfdb_error, ?TRANSACTION_TOO_LARGE},
+ ?assertError({erlfdb_error, ?ERLFDB_TRANSACTION_TOO_LARGE},
add_large_doc(Db, 200000)),
meck:reset(erlfdb),
@@ -125,7 +125,7 @@ can_apply_options_to_db_handle_transactions(_) ->
fabric2_db:update_doc(TxDb, large_doc(200000))
end,
TxOpts = #{size_limit => 150000},
- ?assertError({erlfdb_error, ?TRANSACTION_TOO_LARGE},
+ ?assertError({erlfdb_error, ?ERLFDB_TRANSACTION_TOO_LARGE},
fabric2_fdb:transactional(Db, TxOpts, TxFun)),
ok = fabric2_db:delete(DbName, [?ADMIN_CTX]).