summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick Vatamaniuc <vatamane@gmail.com>2021-03-25 19:17:46 -0400
committerNick Vatamaniuc <nickva@users.noreply.github.com>2021-03-26 15:13:58 -0400
commitacb43e12fd7fddc6f606246875909f7c7df27324 (patch)
tree48f5a7e203563a284498ba1ef17456d93288b1ef
parent4e6117eb0e144aad04cf91444e87e645ac27b215 (diff)
downloadcouchdb-acb43e12fd7fddc6f606246875909f7c7df27324.tar.gz
Improve retryable FDB error handling
After running the Elixir suite under the buggify" mode, we noticed retryable errors were not handled well. In some cases we handled some errors (1009) but not others (1007, 1031). Some other retryable errors were not considered at all. So here we use the newly defined constants and `ERLFDB_IS_RETRYABLE/2` guard from erlfdb v1.3.1 to make handling of these errors a bit more consistent.
-rw-r--r--rebar.config.script2
-rw-r--r--src/couch_jobs/src/couch_jobs.erl9
-rw-r--r--src/couch_jobs/src/couch_jobs_activity_monitor.erl7
-rw-r--r--src/couch_jobs/src/couch_jobs_notifier.erl8
-rw-r--r--src/couch_jobs/src/couch_jobs_type_monitor.erl5
-rw-r--r--src/couch_views/src/couch_views_indexer.erl10
-rw-r--r--src/fabric/include/fabric2.hrl11
-rw-r--r--src/fabric/src/fabric2_fdb.erl9
-rw-r--r--src/fabric/test/fabric2_test_util.erl5
-rw-r--r--src/fabric/test/fabric2_tx_options_tests.erl6
10 files changed, 44 insertions, 28 deletions
diff --git a/rebar.config.script b/rebar.config.script
index 9e872be36..a2da2d8c9 100644
--- a/rebar.config.script
+++ b/rebar.config.script
@@ -153,7 +153,7 @@ DepDescs = [
%% Independent Apps
{config, "config", {tag, "2.1.8"}},
{b64url, "b64url", {tag, "1.0.2"}},
-{erlfdb, "erlfdb", {tag, "v1.3.0"}},
+{erlfdb, "erlfdb", {tag, "v1.3.1"}},
{ets_lru, "ets-lru", {tag, "1.1.0"}},
{khash, "khash", {tag, "1.1.0"}},
{snappy, "snappy", {tag, "CouchDB-1.0.4"}},
diff --git a/src/couch_jobs/src/couch_jobs.erl b/src/couch_jobs/src/couch_jobs.erl
index 6c40f5dff..8d2fcd807 100644
--- a/src/couch_jobs/src/couch_jobs.erl
+++ b/src/couch_jobs/src/couch_jobs.erl
@@ -351,7 +351,9 @@ accept_loop(Type, NoSched, MaxSchedTime, Timeout) ->
catch
error:{timeout, _} ->
retry;
- error:{erlfdb_error, Err} when Err =:= 1020 orelse Err =:= 1031 ->
+ error:{erlfdb_error, ?ERLFDB_TRANSACTION_TIMED_OUT} ->
+ retry;
+ error:{erlfdb_error, Err} when ?ERLFDB_IS_RETRYABLE(Err) ->
retry
end,
case AcceptResult of
@@ -387,7 +389,10 @@ wait_pending(PendingWatch, MaxSTime, UserTimeout, NoSched) ->
erlfdb:wait(PendingWatch, [{timeout, Timeout}]),
ok
catch
- error:{erlfdb_error, ?FUTURE_VERSION} ->
+ error:{erlfdb_error, ?ERLFDB_TRANSACTION_TIMED_OUT} ->
+ erlfdb:cancel(PendingWatch, [flush]),
+ retry;
+ error:{erlfdb_error, Error} when ?ERLFDB_IS_RETRYABLE(Error) ->
erlfdb:cancel(PendingWatch, [flush]),
retry;
error:{timeout, _} ->
diff --git a/src/couch_jobs/src/couch_jobs_activity_monitor.erl b/src/couch_jobs/src/couch_jobs_activity_monitor.erl
index 9802f5798..d5dfa41d3 100644
--- a/src/couch_jobs/src/couch_jobs_activity_monitor.erl
+++ b/src/couch_jobs/src/couch_jobs_activity_monitor.erl
@@ -28,6 +28,10 @@
code_change/3
]).
+
+-include("couch_jobs.hrl").
+
+
-record(st, {
jtx,
type,
@@ -68,7 +72,8 @@ handle_info(check_activity, St) ->
St1 = try
check_activity(St)
catch
- error:{erlfdb_error, Err} when Err =:= 1020 orelse Err =:= 1031 ->
+ error:{erlfdb_error, Err} when ?ERLFDB_IS_RETRYABLE(Err) orelse
+ Err =:= ?ERLFDB_TRANSACTION_TIMED_OUT ->
LogMsg = "~p : type:~p got ~p error, possibly from overload",
couch_log:error(LogMsg, [?MODULE, St#st.type, Err]),
St
diff --git a/src/couch_jobs/src/couch_jobs_notifier.erl b/src/couch_jobs/src/couch_jobs_notifier.erl
index 99581cb79..db5d5e0e5 100644
--- a/src/couch_jobs/src/couch_jobs_notifier.erl
+++ b/src/couch_jobs/src/couch_jobs_notifier.erl
@@ -252,8 +252,12 @@ try_notify_subscribers(ActiveVS, #st{} = St) ->
try
notify_subscribers(ActiveVS, St)
catch
- error:{timeout, _} -> try_notify_subscribers(ActiveVS, St);
- error:{erlfdb_error, 1031} -> try_notify_subscribers(ActiveVS, St)
+ error:{timeout, _} ->
+ try_notify_subscribers(ActiveVS, St);
+ error:{erlfdb_error, ?ERLFDB_TRANSACTION_TIMED_OUT} ->
+ try_notify_subscribers(ActiveVS, St);
+ error:{erlfdb_error, Code} when ?ERLFDB_IS_RETRYABLE(Code) ->
+ try_notify_subscribers(ActiveVS, St)
end.
diff --git a/src/couch_jobs/src/couch_jobs_type_monitor.erl b/src/couch_jobs/src/couch_jobs_type_monitor.erl
index 04ad60acc..a62eb6217 100644
--- a/src/couch_jobs/src/couch_jobs_type_monitor.erl
+++ b/src/couch_jobs/src/couch_jobs_type_monitor.erl
@@ -55,7 +55,10 @@ loop(#st{vs = VS, timeout = Timeout} = St) ->
try
erlfdb:wait(Watch, [{timeout, Timeout}])
catch
- error:{erlfdb_error, ?FUTURE_VERSION} ->
+ error:{erlfdb_error, ?ERLFDB_TRANSACTION_TIMED_OUT} ->
+ erlfdb:cancel(Watch, [flush]),
+ ok;
+ error:{erlfdb_error, Code} when ?ERLFDB_IS_RETRYABLE(Code) ->
erlfdb:cancel(Watch, [flush]),
ok;
error:{timeout, _} ->
diff --git a/src/couch_views/src/couch_views_indexer.erl b/src/couch_views/src/couch_views_indexer.erl
index 8556b9946..df537977f 100644
--- a/src/couch_views/src/couch_views_indexer.erl
+++ b/src/couch_views/src/couch_views_indexer.erl
@@ -43,10 +43,10 @@
% These are all of the errors that we can fix by using
% a smaller batch size.
-define(IS_RECOVERABLE_ERROR(Code), (
- (Code == 1004) % timed_out
- orelse (Code == 1007) % transaction_too_old
- orelse (Code == 1031) % transaction_timed_out
- orelse (Code == 2101) % transaction_too_large
+ (Code == ?ERLFDB_TIMED_OUT) orelse
+ (Code == ?ERLFDB_TRANSACTION_TOO_OLD) orelse
+ (Code == ?ERLFDB_TRANSACTION_TIMED_OUT) orelse
+ (Code == ?ERLFDB_TRANSACTION_TOO_LARGE)
)).
@@ -161,7 +161,7 @@ upgrade_data(Data) ->
% Transaction limit exceeded don't retry
-should_retry(_, _, {erlfdb_error, 2101}) ->
+should_retry(_, _, {erlfdb_error, ?ERLFDB_TRANSACTION_TOO_LARGE}) ->
false;
should_retry(Retries, RetryLimit, _) when Retries < RetryLimit ->
diff --git a/src/fabric/include/fabric2.hrl b/src/fabric/include/fabric2.hrl
index ebbb7c7c5..f0d789940 100644
--- a/src/fabric/include/fabric2.hrl
+++ b/src/fabric/include/fabric2.hrl
@@ -11,6 +11,9 @@
% the License.
+-include_lib("erlfdb/include/erlfdb.hrl").
+
+
-define(uint2bin(I), binary:encode_unsigned(I, little)).
-define(bin2uint(I), binary:decode_unsigned(I, little)).
-define(bin2int(V), binary_to_integer(V)).
@@ -77,12 +80,4 @@
-define(PDICT_TX_RES_KEY, '$fabric_tx_result').
-define(PDICT_FOLD_ACC_STATE, '$fabric_fold_acc_state').
-% Let's keep these in ascending order
--define(TRANSACTION_TOO_OLD, 1007).
--define(FUTURE_VERSION, 1009).
--define(COMMIT_UNKNOWN_RESULT, 1021).
--define(TRANSACTION_CANCELLED, 1025).
--define(TRANSACTION_TOO_LARGE, 2101).
-
-
-define(DEFAULT_BINARY_CHUNK_SIZE, 100000).
diff --git a/src/fabric/src/fabric2_fdb.erl b/src/fabric/src/fabric2_fdb.erl
index 8c60d960c..bccf1d635 100644
--- a/src/fabric/src/fabric2_fdb.erl
+++ b/src/fabric/src/fabric2_fdb.erl
@@ -567,10 +567,10 @@ get_info_wait(#info_future{tx = Tx, retries = Retries} = Future) ->
try
get_info_wait_int(Future)
catch
- error:{erlfdb_error, ?TRANSACTION_CANCELLED} ->
+ error:{erlfdb_error, ?ERLFDB_TRANSACTION_CANCELLED} ->
Future1 = get_info_future(Tx, Future#info_future.db_prefix),
get_info_wait(Future1#info_future{retries = Retries + 1});
- error:{erlfdb_error, ?TRANSACTION_TOO_OLD} ->
+ error:{erlfdb_error, Error} when ?ERLFDB_IS_RETRYABLE(Error) ->
ok = erlfdb:reset(Tx),
Future1 = get_info_future(Tx, Future#info_future.db_prefix),
get_info_wait(Future1#info_future{retries = Retries + 1})
@@ -1161,7 +1161,8 @@ fold_range(Tx, FAcc) ->
user_acc = FinalUserAcc
} = erlfdb:fold_range(Tx, Start, End, Callback, FAcc, Opts),
FinalUserAcc
- catch error:{erlfdb_error, ?TRANSACTION_TOO_OLD} when DoRestart ->
+ catch error:{erlfdb_error, Error} when
+ ?ERLFDB_IS_RETRYABLE(Error) andalso DoRestart ->
% Possibly handle cluster_version_changed and future_version as well to
% continue iteration instead fallback to transactional and retrying
% from the beginning which is bound to fail when streaming data out to a
@@ -1991,7 +1992,7 @@ clear_transaction() ->
is_commit_unknown_result() ->
- erlfdb:get_last_error() == ?COMMIT_UNKNOWN_RESULT.
+ erlfdb:get_last_error() == ?ERLFDB_COMMIT_UNKNOWN_RESULT.
has_transaction_id() ->
diff --git a/src/fabric/test/fabric2_test_util.erl b/src/fabric/test/fabric2_test_util.erl
index acbe252b1..3d3477c5d 100644
--- a/src/fabric/test/fabric2_test_util.erl
+++ b/src/fabric/test/fabric2_test_util.erl
@@ -21,6 +21,9 @@
]).
+-include_lib("fabric/include/fabric2.hrl").
+
+
-define(PDICT_ERROR_IN_FOLD_RANGE, '$fabric2_error_in_fold_range').
-define(PDICT_ERROR_IN_USER_FUN, '$fabric2_error_throw_in_user_fun').
@@ -68,7 +71,7 @@ maybe_tx_too_old(Key) ->
put(Key, {Skip - 1, Count});
{0, Count} when is_integer(Count), Count > 0 ->
put(Key, {0, Count - 1}),
- error({erlfdb_error, 1007});
+ error({erlfdb_error, ?ERLFDB_TRANSACTION_TOO_OLD});
{0, 0} ->
ok;
undefined ->
diff --git a/src/fabric/test/fabric2_tx_options_tests.erl b/src/fabric/test/fabric2_tx_options_tests.erl
index cff383e57..78428c6e4 100644
--- a/src/fabric/test/fabric2_tx_options_tests.erl
+++ b/src/fabric/test/fabric2_tx_options_tests.erl
@@ -63,7 +63,7 @@ options_take_effect(_) ->
DbName = ?tempdb(),
{ok, Db} = fabric2_db:create(DbName, [?ADMIN_CTX]),
- ?assertError({erlfdb_error, ?TRANSACTION_TOO_LARGE},
+ ?assertError({erlfdb_error, ?ERLFDB_TRANSACTION_TOO_LARGE},
add_large_doc(Db, 200000)),
ok = fabric2_db:delete(DbName, [?ADMIN_CTX]).
@@ -81,7 +81,7 @@ can_configure_options_at_runtime(_) ->
DbName = ?tempdb(),
{ok, Db} = fabric2_db:create(DbName, [?ADMIN_CTX]),
- ?assertError({erlfdb_error, ?TRANSACTION_TOO_LARGE},
+ ?assertError({erlfdb_error, ?ERLFDB_TRANSACTION_TOO_LARGE},
add_large_doc(Db, 200000)),
meck:reset(erlfdb),
@@ -125,7 +125,7 @@ can_apply_options_to_db_handle_transactions(_) ->
fabric2_db:update_doc(TxDb, large_doc(200000))
end,
TxOpts = #{size_limit => 150000},
- ?assertError({erlfdb_error, ?TRANSACTION_TOO_LARGE},
+ ?assertError({erlfdb_error, ?ERLFDB_TRANSACTION_TOO_LARGE},
fabric2_fdb:transactional(Db, TxOpts, TxFun)),
ok = fabric2_db:delete(DbName, [?ADMIN_CTX]).