summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Bakken <luke@bakken.io>2021-12-14 17:35:14 -0800
committermergify-bot <noreply@mergify.com>2021-12-21 14:53:33 +0000
commit4f85b921ee21371294d7982cd87c828e0727771a (patch)
treec87a6c95a0256c3afc14638c4d338b26ba4ec90d
parentc5aab11ba3b62f865e731032c9b0c297dceead51 (diff)
downloadrabbitmq-server-git-4f85b921ee21371294d7982cd87c828e0727771a.tar.gz
Disk monitor improvements
Related to VESC-1015 * Remove `infinity` timeouts * Improve free disk space retrieval on win32 Run commands with a timeout This PR fixes an issue I observed while reproducing VESC-1015 on Windows 10. Within an hour or so of running a 3-node cluster that has health checks being run against it, one or more nodes' memory use would spike. I would see that the rabbit_disk_monitor process is stuck executing os:cmd to retrieve free disk space information. Thus, all gen_server:call calls to the process would never return, especially since they used an infinity timeout. Do something with timeout Fix unit_disk_monitor_mocks_SUITE (cherry picked from commit e0c4681eba7f5619674939639f540bfc34c89e1f) (cherry picked from commit 33cf06c966a2369146e087f8ac5b6ebeffbcf4fe)
-rw-r--r--deps/rabbit/src/rabbit_disk_monitor.erl112
1 files changed, 75 insertions, 37 deletions
diff --git a/deps/rabbit/src/rabbit_disk_monitor.erl b/deps/rabbit/src/rabbit_disk_monitor.erl
index 4f50489ad6..3a11952421 100644
--- a/deps/rabbit/src/rabbit_disk_monitor.erl
+++ b/deps/rabbit/src/rabbit_disk_monitor.erl
@@ -75,41 +75,41 @@
-spec get_disk_free_limit() -> integer().
get_disk_free_limit() ->
- gen_server:call(?MODULE, get_disk_free_limit, infinity).
+ gen_server:call(?MODULE, get_disk_free_limit).
-spec set_disk_free_limit(disk_free_limit()) -> 'ok'.
set_disk_free_limit(Limit) ->
- gen_server:call(?MODULE, {set_disk_free_limit, Limit}, infinity).
+ gen_server:call(?MODULE, {set_disk_free_limit, Limit}).
-spec get_min_check_interval() -> integer().
get_min_check_interval() ->
- gen_server:call(?MODULE, get_min_check_interval, infinity).
+ gen_server:call(?MODULE, get_min_check_interval).
-spec set_min_check_interval(integer()) -> 'ok'.
set_min_check_interval(Interval) ->
- gen_server:call(?MODULE, {set_min_check_interval, Interval}, infinity).
+ gen_server:call(?MODULE, {set_min_check_interval, Interval}).
-spec get_max_check_interval() -> integer().
get_max_check_interval() ->
- gen_server:call(?MODULE, get_max_check_interval, infinity).
+ gen_server:call(?MODULE, get_max_check_interval).
-spec set_max_check_interval(integer()) -> 'ok'.
set_max_check_interval(Interval) ->
- gen_server:call(?MODULE, {set_max_check_interval, Interval}, infinity).
+ gen_server:call(?MODULE, {set_max_check_interval, Interval}).
-spec get_disk_free() -> (integer() | 'unknown').
-spec set_enabled(string()) -> 'ok'.
get_disk_free() ->
- gen_server:call(?MODULE, get_disk_free, infinity).
+ gen_server:call(?MODULE, get_disk_free).
set_enabled(Enabled) ->
- gen_server:call(?MODULE, {set_enabled, Enabled}, infinity).
+ gen_server:call(?MODULE, {set_enabled, Enabled}).
%%----------------------------------------------------------------------------
%% gen_server callbacks
@@ -226,33 +226,19 @@ get_disk_free(Dir) ->
get_disk_free(Dir, {unix, Sun})
when Sun =:= sunos; Sun =:= sunos4; Sun =:= solaris ->
Df = os:find_executable("df"),
- parse_free_unix(rabbit_misc:os_cmd(Df ++ " -k " ++ Dir));
+ parse_free_unix(run_cmd(Df ++ " -k " ++ Dir));
get_disk_free(Dir, {unix, _}) ->
Df = os:find_executable("df"),
- parse_free_unix(rabbit_misc:os_cmd(Df ++ " -kP " ++ Dir));
+ parse_free_unix(run_cmd(Df ++ " -kP " ++ Dir));
get_disk_free(Dir, {win32, _}) ->
- %% On Windows, the Win32 API enforces a limit of 260 characters
- %% (MAX_PATH). If we call `dir` with a path longer than that, it
- %% fails with "File not found". Starting with Windows 10 version
- %% 1607, this limit was removed, but the administrator has to
- %% configure that.
- %%
- %% NTFS supports paths up to 32767 characters. Therefore, paths
- %% longer than 260 characters exist but they are "inaccessible" to
- %% `dir`.
- %%
- %% A workaround is to tell the Win32 API to not parse a path and
- %% just pass it raw to the underlying filesystem. To do this, the
- %% path must be prepended with "\\?\". That's what we do here.
- %%
- %% However, the underlying filesystem may not support forward
- %% slashes transparently, as the Win32 API does. Therefore, we
- %% convert all forward slashes to backslashes.
- %%
- %% See the following page to learn more about this:
- %% https://ss64.com/nt/syntax-filenames.html
- RawDir = "\\\\?\\" ++ string:replace(Dir, "/", "\\", all),
- parse_free_win32(rabbit_misc:os_cmd("dir /-C /W \"" ++ RawDir ++ "\"")).
+ case win32_get_disk_free_fsutil(Dir) of
+ {ok, Free0} -> Free0;
+ error ->
+ case win32_get_disk_free_pwsh(Dir) of
+ {ok, Free1} -> Free1;
+ _ -> exit(could_not_determine_disk_free)
+ end
+ end.
parse_free_unix(Str) ->
case string:tokens(Str, "\n") of
@@ -263,11 +249,46 @@ parse_free_unix(Str) ->
_ -> exit({unparseable, Str})
end.
-parse_free_win32(CommandResult) ->
- LastLine = lists:last(string:tokens(CommandResult, "\r\n")),
- {match, [Free]} = re:run(lists:reverse(LastLine), "(\\d+)",
- [{capture, all_but_first, list}]),
- list_to_integer(lists:reverse(Free)).
+win32_get_disk_free_fsutil(Dir) ->
+ % Dir:
+ % "c:/Users/username/AppData/Roaming/RabbitMQ/db/rabbit2@username-z01-mnesia"
+ Drive = string:slice(Dir, 0, 2),
+
+ % Drive: c:
+ FsutilCmd = "fsutil.exe volume diskfree " ++ Drive,
+
+ % C:\windows\system32>fsutil volume diskfree c:
+ % Total free bytes : 812,733,878,272 (756.9 GB)
+ % Total bytes : 1,013,310,287,872 (943.7 GB)
+ % Total quota free bytes : 812,733,878,272 (756.9 GB)
+ case run_cmd(FsutilCmd) of
+ {error, timeout} ->
+ error;
+ FsutilResult ->
+ case string:slice(FsutilResult, 0, 5) of
+ "Error" ->
+ error;
+ "Total" ->
+ FirstLine = hd(string:tokens(FsutilResult, "\r\n")),
+ {match, [FreeStr]} = re:run(FirstLine, "(\\d+,?)+", [{capture, first, list}]),
+ {ok, list_to_integer(lists:flatten(string:tokens(FreeStr, ",")))}
+ end
+ end.
+
+
+win32_get_disk_free_pwsh(Dir) ->
+ % Dir:
+ % "c:/Users/username/AppData/Roaming/RabbitMQ/db/rabbit2@username-z01-mnesia"
+ Drive = string:slice(Dir, 0, 1),
+ PoshCmd = "powershell.exe -NoLogo -NoProfile -NonInteractive -Command (Get-PSDrive " ++ Drive ++ ").Free",
+ case run_cmd(PoshCmd) of
+ {error, timeout} ->
+ error;
+ PoshResultStr ->
+ % Note: remove \r\n
+ PoshResult = string:slice(PoshResultStr, 0, length(PoshResultStr) - 2),
+ {ok, list_to_integer(PoshResult)}
+ end.
interpret_limit({mem_relative, Relative})
when is_number(Relative) ->
@@ -317,3 +338,20 @@ enable(#state{dir = Dir, interval = Interval, limit = Limit, retries = Retries}
erlang:send_after(Interval, self(), try_enable),
State#state{enabled = false}
end.
+
+run_cmd(Cmd) ->
+ Pid = self(),
+ Ref = make_ref(),
+ CmdFun = fun() ->
+ CmdResult = rabbit_misc:os_cmd(Cmd),
+ Pid ! {Pid, Ref, CmdResult}
+ end,
+ CmdPid = spawn(CmdFun),
+ receive
+ {Pid, Ref, CmdResult} ->
+ CmdResult
+ after 5000 ->
+ exit(CmdPid, kill),
+ rabbit_log:error("Command timed out: '~s'", [Cmd]),
+ {error, timeout}
+ end.