summaryrefslogtreecommitdiff
path: root/deps/rabbit/src/rabbit_msg_store.erl
diff options
context:
space:
mode:
Diffstat (limited to 'deps/rabbit/src/rabbit_msg_store.erl')
-rw-r--r--deps/rabbit/src/rabbit_msg_store.erl2245
1 files changed, 2245 insertions, 0 deletions
diff --git a/deps/rabbit/src/rabbit_msg_store.erl b/deps/rabbit/src/rabbit_msg_store.erl
new file mode 100644
index 0000000000..4851e56248
--- /dev/null
+++ b/deps/rabbit/src/rabbit_msg_store.erl
@@ -0,0 +1,2245 @@
+%% This Source Code Form is subject to the terms of the Mozilla Public
+%% License, v. 2.0. If a copy of the MPL was not distributed with this
+%% file, You can obtain one at https://mozilla.org/MPL/2.0/.
+%%
+%% Copyright (c) 2007-2020 VMware, Inc. or its affiliates. All rights reserved.
+%%
+
+-module(rabbit_msg_store).
+
+-behaviour(gen_server2).
+
+-export([start_link/4, start_global_store_link/4, successfully_recovered_state/1,
+ client_init/4, client_terminate/1, client_delete_and_terminate/1,
+ client_ref/1, close_all_indicated/1,
+ write/3, write_flow/3, read/2, contains/2, remove/2]).
+
+-export([set_maximum_since_use/2, combine_files/3,
+ delete_file/2]). %% internal
+
+-export([scan_file_for_valid_messages/1]). %% salvage tool
+
+-export([transform_dir/3, force_recovery/2]). %% upgrade
+
+-export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2,
+ code_change/3, prioritise_call/4, prioritise_cast/3,
+ prioritise_info/3, format_message_queue/2]).
+
+%%----------------------------------------------------------------------------
+
+-include("rabbit_msg_store.hrl").
+
+-define(SYNC_INTERVAL, 25). %% milliseconds
+-define(CLEAN_FILENAME, "clean.dot").
+-define(FILE_SUMMARY_FILENAME, "file_summary.ets").
+-define(TRANSFORM_TMP, "transform_tmp").
+
+-define(BINARY_MODE, [raw, binary]).
+-define(READ_MODE, [read]).
+-define(READ_AHEAD_MODE, [read_ahead | ?READ_MODE]).
+-define(WRITE_MODE, [write]).
+
+-define(FILE_EXTENSION, ".rdq").
+-define(FILE_EXTENSION_TMP, ".rdt").
+
+-define(HANDLE_CACHE_BUFFER_SIZE, 1048576). %% 1MB
+
+ %% i.e. two pairs, so GC does not go idle when busy
+-define(MAXIMUM_SIMULTANEOUS_GC_FILES, 4).
+
+%%----------------------------------------------------------------------------
+
+-record(msstate,
+ {
+ %% store directory
+ dir,
+ %% the module for index ops,
+ %% rabbit_msg_store_ets_index by default
+ index_module,
+ %% where are messages?
+ index_state,
+ %% current file name as number
+ current_file,
+ %% current file handle since the last fsync?
+ current_file_handle,
+ %% file handle cache
+ file_handle_cache,
+ %% TRef for our interval timer
+ sync_timer_ref,
+ %% sum of valid data in all files
+ sum_valid_data,
+ %% sum of file sizes
+ sum_file_size,
+ %% things to do once GC completes
+ pending_gc_completion,
+ %% pid of our GC
+ gc_pid,
+ %% tid of the shared file handles table
+ file_handles_ets,
+ %% tid of the file summary table
+ file_summary_ets,
+ %% tid of current file cache table
+ cur_file_cache_ets,
+ %% tid of writes/removes in flight
+ flying_ets,
+ %% set of dying clients
+ dying_clients,
+ %% map of references of all registered clients
+ %% to callbacks
+ clients,
+ %% boolean: did we recover state?
+ successfully_recovered,
+ %% how big are our files allowed to get?
+ file_size_limit,
+ %% client ref to synced messages mapping
+ cref_to_msg_ids,
+ %% See CREDIT_DISC_BOUND in rabbit.hrl
+ credit_disc_bound
+ }).
+
+-record(client_msstate,
+ { server,
+ client_ref,
+ file_handle_cache,
+ index_state,
+ index_module,
+ dir,
+ gc_pid,
+ file_handles_ets,
+ file_summary_ets,
+ cur_file_cache_ets,
+ flying_ets,
+ credit_disc_bound
+ }).
+
+-record(file_summary,
+ {file, valid_total_size, left, right, file_size, locked, readers}).
+
+-record(gc_state,
+ { dir,
+ index_module,
+ index_state,
+ file_summary_ets,
+ file_handles_ets,
+ msg_store
+ }).
+
+-record(dying_client,
+ { client_ref,
+ file,
+ offset
+ }).
+
+%%----------------------------------------------------------------------------
+
+-export_type([gc_state/0, file_num/0]).
+
+-type gc_state() :: #gc_state { dir :: file:filename(),
+ index_module :: atom(),
+ index_state :: any(),
+ file_summary_ets :: ets:tid(),
+ file_handles_ets :: ets:tid(),
+ msg_store :: server()
+ }.
+
+-type server() :: pid() | atom().
+-type client_ref() :: binary().
+-type file_num() :: non_neg_integer().
+-type client_msstate() :: #client_msstate {
+ server :: server(),
+ client_ref :: client_ref(),
+ file_handle_cache :: map(),
+ index_state :: any(),
+ index_module :: atom(),
+ dir :: file:filename(),
+ gc_pid :: pid(),
+ file_handles_ets :: ets:tid(),
+ file_summary_ets :: ets:tid(),
+ cur_file_cache_ets :: ets:tid(),
+ flying_ets :: ets:tid(),
+ credit_disc_bound :: {pos_integer(), pos_integer()}}.
+-type msg_ref_delta_gen(A) ::
+ fun ((A) -> 'finished' |
+ {rabbit_types:msg_id(), non_neg_integer(), A}).
+-type maybe_msg_id_fun() ::
+ 'undefined' | fun ((gb_sets:set(), 'written' | 'ignored') -> any()).
+-type maybe_close_fds_fun() :: 'undefined' | fun (() -> 'ok').
+-type deletion_thunk() :: fun (() -> boolean()).
+
+%%----------------------------------------------------------------------------
+
+%% We run GC whenever (garbage / sum_file_size) > ?GARBAGE_FRACTION
+%% It is not recommended to set this to < 0.5
+-define(GARBAGE_FRACTION, 0.5).
+
+%% Message store is responsible for storing messages
+%% on disk and loading them back. The store handles both
+%% persistent messages and transient ones (when a node
+%% is under RAM pressure and needs to page messages out
+%% to disk). The store is responsible for locating messages
+%% on disk and maintaining an index.
+%%
+%% There are two message stores per node: one for transient
+%% and one for persistent messages.
+%%
+%% Queue processes interact with the stores via clients.
+%%
+%% The components:
+%%
+%% Index: this is a mapping from MsgId to #msg_location{}.
+%% By default, it's in ETS, but other implementations can
+%% be used.
+%% FileSummary: this maps File to #file_summary{} and is stored
+%% in ETS.
+%%
+%% The basic idea is that messages are appended to the current file up
+%% until that file becomes too big (> file_size_limit). At that point,
+%% the file is closed and a new file is created on the _right_ of the
+%% old file which is used for new messages. Files are named
+%% numerically ascending, thus the file with the lowest name is the
+%% eldest file.
+%%
+%% We need to keep track of which messages are in which files (this is
+%% the index); how much useful data is in each file and which files
+%% are on the left and right of each other. This is the purpose of the
+%% file summary ETS table.
+%%
+%% As messages are removed from files, holes appear in these
+%% files. The field ValidTotalSize contains the total amount of useful
+%% data left in the file. This is needed for garbage collection.
+%%
+%% When we discover that a file is now empty, we delete it. When we
+%% discover that it can be combined with the useful data in either its
+%% left or right neighbour, and overall, across all the files, we have
+%% ((the amount of garbage) / (the sum of all file sizes)) >
+%% ?GARBAGE_FRACTION, we start a garbage collection run concurrently,
+%% which will compact the two files together. This keeps disk
+%% utilisation high and aids performance. We deliberately do this
+%% lazily in order to prevent doing GC on files which are soon to be
+%% emptied (and hence deleted).
+%%
+%% Given the compaction between two files, the left file (i.e. elder
+%% file) is considered the ultimate destination for the good data in
+%% the right file. If necessary, the good data in the left file which
+%% is fragmented throughout the file is written out to a temporary
+%% file, then read back in to form a contiguous chunk of good data at
+%% the start of the left file. Thus the left file is garbage collected
+%% and compacted. Then the good data from the right file is copied
+%% onto the end of the left file. Index and file summary tables are
+%% updated.
+%%
+%% On non-clean startup, we scan the files we discover, dealing with
+%% the possibilities of a crash having occurred during a compaction
+%% (this consists of tidyup - the compaction is deliberately designed
+%% such that data is duplicated on disk rather than risking it being
+%% lost), and rebuild the file summary and index ETS table.
+%%
+%% So, with this design, messages move to the left. Eventually, they
+%% should end up in a contiguous block on the left and are then never
+%% rewritten. But this isn't quite the case. If in a file there is one
+%% message that is being ignored, for some reason, and messages in the
+%% file to the right and in the current block are being read all the
+%% time then it will repeatedly be the case that the good data from
+%% both files can be combined and will be written out to a new
+%% file. Whenever this happens, our shunned message will be rewritten.
+%%
+%% So, provided that we combine messages in the right order,
+%% (i.e. left file, bottom to top, right file, bottom to top),
+%% eventually our shunned message will end up at the bottom of the
+%% left file. The compaction/combining algorithm is smart enough to
+%% read in good data from the left file that is scattered throughout
+%% (i.e. C and D in the below diagram), then truncate the file to just
+%% above B (i.e. truncate to the limit of the good contiguous region
+%% at the start of the file), then write C and D on top and then write
+%% E, F and G from the right file on top. Thus contiguous blocks of
+%% good data at the bottom of files are not rewritten.
+%%
+%% +-------+ +-------+ +-------+
+%% | X | | G | | G |
+%% +-------+ +-------+ +-------+
+%% | D | | X | | F |
+%% +-------+ +-------+ +-------+
+%% | X | | X | | E |
+%% +-------+ +-------+ +-------+
+%% | C | | F | ===> | D |
+%% +-------+ +-------+ +-------+
+%% | X | | X | | C |
+%% +-------+ +-------+ +-------+
+%% | B | | X | | B |
+%% +-------+ +-------+ +-------+
+%% | A | | E | | A |
+%% +-------+ +-------+ +-------+
+%% left right left
+%%
+%% From this reasoning, we do have a bound on the number of times the
+%% message is rewritten. From when it is inserted, there can be no
+%% files inserted between it and the head of the queue, and the worst
+%% case is that every time it is rewritten, it moves one position lower
+%% in the file (for it to stay at the same position requires that
+%% there are no holes beneath it, which means truncate would be used
+%% and so it would not be rewritten at all). Thus this seems to
+%% suggest the limit is the number of messages ahead of it in the
+%% queue, though it's likely that that's pessimistic, given the
+%% requirements for compaction/combination of files.
+%%
+%% The other property that we have is the bound on the lowest
+%% utilisation, which should be 50% - worst case is that all files are
+%% fractionally over half full and can't be combined (equivalent is
+%% alternating full files and files with only one tiny message in
+%% them).
+%%
+%% Messages are reference-counted. When a message with the same msg id
+%% is written several times we only store it once, and only remove it
+%% from the store when it has been removed the same number of times.
+%%
+%% The reference counts do not persist. Therefore the initialisation
+%% function must be provided with a generator that produces ref count
+%% deltas for all recovered messages. This is only used on startup
+%% when the shutdown was non-clean.
+%%
+%% Read messages with a reference count greater than one are entered
+%% into a message cache. The purpose of the cache is not especially
+%% performance, though it can help there too, but prevention of memory
+%% explosion. It ensures that as messages with a high reference count
+%% are read from several processes they are read back as the same
+%% binary object rather than multiples of identical binary
+%% objects.
+%%
+%% Reads can be performed directly by clients without calling to the
+%% server. This is safe because multiple file handles can be used to
+%% read files. However, locking is used by the concurrent GC to make
+%% sure that reads are not attempted from files which are in the
+%% process of being garbage collected.
+%%
+%% When a message is removed, its reference count is decremented. Even
+%% if the reference count becomes 0, its entry is not removed. This is
+%% because in the event of the same message being sent to several
+%% different queues, there is the possibility of one queue writing and
+%% removing the message before other queues write it at all. Thus
+%% accommodating 0-reference counts allows us to avoid unnecessary
+%% writes here. Of course, there are complications: the file to which
+%% the message has already been written could be locked pending
+%% deletion or GC, which means we have to rewrite the message as the
+%% original copy will now be lost.
+%%
+%% The server automatically defers reads, removes and contains calls
+%% that occur which refer to files which are currently being
+%% GC'd. Contains calls are only deferred in order to ensure they do
+%% not overtake removes.
+%%
+%% The current file to which messages are being written has a
+%% write-back cache. This is written to immediately by clients and can
+%% be read from by clients too. This means that there are only ever
+%% writes made to the current file, thus eliminating delays due to
+%% flushing write buffers in order to be able to safely read from the
+%% current file. The one exception to this is that on start up, the
+%% cache is not populated with msgs found in the current file, and
+%% thus in this case only, reads may have to come from the file
+%% itself. The effect of this is that even if the msg_store process is
+%% heavily overloaded, clients can still write and read messages with
+%% very low latency and not block at all.
+%%
+%% Clients of the msg_store are required to register before using the
+%% msg_store. This provides them with the necessary client-side state
+%% to allow them to directly access the various caches and files. When
+%% they terminate, they should deregister. They can do this by calling
+%% either client_terminate/1 or client_delete_and_terminate/1. The
+%% differences are: (a) client_terminate is synchronous. As a result,
+%% if the msg_store is badly overloaded and has lots of in-flight
+%% writes and removes to process, this will take some time to
+%% return. However, once it does return, you can be sure that all the
+%% actions you've issued to the msg_store have been processed. (b) Not
+%% only is client_delete_and_terminate/1 asynchronous, but it also
+%% permits writes and subsequent removes from the current
+%% (terminating) client which are still in flight to be safely
+%% ignored. Thus from the point of view of the msg_store itself, and
+%% all from the same client:
+%%
+%% (T) = termination; (WN) = write of msg N; (RN) = remove of msg N
+%% --> W1, W2, W1, R1, T, W3, R2, W2, R1, R2, R3, W4 -->
+%%
+%% The client obviously sent T after all the other messages (up to
+%% W4), but because the msg_store prioritises messages, the T can be
+%% promoted and thus received early.
+%%
+%% Thus at the point of the msg_store receiving T, we have messages 1
+%% and 2 with a refcount of 1. After T, W3 will be ignored because
+%% it's an unknown message, as will R3, and W4. W2, R1 and R2 won't be
+%% ignored because the messages that they refer to were already known
+%% to the msg_store prior to T. However, it can be a little more
+%% complex: after the first R2, the refcount of msg 2 is 0. At that
+%% point, if a GC occurs or file deletion, msg 2 could vanish, which
+%% would then mean that the subsequent W2 and R2 are then ignored.
+%%
+%% The use case then for client_delete_and_terminate/1 is if the
+%% client wishes to remove everything it's written to the msg_store:
+%% it issues removes for all messages it's written and not removed,
+%% and then calls client_delete_and_terminate/1. At that point, any
+%% in-flight writes (and subsequent removes) can be ignored, but
+%% removes and writes for messages the msg_store already knows about
+%% will continue to be processed normally (which will normally just
+%% involve modifying the reference count, which is fast). Thus we save
+%% disk bandwidth for writes which are going to be immediately removed
+%% again by the the terminating client.
+%%
+%% We use a separate set to keep track of the dying clients in order
+%% to keep that set, which is inspected on every write and remove, as
+%% small as possible. Inspecting the set of all clients would degrade
+%% performance with many healthy clients and few, if any, dying
+%% clients, which is the typical case.
+%%
+%% Client termination messages are stored in a separate ets index to
+%% avoid filling primary message store index and message files with
+%% client termination messages.
+%%
+%% When the msg_store has a backlog (i.e. it has unprocessed messages
+%% in its mailbox / gen_server priority queue), a further optimisation
+%% opportunity arises: we can eliminate pairs of 'write' and 'remove'
+%% from the same client for the same message. A typical occurrence of
+%% these is when an empty durable queue delivers persistent messages
+%% to ack'ing consumers. The queue will asynchronously ask the
+%% msg_store to 'write' such messages, and when they are acknowledged
+%% it will issue a 'remove'. That 'remove' may be issued before the
+%% msg_store has processed the 'write'. There is then no point going
+%% ahead with the processing of that 'write'.
+%%
+%% To detect this situation a 'flying_ets' table is shared between the
+%% clients and the server. The table is keyed on the combination of
+%% client (reference) and msg id, and the value represents an
+%% integration of all the writes and removes currently "in flight" for
+%% that message between the client and server - '+1' means all the
+%% writes/removes add up to a single 'write', '-1' to a 'remove', and
+%% '0' to nothing. (NB: the integration can never add up to more than
+%% one 'write' or 'read' since clients must not write/remove a message
+%% more than once without first removing/writing it).
+%%
+%% Maintaining this table poses two challenges: 1) both the clients
+%% and the server access and update the table, which causes
+%% concurrency issues, 2) we must ensure that entries do not stay in
+%% the table forever, since that would constitute a memory leak. We
+%% address the former by carefully modelling all operations as
+%% sequences of atomic actions that produce valid results in all
+%% possible interleavings. We address the latter by deleting table
+%% entries whenever the server finds a 0-valued entry during the
+%% processing of a write/remove. 0 is essentially equivalent to "no
+%% entry". If, OTOH, the value is non-zero we know there is at least
+%% one other 'write' or 'remove' in flight, so we get an opportunity
+%% later to delete the table entry when processing these.
+%%
+%% There are two further complications. We need to ensure that 1)
+%% eliminated writes still get confirmed, and 2) the write-back cache
+%% doesn't grow unbounded. These are quite straightforward to
+%% address. See the comments in the code.
+%%
+%% For notes on Clean Shutdown and startup, see documentation in
+%% rabbit_variable_queue.
+
+%%----------------------------------------------------------------------------
+%% public API
+%%----------------------------------------------------------------------------
+
+-spec start_link
+ (atom(), file:filename(), [binary()] | 'undefined',
+ {msg_ref_delta_gen(A), A}) -> rabbit_types:ok_pid_or_error().
+
+start_link(Type, Dir, ClientRefs, StartupFunState) when is_atom(Type) ->
+ gen_server2:start_link(?MODULE,
+ [Type, Dir, ClientRefs, StartupFunState],
+ [{timeout, infinity}]).
+
+start_global_store_link(Type, Dir, ClientRefs, StartupFunState) when is_atom(Type) ->
+ gen_server2:start_link({local, Type}, ?MODULE,
+ [Type, Dir, ClientRefs, StartupFunState],
+ [{timeout, infinity}]).
+
+-spec successfully_recovered_state(server()) -> boolean().
+
+successfully_recovered_state(Server) ->
+ gen_server2:call(Server, successfully_recovered_state, infinity).
+
+-spec client_init(server(), client_ref(), maybe_msg_id_fun(),
+ maybe_close_fds_fun()) -> client_msstate().
+
+client_init(Server, Ref, MsgOnDiskFun, CloseFDsFun) when is_pid(Server); is_atom(Server) ->
+ {IState, IModule, Dir, GCPid,
+ FileHandlesEts, FileSummaryEts, CurFileCacheEts, FlyingEts} =
+ gen_server2:call(
+ Server, {new_client_state, Ref, self(), MsgOnDiskFun, CloseFDsFun},
+ infinity),
+ CreditDiscBound = rabbit_misc:get_env(rabbit, msg_store_credit_disc_bound,
+ ?CREDIT_DISC_BOUND),
+ #client_msstate { server = Server,
+ client_ref = Ref,
+ file_handle_cache = #{},
+ index_state = IState,
+ index_module = IModule,
+ dir = Dir,
+ gc_pid = GCPid,
+ file_handles_ets = FileHandlesEts,
+ file_summary_ets = FileSummaryEts,
+ cur_file_cache_ets = CurFileCacheEts,
+ flying_ets = FlyingEts,
+ credit_disc_bound = CreditDiscBound }.
+
+-spec client_terminate(client_msstate()) -> 'ok'.
+
+client_terminate(CState = #client_msstate { client_ref = Ref }) ->
+ close_all_handles(CState),
+ ok = server_call(CState, {client_terminate, Ref}).
+
+-spec client_delete_and_terminate(client_msstate()) -> 'ok'.
+
+client_delete_and_terminate(CState = #client_msstate { client_ref = Ref }) ->
+ close_all_handles(CState),
+ ok = server_cast(CState, {client_dying, Ref}),
+ ok = server_cast(CState, {client_delete, Ref}).
+
+-spec client_ref(client_msstate()) -> client_ref().
+
+client_ref(#client_msstate { client_ref = Ref }) -> Ref.
+
+-spec write_flow(rabbit_types:msg_id(), msg(), client_msstate()) -> 'ok'.
+
+write_flow(MsgId, Msg,
+ CState = #client_msstate {
+ server = Server,
+ credit_disc_bound = CreditDiscBound }) ->
+ %% Here we are tracking messages sent by the
+ %% rabbit_amqqueue_process process via the
+ %% rabbit_variable_queue. We are accessing the
+ %% rabbit_amqqueue_process process dictionary.
+ credit_flow:send(Server, CreditDiscBound),
+ client_write(MsgId, Msg, flow, CState).
+
+-spec write(rabbit_types:msg_id(), msg(), client_msstate()) -> 'ok'.
+
+write(MsgId, Msg, CState) -> client_write(MsgId, Msg, noflow, CState).
+
+-spec read(rabbit_types:msg_id(), client_msstate()) ->
+ {rabbit_types:ok(msg()) | 'not_found', client_msstate()}.
+
+read(MsgId,
+ CState = #client_msstate { cur_file_cache_ets = CurFileCacheEts }) ->
+ file_handle_cache_stats:update(msg_store_read),
+ %% Check the cur file cache
+ case ets:lookup(CurFileCacheEts, MsgId) of
+ [] ->
+ Defer = fun() -> {server_call(CState, {read, MsgId}), CState} end,
+ case index_lookup_positive_ref_count(MsgId, CState) of
+ not_found -> Defer();
+ MsgLocation -> client_read1(MsgLocation, Defer, CState)
+ end;
+ [{MsgId, Msg, _CacheRefCount}] ->
+ {{ok, Msg}, CState}
+ end.
+
+-spec contains(rabbit_types:msg_id(), client_msstate()) -> boolean().
+
+contains(MsgId, CState) -> server_call(CState, {contains, MsgId}).
+
+-spec remove([rabbit_types:msg_id()], client_msstate()) -> 'ok'.
+
+remove([], _CState) -> ok;
+remove(MsgIds, CState = #client_msstate { client_ref = CRef }) ->
+ [client_update_flying(-1, MsgId, CState) || MsgId <- MsgIds],
+ server_cast(CState, {remove, CRef, MsgIds}).
+
+-spec set_maximum_since_use(server(), non_neg_integer()) -> 'ok'.
+
+set_maximum_since_use(Server, Age) when is_pid(Server); is_atom(Server) ->
+ gen_server2:cast(Server, {set_maximum_since_use, Age}).
+
+%%----------------------------------------------------------------------------
+%% Client-side-only helpers
+%%----------------------------------------------------------------------------
+
+server_call(#client_msstate { server = Server }, Msg) ->
+ gen_server2:call(Server, Msg, infinity).
+
+server_cast(#client_msstate { server = Server }, Msg) ->
+ gen_server2:cast(Server, Msg).
+
+client_write(MsgId, Msg, Flow,
+ CState = #client_msstate { cur_file_cache_ets = CurFileCacheEts,
+ client_ref = CRef }) ->
+ file_handle_cache_stats:update(msg_store_write),
+ ok = client_update_flying(+1, MsgId, CState),
+ ok = update_msg_cache(CurFileCacheEts, MsgId, Msg),
+ ok = server_cast(CState, {write, CRef, MsgId, Flow}).
+
+client_read1(#msg_location { msg_id = MsgId, file = File } = MsgLocation, Defer,
+ CState = #client_msstate { file_summary_ets = FileSummaryEts }) ->
+ case ets:lookup(FileSummaryEts, File) of
+ [] -> %% File has been GC'd and no longer exists. Go around again.
+ read(MsgId, CState);
+ [#file_summary { locked = Locked, right = Right }] ->
+ client_read2(Locked, Right, MsgLocation, Defer, CState)
+ end.
+
+client_read2(false, undefined, _MsgLocation, Defer, _CState) ->
+ %% Although we've already checked both caches and not found the
+ %% message there, the message is apparently in the
+ %% current_file. We can only arrive here if we are trying to read
+ %% a message which we have not written, which is very odd, so just
+ %% defer.
+ %%
+ %% OR, on startup, the cur_file_cache is not populated with the
+ %% contents of the current file, thus reads from the current file
+ %% will end up here and will need to be deferred.
+ Defer();
+client_read2(true, _Right, _MsgLocation, Defer, _CState) ->
+ %% Of course, in the mean time, the GC could have run and our msg
+ %% is actually in a different file, unlocked. However, deferring is
+ %% the safest and simplest thing to do.
+ Defer();
+client_read2(false, _Right,
+ MsgLocation = #msg_location { msg_id = MsgId, file = File },
+ Defer,
+ CState = #client_msstate { file_summary_ets = FileSummaryEts }) ->
+ %% It's entirely possible that everything we're doing from here on
+ %% is for the wrong file, or a non-existent file, as a GC may have
+ %% finished.
+ safe_ets_update_counter(
+ FileSummaryEts, File, {#file_summary.readers, +1},
+ fun (_) -> client_read3(MsgLocation, Defer, CState) end,
+ fun () -> read(MsgId, CState) end).
+
+client_read3(#msg_location { msg_id = MsgId, file = File }, Defer,
+ CState = #client_msstate { file_handles_ets = FileHandlesEts,
+ file_summary_ets = FileSummaryEts,
+ gc_pid = GCPid,
+ client_ref = Ref }) ->
+ Release =
+ fun() -> ok = case ets:update_counter(FileSummaryEts, File,
+ {#file_summary.readers, -1}) of
+ 0 -> case ets:lookup(FileSummaryEts, File) of
+ [#file_summary { locked = true }] ->
+ rabbit_msg_store_gc:no_readers(
+ GCPid, File);
+ _ -> ok
+ end;
+ _ -> ok
+ end
+ end,
+ %% If a GC involving the file hasn't already started, it won't
+ %% start now. Need to check again to see if we've been locked in
+ %% the meantime, between lookup and update_counter (thus GC
+ %% started before our +1. In fact, it could have finished by now
+ %% too).
+ case ets:lookup(FileSummaryEts, File) of
+ [] -> %% GC has deleted our file, just go round again.
+ read(MsgId, CState);
+ [#file_summary { locked = true }] ->
+ %% If we get a badarg here, then the GC has finished and
+ %% deleted our file. Try going around again. Otherwise,
+ %% just defer.
+ %%
+ %% badarg scenario: we lookup, msg_store locks, GC starts,
+ %% GC ends, we +1 readers, msg_store ets:deletes (and
+ %% unlocks the dest)
+ try Release(),
+ Defer()
+ catch error:badarg -> read(MsgId, CState)
+ end;
+ [#file_summary { locked = false }] ->
+ %% Ok, we're definitely safe to continue - a GC involving
+ %% the file cannot start up now, and isn't running, so
+ %% nothing will tell us from now on to close the handle if
+ %% it's already open.
+ %%
+ %% Finally, we need to recheck that the msg is still at
+ %% the same place - it's possible an entire GC ran between
+ %% us doing the lookup and the +1 on the readers. (Same as
+ %% badarg scenario above, but we don't have a missing file
+ %% - we just have the /wrong/ file).
+ case index_lookup(MsgId, CState) of
+ #msg_location { file = File } = MsgLocation ->
+ %% Still the same file.
+ {ok, CState1} = close_all_indicated(CState),
+ %% We are now guaranteed that the mark_handle_open
+ %% call will either insert_new correctly, or will
+ %% fail, but find the value is open, not close.
+ mark_handle_open(FileHandlesEts, File, Ref),
+ %% Could the msg_store now mark the file to be
+ %% closed? No: marks for closing are issued only
+ %% when the msg_store has locked the file.
+ %% This will never be the current file
+ {Msg, CState2} = read_from_disk(MsgLocation, CState1),
+ Release(), %% this MUST NOT fail with badarg
+ {{ok, Msg}, CState2};
+ #msg_location {} = MsgLocation -> %% different file!
+ Release(), %% this MUST NOT fail with badarg
+ client_read1(MsgLocation, Defer, CState);
+ not_found -> %% it seems not to exist. Defer, just to be sure.
+ try Release() %% this can badarg, same as locked case, above
+ catch error:badarg -> ok
+ end,
+ Defer()
+ end
+ end.
+
+client_update_flying(Diff, MsgId, #client_msstate { flying_ets = FlyingEts,
+ client_ref = CRef }) ->
+ Key = {MsgId, CRef},
+ case ets:insert_new(FlyingEts, {Key, Diff}) of
+ true -> ok;
+ false -> try ets:update_counter(FlyingEts, Key, {2, Diff}) of
+ 0 -> ok;
+ Diff -> ok;
+ Err -> throw({bad_flying_ets_update, Diff, Err, Key})
+ catch error:badarg ->
+ %% this is guaranteed to succeed since the
+ %% server only removes and updates flying_ets
+ %% entries; it never inserts them
+ true = ets:insert_new(FlyingEts, {Key, Diff})
+ end,
+ ok
+ end.
+
+clear_client(CRef, State = #msstate { cref_to_msg_ids = CTM,
+ dying_clients = DyingClients }) ->
+ State #msstate { cref_to_msg_ids = maps:remove(CRef, CTM),
+ dying_clients = maps:remove(CRef, DyingClients) }.
+
+
+%%----------------------------------------------------------------------------
+%% gen_server callbacks
+%%----------------------------------------------------------------------------
+
+
+init([Type, BaseDir, ClientRefs, StartupFunState]) ->
+ process_flag(trap_exit, true),
+
+ ok = file_handle_cache:register_callback(?MODULE, set_maximum_since_use,
+ [self()]),
+
+ Dir = filename:join(BaseDir, atom_to_list(Type)),
+ Name = filename:join(filename:basename(BaseDir), atom_to_list(Type)),
+
+ {ok, IndexModule} = application:get_env(rabbit, msg_store_index_module),
+ rabbit_log:info("Message store ~tp: using ~p to provide index~n", [Name, IndexModule]),
+
+ AttemptFileSummaryRecovery =
+ case ClientRefs of
+ undefined -> ok = rabbit_file:recursive_delete([Dir]),
+ ok = filelib:ensure_dir(filename:join(Dir, "nothing")),
+ false;
+ _ -> ok = filelib:ensure_dir(filename:join(Dir, "nothing")),
+ recover_crashed_compactions(Dir)
+ end,
+ %% if we found crashed compactions we trust neither the
+ %% file_summary nor the location index. Note the file_summary is
+ %% left empty here if it can't be recovered.
+ {FileSummaryRecovered, FileSummaryEts} =
+ recover_file_summary(AttemptFileSummaryRecovery, Dir),
+ {CleanShutdown, IndexState, ClientRefs1} =
+ recover_index_and_client_refs(IndexModule, FileSummaryRecovered,
+ ClientRefs, Dir, Name),
+ Clients = maps:from_list(
+ [{CRef, {undefined, undefined, undefined}} ||
+ CRef <- ClientRefs1]),
+ %% CleanShutdown => msg location index and file_summary both
+ %% recovered correctly.
+ true = case {FileSummaryRecovered, CleanShutdown} of
+ {true, false} -> ets:delete_all_objects(FileSummaryEts);
+ _ -> true
+ end,
+ %% CleanShutdown <=> msg location index and file_summary both
+ %% recovered correctly.
+
+ FileHandlesEts = ets:new(rabbit_msg_store_shared_file_handles,
+ [ordered_set, public]),
+ CurFileCacheEts = ets:new(rabbit_msg_store_cur_file, [set, public]),
+ FlyingEts = ets:new(rabbit_msg_store_flying, [set, public]),
+
+ {ok, FileSizeLimit} = application:get_env(rabbit, msg_store_file_size_limit),
+
+ {ok, GCPid} = rabbit_msg_store_gc:start_link(
+ #gc_state { dir = Dir,
+ index_module = IndexModule,
+ index_state = IndexState,
+ file_summary_ets = FileSummaryEts,
+ file_handles_ets = FileHandlesEts,
+ msg_store = self()
+ }),
+
+ CreditDiscBound = rabbit_misc:get_env(rabbit, msg_store_credit_disc_bound,
+ ?CREDIT_DISC_BOUND),
+
+ State = #msstate { dir = Dir,
+ index_module = IndexModule,
+ index_state = IndexState,
+ current_file = 0,
+ current_file_handle = undefined,
+ file_handle_cache = #{},
+ sync_timer_ref = undefined,
+ sum_valid_data = 0,
+ sum_file_size = 0,
+ pending_gc_completion = maps:new(),
+ gc_pid = GCPid,
+ file_handles_ets = FileHandlesEts,
+ file_summary_ets = FileSummaryEts,
+ cur_file_cache_ets = CurFileCacheEts,
+ flying_ets = FlyingEts,
+ dying_clients = #{},
+ clients = Clients,
+ successfully_recovered = CleanShutdown,
+ file_size_limit = FileSizeLimit,
+ cref_to_msg_ids = #{},
+ credit_disc_bound = CreditDiscBound
+ },
+ %% If we didn't recover the msg location index then we need to
+ %% rebuild it now.
+ Cleanliness = case CleanShutdown of
+ true -> "clean";
+ false -> "unclean"
+ end,
+ rabbit_log:debug("Rebuilding message location index after ~s shutdown...~n",
+ [Cleanliness]),
+ {Offset, State1 = #msstate { current_file = CurFile }} =
+ build_index(CleanShutdown, StartupFunState, State),
+ rabbit_log:debug("Finished rebuilding index~n", []),
+ %% read is only needed so that we can seek
+ {ok, CurHdl} = open_file(Dir, filenum_to_name(CurFile),
+ [read | ?WRITE_MODE]),
+ {ok, Offset} = file_handle_cache:position(CurHdl, Offset),
+ ok = file_handle_cache:truncate(CurHdl),
+
+ {ok, maybe_compact(State1 #msstate { current_file_handle = CurHdl }),
+ hibernate,
+ {backoff, ?HIBERNATE_AFTER_MIN, ?HIBERNATE_AFTER_MIN, ?DESIRED_HIBERNATE}}.
+
+prioritise_call(Msg, _From, _Len, _State) ->
+ case Msg of
+ successfully_recovered_state -> 7;
+ {new_client_state, _Ref, _Pid, _MODC, _CloseFDsFun} -> 7;
+ {read, _MsgId} -> 2;
+ _ -> 0
+ end.
+
+prioritise_cast(Msg, _Len, _State) ->
+ case Msg of
+ {combine_files, _Source, _Destination, _Reclaimed} -> 8;
+ {delete_file, _File, _Reclaimed} -> 8;
+ {set_maximum_since_use, _Age} -> 8;
+ {client_dying, _Pid} -> 7;
+ _ -> 0
+ end.
+
+prioritise_info(Msg, _Len, _State) ->
+ case Msg of
+ sync -> 8;
+ _ -> 0
+ end.
+
+handle_call(successfully_recovered_state, _From, State) ->
+ reply(State #msstate.successfully_recovered, State);
+
+handle_call({new_client_state, CRef, CPid, MsgOnDiskFun, CloseFDsFun}, _From,
+ State = #msstate { dir = Dir,
+ index_state = IndexState,
+ index_module = IndexModule,
+ file_handles_ets = FileHandlesEts,
+ file_summary_ets = FileSummaryEts,
+ cur_file_cache_ets = CurFileCacheEts,
+ flying_ets = FlyingEts,
+ clients = Clients,
+ gc_pid = GCPid }) ->
+ Clients1 = maps:put(CRef, {CPid, MsgOnDiskFun, CloseFDsFun}, Clients),
+ erlang:monitor(process, CPid),
+ reply({IndexState, IndexModule, Dir, GCPid, FileHandlesEts, FileSummaryEts,
+ CurFileCacheEts, FlyingEts},
+ State #msstate { clients = Clients1 });
+
+handle_call({client_terminate, CRef}, _From, State) ->
+ reply(ok, clear_client(CRef, State));
+
+handle_call({read, MsgId}, From, State) ->
+ State1 = read_message(MsgId, From, State),
+ noreply(State1);
+
+handle_call({contains, MsgId}, From, State) ->
+ State1 = contains_message(MsgId, From, State),
+ noreply(State1).
+
+handle_cast({client_dying, CRef},
+ State = #msstate { dying_clients = DyingClients,
+ current_file_handle = CurHdl,
+ current_file = CurFile }) ->
+ {ok, CurOffset} = file_handle_cache:current_virtual_offset(CurHdl),
+ DyingClients1 = maps:put(CRef,
+ #dying_client{client_ref = CRef,
+ file = CurFile,
+ offset = CurOffset},
+ DyingClients),
+ noreply(State #msstate { dying_clients = DyingClients1 });
+
+handle_cast({client_delete, CRef},
+ State = #msstate { clients = Clients }) ->
+ State1 = State #msstate { clients = maps:remove(CRef, Clients) },
+ noreply(clear_client(CRef, State1));
+
+handle_cast({write, CRef, MsgId, Flow},
+ State = #msstate { cur_file_cache_ets = CurFileCacheEts,
+ clients = Clients,
+ credit_disc_bound = CreditDiscBound }) ->
+ case Flow of
+ flow -> {CPid, _, _} = maps:get(CRef, Clients),
+ %% We are going to process a message sent by the
+ %% rabbit_amqqueue_process. Now we are accessing the
+ %% msg_store process dictionary.
+ credit_flow:ack(CPid, CreditDiscBound);
+ noflow -> ok
+ end,
+ true = 0 =< ets:update_counter(CurFileCacheEts, MsgId, {3, -1}),
+ case update_flying(-1, MsgId, CRef, State) of
+ process ->
+ [{MsgId, Msg, _PWC}] = ets:lookup(CurFileCacheEts, MsgId),
+ noreply(write_message(MsgId, Msg, CRef, State));
+ ignore ->
+ %% A 'remove' has already been issued and eliminated the
+ %% 'write'.
+ State1 = blind_confirm(CRef, gb_sets:singleton(MsgId),
+ ignored, State),
+ %% If all writes get eliminated, cur_file_cache_ets could
+ %% grow unbounded. To prevent that we delete the cache
+ %% entry here, but only if the message isn't in the
+ %% current file. That way reads of the message can
+ %% continue to be done client side, from either the cache
+ %% or the non-current files. If the message *is* in the
+ %% current file then the cache entry will be removed by
+ %% the normal logic for that in write_message/4 and
+ %% maybe_roll_to_new_file/2.
+ case index_lookup(MsgId, State1) of
+ [#msg_location { file = File }]
+ when File == State1 #msstate.current_file ->
+ ok;
+ _ ->
+ true = ets:match_delete(CurFileCacheEts, {MsgId, '_', 0})
+ end,
+ noreply(State1)
+ end;
+
+handle_cast({remove, CRef, MsgIds}, State) ->
+ {RemovedMsgIds, State1} =
+ lists:foldl(
+ fun (MsgId, {Removed, State2}) ->
+ case update_flying(+1, MsgId, CRef, State2) of
+ process -> {[MsgId | Removed],
+ remove_message(MsgId, CRef, State2)};
+ ignore -> {Removed, State2}
+ end
+ end, {[], State}, MsgIds),
+ noreply(maybe_compact(client_confirm(CRef, gb_sets:from_list(RemovedMsgIds),
+ ignored, State1)));
+
+handle_cast({combine_files, Source, Destination, Reclaimed},
+ State = #msstate { sum_file_size = SumFileSize,
+ file_handles_ets = FileHandlesEts,
+ file_summary_ets = FileSummaryEts,
+ clients = Clients }) ->
+ ok = cleanup_after_file_deletion(Source, State),
+ %% see comment in cleanup_after_file_deletion, and client_read3
+ true = mark_handle_to_close(Clients, FileHandlesEts, Destination, false),
+ true = ets:update_element(FileSummaryEts, Destination,
+ {#file_summary.locked, false}),
+ State1 = State #msstate { sum_file_size = SumFileSize - Reclaimed },
+ noreply(maybe_compact(run_pending([Source, Destination], State1)));
+
+handle_cast({delete_file, File, Reclaimed},
+ State = #msstate { sum_file_size = SumFileSize }) ->
+ ok = cleanup_after_file_deletion(File, State),
+ State1 = State #msstate { sum_file_size = SumFileSize - Reclaimed },
+ noreply(maybe_compact(run_pending([File], State1)));
+
+handle_cast({set_maximum_since_use, Age}, State) ->
+ ok = file_handle_cache:set_maximum_since_use(Age),
+ noreply(State).
+
+handle_info(sync, State) ->
+ noreply(internal_sync(State));
+
+handle_info(timeout, State) ->
+ noreply(internal_sync(State));
+
+handle_info({'DOWN', _MRef, process, Pid, _Reason}, State) ->
+ %% similar to what happens in
+ %% rabbit_amqqueue_process:handle_ch_down but with a relation of
+ %% msg_store -> rabbit_amqqueue_process instead of
+ %% rabbit_amqqueue_process -> rabbit_channel.
+ credit_flow:peer_down(Pid),
+ noreply(State);
+
+handle_info({'EXIT', _Pid, Reason}, State) ->
+ {stop, Reason, State}.
+
+terminate(_Reason, State = #msstate { index_state = IndexState,
+ index_module = IndexModule,
+ current_file_handle = CurHdl,
+ gc_pid = GCPid,
+ file_handles_ets = FileHandlesEts,
+ file_summary_ets = FileSummaryEts,
+ cur_file_cache_ets = CurFileCacheEts,
+ flying_ets = FlyingEts,
+ clients = Clients,
+ dir = Dir }) ->
+ rabbit_log:info("Stopping message store for directory '~s'", [Dir]),
+ %% stop the gc first, otherwise it could be working and we pull
+ %% out the ets tables from under it.
+ ok = rabbit_msg_store_gc:stop(GCPid),
+ State1 = case CurHdl of
+ undefined -> State;
+ _ -> State2 = internal_sync(State),
+ ok = file_handle_cache:close(CurHdl),
+ State2
+ end,
+ State3 = close_all_handles(State1),
+ case store_file_summary(FileSummaryEts, Dir) of
+ ok -> ok;
+ {error, FSErr} ->
+ rabbit_log:error("Unable to store file summary"
+ " for vhost message store for directory ~p~n"
+ "Error: ~p~n",
+ [Dir, FSErr])
+ end,
+ [true = ets:delete(T) || T <- [FileSummaryEts, FileHandlesEts,
+ CurFileCacheEts, FlyingEts]],
+ IndexModule:terminate(IndexState),
+ case store_recovery_terms([{client_refs, maps:keys(Clients)},
+ {index_module, IndexModule}], Dir) of
+ ok ->
+ rabbit_log:info("Message store for directory '~s' is stopped", [Dir]),
+ ok;
+ {error, RTErr} ->
+ rabbit_log:error("Unable to save message store recovery terms"
+ " for directory ~p~nError: ~p~n",
+ [Dir, RTErr])
+ end,
+ State3 #msstate { index_state = undefined,
+ current_file_handle = undefined }.
+
+code_change(_OldVsn, State, _Extra) ->
+ {ok, State}.
+
+format_message_queue(Opt, MQ) -> rabbit_misc:format_message_queue(Opt, MQ).
+
+%%----------------------------------------------------------------------------
+%% general helper functions
+%%----------------------------------------------------------------------------
+
+noreply(State) ->
+ {State1, Timeout} = next_state(State),
+ {noreply, State1, Timeout}.
+
+reply(Reply, State) ->
+ {State1, Timeout} = next_state(State),
+ {reply, Reply, State1, Timeout}.
+
+next_state(State = #msstate { sync_timer_ref = undefined,
+ cref_to_msg_ids = CTM }) ->
+ case maps:size(CTM) of
+ 0 -> {State, hibernate};
+ _ -> {start_sync_timer(State), 0}
+ end;
+next_state(State = #msstate { cref_to_msg_ids = CTM }) ->
+ case maps:size(CTM) of
+ 0 -> {stop_sync_timer(State), hibernate};
+ _ -> {State, 0}
+ end.
+
+start_sync_timer(State) ->
+ rabbit_misc:ensure_timer(State, #msstate.sync_timer_ref,
+ ?SYNC_INTERVAL, sync).
+
+stop_sync_timer(State) ->
+ rabbit_misc:stop_timer(State, #msstate.sync_timer_ref).
+
+internal_sync(State = #msstate { current_file_handle = CurHdl,
+ cref_to_msg_ids = CTM }) ->
+ State1 = stop_sync_timer(State),
+ CGs = maps:fold(fun (CRef, MsgIds, NS) ->
+ case gb_sets:is_empty(MsgIds) of
+ true -> NS;
+ false -> [{CRef, MsgIds} | NS]
+ end
+ end, [], CTM),
+ ok = case CGs of
+ [] -> ok;
+ _ -> file_handle_cache:sync(CurHdl)
+ end,
+ lists:foldl(fun ({CRef, MsgIds}, StateN) ->
+ client_confirm(CRef, MsgIds, written, StateN)
+ end, State1, CGs).
+
+update_flying(Diff, MsgId, CRef, #msstate { flying_ets = FlyingEts }) ->
+ Key = {MsgId, CRef},
+ NDiff = -Diff,
+ case ets:lookup(FlyingEts, Key) of
+ [] -> ignore;
+ [{_, Diff}] -> ignore; %% [1]
+ [{_, NDiff}] -> ets:update_counter(FlyingEts, Key, {2, Diff}),
+ true = ets:delete_object(FlyingEts, {Key, 0}),
+ process;
+ [{_, 0}] -> true = ets:delete_object(FlyingEts, {Key, 0}),
+ ignore;
+ [{_, Err}] -> throw({bad_flying_ets_record, Diff, Err, Key})
+ end.
+%% [1] We can get here, for example, in the following scenario: There
+%% is a write followed by a remove in flight. The counter will be 0,
+%% so on processing the write the server attempts to delete the
+%% entry. If at that point the client injects another write it will
+%% either insert a new entry, containing +1, or increment the existing
+%% entry to +1, thus preventing its removal. Either way therefore when
+%% the server processes the read, the counter will be +1.
+
+write_action({true, not_found}, _MsgId, State) ->
+ {ignore, undefined, State};
+write_action({true, #msg_location { file = File }}, _MsgId, State) ->
+ {ignore, File, State};
+write_action({false, not_found}, _MsgId, State) ->
+ {write, State};
+write_action({Mask, #msg_location { ref_count = 0, file = File,
+ total_size = TotalSize }},
+ MsgId, State = #msstate { file_summary_ets = FileSummaryEts }) ->
+ case {Mask, ets:lookup(FileSummaryEts, File)} of
+ {false, [#file_summary { locked = true }]} ->
+ ok = index_delete(MsgId, State),
+ {write, State};
+ {false_if_increment, [#file_summary { locked = true }]} ->
+ %% The msg for MsgId is older than the client death
+ %% message, but as it is being GC'd currently we'll have
+ %% to write a new copy, which will then be younger, so
+ %% ignore this write.
+ {ignore, File, State};
+ {_Mask, [#file_summary {}]} ->
+ ok = index_update_ref_count(MsgId, 1, State),
+ State1 = adjust_valid_total_size(File, TotalSize, State),
+ {confirm, File, State1}
+ end;
+write_action({_Mask, #msg_location { ref_count = RefCount, file = File }},
+ MsgId, State) ->
+ ok = index_update_ref_count(MsgId, RefCount + 1, State),
+ %% We already know about it, just update counter. Only update
+ %% field otherwise bad interaction with concurrent GC
+ {confirm, File, State}.
+
+write_message(MsgId, Msg, CRef,
+ State = #msstate { cur_file_cache_ets = CurFileCacheEts }) ->
+ case write_action(should_mask_action(CRef, MsgId, State), MsgId, State) of
+ {write, State1} ->
+ write_message(MsgId, Msg,
+ record_pending_confirm(CRef, MsgId, State1));
+ {ignore, CurFile, State1 = #msstate { current_file = CurFile }} ->
+ State1;
+ {ignore, _File, State1} ->
+ true = ets:delete_object(CurFileCacheEts, {MsgId, Msg, 0}),
+ State1;
+ {confirm, CurFile, State1 = #msstate { current_file = CurFile }}->
+ record_pending_confirm(CRef, MsgId, State1);
+ {confirm, _File, State1} ->
+ true = ets:delete_object(CurFileCacheEts, {MsgId, Msg, 0}),
+ update_pending_confirms(
+ fun (MsgOnDiskFun, CTM) ->
+ MsgOnDiskFun(gb_sets:singleton(MsgId), written),
+ CTM
+ end, CRef, State1)
+ end.
+
+remove_message(MsgId, CRef,
+ State = #msstate { file_summary_ets = FileSummaryEts }) ->
+ case should_mask_action(CRef, MsgId, State) of
+ {true, _Location} ->
+ State;
+ {false_if_increment, #msg_location { ref_count = 0 }} ->
+ %% CRef has tried to both write and remove this msg whilst
+ %% it's being GC'd.
+ %%
+ %% ASSERTION: [#file_summary { locked = true }] =
+ %% ets:lookup(FileSummaryEts, File),
+ State;
+ {_Mask, #msg_location { ref_count = RefCount, file = File,
+ total_size = TotalSize }}
+ when RefCount > 0 ->
+ %% only update field, otherwise bad interaction with
+ %% concurrent GC
+ Dec = fun () -> index_update_ref_count(
+ MsgId, RefCount - 1, State) end,
+ case RefCount of
+ %% don't remove from cur_file_cache_ets here because
+ %% there may be further writes in the mailbox for the
+ %% same msg.
+ 1 -> case ets:lookup(FileSummaryEts, File) of
+ [#file_summary { locked = true }] ->
+ add_to_pending_gc_completion(
+ {remove, MsgId, CRef}, File, State);
+ [#file_summary {}] ->
+ ok = Dec(),
+ delete_file_if_empty(
+ File, adjust_valid_total_size(
+ File, -TotalSize, State))
+ end;
+ _ -> ok = Dec(),
+ State
+ end
+ end.
+
+write_message(MsgId, Msg,
+ State = #msstate { current_file_handle = CurHdl,
+ current_file = CurFile,
+ sum_valid_data = SumValid,
+ sum_file_size = SumFileSize,
+ file_summary_ets = FileSummaryEts }) ->
+ {ok, CurOffset} = file_handle_cache:current_virtual_offset(CurHdl),
+ {ok, TotalSize} = rabbit_msg_file:append(CurHdl, MsgId, Msg),
+ ok = index_insert(
+ #msg_location { msg_id = MsgId, ref_count = 1, file = CurFile,
+ offset = CurOffset, total_size = TotalSize }, State),
+ [#file_summary { right = undefined, locked = false }] =
+ ets:lookup(FileSummaryEts, CurFile),
+ [_,_] = ets:update_counter(FileSummaryEts, CurFile,
+ [{#file_summary.valid_total_size, TotalSize},
+ {#file_summary.file_size, TotalSize}]),
+ maybe_roll_to_new_file(CurOffset + TotalSize,
+ State #msstate {
+ sum_valid_data = SumValid + TotalSize,
+ sum_file_size = SumFileSize + TotalSize }).
+
+read_message(MsgId, From, State) ->
+ case index_lookup_positive_ref_count(MsgId, State) of
+ not_found -> gen_server2:reply(From, not_found),
+ State;
+ MsgLocation -> read_message1(From, MsgLocation, State)
+ end.
+
+read_message1(From, #msg_location { msg_id = MsgId, file = File,
+ offset = Offset } = MsgLoc,
+ State = #msstate { current_file = CurFile,
+ current_file_handle = CurHdl,
+ file_summary_ets = FileSummaryEts,
+ cur_file_cache_ets = CurFileCacheEts }) ->
+ case File =:= CurFile of
+ true -> {Msg, State1} =
+ %% can return [] if msg in file existed on startup
+ case ets:lookup(CurFileCacheEts, MsgId) of
+ [] ->
+ {ok, RawOffSet} =
+ file_handle_cache:current_raw_offset(CurHdl),
+ ok = case Offset >= RawOffSet of
+ true -> file_handle_cache:flush(CurHdl);
+ false -> ok
+ end,
+ read_from_disk(MsgLoc, State);
+ [{MsgId, Msg1, _CacheRefCount}] ->
+ {Msg1, State}
+ end,
+ gen_server2:reply(From, {ok, Msg}),
+ State1;
+ false -> [#file_summary { locked = Locked }] =
+ ets:lookup(FileSummaryEts, File),
+ case Locked of
+ true -> add_to_pending_gc_completion({read, MsgId, From},
+ File, State);
+ false -> {Msg, State1} = read_from_disk(MsgLoc, State),
+ gen_server2:reply(From, {ok, Msg}),
+ State1
+ end
+ end.
+
+read_from_disk(#msg_location { msg_id = MsgId, file = File, offset = Offset,
+ total_size = TotalSize }, State) ->
+ {Hdl, State1} = get_read_handle(File, State),
+ {ok, Offset} = file_handle_cache:position(Hdl, Offset),
+ {ok, {MsgId, Msg}} =
+ case rabbit_msg_file:read(Hdl, TotalSize) of
+ {ok, {MsgId, _}} = Obj ->
+ Obj;
+ Rest ->
+ {error, {misread, [{old_state, State},
+ {file_num, File},
+ {offset, Offset},
+ {msg_id, MsgId},
+ {read, Rest},
+ {proc_dict, get()}
+ ]}}
+ end,
+ {Msg, State1}.
+
+contains_message(MsgId, From,
+ State = #msstate { pending_gc_completion = Pending }) ->
+ case index_lookup_positive_ref_count(MsgId, State) of
+ not_found ->
+ gen_server2:reply(From, false),
+ State;
+ #msg_location { file = File } ->
+ case maps:is_key(File, Pending) of
+ true -> add_to_pending_gc_completion(
+ {contains, MsgId, From}, File, State);
+ false -> gen_server2:reply(From, true),
+ State
+ end
+ end.
+
+add_to_pending_gc_completion(
+ Op, File, State = #msstate { pending_gc_completion = Pending }) ->
+ State #msstate { pending_gc_completion =
+ rabbit_misc:maps_cons(File, Op, Pending) }.
+
+run_pending(Files, State) ->
+ lists:foldl(
+ fun (File, State1 = #msstate { pending_gc_completion = Pending }) ->
+ Pending1 = maps:remove(File, Pending),
+ lists:foldl(
+ fun run_pending_action/2,
+ State1 #msstate { pending_gc_completion = Pending1 },
+ lists:reverse(maps:get(File, Pending)))
+ end, State, Files).
+
+run_pending_action({read, MsgId, From}, State) ->
+ read_message(MsgId, From, State);
+run_pending_action({contains, MsgId, From}, State) ->
+ contains_message(MsgId, From, State);
+run_pending_action({remove, MsgId, CRef}, State) ->
+ remove_message(MsgId, CRef, State).
+
+safe_ets_update_counter(Tab, Key, UpdateOp, SuccessFun, FailThunk) ->
+ try
+ SuccessFun(ets:update_counter(Tab, Key, UpdateOp))
+ catch error:badarg -> FailThunk()
+ end.
+
+update_msg_cache(CacheEts, MsgId, Msg) ->
+ case ets:insert_new(CacheEts, {MsgId, Msg, 1}) of
+ true -> ok;
+ false -> safe_ets_update_counter(
+ CacheEts, MsgId, {3, +1}, fun (_) -> ok end,
+ fun () -> update_msg_cache(CacheEts, MsgId, Msg) end)
+ end.
+
+adjust_valid_total_size(File, Delta, State = #msstate {
+ sum_valid_data = SumValid,
+ file_summary_ets = FileSummaryEts }) ->
+ [_] = ets:update_counter(FileSummaryEts, File,
+ [{#file_summary.valid_total_size, Delta}]),
+ State #msstate { sum_valid_data = SumValid + Delta }.
+
+maps_store(Key, Val, Dict) ->
+ false = maps:is_key(Key, Dict),
+ maps:put(Key, Val, Dict).
+
+update_pending_confirms(Fun, CRef,
+ State = #msstate { clients = Clients,
+ cref_to_msg_ids = CTM }) ->
+ case maps:get(CRef, Clients) of
+ {_CPid, undefined, _CloseFDsFun} -> State;
+ {_CPid, MsgOnDiskFun, _CloseFDsFun} -> CTM1 = Fun(MsgOnDiskFun, CTM),
+ State #msstate {
+ cref_to_msg_ids = CTM1 }
+ end.
+
+record_pending_confirm(CRef, MsgId, State) ->
+ update_pending_confirms(
+ fun (_MsgOnDiskFun, CTM) ->
+ NewMsgIds = case maps:find(CRef, CTM) of
+ error -> gb_sets:singleton(MsgId);
+ {ok, MsgIds} -> gb_sets:add(MsgId, MsgIds)
+ end,
+ maps:put(CRef, NewMsgIds, CTM)
+ end, CRef, State).
+
+client_confirm(CRef, MsgIds, ActionTaken, State) ->
+ update_pending_confirms(
+ fun (MsgOnDiskFun, CTM) ->
+ case maps:find(CRef, CTM) of
+ {ok, Gs} -> MsgOnDiskFun(gb_sets:intersection(Gs, MsgIds),
+ ActionTaken),
+ MsgIds1 = rabbit_misc:gb_sets_difference(
+ Gs, MsgIds),
+ case gb_sets:is_empty(MsgIds1) of
+ true -> maps:remove(CRef, CTM);
+ false -> maps:put(CRef, MsgIds1, CTM)
+ end;
+ error -> CTM
+ end
+ end, CRef, State).
+
+blind_confirm(CRef, MsgIds, ActionTaken, State) ->
+ update_pending_confirms(
+ fun (MsgOnDiskFun, CTM) -> MsgOnDiskFun(MsgIds, ActionTaken), CTM end,
+ CRef, State).
+
+%% Detect whether the MsgId is older or younger than the client's death
+%% msg (if there is one). If the msg is older than the client death
+%% msg, and it has a 0 ref_count we must only alter the ref_count, not
+%% rewrite the msg - rewriting it would make it younger than the death
+%% msg and thus should be ignored. Note that this (correctly) returns
+%% false when testing to remove the death msg itself.
+should_mask_action(CRef, MsgId,
+ State = #msstate{dying_clients = DyingClients}) ->
+ case {maps:find(CRef, DyingClients), index_lookup(MsgId, State)} of
+ {error, Location} ->
+ {false, Location};
+ {{ok, _}, not_found} ->
+ {true, not_found};
+ {{ok, Client}, #msg_location { file = File, offset = Offset,
+ ref_count = RefCount } = Location} ->
+ #dying_client{file = DeathFile, offset = DeathOffset} = Client,
+ {case {{DeathFile, DeathOffset} < {File, Offset}, RefCount} of
+ {true, _} -> true;
+ {false, 0} -> false_if_increment;
+ {false, _} -> false
+ end, Location}
+ end.
+
+%%----------------------------------------------------------------------------
+%% file helper functions
+%%----------------------------------------------------------------------------
+
+open_file(File, Mode) ->
+ file_handle_cache:open_with_absolute_path(
+ File, ?BINARY_MODE ++ Mode,
+ [{write_buffer, ?HANDLE_CACHE_BUFFER_SIZE},
+ {read_buffer, ?HANDLE_CACHE_BUFFER_SIZE}]).
+
+open_file(Dir, FileName, Mode) ->
+ open_file(form_filename(Dir, FileName), Mode).
+
+close_handle(Key, CState = #client_msstate { file_handle_cache = FHC }) ->
+ CState #client_msstate { file_handle_cache = close_handle(Key, FHC) };
+
+close_handle(Key, State = #msstate { file_handle_cache = FHC }) ->
+ State #msstate { file_handle_cache = close_handle(Key, FHC) };
+
+close_handle(Key, FHC) ->
+ case maps:find(Key, FHC) of
+ {ok, Hdl} -> ok = file_handle_cache:close(Hdl),
+ maps:remove(Key, FHC);
+ error -> FHC
+ end.
+
+mark_handle_open(FileHandlesEts, File, Ref) ->
+ %% This is fine to fail (already exists). Note it could fail with
+ %% the value being close, and not have it updated to open.
+ ets:insert_new(FileHandlesEts, {{Ref, File}, open}),
+ true.
+
+%% See comment in client_read3 - only call this when the file is locked
+mark_handle_to_close(ClientRefs, FileHandlesEts, File, Invoke) ->
+ [ begin
+ case (ets:update_element(FileHandlesEts, Key, {2, close})
+ andalso Invoke) of
+ true -> case maps:get(Ref, ClientRefs) of
+ {_CPid, _MsgOnDiskFun, undefined} ->
+ ok;
+ {_CPid, _MsgOnDiskFun, CloseFDsFun} ->
+ ok = CloseFDsFun()
+ end;
+ false -> ok
+ end
+ end || {{Ref, _File} = Key, open} <-
+ ets:match_object(FileHandlesEts, {{'_', File}, open}) ],
+ true.
+
+safe_file_delete_fun(File, Dir, FileHandlesEts) ->
+ fun () -> safe_file_delete(File, Dir, FileHandlesEts) end.
+
+safe_file_delete(File, Dir, FileHandlesEts) ->
+ %% do not match on any value - it's the absence of the row that
+ %% indicates the client has really closed the file.
+ case ets:match_object(FileHandlesEts, {{'_', File}, '_'}, 1) of
+ {[_|_], _Cont} -> false;
+ _ -> ok = file:delete(
+ form_filename(Dir, filenum_to_name(File))),
+ true
+ end.
+
+-spec close_all_indicated
+ (client_msstate()) -> rabbit_types:ok(client_msstate()).
+
+close_all_indicated(#client_msstate { file_handles_ets = FileHandlesEts,
+ client_ref = Ref } =
+ CState) ->
+ Objs = ets:match_object(FileHandlesEts, {{Ref, '_'}, close}),
+ {ok, lists:foldl(fun ({Key = {_Ref, File}, close}, CStateM) ->
+ true = ets:delete(FileHandlesEts, Key),
+ close_handle(File, CStateM)
+ end, CState, Objs)}.
+
+close_all_handles(CState = #client_msstate { file_handles_ets = FileHandlesEts,
+ file_handle_cache = FHC,
+ client_ref = Ref }) ->
+ ok = maps:fold(fun (File, Hdl, ok) ->
+ true = ets:delete(FileHandlesEts, {Ref, File}),
+ file_handle_cache:close(Hdl)
+ end, ok, FHC),
+ CState #client_msstate { file_handle_cache = #{} };
+
+close_all_handles(State = #msstate { file_handle_cache = FHC }) ->
+ ok = maps:fold(fun (_Key, Hdl, ok) -> file_handle_cache:close(Hdl) end,
+ ok, FHC),
+ State #msstate { file_handle_cache = #{} }.
+
+get_read_handle(FileNum, CState = #client_msstate { file_handle_cache = FHC,
+ dir = Dir }) ->
+ {Hdl, FHC2} = get_read_handle(FileNum, FHC, Dir),
+ {Hdl, CState #client_msstate { file_handle_cache = FHC2 }};
+
+get_read_handle(FileNum, State = #msstate { file_handle_cache = FHC,
+ dir = Dir }) ->
+ {Hdl, FHC2} = get_read_handle(FileNum, FHC, Dir),
+ {Hdl, State #msstate { file_handle_cache = FHC2 }}.
+
+get_read_handle(FileNum, FHC, Dir) ->
+ case maps:find(FileNum, FHC) of
+ {ok, Hdl} -> {Hdl, FHC};
+ error -> {ok, Hdl} = open_file(Dir, filenum_to_name(FileNum),
+ ?READ_MODE),
+ {Hdl, maps:put(FileNum, Hdl, FHC)}
+ end.
+
+preallocate(Hdl, FileSizeLimit, FinalPos) ->
+ {ok, FileSizeLimit} = file_handle_cache:position(Hdl, FileSizeLimit),
+ ok = file_handle_cache:truncate(Hdl),
+ {ok, FinalPos} = file_handle_cache:position(Hdl, FinalPos),
+ ok.
+
+truncate_and_extend_file(Hdl, Lowpoint, Highpoint) ->
+ {ok, Lowpoint} = file_handle_cache:position(Hdl, Lowpoint),
+ ok = file_handle_cache:truncate(Hdl),
+ ok = preallocate(Hdl, Highpoint, Lowpoint).
+
+form_filename(Dir, Name) -> filename:join(Dir, Name).
+
+filenum_to_name(File) -> integer_to_list(File) ++ ?FILE_EXTENSION.
+
+filename_to_num(FileName) -> list_to_integer(filename:rootname(FileName)).
+
+list_sorted_filenames(Dir, Ext) ->
+ lists:sort(fun (A, B) -> filename_to_num(A) < filename_to_num(B) end,
+ filelib:wildcard("*" ++ Ext, Dir)).
+
+%%----------------------------------------------------------------------------
+%% index
+%%----------------------------------------------------------------------------
+
+index_lookup_positive_ref_count(Key, State) ->
+ case index_lookup(Key, State) of
+ not_found -> not_found;
+ #msg_location { ref_count = 0 } -> not_found;
+ #msg_location {} = MsgLocation -> MsgLocation
+ end.
+
+index_update_ref_count(Key, RefCount, State) ->
+ index_update_fields(Key, {#msg_location.ref_count, RefCount}, State).
+
+index_lookup(Key, #gc_state { index_module = Index,
+ index_state = State }) ->
+ Index:lookup(Key, State);
+
+index_lookup(Key, #client_msstate { index_module = Index,
+ index_state = State }) ->
+ Index:lookup(Key, State);
+
+index_lookup(Key, #msstate { index_module = Index, index_state = State }) ->
+ Index:lookup(Key, State).
+
+index_insert(Obj, #msstate { index_module = Index, index_state = State }) ->
+ Index:insert(Obj, State).
+
+index_update(Obj, #msstate { index_module = Index, index_state = State }) ->
+ Index:update(Obj, State).
+
+index_update_fields(Key, Updates, #msstate{ index_module = Index,
+ index_state = State }) ->
+ Index:update_fields(Key, Updates, State);
+index_update_fields(Key, Updates, #gc_state{ index_module = Index,
+ index_state = State }) ->
+ Index:update_fields(Key, Updates, State).
+
+index_delete(Key, #msstate { index_module = Index, index_state = State }) ->
+ Index:delete(Key, State).
+
+index_delete_object(Obj, #gc_state{ index_module = Index,
+ index_state = State }) ->
+ Index:delete_object(Obj, State).
+
+index_clean_up_temporary_reference_count_entries(
+ #msstate { index_module = Index,
+ index_state = State }) ->
+ Index:clean_up_temporary_reference_count_entries_without_file(State).
+
+%%----------------------------------------------------------------------------
+%% shutdown and recovery
+%%----------------------------------------------------------------------------
+
+recover_index_and_client_refs(IndexModule, _Recover, undefined, Dir, _Name) ->
+ {false, IndexModule:new(Dir), []};
+recover_index_and_client_refs(IndexModule, false, _ClientRefs, Dir, Name) ->
+ rabbit_log:warning("Message store ~tp: rebuilding indices from scratch~n", [Name]),
+ {false, IndexModule:new(Dir), []};
+recover_index_and_client_refs(IndexModule, true, ClientRefs, Dir, Name) ->
+ Fresh = fun (ErrorMsg, ErrorArgs) ->
+ rabbit_log:warning("Message store ~tp : " ++ ErrorMsg ++ "~n"
+ "rebuilding indices from scratch~n",
+ [Name | ErrorArgs]),
+ {false, IndexModule:new(Dir), []}
+ end,
+ case read_recovery_terms(Dir) of
+ {false, Error} ->
+ Fresh("failed to read recovery terms: ~p", [Error]);
+ {true, Terms} ->
+ RecClientRefs = proplists:get_value(client_refs, Terms, []),
+ RecIndexModule = proplists:get_value(index_module, Terms),
+ case (lists:sort(ClientRefs) =:= lists:sort(RecClientRefs)
+ andalso IndexModule =:= RecIndexModule) of
+ true -> case IndexModule:recover(Dir) of
+ {ok, IndexState1} ->
+ {true, IndexState1, ClientRefs};
+ {error, Error} ->
+ Fresh("failed to recover index: ~p", [Error])
+ end;
+ false -> Fresh("recovery terms differ from present", [])
+ end
+ end.
+
+store_recovery_terms(Terms, Dir) ->
+ rabbit_file:write_term_file(filename:join(Dir, ?CLEAN_FILENAME), Terms).
+
+read_recovery_terms(Dir) ->
+ Path = filename:join(Dir, ?CLEAN_FILENAME),
+ case rabbit_file:read_term_file(Path) of
+ {ok, Terms} -> case file:delete(Path) of
+ ok -> {true, Terms};
+ {error, Error} -> {false, Error}
+ end;
+ {error, Error} -> {false, Error}
+ end.
+
+store_file_summary(Tid, Dir) ->
+ ets:tab2file(Tid, filename:join(Dir, ?FILE_SUMMARY_FILENAME),
+ [{extended_info, [object_count]}]).
+
+recover_file_summary(false, _Dir) ->
+ %% TODO: the only reason for this to be an *ordered*_set is so
+ %% that a) maybe_compact can start a traversal from the eldest
+ %% file, and b) build_index in fast recovery mode can easily
+ %% identify the current file. It's awkward to have both that
+ %% odering and the left/right pointers in the entries - replacing
+ %% the former with some additional bit of state would be easy, but
+ %% ditching the latter would be neater.
+ {false, ets:new(rabbit_msg_store_file_summary,
+ [ordered_set, public, {keypos, #file_summary.file}])};
+recover_file_summary(true, Dir) ->
+ Path = filename:join(Dir, ?FILE_SUMMARY_FILENAME),
+ case ets:file2tab(Path) of
+ {ok, Tid} -> ok = file:delete(Path),
+ {true, Tid};
+ {error, _Error} -> recover_file_summary(false, Dir)
+ end.
+
+count_msg_refs(Gen, Seed, State) ->
+ case Gen(Seed) of
+ finished ->
+ ok;
+ {_MsgId, 0, Next} ->
+ count_msg_refs(Gen, Next, State);
+ {MsgId, Delta, Next} ->
+ ok = case index_lookup(MsgId, State) of
+ not_found ->
+ index_insert(#msg_location { msg_id = MsgId,
+ file = undefined,
+ ref_count = Delta },
+ State);
+ #msg_location { ref_count = RefCount } = StoreEntry ->
+ NewRefCount = RefCount + Delta,
+ case NewRefCount of
+ 0 -> index_delete(MsgId, State);
+ _ -> index_update(StoreEntry #msg_location {
+ ref_count = NewRefCount },
+ State)
+ end
+ end,
+ count_msg_refs(Gen, Next, State)
+ end.
+
+recover_crashed_compactions(Dir) ->
+ FileNames = list_sorted_filenames(Dir, ?FILE_EXTENSION),
+ TmpFileNames = list_sorted_filenames(Dir, ?FILE_EXTENSION_TMP),
+ lists:foreach(
+ fun (TmpFileName) ->
+ NonTmpRelatedFileName =
+ filename:rootname(TmpFileName) ++ ?FILE_EXTENSION,
+ true = lists:member(NonTmpRelatedFileName, FileNames),
+ ok = recover_crashed_compaction(
+ Dir, TmpFileName, NonTmpRelatedFileName)
+ end, TmpFileNames),
+ TmpFileNames == [].
+
+recover_crashed_compaction(Dir, TmpFileName, NonTmpRelatedFileName) ->
+ %% Because a msg can legitimately appear multiple times in the
+ %% same file, identifying the contents of the tmp file and where
+ %% they came from is non-trivial. If we are recovering a crashed
+ %% compaction then we will be rebuilding the index, which can cope
+ %% with duplicates appearing. Thus the simplest and safest thing
+ %% to do is to append the contents of the tmp file to its main
+ %% file.
+ {ok, TmpHdl} = open_file(Dir, TmpFileName, ?READ_MODE),
+ {ok, MainHdl} = open_file(Dir, NonTmpRelatedFileName,
+ ?READ_MODE ++ ?WRITE_MODE),
+ {ok, _End} = file_handle_cache:position(MainHdl, eof),
+ Size = filelib:file_size(form_filename(Dir, TmpFileName)),
+ {ok, Size} = file_handle_cache:copy(TmpHdl, MainHdl, Size),
+ ok = file_handle_cache:close(MainHdl),
+ ok = file_handle_cache:delete(TmpHdl),
+ ok.
+
+scan_file_for_valid_messages(File) ->
+ case open_file(File, ?READ_MODE) of
+ {ok, Hdl} -> Valid = rabbit_msg_file:scan(
+ Hdl, filelib:file_size(File),
+ fun scan_fun/2, []),
+ ok = file_handle_cache:close(Hdl),
+ Valid;
+ {error, enoent} -> {ok, [], 0};
+ {error, Reason} -> {error, {unable_to_scan_file,
+ filename:basename(File),
+ Reason}}
+ end.
+
+scan_file_for_valid_messages(Dir, FileName) ->
+ scan_file_for_valid_messages(form_filename(Dir, FileName)).
+
+scan_fun({MsgId, TotalSize, Offset, _Msg}, Acc) ->
+ [{MsgId, TotalSize, Offset} | Acc].
+
+%% Takes the list in *ascending* order (i.e. eldest message
+%% first). This is the opposite of what scan_file_for_valid_messages
+%% produces. The list of msgs that is produced is youngest first.
+drop_contiguous_block_prefix(L) -> drop_contiguous_block_prefix(L, 0).
+
+drop_contiguous_block_prefix([], ExpectedOffset) ->
+ {ExpectedOffset, []};
+drop_contiguous_block_prefix([#msg_location { offset = ExpectedOffset,
+ total_size = TotalSize } | Tail],
+ ExpectedOffset) ->
+ ExpectedOffset1 = ExpectedOffset + TotalSize,
+ drop_contiguous_block_prefix(Tail, ExpectedOffset1);
+drop_contiguous_block_prefix(MsgsAfterGap, ExpectedOffset) ->
+ {ExpectedOffset, MsgsAfterGap}.
+
+build_index(true, _StartupFunState,
+ State = #msstate { file_summary_ets = FileSummaryEts }) ->
+ ets:foldl(
+ fun (#file_summary { valid_total_size = ValidTotalSize,
+ file_size = FileSize,
+ file = File },
+ {_Offset, State1 = #msstate { sum_valid_data = SumValid,
+ sum_file_size = SumFileSize }}) ->
+ {FileSize, State1 #msstate {
+ sum_valid_data = SumValid + ValidTotalSize,
+ sum_file_size = SumFileSize + FileSize,
+ current_file = File }}
+ end, {0, State}, FileSummaryEts);
+build_index(false, {MsgRefDeltaGen, MsgRefDeltaGenInit},
+ State = #msstate { dir = Dir }) ->
+ rabbit_log:debug("Rebuilding message refcount...~n", []),
+ ok = count_msg_refs(MsgRefDeltaGen, MsgRefDeltaGenInit, State),
+ rabbit_log:debug("Done rebuilding message refcount~n", []),
+ {ok, Pid} = gatherer:start_link(),
+ case [filename_to_num(FileName) ||
+ FileName <- list_sorted_filenames(Dir, ?FILE_EXTENSION)] of
+ [] -> rebuild_index(Pid, [State #msstate.current_file],
+ State);
+ Files -> {Offset, State1} = rebuild_index(Pid, Files, State),
+ {Offset, lists:foldl(fun delete_file_if_empty/2,
+ State1, Files)}
+ end.
+
+build_index_worker(Gatherer, State = #msstate { dir = Dir },
+ Left, File, Files) ->
+ FileName = filenum_to_name(File),
+ rabbit_log:debug("Rebuilding message location index from ~p (~B file(s) remaining)~n",
+ [form_filename(Dir, FileName), length(Files)]),
+ {ok, Messages, FileSize} =
+ scan_file_for_valid_messages(Dir, FileName),
+ {ValidMessages, ValidTotalSize} =
+ lists:foldl(
+ fun (Obj = {MsgId, TotalSize, Offset}, {VMAcc, VTSAcc}) ->
+ case index_lookup(MsgId, State) of
+ #msg_location { file = undefined } = StoreEntry ->
+ ok = index_update(StoreEntry #msg_location {
+ file = File, offset = Offset,
+ total_size = TotalSize },
+ State),
+ {[Obj | VMAcc], VTSAcc + TotalSize};
+ _ ->
+ {VMAcc, VTSAcc}
+ end
+ end, {[], 0}, Messages),
+ {Right, FileSize1} =
+ case Files of
+ %% if it's the last file, we'll truncate to remove any
+ %% rubbish above the last valid message. This affects the
+ %% file size.
+ [] -> {undefined, case ValidMessages of
+ [] -> 0;
+ _ -> {_MsgId, TotalSize, Offset} =
+ lists:last(ValidMessages),
+ Offset + TotalSize
+ end};
+ [F|_] -> {F, FileSize}
+ end,
+ ok = gatherer:in(Gatherer, #file_summary {
+ file = File,
+ valid_total_size = ValidTotalSize,
+ left = Left,
+ right = Right,
+ file_size = FileSize1,
+ locked = false,
+ readers = 0 }),
+ ok = gatherer:finish(Gatherer).
+
+enqueue_build_index_workers(_Gatherer, _Left, [], _State) ->
+ exit(normal);
+enqueue_build_index_workers(Gatherer, Left, [File|Files], State) ->
+ ok = worker_pool:dispatch_sync(
+ fun () ->
+ link(Gatherer),
+ ok = build_index_worker(Gatherer, State,
+ Left, File, Files),
+ unlink(Gatherer),
+ ok
+ end),
+ enqueue_build_index_workers(Gatherer, File, Files, State).
+
+reduce_index(Gatherer, LastFile,
+ State = #msstate { file_summary_ets = FileSummaryEts,
+ sum_valid_data = SumValid,
+ sum_file_size = SumFileSize }) ->
+ case gatherer:out(Gatherer) of
+ empty ->
+ ok = gatherer:stop(Gatherer),
+ ok = index_clean_up_temporary_reference_count_entries(State),
+ Offset = case ets:lookup(FileSummaryEts, LastFile) of
+ [] -> 0;
+ [#file_summary { file_size = FileSize }] -> FileSize
+ end,
+ {Offset, State #msstate { current_file = LastFile }};
+ {value, #file_summary { valid_total_size = ValidTotalSize,
+ file_size = FileSize } = FileSummary} ->
+ true = ets:insert_new(FileSummaryEts, FileSummary),
+ reduce_index(Gatherer, LastFile,
+ State #msstate {
+ sum_valid_data = SumValid + ValidTotalSize,
+ sum_file_size = SumFileSize + FileSize })
+ end.
+
+rebuild_index(Gatherer, Files, State) ->
+ lists:foreach(fun (_File) ->
+ ok = gatherer:fork(Gatherer)
+ end, Files),
+ Pid = spawn(
+ fun () ->
+ enqueue_build_index_workers(Gatherer, undefined,
+ Files, State)
+ end),
+ erlang:monitor(process, Pid),
+ reduce_index(Gatherer, lists:last(Files), State).
+
+%%----------------------------------------------------------------------------
+%% garbage collection / compaction / aggregation -- internal
+%%----------------------------------------------------------------------------
+
+maybe_roll_to_new_file(
+ Offset,
+ State = #msstate { dir = Dir,
+ current_file_handle = CurHdl,
+ current_file = CurFile,
+ file_summary_ets = FileSummaryEts,
+ cur_file_cache_ets = CurFileCacheEts,
+ file_size_limit = FileSizeLimit })
+ when Offset >= FileSizeLimit ->
+ State1 = internal_sync(State),
+ ok = file_handle_cache:close(CurHdl),
+ NextFile = CurFile + 1,
+ {ok, NextHdl} = open_file(Dir, filenum_to_name(NextFile), ?WRITE_MODE),
+ true = ets:insert_new(FileSummaryEts, #file_summary {
+ file = NextFile,
+ valid_total_size = 0,
+ left = CurFile,
+ right = undefined,
+ file_size = 0,
+ locked = false,
+ readers = 0 }),
+ true = ets:update_element(FileSummaryEts, CurFile,
+ {#file_summary.right, NextFile}),
+ true = ets:match_delete(CurFileCacheEts, {'_', '_', 0}),
+ maybe_compact(State1 #msstate { current_file_handle = NextHdl,
+ current_file = NextFile });
+maybe_roll_to_new_file(_, State) ->
+ State.
+
+maybe_compact(State = #msstate { sum_valid_data = SumValid,
+ sum_file_size = SumFileSize,
+ gc_pid = GCPid,
+ pending_gc_completion = Pending,
+ file_summary_ets = FileSummaryEts,
+ file_size_limit = FileSizeLimit })
+ when SumFileSize > 2 * FileSizeLimit andalso
+ (SumFileSize - SumValid) / SumFileSize > ?GARBAGE_FRACTION ->
+ %% TODO: the algorithm here is sub-optimal - it may result in a
+ %% complete traversal of FileSummaryEts.
+ First = ets:first(FileSummaryEts),
+ case First =:= '$end_of_table' orelse
+ maps:size(Pending) >= ?MAXIMUM_SIMULTANEOUS_GC_FILES of
+ true ->
+ State;
+ false ->
+ case find_files_to_combine(FileSummaryEts, FileSizeLimit,
+ ets:lookup(FileSummaryEts, First)) of
+ not_found ->
+ State;
+ {Src, Dst} ->
+ Pending1 = maps_store(Dst, [],
+ maps_store(Src, [], Pending)),
+ State1 = close_handle(Src, close_handle(Dst, State)),
+ true = ets:update_element(FileSummaryEts, Src,
+ {#file_summary.locked, true}),
+ true = ets:update_element(FileSummaryEts, Dst,
+ {#file_summary.locked, true}),
+ ok = rabbit_msg_store_gc:combine(GCPid, Src, Dst),
+ State1 #msstate { pending_gc_completion = Pending1 }
+ end
+ end;
+maybe_compact(State) ->
+ State.
+
+find_files_to_combine(FileSummaryEts, FileSizeLimit,
+ [#file_summary { file = Dst,
+ valid_total_size = DstValid,
+ right = Src,
+ locked = DstLocked }]) ->
+ case Src of
+ undefined ->
+ not_found;
+ _ ->
+ [#file_summary { file = Src,
+ valid_total_size = SrcValid,
+ left = Dst,
+ right = SrcRight,
+ locked = SrcLocked }] = Next =
+ ets:lookup(FileSummaryEts, Src),
+ case SrcRight of
+ undefined -> not_found;
+ _ -> case (DstValid + SrcValid =< FileSizeLimit) andalso
+ (DstValid > 0) andalso (SrcValid > 0) andalso
+ not (DstLocked orelse SrcLocked) of
+ true -> {Src, Dst};
+ false -> find_files_to_combine(
+ FileSummaryEts, FileSizeLimit, Next)
+ end
+ end
+ end.
+
+delete_file_if_empty(File, State = #msstate { current_file = File }) ->
+ State;
+delete_file_if_empty(File, State = #msstate {
+ gc_pid = GCPid,
+ file_summary_ets = FileSummaryEts,
+ pending_gc_completion = Pending }) ->
+ [#file_summary { valid_total_size = ValidData,
+ locked = false }] =
+ ets:lookup(FileSummaryEts, File),
+ case ValidData of
+ %% don't delete the file_summary_ets entry for File here
+ %% because we could have readers which need to be able to
+ %% decrement the readers count.
+ 0 -> true = ets:update_element(FileSummaryEts, File,
+ {#file_summary.locked, true}),
+ ok = rabbit_msg_store_gc:delete(GCPid, File),
+ Pending1 = maps_store(File, [], Pending),
+ close_handle(File,
+ State #msstate { pending_gc_completion = Pending1 });
+ _ -> State
+ end.
+
+cleanup_after_file_deletion(File,
+ #msstate { file_handles_ets = FileHandlesEts,
+ file_summary_ets = FileSummaryEts,
+ clients = Clients }) ->
+ %% Ensure that any clients that have open fhs to the file close
+ %% them before using them again. This has to be done here (given
+ %% it's done in the msg_store, and not the gc), and not when
+ %% starting up the GC, because if done when starting up the GC,
+ %% the client could find the close, and close and reopen the fh,
+ %% whilst the GC is waiting for readers to disappear, before it's
+ %% actually done the GC.
+ true = mark_handle_to_close(Clients, FileHandlesEts, File, true),
+ [#file_summary { left = Left,
+ right = Right,
+ locked = true,
+ readers = 0 }] = ets:lookup(FileSummaryEts, File),
+ %% We'll never delete the current file, so right is never undefined
+ true = Right =/= undefined, %% ASSERTION
+ true = ets:update_element(FileSummaryEts, Right,
+ {#file_summary.left, Left}),
+ %% ensure the double linked list is maintained
+ true = case Left of
+ undefined -> true; %% File is the eldest file (left-most)
+ _ -> ets:update_element(FileSummaryEts, Left,
+ {#file_summary.right, Right})
+ end,
+ true = ets:delete(FileSummaryEts, File),
+ ok.
+
+%%----------------------------------------------------------------------------
+%% garbage collection / compaction / aggregation -- external
+%%----------------------------------------------------------------------------
+
+-spec combine_files(non_neg_integer(), non_neg_integer(), gc_state()) ->
+ {ok, deletion_thunk()} | {defer, [non_neg_integer()]}.
+
+combine_files(Source, Destination,
+ State = #gc_state { file_summary_ets = FileSummaryEts }) ->
+ [#file_summary{locked = true} = SourceSummary] =
+ ets:lookup(FileSummaryEts, Source),
+
+ [#file_summary{locked = true} = DestinationSummary] =
+ ets:lookup(FileSummaryEts, Destination),
+
+ case {SourceSummary, DestinationSummary} of
+ {#file_summary{readers = 0}, #file_summary{readers = 0}} ->
+ {ok, do_combine_files(SourceSummary, DestinationSummary,
+ Source, Destination, State)};
+ _ ->
+ rabbit_log:debug("Asked to combine files ~p and ~p but they have active readers. Deferring.",
+ [Source, Destination]),
+ DeferredFiles = [FileSummary#file_summary.file
+ || FileSummary <- [SourceSummary, DestinationSummary],
+ FileSummary#file_summary.readers /= 0],
+ {defer, DeferredFiles}
+ end.
+
+do_combine_files(SourceSummary, DestinationSummary,
+ Source, Destination,
+ State = #gc_state { file_summary_ets = FileSummaryEts,
+ file_handles_ets = FileHandlesEts,
+ dir = Dir,
+ msg_store = Server }) ->
+ #file_summary {
+ readers = 0,
+ left = Destination,
+ valid_total_size = SourceValid,
+ file_size = SourceFileSize,
+ locked = true } = SourceSummary,
+ #file_summary {
+ readers = 0,
+ right = Source,
+ valid_total_size = DestinationValid,
+ file_size = DestinationFileSize,
+ locked = true } = DestinationSummary,
+
+ SourceName = filenum_to_name(Source),
+ DestinationName = filenum_to_name(Destination),
+ {ok, SourceHdl} = open_file(Dir, SourceName,
+ ?READ_AHEAD_MODE),
+ {ok, DestinationHdl} = open_file(Dir, DestinationName,
+ ?READ_AHEAD_MODE ++ ?WRITE_MODE),
+ TotalValidData = SourceValid + DestinationValid,
+ %% if DestinationValid =:= DestinationContiguousTop then we don't
+ %% need a tmp file
+ %% if they're not equal, then we need to write out everything past
+ %% the DestinationContiguousTop to a tmp file then truncate,
+ %% copy back in, and then copy over from Source
+ %% otherwise we just truncate straight away and copy over from Source
+ {DestinationWorkList, DestinationValid} =
+ load_and_vacuum_message_file(Destination, State),
+ {DestinationContiguousTop, DestinationWorkListTail} =
+ drop_contiguous_block_prefix(DestinationWorkList),
+ case DestinationWorkListTail of
+ [] -> ok = truncate_and_extend_file(
+ DestinationHdl, DestinationContiguousTop, TotalValidData);
+ _ -> Tmp = filename:rootname(DestinationName) ++ ?FILE_EXTENSION_TMP,
+ {ok, TmpHdl} = open_file(Dir, Tmp, ?READ_AHEAD_MODE++?WRITE_MODE),
+ ok = copy_messages(
+ DestinationWorkListTail, DestinationContiguousTop,
+ DestinationValid, DestinationHdl, TmpHdl, Destination,
+ State),
+ TmpSize = DestinationValid - DestinationContiguousTop,
+ %% so now Tmp contains everything we need to salvage
+ %% from Destination, and index_state has been updated to
+ %% reflect the compaction of Destination so truncate
+ %% Destination and copy from Tmp back to the end
+ {ok, 0} = file_handle_cache:position(TmpHdl, 0),
+ ok = truncate_and_extend_file(
+ DestinationHdl, DestinationContiguousTop, TotalValidData),
+ {ok, TmpSize} =
+ file_handle_cache:copy(TmpHdl, DestinationHdl, TmpSize),
+ %% position in DestinationHdl should now be DestinationValid
+ ok = file_handle_cache:sync(DestinationHdl),
+ ok = file_handle_cache:delete(TmpHdl)
+ end,
+ {SourceWorkList, SourceValid} = load_and_vacuum_message_file(Source, State),
+ ok = copy_messages(SourceWorkList, DestinationValid, TotalValidData,
+ SourceHdl, DestinationHdl, Destination, State),
+ %% tidy up
+ ok = file_handle_cache:close(DestinationHdl),
+ ok = file_handle_cache:close(SourceHdl),
+
+ %% don't update dest.right, because it could be changing at the
+ %% same time
+ true = ets:update_element(
+ FileSummaryEts, Destination,
+ [{#file_summary.valid_total_size, TotalValidData},
+ {#file_summary.file_size, TotalValidData}]),
+
+ Reclaimed = SourceFileSize + DestinationFileSize - TotalValidData,
+ rabbit_log:debug("Combined segment files number ~p (source) and ~p (destination), reclaimed ~p bytes",
+ [Source, Destination, Reclaimed]),
+ gen_server2:cast(Server, {combine_files, Source, Destination, Reclaimed}),
+ safe_file_delete_fun(Source, Dir, FileHandlesEts).
+
+-spec delete_file(non_neg_integer(), gc_state()) -> {ok, deletion_thunk()} | {defer, [non_neg_integer()]}.
+
+delete_file(File, State = #gc_state { file_summary_ets = FileSummaryEts,
+ file_handles_ets = FileHandlesEts,
+ dir = Dir,
+ msg_store = Server }) ->
+ case ets:lookup(FileSummaryEts, File) of
+ [#file_summary { valid_total_size = 0,
+ locked = true,
+ file_size = FileSize,
+ readers = 0 }] ->
+ {[], 0} = load_and_vacuum_message_file(File, State),
+ gen_server2:cast(Server, {delete_file, File, FileSize}),
+ {ok, safe_file_delete_fun(File, Dir, FileHandlesEts)};
+ [#file_summary{readers = Readers}] when Readers > 0 ->
+ rabbit_log:debug("Asked to delete file ~p but it has active readers. Deferring.",
+ [File]),
+ {defer, [File]}
+ end.
+
+load_and_vacuum_message_file(File, State = #gc_state { dir = Dir }) ->
+ %% Messages here will be end-of-file at start-of-list
+ {ok, Messages, _FileSize} =
+ scan_file_for_valid_messages(Dir, filenum_to_name(File)),
+ %% foldl will reverse so will end up with msgs in ascending offset order
+ lists:foldl(
+ fun ({MsgId, TotalSize, Offset}, Acc = {List, Size}) ->
+ case index_lookup(MsgId, State) of
+ #msg_location { file = File, total_size = TotalSize,
+ offset = Offset, ref_count = 0 } = Entry ->
+ ok = index_delete_object(Entry, State),
+ Acc;
+ #msg_location { file = File, total_size = TotalSize,
+ offset = Offset } = Entry ->
+ {[ Entry | List ], TotalSize + Size};
+ _ ->
+ Acc
+ end
+ end, {[], 0}, Messages).
+
+copy_messages(WorkList, InitOffset, FinalOffset, SourceHdl, DestinationHdl,
+ Destination, State) ->
+ Copy = fun ({BlockStart, BlockEnd}) ->
+ BSize = BlockEnd - BlockStart,
+ {ok, BlockStart} =
+ file_handle_cache:position(SourceHdl, BlockStart),
+ {ok, BSize} =
+ file_handle_cache:copy(SourceHdl, DestinationHdl, BSize)
+ end,
+ case
+ lists:foldl(
+ fun (#msg_location { msg_id = MsgId, offset = Offset,
+ total_size = TotalSize },
+ {CurOffset, Block = {BlockStart, BlockEnd}}) ->
+ %% CurOffset is in the DestinationFile.
+ %% Offset, BlockStart and BlockEnd are in the SourceFile
+ %% update MsgLocation to reflect change of file and offset
+ ok = index_update_fields(MsgId,
+ [{#msg_location.file, Destination},
+ {#msg_location.offset, CurOffset}],
+ State),
+ {CurOffset + TotalSize,
+ case BlockEnd of
+ undefined ->
+ %% base case, called only for the first list elem
+ {Offset, Offset + TotalSize};
+ Offset ->
+ %% extend the current block because the
+ %% next msg follows straight on
+ {BlockStart, BlockEnd + TotalSize};
+ _ ->
+ %% found a gap, so actually do the work for
+ %% the previous block
+ Copy(Block),
+ {Offset, Offset + TotalSize}
+ end}
+ end, {InitOffset, {undefined, undefined}}, WorkList) of
+ {FinalOffset, Block} ->
+ case WorkList of
+ [] -> ok;
+ _ -> Copy(Block), %% do the last remaining block
+ ok = file_handle_cache:sync(DestinationHdl)
+ end;
+ {FinalOffsetZ, _Block} ->
+ {gc_error, [{expected, FinalOffset},
+ {got, FinalOffsetZ},
+ {destination, Destination}]}
+ end.
+
+-spec force_recovery(file:filename(), server()) -> 'ok'.
+
+force_recovery(BaseDir, Store) ->
+ Dir = filename:join(BaseDir, atom_to_list(Store)),
+ case file:delete(filename:join(Dir, ?CLEAN_FILENAME)) of
+ ok -> ok;
+ {error, enoent} -> ok
+ end,
+ recover_crashed_compactions(BaseDir),
+ ok.
+
+foreach_file(D, Fun, Files) ->
+ [ok = Fun(filename:join(D, File)) || File <- Files].
+
+foreach_file(D1, D2, Fun, Files) ->
+ [ok = Fun(filename:join(D1, File), filename:join(D2, File)) || File <- Files].
+
+-spec transform_dir(file:filename(), server(),
+ fun ((any()) -> (rabbit_types:ok_or_error2(msg(), any())))) -> 'ok'.
+
+transform_dir(BaseDir, Store, TransformFun) ->
+ Dir = filename:join(BaseDir, atom_to_list(Store)),
+ TmpDir = filename:join(Dir, ?TRANSFORM_TMP),
+ TransformFile = fun (A, B) -> transform_msg_file(A, B, TransformFun) end,
+ CopyFile = fun (Src, Dst) -> {ok, _Bytes} = file:copy(Src, Dst), ok end,
+ case filelib:is_dir(TmpDir) of
+ true -> throw({error, transform_failed_previously});
+ false -> FileList = list_sorted_filenames(Dir, ?FILE_EXTENSION),
+ foreach_file(Dir, TmpDir, TransformFile, FileList),
+ foreach_file(Dir, fun file:delete/1, FileList),
+ foreach_file(TmpDir, Dir, CopyFile, FileList),
+ foreach_file(TmpDir, fun file:delete/1, FileList),
+ ok = file:del_dir(TmpDir)
+ end.
+
+transform_msg_file(FileOld, FileNew, TransformFun) ->
+ ok = rabbit_file:ensure_parent_dirs_exist(FileNew),
+ {ok, RefOld} = file_handle_cache:open_with_absolute_path(
+ FileOld, [raw, binary, read], []),
+ {ok, RefNew} = file_handle_cache:open_with_absolute_path(
+ FileNew, [raw, binary, write],
+ [{write_buffer, ?HANDLE_CACHE_BUFFER_SIZE}]),
+ {ok, _Acc, _IgnoreSize} =
+ rabbit_msg_file:scan(
+ RefOld, filelib:file_size(FileOld),
+ fun({MsgId, _Size, _Offset, BinMsg}, ok) ->
+ {ok, MsgNew} = case binary_to_term(BinMsg) of
+ <<>> -> {ok, <<>>}; %% dying client marker
+ Msg -> TransformFun(Msg)
+ end,
+ {ok, _} = rabbit_msg_file:append(RefNew, MsgId, MsgNew),
+ ok
+ end, ok),
+ ok = file_handle_cache:close(RefOld),
+ ok = file_handle_cache:close(RefNew),
+ ok.