merging heads of default

author: Matthew Sackman <matthew@rabbitmq.com> 2010-11-24 16:01:57 +0000
committer: Matthew Sackman <matthew@rabbitmq.com> 2010-11-24 16:01:57 +0000
commit: ea3a6f7870b33c5dc5ad9d6adef0de17e5d8ae67 (patch)
tree: 99c661cc192ca43ebcf2fbfb5a58044e89f882a4
parent: 8a9d9551f53e5d5f28805d3b2a1cca9b31f9168c (diff)
parent: 817ea4c665772214eb3acfbcca49be709825649c (diff)
download: rabbitmq-server-ea3a6f7870b33c5dc5ad9d6adef0de17e5d8ae67.tar.gz
6 files changed, 428 insertions, 199 deletions
diff --git a/src/rabbit_channel.erl b/src/rabbit_channel.erl
index 800cc237..6bed63a3 100644
--- a/src/rabbit_channel.erl
+++ b/src/rabbit_channel.erl
@@ -245,7 +245,9 @@ handle_cast({command, Msg}, State = #ch{writer_pid = WriterPid}) ->
 handle_cast({deliver, ConsumerTag, AckRequired, Msg},
             State = #ch{writer_pid = WriterPid,
                         next_tag = DeliveryTag}) ->
-    State1 = lock_message(AckRequired, {DeliveryTag, ConsumerTag, Msg}, State),
+    State1 = lock_message(AckRequired,
+                          ack_record(DeliveryTag, ConsumerTag, Msg),
+                          State),
     ok = internal_deliver(WriterPid, true, ConsumerTag, DeliveryTag, Msg),
     {_QName, QPid, _MsgId, _Redelivered, _Msg} = Msg,
     maybe_incr_stats([{QPid, 1}],
@@ -506,7 +508,9 @@ handle_method(#'basic.get'{queue = QueueNameBin,
                 #basic_message{exchange_name = ExchangeName,
                                routing_key = RoutingKey,
                                content = Content}}} ->
-            State1 = lock_message(not(NoAck), {DeliveryTag, none, Msg}, State),
+            State1 = lock_message(not(NoAck),
+                                  ack_record(DeliveryTag, none, Msg),
+                                  State),
             maybe_incr_stats([{QPid, 1}],
                              case NoAck of
                                  true  -> get_no_ack;
@@ -642,30 +646,8 @@ handle_method(#'basic.recover_async'{requeue = true},
     %% variant of this method
     {noreply, State#ch{unacked_message_q = queue:new()}};
 
-handle_method(#'basic.recover_async'{requeue = false},
-              _, State = #ch{writer_pid = WriterPid,
-                             unacked_message_q = UAMQ}) ->
-    ok = rabbit_misc:queue_fold(
-           fun ({_DeliveryTag, none, _Msg}, ok) ->
-                   %% Was sent as a basic.get_ok. Don't redeliver
-                   %% it. FIXME: appropriate?
-                   ok;
-               ({DeliveryTag, ConsumerTag,
-                 {QName, QPid, MsgId, _Redelivered, Message}}, ok) ->
-                   %% Was sent as a proper consumer delivery.  Resend
-                   %% it as before.
-                   %%
-                   %% FIXME: What should happen if the consumer's been
-                   %% cancelled since?
-                   %%
-                   %% FIXME: should we allocate a fresh DeliveryTag?
-                   internal_deliver(
-                     WriterPid, false, ConsumerTag, DeliveryTag,
-                     {QName, QPid, MsgId, true, Message})
-           end, ok, UAMQ),
-    %% No answer required - basic.recover is the newer, synchronous
-    %% variant of this method
-    {noreply, State};
+handle_method(#'basic.recover_async'{requeue = false}, _, _State) ->
+    rabbit_misc:protocol_error(not_implemented, "requeue=false", []);
 
 handle_method(#'basic.recover'{requeue = Requeue}, Content, State) ->
     {noreply, State2 = #ch{writer_pid = WriterPid}} =
@@ -987,6 +969,10 @@ basic_return(#basic_message{exchange_name = ExchangeName,
                            routing_key = RoutingKey},
            Content).
 
+ack_record(DeliveryTag, ConsumerTag,
+           _MsgStruct = {_QName, QPid, MsgId, _Redelivered, _Msg}) ->
+    {DeliveryTag, ConsumerTag, {QPid, MsgId}}.
+
 collect_acks(Q, 0, true) ->
     {Q, queue:new()};
 collect_acks(Q, DeliveryTag, Multiple) ->
@@ -1059,8 +1045,7 @@ rollback_and_notify(State) ->
 
 fold_per_queue(F, Acc0, UAQ) ->
     D = rabbit_misc:queue_fold(
-          fun ({_DTag, _CTag,
-                {_QName, QPid, MsgId, _Redelivered, _Message}}, D) ->
+          fun ({_DTag, _CTag, {QPid, MsgId}}, D) ->
                   %% dict:append would avoid the lists:reverse in
                   %% handle_message({recover, true}, ...). However, it
                   %% is significantly slower when going beyond a few
diff --git a/src/rabbit_misc.erl b/src/rabbit_misc.erl
index 1e6ad8d8..0067a410 100644
--- a/src/rabbit_misc.erl
+++ b/src/rabbit_misc.erl
@@ -61,7 +61,7 @@
 -export([sort_field_table/1]).
 -export([pid_to_string/1, string_to_pid/1]).
 -export([version_compare/2, version_compare/3]).
--export([recursive_delete/1, dict_cons/3, orddict_cons/3,
+-export([recursive_delete/1, recursive_copy/2, dict_cons/3, orddict_cons/3,
          unlink_and_capture_exit/1]).
 -export([get_options/2]).
 -export([all_module_attributes/1, build_acyclic_graph/3]).
@@ -183,6 +183,9 @@
 -spec(recursive_delete/1 ::
         ([file:filename()])
         -> rabbit_types:ok_or_error({file:filename(), any()})).
+-spec(recursive_copy/2 ::
+        (file:filename(), file:filename())
+        -> rabbit_types:ok_or_error({file:filename(), file:filename(), any()})).
 -spec(dict_cons/3 :: (any(), any(), dict()) -> dict()).
 -spec(orddict_cons/3 :: (any(), any(), orddict:orddict()) -> orddict:orddict()).
 -spec(unlink_and_capture_exit/1 :: (pid()) -> 'ok').
@@ -687,6 +690,33 @@ recursive_delete1(Path) ->
                  end
     end.
 
+recursive_copy(Src, Dest) ->
+    case filelib:is_dir(Src) of
+        false -> case file:copy(Src, Dest) of
+                     {ok, _Bytes}    -> ok;
+                     {error, enoent} -> ok; %% Path doesn't exist anyway
+                     {error, Err}    -> {error, {Src, Dest, Err}}
+                 end;
+        true  -> case file:list_dir(Src) of
+                     {ok, FileNames} ->
+                         case file:make_dir(Dest) of
+                             ok ->
+                                 lists:foldl(
+                                   fun (FileName, ok) ->
+                                           recursive_copy(
+                                             filename:join(Src, FileName),
+                                             filename:join(Dest, FileName));
+                                       (_FileName, Error) ->
+                                           Error
+                                   end, ok, FileNames);
+                             {error, Err} ->
+                                 {error, {Src, Dest, Err}}
+                         end;
+                     {error, Err} ->
+                         {error, {Src, Dest, Err}}
+                 end
+    end.
+
 dict_cons(Key, Value, Dict) ->
     dict:update(Key, fun (List) -> [Value | List] end, [Value], Dict).
 
diff --git a/src/rabbit_mnesia.erl b/src/rabbit_mnesia.erl
index cb3251c7..a62e7a6f 100644
--- a/src/rabbit_mnesia.erl
+++ b/src/rabbit_mnesia.erl
@@ -34,7 +34,7 @@
 
 -export([ensure_mnesia_dir/0, dir/0, status/0, init/0, is_db_empty/0,
          cluster/1, force_cluster/1, reset/0, force_reset/0,
-         is_clustered/0, empty_ram_only_tables/0]).
+         is_clustered/0, empty_ram_only_tables/0, copy_db/1]).
 
 -export([table_names/0]).
 
@@ -65,6 +65,7 @@
 -spec(is_clustered/0 :: () -> boolean()).
 -spec(empty_ram_only_tables/0 :: () -> 'ok').
 -spec(create_tables/0 :: () -> 'ok').
+-spec(copy_db/1 :: (file:filename()) ->  rabbit_types:ok_or_error(any())).
 
 -endif.
 
@@ -375,17 +376,15 @@ init_db(ClusterNodes, Force) ->
                   mnesia:system_info(db_nodes)} of
                 {[], true, [_]} ->
                     %% True single disc node, attempt upgrade
-                    wait_for_tables(),
+                    ok = wait_for_tables(),
                     case rabbit_upgrade:maybe_upgrade() of
-                        ok ->
-                            ensure_schema_ok();
-                        version_not_available ->
-                            schema_ok_or_move()
+                        ok                    -> ensure_schema_ok();
+                        version_not_available -> schema_ok_or_move()
                     end;
                 {[], true, _} ->
                     %% "Master" (i.e. without config) disc node in cluster,
                     %% verify schema
-                    wait_for_tables(),
+                    ok = wait_for_tables(),
                     ensure_version_ok(rabbit_upgrade:read_version()),
                     ensure_schema_ok();
                 {[], false, _} ->
@@ -476,6 +475,16 @@ move_db() ->
     rabbit_misc:ensure_ok(mnesia:start(), cannot_start_mnesia),
     ok.
 
+copy_db(Destination) ->
+    mnesia:stop(),
+    case rabbit_misc:recursive_copy(dir(), Destination) of
+        ok ->
+            rabbit_misc:ensure_ok(mnesia:start(), cannot_start_mnesia),
+            ok = wait_for_tables();
+        {error, E} ->
+            {error, E}
+    end.
+
 create_tables() ->
     lists:foreach(fun ({Tab, TabDef}) ->
                           TabDef1 = proplists:delete(match, TabDef),
diff --git a/src/rabbit_tests.erl b/src/rabbit_tests.erl
index 71b23e01..27e4d925 100644
--- a/src/rabbit_tests.erl
+++ b/src/rabbit_tests.erl
@@ -1865,9 +1865,39 @@ test_variable_queue() ->
               fun test_variable_queue_partial_segments_delta_thing/1,
               fun test_variable_queue_all_the_bits_not_covered_elsewhere1/1,
               fun test_variable_queue_all_the_bits_not_covered_elsewhere2/1,
-              fun test_dropwhile/1]],
+              fun test_dropwhile/1,
+              fun test_variable_queue_ack_limiting/1]],
     passed.
 
+test_variable_queue_ack_limiting(VQ0) ->
+    %% start by sending in a bunch of messages
+    Len = 1024,
+    VQ1 = variable_queue_publish(false, Len, VQ0),
+
+    %% squeeze and relax queue
+    Churn = Len div 32,
+    VQ2 = publish_fetch_and_ack(Churn, Len, VQ1),
+
+    %% update stats for duration
+    {_Duration, VQ3} = rabbit_variable_queue:ram_duration(VQ2),
+
+    %% fetch half the messages
+    {VQ4, _AckTags} = variable_queue_fetch(Len div 2, false, false, Len, VQ3),
+
+    VQ5 = check_variable_queue_status(VQ4, [{len          , Len div 2},
+                                            {ram_ack_count, Len div 2},
+                                            {ram_msg_count, Len div 2}]),
+
+    %% ensure all acks go to disk on 0 duration target
+    VQ6 = check_variable_queue_status(
+            rabbit_variable_queue:set_ram_duration_target(0, VQ5),
+            [{len, Len div 2},
+             {target_ram_item_count, 0},
+             {ram_msg_count, 0},
+             {ram_ack_count, 0}]),
+
+    VQ6.
+
 test_dropwhile(VQ0) ->
     Count = 10,
 
diff --git a/src/rabbit_upgrade.erl b/src/rabbit_upgrade.erl
index 27a94f6f..64d0c251 100644
--- a/src/rabbit_upgrade.erl
+++ b/src/rabbit_upgrade.erl
@@ -126,15 +126,34 @@ heads(G) ->
 %% -------------------------------------------------------------------
 
 apply_upgrades(Upgrades) ->
-    LockFile = lock_filename(),
+    LockFile = lock_filename(dir()),
     case file:open(LockFile, [write, exclusive]) of
         {ok, Lock} ->
             ok = file:close(Lock),
+            BackupDir = dir() ++ "-upgrade-backup",
             info("Upgrades: ~w to apply~n", [length(Upgrades)]),
-            [apply_upgrade(Upgrade) || Upgrade <- Upgrades],
-            info("Upgrades: All applied~n", []),
-            ok = write_version(),
-            ok = file:delete(LockFile);
+            case rabbit_mnesia:copy_db(BackupDir) of
+                ok ->
+                    %% We need to make the backup after creating the
+                    %% lock file so that it protects us from trying to
+                    %% overwrite the backup. Unfortunately this means
+                    %% the lock file exists in the backup too, which
+                    %% is not intuitive. Remove it.
+                    ok = file:delete(lock_filename(BackupDir)),
+                    info("Upgrades: Mnesia dir backed up to ~p~n", [BackupDir]),
+                    [apply_upgrade(Upgrade) || Upgrade <- Upgrades],
+                    info("Upgrades: All upgrades applied successfully~n", []),
+                    ok = write_version(),
+                    ok = rabbit_misc:recursive_delete([BackupDir]),
+                    info("Upgrades: Mnesia backup removed~n", []),
+                    ok = file:delete(LockFile);
+                {error, E} ->
+                    %% If we can't backup, the upgrade hasn't started
+                    %% hence we don't need the lockfile since the real
+                    %% mnesia dir is the good one.
+                    ok = file:delete(LockFile),
+                    throw({could_not_back_up_mnesia_dir, E})
+            end;
         {error, eexist} ->
             throw({error, previous_upgrade_failed});
         {error, _} = Error ->
@@ -151,7 +170,7 @@ dir() -> rabbit_mnesia:dir().
 
 schema_filename() -> filename:join(dir(), ?VERSION_FILENAME).
 
-lock_filename()   -> filename:join(dir(), ?LOCK_FILENAME).
+lock_filename(Dir) -> filename:join(Dir, ?LOCK_FILENAME).
 
 %% NB: we cannot use rabbit_log here since it may not have been
 %% started yet
diff --git a/src/rabbit_variable_queue.erl b/src/rabbit_variable_queue.erl
index 69d62fde..5ac042a2 100644
--- a/src/rabbit_variable_queue.erl
+++ b/src/rabbit_variable_queue.erl
@@ -89,12 +89,14 @@
 %%
 %% The duration indicated to us by the memory_monitor is used to
 %% calculate, given our current ingress and egress rates, how many
-%% messages we should hold in RAM. When we need to push alphas to
-%% betas or betas to gammas, we favour writing out messages that are
-%% further from the head of the queue. This minimises writes to disk,
-%% as the messages closer to the tail of the queue stay in the queue
-%% for longer, thus do not need to be replaced as quickly by sending
-%% other messages to disk.
+%% messages we should hold in RAM. We track the ingress and egress
+%% rates for both messages and pending acks and rates for both are
+%% considered when calculating the number of messages to hold in
+%% RAM. When we need to push alphas to betas or betas to gammas, we
+%% favour writing out messages that are further from the head of the
+%% queue. This minimises writes to disk, as the messages closer to the
+%% tail of the queue stay in the queue for longer, thus do not need to
+%% be replaced as quickly by sending other messages to disk.
 %%
 %% Whilst messages are pushed to disk and forgotten from RAM as soon
 %% as requested by a new setting of the queue RAM duration, the
@@ -156,7 +158,7 @@
 %% The conversion from alphas to betas is also chunked, but only to
 %% ensure no more than ?IO_BATCH_SIZE alphas are converted to betas at
 %% any one time. This further smooths the effects of changes to the
-%% target_ram_msg_count and ensures the queue remains responsive
+%% target_ram_item_count and ensures the queue remains responsive
 %% even when there is a large amount of IO work to do. The
 %% idle_timeout callback is utilised to ensure that conversions are
 %% done as promptly as possible whilst ensuring the queue remains
@@ -168,6 +170,29 @@
 %% the latter) are both cheap and do require any scanning through qi
 %% segments.
 %%
+%% Pending acks are recorded in memory either as the tuple {SeqId,
+%% Guid, MsgProps} (tuple-form) or as the message itself (message-
+%% form). Acks for persistent messages are always stored in the tuple-
+%% form. Acks for transient messages are also stored in tuple-form if
+%% the message has been sent to disk as part of the memory reduction
+%% process. For transient messages that haven't already been written
+%% to disk, acks are stored in message-form.
+%%
+%% During memory reduction, acks stored in message-form are converted
+%% to tuple-form, and the corresponding messages are pushed out to
+%% disk.
+%%
+%% The order in which alphas are pushed to betas and message-form acks
+%% are pushed to disk is determined dynamically. We always prefer to
+%% push messages for the source (alphas or acks) that is growing the
+%% fastest (with growth measured as avg. ingress - avg. egress). In
+%% each round of memory reduction a chunk of messages at most
+%% ?IO_BATCH_SIZE in size is allocated to be pushed to disk. The
+%% fastest growing source will be reduced by as much of this chunk as
+%% possible. If there is any remaining allocation in the chunk after
+%% the first source has been reduced to zero, the second source will
+%% be reduced by as much of the remaining chunk as possible.
+%%
 %% Notes on Clean Shutdown
 %% (This documents behaviour in variable_queue, queue_index and
 %% msg_store.)
@@ -220,6 +245,8 @@
           q4,
           next_seq_id,
           pending_ack,
+          pending_ack_index,
+          ram_ack_index,
           index_state,
           msg_store_clients,
           on_sync,
@@ -229,13 +256,17 @@
           len,
           persistent_count,
 
-          target_ram_msg_count,
+          target_ram_item_count,
           ram_msg_count,
           ram_msg_count_prev,
+          ram_ack_count_prev,
           ram_index_count,
           out_counter,
           in_counter,
-          rates
+          ack_out_counter,
+          ack_in_counter,
+          rates,
+          ack_rates
          }).
 
 -record(rates, { egress, ingress, avg_egress, avg_ingress, timestamp }).
@@ -299,30 +330,34 @@
                         funs            :: [fun (() -> any())] }).
 
 -type(state() :: #vqstate {
-             q1                   :: queue(),
-             q2                   :: bpqueue:bpqueue(),
-             delta                :: delta(),
-             q3                   :: bpqueue:bpqueue(),
-             q4                   :: queue(),
-             next_seq_id          :: seq_id(),
-             pending_ack          :: dict(),
-             index_state          :: any(),
-             msg_store_clients    :: 'undefined' | {{any(), binary()},
+             q1                    :: queue(),
+             q2                    :: bpqueue:bpqueue(),
+             delta                 :: delta(),
+             q3                    :: bpqueue:bpqueue(),
+             q4                    :: queue(),
+             next_seq_id           :: seq_id(),
+             pending_ack           :: dict(),
+             ram_ack_index         :: gb_tree(),
+             index_state           :: any(),
+             msg_store_clients     :: 'undefined' | {{any(), binary()},
                                                     {any(), binary()}},
-             on_sync              :: sync(),
-             durable              :: boolean(),
-
-             len                  :: non_neg_integer(),
-             persistent_count     :: non_neg_integer(),
-
-             transient_threshold  :: non_neg_integer(),
-             target_ram_msg_count :: non_neg_integer() | 'infinity',
-             ram_msg_count        :: non_neg_integer(),
-             ram_msg_count_prev   :: non_neg_integer(),
-             ram_index_count      :: non_neg_integer(),
-             out_counter          :: non_neg_integer(),
-             in_counter           :: non_neg_integer(),
-             rates                :: rates() }).
+             on_sync               :: sync(),
+             durable               :: boolean(),
+
+             len                   :: non_neg_integer(),
+             persistent_count      :: non_neg_integer(),
+
+             transient_threshold   :: non_neg_integer(),
+             target_ram_item_count :: non_neg_integer() | 'infinity',
+             ram_msg_count         :: non_neg_integer(),
+             ram_msg_count_prev    :: non_neg_integer(),
+             ram_index_count       :: non_neg_integer(),
+             out_counter           :: non_neg_integer(),
+             in_counter            :: non_neg_integer(),
+             ack_out_counter       :: non_neg_integer(),
+             ack_in_counter        :: non_neg_integer(),
+             rates                 :: rates(),
+             ack_rates             :: rates() }).
 
 -include("rabbit_backing_queue_spec.hrl").
 
@@ -479,19 +514,18 @@ publish_delivered(true, Msg = #basic_message { is_persistent = IsPersistent },
                                      out_counter      = OutCount,
                                      in_counter       = InCount,
                                      persistent_count = PCount,
-                                     pending_ack      = PA,
                                      durable          = IsDurable }) ->
     IsPersistent1 = IsDurable andalso IsPersistent,
     MsgStatus = (msg_status(IsPersistent1, SeqId, Msg, MsgProps))
         #msg_status { is_delivered = true },
     {MsgStatus1, State1} = maybe_write_to_disk(false, false, MsgStatus, State),
-    PA1 = record_pending_ack(m(MsgStatus1), PA),
+    State2 = record_pending_ack(m(MsgStatus1), State1),
     PCount1 = PCount + one_if(IsPersistent1),
-    {SeqId, a(State1 #vqstate { next_seq_id      = SeqId    + 1,
-                                out_counter      = OutCount + 1,
-                                in_counter       = InCount  + 1,
-                                persistent_count = PCount1,
-                                pending_ack      = PA1 })}.
+    {SeqId, a(reduce_memory_use(
+                State2 #vqstate { next_seq_id      = SeqId    + 1,
+                                  out_counter      = OutCount + 1,
+                                  in_counter       = InCount  + 1,
+                                  persistent_count = PCount1 }))}.
 
 dropwhile(Pred, State) ->
     {_OkOrEmpty, State1} = dropwhile1(Pred, State),
@@ -561,8 +595,7 @@ internal_fetch(AckRequired, MsgStatus = #msg_status {
                                  index_state       = IndexState,
                                  msg_store_clients = MSCState,
                                  len               = Len,
-                                 persistent_count  = PCount,
-                                 pending_ack       = PA }) ->
+                                 persistent_count  = PCount }) ->
     %% 1. Mark it delivered if necessary
     IndexState1 = maybe_write_delivered(
                     IndexOnDisk andalso not IsDelivered,
@@ -582,12 +615,12 @@ internal_fetch(AckRequired, MsgStatus = #msg_status {
         end,
 
     %% 3. If an ack is required, add something sensible to PA
-    {AckTag, PA1} = case AckRequired of
-                        true  -> PA2 = record_pending_ack(
-                                         MsgStatus #msg_status {
-                                           is_delivered = true }, PA),
-                                 {SeqId, PA2};
-                        false -> {blank_ack, PA}
+    {AckTag, State1} = case AckRequired of
+                        true  -> StateN = record_pending_ack(
+                                            MsgStatus #msg_status {
+                                              is_delivered = true }, State),
+                                 {SeqId, StateN};
+                        false -> {blank_ack, State}
                     end,
 
     PCount1 = PCount - one_if(IsPersistent andalso not AckRequired),
@@ -595,12 +628,11 @@ internal_fetch(AckRequired, MsgStatus = #msg_status {
     RamMsgCount1 = RamMsgCount - one_if(Msg =/= undefined),
 
     {{Msg, IsDelivered, AckTag, Len1},
-     a(State #vqstate { ram_msg_count    = RamMsgCount1,
-                        out_counter      = OutCount + 1,
-                        index_state      = IndexState2,
-                        len              = Len1,
-                        persistent_count = PCount1,
-                        pending_ack      = PA1 })}.
+     a(State1 #vqstate { ram_msg_count    = RamMsgCount1,
+                         out_counter      = OutCount + 1,
+                         index_state      = IndexState2,
+                         len              = Len1,
+                         persistent_count = PCount1 })}.
 
 ack(AckTags, State) ->
     a(ack(fun msg_store_remove/3,
@@ -678,40 +710,62 @@ is_empty(State) -> 0 == len(State).
 
 set_ram_duration_target(DurationTarget,
                         State = #vqstate {
-                          rates = #rates { avg_egress  = AvgEgressRate,
-                                           avg_ingress = AvgIngressRate },
-                          target_ram_msg_count = TargetRamMsgCount }) ->
-    Rate = AvgEgressRate + AvgIngressRate,
-    TargetRamMsgCount1 =
+                          rates =
+                              #rates { avg_egress  = AvgEgressRate,
+                                       avg_ingress = AvgIngressRate },
+                          ack_rates =
+                              #rates { avg_egress  = AvgAckEgressRate,
+                                       avg_ingress = AvgAckIngressRate },
+                          target_ram_item_count = TargetRamItemCount }) ->
+    Rate =
+        AvgEgressRate + AvgIngressRate + AvgAckEgressRate + AvgAckIngressRate,
+    TargetRamItemCount1 =
         case DurationTarget of
             infinity  -> infinity;
             _         -> trunc(DurationTarget * Rate) %% msgs = sec * msgs/sec
         end,
-    State1 = State #vqstate { target_ram_msg_count = TargetRamMsgCount1 },
-    a(case TargetRamMsgCount1 == infinity orelse
-          (TargetRamMsgCount =/= infinity andalso
-           TargetRamMsgCount1 >= TargetRamMsgCount) of
+    State1 = State #vqstate { target_ram_item_count = TargetRamItemCount1 },
+    a(case TargetRamItemCount1 == infinity orelse
+          (TargetRamItemCount =/= infinity andalso
+           TargetRamItemCount1 >= TargetRamItemCount) of
           true  -> State1;
           false -> reduce_memory_use(State1)
       end).
 
 ram_duration(State = #vqstate {
-               rates              = #rates { egress    = Egress,
-                                             ingress   = Ingress,
-                                             timestamp = Timestamp } = Rates,
+               rates              = #rates { timestamp = Timestamp,
+                                             egress    = Egress,
+                                             ingress   = Ingress } = Rates,
+               ack_rates          = #rates { timestamp = AckTimestamp,
+                                             egress    = AckEgress,
+                                             ingress   = AckIngress } = ARates,
                in_counter         = InCount,
                out_counter        = OutCount,
+               ack_in_counter     = AckInCount,
+               ack_out_counter    = AckOutCount,
                ram_msg_count      = RamMsgCount,
-               ram_msg_count_prev = RamMsgCountPrev }) ->
+               ram_msg_count_prev = RamMsgCountPrev,
+               ram_ack_index      = RamAckIndex,
+               ram_ack_count_prev = RamAckCountPrev }) ->
     Now = now(),
     {AvgEgressRate,   Egress1} = update_rate(Now, Timestamp, OutCount, Egress),
     {AvgIngressRate, Ingress1} = update_rate(Now, Timestamp, InCount, Ingress),
 
-    Duration = %% msgs / (msgs/sec) == sec
-        case AvgEgressRate == 0 andalso AvgIngressRate == 0 of
+    {AvgAckEgressRate,   AckEgress1} =
+        update_rate(Now, AckTimestamp, AckOutCount, AckEgress),
+    {AvgAckIngressRate, AckIngress1} =
+        update_rate(Now, AckTimestamp, AckInCount, AckIngress),
+
+    RamAckCount = gb_trees:size(RamAckIndex),
+
+    Duration = %% msgs+acks / (msgs+acks/sec) == sec
+        case AvgEgressRate == 0 andalso AvgIngressRate == 0 andalso
+             AvgAckEgressRate == 0 andalso AvgAckIngressRate == 0 of
             true  -> infinity;
-            false -> (RamMsgCountPrev + RamMsgCount) /
-                         (2 * (AvgEgressRate + AvgIngressRate))
+            false -> (RamMsgCountPrev + RamMsgCount +
+                          RamAckCount + RamAckCountPrev) /
+                         (4 * (AvgEgressRate + AvgIngressRate +
+                                   AvgAckEgressRate + AvgAckIngressRate))
         end,
 
     {Duration, State #vqstate {
@@ -721,14 +775,24 @@ ram_duration(State = #vqstate {
                                         avg_egress  = AvgEgressRate,
                                         avg_ingress = AvgIngressRate,
                                         timestamp   = Now },
+                 ack_rates          = ARates #rates {
+                                        egress      = AckEgress1,
+                                        ingress     = AckIngress1,
+                                        avg_egress  = AvgAckEgressRate,
+                                        avg_ingress = AvgAckIngressRate,
+                                        timestamp   = Now },
                  in_counter         = 0,
                  out_counter        = 0,
-                 ram_msg_count_prev = RamMsgCount }}.
+                 ack_in_counter     = 0,
+                 ack_out_counter    = 0,
+                 ram_msg_count_prev = RamMsgCount,
+                 ram_ack_count_prev = RamAckCount }}.
 
 needs_idle_timeout(State = #vqstate { on_sync = ?BLANK_SYNC }) ->
-    {Res, _State} = reduce_memory_use(fun (_Quota, State1) -> State1 end,
+    {Res, _State} = reduce_memory_use(fun (_Quota, State1) -> {0, State1} end,
                                       fun (_Quota, State1) -> State1 end,
                                       fun (State1)         -> State1 end,
+                                      fun (_Quota, State1) -> {0, State1} end,
                                       State),
     Res;
 needs_idle_timeout(_State) ->
@@ -740,32 +804,39 @@ handle_pre_hibernate(State = #vqstate { index_state = IndexState }) ->
     State #vqstate { index_state = rabbit_queue_index:flush(IndexState) }.
 
 status(#vqstate { q1 = Q1, q2 = Q2, delta = Delta, q3 = Q3, q4 = Q4,
-                  len                  = Len,
-                  pending_ack          = PA,
-                  on_sync              = #sync { funs = From },
-                  target_ram_msg_count = TargetRamMsgCount,
-                  ram_msg_count        = RamMsgCount,
-                  ram_index_count      = RamIndexCount,
-                  next_seq_id          = NextSeqId,
-                  persistent_count     = PersistentCount,
-                  rates                = #rates {
+                  len                   = Len,
+                  pending_ack           = PA,
+                  ram_ack_index         = RAI,
+                  on_sync               = #sync { funs = From },
+                  target_ram_item_count = TargetRamItemCount,
+                  ram_msg_count         = RamMsgCount,
+                  ram_index_count       = RamIndexCount,
+                  next_seq_id           = NextSeqId,
+                  persistent_count      = PersistentCount,
+                  rates                 = #rates {
                     avg_egress  = AvgEgressRate,
-                    avg_ingress = AvgIngressRate } }) ->
-    [ {q1                   , queue:len(Q1)},
-      {q2                   , bpqueue:len(Q2)},
-      {delta                , Delta},
-      {q3                   , bpqueue:len(Q3)},
-      {q4                   , queue:len(Q4)},
-      {len                  , Len},
-      {pending_acks         , dict:size(PA)},
-      {outstanding_txns     , length(From)},
-      {target_ram_msg_count , TargetRamMsgCount},
-      {ram_msg_count        , RamMsgCount},
-      {ram_index_count      , RamIndexCount},
-      {next_seq_id          , NextSeqId},
-      {persistent_count     , PersistentCount},
-      {avg_egress_rate      , AvgEgressRate},
-      {avg_ingress_rate     , AvgIngressRate} ].
+                    avg_ingress = AvgIngressRate },
+                  ack_rates             = #rates {
+                    avg_egress  = AvgAckEgressRate,
+                    avg_ingress = AvgAckIngressRate } }) ->
+    [ {q1                    , queue:len(Q1)},
+      {q2                    , bpqueue:len(Q2)},
+      {delta                 , Delta},
+      {q3                    , bpqueue:len(Q3)},
+      {q4                    , queue:len(Q4)},
+      {len                   , Len},
+      {pending_acks          , dict:size(PA)},
+      {outstanding_txns      , length(From)},
+      {target_ram_item_count , TargetRamItemCount},
+      {ram_msg_count         , RamMsgCount},
+      {ram_ack_count         , gb_trees:size(RAI)},
+      {ram_index_count       , RamIndexCount},
+      {next_seq_id           , NextSeqId},
+      {persistent_count      , PersistentCount},
+      {avg_ingress_rate      , AvgIngressRate},
+      {avg_egress_rate       , AvgEgressRate},
+      {avg_ack_ingress_rate  , AvgAckIngressRate},
+      {avg_ack_egress_rate   , AvgAckEgressRate} ].
 
 %%----------------------------------------------------------------------------
 %% Minor helpers
@@ -955,35 +1026,43 @@ init(IsDurable, IndexState, DeltaCount, Terms,
             end,
     Now = now(),
     State = #vqstate {
-      q1                   = queue:new(),
-      q2                   = bpqueue:new(),
-      delta                = Delta,
-      q3                   = bpqueue:new(),
-      q4                   = queue:new(),
-      next_seq_id          = NextSeqId,
-      pending_ack          = dict:new(),
-      index_state          = IndexState1,
-      msg_store_clients    = {PersistentClient, TransientClient},
-      on_sync              = ?BLANK_SYNC,
-      durable              = IsDurable,
-      transient_threshold  = NextSeqId,
-
-      len                  = DeltaCount1,
-      persistent_count     = DeltaCount1,
-
-      target_ram_msg_count = infinity,
-      ram_msg_count        = 0,
-      ram_msg_count_prev   = 0,
-      ram_index_count      = 0,
-      out_counter          = 0,
-      in_counter           = 0,
-      rates                = #rates { egress      = {Now, 0},
-                                      ingress     = {Now, DeltaCount1},
-                                      avg_egress  = 0.0,
-                                      avg_ingress = 0.0,
-                                      timestamp   = Now } },
+      q1                    = queue:new(),
+      q2                    = bpqueue:new(),
+      delta                 = Delta,
+      q3                    = bpqueue:new(),
+      q4                    = queue:new(),
+      next_seq_id           = NextSeqId,
+      pending_ack           = dict:new(),
+      ram_ack_index         = gb_trees:empty(),
+      index_state           = IndexState1,
+      msg_store_clients     = {PersistentClient, TransientClient},
+      on_sync               = ?BLANK_SYNC,
+      durable               = IsDurable,
+      transient_threshold   = NextSeqId,
+
+      len                   = DeltaCount1,
+      persistent_count      = DeltaCount1,
+
+      target_ram_item_count = infinity,
+      ram_msg_count         = 0,
+      ram_msg_count_prev    = 0,
+      ram_ack_count_prev    = 0,
+      ram_index_count       = 0,
+      out_counter           = 0,
+      in_counter            = 0,
+      ack_out_counter       = 0,
+      ack_in_counter        = 0,
+      rates                 = blank_rate(Now, DeltaCount1),
+      ack_rates             = blank_rate(Now, 0) },
     a(maybe_deltas_to_betas(State)).
 
+blank_rate(Timestamp, IngressLength) ->
+    #rates { egress      = {Timestamp, 0},
+             ingress     = {Timestamp, IngressLength},
+             avg_egress  = 0.0,
+             avg_ingress = 0.0,
+             timestamp   = Timestamp }.
+
 msg_store_callback(PersistentGuids, Pubs, AckTags, Fun, MsgPropsFun) ->
     Self = self(),
     F = fun () -> rabbit_amqqueue:maybe_run_queue_via_backing_queue(
@@ -1191,12 +1270,21 @@ record_pending_ack(#msg_status { seq_id        = SeqId,
                                  guid          = Guid,
                                  is_persistent = IsPersistent,
                                  msg_on_disk   = MsgOnDisk,
-                                 msg_props     = MsgProps } = MsgStatus, PA) ->
-    AckEntry = case MsgOnDisk of
-                   true  -> {IsPersistent, Guid, MsgProps};
-                   false -> MsgStatus
-               end,
-    dict:store(SeqId, AckEntry, PA).
+                                 msg_props     = MsgProps } = MsgStatus,
+                   State = #vqstate { pending_ack     = PA,
+                                      ram_ack_index   = RAI,
+                                      ack_in_counter  = AckInCount}) ->
+    {AckEntry, RAI1} =
+        case MsgOnDisk of
+            true ->
+                {{IsPersistent, Guid, MsgProps}, RAI};
+            false ->
+                {MsgStatus, gb_trees:insert(SeqId, Guid, RAI)}
+        end,
+    PA1 = dict:store(SeqId, AckEntry, PA),
+    State #vqstate { pending_ack    = PA1,
+                     ram_ack_index  = RAI1,
+                     ack_in_counter = AckInCount + 1}.
 
 remove_pending_ack(KeepPersistent,
                    State = #vqstate { pending_ack       = PA,
@@ -1204,7 +1292,8 @@ remove_pending_ack(KeepPersistent,
                                       msg_store_clients = MSCState }) ->
     {SeqIds, GuidsByStore} = dict:fold(fun accumulate_ack/3,
                                        {[], orddict:new()}, PA),
-    State1 = State #vqstate { pending_ack = dict:new() },
+    State1 = State #vqstate { pending_ack   = dict:new(),
+                              ram_ack_index = gb_trees:empty() },
     case KeepPersistent of
         true  -> case orddict:find(false, GuidsByStore) of
                      error       -> State1;
@@ -1226,13 +1315,17 @@ ack(MsgStoreFun, Fun, AckTags, State) ->
     {{SeqIds, GuidsByStore},
      State1 = #vqstate { index_state       = IndexState,
                          msg_store_clients = MSCState,
-                         persistent_count  = PCount }} =
+                         persistent_count  = PCount,
+                         ack_out_counter   = AckOutCount }} =
         lists:foldl(
-          fun (SeqId, {Acc, State2 = #vqstate { pending_ack = PA }}) ->
+          fun (SeqId, {Acc, State2 = #vqstate { pending_ack   = PA,
+                                                ram_ack_index = RAI }}) ->
                   AckEntry = dict:fetch(SeqId, PA),
                   {accumulate_ack(SeqId, AckEntry, Acc),
                    Fun(AckEntry, State2 #vqstate {
-                                   pending_ack = dict:erase(SeqId, PA) })}
+                                   pending_ack   = dict:erase(SeqId, PA),
+                                   ram_ack_index =
+                                       gb_trees:delete_any(SeqId, RAI)})}
           end, {{[], orddict:new()}, State}, AckTags),
     IndexState1 = rabbit_queue_index:ack(SeqIds, IndexState),
     ok = orddict:fold(fun (IsPersistent, Guids, ok) ->
@@ -1241,7 +1334,8 @@ ack(MsgStoreFun, Fun, AckTags, State) ->
     PCount1 = PCount - find_persistent_count(sum_guids_by_store_to_len(
                                                orddict:new(), GuidsByStore)),
     State1 #vqstate { index_state      = IndexState1,
-                      persistent_count = PCount1 }.
+                      persistent_count = PCount1,
+                      ack_out_counter  = AckOutCount + length(AckTags) }.
 
 accumulate_ack(_SeqId, #msg_status { is_persistent = false, %% ASSERTIONS
                                      msg_on_disk   = false,
@@ -1270,7 +1364,7 @@ find_persistent_count(LensByStore) ->
 %% though the conversion function for that is called as necessary. The
 %% reason is twofold. Firstly, this is safe because the conversion is
 %% only ever necessary just after a transition to a
-%% target_ram_msg_count of zero or after an incremental alpha->beta
+%% target_ram_item_count of zero or after an incremental alpha->beta
 %% conversion. In the former case the conversion is performed straight
 %% away (i.e. any betas present at the time are converted to deltas),
 %% and in the latter case the need for a conversion is flagged up
@@ -1280,26 +1374,87 @@ find_persistent_count(LensByStore) ->
 %% one segment's worth of messages in q3 - and thus would risk
 %% perpetually reporting the need for a conversion when no such
 %% conversion is needed. That in turn could cause an infinite loop.
-reduce_memory_use(AlphaBetaFun, BetaGammaFun, BetaDeltaFun, State) ->
-    {Reduce, State1} = case chunk_size(State #vqstate.ram_msg_count,
-                                       State #vqstate.target_ram_msg_count) of
-                           0  -> {false, State};
-                           S1 -> {true, AlphaBetaFun(S1, State)}
-                       end,
-    case State1 #vqstate.target_ram_msg_count of
-        infinity -> {Reduce, State1};
-        0        -> {Reduce, BetaDeltaFun(State1)};
-        _        -> case chunk_size(State1 #vqstate.ram_index_count,
-                                   permitted_ram_index_count(State1)) of
-                        ?IO_BATCH_SIZE = S2 -> {true, BetaGammaFun(S2, State1)};
-                        _                   -> {Reduce, State1}
-                    end
+reduce_memory_use(_AlphaBetaFun, _BetaGammaFun, _BetaDeltaFun, _AckFun,
+                  State = #vqstate {target_ram_item_count = infinity}) ->
+    {false, State};
+reduce_memory_use(AlphaBetaFun, BetaGammaFun, BetaDeltaFun, AckFun,
+                  State = #vqstate {
+                    ram_ack_index         = RamAckIndex,
+                    ram_msg_count         = RamMsgCount,
+                    target_ram_item_count = TargetRamItemCount,
+                    rates                 = #rates {
+                      avg_ingress = AvgIngress,
+                      avg_egress  = AvgEgress },
+                    ack_rates             = #rates {
+                      avg_ingress = AvgAckIngress,
+                      avg_egress  = AvgAckEgress } }) ->
+
+    {Reduce, State1} =
+        case chunk_size(RamMsgCount + gb_trees:size(RamAckIndex),
+                        TargetRamItemCount) of
+            0 ->
+                {false, State};
+            S1 ->
+                ReduceFuns =
+                    case (AvgAckIngress - AvgAckEgress) >
+                        (AvgIngress - AvgEgress) of
+                        true ->
+                            %% ACKs are growing faster than the queue,
+                            %% push messages from there first.
+                            [AckFun, AlphaBetaFun];
+                        false ->
+                            %% The queue is growing faster than the
+                            %% acks, push queue messages first.
+                            [AlphaBetaFun, AckFun]
+                    end,
+                {_, State2} =
+                    %% Both reduce functions get a chance to reduce
+                    %% memory. The second may very well get a quota of
+                    %% 0 if the first function managed to push out the
+                    %% maximum number of messages.
+                    lists:foldl(
+                      fun (ReduceFun, {QuotaN, StateN}) ->
+                              ReduceFun(QuotaN, StateN)
+                      end, {S1, State}, ReduceFuns),
+                {true, State2}
+        end,
+
+    case State1 #vqstate.target_ram_item_count of
+        0 -> {Reduce, BetaDeltaFun(State1)};
+        _ -> case chunk_size(State1 #vqstate.ram_index_count,
+                             permitted_ram_index_count(State1)) of
+                 ?IO_BATCH_SIZE = S2 -> {true, BetaGammaFun(S2, State1)};
+                 _                   -> {Reduce, State1}
+             end
     end.
 
+limit_ram_acks(0, State) ->
+    {0, State};
+limit_ram_acks(Quota, State = #vqstate { pending_ack   = PA,
+                                         ram_ack_index = RAI }) ->
+    case gb_trees:is_empty(RAI) of
+        true ->
+            {Quota, State};
+        false ->
+            {SeqId, Guid, RAI1} = gb_trees:take_largest(RAI),
+            MsgStatus = #msg_status {
+              guid          = Guid, %% ASSERTION
+              is_persistent = false, %% ASSERTION
+              msg_props     = MsgProps } = dict:fetch(SeqId, PA),
+            {_, State1} = maybe_write_to_disk(true, false, MsgStatus, State),
+            limit_ram_acks(Quota - 1,
+                           State1 #vqstate {
+                             pending_ack   =
+                                 dict:store(SeqId, {false, Guid, MsgProps}, PA),
+                             ram_ack_index = RAI1 })
+    end.
+
+
 reduce_memory_use(State) ->
     {_, State1} = reduce_memory_use(fun push_alphas_to_betas/2,
                                     fun limit_ram_index/2,
                                     fun push_betas_to_deltas/1,
+                                    fun limit_ram_acks/2,
                                     State),
     State1.
 
@@ -1432,9 +1587,9 @@ maybe_deltas_to_betas(State = #vqstate {
     end.
 
 push_alphas_to_betas(Quota, State) ->
-    { Quota1, State1} = maybe_push_q1_to_betas(Quota,  State),
-    {_Quota2, State2} = maybe_push_q4_to_betas(Quota1, State1),
-    State2.
+    {Quota1, State1} = maybe_push_q1_to_betas(Quota,  State),
+    {Quota2, State2} = maybe_push_q4_to_betas(Quota1, State1),
+    {Quota2, State2}.
 
 maybe_push_q1_to_betas(Quota, State = #vqstate { q1 = Q1 }) ->
     maybe_push_alphas_to_betas(
@@ -1460,10 +1615,11 @@ maybe_push_q4_to_betas(Quota, State = #vqstate { q4 = Q4 }) ->
 
 maybe_push_alphas_to_betas(_Generator, _Consumer, Quota, _Q,
                            State = #vqstate {
-                             ram_msg_count        = RamMsgCount,
-                             target_ram_msg_count = TargetRamMsgCount })
+                             ram_msg_count         = RamMsgCount,
+                             target_ram_item_count = TargetRamItemCount })
   when Quota =:= 0 orelse
-       TargetRamMsgCount =:= infinity orelse TargetRamMsgCount >= RamMsgCount ->
+       TargetRamItemCount =:= infinity orelse
+       TargetRamItemCount >= RamMsgCount ->
     {Quota, State};
 maybe_push_alphas_to_betas(Generator, Consumer, Quota, Q, State) ->
     case Generator(Q) of
author	Matthew Sackman <matthew@rabbitmq.com>	2010-11-24 16:01:57 +0000
committer	Matthew Sackman <matthew@rabbitmq.com>	2010-11-24 16:01:57 +0000
commit	ea3a6f7870b33c5dc5ad9d6adef0de17e5d8ae67 (patch)
tree	99c661cc192ca43ebcf2fbfb5a58044e89f882a4
parent	8a9d9551f53e5d5f28805d3b2a1cca9b31f9168c (diff)
parent	817ea4c665772214eb3acfbcca49be709825649c (diff)
download	rabbitmq-server-ea3a6f7870b33c5dc5ad9d6adef0de17e5d8ae67.tar.gz