Added INFO LATENCYSTATS section: latency by percentile distribution/latency by cumulative distribution of latencies (#9462)

# Short description The Redis extended latency stats track per command latencies and enables: - exporting the per-command percentile distribution via the `INFO LATENCYSTATS` command. **( percentile distribution is not mergeable between cluster nodes ).** - exporting the per-command cumulative latency distributions via the `LATENCY HISTOGRAM` command. Using the cumulative distribution of latencies we can merge several stats from different cluster nodes to calculate aggregate metrics . By default, the extended latency monitoring is enabled since the overhead of keeping track of the command latency is very small. If you don't want to track extended latency metrics, you can easily disable it at runtime using the command: - `CONFIG SET latency-tracking no` By default, the exported latency percentiles are the p50, p99, and p999. You can alter them at runtime using the command: - `CONFIG SET latency-tracking-info-percentiles "0.0 50.0 100.0"` ## Some details: - The total size per histogram should sit around 40 KiB. We only allocate those 40KiB when a command was called for the first time. - With regards to the WRITE overhead As seen below, there is no measurable overhead on the achievable ops/sec or full latency spectrum on the client. Including also the measured redis-benchmark for unstable vs this branch. - We track from 1 nanosecond to 1 second ( everything above 1 second is considered +Inf ) ## `INFO LATENCYSTATS` exposition format - Format: `latency_percentiles_usec_<CMDNAME>:p0=XX,p50....` ## `LATENCY HISTOGRAM [command ...]` exposition format Return a cumulative distribution of latencies in the format of a histogram for the specified command names. The histogram is composed of a map of time buckets: - Each representing a latency range, between 1 nanosecond and roughly 1 second. - Each bucket covers twice the previous bucket's range. - Empty buckets are not printed. - Everything above 1 sec is considered +Inf. - At max there will be log2(1000000000)=30 buckets We reply a map for each command in the format: `<command name> : { `calls`: <total command calls> , `histogram` : { <bucket 1> : latency , < bucket 2> : latency, ... } }` Co-authored-by: Oran Agra <oran@redislabs.com>
author: filipe oliveira <filipecosta.90@gmail.com> 2022-01-05 12:01:05 +0000
committer: GitHub <noreply@github.com> 2022-01-05 14:01:05 +0200
commit: 5dd15443ac755d9ad2a22aa8c341b25d3de82eb4 (patch)
tree: 96123aeae3e692d6401b8e0138a81b65a206b19f /src/latency.c
parent: 4d3c4cfac70557a28a62266543951d46d0cae584 (diff)
download: redis-5dd15443ac755d9ad2a22aa8c341b25d3de82eb4.tar.gz
1 files changed, 83 insertions, 0 deletions
diff --git a/src/latency.c b/src/latency.c
index 5b3476a81..ff8d66069 100644
--- a/src/latency.c
+++ b/src/latency.c
@@ -490,6 +490,78 @@ sds createLatencyReport(void) {
 
 /* ---------------------- Latency command implementation -------------------- */
 
+/* latencyCommand() helper to produce a map of time buckets,
+ * each representing a latency range,
+ * between 1 nanosecond and roughly 1 second.
+ * Each bucket covers twice the previous bucket's range.
+ * Empty buckets are not printed.
+ * Everything above 1 sec is considered +Inf.
+ * At max there will be log2(1000000000)=30 buckets */
+void fillCommandCDF(client *c, struct hdr_histogram* histogram) {
+    addReplyMapLen(c,2);
+    addReplyBulkCString(c,"calls");
+    addReplyLongLong(c,(long long) histogram->total_count);
+    addReplyBulkCString(c,"histogram_usec");
+    void *replylen = addReplyDeferredLen(c);
+    int samples = 0;
+    struct hdr_iter iter;
+    hdr_iter_log_init(&iter,histogram,1024,2);
+    int64_t previous_count = 0;
+    while (hdr_iter_next(&iter)) {
+        const int64_t micros = iter.highest_equivalent_value / 1000;
+        const int64_t cumulative_count = iter.cumulative_count;
+        if(cumulative_count > previous_count){
+            addReplyLongLong(c,(long long) micros);
+            addReplyLongLong(c,(long long) cumulative_count);
+            samples++;
+        }
+        previous_count = cumulative_count;
+    }
+    setDeferredMapLen(c,replylen,samples);
+}
+
+/* latencyCommand() helper to produce for all commands,
+ * a per command cumulative distribution of latencies. */
+void latencyAllCommandsFillCDF(client *c) {
+    dictIterator *di = dictGetSafeIterator(server.commands);
+    dictEntry *de;
+    struct redisCommand *cmd;
+    void *replylen = addReplyDeferredLen(c);
+    int command_with_data = 0;
+    while((de = dictNext(di)) != NULL) {
+        cmd = (struct redisCommand *) dictGetVal(de);
+        if (!cmd->latency_histogram)
+            continue;
+        addReplyBulkCString(c,cmd->name);
+        fillCommandCDF(c, cmd->latency_histogram);
+        command_with_data++;
+    }
+    dictReleaseIterator(di);
+    setDeferredMapLen(c,replylen,command_with_data);
+}
+
+/* latencyCommand() helper to produce for a specific command set,
+ * a per command cumulative distribution of latencies. */
+void latencySpecificCommandsFillCDF(client *c) {
+    void *replylen = addReplyDeferredLen(c);
+    int command_with_data = 0;
+    for (int j = 2; j < c->argc; j++){
+        struct redisCommand *cmd = dictFetchValue(server.commands, c->argv[j]->ptr);
+        /* If the command does not exist we skip the reply */
+        if (cmd == NULL) {
+            continue;
+        }
+        /* If no latency info we reply with the same format as non empty histograms */
+        if (!cmd->latency_histogram) {
+            continue;
+        }
+        addReplyBulkCString(c,c->argv[j]->ptr);
+        fillCommandCDF(c, cmd->latency_histogram);
+        command_with_data++;
+    }
+    setDeferredMapLen(c,replylen,command_with_data);
+}
+
 /* latencyCommand() helper to produce a time-delay reply for all the samples
  * in memory for the specified time series. */
 void latencyCommandReplyWithSamples(client *c, struct latencyTimeSeries *ts) {
@@ -582,6 +654,7 @@ sds latencyCommandGenSparkeline(char *event, struct latencyTimeSeries *ts) {
  * LATENCY DOCTOR: returns a human readable analysis of instance latency.
  * LATENCY GRAPH: provide an ASCII graph of the latency of the specified event.
  * LATENCY RESET: reset data of a specified event or all the data if no event provided.
+ * LATENCY HISTOGRAM: return a cumulative distribution of latencies in the format of an histogram for the specified command names.
  */
 void latencyCommand(client *c) {
     struct latencyTimeSeries *ts;
@@ -628,6 +701,13 @@ void latencyCommand(client *c) {
                 resets += latencyResetEvent(c->argv[j]->ptr);
             addReplyLongLong(c,resets);
         }
+    } else if (!strcasecmp(c->argv[1]->ptr,"histogram") && c->argc >= 2) {
+        /* LATENCY HISTOGRAM*/
+        if (c->argc == 2) {
+            latencyAllCommandsFillCDF(c);
+        } else {
+            latencySpecificCommandsFillCDF(c);
+        }
     } else if (!strcasecmp(c->argv[1]->ptr,"help") && c->argc == 2) {
         const char *help[] = {
 "DOCTOR",
@@ -641,6 +721,9 @@ void latencyCommand(client *c) {
 "RESET [<event> ...]",
 "    Reset latency data of one or more <event> classes.",
 "    (default: reset all data for all event classes)",
+"HISTOGRAM [COMMAND ...]",
+"    Return a cumulative distribution of latencies in the format of a histogram for the specified command names.",
+"    If no commands are specified then all histograms are replied.",
 NULL
         };
         addReplyHelp(c, help);
author	filipe oliveira <filipecosta.90@gmail.com>	2022-01-05 12:01:05 +0000
committer	GitHub <noreply@github.com>	2022-01-05 14:01:05 +0200
commit	5dd15443ac755d9ad2a22aa8c341b25d3de82eb4 (patch)
tree	96123aeae3e692d6401b8e0138a81b65a206b19f /src/latency.c
parent	4d3c4cfac70557a28a62266543951d46d0cae584 (diff)
download	redis-5dd15443ac755d9ad2a22aa8c341b25d3de82eb4.tar.gz