8 files changed, 333 insertions, 72 deletions
diff --git a/src/defrag.c b/src/defrag.c
index 6d15a4c16..aae72adcb 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -700,18 +700,9 @@ void defragDictBucketCallback(void *privdata, dictEntry **bucketref) {
  * or not, a false detection can cause the defragmenter to waste a lot of CPU
  * without the possibility of getting any results. */
 float getAllocatorFragmentation(size_t *out_frag_bytes) {
-    size_t epoch = 1, allocated = 0, resident = 0, active = 0, sz = sizeof(size_t);
-    /* Update the statistics cached by mallctl. */
-    je_mallctl("epoch", &epoch, &sz, &epoch, sz);
-    /* Unlike RSS, this does not include RSS from shared libraries and other non
-     * heap mappings. */
-    je_mallctl("stats.resident", &resident, &sz, NULL, 0);
-    /* Unlike resident, this doesn't not include the pages jemalloc reserves
-     * for re-use (purge will clean that). */
-    je_mallctl("stats.active", &active, &sz, NULL, 0);
-    /* Unlike zmalloc_used_memory, this matches the stats.resident by taking
-     * into account all allocations done by this process (not only zmalloc). */
-    je_mallctl("stats.allocated", &allocated, &sz, NULL, 0);
+    size_t resident = server.cron_malloc_stats.allocator_resident;
+    size_t active = server.cron_malloc_stats.allocator_active;
+    size_t allocated = server.cron_malloc_stats.allocator_allocated;
     float frag_pct = ((float)active / allocated)*100 - 100;
     size_t frag_bytes = active - allocated;
     float rss_pct = ((float)resident / allocated)*100 - 100;
diff --git a/src/object.c b/src/object.c
index 395ec84df..1f2a3ed29 100644
--- a/src/object.c
+++ b/src/object.c
@@ -876,8 +876,23 @@ struct redisMemOverhead *getMemoryOverheadData(void) {
     mh->total_allocated = zmalloc_used;
     mh->startup_allocated = server.initial_memory_usage;
     mh->peak_allocated = server.stat_peak_memory;
-    mh->fragmentation =
-        zmalloc_get_fragmentation_ratio(server.resident_set_size);
+    mh->total_frag =
+        (float)server.cron_malloc_stats.process_rss / server.cron_malloc_stats.zmalloc_used;
+    mh->total_frag_bytes =
+        server.cron_malloc_stats.process_rss - server.cron_malloc_stats.zmalloc_used;
+    mh->allocator_frag =
+        (float)server.cron_malloc_stats.allocator_active / server.cron_malloc_stats.allocator_allocated;
+    mh->allocator_frag_bytes =
+        server.cron_malloc_stats.allocator_active - server.cron_malloc_stats.allocator_allocated;
+    mh->allocator_rss =
+        (float)server.cron_malloc_stats.allocator_resident / server.cron_malloc_stats.allocator_active;
+    mh->allocator_rss_bytes =
+        server.cron_malloc_stats.allocator_resident - server.cron_malloc_stats.allocator_active;
+    mh->rss_extra =
+        (float)server.cron_malloc_stats.process_rss / server.cron_malloc_stats.allocator_resident;
+    mh->rss_extra_bytes =
+        server.cron_malloc_stats.process_rss - server.cron_malloc_stats.allocator_resident;
+
     mem_total += server.initial_memory_usage;
 
     mem = 0;
@@ -980,6 +995,9 @@ sds getMemoryDoctorReport(void) {
     int empty = 0;          /* Instance is empty or almost empty. */
     int big_peak = 0;       /* Memory peak is much larger than used mem. */
     int high_frag = 0;      /* High fragmentation. */
+    int high_alloc_frag = 0;/* High allocator fragmentation. */
+    int high_proc_rss = 0;  /* High process rss overhead. */
+    int high_alloc_rss = 0; /* High rss overhead. */
     int big_slave_buf = 0;  /* Slave buffers are too big. */
     int big_client_buf = 0; /* Client buffers are too big. */
     int num_reports = 0;
@@ -995,12 +1013,30 @@ sds getMemoryDoctorReport(void) {
             num_reports++;
         }
 
-        /* Fragmentation is higher than 1.4? */
-        if (mh->fragmentation > 1.4) {
+        /* Fragmentation is higher than 1.4 and 10MB ?*/
+        if (mh->total_frag > 1.4 && mh->total_frag_bytes > 10<<20) {
             high_frag = 1;
             num_reports++;
         }
 
+        /* External fragmentation is higher than 1.1 and 10MB? */
+        if (mh->allocator_frag > 1.1 && mh->allocator_frag_bytes > 10<<20) {
+            high_alloc_frag = 1;
+            num_reports++;
+        }
+
+        /* Allocator fss is higher than 1.1 and 10MB ? */
+        if (mh->allocator_rss > 1.1 && mh->allocator_rss_bytes > 10<<20) {
+            high_alloc_rss = 1;
+            num_reports++;
+        }
+
+        /* Non-Allocator fss is higher than 1.1 and 10MB ? */
+        if (mh->rss_extra > 1.1 && mh->rss_extra_bytes > 10<<20) {
+            high_proc_rss = 1;
+            num_reports++;
+        }
+
         /* Clients using more than 200k each average? */
         long numslaves = listLength(server.slaves);
         long numclients = listLength(server.clients)-numslaves;
@@ -1034,7 +1070,16 @@ sds getMemoryDoctorReport(void) {
             s = sdscat(s," * Peak memory: In the past this instance used more than 150% the memory that is currently using. The allocator is normally not able to release memory after a peak, so you can expect to see a big fragmentation ratio, however this is actually harmless and is only due to the memory peak, and if the Redis instance Resident Set Size (RSS) is currently bigger than expected, the memory will be used as soon as you fill the Redis instance with more data. If the memory peak was only occasional and you want to try to reclaim memory, please try the MEMORY PURGE command, otherwise the only other option is to shutdown and restart the instance.\n\n");
         }
         if (high_frag) {
-            s = sdscatprintf(s," * High fragmentation: This instance has a memory fragmentation greater than 1.4 (this means that the Resident Set Size of the Redis process is much larger than the sum of the logical allocations Redis performed). This problem is usually due either to a large peak memory (check if there is a peak memory entry above in the report) or may result from a workload that causes the allocator to fragment memory a lot. If the problem is a large peak memory, then there is no issue. Otherwise, make sure you are using the Jemalloc allocator and not the default libc malloc. Note: The currently used allocator is \"%s\".\n\n", ZMALLOC_LIB);
+            s = sdscatprintf(s," * High total RSS: This instance has a memory fragmentation and RSS overhead greater than 1.4 (this means that the Resident Set Size of the Redis process is much larger than the sum of the logical allocations Redis performed). This problem is usually due either to a large peak memory (check if there is a peak memory entry above in the report) or may result from a workload that causes the allocator to fragment memory a lot. If the problem is a large peak memory, then there is no issue. Otherwise, make sure you are using the Jemalloc allocator and not the default libc malloc. Note: The currently used allocator is \"%s\".\n\n", ZMALLOC_LIB);
+        }
+        if (high_alloc_frag) {
+            s = sdscatprintf(s," * High allocator fragmentation: This instance has an allocator external fragmentation greater than 1.1. This problem is usually due either to a large peak memory (check if there is a peak memory entry above in the report) or may result from a workload that causes the allocator to fragment memory a lot. You can try enabling 'activedefrag' config option.\n\n");
+        }
+        if (high_alloc_rss) {
+            s = sdscatprintf(s," * High allocator RSS overhead: This instance has an RSS memory overhead is greater than 1.1 (this means that the Resident Set Size of the allocator is much larger than the sum what the allocator actually holds). This problem is usually due to a large peak memory (check if there is a peak memory entry above in the report), you can try the MEMORY PURGE command to reclaim it.\n\n");
+        }
+        if (high_proc_rss) {
+            s = sdscatprintf(s," * High process RSS overhead: This instance has non-allocator RSS memory overhead is greater than 1.1 (this means that the Resident Set Size of the Redis process is much larger than the RSS the allocator holds). This problem may be due to LUA scripts or Modules.\n\n");
         }
         if (big_slave_buf) {
             s = sdscat(s," * Big slave buffers: The slave output buffers in this instance are greater than 10MB for each slave (on average). This likely means that there is some slave instance that is struggling receiving data, either because it is too slow or because of networking issues. As a result, data piles on the master output buffers. Please try to identify what slave is not receiving data correctly and why. You can use the INFO output in order to check the slaves delays and the CLIENT LIST command to check the output buffers of each slave.\n\n");
@@ -1148,7 +1193,7 @@ void memoryCommand(client *c) {
     } else if (!strcasecmp(c->argv[1]->ptr,"stats") && c->argc == 2) {
         struct redisMemOverhead *mh = getMemoryOverheadData();
 
-        addReplyMultiBulkLen(c,(14+mh->num_dbs)*2);
+        addReplyMultiBulkLen(c,(24+mh->num_dbs)*2);
 
         addReplyBulkCString(c,"peak.allocated");
         addReplyLongLong(c,mh->peak_allocated);
@@ -1202,8 +1247,38 @@ void memoryCommand(client *c) {
         addReplyBulkCString(c,"peak.percentage");
         addReplyDouble(c,mh->peak_perc);
 
-        addReplyBulkCString(c,"fragmentation");
-        addReplyDouble(c,mh->fragmentation);
+        addReplyBulkCString(c,"allocator.allocated");
+        addReplyLongLong(c,server.cron_malloc_stats.allocator_allocated);
+
+        addReplyBulkCString(c,"allocator.active");
+        addReplyLongLong(c,server.cron_malloc_stats.allocator_active);
+
+        addReplyBulkCString(c,"allocator.resident");
+        addReplyLongLong(c,server.cron_malloc_stats.allocator_resident);
+
+        addReplyBulkCString(c,"allocator-fragmentation.ratio");
+        addReplyDouble(c,mh->allocator_frag);
+
+        addReplyBulkCString(c,"allocator-fragmentation.bytes");
+        addReplyLongLong(c,mh->allocator_frag_bytes);
+
+        addReplyBulkCString(c,"allocator-rss.ratio");
+        addReplyDouble(c,mh->allocator_rss);
+
+        addReplyBulkCString(c,"allocator-rss.bytes");
+        addReplyLongLong(c,mh->allocator_rss_bytes);
+
+        addReplyBulkCString(c,"rss-overhead.ratio");
+        addReplyDouble(c,mh->rss_extra);
+
+        addReplyBulkCString(c,"rss-overhead.bytes");
+        addReplyLongLong(c,mh->rss_extra_bytes);
+
+        addReplyBulkCString(c,"fragmentation"); /* this is the total RSS overhead, including fragmentation */
+        addReplyDouble(c,mh->total_frag); /* it is kept here for backwards compatibility */
+
+        addReplyBulkCString(c,"fragmentation.bytes");
+        addReplyLongLong(c,mh->total_frag_bytes);
 
         freeMemoryOverheadData(mh);
     } else if (!strcasecmp(c->argv[1]->ptr,"malloc-stats") && c->argc == 2) {
diff --git a/src/server.c b/src/server.c
index c6d292297..294da6437 100644
--- a/src/server.c
+++ b/src/server.c
@@ -1007,8 +1007,33 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
     if (zmalloc_used_memory() > server.stat_peak_memory)
         server.stat_peak_memory = zmalloc_used_memory();
 
-    /* Sample the RSS here since this is a relatively slow call. */
-    server.resident_set_size = zmalloc_get_rss();
+    run_with_period(10) {
+        /* Sample the RSS and other metrics here since this is a relatively slow call.
+         * We must sample the zmalloc_used at the same time we take the rss, otherwise
+         * the frag ratio calculate may be off (ratio of two samples at different times) */
+        server.cron_malloc_stats.process_rss = zmalloc_get_rss();
+        server.cron_malloc_stats.zmalloc_used = zmalloc_used_memory();
+        /* Sampling the allcator info can be slow too.
+         * The fragmentation ratio it'll show is potentically more accurate
+         * it excludes other RSS pages such as: shared libraries, LUA and other non-zmalloc
+         * allocations, and allocator reserved pages that can be pursed (all not actual frag) */
+        zmalloc_get_allocator_info(&server.cron_malloc_stats.allocator_allocated,
+                                   &server.cron_malloc_stats.allocator_active,
+                                   &server.cron_malloc_stats.allocator_resident);
+        /* in case the allocator isn't providing these stats, fake them so that
+         * fragmention info still shows some (inaccurate metrics) */
+        if (!server.cron_malloc_stats.allocator_resident) {
+            /* LUA memory isn't part of zmalloc_used, but it is part of the process RSS,
+             * so we must desuct it in order to be able to calculate correct
+             * "allocator fragmentation" ratio */
+            size_t lua_memory = lua_gc(server.lua,LUA_GCCOUNT,0)*1024LL;
+            server.cron_malloc_stats.allocator_resident = server.cron_malloc_stats.process_rss - lua_memory;
+        }
+        if (!server.cron_malloc_stats.allocator_active)
+            server.cron_malloc_stats.allocator_active = server.cron_malloc_stats.allocator_resident;
+        if (!server.cron_malloc_stats.allocator_allocated)
+            server.cron_malloc_stats.allocator_allocated = server.cron_malloc_stats.zmalloc_used;
+    }
 
     /* We received a SIGTERM, shutting down here in a safe way, as it is
      * not ok doing so inside the signal handler. */
@@ -1924,7 +1949,11 @@ void initServer(void) {
     server.stat_peak_memory = 0;
     server.stat_rdb_cow_bytes = 0;
     server.stat_aof_cow_bytes = 0;
-    server.resident_set_size = 0;
+    server.cron_malloc_stats.zmalloc_used = 0;
+    server.cron_malloc_stats.process_rss = 0;
+    server.cron_malloc_stats.allocator_allocated = 0;
+    server.cron_malloc_stats.allocator_active = 0;
+    server.cron_malloc_stats.allocator_resident = 0;
     server.lastbgsave_status = C_OK;
     server.aof_last_write_status = C_OK;
     server.aof_last_write_errno = 0;
@@ -2974,7 +3003,7 @@ sds genRedisInfoString(char *section) {
         bytesToHuman(peak_hmem,server.stat_peak_memory);
         bytesToHuman(total_system_hmem,total_system_mem);
         bytesToHuman(used_memory_lua_hmem,memory_lua);
-        bytesToHuman(used_memory_rss_hmem,server.resident_set_size);
+        bytesToHuman(used_memory_rss_hmem,server.cron_malloc_stats.process_rss);
         bytesToHuman(maxmemory_hmem,server.maxmemory);
 
         if (sections++) info = sdscat(info,"\r\n");
@@ -2991,6 +3020,9 @@ sds genRedisInfoString(char *section) {
             "used_memory_startup:%zu\r\n"
             "used_memory_dataset:%zu\r\n"
             "used_memory_dataset_perc:%.2f%%\r\n"
+            "allocator_allocated:%zu\r\n"
+            "allocator_active:%zu\r\n"
+            "allocator_resident:%zu\r\n"
             "total_system_memory:%lu\r\n"
             "total_system_memory_human:%s\r\n"
             "used_memory_lua:%lld\r\n"
@@ -2998,13 +3030,20 @@ sds genRedisInfoString(char *section) {
             "maxmemory:%lld\r\n"
             "maxmemory_human:%s\r\n"
             "maxmemory_policy:%s\r\n"
+            "allocator_frag_ratio:%.2f\r\n"
+            "allocator_frag_bytes:%zu\r\n"
+            "allocator_rss_ratio:%.2f\r\n"
+            "allocator_rss_bytes:%zu\r\n"
+            "rss_overhead_ratio:%.2f\r\n"
+            "rss_overhead_bytes:%zu\r\n"
             "mem_fragmentation_ratio:%.2f\r\n"
+            "mem_fragmentation_bytes:%zu\r\n"
             "mem_allocator:%s\r\n"
             "active_defrag_running:%d\r\n"
             "lazyfree_pending_objects:%zu\r\n",
             zmalloc_used,
             hmem,
-            server.resident_set_size,
+            server.cron_malloc_stats.process_rss,
             used_memory_rss_hmem,
             server.stat_peak_memory,
             peak_hmem,
@@ -3013,6 +3052,9 @@ sds genRedisInfoString(char *section) {
             mh->startup_allocated,
             mh->dataset,
             mh->dataset_perc,
+            server.cron_malloc_stats.allocator_allocated,
+            server.cron_malloc_stats.allocator_active,
+            server.cron_malloc_stats.allocator_resident,
             (unsigned long)total_system_mem,
             total_system_hmem,
             memory_lua,
@@ -3020,7 +3062,14 @@ sds genRedisInfoString(char *section) {
             server.maxmemory,
             maxmemory_hmem,
             evict_policy,
-            mh->fragmentation,
+            mh->allocator_frag,
+            mh->allocator_frag_bytes,
+            mh->allocator_rss,
+            mh->allocator_rss_bytes,
+            mh->rss_extra,
+            mh->rss_extra_bytes,
+            mh->total_frag, /* this is the total RSS overhead, including fragmentation, */
+            mh->total_frag_bytes, /* named so for backwards compatibility */
             ZMALLOC_LIB,
             server.active_defrag_running,
             lazyfreeGetPendingObjectsCount()
diff --git a/src/server.h b/src/server.h
index 3256278ea..43086eba2 100644
--- a/src/server.h
+++ b/src/server.h
@@ -840,7 +840,14 @@ struct redisMemOverhead {
     size_t bytes_per_key;
     float dataset_perc;
     float peak_perc;
-    float fragmentation;
+    float total_frag;
+    size_t total_frag_bytes;
+    float allocator_frag;
+    size_t allocator_frag_bytes;
+    float allocator_rss;
+    size_t allocator_rss_bytes;
+    float rss_extra;
+    size_t rss_extra_bytes;
     size_t num_dbs;
     struct {
         size_t dbid;
@@ -869,6 +876,14 @@ typedef struct rdbSaveInfo {
 
 #define RDB_SAVE_INFO_INIT {-1,0,"000000000000000000000000000000",-1}
 
+typedef struct malloc_stats {
+    size_t zmalloc_used;
+    size_t process_rss;
+    size_t allocator_allocated;
+    size_t allocator_active;
+    size_t allocator_resident;
+} malloc_stats;
+
 /*-----------------------------------------------------------------------------
  * Global server state
  *----------------------------------------------------------------------------*/
@@ -971,7 +986,7 @@ struct redisServer {
     long long slowlog_entry_id;     /* SLOWLOG current entry ID */
     long long slowlog_log_slower_than; /* SLOWLOG time limit (to get logged) */
     unsigned long slowlog_max_len;     /* SLOWLOG max number of items logged */
-    size_t resident_set_size;       /* RSS sampled in serverCron(). */
+    malloc_stats cron_malloc_stats; /* sampled in serverCron(). */
     long long stat_net_input_bytes; /* Bytes read from network. */
     long long stat_net_output_bytes; /* Bytes written to network. */
     size_t stat_rdb_cow_bytes;      /* Copy on write bytes during RDB saving. */
diff --git a/src/zmalloc.c b/src/zmalloc.c
index 094dd80fa..7cb4e3729 100644
--- a/src/zmalloc.c
+++ b/src/zmalloc.c
@@ -297,10 +297,33 @@ size_t zmalloc_get_rss(void) {
 }
 #endif
 
-/* Fragmentation = RSS / allocated-bytes */
-float zmalloc_get_fragmentation_ratio(size_t rss) {
-    return (float)rss/zmalloc_used_memory();
+#if defined(USE_JEMALLOC)
+int zmalloc_get_allocator_info(size_t *allocated,
+                               size_t *active,
+                               size_t *resident) {
+    size_t epoch = 1, sz = sizeof(size_t);
+    *allocated = *resident = *active = 0;
+    /* Update the statistics cached by mallctl. */
+    je_mallctl("epoch", &epoch, &sz, &epoch, sz);
+    /* Unlike RSS, this does not include RSS from shared libraries and other non
+     * heap mappings. */
+    je_mallctl("stats.resident", resident, &sz, NULL, 0);
+    /* Unlike resident, this doesn't not include the pages jemalloc reserves
+     * for re-use (purge will clean that). */
+    je_mallctl("stats.active", active, &sz, NULL, 0);
+    /* Unlike zmalloc_used_memory, this matches the stats.resident by taking
+     * into account all allocations done by this process (not only zmalloc). */
+    je_mallctl("stats.allocated", allocated, &sz, NULL, 0);
+    return 1;
 }
+#else
+int zmalloc_get_allocator_info(size_t *allocated,
+                               size_t *active,
+                               size_t *resident) {
+    *allocated = *resident = *active = 0;
+    return 1;
+}
+#endif
 
 /* Get the sum of the specified field (converted form kb to bytes) in
  * /proc/self/smaps. The field must be specified with trailing ":" as it
diff --git a/src/zmalloc.h b/src/zmalloc.h
index 64f2f36aa..3c926bcbe 100644
--- a/src/zmalloc.h
+++ b/src/zmalloc.h
@@ -79,8 +79,8 @@ void zfree(void *ptr);
 char *zstrdup(const char *s);
 size_t zmalloc_used_memory(void);
 void zmalloc_set_oom_handler(void (*oom_handler)(size_t));
-float zmalloc_get_fragmentation_ratio(size_t rss);
 size_t zmalloc_get_rss(void);
+int zmalloc_get_allocator_info(size_t *allocated, size_t *active, size_t *resident);
 size_t zmalloc_get_private_dirty(long pid);
 size_t zmalloc_get_smap_bytes_by_field(char *field, long pid);
 size_t zmalloc_get_memory_size(void);
diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl
index 7def9a7f6..3114f844c 100644
--- a/tests/test_helper.tcl
+++ b/tests/test_helper.tcl
@@ -452,6 +452,8 @@ for {set j 0} {$j < [llength $argv]} {incr j} {
             puts $t
         }
         exit 0
+    } elseif {$opt eq {--verbose}} {
+        set ::verbose 1
     } elseif {$opt eq {--client}} {
         set ::client 1
         set ::test_server_port $arg
diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl
index f452f0224..4daf85c7f 100644
--- a/tests/unit/memefficiency.tcl
+++ b/tests/unit/memefficiency.tcl
@@ -36,50 +36,156 @@ start_server {tags {"memefficiency"}} {
     }
 }
 
-if 0 {
-    start_server {tags {"defrag"}} {
-        if {[string match {*jemalloc*} [s mem_allocator]]} {
-            test "Active defrag" {
-                r config set activedefrag no
-                r config set active-defrag-threshold-lower 5
-                r config set active-defrag-ignore-bytes 2mb
-                r config set maxmemory 100mb
-                r config set maxmemory-policy allkeys-lru
-                r debug populate 700000 asdf 150
-                r debug populate 170000 asdf 300
-                set frag [s mem_fragmentation_ratio]
-                assert {$frag >= 1.7}
-                r config set activedefrag yes
-                after 1500 ;# active defrag tests the status once a second.
-                set hits [s active_defrag_hits]
+start_server {tags {"defrag"}} {
+    if {[string match {*jemalloc*} [s mem_allocator]]} {
+        test "Active defrag" {
+            r config set activedefrag no
+            r config set active-defrag-threshold-lower 5
+            r config set active-defrag-cycle-min 25
+            r config set active-defrag-cycle-max 75
+            r config set active-defrag-ignore-bytes 2mb
+            r config set maxmemory 100mb
+            r config set maxmemory-policy allkeys-lru
+            r debug populate 700000 asdf 150
+            r debug populate 170000 asdf 300
+            after 20 ;# serverCron only updates the info once in 10ms
+            set frag [s allocator_frag_ratio]
+            if {$::verbose} {
+                puts "frag $frag"
+            }
+            assert {$frag >= 1.4}
+            r config set activedefrag yes
+
+            # wait for the active defrag to start working (decision once a second)
+            wait_for_condition 50 100 {
+                [s active_defrag_running] ne 0
+            } else {
+                fail "defrag not started."
+            }
+
+            # wait for the active defrag to stop working
+            wait_for_condition 100 100 {
+                [s active_defrag_running] eq 0
+            } else {
+                puts [r info memory]
+                fail "defrag didn't stop."
+            }
+
+            # test the the fragmentation is lower
+            after 20 ;# serverCron only updates the info once in 10ms
+            set frag [s allocator_frag_ratio]
+            if {$::verbose} {
+                puts "frag $frag"
+            }
+            assert {$frag < 1.1}
+        } {}
 
-                # wait for the active defrag to stop working
-                set tries 0
-                while { True } {
-                    incr tries
-                    after 500
-                    set prev_hits $hits
-                    set hits [s active_defrag_hits]
-                    if {$hits == $prev_hits} {
-                        break
-                    }
-                    assert {$tries < 100}
+        test "Active defrag big keys" {
+            r flushdb
+            r config resetstat
+            r config set activedefrag no
+            r config set active-defrag-max-scan-fields 1000
+            r config set active-defrag-threshold-lower 5
+            r config set active-defrag-cycle-min 65
+            r config set active-defrag-cycle-max 75
+            r config set active-defrag-ignore-bytes 2mb
+            r config set maxmemory 0
+            r config set list-max-ziplist-size 5 ;# list of 10k items will have 2000 quicklist nodes
+            r hmset hash h1 v1 h2 v2 h3 v3
+            r lpush list a b c d
+            r zadd zset 0 a 1 b 2 c 3 d
+            r sadd set a b c d
+
+            # create big keys with 10k items
+            set rd [redis_deferring_client]
+            for {set j 0} {$j < 10000} {incr j} {
+                $rd hset bighash $j [concat "asdfasdfasdf" $j]
+                $rd lpush biglist [concat "asdfasdfasdf" $j]
+                $rd zadd bigzset $j [concat "asdfasdfasdf" $j]
+                $rd sadd bigset [concat "asdfasdfasdf" $j]
+            }
+            for {set j 0} {$j < 40000} {incr j} {
+                $rd read ; # Discard replies
+            }
+
+            set expected_frag 1.7
+            if {$::accurate} {
+                # scale the hash to 1m fields in order to have a measurable the latency
+                for {set j 10000} {$j < 1000000} {incr j} {
+                    $rd hset bighash $j [concat "asdfasdfasdf" $j]
                 }
+                for {set j 10000} {$j < 1000000} {incr j} {
+                    $rd read ; # Discard replies
+                }
+                # creating that big hash, increased used_memory, so the relative frag goes down
+                set expected_frag 1.3
+            }
 
-                # TODO: we need to expose more accurate fragmentation info
-                # i.e. the allocator used and active pages
-                # instead we currently look at RSS so we need to ask for purge
-                r memory purge
+            # add a mass of string keys
+            for {set j 0} {$j < 500000} {incr j} {
+                $rd setrange $j 150 a
+            }
+            for {set j 0} {$j < 500000} {incr j} {
+                $rd read ; # Discard replies
+            }
+            assert {[r dbsize] == 500008}
 
-                # Test the the fragmentation is lower and that the defragger
-                # stopped working
-                set frag [s mem_fragmentation_ratio]
-                assert {$frag < 1.55}
-                set misses [s active_defrag_misses]
-                after 500
-                set misses2 [s active_defrag_misses]
-                assert {$misses2 == $misses}
+            # create some fragmentation
+            for {set j 0} {$j < 500000} {incr j 2} {
+                $rd del $j
             }
-        }
+            for {set j 0} {$j < 500000} {incr j 2} {
+                $rd read ; # Discard replies
+            }
+            assert {[r dbsize] == 250008}
+
+            # start defrag
+            after 20 ;# serverCron only updates the info once in 10ms
+            set frag [s allocator_frag_ratio]
+            if {$::verbose} {
+                puts "frag $frag"
+            }
+            assert {$frag >= $expected_frag}
+            r config set latency-monitor-threshold 5
+            r latency reset
+            r config set activedefrag yes
+
+            # wait for the active defrag to start working (decision once a second)
+            wait_for_condition 50 100 {
+                [s active_defrag_running] ne 0
+            } else {
+                fail "defrag not started."
+            }
+
+            # wait for the active defrag to stop working
+            wait_for_condition 500 100 {
+                [s active_defrag_running] eq 0
+            } else {
+                puts [r info memory]
+                puts [r memory malloc-stats]
+                fail "defrag didn't stop."
+            }
+
+            # test the the fragmentation is lower
+            after 20 ;# serverCron only updates the info once in 10ms
+            set frag [s allocator_frag_ratio]
+            set max_latency 0
+            foreach event [r latency latest] {
+                lassign $event eventname time latency max
+                if {$eventname == "active-defrag-cycle"} {
+                    set max_latency $max
+                }
+            }
+            if {$::verbose} {
+                puts "frag $frag"
+                puts "max latency $max_latency"
+                puts [r latency latest]
+                puts [r latency history active-defrag-cycle]
+            }
+            assert {$frag < 1.1}
+            # due to high fragmentation, 10hz, and active-defrag-cycle-max set to 75,
+            # we expect max latency to be not much higher than 75ms
+            assert {$max_latency <= 80}
+        } {}
     }
 }