Hash function switched to murmurhash2.

The previously used hash function, djbhash, is not secure against collision attacks even when the seed is randomized as there are simple ways to find seed-independent collisions. The new hash function appears to be safe (or much harder to exploit at least) in this case, and has better distribution. Better distribution does not always means that's better. For instance in a fast benchmark with "DEBUG POPULATE 1000000" I obtained the following results: 1.6 seconds with djbhash 2.0 seconds with murmurhash2 This is due to the fact that djbhash will hash objects that follow the pattern `prefix:<id>` and where the id is numerically near, to near buckets. This improves the locality. However in other access patterns with keys that have no relation murmurhash2 has some (apparently minimal) speed advantage. On the other hand a better distribution should significantly improve the quality of the distribution of elements returned with dictGetRandomKey() that is used in SPOP, SRANDMEMBER, RANDOMKEY, and other commands. Everything considered, and under the suspect that this commit fixes a security issue in Redis, we are switching to the new hash function. If some serious speed regression will be found in the future we'll be able to step back easiliy. This commit fixes issue #663.
author: antirez <antirez@gmail.com> 2012-10-03 19:14:46 +0200
committer: antirez <antirez@gmail.com> 2012-10-05 11:16:40 +0200
commit: 99c3338c23ba0d4ca4ecb85e0f23ff3f3c3e81f3 (patch)
tree: 39a9d607a0f85911517245a30ba85b00cf3c7d6a
parent: 05e06e154332cf107d30092dddb94b6b35967ce5 (diff)
download: redis-99c3338c23ba0d4ca4ecb85e0f23ff3f3c3e81f3.tar.gz
2 files changed, 57 insertions, 13 deletions
diff --git a/src/dict.c b/src/dict.c
index 66bb983a8..ec58e8200 100644
--- a/src/dict.c
+++ b/src/dict.c
@@ -85,29 +85,73 @@ unsigned int dictIdentityHashFunction(unsigned int key)
     return key;
 }
 
-static int dict_hash_function_seed = 5381;
+static uint32_t dict_hash_function_seed = 5381;
 
-void dictSetHashFunctionSeed(unsigned int seed) {
+void dictSetHashFunctionSeed(uint32_t seed) {
     dict_hash_function_seed = seed;
 }
 
-unsigned int dictGetHashFunctionSeed(void) {
+uint32_t dictGetHashFunctionSeed(void) {
     return dict_hash_function_seed;
 }
 
-/* Generic hash function (a popular one from Bernstein).
- * I tested a few and this was the best. */
-unsigned int dictGenHashFunction(const unsigned char *buf, int len) {
-    unsigned int hash = dict_hash_function_seed;
+/* MurmurHash2, by Austin Appleby
+ * Note - This code makes a few assumptions about how your machine behaves -
+ * 1. We can read a 4-byte value from any address without crashing
+ * 2. sizeof(int) == 4
+ *
+ * And it has a few limitations -
+ *
+ * 1. It will not work incrementally.
+ * 2. It will not produce the same results on little-endian and big-endian
+ *    machines.
+ */
+unsigned int dictGenHashFunction(const void *key, int len) {
+    /* 'm' and 'r' are mixing constants generated offline.
+     They're not really 'magic', they just happen to work well.  */
+    uint32_t seed = dict_hash_function_seed;
+    const uint32_t m = 0x5bd1e995;
+    const int r = 24;
 
-    while (len--)
-        hash = ((hash << 5) + hash) + (*buf++); /* hash * 33 + c */
-    return hash;
+    /* Initialize the hash to a 'random' value */
+    uint32_t h = seed ^ len;
+
+    /* Mix 4 bytes at a time into the hash */
+    const unsigned char *data = (const unsigned char *)key;
+
+    while(len >= 4) {
+        uint32_t k = *(uint32_t*)data;
+
+        k *= m;
+        k ^= k >> r;
+        k *= m;
+
+        h *= m;
+        h ^= k;
+
+        data += 4;
+        len -= 4;
+    }
+
+    /* Handle the last few bytes of the input array  */
+    switch(len) {
+    case 3: h ^= data[2] << 16;
+    case 2: h ^= data[1] << 8;
+    case 1: h ^= data[0]; h *= m;
+    };
+
+    /* Do a few final mixes of the hash to ensure the last few
+     * bytes are well-incorporated. */
+    h ^= h >> 13;
+    h *= m;
+    h ^= h >> 15;
+
+    return (unsigned int)h;
 }
 
-/* And a case insensitive version */
+/* And a case insensitive hash function (based on djb hash) */
 unsigned int dictGenCaseHashFunction(const unsigned char *buf, int len) {
-    unsigned int hash = dict_hash_function_seed;
+    unsigned int hash = (unsigned int)dict_hash_function_seed;
 
     while (len--)
         hash = ((hash << 5) + hash) + (tolower(*buf++)); /* hash * 33 + c */
diff --git a/src/dict.h b/src/dict.h
index 5f8569535..f480ae539 100644
--- a/src/dict.h
+++ b/src/dict.h
@@ -155,7 +155,7 @@ dictEntry *dictNext(dictIterator *iter);
 void dictReleaseIterator(dictIterator *iter);
 dictEntry *dictGetRandomKey(dict *d);
 void dictPrintStats(dict *d);
-unsigned int dictGenHashFunction(const unsigned char *buf, int len);
+unsigned int dictGenHashFunction(const void *key, int len);
 unsigned int dictGenCaseHashFunction(const unsigned char *buf, int len);
 void dictEmpty(dict *d);
 void dictEnableResize(void);
author	antirez <antirez@gmail.com>	2012-10-03 19:14:46 +0200
committer	antirez <antirez@gmail.com>	2012-10-05 11:16:40 +0200
commit	99c3338c23ba0d4ca4ecb85e0f23ff3f3c3e81f3 (patch)
tree	39a9d607a0f85911517245a30ba85b00cf3c7d6a
parent	05e06e154332cf107d30092dddb94b6b35967ce5 (diff)
download	redis-99c3338c23ba0d4ca4ecb85e0f23ff3f3c3e81f3.tar.gz