Tweak our hash bucket splitting rules

Prior to this patch we resized hashes when after inserting a key the load factor of the hash reached 1 (load factor= keys / buckets). This patch makes two subtle changes to this logic: 1. We split only after inserting a key into an utilized bucket, 2. and the maximum load factor exceeds 0.667 The intent and effect of this change is to increase our hash tables efficiency. Reducing the maximum load factor 0.667 means that we should have much less keys in collision overall, at the cost of some unutilized space (2/3rds was chosen as it is easier to calculate than 0.7). On the other hand, only splitting after a collision means in theory that we execute the "final split" less often. Additionally, insertin a key into an unused bucket increases the efficiency of the hash, without changing the worst case.[1] In other words without increasing collisions we use the space in our hashes more efficiently. A side effect of this hash is that the size of a hash is more sensitive to key insert order. A set of keys with some collisions might be one size if those collisions were encountered early, or another if they were encountered later. Assuming random distribution of hash values about 50% of hashes should be smaller than they would be without this rule. The two changes complement each other, as changing the maximum load factor decreases the chance of a collision, but changing to only split after a collision means that we won't waste as much of that space we might. [1] Since I personally didnt find this obvious at first here is my explanation: The old behavior was that we doubled the number of buckets when the number of keys in the hash matched that of buckets. So on inserting the Kth key into a K bucket hash, we would double the number of buckets. Thus the worse case prior to this patch was a hash containing K-1 keys which all hash into a single bucket, and the post split worst case behavior would be having K items in a single bucket of a hash with 2*K buckets total. The new behavior says that we double the size of the hash once inserting an item into an occupied bucket and after doing so we exceeed the maximum load factor (leave aside the change in maximum load factor in this patch). If we insert into an occupied bucket (including the worse case bucket) then we trigger a key split, and we have exactly the same cases as before. If we insert into an empty bucket then we now have a worst case of K-1 items in one bucket, and 1 item in another, in a hash with K buckets, thus the worst case has not changed.
author: Yves Orton <demerphq@gmail.com> 2017-03-22 15:59:31 +0100
committer: Yves Orton <demerphq@gmail.com> 2017-04-23 11:44:17 +0200
commit: 05f97de032fe95cabe8c9f6d6c0a5897b1616194 (patch)
tree: f0c829ea57a9a66e3c253310c7db15ca04958b30 /hv.c
parent: a4283faf7092ec370914ee3e4e7afeddd0115689 (diff)
download: perl-05f97de032fe95cabe8c9f6d6c0a5897b1616194.tar.gz
1 files changed, 31 insertions, 12 deletions
diff --git a/hv.c b/hv.c
index 85e42d13e0..3bd62c6f9d 100644
--- a/hv.c
+++ b/hv.c
@@ -34,7 +34,11 @@ holds the key and hash value.
 #define PERL_HASH_INTERNAL_ACCESS
 #include "perl.h"
 
-#define DO_HSPLIT(xhv) ((xhv)->xhv_keys > (xhv)->xhv_max) /* HvTOTALKEYS(hv) > HvMAX(hv) */
+/* we split when we collide and we have a load factor over 0.667.
+ * NOTE if you change this formula so we split earlier than previously
+ * you MUST change the logic in hv_ksplit()
+ */
+#define DO_HSPLIT(xhv) ( ((xhv)->xhv_keys + ((xhv)->xhv_keys >> 1))  > (xhv)->xhv_max )
 #define HV_FILL_THRESHOLD 31
 
 static const char S_strtab_error[]
@@ -343,6 +347,7 @@ Perl_hv_common(pTHX_ HV *hv, SV *keysv, const char *key, STRLEN klen,
     HE **oentry;
     SV *sv;
     bool is_utf8;
+    bool in_collision;
     int masked_flags;
     const int return_svp = action & HV_FETCH_JUST_SV;
     HEK *keysv_hek = NULL;
@@ -835,6 +840,7 @@ Perl_hv_common(pTHX_ HV *hv, SV *keysv, const char *key, STRLEN klen,
      * making it harder to see if there is a collision. We also
      * reset the iterator randomizer if there is one.
      */
+    in_collision = *oentry != NULL;
     if ( *oentry && PL_HASH_RAND_BITS_ENABLED) {
         PL_hash_rand_bits++;
         PL_hash_rand_bits= ROTL_UV(PL_hash_rand_bits,1);
@@ -877,7 +883,7 @@ Perl_hv_common(pTHX_ HV *hv, SV *keysv, const char *key, STRLEN klen,
 	HvHASKFLAGS_on(hv);
 
     xhv->xhv_keys++; /* HvTOTALKEYS(hv)++ */
-    if ( DO_HSPLIT(xhv) ) {
+    if ( in_collision && DO_HSPLIT(xhv) ) {
         const STRLEN oldsize = xhv->xhv_max + 1;
         const U32 items = (U32)HvPLACEHOLDERS_get(hv);
 
@@ -1450,29 +1456,42 @@ void
 Perl_hv_ksplit(pTHX_ HV *hv, IV newmax)
 {
     XPVHV* xhv = (XPVHV*)SvANY(hv);
-    const I32 oldsize = (I32) xhv->xhv_max+1; /* HvMAX(hv)+1 (sick) */
+    const I32 oldsize = (I32) xhv->xhv_max+1;       /* HvMAX(hv)+1 */
     I32 newsize;
+    I32 wantsize;
+    I32 trysize;
     char *a;
 
     PERL_ARGS_ASSERT_HV_KSPLIT;
 
-    newsize = (I32) newmax;			/* possible truncation here */
-    if (newsize != newmax || newmax <= oldsize)
+    wantsize = (I32) newmax;                            /* possible truncation here */
+    if (wantsize != newmax)
 	return;
-    while ((newsize & (1 + ~newsize)) != newsize) {
-	newsize &= ~(newsize & (1 + ~newsize));	/* get proper power of 2 */
+
+    wantsize= wantsize + (wantsize >> 1);           /* wantsize *= 1.5 */
+    if (wantsize < newmax)                          /* overflow detection */
+        return;
+
+    newsize = oldsize;
+    while (wantsize > newsize) {
+        trysize = newsize << 1;
+        if (trysize > newsize) {
+            newsize = trysize;
+        } else {
+            /* we overflowed */
+            return;
+        }
     }
-    if (newsize < newmax)
-	newsize *= 2;
-    if (newsize < newmax)
-	return;					/* overflow detection */
+
+    if (newsize <= oldsize)
+        return;                                            /* overflow detection */
 
     a = (char *) HvARRAY(hv);
     if (a) {
         hsplit(hv, oldsize, newsize);
     } else {
         Newxz(a, PERL_HV_ARRAY_ALLOC_BYTES(newsize), char);
-        xhv->xhv_max = --newsize;
+        xhv->xhv_max = newsize - 1;
         HvARRAY(hv) = (HE **) a;
     }
 }
author	Yves Orton <demerphq@gmail.com>	2017-03-22 15:59:31 +0100
committer	Yves Orton <demerphq@gmail.com>	2017-04-23 11:44:17 +0200
commit	05f97de032fe95cabe8c9f6d6c0a5897b1616194 (patch)
tree	f0c829ea57a9a66e3c253310c7db15ca04958b30 /hv.c
parent	a4283faf7092ec370914ee3e4e7afeddd0115689 (diff)
download	perl-05f97de032fe95cabe8c9f6d6c0a5897b1616194.tar.gz