summaryrefslogtreecommitdiff
path: root/hv.c
diff options
context:
space:
mode:
authorYves Orton <demerphq@gmail.com>2012-11-17 14:12:04 +0100
committerYves Orton <demerphq@gmail.com>2012-11-17 19:19:43 +0100
commit7dc8663964c66a698d31bbdc8e8abed69bddeec3 (patch)
tree92f39aa5770c2f401bbe134db921d68cbe4f758a /hv.c
parent867b16b50f413adfdb2addcab33ad1d6e61da9d5 (diff)
downloadperl-7dc8663964c66a698d31bbdc8e8abed69bddeec3.tar.gz
Hash Function Change - Murmur hash and true per process hash seed
This patch does the following: *) Introduces multiple new hash functions to choose from at build time. This includes Murmur-32, SDBM, DJB2, SipHash, SuperFast, and One-at-a-time. Currently this is handled by muning hv.h. Configure support hopefully to follow. *) Changes the default hash to Murmur hash which is faster than the old default One-at-a-time. *) Rips out the old HvREHASH mechanism and replaces it with a per-process random hash seed. *) Changes the old PL_hash_seed from an interpreter value to a global variable. This means it does not have to be copied during interpreter setup or cloning. *) Changes the format of the PERL_HASH_SEED variable to a hex string so that hash seeds longer than fit in an integer are possible. *) Changes the return of Hash::Util::hash_seed() from a number to a string. This is to accomodate hash functions which have more bits than can be fit in an integer. *) Adds new functions to Hash::Util to improve introspection of hashes -) hash_value() - returns an integer hash value for a given string. -) bucket_info() - returns basic hash bucket utilization info -) bucket_stats() - returns more hash bucket utilization info -) bucket_array() - which keys are in which buckets in a hash More details on the new hash functions can be found below: Murmur Hash: (v3) from google, see http://code.google.com/p/smhasher/wiki/MurmurHash3 Superfast Hash: From Paul Hsieh. http://www.azillionmonkeys.com/qed/hash.html DJB2: a hash function from Daniel Bernstein http://www.cse.yorku.ca/~oz/hash.html SDBM: a hash function sdbm. http://www.cse.yorku.ca/~oz/hash.html SipHash: by Jean-Philippe Aumasson and Daniel J. Bernstein. https://www.131002.net/siphash/ They have all be converted into Perl's ugly macro format. I have not done any rigorous testing to make sure this conversion is correct. They seem to function as expected however. All of them use the random hash seed. You can force the use of a given function by defining one of PERL_HASH_FUNC_MURMUR PERL_HASH_FUNC_SUPERFAST PERL_HASH_FUNC_DJB2 PERL_HASH_FUNC_SDBM PERL_HASH_FUNC_ONE_AT_A_TIME Setting the environment variable PERL_HASH_SEED_DEBUG to 1 will make perl output the current seed (changed to hex) and the hash function it has been built with. Setting the environment variable PERL_HASH_SEED to a hex value will cause that value to be used at the seed. Any missing bits of the seed will be set to 0. The bits are filled in from left to right, not the traditional right to left so setting it to FE results in a seed value of "FE000000" not "000000FE". Note that we do the hash seed initialization in perl_construct(). Doing it via perl_alloc() (via init_tls) causes problems under threaded builds as the buffers used for reentrant srand48 functions are not allocated. See also the p5p mail "Hash improvements blocker: portable random code that doesnt depend on a functional interpreter", Message-ID: <CANgJU+X+wNayjsNOpKRqYHnEy_+B9UH_2irRA5O3ZmcYGAAZFQ@mail.gmail.com>
Diffstat (limited to 'hv.c')
-rw-r--r--hv.c113
1 files changed, 13 insertions, 100 deletions
diff --git a/hv.c b/hv.c
index ddefd6585e..3069e67bf4 100644
--- a/hv.c
+++ b/hv.c
@@ -613,18 +613,12 @@ Perl_hv_common(pTHX_ HV *hv, SV *keysv, const char *key, STRLEN klen,
}
}
- if (HvREHASH(hv) || (!hash && !(keysv && (SvIsCOW_shared_hash(keysv)))))
- PERL_HASH_INTERNAL_(hash, key, klen, HvREHASH(hv));
- else if (!hash)
- hash = SvSHARED_HASH(keysv);
-
- /* We don't have a pointer to the hv, so we have to replicate the
- flag into every HEK, so that hv_iterkeysv can see it.
- And yes, you do need this even though you are not "storing" because
- you can flip the flags below if doing an lval lookup. (And that
- was put in to give the semantics Andreas was expecting.) */
- if (HvREHASH(hv))
- flags |= HVhek_REHASH;
+ if (!hash) {
+ if (keysv && (SvIsCOW_shared_hash(keysv)))
+ hash = SvSHARED_HASH(keysv);
+ else
+ PERL_HASH(hash, key, klen);
+ }
masked_flags = (flags & HVhek_MASK);
@@ -813,7 +807,7 @@ Perl_hv_common(pTHX_ HV *hv, SV *keysv, const char *key, STRLEN klen,
as we repeatedly double the number of buckets on every
entry. Linear search feels a less worse thing to do. */
hsplit(hv);
- } else if(!HvREHASH(hv)) {
+ } else {
U32 n_links = 1;
while ((counter = HeNEXT(counter)))
@@ -978,10 +972,12 @@ S_hv_delete_common(pTHX_ HV *hv, SV *keysv, const char *key, STRLEN klen,
HvHASKFLAGS_on(MUTABLE_SV(hv));
}
- if (HvREHASH(hv) || (!hash && !(keysv && (SvIsCOW_shared_hash(keysv)))))
- PERL_HASH_INTERNAL_(hash, key, klen, HvREHASH(hv));
- else if (!hash)
- hash = SvSHARED_HASH(keysv);
+ if (!hash) {
+ if (keysv && (SvIsCOW_shared_hash(keysv)))
+ hash = SvSHARED_HASH(keysv);
+ else
+ PERL_HASH(hash, key, klen);
+ }
masked_flags = (k_flags & HVhek_MASK);
@@ -1118,8 +1114,6 @@ S_hsplit(pTHX_ HV *hv)
I32 i;
char *a = (char*) HvARRAY(hv);
HE **aep;
- int longest_chain = 0;
- int was_shared;
PERL_ARGS_ASSERT_HSPLIT;
@@ -1166,8 +1160,6 @@ S_hsplit(pTHX_ HV *hv)
aep = (HE**)a;
for (i=0; i<oldsize; i++,aep++) {
- int left_length = 0;
- int right_length = 0;
HE **oentry = aep;
HE *entry = *aep;
HE **bep;
@@ -1180,91 +1172,16 @@ S_hsplit(pTHX_ HV *hv)
*oentry = HeNEXT(entry);
HeNEXT(entry) = *bep;
*bep = entry;
- right_length++;
}
else {
oentry = &HeNEXT(entry);
- left_length++;
}
entry = *oentry;
} while (entry);
/* I think we don't actually need to keep track of the longest length,
merely flag if anything is too long. But for the moment while
developing this code I'll track it. */
- if (left_length > longest_chain)
- longest_chain = left_length;
- if (right_length > longest_chain)
- longest_chain = right_length;
- }
-
-
- /* Pick your policy for "hashing isn't working" here: */
- if (longest_chain <= HV_MAX_LENGTH_BEFORE_SPLIT /* split worked? */
- || HvREHASH(hv)) {
- return;
- }
-
- if (hv == PL_strtab) {
- /* Urg. Someone is doing something nasty to the string table.
- Can't win. */
- return;
- }
-
- /* Awooga. Awooga. Pathological data. */
- /*PerlIO_printf(PerlIO_stderr(), "%p %d of %d with %d/%d buckets\n", (void*)hv,
- longest_chain, HvTOTALKEYS(hv), HvFILL(hv), 1+HvMAX(hv));*/
-
- ++newsize;
- Newxz(a, PERL_HV_ARRAY_ALLOC_BYTES(newsize)
- + (SvOOK(hv) ? sizeof(struct xpvhv_aux) : 0), char);
- if (SvOOK(hv)) {
- Copy(HvAUX(hv), &a[newsize * sizeof(HE*)], 1, struct xpvhv_aux);
- }
-
- was_shared = HvSHAREKEYS(hv);
-
- HvSHAREKEYS_off(hv);
- HvREHASH_on(hv);
-
- aep = HvARRAY(hv);
-
- for (i=0; i<newsize; i++,aep++) {
- HE *entry = *aep;
- while (entry) {
- /* We're going to trash this HE's next pointer when we chain it
- into the new hash below, so store where we go next. */
- HE * const next = HeNEXT(entry);
- UV hash;
- HE **bep;
-
- /* Rehash it */
- PERL_HASH_INTERNAL(hash, HeKEY(entry), HeKLEN(entry));
-
- if (was_shared) {
- /* Unshare it. */
- HEK * const new_hek
- = save_hek_flags(HeKEY(entry), HeKLEN(entry),
- hash, HeKFLAGS(entry));
- unshare_hek (HeKEY_hek(entry));
- HeKEY_hek(entry) = new_hek;
- } else {
- /* Not shared, so simply write the new hash in. */
- HeHASH(entry) = hash;
- }
- /*PerlIO_printf(PerlIO_stderr(), "%d ", HeKFLAGS(entry));*/
- HEK_REHASH_on(HeKEY_hek(entry));
- /*PerlIO_printf(PerlIO_stderr(), "%d\n", HeKFLAGS(entry));*/
-
- /* Copy oentry to the correct new chain. */
- bep = ((HE**)a) + (hash & (I32) xhv->xhv_max);
- HeNEXT(entry) = *bep;
- *bep = entry;
-
- entry = next;
- }
}
- Safefree (HvARRAY(hv));
- HvARRAY(hv) = (HE **)a;
}
void
@@ -1606,7 +1523,6 @@ Perl_hv_clear(pTHX_ HV *hv)
mg_clear(MUTABLE_SV(hv));
HvHASKFLAGS_off(hv);
- HvREHASH_off(hv);
}
if (SvOOK(hv)) {
if(HvENAME_get(hv))
@@ -2478,9 +2394,6 @@ Perl_hv_iternext_flags(pTHX_ HV *hv, I32 flags)
hv_free_ent(hv, oldentry);
}
- /*if (HvREHASH(hv) && entry && !HeKREHASH(entry))
- PerlIO_printf(PerlIO_stderr(), "Awooga %p %p\n", (void*)hv, (void*)entry);*/
-
iter->xhv_eiter = entry; /* HvEITER(hv) = entry */
return entry;
}