diff options
author | Yves Orton <demerphq@gmail.com> | 2012-11-17 14:12:04 +0100 |
---|---|---|
committer | Yves Orton <demerphq@gmail.com> | 2012-11-17 19:19:43 +0100 |
commit | 7dc8663964c66a698d31bbdc8e8abed69bddeec3 (patch) | |
tree | 92f39aa5770c2f401bbe134db921d68cbe4f758a /ext/Hash-Util-FieldHash/t | |
parent | 867b16b50f413adfdb2addcab33ad1d6e61da9d5 (diff) | |
download | perl-7dc8663964c66a698d31bbdc8e8abed69bddeec3.tar.gz |
Hash Function Change - Murmur hash and true per process hash seed
This patch does the following:
*) Introduces multiple new hash functions to choose from at build
time. This includes Murmur-32, SDBM, DJB2, SipHash, SuperFast, and
One-at-a-time. Currently this is handled by muning hv.h. Configure
support hopefully to follow.
*) Changes the default hash to Murmur hash which is faster than the
old default One-at-a-time.
*) Rips out the old HvREHASH mechanism and replaces it with a
per-process random hash seed.
*) Changes the old PL_hash_seed from an interpreter value to a
global variable. This means it does not have to be copied during
interpreter setup or cloning.
*) Changes the format of the PERL_HASH_SEED variable to a hex
string so that hash seeds longer than fit in an integer are possible.
*) Changes the return of Hash::Util::hash_seed() from a number to a
string. This is to accomodate hash functions which have more bits than
can be fit in an integer.
*) Adds new functions to Hash::Util to improve introspection of hashes
-) hash_value() - returns an integer hash value for a given string.
-) bucket_info() - returns basic hash bucket utilization info
-) bucket_stats() - returns more hash bucket utilization info
-) bucket_array() - which keys are in which buckets in a hash
More details on the new hash functions can be found below:
Murmur Hash: (v3) from google, see
http://code.google.com/p/smhasher/wiki/MurmurHash3
Superfast Hash: From Paul Hsieh.
http://www.azillionmonkeys.com/qed/hash.html
DJB2: a hash function from Daniel Bernstein
http://www.cse.yorku.ca/~oz/hash.html
SDBM: a hash function sdbm.
http://www.cse.yorku.ca/~oz/hash.html
SipHash: by Jean-Philippe Aumasson and Daniel J. Bernstein.
https://www.131002.net/siphash/
They have all be converted into Perl's ugly macro format.
I have not done any rigorous testing to make sure this conversion
is correct. They seem to function as expected however.
All of them use the random hash seed.
You can force the use of a given function by defining one of
PERL_HASH_FUNC_MURMUR
PERL_HASH_FUNC_SUPERFAST
PERL_HASH_FUNC_DJB2
PERL_HASH_FUNC_SDBM
PERL_HASH_FUNC_ONE_AT_A_TIME
Setting the environment variable PERL_HASH_SEED_DEBUG to 1 will make
perl output the current seed (changed to hex) and the hash function
it has been built with.
Setting the environment variable PERL_HASH_SEED to a hex value will
cause that value to be used at the seed. Any missing bits of the seed
will be set to 0. The bits are filled in from left to right, not
the traditional right to left so setting it to FE results in a seed
value of "FE000000" not "000000FE".
Note that we do the hash seed initialization in perl_construct().
Doing it via perl_alloc() (via init_tls) causes problems under
threaded builds as the buffers used for reentrant srand48 functions
are not allocated. See also the p5p mail "Hash improvements blocker:
portable random code that doesnt depend on a functional interpreter",
Message-ID:
<CANgJU+X+wNayjsNOpKRqYHnEy_+B9UH_2irRA5O3ZmcYGAAZFQ@mail.gmail.com>
Diffstat (limited to 'ext/Hash-Util-FieldHash/t')
-rw-r--r-- | ext/Hash-Util-FieldHash/t/10_hash.t | 110 |
1 files changed, 0 insertions, 110 deletions
diff --git a/ext/Hash-Util-FieldHash/t/10_hash.t b/ext/Hash-Util-FieldHash/t/10_hash.t deleted file mode 100644 index 2cfb4e81fa..0000000000 --- a/ext/Hash-Util-FieldHash/t/10_hash.t +++ /dev/null @@ -1,110 +0,0 @@ -#!./perl -w -use Test::More; - -use strict; -use Hash::Util::FieldHash qw( :all); - -no warnings 'misc'; - -plan tests => 5; - -fieldhash my %h; - -ok (!Internals::HvREHASH(%h), "hash doesn't start with rehash flag on"); - -foreach (1..10) { - $h{"\0"x$_}++; -} - -ok (!Internals::HvREHASH(%h), "10 entries doesn't trigger rehash"); - -foreach (11..20) { - $h{"\0"x$_}++; -} - -ok (Internals::HvREHASH(%h), "20 entries triggers rehash"); - - - - -# second part using an emulation of the PERL_HASH in perl, mounting an -# attack on a pre-populated hash. This is also useful if you need normal -# keys which don't contain \0 -- suitable for stashes - -use constant MASK_U32 => 2**32; -use constant HASH_SEED => 0; -use constant THRESHOLD => 14; -use constant START => "a"; - -# some initial hash data -fieldhash my %h2; -%h2 = map {$_ => 1} 'a'..'cc'; - -ok (!Internals::HvREHASH(%h2), - "starting with pre-populated non-pathological hash (rehash flag if off)"); - -my @keys = get_keys(\%h2); -$h2{$_}++ for @keys; -ok (Internals::HvREHASH(%h2), - scalar(@keys) . " colliding into the same bucket keys are triggering rehash"); - -sub get_keys { - my $hr = shift; - - # the minimum of bits required to mount the attack on a hash - my $min_bits = log(THRESHOLD)/log(2); - - # if the hash has already been populated with a significant amount - # of entries the number of mask bits can be higher - my $keys = scalar keys %$hr; - my $bits = $keys ? log($keys)/log(2) : 0; - $bits = $min_bits if $min_bits > $bits; - - $bits = int($bits) < $bits ? int($bits) + 1 : int($bits); - # need to add 2 bits to cover the internal split cases - $bits += 2; - my $mask = 2**$bits-1; - print "# using mask: $mask ($bits)\n"; - - my @keys; - my $s = START; - my $c = 0; - # get 2 keys on top of the THRESHOLD - my $hash; - while (@keys < THRESHOLD+2) { - # next if exists $hash->{$s}; - $hash = hash($s); - next unless ($hash & $mask) == 0; - $c++; - printf "# %2d: %5s, %10s\n", $c, $s, $hash; - push @keys, $s; - } continue { - $s++; - } - - return @keys; -} - - -# trying to provide the fastest equivalent of C macro's PERL_HASH in -# Perl - the main complication is that it uses U32 integer, which we -# can't do it perl, without doing some tricks -sub hash { - my $s = shift; - my @c = split //, $s; - my $u = HASH_SEED; - for (@c) { - # (A % M) + (B % M) == (A + B) % M - # This works because '+' produces a NV, which is big enough to hold - # the intermediate result. We only need the % before any "^" and "&" - # to get the result in the range for an I32. - # and << doesn't work on NV, so using 1 << 10 - $u += ord; - $u += $u * (1 << 10); $u %= MASK_U32; - $u ^= $u >> 6; - } - $u += $u << 3; $u %= MASK_U32; - $u ^= $u >> 11; $u %= MASK_U32; - $u += $u << 15; $u %= MASK_U32; - $u; -} |