diff options
author | Yves Orton <demerphq@gmail.com> | 2012-12-11 23:46:37 +0100 |
---|---|---|
committer | Yves Orton <demerphq@gmail.com> | 2013-03-19 00:23:10 +0100 |
commit | a740dcb9a42f0314c6f5dc7e1df1f8f8370a8690 (patch) | |
tree | 77392f48cd1c9b92587eaf9938df45aa6362ab9d | |
parent | 2d4cc5ff33ebe4b3884fbc2b801d833c145cd946 (diff) | |
download | perl-a740dcb9a42f0314c6f5dc7e1df1f8f8370a8690.tar.gz |
add a "hash quality score" to Hash::Util::bucket_stats()
-rw-r--r-- | ext/Hash-Util/lib/Hash/Util.pm | 35 |
1 files changed, 28 insertions, 7 deletions
diff --git a/ext/Hash-Util/lib/Hash/Util.pm b/ext/Hash-Util/lib/Hash/Util.pm index 6dcb6ccaef..050f9263b0 100644 --- a/ext/Hash-Util/lib/Hash/Util.pm +++ b/ext/Hash-Util/lib/Hash/Util.pm @@ -32,7 +32,7 @@ our @EXPORT_OK = qw( bucket_stats bucket_info bucket_array lock_hash_recurse unlock_hash_recurse ); -our $VERSION = '0.14'; +our $VERSION = '0.15'; require XSLoader; XSLoader::load(); @@ -525,15 +525,29 @@ Fields are as follows: 0: Number of keys in the hash 1: Number of buckets in the hash 2: Number of used buckets in the hash - 3: Percent of buckets used - 4: Percent of keys which are in collision - 5: Average bucket length - 6: Standard Deviation of bucket lengths. + 3: Hash Quality Score + 4: Percent of buckets used + 5: Percent of keys which are in collision + 6: Average bucket length + 7: Standard Deviation of bucket lengths. rest : list of counts, Kth element is the number of buckets with K keys in it. See also bucket_info() and bucket_array(). +Note that Hash Quality Score would be 1 for an ideal hash, numbers +close to and below 1 indicate good hashing, and number significantly +above indicate a poor score. In practice it should be around 0.95 to 1.05. +It is defined as: + + $score= sum( $count[$length] * ($length * ($length + 1) / 2) ) + / + ( ( $keys / 2 * $buckets ) * + ( $keys + ( 2 * $buckets ) - 1 ) ) + +The formula is from the Red Dragon book (reformulated to use the data available) +and is documented at L<http://www.strchr.com/hash_functions> + =item B<bucket_array> my $array= bucket_array(\%hash); @@ -558,13 +572,20 @@ sub bucket_stats { my ($hash) = @_; my ($keys, $buckets, $used, @length_counts) = bucket_info($hash); my $sum; - $sum += ($length_counts[$_] * $_) for 0 .. $#length_counts; + my $score; + for (0 .. $#length_counts) { + $sum += ($length_counts[$_] * $_); + $score += $length_counts[$_] * ( $_ * ($_ + 1 ) / 2 ); + } + $score = $score / + (( $keys / (2 * $buckets )) * ( $keys + ( 2 * $buckets ) - 1 )) + if $keys; my $mean= $sum/$buckets; $sum= 0; $sum += ($length_counts[$_] * (($_-$mean)**2)) for 0 .. $#length_counts; my $stddev= sqrt($sum/$buckets); - return $keys, $buckets, $used, $keys ? ($used/$buckets, ($keys-$used)/$keys, $mean, $stddev, @length_counts) : (); + return $keys, $buckets, $used, $keys ? ($score, $used/$buckets, ($keys-$used)/$keys, $mean, $stddev, @length_counts) : (); } =item B<hv_store> |