diff options
author | Sanjay Ghemawat <sanjay@google.com> | 2012-04-17 08:36:46 -0700 |
---|---|---|
committer | Sanjay Ghemawat <sanjay@google.com> | 2012-04-17 08:36:46 -0700 |
commit | 85584d497e7b354853b72f450683d59fcf6b9c5c (patch) | |
tree | b6aad4f973f87487c3caa5600e7596219c79f645 /util | |
parent | bc1ee4d25e09b04e074db330a41f54ef4af0e31b (diff) | |
download | leveldb-85584d497e7b354853b72f450683d59fcf6b9c5c.tar.gz |
Added bloom filter support.v1.4
In particular, we add a new FilterPolicy class. An instance
of this class can be supplied in Options when opening a
database. If supplied, the instance is used to generate
summaries of keys (e.g., a bloom filter) which are placed in
sstables. These summaries are consulted by DB::Get() so we
can avoid reading sstable blocks that are guaranteed to not
contain the key we are looking for.
This change provides one implementation of FilterPolicy
based on bloom filters.
Other changes:
- Updated version number to 1.4.
- Some build tweaks.
- C binding for CompactRange.
- A few more benchmarks: deleteseq, deleterandom, readmissing, seekrandom.
- Minor .gitignore update.
Diffstat (limited to 'util')
-rw-r--r-- | util/bloom.cc | 95 | ||||
-rw-r--r-- | util/bloom_test.cc | 159 | ||||
-rw-r--r-- | util/filter_policy.cc | 11 | ||||
-rw-r--r-- | util/options.cc | 3 |
4 files changed, 267 insertions, 1 deletions
diff --git a/util/bloom.cc b/util/bloom.cc new file mode 100644 index 0000000..d7941cd --- /dev/null +++ b/util/bloom.cc @@ -0,0 +1,95 @@ +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/filter_policy.h" + +#include "leveldb/slice.h" +#include "util/hash.h" + +namespace leveldb { + +namespace { +static uint32_t BloomHash(const Slice& key) { + return Hash(key.data(), key.size(), 0xbc9f1d34); +} + +class BloomFilterPolicy : public FilterPolicy { + private: + size_t bits_per_key_; + size_t k_; + + public: + explicit BloomFilterPolicy(int bits_per_key) + : bits_per_key_(bits_per_key) { + // We intentionally round down to reduce probing cost a little bit + k_ = static_cast<size_t>(bits_per_key * 0.69); // 0.69 =~ ln(2) + if (k_ < 1) k_ = 1; + if (k_ > 30) k_ = 30; + } + + virtual const char* Name() const { + return "leveldb.BuiltinBloomFilter"; + } + + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { + // Compute bloom filter size (in both bits and bytes) + size_t bits = n * bits_per_key_; + + // For small n, we can see a very high false positive rate. Fix it + // by enforcing a minimum bloom filter length. + if (bits < 64) bits = 64; + + size_t bytes = (bits + 7) / 8; + bits = bytes * 8; + + const size_t init_size = dst->size(); + dst->resize(init_size + bytes, 0); + dst->push_back(static_cast<char>(k_)); // Remember # of probes in filter + char* array = &(*dst)[init_size]; + for (size_t i = 0; i < n; i++) { + // Use double-hashing to generate a sequence of hash values. + // See analysis in [Kirsch,Mitzenmacher 2006]. + uint32_t h = BloomHash(keys[i]); + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + for (size_t j = 0; j < k_; j++) { + const uint32_t bitpos = h % bits; + array[bitpos/8] |= (1 << (bitpos % 8)); + h += delta; + } + } + } + + virtual bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const { + const size_t len = bloom_filter.size(); + if (len < 2) return false; + + const char* array = bloom_filter.data(); + const size_t bits = (len - 1) * 8; + + // Use the encoded k so that we can read filters generated by + // bloom filters created using different parameters. + const size_t k = array[len-1]; + if (k > 30) { + // Reserved for potentially new encodings for short bloom filters. + // Consider it a match. + return true; + } + + uint32_t h = BloomHash(key); + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + for (size_t j = 0; j < k; j++) { + const uint32_t bitpos = h % bits; + if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false; + h += delta; + } + return true; + } +}; +} + +const FilterPolicy* NewBloomFilterPolicy(int bits_per_key) { + return new BloomFilterPolicy(bits_per_key); +} + +} // namespace leveldb diff --git a/util/bloom_test.cc b/util/bloom_test.cc new file mode 100644 index 0000000..4a6ea1b --- /dev/null +++ b/util/bloom_test.cc @@ -0,0 +1,159 @@ +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/filter_policy.h" + +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace leveldb { + +static const int kVerbose = 1; + +static Slice Key(int i, char* buffer) { + memcpy(buffer, &i, sizeof(i)); + return Slice(buffer, sizeof(i)); +} + +class BloomTest { + private: + const FilterPolicy* policy_; + std::string filter_; + std::vector<std::string> keys_; + + public: + BloomTest() : policy_(NewBloomFilterPolicy(10)) { } + + ~BloomTest() { + delete policy_; + } + + void Reset() { + keys_.clear(); + filter_.clear(); + } + + void Add(const Slice& s) { + keys_.push_back(s.ToString()); + } + + void Build() { + std::vector<Slice> key_slices; + for (size_t i = 0; i < keys_.size(); i++) { + key_slices.push_back(Slice(keys_[i])); + } + filter_.clear(); + policy_->CreateFilter(&key_slices[0], key_slices.size(), &filter_); + keys_.clear(); + if (kVerbose >= 2) DumpFilter(); + } + + size_t FilterSize() const { + return filter_.size(); + } + + void DumpFilter() { + fprintf(stderr, "F("); + for (size_t i = 0; i+1 < filter_.size(); i++) { + const unsigned int c = static_cast<unsigned int>(filter_[i]); + for (int j = 0; j < 8; j++) { + fprintf(stderr, "%c", (c & (1 <<j)) ? '1' : '.'); + } + } + fprintf(stderr, ")\n"); + } + + bool Matches(const Slice& s) { + if (!keys_.empty()) { + Build(); + } + return policy_->KeyMayMatch(s, filter_); + } + + double FalsePositiveRate() { + char buffer[sizeof(int)]; + int result = 0; + for (int i = 0; i < 10000; i++) { + if (Matches(Key(i + 1000000000, buffer))) { + result++; + } + } + return result / 10000.0; + } +}; + +TEST(BloomTest, EmptyFilter) { + ASSERT_TRUE(! Matches("hello")); + ASSERT_TRUE(! Matches("world")); +} + +TEST(BloomTest, Small) { + Add("hello"); + Add("world"); + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); + ASSERT_TRUE(! Matches("x")); + ASSERT_TRUE(! Matches("foo")); +} + +static int NextLength(int length) { + if (length < 10) { + length += 1; + } else if (length < 100) { + length += 10; + } else if (length < 1000) { + length += 100; + } else { + length += 1000; + } + return length; +} + +TEST(BloomTest, VaryingLengths) { + char buffer[sizeof(int)]; + + // Count number of filters that significantly exceed the false positive rate + int mediocre_filters = 0; + int good_filters = 0; + + for (int length = 1; length <= 10000; length = NextLength(length)) { + Reset(); + for (int i = 0; i < length; i++) { + Add(Key(i, buffer)); + } + Build(); + + ASSERT_LE(FilterSize(), (length * 10 / 8) + 40) << length; + + // All added keys must match + for (int i = 0; i < length; i++) { + ASSERT_TRUE(Matches(Key(i, buffer))) + << "Length " << length << "; key " << i; + } + + // Check false positive rate + double rate = FalsePositiveRate(); + if (kVerbose >= 1) { + fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n", + rate*100.0, length, static_cast<int>(FilterSize())); + } + ASSERT_LE(rate, 0.02); // Must not be over 2% + if (rate > 0.0125) mediocre_filters++; // Allowed, but not too often + else good_filters++; + } + if (kVerbose >= 1) { + fprintf(stderr, "Filters: %d good, %d mediocre\n", + good_filters, mediocre_filters); + } + ASSERT_LE(mediocre_filters, good_filters/5); +} + +// Different bits-per-byte + +} // namespace leveldb + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/util/filter_policy.cc b/util/filter_policy.cc new file mode 100644 index 0000000..7b045c8 --- /dev/null +++ b/util/filter_policy.cc @@ -0,0 +1,11 @@ +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/filter_policy.h" + +namespace leveldb { + +FilterPolicy::~FilterPolicy() { } + +} // namespace leveldb diff --git a/util/options.cc b/util/options.cc index bb97838..76af5b9 100644 --- a/util/options.cc +++ b/util/options.cc @@ -21,7 +21,8 @@ Options::Options() block_cache(NULL), block_size(4096), block_restart_interval(16), - compression(kSnappyCompression) { + compression(kSnappyCompression), + filter_policy(NULL) { } |