diff options
author | Loic Dachary <loic@dachary.org> | 2013-10-03 12:13:41 -0700 |
---|---|---|
committer | Loic Dachary <loic@dachary.org> | 2013-10-03 12:13:41 -0700 |
commit | 739696ef0a8c9cd38de73ff78f06d6e25a71b890 (patch) | |
tree | 2b184e4c76eef9a7a63b68e66cab902b3af82ce5 | |
parent | d3ba8da597384516c85a9af928971a585843aeb4 (diff) | |
parent | 8cfeb8342a08774fb1030830859c3fc30514f0b5 (diff) | |
download | ceph-739696ef0a8c9cd38de73ff78f06d6e25a71b890.tar.gz |
Merge pull request #638 from ceph/wip-bloom
bloom filter cleanups, encodability, and unit tests
-rw-r--r-- | COPYING | 4 | ||||
-rw-r--r-- | debian/copyright | 4 | ||||
-rw-r--r-- | src/common/Makefile.am | 4 | ||||
-rw-r--r-- | src/common/bloom_filter.cc | 76 | ||||
-rw-r--r-- | src/common/bloom_filter.hpp | 627 | ||||
-rw-r--r-- | src/include/Makefile.am | 1 | ||||
-rw-r--r-- | src/include/bloom_filter.hpp | 544 | ||||
-rw-r--r-- | src/mds/CDir.cc | 2 | ||||
-rw-r--r-- | src/test/Makefile.am | 5 | ||||
-rw-r--r-- | src/test/common/test_bloom_filter.cc | 222 | ||||
-rw-r--r-- | src/test/encoding/types.h | 3 |
11 files changed, 945 insertions, 547 deletions
@@ -23,6 +23,10 @@ Files: src/include/ceph_hash.cc Copyright: None License: Public domain +Files: src/common/bloom_filter.hpp +Copyright: Copyright (C) 2000 Arash Partow <arash@partow.net> +License: Boost Software License, Version 1.0 + Files: m4/acx_pthread.m4 Copyright: Steven G. Johnson <stevenj@alum.mit.edu> License: GPLWithACException diff --git a/debian/copyright b/debian/copyright index f18b45ceec0..d3906c44d35 100644 --- a/debian/copyright +++ b/debian/copyright @@ -23,6 +23,10 @@ Files: src/include/ceph_hash.cc Copyright: None License: Public domain +Files: src/common/bloom_filter.hpp +Copyright: Copyright (C) 2000 Arash Partow +License: Boost Software License, Version 1.0 + Files: m4/acx_pthread.m4 Copyright: Steven G. Johnson <stevenj@alum.mit.edu> License: GPLWithACException diff --git a/src/common/Makefile.am b/src/common/Makefile.am index deddc5d831c..9ec6c3e895b 100644 --- a/src/common/Makefile.am +++ b/src/common/Makefile.am @@ -65,7 +65,8 @@ libcommon_la_SOURCES = \ common/ceph_strings.cc \ common/ceph_frag.cc \ common/addr_parsing.c \ - common/hobject.cc + common/hobject.cc \ + common/bloom_filter.cc if LINUX libcommon_la_SOURCES += common/secret.c @@ -97,6 +98,7 @@ LIBCOMMON_DEPS += libcommon_crc.la noinst_LTLIBRARIES += libcommon_crc.la noinst_HEADERS += \ + common/bloom_filter.hpp \ common/sctp_crc32.h \ common/crc32c_intel_baseline.h \ common/crc32c_intel_fast.h diff --git a/src/common/bloom_filter.cc b/src/common/bloom_filter.cc new file mode 100644 index 00000000000..f602b80149e --- /dev/null +++ b/src/common/bloom_filter.cc @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/types.h" +#include "common/bloom_filter.hpp" + +void bloom_filter::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + ::encode((uint64_t)salt_count_, bl); + ::encode((uint64_t)table_size_, bl); + ::encode((uint64_t)inserted_element_count_, bl); + ::encode((uint64_t)random_seed_, bl); + bufferptr bp((const char*)bit_table_, raw_table_size_); + ::encode(bp, bl); + ENCODE_FINISH(bl); +} + +void bloom_filter::decode(bufferlist::iterator& p) +{ + DECODE_START(1, p); + uint64_t v; + ::decode(v, p); + salt_count_ = v; + ::decode(v, p); + table_size_ = v; + ::decode(v, p); + inserted_element_count_ = v; + ::decode(v, p); + random_seed_ = v; + bufferlist t; + ::decode(t, p); + + salt_.clear(); + generate_unique_salt(); + raw_table_size_ = t.length(); + assert(raw_table_size_ == table_size_ / bits_per_char); + delete bit_table_; + bit_table_ = new cell_type[raw_table_size_]; + t.copy(0, raw_table_size_, (char *)bit_table_); + + DECODE_FINISH(p); +} + +void bloom_filter::dump(Formatter *f) const +{ + f->dump_unsigned("salt_count", salt_count_); + f->dump_unsigned("table_size", table_size_); + f->dump_unsigned("raw_table_size", raw_table_size_); + f->dump_unsigned("insert_count", inserted_element_count_); + f->dump_unsigned("random_seed", random_seed_); + + f->open_array_section("salt_table"); + for (std::vector<bloom_type>::const_iterator i = salt_.begin(); i != salt_.end(); ++i) + f->dump_unsigned("salt", *i); + f->close_section(); + + f->open_array_section("bit_table"); + for (unsigned i = 0; i < raw_table_size_; ++i) + f->dump_unsigned("byte", (unsigned)bit_table_[i]); + f->close_section(); +} + +void bloom_filter::generate_test_instances(list<bloom_filter*>& ls) +{ + ls.push_back(new bloom_filter(10, .5, 1)); + ls.push_back(new bloom_filter(10, .5, 1)); + ls.back()->insert("foo"); + ls.back()->insert("bar"); + ls.push_back(new bloom_filter(50, .5, 1)); + ls.back()->insert("foo"); + ls.back()->insert("bar"); + ls.back()->insert("baz"); + ls.back()->insert("boof"); + ls.back()->insert("boogggg"); +} diff --git a/src/common/bloom_filter.hpp b/src/common/bloom_filter.hpp new file mode 100644 index 00000000000..6216c7fb34d --- /dev/null +++ b/src/common/bloom_filter.hpp @@ -0,0 +1,627 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + ******************************************************************* + * * + * Open Bloom Filter * + * * + * Author: Arash Partow - 2000 * + * URL: http://www.partow.net/programming/hashfunctions/index.html * + * * + * Copyright notice: * + * Free use of the Open Bloom Filter Library is permitted under * + * the guidelines and in accordance with the most current version * + * of the Boost Software License, Version 1.0 * + * http://www.opensource.org/licenses/bsl1.0.html * + * * + ******************************************************************* +*/ + + +#ifndef COMMON_BLOOM_FILTER_HPP +#define COMMON_BLOOM_FILTER_HPP + +#include <cstddef> +#include <algorithm> +#include <cmath> +#include <limits> +#include <list> +#include <string> +#include <vector> + +#include "include/encoding.h" +#include "common/Formatter.h" + +static const std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned) +static const unsigned char bit_mask[bits_per_char] = { + 0x01, //00000001 + 0x02, //00000010 + 0x04, //00000100 + 0x08, //00001000 + 0x10, //00010000 + 0x20, //00100000 + 0x40, //01000000 + 0x80 //10000000 +}; + + +class bloom_filter +{ +protected: + + typedef unsigned int bloom_type; + typedef unsigned char cell_type; + +public: + + bloom_filter() + : bit_table_(0), + salt_count_(0), + table_size_(0), + raw_table_size_(0), + inserted_element_count_(0), + random_seed_(0) + {} + + bloom_filter(const std::size_t& predicted_inserted_element_count, + const double& false_positive_probability, + const std::size_t& random_seed) + : bit_table_(0), + inserted_element_count_(0), + random_seed_((random_seed) ? random_seed : 0xA5A5A5A5) + { + find_optimal_parameters(predicted_inserted_element_count, false_positive_probability, + &salt_count_, &table_size_); + init(); + } + + bloom_filter(const std::size_t& salt_count, std::size_t table_size, + const std::size_t& random_seed) + : bit_table_(0), + salt_count_(salt_count), + table_size_(table_size), + inserted_element_count_(0), + random_seed_((random_seed) ? random_seed : 0xA5A5A5A5) + { + init(); + } + + void init() { + generate_unique_salt(); + raw_table_size_ = table_size_ / bits_per_char; + bit_table_ = new cell_type[raw_table_size_]; + std::fill_n(bit_table_,raw_table_size_,0x00); + } + + bloom_filter(const bloom_filter& filter) + { + this->operator=(filter); + } + + bloom_filter& operator = (const bloom_filter& filter) + { + if (this != &filter) { + salt_count_ = filter.salt_count_; + table_size_ = filter.table_size_; + raw_table_size_ = filter.raw_table_size_; + inserted_element_count_ = filter.inserted_element_count_; + random_seed_ = filter.random_seed_; + delete[] bit_table_; + bit_table_ = new cell_type[raw_table_size_]; + std::copy(filter.bit_table_,filter.bit_table_ + raw_table_size_,bit_table_); + salt_ = filter.salt_; + } + return *this; + } + + virtual ~bloom_filter() + { + delete[] bit_table_; + } + + inline bool operator!() const + { + return (0 == table_size_); + } + + inline void clear() + { + std::fill_n(bit_table_,raw_table_size_,0x00); + inserted_element_count_ = 0; + } + + /** + * insert a u32 into the set + * + * NOTE: the internal hash is weak enough that consecutive inputs do + * not achieve the desired fpp. Well-mixed values should be used + * here (e.g., put rjhash(x) into the filter instead of just x). + * + * @param val integer value to insert + */ + inline void insert(uint32_t val) { + std::size_t bit_index = 0; + std::size_t bit = 0; + for (std::size_t i = 0; i < salt_.size(); ++i) + { + compute_indices(hash_ap(val,salt_[i]),bit_index,bit); + bit_table_[bit_index / bits_per_char] |= bit_mask[bit]; + } + ++inserted_element_count_; + } + + inline void insert(const unsigned char* key_begin, const std::size_t& length) + { + std::size_t bit_index = 0; + std::size_t bit = 0; + for (std::size_t i = 0; i < salt_.size(); ++i) + { + compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit); + bit_table_[bit_index / bits_per_char] |= bit_mask[bit]; + } + ++inserted_element_count_; + } + + template<typename T> + inline void insert(const T& t) + { + // Note: T must be a C++ POD type. + insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T)); + } + + inline void insert(const std::string& key) + { + insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size()); + } + + inline void insert(const char* data, const std::size_t& length) + { + insert(reinterpret_cast<const unsigned char*>(data),length); + } + + template<typename InputIterator> + inline void insert(const InputIterator begin, const InputIterator end) + { + InputIterator itr = begin; + while (end != itr) + { + insert(*(itr++)); + } + } + + /** + * check if a u32 is contained by set + * + * NOTE: the internal hash is weak enough that consecutive inputs do + * not achieve the desired fpp. Well-mixed values should be used + * here (e.g., put rjhash(x) into the filter instead of just x). + * + * @param val integer value to query + * @returns true if value is (probably) in the set, false if it definitely is not + */ + inline virtual bool contains(uint32_t val) const + { + std::size_t bit_index = 0; + std::size_t bit = 0; + for (std::size_t i = 0; i < salt_.size(); ++i) + { + compute_indices(hash_ap(val,salt_[i]),bit_index,bit); + if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit]) + { + return false; + } + } + return true; + } + + inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const + { + std::size_t bit_index = 0; + std::size_t bit = 0; + for (std::size_t i = 0; i < salt_.size(); ++i) + { + compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit); + if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit]) + { + return false; + } + } + return true; + } + + template<typename T> + inline bool contains(const T& t) const + { + return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T))); + } + + inline bool contains(const std::string& key) const + { + return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size()); + } + + inline bool contains(const char* data, const std::size_t& length) const + { + return contains(reinterpret_cast<const unsigned char*>(data),length); + } + + template<typename InputIterator> + inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const + { + InputIterator itr = begin; + while (end != itr) + { + if (!contains(*itr)) + { + return itr; + } + ++itr; + } + return end; + } + + template<typename InputIterator> + inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const + { + InputIterator itr = begin; + while (end != itr) + { + if (contains(*itr)) + { + return itr; + } + ++itr; + } + return end; + } + + inline virtual std::size_t size() const + { + return table_size_; + } + + inline std::size_t element_count() const + { + return inserted_element_count_; + } + + inline double effective_fpp() const + { + /* + Note: + The effective false positive probability is calculated using the + designated table size and hash function count in conjunction with + the current number of inserted elements - not the user defined + predicated/expected number of inserted elements. + */ + return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size()); + } + + inline bloom_filter& operator &= (const bloom_filter& filter) + { + /* intersection */ + if ( + (salt_count_ == filter.salt_count_) && + (table_size_ == filter.table_size_) && + (random_seed_ == filter.random_seed_) + ) { + for (std::size_t i = 0; i < raw_table_size_; ++i) { + bit_table_[i] &= filter.bit_table_[i]; + } + } + return *this; + } + + inline bloom_filter& operator |= (const bloom_filter& filter) + { + /* union */ + if ( + (salt_count_ == filter.salt_count_) && + (table_size_ == filter.table_size_) && + (random_seed_ == filter.random_seed_) + ) { + for (std::size_t i = 0; i < raw_table_size_; ++i) { + bit_table_[i] |= filter.bit_table_[i]; + } + } + return *this; + } + + inline bloom_filter& operator ^= (const bloom_filter& filter) + { + /* difference */ + if ( + (salt_count_ == filter.salt_count_) && + (table_size_ == filter.table_size_) && + (random_seed_ == filter.random_seed_) + ) { + for (std::size_t i = 0; i < raw_table_size_; ++i) { + bit_table_[i] ^= filter.bit_table_[i]; + } + } + return *this; + } + + inline const cell_type* table() const + { + return bit_table_; + } + +protected: + + inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const + { + bit_index = hash % table_size_; + bit = bit_index % bits_per_char; + } + + void generate_unique_salt() + { + /* + Note: + A distinct hash function need not be implementation-wise + distinct. In the current implementation "seeding" a common + hash function with different values seems to be adequate. + */ + const unsigned int predef_salt_count = 128; + static const bloom_type predef_salt[predef_salt_count] = { + 0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC, + 0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B, + 0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66, + 0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA, + 0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99, + 0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33, + 0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5, + 0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000, + 0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F, + 0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63, + 0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7, + 0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492, + 0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A, + 0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B, + 0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3, + 0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432, + 0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC, + 0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB, + 0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331, + 0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68, + 0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8, + 0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A, + 0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF, + 0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E, + 0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39, + 0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E, + 0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355, + 0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E, + 0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79, + 0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075, + 0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC, + 0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421 + }; + + if (salt_count_ <= predef_salt_count) + { + std::copy(predef_salt, + predef_salt + salt_count_, + std::back_inserter(salt_)); + for (unsigned int i = 0; i < salt_.size(); ++i) + { + /* + Note: + This is done to integrate the user defined random seed, + so as to allow for the generation of unique bloom filter + instances. + */ + salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + random_seed_; + } + } + else + { + std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_)); + srand(static_cast<unsigned int>(random_seed_)); + while (salt_.size() < salt_count_) + { + bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand()); + if (0 == current_salt) + continue; + if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt)) + { + salt_.push_back(current_salt); + } + } + } + } + + static void find_optimal_parameters(std::size_t target_insert_count, + double target_fpp, + std::size_t *salt_count, + std::size_t *table_size) + { + /* + Note: + The following will attempt to find the number of hash functions + and minimum amount of storage bits required to construct a bloom + filter consistent with the user defined false positive probability + and estimated element insertion count. + */ + + double min_m = std::numeric_limits<double>::infinity(); + double min_k = 0.0; + double curr_m = 0.0; + double k = 1.0; + while (k < 1000.0) + { + double numerator = (- k * target_insert_count); + double denominator = std::log(1.0 - std::pow(target_fpp, 1.0 / k)); + curr_m = numerator / denominator; + + if (curr_m < min_m) + { + min_m = curr_m; + min_k = k; + } + k += 1.0; + } + + *salt_count = static_cast<std::size_t>(min_k); + size_t t = static_cast<std::size_t>(min_m); + t += (((t % bits_per_char) != 0) ? (bits_per_char - (t % bits_per_char)) : 0); + *table_size = t; + } + + inline bloom_type hash_ap(uint32_t val, bloom_type hash) const + { + hash ^= (hash << 7) ^ ((val & 0xff000000) >> 24) * (hash >> 3); + hash ^= (~((hash << 11) + (((val & 0xff0000) >> 16) ^ (hash >> 5)))); + hash ^= (hash << 7) ^ ((val & 0xff00) >> 8) * (hash >> 3); + hash ^= (~((hash << 11) + (((val & 0xff)) ^ (hash >> 5)))); + return hash; + } + + inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const + { + const unsigned char* itr = begin; + + while (remaining_length >= 4) + { + hash ^= (hash << 7) ^ (*itr++) * (hash >> 3); + hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5)))); + hash ^= (hash << 7) ^ (*itr++) * (hash >> 3); + hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5)))); + remaining_length -= 4; + } + + while (remaining_length >= 2) + { + hash ^= (hash << 7) ^ (*itr++) * (hash >> 3); + hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5)))); + remaining_length -= 2; + } + + if (remaining_length) + { + hash ^= (hash << 7) ^ (*itr) * (hash >> 3); + } + + return hash; + } + + std::vector<bloom_type> salt_; + unsigned char* bit_table_; + std::size_t salt_count_; + std::size_t table_size_; + std::size_t raw_table_size_; + std::size_t inserted_element_count_; + std::size_t random_seed_; + +public: + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(std::list<bloom_filter*>& ls); +}; +WRITE_CLASS_ENCODER(bloom_filter) + +inline bloom_filter operator & (const bloom_filter& a, const bloom_filter& b) +{ + bloom_filter result = a; + result &= b; + return result; +} + +inline bloom_filter operator | (const bloom_filter& a, const bloom_filter& b) +{ + bloom_filter result = a; + result |= b; + return result; +} + +inline bloom_filter operator ^ (const bloom_filter& a, const bloom_filter& b) +{ + bloom_filter result = a; + result ^= b; + return result; +} + + +class compressible_bloom_filter : public bloom_filter +{ +public: + + compressible_bloom_filter(const std::size_t& predicted_element_count, + const double& false_positive_probability, + const std::size_t& random_seed) + : bloom_filter(predicted_element_count,false_positive_probability,random_seed) + { + size_list.push_back(table_size_); + } + + inline virtual std::size_t size() const + { + return size_list.back(); + } + + inline bool compress(const double& percentage) + { + if ((0.0 >= percentage) || (percentage >= 100.0)) + { + return false; + } + + std::size_t original_table_size = size_list.back(); + std::size_t new_table_size = static_cast<std::size_t>((size_list.back() * (1.0 - (percentage / 100.0)))); + new_table_size -= (((new_table_size % bits_per_char) != 0) ? (new_table_size % bits_per_char) : 0); + + if ((bits_per_char > new_table_size) || (new_table_size >= original_table_size)) + { + return false; + } + + cell_type* tmp = new cell_type[new_table_size / bits_per_char]; + std::copy(bit_table_, bit_table_ + (new_table_size / bits_per_char), tmp); + cell_type* itr = bit_table_ + (new_table_size / bits_per_char); + cell_type* end = bit_table_ + (original_table_size / bits_per_char); + cell_type* itr_tmp = tmp; + + while (end != itr) + { + *(itr_tmp++) |= (*itr++); + } + + delete[] bit_table_; + bit_table_ = tmp; + size_list.push_back(new_table_size); + + return true; + } + +private: + + inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const + { + bit_index = hash; + for (std::size_t i = 0; i < size_list.size(); ++i) + { + bit_index %= size_list[i]; + } + bit = bit_index % bits_per_char; + } + + std::vector<std::size_t> size_list; +}; + +#endif + + +/* + Note 1: + If it can be guaranteed that bits_per_char will be of the form 2^n then + the following optimization can be used: + + hash_table[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)]; + + Note 2: + For performance reasons where possible when allocating memory it should + be aligned (aligned_alloc) according to the architecture being used. +*/ diff --git a/src/include/Makefile.am b/src/include/Makefile.am index d702ebd2795..2d98e777f00 100644 --- a/src/include/Makefile.am +++ b/src/include/Makefile.am @@ -18,7 +18,6 @@ rados_include_DATA = \ $(srcdir)/include/crc32c.h noinst_HEADERS += \ - include/bloom_filter.hpp \ include/Context.h \ include/CompatSet.h \ include/Distribution.h \ diff --git a/src/include/bloom_filter.hpp b/src/include/bloom_filter.hpp deleted file mode 100644 index 41aba4bad47..00000000000 --- a/src/include/bloom_filter.hpp +++ /dev/null @@ -1,544 +0,0 @@ -/* - ******************************************************************* - * * - * Open Bloom Filter * - * * - * Author: Arash Partow - 2000 * - * URL: http://www.partow.net/programming/hashfunctions/index.html * - * * - * Copyright notice: * - * Free use of the Open Bloom Filter Library is permitted under * - * the guidelines and in accordance with the most current version * - * of the Boost Software License, Version 1.0 * - * http://www.opensource.org/licenses/bsl1.0.html * - * * - ******************************************************************* -*/ - - -#ifndef INCLUDE_BLOOM_FILTER_HPP -#define INCLUDE_BLOOM_FILTER_HPP - -#include <cstddef> -#include <algorithm> -#include <cmath> -#include <limits> -#include <string> -#include <vector> - - -static const std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned) -static const unsigned char bit_mask[bits_per_char] = { - 0x01, //00000001 - 0x02, //00000010 - 0x04, //00000100 - 0x08, //00001000 - 0x10, //00010000 - 0x20, //00100000 - 0x40, //01000000 - 0x80 //10000000 - }; - - -class bloom_filter -{ -protected: - - typedef unsigned int bloom_type; - typedef unsigned char cell_type; - -public: - - bloom_filter(const std::size_t& predicted_inserted_element_count, - const double& false_positive_probability, - const std::size_t& random_seed) - : bit_table_(0), - predicted_inserted_element_count_(predicted_inserted_element_count), - inserted_element_count_(0), - random_seed_((random_seed) ? random_seed : 0xA5A5A5A5), - desired_false_positive_probability_(false_positive_probability) - { - find_optimal_parameters(); - generate_unique_salt(); - raw_table_size_ = table_size_ / bits_per_char; - bit_table_ = new cell_type[raw_table_size_]; - std::fill_n(bit_table_,raw_table_size_,0x00); - } - - bloom_filter(const bloom_filter& filter) - { - this->operator=(filter); - } - - bloom_filter& operator = (const bloom_filter& filter) - { - if (this != &filter) { - salt_count_ = filter.salt_count_; - table_size_ = filter.table_size_; - raw_table_size_ = filter.raw_table_size_; - predicted_inserted_element_count_ = filter.predicted_inserted_element_count_; - inserted_element_count_ = filter.inserted_element_count_; - random_seed_ = filter.random_seed_; - desired_false_positive_probability_ = filter.desired_false_positive_probability_; - delete[] bit_table_; - bit_table_ = new cell_type[raw_table_size_]; - std::copy(filter.bit_table_,filter.bit_table_ + raw_table_size_,bit_table_); - salt_ = filter.salt_; - } - return *this; - } - - virtual ~bloom_filter() - { - delete[] bit_table_; - } - - inline bool operator!() const - { - return (0 == table_size_); - } - - inline void clear() - { - std::fill_n(bit_table_,raw_table_size_,0x00); - inserted_element_count_ = 0; - } - - inline void insert(const unsigned char* key_begin, const std::size_t& length) - { - std::size_t bit_index = 0; - std::size_t bit = 0; - for (std::size_t i = 0; i < salt_.size(); ++i) - { - compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit); - bit_table_[bit_index / bits_per_char] |= bit_mask[bit]; - } - ++inserted_element_count_; - } - - template<typename T> - inline void insert(const T& t) - { - // Note: T must be a C++ POD type. - insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T)); - } - - inline void insert(const std::string& key) - { - insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size()); - } - - inline void insert(const char* data, const std::size_t& length) - { - insert(reinterpret_cast<const unsigned char*>(data),length); - } - - template<typename InputIterator> - inline void insert(const InputIterator begin, const InputIterator end) - { - InputIterator itr = begin; - while (end != itr) - { - insert(*(itr++)); - } - } - - inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const - { - std::size_t bit_index = 0; - std::size_t bit = 0; - for (std::size_t i = 0; i < salt_.size(); ++i) - { - compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit); - if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit]) - { - return false; - } - } - return true; - } - - template<typename T> - inline bool contains(const T& t) const - { - return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T))); - } - - inline bool contains(const std::string& key) const - { - return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size()); - } - - inline bool contains(const char* data, const std::size_t& length) const - { - return contains(reinterpret_cast<const unsigned char*>(data),length); - } - - template<typename InputIterator> - inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const - { - InputIterator itr = begin; - while (end != itr) - { - if (!contains(*itr)) - { - return itr; - } - ++itr; - } - return end; - } - - template<typename InputIterator> - inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const - { - InputIterator itr = begin; - while (end != itr) - { - if (contains(*itr)) - { - return itr; - } - ++itr; - } - return end; - } - - inline virtual std::size_t size() const - { - return table_size_; - } - - inline std::size_t element_count() const - { - return inserted_element_count_; - } - - inline double effective_fpp() const - { - /* - Note: - The effective false positive probability is calculated using the - designated table size and hash function count in conjunction with - the current number of inserted elements - not the user defined - predicated/expected number of inserted elements. - */ - return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size()); - } - - inline bloom_filter& operator &= (const bloom_filter& filter) - { - /* intersection */ - if ( - (salt_count_ == filter.salt_count_) && - (table_size_ == filter.table_size_) && - (random_seed_ == filter.random_seed_) - ) - { - for (std::size_t i = 0; i < raw_table_size_; ++i) - { - bit_table_[i] &= filter.bit_table_[i]; - } - } - return *this; - } - - inline bloom_filter& operator |= (const bloom_filter& filter) - { - /* union */ - if ( - (salt_count_ == filter.salt_count_) && - (table_size_ == filter.table_size_) && - (random_seed_ == filter.random_seed_) - ) - { - for (std::size_t i = 0; i < raw_table_size_; ++i) - { - bit_table_[i] |= filter.bit_table_[i]; - } - } - return *this; - } - - inline bloom_filter& operator ^= (const bloom_filter& filter) - { - /* difference */ - if ( - (salt_count_ == filter.salt_count_) && - (table_size_ == filter.table_size_) && - (random_seed_ == filter.random_seed_) - ) - { - for (std::size_t i = 0; i < raw_table_size_; ++i) - { - bit_table_[i] ^= filter.bit_table_[i]; - } - } - return *this; - } - - inline const cell_type* table() const - { - return bit_table_; - } - -protected: - - inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const - { - bit_index = hash % table_size_; - bit = bit_index % bits_per_char; - } - - void generate_unique_salt() - { - /* - Note: - A distinct hash function need not be implementation-wise - distinct. In the current implementation "seeding" a common - hash function with different values seems to be adequate. - */ - const unsigned int predef_salt_count = 128; - static const bloom_type predef_salt[predef_salt_count] = - { - 0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC, - 0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B, - 0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66, - 0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA, - 0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99, - 0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33, - 0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5, - 0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000, - 0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F, - 0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63, - 0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7, - 0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492, - 0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A, - 0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B, - 0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3, - 0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432, - 0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC, - 0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB, - 0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331, - 0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68, - 0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8, - 0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A, - 0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF, - 0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E, - 0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39, - 0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E, - 0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355, - 0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E, - 0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79, - 0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075, - 0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC, - 0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421 - }; - - if (salt_count_ <= predef_salt_count) - { - std::copy(predef_salt, - predef_salt + salt_count_, - std::back_inserter(salt_)); - for (unsigned int i = 0; i < salt_.size(); ++i) - { - /* - Note: - This is done to integrate the user defined random seed, - so as to allow for the generation of unique bloom filter - instances. - */ - salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + random_seed_; - } - } - else - { - std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_)); - srand(static_cast<unsigned int>(random_seed_)); - while (salt_.size() < salt_count_) - { - bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand()); - if (0 == current_salt) continue; - if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt)) - { - salt_.push_back(current_salt); - } - } - } - } - - void find_optimal_parameters() - { - /* - Note: - The following will attempt to find the number of hash functions - and minimum amount of storage bits required to construct a bloom - filter consistent with the user defined false positive probability - and estimated element insertion count. - */ - - double min_m = std::numeric_limits<double>::infinity(); - double min_k = 0.0; - double curr_m = 0.0; - double k = 1.0; - while (k < 1000.0) - { - double numerator = (- k * predicted_inserted_element_count_); - double denominator = std::log(1.0 - std::pow(desired_false_positive_probability_, 1.0 / k)); - curr_m = numerator / denominator; - - if (curr_m < min_m) - { - min_m = curr_m; - min_k = k; - } - k += 1.0; - } - - salt_count_ = static_cast<std::size_t>(min_k); - table_size_ = static_cast<std::size_t>(min_m); - table_size_ += (((table_size_ % bits_per_char) != 0) ? (bits_per_char - (table_size_ % bits_per_char)) : 0); - } - - inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const - { - const unsigned char* itr = begin; - - while (remaining_length >= 4) - { - hash ^= (hash << 7) ^ (*itr++) * (hash >> 3); - hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5)))); - hash ^= (hash << 7) ^ (*itr++) * (hash >> 3); - hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5)))); - remaining_length -= 4; - } - - while (remaining_length >= 2) - { - hash ^= (hash << 7) ^ (*itr++) * (hash >> 3); - hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5)))); - remaining_length -= 2; - } - - if (remaining_length) - { - hash ^= (hash << 7) ^ (*itr) * (hash >> 3); - } - - return hash; - } - - std::vector<bloom_type> salt_; - unsigned char* bit_table_; - std::size_t salt_count_; - std::size_t table_size_; - std::size_t raw_table_size_; - std::size_t predicted_inserted_element_count_; - std::size_t inserted_element_count_; - std::size_t random_seed_; - double desired_false_positive_probability_; -}; - -inline bloom_filter operator & (const bloom_filter& a, const bloom_filter& b) -{ - bloom_filter result = a; - result &= b; - return result; -} - -inline bloom_filter operator | (const bloom_filter& a, const bloom_filter& b) -{ - bloom_filter result = a; - result |= b; - return result; -} - -inline bloom_filter operator ^ (const bloom_filter& a, const bloom_filter& b) -{ - bloom_filter result = a; - result ^= b; - return result; -} - - -class compressible_bloom_filter : public bloom_filter -{ -public: - - compressible_bloom_filter(const std::size_t& predicted_element_count, - const double& false_positive_probability, - const std::size_t& random_seed) - : bloom_filter(predicted_element_count,false_positive_probability,random_seed) - { - size_list.push_back(table_size_); - } - - inline virtual std::size_t size() const - { - return size_list.back(); - } - - inline bool compress(const double& percentage) - { - if ((0.0 >= percentage) || (percentage >= 100.0)) - { - return false; - } - - std::size_t original_table_size = size_list.back(); - std::size_t new_table_size = static_cast<std::size_t>((size_list.back() * (1.0 - (percentage / 100.0)))); - new_table_size -= (((new_table_size % bits_per_char) != 0) ? (new_table_size % bits_per_char) : 0); - - if ((bits_per_char > new_table_size) || (new_table_size >= original_table_size)) - { - return false; - } - - desired_false_positive_probability_ = effective_fpp(); - cell_type* tmp = new cell_type[new_table_size / bits_per_char]; - std::copy(bit_table_, bit_table_ + (new_table_size / bits_per_char), tmp); - cell_type* itr = bit_table_ + (new_table_size / bits_per_char); - cell_type* end = bit_table_ + (original_table_size / bits_per_char); - cell_type* itr_tmp = tmp; - - while (end != itr) - { - *(itr_tmp++) |= (*itr++); - } - - delete[] bit_table_; - bit_table_ = tmp; - size_list.push_back(new_table_size); - - return true; - } - -private: - - inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const - { - bit_index = hash; - for (std::size_t i = 0; i < size_list.size(); ++i) - { - bit_index %= size_list[i]; - } - bit = bit_index % bits_per_char; - } - - std::vector<std::size_t> size_list; -}; - -#endif - - -/* - Note 1: - If it can be guaranteed that bits_per_char will be of the form 2^n then - the following optimization can be used: - - hash_table[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)]; - - Note 2: - For performance reasons where possible when allocating memory it should - be aligned (aligned_alloc) according to the architecture being used. -*/ diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index c77ca180a6f..4a5e636d9a6 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -27,7 +27,7 @@ #include "MDLog.h" #include "LogSegment.h" -#include "include/bloom_filter.hpp" +#include "common/bloom_filter.hpp" #include "include/Context.h" #include "common/Clock.h" diff --git a/src/test/Makefile.am b/src/test/Makefile.am index debe18ff907..59b4d89e930 100644 --- a/src/test/Makefile.am +++ b/src/test/Makefile.am @@ -258,6 +258,11 @@ unittest_addrs_CXXFLAGS = $(UNITTEST_CXXFLAGS) unittest_addrs_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL) check_PROGRAMS += unittest_addrs +unittest_bloom_filter_SOURCES = test/common/test_bloom_filter.cc +unittest_bloom_filter_CXXFLAGS = $(UNITTEST_CXXFLAGS) +unittest_bloom_filter_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL) +check_PROGRAMS += unittest_bloom_filter + unittest_sharedptr_registry_SOURCES = test/common/test_sharedptr_registry.cc unittest_sharedptr_registry_CXXFLAGS = $(UNITTEST_CXXFLAGS) unittest_sharedptr_registry_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL) diff --git a/src/test/common/test_bloom_filter.cc b/src/test/common/test_bloom_filter.cc new file mode 100644 index 00000000000..8e3661b2cc1 --- /dev/null +++ b/src/test/common/test_bloom_filter.cc @@ -0,0 +1,222 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank <info@inktank.com> + * + * LGPL2.1 (see COPYING-LGPL2.1) or later + */ + +#include <iostream> +#include <gtest/gtest.h> + +#include "include/stringify.h" +#include "common/bloom_filter.hpp" + +TEST(BloomFilter, Basic) { + bloom_filter bf(10, .1, 1); + bf.insert("foo"); + bf.insert("bar"); + + ASSERT_TRUE(bf.contains("foo")); + ASSERT_TRUE(bf.contains("bar")); +} + +TEST(BloomFilter, Sweep) { + std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl; + for (int ex = 3; ex < 12; ex += 2) { + for (float fpp = .001; fpp < .5; fpp *= 4.0) { + int max = 2 << ex; + bloom_filter bf(max, fpp, 1); + bf.insert("foo"); + bf.insert("bar"); + + ASSERT_TRUE(bf.contains("foo")); + ASSERT_TRUE(bf.contains("bar")); + + for (int n = 0; n < max; n++) + bf.insert("ok" + stringify(n)); + + int test = max * 100; + int hit = 0; + for (int n = 0; n < test; n++) + if (bf.contains("asdf" + stringify(n))) + hit++; + + ASSERT_TRUE(bf.contains("foo")); + ASSERT_TRUE(bf.contains("bar")); + + double actual = (double)hit / (double)test; + + bufferlist bl; + ::encode(bf, bl); + + double byte_per_insert = (double)bl.length() / (double)max; + + std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert << std::endl; + ASSERT_TRUE(actual < fpp * 10); + + } + } +} + +TEST(BloomFilter, SweepInt) { + std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl; + for (int ex = 3; ex < 12; ex += 2) { + for (float fpp = .001; fpp < .5; fpp *= 4.0) { + int max = 2 << ex; + bloom_filter bf(max, fpp, 1); + bf.insert("foo"); + bf.insert("bar"); + + ASSERT_TRUE(123); + ASSERT_TRUE(456); + + for (int n = 0; n < max; n++) + bf.insert(n); + + int test = max * 100; + int hit = 0; + for (int n = 0; n < test; n++) + if (bf.contains(100000 + n)) + hit++; + + ASSERT_TRUE(123); + ASSERT_TRUE(456); + + double actual = (double)hit / (double)test; + + bufferlist bl; + ::encode(bf, bl); + + double byte_per_insert = (double)bl.length() / (double)max; + + std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert << std::endl; + ASSERT_TRUE(actual < fpp * 10); + ASSERT_TRUE(actual > fpp / 10); + } + } +} + + +TEST(BloomFilter, BinSweep) { + int total_max = 16384; + float total_fpp = .01; + std::cout << "total_inserts " << total_max << " target-fpp " << total_fpp << std::endl; + for (int bins = 1; bins < 16; ++bins) { + int max = total_max / bins; + float fpp = total_fpp / bins;//pow(total_fpp, bins); + + std::vector<bloom_filter*> ls; + bufferlist bl; + for (int i=0; i<bins; i++) { + ls.push_back(new bloom_filter(max, fpp, i)); + for (int j=0; j<max; j++) { + ls.back()->insert(10000 * (i+1) + j); + } + ::encode(*ls.front(), bl); + } + + int hit = 0; + int test = max * 100; + for (int i=0; i<test; ++i) { + for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) { + if ((*j)->contains(i * 732)) { // note: sequential i does not work here; the intenral int hash is weak!! + hit++; + break; + } + } + } + + double actual = (double)hit / (double)test; + std::cout << "bins " << bins << " bin-max " << max << " bin-fpp " << fpp + << " actual-fpp " << actual + << " total-size " << bl.length() << std::endl; + } +} + +// disable these tests; doing dual insertions in consecutive filters +// appears to be equivalent to doing a single insertion in a bloom +// filter that is twice as big. +#if 0 + +// test the fpp over a sequence of bloom filters, each with unique +// items inserted into it. +// +// we expect: actual_fpp = num_filters * per_filter_fpp +TEST(BloomFilter, Sequence) { + + int max = 1024; + double fpp = .01; + for (int seq = 2; seq <= 128; seq *= 2) { + std::vector<bloom_filter*> ls; + for (int i=0; i<seq; i++) { + ls.push_back(new bloom_filter(max*2, fpp, i)); + for (int j=0; j<max; j++) { + ls.back()->insert("ok" + stringify(j) + "_" + stringify(i)); + if (ls.size() > 1) + ls[ls.size() - 2]->insert("ok" + stringify(j) + "_" + stringify(i)); + } + } + + int hit = 0; + int test = max * 100; + for (int i=0; i<test; ++i) { + for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) { + if ((*j)->contains("bad" + stringify(i))) { + hit++; + break; + } + } + } + + double actual = (double)hit / (double)test; + std::cout << "seq " << seq << " max " << max << " fpp " << fpp << " actual " << actual << std::endl; + } +} + +// test the ffp over a sequence of bloom filters, where actual values +// are always inserted into a consecutive pair of filters. in order +// to have a false positive, we need to falsely match two consecutive +// filters. +// +// we expect: actual_fpp = num_filters * per_filter_fpp^2 +TEST(BloomFilter, SequenceDouble) { + int max = 1024; + double fpp = .01; + for (int seq = 2; seq <= 128; seq *= 2) { + std::vector<bloom_filter*> ls; + for (int i=0; i<seq; i++) { + ls.push_back(new bloom_filter(max*2, fpp, i)); + for (int j=0; j<max; j++) { + ls.back()->insert("ok" + stringify(j) + "_" + stringify(i)); + if (ls.size() > 1) + ls[ls.size() - 2]->insert("ok" + stringify(j) + "_" + stringify(i)); + } + } + + int hit = 0; + int test = max * 100; + int run = 0; + for (int i=0; i<test; ++i) { + for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) { + if ((*j)->contains("bad" + stringify(i))) { + run++; + if (run >= 2) { + hit++; + break; + } + } else { + run = 0; + } + } + } + + double actual = (double)hit / (double)test; + std::cout << "seq " << seq << " max " << max << " fpp " << fpp << " actual " << actual + << " expected " << (fpp*fpp*(double)seq) << std::endl; + } +} + +#endif diff --git a/src/test/encoding/types.h b/src/test/encoding/types.h index 6dd180bc198..59e55a11b23 100644 --- a/src/test/encoding/types.h +++ b/src/test/encoding/types.h @@ -4,6 +4,9 @@ TYPE(CompatSet) #include "include/filepath.h" TYPE(filepath) +#include "common/bloom_filter.hpp" +TYPE(bloom_filter) + #include "common/snap_types.h" TYPE(SnapContext) TYPE(SnapRealmInfo) |