summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLoic Dachary <loic@dachary.org>2013-10-03 12:13:41 -0700
committerLoic Dachary <loic@dachary.org>2013-10-03 12:13:41 -0700
commit739696ef0a8c9cd38de73ff78f06d6e25a71b890 (patch)
tree2b184e4c76eef9a7a63b68e66cab902b3af82ce5
parentd3ba8da597384516c85a9af928971a585843aeb4 (diff)
parent8cfeb8342a08774fb1030830859c3fc30514f0b5 (diff)
downloadceph-739696ef0a8c9cd38de73ff78f06d6e25a71b890.tar.gz
Merge pull request #638 from ceph/wip-bloom
bloom filter cleanups, encodability, and unit tests
-rw-r--r--COPYING4
-rw-r--r--debian/copyright4
-rw-r--r--src/common/Makefile.am4
-rw-r--r--src/common/bloom_filter.cc76
-rw-r--r--src/common/bloom_filter.hpp627
-rw-r--r--src/include/Makefile.am1
-rw-r--r--src/include/bloom_filter.hpp544
-rw-r--r--src/mds/CDir.cc2
-rw-r--r--src/test/Makefile.am5
-rw-r--r--src/test/common/test_bloom_filter.cc222
-rw-r--r--src/test/encoding/types.h3
11 files changed, 945 insertions, 547 deletions
diff --git a/COPYING b/COPYING
index f18b45ceec0..a0034d58c3b 100644
--- a/COPYING
+++ b/COPYING
@@ -23,6 +23,10 @@ Files: src/include/ceph_hash.cc
Copyright: None
License: Public domain
+Files: src/common/bloom_filter.hpp
+Copyright: Copyright (C) 2000 Arash Partow <arash@partow.net>
+License: Boost Software License, Version 1.0
+
Files: m4/acx_pthread.m4
Copyright: Steven G. Johnson <stevenj@alum.mit.edu>
License: GPLWithACException
diff --git a/debian/copyright b/debian/copyright
index f18b45ceec0..d3906c44d35 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -23,6 +23,10 @@ Files: src/include/ceph_hash.cc
Copyright: None
License: Public domain
+Files: src/common/bloom_filter.hpp
+Copyright: Copyright (C) 2000 Arash Partow
+License: Boost Software License, Version 1.0
+
Files: m4/acx_pthread.m4
Copyright: Steven G. Johnson <stevenj@alum.mit.edu>
License: GPLWithACException
diff --git a/src/common/Makefile.am b/src/common/Makefile.am
index deddc5d831c..9ec6c3e895b 100644
--- a/src/common/Makefile.am
+++ b/src/common/Makefile.am
@@ -65,7 +65,8 @@ libcommon_la_SOURCES = \
common/ceph_strings.cc \
common/ceph_frag.cc \
common/addr_parsing.c \
- common/hobject.cc
+ common/hobject.cc \
+ common/bloom_filter.cc
if LINUX
libcommon_la_SOURCES += common/secret.c
@@ -97,6 +98,7 @@ LIBCOMMON_DEPS += libcommon_crc.la
noinst_LTLIBRARIES += libcommon_crc.la
noinst_HEADERS += \
+ common/bloom_filter.hpp \
common/sctp_crc32.h \
common/crc32c_intel_baseline.h \
common/crc32c_intel_fast.h
diff --git a/src/common/bloom_filter.cc b/src/common/bloom_filter.cc
new file mode 100644
index 00000000000..f602b80149e
--- /dev/null
+++ b/src/common/bloom_filter.cc
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/types.h"
+#include "common/bloom_filter.hpp"
+
+void bloom_filter::encode(bufferlist& bl) const
+{
+ ENCODE_START(1, 1, bl);
+ ::encode((uint64_t)salt_count_, bl);
+ ::encode((uint64_t)table_size_, bl);
+ ::encode((uint64_t)inserted_element_count_, bl);
+ ::encode((uint64_t)random_seed_, bl);
+ bufferptr bp((const char*)bit_table_, raw_table_size_);
+ ::encode(bp, bl);
+ ENCODE_FINISH(bl);
+}
+
+void bloom_filter::decode(bufferlist::iterator& p)
+{
+ DECODE_START(1, p);
+ uint64_t v;
+ ::decode(v, p);
+ salt_count_ = v;
+ ::decode(v, p);
+ table_size_ = v;
+ ::decode(v, p);
+ inserted_element_count_ = v;
+ ::decode(v, p);
+ random_seed_ = v;
+ bufferlist t;
+ ::decode(t, p);
+
+ salt_.clear();
+ generate_unique_salt();
+ raw_table_size_ = t.length();
+ assert(raw_table_size_ == table_size_ / bits_per_char);
+ delete bit_table_;
+ bit_table_ = new cell_type[raw_table_size_];
+ t.copy(0, raw_table_size_, (char *)bit_table_);
+
+ DECODE_FINISH(p);
+}
+
+void bloom_filter::dump(Formatter *f) const
+{
+ f->dump_unsigned("salt_count", salt_count_);
+ f->dump_unsigned("table_size", table_size_);
+ f->dump_unsigned("raw_table_size", raw_table_size_);
+ f->dump_unsigned("insert_count", inserted_element_count_);
+ f->dump_unsigned("random_seed", random_seed_);
+
+ f->open_array_section("salt_table");
+ for (std::vector<bloom_type>::const_iterator i = salt_.begin(); i != salt_.end(); ++i)
+ f->dump_unsigned("salt", *i);
+ f->close_section();
+
+ f->open_array_section("bit_table");
+ for (unsigned i = 0; i < raw_table_size_; ++i)
+ f->dump_unsigned("byte", (unsigned)bit_table_[i]);
+ f->close_section();
+}
+
+void bloom_filter::generate_test_instances(list<bloom_filter*>& ls)
+{
+ ls.push_back(new bloom_filter(10, .5, 1));
+ ls.push_back(new bloom_filter(10, .5, 1));
+ ls.back()->insert("foo");
+ ls.back()->insert("bar");
+ ls.push_back(new bloom_filter(50, .5, 1));
+ ls.back()->insert("foo");
+ ls.back()->insert("bar");
+ ls.back()->insert("baz");
+ ls.back()->insert("boof");
+ ls.back()->insert("boogggg");
+}
diff --git a/src/common/bloom_filter.hpp b/src/common/bloom_filter.hpp
new file mode 100644
index 00000000000..6216c7fb34d
--- /dev/null
+++ b/src/common/bloom_filter.hpp
@@ -0,0 +1,627 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ *******************************************************************
+ * *
+ * Open Bloom Filter *
+ * *
+ * Author: Arash Partow - 2000 *
+ * URL: http://www.partow.net/programming/hashfunctions/index.html *
+ * *
+ * Copyright notice: *
+ * Free use of the Open Bloom Filter Library is permitted under *
+ * the guidelines and in accordance with the most current version *
+ * of the Boost Software License, Version 1.0 *
+ * http://www.opensource.org/licenses/bsl1.0.html *
+ * *
+ *******************************************************************
+*/
+
+
+#ifndef COMMON_BLOOM_FILTER_HPP
+#define COMMON_BLOOM_FILTER_HPP
+
+#include <cstddef>
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <list>
+#include <string>
+#include <vector>
+
+#include "include/encoding.h"
+#include "common/Formatter.h"
+
+static const std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned)
+static const unsigned char bit_mask[bits_per_char] = {
+ 0x01, //00000001
+ 0x02, //00000010
+ 0x04, //00000100
+ 0x08, //00001000
+ 0x10, //00010000
+ 0x20, //00100000
+ 0x40, //01000000
+ 0x80 //10000000
+};
+
+
+class bloom_filter
+{
+protected:
+
+ typedef unsigned int bloom_type;
+ typedef unsigned char cell_type;
+
+public:
+
+ bloom_filter()
+ : bit_table_(0),
+ salt_count_(0),
+ table_size_(0),
+ raw_table_size_(0),
+ inserted_element_count_(0),
+ random_seed_(0)
+ {}
+
+ bloom_filter(const std::size_t& predicted_inserted_element_count,
+ const double& false_positive_probability,
+ const std::size_t& random_seed)
+ : bit_table_(0),
+ inserted_element_count_(0),
+ random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
+ {
+ find_optimal_parameters(predicted_inserted_element_count, false_positive_probability,
+ &salt_count_, &table_size_);
+ init();
+ }
+
+ bloom_filter(const std::size_t& salt_count, std::size_t table_size,
+ const std::size_t& random_seed)
+ : bit_table_(0),
+ salt_count_(salt_count),
+ table_size_(table_size),
+ inserted_element_count_(0),
+ random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
+ {
+ init();
+ }
+
+ void init() {
+ generate_unique_salt();
+ raw_table_size_ = table_size_ / bits_per_char;
+ bit_table_ = new cell_type[raw_table_size_];
+ std::fill_n(bit_table_,raw_table_size_,0x00);
+ }
+
+ bloom_filter(const bloom_filter& filter)
+ {
+ this->operator=(filter);
+ }
+
+ bloom_filter& operator = (const bloom_filter& filter)
+ {
+ if (this != &filter) {
+ salt_count_ = filter.salt_count_;
+ table_size_ = filter.table_size_;
+ raw_table_size_ = filter.raw_table_size_;
+ inserted_element_count_ = filter.inserted_element_count_;
+ random_seed_ = filter.random_seed_;
+ delete[] bit_table_;
+ bit_table_ = new cell_type[raw_table_size_];
+ std::copy(filter.bit_table_,filter.bit_table_ + raw_table_size_,bit_table_);
+ salt_ = filter.salt_;
+ }
+ return *this;
+ }
+
+ virtual ~bloom_filter()
+ {
+ delete[] bit_table_;
+ }
+
+ inline bool operator!() const
+ {
+ return (0 == table_size_);
+ }
+
+ inline void clear()
+ {
+ std::fill_n(bit_table_,raw_table_size_,0x00);
+ inserted_element_count_ = 0;
+ }
+
+ /**
+ * insert a u32 into the set
+ *
+ * NOTE: the internal hash is weak enough that consecutive inputs do
+ * not achieve the desired fpp. Well-mixed values should be used
+ * here (e.g., put rjhash(x) into the filter instead of just x).
+ *
+ * @param val integer value to insert
+ */
+ inline void insert(uint32_t val) {
+ std::size_t bit_index = 0;
+ std::size_t bit = 0;
+ for (std::size_t i = 0; i < salt_.size(); ++i)
+ {
+ compute_indices(hash_ap(val,salt_[i]),bit_index,bit);
+ bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
+ }
+ ++inserted_element_count_;
+ }
+
+ inline void insert(const unsigned char* key_begin, const std::size_t& length)
+ {
+ std::size_t bit_index = 0;
+ std::size_t bit = 0;
+ for (std::size_t i = 0; i < salt_.size(); ++i)
+ {
+ compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
+ bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
+ }
+ ++inserted_element_count_;
+ }
+
+ template<typename T>
+ inline void insert(const T& t)
+ {
+ // Note: T must be a C++ POD type.
+ insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T));
+ }
+
+ inline void insert(const std::string& key)
+ {
+ insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
+ }
+
+ inline void insert(const char* data, const std::size_t& length)
+ {
+ insert(reinterpret_cast<const unsigned char*>(data),length);
+ }
+
+ template<typename InputIterator>
+ inline void insert(const InputIterator begin, const InputIterator end)
+ {
+ InputIterator itr = begin;
+ while (end != itr)
+ {
+ insert(*(itr++));
+ }
+ }
+
+ /**
+ * check if a u32 is contained by set
+ *
+ * NOTE: the internal hash is weak enough that consecutive inputs do
+ * not achieve the desired fpp. Well-mixed values should be used
+ * here (e.g., put rjhash(x) into the filter instead of just x).
+ *
+ * @param val integer value to query
+ * @returns true if value is (probably) in the set, false if it definitely is not
+ */
+ inline virtual bool contains(uint32_t val) const
+ {
+ std::size_t bit_index = 0;
+ std::size_t bit = 0;
+ for (std::size_t i = 0; i < salt_.size(); ++i)
+ {
+ compute_indices(hash_ap(val,salt_[i]),bit_index,bit);
+ if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
+ {
+ std::size_t bit_index = 0;
+ std::size_t bit = 0;
+ for (std::size_t i = 0; i < salt_.size(); ++i)
+ {
+ compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
+ if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ template<typename T>
+ inline bool contains(const T& t) const
+ {
+ return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T)));
+ }
+
+ inline bool contains(const std::string& key) const
+ {
+ return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
+ }
+
+ inline bool contains(const char* data, const std::size_t& length) const
+ {
+ return contains(reinterpret_cast<const unsigned char*>(data),length);
+ }
+
+ template<typename InputIterator>
+ inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
+ {
+ InputIterator itr = begin;
+ while (end != itr)
+ {
+ if (!contains(*itr))
+ {
+ return itr;
+ }
+ ++itr;
+ }
+ return end;
+ }
+
+ template<typename InputIterator>
+ inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
+ {
+ InputIterator itr = begin;
+ while (end != itr)
+ {
+ if (contains(*itr))
+ {
+ return itr;
+ }
+ ++itr;
+ }
+ return end;
+ }
+
+ inline virtual std::size_t size() const
+ {
+ return table_size_;
+ }
+
+ inline std::size_t element_count() const
+ {
+ return inserted_element_count_;
+ }
+
+ inline double effective_fpp() const
+ {
+ /*
+ Note:
+ The effective false positive probability is calculated using the
+ designated table size and hash function count in conjunction with
+ the current number of inserted elements - not the user defined
+ predicated/expected number of inserted elements.
+ */
+ return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size());
+ }
+
+ inline bloom_filter& operator &= (const bloom_filter& filter)
+ {
+ /* intersection */
+ if (
+ (salt_count_ == filter.salt_count_) &&
+ (table_size_ == filter.table_size_) &&
+ (random_seed_ == filter.random_seed_)
+ ) {
+ for (std::size_t i = 0; i < raw_table_size_; ++i) {
+ bit_table_[i] &= filter.bit_table_[i];
+ }
+ }
+ return *this;
+ }
+
+ inline bloom_filter& operator |= (const bloom_filter& filter)
+ {
+ /* union */
+ if (
+ (salt_count_ == filter.salt_count_) &&
+ (table_size_ == filter.table_size_) &&
+ (random_seed_ == filter.random_seed_)
+ ) {
+ for (std::size_t i = 0; i < raw_table_size_; ++i) {
+ bit_table_[i] |= filter.bit_table_[i];
+ }
+ }
+ return *this;
+ }
+
+ inline bloom_filter& operator ^= (const bloom_filter& filter)
+ {
+ /* difference */
+ if (
+ (salt_count_ == filter.salt_count_) &&
+ (table_size_ == filter.table_size_) &&
+ (random_seed_ == filter.random_seed_)
+ ) {
+ for (std::size_t i = 0; i < raw_table_size_; ++i) {
+ bit_table_[i] ^= filter.bit_table_[i];
+ }
+ }
+ return *this;
+ }
+
+ inline const cell_type* table() const
+ {
+ return bit_table_;
+ }
+
+protected:
+
+ inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
+ {
+ bit_index = hash % table_size_;
+ bit = bit_index % bits_per_char;
+ }
+
+ void generate_unique_salt()
+ {
+ /*
+ Note:
+ A distinct hash function need not be implementation-wise
+ distinct. In the current implementation "seeding" a common
+ hash function with different values seems to be adequate.
+ */
+ const unsigned int predef_salt_count = 128;
+ static const bloom_type predef_salt[predef_salt_count] = {
+ 0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
+ 0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
+ 0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
+ 0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
+ 0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
+ 0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
+ 0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
+ 0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
+ 0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
+ 0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
+ 0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
+ 0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
+ 0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
+ 0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
+ 0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
+ 0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
+ 0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
+ 0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
+ 0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
+ 0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
+ 0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
+ 0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
+ 0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
+ 0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
+ 0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
+ 0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
+ 0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
+ 0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
+ 0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
+ 0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
+ 0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
+ 0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
+ };
+
+ if (salt_count_ <= predef_salt_count)
+ {
+ std::copy(predef_salt,
+ predef_salt + salt_count_,
+ std::back_inserter(salt_));
+ for (unsigned int i = 0; i < salt_.size(); ++i)
+ {
+ /*
+ Note:
+ This is done to integrate the user defined random seed,
+ so as to allow for the generation of unique bloom filter
+ instances.
+ */
+ salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + random_seed_;
+ }
+ }
+ else
+ {
+ std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_));
+ srand(static_cast<unsigned int>(random_seed_));
+ while (salt_.size() < salt_count_)
+ {
+ bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
+ if (0 == current_salt)
+ continue;
+ if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
+ {
+ salt_.push_back(current_salt);
+ }
+ }
+ }
+ }
+
+ static void find_optimal_parameters(std::size_t target_insert_count,
+ double target_fpp,
+ std::size_t *salt_count,
+ std::size_t *table_size)
+ {
+ /*
+ Note:
+ The following will attempt to find the number of hash functions
+ and minimum amount of storage bits required to construct a bloom
+ filter consistent with the user defined false positive probability
+ and estimated element insertion count.
+ */
+
+ double min_m = std::numeric_limits<double>::infinity();
+ double min_k = 0.0;
+ double curr_m = 0.0;
+ double k = 1.0;
+ while (k < 1000.0)
+ {
+ double numerator = (- k * target_insert_count);
+ double denominator = std::log(1.0 - std::pow(target_fpp, 1.0 / k));
+ curr_m = numerator / denominator;
+
+ if (curr_m < min_m)
+ {
+ min_m = curr_m;
+ min_k = k;
+ }
+ k += 1.0;
+ }
+
+ *salt_count = static_cast<std::size_t>(min_k);
+ size_t t = static_cast<std::size_t>(min_m);
+ t += (((t % bits_per_char) != 0) ? (bits_per_char - (t % bits_per_char)) : 0);
+ *table_size = t;
+ }
+
+ inline bloom_type hash_ap(uint32_t val, bloom_type hash) const
+ {
+ hash ^= (hash << 7) ^ ((val & 0xff000000) >> 24) * (hash >> 3);
+ hash ^= (~((hash << 11) + (((val & 0xff0000) >> 16) ^ (hash >> 5))));
+ hash ^= (hash << 7) ^ ((val & 0xff00) >> 8) * (hash >> 3);
+ hash ^= (~((hash << 11) + (((val & 0xff)) ^ (hash >> 5))));
+ return hash;
+ }
+
+ inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
+ {
+ const unsigned char* itr = begin;
+
+ while (remaining_length >= 4)
+ {
+ hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
+ hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+ hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
+ hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+ remaining_length -= 4;
+ }
+
+ while (remaining_length >= 2)
+ {
+ hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
+ hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+ remaining_length -= 2;
+ }
+
+ if (remaining_length)
+ {
+ hash ^= (hash << 7) ^ (*itr) * (hash >> 3);
+ }
+
+ return hash;
+ }
+
+ std::vector<bloom_type> salt_;
+ unsigned char* bit_table_;
+ std::size_t salt_count_;
+ std::size_t table_size_;
+ std::size_t raw_table_size_;
+ std::size_t inserted_element_count_;
+ std::size_t random_seed_;
+
+public:
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::iterator& bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(std::list<bloom_filter*>& ls);
+};
+WRITE_CLASS_ENCODER(bloom_filter)
+
+inline bloom_filter operator & (const bloom_filter& a, const bloom_filter& b)
+{
+ bloom_filter result = a;
+ result &= b;
+ return result;
+}
+
+inline bloom_filter operator | (const bloom_filter& a, const bloom_filter& b)
+{
+ bloom_filter result = a;
+ result |= b;
+ return result;
+}
+
+inline bloom_filter operator ^ (const bloom_filter& a, const bloom_filter& b)
+{
+ bloom_filter result = a;
+ result ^= b;
+ return result;
+}
+
+
+class compressible_bloom_filter : public bloom_filter
+{
+public:
+
+ compressible_bloom_filter(const std::size_t& predicted_element_count,
+ const double& false_positive_probability,
+ const std::size_t& random_seed)
+ : bloom_filter(predicted_element_count,false_positive_probability,random_seed)
+ {
+ size_list.push_back(table_size_);
+ }
+
+ inline virtual std::size_t size() const
+ {
+ return size_list.back();
+ }
+
+ inline bool compress(const double& percentage)
+ {
+ if ((0.0 >= percentage) || (percentage >= 100.0))
+ {
+ return false;
+ }
+
+ std::size_t original_table_size = size_list.back();
+ std::size_t new_table_size = static_cast<std::size_t>((size_list.back() * (1.0 - (percentage / 100.0))));
+ new_table_size -= (((new_table_size % bits_per_char) != 0) ? (new_table_size % bits_per_char) : 0);
+
+ if ((bits_per_char > new_table_size) || (new_table_size >= original_table_size))
+ {
+ return false;
+ }
+
+ cell_type* tmp = new cell_type[new_table_size / bits_per_char];
+ std::copy(bit_table_, bit_table_ + (new_table_size / bits_per_char), tmp);
+ cell_type* itr = bit_table_ + (new_table_size / bits_per_char);
+ cell_type* end = bit_table_ + (original_table_size / bits_per_char);
+ cell_type* itr_tmp = tmp;
+
+ while (end != itr)
+ {
+ *(itr_tmp++) |= (*itr++);
+ }
+
+ delete[] bit_table_;
+ bit_table_ = tmp;
+ size_list.push_back(new_table_size);
+
+ return true;
+ }
+
+private:
+
+ inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
+ {
+ bit_index = hash;
+ for (std::size_t i = 0; i < size_list.size(); ++i)
+ {
+ bit_index %= size_list[i];
+ }
+ bit = bit_index % bits_per_char;
+ }
+
+ std::vector<std::size_t> size_list;
+};
+
+#endif
+
+
+/*
+ Note 1:
+ If it can be guaranteed that bits_per_char will be of the form 2^n then
+ the following optimization can be used:
+
+ hash_table[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)];
+
+ Note 2:
+ For performance reasons where possible when allocating memory it should
+ be aligned (aligned_alloc) according to the architecture being used.
+*/
diff --git a/src/include/Makefile.am b/src/include/Makefile.am
index d702ebd2795..2d98e777f00 100644
--- a/src/include/Makefile.am
+++ b/src/include/Makefile.am
@@ -18,7 +18,6 @@ rados_include_DATA = \
$(srcdir)/include/crc32c.h
noinst_HEADERS += \
- include/bloom_filter.hpp \
include/Context.h \
include/CompatSet.h \
include/Distribution.h \
diff --git a/src/include/bloom_filter.hpp b/src/include/bloom_filter.hpp
deleted file mode 100644
index 41aba4bad47..00000000000
--- a/src/include/bloom_filter.hpp
+++ /dev/null
@@ -1,544 +0,0 @@
-/*
- *******************************************************************
- * *
- * Open Bloom Filter *
- * *
- * Author: Arash Partow - 2000 *
- * URL: http://www.partow.net/programming/hashfunctions/index.html *
- * *
- * Copyright notice: *
- * Free use of the Open Bloom Filter Library is permitted under *
- * the guidelines and in accordance with the most current version *
- * of the Boost Software License, Version 1.0 *
- * http://www.opensource.org/licenses/bsl1.0.html *
- * *
- *******************************************************************
-*/
-
-
-#ifndef INCLUDE_BLOOM_FILTER_HPP
-#define INCLUDE_BLOOM_FILTER_HPP
-
-#include <cstddef>
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <string>
-#include <vector>
-
-
-static const std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned)
-static const unsigned char bit_mask[bits_per_char] = {
- 0x01, //00000001
- 0x02, //00000010
- 0x04, //00000100
- 0x08, //00001000
- 0x10, //00010000
- 0x20, //00100000
- 0x40, //01000000
- 0x80 //10000000
- };
-
-
-class bloom_filter
-{
-protected:
-
- typedef unsigned int bloom_type;
- typedef unsigned char cell_type;
-
-public:
-
- bloom_filter(const std::size_t& predicted_inserted_element_count,
- const double& false_positive_probability,
- const std::size_t& random_seed)
- : bit_table_(0),
- predicted_inserted_element_count_(predicted_inserted_element_count),
- inserted_element_count_(0),
- random_seed_((random_seed) ? random_seed : 0xA5A5A5A5),
- desired_false_positive_probability_(false_positive_probability)
- {
- find_optimal_parameters();
- generate_unique_salt();
- raw_table_size_ = table_size_ / bits_per_char;
- bit_table_ = new cell_type[raw_table_size_];
- std::fill_n(bit_table_,raw_table_size_,0x00);
- }
-
- bloom_filter(const bloom_filter& filter)
- {
- this->operator=(filter);
- }
-
- bloom_filter& operator = (const bloom_filter& filter)
- {
- if (this != &filter) {
- salt_count_ = filter.salt_count_;
- table_size_ = filter.table_size_;
- raw_table_size_ = filter.raw_table_size_;
- predicted_inserted_element_count_ = filter.predicted_inserted_element_count_;
- inserted_element_count_ = filter.inserted_element_count_;
- random_seed_ = filter.random_seed_;
- desired_false_positive_probability_ = filter.desired_false_positive_probability_;
- delete[] bit_table_;
- bit_table_ = new cell_type[raw_table_size_];
- std::copy(filter.bit_table_,filter.bit_table_ + raw_table_size_,bit_table_);
- salt_ = filter.salt_;
- }
- return *this;
- }
-
- virtual ~bloom_filter()
- {
- delete[] bit_table_;
- }
-
- inline bool operator!() const
- {
- return (0 == table_size_);
- }
-
- inline void clear()
- {
- std::fill_n(bit_table_,raw_table_size_,0x00);
- inserted_element_count_ = 0;
- }
-
- inline void insert(const unsigned char* key_begin, const std::size_t& length)
- {
- std::size_t bit_index = 0;
- std::size_t bit = 0;
- for (std::size_t i = 0; i < salt_.size(); ++i)
- {
- compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
- bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
- }
- ++inserted_element_count_;
- }
-
- template<typename T>
- inline void insert(const T& t)
- {
- // Note: T must be a C++ POD type.
- insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T));
- }
-
- inline void insert(const std::string& key)
- {
- insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
- }
-
- inline void insert(const char* data, const std::size_t& length)
- {
- insert(reinterpret_cast<const unsigned char*>(data),length);
- }
-
- template<typename InputIterator>
- inline void insert(const InputIterator begin, const InputIterator end)
- {
- InputIterator itr = begin;
- while (end != itr)
- {
- insert(*(itr++));
- }
- }
-
- inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
- {
- std::size_t bit_index = 0;
- std::size_t bit = 0;
- for (std::size_t i = 0; i < salt_.size(); ++i)
- {
- compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
- if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
- {
- return false;
- }
- }
- return true;
- }
-
- template<typename T>
- inline bool contains(const T& t) const
- {
- return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T)));
- }
-
- inline bool contains(const std::string& key) const
- {
- return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
- }
-
- inline bool contains(const char* data, const std::size_t& length) const
- {
- return contains(reinterpret_cast<const unsigned char*>(data),length);
- }
-
- template<typename InputIterator>
- inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
- {
- InputIterator itr = begin;
- while (end != itr)
- {
- if (!contains(*itr))
- {
- return itr;
- }
- ++itr;
- }
- return end;
- }
-
- template<typename InputIterator>
- inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
- {
- InputIterator itr = begin;
- while (end != itr)
- {
- if (contains(*itr))
- {
- return itr;
- }
- ++itr;
- }
- return end;
- }
-
- inline virtual std::size_t size() const
- {
- return table_size_;
- }
-
- inline std::size_t element_count() const
- {
- return inserted_element_count_;
- }
-
- inline double effective_fpp() const
- {
- /*
- Note:
- The effective false positive probability is calculated using the
- designated table size and hash function count in conjunction with
- the current number of inserted elements - not the user defined
- predicated/expected number of inserted elements.
- */
- return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size());
- }
-
- inline bloom_filter& operator &= (const bloom_filter& filter)
- {
- /* intersection */
- if (
- (salt_count_ == filter.salt_count_) &&
- (table_size_ == filter.table_size_) &&
- (random_seed_ == filter.random_seed_)
- )
- {
- for (std::size_t i = 0; i < raw_table_size_; ++i)
- {
- bit_table_[i] &= filter.bit_table_[i];
- }
- }
- return *this;
- }
-
- inline bloom_filter& operator |= (const bloom_filter& filter)
- {
- /* union */
- if (
- (salt_count_ == filter.salt_count_) &&
- (table_size_ == filter.table_size_) &&
- (random_seed_ == filter.random_seed_)
- )
- {
- for (std::size_t i = 0; i < raw_table_size_; ++i)
- {
- bit_table_[i] |= filter.bit_table_[i];
- }
- }
- return *this;
- }
-
- inline bloom_filter& operator ^= (const bloom_filter& filter)
- {
- /* difference */
- if (
- (salt_count_ == filter.salt_count_) &&
- (table_size_ == filter.table_size_) &&
- (random_seed_ == filter.random_seed_)
- )
- {
- for (std::size_t i = 0; i < raw_table_size_; ++i)
- {
- bit_table_[i] ^= filter.bit_table_[i];
- }
- }
- return *this;
- }
-
- inline const cell_type* table() const
- {
- return bit_table_;
- }
-
-protected:
-
- inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
- {
- bit_index = hash % table_size_;
- bit = bit_index % bits_per_char;
- }
-
- void generate_unique_salt()
- {
- /*
- Note:
- A distinct hash function need not be implementation-wise
- distinct. In the current implementation "seeding" a common
- hash function with different values seems to be adequate.
- */
- const unsigned int predef_salt_count = 128;
- static const bloom_type predef_salt[predef_salt_count] =
- {
- 0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
- 0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
- 0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
- 0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
- 0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
- 0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
- 0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
- 0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
- 0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
- 0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
- 0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
- 0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
- 0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
- 0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
- 0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
- 0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
- 0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
- 0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
- 0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
- 0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
- 0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
- 0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
- 0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
- 0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
- 0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
- 0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
- 0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
- 0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
- 0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
- 0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
- 0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
- 0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
- };
-
- if (salt_count_ <= predef_salt_count)
- {
- std::copy(predef_salt,
- predef_salt + salt_count_,
- std::back_inserter(salt_));
- for (unsigned int i = 0; i < salt_.size(); ++i)
- {
- /*
- Note:
- This is done to integrate the user defined random seed,
- so as to allow for the generation of unique bloom filter
- instances.
- */
- salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + random_seed_;
- }
- }
- else
- {
- std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_));
- srand(static_cast<unsigned int>(random_seed_));
- while (salt_.size() < salt_count_)
- {
- bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
- if (0 == current_salt) continue;
- if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
- {
- salt_.push_back(current_salt);
- }
- }
- }
- }
-
- void find_optimal_parameters()
- {
- /*
- Note:
- The following will attempt to find the number of hash functions
- and minimum amount of storage bits required to construct a bloom
- filter consistent with the user defined false positive probability
- and estimated element insertion count.
- */
-
- double min_m = std::numeric_limits<double>::infinity();
- double min_k = 0.0;
- double curr_m = 0.0;
- double k = 1.0;
- while (k < 1000.0)
- {
- double numerator = (- k * predicted_inserted_element_count_);
- double denominator = std::log(1.0 - std::pow(desired_false_positive_probability_, 1.0 / k));
- curr_m = numerator / denominator;
-
- if (curr_m < min_m)
- {
- min_m = curr_m;
- min_k = k;
- }
- k += 1.0;
- }
-
- salt_count_ = static_cast<std::size_t>(min_k);
- table_size_ = static_cast<std::size_t>(min_m);
- table_size_ += (((table_size_ % bits_per_char) != 0) ? (bits_per_char - (table_size_ % bits_per_char)) : 0);
- }
-
- inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
- {
- const unsigned char* itr = begin;
-
- while (remaining_length >= 4)
- {
- hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
- hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
- hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
- hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
- remaining_length -= 4;
- }
-
- while (remaining_length >= 2)
- {
- hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
- hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
- remaining_length -= 2;
- }
-
- if (remaining_length)
- {
- hash ^= (hash << 7) ^ (*itr) * (hash >> 3);
- }
-
- return hash;
- }
-
- std::vector<bloom_type> salt_;
- unsigned char* bit_table_;
- std::size_t salt_count_;
- std::size_t table_size_;
- std::size_t raw_table_size_;
- std::size_t predicted_inserted_element_count_;
- std::size_t inserted_element_count_;
- std::size_t random_seed_;
- double desired_false_positive_probability_;
-};
-
-inline bloom_filter operator & (const bloom_filter& a, const bloom_filter& b)
-{
- bloom_filter result = a;
- result &= b;
- return result;
-}
-
-inline bloom_filter operator | (const bloom_filter& a, const bloom_filter& b)
-{
- bloom_filter result = a;
- result |= b;
- return result;
-}
-
-inline bloom_filter operator ^ (const bloom_filter& a, const bloom_filter& b)
-{
- bloom_filter result = a;
- result ^= b;
- return result;
-}
-
-
-class compressible_bloom_filter : public bloom_filter
-{
-public:
-
- compressible_bloom_filter(const std::size_t& predicted_element_count,
- const double& false_positive_probability,
- const std::size_t& random_seed)
- : bloom_filter(predicted_element_count,false_positive_probability,random_seed)
- {
- size_list.push_back(table_size_);
- }
-
- inline virtual std::size_t size() const
- {
- return size_list.back();
- }
-
- inline bool compress(const double& percentage)
- {
- if ((0.0 >= percentage) || (percentage >= 100.0))
- {
- return false;
- }
-
- std::size_t original_table_size = size_list.back();
- std::size_t new_table_size = static_cast<std::size_t>((size_list.back() * (1.0 - (percentage / 100.0))));
- new_table_size -= (((new_table_size % bits_per_char) != 0) ? (new_table_size % bits_per_char) : 0);
-
- if ((bits_per_char > new_table_size) || (new_table_size >= original_table_size))
- {
- return false;
- }
-
- desired_false_positive_probability_ = effective_fpp();
- cell_type* tmp = new cell_type[new_table_size / bits_per_char];
- std::copy(bit_table_, bit_table_ + (new_table_size / bits_per_char), tmp);
- cell_type* itr = bit_table_ + (new_table_size / bits_per_char);
- cell_type* end = bit_table_ + (original_table_size / bits_per_char);
- cell_type* itr_tmp = tmp;
-
- while (end != itr)
- {
- *(itr_tmp++) |= (*itr++);
- }
-
- delete[] bit_table_;
- bit_table_ = tmp;
- size_list.push_back(new_table_size);
-
- return true;
- }
-
-private:
-
- inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
- {
- bit_index = hash;
- for (std::size_t i = 0; i < size_list.size(); ++i)
- {
- bit_index %= size_list[i];
- }
- bit = bit_index % bits_per_char;
- }
-
- std::vector<std::size_t> size_list;
-};
-
-#endif
-
-
-/*
- Note 1:
- If it can be guaranteed that bits_per_char will be of the form 2^n then
- the following optimization can be used:
-
- hash_table[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)];
-
- Note 2:
- For performance reasons where possible when allocating memory it should
- be aligned (aligned_alloc) according to the architecture being used.
-*/
diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index c77ca180a6f..4a5e636d9a6 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -27,7 +27,7 @@
#include "MDLog.h"
#include "LogSegment.h"
-#include "include/bloom_filter.hpp"
+#include "common/bloom_filter.hpp"
#include "include/Context.h"
#include "common/Clock.h"
diff --git a/src/test/Makefile.am b/src/test/Makefile.am
index debe18ff907..59b4d89e930 100644
--- a/src/test/Makefile.am
+++ b/src/test/Makefile.am
@@ -258,6 +258,11 @@ unittest_addrs_CXXFLAGS = $(UNITTEST_CXXFLAGS)
unittest_addrs_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
check_PROGRAMS += unittest_addrs
+unittest_bloom_filter_SOURCES = test/common/test_bloom_filter.cc
+unittest_bloom_filter_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_bloom_filter_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+check_PROGRAMS += unittest_bloom_filter
+
unittest_sharedptr_registry_SOURCES = test/common/test_sharedptr_registry.cc
unittest_sharedptr_registry_CXXFLAGS = $(UNITTEST_CXXFLAGS)
unittest_sharedptr_registry_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
diff --git a/src/test/common/test_bloom_filter.cc b/src/test/common/test_bloom_filter.cc
new file mode 100644
index 00000000000..8e3661b2cc1
--- /dev/null
+++ b/src/test/common/test_bloom_filter.cc
@@ -0,0 +1,222 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank <info@inktank.com>
+ *
+ * LGPL2.1 (see COPYING-LGPL2.1) or later
+ */
+
+#include <iostream>
+#include <gtest/gtest.h>
+
+#include "include/stringify.h"
+#include "common/bloom_filter.hpp"
+
+TEST(BloomFilter, Basic) {
+ bloom_filter bf(10, .1, 1);
+ bf.insert("foo");
+ bf.insert("bar");
+
+ ASSERT_TRUE(bf.contains("foo"));
+ ASSERT_TRUE(bf.contains("bar"));
+}
+
+TEST(BloomFilter, Sweep) {
+ std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl;
+ for (int ex = 3; ex < 12; ex += 2) {
+ for (float fpp = .001; fpp < .5; fpp *= 4.0) {
+ int max = 2 << ex;
+ bloom_filter bf(max, fpp, 1);
+ bf.insert("foo");
+ bf.insert("bar");
+
+ ASSERT_TRUE(bf.contains("foo"));
+ ASSERT_TRUE(bf.contains("bar"));
+
+ for (int n = 0; n < max; n++)
+ bf.insert("ok" + stringify(n));
+
+ int test = max * 100;
+ int hit = 0;
+ for (int n = 0; n < test; n++)
+ if (bf.contains("asdf" + stringify(n)))
+ hit++;
+
+ ASSERT_TRUE(bf.contains("foo"));
+ ASSERT_TRUE(bf.contains("bar"));
+
+ double actual = (double)hit / (double)test;
+
+ bufferlist bl;
+ ::encode(bf, bl);
+
+ double byte_per_insert = (double)bl.length() / (double)max;
+
+ std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert << std::endl;
+ ASSERT_TRUE(actual < fpp * 10);
+
+ }
+ }
+}
+
+TEST(BloomFilter, SweepInt) {
+ std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl;
+ for (int ex = 3; ex < 12; ex += 2) {
+ for (float fpp = .001; fpp < .5; fpp *= 4.0) {
+ int max = 2 << ex;
+ bloom_filter bf(max, fpp, 1);
+ bf.insert("foo");
+ bf.insert("bar");
+
+ ASSERT_TRUE(123);
+ ASSERT_TRUE(456);
+
+ for (int n = 0; n < max; n++)
+ bf.insert(n);
+
+ int test = max * 100;
+ int hit = 0;
+ for (int n = 0; n < test; n++)
+ if (bf.contains(100000 + n))
+ hit++;
+
+ ASSERT_TRUE(123);
+ ASSERT_TRUE(456);
+
+ double actual = (double)hit / (double)test;
+
+ bufferlist bl;
+ ::encode(bf, bl);
+
+ double byte_per_insert = (double)bl.length() / (double)max;
+
+ std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert << std::endl;
+ ASSERT_TRUE(actual < fpp * 10);
+ ASSERT_TRUE(actual > fpp / 10);
+ }
+ }
+}
+
+
+TEST(BloomFilter, BinSweep) {
+ int total_max = 16384;
+ float total_fpp = .01;
+ std::cout << "total_inserts " << total_max << " target-fpp " << total_fpp << std::endl;
+ for (int bins = 1; bins < 16; ++bins) {
+ int max = total_max / bins;
+ float fpp = total_fpp / bins;//pow(total_fpp, bins);
+
+ std::vector<bloom_filter*> ls;
+ bufferlist bl;
+ for (int i=0; i<bins; i++) {
+ ls.push_back(new bloom_filter(max, fpp, i));
+ for (int j=0; j<max; j++) {
+ ls.back()->insert(10000 * (i+1) + j);
+ }
+ ::encode(*ls.front(), bl);
+ }
+
+ int hit = 0;
+ int test = max * 100;
+ for (int i=0; i<test; ++i) {
+ for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) {
+ if ((*j)->contains(i * 732)) { // note: sequential i does not work here; the intenral int hash is weak!!
+ hit++;
+ break;
+ }
+ }
+ }
+
+ double actual = (double)hit / (double)test;
+ std::cout << "bins " << bins << " bin-max " << max << " bin-fpp " << fpp
+ << " actual-fpp " << actual
+ << " total-size " << bl.length() << std::endl;
+ }
+}
+
+// disable these tests; doing dual insertions in consecutive filters
+// appears to be equivalent to doing a single insertion in a bloom
+// filter that is twice as big.
+#if 0
+
+// test the fpp over a sequence of bloom filters, each with unique
+// items inserted into it.
+//
+// we expect: actual_fpp = num_filters * per_filter_fpp
+TEST(BloomFilter, Sequence) {
+
+ int max = 1024;
+ double fpp = .01;
+ for (int seq = 2; seq <= 128; seq *= 2) {
+ std::vector<bloom_filter*> ls;
+ for (int i=0; i<seq; i++) {
+ ls.push_back(new bloom_filter(max*2, fpp, i));
+ for (int j=0; j<max; j++) {
+ ls.back()->insert("ok" + stringify(j) + "_" + stringify(i));
+ if (ls.size() > 1)
+ ls[ls.size() - 2]->insert("ok" + stringify(j) + "_" + stringify(i));
+ }
+ }
+
+ int hit = 0;
+ int test = max * 100;
+ for (int i=0; i<test; ++i) {
+ for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) {
+ if ((*j)->contains("bad" + stringify(i))) {
+ hit++;
+ break;
+ }
+ }
+ }
+
+ double actual = (double)hit / (double)test;
+ std::cout << "seq " << seq << " max " << max << " fpp " << fpp << " actual " << actual << std::endl;
+ }
+}
+
+// test the ffp over a sequence of bloom filters, where actual values
+// are always inserted into a consecutive pair of filters. in order
+// to have a false positive, we need to falsely match two consecutive
+// filters.
+//
+// we expect: actual_fpp = num_filters * per_filter_fpp^2
+TEST(BloomFilter, SequenceDouble) {
+ int max = 1024;
+ double fpp = .01;
+ for (int seq = 2; seq <= 128; seq *= 2) {
+ std::vector<bloom_filter*> ls;
+ for (int i=0; i<seq; i++) {
+ ls.push_back(new bloom_filter(max*2, fpp, i));
+ for (int j=0; j<max; j++) {
+ ls.back()->insert("ok" + stringify(j) + "_" + stringify(i));
+ if (ls.size() > 1)
+ ls[ls.size() - 2]->insert("ok" + stringify(j) + "_" + stringify(i));
+ }
+ }
+
+ int hit = 0;
+ int test = max * 100;
+ int run = 0;
+ for (int i=0; i<test; ++i) {
+ for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) {
+ if ((*j)->contains("bad" + stringify(i))) {
+ run++;
+ if (run >= 2) {
+ hit++;
+ break;
+ }
+ } else {
+ run = 0;
+ }
+ }
+ }
+
+ double actual = (double)hit / (double)test;
+ std::cout << "seq " << seq << " max " << max << " fpp " << fpp << " actual " << actual
+ << " expected " << (fpp*fpp*(double)seq) << std::endl;
+ }
+}
+
+#endif
diff --git a/src/test/encoding/types.h b/src/test/encoding/types.h
index 6dd180bc198..59e55a11b23 100644
--- a/src/test/encoding/types.h
+++ b/src/test/encoding/types.h
@@ -4,6 +4,9 @@ TYPE(CompatSet)
#include "include/filepath.h"
TYPE(filepath)
+#include "common/bloom_filter.hpp"
+TYPE(bloom_filter)
+
#include "common/snap_types.h"
TYPE(SnapContext)
TYPE(SnapRealmInfo)