diff options
author | Don Anderson <dda@mongodb.com> | 2016-10-24 23:15:47 -0400 |
---|---|---|
committer | Alex Gorrod <alexander.gorrod@mongodb.com> | 2016-10-25 14:15:47 +1100 |
commit | 3c82245968327036deb064b4ba18823d859d59f8 (patch) | |
tree | c0685ce38016f76f4386e3eee613473864e69f0f | |
parent | 51d4bca24a726307346cdff89712a8e811f586c6 (diff) | |
download | mongo-3c82245968327036deb064b4ba18823d859d59f8.tar.gz |
WT-2415 Add support for joins to return false positives from the Bloom filter. (#3099)
-rw-r--r-- | dist/api_data.py | 4 | ||||
-rw-r--r-- | src/config/config_def.c | 8 | ||||
-rw-r--r-- | src/cursor/cur_join.c | 11 | ||||
-rw-r--r-- | src/include/cursor.h | 8 | ||||
-rw-r--r-- | src/include/wiredtiger.in | 3 | ||||
-rw-r--r-- | src/session/session_api.c | 4 | ||||
-rw-r--r-- | test/suite/test_join09.py | 115 |
7 files changed, 145 insertions, 8 deletions
diff --git a/dist/api_data.py b/dist/api_data.py index 7affc58a217..22d06c380ae 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -869,6 +869,10 @@ methods = { Config('bloom_bit_count', '16', r''' the number of bits used per item for the bloom filter''', min='2', max='1000'), + Config('bloom_false_positives', 'false', r''' + return all values that pass the bloom filter, without eliminating + any false positives''', + type='boolean'), Config('bloom_hash_count', '8', r''' the number of hash values per item for the bloom filter''', min='2', max='100'), diff --git a/src/config/config_def.c b/src/config/config_def.c index 018cc7a8ac4..d57bc418c93 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -299,6 +299,7 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_drop[] = { static const WT_CONFIG_CHECK confchk_WT_SESSION_join[] = { { "bloom_bit_count", "int", NULL, "min=2,max=1000", NULL, 0 }, + { "bloom_false_positives", "boolean", NULL, NULL, NULL, 0 }, { "bloom_hash_count", "int", NULL, "min=2,max=100", NULL, 0 }, { "compare", "string", NULL, "choices=[\"eq\",\"ge\",\"gt\",\"le\",\"lt\"]", @@ -1090,9 +1091,10 @@ static const WT_CONFIG_ENTRY config_entries[] = { confchk_WT_SESSION_drop, 4 }, { "WT_SESSION.join", - "bloom_bit_count=16,bloom_hash_count=8,compare=\"eq\",count=," - "operation=\"and\",strategy=", - confchk_WT_SESSION_join, 6 + "bloom_bit_count=16,bloom_false_positives=false," + "bloom_hash_count=8,compare=\"eq\",count=,operation=\"and\"," + "strategy=", + confchk_WT_SESSION_join, 7 }, { "WT_SESSION.log_flush", "sync=on", diff --git a/src/cursor/cur_join.c b/src/cursor/cur_join.c index 087411febda..88eecfddef8 100644 --- a/src/cursor/cur_join.c +++ b/src/cursor/cur_join.c @@ -613,8 +613,8 @@ __curjoin_entry_member(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry, if (entry->bloom != NULL) { /* * If the item is not in the Bloom filter, we return - * immediately, otherwise, we still need to check the long - * way, since it may be a false positive. + * immediately, otherwise, we still may need to check the + * long way, since it may be a false positive. * * If we don't own the Bloom filter, we must be sharing one * in a previous entry. So the shared filter has already @@ -623,6 +623,8 @@ __curjoin_entry_member(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry, */ if (F_ISSET(entry, WT_CURJOIN_ENTRY_OWN_BLOOM)) WT_ERR(__wt_bloom_inmem_get(entry->bloom, key)); + if (F_ISSET(entry, WT_CURJOIN_ENTRY_FALSE_POSITIVES)) + return (0); bloom_found = true; } if (entry->subjoin != NULL) { @@ -1442,6 +1444,11 @@ __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_RET_MSG(session, EINVAL, "join has incompatible strategy " "values for the same index"); + if (LF_MASK(WT_CURJOIN_ENTRY_FALSE_POSITIVES) != + F_MASK(entry, WT_CURJOIN_ENTRY_FALSE_POSITIVES)) + WT_RET_MSG(session, EINVAL, + "join has incompatible bloom_false_positives " + "values for the same index"); /* * Check against other comparisons (we call them endpoints) diff --git a/src/include/cursor.h b/src/include/cursor.h index e322a53a65d..524e65de12f 100644 --- a/src/include/cursor.h +++ b/src/include/cursor.h @@ -365,9 +365,11 @@ struct __wt_cursor_join_entry { uint32_t bloom_hash_count; /* hash functions in bloom */ uint64_t count; /* approx number of matches */ -#define WT_CURJOIN_ENTRY_BLOOM 0x01 /* use a bloom filter */ -#define WT_CURJOIN_ENTRY_DISJUNCTION 0x02 /* endpoints are or-ed */ -#define WT_CURJOIN_ENTRY_OWN_BLOOM 0x04 /* this entry owns the bloom */ +#define WT_CURJOIN_ENTRY_BLOOM 0x01 /* use a bloom filter */ +#define WT_CURJOIN_ENTRY_DISJUNCTION 0x02 /* endpoints are or-ed */ +#define WT_CURJOIN_ENTRY_FALSE_POSITIVES 0x04 /* after bloom filter do not + * filter false positives */ +#define WT_CURJOIN_ENTRY_OWN_BLOOM 0x08 /* this entry owns the bloom */ uint8_t flags; WT_CURSOR_JOIN_ENDPOINT *ends; /* reference endpoints */ diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index b6185b4ead6..b92111a7810 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -1284,6 +1284,9 @@ struct __wt_session { * @configstart{WT_SESSION.join, see dist/api_data.py} * @config{bloom_bit_count, the number of bits used per item for the * bloom filter., an integer between 2 and 1000; default \c 16.} + * @config{bloom_false_positives, return all values that pass the bloom + * filter\, without eliminating any false positives., a boolean flag; + * default \c false.} * @config{bloom_hash_count, the number of hash values per item for the * bloom filter., an integer between 2 and 100; default \c 8.} * @config{compare, modifies the set of items to be returned so that the diff --git a/src/session/session_api.c b/src/session/session_api.c index f594450db74..fbf171dcfc1 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -963,6 +963,10 @@ __session_join(WT_SESSION *wt_session, WT_CURSOR *join_cursor, if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM) && count == 0) WT_ERR_MSG(session, EINVAL, "count must be nonzero when strategy=bloom"); + WT_ERR(__wt_config_gets_def( + session, cfg, "bloom_false_positives", 0, &cval)); + if (cval.val != 0) + LF_SET(WT_CURJOIN_ENTRY_FALSE_POSITIVES); WT_ERR(__wt_config_gets(session, cfg, "operation", &cval)); if (cval.len != 0 && WT_STRING_MATCH("or", cval.str, cval.len)) diff --git a/test/suite/test_join09.py b/test/suite/test_join09.py new file mode 100644 index 00000000000..d48353b1580 --- /dev/null +++ b/test/suite/test_join09.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2016 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import os +import wiredtiger, wttest, run +from wtscenario import make_scenarios + +# test_join09.py +# Join bloom filters with false positives +class test_join09(wttest.WiredTigerTestCase): + nentries = 1000 + + bloomscen = [ + ('nobloom', dict(false_positives=False, config='')), + ('bloom1000', dict(false_positives=False, + config='strategy=bloom,count=1000')), + ('bloom10fp', dict(false_positives=True, + config='strategy=bloom,count=10,bloom_false_positives=true')) + ] + + scenarios = make_scenarios(bloomscen) + + def gen_values(self, i): + s = str(i) # 345 => "345" + f = s[0:1] + s[0:1] + s[0:1] # 345 => "333" + return [s, f] + + def populate(self, s, gen_values): + c = s.open_cursor('table:join09', None, None) + for i in range(0, self.nentries): + c.set_key(i) + c.set_value(*gen_values(i)) + c.insert() + c.close() + + # Common function for testing the most basic functionality + # of joins + def test_join(self): + self.session.create('table:join09', + 'columns=(k,v0,v1),key_format=i,value_format=SS') + self.session.create('index:join09:index0','columns=(v0)') + self.session.create('index:join09:index1','columns=(v1)') + + self.populate(self.session, self.gen_values) + + jc = self.session.open_cursor('join:table:join09', None, None) + c0 = self.session.open_cursor('index:join09:index0', None, None) + c0.set_key('520') + self.assertEquals(0, c0.search()) + self.session.join(jc, c0, 'compare=ge') + + joinconfig = 'compare=eq,' + self.config + c1 = self.session.open_cursor('index:join09:index1', None, None) + c1.set_key('555') + self.assertEquals(0, c1.search()) + self.session.join(jc, c1, joinconfig) + + mbr = set(range(520,600)) | set(range(53,60)) + + fp_count = 0 + while jc.next() == 0: + [k] = jc.get_keys() + [v0,v1] = jc.get_values() + self.assertEquals(self.gen_values(k), [v0, v1]) + if not k in mbr: + # With false positives, we can see extra values + if self.false_positives: + fp_count += 1 + continue + self.tty('**** ERROR: result ' + str(k) + ' is not in: ' + + str(mbr)) + self.assertTrue(k in mbr) + mbr.remove(k) + + if len(mbr) != 0: + self.tty('**** ERROR: did not see these: ' + str(mbr)) + self.assertEquals(0, len(mbr)) + + # Turning on false positives does not guarantee we'll see extra + # values, but we've configured our test with a low count to + # make sure it happens. + if self.false_positives: + self.assertTrue(fp_count > 0) + jc.close() + c1.close() + c0.close() + self.session.drop('table:join09') + +if __name__ == '__main__': + wttest.run() |