summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDon Anderson <dda@mongodb.com>2016-10-24 23:15:47 -0400
committerAlex Gorrod <alexander.gorrod@mongodb.com>2016-10-25 14:15:47 +1100
commit3c82245968327036deb064b4ba18823d859d59f8 (patch)
treec0685ce38016f76f4386e3eee613473864e69f0f
parent51d4bca24a726307346cdff89712a8e811f586c6 (diff)
downloadmongo-3c82245968327036deb064b4ba18823d859d59f8.tar.gz
WT-2415 Add support for joins to return false positives from the Bloom filter. (#3099)
-rw-r--r--dist/api_data.py4
-rw-r--r--src/config/config_def.c8
-rw-r--r--src/cursor/cur_join.c11
-rw-r--r--src/include/cursor.h8
-rw-r--r--src/include/wiredtiger.in3
-rw-r--r--src/session/session_api.c4
-rw-r--r--test/suite/test_join09.py115
7 files changed, 145 insertions, 8 deletions
diff --git a/dist/api_data.py b/dist/api_data.py
index 7affc58a217..22d06c380ae 100644
--- a/dist/api_data.py
+++ b/dist/api_data.py
@@ -869,6 +869,10 @@ methods = {
Config('bloom_bit_count', '16', r'''
the number of bits used per item for the bloom filter''',
min='2', max='1000'),
+ Config('bloom_false_positives', 'false', r'''
+ return all values that pass the bloom filter, without eliminating
+ any false positives''',
+ type='boolean'),
Config('bloom_hash_count', '8', r'''
the number of hash values per item for the bloom filter''',
min='2', max='100'),
diff --git a/src/config/config_def.c b/src/config/config_def.c
index 018cc7a8ac4..d57bc418c93 100644
--- a/src/config/config_def.c
+++ b/src/config/config_def.c
@@ -299,6 +299,7 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_drop[] = {
static const WT_CONFIG_CHECK confchk_WT_SESSION_join[] = {
{ "bloom_bit_count", "int", NULL, "min=2,max=1000", NULL, 0 },
+ { "bloom_false_positives", "boolean", NULL, NULL, NULL, 0 },
{ "bloom_hash_count", "int", NULL, "min=2,max=100", NULL, 0 },
{ "compare", "string",
NULL, "choices=[\"eq\",\"ge\",\"gt\",\"le\",\"lt\"]",
@@ -1090,9 +1091,10 @@ static const WT_CONFIG_ENTRY config_entries[] = {
confchk_WT_SESSION_drop, 4
},
{ "WT_SESSION.join",
- "bloom_bit_count=16,bloom_hash_count=8,compare=\"eq\",count=,"
- "operation=\"and\",strategy=",
- confchk_WT_SESSION_join, 6
+ "bloom_bit_count=16,bloom_false_positives=false,"
+ "bloom_hash_count=8,compare=\"eq\",count=,operation=\"and\","
+ "strategy=",
+ confchk_WT_SESSION_join, 7
},
{ "WT_SESSION.log_flush",
"sync=on",
diff --git a/src/cursor/cur_join.c b/src/cursor/cur_join.c
index 087411febda..88eecfddef8 100644
--- a/src/cursor/cur_join.c
+++ b/src/cursor/cur_join.c
@@ -613,8 +613,8 @@ __curjoin_entry_member(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry,
if (entry->bloom != NULL) {
/*
* If the item is not in the Bloom filter, we return
- * immediately, otherwise, we still need to check the long
- * way, since it may be a false positive.
+ * immediately, otherwise, we still may need to check the
+ * long way, since it may be a false positive.
*
* If we don't own the Bloom filter, we must be sharing one
* in a previous entry. So the shared filter has already
@@ -623,6 +623,8 @@ __curjoin_entry_member(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry,
*/
if (F_ISSET(entry, WT_CURJOIN_ENTRY_OWN_BLOOM))
WT_ERR(__wt_bloom_inmem_get(entry->bloom, key));
+ if (F_ISSET(entry, WT_CURJOIN_ENTRY_FALSE_POSITIVES))
+ return (0);
bloom_found = true;
}
if (entry->subjoin != NULL) {
@@ -1442,6 +1444,11 @@ __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
WT_RET_MSG(session, EINVAL,
"join has incompatible strategy "
"values for the same index");
+ if (LF_MASK(WT_CURJOIN_ENTRY_FALSE_POSITIVES) !=
+ F_MASK(entry, WT_CURJOIN_ENTRY_FALSE_POSITIVES))
+ WT_RET_MSG(session, EINVAL,
+ "join has incompatible bloom_false_positives "
+ "values for the same index");
/*
* Check against other comparisons (we call them endpoints)
diff --git a/src/include/cursor.h b/src/include/cursor.h
index e322a53a65d..524e65de12f 100644
--- a/src/include/cursor.h
+++ b/src/include/cursor.h
@@ -365,9 +365,11 @@ struct __wt_cursor_join_entry {
uint32_t bloom_hash_count; /* hash functions in bloom */
uint64_t count; /* approx number of matches */
-#define WT_CURJOIN_ENTRY_BLOOM 0x01 /* use a bloom filter */
-#define WT_CURJOIN_ENTRY_DISJUNCTION 0x02 /* endpoints are or-ed */
-#define WT_CURJOIN_ENTRY_OWN_BLOOM 0x04 /* this entry owns the bloom */
+#define WT_CURJOIN_ENTRY_BLOOM 0x01 /* use a bloom filter */
+#define WT_CURJOIN_ENTRY_DISJUNCTION 0x02 /* endpoints are or-ed */
+#define WT_CURJOIN_ENTRY_FALSE_POSITIVES 0x04 /* after bloom filter do not
+ * filter false positives */
+#define WT_CURJOIN_ENTRY_OWN_BLOOM 0x08 /* this entry owns the bloom */
uint8_t flags;
WT_CURSOR_JOIN_ENDPOINT *ends; /* reference endpoints */
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index b6185b4ead6..b92111a7810 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -1284,6 +1284,9 @@ struct __wt_session {
* @configstart{WT_SESSION.join, see dist/api_data.py}
* @config{bloom_bit_count, the number of bits used per item for the
* bloom filter., an integer between 2 and 1000; default \c 16.}
+ * @config{bloom_false_positives, return all values that pass the bloom
+ * filter\, without eliminating any false positives., a boolean flag;
+ * default \c false.}
* @config{bloom_hash_count, the number of hash values per item for the
* bloom filter., an integer between 2 and 100; default \c 8.}
* @config{compare, modifies the set of items to be returned so that the
diff --git a/src/session/session_api.c b/src/session/session_api.c
index f594450db74..fbf171dcfc1 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -963,6 +963,10 @@ __session_join(WT_SESSION *wt_session, WT_CURSOR *join_cursor,
if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM) && count == 0)
WT_ERR_MSG(session, EINVAL,
"count must be nonzero when strategy=bloom");
+ WT_ERR(__wt_config_gets_def(
+ session, cfg, "bloom_false_positives", 0, &cval));
+ if (cval.val != 0)
+ LF_SET(WT_CURJOIN_ENTRY_FALSE_POSITIVES);
WT_ERR(__wt_config_gets(session, cfg, "operation", &cval));
if (cval.len != 0 && WT_STRING_MATCH("or", cval.str, cval.len))
diff --git a/test/suite/test_join09.py b/test/suite/test_join09.py
new file mode 100644
index 00000000000..d48353b1580
--- /dev/null
+++ b/test/suite/test_join09.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2016 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import os
+import wiredtiger, wttest, run
+from wtscenario import make_scenarios
+
+# test_join09.py
+# Join bloom filters with false positives
+class test_join09(wttest.WiredTigerTestCase):
+ nentries = 1000
+
+ bloomscen = [
+ ('nobloom', dict(false_positives=False, config='')),
+ ('bloom1000', dict(false_positives=False,
+ config='strategy=bloom,count=1000')),
+ ('bloom10fp', dict(false_positives=True,
+ config='strategy=bloom,count=10,bloom_false_positives=true'))
+ ]
+
+ scenarios = make_scenarios(bloomscen)
+
+ def gen_values(self, i):
+ s = str(i) # 345 => "345"
+ f = s[0:1] + s[0:1] + s[0:1] # 345 => "333"
+ return [s, f]
+
+ def populate(self, s, gen_values):
+ c = s.open_cursor('table:join09', None, None)
+ for i in range(0, self.nentries):
+ c.set_key(i)
+ c.set_value(*gen_values(i))
+ c.insert()
+ c.close()
+
+ # Common function for testing the most basic functionality
+ # of joins
+ def test_join(self):
+ self.session.create('table:join09',
+ 'columns=(k,v0,v1),key_format=i,value_format=SS')
+ self.session.create('index:join09:index0','columns=(v0)')
+ self.session.create('index:join09:index1','columns=(v1)')
+
+ self.populate(self.session, self.gen_values)
+
+ jc = self.session.open_cursor('join:table:join09', None, None)
+ c0 = self.session.open_cursor('index:join09:index0', None, None)
+ c0.set_key('520')
+ self.assertEquals(0, c0.search())
+ self.session.join(jc, c0, 'compare=ge')
+
+ joinconfig = 'compare=eq,' + self.config
+ c1 = self.session.open_cursor('index:join09:index1', None, None)
+ c1.set_key('555')
+ self.assertEquals(0, c1.search())
+ self.session.join(jc, c1, joinconfig)
+
+ mbr = set(range(520,600)) | set(range(53,60))
+
+ fp_count = 0
+ while jc.next() == 0:
+ [k] = jc.get_keys()
+ [v0,v1] = jc.get_values()
+ self.assertEquals(self.gen_values(k), [v0, v1])
+ if not k in mbr:
+ # With false positives, we can see extra values
+ if self.false_positives:
+ fp_count += 1
+ continue
+ self.tty('**** ERROR: result ' + str(k) + ' is not in: ' +
+ str(mbr))
+ self.assertTrue(k in mbr)
+ mbr.remove(k)
+
+ if len(mbr) != 0:
+ self.tty('**** ERROR: did not see these: ' + str(mbr))
+ self.assertEquals(0, len(mbr))
+
+ # Turning on false positives does not guarantee we'll see extra
+ # values, but we've configured our test with a low count to
+ # make sure it happens.
+ if self.false_positives:
+ self.assertTrue(fp_count > 0)
+ jc.close()
+ c1.close()
+ c0.close()
+ self.session.drop('table:join09')
+
+if __name__ == '__main__':
+ wttest.run()