diff options
author | Adam Kocoloski <kocolosk@apache.org> | 2018-06-05 21:45:31 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-06-05 21:45:31 -0400 |
commit | 6d44e17fccc44c377476247d9765fc573154097f (patch) | |
tree | dc9021c5443a65be1a258eff0854d6dcbddeceba | |
parent | 5290a32b395cf66d4bc1fa1b417a299b1006151b (diff) | |
download | couchdb-6d44e17fccc44c377476247d9765fc573154097f.tar.gz |
Add _approx_count_distinct as a builtin reduce function (#1346)
This introduces a new builtin reduce function, which uses a HyperLogLog
algorithm to estimate the number of distinct keys in the view index. The
precision is currently fixed to 2^11 observables andtherefore uses
approximately 1.5 KB of memory.
It also introduces a finalize step which can be used to improve the
efficiency of other builtin reduce functions going forward.
Closes COUCHDB-2971
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | LICENSE | 24 | ||||
-rw-r--r-- | NOTICE | 4 | ||||
-rw-r--r-- | rebar.config.script | 1 | ||||
-rw-r--r-- | rel/reltool.config | 2 | ||||
-rw-r--r-- | src/couch/src/couch_query_servers.erl | 23 | ||||
-rw-r--r-- | src/couch_mrview/src/couch_mrview.erl | 2 | ||||
-rw-r--r-- | src/fabric/src/fabric_view.erl | 3 | ||||
-rw-r--r-- | test/javascript/tests/reduce_builtin.js | 20 |
9 files changed, 78 insertions, 2 deletions
diff --git a/.gitignore b/.gitignore index faa07f983..a1cba1e0b 100644 --- a/.gitignore +++ b/.gitignore @@ -54,6 +54,7 @@ src/oauth/ src/rebar/ src/snappy/ src/triq/ +src/hyper/ tmp/ src/couch/*.o @@ -2274,3 +2274,27 @@ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +For the src/hyper component: + +The MIT License (MIT) + +Copyright (c) 2014 Game Analytics ApS + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. @@ -184,3 +184,7 @@ This product also includes the following third-party components: 1997 Niels Provos <provos@physnet.uni-hamburg.de> - The asynchronous queue code (c_src/async_queue.c and c_src/async_queue.h) is from the esnappy project, copyright 2011 Konstantin V. Sorokin. + +* hyper + + Copyright (c) 2014 Game Analytics ApS diff --git a/rebar.config.script b/rebar.config.script index 0cbc21faf..a3462c990 100644 --- a/rebar.config.script +++ b/rebar.config.script @@ -61,6 +61,7 @@ DepDescs = [ {tag, "v1.1.15"}, [raw]}, %% Third party deps {folsom, "folsom", {tag, "CouchDB-0.8.2"}}, +{hyper, "hyper", {tag, "CouchDB-2.2.0-3"}}, {ibrowse, "ibrowse", {tag, "CouchDB-4.0.1"}}, {jiffy, "jiffy", {tag, "CouchDB-0.14.11-2"}}, {mochiweb, "mochiweb", {tag, "v2.17.0"}}, diff --git a/rel/reltool.config b/rel/reltool.config index aa3100647..5e86d9643 100644 --- a/rel/reltool.config +++ b/rel/reltool.config @@ -47,6 +47,7 @@ fabric, folsom, global_changes, + hyper, ibrowse, ioq, jiffy, @@ -101,6 +102,7 @@ {app, fabric, [{incl_cond, include}]}, {app, folsom, [{incl_cond, include}]}, {app, global_changes, [{incl_cond, include}]}, + {app, hyper, [{incl_cond, include}]}, {app, ibrowse, [{incl_cond, include}]}, {app, ioq, [{incl_cond, include}]}, {app, jiffy, [{incl_cond, include}]}, diff --git a/src/couch/src/couch_query_servers.erl b/src/couch/src/couch_query_servers.erl index f31d24c6c..02d90f195 100644 --- a/src/couch/src/couch_query_servers.erl +++ b/src/couch/src/couch_query_servers.erl @@ -17,6 +17,7 @@ -export([reduce/3, rereduce/3,validate_doc_update/5]). -export([filter_docs/5]). -export([filter_view/3]). +-export([finalize/1]). -export([rewrite/3]). -export([with_ddoc_proc/2, proc_prompt/2, ddoc_prompt/3, ddoc_proc_prompt/3, json_doc/1]). @@ -86,6 +87,16 @@ group_reductions_results(List) -> [Heads | group_reductions_results(Tails)] end. +finalize(Reductions) -> + {ok, lists:map(fun(Reduction) -> + case hyper:is_hyper(Reduction) of + true -> + round(hyper:card(Reduction)); + false -> + Reduction + end + end, Reductions)}. + rereduce(_Lang, [], _ReducedValues) -> {ok, []}; rereduce(Lang, RedSrcs, ReducedValues) -> @@ -171,7 +182,10 @@ builtin_reduce(rereduce, [<<"_count",_/binary>>|BuiltinReds], KVs, Acc) -> builtin_reduce(rereduce, BuiltinReds, KVs, [Count|Acc]); builtin_reduce(Re, [<<"_stats",_/binary>>|BuiltinReds], KVs, Acc) -> Stats = builtin_stats(Re, KVs), - builtin_reduce(Re, BuiltinReds, KVs, [Stats|Acc]). + builtin_reduce(Re, BuiltinReds, KVs, [Stats|Acc]); +builtin_reduce(Re, [<<"_approx_count_distinct",_/binary>>|BuiltinReds], KVs, Acc) -> + Distinct = approx_count_distinct(Re, KVs), + builtin_reduce(Re, BuiltinReds, KVs, [Distinct|Acc]). builtin_sum_rows([], Acc) -> @@ -303,6 +317,13 @@ get_number(Key, Props) -> throw({invalid_value, iolist_to_binary(Msg)}) end. +% TODO allow customization of precision in the ddoc. +approx_count_distinct(reduce, KVs) -> + lists:foldl(fun([[Key, _Id], _Value], Filter) -> + hyper:insert(term_to_binary(Key), Filter) + end, hyper:new(11), KVs); +approx_count_distinct(rereduce, Reds) -> + hyper:union([Filter || [_, Filter] <- Reds]). % use the function stored in ddoc.validate_doc_update to test an update. -spec validate_doc_update(DDoc, EditDoc, DiskDoc, Ctx, SecObj) -> ok when diff --git a/src/couch_mrview/src/couch_mrview.erl b/src/couch_mrview/src/couch_mrview.erl index a099f377e..b417aac52 100644 --- a/src/couch_mrview/src/couch_mrview.erl +++ b/src/couch_mrview/src/couch_mrview.erl @@ -184,6 +184,8 @@ validate(DbName, DDoc) -> ok; ({_RedName, <<"_stats", _/binary>>}) -> ok; + ({_RedName, <<"_approx_count_distinct", _/binary>>}) -> + ok; ({_RedName, <<"_", _/binary>> = Bad}) -> Msg = ["`", Bad, "` is not a supported reduce function."], throw({invalid_design_doc, Msg}); diff --git a/src/fabric/src/fabric_view.erl b/src/fabric/src/fabric_view.erl index dd0fcfd8b..4d8d0e987 100644 --- a/src/fabric/src/fabric_view.erl +++ b/src/fabric/src/fabric_view.erl @@ -230,8 +230,9 @@ get_next_row(#collector{reducer = RedSrc} = St) when RedSrc =/= undefined -> end, Counters0, Records), Wrapped = [[V] || #view_row{value=V} <- Records], {ok, [Reduced]} = couch_query_servers:rereduce(Lang, [RedSrc], Wrapped), + {ok, [Finalized]} = couch_query_servers:finalize([Reduced]), NewSt = St#collector{keys=RestKeys, rows=NewRowDict, counters=Counters}, - {#view_row{key=Key, id=reduced, value=Reduced}, NewSt}; + {#view_row{key=Key, id=reduced, value=Finalized}, NewSt}; error -> get_next_row(St#collector{keys=RestKeys}) end; diff --git a/test/javascript/tests/reduce_builtin.js b/test/javascript/tests/reduce_builtin.js index 9c455e4e6..4686841e3 100644 --- a/test/javascript/tests/reduce_builtin.js +++ b/test/javascript/tests/reduce_builtin.js @@ -37,6 +37,12 @@ couchTests.reduce_builtin = function(debug) { emit(doc.integer, doc.integer); }; + var check_approx_distinct = function(expected, estimated) { + // see https://en.wikipedia.org/wiki/HyperLogLog + var err = 1.04 / Math.sqrt(Math.pow(2, 11 - 1)); + return Math.abs(expected - estimated) < expected * err; + }; + var result = db.query(map, "_sum"); T(result.rows[0].value == 2*summate(numDocs)); result = db.query(map, "_count"); @@ -47,27 +53,41 @@ couchTests.reduce_builtin = function(debug) { T(result.rows[0].value.min == 1); T(result.rows[0].value.max == 500); T(result.rows[0].value.sumsqr == 2*sumsqr(numDocs)); + result = db.query(map, "_approx_count_distinct"); + T(check_approx_distinct(numDocs, result.rows[0].value)); result = db.query(map, "_sum", {startkey: 4, endkey: 4}); T(result.rows[0].value == 8); result = db.query(map, "_count", {startkey: 4, endkey: 4}); T(result.rows[0].value == 2); + result = db.query(map, "_approx_count_distinct", {startkey:4, endkey:4}); + T(check_approx_distinct(1, result.rows[0].value)); result = db.query(map, "_sum", {startkey: 4, endkey: 5}); T(result.rows[0].value == 18); result = db.query(map, "_count", {startkey: 4, endkey: 5}); T(result.rows[0].value == 4); + result = db.query(map, "_approx_count_distinct", {startkey:4, endkey:5}); + T(check_approx_distinct(2, result.rows[0].value)); + result = db.query(map, "_sum", {startkey: 4, endkey: 6}); T(result.rows[0].value == 30); result = db.query(map, "_count", {startkey: 4, endkey: 6}); T(result.rows[0].value == 6); + result = db.query(map, "_approx_count_distinct", {startkey: 4, endkey: 6}); + T(check_approx_distinct(3, result.rows[0].value)); result = db.query(map, "_sum", {group:true, limit:3}); T(result.rows[0].value == 2); T(result.rows[1].value == 4); T(result.rows[2].value == 6); + result = db.query(map, "_approx_count_distinct", {group:true, limit:3}); + T(check_approx_distinct(1, result.rows[0].value)); + T(check_approx_distinct(1, result.rows[1].value)); + T(check_approx_distinct(1, result.rows[2].value)); + for(var i=1; i<numDocs/2; i+=30) { result = db.query(map, "_sum", {startkey: i, endkey: numDocs - i}); T(result.rows[0].value == 2*(summate(numDocs-i) - summate(i-1))); |