summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAdam Kocoloski <kocolosk@apache.org>2018-06-05 21:45:31 -0400
committerGitHub <noreply@github.com>2018-06-05 21:45:31 -0400
commit6d44e17fccc44c377476247d9765fc573154097f (patch)
treedc9021c5443a65be1a258eff0854d6dcbddeceba
parent5290a32b395cf66d4bc1fa1b417a299b1006151b (diff)
downloadcouchdb-6d44e17fccc44c377476247d9765fc573154097f.tar.gz
Add _approx_count_distinct as a builtin reduce function (#1346)
This introduces a new builtin reduce function, which uses a HyperLogLog algorithm to estimate the number of distinct keys in the view index. The precision is currently fixed to 2^11 observables andtherefore uses approximately 1.5 KB of memory. It also introduces a finalize step which can be used to improve the efficiency of other builtin reduce functions going forward. Closes COUCHDB-2971
-rw-r--r--.gitignore1
-rw-r--r--LICENSE24
-rw-r--r--NOTICE4
-rw-r--r--rebar.config.script1
-rw-r--r--rel/reltool.config2
-rw-r--r--src/couch/src/couch_query_servers.erl23
-rw-r--r--src/couch_mrview/src/couch_mrview.erl2
-rw-r--r--src/fabric/src/fabric_view.erl3
-rw-r--r--test/javascript/tests/reduce_builtin.js20
9 files changed, 78 insertions, 2 deletions
diff --git a/.gitignore b/.gitignore
index faa07f983..a1cba1e0b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -54,6 +54,7 @@ src/oauth/
src/rebar/
src/snappy/
src/triq/
+src/hyper/
tmp/
src/couch/*.o
diff --git a/LICENSE b/LICENSE
index a209352a0..6034c717e 100644
--- a/LICENSE
+++ b/LICENSE
@@ -2274,3 +2274,27 @@ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
SUCH DAMAGE.
+
+For the src/hyper component:
+
+The MIT License (MIT)
+
+Copyright (c) 2014 Game Analytics ApS
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/NOTICE b/NOTICE
index c04033897..481e75513 100644
--- a/NOTICE
+++ b/NOTICE
@@ -184,3 +184,7 @@ This product also includes the following third-party components:
1997 Niels Provos <provos@physnet.uni-hamburg.de>
- The asynchronous queue code (c_src/async_queue.c and c_src/async_queue.h)
is from the esnappy project, copyright 2011 Konstantin V. Sorokin.
+
+* hyper
+
+ Copyright (c) 2014 Game Analytics ApS
diff --git a/rebar.config.script b/rebar.config.script
index 0cbc21faf..a3462c990 100644
--- a/rebar.config.script
+++ b/rebar.config.script
@@ -61,6 +61,7 @@ DepDescs = [
{tag, "v1.1.15"}, [raw]},
%% Third party deps
{folsom, "folsom", {tag, "CouchDB-0.8.2"}},
+{hyper, "hyper", {tag, "CouchDB-2.2.0-3"}},
{ibrowse, "ibrowse", {tag, "CouchDB-4.0.1"}},
{jiffy, "jiffy", {tag, "CouchDB-0.14.11-2"}},
{mochiweb, "mochiweb", {tag, "v2.17.0"}},
diff --git a/rel/reltool.config b/rel/reltool.config
index aa3100647..5e86d9643 100644
--- a/rel/reltool.config
+++ b/rel/reltool.config
@@ -47,6 +47,7 @@
fabric,
folsom,
global_changes,
+ hyper,
ibrowse,
ioq,
jiffy,
@@ -101,6 +102,7 @@
{app, fabric, [{incl_cond, include}]},
{app, folsom, [{incl_cond, include}]},
{app, global_changes, [{incl_cond, include}]},
+ {app, hyper, [{incl_cond, include}]},
{app, ibrowse, [{incl_cond, include}]},
{app, ioq, [{incl_cond, include}]},
{app, jiffy, [{incl_cond, include}]},
diff --git a/src/couch/src/couch_query_servers.erl b/src/couch/src/couch_query_servers.erl
index f31d24c6c..02d90f195 100644
--- a/src/couch/src/couch_query_servers.erl
+++ b/src/couch/src/couch_query_servers.erl
@@ -17,6 +17,7 @@
-export([reduce/3, rereduce/3,validate_doc_update/5]).
-export([filter_docs/5]).
-export([filter_view/3]).
+-export([finalize/1]).
-export([rewrite/3]).
-export([with_ddoc_proc/2, proc_prompt/2, ddoc_prompt/3, ddoc_proc_prompt/3, json_doc/1]).
@@ -86,6 +87,16 @@ group_reductions_results(List) ->
[Heads | group_reductions_results(Tails)]
end.
+finalize(Reductions) ->
+ {ok, lists:map(fun(Reduction) ->
+ case hyper:is_hyper(Reduction) of
+ true ->
+ round(hyper:card(Reduction));
+ false ->
+ Reduction
+ end
+ end, Reductions)}.
+
rereduce(_Lang, [], _ReducedValues) ->
{ok, []};
rereduce(Lang, RedSrcs, ReducedValues) ->
@@ -171,7 +182,10 @@ builtin_reduce(rereduce, [<<"_count",_/binary>>|BuiltinReds], KVs, Acc) ->
builtin_reduce(rereduce, BuiltinReds, KVs, [Count|Acc]);
builtin_reduce(Re, [<<"_stats",_/binary>>|BuiltinReds], KVs, Acc) ->
Stats = builtin_stats(Re, KVs),
- builtin_reduce(Re, BuiltinReds, KVs, [Stats|Acc]).
+ builtin_reduce(Re, BuiltinReds, KVs, [Stats|Acc]);
+builtin_reduce(Re, [<<"_approx_count_distinct",_/binary>>|BuiltinReds], KVs, Acc) ->
+ Distinct = approx_count_distinct(Re, KVs),
+ builtin_reduce(Re, BuiltinReds, KVs, [Distinct|Acc]).
builtin_sum_rows([], Acc) ->
@@ -303,6 +317,13 @@ get_number(Key, Props) ->
throw({invalid_value, iolist_to_binary(Msg)})
end.
+% TODO allow customization of precision in the ddoc.
+approx_count_distinct(reduce, KVs) ->
+ lists:foldl(fun([[Key, _Id], _Value], Filter) ->
+ hyper:insert(term_to_binary(Key), Filter)
+ end, hyper:new(11), KVs);
+approx_count_distinct(rereduce, Reds) ->
+ hyper:union([Filter || [_, Filter] <- Reds]).
% use the function stored in ddoc.validate_doc_update to test an update.
-spec validate_doc_update(DDoc, EditDoc, DiskDoc, Ctx, SecObj) -> ok when
diff --git a/src/couch_mrview/src/couch_mrview.erl b/src/couch_mrview/src/couch_mrview.erl
index a099f377e..b417aac52 100644
--- a/src/couch_mrview/src/couch_mrview.erl
+++ b/src/couch_mrview/src/couch_mrview.erl
@@ -184,6 +184,8 @@ validate(DbName, DDoc) ->
ok;
({_RedName, <<"_stats", _/binary>>}) ->
ok;
+ ({_RedName, <<"_approx_count_distinct", _/binary>>}) ->
+ ok;
({_RedName, <<"_", _/binary>> = Bad}) ->
Msg = ["`", Bad, "` is not a supported reduce function."],
throw({invalid_design_doc, Msg});
diff --git a/src/fabric/src/fabric_view.erl b/src/fabric/src/fabric_view.erl
index dd0fcfd8b..4d8d0e987 100644
--- a/src/fabric/src/fabric_view.erl
+++ b/src/fabric/src/fabric_view.erl
@@ -230,8 +230,9 @@ get_next_row(#collector{reducer = RedSrc} = St) when RedSrc =/= undefined ->
end, Counters0, Records),
Wrapped = [[V] || #view_row{value=V} <- Records],
{ok, [Reduced]} = couch_query_servers:rereduce(Lang, [RedSrc], Wrapped),
+ {ok, [Finalized]} = couch_query_servers:finalize([Reduced]),
NewSt = St#collector{keys=RestKeys, rows=NewRowDict, counters=Counters},
- {#view_row{key=Key, id=reduced, value=Reduced}, NewSt};
+ {#view_row{key=Key, id=reduced, value=Finalized}, NewSt};
error ->
get_next_row(St#collector{keys=RestKeys})
end;
diff --git a/test/javascript/tests/reduce_builtin.js b/test/javascript/tests/reduce_builtin.js
index 9c455e4e6..4686841e3 100644
--- a/test/javascript/tests/reduce_builtin.js
+++ b/test/javascript/tests/reduce_builtin.js
@@ -37,6 +37,12 @@ couchTests.reduce_builtin = function(debug) {
emit(doc.integer, doc.integer);
};
+ var check_approx_distinct = function(expected, estimated) {
+ // see https://en.wikipedia.org/wiki/HyperLogLog
+ var err = 1.04 / Math.sqrt(Math.pow(2, 11 - 1));
+ return Math.abs(expected - estimated) < expected * err;
+ };
+
var result = db.query(map, "_sum");
T(result.rows[0].value == 2*summate(numDocs));
result = db.query(map, "_count");
@@ -47,27 +53,41 @@ couchTests.reduce_builtin = function(debug) {
T(result.rows[0].value.min == 1);
T(result.rows[0].value.max == 500);
T(result.rows[0].value.sumsqr == 2*sumsqr(numDocs));
+ result = db.query(map, "_approx_count_distinct");
+ T(check_approx_distinct(numDocs, result.rows[0].value));
result = db.query(map, "_sum", {startkey: 4, endkey: 4});
T(result.rows[0].value == 8);
result = db.query(map, "_count", {startkey: 4, endkey: 4});
T(result.rows[0].value == 2);
+ result = db.query(map, "_approx_count_distinct", {startkey:4, endkey:4});
+ T(check_approx_distinct(1, result.rows[0].value));
result = db.query(map, "_sum", {startkey: 4, endkey: 5});
T(result.rows[0].value == 18);
result = db.query(map, "_count", {startkey: 4, endkey: 5});
T(result.rows[0].value == 4);
+ result = db.query(map, "_approx_count_distinct", {startkey:4, endkey:5});
+ T(check_approx_distinct(2, result.rows[0].value));
+
result = db.query(map, "_sum", {startkey: 4, endkey: 6});
T(result.rows[0].value == 30);
result = db.query(map, "_count", {startkey: 4, endkey: 6});
T(result.rows[0].value == 6);
+ result = db.query(map, "_approx_count_distinct", {startkey: 4, endkey: 6});
+ T(check_approx_distinct(3, result.rows[0].value));
result = db.query(map, "_sum", {group:true, limit:3});
T(result.rows[0].value == 2);
T(result.rows[1].value == 4);
T(result.rows[2].value == 6);
+ result = db.query(map, "_approx_count_distinct", {group:true, limit:3});
+ T(check_approx_distinct(1, result.rows[0].value));
+ T(check_approx_distinct(1, result.rows[1].value));
+ T(check_approx_distinct(1, result.rows[2].value));
+
for(var i=1; i<numDocs/2; i+=30) {
result = db.query(map, "_sum", {startkey: i, endkey: numDocs - i});
T(result.rows[0].value == 2*(summate(numDocs-i) - summate(i-1)));