From b1474e5c22fd2106c1e7c6493052e8fbc450e289 Mon Sep 17 00:00:00 2001 From: Anton Korshunov Date: Mon, 28 Nov 2022 14:19:20 +0000 Subject: SERVER-71051 Make CE module less dependent on statistics module --- src/mongo/db/SConscript | 3 +- src/mongo/db/commands/SConscript | 2 +- src/mongo/db/commands/analyze_cmd.cpp | 4 +- src/mongo/db/exec/sbe/SConscript | 3 +- src/mongo/db/mongod_main.cpp | 10 +- src/mongo/db/pipeline/SConscript | 4 +- src/mongo/db/pipeline/accumulator.h | 4 +- .../accumulator_internal_construct_stats.cpp | 8 +- src/mongo/db/query/SConscript | 1 + src/mongo/db/query/ce/SConscript | 184 +--- src/mongo/db/query/ce/array_histogram.cpp | 214 ---- src/mongo/db/query/ce/array_histogram.h | 148 --- src/mongo/db/query/ce/ce_array_data_test.cpp | 295 ----- src/mongo/db/query/ce/ce_dataflow_nodes_test.cpp | 227 ---- src/mongo/db/query/ce/ce_edge_cases_test.cpp | 1002 ----------------- .../db/query/ce/ce_generated_histograms_test.cpp | 363 ------ src/mongo/db/query/ce/ce_heuristic.cpp | 611 ---------- src/mongo/db/query/ce/ce_heuristic.h | 49 - src/mongo/db/query/ce/ce_heuristic_test.cpp | 1009 ----------------- src/mongo/db/query/ce/ce_hinted.cpp | 108 -- src/mongo/db/query/ce/ce_hinted.h | 58 - src/mongo/db/query/ce/ce_histogram.cpp | 289 ----- src/mongo/db/query/ce/ce_histogram.h | 54 - src/mongo/db/query/ce/ce_histogram_test.cpp | 1156 ------------------- src/mongo/db/query/ce/ce_interpolation_test.cpp | 505 --------- src/mongo/db/query/ce/ce_sampling.cpp | 362 ------ src/mongo/db/query/ce/ce_sampling.h | 56 - src/mongo/db/query/ce/ce_test_utils.cpp | 216 ---- src/mongo/db/query/ce/ce_test_utils.h | 250 ----- src/mongo/db/query/ce/collection_statistics.h | 60 - .../db/query/ce/collection_statistics_impl.cpp | 71 -- src/mongo/db/query/ce/collection_statistics_impl.h | 67 -- .../db/query/ce/collection_statistics_mock.cpp | 53 - src/mongo/db/query/ce/collection_statistics_mock.h | 64 -- .../db/query/ce/generated_histograms_test.cpp | 366 ++++++ .../db/query/ce/heuristic_dataflow_nodes_test.cpp | 221 ++++ src/mongo/db/query/ce/heuristic_estimator.cpp | 600 ++++++++++ src/mongo/db/query/ce/heuristic_estimator.h | 49 + src/mongo/db/query/ce/heuristic_estimator_test.cpp | 978 +++++++++++++++++ src/mongo/db/query/ce/hinted_estimator.cpp | 100 ++ src/mongo/db/query/ce/hinted_estimator.h | 57 + .../db/query/ce/histogram_array_data_test.cpp | 298 +++++ .../db/query/ce/histogram_edge_cases_test.cpp | 1007 +++++++++++++++++ src/mongo/db/query/ce/histogram_estimation.cpp | 488 -------- src/mongo/db/query/ce/histogram_estimation.h | 106 -- src/mongo/db/query/ce/histogram_estimator.cpp | 272 +++++ src/mongo/db/query/ce/histogram_estimator.h | 54 + src/mongo/db/query/ce/histogram_estimator_test.cpp | 1161 ++++++++++++++++++++ .../db/query/ce/histogram_interpolation_test.cpp | 508 +++++++++ .../db/query/ce/histogram_predicate_estimation.cpp | 496 +++++++++ .../db/query/ce/histogram_predicate_estimation.h | 106 ++ src/mongo/db/query/ce/max_diff.cpp | 376 ------- src/mongo/db/query/ce/max_diff.h | 82 -- src/mongo/db/query/ce/maxdiff_histogram_test.cpp | 34 +- src/mongo/db/query/ce/maxdiff_test_utils.cpp | 120 -- src/mongo/db/query/ce/maxdiff_test_utils.h | 76 -- src/mongo/db/query/ce/rand_utils.cpp | 391 ------- src/mongo/db/query/ce/rand_utils.h | 191 ---- src/mongo/db/query/ce/rand_utils_new.cpp | 249 ----- src/mongo/db/query/ce/rand_utils_new.h | 354 ------ src/mongo/db/query/ce/sampling_estimator.cpp | 341 ++++++ src/mongo/db/query/ce/sampling_estimator.h | 56 + src/mongo/db/query/ce/scalar_histogram.cpp | 194 ---- src/mongo/db/query/ce/scalar_histogram.h | 120 -- src/mongo/db/query/ce/stats.idl | 102 -- src/mongo/db/query/ce/stats_cache.cpp | 82 -- src/mongo/db/query/ce/stats_cache.h | 84 -- src/mongo/db/query/ce/stats_cache_loader.h | 61 - src/mongo/db/query/ce/stats_cache_loader_impl.cpp | 86 -- src/mongo/db/query/ce/stats_cache_loader_impl.h | 47 - src/mongo/db/query/ce/stats_cache_loader_mock.cpp | 53 - src/mongo/db/query/ce/stats_cache_loader_mock.h | 54 - src/mongo/db/query/ce/stats_cache_loader_test.cpp | 116 -- .../query/ce/stats_cache_loader_test_fixture.cpp | 76 -- .../db/query/ce/stats_cache_loader_test_fixture.h | 60 - src/mongo/db/query/ce/stats_cache_test.cpp | 133 --- src/mongo/db/query/ce/stats_catalog.cpp | 115 -- src/mongo/db/query/ce/stats_catalog.h | 80 -- src/mongo/db/query/ce/stats_path_test.cpp | 131 --- src/mongo/db/query/ce/test_utils.cpp | 214 ++++ src/mongo/db/query/ce/test_utils.h | 231 ++++ src/mongo/db/query/ce/value_utils.cpp | 254 ----- src/mongo/db/query/ce/value_utils.h | 123 --- src/mongo/db/query/ce_mode_parameter.cpp | 4 +- src/mongo/db/query/ce_mode_parameter.h | 4 +- src/mongo/db/query/ce_mode_parameter_test.cpp | 4 +- src/mongo/db/query/cost_model/SConscript | 2 +- src/mongo/db/query/cost_model/cost_estimator.cpp | 418 ------- src/mongo/db/query/cost_model/cost_estimator.h | 56 - .../db/query/cost_model/cost_estimator_impl.cpp | 418 +++++++ .../db/query/cost_model/cost_estimator_impl.h | 56 + .../db/query/cost_model/cost_estimator_test.cpp | 12 +- src/mongo/db/query/cqf_get_executor.cpp | 49 +- src/mongo/db/query/optimizer/cascades/interfaces.h | 8 +- .../query/optimizer/cascades/logical_rewriter.cpp | 6 +- .../db/query/optimizer/cascades/logical_rewriter.h | 4 +- src/mongo/db/query/optimizer/cascades/memo.cpp | 13 +- src/mongo/db/query/optimizer/cascades/memo.h | 4 +- .../query/optimizer/cascades/physical_rewriter.cpp | 6 +- .../query/optimizer/cascades/physical_rewriter.h | 4 +- src/mongo/db/query/optimizer/opt_phase_manager.cpp | 12 +- src/mongo/db/query/optimizer/opt_phase_manager.h | 12 +- src/mongo/db/query/optimizer/utils/ce_math.cpp | 5 +- src/mongo/db/query/optimizer/utils/ce_math.h | 6 +- .../db/query/optimizer/utils/unit_test_utils.cpp | 32 +- .../db/query/optimizer/utils/unit_test_utils.h | 10 +- src/mongo/db/query/query_knobs.idl | 2 +- src/mongo/db/query/stats/SConscript | 123 +++ src/mongo/db/query/stats/array_histogram.cpp | 209 ++++ src/mongo/db/query/stats/array_histogram.h | 142 +++ src/mongo/db/query/stats/collection_statistics.h | 60 + .../db/query/stats/collection_statistics_impl.cpp | 72 ++ .../db/query/stats/collection_statistics_impl.h | 67 ++ .../db/query/stats/collection_statistics_mock.cpp | 53 + .../db/query/stats/collection_statistics_mock.h | 64 ++ src/mongo/db/query/stats/max_diff.cpp | 378 +++++++ src/mongo/db/query/stats/max_diff.h | 82 ++ src/mongo/db/query/stats/maxdiff_test_utils.cpp | 120 ++ src/mongo/db/query/stats/maxdiff_test_utils.h | 74 ++ src/mongo/db/query/stats/rand_utils.cpp | 392 +++++++ src/mongo/db/query/stats/rand_utils.h | 188 ++++ src/mongo/db/query/stats/rand_utils_new.cpp | 250 +++++ src/mongo/db/query/stats/rand_utils_new.h | 353 ++++++ src/mongo/db/query/stats/scalar_histogram.cpp | 192 ++++ src/mongo/db/query/stats/scalar_histogram.h | 120 ++ src/mongo/db/query/stats/stats.idl | 102 ++ src/mongo/db/query/stats/stats_cache.cpp | 74 ++ src/mongo/db/query/stats/stats_cache.h | 81 ++ src/mongo/db/query/stats/stats_cache_loader.h | 58 + .../db/query/stats/stats_cache_loader_impl.cpp | 82 ++ src/mongo/db/query/stats/stats_cache_loader_impl.h | 45 + .../db/query/stats/stats_cache_loader_mock.cpp | 50 + src/mongo/db/query/stats/stats_cache_loader_mock.h | 52 + .../db/query/stats/stats_cache_loader_test.cpp | 116 ++ .../stats/stats_cache_loader_test_fixture.cpp | 74 ++ .../query/stats/stats_cache_loader_test_fixture.h | 60 + src/mongo/db/query/stats/stats_cache_test.cpp | 131 +++ src/mongo/db/query/stats/stats_catalog.cpp | 108 ++ src/mongo/db/query/stats/stats_catalog.h | 77 ++ src/mongo/db/query/stats/stats_path_test.cpp | 129 +++ src/mongo/db/query/stats/value_utils.cpp | 252 +++++ src/mongo/db/query/stats/value_utils.h | 120 ++ 142 files changed, 12315 insertions(+), 12445 deletions(-) delete mode 100644 src/mongo/db/query/ce/array_histogram.cpp delete mode 100644 src/mongo/db/query/ce/array_histogram.h delete mode 100644 src/mongo/db/query/ce/ce_array_data_test.cpp delete mode 100644 src/mongo/db/query/ce/ce_dataflow_nodes_test.cpp delete mode 100644 src/mongo/db/query/ce/ce_edge_cases_test.cpp delete mode 100644 src/mongo/db/query/ce/ce_generated_histograms_test.cpp delete mode 100644 src/mongo/db/query/ce/ce_heuristic.cpp delete mode 100644 src/mongo/db/query/ce/ce_heuristic.h delete mode 100644 src/mongo/db/query/ce/ce_heuristic_test.cpp delete mode 100644 src/mongo/db/query/ce/ce_hinted.cpp delete mode 100644 src/mongo/db/query/ce/ce_hinted.h delete mode 100644 src/mongo/db/query/ce/ce_histogram.cpp delete mode 100644 src/mongo/db/query/ce/ce_histogram.h delete mode 100644 src/mongo/db/query/ce/ce_histogram_test.cpp delete mode 100644 src/mongo/db/query/ce/ce_interpolation_test.cpp delete mode 100644 src/mongo/db/query/ce/ce_sampling.cpp delete mode 100644 src/mongo/db/query/ce/ce_sampling.h delete mode 100644 src/mongo/db/query/ce/ce_test_utils.cpp delete mode 100644 src/mongo/db/query/ce/ce_test_utils.h delete mode 100644 src/mongo/db/query/ce/collection_statistics.h delete mode 100644 src/mongo/db/query/ce/collection_statistics_impl.cpp delete mode 100644 src/mongo/db/query/ce/collection_statistics_impl.h delete mode 100644 src/mongo/db/query/ce/collection_statistics_mock.cpp delete mode 100644 src/mongo/db/query/ce/collection_statistics_mock.h create mode 100644 src/mongo/db/query/ce/generated_histograms_test.cpp create mode 100644 src/mongo/db/query/ce/heuristic_dataflow_nodes_test.cpp create mode 100644 src/mongo/db/query/ce/heuristic_estimator.cpp create mode 100644 src/mongo/db/query/ce/heuristic_estimator.h create mode 100644 src/mongo/db/query/ce/heuristic_estimator_test.cpp create mode 100644 src/mongo/db/query/ce/hinted_estimator.cpp create mode 100644 src/mongo/db/query/ce/hinted_estimator.h create mode 100644 src/mongo/db/query/ce/histogram_array_data_test.cpp create mode 100644 src/mongo/db/query/ce/histogram_edge_cases_test.cpp delete mode 100644 src/mongo/db/query/ce/histogram_estimation.cpp delete mode 100644 src/mongo/db/query/ce/histogram_estimation.h create mode 100644 src/mongo/db/query/ce/histogram_estimator.cpp create mode 100644 src/mongo/db/query/ce/histogram_estimator.h create mode 100644 src/mongo/db/query/ce/histogram_estimator_test.cpp create mode 100644 src/mongo/db/query/ce/histogram_interpolation_test.cpp create mode 100644 src/mongo/db/query/ce/histogram_predicate_estimation.cpp create mode 100644 src/mongo/db/query/ce/histogram_predicate_estimation.h delete mode 100644 src/mongo/db/query/ce/max_diff.cpp delete mode 100644 src/mongo/db/query/ce/max_diff.h delete mode 100644 src/mongo/db/query/ce/maxdiff_test_utils.cpp delete mode 100644 src/mongo/db/query/ce/maxdiff_test_utils.h delete mode 100644 src/mongo/db/query/ce/rand_utils.cpp delete mode 100644 src/mongo/db/query/ce/rand_utils.h delete mode 100644 src/mongo/db/query/ce/rand_utils_new.cpp delete mode 100644 src/mongo/db/query/ce/rand_utils_new.h create mode 100644 src/mongo/db/query/ce/sampling_estimator.cpp create mode 100644 src/mongo/db/query/ce/sampling_estimator.h delete mode 100644 src/mongo/db/query/ce/scalar_histogram.cpp delete mode 100644 src/mongo/db/query/ce/scalar_histogram.h delete mode 100644 src/mongo/db/query/ce/stats.idl delete mode 100644 src/mongo/db/query/ce/stats_cache.cpp delete mode 100644 src/mongo/db/query/ce/stats_cache.h delete mode 100644 src/mongo/db/query/ce/stats_cache_loader.h delete mode 100644 src/mongo/db/query/ce/stats_cache_loader_impl.cpp delete mode 100644 src/mongo/db/query/ce/stats_cache_loader_impl.h delete mode 100644 src/mongo/db/query/ce/stats_cache_loader_mock.cpp delete mode 100644 src/mongo/db/query/ce/stats_cache_loader_mock.h delete mode 100644 src/mongo/db/query/ce/stats_cache_loader_test.cpp delete mode 100644 src/mongo/db/query/ce/stats_cache_loader_test_fixture.cpp delete mode 100644 src/mongo/db/query/ce/stats_cache_loader_test_fixture.h delete mode 100644 src/mongo/db/query/ce/stats_cache_test.cpp delete mode 100644 src/mongo/db/query/ce/stats_catalog.cpp delete mode 100644 src/mongo/db/query/ce/stats_catalog.h delete mode 100644 src/mongo/db/query/ce/stats_path_test.cpp create mode 100644 src/mongo/db/query/ce/test_utils.cpp create mode 100644 src/mongo/db/query/ce/test_utils.h delete mode 100644 src/mongo/db/query/ce/value_utils.cpp delete mode 100644 src/mongo/db/query/ce/value_utils.h delete mode 100644 src/mongo/db/query/cost_model/cost_estimator.cpp delete mode 100644 src/mongo/db/query/cost_model/cost_estimator.h create mode 100644 src/mongo/db/query/cost_model/cost_estimator_impl.cpp create mode 100644 src/mongo/db/query/cost_model/cost_estimator_impl.h create mode 100644 src/mongo/db/query/stats/SConscript create mode 100644 src/mongo/db/query/stats/array_histogram.cpp create mode 100644 src/mongo/db/query/stats/array_histogram.h create mode 100644 src/mongo/db/query/stats/collection_statistics.h create mode 100644 src/mongo/db/query/stats/collection_statistics_impl.cpp create mode 100644 src/mongo/db/query/stats/collection_statistics_impl.h create mode 100644 src/mongo/db/query/stats/collection_statistics_mock.cpp create mode 100644 src/mongo/db/query/stats/collection_statistics_mock.h create mode 100644 src/mongo/db/query/stats/max_diff.cpp create mode 100644 src/mongo/db/query/stats/max_diff.h create mode 100644 src/mongo/db/query/stats/maxdiff_test_utils.cpp create mode 100644 src/mongo/db/query/stats/maxdiff_test_utils.h create mode 100644 src/mongo/db/query/stats/rand_utils.cpp create mode 100644 src/mongo/db/query/stats/rand_utils.h create mode 100644 src/mongo/db/query/stats/rand_utils_new.cpp create mode 100644 src/mongo/db/query/stats/rand_utils_new.h create mode 100644 src/mongo/db/query/stats/scalar_histogram.cpp create mode 100644 src/mongo/db/query/stats/scalar_histogram.h create mode 100644 src/mongo/db/query/stats/stats.idl create mode 100644 src/mongo/db/query/stats/stats_cache.cpp create mode 100644 src/mongo/db/query/stats/stats_cache.h create mode 100644 src/mongo/db/query/stats/stats_cache_loader.h create mode 100644 src/mongo/db/query/stats/stats_cache_loader_impl.cpp create mode 100644 src/mongo/db/query/stats/stats_cache_loader_impl.h create mode 100644 src/mongo/db/query/stats/stats_cache_loader_mock.cpp create mode 100644 src/mongo/db/query/stats/stats_cache_loader_mock.h create mode 100644 src/mongo/db/query/stats/stats_cache_loader_test.cpp create mode 100644 src/mongo/db/query/stats/stats_cache_loader_test_fixture.cpp create mode 100644 src/mongo/db/query/stats/stats_cache_loader_test_fixture.h create mode 100644 src/mongo/db/query/stats/stats_cache_test.cpp create mode 100644 src/mongo/db/query/stats/stats_catalog.cpp create mode 100644 src/mongo/db/query/stats/stats_catalog.h create mode 100644 src/mongo/db/query/stats/stats_path_test.cpp create mode 100644 src/mongo/db/query/stats/value_utils.cpp create mode 100644 src/mongo/db/query/stats/value_utils.h diff --git a/src/mongo/db/SConscript b/src/mongo/db/SConscript index acfed69144a..a33c66d3cdf 100644 --- a/src/mongo/db/SConscript +++ b/src/mongo/db/SConscript @@ -1520,6 +1520,7 @@ env.Library( '$BUILD_DIR/mongo/db/concurrency/exception_util', '$BUILD_DIR/mongo/db/exec/sbe/query_sbe_abt', '$BUILD_DIR/mongo/db/internal_transactions_feature_flag', + '$BUILD_DIR/mongo/db/query/ce/query_ce_heuristic', '$BUILD_DIR/mongo/db/query/ce/query_ce_histogram', '$BUILD_DIR/mongo/db/query/ce/query_ce_sampling', '$BUILD_DIR/mongo/db/query/optimizer/optimizer', @@ -2330,7 +2331,7 @@ env.Library( '$BUILD_DIR/mongo/db/change_stream_options_manager', '$BUILD_DIR/mongo/db/change_streams_cluster_parameter', '$BUILD_DIR/mongo/db/pipeline/change_stream_expired_pre_image_remover', - '$BUILD_DIR/mongo/db/query/ce/query_ce_histogram', + '$BUILD_DIR/mongo/db/query/stats/query_stats', '$BUILD_DIR/mongo/db/s/query_analysis_writer', '$BUILD_DIR/mongo/db/set_change_stream_state_coordinator', '$BUILD_DIR/mongo/idl/cluster_server_parameter', diff --git a/src/mongo/db/commands/SConscript b/src/mongo/db/commands/SConscript index 3dd709d03e7..d18125f0da1 100644 --- a/src/mongo/db/commands/SConscript +++ b/src/mongo/db/commands/SConscript @@ -360,9 +360,9 @@ env.Library( '$BUILD_DIR/mongo/db/ops/write_ops_exec', '$BUILD_DIR/mongo/db/pipeline/aggregation_request_helper', '$BUILD_DIR/mongo/db/pipeline/process_interface/mongo_process_interface', - '$BUILD_DIR/mongo/db/query/ce/query_ce_histogram', '$BUILD_DIR/mongo/db/query/command_request_response', '$BUILD_DIR/mongo/db/query/cursor_response_idl', + '$BUILD_DIR/mongo/db/query/stats/query_stats', '$BUILD_DIR/mongo/db/query/telemetry', '$BUILD_DIR/mongo/db/query_exec', '$BUILD_DIR/mongo/db/repl/replica_set_messages', diff --git a/src/mongo/db/commands/analyze_cmd.cpp b/src/mongo/db/commands/analyze_cmd.cpp index d47a3b55b41..d6a31329f1c 100644 --- a/src/mongo/db/commands/analyze_cmd.cpp +++ b/src/mongo/db/commands/analyze_cmd.cpp @@ -38,8 +38,8 @@ #include "mongo/db/namespace_string.h" #include "mongo/db/query/allowed_contexts.h" #include "mongo/db/query/analyze_command_gen.h" -#include "mongo/db/query/ce/stats_catalog.h" #include "mongo/db/query/query_feature_flags_gen.h" +#include "mongo/db/query/stats/stats_catalog.h" #include "mongo/rpc/get_status_from_command_result.h" namespace mongo { @@ -195,7 +195,7 @@ public: uassertStatusOK(getStatusFromCommandResult(analyzeResult)); // Invalidate statistics in the cache for the analyzed path - StatsCatalog& statsCatalog = StatsCatalog::get(opCtx); + stats::StatsCatalog& statsCatalog = stats::StatsCatalog::get(opCtx); uassertStatusOK(statsCatalog.invalidatePath(nss, key->toString())); } else if (sampleSize || sampleRate) { diff --git a/src/mongo/db/exec/sbe/SConscript b/src/mongo/db/exec/sbe/SConscript index fc66fc3519e..bf2d6170cf1 100644 --- a/src/mongo/db/exec/sbe/SConscript +++ b/src/mongo/db/exec/sbe/SConscript @@ -137,8 +137,7 @@ env.Library( 'abt/abt_lower.cpp', ], LIBDEPS=[ - '$BUILD_DIR/mongo/db/query/ce/query_ce_heuristic', - '$BUILD_DIR/mongo/db/query/optimizer/optimizer', + '$BUILD_DIR/mongo/db/query/optimizer/optimizer_base', 'query_sbe', 'query_sbe_stages', 'query_sbe_storage', diff --git a/src/mongo/db/mongod_main.cpp b/src/mongo/db/mongod_main.cpp index a23b35f911a..a018c618cbf 100644 --- a/src/mongo/db/mongod_main.cpp +++ b/src/mongo/db/mongod_main.cpp @@ -107,9 +107,9 @@ #include "mongo/db/periodic_runner_job_abort_expired_transactions.h" #include "mongo/db/pipeline/change_stream_expired_pre_image_remover.h" #include "mongo/db/pipeline/process_interface/replica_set_node_process_interface.h" -#include "mongo/db/query/ce/stats_cache_loader_impl.h" -#include "mongo/db/query/ce/stats_catalog.h" #include "mongo/db/query/internal_plans.h" +#include "mongo/db/query/stats/stats_cache_loader_impl.h" +#include "mongo/db/query/stats/stats_catalog.h" #include "mongo/db/read_write_concern_defaults_cache_lookup_mongod.h" #include "mongo/db/repl/drop_pending_collection_reaper.h" #include "mongo/db/repl/initial_syncer_factory.h" @@ -855,9 +855,9 @@ ExitCode _initAndListen(ServiceContext* serviceContext, int listenPort) { LogicalSessionCache::set(serviceContext, makeLogicalSessionCacheD(kind)); - auto cacheLoader = std::make_unique(); - auto catalog = std::make_unique(serviceContext, std::move(cacheLoader)); - StatsCatalog::set(serviceContext, std::move(catalog)); + auto cacheLoader = std::make_unique(); + auto catalog = std::make_unique(serviceContext, std::move(cacheLoader)); + stats::StatsCatalog::set(serviceContext, std::move(catalog)); if (analyze_shard_key::supportsPersistingSampledQueriesIgnoreFCV()) { analyze_shard_key::QueryAnalysisWriter::get(serviceContext).onStartup(); diff --git a/src/mongo/db/pipeline/SConscript b/src/mongo/db/pipeline/SConscript index 422eef81f39..b4843172e39 100644 --- a/src/mongo/db/pipeline/SConscript +++ b/src/mongo/db/pipeline/SConscript @@ -133,8 +133,8 @@ env.Library( ], LIBDEPS=[ '$BUILD_DIR/mongo/db/exec/document_value/document_value', - '$BUILD_DIR/mongo/db/query/ce/query_stats', '$BUILD_DIR/mongo/db/query/query_knobs', + '$BUILD_DIR/mongo/db/query/stats/query_stats', '$BUILD_DIR/mongo/db/query_expressions', '$BUILD_DIR/mongo/scripting/scripting_common', '$BUILD_DIR/mongo/util/summation', @@ -334,13 +334,13 @@ pipelineEnv.Library( '$BUILD_DIR/mongo/db/index/key_generator', '$BUILD_DIR/mongo/db/pipeline/change_stream_error_extra_info', '$BUILD_DIR/mongo/db/pipeline/lite_parsed_document_source', - '$BUILD_DIR/mongo/db/query/ce/query_stats_gen', '$BUILD_DIR/mongo/db/query/collation/collator_factory_interface', '$BUILD_DIR/mongo/db/query/collation/collator_interface', '$BUILD_DIR/mongo/db/query/cursor_response_idl', '$BUILD_DIR/mongo/db/query/datetime/date_time_support', '$BUILD_DIR/mongo/db/query/query_knobs', '$BUILD_DIR/mongo/db/query/sort_pattern', + '$BUILD_DIR/mongo/db/query/stats/stats_gen', '$BUILD_DIR/mongo/db/query/telemetry', '$BUILD_DIR/mongo/db/query_expressions', '$BUILD_DIR/mongo/db/repl/apply_ops_command_info', diff --git a/src/mongo/db/pipeline/accumulator.h b/src/mongo/db/pipeline/accumulator.h index df070ff6531..e99b9db3ec1 100644 --- a/src/mongo/db/pipeline/accumulator.h +++ b/src/mongo/db/pipeline/accumulator.h @@ -43,7 +43,7 @@ #include "mongo/db/exec/document_value/value_comparator.h" #include "mongo/db/pipeline/expression.h" #include "mongo/db/pipeline/expression_context.h" -#include "mongo/db/query/ce/value_utils.h" +#include "mongo/db/query/stats/value_utils.h" #include "mongo/stdx/unordered_set.h" #include "mongo/util/summation.h" @@ -250,7 +250,7 @@ public: private: double _count; - std::vector _values; + std::vector _values; }; class AccumulatorLast final : public AccumulatorState { diff --git a/src/mongo/db/pipeline/accumulator_internal_construct_stats.cpp b/src/mongo/db/pipeline/accumulator_internal_construct_stats.cpp index e5f2a5b0c70..bdcd67c6c24 100644 --- a/src/mongo/db/pipeline/accumulator_internal_construct_stats.cpp +++ b/src/mongo/db/pipeline/accumulator_internal_construct_stats.cpp @@ -35,8 +35,8 @@ #include "mongo/db/pipeline/accumulator.h" #include "mongo/db/pipeline/expression_context.h" #include "mongo/db/query/allowed_contexts.h" -#include "mongo/db/query/ce/max_diff.h" -#include "mongo/db/query/ce/value_utils.h" +#include "mongo/db/query/stats/max_diff.h" +#include "mongo/db/query/stats/value_utils.h" #include "mongo/logv2/log.h" @@ -71,7 +71,7 @@ void AccumulatorInternalConstructStats::processInternal(const Value& input, bool auto val = doc["val"]; LOGV2_DEBUG(6735800, 4, "Extracted document", "val"_attr = val); - _values.emplace_back(ce::SBEValue(mongo::optimizer::convertFrom(val))); + _values.emplace_back(stats::SBEValue(mongo::optimizer::convertFrom(val))); _count++; _memUsageBytes = sizeof(*this); @@ -81,7 +81,7 @@ Value AccumulatorInternalConstructStats::getValue(bool toBeMerged) { uassert(8423374, "Can not merge analyze pipelines", !toBeMerged); // Generate and serialize maxdiff histogram for scalar and array values. - auto arrayHistogram = ce::createArrayEstimator(_values, ce::ScalarHistogram::kMaxBuckets); + auto arrayHistogram = stats::createArrayEstimator(_values, stats::ScalarHistogram::kMaxBuckets); auto stats = stats::makeStatistics(_count, arrayHistogram); return Value(stats); diff --git a/src/mongo/db/query/SConscript b/src/mongo/db/query/SConscript index 02890313b21..1d3e11ccc13 100644 --- a/src/mongo/db/query/SConscript +++ b/src/mongo/db/query/SConscript @@ -11,6 +11,7 @@ env.SConscript( 'cost_model', 'datetime', 'optimizer', + 'stats', ], exports=[ 'env', diff --git a/src/mongo/db/query/ce/SConscript b/src/mongo/db/query/ce/SConscript index d1e33484f03..c027c82f743 100644 --- a/src/mongo/db/query/ce/SConscript +++ b/src/mongo/db/query/ce/SConscript @@ -7,7 +7,7 @@ env = env.Clone() env.Library( target="query_ce_heuristic", source=[ - 'ce_heuristic.cpp', + 'heuristic_estimator.cpp', ], LIBDEPS_PRIVATE=[ '$BUILD_DIR/mongo/db/query/optimizer/optimizer_memo', @@ -17,7 +17,7 @@ env.Library( env.Library( target="query_ce_hinted", source=[ - 'ce_hinted.cpp', + 'hinted_estimator.cpp', ], LIBDEPS_PRIVATE=[ '$BUILD_DIR/mongo/db/query/optimizer/optimizer_memo', @@ -28,62 +28,32 @@ env.Library( env.Library( target="query_ce_histogram", source=[ - 'ce_histogram.cpp', - 'collection_statistics_impl.cpp', - 'histogram_estimation.cpp', - 'stats_catalog.cpp', - 'stats_cache.cpp', - 'stats_cache_loader_impl.cpp', + 'histogram_estimator.cpp', + 'histogram_predicate_estimation.cpp', ], LIBDEPS_PRIVATE=[ - '$BUILD_DIR/mongo/db/dbdirectclient', - '$BUILD_DIR/mongo/db/pipeline/pipeline', + '$BUILD_DIR/mongo/db/pipeline/abt_utils', '$BUILD_DIR/mongo/db/query/optimizer/optimizer_memo', '$BUILD_DIR/mongo/db/query/optimizer/optimizer_rewrites', - '$BUILD_DIR/mongo/util/caching', - '$BUILD_DIR/mongo/util/concurrency/thread_pool', - 'query_stats', + '$BUILD_DIR/mongo/db/query/stats/stats_histograms', ], ) env.Library( target="query_ce_sampling", source=[ - 'ce_sampling.cpp', + 'sampling_estimator.cpp', ], LIBDEPS_PRIVATE=[ '$BUILD_DIR/mongo/db/exec/sbe/query_sbe_abt', - '$BUILD_DIR/mongo/db/query/optimizer/optimizer_memo', - ], -) - -env.Library( - target="query_stats", - source=[ - 'array_histogram.cpp', - 'scalar_histogram.cpp', - 'stats.idl', - 'value_utils.cpp', - ], - LIBDEPS=[ - '$BUILD_DIR/mongo/db/exec/sbe/query_sbe_values', - ], -) - -env.Library( - target="query_stats_gen", - source=[ - 'max_diff.cpp', - ], - LIBDEPS=[ - 'query_stats', + '$BUILD_DIR/mongo/db/query/optimizer/optimizer', ], ) env.Library( - target="ce_test_utils", + target="test_utils", source=[ - 'ce_test_utils.cpp', + 'test_utils.cpp', ], LIBDEPS=[ '$BUILD_DIR/mongo/base', @@ -94,150 +64,79 @@ env.Library( 'query_ce_heuristic', 'query_ce_histogram', 'query_ce_sampling', - 'query_stats', ], ) env.CppUnitTest( - target="ce_histogram_test", + target="histogram_estimator_test", source=[ - "ce_histogram_test.cpp", - "collection_statistics_mock.cpp", + "histogram_estimator_test.cpp", ], LIBDEPS=[ - 'ce_test_utils', + '$BUILD_DIR/mongo/db/query/stats/stats_test_utils', + 'test_utils', ], ) env.CppUnitTest( - target="ce_interpolation_test", + target="histogram_interpolation_test", source=[ - "ce_interpolation_test.cpp", + "histogram_interpolation_test.cpp", ], LIBDEPS=[ - 'ce_test_utils', + 'test_utils', ], ) env.CppUnitTest( - target="ce_heuristic_test", + target="heuristic_estimator_test", source=[ - "ce_heuristic_test.cpp", + "heuristic_estimator_test.cpp", ], LIBDEPS=[ - 'ce_test_utils', + 'test_utils', ], ) env.CppUnitTest( - target="ce_array_data_test", + target="histogram_array_data_test", source=[ - "ce_array_data_test.cpp", + "histogram_array_data_test.cpp", ], LIBDEPS=[ - 'ce_test_utils', + '$BUILD_DIR/mongo/db/query/stats/stats_test_utils', + 'test_utils', ], ) env.CppUnitTest( - target="ce_edge_cases_test", + target="histogram_edge_cases_test", source=[ - "ce_edge_cases_test.cpp", + "histogram_edge_cases_test.cpp", ], LIBDEPS=[ - 'ce_test_utils', - 'query_stats_test_utils', + '$BUILD_DIR/mongo/db/query/stats/stats_test_utils', + 'test_utils', ], ) env.CppUnitTest( - target="ce_dataflow_nodes_test", + target="heuristic_dataflow_nodes_test", source=[ - "ce_dataflow_nodes_test.cpp", + "heuristic_dataflow_nodes_test.cpp", ], LIBDEPS=[ - 'ce_test_utils', + 'test_utils', ], ) env.CppUnitTest( - target='stats_cache_loader_test', + target="generated_histograms_test", source=[ - 'stats_cache_loader_test.cpp', - 'stats_cache_loader_test_fixture.cpp', + "generated_histograms_test.cpp", ], LIBDEPS=[ - '$BUILD_DIR/mongo/db/auth/authmocks', - '$BUILD_DIR/mongo/db/catalog/collection_crud', - '$BUILD_DIR/mongo/db/commands/test_commands_enabled', - '$BUILD_DIR/mongo/db/index_builds_coordinator_mongod', - '$BUILD_DIR/mongo/db/multitenancy', - '$BUILD_DIR/mongo/db/op_observer/op_observer', - '$BUILD_DIR/mongo/db/op_observer/op_observer_impl', - '$BUILD_DIR/mongo/db/query/datetime/date_time_support', - '$BUILD_DIR/mongo/db/query/query_test_service_context', - '$BUILD_DIR/mongo/db/query_expressions', - '$BUILD_DIR/mongo/db/repl/drop_pending_collection_reaper', - '$BUILD_DIR/mongo/db/repl/oplog', - '$BUILD_DIR/mongo/db/repl/optime', - '$BUILD_DIR/mongo/db/repl/repl_coordinator_interface', - '$BUILD_DIR/mongo/db/repl/replmocks', - '$BUILD_DIR/mongo/db/repl/storage_interface_impl', - '$BUILD_DIR/mongo/db/server_base', - '$BUILD_DIR/mongo/db/service_context', - '$BUILD_DIR/mongo/db/service_context_d_test_fixture', - '$BUILD_DIR/mongo/db/service_context_test_fixture', - '$BUILD_DIR/mongo/db/shard_role', - '$BUILD_DIR/mongo/db/storage/wiredtiger/storage_wiredtiger', - '$BUILD_DIR/mongo/db/timeseries/timeseries_options', - '$BUILD_DIR/mongo/unittest/unittest', - '$BUILD_DIR/mongo/util/clock_source_mock', - '$BUILD_DIR/mongo/util/fail_point', - '$BUILD_DIR/mongo/util/pcre_wrapper', - 'query_ce_histogram', - 'query_stats', - ], -) - -env.CppUnitTest( - target="stats_cache_test", - source=[ - "stats_cache_test.cpp", - "stats_cache_loader_mock.cpp", - ], - LIBDEPS=[ - '$BUILD_DIR/mongo/base', - '$BUILD_DIR/mongo/db/service_context', - 'ce_test_utils', - ], -) - -env.CppUnitTest( - target="stats_path_test", - source=[ - "stats_path_test.cpp", - ], - LIBDEPS=[ - '$BUILD_DIR/mongo/base', - '$BUILD_DIR/mongo/db/service_context', - 'ce_test_utils', - ], -) - -env.Library( - target="query_stats_test_utils", - source=[ - 'rand_utils.cpp', - 'rand_utils_new.cpp', - 'maxdiff_test_utils.cpp', - ], - LIBDEPS=[ - '$BUILD_DIR/mongo/base', - '$BUILD_DIR/mongo/db/exec/sbe/sbe_abt_test_util', - "$BUILD_DIR/mongo/unittest/unittest", - 'query_ce_histogram', - 'query_stats', - 'query_stats_gen', + 'test_utils', ], ) @@ -247,18 +146,7 @@ env.CppUnitTest( 'maxdiff_histogram_test.cpp', ], LIBDEPS=[ - 'ce_test_utils', - 'query_stats_test_utils', - ], -) - -env.CppUnitTest( - target="ce_generated_histograms_test", - source=[ - "ce_generated_histograms_test.cpp", - ], - LIBDEPS=[ - 'ce_test_utils', - 'query_stats_test_utils', + '$BUILD_DIR/mongo/db/query/stats/stats_test_utils', + 'test_utils', ], ) diff --git a/src/mongo/db/query/ce/array_histogram.cpp b/src/mongo/db/query/ce/array_histogram.cpp deleted file mode 100644 index 66ab117e60d..00000000000 --- a/src/mongo/db/query/ce/array_histogram.cpp +++ /dev/null @@ -1,214 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/db/query/ce/array_histogram.h" -#include "mongo/db/query/ce/value_utils.h" - -namespace mongo { -namespace ce { -using namespace sbe; - -TypeCounts mapStatsTypeCountToTypeCounts(std::vector tc) { - TypeCounts out; - for (const auto& t : tc) { - out.emplace(deserialize(t.getTypeName().toString()), t.getCount()); - } - return out; -} - -ArrayHistogram::ArrayHistogram() : ArrayHistogram(ScalarHistogram(), {}) {} - -ArrayHistogram::ArrayHistogram(Statistics stats) - : ArrayHistogram(stats.getScalarHistogram(), - mapStatsTypeCountToTypeCounts(stats.getTypeCount()), - stats.getTrueCount(), - stats.getFalseCount()) { - // TODO SERVER-71513: initialize non-scalar histogram fields. -} - -ArrayHistogram::ArrayHistogram(ScalarHistogram scalar, - TypeCounts typeCounts, - ScalarHistogram arrayUnique, - ScalarHistogram arrayMin, - ScalarHistogram arrayMax, - TypeCounts arrayTypeCounts, - double emptyArrayCount, - double trueCount, - double falseCount) - : _scalar(std::move(scalar)), - _typeCounts(std::move(typeCounts)), - _emptyArrayCount(emptyArrayCount), - _trueCount(trueCount), - _falseCount(falseCount), - _arrayUnique(std::move(arrayUnique)), - _arrayMin(std::move(arrayMin)), - _arrayMax(std::move(arrayMax)), - _arrayTypeCounts(std::move(arrayTypeCounts)) { - invariant(isArray()); -} - -ArrayHistogram::ArrayHistogram(ScalarHistogram scalar, - TypeCounts typeCounts, - double trueCount, - double falseCount) - : _scalar(std::move(scalar)), - _typeCounts(std::move(typeCounts)), - _emptyArrayCount(0.0), - _trueCount(trueCount), - _falseCount(falseCount), - _arrayUnique(boost::none), - _arrayMin(boost::none), - _arrayMax(boost::none), - _arrayTypeCounts(boost::none) { - invariant(!isArray()); -} - -bool ArrayHistogram::isArray() const { - return _arrayUnique && _arrayMin && _arrayMax && _arrayTypeCounts; -} - -std::string typeCountsToString(const TypeCounts& typeCounts) { - std::ostringstream os; - os << "{"; - bool first = true; - for (auto [tag, count] : typeCounts) { - if (!first) - os << ", "; - os << tag << ": " << count; - first = false; - } - os << "}"; - return os.str(); -} - -std::string ArrayHistogram::toString() const { - std::ostringstream os; - os << "{\n"; - os << " scalar: " << _scalar.toString(); - os << ",\n typeCounts: " << typeCountsToString(_typeCounts); - if (isArray()) { - os << ",\n arrayUnique: " << _arrayUnique->toString(); - os << ",\n arrayMin: " << _arrayMin->toString(); - os << ",\n arrayMax: " << _arrayMax->toString(); - os << ",\n arrayTypeCounts: " << typeCountsToString(*_arrayTypeCounts); - } - os << "\n}\n"; - return os.str(); -} - -const ScalarHistogram& ArrayHistogram::getScalar() const { - return _scalar; -} - -const ScalarHistogram& ArrayHistogram::getArrayUnique() const { - invariant(isArray()); - return *_arrayUnique; -} - -const ScalarHistogram& ArrayHistogram::getArrayMin() const { - invariant(isArray()); - return *_arrayMin; -} - -const ScalarHistogram& ArrayHistogram::getArrayMax() const { - invariant(isArray()); - return *_arrayMax; -} - -const TypeCounts& ArrayHistogram::getTypeCounts() const { - return _typeCounts; -} - -const TypeCounts& ArrayHistogram::getArrayTypeCounts() const { - invariant(isArray()); - return *_arrayTypeCounts; -} - -double ArrayHistogram::getArrayCount() const { - if (isArray()) { - auto findArray = _typeCounts.find(value::TypeTags::Array); - uassert(6979504, - "Histogram with array data must have a total array count.", - findArray != _typeCounts.end()); - double arrayCount = findArray->second; - uassert(6979503, "Histogram with array data must have at least one array.", arrayCount > 0); - return arrayCount; - } - return 0; -} - -BSONObj ArrayHistogram::serialize() const { - BSONObjBuilder histogramBuilder; - - // Serialize boolean type counters. - histogramBuilder.append("trueCount", getTrueCount()); - histogramBuilder.append("falseCount", getFalseCount()); - - // Serialize empty array counts. - histogramBuilder.appendNumber("emptyArrayCount", getEmptyArrayCount()); - - // Serialize type counts. - BSONArrayBuilder typeCountBuilder(histogramBuilder.subarrayStart("typeCount")); - const auto& typeCounts = getTypeCounts(); - for (const auto& [sbeType, count] : typeCounts) { - auto typeCount = BSON("typeName" << ce::serialize(sbeType) << "count" << count); - typeCountBuilder.append(typeCount); - } - typeCountBuilder.doneFast(); - - // Serialize scalar histogram. - histogramBuilder.append("scalarHistogram", getScalar().serialize()); - - // TODO SERVER-71513: serialize array histograms. - - histogramBuilder.doneFast(); - return histogramBuilder.obj(); -} -} // namespace ce - -// TODO: update this once SERVER-71051 is done. -namespace stats { -BSONObj makeStatistics(double documents, const ce::ArrayHistogram& arrayHistogram) { - BSONObjBuilder builder; - builder.appendNumber("documents", documents); - builder.appendElements(arrayHistogram.serialize()); - builder.doneFast(); - return builder.obj(); -} - -BSONObj makeStatsPath(StringData path, double documents, const ce::ArrayHistogram& arrayHistogram) { - BSONObjBuilder builder; - builder.append("_id", path); - builder.append("statistics", makeStatistics(documents, arrayHistogram)); - builder.doneFast(); - return builder.obj(); -} -} // namespace stats - -} // namespace mongo diff --git a/src/mongo/db/query/ce/array_histogram.h b/src/mongo/db/query/ce/array_histogram.h deleted file mode 100644 index 2ce33d330b5..00000000000 --- a/src/mongo/db/query/ce/array_histogram.h +++ /dev/null @@ -1,148 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include - -#include "mongo/db/exec/sbe/values/value.h" -#include "mongo/db/query/ce/scalar_histogram.h" -#include "mongo/db/query/ce/stats_gen.h" - -namespace mongo { -namespace ce { - -using TypeCounts = std::map; - -class ArrayHistogram { -public: - // Constructs an empty scalar histogram. - ArrayHistogram(); - - // Constructor using StatsPath IDL as input. - ArrayHistogram(Statistics stats); - - // Constructor for scalar field histograms. - ArrayHistogram(ScalarHistogram scalar, - TypeCounts typeCounts, - double trueCount = 0.0, - double falseCount = 0.0); - - // Constructor for array field histograms. We have to initialize all array fields in this case. - ArrayHistogram(ScalarHistogram scalar, - TypeCounts typeCounts, - ScalarHistogram arrayUnique, - ScalarHistogram arrayMin, - ScalarHistogram arrayMax, - TypeCounts arrayTypeCounts, - double emptyArrayCount = 0.0, - double trueCount = 0.0, - double falseCount = 0.0); - - // ArrayHistogram is neither copy-constructible nor copy-assignable. - ArrayHistogram(const ArrayHistogram&) = delete; - ArrayHistogram& operator=(const ArrayHistogram&) = delete; - - // However, it is move-constructible and move-assignable. - ArrayHistogram(ArrayHistogram&&) = default; - ArrayHistogram& operator=(ArrayHistogram&&) = default; - ~ArrayHistogram() = default; - - std::string toString() const; - - // Serialize to BSON for storage in stats collection. - BSONObj serialize() const; - - const ScalarHistogram& getScalar() const; - const ScalarHistogram& getArrayUnique() const; - const ScalarHistogram& getArrayMin() const; - const ScalarHistogram& getArrayMax() const; - const TypeCounts& getTypeCounts() const; - const TypeCounts& getArrayTypeCounts() const; - - // Returns whether or not this histogram includes array data points. - bool isArray() const; - - // Get the total number of arrays in the histogram's path including empty arrays. - double getArrayCount() const; - - // Get the total number of empty arrays ( [] ) in the histogram's path. - double getEmptyArrayCount() const { - return _emptyArrayCount; - } - - // Get the count of true booleans. - double getTrueCount() const { - return _trueCount; - } - - // Get the count of false booleans. - double getFalseCount() const { - return _falseCount; - } - -private: - /* Fields for all paths. */ - - // Contains values which appeared originally as scalars on the path. - ScalarHistogram _scalar; - // The number of values of each type. - TypeCounts _typeCounts; - // The number of empty arrays - they are not accounted for in the histograms. - double _emptyArrayCount; - // The counts of true & false booleans. - double _trueCount; - double _falseCount; - - /* Fields for array paths (only initialized if arrays are present). */ - - // Contains unique scalar values originating from arrays. - boost::optional _arrayUnique; - // Contains minimum values originating from arrays **per class**. - boost::optional _arrayMin; - // Contains maximum values originating from arrays **per class**. - boost::optional _arrayMax; - // The number of values of each type inside all arrays. - boost::optional _arrayTypeCounts; -}; -} // namespace ce -// TODO: update this once SERVER-71051 is done. -namespace stats { -/** - * Returns an owned BSON Object representing data matching mongo::Statistics IDL. - */ -BSONObj makeStatistics(double documents, const ce::ArrayHistogram& arrayHistogram); - -/** - * Returns an owned BSON Object representing data matching mongo::StatsPath IDL. - */ -BSONObj makeStatsPath(StringData path, double documents, const ce::ArrayHistogram& arrayHistogram); -} // namespace stats - -} // namespace mongo diff --git a/src/mongo/db/query/ce/ce_array_data_test.cpp b/src/mongo/db/query/ce/ce_array_data_test.cpp deleted file mode 100644 index 587ab4b7364..00000000000 --- a/src/mongo/db/query/ce/ce_array_data_test.cpp +++ /dev/null @@ -1,295 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include - -#include "mongo/db/exec/sbe/values/value.h" -#include "mongo/db/query/ce/array_histogram.h" -#include "mongo/db/query/ce/ce_test_utils.h" -#include "mongo/db/query/ce/histogram_estimation.h" -#include "mongo/db/query/query_test_service_context.h" -#include "mongo/unittest/unittest.h" - -namespace mongo::ce { -namespace { - -using namespace sbe; - -/** - * Structure representing a range query and its estimated and actual cardinalities. - * Used to record hand-crafted queries over a pre-generated dataset. - */ -struct QuerySpec { - // Low bound of the query range. - int32_t low; - // Upper bound of the query range. - int32_t high; - // Estimated cardinality of $match query. - double estMatch; - // Actual cardinality of $match query. - double actMatch; - // Estimated cardinality of $elemMatch query. - double estElemMatch; - // Actual cardinality of $elemMatch query. - double actElemMatch; -}; - -static std::pair computeErrors(size_t actualCard, double estimatedCard) { - double error = estimatedCard - actualCard; - double relError = (actualCard == 0) ? (estimatedCard == 0 ? 0.0 : -1.0) : error / actualCard; - return std::make_pair(error, relError); -} - -static std::string serializeQuery(QuerySpec& q, bool isElemMatch) { - std::ostringstream os; - os << "{$match: {a: {"; - if (isElemMatch) { - os << "$elemMatch: {"; - } - os << "$gt: " << q.low; - os << ", $lt: " << q.high; - if (isElemMatch) { - os << "}"; - } - os << "}}}\n"; - return os.str(); -} - -static std::string computeRMSE(std::vector& querySet, bool isElemMatch) { - double rms = 0.0, relRms = 0.0, meanAbsSelErr = 0.0; - size_t trialSize = querySet.size(); - const size_t dataSize = 1000; - - std::ostringstream os; - os << "\nQueries:\n"; - for (auto& q : querySet) { - double estimatedCard = isElemMatch ? q.estElemMatch : q.estMatch; - double actualCard = isElemMatch ? q.actElemMatch : q.actMatch; - - auto [error, relError] = computeErrors(actualCard, estimatedCard); - rms += error * error; - relRms += relError * relError; - meanAbsSelErr += std::abs(error); - os << serializeQuery(q, isElemMatch); - os << "Estimated: " << estimatedCard << " Actual " << actualCard << " (Error: " << error - << " RelError: " << relError << ")\n\n"; - } - rms = std::sqrt(rms / trialSize); - relRms = std::sqrt(relRms / trialSize); - meanAbsSelErr /= (trialSize * dataSize); - - os << "=====" << (isElemMatch ? " ElemMatch errors: " : "Match errors:") << "=====\n"; - os << "RMSE : " << rms << " RelRMSE : " << relRms - << " MeanAbsSelectivityError: " << meanAbsSelErr << std::endl; - return os.str(); -} - -TEST(EstimatorArrayDataTest, Histogram1000ArraysSmall10Buckets) { - std::vector scalarData{{}}; - const ScalarHistogram scalarHist = createHistogram(scalarData); - - std::vector minData{{0, 5.0, 0.0, 0.0}, - {553, 2.0, 935.0, 303.0}, - {591, 4.0, 2.0, 1.0}, - {656, 2.0, 21.0, 12.0}, - {678, 3.0, 6.0, 3.0}, - {693, 2.0, 1.0, 1.0}, - {730, 1.0, 6.0, 3.0}, - {788, 1.0, 2.0, 2.0}, - {847, 2.0, 4.0, 1.0}, - {867, 1.0, 0.0, 0.0}}; - - const ScalarHistogram aMinHist = createHistogram(minData); - - std::vector maxData{{117, 1.0, 0.0, 0.0}, - {210, 1.0, 1.0, 1.0}, - {591, 1.0, 8.0, 4.0}, - {656, 1.0, 0.0, 0.0}, - {353, 2.0, 18.0, 9.0}, - {610, 5.0, 125.0, 65.0}, - {733, 8.0, 134.0, 53.0}, - {768, 6.0, 50.0, 16.0}, - {957, 8.0, 448.0, 137.0}, - {1000, 7.0, 176.0, 40.0}}; - - const ScalarHistogram aMaxHist = createHistogram(maxData); - - std::vector uniqueData{{0, 5.0, 0.0, 0.0}, - {16, 11.0, 74.0, 13.0}, - {192, 13.0, 698.0, 148.0}, - {271, 9.0, 312.0, 70.0}, - {670, 7.0, 1545.0, 355.0}, - {712, 9.0, 159.0, 32.0}, - {776, 11.0, 247.0, 54.0}, - {869, 9.0, 361.0, 85.0}, - {957, 8.0, 323.0, 76.0}, - {1000, 7.0, 188.0, 40.0}}; - - const ScalarHistogram aUniqueHist = createHistogram(uniqueData); - - TypeCounts typeCounts; - TypeCounts arrayTypeCounts; - // Dataset generated as 1000 arrays of size between 3 to 5. - typeCounts.insert({value::TypeTags::Array, 1000}); - arrayTypeCounts.insert({value::TypeTags::NumberInt32, 3996}); - - const ArrayHistogram arrHist(scalarHist, - typeCounts, - aUniqueHist, - aMinHist, - aMaxHist, - arrayTypeCounts, - 0 /* emptyArrayCount */); - - std::vector querySet{{10, 20, 35.7, 93.0, 37.8, 39.0}, - {10, 60, 103.3, 240.0, 158.0, 196.0}, - {320, 330, 554.5, 746.0, 26.0, 30.0}, - {320, 400, 672.9, 832.0, 231.5, 298.0}, - {980, 990, 88.8, 101.0, 36.5, 41.0}, - {970, 1050, 129.7, 141.0, 129.7, 141.0}}; - - for (const auto q : querySet) { - // $match query, includeScalar = true. - double estCard = estimateCardRange(arrHist, - false /* lowInclusive */, - value::TypeTags::NumberInt32, - sbe::value::bitcastFrom(q.low), - false /* highInclusive */, - value::TypeTags::NumberInt32, - sbe::value::bitcastFrom(q.high), - true /* includeScalar */); - ASSERT_APPROX_EQUAL(estCard, q.estMatch, 0.1); - - // $elemMatch query, includeScalar = false. - estCard = estimateCardRange(arrHist, - false /* lowInclusive */, - value::TypeTags::NumberInt32, - sbe::value::bitcastFrom(q.low), - false /* highInclusive */, - value::TypeTags::NumberInt32, - sbe::value::bitcastFrom(q.high), - false /* includeScalar */); - ASSERT_APPROX_EQUAL(estCard, q.estElemMatch, 0.1); - } - std::cout << computeRMSE(querySet, false /* isElemMatch */) << std::endl; - std::cout << computeRMSE(querySet, true /* isElemMatch */) << std::endl; -} - -TEST(EstimatorArrayDataTest, Histogram1000ArraysLarge10Buckets) { - std::vector scalarData{{}}; - const ScalarHistogram scalarHist = createHistogram(scalarData); - - std::vector minData{{0, 2.0, 0.0, 0.0}, - {1324, 4.0, 925.0, 408.0}, - {1389, 5.0, 7.0, 5.0}, - {1521, 2.0, 16.0, 10.0}, - {1621, 2.0, 13.0, 7.0}, - {1852, 5.0, 10.0, 9.0}, - {1864, 2.0, 0.0, 0.0}, - {1971, 1.0, 3.0, 3.0}, - {2062, 2.0, 0.0, 0.0}, - {2873, 1.0, 0.0, 0.0}}; - - const ScalarHistogram aMinHist = createHistogram(minData); - - std::vector maxData{{2261, 1.0, 0.0, 0.0}, - {2673, 1.0, 0.0, 0.0}, - {2930, 1.0, 1.0, 1.0}, - {3048, 2.0, 2.0, 2.0}, - {3128, 3.0, 1.0, 1.0}, - {3281, 2.0, 0.0, 0.0}, - {3378, 2.0, 7.0, 5.0}, - {3453, 4.0, 2.0, 2.0}, - {3763, 6.0, 44.0, 23.0}, - {5000, 1.0, 920.0, 416.0}}; - - const ScalarHistogram aMaxHist = createHistogram(maxData); - - std::vector uniqueData{{0, 2.0, 0.0, 0.0}, - {1106, 9.0, 1970.0, 704.0}, - {1542, 11.0, 736.0, 280.0}, - {3267, 6.0, 3141.0, 1097.0}, - {3531, 6.0, 461.0, 175.0}, - {3570, 7.0, 48.0, 20.0}, - {4573, 8.0, 1851.0, 656.0}, - {4619, 6.0, 65.0, 30.0}, - {4782, 5.0, 265.0, 99.0}, - {5000, 1.0, 342.0, 135.0}}; - - const ScalarHistogram aUniqueHist = createHistogram(uniqueData); - - TypeCounts typeCounts; - TypeCounts arrayTypeCounts; - // Dataset generated as 1000 arrays of size between 8 to 10. - typeCounts.insert({value::TypeTags::Array, 1000}); - arrayTypeCounts.insert({value::TypeTags::NumberInt32, 8940}); - - const ArrayHistogram arrHist(scalarHist, - typeCounts, - aUniqueHist, - aMinHist, - aMaxHist, - arrayTypeCounts, - 0 /* emptyArrayCount */); - - std::vector querySet{{10, 20, 13.7, 39.0, 9.7, 26.0}, - {10, 60, 41.6, 108.0, 55.7, 101.0}, - {1000, 1010, 705.4, 861.0, 9.7, 7.0}, - {1000, 1050, 733.3, 884.0, 55.7, 87.0}, - {3250, 3300, 988.0, 988.0, 59.3, 86.0}, - {4970, 4980, 23.3, 53.0, 8.5, 16.0}}; - - for (const auto q : querySet) { - // $match query, includeScalar = true. - double estCard = estimateCardRange(arrHist, - false /* lowInclusive */, - value::TypeTags::NumberInt32, - sbe::value::bitcastFrom(q.low), - false /* highInclusive */, - value::TypeTags::NumberInt32, - sbe::value::bitcastFrom(q.high), - true /* includeScalar */); - ASSERT_APPROX_EQUAL(estCard, q.estMatch, 0.1); - - // $elemMatch query, includeScalar = false. - estCard = estimateCardRange(arrHist, - false /* lowInclusive */, - value::TypeTags::NumberInt32, - sbe::value::bitcastFrom(q.low), - false /* highInclusive */, - value::TypeTags::NumberInt32, - sbe::value::bitcastFrom(q.high), - false /* includeScalar */); - ASSERT_APPROX_EQUAL(estCard, q.estElemMatch, 0.1); - } - std::cout << computeRMSE(querySet, false /* isElemMatch */) << std::endl; - std::cout << computeRMSE(querySet, true /* isElemMatch */) << std::endl; -} -} // namespace -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_dataflow_nodes_test.cpp b/src/mongo/db/query/ce/ce_dataflow_nodes_test.cpp deleted file mode 100644 index 1f11472c811..00000000000 --- a/src/mongo/db/query/ce/ce_dataflow_nodes_test.cpp +++ /dev/null @@ -1,227 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/db/query/ce/ce_heuristic.h" -#include "mongo/db/query/ce/ce_test_utils.h" -#include "mongo/db/query/optimizer/props.h" -#include "mongo/db/query/optimizer/utils/unit_test_utils.h" -#include "mongo/db/query/optimizer/utils/utils.h" -#include "mongo/unittest/unittest.h" - -namespace mongo::ce { -namespace { - -using namespace optimizer; -using namespace optimizer::cascades; - -constexpr double kCollCard = 1000.0; -const std::string kCollName = "test"; - -constexpr double kOtherCollCard = 200.0; -const std::string kOtherCollName = "otherTest"; - -constexpr double kThirdCollCard = 50.0; -const std::string kThirdCollName = "thirdTest"; - -class DataflowCETester : public CETester { -public: - DataflowCETester() : CETester(kCollName, kCollCard, kDefaultCETestPhaseSet) {} - -protected: - std::unique_ptr getCETransport() const override { - return std::make_unique(); - } -}; - -namespace { -bool isRootNodeFn(const ABT& node) { - return node.is(); -} -} // namespace - -TEST(CEDataflowTest, EstimateTrivialNodes) { - DataflowCETester t; - const auto matchCard = t.getMatchCE("{a: 1}", isRootNodeFn); - - // Verify 'CollationNode' estimate returns the input cardinality. - ASSERT_CE(t, "[{$sort: {a: 1}}]", kCollCard); - ASSERT_CE(t, "[{$sort: {a: -1, b: 1}}]", kCollCard); - ASSERT_CE(t, "[{$match: {a: 1}}, {$sort: {a: 1, b: 1}}]", matchCard); - - // Verify 'EvaluationNode' estimate. - ASSERT_CE(t, "[{$project: {a: {$add: [\"$a\", 1]}}}]", kCollCard); - ASSERT_CE(t, "[{$match: {a: 1}}, {$project: {a: {$add: [\"$a\", 1]}}}]", matchCard); -} - -TEST(CEDataflowTest, EstimateUnionNode) { - auto makeUnionBranch = [](const std::string& collName) { - ProjectionName scanVar{"scan_" + collName}; - auto scanNode = make(scanVar, collName); - auto evalPath = - make(make("a", make()), make(scanVar)); - return make("a", std::move(evalPath), std::move(scanNode)); - }; - - // Verify that the estimate of 'UnionNode' always returns the sum of estimates of its children. - // In the following tests we force a simple plan to be generated by passing in a 'manually' - // constructed ABT. - { - DataflowCETester t; - t.addCollection(kOtherCollName, kOtherCollCard, {}); - t.addCollection(kThirdCollName, kThirdCollCard, {}); - { - auto unionNode = make( - ProjectionNameVector{"a"}, - makeSeq(makeUnionBranch(kCollName), makeUnionBranch(kOtherCollName))); - auto rootNode = make( - properties::ProjectionRequirement{ProjectionNameVector{"a"}}, std::move(unionNode)); - ASSERT_CE(t, rootNode, kCollCard + kOtherCollCard); - } - { - auto unionNode = make( - ProjectionNameVector{"a"}, - makeSeq(makeUnionBranch(kCollName), makeUnionBranch(kOtherCollName))); - auto parentUnionNode = - make(ProjectionNameVector{"a"}, - makeSeq(std::move(unionNode), makeUnionBranch(kThirdCollName))); - auto rootNode = - make(properties::ProjectionRequirement{ProjectionNameVector{"a"}}, - std::move(parentUnionNode)); - ASSERT_CE(t, rootNode, kCollCard + kOtherCollCard + kThirdCollCard); - } - } - - // The following plans include a UnionNode. - { - DataflowCETester t; - t.setCollCard(2000); - t.setIndexes( - {{"indexA", makeIndexDefinition("a", CollationOp::Ascending, /* isMultiKey */ true)}}); - t.setDisableScan(true); - ASSERT_MATCH_CE(t, {"{a: [12]}"}, 1); - } - { - DataflowCETester t; - t.setIndexes( - {{"indexA", makeIndexDefinition("a", CollationOp::Ascending, /* isMultiKey */ false)}, - {"indexB", makeIndexDefinition("b", CollationOp::Ascending, /* isMultiKey */ false)}}); - t.setDisableScan(true); - ASSERT_MATCH_CE(t, {"{a: 1, b: 2}"}, 5.62341); - } -} - -TEST(CEDataflowTest, EstimateLimitSkipNode) { - DataflowCETester t; - const CEType matchCard = t.getMatchCE("{a: 1}", isRootNodeFn); - - // Verify that 'LimitSkipNode' estimate with only a limit set is min(limit, inputCE). - ASSERT_CE(t, "[{$limit: 1}]", 1.0); - ASSERT_CE(t, "[{$limit: 50}]", 50.0); - ASSERT_CE(t, "[{$limit: 1000}]", kCollCard); - ASSERT_CE(t, "[{$limit: 10000}]", kCollCard); - ASSERT_CE(t, "[{$match: {a: 1}}, {$limit: 1}]", 1.0); - ASSERT_CE(t, "[{$match: {a: 1}}, {$limit: 5}]", 5.0); - ASSERT_CE(t, "[{$match: {a: 1}}, {$limit: 50}]", matchCard); - ASSERT_CE(t, "[{$match: {a: 1}}, {$limit: 1000}]", matchCard); - - // Verify that 'LimitSkipNode' estimate with only a skip set is max(inputCE - skip, 0). - ASSERT_CE(t, "[{$skip: 0}]", kCollCard); - ASSERT_CE(t, "[{$skip: 1}]", kCollCard - 1.0); - ASSERT_CE(t, "[{$skip: 50}]", kCollCard - 50.0); - ASSERT_CE(t, "[{$skip: 1000}]", 0.0); - ASSERT_CE(t, "[{$skip: 10000}]", 0.0); - ASSERT_CE(t, "[{$match: {a: 1}}, {$skip: 1}]", matchCard - 1.0); - ASSERT_CE(t, "[{$match: {a: 1}}, {$skip: 5}]", matchCard - 5.0); - ASSERT_CE(t, "[{$match: {a: 1}}, {$skip: 50}]", 0.0); - ASSERT_CE(t, "[{$match: {a: 1}}, {$skip: 1000}]", 0.0); - - // Test estimates for combinations of $limit & $skip. - ASSERT_CE(t, "[{$limit: 1}, {$skip: 1}]", 0.0); - ASSERT_CE(t, "[{$skip: 1}, {$limit: 1}]", 1.0); - ASSERT_CE(t, "[{$limit: 1}, {$skip: 50}]", 0.0); - ASSERT_CE(t, "[{$skip: 50}, {$limit: 1}]", 1.0); - ASSERT_CE(t, "[{$limit: 50}, {$skip: 1}]", 49.0); - ASSERT_CE(t, "[{$skip: 1}, {$limit: 50}]", 50.0); - ASSERT_CE(t, "[{$limit: 50}, {$skip: 50}]", 0.0); - ASSERT_CE(t, "[{$skip: 50}, {$limit: 50}]", 50.0); - ASSERT_CE(t, "[{$limit: 1000}, {$skip: 50}]", kCollCard - 50.0); - ASSERT_CE(t, "[{$skip: 50}, {$limit: 1000}]", kCollCard - 50.0); - ASSERT_CE(t, "[{$limit: 50}, {$skip: 1000}]", 0.0); - ASSERT_CE(t, "[{$skip: 1000}, {$limit: 50}]", 0.0); - ASSERT_CE(t, "[{$limit: 1000}, {$skip: 1000}]", 0.0); - ASSERT_CE(t, "[{$skip: 1000}, {$limit: 1000}]", 0.0); - - // Test estimates for combinations of $limit & $skip separated by a $match. - ASSERT_CE(t, "[{$limit: 1}, {$match: {a: 1}}, {$skip: 1}]", 0.0); - ASSERT_CE(t, "[{$limit: 1}, {$match: {a: 1}}, {$skip: 50}]", 0.0); - - // Input card to $match: 50. $match selectivity here is sqrt(50)/50. - ASSERT_CE(t, "[{$limit: 50}, {$match: {a: 1}}, {$skip: 1}]", 6.07107); - ASSERT_CE(t, "[{$limit: 50}, {$match: {a: 1}}, {$skip: 50}]", 0.0); - ASSERT_CE(t, "[{$limit: 50}, {$match: {a: 1}}, {$skip: 1000}]", 0.0); - - // Input card to $match is kCollCard. However, our estimate is larger than matchCard because we - // have a FilterNode that does not get converted to a SargableNode in this case. The $match - // selectivity here is sqrt(1000)/1000. - ASSERT_CE(t, "[{$limit: 1000}, {$match: {a: 1}}, {$skip: 1}]", 30.6228); - ASSERT_CE(t, "[{$limit: 1000}, {$match: {a: 1}}, {$skip: 20}]", 11.6228); - ASSERT_CE(t, "[{$limit: 1000}, {$match: {a: 1}}, {$skip: 1000}]", 0.0); - - // Input card to $match: 999. $match selectivity here is sqrt(999)/999. - ASSERT_CE(t, "[{$skip: 1}, {$match: {a: 1}}, {$limit: 1}]", 1.0); - ASSERT_CE(t, "[{$skip: 1}, {$match: {a: 1}}, {$limit: 20}]", 20.0); - ASSERT_CE(t, "[{$skip: 1}, {$match: {a: 1}}, {$limit: 1000}]", 31.607); - - // Input card to $match: 950. $match selectivity here is sqrt(950)/950. - ASSERT_CE(t, "[{$skip: 50}, {$match: {a: 1}}, {$limit: 1}]", 1.0); - ASSERT_CE(t, "[{$skip: 50}, {$match: {a: 1}}, {$limit: 20}]", 20.0); - ASSERT_CE(t, "[{$skip: 50}, {$match: {a: 1}}, {$limit: 1000}]", 30.8221); - - // Input card to $match is 0.0. - ASSERT_CE(t, "[{$skip: 1000}, {$match: {a: 1}}, {$limit: 50}]", 0.0); - ASSERT_CE(t, "[{$skip: 1000}, {$match: {a: 1}}, {$limit: 1000}]", 0.0); -} - -TEST(CEDataflowTest, EstimateUnwindNode) { - DataflowCETester t; - const CEType matchCard = t.getMatchCE("{a: 1}", isRootNodeFn); - - // We assume that arrays on average have ~10 elements, so we estimate this as inputCard*10. - ASSERT_CE(t, "[{$unwind: '$a'}]", 10 * kCollCard); - ASSERT_CE(t, "[{$match: {a: 1}}, {$unwind: '$a'}]", 10 * matchCard); - ASSERT_CE(t, "[{$unwind: {path: '$a', preserveNullAndEmptyArrays: true}}]", 10 * kCollCard); - ASSERT_CE(t, - "[{$match: {a: 1}}, {$unwind: {path: '$a', preserveNullAndEmptyArrays: true}}]", - 10 * matchCard); - - // TODO SERVER-70035: implement histogram estimation of $unwind. -} - -} // namespace -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_edge_cases_test.cpp b/src/mongo/db/query/ce/ce_edge_cases_test.cpp deleted file mode 100644 index 4d8d84f831a..00000000000 --- a/src/mongo/db/query/ce/ce_edge_cases_test.cpp +++ /dev/null @@ -1,1002 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/db/pipeline/abt/utils.h" -#include "mongo/db/query/ce/array_histogram.h" -#include "mongo/db/query/ce/ce_test_utils.h" -#include "mongo/db/query/ce/histogram_estimation.h" -#include "mongo/db/query/ce/maxdiff_test_utils.h" -#include "mongo/db/query/ce/value_utils.h" -#include "mongo/db/query/optimizer/utils/ce_math.h" -#include "mongo/db/query/sbe_stage_builder_helpers.h" -#include "mongo/unittest/unittest.h" - -namespace mongo::ce { -namespace { - -using namespace sbe; - -constexpr double kErrorBound = 0.01; - -TEST(EstimatorTest, OneBucketIntHistogram) { - // Data set of 10 values, each with frequency 3, in the range (-inf, 100]. - // Example: { -100, -20, 0, 20, 50, 60, 70, 80, 90, 100}. - std::vector data{{100, 3.0, 27.0, 9.0}}; - const ScalarHistogram hist = createHistogram(data); - - ASSERT_EQ(30.0, getTotals(hist).card); - - // Estimates with the bucket bound. - ASSERT_EQ(3.0, estimateIntValCard(hist, 100, EstimationType::kEqual)); - ASSERT_EQ(27.0, estimateIntValCard(hist, 100, EstimationType::kLess)); - ASSERT_EQ(30.0, estimateIntValCard(hist, 100, EstimationType::kLessOrEqual)); - ASSERT_EQ(0.0, estimateIntValCard(hist, 100, EstimationType::kGreater)); - ASSERT_EQ(3.0, estimateIntValCard(hist, 100, EstimationType::kGreaterOrEqual)); - - // Estimates with a value inside the bucket. - ASSERT_EQ(3.0, estimateIntValCard(hist, 10, EstimationType::kEqual)); - // No interpolation possible for estimates of inequalities in a single bucket. The estimates - // are based on the default cardinality of half bucket +/- the estimate of equality inside of - // the bucket. - ASSERT_EQ(10.5, estimateIntValCard(hist, 10, EstimationType::kLess)); - ASSERT_EQ(13.5, estimateIntValCard(hist, 10, EstimationType::kLessOrEqual)); - ASSERT_EQ(16.5, estimateIntValCard(hist, 10, EstimationType::kGreater)); - ASSERT_EQ(19.5, estimateIntValCard(hist, 10, EstimationType::kGreaterOrEqual)); - - // Estimates for a value larger than the last bucket bound. - ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kEqual)); - ASSERT_EQ(30.0, estimateIntValCard(hist, 1000, EstimationType::kLess)); - ASSERT_EQ(30.0, estimateIntValCard(hist, 1000, EstimationType::kLessOrEqual)); - ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kGreater)); - ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kGreaterOrEqual)); -} - -TEST(EstimatorTest, OneExclusiveBucketIntHistogram) { - // Data set of a single value. - // By exclusive bucket we mean a bucket with only boundary, that is the range frequency and NDV - // are zero. - std::vector data{{100, 2.0, 0.0, 0.0}}; - const ScalarHistogram hist = createHistogram(data); - - ASSERT_EQ(2.0, getTotals(hist).card); - - // Estimates with the bucket boundary. - ASSERT_EQ(2.0, estimateIntValCard(hist, 100, EstimationType::kEqual)); - ASSERT_EQ(0.0, estimateIntValCard(hist, 100, EstimationType::kLess)); - ASSERT_EQ(0.0, estimateIntValCard(hist, 100, EstimationType::kGreater)); - - ASSERT_EQ(0.0, estimateIntValCard(hist, 0, EstimationType::kEqual)); - ASSERT_EQ(0.0, estimateIntValCard(hist, 0, EstimationType::kLess)); - ASSERT_EQ(2.0, estimateIntValCard(hist, 0, EstimationType::kGreater)); - - ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kEqual)); - ASSERT_EQ(2.0, estimateIntValCard(hist, 1000, EstimationType::kLess)); - ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kGreater)); -} - -TEST(EstimatorTest, OneBucketTwoIntValuesHistogram) { - // Data set of two values, example {5, 100, 100}. - std::vector data{{100, 2.0, 1.0, 1.0}}; - const ScalarHistogram hist = createHistogram(data); - - ASSERT_EQ(3.0, getTotals(hist).card); - - // Estimates with the bucket boundary. - ASSERT_EQ(2.0, estimateIntValCard(hist, 100, EstimationType::kEqual)); - ASSERT_EQ(1.0, estimateIntValCard(hist, 100, EstimationType::kLess)); - ASSERT_EQ(0.0, estimateIntValCard(hist, 100, EstimationType::kGreater)); - - ASSERT_EQ(1.0, estimateIntValCard(hist, 10, EstimationType::kEqual)); - // Default estimate of half of the bucket's range frequency = 0.5. - ASSERT_EQ(0.5, estimateIntValCard(hist, 10, EstimationType::kLess)); - ASSERT_EQ(2.5, estimateIntValCard(hist, 10, EstimationType::kGreater)); - - ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kEqual)); - ASSERT_EQ(3.0, estimateIntValCard(hist, 1000, EstimationType::kLess)); - ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kGreater)); -} - -TEST(EstimatorTest, OneBucketTwoIntValuesHistogram2) { - // Similar to the above test with higher frequency for the second value. - // Example {5, 5, 5, 100, 100}. - std::vector data{{100, 2.0, 3.0, 1.0}}; - const ScalarHistogram hist = createHistogram(data); - - ASSERT_EQ(5.0, getTotals(hist).card); - - // Estimates with the bucket boundary. - ASSERT_EQ(2.0, estimateIntValCard(hist, 100, EstimationType::kEqual)); - ASSERT_EQ(3.0, estimateIntValCard(hist, 100, EstimationType::kLess)); - ASSERT_EQ(0.0, estimateIntValCard(hist, 100, EstimationType::kGreater)); - - ASSERT_EQ(3.0, estimateIntValCard(hist, 10, EstimationType::kEqual)); - // Default estimate of half of the bucket's range frequency = 1.5. - ASSERT_EQ(1.5, estimateIntValCard(hist, 10, EstimationType::kLess)); - ASSERT_EQ(3.5, estimateIntValCard(hist, 10, EstimationType::kGreater)); - - ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kEqual)); - ASSERT_EQ(5.0, estimateIntValCard(hist, 1000, EstimationType::kLess)); - ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kGreater)); -} - -TEST(EstimatorTest, TwoBucketsIntHistogram) { - // Data set of 10 values in the range [1, 100]. - std::vector data{{1, 1.0, 0.0, 0.0}, {100, 3.0, 26.0, 8.0}}; - const ScalarHistogram hist = createHistogram(data); - - ASSERT_EQ(30.0, getTotals(hist).card); - - // Estimates for a value smaller than the first bucket. - ASSERT_EQ(0.0, estimateIntValCard(hist, -42, EstimationType::kEqual)); - ASSERT_EQ(0.0, estimateIntValCard(hist, -42, EstimationType::kLess)); - ASSERT_EQ(0.0, estimateIntValCard(hist, -42, EstimationType::kLessOrEqual)); - ASSERT_EQ(30.0, estimateIntValCard(hist, -42, EstimationType::kGreater)); - ASSERT_EQ(30.0, estimateIntValCard(hist, -42, EstimationType::kGreaterOrEqual)); - - // Estimates with bucket bounds. - ASSERT_EQ(1.0, estimateIntValCard(hist, 1, EstimationType::kEqual)); - ASSERT_EQ(0.0, estimateIntValCard(hist, 1, EstimationType::kLess)); - ASSERT_EQ(1.0, estimateIntValCard(hist, 1, EstimationType::kLessOrEqual)); - ASSERT_EQ(29.0, estimateIntValCard(hist, 1, EstimationType::kGreater)); - ASSERT_EQ(30.0, estimateIntValCard(hist, 1, EstimationType::kGreaterOrEqual)); - - ASSERT_EQ(3.0, estimateIntValCard(hist, 100, EstimationType::kEqual)); - ASSERT_EQ(27.0, estimateIntValCard(hist, 100, EstimationType::kLess)); - ASSERT_EQ(30.0, estimateIntValCard(hist, 100, EstimationType::kLessOrEqual)); - ASSERT_EQ(0.0, estimateIntValCard(hist, 100, EstimationType::kGreater)); - ASSERT_EQ(3.0, estimateIntValCard(hist, 100, EstimationType::kGreaterOrEqual)); - - // Estimates with a value inside the bucket. The estimates use interpolation. - // The bucket ratio for the value of 10 is smaller than the estimate for equality - // and the estimates for Less and LessOrEqual are the same. - ASSERT_APPROX_EQUAL(3.25, estimateIntValCard(hist, 10, EstimationType::kEqual), kErrorBound); - ASSERT_APPROX_EQUAL(3.36, estimateIntValCard(hist, 10, EstimationType::kLess), kErrorBound); - ASSERT_APPROX_EQUAL( - 3.36, estimateIntValCard(hist, 10, EstimationType::kLessOrEqual), kErrorBound); - - ASSERT_APPROX_EQUAL(26.64, estimateIntValCard(hist, 10, EstimationType::kGreater), kErrorBound); - ASSERT_APPROX_EQUAL( - 26.64, estimateIntValCard(hist, 10, EstimationType::kGreaterOrEqual), kErrorBound); - - // Different estimates for Less and LessOrEqual for the value of 50. - ASSERT_APPROX_EQUAL(3.25, estimateIntValCard(hist, 50, EstimationType::kEqual), kErrorBound); - ASSERT_APPROX_EQUAL(10.61, estimateIntValCard(hist, 50, EstimationType::kLess), kErrorBound); - ASSERT_APPROX_EQUAL( - 13.87, estimateIntValCard(hist, 50, EstimationType::kLessOrEqual), kErrorBound); - ASSERT_APPROX_EQUAL(16.13, estimateIntValCard(hist, 50, EstimationType::kGreater), kErrorBound); - ASSERT_APPROX_EQUAL( - 19.38, estimateIntValCard(hist, 50, EstimationType::kGreaterOrEqual), kErrorBound); -} - -TEST(EstimatorTest, ThreeExclusiveBucketsIntHistogram) { - std::vector data{{1, 1.0, 0.0, 0.0}, {10, 8.0, 0.0, 0.0}, {100, 1.0, 0.0, 0.0}}; - const ScalarHistogram hist = createHistogram(data); - - ASSERT_EQ(10.0, getTotals(hist).card); - - ASSERT_EQ(0.0, estimateIntValCard(hist, 5, EstimationType::kEqual)); - ASSERT_EQ(1.0, estimateIntValCard(hist, 5, EstimationType::kLess)); - ASSERT_EQ(1.0, estimateIntValCard(hist, 5, EstimationType::kLessOrEqual)); - ASSERT_EQ(9.0, estimateIntValCard(hist, 5, EstimationType::kGreater)); - ASSERT_EQ(9.0, estimateIntValCard(hist, 5, EstimationType::kGreaterOrEqual)); -} -TEST(EstimatorTest, OneBucketStrHistogram) { - std::vector data{{"xyz", 3.0, 27.0, 9.0}}; - const ScalarHistogram hist = createHistogram(data); - - ASSERT_EQ(30.0, getTotals(hist).card); - - // Estimates with bucket bound. - auto [tag, value] = value::makeNewString("xyz"_sd); - value::ValueGuard vg(tag, value); - double expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; - ASSERT_EQ(3.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; - ASSERT_EQ(27.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; - ASSERT_EQ(30.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; - ASSERT_EQ(0.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card; - ASSERT_EQ(3.0, expectedCard); - - // Estimates for a value inside the bucket. Since there is no low value bound in the histogram - // all values smaller than the upper bound will be estimated the same way using half of the - // bucket cardinality. - std::tie(tag, value) = value::makeNewString("a"_sd); - expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; - ASSERT_EQ(3.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; - ASSERT_EQ(10.5, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; - ASSERT_EQ(13.5, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; - ASSERT_EQ(16.5, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card; - ASSERT_EQ(19.5, expectedCard); - - std::tie(tag, value) = value::makeNewString(""_sd); - // In the special case of a single string bucket, we estimate empty string equality as for any - // other string value. In practice if there are at least 2 buckets for the string data and an - // empty string in the data set, it will be chosen as a bound for the first bucket and produce - // precise estimates. - expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; - ASSERT_EQ(3.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; - ASSERT_EQ(0.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card; - ASSERT_EQ(30.0, expectedCard); - - // Estimates for a value larger than the upper bound. - std::tie(tag, value) = value::makeNewString("z"_sd); - expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; - ASSERT_EQ(0.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; - ASSERT_EQ(30.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; - ASSERT_EQ(0.0, expectedCard); -} - -TEST(EstimatorTest, TwoBucketsStrHistogram) { - // Data set of 100 strings in the range ["abc", "xyz"], with average frequency of 2. - std::vector data{{"abc", 2.0, 0.0, 0.0}, {"xyz", 3.0, 95.0, 48.0}}; - const ScalarHistogram hist = createHistogram(data); - - ASSERT_EQ(100.0, getTotals(hist).card); - - // Estimates for a value smaller than the first bucket bound. - auto [tag, value] = value::makeNewString("a"_sd); - value::ValueGuard vg(tag, value); - - double expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; - ASSERT_EQ(0.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; - ASSERT_EQ(0.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; - ASSERT_EQ(0.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; - ASSERT_EQ(100.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card; - ASSERT_EQ(100.0, expectedCard); - - // Estimates with bucket bounds. - std::tie(tag, value) = value::makeNewString("abc"_sd); - expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; - ASSERT_EQ(2.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; - ASSERT_EQ(0.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; - ASSERT_EQ(2.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; - ASSERT_EQ(98.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card; - ASSERT_EQ(100.0, expectedCard); - - std::tie(tag, value) = value::makeNewString("xyz"_sd); - expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; - ASSERT_EQ(3.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; - ASSERT_EQ(97.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; - ASSERT_EQ(100.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; - ASSERT_EQ(0.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card; - ASSERT_EQ(3.0, expectedCard); - - // Estimates for a value inside the bucket. - std::tie(tag, value) = value::makeNewString("sun"_sd); - expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; - ASSERT_APPROX_EQUAL(1.98, expectedCard, kErrorBound); - expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; - ASSERT_APPROX_EQUAL(74.39, expectedCard, kErrorBound); - expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; - ASSERT_APPROX_EQUAL(76.37, expectedCard, kErrorBound); - expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; - ASSERT_APPROX_EQUAL(23.64, expectedCard, kErrorBound); - expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card; - ASSERT_APPROX_EQUAL(25.62, expectedCard, kErrorBound); - - // Estimate for a value very close to the bucket bound. - std::tie(tag, value) = value::makeNewString("xyw"_sd); - expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; - ASSERT_APPROX_EQUAL(1.98, expectedCard, kErrorBound); - expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; - ASSERT_APPROX_EQUAL(95.02, expectedCard, kErrorBound); - expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; - ASSERT_APPROX_EQUAL(96.99, expectedCard, kErrorBound); - expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; - ASSERT_APPROX_EQUAL(3.0, expectedCard, kErrorBound); - expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card; - ASSERT_APPROX_EQUAL(4.98, expectedCard, kErrorBound); -} - -TEST(EstimatorTest, TwoBucketsDateHistogram) { - // June 6, 2017 -- June 7, 2017. - const int64_t startInstant = 1496777923000LL; - const int64_t endInstant = 1496864323000LL; - const auto startDate = Date_t::fromMillisSinceEpoch(startInstant); - const auto endDate = Date_t::fromMillisSinceEpoch(endInstant); - - std::vector data{{Value(startDate), 3.0, 0.0, 0.0}, - {Value(endDate), 1.0, 96.0, 48.0}}; - const ScalarHistogram hist = createHistogram(data); - - ASSERT_EQ(100.0, getTotals(hist).card); - - const auto valueBefore = value::bitcastFrom(startInstant - 1); - double expectedCard = - estimate(hist, value::TypeTags::Date, valueBefore, EstimationType::kEqual).card; - ASSERT_EQ(0.0, expectedCard); - expectedCard = estimate(hist, value::TypeTags::Date, valueBefore, EstimationType::kLess).card; - ASSERT_EQ(0.0, expectedCard); - expectedCard = - estimate(hist, value::TypeTags::Date, valueBefore, EstimationType::kGreater).card; - ASSERT_EQ(100.0, expectedCard); - - const auto valueStart = value::bitcastFrom(startInstant); - expectedCard = estimate(hist, value::TypeTags::Date, valueStart, EstimationType::kEqual).card; - ASSERT_EQ(3.0, expectedCard); - expectedCard = estimate(hist, value::TypeTags::Date, valueStart, EstimationType::kLess).card; - ASSERT_EQ(0.0, expectedCard); - expectedCard = estimate(hist, value::TypeTags::Date, valueStart, EstimationType::kGreater).card; - ASSERT_EQ(97.0, expectedCard); - - const auto valueEnd = value::bitcastFrom(endInstant); - expectedCard = estimate(hist, value::TypeTags::Date, valueEnd, EstimationType::kEqual).card; - ASSERT_EQ(1.0, expectedCard); - expectedCard = estimate(hist, value::TypeTags::Date, valueEnd, EstimationType::kLess).card; - ASSERT_EQ(99.0, expectedCard); - expectedCard = estimate(hist, value::TypeTags::Date, valueEnd, EstimationType::kGreater).card; - ASSERT_EQ(0.0, expectedCard); - - const auto valueIn = value::bitcastFrom(startInstant + 43000000); - expectedCard = estimate(hist, value::TypeTags::Date, valueIn, EstimationType::kEqual).card; - ASSERT_EQ(2.0, expectedCard); - expectedCard = estimate(hist, value::TypeTags::Date, valueIn, EstimationType::kLess).card; - ASSERT_APPROX_EQUAL(48.77, expectedCard, kErrorBound); - expectedCard = estimate(hist, value::TypeTags::Date, valueIn, EstimationType::kGreater).card; - ASSERT_APPROX_EQUAL(49.22, expectedCard, kErrorBound); - - const auto valueAfter = value::bitcastFrom(endInstant + 100); - expectedCard = estimate(hist, value::TypeTags::Date, valueAfter, EstimationType::kEqual).card; - ASSERT_EQ(0.0, expectedCard); - expectedCard = estimate(hist, value::TypeTags::Date, valueAfter, EstimationType::kLess).card; - ASSERT_EQ(100.0, expectedCard); - expectedCard = estimate(hist, value::TypeTags::Date, valueAfter, EstimationType::kGreater).card; - ASSERT_EQ(0.0, expectedCard); -} - -TEST(EstimatorTest, TwoBucketsTimestampHistogram) { - // June 6, 2017 -- June 7, 2017 in seconds. - const int64_t startInstant = 1496777923LL; - const int64_t endInstant = 1496864323LL; - const Timestamp startTs{Seconds(startInstant), 0}; - const Timestamp endTs{Seconds(endInstant), 0}; - - std::vector data{{Value(startTs), 3.0, 0.0, 0.0}, {Value(endTs), 1.0, 96.0, 48.0}}; - const ScalarHistogram hist = createHistogram(data); - - ASSERT_EQ(100.0, getTotals(hist).card); - - const auto valueBefore = value::bitcastFrom(startTs.asULL() - 1); - double expectedCard = - estimate(hist, value::TypeTags::Timestamp, valueBefore, EstimationType::kEqual).card; - ASSERT_EQ(0.0, expectedCard); - expectedCard = - estimate(hist, value::TypeTags::Timestamp, valueBefore, EstimationType::kLess).card; - ASSERT_EQ(0.0, expectedCard); - expectedCard = - estimate(hist, value::TypeTags::Timestamp, valueBefore, EstimationType::kGreater).card; - ASSERT_EQ(100.0, expectedCard); - - const auto valueStart = value::bitcastFrom( - startTs.asULL()); // NB: startTs.asInt64() produces different value. - expectedCard = - estimate(hist, value::TypeTags::Timestamp, valueStart, EstimationType::kEqual).card; - ASSERT_EQ(3.0, expectedCard); - expectedCard = - estimate(hist, value::TypeTags::Timestamp, valueStart, EstimationType::kLess).card; - ASSERT_EQ(0.0, expectedCard); - expectedCard = - estimate(hist, value::TypeTags::Timestamp, valueStart, EstimationType::kGreater).card; - ASSERT_EQ(97.0, expectedCard); - - const auto valueEnd = value::bitcastFrom(endTs.asULL()); - expectedCard = - estimate(hist, value::TypeTags::Timestamp, valueEnd, EstimationType::kEqual).card; - ASSERT_EQ(1.0, expectedCard); - expectedCard = estimate(hist, value::TypeTags::Timestamp, valueEnd, EstimationType::kLess).card; - ASSERT_EQ(99.0, expectedCard); - expectedCard = - estimate(hist, value::TypeTags::Timestamp, valueEnd, EstimationType::kGreater).card; - ASSERT_EQ(0.0, expectedCard); - - const auto valueIn = value::bitcastFrom((startTs.asULL() + endTs.asULL()) / 2); - expectedCard = estimate(hist, value::TypeTags::Timestamp, valueIn, EstimationType::kEqual).card; - ASSERT_EQ(2.0, expectedCard); - expectedCard = estimate(hist, value::TypeTags::Timestamp, valueIn, EstimationType::kLess).card; - ASSERT_APPROX_EQUAL(49.0, expectedCard, kErrorBound); - expectedCard = - estimate(hist, value::TypeTags::Timestamp, valueIn, EstimationType::kGreater).card; - ASSERT_APPROX_EQUAL(49.0, expectedCard, kErrorBound); - - const auto valueAfter = value::bitcastFrom(endTs.asULL() + 100); - expectedCard = - estimate(hist, value::TypeTags::Timestamp, valueAfter, EstimationType::kEqual).card; - ASSERT_EQ(0.0, expectedCard); - expectedCard = - estimate(hist, value::TypeTags::Timestamp, valueAfter, EstimationType::kLess).card; - ASSERT_EQ(100.0, expectedCard); - expectedCard = - estimate(hist, value::TypeTags::Timestamp, valueAfter, EstimationType::kGreater).card; - ASSERT_EQ(0.0, expectedCard); -} - -TEST(EstimatorTest, TwoBucketsObjectIdHistogram) { - const auto startOid = OID("63340d8d27afef2de7357e8d"); - const auto endOid = OID("63340dbed6cd8af737d4139a"); - ASSERT_TRUE(startOid < endOid); - - std::vector data{{Value(startOid), 2.0, 0.0, 0.0}, - {Value(endOid), 1.0, 97.0, 77.0}}; - const ScalarHistogram hist = createHistogram(data); - - ASSERT_EQ(100.0, getTotals(hist).card); - - auto [tag, value] = value::makeNewObjectId(); - value::ValueGuard vg(tag, value); - const auto oidBefore = OID("63340d8d27afef2de7357e8c"); - oidBefore.view().readInto(value::getObjectIdView(value)); - - double expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; - ASSERT_EQ(0.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; - ASSERT_EQ(0.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; - ASSERT_EQ(100.0, expectedCard); - - // Bucket bounds. - startOid.view().readInto(value::getObjectIdView(value)); - expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; - ASSERT_EQ(2.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; - ASSERT_EQ(0.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; - ASSERT_EQ(98.0, expectedCard); - - endOid.view().readInto(value::getObjectIdView(value)); - expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; - ASSERT_EQ(1.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; - ASSERT_EQ(99.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; - ASSERT_EQ(0.0, expectedCard); - - // ObjectId value inside the bucket. - const auto oidInside = OID("63340db2cd4d46ff39178e9d"); - oidInside.view().readInto(value::getObjectIdView(value)); - expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; - ASSERT_APPROX_EQUAL(1.25, expectedCard, kErrorBound); - - expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; - ASSERT_APPROX_EQUAL(83.95, expectedCard, kErrorBound); - expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; - ASSERT_APPROX_EQUAL(14.78, expectedCard, kErrorBound); - - const auto oidAfter = OID("63340dbed6cd8af737d4139b"); - oidAfter.view().readInto(value::getObjectIdView(value)); - expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; - ASSERT_EQ(0.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; - ASSERT_EQ(100.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; - ASSERT_EQ(0.0, expectedCard); -} - -TEST(EstimatorTest, TwoExclusiveBucketsMixedHistogram) { - // Data set of mixed data types: 3 integers and 5 strings. - std::vector data{{1, 3.0, 0.0, 0.0}, {"abc", 5.0, 0.0, 0.0}}; - const ScalarHistogram hist = createHistogram(data); - const ArrayHistogram arrHist( - hist, TypeCounts{{value::TypeTags::NumberInt64, 3}, {value::TypeTags::StringSmall, 5}}); - - const auto [tagLowDbl, valLowDbl] = - std::make_pair(value::TypeTags::NumberDouble, - value::bitcastFrom(std::numeric_limits::quiet_NaN())); - - // (NaN, 1). - double expectedCard = estimateCardRange(arrHist, - false /* lowInclusive */, - tagLowDbl, - valLowDbl, - false /* highInclusive */, - value::TypeTags::NumberInt32, - value::bitcastFrom(1), - true /* includeScalar */); - ASSERT_APPROX_EQUAL(0.0, expectedCard, kErrorBound); - - // (NaN, 5). - expectedCard = estimateCardRange(arrHist, - false /* lowInclusive */, - tagLowDbl, - valLowDbl, - false /* highInclusive */, - value::TypeTags::NumberInt32, - value::bitcastFrom(5), - true /* includeScalar */); - ASSERT_APPROX_EQUAL(3.0, expectedCard, kErrorBound); - - const auto [tagLowStr, valLowStr] = value::makeNewString(""_sd); - value::ValueGuard vgLowStr(tagLowStr, valLowStr); - auto [tag, value] = value::makeNewString("a"_sd); - value::ValueGuard vg(tag, value); - - // [0, ""). - expectedCard = estimateCardRange(arrHist, - true /* lowInclusive */, - value::TypeTags::NumberInt32, - value::bitcastFrom(0), - false /* highInclusive */, - tagLowStr, - valLowStr, - true /* includeScalar */); - ASSERT_APPROX_EQUAL(3.0, expectedCard, kErrorBound); - - // ["", "a"]. - expectedCard = estimateCardRange(arrHist, - true /* lowInclusive */, - tagLowStr, - valLowStr, - true /* highInclusive */, - tag, - value, - true /* includeScalar */); - - ASSERT_APPROX_EQUAL(0.0, expectedCard, kErrorBound); - - std::tie(tag, value) = value::makeNewString("xyz"_sd); - // ["", "xyz"]. - expectedCard = estimateCardRange(arrHist, - true /* lowInclusive */, - tagLowStr, - valLowStr, - true /* highInclusive */, - tag, - value, - true /* includeScalar */); - - ASSERT_APPROX_EQUAL(5.0, expectedCard, kErrorBound); -} - -TEST(EstimatorTest, TwoBucketsMixedHistogram) { - // Data set of mixed data types: 20 integers and 80 strings. - // Histogram with one bucket per data type. - std::vector data{{100, 3.0, 17.0, 9.0}, {"pqr", 5.0, 75.0, 25.0}}; - const ScalarHistogram hist = createHistogram(data); - const ArrayHistogram arrHist( - hist, TypeCounts{{value::TypeTags::NumberInt64, 20}, {value::TypeTags::StringSmall, 80}}); - - ASSERT_EQ(100.0, getTotals(hist).card); - - // Estimates with the bucket bounds. - ASSERT_EQ(3.0, estimateIntValCard(hist, 100, EstimationType::kEqual)); - ASSERT_EQ(17.0, estimateIntValCard(hist, 100, EstimationType::kLess)); - ASSERT_EQ(80.0, estimateIntValCard(hist, 100, EstimationType::kGreater)); - - auto [tag, value] = value::makeNewString("pqr"_sd); - value::ValueGuard vg(tag, value); - double expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; - ASSERT_EQ(5.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; - ASSERT_EQ(95.0, expectedCard); - expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; - ASSERT_EQ(0.0, expectedCard); - - // Estimates for a value smaller than the first bucket bound. - ASSERT_APPROX_EQUAL(1.88, estimateIntValCard(hist, 50, EstimationType::kEqual), kErrorBound); - ASSERT_APPROX_EQUAL(6.61, estimateIntValCard(hist, 50, EstimationType::kLess), kErrorBound); - ASSERT_APPROX_EQUAL( - 8.49, estimateIntValCard(hist, 50, EstimationType::kLessOrEqual), kErrorBound); - ASSERT_APPROX_EQUAL(91.5, estimateIntValCard(hist, 50, EstimationType::kGreater), kErrorBound); - ASSERT_APPROX_EQUAL( - 93.39, estimateIntValCard(hist, 50, EstimationType::kGreaterOrEqual), kErrorBound); - - // Estimates for a value between bucket bounds. - ASSERT_EQ(0.0, estimateIntValCard(hist, 105, EstimationType::kEqual)); - - std::tie(tag, value) = value::makeNewString("a"_sd); - expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; - ASSERT_APPROX_EQUAL(3.0, expectedCard, kErrorBound); - expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; - ASSERT_APPROX_EQUAL(54.5, expectedCard, kErrorBound); - expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; - ASSERT_APPROX_EQUAL(57.5, expectedCard, kErrorBound); - expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; - ASSERT_APPROX_EQUAL(42.5, expectedCard, kErrorBound); - expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card; - ASSERT_APPROX_EQUAL(45.5, expectedCard, kErrorBound); - - // Range estimates, including min/max values per data type. - const auto [tagLowDbl, valLowDbl] = - std::make_pair(value::TypeTags::NumberDouble, - value::bitcastFrom(std::numeric_limits::quiet_NaN())); - const auto [tagHighInt, valHighInt] = - std::make_pair(value::TypeTags::NumberInt64, value::bitcastFrom(1000000)); - - // [NaN, 25]. - expectedCard = estimateCardRange(arrHist, - true /* lowInclusive */, - tagLowDbl, - valLowDbl, - true /* highInclusive */, - value::TypeTags::NumberInt32, - value::bitcastFrom(25), - true /* includeScalar */); - ASSERT_APPROX_EQUAL(8.49, expectedCard, kErrorBound); - - // [25, 1000000]. - expectedCard = estimateCardRange(arrHist, - true /* lowInclusive */, - value::TypeTags::NumberInt32, - value::bitcastFrom(25), - true /* highInclusive */, - tagHighInt, - valHighInt, - true /* includeScalar */); - ASSERT_APPROX_EQUAL(13.38, expectedCard, kErrorBound); - - // [NaN, 1000000]. - expectedCard = estimateCardRange(arrHist, - true /* lowInclusive */, - tagLowDbl, - valLowDbl, - true /* highInclusive */, - tagHighInt, - valHighInt, - true /* includeScalar */); - ASSERT_APPROX_EQUAL(20.0, expectedCard, kErrorBound); - - const auto [tagLowStr, valLowStr] = value::makeNewString(""_sd); - value::ValueGuard vgLowStr(tagLowStr, valLowStr); - - // [NaN, ""). - expectedCard = estimateCardRange(arrHist, - true /* lowInclusive */, - tagLowDbl, - valLowDbl, - false /* highInclusive */, - tagLowStr, - valLowStr, - true /* includeScalar */); - ASSERT_APPROX_EQUAL(20.0, expectedCard, kErrorBound); - - // [25, ""). - expectedCard = estimateCardRange(arrHist, - true /* lowInclusive */, - value::TypeTags::NumberInt32, - value::bitcastFrom(25), - false /* highInclusive */, - tagLowStr, - valLowStr, - true /* includeScalar */); - ASSERT_APPROX_EQUAL(13.39, expectedCard, kErrorBound); - - // ["", "a"]. - expectedCard = estimateCardRange(arrHist, - true /* lowInclusive */, - tagLowStr, - valLowStr, - true /* highInclusive */, - tag, - value, - true /* includeScalar */); - - ASSERT_APPROX_EQUAL(37.49, expectedCard, kErrorBound); - - // ["", {}). - auto [tagObj, valObj] = value::makeNewObject(); - value::ValueGuard vgObj(tagObj, valObj); - expectedCard = estimateCardRange(arrHist, - true /* lowInclusive */, - tagLowStr, - valLowStr, - false /* highInclusive */, - tagObj, - valObj, - true /* includeScalar */); - ASSERT_APPROX_EQUAL(80.0, expectedCard, kErrorBound); - - // ["a", {}). - expectedCard = estimateCardRange(arrHist, - true /* lowInclusive */, - tag, - value, - false /* highInclusive */, - tagObj, - valObj, - true /* includeScalar */); - - ASSERT_APPROX_EQUAL(45.5, expectedCard, kErrorBound); -} - -// TODO: enable the following test after SERVER-71376 Fix histogram generation on MacOs -#if 0 -/** - * Tests for cardinality estimates for queries over minimum values of date, timestamp, and objectId - * types. When the histogram has at least 2 buckets per data type, the minimum value, if present in - * the data, is picked as a bound for the first bucket for the corresponding data type. In this case - * the cardinality estimates are precise. To test the approximate estimation, we force the histogram - * generation to use one bucket per type (except the first numeric type). - */ -TEST(EstimatorTest, MinValueMixedHistogramFromData) { - const int64_t startInstant = 1506777923000LL; - const int64_t endInstant = 1516864323000LL; - const Timestamp startTs{Seconds(1516864323LL), 0}; - const Timestamp endTs{Seconds(1526864323LL), 0}; - const auto startOid = OID("63340d8d27afef2de7357e8d"); - // const auto endOid = OID("63340dbed6cd8af737d4139a"); - - std::vector data; - data.emplace_back(value::TypeTags::Date, value::bitcastFrom(startInstant)); - data.emplace_back(value::TypeTags::Date, value::bitcastFrom(endInstant)); - - data.emplace_back(value::TypeTags::Timestamp, value::bitcastFrom(startTs.asULL())); - data.emplace_back(value::TypeTags::Timestamp, value::bitcastFrom(endTs.asULL())); - - auto [tag, val] = makeInt64Value(100); - data.emplace_back(tag, val); - std::tie(tag, val) = makeInt64Value(1000); - data.emplace_back(tag, val); - - auto [strTag, strVal] = value::makeNewString("abc"_sd); - value::ValueGuard strVG(strTag, strVal); - auto [copyTag, copyVal] = value::copyValue(strTag, strVal); - data.emplace_back(copyTag, copyVal); - std::tie(strTag, strVal) = value::makeNewString("xyz"_sd); - std::tie(copyTag, copyVal) = value::copyValue(strTag, strVal); - data.emplace_back(copyTag, copyVal); - - auto [objTag, objVal] = value::makeNewObjectId(); - value::ValueGuard objVG(objTag, objVal); - startOid.view().readInto(value::getObjectIdView(objVal)); - std::tie(tag, val) = copyValue(objTag, objVal); - data.emplace_back(tag, val); - /* TODO: add another objectId value when mapping to double is fixed by SERVER-71205. - endOid.view().readInto(value::getObjectIdView(objVal)); - std::tie(tag, val) = copyValue(objTag, objVal); - data.emplace_back(tag, val); - */ - - sortValueVector(data); - - // Force each type except numbers to use a single bucket. This way there is no bucket for the - // min value if present in the data and it needs to be estimated. - const ScalarHistogram& hist = makeHistogram(data, 6); - // Mixed data are sorted in the histogram according to the BSON order as defined in bsontypes.h - // the canonicalizeBSONTypeUnsafeLookup function. - if constexpr (kCETestLogOnly) { - std::cout << printValueArray(data) << "\n"; - std::cout << "Mixed types " << hist.dump(); - } - - // Minimum ObjectId. - auto&& [minOid, inclOid] = getMinMaxBoundForType(true /*isMin*/, value::TypeTags::ObjectId); - auto [minOidTag, minOidVal] = minOid->cast()->get(); - double expectedCard = estimate(hist, minOidTag, minOidVal, EstimationType::kEqual).card; - ASSERT_EQ(1.0, expectedCard); - - // Minimum date. - const auto&& [minDate, inclDate] = getMinMaxBoundForType(true /*isMin*/, value::TypeTags::Date); - const auto [minDateTag, minDateVal] = minDate->cast()->get(); - expectedCard = estimate(hist, minDateTag, minDateVal, EstimationType::kEqual).card; - ASSERT_EQ(1.0, expectedCard); - - // Minimum timestamp. - auto&& [minTs, inclTs] = getMinMaxBoundForType(true /*isMin*/, value::TypeTags::Timestamp); - auto [minTsTag, minTsVal] = minTs->cast()->get(); - expectedCard = estimate(hist, minTsTag, minTsVal, EstimationType::kEqual).card; - ASSERT_EQ(1.0, expectedCard); - - // Add minimum values to the data set and create another histogram. - const auto [tagLowStr, valLowStr] = value::makeNewString(""_sd); - value::ValueGuard vgLowStr(tagLowStr, valLowStr); - std::tie(copyTag, copyVal) = value::copyValue(tagLowStr, valLowStr); - data.emplace_back(copyTag, copyVal); - data.emplace_back(minDateTag, minDateVal); - data.emplace_back(minTsTag, minTsVal); - - sortValueVector(data); - const ScalarHistogram& hist2 = makeHistogram(data, 6); - if constexpr (kCETestLogOnly) { - std::cout << printValueArray(data) << "\n"; - std::cout << "Mixed types " << hist2.dump(); - } - - // Precise estimate for equality to empty string, it is a bucket boundary. - expectedCard = estimate(hist2, tagLowStr, valLowStr, EstimationType::kEqual).card; - ASSERT_EQ(1.0, expectedCard); - // Equality to the minimum date/ts value is estimated by range_frequency/NDV. - expectedCard = estimate(hist2, minDateTag, minDateVal, EstimationType::kEqual).card; - ASSERT_EQ(1.0, expectedCard); - expectedCard = estimate(hist2, minTsTag, minTsVal, EstimationType::kEqual).card; - ASSERT_EQ(1.0, expectedCard); - - // Inequality predicates using min values. - const ArrayHistogram arrHist(hist2, - TypeCounts{ - {value::TypeTags::NumberInt64, 2}, - {value::TypeTags::StringSmall, 3}, - {value::TypeTags::ObjectId, 1}, - {value::TypeTags::Date, 3}, - {value::TypeTags::Timestamp, 3}, - }); - // [minDate, startInstant], estimated by the half of the date bucket. - expectedCard = estimateCardRange(arrHist, - true /* lowInclusive */, - minDateTag, - minDateVal, - true /* highInclusive */, - value::TypeTags::Date, - value::bitcastFrom(startInstant), - true /* includeScalar */); - ASSERT_EQ(1.0, expectedCard); - - // [minDate, endInstant], estimated by the entire date bucket. - expectedCard = estimateCardRange(arrHist, - true /* lowInclusive */, - minDateTag, - minDateVal, - true /* highInclusive */, - value::TypeTags::Date, - value::bitcastFrom(endInstant), - true /* includeScalar */); - ASSERT_EQ(3.0, expectedCard); - - // [minDate, minTs), estimated by the entire date bucket. - // (is this interval possible or is it better to have maxDate upper bound?). - expectedCard = estimateCardRange(arrHist, - true /* lowInclusive */, - minDateTag, - minDateVal, - false /* highInclusive */, - minTsTag, - minTsVal, - true /* includeScalar */); - ASSERT_EQ(3.0, expectedCard); - - // [minTs, startTs], estimated by the half of the timestamp bucket. - expectedCard = estimateCardRange(arrHist, - true /* lowInclusive */, - minTsTag, - minTsVal, - true /* highInclusive */, - value::TypeTags::Timestamp, - value::bitcastFrom(startTs.asULL()), - true /* includeScalar */); - ASSERT_EQ(1.0, expectedCard); - - // [minTs, endTs], estimated by the entire timestamp bucket. - expectedCard = estimateCardRange(arrHist, - true /* lowInclusive */, - minTsTag, - minTsVal, - true /* highInclusive */, - value::TypeTags::Timestamp, - value::bitcastFrom(endTs.asULL()), - true /* includeScalar */); - ASSERT_EQ(3.0, expectedCard); - - // [minTs, maxTs], estimated by the entire timestamp bucket. - auto&& [maxTs, inclMaxTs] = getMinMaxBoundForType(false /*isMin*/, value::TypeTags::Timestamp); - const auto [maxTsTag, maxTsVal] = maxTs->cast()->get(); - expectedCard = estimateCardRange(arrHist, - true /* lowInclusive */, - minTsTag, - minTsVal, - true /* highInclusive */, - maxTsTag, - maxTsVal, - true /* includeScalar */); - ASSERT_EQ(3.0, expectedCard); -} -#endif - -TEST(EstimatorTest, MinValueMixedHistogramFromBuckets) { - const auto endOid = OID("63340dbed6cd8af737d4139a"); - const auto endDate = Date_t::fromMillisSinceEpoch(1526864323000LL); - const Timestamp endTs{Seconds(1526864323LL), 0}; - - std::vector data{ - {0, 1.0, 0.0, 0.0}, - {100, 4.0, 95.0, 30.0}, - {"xyz", 5.0, 95.0, 25.0}, - {Value(endOid), 5.0, 95.0, 50.0}, - {Value(endDate), 4.0, 96.0, 24.0}, - {Value(endTs), 5.0, 95.0, 50.0}, - }; - const ScalarHistogram hist = createHistogram(data); - if constexpr (kCETestLogOnly) { - std::cout << "Mixed types " << hist.dump(); - } - ASSERT_EQ(500.0, getTotals(hist).card); - - // Minimum ObjectId. - auto&& [minOid, inclOid] = getMinMaxBoundForType(true /*isMin*/, value::TypeTags::ObjectId); - auto [minOidTag, minOidVal] = minOid->cast()->get(); - double expectedCard = estimate(hist, minOidTag, minOidVal, EstimationType::kEqual).card; - ASSERT_APPROX_EQUAL(1.9, expectedCard, kErrorBound); - - // Minimum date. - const auto&& [minDate, inclDate] = getMinMaxBoundForType(true /*isMin*/, value::TypeTags::Date); - const auto [minDateTag, minDateVal] = minDate->cast()->get(); - expectedCard = estimate(hist, minDateTag, minDateVal, EstimationType::kEqual).card; - ASSERT_EQ(4.0, expectedCard); - - // Minimum timestamp. - auto&& [minTs, inclTs] = getMinMaxBoundForType(true /*isMin*/, value::TypeTags::Timestamp); - auto [minTsTag, minTsVal] = minTs->cast()->get(); - expectedCard = estimate(hist, minTsTag, minTsVal, EstimationType::kEqual).card; - ASSERT_APPROX_EQUAL(1.9, expectedCard, kErrorBound); - - // Inequality predicates using min values. - const ArrayHistogram arrHist(hist, - TypeCounts{ - {value::TypeTags::NumberInt64, 100}, - {value::TypeTags::StringSmall, 100}, - {value::TypeTags::ObjectId, 100}, - {value::TypeTags::Date, 100}, - {value::TypeTags::Timestamp, 100}, - }); - // [minDate, innerDate], estimated by the half of the date bucket. - const int64_t innerDate = 1516864323000LL; - expectedCard = estimateCardRange(arrHist, - true /* lowInclusive */, - minDateTag, - minDateVal, - true /* highInclusive */, - value::TypeTags::Date, - value::bitcastFrom(innerDate), - true /* includeScalar */); - ASSERT_APPROX_EQUAL(48.0, expectedCard, kErrorBound); - - // [minTs, innerTs], estimated by the half of the timestamp bucket. - const Timestamp innerTs{Seconds(1516864323LL), 0}; - expectedCard = estimateCardRange(arrHist, - true /* lowInclusive */, - minTsTag, - minTsVal, - true /* highInclusive */, - value::TypeTags::Timestamp, - value::bitcastFrom(innerTs.asULL()), - true /* includeScalar */); - ASSERT_APPROX_EQUAL(47.5, expectedCard, kErrorBound); -} -} // namespace -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_generated_histograms_test.cpp b/src/mongo/db/query/ce/ce_generated_histograms_test.cpp deleted file mode 100644 index 93696346447..00000000000 --- a/src/mongo/db/query/ce/ce_generated_histograms_test.cpp +++ /dev/null @@ -1,363 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include -#include - -#include "mongo/db/exec/sbe/values/value.h" -#include "mongo/db/query/ce/array_histogram.h" -#include "mongo/db/query/ce/ce_test_utils.h" -#include "mongo/db/query/ce/histogram_estimation.h" -#include "mongo/unittest/unittest.h" - -namespace mongo::ce { -namespace { - -using namespace sbe; - -constexpr double kErrorBound = 0.1; - -TEST(EstimatorTest, UniformIntStrEstimate) { - /* The code in this comment generates a dataset and creates the histogram used in this test. To - recreate the data set and the histogram, place this code in a unit test which uses the utilities - from rand_utils_new.cpp. - - constexpr int minLen = 3, maxLen = 5; - constexpr int minVal = 0, maxVal = 1000; - constexpr size_t dataSize = 1000; - constexpr size_t nBuckets = std::min(20UL, dataSize); - - MixedDistributionDescriptor dd{{DistrType::kUniform, 1.0}}; - TypeDistrVector td; - td.emplace_back(std::make_unique(dd, 0.5, 250, minVal, maxVal)); - td.emplace_back(std::make_unique(dd, 0.5, 250, minLen, maxLen)); - - std::mt19937_64 gen(0); - DatasetDescriptorNew desc{std::move(td), gen}; - - std::vector dataset; - dataset = desc.genRandomDataset(dataSize); - - const ScalarHistogram& hist = makeHistogram(dataset, nBuckets); - */ - - std::vector data{ - {2, 5, 0, 0}, {57, 4, 21, 12}, {159, 4, 59, 24}, {172, 5, 0, 0}, - {184, 4, 2, 2}, {344, 4, 73, 32}, {363, 4, 1, 1}, {420, 3, 16, 10}, - {516, 2, 49, 23}, {758, 4, 113, 54}, {931, 5, 104, 41}, {998, 4, 29, 12}, - {"3vL", 6, 30, 11}, {"9WUk", 1, 59, 24}, {"HraK", 4, 56, 26}, {"Zujbu", 1, 130, 64}, - {"kEr", 5, 80, 40}, {"rupc", 6, 44, 21}, {"up1O", 5, 16, 7}, {"ztf", 5, 37, 17}}; - - const ScalarHistogram hist = createHistogram(data); - const ArrayHistogram arrHist( - hist, TypeCounts{{value::TypeTags::NumberInt64, 515}, {value::TypeTags::StringSmall, 485}}); - - const auto [tagLowStr, valLowStr] = value::makeNewString(""_sd); - value::ValueGuard vgLowStr(tagLowStr, valLowStr); - const auto [tagAbc, valAbc] = value::makeNewString("abc"_sd); - value::ValueGuard vg(tagAbc, valAbc); - auto [tagObj, valObj] = value::makeNewObject(); - value::ValueGuard vgObj(tagObj, valObj); - - // Predicates over bucket bound. - // Actual cardinality {$eq: 804} = 2. - double expectedCard = estimateIntValCard(hist, 804, EstimationType::kEqual); - ASSERT_APPROX_EQUAL(2.5, expectedCard, kErrorBound); - - // Actual cardinality {$lt: 100} = 40. - expectedCard = estimateIntValCard(hist, 100, EstimationType::kLess); - ASSERT_APPROX_EQUAL(52.4, expectedCard, kErrorBound); - - // Range query crossing the type brackets. - // Actual cardinality {$gt: 100} = 475. - expectedCard = estimateCardRange(arrHist, - false /* lowInclusive */, - value::TypeTags::NumberInt64, - value::bitcastFrom(100), - false /* highInclusive */, - tagLowStr, - valLowStr, - true /* includeScalar */); - ASSERT_APPROX_EQUAL(460.1, expectedCard, kErrorBound); - - // Actual cardinality {$lt: 'abc'} = 291. - expectedCard = estimateCardRange(arrHist, - true /* lowInclusive */, - tagLowStr, - valLowStr, - true /* highInclusive */, - tagAbc, - valAbc, - true /* includeScalar */); - ASSERT_APPROX_EQUAL(319.9, expectedCard, kErrorBound); - - // Actual cardinality {$gte: 'abc'} = 194. - expectedCard = estimateCardRange(arrHist, - true /* lowInclusive */, - tagAbc, - valAbc, - false /* highInclusive */, - tagObj, - valObj, - true /* includeScalar */); - ASSERT_APPROX_EQUAL(167.0, expectedCard, kErrorBound); - - // Queries over the low string bound. - // Actual cardinality {$eq: ''} = 0. - expectedCard = estimateCardEq(arrHist, tagLowStr, valLowStr, true); - ASSERT_APPROX_EQUAL(2.727, expectedCard, 0.001); - - // Actual cardinality {$gt: ''} = 485. - expectedCard = estimateCardRange(arrHist, - false /* lowInclusive */, - tagLowStr, - valLowStr, - false /* highInclusive */, - tagObj, - valObj, - true /* includeScalar */); - ASSERT_APPROX_EQUAL(485, expectedCard, 0.001); -} - -TEST(EstimatorTest, IntStrArrayEstimate) { - /* The code in this comment generates a dataset of 1000 integers, strings and arrays of integers - and strings and creates the histogram used in this test. To recreate the data set and the - histogram, place this code in a unit test which uses the utilities from rand_utils_new.cpp. - - constexpr int minLen = 2, maxLen = 5; - constexpr int minVal = 0, maxVal = 1000; - constexpr size_t dataSize = 1000; - constexpr size_t nBuckets = std::min(20UL, dataSize); - - MixedDistributionDescriptor dd{{DistrType::kUniform, 1.0}}; - TypeDistrVector td1; - td1.emplace_back(std::make_unique(dd, 0.7, 200, minVal, maxVal)); - td1.emplace_back(std::make_unique(dd, 0.3, 100, minLen, maxLen)); - - std::mt19937_64 gen(5); - auto desc1 = std::make_unique(std::move(td1), gen); - - TypeDistrVector td2; - td2.emplace_back(std::make_unique(dd, 0.4, 200, minVal, maxVal)); - td2.emplace_back(std::make_unique(dd, 0.3, 200, minLen, maxLen)); - td2.emplace_back(std::make_unique(dd, 0.3, 200, 2, 6, std::move(desc1), - 0.0)); - - DatasetDescriptorNew desc{std::move(td2), gen}; - std::vector dataset; - dataset = desc.genRandomDataset(dataSize); - - const ScalarHistogram& hist = makeHistogram(dataset, nBuckets); - */ - - std::vector scalarData{ - {10, 1, 0, 0}, {11, 4, 0, 0}, {44, 2, 5, 2}, {213, 3, 40, 20}, - {256, 5, 13, 6}, {270, 3, 9, 2}, {407, 3, 56, 28}, {510, 3, 32, 16}, - {524, 3, 0, 0}, {561, 5, 16, 8}, {583, 3, 4, 3}, {599, 3, 1, 1}, - {663, 5, 19, 9}, {681, 5, 6, 2}, {873, 5, 75, 37}, {909, 4, 16, 7}, - {994, 3, 36, 14}, {"9TcY", 4, 44, 23}, {"Zow00", 5, 134, 67}, {"zsS", 2, 130, 66}, - }; - - const ScalarHistogram scalarHist = createHistogram(scalarData); - - std::vector minData{ - {12, 5, 0, 0}, {17, 8, 0, 0}, {28, 7, 7, 1}, {55, 5, 22, 5}, - {110, 5, 45, 11}, {225, 4, 43, 15}, {563, 3, 98, 36}, {643, 4, 3, 2}, - {701, 4, 9, 5}, {845, 1, 6, 4}, {921, 2, 0, 0}, {980, 1, 0, 0}, - {"1l", 9, 16, 4}, {"8YN", 4, 19, 5}, {"PE2OO", 2, 41, 15}, {"WdJ", 8, 25, 7}, - {"dKb7", 9, 17, 6}, {"msdP", 12, 25, 10}, {"t7wmp", 5, 15, 6}, {"yx", 2, 13, 4}, - }; - - const ScalarHistogram minHist = createHistogram(minData); - - std::vector maxData{ - {26, 2, 0, 0}, {79, 3, 0, 0}, {147, 1, 0, 0}, {207, 2, 0, 0}, - {362, 6, 7, 5}, {563, 3, 47, 19}, {603, 9, 2, 1}, {676, 6, 21, 10}, - {702, 6, 9, 4}, {712, 6, 0, 0}, {759, 8, 4, 1}, {774, 6, 3, 1}, - {831, 9, 28, 9}, {948, 7, 51, 15}, {981, 3, 33, 8}, {"9Iey", 4, 20, 8}, - {"Ji", 3, 21, 8}, {"WdJ", 9, 26, 10}, {"msdP", 9, 59, 20}, {"zbI", 3, 68, 16}, - }; - - const ScalarHistogram maxHist = createHistogram(maxData); - - std::vector uniqueData{ - {12, 5, 0, 0}, {28, 8, 15, 2}, {55, 8, 23, 5}, {110, 5, 59, 12}, - {225, 8, 79, 18}, {362, 8, 88, 20}, {507, 10, 165, 36}, {572, 5, 25, 6}, - {603, 12, 25, 3}, {712, 6, 106, 19}, {759, 11, 17, 4}, {774, 6, 3, 1}, - {831, 14, 50, 13}, {981, 3, 105, 25}, {"547DP", 4, 43, 9}, {"9Iey", 4, 8, 1}, - {"WdJ", 9, 85, 26}, {"ZGYcw", 2, 14, 4}, {"msdP", 14, 80, 21}, {"zbI", 3, 74, 17}, - }; - - const ScalarHistogram uniqueHist = createHistogram(uniqueData); - - TypeCounts typeCounts{{value::TypeTags::NumberInt64, 388}, - {value::TypeTags::StringSmall, 319}, - {value::TypeTags::Array, 293}}; - TypeCounts arrayTypeCounts{{value::TypeTags::NumberInt64, 874}, - {value::TypeTags::StringSmall, 340}}; - const ArrayHistogram arrHist(scalarHist, - typeCounts, - uniqueHist, - minHist, - maxHist, - arrayTypeCounts, - 0 /* No empty arrays */); - - const auto [tagLowDbl, valLowDbl] = - std::make_pair(value::TypeTags::NumberDouble, - value::bitcastFrom(std::numeric_limits::quiet_NaN())); - const auto [tagLowStr, valLowStr] = value::makeNewString(""_sd); - value::ValueGuard vgLowStr(tagLowStr, valLowStr); - - // Actual cardinality {$lt: 100} = 115. - double expectedCard = estimateCardRange(arrHist, - false /* lowInclusive */, - tagLowDbl, - valLowDbl, - false /* highInclusive */, - value::TypeTags::NumberInt64, - value::bitcastFrom(100), - true /* includeScalar */); - ASSERT_APPROX_EQUAL(109.9, expectedCard, kErrorBound); - - // Actual cardinality {$gt: 502} = 434. - expectedCard = estimateCardRange(arrHist, - false /* lowInclusive */, - value::TypeTags::NumberInt64, - value::bitcastFrom(500), - false /* highInclusive */, - tagLowStr, - valLowStr, - true /* includeScalar */); - ASSERT_APPROX_EQUAL(443.8, expectedCard, kErrorBound); - - // Actual cardinality {$gte: 502} = 437. - expectedCard = estimateCardRange(arrHist, - true /* lowInclusive */, - value::TypeTags::NumberInt64, - value::bitcastFrom(500), - false /* highInclusive */, - tagLowStr, - valLowStr, - true /* includeScalar */); - ASSERT_APPROX_EQUAL(448.3, expectedCard, kErrorBound); - - // Actual cardinality {$eq: ''} = 0. - expectedCard = estimateCardEq(arrHist, tagLowStr, valLowStr, true /* includeScalar */); - ASSERT_APPROX_EQUAL(6.69, expectedCard, 0.001); - - // Actual cardinality {$eq: 'DD2'} = 2. - auto [tagStr, valStr] = value::makeNewString("DD2"_sd); - value::ValueGuard vg(tagStr, valStr); - expectedCard = estimateCardEq(arrHist, tagStr, valStr, true /* includeScalar */); - ASSERT_APPROX_EQUAL(5.27, expectedCard, kErrorBound); - - // Actual cardinality {$lte: 'DD2'} = 120. - expectedCard = estimateCardRange(arrHist, - true /* lowInclusive */, - tagLowStr, - valLowStr, - true /* highInclusive */, - tagStr, - valStr, - true /* includeScalar */); - ASSERT_APPROX_EQUAL(160.6, expectedCard, kErrorBound); - - // Actual cardinality {$gt: 'DD2'} = 450. - auto [tagObj, valObj] = value::makeNewObject(); - value::ValueGuard vgObj(tagObj, valObj); - expectedCard = estimateCardRange(arrHist, - false /* lowInclusive */, - tagStr, - valStr, - false /* highInclusive */, - tagObj, - valObj, - true /* includeScalar */); - ASSERT_APPROX_EQUAL(411.2, expectedCard, kErrorBound); - - // Queries with $elemMatch. - const auto [tagInt, valInt] = - std::make_pair(value::TypeTags::NumberInt64, value::bitcastFrom(603)); - - // Actual cardinality {$match: {a: {$elemMatch: {$eq: 603}}}} = 12. - expectedCard = estimateCardEq(arrHist, tagInt, valInt, false /* includeScalar */); - ASSERT_APPROX_EQUAL(12.0, expectedCard, kErrorBound); - - // Actual cardinality {$match: {a: {$elemMatch: {$lte: 603}}}} = 252. - expectedCard = estimateCardRange(arrHist, - false /* lowInclusive */, - tagLowDbl, - valLowDbl, - true /* highInclusive */, - tagInt, - valInt, - false /* includeScalar */); - ASSERT_APPROX_EQUAL(293.0, expectedCard, kErrorBound); - - // Actual cardinality {$match: {a: {$elemMatch: {$gte: 603}}}} = 200. - expectedCard = estimateCardRange(arrHist, - true /* lowInclusive */, - tagInt, - valInt, - false /* highInclusive */, - tagLowStr, - valLowStr, - false /* includeScalar */); - ASSERT_APPROX_EQUAL(250.8, expectedCard, kErrorBound); - - // Actual cardinality {$match: {a: {$elemMatch: {$eq: 'cu'}}}} = 7. - std::tie(tagStr, valStr) = value::makeNewString("cu"_sd); - expectedCard = estimateCardEq(arrHist, tagStr, valStr, false /* includeScalar */); - ASSERT_APPROX_EQUAL(3.8, expectedCard, kErrorBound); - - // Actual cardinality {$match: {a: {$elemMatch: {$gte: 'cu'}}}} = 125. - expectedCard = estimateCardRange(arrHist, - true /* lowInclusive */, - tagStr, - valStr, - false /* highInclusive */, - tagObj, - valObj, - false /* includeScalar */); - ASSERT_APPROX_EQUAL(109.7, expectedCard, kErrorBound); - - // Actual cardinality {$match: {a: {$elemMatch: {$lte: 'cu'}}}} = 141. - expectedCard = estimateCardRange(arrHist, - true /* lowInclusive */, - tagLowStr, - valLowStr, - true /* highInclusive */, - tagStr, - valStr, - false /* includeScalar */); - ASSERT_APPROX_EQUAL(156.1, expectedCard, kErrorBound); -} -} // namespace -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_heuristic.cpp b/src/mongo/db/query/ce/ce_heuristic.cpp deleted file mode 100644 index dcdc6e698e4..00000000000 --- a/src/mongo/db/query/ce/ce_heuristic.cpp +++ /dev/null @@ -1,611 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/db/query/ce/ce_heuristic.h" - -#include "mongo/db/query/optimizer/cascades/memo.h" -#include "mongo/db/query/optimizer/utils/ce_math.h" -#include "mongo/util/assert_util.h" - -namespace mongo::ce { -namespace { -namespace cascades = optimizer::cascades; -namespace properties = optimizer::properties; - -using ABT = optimizer::ABT; -using CEType = optimizer::CEType; -using LogicalProps = properties::LogicalProps; -using Memo = cascades::Memo; -using Metadata = optimizer::Metadata; - -// Invalid estimate - an arbitrary negative value used for initialization. -constexpr SelectivityType kInvalidSel = -1.0; - -constexpr SelectivityType kDefaultFilterSel = 0.1; -constexpr SelectivityType kDefaultExistsSel = 0.70; - -// The selectivities used in the piece-wise function for open-range intervals. -// Note that we assume a smaller input cardinality will result in a less selective range. -constexpr SelectivityType kSmallCardOpenRangeSel = 0.70; -constexpr SelectivityType kMediumCardOpenRangeSel = 0.45; -constexpr SelectivityType kLargeCardOpenRangeSel = 0.33; - -// The selectivities used in the piece-wise function for closed-range intervals. -// Note that we assume a smaller input cardinality will result in a less selective range. -constexpr SelectivityType kSmallCardClosedRangeSel = 0.50; -constexpr SelectivityType kMediumCardClosedRangeSel = 0.33; -constexpr SelectivityType kLargeCardClosedRangeSel = 0.20; - -// Global and Local selectivity should multiply to the Complete selectivity. -constexpr SelectivityType kDefaultCompleteGroupSel = 0.01; -constexpr SelectivityType kDefaultLocalGroupSel = 0.02; -constexpr SelectivityType kDefaultGlobalGroupSel = 0.5; - -// The following constants are the steps used in the piece-wise functions that select selectivies -// based on input cardinality. -constexpr CEType kSmallLimit = 20.0; -constexpr CEType kMediumLimit = 100.0; - -// Assumed average number of elements in an array. -constexpr CEType kDefaultAverageArraySize = 10.0; - -/** - * Default selectivity of equalities. To avoid super small selectivities for small - * cardinalities, that would result in 0 cardinality for many small inputs, the - * estimate is scaled as inputCard grows. The bigger inputCard, the smaller the - * selectivity. - */ -SelectivityType equalitySel(const CEType inputCard) { - uassert(6716604, "Zero cardinality must be handled by the caller.", inputCard > 0.0); - if (inputCard <= 1.0) { - // If the input has < 1 values, it cannot be reduced any further by a condition. - return 1.0; - } - return std::sqrt(inputCard) / inputCard; -} - -/** - * Default selectivity of intervals with bounds on both ends. These intervals are - * considered less selective than equalities. - * Examples: (a > 'abc' AND a < 'hta'), (0 < b <= 13) - */ -SelectivityType closedRangeSel(const CEType inputCard) { - SelectivityType sel = kInvalidSel; - if (inputCard < kSmallLimit) { - sel = kSmallCardClosedRangeSel; - } else if (inputCard < kMediumLimit) { - sel = kMediumCardClosedRangeSel; - } else { - sel = kLargeCardClosedRangeSel; - } - return sel; -} - -/** - * Default selectivity of intervals open on one end. These intervals are - * considered less selective than those with both ends specified by the user query. - * Examples: (a > 'xyz'), (b <= 13) - */ -SelectivityType openRangeSel(const CEType inputCard) { - SelectivityType sel = kInvalidSel; - if (inputCard < kSmallLimit) { - sel = kSmallCardOpenRangeSel; - } else if (inputCard < kMediumLimit) { - sel = kMediumCardOpenRangeSel; - } else { - sel = kLargeCardOpenRangeSel; - } - return sel; -} - -mongo::sbe::value::TypeTags constType(const Constant* constBoundPtr) { - if (constBoundPtr == nullptr) { - return mongo::sbe::value::TypeTags::Nothing; - } - const auto [tag, val] = constBoundPtr->get(); - return tag; -} - -mongo::sbe::value::TypeTags boundType(const BoundRequirement& bound) { - return constType(bound.getBound().cast()); -} - -SelectivityType intervalSel(const IntervalRequirement& interval, const CEType inputCard) { - SelectivityType sel = kInvalidSel; - if (interval.isFullyOpen()) { - sel = 1.0; - } else if (interval.isEquality()) { - sel = equalitySel(inputCard); - } else if (interval.getHighBound().isPlusInf() || interval.getLowBound().isMinusInf() || - boundType(interval.getLowBound()) != boundType(interval.getHighBound())) { - // The interval has an actual bound only on one of it ends if: - // - one of the bounds is infinite, or - // - both bounds are of a different type - this is the case when due to type bracketing - // one of the bounds is the lowest/highest value of the previous/next type. - // TODO: Notice that sometimes type bracketing uses a min/max value from the same type, - // so sometimes we may not detect an open-ended interval. - sel = openRangeSel(inputCard); - } else { - sel = closedRangeSel(inputCard); - } - uassert(6716603, "Invalid selectivity.", validSelectivity(sel)); - return sel; -} - -SelectivityType negationSel(SelectivityType sel) { - return 1.0 - sel; -} - -SelectivityType operationSel(const Operations op, const CEType inputCard) { - switch (op) { - case Operations::Eq: - return equalitySel(inputCard); - case Operations::Neq: - return negationSel(equalitySel(inputCard)); - case Operations::EqMember: - // Reached when the query has $in. We don't handle it yet. - return kDefaultFilterSel; - case Operations::Gt: - case Operations::Gte: - case Operations::Lt: - case Operations::Lte: - return openRangeSel(inputCard); - default: - MONGO_UNREACHABLE; - } -} - -SelectivityType intervalSel(const PathCompare& left, - const PathCompare& right, - const CEType inputCard) { - if (left.op() == Operations::EqMember || right.op() == Operations::EqMember) { - // Reached when the query has $in. We don't handle it yet. - return kDefaultFilterSel; - } - - bool lowBoundUnknown = false; - bool highBoundUnknown = false; - boost::optional lowBoundType; - boost::optional highBoundType; - - for (const auto& compare : {left, right}) { - switch (compare.op()) { - case Operations::Eq: { - // This branch is reached when we have a conjunction of equalities on the same path. - uassert(6777601, - "Expected conjunction of equalities.", - left.op() == Operations::Eq && right.op() == Operations::Eq); - - const auto leftConst = left.getVal().cast(); - const auto rightConst = right.getVal().cast(); - if (leftConst && rightConst && !(*leftConst == *rightConst)) { - // Equality comparison on different constants is a contradiction. - return 0.0; - } - // We can't tell if the equalities result in a contradiction or not, so we use the - // default equality selectivity. - return equalitySel(inputCard); - } - case Operations::Gt: - case Operations::Gte: - lowBoundUnknown = lowBoundUnknown || compare.getVal().is(); - lowBoundType = constType(compare.getVal().cast()); - break; - case Operations::Lt: - case Operations::Lte: - highBoundUnknown = highBoundUnknown || compare.getVal().is(); - highBoundType = constType(compare.getVal().cast()); - break; - default: - MONGO_UNREACHABLE; - } - } - - if (lowBoundType && highBoundType && - (lowBoundType == highBoundType || lowBoundUnknown || highBoundUnknown)) { - // Interval is closed only if: - // - it has low and high bounds - // - bounds are of the same type - // - // If bounds are of a different type, it implies that one bound is the - // lowest/highest value of the previous/next type and has been added for type bracketing - // purposes. We treat such bounds as infinity. - // - // If there are unknown boundaries (Variables), we assume that they are of the same type - // as the other bound. - // - // TODO: Notice that sometimes type bracketing uses a min/max value from the same type, - // so sometimes we may not detect an open-ended interval. - return closedRangeSel(inputCard); - } - - if (lowBoundType || highBoundType) { - return openRangeSel(inputCard); - } - - MONGO_UNREACHABLE; -} - -/** - * Heuristic selectivity estimation for EvalFilter nodes. Used for estimating cardinalities of - * FilterNodes. The estimate is computed by traversing the tree bottom-up, applying default - * selectivity functions to atomic predicates (comparisons), and combining child selectivities of - * disjunctions and conjunctions via simple addition and multiplication. - */ -class EvalFilterSelectivityTransport { -public: - /** - * Helper class for holding values passed from child to parent nodes when traversing the tree. - */ - struct EvalFilterSelectivityResult { - // Each item represents a field in a dotted path. - // Collected while traversing a path expression. - // Used for deciding whether a conjunction of comparisons is an interval or not. - FieldPathType path; - // When handling a PathComposeM, we need to access its child comparisons which might be - // hidden under path expressions. - const PathCompare* compare; - // The selectivity estimate. - SelectivityType selectivity; - }; - - EvalFilterSelectivityResult transport(const EvalFilter& /*node*/, - CEType /*inputCard*/, - EvalFilterSelectivityResult pathResult, - EvalFilterSelectivityResult /*inputResult*/) { - return pathResult; - } - - EvalFilterSelectivityResult transport(const PathGet& node, - CEType /*inputCard*/, - EvalFilterSelectivityResult childResult) { - childResult.path.push_back(node.name()); - return childResult; - } - - EvalFilterSelectivityResult transport(const PathTraverse& node, - CEType /*inputCard*/, - EvalFilterSelectivityResult childResult) { - return childResult; - } - - EvalFilterSelectivityResult transport(const PathCompare& node, - CEType inputCard, - EvalFilterSelectivityResult /*childResult*/) { - // Note that the result will be ignored if this operation is part of an interval. - const SelectivityType sel = operationSel(node.op(), inputCard); - return {{}, &node, sel}; - } - - EvalFilterSelectivityResult transport(const PathComposeM& node, - CEType inputCard, - EvalFilterSelectivityResult leftChildResult, - EvalFilterSelectivityResult rightChildResult) { - const bool isInterval = leftChildResult.compare && rightChildResult.compare && - leftChildResult.path == rightChildResult.path; - - const SelectivityType sel = isInterval - ? intervalSel(*leftChildResult.compare, *rightChildResult.compare, inputCard) - : conjunctionSel(leftChildResult.selectivity, rightChildResult.selectivity); - - return {{}, nullptr, sel}; - } - - EvalFilterSelectivityResult transport(const PathComposeA& node, - CEType /*inputCard*/, - EvalFilterSelectivityResult leftChildResult, - EvalFilterSelectivityResult rightChildResult) { - const SelectivityType sel = - disjunctionSel(leftChildResult.selectivity, rightChildResult.selectivity); - - return {{}, nullptr, sel}; - } - - EvalFilterSelectivityResult transport(const UnaryOp& node, - CEType /*inputCard*/, - EvalFilterSelectivityResult childResult) { - switch (node.op()) { - case Operations::Not: - childResult.selectivity = negationSel(childResult.selectivity); - return childResult; - case Operations::Neg: - // If we see negation (-) in a UnaryOp, we ignore it for CE purposes. - return childResult; - default: - MONGO_UNREACHABLE; - } - } - - EvalFilterSelectivityResult transport(const PathConstant& /*node*/, - CEType /*inputCard*/, - EvalFilterSelectivityResult childResult) { - return childResult; - } - - EvalFilterSelectivityResult transport(const PathDefault& node, - CEType inputCard, - EvalFilterSelectivityResult childResult) { - if (node.getDefault() == Constant::boolean(false)) { - // We have a {$exists: true} predicate on this path if we have a Constant[false] child - // here. Note that ${exists: false} is handled by the presence of a negation expression - // higher in the ABT. - childResult.selectivity = kDefaultExistsSel; - } - return childResult; - } - - template - EvalFilterSelectivityResult transport(const T& /*node*/, Ts&&...) { - return {{}, nullptr, kDefaultFilterSel}; - } - - static SelectivityType derive(const CEType inputCard, const ABT::reference_type ref) { - EvalFilterSelectivityTransport instance; - const auto result = algebra::transport(ref, instance, inputCard); - return result.selectivity; - } - -private: - SelectivityType negationSel(const SelectivityType in) { - return 1.0 - in; - } - - SelectivityType conjunctionSel(const SelectivityType left, const SelectivityType right) { - return left * right; - } - - SelectivityType disjunctionSel(const SelectivityType left, const SelectivityType right) { - // We sum the selectivities and subtract the overlapping part so that it's only counted - // once. - return left + right - left * right; - } -}; - -class CEHeuristicTransport { -public: - CEType transport(const ScanNode& node, CEType /*bindResult*/) { - // Default cardinality estimate. - const CEType metadataCE = _metadata._scanDefs.at(node.getScanDefName()).getCE(); - return (metadataCE < 0.0) ? kDefaultCard : metadataCE; - } - - CEType transport(const ValueScanNode& node, CEType /*bindResult*/) { - return node.getArraySize(); - } - - CEType transport(const MemoLogicalDelegatorNode& node) { - return properties::getPropertyConst( - _memo.getLogicalProps(node.getGroupId())) - .getEstimate(); - } - - CEType transport(const FilterNode& node, CEType childResult, CEType /*exprResult*/) { - if (childResult == 0.0) { - // Early out and return 0 since we don't expect to get more results. - return 0.0; - } - if (node.getFilter() == Constant::boolean(true)) { - // Trivially true filter. - return childResult; - } - if (node.getFilter() == Constant::boolean(false)) { - // Trivially false filter. - return 0.0; - } - - const SelectivityType sel = - EvalFilterSelectivityTransport::derive(childResult, node.getFilter().ref()); - - return std::max(sel * childResult, kMinCard); - } - - CEType transport(const EvaluationNode& node, CEType childResult, CEType /*exprResult*/) { - // Evaluations do not change cardinality. - return childResult; - } - - CEType transport(const SargableNode& node, - CEType childResult, - CEType /*bindsResult*/, - CEType /*refsResult*/) { - // Early out and return 0 since we don't expect to get more results. - if (childResult == 0.0) { - return 0.0; - } - - SelectivityType topLevelSel = 1.0; - std::vector topLevelSelectivities; - for (const auto& [key, req] : node.getReqMap()) { - if (req.getIsPerfOnly()) { - // Ignore perf-only requirements. - continue; - } - - SelectivityType disjSel = 1.0; - std::vector disjSelectivities; - // Intervals are in DNF. - const auto intervalDNF = req.getIntervals(); - const auto disjuncts = intervalDNF.cast()->nodes(); - for (const auto& disjunct : disjuncts) { - const auto& conjuncts = disjunct.cast()->nodes(); - SelectivityType conjSel = 1.0; - std::vector conjSelectivities; - for (const auto& conjunct : conjuncts) { - const auto& interval = conjunct.cast()->getExpr(); - const SelectivityType sel = intervalSel(interval, childResult); - conjSelectivities.push_back(sel); - } - conjSel = ce::conjExponentialBackoff(std::move(conjSelectivities)); - disjSelectivities.push_back(conjSel); - } - disjSel = ce::disjExponentialBackoff(std::move(disjSelectivities)); - topLevelSelectivities.push_back(disjSel); - } - - if (topLevelSelectivities.empty()) { - return 1.0; - } - // The elements of the PartialSchemaRequirements map represent an implicit conjunction. - topLevelSel = ce::conjExponentialBackoff(std::move(topLevelSelectivities)); - CEType card = std::max(topLevelSel * childResult, kMinCard); - uassert(6716602, "Invalid cardinality.", mongo::ce::validCardinality(card)); - return card; - } - - CEType transport(const RIDIntersectNode& node, - CEType /*leftChildResult*/, - CEType /*rightChildResult*/) { - // CE for the group should already be derived via the underlying Filter or Evaluation - // logical nodes. - uasserted(6624038, "Should not be necessary to derive CE for RIDIntersectNode"); - } - - CEType transport(const RIDUnionNode& node, - CEType /*leftChildResult*/, - CEType /*rightChildResult*/) { - // CE for the group should already be derived via the underlying Filter or Evaluation - // logical nodes. - uasserted(7016301, "Should not be necessary to derive CE for RIDUnionNode"); - } - - CEType transport(const BinaryJoinNode& node, - CEType leftChildResult, - CEType rightChildResult, - CEType /*exprResult*/) { - const auto& filter = node.getFilter(); - - SelectivityType selectivity = kDefaultFilterSel; - if (filter == Constant::boolean(false)) { - selectivity = 0.0; - } else if (filter == Constant::boolean(true)) { - selectivity = 1.0; - } - return leftChildResult * rightChildResult * selectivity; - } - - CEType transport(const UnionNode& node, - std::vector childResults, - CEType /*bindResult*/, - CEType /*refsResult*/) { - // Combine the CE of each child. - CEType result = 0; - for (auto&& child : childResults) { - result += child; - } - return result; - } - - CEType transport(const GroupByNode& node, - CEType childResult, - CEType /*bindAggResult*/, - CEType /*refsAggResult*/, - CEType /*bindGbResult*/, - CEType /*refsGbResult*/) { - // TODO: estimate number of groups. - switch (node.getType()) { - case GroupNodeType::Complete: - return kDefaultCompleteGroupSel * childResult; - - // Global and Local selectivity should multiply to Complete selectivity. - case GroupNodeType::Global: - return kDefaultGlobalGroupSel * childResult; - case GroupNodeType::Local: - return kDefaultLocalGroupSel * childResult; - - default: - MONGO_UNREACHABLE; - } - } - - CEType transport(const UnwindNode& node, - CEType childResult, - CEType /*bindResult*/, - CEType /*refsResult*/) { - return kDefaultAverageArraySize * childResult; - } - - CEType transport(const CollationNode& node, CEType childResult, CEType /*refsResult*/) { - // Collations do not change cardinality. - return childResult; - } - - CEType transport(const LimitSkipNode& node, CEType childResult) { - const auto limit = node.getProperty().getLimit(); - const auto skip = node.getProperty().getSkip(); - const auto cardAfterSkip = std::max(childResult - skip, 0.0); - if (limit < cardAfterSkip) { - return limit; - } - return cardAfterSkip; - } - - CEType transport(const ExchangeNode& node, CEType childResult, CEType /*refsResult*/) { - // Exchanges do not change cardinality. - return childResult; - } - - CEType transport(const RootNode& node, CEType childResult, CEType /*refsResult*/) { - // Root node does not change cardinality. - return childResult; - } - - /** - * Other ABT types. - */ - template - CEType transport(const T& /*node*/, Ts&&...) { - static_assert(!canBeLogicalNode(), "Logical node must implement its CE derivation."); - return 0.0; - } - - static CEType derive(const Metadata& metadata, - const Memo& memo, - const ABT::reference_type logicalNodeRef) { - CEHeuristicTransport instance(metadata, memo); - return algebra::transport(logicalNodeRef, instance); - } - -private: - CEHeuristicTransport(const Metadata& metadata, const Memo& memo) - : _metadata(metadata), _memo(memo) {} - - // We don't own this. - const Metadata& _metadata; - const Memo& _memo; -}; -} // namespace - -CEType HeuristicCE::deriveCE(const Metadata& metadata, - const Memo& memo, - const LogicalProps& /*logicalProps*/, - const ABT::reference_type logicalNodeRef) const { - CEType card = CEHeuristicTransport::derive(metadata, memo, logicalNodeRef); - return card; -} -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_heuristic.h b/src/mongo/db/query/ce/ce_heuristic.h deleted file mode 100644 index fd6e0672c54..00000000000 --- a/src/mongo/db/query/ce/ce_heuristic.h +++ /dev/null @@ -1,49 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include "mongo/db/query/optimizer/cascades/interfaces.h" - -namespace mongo::ce { - -/** - * Default cardinality estimation in the absence of statistics. - * Relies purely on heuristics. - * We currently do not use logical properties for heuristic ce. - */ -class HeuristicCE : public optimizer::cascades::CEInterface { -public: - optimizer::CEType deriveCE(const optimizer::Metadata& metadata, - const optimizer::cascades::Memo& memo, - const optimizer::properties::LogicalProps& /*logicalProps*/, - optimizer::ABT::reference_type logicalNodeRef) const override final; -}; - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_heuristic_test.cpp b/src/mongo/db/query/ce/ce_heuristic_test.cpp deleted file mode 100644 index c59361fb03c..00000000000 --- a/src/mongo/db/query/ce/ce_heuristic_test.cpp +++ /dev/null @@ -1,1009 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include - -#include "mongo/db/query/ce/ce_heuristic.h" -#include "mongo/db/query/ce/ce_test_utils.h" -#include "mongo/db/query/optimizer/cascades/logical_props_derivation.h" -#include "mongo/db/query/optimizer/cascades/memo.h" -#include "mongo/db/query/optimizer/defs.h" -#include "mongo/db/query/optimizer/explain.h" -#include "mongo/db/query/optimizer/metadata.h" -#include "mongo/db/query/optimizer/opt_phase_manager.h" -#include "mongo/db/query/optimizer/props.h" -#include "mongo/db/query/optimizer/utils/unit_test_utils.h" -#include "mongo/db/query/optimizer/utils/utils.h" -#include "mongo/unittest/unittest.h" - -namespace mongo::ce { -namespace { - -using namespace optimizer; -using namespace optimizer::cascades; - -constexpr double kCollCard = 10000.0; -const std::string collName = "test"; - -class HeuristicCETester : public CETester { -public: - HeuristicCETester( - std::string collName, - const optimizer::OptPhaseManager::PhaseSet& optPhases = kDefaultCETestPhaseSet) - : CETester(collName, kCollCard, optPhases) {} - -protected: - std::unique_ptr getCETransport() const override { - return std::make_unique(); - } -}; - -TEST(CEHeuristicTest, CEWithoutOptimizationGtLtNum) { - std::string query = "{a0 : {$gt : 14, $lt : 21}}"; - HeuristicCETester ht(collName, kNoOptPhaseSet); - ASSERT_MATCH_CE(ht, query, 1089.0); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationEqNum) { - std::string query = "{a: 123}"; - HeuristicCETester ht(collName, kNoOptPhaseSet); - ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); - ASSERT_MATCH_CE_CARD(ht, query, 1.73205, 3.0); - ASSERT_MATCH_CE_CARD(ht, query, 2.64575, 7.0); - ASSERT_MATCH_CE_CARD(ht, query, 3.16228, 10.0); - ASSERT_MATCH_CE_CARD(ht, query, 10.0, 100.0); - ASSERT_MATCH_CE_CARD(ht, query, 100.0, 10000.0); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationEqStr) { - std::string query = "{a: 'foo'}"; - HeuristicCETester ht(collName, kNoOptPhaseSet); - ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); - ASSERT_MATCH_CE_CARD(ht, query, 1.73205, 3.0); - ASSERT_MATCH_CE_CARD(ht, query, 2.64575, 7.0); - ASSERT_MATCH_CE_CARD(ht, query, 3.16228, 10.0); - ASSERT_MATCH_CE_CARD(ht, query, 10.0, 100.0); - ASSERT_MATCH_CE_CARD(ht, query, 100.0, 10000.0); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationGtNum) { - std::string query = "{a: {$gt: 44}}"; - HeuristicCETester ht(collName, kNoOptPhaseSet); - ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); - ASSERT_MATCH_CE_CARD(ht, query, 6.3, 9.0); - ASSERT_MATCH_CE_CARD(ht, query, 44.55, 99.0); - ASSERT_MATCH_CE_CARD(ht, query, 330.0, 1000.0); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationGtStr) { - std::string query = "{a: {$gt: 'foo'}}"; - HeuristicCETester ht(collName, kNoOptPhaseSet); - ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); - ASSERT_MATCH_CE_CARD(ht, query, 6.3, 9.0); - ASSERT_MATCH_CE_CARD(ht, query, 44.55, 99.0); - ASSERT_MATCH_CE_CARD(ht, query, 330.0, 1000.0); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationLtNum) { - std::string query = "{a: {$lt: 44}}"; - HeuristicCETester ht(collName, kNoOptPhaseSet); - ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); - ASSERT_MATCH_CE_CARD(ht, query, 6.3, 9.0); - ASSERT_MATCH_CE_CARD(ht, query, 44.55, 99.0); - ASSERT_MATCH_CE_CARD(ht, query, 330.0, 1000.0); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationDNF1pathSimple) { - std::string query = - "{$or: [" - "{$and: [{a0: {$gt: 9}}, {a0: {$lt: 12}}]}," - "{$and: [{a0: {$gt:40}}, {a0: {$lt: 44}}]}" - "]}"; - HeuristicCETester ht(collName, kNoOptPhaseSet); - ASSERT_MATCH_CE_CARD(ht, query, 6.6591, 9.0); - ASSERT_MATCH_CE_CARD(ht, query, 36.0354, 99.0); - ASSERT_MATCH_CE_CARD(ht, query, 205.941, 1000.0); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationNestedConjAndDisj1) { - std::string query = - "{$or: [" - "{a: {$lt: 3}}," - "{$and: [{b: {$gt:5}}, {c: {$lt: 10}}]}" - "]}"; - HeuristicCETester ht(collName, kNoOptPhaseSet); - ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); - ASSERT_MATCH_CE_CARD(ht, query, 7.623, 9.0); - ASSERT_MATCH_CE_CARD(ht, query, 55.5761, 99.0); - ASSERT_MATCH_CE_CARD(ht, query, 402.963, 1000.0); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationNestedConjAndDisj2) { - std::string query = - "{$and: [" - "{a: {$lt: 3}}," - "{$or: [{b: {$gt:5}}, {b: {$lt: 10}}]}" - "]}"; - HeuristicCETester ht(collName, kNoOptPhaseSet); - ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); - ASSERT_MATCH_CE_CARD(ht, query, 5.733, 9.0); - ASSERT_MATCH_CE_CARD(ht, query, 31.0736, 99.0); - ASSERT_MATCH_CE_CARD(ht, query, 181.863, 1000.0); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationNestedConjAndDisj3) { - std::string query = - "{$and: [" - "{$and: [{a: {$gt: 5}}, {a: {$lt: 10}}]}," - "{$and: [" - " {b: {$gt: 15}}," - " {c: {$lt: 110}}," - " {$or: [{a1: 1}, {b1: 2}, {c1: 3}]}" - "]}" - "]}"; - HeuristicCETester ht(collName, kNoOptPhaseSet); - ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); - ASSERT_MATCH_CE_CARD(ht, query, 1.52063, 9.0); - ASSERT_MATCH_CE_CARD(ht, query, 4.15975, 99.0); - ASSERT_MATCH_CE_CARD(ht, query, 9.11877, 1000.0); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationNestedConjAndDisj4) { - std::string query = - "{$or: [" - "{$or: [{a: {$gt: 5}}, {a: {$lt: 10}}]}," - "{$or: [" - " {b: {$gt: 15}}," - " {c: {$lt: 110}}," - " {$and: [{a1: 1}, {b1: 2}, {c1: 3}]}" - "]}" - "]}"; - HeuristicCETester ht(collName, kNoOptPhaseSet); - ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); - ASSERT_MATCH_CE_CARD(ht, query, 8.9298, 9.0); - ASSERT_MATCH_CE_CARD(ht, query, 89.9501, 99.0); - ASSERT_MATCH_CE_CARD(ht, query, 798.495, 1000.0); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationTraverseSelectivityDoesNotAccumulate) { - std::string query = - "{$or: [" - "{a0: 1}," - "{a0: {$lt: -4}}," - "{b0: {$gt: 10}}" - "]}"; - std::string queryWithLongPaths = - "{$or: [" - "{'a0.a1.a2.a3.a4.a5.a6.a7.a8.a9': 1}," - "{'a0.a1.a2.a3.a4.a5.a6.a7.a8.a9': {$lt: -4}}," - "{'b0.b1.b3': {$gt: 10}}" - "]}"; - HeuristicCETester ht(collName, kNoOptPhaseSet); - auto ce1 = ht.getMatchCE(query); - auto ce2 = ht.getMatchCE(queryWithLongPaths); - ASSERT_APPROX_EQUAL(ce1, ce2, kMaxCEError); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationIntervalWithEqOnSameValue) { - std::string query = - "{$or: [" - "{a: 1}," - "{$and: [{a: 2}, {a: 2}]}" - "]}"; - HeuristicCETester ht(collName, kNoOptPhaseSet); - ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); - ASSERT_MATCH_CE_CARD(ht, query, 5.0, 9.0); - ASSERT_MATCH_CE_CARD(ht, query, 18.8997, 99.0); - ASSERT_MATCH_CE_CARD(ht, query, 62.2456, 1000.0); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationIntervalWithEqOnDifferentValues) { - std::string query = - "{$or: [" - "{a: 1}," - "{$and: [{a: 2}, {a: 3}]}" - "]}"; - HeuristicCETester ht(collName, kNoOptPhaseSet); - ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); - ASSERT_MATCH_CE_CARD(ht, query, 3.0, 9.0); - ASSERT_MATCH_CE_CARD(ht, query, 9.94987, 99.0); - ASSERT_MATCH_CE_CARD(ht, query, 31.6228, 1000.0); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationConjunctionWithIn) { - std::string query = - "{$or: [" - "{a: 1}," - "{$and: [{a: 2}, {a: {$in: [2, 3, 4]}}]}" - "]}"; - HeuristicCETester ht(collName, kNoOptPhaseSet); - // Estimation for $in is not implemented yet, so we assume it has the default filter selectivity - // of 0.1. - ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); - ASSERT_MATCH_CE_CARD(ht, query, 3.6, 9.0); - ASSERT_MATCH_CE_CARD(ht, query, 18.8549, 99.0); - ASSERT_MATCH_CE_CARD(ht, query, 128.46, 1000.0); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationOneLowBoundWithoutTraverse) { - using namespace properties; - - ABT scanNode = make("test", "test"); - - ABT filterNode = make( - make(make("a", make(Operations::Gt, Constant::int64(42))), - make("test")), - std::move(scanNode)); - - ABT rootNode = - make(ProjectionRequirement{ProjectionNameVector{"test"}}, std::move(filterNode)); - - HeuristicCETester ht(collName, kNoOptPhaseSet); - ASSERT_CE_CARD(ht, rootNode, 0.0, 0.0); - ASSERT_CE_CARD(ht, rootNode, 2.1, 3.0); - ASSERT_CE_CARD(ht, rootNode, 4.9, 7.0); - ASSERT_CE_CARD(ht, rootNode, 7.0, 10.0); - ASSERT_CE_CARD(ht, rootNode, 33.0, 100.0); - ASSERT_CE_CARD(ht, rootNode, 3300.0, 10000.0); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationOneHighBoundWithoutTraverse) { - using namespace properties; - - ABT scanNode = make("test", "test"); - - ABT filterNode = make( - make(make("a", make(Operations::Lt, Constant::int64(42))), - make("test")), - std::move(scanNode)); - - ABT rootNode = - make(ProjectionRequirement{ProjectionNameVector{"test"}}, std::move(filterNode)); - - HeuristicCETester ht(collName, kNoOptPhaseSet); - ASSERT_CE_CARD(ht, rootNode, 0.0, 0.0); - ASSERT_CE_CARD(ht, rootNode, 2.1, 3.0); - ASSERT_CE_CARD(ht, rootNode, 4.9, 7.0); - ASSERT_CE_CARD(ht, rootNode, 7.0, 10.0); - ASSERT_CE_CARD(ht, rootNode, 33.0, 100.0); - ASSERT_CE_CARD(ht, rootNode, 3300.0, 10000.0); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationTwoLowBoundsWithoutTraverse) { - using namespace properties; - - ABT scanNode = make("test", "test"); - - ABT filterNode = make( - make(make("a", - make( - make(Operations::Gt, Constant::int64(5)), - make(Operations::Gt, Constant::int64(10)))), - make("test")), - std::move(scanNode)); - - ABT rootNode = - make(ProjectionRequirement{ProjectionNameVector{"test"}}, std::move(filterNode)); - - HeuristicCETester ht(collName, kNoOptPhaseSet); - ASSERT_CE_CARD(ht, rootNode, 0.0, 0.0); - ASSERT_CE_CARD(ht, rootNode, 2.1, 3.0); - ASSERT_CE_CARD(ht, rootNode, 4.9, 7.0); - ASSERT_CE_CARD(ht, rootNode, 7.0, 10.0); - ASSERT_CE_CARD(ht, rootNode, 33.0, 100.0); - ASSERT_CE_CARD(ht, rootNode, 3300.0, 10000.0); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationTwoHighBoundsWithoutTraverse) { - using namespace properties; - - ABT scanNode = make("test", "test"); - - ABT filterNode = make( - make(make("a", - make( - make(Operations::Lt, Constant::int64(5)), - make(Operations::Lt, Constant::int64(10)))), - make("test")), - std::move(scanNode)); - - ABT rootNode = - make(ProjectionRequirement{ProjectionNameVector{"test"}}, std::move(filterNode)); - - HeuristicCETester ht(collName, kNoOptPhaseSet); - ASSERT_CE_CARD(ht, rootNode, 0.0, 0.0); - ASSERT_CE_CARD(ht, rootNode, 2.1, 3.0); - ASSERT_CE_CARD(ht, rootNode, 4.9, 7.0); - ASSERT_CE_CARD(ht, rootNode, 7.0, 10.0); - ASSERT_CE_CARD(ht, rootNode, 33.0, 100.0); - ASSERT_CE_CARD(ht, rootNode, 3300.0, 10000.0); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationClosedRangeWithoutTraverse) { - using namespace properties; - - ABT scanNode = make("test", "test"); - - ABT filterNode = make( - make(make("a", - make( - make(Operations::Gt, Constant::int64(7)), - make(Operations::Lt, Constant::int64(13)))), - make("test")), - std::move(scanNode)); - - ABT rootNode = - make(ProjectionRequirement{ProjectionNameVector{"test"}}, std::move(filterNode)); - - HeuristicCETester ht(collName, kNoOptPhaseSet); - ASSERT_CE_CARD(ht, rootNode, 0.0, 0.0); - ASSERT_CE_CARD(ht, rootNode, 1.5, 3.0); - ASSERT_CE_CARD(ht, rootNode, 3.5, 7.0); - ASSERT_CE_CARD(ht, rootNode, 5.0, 10.0); - ASSERT_CE_CARD(ht, rootNode, 20.0, 100.0); - ASSERT_CE_CARD(ht, rootNode, 2000.0, 10000.0); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationIntervalWithDifferentTypes) { - using namespace properties; - - ABT scanNode = make("test", "test"); - - ABT filterNode = make( - make( - make( - "a", - make(make(Operations::Gt, Constant::int64(5)), - make(Operations::Lt, Constant::str("foo")))), - make("test")), - std::move(scanNode)); - - ABT rootNode = - make(ProjectionRequirement{ProjectionNameVector{"test"}}, std::move(filterNode)); - - HeuristicCETester ht(collName, kNoOptPhaseSet); - ASSERT_CE_CARD(ht, rootNode, 0.0, 0.0); - ASSERT_CE_CARD(ht, rootNode, 2.1, 3.0); - ASSERT_CE_CARD(ht, rootNode, 4.9, 7.0); - ASSERT_CE_CARD(ht, rootNode, 7.0, 10.0); - ASSERT_CE_CARD(ht, rootNode, 33.0, 100.0); - ASSERT_CE_CARD(ht, rootNode, 3300.0, 10000.0); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationClosedRangeWithPathExpr) { - using namespace properties; - - ABT scanNode = make("test", "test"); - - ABT filterNode = make( - make( - make( - make( - "a0", - make( - make("a1", - make( - make(Operations::Gt, Constant::int64(5)), - PathTraverse::kSingleLevel)), - PathTraverse::kSingleLevel)), - make( - "a0", - make( - make("a1", - make( - make(Operations::Lt, Constant::int64(10)), - PathTraverse::kSingleLevel)), - PathTraverse::kSingleLevel))), - make("test")), - std::move(scanNode)); - - ABT rootNode = - make(ProjectionRequirement{ProjectionNameVector{"test"}}, std::move(filterNode)); - - HeuristicCETester ht(collName, kNoOptPhaseSet); - ASSERT_CE_CARD(ht, rootNode, 0.0, 0.0); - ASSERT_CE_CARD(ht, rootNode, 1.5, 3.0); - ASSERT_CE_CARD(ht, rootNode, 3.5, 7.0); - ASSERT_CE_CARD(ht, rootNode, 5.0, 10.0); - ASSERT_CE_CARD(ht, rootNode, 20.0, 100.0); - ASSERT_CE_CARD(ht, rootNode, 2000.0, 10000.0); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationClosedRangeWith1Variable) { - using namespace properties; - - ABT scanNode = make("test", "test"); - - ABT filterNode = make( - make( - make( - make( - "a0", - make( - make("a1", - make( - make(Operations::Gt, Constant::int64(5)), - PathTraverse::kSingleLevel)), - PathTraverse::kSingleLevel)), - make( - "a0", - make( - make("a1", - make( - make(Operations::Lt, make("test")), - PathTraverse::kSingleLevel)), - PathTraverse::kSingleLevel))), - make("test")), - std::move(scanNode)); - - ABT rootNode = - make(ProjectionRequirement{ProjectionNameVector{"test"}}, std::move(filterNode)); - - HeuristicCETester ht(collName, kNoOptPhaseSet); - ASSERT_CE_CARD(ht, rootNode, 0.0, 0.0); - ASSERT_CE_CARD(ht, rootNode, 1.5, 3.0); - ASSERT_CE_CARD(ht, rootNode, 3.5, 7.0); - ASSERT_CE_CARD(ht, rootNode, 5.0, 10.0); - ASSERT_CE_CARD(ht, rootNode, 20.0, 100.0); - ASSERT_CE_CARD(ht, rootNode, 2000.0, 10000.0); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationOpenRangeWith1Variable) { - using namespace properties; - - ABT scanNode = make("test", "test"); - - ABT filterNode = make( - make( - make( - make( - "a0", - make( - make("a1", - make( - make(Operations::Lt, Constant::int64(5)), - PathTraverse::kSingleLevel)), - PathTraverse::kSingleLevel)), - make( - "a0", - make( - make("a1", - make( - make(Operations::Lt, make("test")), - PathTraverse::kSingleLevel)), - PathTraverse::kSingleLevel))), - make("test")), - std::move(scanNode)); - - ABT rootNode = - make(ProjectionRequirement{ProjectionNameVector{"test"}}, std::move(filterNode)); - - HeuristicCETester ht(collName, kNoOptPhaseSet); - ASSERT_CE_CARD(ht, rootNode, 0.0, 0.0); - ASSERT_CE_CARD(ht, rootNode, 2.1, 3.0); - ASSERT_CE_CARD(ht, rootNode, 4.9, 7.0); - ASSERT_CE_CARD(ht, rootNode, 7.0, 10.0); - ASSERT_CE_CARD(ht, rootNode, 33.0, 100.0); - ASSERT_CE_CARD(ht, rootNode, 3300.0, 10000.0); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationConjunctionOfBoundsWithDifferentPaths) { - using namespace properties; - - ABT scanNode = make("test", "test"); - - ABT filterNode = make( - make( - make( - make( - "a0", - make( - make("a1", - make( - make(Operations::Gt, Constant::int64(5)), - PathTraverse::kSingleLevel)), - PathTraverse::kSingleLevel)), - make( - "b0", - make( - make("b1", - make( - make(Operations::Lt, Constant::int64(10)), - PathTraverse::kSingleLevel)), - PathTraverse::kSingleLevel))), - make("test")), - std::move(scanNode)); - - ABT rootNode = - make(ProjectionRequirement{ProjectionNameVector{"test"}}, std::move(filterNode)); - - HeuristicCETester ht(collName, kNoOptPhaseSet); - ASSERT_CE_CARD(ht, rootNode, 0.0, 0.0); - ASSERT_CE_CARD(ht, rootNode, 1.47, 3.0); - ASSERT_CE_CARD(ht, rootNode, 3.43, 7.0); - ASSERT_CE_CARD(ht, rootNode, 4.9, 10.0); - ASSERT_CE_CARD(ht, rootNode, 10.89, 100.0); - ASSERT_CE_CARD(ht, rootNode, 1089.0, 10000.0); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationDisjunctionOnSamePathWithoutTraverse) { - using namespace properties; - - ABT scanNode = make("test", "test"); - - ABT filterNode = make( - make( - make( - make( - "a0", - make("a1", make(Operations::Gt, Constant::int64(5)))), - make( - "a0", - make("a1", make(Operations::Eq, Constant::int64(100))))), - make("test")), - std::move(scanNode)); - - ABT rootNode = - make(ProjectionRequirement{ProjectionNameVector{"test"}}, std::move(filterNode)); - - HeuristicCETester ht(collName, kNoOptPhaseSet); - ASSERT_CE_CARD(ht, rootNode, 0.0, 0.0); - ASSERT_CE_CARD(ht, rootNode, 2.61962, 3.0); - ASSERT_CE_CARD(ht, rootNode, 5.69373, 7.0); - ASSERT_CE_CARD(ht, rootNode, 7.94868, 10.0); - ASSERT_CE_CARD(ht, rootNode, 39.7, 100.0); - ASSERT_CE_CARD(ht, rootNode, 3367.0, 10000.0); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationDisjunctionOnDifferentPathsWithoutTraverse) { - using namespace properties; - - ABT scanNode = make("test", "test"); - - ABT filterNode = make( - make( - make( - make( - "a0", - make("a1", make(Operations::Gt, Constant::int64(5)))), - make( - "b0", - make("b1", make(Operations::Eq, Constant::int64(100))))), - make("test")), - std::move(scanNode)); - - ABT rootNode = - make(ProjectionRequirement{ProjectionNameVector{"test"}}, std::move(filterNode)); - - HeuristicCETester ht(collName, kNoOptPhaseSet); - ASSERT_CE_CARD(ht, rootNode, 0.0, 0.0); - ASSERT_CE_CARD(ht, rootNode, 2.61962, 3.0); - ASSERT_CE_CARD(ht, rootNode, 5.69373, 7.0); - ASSERT_CE_CARD(ht, rootNode, 7.94868, 10.0); - ASSERT_CE_CARD(ht, rootNode, 39.7, 100.0); - ASSERT_CE_CARD(ht, rootNode, 3367.0, 10000.0); -} - -TEST(CEHeuristicTest, CEWithoutOptimizationEquivalentConjunctions) { - using namespace properties; - - ABT rootNode1 = make( - ProjectionRequirement{ProjectionNameVector{"test"}}, - make( - make( - make( - make( - "a0", - make(make(Operations::Gt, Constant::int64(5)), - PathTraverse::kSingleLevel)), - make( - "b0", - make(make(Operations::Gt, Constant::int64(10)), - PathTraverse::kSingleLevel))), - make("test")), - make("test", "test"))); - - ABT rootNode2 = make( - ProjectionRequirement{ProjectionNameVector{"test"}}, - make( - make(make("a0", - make(make(Operations::Gt, - Constant::int64(5)), - PathTraverse::kSingleLevel)), - make("test")), - make( - make( - make( - "b0", - make(make(Operations::Gt, Constant::int64(10)), - PathTraverse::kSingleLevel)), - make("test")), - make("test", "test")))); - - HeuristicCETester ht(collName, kNoOptPhaseSet); - ht.setCollCard(kCollCard); - auto ce1 = ht.getCE(rootNode1); - auto ce2 = ht.getCE(rootNode2); - ASSERT_APPROX_EQUAL(ce1, ce2, kMaxCEError); -} - -TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_Eq) { - std::string query = "{a : 123}"; - HeuristicCETester ht(collName); - ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); - ASSERT_MATCH_CE_CARD(ht, query, 0.1, 0.1); - ASSERT_MATCH_CE_CARD(ht, query, 1.73205, 3.0); - ASSERT_MATCH_CE_CARD(ht, query, 2.64575, 7.0); - ASSERT_MATCH_CE_CARD(ht, query, 3.16228, 10.0); - ASSERT_MATCH_CE_CARD(ht, query, 10.0, 100.0); - ASSERT_MATCH_CE_CARD(ht, query, 100.0, 10000.0); -} - -TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_Gt) { - std::string query = "{a: {$gt: 44}}"; - HeuristicCETester ht(collName); - ASSERT_MATCH_CE_CARD(ht, query, 0.01, 0.0); - ASSERT_MATCH_CE_CARD(ht, query, 0.7, 1.0); - ASSERT_MATCH_CE_CARD(ht, query, 6.3, 9.0); - ASSERT_MATCH_CE_CARD(ht, query, 44.55, 99.0); - ASSERT_MATCH_CE_CARD(ht, query, 330, 1000.0); -} - -TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_Gt_Lt) { - std::string query = "{a: {$gt: 44, $lt: 99}}"; - HeuristicCETester ht(collName); - ASSERT_MATCH_CE_CARD(ht, query, 0.585662, 1.0); - ASSERT_MATCH_CE_CARD(ht, query, 5.27096, 9.0); - ASSERT_MATCH_CE_CARD(ht, query, 29.885, 99.0); - ASSERT_MATCH_CE_CARD(ht, query, 189.571, 1000.0); -} - -TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_AND2Eq) { - std::string query = "{a : 13, b : 42}"; - HeuristicCETester ht(collName); - ASSERT_MATCH_CE_CARD(ht, query, 1.31607, 3.0); - ASSERT_MATCH_CE_CARD(ht, query, 1.62658, 7.0); - ASSERT_MATCH_CE_CARD(ht, query, 1.77828, 10.0); - ASSERT_MATCH_CE_CARD(ht, query, 3.16228, 100.0); - ASSERT_MATCH_CE_CARD(ht, query, 10.0, 10000.0); -} - -TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_AND3Eq) { - std::string query = "{a : 13, b : 42, c : 69}"; - HeuristicCETester ht(collName); - ASSERT_MATCH_CE_CARD(ht, query, 1.1472, 3.0); - ASSERT_MATCH_CE_CARD(ht, query, 1.27537, 7.0); - ASSERT_MATCH_CE_CARD(ht, query, 1.33352, 10.0); - ASSERT_MATCH_CE_CARD(ht, query, 1.77828, 100.0); - ASSERT_MATCH_CE_CARD(ht, query, 3.16228, 10000.0); -} - -TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_OR1path) { - std::string query = "{$or: [{a0: {$gt: 44}}, {a0: {$lt: 9}}]}"; - HeuristicCETester ht(collName); - ASSERT_MATCH_CE_CARD(ht, query, 7.52115, 9.0); - ASSERT_MATCH_CE_CARD(ht, query, 58.6188, 99.0); - ASSERT_MATCH_CE_CARD(ht, query, 451.581, 1000.0); -} - -TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_OR2paths) { - std::string query = "{$or: [{a0: {$gt:44}}, {b0: {$lt: 9}}]}"; - HeuristicCETester ht(collName, kOnlySubPhaseSet); - // Disjunctions on different paths are not SARGable. - ASSERT_MATCH_CE_CARD(ht, query, 8.19, 9.0); - ASSERT_MATCH_CE_CARD(ht, query, 69.0525, 99.0); - ASSERT_MATCH_CE_CARD(ht, query, 551.1, 1000.0); -} - -TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_DNF1pathSimple) { - std::string query = - "{$or: [" - "{$and: [{a0: {$gt: 9}}, {a0: {$lt: 12}}]}," - "{$and: [{a0: {$gt:40}}, {a0: {$lt: 44}}]}" - "]}"; - HeuristicCETester ht(collName); - ASSERT_MATCH_CE_CARD(ht, query, 6.42792, 9.0); - ASSERT_MATCH_CE_CARD(ht, query, 37.0586, 99.0); - ASSERT_MATCH_CE_CARD(ht, query, 225.232, 1000.0); -} - - -TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_DNF1pathComplex) { - HeuristicCETester ht(collName, kOnlySubPhaseSet); - // Each disjunct has different number of conjuncts, - // so that its selectivity is different. We need 5 disjuncts to test exponential backoff which - // cuts off at the first 4. The conjuncts are in selectivity order. - std::string query1 = - "{$or: [" - "{$and: [{a0: {$gt: 9}}, {a0: {$lt: 12}}]}," - "{$and: [{a0: {$gt: 9}}, {a0: {$lt: 12}}, {a0: {$gt: 42}}]}," - "{$and: [{a0: {$gt:40}}, {a0: {$lt: 99}}, {a0: {$gt: 42}}, {a0: {$lt: 88}}]}," - "{$and: [{a0: {$gt:40}}, {a0: {$lt: 99}}, {a0: {$gt: 42}}, {a0: {$lt: 88}}, {a0: {$lt: " - "81}}]}," - "{$and: [{a0: {$gt:40}}, {a0: {$lt: 99}}, {a0: {$gt: 42}}, {a0: {$lt: 88}}, {a0: {$lt: " - "81}}, {a0: {$lt: 77}}]}" - "]}"; - auto ce1 = ht.getMatchCE(query1); - // The conjuncts are in inverse selectivity order. - std::string query2 = - "{$or: [" - "{$and: [{a0: {$gt:40}}, {a0: {$lt: 99}}, {a0: {$gt: 42}}, {a0: {$lt: 88}}, {a0: {$lt: " - "81}}, {a0: {$lt: 77}}]}," - "{$and: [{a0: {$gt:40}}, {a0: {$lt: 99}}, {a0: {$gt: 42}}, {a0: {$lt: 88}}, {a0: {$lt: " - "81}}]}," - "{$and: [{a0: {$gt:40}}, {a0: {$lt: 99}}, {a0: {$gt: 42}}, {a0: {$lt: 88}}]}," - "{$and: [{a0: {$gt: 9}}, {a0: {$lt: 12}}, {a0: {$gt: 42}}]}," - "{$and: [{a0: {$gt: 9}}, {a0: {$lt: 12}}]}" - "]}"; - auto ce2 = ht.getMatchCE(query2); - ASSERT_APPROX_EQUAL(ce1, ce2, kMaxCEError); -} - -TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_DNF2paths) { - std::string query = - "{$or: [" - "{$and: [{a0: {$gt: 9}}, {a0: {$lt: 12}}]}," - "{$and: [{b0: {$gt:40}}, {b0: {$lt: 44}}]}" - "]}"; - HeuristicCETester ht(collName, kOnlySubPhaseSet); - // Disjunctions on different paths are not SARGable. - ASSERT_MATCH_CE_CARD(ht, query, 6.6591, 9.0); - ASSERT_MATCH_CE_CARD(ht, query, 36.0354, 99.0); - ASSERT_MATCH_CE_CARD(ht, query, 205.941, 1000.0); -} - -TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_CNF1path) { - std::string query = - "{$and : [" - "{$or : [ {a0 : {$gt : 11}}, {a0 : {$lt : 44}} ]}," - "{$or : [ {a0 : {$gt : 77}}, {a0 : {$eq : 51}} ]}" - "]}"; - HeuristicCETester ht(collName); - ASSERT_MATCH_CE_CARD(ht, query, 6.21212, 9.0); - ASSERT_MATCH_CE_CARD(ht, query, 36.4418, 99.0); - ASSERT_MATCH_CE_CARD(ht, query, 228.935, 1000.0); -} - -TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_CNF2paths) { - std::string query = - "{$and : [" - "{$or : [ {a0 : {$gt : 11}}, {a0 : {$lt : 44}} ]}," - "{$or : [ {b0 : {$gt : 77}}, {b0 : {$eq : 51}} ]}" - "]}"; - HeuristicCETester ht(collName); - ASSERT_MATCH_CE_CARD(ht, query, 6.21212, 9.0); - ASSERT_MATCH_CE_CARD(ht, query, 36.4418, 99.0); - ASSERT_MATCH_CE_CARD(ht, query, 228.935, 1000.0); -} - -TEST(CEHeuristicTest, CEAfterMemoSubstitutionExplorationPhases) { - HeuristicCETester ht(collName); - ASSERT_MATCH_CE(ht, "{a : 13, b : 42}", 10.0); -} - -TEST(CEHeuristicTest, CENotEquality) { - double collCard = kCollCard; - HeuristicCETester opt(collName); - - // We avoid optimizing in order to verify heuristic estimate of FilterNode subtree. Note that we - // do not generate SargableNodes for $not predicates, but we do generate SargableNodes without - // it; for the purposes of this test, we want to demonstrate that $not returns the inverse of - // the FilterNode estimate. - HeuristicCETester noOpt(collName, kNoOptPhaseSet); - - // Equality selectivity is sqrt(kCollCard)/kCollCard = 0.01. When we see a UnaryOp [Not] above - // this subtree, we invert the selectivity 1.0 - 0.01 = 0.99. - double ce = 100.0; - double inverseCE = collCard - ce; - ASSERT_MATCH_CE(noOpt, "{a: {$eq: 1}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$eq: 1}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{'validate.long.path.estimate': {$eq: 1}}", ce); - ASSERT_MATCH_CE(opt, "{'validate.long.path.estimate': {$not: {$eq: 1}}}", inverseCE); - - // Update cardinality to 25. - collCard = 25; - opt.setCollCard(collCard); - noOpt.setCollCard(collCard); - - // Selectivity is sqrt(25)/25. - ce = 5.0; - inverseCE = collCard - ce; - ASSERT_MATCH_CE(noOpt, "{a: {$eq: 1}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$eq: 1}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{'validate.long.path.estimate': {$eq: 1}}", ce); - ASSERT_MATCH_CE(opt, "{'validate.long.path.estimate': {$not: {$eq: 1}}}", inverseCE); - - // Update cardinality to 9. - collCard = 9; - opt.setCollCard(collCard); - noOpt.setCollCard(collCard); - - // Selectivity is sqrt(3)/9. - ce = 3.0; - inverseCE = collCard - ce; - ASSERT_MATCH_CE(noOpt, "{a: {$eq: 1}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$eq: 1}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{'validate.long.path.estimate': {$eq: 1}}", ce); - ASSERT_MATCH_CE(opt, "{'validate.long.path.estimate': {$not: {$eq: 1}}}", inverseCE); -} - -TEST(CEHeuristicTest, CENotOpenRange) { - // Repeat the above test for open ranges; the $not cardinality estimate should add up with the - // non-$not estimate to the collection cardinality. - double collCard = kCollCard; - HeuristicCETester opt(collName); - HeuristicCETester noOpt(collName, kNoOptPhaseSet); - - // Expect open-range selectivity for input card > 100 (0.33). - double ce = 3300; - double inverseCE = collCard - ce; - - ASSERT_MATCH_CE(noOpt, "{a: {$lt: 1}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$lt: 1}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{a: {$lte: 1}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$lte: 1}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{a: {$gt: 1}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$gt: 1}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{a: {$gte: 1}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$gte: 1}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{'validate.long.path.estimate': {$gte: 1}}", ce); - ASSERT_MATCH_CE(opt, "{'validate.long.path.estimate': {$not: {$gte: 1}}}", inverseCE); - - // Update cardinality to 25. - collCard = 25; - opt.setCollCard(collCard); - noOpt.setCollCard(collCard); - - // Expect open-range selectivity for input card in range (20, 100) (0.45). - ce = 11.25; - inverseCE = collCard - ce; - - ASSERT_MATCH_CE(noOpt, "{a: {$lt: 1}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$lt: 1}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{a: {$lte: 1}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$lte: 1}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{a: {$gt: 1}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$gt: 1}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{a: {$gte: 1}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$gte: 1}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{'validate.long.path.estimate': {$gte: 1}}", ce); - ASSERT_MATCH_CE(opt, "{'validate.long.path.estimate': {$not: {$gte: 1}}}", inverseCE); - - // Update cardinality to 10. - collCard = 10.0; - opt.setCollCard(collCard); - noOpt.setCollCard(collCard); - - // Expect open-range selectivity for input card < 20 (0.70). - ce = 7.0; - inverseCE = collCard - ce; - - ASSERT_MATCH_CE(noOpt, "{a: {$lt: 1}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$lt: 1}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{a: {$lte: 1}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$lte: 1}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{a: {$gt: 1}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$gt: 1}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{a: {$gte: 1}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$gte: 1}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{'validate.long.path.estimate': {$gte: 1}}", ce); - ASSERT_MATCH_CE(opt, "{'validate.long.path.estimate': {$not: {$gte: 1}}}", inverseCE); -} - -TEST(CEHeuristicTest, CENotClosedRange) { - // Repeat the above test for closed ranges; the $not cardinality estimate should add up with the - // non-$not estimate to the collection cardinality. - double collCard = kCollCard; - double ce = 1089.0; - double inverseCE = collCard - ce; - HeuristicCETester opt(collName); - HeuristicCETester noOpt(collName, kNoOptPhaseSet); - - ASSERT_MATCH_CE(noOpt, "{a: {$gt: 10, $lt: 20}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$gt: 10, $lt: 20}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{a: {$gte: 10, $lt: 20}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$gte: 10, $lt: 20}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{a: {$gte: 10, $lte: 20}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$gte: 10, $lte: 20}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{a: {$gt: 10, $lte: 20}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$gt: 10, $lte: 20}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{'validate.long.path.estimate': {$gte: 10, $lt: 20}}", ce); - ASSERT_MATCH_CE(opt, "{'validate.long.path.estimate': {$not: {$gte: 10, $lt: 20}}}", inverseCE); - - /* - * Update cardinality to 25. Here we observe an interesting edge case where the estimated - * cardinality is not the inverse of the actual cardinality. - * - * Consider the predicate {a: {$gt: 10, $lt: 20}}. This generates two FilterNodes stacked on top - * of each other. However, the predicate {a: {$not: {$gt: 10, $lt: 20}}} generates just one - * FilterNode. - * - * We always use input cardinality to determine which interval selectivity we're going to use. - * However, we have a different input cardinality for the one FilterNode case (collCard) than - * for the two FilterNodes case: the first node gets collCard, and the second node gets a - * smaller value after the selectivity of the first filter is applied. - * - * Because we use a piecewise function to pick the selectivity, and because we go from inputCard - * < 100 to inputCard < 20, we choose different selectivities for the intervals in the second - * FilterNode (0.50) than in the first (0.33). - */ - collCard = 25; - ce = 7.875; - inverseCE = 19.9375; - opt.setCollCard(collCard); - noOpt.setCollCard(collCard); - - ASSERT_MATCH_CE(noOpt, "{a: {$gt: 10, $lt: 20}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$gt: 10, $lt: 20}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{a: {$gte: 10, $lt: 20}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$gte: 10, $lt: 20}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{a: {$gte: 10, $lte: 20}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$gte: 10, $lte: 20}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{a: {$gt: 10, $lte: 20}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$gt: 10, $lte: 20}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{'validate.long.path.estimate': {$gte: 10, $lt: 20}}", ce); - ASSERT_MATCH_CE(opt, "{'validate.long.path.estimate': {$not: {$gte: 10, $lt: 20}}}", inverseCE); - - // Update cardinality to 10. - collCard = 10.0; - ce = 4.9; - inverseCE = collCard - ce; - opt.setCollCard(collCard); - noOpt.setCollCard(collCard); - - ASSERT_MATCH_CE(noOpt, "{a: {$gt: 10, $lt: 20}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$gt: 10, $lt: 20}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{a: {$gte: 10, $lt: 20}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$gte: 10, $lt: 20}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{a: {$gte: 10, $lte: 20}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$gte: 10, $lte: 20}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{a: {$gt: 10, $lte: 20}}", ce); - ASSERT_MATCH_CE(opt, "{a: {$not: {$gt: 10, $lte: 20}}}", inverseCE); - ASSERT_MATCH_CE(noOpt, "{'validate.long.path.estimate': {$gte: 10, $lt: 20}}", ce); - ASSERT_MATCH_CE(opt, "{'validate.long.path.estimate': {$not: {$gte: 10, $lt: 20}}}", inverseCE); -} - -TEST(CEHeuristicTest, CEExists) { - HeuristicCETester noOpt(collName); - - // Test basic case + $not. - ASSERT_MATCH_CE(noOpt, "{a: {$exists: true}}", 7000); - ASSERT_MATCH_CE(noOpt, "{a: {$exists: false}}", 3000); - ASSERT_MATCH_CE(noOpt, "{a: {$not: {$exists: false}}}", 7000); - ASSERT_MATCH_CE(noOpt, "{a: {$not: {$exists: true}}}", 3000); - - // Test combinations of predicates. - ASSERT_MATCH_CE(noOpt, "{a: {$exists: true, $eq: 123}}", 70); - ASSERT_MATCH_CE(noOpt, "{a: {$exists: false, $eq: null}}", 30); - ASSERT_MATCH_CE(noOpt, "{a: {$exists: false}, b: {$eq: 123}}", 30); - ASSERT_MATCH_CE(noOpt, "{a: {$exists: true, $gt: 123}}", 2310); -} - -} // namespace -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_hinted.cpp b/src/mongo/db/query/ce/ce_hinted.cpp deleted file mode 100644 index 0ce71a69fe7..00000000000 --- a/src/mongo/db/query/ce/ce_hinted.cpp +++ /dev/null @@ -1,108 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/db/query/ce/ce_hinted.h" -#include "mongo/db/query/ce/ce_heuristic.h" - -namespace mongo::ce { -namespace cascades = optimizer::cascades; -namespace properties = optimizer::properties; - -using ABT = optimizer::ABT; -using CEType = optimizer::CEType; -using LogicalProps = properties::LogicalProps; -using Memo = cascades::Memo; -using Metadata = optimizer::Metadata; - -class CEHintedTransport { -public: - CEType transport(const ABT& n, - const optimizer::SargableNode& node, - CEType childResult, - CEType /*bindsResult*/, - CEType /*refsResult*/) { - CEType result = childResult; - for (const auto& [key, req] : node.getReqMap()) { - if (!isIntervalReqFullyOpenDNF(req.getIntervals())) { - auto it = _hints.find(key); - if (it != _hints.cend()) { - // Assume independence. - result *= it->second; - } - } - } - - return result; - } - - template - CEType transport(const ABT& n, const T& /*node*/, Ts&&...) { - if (optimizer::canBeLogicalNode()) { - return _heuristicCE.deriveCE(_metadata, _memo, _logicalProps, n.ref()); - } - return 0.0; - } - - static CEType derive(const Metadata& metadata, - const Memo& memo, - const PartialSchemaSelHints& hints, - const LogicalProps& logicalProps, - const ABT::reference_type logicalNodeRef) { - CEHintedTransport instance(metadata, memo, logicalProps, hints); - return optimizer::algebra::transport(logicalNodeRef, instance); - } - -private: - CEHintedTransport(const Metadata& metadata, - const Memo& memo, - const LogicalProps& logicalProps, - const PartialSchemaSelHints& hints) - : _heuristicCE(), - _metadata(metadata), - _memo(memo), - _logicalProps(logicalProps), - _hints(hints) {} - - HeuristicCE _heuristicCE; - - // We don't own this. - const Metadata& _metadata; - const Memo& _memo; - const LogicalProps& _logicalProps; - const PartialSchemaSelHints& _hints; -}; - -CEType HintedCE::deriveCE(const Metadata& metadata, - const Memo& memo, - const LogicalProps& logicalProps, - const ABT::reference_type logicalNodeRef) const { - return CEHintedTransport::derive(metadata, memo, _hints, logicalProps, logicalNodeRef); -} - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_hinted.h b/src/mongo/db/query/ce/ce_hinted.h deleted file mode 100644 index eacadc0ccfb..00000000000 --- a/src/mongo/db/query/ce/ce_hinted.h +++ /dev/null @@ -1,58 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include "mongo/db/query/optimizer/cascades/interfaces.h" - -namespace mongo::ce { - -using PartialSchemaSelHints = std::map; - -/** - * Estimation based on hints. The hints are organized in a PartialSchemaSelHints structure. - * SargableNodes are estimated based on the matching PartialSchemaKeys. - */ -class HintedCE : public optimizer::cascades::CEInterface { -public: - HintedCE(PartialSchemaSelHints hints) : _hints(std::move(hints)) {} - - optimizer::CEType deriveCE(const optimizer::Metadata& metadata, - const optimizer::cascades::Memo& memo, - const optimizer::properties::LogicalProps& logicalProps, - optimizer::ABT::reference_type logicalNodeRef) const override final; - -private: - // Selectivity hints per PartialSchemaKey. - PartialSchemaSelHints _hints; -}; - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_histogram.cpp b/src/mongo/db/query/ce/ce_histogram.cpp deleted file mode 100644 index c456d9227b6..00000000000 --- a/src/mongo/db/query/ce/ce_histogram.cpp +++ /dev/null @@ -1,289 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/db/exec/sbe/abt/abt_lower.h" - -#include "mongo/db/query/ce/ce_histogram.h" -#include "mongo/db/query/ce/collection_statistics_impl.h" -#include "mongo/db/query/ce/histogram_estimation.h" - -#include "mongo/db/query/optimizer/utils/abt_hash.h" -#include "mongo/db/query/optimizer/utils/ce_math.h" -#include "mongo/db/query/optimizer/utils/memo_utils.h" - -#include "mongo/db/pipeline/abt/utils.h" - -namespace mongo::ce { -namespace cascades = optimizer::cascades; -namespace properties = optimizer::properties; - -using ABT = optimizer::ABT; -using CEType = optimizer::CEType; -using LogicalProps = properties::LogicalProps; -using Memo = cascades::Memo; -using Metadata = optimizer::Metadata; - -namespace { - -/** - * This transport combines chains of PathGets and PathTraverses into an MQL-like string path. - */ -class PathDescribeTransport { -public: - std::string transport(const optimizer::PathTraverse& /*node*/, std::string childResult) { - return childResult; - } - - std::string transport(const optimizer::PathGet& node, std::string childResult) { - return str::stream() << node.name() << (childResult.length() > 0 ? "." : "") << childResult; - } - - std::string transport(const optimizer::EvalFilter& node, - std::string pathResult, - std::string inputResult) { - return pathResult; - } - - std::string transport(const optimizer::PathIdentity& node) { - return ""; - } - - template - std::string transport(const T& node, Ts&&... /* args */) { - uasserted(6903900, "Unexpected node in path serialization."); - } -}; - -std::string serializePath(const ABT& path) { - PathDescribeTransport pdt; - auto str = optimizer::algebra::transport(path, pdt); - return str; -} - -} // namespace - -class CEHistogramTransportImpl { -public: - CEHistogramTransportImpl(std::shared_ptr stats, - std::unique_ptr fallbackCE) - : _stats(stats), - _fallbackCE(std::move(fallbackCE)), - _arrayOnlyInterval(*defaultConvertPathToInterval(make())) {} - - ~CEHistogramTransportImpl() {} - - CEType transport(const ABT& n, - const optimizer::ScanNode& node, - const Memo& memo, - const LogicalProps& logicalProps, - CEType /*bindResult*/) { - return _stats->getCardinality(); - } - - /** - * This struct is used to track an intermediate representation of the intervals in the - * requirements map. In particular, grouping intervals along each path in the map allows us to - * determine which paths should be estimated as $elemMatches without relying on a particular - * order of entries in the requirements map. - */ - struct SargableConjunct { - bool includeScalar; - const ce::ArrayHistogram& histogram; - std::vector> intervals; - }; - - CEType transport(const ABT& n, - const SargableNode& node, - const Metadata& metadata, - const Memo& memo, - const LogicalProps& logicalProps, - CEType childResult, - CEType /*bindsResult*/, - CEType /*refsResult*/) { - // Early out and return 0 since we don't expect to get more results. - if (childResult == 0.0) { - return 0.0; - } - - // Initial first pass through the requirements map to extract information about each path. - std::map conjunctRequirements; - for (const auto& [key, req] : node.getReqMap()) { - if (req.getIsPerfOnly()) { - // Ignore perf-only requirements. - continue; - } - - const auto serializedPath = serializePath(key._path.ref()); - const auto& interval = req.getIntervals(); - const bool isPathArrInterval = - (_arrayOnlyInterval == interval) && !pathEndsInTraverse(key._path.ref()); - - // Check if we have already seen this path. - if (auto conjunctIt = conjunctRequirements.find({serializedPath}); - conjunctIt != conjunctRequirements.end()) { - auto& conjunctReq = conjunctIt->second; - if (isPathArrInterval) { - // We should estimate this path's intervals using $elemMatch semantics. - // Don't push back the interval for estimation; instead, we use it to change how - // we estimate other intervals along this path. - conjunctReq.includeScalar = false; - } else { - // We will need to estimate this interval. - conjunctReq.intervals.push_back(interval); - } - continue; - } - - // Fallback if there is no histogram. - auto histogram = _stats->getHistogram(serializedPath); - if (!histogram) { - // For now, because of the structure of SargableNode and the implementation of - // the fallback (currently HeuristicCE), we can't combine heuristic & histogram - // estimates. In this case, default to Heuristic if we don't have a histogram for - // any of the predicates. - return _fallbackCE->deriveCE(metadata, memo, logicalProps, n.ref()); - } - - // Add this path to the map. If this is not a 'PathArr' interval, add it to the vector - // of intervals we will be estimating. - SargableConjunct sc{!isPathArrInterval, *histogram, {}}; - if (sc.includeScalar) { - sc.intervals.push_back(interval); - } - conjunctRequirements.emplace(serializedPath, std::move(sc)); - } - - std::vector topLevelSelectivities; - for (const auto& [_, conjunctReq] : conjunctRequirements) { - const CEType totalCard = _stats->getCardinality(); - - if (conjunctReq.intervals.empty() && !conjunctReq.includeScalar) { - // In this case there is a single 'PathArr' interval for this field. - // The selectivity of this interval is: (count of all arrays) / totalCard - double pathArrSel = conjunctReq.histogram.getArrayCount() / totalCard; - topLevelSelectivities.push_back(pathArrSel); - } - - // Intervals are in DNF. - for (const IntervalReqExpr::Node& intervalDNF : conjunctReq.intervals) { - std::vector disjSelectivities; - - const auto disjuncts = intervalDNF.cast()->nodes(); - for (const auto& disjunct : disjuncts) { - const auto& conjuncts = disjunct.cast()->nodes(); - - std::vector conjSelectivities; - for (const auto& conjunct : conjuncts) { - const auto& interval = conjunct.cast()->getExpr(); - auto cardinality = - ce::estimateIntervalCardinality(conjunctReq.histogram, - interval, - childResult, - conjunctReq.includeScalar); - - // We may still not have been able to estimate the interval using - // histograms, for instance if the interval bounds were non-Constant. In - // this case, we should fallback to heuristics. - if (cardinality < 0) { - return _fallbackCE->deriveCE(metadata, memo, logicalProps, n.ref()); - } - - // We have to convert the cardinality to a selectivity. The histogram - // returns the cardinality for the entire collection; however, fewer records - // may be expected at the SargableNode. - conjSelectivities.push_back(cardinality / totalCard); - } - - auto backoff = ce::conjExponentialBackoff(std::move(conjSelectivities)); - disjSelectivities.push_back(backoff); - } - - auto backoff = ce::disjExponentialBackoff(std::move(disjSelectivities)); - topLevelSelectivities.push_back(backoff); - } - } - - // The elements of the PartialSchemaRequirements map represent an implicit conjunction. - if (!topLevelSelectivities.empty()) { - auto backoff = ce::conjExponentialBackoff(std::move(topLevelSelectivities)); - childResult *= backoff; - } - return childResult; - } - - CEType transport(const ABT& n, - const RootNode& node, - const Metadata& metadata, - const Memo& memo, - const LogicalProps& logicalProps, - CEType childResult, - CEType /*refsResult*/) { - // Root node does not change cardinality. - return childResult; - } - - /** - * Use fallback for other ABT types. - */ - template - CEType transport(const ABT& n, - const T& /*node*/, - const Metadata& metadata, - const Memo& memo, - const LogicalProps& logicalProps, - Ts&&...) { - if (canBeLogicalNode()) { - return _fallbackCE->deriveCE(metadata, memo, logicalProps, n.ref()); - } - return 0.0; - } - -private: - std::shared_ptr _stats; - std::unique_ptr _fallbackCE; - - // This is a special interval indicating that we expect to use $elemMatch semantics when - // estimating the current path. - const IntervalReqExpr::Node _arrayOnlyInterval; -}; - -CEHistogramTransport::CEHistogramTransport(std::shared_ptr stats, - std::unique_ptr fallbackCE) - : _impl(std::make_unique(stats, std::move(fallbackCE))) {} - -CEHistogramTransport::~CEHistogramTransport() {} - -CEType CEHistogramTransport::deriveCE(const Metadata& metadata, - const Memo& memo, - const LogicalProps& logicalProps, - const ABT::reference_type logicalNodeRef) const { - return algebra::transport(logicalNodeRef, *this->_impl, metadata, memo, logicalProps); -} - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_histogram.h b/src/mongo/db/query/ce/ce_histogram.h deleted file mode 100644 index 1823bc211a5..00000000000 --- a/src/mongo/db/query/ce/ce_histogram.h +++ /dev/null @@ -1,54 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include "mongo/db/query/ce/collection_statistics_impl.h" -#include "mongo/db/query/optimizer/cascades/interfaces.h" - -namespace mongo::ce { - -class CEHistogramTransportImpl; - -class CEHistogramTransport : public optimizer::cascades::CEInterface { -public: - CEHistogramTransport(std::shared_ptr stats, - std::unique_ptr fallbackCE); - ~CEHistogramTransport(); - - optimizer::CEType deriveCE(const optimizer::Metadata& metadata, - const optimizer::cascades::Memo& memo, - const optimizer::properties::LogicalProps& logicalProps, - optimizer::ABT::reference_type logicalNodeRef) const final; - -private: - std::unique_ptr _impl; -}; - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_histogram_test.cpp b/src/mongo/db/query/ce/ce_histogram_test.cpp deleted file mode 100644 index 3267ce4d89f..00000000000 --- a/src/mongo/db/query/ce/ce_histogram_test.cpp +++ /dev/null @@ -1,1156 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/db/query/ce/ce_histogram.h" -#include "mongo/db/query/ce/ce_test_utils.h" -#include "mongo/db/query/ce/collection_statistics_mock.h" -#include "mongo/db/query/ce/histogram_estimation.h" -#include "mongo/db/query/optimizer/utils/unit_test_utils.h" -#include "mongo/db/query/sbe_stage_builder_helpers.h" -#include "mongo/unittest/unittest.h" - -namespace mongo::ce { -namespace { - -using namespace optimizer; -using namespace cascades; - -std::string collName("test"); - -class CEHistogramTester : public CETester { -public: - CEHistogramTester(std::string collName, double numRecords) - : CETester(collName, numRecords), _stats{new CollectionStatisticsMock(numRecords)} {} - - void addHistogram(const std::string& path, std::shared_ptr histogram) { - _stats->addHistogram(path, histogram); - } - -protected: - std::unique_ptr getCETransport() const override { - // making a copy of CollecitonStatistics to override - return std::make_unique(_stats, makeHeuristicCE()); - } - -private: - std::shared_ptr _stats; -}; - -struct TestBucket { - Value val; - double equalFreq; - double rangeFreq = 0.0; - double ndv = 1.0; /* ndv including bucket boundary*/ -}; -using TestBuckets = std::vector; - -ScalarHistogram getHistogramFromData(TestBuckets testBuckets) { - sbe::value::Array bounds; - std::vector buckets; - - double cumulativeFreq = 0.0; - double cumulativeNDV = 0.0; - for (const auto& b : testBuckets) { - // Add bucket boundary value to bounds. - auto [tag, val] = stage_builder::makeValue(b.val); - bounds.push_back(tag, val); - - cumulativeFreq += b.equalFreq + b.rangeFreq; - cumulativeNDV += b.ndv; - - // Create a histogram bucket. - buckets.emplace_back(b.equalFreq, - b.rangeFreq, - cumulativeFreq, - b.ndv - 1, /* ndv excluding bucket boundary*/ - cumulativeNDV); - } - - return ScalarHistogram(std::move(bounds), std::move(buckets)); -} - -TypeCounts getTypeCountsFromData(TestBuckets testBuckets) { - TypeCounts typeCounts; - for (const auto& b : testBuckets) { - // Add bucket boundary value to bounds. - auto sbeVal = stage_builder::makeValue(b.val); - auto [tag, val] = sbeVal; - - // Increment count of values for each type tag. - if (auto it = typeCounts.find(tag); it != typeCounts.end()) { - it->second += b.equalFreq + b.rangeFreq; - } else { - typeCounts[tag] = b.equalFreq + b.rangeFreq; - } - } - return typeCounts; -} - -std::unique_ptr getArrayHistogramFromData(TestBuckets testBuckets, - TypeCounts additionalScalarData = {}) { - TypeCounts dataTypeCounts = getTypeCountsFromData(testBuckets); - dataTypeCounts.merge(additionalScalarData); - return std::make_unique(getHistogramFromData(testBuckets), - std::move(dataTypeCounts)); -} - -std::unique_ptr getArrayHistogramFromData(TestBuckets scalarBuckets, - TestBuckets arrayUniqueBuckets, - TestBuckets arrayMinBuckets, - TestBuckets arrayMaxBuckets, - TypeCounts arrayTypeCounts, - double totalArrayCount, - double emptyArrayCount = 0, - TypeCounts additionalScalarData = {}) { - - // Set up scalar type counts. - TypeCounts dataTypeCounts = getTypeCountsFromData(scalarBuckets); - dataTypeCounts[value::TypeTags::Array] = totalArrayCount; - dataTypeCounts.merge(additionalScalarData); - - // Set up histograms. - auto arrayMinHist = getHistogramFromData(arrayMinBuckets); - auto arrayMaxHist = getHistogramFromData(arrayMaxBuckets); - return std::make_unique(getHistogramFromData(scalarBuckets), - std::move(dataTypeCounts), - getHistogramFromData(arrayUniqueBuckets), - std::move(arrayMinHist), - std::move(arrayMaxHist), - std::move(arrayTypeCounts), - emptyArrayCount); -} - -TEST(CEHistogramTest, AssertSmallMaxDiffHistogramEstimatesAtomicPredicates) { - constexpr auto kCollCard = 8; - CEHistogramTester t(collName, kCollCard); - - // Construct a histogram with two buckets: one for 3 ints equal to 1, another for 5 strings - // equal to "ing". - const std::string& str = "ing"; - t.addHistogram("a", - getArrayHistogramFromData({ - {Value(1), 3 /* frequency */}, - {Value(str), 5 /* frequency */}, - })); - - // Test $eq. - ASSERT_MATCH_CE(t, "{a: {$eq: 1}}", 3.0); - ASSERT_MATCH_CE(t, "{a: {$eq: 2}}", 0.0); - ASSERT_MATCH_CE(t, "{a: {$eq: \"ing\"}}", 5.0); - ASSERT_MATCH_CE(t, "{a: {$eq: \"foo\"}}", 0.0); - - // Test case when field doesn't match fieldpath of histogram. This falls back to heuristics. - ASSERT_MATCH_CE(t, "{b: {$eq: 1}}", 2.82843); - - // Test $gt. - ASSERT_MATCH_CE(t, "{a: {$gt: 3}}", 0.0); - ASSERT_MATCH_CE(t, "{a: {$gt: 1}}", 0.0); - ASSERT_MATCH_CE(t, "{a: {$gt: 0}}", 3.0); - ASSERT_MATCH_CE(t, "{a: {$gt: \"bar\"}}", 5.0); - ASSERT_MATCH_CE(t, "{a: {$gt: \"ing\"}}", 0.0); - ASSERT_MATCH_CE(t, "{a: {$gt: \"zap\"}}", 0.0); - - // Test $lt. - ASSERT_MATCH_CE(t, "{a: {$lt: 3}}", 3.0); - ASSERT_MATCH_CE(t, "{a: {$lt: 1}}", 0.0); - ASSERT_MATCH_CE(t, "{a: {$lt: 0}}", 0.0); - ASSERT_MATCH_CE(t, "{a: {$lt: \"bar\"}}", 0.0); - ASSERT_MATCH_CE(t, "{a: {$lt: \"ing\"}}", 0.0); - ASSERT_MATCH_CE(t, "{a: {$lt: \"zap\"}}", 5.0); - - // Test $gte. - ASSERT_MATCH_CE(t, "{a: {$gte: 3}}", 0.0); - ASSERT_MATCH_CE(t, "{a: {$gte: 1}}", 3.0); - ASSERT_MATCH_CE(t, "{a: {$gte: 0}}", 3.0); - ASSERT_MATCH_CE(t, "{a: {$gte: \"bar\"}}", 5.0); - ASSERT_MATCH_CE(t, "{a: {$gte: \"ing\"}}", 5.0); - ASSERT_MATCH_CE(t, "{a: {$gte: \"zap\"}}", 0.0); - - // Test $lte. - ASSERT_MATCH_CE(t, "{a: {$lte: 3}}", 3.0); - ASSERT_MATCH_CE(t, "{a: {$lte: 1}}", 3.0); - ASSERT_MATCH_CE(t, "{a: {$lte: 0}}", 0.0); - ASSERT_MATCH_CE(t, "{a: {$lte: \"bar\"}}", 0.0); - ASSERT_MATCH_CE(t, "{a: {$lte: \"ing\"}}", 5.0); - ASSERT_MATCH_CE(t, "{a: {$lte: \"zap\"}}", 5.0); -} - -TEST(CEHistogramTest, AssertSmallHistogramEstimatesComplexPredicates) { - constexpr auto kCollCard = 9; - CEHistogramTester t(collName, kCollCard); - - // Construct a histogram with three int buckets for field 'a'. - t.addHistogram("a", - getArrayHistogramFromData({ - {Value(1), 3 /* frequency */}, - {Value(2), 5 /* frequency */}, - {Value(3), 1 /* frequency */}, - })); - - // Construct a histogram with two int buckets for field 'b'. - t.addHistogram("b", - getArrayHistogramFromData({ - {Value(22), 3 /* frequency */}, - {Value(33), 6 /* frequency */}, - })); - - - // Test simple conjunctions on one field. Note the first example: the range we expect to see - // here is (1, 3); however, the structure in the SargableNode gives us a conjunction of two - // intervals instead: (1, "") ^ (nan, 3) This is then estimated using exponential backoff to - // give us a less accurate result. The correct cardinality here would be 5. - ASSERT_MATCH_CE(t, "{a: {$gt: 1}, a: {$lt: 3}}", 5.66); - ASSERT_MATCH_CE(t, "{a: {$gt: 1}, a: {$lte: 3}}", 6.0); - ASSERT_MATCH_CE(t, "{a: {$gte: 1}, a: {$lt: 3}}", 8.0); - ASSERT_MATCH_CE(t, "{a: {$gte: 1}, a: {$lte: 3}}", 9.0); - - // Test ranges which exclude each other. - ASSERT_MATCH_CE(t, "{a: {$lt: 1}, a: {$gt: 3}}", 0.0); - - // Test overlapping ranges. This is a similar case to {a: {$gt: 1}, a: {$lt: 3}} above: we - // expect to see the range [2, 2]; instead, we see the range [nan, 2] ^ [2, ""). - ASSERT_MATCH_CE(t, "{a: {$lte: 2}, a: {$gte: 2}}", 5.66); - - // Test conjunctions over multiple fields for which we have histograms. Here we expect a - // cardinality estimated by exponential backoff. - ASSERT_MATCH_CE(t, "{a: {$eq: 2}, b: {$eq: 22}}", 2.24); - ASSERT_MATCH_CE(t, "{a: {$eq: 11}, b: {$eq: 22}}", 0.0); - ASSERT_MATCH_CE(t, "{a: {$gt: 11}, a: {$lte: 100}, b: {$eq: 22}}", 0.0); - ASSERT_MATCH_CE(t, "{a: {$lt: 3}, a: {$gte: 1}, b: {$lt: 100}, b: {$gt: 30}}", 5.66); - - // Test conjunctions over multiple fields for which we may not have histograms. This falls back - // to heuristic estimation. - ASSERT_MATCH_CE(t, "{a: {$eq: 2}, c: {$eq: 1}}", 1.73205); - ASSERT_MATCH_CE(t, "{c: {$eq: 2}, d: {$eq: 22}}", 1.73205); -} - -TEST(CEHistogramTest, SanityTestEmptyHistogram) { - constexpr auto kCollCard = 0; - CEHistogramTester t(collName, kCollCard); - t.addHistogram("empty", std::make_unique()); - - ASSERT_MATCH_CE(t, "{empty: {$eq: 1.0}}", 0.0); - ASSERT_MATCH_CE(t, "{empty: {$lt: 1.0}, empty: {$gt: 0.0}}", 0.0); - ASSERT_MATCH_CE(t, "{empty: {$eq: 1.0}, other: {$eq: \"anything\"}}", 0.0); - ASSERT_MATCH_CE(t, "{other: {$eq: \"anything\"}, empty: {$eq: 1.0}}", 0.0); -} - -TEST(CEHistogramTest, TestOneBucketOneIntHistogram) { - constexpr auto kCollCard = 50; - CEHistogramTester t(collName, kCollCard); - - // Create a histogram with a single bucket that contains exactly one int (42) with a frequency - // of 50 (equal to the collection cardinality). - t.addHistogram("soloInt", - getArrayHistogramFromData({ - {Value(42), kCollCard /* frequency */}, - })); - - // Check against a variety of intervals that include 42 as a bound. - ASSERT_MATCH_CE(t, "{soloInt: {$eq: 42}}", kCollCard); - ASSERT_MATCH_CE(t, "{soloInt: {$lt: 42}}", 0.0); - ASSERT_MATCH_CE(t, "{soloInt: {$lte: 42}}", kCollCard); - ASSERT_MATCH_CE(t, "{soloInt: {$gt: 42}}", 0.0); - ASSERT_MATCH_CE(t, "{soloInt: {$gte: 42}}", kCollCard); - ASSERT_MATCH_CE(t, "{soloInt: {$gt: 42}, soloInt: {$lt: 42}}", 0.0); - ASSERT_MATCH_CE(t, "{soloInt: {$gt: 42}, soloInt: {$lte: 42}}", 0.0); - ASSERT_MATCH_CE(t, "{soloInt: {$gte: 42}, soloInt: {$lt: 42}}", 0.0); - ASSERT_MATCH_CE(t, "{soloInt: {$gte: 42}, soloInt: {$lte: 42}}", kCollCard); - - // Check against a variety of intervals that include 42 only as one bound. - ASSERT_MATCH_CE(t, "{soloInt: {$gt: 42}, soloInt: {$lt: 43}}", 0.0); - ASSERT_MATCH_CE(t, "{soloInt: {$gt: 42}, soloInt: {$lte: 43}}", 0.0); - ASSERT_MATCH_CE(t, "{soloInt: {$gte: 42}, soloInt: {$lt: 43}}", kCollCard); - ASSERT_MATCH_CE(t, "{soloInt: {$gte: 42}, soloInt: {$lte: 43}}", kCollCard); - ASSERT_MATCH_CE(t, "{soloInt: {$gt: 41}, soloInt: {$lt: 42}}", 0.0); - ASSERT_MATCH_CE(t, "{soloInt: {$gt: 41}, soloInt: {$lte: 42}}", kCollCard); - ASSERT_MATCH_CE(t, "{soloInt: {$gte: 41}, soloInt: {$lt: 42}}", 0.0); - ASSERT_MATCH_CE(t, "{soloInt: {$gte: 41}, soloInt: {$lte: 42}}", kCollCard); - - // Check against a variety of intervals close to 42 using a lower bound of 41 and a higher bound - // of 43. - ASSERT_MATCH_CE(t, "{soloInt: {$eq: 41}}", 0.0); - ASSERT_MATCH_CE(t, "{soloInt: {$eq: 43}}", 0.0); - ASSERT_MATCH_CE(t, "{soloInt: {$lt: 43}}", kCollCard); - ASSERT_MATCH_CE(t, "{soloInt: {$lte: 43}}", kCollCard); - ASSERT_MATCH_CE(t, "{soloInt: {$gt: 41}}", kCollCard); - ASSERT_MATCH_CE(t, "{soloInt: {$gte: 41}}", kCollCard); - ASSERT_MATCH_CE(t, "{soloInt: {$gt: 41}, soloInt: {$lt: 43}}", kCollCard); - ASSERT_MATCH_CE(t, "{soloInt: {$gte: 41}, soloInt: {$lt: 43}}", kCollCard); - ASSERT_MATCH_CE(t, "{soloInt: {$gt: 41}, soloInt: {$lte: 43}}", kCollCard); - ASSERT_MATCH_CE(t, "{soloInt: {$gte: 41}, soloInt: {$lte: 43}}", kCollCard); - - // Check against different types. - ASSERT_MATCH_CE(t, "{soloInt: {$eq: \"42\"}}", 0.0); - ASSERT_MATCH_CE(t, "{soloInt: {$lt: \"42\"}}", 0.0); - ASSERT_MATCH_CE(t, "{soloInt: {$lt: 42.1}}", kCollCard); -} - -TEST(CEHistogramTest, TestOneBoundIntRangeHistogram) { - constexpr auto kCollCard = 51; - CEHistogramTester t(collName, kCollCard); - t.addHistogram("intRange", - getArrayHistogramFromData({ - {Value(10), 5 /* frequency */}, - {Value(20), 1 /* frequency */, 45 /* range frequency */, 10 /* ndv */}, - })); - - // Test ranges that overlap only with the lower bound. - // Note: 5 values equal 10. - ASSERT_MATCH_CE(t, "{intRange: {$eq: 10}}", 5.0); - ASSERT_MATCH_CE(t, "{intRange: {$lte: 10}}", 5.0); - ASSERT_MATCH_CE(t, "{intRange: {$lte: 10}, intRange: {$gte: 10}}", 5.0); - - // Test ranges that overlap only with the upper bound. - ASSERT_MATCH_CE(t, "{intRange: {$eq: 11}}", 5.0); - ASSERT_MATCH_CE(t, "{intRange: {$eq: 15}}", 5.0); - ASSERT_MATCH_CE(t, "{intRange: {$eq: 15.5}}", 5.0); - ASSERT_MATCH_CE(t, "{intRange: {$eq: 20}}", 1.0); - ASSERT_MATCH_CE(t, "{intRange: {$gte: 20}}", 1.0); - ASSERT_MATCH_CE(t, "{intRange: {$gt: 10}}", 46.0); - ASSERT_MATCH_CE(t, "{intRange: {$gte: 15}}", 28.5); - ASSERT_MATCH_CE(t, "{intRange: {$gt: 15}}", 23.5); - ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lte: 20}}", 41.5); - ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lte: 20}}", 41.5); - - // Test ranges that partially overlap with the entire histogram. - ASSERT_MATCH_CE(t, "{intRange: {$lt: 11}}", 9.5); - ASSERT_MATCH_CE(t, "{intRange: {$lt: 15}}", 22.5); - ASSERT_MATCH_CE(t, "{intRange: {$lte: 15}}", 27.5); - ASSERT_MATCH_CE(t, "{intRange: {$gte: 8}, intRange: {$lte: 15}}", 27.5); - ASSERT_MATCH_CE(t, "{intRange: {$gt: 8}, intRange: {$lte: 15}}", 27.5); - ASSERT_MATCH_CE(t, "{intRange: {$gt: 8}, intRange: {$lt: 15}}", 22.5); - ASSERT_MATCH_CE(t, "{intRange: {$gte: 8}, intRange: {$lte: 15}}", 27.5); - - // Test ranges that include all values in the histogram. - ASSERT_MATCH_CE(t, "{intRange: {$gte: 10}, intRange: {$lte: 20}}", kCollCard); - ASSERT_MATCH_CE(t, "{intRange: {$gte: 1}, intRange: {$lte: 30}}", kCollCard); - ASSERT_MATCH_CE(t, "{intRange: {$gt: 1}, intRange: {$lt: 30}}", kCollCard); - ASSERT_MATCH_CE(t, "{intRange: {$gt: 1}, intRange: {$lte: 30}}", kCollCard); - ASSERT_MATCH_CE(t, "{intRange: {$gte: 1}, intRange: {$lt: 30}}", kCollCard); - ASSERT_MATCH_CE(t, "{intRange: {$gt: 0}}", kCollCard); - ASSERT_MATCH_CE(t, "{intRange: {$gte: 0}}", kCollCard); - ASSERT_MATCH_CE(t, "{intRange: {$lt: 100}}", kCollCard); - ASSERT_MATCH_CE(t, "{intRange: {$lte: 100}}", kCollCard); - - // Test ranges that are fully included in the histogram. - ASSERT_MATCH_CE(t, "{intRange: {$eq: 10.5}}", 5.0); - ASSERT_MATCH_CE(t, "{intRange: {$eq: 12.5}}", 5.0); - ASSERT_MATCH_CE(t, "{intRange: {$eq: 19.36}}", 5.0); - - // Test ranges that don't overlap with the histogram. - ASSERT_MATCH_CE(t, "{intRange: {$lt: 10}}", 0.0); - ASSERT_MATCH_CE(t, "{intRange: {$lt: 5}}", 0.0); - ASSERT_MATCH_CE(t, "{intRange: {$lte: 5}}", 0.0); - ASSERT_MATCH_CE(t, "{intRange: {$eq: 20.1}}", 0.0); - ASSERT_MATCH_CE(t, "{intRange: {$eq: 21}}", 0.0); - ASSERT_MATCH_CE(t, "{intRange: {$gte: 21}}", 0.0); - ASSERT_MATCH_CE(t, "{intRange: {$gt: 20}}", 0.0); - ASSERT_MATCH_CE(t, "{intRange: {$gt: 100}}", 0.0); - ASSERT_MATCH_CE(t, "{intRange: {$gte: 30}, intRange: {$lte: 50}}", 0.0); - ASSERT_MATCH_CE(t, "{intRange: {$gt: 30}, intRange: {$lt: 50}}", 0.0); - ASSERT_MATCH_CE(t, "{intRange: {$gte: 30}, intRange: {$lt: 50}}", 0.0); - ASSERT_MATCH_CE(t, "{intRange: {$gt: 30}, intRange: {$lte: 50}}", 0.0); - ASSERT_MATCH_CE(t, "{intRange: {$gte: 0}, intRange: {$lte: 5}}", 0.0); - ASSERT_MATCH_CE(t, "{intRange: {$gt: 0}, intRange: {$lt: 5}}", 0.0); - ASSERT_MATCH_CE(t, "{intRange: {$gte: 0}, intRange: {$lt: 5}}", 0.0); - ASSERT_MATCH_CE(t, "{intRange: {$gt: 0}, intRange: {$lte: 5}}", 0.0); - - // Because we don't specify any indexes here, these intervals do not go through simplification. - // This means that instead of having one key in the requirements map of the generated sargable - // node corresponding to the path "intRange", we have two keys and two ranges, both - // corresponding to the same path. As a consequence, we combine the estimates for the intervals - // using exponential backoff, which results in an overestimate. - ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lt: 20}}", 41.09); - ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lt: 20}}", 41.09); - ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lt: 15}}", 19.16); - ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lt: 15}}", 20.42); - ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lte: 15}}", 23.42); - ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lte: 15}}", 24.96); - ASSERT_MATCH_CE(t, "{intRange: {$lt: 19}, intRange: {$gt: 11}}", 36.53); - - // When we specify that there is a non-multikey index on 'intRange', we expect to see interval - // simplification occurring, which should provide a better estimate for the following ranges. - t.setIndexes( - {{"intRangeIndex", - makeIndexDefinition("intRange", CollationOp::Ascending, /* isMultiKey */ false)}}); - ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lt: 20}}", 40.5); - ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lt: 20}}", 40.5); - ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lt: 15}}", 8.5); - ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lt: 15}}", 13.5); - ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lte: 15}}", 13.5); - ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lte: 15}}", 18.5); - ASSERT_MATCH_CE(t, "{intRange: {$lt: 19}, intRange: {$gt: 11}}", 31.0); -} - -TEST(CEHistogramTest, TestHistogramOnNestedPaths) { - constexpr auto kCollCard = 50; - CEHistogramTester t(collName, kCollCard); - - // Create a histogram with a single bucket that contains exactly one int (42) with a frequency - // of 50 (equal to the collection cardinality). - t.addHistogram("path", - getArrayHistogramFromData({ - {Value(42), kCollCard /* frequency */}, - })); - t.addHistogram("a.histogram.path", - getArrayHistogramFromData({ - {Value(42), kCollCard /* frequency */}, - })); - - ASSERT_MATCH_CE(t, "{\"not.a.histogram.path\": {$eq: 42}}", 7.071 /* heuristic */); - ASSERT_MATCH_CE(t, "{\"a.histogram.path\": {$eq: 42}}", kCollCard); - ASSERT_MATCH_CE( - t, "{\"a.histogram.path.with.no.histogram\": {$eq: 42}}", 7.071 /* heuristic */); - - // When a predicate can't be precisely translated to a SargableNode (such as $elemMatch on a - // dotted path), we may still be able to translate an over-approximation. We generate a - // SargableNode with all predicates marked perfOnly, and keep the original Filter. The Filter - // ensures the results are correct, while the SargableNode hopefully will be answerable by an - // index. - // - // On the logical level, perfOnly predicates don't do anything, so we don't consider them in - // cardinality estimates. But when we split a SargableNode into an indexed part and a fetch - // part, we remove the perfOnly flag from the indexed part, and we should consider them to - // estimate how many index keys are returned. - // - // In this test, we want to exercise the histogram estimate for the SargableNode generated by - // $elemMatch on a dotted path. So we create an index on this field to ensure the SargableNode - // is split, and the predicates marked non-perfOnly. - // - // We also mark the index multikey, to prevent non-CE rewrites from removing the predicate - // entirely. (This scenario could happen if you remove all the arrays, and refresh the - // statistics.) - IndexDefinition ix{ - IndexCollationSpec{ - IndexCollationEntry{ - makeIndexPath({"a", "histogram", "path"}), - CollationOp::Ascending, - }, - }, - true /* isMultiKey */, - }; - t.setIndexes({{"a_histogram_path_1", std::move(ix)}}); - ASSERT_MATCH_CE_NODE(t, "{\"a.histogram.path\": {$elemMatch: {$eq: 42}}}", 0.0, isSargable2); -} - -TEST(CEHistogramTest, TestArrayHistogramOnAtomicPredicates) { - constexpr auto kCollCard = 6; - CEHistogramTester t(collName, kCollCard); - t.addHistogram( - "a", - // Generate a histogram for this data: - // {a: 1}, {a: 2}, {a: [1, 2, 3, 2, 2]}, {a: [10]}, {a: [2, 3, 3, 4, 5, 5, 6]}, {a: []} - // - scalars: [1, 2] - // - unique values: [1, 2, 3], [10], [2, 3, 4, 5, 6] - // -> [1, 2, 2, 3, 3, 4, 5, 6, 10] - // - min values: [1], [10], [2] -> [1, 1, 2, 2, 10] - // - max values: [3], [10], [6] -> [1, 2, 3, 6, 10] - getArrayHistogramFromData( - {// Scalar buckets. - {Value(1), 1 /* frequency */}, - {Value(2), 1 /* frequency */}}, - { - // Array unique buckets. - {Value(1), 1 /* frequency */}, - {Value(2), 2 /* frequency */}, - {Value(3), 2 /* frequency */}, - {Value(4), 1 /* frequency */}, - {Value(5), 1 /* frequency */}, - {Value(6), 1 /* frequency */}, - {Value(10), 1 /* frequency */}, - }, - { - // Array min buckets. - {Value(1), 1 /* frequency */}, - {Value(2), 1 /* frequency */}, - {Value(10), 1 /* frequency */}, - }, - { - // Array max buckets. - {Value(3), 1 /* frequency */}, - {Value(6), 1 /* frequency */}, - {Value(10), 1 /* frequency */}, - }, - {{sbe::value::TypeTags::NumberInt32, 13}}, // Array type counts. - 3, // 3 arrays total. - 1 // 1 empty array. - )); - - // Test simple predicates against 'a'. Note: in the $elemMatch case, we exclude scalar - // estimates. Without $elemMatch, we add the array histogram and scalar histogram estimates - // together. - - // Test equality predicates. - ASSERT_EQ_ELEMMATCH_CE(t, 0.0 /* CE */, 0.0 /* $elemMatch CE */, "a", "{$eq: 0}"); - ASSERT_EQ_ELEMMATCH_CE(t, 2.0 /* CE */, 1.0 /* $elemMatch CE */, "a", "{$eq: 1}"); - ASSERT_EQ_ELEMMATCH_CE(t, 3.0 /* CE */, 2.0 /* $elemMatch CE */, "a", "{$eq: 2}"); - ASSERT_EQ_ELEMMATCH_CE(t, 2.0 /* CE */, 2.0 /* $elemMatch CE */, "a", "{$eq: 3}"); - ASSERT_EQ_ELEMMATCH_CE(t, 1.0 /* CE */, 1.0 /* $elemMatch CE */, "a", "{$eq: 4}"); - ASSERT_EQ_ELEMMATCH_CE(t, 1.0 /* CE */, 1.0 /* $elemMatch CE */, "a", "{$eq: 5}"); - ASSERT_EQ_ELEMMATCH_CE(t, 1.0 /* CE */, 1.0 /* $elemMatch CE */, "a", "{$eq: 6}"); - ASSERT_EQ_ELEMMATCH_CE(t, 1.0 /* CE */, 1.0 /* $elemMatch CE */, "a", "{$eq: 10}"); - ASSERT_EQ_ELEMMATCH_CE(t, 0.0 /* CE */, 0.0 /* $elemMatch CE */, "a", "{$eq: 11}"); - - // Test histogram boundary values. - ASSERT_EQ_ELEMMATCH_CE(t, 0.0 /* CE */, 0.0 /* $elemMatch CE */, "a", "{$lt: 1}"); - ASSERT_EQ_ELEMMATCH_CE(t, 2.0 /* CE */, 1.0 /* $elemMatch CE */, "a", "{$lte: 1}"); - ASSERT_EQ_ELEMMATCH_CE(t, 0.0 /* CE */, 0.0 /* $elemMatch CE */, "a", "{$gt: 10}"); - ASSERT_EQ_ELEMMATCH_CE(t, 1.0 /* CE */, 1.0 /* $elemMatch CE */, "a", "{$gte: 10}"); - - ASSERT_EQ_ELEMMATCH_CE(t, 5.0 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$lte: 10}"); - ASSERT_EQ_ELEMMATCH_CE(t, 4.0 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$lt: 10}"); - ASSERT_EQ_ELEMMATCH_CE(t, 4.0 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$gt: 1}"); - ASSERT_EQ_ELEMMATCH_CE(t, 5.0 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$gte: 1}"); - - ASSERT_EQ_ELEMMATCH_CE(t, 4.0 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$lte: 5}"); - ASSERT_EQ_ELEMMATCH_CE(t, 4.0 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$lt: 5}"); - ASSERT_EQ_ELEMMATCH_CE(t, 2.0 /* CE */, 2.0 /* $elemMatch CE */, "a", "{$gt: 5}"); - ASSERT_EQ_ELEMMATCH_CE(t, 2.0 /* CE */, 2.40822 /* $elemMatch CE */, "a", "{$gte: 5}"); - - ASSERT_EQ_ELEMMATCH_CE(t, 2.45 /* CE */, 2.40822 /* $elemMatch CE */, "a", "{$gt: 2, $lt: 5}"); - ASSERT_EQ_ELEMMATCH_CE(t, 3.27 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$gte: 2, $lt: 5}"); - ASSERT_EQ_ELEMMATCH_CE(t, 2.45 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$gt: 2, $lte: 5}"); - ASSERT_EQ_ELEMMATCH_CE(t, 3.27 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$gte: 2, $lte: 5}"); -} - -TEST(CEHistogramTest, TestArrayHistogramOnCompositePredicates) { - constexpr auto kCollCard = 175; - CEHistogramTester t(collName, kCollCard); - - // A scalar histogram with values in the range [1,10], most of which are in the middle bucket. - t.addHistogram("scalar", - getArrayHistogramFromData({ - {Value(1), 10 /* frequency */}, - {Value(2), 10 /* frequency */}, - {Value(3), 20 /* frequency */, 120 /* range frequency */, 5 /* ndv */}, - {Value(8), 5 /* frequency */, 10 /* range frequency */, 3 /* ndv */}, - })); - - // An array histogram built on the following arrays with 35 occurrences of each: - // [{[1, 2, 3]: 35}, {[5, 5, 5, 5, 5]: 35}, {[6]: 35}, {[]: 35}, {[8, 9, 10]: 35}] - t.addHistogram( - "array", - getArrayHistogramFromData( - {/* No scalar buckets. */}, - { - // Array unique buckets. - {Value(2), 35 /* frequency */, 35 /* range frequency */, 2 /* ndv */}, - {Value(5), 35 /* frequency */, 35 /* range frequency */, 2 /* ndv */}, - {Value(6), 35 /* frequency */}, - {Value(10), 35 /* frequency */, 105 /* range frequency */, 3 /* ndv */}, - }, - { - // Array min buckets. - {Value(1), 35 /* frequency */}, - {Value(5), 35 /* frequency */}, - {Value(6), 35 /* frequency */}, - {Value(8), 35 /* frequency */}, - }, - { - // Array max buckets. - {Value(3), 35 /* frequency */}, - {Value(5), 35 /* frequency */}, - {Value(6), 35 /* frequency */}, - {Value(10), 35 /* frequency */}, - }, - {{sbe::value::TypeTags::NumberInt32, 420}}, // Array type count = 3*35+5*35+1*35+3*35. - kCollCard, // kCollCard arrays total. - 35 // 35 empty arrays - )); - - t.addHistogram( - "mixed", - // The mixed histogram has 87 scalars that follow approximately the same distribution as - // in the pure scalar case, and 88 arrays with the following distribution: - // [{[1, 2, 3]: 17}, {[5, 5, 5, 5, 5]: 17}, {[6]: 17}, {[]: 20}, {[8, 9, 10]: 17}] - getArrayHistogramFromData( - { - // Scalar buckets. These are half the number of values from the "scalar" histogram. - {Value(1), 5 /* frequency */}, - {Value(2), 5 /* frequency */}, - {Value(3), 10 /* frequency */, 60 /* range frequency */, 5 /* ndv */}, - {Value(8), 2 /* frequency */, 5 /* range frequency */, 3 /* ndv */}, - }, - { - // Array unique buckets. - {Value(2), 17 /* frequency */, 17 /* range frequency */, 2 /* ndv */}, - {Value(5), 17 /* frequency */, 17 /* range frequency */, 2 /* ndv */}, - {Value(6), 17 /* frequency */}, - {Value(10), 17 /* frequency */, 34 /* range frequency */, 3 /* ndv */}, - }, - { - // Array min buckets. - {Value(1), 17 /* frequency */}, - {Value(5), 17 /* frequency */}, - {Value(6), 17 /* frequency */}, - {Value(8), 17 /* frequency */}, - }, - { - // Array max buckets. - {Value(3), 17 /* frequency */}, - {Value(5), 17 /* frequency */}, - {Value(6), 17 /* frequency */}, - {Value(10), 17 /* frequency */}, - }, - {{sbe::value::TypeTags::NumberInt32, 289}}, // Array type count = 3*17+5*17+6*17+3*17 - 88, // kCollCard arrays total. - 20 // 20 empty arrays. - )); - - // Test cardinality of individual predicates. - ASSERT_EQ_ELEMMATCH_CE(t, 5.0 /* CE */, 0.0 /* $elemMatch CE */, "scalar", "{$eq: 5}"); - ASSERT_EQ_ELEMMATCH_CE(t, 35.0 /* CE */, 35.0 /* $elemMatch CE */, "array", "{$eq: 5}"); - ASSERT_EQ_ELEMMATCH_CE(t, 19.5 /* CE */, 17.0 /* $elemMatch CE */, "mixed", "{$eq: 5}"); - - // Test cardinality of predicate combinations; the following tests make sure we correctly track - // which paths have $elemMatches and which don't. Some notes: - // - Whenever we use 'scalar' + $elemMatch, we expect an estimate of 0 because $elemMatch never - // returns documents on non-array paths. - // - Whenever we use 'mixed' + $elemMatch, we expect the estimate to decrease because we omit - // scalar values in 'mixed' from our estimate. - // - We do not expect the estimate on 'array' to be affected by the presence of $elemMatch, - // since we only have array values for this field. - - // Composite predicate on 'scalar' and 'array' fields. - ASSERT_MATCH_CE(t, "{scalar: {$eq: 5}, array: {$eq: 5}}", 2.236); - ASSERT_MATCH_CE(t, "{scalar: {$eq: 5}, array: {$elemMatch: {$eq: 5}}}", 2.236); - ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$eq: 5}}, array: {$eq: 5}}", 0.0); - - // Composite predicate on 'mixed' and 'array' fields. - ASSERT_MATCH_CE(t, "{mixed: {$eq: 5}, array: {$eq: 5}}", 8.721); - ASSERT_MATCH_CE(t, "{mixed: {$eq: 5}, array: {$elemMatch: {$eq: 5}}}", 8.721); - ASSERT_MATCH_CE(t, "{mixed: {$elemMatch: {$eq: 5}}, array: {$eq: 5}}", 7.603); - - // Composite predicate on 'scalar' and 'mixed' fields. - ASSERT_MATCH_CE(t, "{scalar: {$eq: 5}, mixed: {$eq: 5}}", 1.669); - ASSERT_MATCH_CE(t, "{scalar: {$eq: 5}, mixed: {$elemMatch: {$eq: 5}}}", 1.559); - ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$eq: 5}}, mixed: {$eq: 5}}", 0.0); - - // Composite predicate on all three fields without '$elemMatch' on 'array'. - ASSERT_MATCH_CE(t, "{scalar: {$eq: 5}, mixed: {$eq: 5}, array: {$eq: 5}}", 1.116); - ASSERT_MATCH_CE(t, "{scalar: {$eq: 5}, mixed: {$elemMatch: {$eq: 5}}, array: {$eq: 5}}", 1.042); - ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$eq: 5}}, mixed: {$eq: 5}, array: {$eq: 5}}", 0.0); - - // Composite predicate on all three fields with '$elemMatch' on 'array' (same expected results - // as above). - ASSERT_MATCH_CE(t, "{scalar: {$eq: 5}, mixed: {$eq: 5}, array: {$elemMatch: {$eq: 5}}}", 1.116); - - // Test case where the same path has both $match and $elemMatch (same as $elemMatch case). - ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$eq: 5}}, scalar: {$eq: 5}}", 0.0); - ASSERT_MATCH_CE(t, "{mixed: {$elemMatch: {$eq: 5}}, mixed: {$eq: 5}}", 17.0); - ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$eq: 5}}, array: {$eq: 5}}", 35.0); - - // Test case with multiple predicates and ranges. - ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$lt: 5}}, mixed: {$lt: 5}}", 70.2156); - ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$lt: 5}}, mixed: {$gt: 5}}", 28.4848); - - // Test multiple $elemMatches. - ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$eq: 5}}, array: {$elemMatch: {$eq: 5}}}", 0.0); - ASSERT_MATCH_CE(t, "{mixed: {$elemMatch: {$eq: 5}}, array: {$elemMatch: {$eq: 5}}}", 7.603); - ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$eq: 5}}, mixed: {$elemMatch: {$eq: 5}}}", 0.0); - ASSERT_MATCH_CE( - t, "{scalar: {$elemMatch: {$eq: 5}}, mixed: {$elemMatch: {$eq: 5}}, array: {$eq: 5}}", 0.0); - ASSERT_MATCH_CE( - t, - "{scalar: {$eq: 5}, mixed: {$elemMatch: {$eq: 5}}, array: {$elemMatch: {$eq: 5}}}", - 1.042); - ASSERT_MATCH_CE( - t, "{scalar: {$elemMatch: {$eq: 5}}, mixed: {$eq: 5}, array: {$elemMatch: {$eq: 5}}}", 0.0); - ASSERT_MATCH_CE(t, - "{scalar: {$elemMatch: {$eq: 5}}, mixed: {$elemMatch: {$eq: 5}}, array: " - "{$elemMatch: {$eq: 5}}}", - 0.0); - ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$lt: 5}}, mixed: {$elemMatch: {$lt: 5}}}", 34.1434); - ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$lt: 5}}, mixed: {$elemMatch: {$gt: 5}}}", 45.5246); - - // Verify that we still return an estimate of 0.0 for any $elemMatch predicate on a scalar - // field when we have a non-multikey index. - t.setIndexes({{"aScalarIndex", - makeIndexDefinition("scalar", CollationOp::Ascending, /* isMultiKey */ false)}}); - ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$eq: 5}}}", 0.0); - ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$gt: 1, $lt: 10}}}", 0.0); - - // Test how we estimate singular PathArr sargable predicate. - ASSERT_MATCH_CE_NODE(t, "{array: {$elemMatch: {}}}", 175.0, isSargable); - ASSERT_MATCH_CE_NODE(t, "{mixed: {$elemMatch: {}}}", 88.0, isSargable); - - // Take into account both empty and non-empty arrays. - auto makePathArrABT = [&](const FieldNameType& fieldName) { - const ProjectionName scanProjection{"scan_0"}; - auto scanNode = make(scanProjection, collName); - auto filterNode = - make(make(make(std::move(fieldName), make()), - make(scanProjection)), - std::move(scanNode)); - return make( - properties::ProjectionRequirement{ProjectionNameVector{scanProjection}}, - std::move(filterNode)); - }; - - // There are no arrays in the 'scalar' field. - ABT scalarABT = makePathArrABT("scalar"); - ASSERT_CE(t, scalarABT, 0.0); - - // About half the values of this field are arrays. - ABT mixedABT = makePathArrABT("mixed"); - ASSERT_CE(t, mixedABT, 88.0); - - // This field is always an array. - ABT arrayABT = makePathArrABT("array"); - ASSERT_CE(t, arrayABT, kCollCard); -} - -TEST(CEHistogramTest, TestMixedElemMatchAndNonElemMatch) { - constexpr auto kCollCard = 1; - CEHistogramTester t(collName, kCollCard); - - // A very simple histogram encoding a collection with one document {a: [3, 10]}. - t.addHistogram("a", - getArrayHistogramFromData({/* No scalar buckets. */}, - { - // Array unique buckets. - {Value(3), 1 /* frequency */}, - {Value(10), 1 /* frequency */}, - }, - { - // Array min buckets. - {Value(3), 1 /* frequency */}, - }, - { - // Array max buckets. - {Value(10), 1 /* frequency */}, - }, - {{sbe::value::TypeTags::NumberInt32, 2}}, - // Array type counts. - 1, - 0)); - - // Tests without indexes. - ASSERT_MATCH_CE(t, "{a: {$elemMatch: {$gt: 3, $lt: 10}}}", 0.0); - ASSERT_MATCH_CE(t, "{a: {$gt: 3, $lt: 10}}", 1.0); - ASSERT_MATCH_CE(t, "{a: {$elemMatch: {$eq: 3}, $gt: 3, $lt: 10}}", 1.0); - ASSERT_MATCH_CE(t, "{a: {$gt: 3, $lt: 10, $elemMatch: {$eq: 3}}}", 1.0); - ASSERT_MATCH_CE(t, "{a: {$gt: 3, $lt: 10, $elemMatch: {$gt: 3, $lt: 10}}}", 0.0); - ASSERT_MATCH_CE(t, "{a: {$elemMatch: {$gt: 3, $lt: 10}, $gt: 3, $lt: 10}}", 0.0); - - // Tests with multikey index (note that the index on "a" must be multikey due to arrays). - t.setIndexes( - {{"anIndex", makeIndexDefinition("a", CollationOp::Ascending, /* isMultiKey */ true)}}); - ASSERT_MATCH_CE(t, "{a: {$elemMatch: {$gt: 3, $lt: 10}}}", 0.0); - ASSERT_MATCH_CE(t, "{a: {$gt: 3, $lt: 10}}", 1.0); - ASSERT_MATCH_CE(t, "{a: {$elemMatch: {$eq: 3}, $gt: 3, $lt: 10}}", 1.0); - ASSERT_MATCH_CE(t, "{a: {$gt: 3, $lt: 10, $elemMatch: {$eq: 3}}}", 1.0); - ASSERT_MATCH_CE(t, "{a: {$gt: 3, $lt: 10, $elemMatch: {$gt: 3, $lt: 10}}}", 0.0); - ASSERT_MATCH_CE(t, "{a: {$elemMatch: {$gt: 3, $lt: 10}, $gt: 3, $lt: 10}}", 0.0); -} - -TEST(CEHistogramTest, TestTypeCounters) { - constexpr double kCollCard = 1000.0; - CEHistogramTester t(collName, kCollCard); - - // This test is designed such that for each document, we have the following fields: - // 1. scalar: Scalar histogram with no buckets, only type-counted data. - // 2. array: Array histogram with no buckets, only type-counted data inside of arrays. - // 3. mixed: Mixed histogram with no buckets, only type-counted data, both scalars and arrays. - constexpr double kNumObj = 200.0; - constexpr double kNumNull = 300.0; - t.addHistogram("scalar", - getArrayHistogramFromData({/* No histogram data. */}, - {{sbe::value::TypeTags::Object, kNumObj}, - {sbe::value::TypeTags::Null, kNumNull}})); - t.addHistogram("array", - getArrayHistogramFromData({/* No scalar buckets. */}, - {/* No array unique buckets. */}, - {/* No array min buckets. */}, - {/* No array max buckets. */}, - {{sbe::value::TypeTags::Object, kNumObj}, - {sbe::value::TypeTags::Null, kNumNull}}, - kCollCard)); - - // Count of each type in array type counters for field "mixed". - constexpr double kNumObjMA = 50.0; - constexpr double kNumNullMA = 100.0; - // For the purposes of this test, we have one array of each value of a non-histogrammable type. - constexpr double kNumArr = kNumObjMA + kNumNullMA; - const TypeCounts mixedArrayTC{{sbe::value::TypeTags::Object, kNumObjMA}, - {sbe::value::TypeTags::Null, kNumNullMA}}; - - // Count of each type in scalar type counters for field "mixed". - constexpr double kNumObjMS = 150.0; - constexpr double kNumNullMS = 200.0; - const TypeCounts mixedScalarTC{{sbe::value::TypeTags::Object, kNumObjMS}, - {sbe::value::TypeTags::Null, kNumNullMS}}; - - // Quick sanity check of test setup for the "mixed" histogram. The idea is that we want a - // portion of objects inside arrays, and the rest as scalars, but we want the total count of - // objects to be - ASSERT_EQ(kNumObjMA + kNumObjMS, kNumObj); - ASSERT_EQ(kNumNullMA + kNumNullMS, kNumNull); - - t.addHistogram("mixed", - getArrayHistogramFromData({/* No scalar buckets. */}, - {/* No array unique buckets. */}, - {/* No array min buckets. */}, - {/* No array max buckets. */}, - mixedArrayTC, - kNumArr, - 0 /* Empty array count. */, - mixedScalarTC)); - - // Set up indexes. - t.setIndexes({{"scalarIndex", - makeIndexDefinition("scalar", CollationOp::Ascending, /* isMultiKey */ false)}}); - t.setIndexes({{"arrayIndex", - makeIndexDefinition("array", CollationOp::Ascending, /* isMultiKey */ true)}}); - t.setIndexes({{"mixedIndex", - makeIndexDefinition("mixed", CollationOp::Ascending, /* isMultiKey */ true)}}); - - // Tests for scalar type counts only. - // For object-only intervals in a scalar histogram, we always return object count, no matter - // what the bounds are. Since we have a scalar histogram for "scalar", we expect all $elemMatch - // queries to have a cardinality of 0. - - // Test object equality. - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, 0.0, "scalar", "{$eq: {}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, 0.0, "scalar", "{$eq: {a: 1}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, 0.0, "scalar", "{$eq: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, 0.0, "scalar", "{$lt: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, 0.0, "scalar", "{$gt: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, 0.0, "scalar", "{$gte: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, 0.0, "scalar", "{$lte: {b: 2, c: 3}}"); - - // Test intervals including the empty object. Note that range queries on objects do not generate - // point equalities, so these fall back onto logic in interval estimation that identifies that - // the generated intervals are subsets of the object type interval. Note: we don't even generate - // a SargableNode for the first case. The generated bounds are: - // [{}, {}) because {} is the "minimum" value for the object type. - ASSERT_EQ_ELEMMATCH_CE(t, 0.0, 0.0, "scalar", "{$lt: {}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, 0.0, "scalar", "{$gt: {}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, 0.0, "scalar", "{$gte: {}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, 0.0, "scalar", "{$lte: {}}"); - - // Rather than combining the intervals together, in the following cases we generate two - // object-only intervals in the requirements map with the following bounds. Each individual - // interval is estimated as having a cardinality of 'kNumObj', before we apply conjunctive - // exponential backoff to combine them. - constexpr double k2ObjCard = 89.4427; // == 200/1000 * sqrt(200/1000) * 1000 - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, 0.0, "scalar", "{$gt: {}, $lte: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, 0.0, "scalar", "{$gte: {}, $lte: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, 0.0, "scalar", "{$gte: {}, $lt: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, 0.0, "scalar", "{$gt: {}, $lt: {b: 2, c: 3}}"); - - // Test intervals including {a: 1}. Similar to the above case, we have two intervals in the - // requirements map. - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, 0.0, "scalar", "{$gt: {a: 1}, $lte: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, 0.0, "scalar", "{$gte: {a: 1}, $lte: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, 0.0, "scalar", "{$gte: {a: 1}, $lt: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, 0.0, "scalar", "{$gt: {a: 1}, $lt: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, 0.0, "scalar", "{$gt: {a: 1}, $lte: {a: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, 0.0, "scalar", "{$gte: {a: 1}, $lte: {a: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, 0.0, "scalar", "{$gte: {a: 1}, $lt: {a: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, 0.0, "scalar", "{$gt: {a: 1}, $lt: {a: 3}}"); - - // Test that for null, we always return null count. - // Note that for ranges including null (e.g. {$lt: null}) we don't generate any SargableNodes. - ASSERT_EQ_ELEMMATCH_CE(t, kNumNull, 0.0, "scalar", "{$eq: null}"); - - // TODO SERVER-70936: Add tests for booleans. - // ASSERT_EQ_ELEMMATCH_CE(t, kNumBool, 0.0, "scalar", "{$eq: true}"); - // ASSERT_EQ_ELEMMATCH_CE(t, kNumBool, 0.0, "scalar", "{$eq: false}"); - - // Tests for array type counts only. - // For object-only intervals in an array histogram, if we're using $elemMatch on an object-only - // interval, we always return object count. While we have no scalar type counts for "array", - // non-$elemMatch queries should also match objects embedded in arrays, so we still return - // object count in that case. - - // Test object equality. - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObj, "array", "{$eq: {}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObj, "array", "{$eq: {a: 1}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObj, "array", "{$eq: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObj, "array", "{$lt: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObj, "array", "{$gt: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObj, "array", "{$gte: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObj, "array", "{$lte: {b: 2, c: 3}}"); - - // Test intervals including the empty object. - // Note: we don't even generate a SargableNode for the first case. The generated bounds are: - // [{}, {}) because {} is the "minimum" value for the object type. - ASSERT_EQ_ELEMMATCH_CE(t, 0.0, 0.0, "array", "{$lt: {}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObj, "array", "{$gt: {}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObj, "array", "{$gte: {}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObj, "array", "{$lte: {}}"); - - // Similar to above, here we have two object intervals for non-$elemMatch queries. However, for - // $elemMatch queries, we have the following intervals in the requirements map: - // 1. [[], BinData(0, )) with CE 1000 - // 2. The actual object interval, e.g. ({}, {b: 2, c: 3}] with CE 200 - constexpr double kArrEMCard = kNumObj; // == 200/1000 * sqrt(1000/1000) * 1000 - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kArrEMCard, "array", "{$gt: {}, $lte: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kArrEMCard, "array", "{$gte: {}, $lte: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kArrEMCard, "array", "{$gte: {}, $lt: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kArrEMCard, "array", "{$gt: {}, $lt: {b: 2, c: 3}}"); - - // Test intervals including {a: 1}; similar to above, we have two object intervals. - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kArrEMCard, "array", "{$gt: {a: 1}, $lte: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kArrEMCard, "array", "{$gte: {a: 1}, $lte: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kArrEMCard, "array", "{$gte: {a: 1}, $lt: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kArrEMCard, "array", "{$gt: {a: 1}, $lt: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kArrEMCard, "array", "{$gt: {a: 1}, $lte: {a: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kArrEMCard, "array", "{$gte: {a: 1}, $lte: {a: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kArrEMCard, "array", "{$gte: {a: 1}, $lt: {a: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kArrEMCard, "array", "{$gt: {a: 1}, $lt: {a: 3}}"); - - // Test that for null, we always return null count. - // Note that for ranges including null (e.g. {$lt: null}) we don't generate any SargableNodes. - ASSERT_EQ_ELEMMATCH_CE(t, kNumNull, kNumNull, "array", "{$eq: null}"); - - // TODO SERVER-70936: Add tests for booleans. - // ASSERT_EQ_ELEMMATCH_CE(t, kNumBool, kNumBool, "array", "{$eq: true}"); - // ASSERT_EQ_ELEMMATCH_CE(t, kNumBool, kNumBool, "array", "{$eq: false}"); - - // Tests for mixed type counts only. Regular match predicates should be estimated as the sum of - // the scalar and array counts (e.g. for objects, 'kNumObj'), while elemMatch predicates - // should be estimated without scalars, returning the array type count (for objects this is - // 'kNumObjMA'). - - // Test object equality. - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObjMA, "mixed", "{$eq: {}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObjMA, "mixed", "{$eq: {a: 1}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObjMA, "mixed", "{$eq: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObjMA, "mixed", "{$lt: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObjMA, "mixed", "{$gt: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObjMA, "mixed", "{$gte: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObjMA, "mixed", "{$lte: {b: 2, c: 3}}"); - - // Test intervals including the empty object. - // Note: we don't even generate a SargableNode for the first case. The generated bounds are: - // [{}, {}) because {} is the "minimum" value for the object type. - ASSERT_EQ_ELEMMATCH_CE(t, 0.0, 0.0, "mixed", "{$lt: {}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObjMA, "mixed", "{$gt: {}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObjMA, "mixed", "{$gte: {}}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObjMA, "mixed", "{$lte: {}}"); - - // Similar to above, here we have two object intervals for non-$elemMatch queries. However, for - // $elemMatch queries, we have the following intervals in the requirements map: - // 1. [[], BinData(0, )) with CE 1000 - // 2. The actual object interval, e.g. ({}, {b: 2, c: 3}] with CE 50 - constexpr double kMixEMCard = kNumObjMA; // == 50/1000 * sqrt(1000/1000) * 1000 - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kMixEMCard, "mixed", "{$gt: {}, $lte: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kMixEMCard, "mixed", "{$gte: {}, $lte: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kMixEMCard, "mixed", "{$gte: {}, $lt: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kMixEMCard, "mixed", "{$gt: {}, $lt: {b: 2, c: 3}}"); - - // Test intervals including {a: 1}; similar to above, we have two object intervals. - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kMixEMCard, "mixed", "{$gt: {a: 1}, $lte: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kMixEMCard, "mixed", "{$gte: {a: 1}, $lte: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kMixEMCard, "mixed", "{$gte: {a: 1}, $lt: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kMixEMCard, "mixed", "{$gt: {a: 1}, $lt: {b: 2, c: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kMixEMCard, "mixed", "{$gt: {a: 1}, $lte: {a: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kMixEMCard, "mixed", "{$gte: {a: 1}, $lte: {a: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kMixEMCard, "mixed", "{$gte: {a: 1}, $lt: {a: 3}}"); - ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kMixEMCard, "mixed", "{$gt: {a: 1}, $lt: {a: 3}}"); - - // Test that for null, we always return null count. - // Note that for ranges including null (e.g. {$lt: null}) we don't generate any SargableNodes. - ASSERT_EQ_ELEMMATCH_CE(t, kNumNull, kNumNullMA, "mixed", "{$eq: null}"); - - // TODO SERVER-70936: Add tests for booleans. - // ASSERT_EQ_ELEMMATCH_CE(t, kNumBool, kNumBoolMA, "mixed", "{$eq: true}"); - // ASSERT_EQ_ELEMMATCH_CE(t, kNumBool, kNumBoolMA, "mixed", "{$eq: false}"); - - // Test combinations of the three fields/ type counters. - constexpr double k3ObjCard = - 59.814; // == 200/1000 * sqrt(200/1000) * sqrt(sqrt(200/1000)) * 1000 - constexpr double k4ObjCard = 48.914; - ASSERT_MATCH_CE_NODE(t, - "{scalar: {$eq: {a: 1}}, mixed: {$eq: {b: 1}}, array: {$eq: {c: 1}}}", - k3ObjCard, - isSargable3); - ASSERT_MATCH_CE_NODE( - t, - "{scalar: {$eq: {}}, mixed: {$lt: {b: 1}}, array: {$gt: {a: 1}, $lte: {a: 2, b: 4, c: 3}}}", - k4ObjCard, - isSargable4); - - // Should always get a 0.0 cardinality for an $elemMatch on a scalar predicate. - ASSERT_MATCH_CE(t, - "{scalar: {$elemMatch: {$eq: {a: 1}}}, mixed: {$elemMatch: {$eq: {b: 1}}}," - " array: {$elemMatch: {$eq: {c: 1}}}}", - 0.0); - ASSERT_MATCH_CE(t, - "{scalar: {$elemMatch: {$eq: {}}}, mixed: {$elemMatch: {$lt: {b: 1}}}," - " array: {$elemMatch: {$gt: {a: 1}, $lte: {a: 2, b: 4, c: 3}}}}", - 0.0); - - // The 'array' interval estimate is 50, but the 'mixed' interval estimate is 200. - constexpr double kArrMixObjEMCard = 22.3607; // == 50/1000 * sqrt(200/1000) * 1000 - ASSERT_MATCH_CE_NODE(t, - "{mixed: {$elemMatch: {$eq: {b: 1}}}, array: {$elemMatch: {$eq: {c: 1}}}}", - kArrMixObjEMCard, - isSargable4); - ASSERT_MATCH_CE_NODE(t, - "{mixed: {$elemMatch: {$lt: {b: 1}}}," - " array: {$elemMatch: {$gt: {a: 1}, $lte: {a: 2, b: 4, c: 3}}}}", - kArrMixObjEMCard, - isSargable4); -} - -TEST(CEHistogramTest, TestNestedArrayTypeCounterPredicates) { - // This test validates the correct behaviour of both the nested-array type counter as well as - // combinations of type counters and histogram estimates. - constexpr double kCollCard = 1000.0; - constexpr double kNumArr = 600.0; // Total number of arrays. - constexpr double kNumNestArr = 500.0; // Frequency of nested arrays, e.g. [[1, 2, 3]]. - constexpr double kNumNonNestArr = 100.0; - constexpr double kNum1 = 2.0; // Frequency of 1. - constexpr double kNum2 = 3.0; // Frequency of 2. - constexpr double kNum3 = 5.0; // Frequency of 3. - constexpr double kNumArr1 = 20.0; // Frequency of [1]. - constexpr double kNumArr2 = 30.0; // Frequency of [2]. - constexpr double kNumArr3 = 50.0; // Frequency of [3]. - constexpr double kNumObj = 390.0; // Total number of scalar objects. - - // Sanity test numbers. - ASSERT_EQ(kNumArr1 + kNumArr2, kNumArr3); - ASSERT_EQ(kNumNonNestArr + kNumNestArr, kNumArr); - ASSERT_EQ(kNumObj + kNumArr + kNum1 + kNum2 + kNum3, kCollCard); - - // Define histogram buckets. - TestBuckets scalarBuckets{{Value(1), kNum1}, {Value(2), kNum2}, {Value(3), kNum3}}; - TestBuckets arrUniqueBuckets{{Value(1), kNumArr1}, {Value(2), kNumArr2}, {Value(3), kNumArr3}}; - TestBuckets arrMinBuckets{{Value(1), kNumArr1}, {Value(2), kNumArr2}, {Value(3), kNumArr3}}; - TestBuckets arrMaxBuckets{{Value(1), kNumArr1}, {Value(2), kNumArr2}, {Value(3), kNumArr3}}; - - // Define type counts. - TypeCounts arrayTypeCounts{{sbe::value::TypeTags::Array, kNumNestArr}, - {sbe::value::TypeTags::NumberInt32, kNumNonNestArr}}; - TypeCounts scalarTypeCounts{{sbe::value::TypeTags::Object, kNumObj}}; - - CEHistogramTester t(collName, kCollCard); - t.addHistogram("na", - getArrayHistogramFromData(std::move(scalarBuckets), - std::move(arrUniqueBuckets), - std::move(arrMinBuckets), - std::move(arrMaxBuckets), - std::move(arrayTypeCounts), - kNumArr, - 0 /* Empty array count. */, - std::move(scalarTypeCounts))); - t.setIndexes( - {{"index", makeIndexDefinition("na", CollationOp::Ascending, /* isMultiKey */ true)}}); - - // Some equality tests on types that are not present in the type counters should return 0.0. - // TODO SERVER-70936: Add tests for booleans. - // ASSERT_EQ_ELEMMATCH_CE(t, 0.0, 0.0, "na", "{$eq: false}"); - // ASSERT_EQ_ELEMMATCH_CE(t, 0.0, 0.0, "na", "{$eq: true}"); - ASSERT_EQ_ELEMMATCH_CE(t, 0.0, 0.0, "na", "{$eq: null}"); - // We don't have any objects in arrays, so don't count them. - ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, 0.0, "na", "{$eq: {a: 1}}"); - - // Quick equality test to see if regular array histogram estimation still works as expected. - ASSERT_EQ_ELEMMATCH_CE(t, kNumArr1 + kNum1, kNumArr1, "na", "{$eq: 1}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumArr2 + kNum2, kNumArr2, "na", "{$eq: 2}"); - ASSERT_EQ_ELEMMATCH_CE(t, kNumArr3 + kNum3, kNumArr3, "na", "{$eq: 3}"); - - // Test a range predicate. - // - For simple $lt, we correctly return both scalar and array counts that could match. - // - For $elemMatch + $lt, we have two entries in the requirements map. - // - The PathArr interval, estimated correctly as 'kNumArr'. - // - The interval {$lt: 3}, estimated as an array histogram range interval. - // We then combine the estimates for the two using conjunctive exponential backoff. - constexpr double elemMatchRange = 71.5485; - ASSERT_EQ_ELEMMATCH_CE( - t, kNumArr1 + kNum1 + kNumArr2 + kNum2, elemMatchRange, "na", "{$lt: 3}"); - ASSERT_EQ_ELEMMATCH_CE(t, 0.0, 0.0, "na", "{$lt: 1}"); - - // Test equality to arrays. - // - $elemMatch, estimation, as expected, will return the count of nested arrays. - // - For the case where we see equality to the array, we have a disjunction of intervals in the - // same entry of the SargableNode requirements map. For the case of {$eq: [1]}, for example, we - // have: [[1], [1]] U [1, 1]. As a result, we estimate each point interval separately: - // - [[1], [1]]: We estimate the nested array interval as 'kNumNestArr'. - // - [1, 1]: We estimate the regular point interval as 'kNumArr1' + 'kNum1'. - // We then combine the results by exponential backoff. Note that we will NOT match {na: 1}; - // however, because of the way the interval is defined, our estimate suggests that we would. - // TODO: is there a way to know this on the CE side? - constexpr double kArr1EqCard = 505.531; // (1 - (1 - 500.0/1000) * sqrt(1 - 22.0/1000)) * 1000 - constexpr double kArr2EqCard = 508.319; // (1 - (1 - 500.0/1000) * sqrt(1 - 33.0/1000)) * 1000 - constexpr double kArr3EqCard = 513.944; // (1 - (1 - 500.0/1000) * sqrt(1 - 55.0/1000)) * 1000 - ASSERT_EQ_ELEMMATCH_CE_NODE(t, kArr1EqCard, kNumNestArr, "na", "{$eq: [1]}", isSargable); - ASSERT_EQ_ELEMMATCH_CE_NODE(t, kArr2EqCard, kNumNestArr, "na", "{$eq: [2]}", isSargable); - ASSERT_EQ_ELEMMATCH_CE_NODE(t, kArr3EqCard, kNumNestArr, "na", "{$eq: [3]}", isSargable); - // For the last case, we have the interval [[1, 2, 3], [1, 2, 3]] U [1, 1]. - // TODO: is this interval semantically correct? - ASSERT_EQ_ELEMMATCH_CE_NODE(t, kArr1EqCard, kNumNestArr, "na", "{$eq: [1, 2, 3]}", isSargable); - - // Now, we test the case of nested arrays. - // - $elemMatch, once again, returns the number of nested arrays. - // - Simple equality generates two intervals. We estimate both intervals using the nested array - // type count. For {$eq: [[1, 2, 3]]}, we get: - // - [[1, 2, 3], [1, 2, 3]] U [[[1, 2, 3]]], [[1, 2, 3]]] - constexpr double kNestedEqCard = - 646.447; // (1 - (1 - 500.0/1000) * sqrt(1 - 500.0/1000)) * 1000 - ASSERT_EQ_ELEMMATCH_CE_NODE( - t, kNestedEqCard, kNumNestArr, "na", "{$eq: [[1, 2, 3]]}", isSargable); - ASSERT_EQ_ELEMMATCH_CE_NODE(t, kNestedEqCard, kNumNestArr, "na", "{$eq: [[1]]}", isSargable); - ASSERT_EQ_ELEMMATCH_CE_NODE(t, kNestedEqCard, kNumNestArr, "na", "{$eq: [[2]]}", isSargable); - ASSERT_EQ_ELEMMATCH_CE_NODE(t, kNestedEqCard, kNumNestArr, "na", "{$eq: [[3]]}", isSargable); - - // Note: we can't convert range queries on arrays to SargableNodes yet. If we ever can, we - // should add some more tests here. -} - -TEST(CEHistogramTest, TestFallbackForNonConstIntervals) { - // This is a sanity test to validate fallback for an interval with non-const bounds. - IntervalRequirement intervalLowNonConst{ - BoundRequirement(true /*inclusive*/, make("v1")), - BoundRequirement::makePlusInf()}; - - IntervalRequirement intervalHighNonConst{ - BoundRequirement::makeMinusInf(), - BoundRequirement(true /*inclusive*/, make("v2"))}; - - IntervalRequirement intervalEqNonConst{ - BoundRequirement(true /*inclusive*/, make("v3")), - BoundRequirement(true /*inclusive*/, make("v3"))}; - - const auto estInterval = [](const auto& interval) { - ArrayHistogram ah; - return estimateIntervalCardinality( - ah, interval, 100 /* inputCardinality */, true /* includeScalar */); - }; - - ASSERT_EQ(estInterval(intervalLowNonConst), -1.0); - ASSERT_EQ(estInterval(intervalHighNonConst), -1.0); - ASSERT_EQ(estInterval(intervalEqNonConst), -1.0); -} -} // namespace -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_interpolation_test.cpp b/src/mongo/db/query/ce/ce_interpolation_test.cpp deleted file mode 100644 index 6d9d52b347d..00000000000 --- a/src/mongo/db/query/ce/ce_interpolation_test.cpp +++ /dev/null @@ -1,505 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/db/query/ce/array_histogram.h" -#include "mongo/db/query/ce/ce_test_utils.h" -#include "mongo/db/query/ce/histogram_estimation.h" -#include "mongo/db/query/sbe_stage_builder_helpers.h" -#include "mongo/unittest/unittest.h" - -namespace mongo::ce { -namespace { - -using namespace sbe; - -TEST(EstimatorTest, ManualHistogram) { - std::vector data{{0, 1.0, 1.0, 1.0}, - {10, 1.0, 10.0, 5.0}, - {20, 3.0, 15.0, 3.0}, - {30, 1.0, 10.0, 4.0}, - {40, 2.0, 0.0, 0.0}, - {50, 1.0, 10.0, 5.0}}; - const ScalarHistogram hist = createHistogram(data); - - ASSERT_EQ(55.0, getTotals(hist).card); - - ASSERT_EQ(1.0, estimateIntValCard(hist, 0, EstimationType::kEqual)); - ASSERT_EQ(2.0, estimateIntValCard(hist, 5, EstimationType::kEqual)); - ASSERT_EQ(0.0, estimateIntValCard(hist, 35, EstimationType::kEqual)); - - ASSERT_EQ(15.5, estimateIntValCard(hist, 15, EstimationType::kLess)); - ASSERT_EQ(20.5, estimateIntValCard(hist, 15, EstimationType::kLessOrEqual)); - ASSERT_EQ(28, estimateIntValCard(hist, 20, EstimationType::kLess)); - ASSERT_EQ(31.0, estimateIntValCard(hist, 20, EstimationType::kLessOrEqual)); - - ASSERT_EQ(42, estimateIntValCard(hist, 10, EstimationType::kGreater)); - ASSERT_EQ(43, estimateIntValCard(hist, 10, EstimationType::kGreaterOrEqual)); - ASSERT_EQ(19, estimateIntValCard(hist, 25, EstimationType::kGreater)); - ASSERT_EQ(21.5, estimateIntValCard(hist, 25, EstimationType::kGreaterOrEqual)); -} - -TEST(EstimatorTest, UniformIntEstimate) { - // This hard-codes a maxdiff histogram with 10 buckets built off a uniform int distribution with - // a minimum of 0, a maximum of 1000, and 70 distinct values. - std::vector data{{2, 1, 0, 0}, - {57, 3, 2, 1}, - {179, 5, 10, 6}, - {317, 5, 9, 6}, - {344, 3, 0, 0}, - {558, 4, 19, 12}, - {656, 2, 4, 3}, - {798, 3, 7, 4}, - {951, 5, 17, 7}, - {986, 1, 0, 0}}; - const ScalarHistogram hist = createHistogram(data); - - // Predicates over bucket bound. - double expectedCard = estimateIntValCard(hist, 558, EstimationType::kEqual); - ASSERT_EQ(4.0, expectedCard); - expectedCard = estimateIntValCard(hist, 558, EstimationType::kLess); - ASSERT_EQ(57.0, expectedCard); - expectedCard = estimateIntValCard(hist, 558, EstimationType::kLessOrEqual); - ASSERT_EQ(61.0, expectedCard); - - // Predicates over value inside of a bucket. - - // Query: [{$match: {a: {$eq: 530}}}]. - expectedCard = estimateIntValCard(hist, 530, EstimationType::kEqual); - ASSERT_APPROX_EQUAL(1.6, expectedCard, 0.1); // Actual: 1. - - // Query: [{$match: {a: {$lt: 530}}}]. - expectedCard = estimateIntValCard(hist, 530, EstimationType::kLess); - ASSERT_APPROX_EQUAL(52.9, expectedCard, 0.1); // Actual: 50. - - // Query: [{$match: {a: {$lte: 530}}}]. - expectedCard = estimateIntValCard(hist, 530, EstimationType::kLessOrEqual); - ASSERT_APPROX_EQUAL(54.5, expectedCard, 0.1); // Actual: 51. - - // Query: [{$match: {a: {$eq: 400}}}]. - expectedCard = estimateIntValCard(hist, 400, EstimationType::kEqual); - ASSERT_APPROX_EQUAL(1.6, expectedCard, 0.1); // Actual: 1. - - // Query: [{$match: {a: {$lt: 400}}}]. - expectedCard = estimateIntValCard(hist, 400, EstimationType::kLess); - ASSERT_APPROX_EQUAL(41.3, expectedCard, 0.1); // Actual: 39. - - // Query: [{$match: {a: {$lte: 400}}}]. - expectedCard = estimateIntValCard(hist, 400, EstimationType::kLessOrEqual); - ASSERT_APPROX_EQUAL(43.0, expectedCard, 0.1); // Actual: 40. -} - -TEST(EstimatorTest, NormalIntEstimate) { - // This hard-codes a maxdiff histogram with 10 buckets built off a normal int distribution with - // a minimum of 0, a maximum of 1000, and 70 distinct values. - std::vector data{{2, 1, 0, 0}, - {317, 8, 20, 15}, - {344, 2, 0, 0}, - {388, 3, 0, 0}, - {423, 4, 2, 2}, - {579, 4, 12, 8}, - {632, 3, 2, 1}, - {696, 3, 5, 3}, - {790, 5, 4, 2}, - {993, 1, 21, 9}}; - const ScalarHistogram hist = createHistogram(data); - - // Predicates over bucket bound. - double expectedCard = estimateIntValCard(hist, 696, EstimationType::kEqual); - ASSERT_EQ(3.0, expectedCard); - expectedCard = estimateIntValCard(hist, 696, EstimationType::kLess); - ASSERT_EQ(66.0, expectedCard); - expectedCard = estimateIntValCard(hist, 696, EstimationType::kLessOrEqual); - ASSERT_EQ(69.0, expectedCard); - - // Predicates over value inside of a bucket. - - // Query: [{$match: {a: {$eq: 150}}}]. - expectedCard = estimateIntValCard(hist, 150, EstimationType::kEqual); - ASSERT_APPROX_EQUAL(1.3, expectedCard, 0.1); // Actual: 1. - - // Query: [{$match: {a: {$lt: 150}}}]. - expectedCard = estimateIntValCard(hist, 150, EstimationType::kLess); - ASSERT_APPROX_EQUAL(9.1, expectedCard, 0.1); // Actual: 9. - - // Query: [{$match: {a: {$lte: 150}}}]. - expectedCard = estimateIntValCard(hist, 150, EstimationType::kLessOrEqual); - ASSERT_APPROX_EQUAL(10.4, expectedCard, 0.1); // Actual: 10. -} - -TEST(EstimatorTest, UniformStrEstimate) { - // This hard-codes a maxdiff histogram with 10 buckets built off a uniform string distribution - // with a minimum length of 3, a maximum length of 5, and 80 distinct values. - std::vector data{{{"0ejz", 2, 0, 0}, - {"8DCaq", 3, 4, 4}, - {"Cy5Kw", 3, 3, 3}, - {"WXX7w", 3, 31, 20}, - {"YtzS", 2, 0, 0}, - {"fuK", 5, 13, 7}, - {"gLkp", 3, 0, 0}, - {"ixmVx", 2, 6, 2}, - {"qou", 1, 9, 6}, - {"z2b", 1, 9, 6}}}; - const ScalarHistogram hist = createHistogram(data); - - // Predicates over value inside of a bucket. - const auto [tag, value] = value::makeNewString("TTV"_sd); - value::ValueGuard vg(tag, value); - - // Query: [{$match: {a: {$eq: 'TTV'}}}]. - double expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; - ASSERT_APPROX_EQUAL(1.55, expectedCard, 0.1); // Actual: 2. - - // Query: [{$match: {a: {$lt: 'TTV'}}}]. - expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; - ASSERT_APPROX_EQUAL(39.8, expectedCard, 0.1); // Actual: 39. - - // Query: [{$match: {a: {$lte: 'TTV'}}}]. - expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; - ASSERT_APPROX_EQUAL(41.3, expectedCard, 0.1); // Actual: 41. -} - -TEST(EstimatorTest, NormalStrEstimate) { - // This hard-codes a maxdiff histogram with 10 buckets built off a normal string distribution - // with a minimum length of 3, a maximum length of 5, and 80 distinct values. - std::vector data{{ - {"0ejz", 1, 0, 0}, - {"4FGjc", 3, 5, 3}, - {"9bU3", 2, 3, 2}, - {"Cy5Kw", 3, 3, 3}, - {"Lm4U", 2, 11, 5}, - {"TTV", 5, 14, 8}, - {"YtzS", 2, 3, 2}, - {"o9cD4", 6, 26, 16}, - {"qfmnP", 1, 4, 2}, - {"xqbi", 2, 4, 4}, - }}; - const ScalarHistogram hist = createHistogram(data); - - // Predicates over bucket bound. - auto [tag, value] = value::makeNewString("TTV"_sd); - value::ValueGuard vg(tag, value); - - // Query: [{$match: {a: {$eq: 'TTV'}}}]. - double expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; - ASSERT_APPROX_EQUAL(5.0, expectedCard, 0.1); // Actual: 5. - - // Query: [{$match: {a: {$lt: 'TTV'}}}]. - expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; - ASSERT_APPROX_EQUAL(47.0, expectedCard, 0.1); // Actual: 47. - - // Query: [{$match: {a: {$lte: 'TTV'}}}]. - expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; - ASSERT_APPROX_EQUAL(52.0, expectedCard, 0.1); // Actual: 52. - - // Predicates over value inside of a bucket. - std::tie(tag, value) = value::makeNewString("Pfa"_sd); - - // Query: [{$match: {a: {$eq: 'Pfa'}}}]. - expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; - ASSERT_APPROX_EQUAL(1.75, expectedCard, 0.1); // Actual: 2. - - // Query: [{$match: {a: {$lt: 'Pfa'}}}]. - expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; - ASSERT_APPROX_EQUAL(38.3, expectedCard, 0.1); // Actual: 35. - - // Query: [{$match: {a: {$lte: 'Pfa'}}}]. - expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; - ASSERT_APPROX_EQUAL(40.0, expectedCard, 0.1); // Actual: 37. -} - -TEST(EstimatorTest, UniformIntStrEstimate) { - // This hard-codes a maxdiff histogram with 20 buckets built off of a uniform distribution with - // two types occurring with equal probability: - // - 100 distinct ints between 0 and 1000, and - // - 100 distinct strings of length between 2 and 5. - std::vector data{{ - {2, 3, 0, 0}, {19, 4, 1, 1}, {226, 2, 49, 20}, {301, 5, 12, 4}, - {317, 3, 0, 0}, {344, 2, 3, 1}, {423, 5, 18, 6}, {445, 3, 0, 0}, - {495, 3, 4, 2}, {542, 5, 9, 3}, {696, 3, 44, 19}, {773, 4, 11, 5}, - {805, 2, 8, 4}, {931, 5, 21, 8}, {998, 4, 21, 3}, {"8N4", 5, 31, 14}, - {"MIb", 5, 45, 17}, {"Zgi", 3, 55, 22}, {"pZ", 6, 62, 25}, {"yUwxz", 5, 29, 12}, - }}; - const ScalarHistogram hist = createHistogram(data); - const ArrayHistogram arrHist( - hist, TypeCounts{{value::TypeTags::NumberInt64, 254}, {value::TypeTags::StringSmall, 246}}); - - // Predicates over value inside of the last numeric bucket. - - // Query: [{$match: {a: {$eq: 993}}}]. - double expectedCard = estimateIntValCard(hist, 993, EstimationType::kEqual); - ASSERT_APPROX_EQUAL(7.0, expectedCard, 0.1); // Actual: 9. - - // Query: [{$match: {a: {$lt: 993}}}]. - expectedCard = estimateIntValCard(hist, 993, EstimationType::kLess); - ASSERT_APPROX_EQUAL(241.4, expectedCard, 0.1); // Actual: 241. - - // Query: [{$match: {a: {$lte: 993}}}]. - expectedCard = estimateIntValCard(hist, 993, EstimationType::kLessOrEqual); - ASSERT_APPROX_EQUAL(248.4, expectedCard, 0.1); // Actual: 250. - - // Predicates over value inside of the first string bucket. - auto [tag, value] = value::makeNewString("04e"_sd); - value::ValueGuard vg(tag, value); - - // Query: [{$match: {a: {$eq: '04e'}}}]. - expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; - ASSERT_APPROX_EQUAL(2.2, expectedCard, 0.1); // Actual: 3. - - value::TypeTags lowTag = value::TypeTags::NumberInt64; - value::Value lowVal = 100000000; - - // Type bracketing: low value of different type than the bucket bound. - // Query: [{$match: {a: {$eq: 100000000}}}]. - expectedCard = estimateCardEq(arrHist, lowTag, lowVal, true /* includeScalar */); - ASSERT_APPROX_EQUAL(0.0, expectedCard, 0.1); // Actual: 0. - - // No interpolation for inequality to values inside the first string bucket, fallback to half of - // the bucket frequency. - - // Query: [{$match: {a: {$lt: '04e'}}}]. - expectedCard = estimateCardRange(arrHist, - false /* lowInclusive */, - lowTag, - lowVal, - false /* highInclusive */, - tag, - value, - true /* includeScalar */); - ASSERT_APPROX_EQUAL(13.3, expectedCard, 0.1); // Actual: 0. - - // Query: [{$match: {a: {$lte: '04e'}}}]. - expectedCard = estimateCardRange(arrHist, - false /* lowInclusive */, - lowTag, - lowVal, - true /* highInclusive */, - tag, - value, - true /* includeScalar */); - ASSERT_APPROX_EQUAL(15.5, expectedCard, 0.1); // Actual: 3. - - // Value towards the end of the bucket gets the same half bucket estimate. - std::tie(tag, value) = value::makeNewString("8B5"_sd); - - // Query: [{$match: {a: {$lt: '8B5'}}}]. - expectedCard = estimateCardRange(arrHist, - false /* lowInclusive */, - lowTag, - lowVal, - false /* highInclusive */, - tag, - value, - true /* includeScalar */); - ASSERT_APPROX_EQUAL(13.3, expectedCard, 0.1); // Actual: 24. - - // Query: [{$match: {a: {$lte: '8B5'}}}]. - expectedCard = estimateCardRange(arrHist, - false /* lowInclusive */, - lowTag, - lowVal, - true /* highInclusive */, - tag, - value, - true /* includeScalar */); - ASSERT_APPROX_EQUAL(15.5, expectedCard, 0.1); // Actual: 29. -} - -TEST(EstimatorTest, UniformIntArrayOnlyEstimate) { - // This hard-codes a maxdiff histogram with 10 buckets built off of an array distribution with - // arrays between 3 and 5 elements long, each containing 100 distinct ints uniformly distributed - // between 0 and 1000. There are no scalar elements. - std::vector scalarData{{}}; - const ScalarHistogram scalarHist = createHistogram(scalarData); - - std::vector minData{{ - {5, 3, 0, 0}, {19, 5, 2, 1}, {57, 4, 4, 3}, {116, 7, 13, 7}, {198, 3, 15, 6}, - {228, 2, 3, 2}, {254, 4, 0, 0}, {280, 2, 2, 1}, {335, 3, 5, 3}, {344, 2, 0, 0}, - {388, 3, 0, 0}, {420, 2, 0, 0}, {454, 1, 6, 3}, {488, 2, 1, 1}, {530, 1, 0, 0}, - {561, 1, 0, 0}, {609, 1, 0, 0}, {685, 1, 0, 0}, {713, 1, 0, 0}, {758, 1, 0, 0}, - }}; - const ScalarHistogram minHist = createHistogram(minData); - - std::vector maxData{{ - {301, 1, 0, 0}, {408, 2, 0, 0}, {445, 1, 0, 0}, {605, 2, 0, 0}, {620, 1, 0, 0}, - {665, 1, 1, 1}, {687, 3, 0, 0}, {704, 2, 6, 2}, {718, 2, 2, 1}, {741, 2, 1, 1}, - {752, 2, 0, 0}, {823, 7, 3, 3}, {827, 1, 0, 0}, {852, 3, 0, 0}, {864, 5, 0, 0}, - {909, 7, 12, 5}, {931, 2, 3, 1}, {939, 3, 0, 0}, {970, 2, 12, 4}, {998, 1, 10, 4}, - }}; - const ScalarHistogram maxHist = createHistogram(maxData); - - std::vector uniqueData{{ - {5, 3, 0, 0}, {19, 6, 2, 1}, {57, 4, 4, 3}, {116, 7, 15, 8}, {228, 2, 38, 13}, - {254, 7, 0, 0}, {269, 10, 0, 0}, {280, 7, 3, 1}, {306, 4, 1, 1}, {317, 4, 0, 0}, - {344, 2, 19, 5}, {423, 2, 27, 8}, {507, 2, 22, 13}, {704, 8, 72, 34}, {718, 6, 3, 1}, - {758, 3, 13, 4}, {864, 7, 35, 14}, {883, 4, 0, 0}, {939, 5, 32, 10}, {998, 1, 24, 9}, - }}; - const ScalarHistogram uniqueHist = createHistogram(uniqueData); - - const ArrayHistogram arrHist(scalarHist, - TypeCounts{{value::TypeTags::Array, 100}}, - uniqueHist, - minHist, - maxHist, - TypeCounts{}, - 0); - - // Query in the middle of the domain: estimate from ArrayUnique histogram. - value::TypeTags lowTag = value::TypeTags::NumberInt64; - value::Value lowVal = 500; - value::TypeTags highTag = value::TypeTags::NumberInt64; - value::Value highVal = 600; - - // Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 500, $lt: 600}}}}]. - double expectedCard = estimateCardRange(arrHist, - false /* lowInclusive */, - lowTag, - lowVal, - false /* highInclusive */, - highTag, - highVal, - false /* includeScalar */); - ASSERT_APPROX_EQUAL(27.0, expectedCard, 0.1); // actual 21. - - // Test interpolation for query: [{$match: {a: {$gt: 500, $lt: 600}}}]. - // Note: although there are no scalars, the estimate is different than the - // above since we use different formulas. - expectedCard = estimateCardRange(arrHist, - false /* lowInclusive */, - lowTag, - lowVal, - false /* highInclusive */, - highTag, - highVal, - true /* includeScalar */); - ASSERT_APPROX_EQUAL(92.0, expectedCard, 0.1); // actual 92. - - // Query at the end of the domain: more precise estimates from ArrayMin, ArrayMax histograms. - lowVal = 10; - highVal = 110; - - // Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 10, $lt: 110}}}}]. - expectedCard = estimateCardRange(arrHist, - false /* lowInclusive */, - lowTag, - lowVal, - false /* highInclusive */, - highTag, - highVal, - false /* includeScalar */); - ASSERT_APPROX_EQUAL(24.1, expectedCard, 0.1); // actual 29. - - // Test interpolation for query: [{$match: {a: {$gt: 10, $lt: 110}}}]. - expectedCard = estimateCardRange(arrHist, - false /* lowInclusive */, - lowTag, - lowVal, - false /* highInclusive */, - highTag, - highVal, - true /* includeScalar */); - ASSERT_APPROX_EQUAL(27.8, expectedCard, 0.1); // actual 31. -} - -TEST(EstimatorTest, UniformIntMixedArrayEstimate) { - // This hard-codes a maxdiff histogram with 20 buckets built off of a mixed distribution split - // with equal probability between: - // - an array distribution between 3 and 5 elements long, each containing 80 distinct ints - // uniformly distributed between 0 and 1000, and - // - a uniform int distribution with 80 distinct ints between 0 and 1000. - std::vector scalarData{{ - {25, 1, 0, 0}, {41, 2, 0, 0}, {142, 2, 3, 3}, {209, 3, 3, 1}, {243, 1, 2, 1}, - {296, 3, 4, 3}, {321, 5, 4, 2}, {480, 3, 9, 8}, {513, 3, 3, 2}, {554, 1, 0, 0}, - {637, 3, 3, 2}, {666, 2, 1, 1}, {697, 2, 2, 1}, {750, 3, 3, 2}, {768, 4, 0, 0}, - {791, 4, 3, 3}, {851, 2, 2, 2}, {927, 2, 10, 6}, {958, 3, 2, 1}, {980, 3, 0, 0}, - }}; - const ScalarHistogram scalarHist = createHistogram(scalarData); - - std::vector minData{{ - {3, 3, 0, 0}, {5, 8, 0, 0}, {9, 3, 0, 0}, {19, 2, 0, 0}, {49, 7, 4, 2}, - {69, 6, 0, 0}, {115, 3, 5, 3}, {125, 2, 0, 0}, {146, 1, 2, 1}, {198, 2, 4, 3}, - {214, 2, 0, 0}, {228, 3, 0, 0}, {260, 3, 4, 1}, {280, 1, 2, 2}, {330, 2, 2, 1}, - {344, 6, 0, 0}, {388, 2, 0, 0}, {420, 2, 0, 0}, {461, 2, 8, 4}, {696, 1, 2, 1}, - }}; - const ScalarHistogram minHist = createHistogram(minData); - - std::vector maxData{{ - {301, 1, 0, 0}, {445, 1, 0, 0}, {491, 1, 0, 0}, {533, 3, 0, 0}, {605, 3, 0, 0}, - {620, 2, 0, 0}, {647, 3, 0, 0}, {665, 4, 0, 0}, {713, 3, 10, 4}, {741, 3, 0, 0}, - {814, 3, 2, 2}, {839, 2, 1, 1}, {864, 1, 2, 2}, {883, 3, 0, 0}, {893, 7, 0, 0}, - {898, 5, 0, 0}, {909, 1, 12, 3}, {931, 2, 2, 1}, {953, 6, 3, 2}, {993, 1, 7, 5}, - }}; - const ScalarHistogram maxHist = createHistogram(maxData); - - std::vector uniqueData{{ - {3, 3, 0, 0}, {19, 5, 11, 2}, {49, 7, 5, 3}, {69, 8, 0, 0}, {75, 3, 0, 0}, - {125, 2, 10, 5}, {228, 3, 27, 14}, {260, 4, 5, 1}, {344, 6, 36, 13}, {423, 4, 20, 8}, - {605, 4, 61, 28}, {665, 8, 12, 6}, {758, 4, 41, 16}, {768, 5, 0, 0}, {776, 3, 0, 0}, - {864, 3, 15, 10}, {883, 8, 0, 0}, {911, 2, 28, 6}, {953, 6, 8, 4}, {993, 1, 7, 5}, - }}; - const ScalarHistogram uniqueHist = createHistogram(uniqueData); - - TypeCounts typeCounts{{value::TypeTags::NumberInt64, 106}, {value::TypeTags::Array, 94}}; - const ArrayHistogram arrHist(scalarHist, - typeCounts, - uniqueHist, - minHist, - maxHist, - TypeCounts{{value::TypeTags::NumberInt64, 375}}, - 0); - - value::TypeTags lowTag = value::TypeTags::NumberInt64; - value::Value lowVal = 500; - value::TypeTags highTag = value::TypeTags::NumberInt64; - value::Value highVal = 550; - - // Test interpolation for query: [{$match: {a: {$gt: 500, $lt: 550}}}]. - double expectedCard = estimateCardRange(arrHist, - false /* lowInclusive */, - lowTag, - lowVal, - false /* highInclusive */, - highTag, - highVal, - true /* includeScalar */); - ASSERT_APPROX_EQUAL(92.9, expectedCard, 0.1); // Actual: 94. - - // Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 500, $lt: 550}}}}]. - expectedCard = estimateCardRange(arrHist, - false /* lowInclusive */, - lowTag, - lowVal, - false /* highInclusive */, - highTag, - highVal, - false /* includeScalar */); - ASSERT_APPROX_EQUAL(11.0, expectedCard, 0.1); // Actual: 8. -} - -} // namespace -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_sampling.cpp b/src/mongo/db/query/ce/ce_sampling.cpp deleted file mode 100644 index ce31ae842e2..00000000000 --- a/src/mongo/db/query/ce/ce_sampling.cpp +++ /dev/null @@ -1,362 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/db/query/ce/ce_sampling.h" - -#include "mongo/db/exec/sbe/abt/abt_lower.h" -#include "mongo/db/query/cqf_command_utils.h" -#include "mongo/db/query/optimizer/explain.h" -#include "mongo/db/query/optimizer/index_bounds.h" -#include "mongo/db/query/optimizer/props.h" -#include "mongo/db/query/optimizer/utils/abt_hash.h" -#include "mongo/db/query/optimizer/utils/memo_utils.h" -#include "mongo/logv2/log.h" - -#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kQuery - -namespace mongo::ce { -namespace cascades = optimizer::cascades; -namespace properties = optimizer::properties; - -using ABT = optimizer::ABT; -using CEType = optimizer::CEType; -using LogicalProps = properties::LogicalProps; -using OptPhaseManager = optimizer::OptPhaseManager; -using Memo = cascades::Memo; -using Metadata = optimizer::Metadata; - -class SamplingPlanExtractor { -public: - SamplingPlanExtractor(const Memo& memo, - const OptPhaseManager& phaseManager, - const size_t sampleSize) - : _memo(memo), _sampleSize(sampleSize), _phaseManager(phaseManager) {} - - void transport(ABT& n, const optimizer::MemoLogicalDelegatorNode& node) { - n = extract(_memo.getLogicalNodes(node.getGroupId()).front()); - } - - void transport(ABT& n, const optimizer::ScanNode& /*node*/, ABT& /*binder*/) { - // We will lower the scan node in a sampling context here. - // TODO: for now just return the documents in random order. - n = optimizer::make( - properties::LimitSkipRequirement(_sampleSize, 0), std::move(n)); - } - - void transport(ABT& n, - const optimizer::FilterNode& /*node*/, - ABT& childResult, - ABT& /*exprResult*/) { - // Skip over filters. - n = childResult; - } - - void transport(ABT& /*n*/, - const optimizer::EvaluationNode& /*node*/, - ABT& /*childResult*/, - ABT& /*exprResult*/) { - // Keep Eval nodes. - } - - void transport( - ABT& n, const optimizer::SargableNode& node, ABT& childResult, ABT& refs, ABT& binds) { - ABT result = childResult; - // Retain only output bindings without applying filters. - for (const auto& [key, req] : node.getReqMap()) { - if (const auto& boundProjName = req.getBoundProjectionName()) { - optimizer::lowerPartialSchemaRequirement( - key, - optimizer::PartialSchemaRequirement{ - boundProjName, - optimizer::IntervalReqExpr::makeSingularDNF(), - req.getIsPerfOnly()}, - result, - _phaseManager.getPathToInterval()); - } - } - std::swap(n, result); - } - - void transport(ABT& n, const optimizer::CollationNode& /*node*/, ABT& childResult, ABT& refs) { - // Skip over collation nodes. - n = childResult; - } - - template - void transport(ABT& /*n*/, const T& /*node*/, Ts&&...) { - if constexpr (std::is_base_of_v) { - uasserted(6624242, "Should not be seeing other types of nodes here."); - } - } - - ABT extract(ABT node) { - optimizer::algebra::transport(node, *this); - return node; - } - -private: - const Memo& _memo; - const size_t _sampleSize; - const OptPhaseManager& _phaseManager; -}; - -class CESamplingTransportImpl { - static constexpr size_t kMaxSampleSize = 1000; - -public: - CESamplingTransportImpl(OperationContext* opCtx, - OptPhaseManager phaseManager, - const int64_t numRecords, - std::unique_ptr fallbackCE) - : _phaseManager(std::move(phaseManager)), - _opCtx(opCtx), - _sampleSize(std::min(numRecords, kMaxSampleSize)), - _fallbackCE(std::move(fallbackCE)) {} - - CEType transport(const ABT& n, - const optimizer::FilterNode& node, - const Metadata& metadata, - const Memo& memo, - const LogicalProps& logicalProps, - CEType childResult, - CEType /*exprResult*/) { - if (!properties::hasProperty(logicalProps)) { - return _fallbackCE->deriveCE(metadata, memo, logicalProps, n.ref()); - } - - SamplingPlanExtractor planExtractor(memo, _phaseManager, _sampleSize); - // Create a plan with all eval nodes so far and the filter last. - ABT abtTree = - optimizer::make(node.getFilter(), planExtractor.extract(n)); - - return estimateFilterCE(metadata, memo, logicalProps, n, std::move(abtTree), childResult); - } - - CEType transport(const ABT& n, - const optimizer::SargableNode& node, - const Metadata& metadata, - const Memo& memo, - const LogicalProps& logicalProps, - CEType childResult, - CEType /*bindResult*/, - CEType /*refsResult*/) { - if (!properties::hasProperty(logicalProps)) { - return _fallbackCE->deriveCE(metadata, memo, logicalProps, n.ref()); - } - - SamplingPlanExtractor planExtractor(memo, _phaseManager, _sampleSize); - ABT extracted = planExtractor.extract(n); - - // Estimate individual requirements separately by potentially re-using cached results. - // Here we assume that each requirement is independent. - // TODO: consider estimating together the entire set of requirements (but caching!) - CEType result = childResult; - for (const auto& [key, req] : node.getReqMap()) { - if (req.getIsPerfOnly()) { - // Ignore perf-only requirements. - continue; - } - - if (!isIntervalReqFullyOpenDNF(req.getIntervals())) { - ABT lowered = extracted; - // Lower requirement without an output binding. - lowerPartialSchemaRequirement( - key, - optimizer::PartialSchemaRequirement{boost::none /*boundProjectionName*/, - req.getIntervals(), - req.getIsPerfOnly()}, - lowered, - _phaseManager.getPathToInterval()); - uassert(6624243, "Expected a filter node", lowered.is()); - result = - estimateFilterCE(metadata, memo, logicalProps, n, std::move(lowered), result); - } - } - - return result; - } - - /** - * Other ABT types. - */ - template - CEType transport(const ABT& n, - const T& /*node*/, - const Metadata& metadata, - const Memo& memo, - const LogicalProps& logicalProps, - Ts&&...) { - if (optimizer::canBeLogicalNode()) { - return _fallbackCE->deriveCE(metadata, memo, logicalProps, n.ref()); - } - return 0.0; - } - - CEType derive(const Metadata& metadata, - const Memo& memo, - const LogicalProps& logicalProps, - const ABT::reference_type logicalNodeRef) { - return optimizer::algebra::transport( - logicalNodeRef, *this, metadata, memo, logicalProps); - } - -private: - CEType estimateFilterCE(const Metadata& metadata, - const Memo& memo, - const LogicalProps& logicalProps, - const ABT& n, - ABT abtTree, - CEType childResult) { - auto it = _selectivityCacheMap.find(abtTree); - if (it != _selectivityCacheMap.cend()) { - // Cache hit. - return it->second * childResult; - } - - const auto [success, selectivity] = estimateSelectivity(abtTree); - if (!success) { - return _fallbackCE->deriveCE(metadata, memo, logicalProps, n.ref()); - } - - _selectivityCacheMap.emplace(std::move(abtTree), selectivity); - - OPTIMIZER_DEBUG_LOG(6264805, - 5, - "CE sampling estimated filter selectivity", - "selectivity"_attr = selectivity); - return selectivity * childResult; - } - - std::pair estimateSelectivity(ABT abtTree) { - // Add a group by to count number of documents. - const optimizer::ProjectionName sampleSumProjection = "sum"; - abtTree = optimizer::make( - optimizer::ProjectionNameVector{}, - optimizer::ProjectionNameVector{sampleSumProjection}, - optimizer::makeSeq(optimizer::make( - "$sum", makeSeq(optimizer::Constant::int64(1)))), - std::move(abtTree)); - abtTree = optimizer::make( - properties::ProjectionRequirement{optimizer::ProjectionNameVector{sampleSumProjection}}, - std::move(abtTree)); - - - OPTIMIZER_DEBUG_LOG(6264806, - 5, - "Estimate selectivity ABT", - "explain"_attr = optimizer::ExplainGenerator::explainV2(abtTree)); - - _phaseManager.optimize(abtTree); - - auto env = optimizer::VariableEnvironment::build(abtTree); - optimizer::SlotVarMap slotMap; - boost::optional ridSlot; - sbe::value::SlotIdGenerator ids; - optimizer::SBENodeLowering g{env, - slotMap, - ridSlot, - ids, - _phaseManager.getMetadata(), - _phaseManager.getNodeToGroupPropsMap(), - _phaseManager.getRIDProjections(), - true /*randomScan*/}; - auto sbePlan = g.optimize(abtTree); - tassert(6624261, "Unexpected rid slot", !ridSlot); - - // TODO: return errors instead of exceptions? - uassert(6624244, "Lowering failed", sbePlan != nullptr); - uassert(6624245, "Invalid slot map size", slotMap.size() == 1); - - sbePlan->attachToOperationContext(_opCtx); - sbe::CompileCtx ctx(std::make_unique()); - sbePlan->prepare(ctx); - - std::vector accessors; - for (auto& [name, slot] : slotMap) { - accessors.emplace_back(sbePlan->getAccessor(ctx, slot)); - } - - sbePlan->open(false); - ON_BLOCK_EXIT([&] { sbePlan->close(); }); - - while (sbePlan->getNext() != sbe::PlanState::IS_EOF) { - const auto [tag, value] = accessors.at(0)->getViewOfValue(); - if (tag == sbe::value::TypeTags::NumberInt64) { - // TODO: check if we get exactly one result from the groupby? - return {true, static_cast(value) / _sampleSize}; - } - return {false, {}}; - }; - - // If nothing passes the filter, estimate 0.0 selectivity. HashGroup will return 0 results. - return {true, 0.0}; - } - - struct NodeRefHash { - size_t operator()(const ABT& node) const { - return optimizer::ABTHashGenerator::generate(node); - } - }; - - struct NodeRefCompare { - bool operator()(const ABT& left, const ABT& right) const { - return left == right; - } - }; - - // Cache a logical node reference to computed selectivity. Used for Filter and Sargable nodes. - optimizer::opt::unordered_map - _selectivityCacheMap; - - OptPhaseManager _phaseManager; - - // We don't own this. - OperationContext* _opCtx; - - const int64_t _sampleSize; - std::unique_ptr _fallbackCE; -}; - -CESamplingTransport::CESamplingTransport(OperationContext* opCtx, - OptPhaseManager phaseManager, - const int64_t numRecords, - std::unique_ptr fallbackCE) - : _impl(std::make_unique( - opCtx, std::move(phaseManager), numRecords, std::move(fallbackCE))) {} - -CESamplingTransport::~CESamplingTransport() {} - -CEType CESamplingTransport::deriveCE(const Metadata& metadata, - const Memo& memo, - const LogicalProps& logicalProps, - const ABT::reference_type logicalNodeRef) const { - return _impl->derive(metadata, memo, logicalProps, logicalNodeRef); -} - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_sampling.h b/src/mongo/db/query/ce/ce_sampling.h deleted file mode 100644 index 9e13abb5d13..00000000000 --- a/src/mongo/db/query/ce/ce_sampling.h +++ /dev/null @@ -1,56 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include "mongo/db/query/optimizer/cascades/interfaces.h" -#include "mongo/db/query/optimizer/opt_phase_manager.h" - -namespace mongo::ce { - -class CESamplingTransportImpl; - -class CESamplingTransport : public optimizer::cascades::CEInterface { -public: - CESamplingTransport(OperationContext* opCtx, - optimizer::OptPhaseManager phaseManager, - int64_t numRecords, - std::unique_ptr fallbackCE); - ~CESamplingTransport(); - - optimizer::CEType deriveCE(const optimizer::Metadata& metadata, - const optimizer::cascades::Memo& memo, - const optimizer::properties::LogicalProps& logicalProps, - optimizer::ABT::reference_type logicalNodeRef) const final; - -private: - std::unique_ptr _impl; -}; - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_test_utils.cpp b/src/mongo/db/query/ce/ce_test_utils.cpp deleted file mode 100644 index 5212c48ab00..00000000000 --- a/src/mongo/db/query/ce/ce_test_utils.cpp +++ /dev/null @@ -1,216 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include - -#include "mongo/db/query/ce/ce_test_utils.h" - -#include "mongo/db/pipeline/abt/utils.h" -#include "mongo/db/query/optimizer/explain.h" -#include "mongo/db/query/optimizer/metadata_factory.h" -#include "mongo/db/query/optimizer/opt_phase_manager.h" -#include "mongo/db/query/optimizer/rewrites/const_eval.h" -#include "mongo/db/query/optimizer/utils/unit_test_pipeline_utils.h" -#include "mongo/db/query/optimizer/utils/unit_test_utils.h" -#include "mongo/db/query/sbe_stage_builder_helpers.h" -#include "mongo/unittest/unittest.h" - -namespace mongo::ce { - -using namespace optimizer; -using namespace cascades; - -CETester::CETester(std::string collName, - double collCard, - const optimizer::OptPhaseManager::PhaseSet& optPhases) - : _optPhases(optPhases), _hints(), _metadata({}), _collName(collName) { - addCollection(collName, collCard); -} - -optimizer::CEType CETester::getMatchCE(const std::string& queryPredicate, - std::function nodePredicate) const { - return getCE("[{$match: " + queryPredicate + "}]", nodePredicate); -} - -optimizer::CEType CETester::getCE(const std::string& pipeline, - std::function nodePredicate) const { - if constexpr (kCETestLogOnly) { - std::cout << "\n\nQuery: " << pipeline << "\n"; - } - - // Construct ABT from pipeline and optimize. - ABT abt = translatePipeline(pipeline, _collName); - - // Get cardinality estimate. - return getCE(abt, nodePredicate); -} - -optimizer::CEType CETester::getCE(ABT& abt, std::function nodePredicate) const { - if constexpr (kCETestLogOnly) { - std::cout << ExplainGenerator::explainV2(abt) << std::endl; - } - - OptPhaseManager phaseManager{_optPhases, - _prefixId, - false /*requireRID*/, - _metadata, - getCETransport(), - makeHeuristicCE(), - makeCosting(), - defaultConvertPathToInterval, - ConstEval::constFold, - DebugInfo::kDefaultForTests, - _hints}; - phaseManager.optimize(abt); - - const auto& memo = phaseManager.getMemo(); - if constexpr (kCETestLogOnly) { - std::cout << ExplainGenerator::explainMemo(memo) << std::endl; - } - - auto cht = getCETransport(); - - // If we are running no optimization phases, we are ensuring that we get the correct estimate on - // the original ABT (usually testing the CE for FilterNodes). The memo won't have any groups for - // us to estimate directly yet. - if (_optPhases.empty()) { - auto card = cht->deriveCE(_metadata, memo, {}, abt.ref()); - - if constexpr (kCETestLogOnly) { - std::cout << "CE: " << card << std::endl; - } - - return card; - } - - CEType outCard = kInvalidCardinality; - for (size_t groupId = 0; groupId < memo.getGroupCount(); groupId++) { - // Note that we always verify CE for MemoLogicalDelegatorNodes when calling getCE(). - - // If the 'optPhases' either ends with the MemoSubstitutionPhase or the - // MemoImplementationPhase, we should have exactly one logical node per group. However, if - // we have indexes, or a $group, we may have multiple logical nodes. In this case, we still - // want to pick the first node. - const auto& node = memo.getLogicalNodes(groupId).front(); - - // This gets the cardinality estimate actually produced during optimization. - const auto& logicalProps = memo.getLogicalProps(groupId); - auto memoCE = properties::getPropertyConst(logicalProps) - .getEstimate(); - - // Conversely, here we call deriveCE() on the ABT produced by the optimization phases, which - // has all its delegators dereferenced. - auto card = cht->deriveCE(_metadata, memo, logicalProps, node.ref()); - - if constexpr (!kCETestLogOnly) { - // Ensure that the CE stored for the logical nodes of each group is what we would expect - // when estimating that node directly. Note that this check will fail if we are testing - // histogram estimation and only using the MemoSubstitutionPhase because the memo always - // uses heuristic estimation in this case. - ASSERT_APPROX_EQUAL(card, memoCE, kMaxCEError); - } else { - if (std::abs(memoCE - card) > kMaxCEError) { - std::cout << "ERROR: CE Group(" << groupId << ") " << card << " vs. " << memoCE - << std::endl; - std::cout << ExplainGenerator::explainV2(node) << std::endl; - } - } - - if (nodePredicate(node)) { - // We want to return the cardinality for the memo group matching the 'nodePredicate'. - outCard = memoCE; - } - } - - ASSERT_NOT_EQUALS(outCard, kInvalidCardinality); - - if constexpr (kCETestLogOnly) { - std::cout << "CE: " << outCard << std::endl; - } - - return outCard; -} - -ScanDefinition& CETester::getCollScanDefinition() { - auto it = _metadata._scanDefs.find(_collName); - invariant(it != _metadata._scanDefs.end()); - return it->second; -} - - -void CETester::setCollCard(double card) { - auto& scanDef = getCollScanDefinition(); - addCollection(_collName, card, scanDef.getIndexDefs()); -} - -void CETester::setIndexes(opt::unordered_map indexes) { - auto& scanDef = getCollScanDefinition(); - addCollection(_collName, scanDef.getCE(), indexes); -} - -void CETester::addCollection(std::string collName, - double numRecords, - opt::unordered_map indexes) { - _metadata._scanDefs.insert_or_assign(collName, - createScanDef({}, - indexes, - ConstEval::constFold, - {DistributionType::Centralized}, - true /*exists*/, - numRecords)); -} - -ScalarHistogram createHistogram(const std::vector& data) { - value::Array bounds; - std::vector buckets; - - double cumulativeFreq = 0.0; - double cumulativeNDV = 0.0; - - for (size_t i = 0; i < data.size(); i++) { - const auto& item = data.at(i); - const auto [tag, val] = stage_builder::makeValue(item._v); - bounds.push_back(tag, val); - - cumulativeFreq += item._equalFreq + item._rangeFreq; - cumulativeNDV += item._ndv + 1.0; - buckets.emplace_back( - item._equalFreq, item._rangeFreq, cumulativeFreq, item._ndv, cumulativeNDV); - } - - return {std::move(bounds), std::move(buckets)}; -} - -double estimateIntValCard(const ScalarHistogram& hist, const int v, const EstimationType type) { - const auto [tag, val] = - std::make_pair(value::TypeTags::NumberInt64, value::bitcastFrom(v)); - return estimate(hist, tag, val, type).card; -}; - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_test_utils.h b/src/mongo/db/query/ce/ce_test_utils.h deleted file mode 100644 index 0894501bc2a..00000000000 --- a/src/mongo/db/query/ce/ce_test_utils.h +++ /dev/null @@ -1,250 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include -#include - -#include "mongo/db/query/ce/histogram_estimation.h" -#include "mongo/db/query/ce/scalar_histogram.h" -#include "mongo/db/query/optimizer/cascades/interfaces.h" -#include "mongo/db/query/optimizer/opt_phase_manager.h" - -namespace mongo { - -namespace optimizer { -namespace cascades { - -// Forward declaration. -class CEInterface; - -} // namespace cascades -} // namespace optimizer - -namespace ce { - -using namespace optimizer; -using namespace sbe; - -// Enable this flag to log all estimates, and let all tests pass. -constexpr bool kCETestLogOnly = false; - -const double kMaxCEError = 0.01; -const CEType kInvalidCardinality = -1.0; - -const OptPhaseManager::PhaseSet kDefaultCETestPhaseSet{OptPhase::MemoSubstitutionPhase, - OptPhase::MemoExplorationPhase, - OptPhase::MemoImplementationPhase}; - -const OptPhaseManager::PhaseSet kOnlySubPhaseSet{OptPhase::MemoSubstitutionPhase}; - -const OptPhaseManager::PhaseSet kNoOptPhaseSet{}; - -/** - * Helpful macros for asserting that the CE of a $match predicate is approximately what we were - * expecting. - */ - -#define _ASSERT_CE(estimatedCE, expectedCE) \ - if constexpr (kCETestLogOnly) { \ - if (std::abs(estimatedCE - expectedCE) > kMaxCEError) { \ - std::cout << "ERROR: expected " << expectedCE << std::endl; \ - } \ - ASSERT_APPROX_EQUAL(1.0, 1.0, kMaxCEError); \ - } else { \ - ASSERT_APPROX_EQUAL(estimatedCE, expectedCE, kMaxCEError); \ - } -#define _PREDICATE(field, predicate) (str::stream() << "{" << field << ": " << predicate "}") -#define _ELEMMATCH_PREDICATE(field, predicate) \ - (str::stream() << "{" << field << ": {$elemMatch: " << predicate << "}}") - -// This macro verifies the cardinality of a pipeline or an input ABT. -#define ASSERT_CE(ce, pipeline, expectedCE) _ASSERT_CE(ce.getCE(pipeline), (expectedCE)) - -// This macro does the same as above but also sets the collection cardinality. -#define ASSERT_CE_CARD(ce, pipeline, expectedCE, collCard) \ - ce.setCollCard(collCard); \ - ASSERT_CE(ce, pipeline, expectedCE) - -// This macro verifies the cardinality of a pipeline with a single $match predicate. -#define ASSERT_MATCH_CE(ce, predicate, expectedCE) \ - _ASSERT_CE(ce.getMatchCE(predicate), (expectedCE)) - -#define ASSERT_MATCH_CE_NODE(ce, queryPredicate, expectedCE, nodePredicate) \ - _ASSERT_CE(ce.getMatchCE(queryPredicate, nodePredicate), (expectedCE)) - -// This macro does the same as above but also sets the collection cardinality. -#define ASSERT_MATCH_CE_CARD(ce, predicate, expectedCE, collCard) \ - ce.setCollCard(collCard); \ - ASSERT_MATCH_CE(ce, predicate, expectedCE) - -// This macro tests cardinality of two versions of the predicate; with and without $elemMatch. -#define ASSERT_EQ_ELEMMATCH_CE(tester, expectedCE, elemMatchExpectedCE, field, predicate) \ - ASSERT_MATCH_CE(tester, _PREDICATE(field, predicate), expectedCE); \ - ASSERT_MATCH_CE(tester, _ELEMMATCH_PREDICATE(field, predicate), elemMatchExpectedCE) - -#define ASSERT_EQ_ELEMMATCH_CE_NODE(tester, expectedCE, elemMatchExpectedCE, field, predicate, n) \ - ASSERT_MATCH_CE_NODE(tester, _PREDICATE(field, predicate), expectedCE, n); \ - ASSERT_MATCH_CE_NODE(tester, _ELEMMATCH_PREDICATE(field, predicate), elemMatchExpectedCE, n) - -// Some commonly used functions for picking nodes in the memo for testing estimation. -template -bool isSargableNode(const ABT& n) { - if constexpr (NumReq == 0) { - return n.is(); - } - - // Sometimes SargableNodes get split and placed into different memo groups, but we are looking - // for a SargableNode with a specific number of predicates. For tests, we only care about - // verifying the cardinality of that one. - if (auto* sargable = n.cast()) { - return sargable->getReqMap().size() == NumReq; - } - return false; -} -const auto isSargable = isSargableNode<0>; -const auto isSargable1 = isSargableNode<1>; -const auto isSargable2 = isSargableNode<2>; -const auto isSargable3 = isSargableNode<3>; -const auto isSargable4 = isSargableNode<4>; -const auto isRoot = [](const ABT& n) -> bool { return n.is(); }; - -/** - * A test utility class for helping verify the cardinality of CE transports on a given $match - * predicate. - */ -class CETester { -public: - /** - * The tester initializes at least one collection with the name 'collName' and the cardinality - * 'numRecords' in the metadata. - */ - CETester(std::string collName, - double numRecords, - const OptPhaseManager::PhaseSet& optPhases = kDefaultCETestPhaseSet); - - /** - * Returns the estimated cardinality of a given 'matchPredicate'. - * - * 'nodePredicate' identifies the node in the memo we want to estimate. - */ - CEType getMatchCE(const std::string& matchPredicate, - std::function nodePredicate = isRoot) const; - - /** - * Returns the estimated cardinality of a given 'pipeline'. - * - * 'nodePredicate' identifies the node in the memo we want to estimate. - */ - CEType getCE(const std::string& pipeline, - std::function nodePredicate = isRoot) const; - - /** - * Returns the estimated cardinality of a given 'abt'. - * - * 'nodePredicate' identifies the node in the memo we want to estimate. - */ - CEType getCE(ABT& abt, std::function nodePredicate = isRoot) const; - - /** - * Updates the cardinality of the collection '_collName'. - */ - void setCollCard(double card); - - /** - * Updates the indexes used by the collection '_collName'. - */ - void setIndexes(opt::unordered_map indexes); - - /** - * Adds a ScanDefinition for an additional collection for the test. - */ - void addCollection(std::string collName, - double numRecords, - opt::unordered_map indexes = {}); - - /** - * Prevents the optimizer from generating collection scan plans. - */ - void setDisableScan(bool disableScan) { - _hints._disableScan = disableScan; - } - -protected: - /** - * Subclasses need to override this method to initialize the transports they are testing. - */ - virtual std::unique_ptr getCETransport() const = 0; - -private: - /** - * Helper to find the ScanDefinition of '_collName' in _metadata. - */ - ScanDefinition& getCollScanDefinition(); - - // Phases to use when optimizing an input query. - const OptPhaseManager::PhaseSet& _optPhases; - - // Used to initialize the OptPhaseManager. - mutable PrefixId _prefixId; - - // Allows us to pass hints to the optimizer. - QueryHints _hints; - - // Stores the ScanDefinitions for all collections defined in the test. - Metadata _metadata; - - // Name of the collection tests will be executed against. - std::string _collName; -}; - -/** - * Test utility for helping with creation of manual histograms in the unit tests. - */ -struct BucketData { - Value _v; - double _equalFreq; - double _rangeFreq; - double _ndv; - - BucketData(Value v, double equalFreq, double rangeFreq, double ndv) - : _v(v), _equalFreq(equalFreq), _rangeFreq(rangeFreq), _ndv(ndv) {} - BucketData(const std::string& v, double equalFreq, double rangeFreq, double ndv) - : BucketData(Value(v), equalFreq, rangeFreq, ndv) {} - BucketData(int v, double equalFreq, double rangeFreq, double ndv) - : BucketData(Value(v), equalFreq, rangeFreq, ndv) {} -}; - -ScalarHistogram createHistogram(const std::vector& data); - -double estimateIntValCard(const ScalarHistogram& hist, int v, EstimationType type); - -} // namespace ce -} // namespace mongo diff --git a/src/mongo/db/query/ce/collection_statistics.h b/src/mongo/db/query/ce/collection_statistics.h deleted file mode 100644 index 5949215b448..00000000000 --- a/src/mongo/db/query/ce/collection_statistics.h +++ /dev/null @@ -1,60 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include "mongo/db/namespace_string.h" -#include "mongo/db/query/ce/array_histogram.h" - -namespace mongo::ce { - -using Histograms = std::map>; - -class CollectionStatistics { -public: - /** - * Returns the cardinality of the given collection. - */ - virtual double getCardinality() const = 0; - - /** - * Returns the histogram for the given field path, or nullptr if none exists. - */ - virtual const ArrayHistogram* getHistogram(const std::string& path) const = 0; - - /** - * Adds a histogram along the given path. - */ - virtual void addHistogram(const std::string& path, - std::shared_ptr histogram) const = 0; - - virtual ~CollectionStatistics() = default; -}; - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/collection_statistics_impl.cpp b/src/mongo/db/query/ce/collection_statistics_impl.cpp deleted file mode 100644 index 7bf6b4e7a11..00000000000 --- a/src/mongo/db/query/ce/collection_statistics_impl.cpp +++ /dev/null @@ -1,71 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/db/query/ce/collection_statistics_impl.h" -#include "mongo/db/client.h" -#include "mongo/db/query/ce/stats_catalog.h" - -namespace mongo::ce { - -CollectionStatisticsImpl::CollectionStatisticsImpl(double cardinality, const NamespaceString& nss) - : _cardinality{cardinality}, _histograms{}, _nss{nss} {}; - -double CollectionStatisticsImpl::getCardinality() const { - return _cardinality; -} - -void CollectionStatisticsImpl::addHistogram(const std::string& path, - std::shared_ptr histogram) const { - _histograms[path] = histogram; -} - -const ArrayHistogram* CollectionStatisticsImpl::getHistogram(const std::string& path) const { - if (auto mapIt = _histograms.find(path); mapIt != _histograms.end()) { - return mapIt->second.get(); - } else { - uassert(8423368, "no current client", Client::getCurrent()); - auto opCtx = Client::getCurrent()->getOperationContext(); - uassert(8423367, "no operation context", opCtx); - StatsCatalog& statsCatalog = StatsCatalog::get(opCtx); - const auto swHistogram = statsCatalog.getHistogram(opCtx, _nss, path); - if (!swHistogram.isOK()) { - if (swHistogram != ErrorCodes::NamespaceNotFound) { - uasserted(swHistogram.getStatus().code(), - str::stream() << "Error getting histograms for path " << _nss << " : " - << path << swHistogram.getStatus().reason()); - } - return nullptr; - } - const auto histogram = std::move(swHistogram.getValue()); - addHistogram(path, histogram); - return histogram.get(); - } -} - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/collection_statistics_impl.h b/src/mongo/db/query/ce/collection_statistics_impl.h deleted file mode 100644 index 11b2c9630ce..00000000000 --- a/src/mongo/db/query/ce/collection_statistics_impl.h +++ /dev/null @@ -1,67 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include "mongo/db/query/ce/array_histogram.h" -#include "mongo/db/query/ce/collection_statistics.h" - -namespace mongo::ce { - -using Histograms = std::map>; - -class CollectionStatisticsImpl : public CollectionStatistics { -public: - CollectionStatisticsImpl(double cardinality, const NamespaceString& nss); - - /** - * Returns the cardinality of the given collection. - */ - double getCardinality() const override; - - /** - * Returns the histogram for the given field path, or nullptr if none exists. - */ - const ArrayHistogram* getHistogram(const std::string& path) const override; - - /** - * Adds a histogram along the given path. - */ - void addHistogram(const std::string& path, - std::shared_ptr histogram) const override; - - ~CollectionStatisticsImpl() = default; - -private: - double _cardinality; - mutable Histograms _histograms; - const NamespaceString _nss; -}; - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/collection_statistics_mock.cpp b/src/mongo/db/query/ce/collection_statistics_mock.cpp deleted file mode 100644 index d8faa285e20..00000000000 --- a/src/mongo/db/query/ce/collection_statistics_mock.cpp +++ /dev/null @@ -1,53 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/db/query/ce/collection_statistics_mock.h" - -namespace mongo::ce { - -CollectionStatisticsMock::CollectionStatisticsMock(double cardinality) - : _cardinality{cardinality}, _histograms{} {}; - -double CollectionStatisticsMock::getCardinality() const { - return _cardinality; -} - -void CollectionStatisticsMock::addHistogram(const std::string& path, - std::shared_ptr histogram) const { - _histograms[path] = histogram; -} - -const ArrayHistogram* CollectionStatisticsMock::getHistogram(const std::string& path) const { - if (auto mapIt = _histograms.find(path); mapIt != _histograms.end()) { - return mapIt->second.get(); - } - return nullptr; -} - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/collection_statistics_mock.h b/src/mongo/db/query/ce/collection_statistics_mock.h deleted file mode 100644 index a93964cd701..00000000000 --- a/src/mongo/db/query/ce/collection_statistics_mock.h +++ /dev/null @@ -1,64 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include "mongo/db/namespace_string.h" -#include "mongo/db/query/ce/collection_statistics.h" - -namespace mongo::ce { - -class CollectionStatisticsMock : public CollectionStatistics { -public: - CollectionStatisticsMock(double cardinality); - - /** - * Returns the cardinality of the given collection. - */ - double getCardinality() const override; - - /** - * Adds a histogram along the given path. - */ - void addHistogram(const std::string& path, - std::shared_ptr histogram) const override; - - /** - * Returns the histogram for the given field path, or nullptr if none exists. - */ - const ArrayHistogram* getHistogram(const std::string& path) const override; - - ~CollectionStatisticsMock() = default; - -private: - double _cardinality; - mutable Histograms _histograms; -}; - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/generated_histograms_test.cpp b/src/mongo/db/query/ce/generated_histograms_test.cpp new file mode 100644 index 00000000000..3f5ce361584 --- /dev/null +++ b/src/mongo/db/query/ce/generated_histograms_test.cpp @@ -0,0 +1,366 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include +#include + +#include "mongo/db/exec/sbe/values/value.h" +#include "mongo/db/query/ce/histogram_predicate_estimation.h" +#include "mongo/db/query/ce/test_utils.h" +#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/unittest/unittest.h" + +namespace mongo::optimizer::ce { +namespace { +namespace value = sbe::value; + +using stats::ArrayHistogram; +using stats::ScalarHistogram; +using stats::TypeCounts; + +constexpr double kErrorBound = 0.1; + +TEST(EstimatorTest, UniformIntStrEstimate) { + /* The code in this comment generates a dataset and creates the histogram used in this test. To + recreate the data set and the histogram, place this code in a unit test which uses the utilities + from rand_utils_new.cpp. + + constexpr int minLen = 3, maxLen = 5; + constexpr int minVal = 0, maxVal = 1000; + constexpr size_t dataSize = 1000; + constexpr size_t nBuckets = std::min(20UL, dataSize); + + MixedDistributionDescriptor dd{{DistrType::kUniform, 1.0}}; + TypeDistrVector td; + td.emplace_back(std::make_unique(dd, 0.5, 250, minVal, maxVal)); + td.emplace_back(std::make_unique(dd, 0.5, 250, minLen, maxLen)); + + std::mt19937_64 gen(0); + DatasetDescriptorNew desc{std::move(td), gen}; + + std::vector dataset; + dataset = desc.genRandomDataset(dataSize); + + const ScalarHistogram& hist = makeHistogram(dataset, nBuckets); + */ + + std::vector data{ + {2, 5, 0, 0}, {57, 4, 21, 12}, {159, 4, 59, 24}, {172, 5, 0, 0}, + {184, 4, 2, 2}, {344, 4, 73, 32}, {363, 4, 1, 1}, {420, 3, 16, 10}, + {516, 2, 49, 23}, {758, 4, 113, 54}, {931, 5, 104, 41}, {998, 4, 29, 12}, + {"3vL", 6, 30, 11}, {"9WUk", 1, 59, 24}, {"HraK", 4, 56, 26}, {"Zujbu", 1, 130, 64}, + {"kEr", 5, 80, 40}, {"rupc", 6, 44, 21}, {"up1O", 5, 16, 7}, {"ztf", 5, 37, 17}}; + + const ScalarHistogram hist = createHistogram(data); + const ArrayHistogram arrHist( + hist, TypeCounts{{value::TypeTags::NumberInt64, 515}, {value::TypeTags::StringSmall, 485}}); + + const auto [tagLowStr, valLowStr] = value::makeNewString(""_sd); + value::ValueGuard vgLowStr(tagLowStr, valLowStr); + const auto [tagAbc, valAbc] = value::makeNewString("abc"_sd); + value::ValueGuard vg(tagAbc, valAbc); + auto [tagObj, valObj] = value::makeNewObject(); + value::ValueGuard vgObj(tagObj, valObj); + + // Predicates over bucket bound. + // Actual cardinality {$eq: 804} = 2. + double expectedCard = estimateIntValCard(hist, 804, EstimationType::kEqual); + ASSERT_APPROX_EQUAL(2.5, expectedCard, kErrorBound); + + // Actual cardinality {$lt: 100} = 40. + expectedCard = estimateIntValCard(hist, 100, EstimationType::kLess); + ASSERT_APPROX_EQUAL(52.4, expectedCard, kErrorBound); + + // Range query crossing the type brackets. + // Actual cardinality {$gt: 100} = 475. + expectedCard = estimateCardRange(arrHist, + false /* lowInclusive */, + value::TypeTags::NumberInt64, + value::bitcastFrom(100), + false /* highInclusive */, + tagLowStr, + valLowStr, + true /* includeScalar */); + ASSERT_APPROX_EQUAL(460.1, expectedCard, kErrorBound); + + // Actual cardinality {$lt: 'abc'} = 291. + expectedCard = estimateCardRange(arrHist, + true /* lowInclusive */, + tagLowStr, + valLowStr, + true /* highInclusive */, + tagAbc, + valAbc, + true /* includeScalar */); + ASSERT_APPROX_EQUAL(319.9, expectedCard, kErrorBound); + + // Actual cardinality {$gte: 'abc'} = 194. + expectedCard = estimateCardRange(arrHist, + true /* lowInclusive */, + tagAbc, + valAbc, + false /* highInclusive */, + tagObj, + valObj, + true /* includeScalar */); + ASSERT_APPROX_EQUAL(167.0, expectedCard, kErrorBound); + + // Queries over the low string bound. + // Actual cardinality {$eq: ''} = 0. + expectedCard = estimateCardEq(arrHist, tagLowStr, valLowStr, true); + ASSERT_APPROX_EQUAL(2.727, expectedCard, 0.001); + + // Actual cardinality {$gt: ''} = 485. + expectedCard = estimateCardRange(arrHist, + false /* lowInclusive */, + tagLowStr, + valLowStr, + false /* highInclusive */, + tagObj, + valObj, + true /* includeScalar */); + ASSERT_APPROX_EQUAL(485, expectedCard, 0.001); +} + +TEST(EstimatorTest, IntStrArrayEstimate) { + /* The code in this comment generates a dataset of 1000 integers, strings and arrays of integers + and strings and creates the histogram used in this test. To recreate the data set and the + histogram, place this code in a unit test which uses the utilities from rand_utils_new.cpp. + + constexpr int minLen = 2, maxLen = 5; + constexpr int minVal = 0, maxVal = 1000; + constexpr size_t dataSize = 1000; + constexpr size_t nBuckets = std::min(20UL, dataSize); + + MixedDistributionDescriptor dd{{DistrType::kUniform, 1.0}}; + TypeDistrVector td1; + td1.emplace_back(std::make_unique(dd, 0.7, 200, minVal, maxVal)); + td1.emplace_back(std::make_unique(dd, 0.3, 100, minLen, maxLen)); + + std::mt19937_64 gen(5); + auto desc1 = std::make_unique(std::move(td1), gen); + + TypeDistrVector td2; + td2.emplace_back(std::make_unique(dd, 0.4, 200, minVal, maxVal)); + td2.emplace_back(std::make_unique(dd, 0.3, 200, minLen, maxLen)); + td2.emplace_back(std::make_unique(dd, 0.3, 200, 2, 6, std::move(desc1), + 0.0)); + + DatasetDescriptorNew desc{std::move(td2), gen}; + std::vector dataset; + dataset = desc.genRandomDataset(dataSize); + + const ScalarHistogram& hist = makeHistogram(dataset, nBuckets); + */ + + std::vector scalarData{ + {10, 1, 0, 0}, {11, 4, 0, 0}, {44, 2, 5, 2}, {213, 3, 40, 20}, + {256, 5, 13, 6}, {270, 3, 9, 2}, {407, 3, 56, 28}, {510, 3, 32, 16}, + {524, 3, 0, 0}, {561, 5, 16, 8}, {583, 3, 4, 3}, {599, 3, 1, 1}, + {663, 5, 19, 9}, {681, 5, 6, 2}, {873, 5, 75, 37}, {909, 4, 16, 7}, + {994, 3, 36, 14}, {"9TcY", 4, 44, 23}, {"Zow00", 5, 134, 67}, {"zsS", 2, 130, 66}, + }; + + const ScalarHistogram scalarHist = createHistogram(scalarData); + + std::vector minData{ + {12, 5, 0, 0}, {17, 8, 0, 0}, {28, 7, 7, 1}, {55, 5, 22, 5}, + {110, 5, 45, 11}, {225, 4, 43, 15}, {563, 3, 98, 36}, {643, 4, 3, 2}, + {701, 4, 9, 5}, {845, 1, 6, 4}, {921, 2, 0, 0}, {980, 1, 0, 0}, + {"1l", 9, 16, 4}, {"8YN", 4, 19, 5}, {"PE2OO", 2, 41, 15}, {"WdJ", 8, 25, 7}, + {"dKb7", 9, 17, 6}, {"msdP", 12, 25, 10}, {"t7wmp", 5, 15, 6}, {"yx", 2, 13, 4}, + }; + + const ScalarHistogram minHist = createHistogram(minData); + + std::vector maxData{ + {26, 2, 0, 0}, {79, 3, 0, 0}, {147, 1, 0, 0}, {207, 2, 0, 0}, + {362, 6, 7, 5}, {563, 3, 47, 19}, {603, 9, 2, 1}, {676, 6, 21, 10}, + {702, 6, 9, 4}, {712, 6, 0, 0}, {759, 8, 4, 1}, {774, 6, 3, 1}, + {831, 9, 28, 9}, {948, 7, 51, 15}, {981, 3, 33, 8}, {"9Iey", 4, 20, 8}, + {"Ji", 3, 21, 8}, {"WdJ", 9, 26, 10}, {"msdP", 9, 59, 20}, {"zbI", 3, 68, 16}, + }; + + const ScalarHistogram maxHist = createHistogram(maxData); + + std::vector uniqueData{ + {12, 5, 0, 0}, {28, 8, 15, 2}, {55, 8, 23, 5}, {110, 5, 59, 12}, + {225, 8, 79, 18}, {362, 8, 88, 20}, {507, 10, 165, 36}, {572, 5, 25, 6}, + {603, 12, 25, 3}, {712, 6, 106, 19}, {759, 11, 17, 4}, {774, 6, 3, 1}, + {831, 14, 50, 13}, {981, 3, 105, 25}, {"547DP", 4, 43, 9}, {"9Iey", 4, 8, 1}, + {"WdJ", 9, 85, 26}, {"ZGYcw", 2, 14, 4}, {"msdP", 14, 80, 21}, {"zbI", 3, 74, 17}, + }; + + const ScalarHistogram uniqueHist = createHistogram(uniqueData); + + TypeCounts typeCounts{{value::TypeTags::NumberInt64, 388}, + {value::TypeTags::StringSmall, 319}, + {value::TypeTags::Array, 293}}; + TypeCounts arrayTypeCounts{{value::TypeTags::NumberInt64, 874}, + {value::TypeTags::StringSmall, 340}}; + const ArrayHistogram arrHist(scalarHist, + typeCounts, + uniqueHist, + minHist, + maxHist, + arrayTypeCounts, + 0 /* No empty arrays */); + + const auto [tagLowDbl, valLowDbl] = + std::make_pair(value::TypeTags::NumberDouble, + value::bitcastFrom(std::numeric_limits::quiet_NaN())); + const auto [tagLowStr, valLowStr] = value::makeNewString(""_sd); + value::ValueGuard vgLowStr(tagLowStr, valLowStr); + + // Actual cardinality {$lt: 100} = 115. + double expectedCard = estimateCardRange(arrHist, + false /* lowInclusive */, + tagLowDbl, + valLowDbl, + false /* highInclusive */, + value::TypeTags::NumberInt64, + value::bitcastFrom(100), + true /* includeScalar */); + ASSERT_APPROX_EQUAL(109.9, expectedCard, kErrorBound); + + // Actual cardinality {$gt: 502} = 434. + expectedCard = estimateCardRange(arrHist, + false /* lowInclusive */, + value::TypeTags::NumberInt64, + value::bitcastFrom(500), + false /* highInclusive */, + tagLowStr, + valLowStr, + true /* includeScalar */); + ASSERT_APPROX_EQUAL(443.8, expectedCard, kErrorBound); + + // Actual cardinality {$gte: 502} = 437. + expectedCard = estimateCardRange(arrHist, + true /* lowInclusive */, + value::TypeTags::NumberInt64, + value::bitcastFrom(500), + false /* highInclusive */, + tagLowStr, + valLowStr, + true /* includeScalar */); + ASSERT_APPROX_EQUAL(448.3, expectedCard, kErrorBound); + + // Actual cardinality {$eq: ''} = 0. + expectedCard = estimateCardEq(arrHist, tagLowStr, valLowStr, true /* includeScalar */); + ASSERT_APPROX_EQUAL(6.69, expectedCard, 0.001); + + // Actual cardinality {$eq: 'DD2'} = 2. + auto [tagStr, valStr] = value::makeNewString("DD2"_sd); + value::ValueGuard vg(tagStr, valStr); + expectedCard = estimateCardEq(arrHist, tagStr, valStr, true /* includeScalar */); + ASSERT_APPROX_EQUAL(5.27, expectedCard, kErrorBound); + + // Actual cardinality {$lte: 'DD2'} = 120. + expectedCard = estimateCardRange(arrHist, + true /* lowInclusive */, + tagLowStr, + valLowStr, + true /* highInclusive */, + tagStr, + valStr, + true /* includeScalar */); + ASSERT_APPROX_EQUAL(160.6, expectedCard, kErrorBound); + + // Actual cardinality {$gt: 'DD2'} = 450. + auto [tagObj, valObj] = value::makeNewObject(); + value::ValueGuard vgObj(tagObj, valObj); + expectedCard = estimateCardRange(arrHist, + false /* lowInclusive */, + tagStr, + valStr, + false /* highInclusive */, + tagObj, + valObj, + true /* includeScalar */); + ASSERT_APPROX_EQUAL(411.2, expectedCard, kErrorBound); + + // Queries with $elemMatch. + const auto [tagInt, valInt] = + std::make_pair(value::TypeTags::NumberInt64, value::bitcastFrom(603)); + + // Actual cardinality {$match: {a: {$elemMatch: {$eq: 603}}}} = 12. + expectedCard = estimateCardEq(arrHist, tagInt, valInt, false /* includeScalar */); + ASSERT_APPROX_EQUAL(12.0, expectedCard, kErrorBound); + + // Actual cardinality {$match: {a: {$elemMatch: {$lte: 603}}}} = 252. + expectedCard = estimateCardRange(arrHist, + false /* lowInclusive */, + tagLowDbl, + valLowDbl, + true /* highInclusive */, + tagInt, + valInt, + false /* includeScalar */); + ASSERT_APPROX_EQUAL(293.0, expectedCard, kErrorBound); + + // Actual cardinality {$match: {a: {$elemMatch: {$gte: 603}}}} = 200. + expectedCard = estimateCardRange(arrHist, + true /* lowInclusive */, + tagInt, + valInt, + false /* highInclusive */, + tagLowStr, + valLowStr, + false /* includeScalar */); + ASSERT_APPROX_EQUAL(250.8, expectedCard, kErrorBound); + + // Actual cardinality {$match: {a: {$elemMatch: {$eq: 'cu'}}}} = 7. + std::tie(tagStr, valStr) = value::makeNewString("cu"_sd); + expectedCard = estimateCardEq(arrHist, tagStr, valStr, false /* includeScalar */); + ASSERT_APPROX_EQUAL(3.8, expectedCard, kErrorBound); + + // Actual cardinality {$match: {a: {$elemMatch: {$gte: 'cu'}}}} = 125. + expectedCard = estimateCardRange(arrHist, + true /* lowInclusive */, + tagStr, + valStr, + false /* highInclusive */, + tagObj, + valObj, + false /* includeScalar */); + ASSERT_APPROX_EQUAL(109.7, expectedCard, kErrorBound); + + // Actual cardinality {$match: {a: {$elemMatch: {$lte: 'cu'}}}} = 141. + expectedCard = estimateCardRange(arrHist, + true /* lowInclusive */, + tagLowStr, + valLowStr, + true /* highInclusive */, + tagStr, + valStr, + false /* includeScalar */); + ASSERT_APPROX_EQUAL(156.1, expectedCard, kErrorBound); +} +} // namespace +} // namespace mongo::optimizer::ce diff --git a/src/mongo/db/query/ce/heuristic_dataflow_nodes_test.cpp b/src/mongo/db/query/ce/heuristic_dataflow_nodes_test.cpp new file mode 100644 index 00000000000..7efe1a974ba --- /dev/null +++ b/src/mongo/db/query/ce/heuristic_dataflow_nodes_test.cpp @@ -0,0 +1,221 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/ce/heuristic_estimator.h" +#include "mongo/db/query/ce/test_utils.h" +#include "mongo/db/query/optimizer/props.h" +#include "mongo/db/query/optimizer/utils/unit_test_utils.h" +#include "mongo/db/query/optimizer/utils/utils.h" +#include "mongo/unittest/unittest.h" + +namespace mongo::optimizer::ce { +namespace { +constexpr double kCollCard = 1000.0; +const std::string kCollName = "test"; + +constexpr double kOtherCollCard = 200.0; +const std::string kOtherCollName = "otherTest"; + +constexpr double kThirdCollCard = 50.0; +const std::string kThirdCollName = "thirdTest"; + +class DataflowCETester : public CETester { +public: + DataflowCETester() : CETester(kCollName, kCollCard, kDefaultCETestPhaseSet) {} + +protected: + std::unique_ptr getEstimator() const override { + return std::make_unique(); + } +}; + +bool isRootNodeFn(const ABT& node) { + return node.is(); +} + +TEST(CEDataflowTest, EstimateTrivialNodes) { + DataflowCETester t; + const auto matchCard = t.getMatchCE("{a: 1}", isRootNodeFn); + + // Verify 'CollationNode' estimate returns the input cardinality. + ASSERT_CE(t, "[{$sort: {a: 1}}]", kCollCard); + ASSERT_CE(t, "[{$sort: {a: -1, b: 1}}]", kCollCard); + ASSERT_CE(t, "[{$match: {a: 1}}, {$sort: {a: 1, b: 1}}]", matchCard); + + // Verify 'EvaluationNode' estimate. + ASSERT_CE(t, "[{$project: {a: {$add: [\"$a\", 1]}}}]", kCollCard); + ASSERT_CE(t, "[{$match: {a: 1}}, {$project: {a: {$add: [\"$a\", 1]}}}]", matchCard); +} + +TEST(CEDataflowTest, EstimateUnionNode) { + auto makeUnionBranch = [](const std::string& collName) { + ProjectionName scanVar{"scan_" + collName}; + auto scanNode = make(scanVar, collName); + auto evalPath = + make(make("a", make()), make(scanVar)); + return make("a", std::move(evalPath), std::move(scanNode)); + }; + + // Verify that the estimate of 'UnionNode' always returns the sum of estimates of its children. + // In the following tests we force a simple plan to be generated by passing in a 'manually' + // constructed ABT. + { + DataflowCETester t; + t.addCollection(kOtherCollName, kOtherCollCard, {}); + t.addCollection(kThirdCollName, kThirdCollCard, {}); + { + auto unionNode = make( + ProjectionNameVector{"a"}, + makeSeq(makeUnionBranch(kCollName), makeUnionBranch(kOtherCollName))); + auto rootNode = make( + properties::ProjectionRequirement{ProjectionNameVector{"a"}}, std::move(unionNode)); + ASSERT_CE(t, rootNode, kCollCard + kOtherCollCard); + } + { + auto unionNode = make( + ProjectionNameVector{"a"}, + makeSeq(makeUnionBranch(kCollName), makeUnionBranch(kOtherCollName))); + auto parentUnionNode = + make(ProjectionNameVector{"a"}, + makeSeq(std::move(unionNode), makeUnionBranch(kThirdCollName))); + auto rootNode = + make(properties::ProjectionRequirement{ProjectionNameVector{"a"}}, + std::move(parentUnionNode)); + ASSERT_CE(t, rootNode, kCollCard + kOtherCollCard + kThirdCollCard); + } + } + + // The following plans include a UnionNode. + { + DataflowCETester t; + t.setCollCard(2000); + t.setIndexes( + {{"indexA", makeIndexDefinition("a", CollationOp::Ascending, /* isMultiKey */ true)}}); + t.setDisableScan(true); + ASSERT_MATCH_CE(t, {"{a: [12]}"}, 1); + } + { + DataflowCETester t; + t.setIndexes( + {{"indexA", makeIndexDefinition("a", CollationOp::Ascending, /* isMultiKey */ false)}, + {"indexB", makeIndexDefinition("b", CollationOp::Ascending, /* isMultiKey */ false)}}); + t.setDisableScan(true); + ASSERT_MATCH_CE(t, {"{a: 1, b: 2}"}, 5.62341); + } +} + +TEST(CEDataflowTest, EstimateLimitSkipNode) { + DataflowCETester t; + const CEType matchCard = t.getMatchCE("{a: 1}", isRootNodeFn); + + // Verify that 'LimitSkipNode' estimate with only a limit set is min(limit, inputCE). + ASSERT_CE(t, "[{$limit: 1}]", 1.0); + ASSERT_CE(t, "[{$limit: 50}]", 50.0); + ASSERT_CE(t, "[{$limit: 1000}]", kCollCard); + ASSERT_CE(t, "[{$limit: 10000}]", kCollCard); + ASSERT_CE(t, "[{$match: {a: 1}}, {$limit: 1}]", 1.0); + ASSERT_CE(t, "[{$match: {a: 1}}, {$limit: 5}]", 5.0); + ASSERT_CE(t, "[{$match: {a: 1}}, {$limit: 50}]", matchCard); + ASSERT_CE(t, "[{$match: {a: 1}}, {$limit: 1000}]", matchCard); + + // Verify that 'LimitSkipNode' estimate with only a skip set is max(inputCE - skip, 0). + ASSERT_CE(t, "[{$skip: 0}]", kCollCard); + ASSERT_CE(t, "[{$skip: 1}]", kCollCard - 1.0); + ASSERT_CE(t, "[{$skip: 50}]", kCollCard - 50.0); + ASSERT_CE(t, "[{$skip: 1000}]", 0.0); + ASSERT_CE(t, "[{$skip: 10000}]", 0.0); + ASSERT_CE(t, "[{$match: {a: 1}}, {$skip: 1}]", matchCard - 1.0); + ASSERT_CE(t, "[{$match: {a: 1}}, {$skip: 5}]", matchCard - 5.0); + ASSERT_CE(t, "[{$match: {a: 1}}, {$skip: 50}]", 0.0); + ASSERT_CE(t, "[{$match: {a: 1}}, {$skip: 1000}]", 0.0); + + // Test estimates for combinations of $limit & $skip. + ASSERT_CE(t, "[{$limit: 1}, {$skip: 1}]", 0.0); + ASSERT_CE(t, "[{$skip: 1}, {$limit: 1}]", 1.0); + ASSERT_CE(t, "[{$limit: 1}, {$skip: 50}]", 0.0); + ASSERT_CE(t, "[{$skip: 50}, {$limit: 1}]", 1.0); + ASSERT_CE(t, "[{$limit: 50}, {$skip: 1}]", 49.0); + ASSERT_CE(t, "[{$skip: 1}, {$limit: 50}]", 50.0); + ASSERT_CE(t, "[{$limit: 50}, {$skip: 50}]", 0.0); + ASSERT_CE(t, "[{$skip: 50}, {$limit: 50}]", 50.0); + ASSERT_CE(t, "[{$limit: 1000}, {$skip: 50}]", kCollCard - 50.0); + ASSERT_CE(t, "[{$skip: 50}, {$limit: 1000}]", kCollCard - 50.0); + ASSERT_CE(t, "[{$limit: 50}, {$skip: 1000}]", 0.0); + ASSERT_CE(t, "[{$skip: 1000}, {$limit: 50}]", 0.0); + ASSERT_CE(t, "[{$limit: 1000}, {$skip: 1000}]", 0.0); + ASSERT_CE(t, "[{$skip: 1000}, {$limit: 1000}]", 0.0); + + // Test estimates for combinations of $limit & $skip separated by a $match. + ASSERT_CE(t, "[{$limit: 1}, {$match: {a: 1}}, {$skip: 1}]", 0.0); + ASSERT_CE(t, "[{$limit: 1}, {$match: {a: 1}}, {$skip: 50}]", 0.0); + + // Input card to $match: 50. $match selectivity here is sqrt(50)/50. + ASSERT_CE(t, "[{$limit: 50}, {$match: {a: 1}}, {$skip: 1}]", 6.07107); + ASSERT_CE(t, "[{$limit: 50}, {$match: {a: 1}}, {$skip: 50}]", 0.0); + ASSERT_CE(t, "[{$limit: 50}, {$match: {a: 1}}, {$skip: 1000}]", 0.0); + + // Input card to $match is kCollCard. However, our estimate is larger than matchCard because we + // have a FilterNode that does not get converted to a SargableNode in this case. The $match + // selectivity here is sqrt(1000)/1000. + ASSERT_CE(t, "[{$limit: 1000}, {$match: {a: 1}}, {$skip: 1}]", 30.6228); + ASSERT_CE(t, "[{$limit: 1000}, {$match: {a: 1}}, {$skip: 20}]", 11.6228); + ASSERT_CE(t, "[{$limit: 1000}, {$match: {a: 1}}, {$skip: 1000}]", 0.0); + + // Input card to $match: 999. $match selectivity here is sqrt(999)/999. + ASSERT_CE(t, "[{$skip: 1}, {$match: {a: 1}}, {$limit: 1}]", 1.0); + ASSERT_CE(t, "[{$skip: 1}, {$match: {a: 1}}, {$limit: 20}]", 20.0); + ASSERT_CE(t, "[{$skip: 1}, {$match: {a: 1}}, {$limit: 1000}]", 31.607); + + // Input card to $match: 950. $match selectivity here is sqrt(950)/950. + ASSERT_CE(t, "[{$skip: 50}, {$match: {a: 1}}, {$limit: 1}]", 1.0); + ASSERT_CE(t, "[{$skip: 50}, {$match: {a: 1}}, {$limit: 20}]", 20.0); + ASSERT_CE(t, "[{$skip: 50}, {$match: {a: 1}}, {$limit: 1000}]", 30.8221); + + // Input card to $match is 0.0. + ASSERT_CE(t, "[{$skip: 1000}, {$match: {a: 1}}, {$limit: 50}]", 0.0); + ASSERT_CE(t, "[{$skip: 1000}, {$match: {a: 1}}, {$limit: 1000}]", 0.0); +} + +TEST(CEDataflowTest, EstimateUnwindNode) { + DataflowCETester t; + const CEType matchCard = t.getMatchCE("{a: 1}", isRootNodeFn); + + // We assume that arrays on average have ~10 elements, so we estimate this as inputCard*10. + ASSERT_CE(t, "[{$unwind: '$a'}]", 10 * kCollCard); + ASSERT_CE(t, "[{$match: {a: 1}}, {$unwind: '$a'}]", 10 * matchCard); + ASSERT_CE(t, "[{$unwind: {path: '$a', preserveNullAndEmptyArrays: true}}]", 10 * kCollCard); + ASSERT_CE(t, + "[{$match: {a: 1}}, {$unwind: {path: '$a', preserveNullAndEmptyArrays: true}}]", + 10 * matchCard); + + // TODO SERVER-70035: implement histogram estimation of $unwind. +} + +} // namespace +} // namespace mongo::optimizer::ce diff --git a/src/mongo/db/query/ce/heuristic_estimator.cpp b/src/mongo/db/query/ce/heuristic_estimator.cpp new file mode 100644 index 00000000000..88421015f39 --- /dev/null +++ b/src/mongo/db/query/ce/heuristic_estimator.cpp @@ -0,0 +1,600 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/ce/heuristic_estimator.h" + +#include "mongo/db/query/optimizer/cascades/memo.h" +#include "mongo/db/query/optimizer/utils/ce_math.h" +#include "mongo/util/assert_util.h" + +namespace mongo::optimizer::ce { +// Invalid estimate - an arbitrary negative value used for initialization. +constexpr SelectivityType kInvalidSel = -1.0; + +constexpr SelectivityType kDefaultFilterSel = 0.1; +constexpr SelectivityType kDefaultExistsSel = 0.70; + +// The selectivities used in the piece-wise function for open-range intervals. +// Note that we assume a smaller input cardinality will result in a less selective range. +constexpr SelectivityType kSmallCardOpenRangeSel = 0.70; +constexpr SelectivityType kMediumCardOpenRangeSel = 0.45; +constexpr SelectivityType kLargeCardOpenRangeSel = 0.33; + +// The selectivities used in the piece-wise function for closed-range intervals. +// Note that we assume a smaller input cardinality will result in a less selective range. +constexpr SelectivityType kSmallCardClosedRangeSel = 0.50; +constexpr SelectivityType kMediumCardClosedRangeSel = 0.33; +constexpr SelectivityType kLargeCardClosedRangeSel = 0.20; + +// Global and Local selectivity should multiply to the Complete selectivity. +constexpr SelectivityType kDefaultCompleteGroupSel = 0.01; +constexpr SelectivityType kDefaultLocalGroupSel = 0.02; +constexpr SelectivityType kDefaultGlobalGroupSel = 0.5; + +// The following constants are the steps used in the piece-wise functions that select selectivies +// based on input cardinality. +constexpr CEType kSmallLimit = 20.0; +constexpr CEType kMediumLimit = 100.0; + +// Assumed average number of elements in an array. +constexpr CEType kDefaultAverageArraySize = 10.0; + +/** + * Default selectivity of equalities. To avoid super small selectivities for small + * cardinalities, that would result in 0 cardinality for many small inputs, the + * estimate is scaled as inputCard grows. The bigger inputCard, the smaller the + * selectivity. + */ +SelectivityType equalitySel(const CEType inputCard) { + uassert(6716604, "Zero cardinality must be handled by the caller.", inputCard > 0.0); + if (inputCard <= 1.0) { + // If the input has < 1 values, it cannot be reduced any further by a condition. + return 1.0; + } + return std::sqrt(inputCard) / inputCard; +} + +/** + * Default selectivity of intervals with bounds on both ends. These intervals are + * considered less selective than equalities. + * Examples: (a > 'abc' AND a < 'hta'), (0 < b <= 13) + */ +SelectivityType closedRangeSel(const CEType inputCard) { + SelectivityType sel = kInvalidSel; + if (inputCard < kSmallLimit) { + sel = kSmallCardClosedRangeSel; + } else if (inputCard < kMediumLimit) { + sel = kMediumCardClosedRangeSel; + } else { + sel = kLargeCardClosedRangeSel; + } + return sel; +} + +/** + * Default selectivity of intervals open on one end. These intervals are + * considered less selective than those with both ends specified by the user query. + * Examples: (a > 'xyz'), (b <= 13) + */ +SelectivityType openRangeSel(const CEType inputCard) { + SelectivityType sel = kInvalidSel; + if (inputCard < kSmallLimit) { + sel = kSmallCardOpenRangeSel; + } else if (inputCard < kMediumLimit) { + sel = kMediumCardOpenRangeSel; + } else { + sel = kLargeCardOpenRangeSel; + } + return sel; +} + +mongo::sbe::value::TypeTags constType(const Constant* constBoundPtr) { + if (constBoundPtr == nullptr) { + return mongo::sbe::value::TypeTags::Nothing; + } + const auto [tag, val] = constBoundPtr->get(); + return tag; +} + +mongo::sbe::value::TypeTags boundType(const BoundRequirement& bound) { + return constType(bound.getBound().cast()); +} + +SelectivityType intervalSel(const IntervalRequirement& interval, const CEType inputCard) { + SelectivityType sel = kInvalidSel; + if (interval.isFullyOpen()) { + sel = 1.0; + } else if (interval.isEquality()) { + sel = equalitySel(inputCard); + } else if (interval.getHighBound().isPlusInf() || interval.getLowBound().isMinusInf() || + boundType(interval.getLowBound()) != boundType(interval.getHighBound())) { + // The interval has an actual bound only on one of it ends if: + // - one of the bounds is infinite, or + // - both bounds are of a different type - this is the case when due to type bracketing + // one of the bounds is the lowest/highest value of the previous/next type. + // TODO: Notice that sometimes type bracketing uses a min/max value from the same type, + // so sometimes we may not detect an open-ended interval. + sel = openRangeSel(inputCard); + } else { + sel = closedRangeSel(inputCard); + } + uassert(6716603, "Invalid selectivity.", validSelectivity(sel)); + return sel; +} + +SelectivityType negationSel(SelectivityType sel) { + return 1.0 - sel; +} + +SelectivityType operationSel(const Operations op, const CEType inputCard) { + switch (op) { + case Operations::Eq: + return equalitySel(inputCard); + case Operations::Neq: + return negationSel(equalitySel(inputCard)); + case Operations::EqMember: + // Reached when the query has $in. We don't handle it yet. + return kDefaultFilterSel; + case Operations::Gt: + case Operations::Gte: + case Operations::Lt: + case Operations::Lte: + return openRangeSel(inputCard); + default: + MONGO_UNREACHABLE; + } +} + +SelectivityType intervalSel(const PathCompare& left, + const PathCompare& right, + const CEType inputCard) { + if (left.op() == Operations::EqMember || right.op() == Operations::EqMember) { + // Reached when the query has $in. We don't handle it yet. + return kDefaultFilterSel; + } + + bool lowBoundUnknown = false; + bool highBoundUnknown = false; + boost::optional lowBoundType; + boost::optional highBoundType; + + for (const auto& compare : {left, right}) { + switch (compare.op()) { + case Operations::Eq: { + // This branch is reached when we have a conjunction of equalities on the same path. + uassert(6777601, + "Expected conjunction of equalities.", + left.op() == Operations::Eq && right.op() == Operations::Eq); + + const auto leftConst = left.getVal().cast(); + const auto rightConst = right.getVal().cast(); + if (leftConst && rightConst && !(*leftConst == *rightConst)) { + // Equality comparison on different constants is a contradiction. + return 0.0; + } + // We can't tell if the equalities result in a contradiction or not, so we use the + // default equality selectivity. + return equalitySel(inputCard); + } + case Operations::Gt: + case Operations::Gte: + lowBoundUnknown = lowBoundUnknown || compare.getVal().is(); + lowBoundType = constType(compare.getVal().cast()); + break; + case Operations::Lt: + case Operations::Lte: + highBoundUnknown = highBoundUnknown || compare.getVal().is(); + highBoundType = constType(compare.getVal().cast()); + break; + default: + MONGO_UNREACHABLE; + } + } + + if (lowBoundType && highBoundType && + (lowBoundType == highBoundType || lowBoundUnknown || highBoundUnknown)) { + // Interval is closed only if: + // - it has low and high bounds + // - bounds are of the same type + // + // If bounds are of a different type, it implies that one bound is the + // lowest/highest value of the previous/next type and has been added for type bracketing + // purposes. We treat such bounds as infinity. + // + // If there are unknown boundaries (Variables), we assume that they are of the same type + // as the other bound. + // + // TODO: Notice that sometimes type bracketing uses a min/max value from the same type, + // so sometimes we may not detect an open-ended interval. + return closedRangeSel(inputCard); + } + + if (lowBoundType || highBoundType) { + return openRangeSel(inputCard); + } + + MONGO_UNREACHABLE; +} + +/** + * Heuristic selectivity estimation for EvalFilter nodes. Used for estimating cardinalities of + * FilterNodes. The estimate is computed by traversing the tree bottom-up, applying default + * selectivity functions to atomic predicates (comparisons), and combining child selectivities of + * disjunctions and conjunctions via simple addition and multiplication. + */ +class EvalFilterSelectivityTransport { +public: + /** + * Helper class for holding values passed from child to parent nodes when traversing the tree. + */ + struct EvalFilterSelectivityResult { + // Each item represents a field in a dotted path. + // Collected while traversing a path expression. + // Used for deciding whether a conjunction of comparisons is an interval or not. + FieldPathType path; + // When handling a PathComposeM, we need to access its child comparisons which might be + // hidden under path expressions. + const PathCompare* compare; + // The selectivity estimate. + SelectivityType selectivity; + }; + + EvalFilterSelectivityResult transport(const EvalFilter& /*node*/, + CEType /*inputCard*/, + EvalFilterSelectivityResult pathResult, + EvalFilterSelectivityResult /*inputResult*/) { + return pathResult; + } + + EvalFilterSelectivityResult transport(const PathGet& node, + CEType /*inputCard*/, + EvalFilterSelectivityResult childResult) { + childResult.path.push_back(node.name()); + return childResult; + } + + EvalFilterSelectivityResult transport(const PathTraverse& node, + CEType /*inputCard*/, + EvalFilterSelectivityResult childResult) { + return childResult; + } + + EvalFilterSelectivityResult transport(const PathCompare& node, + CEType inputCard, + EvalFilterSelectivityResult /*childResult*/) { + // Note that the result will be ignored if this operation is part of an interval. + const SelectivityType sel = operationSel(node.op(), inputCard); + return {{}, &node, sel}; + } + + EvalFilterSelectivityResult transport(const PathComposeM& node, + CEType inputCard, + EvalFilterSelectivityResult leftChildResult, + EvalFilterSelectivityResult rightChildResult) { + const bool isInterval = leftChildResult.compare && rightChildResult.compare && + leftChildResult.path == rightChildResult.path; + + const SelectivityType sel = isInterval + ? intervalSel(*leftChildResult.compare, *rightChildResult.compare, inputCard) + : conjunctionSel(leftChildResult.selectivity, rightChildResult.selectivity); + + return {{}, nullptr, sel}; + } + + EvalFilterSelectivityResult transport(const PathComposeA& node, + CEType /*inputCard*/, + EvalFilterSelectivityResult leftChildResult, + EvalFilterSelectivityResult rightChildResult) { + const SelectivityType sel = + disjunctionSel(leftChildResult.selectivity, rightChildResult.selectivity); + + return {{}, nullptr, sel}; + } + + EvalFilterSelectivityResult transport(const UnaryOp& node, + CEType /*inputCard*/, + EvalFilterSelectivityResult childResult) { + switch (node.op()) { + case Operations::Not: + childResult.selectivity = negationSel(childResult.selectivity); + return childResult; + case Operations::Neg: + // If we see negation (-) in a UnaryOp, we ignore it for CE purposes. + return childResult; + default: + MONGO_UNREACHABLE; + } + } + + EvalFilterSelectivityResult transport(const PathConstant& /*node*/, + CEType /*inputCard*/, + EvalFilterSelectivityResult childResult) { + return childResult; + } + + EvalFilterSelectivityResult transport(const PathDefault& node, + CEType inputCard, + EvalFilterSelectivityResult childResult) { + if (node.getDefault() == Constant::boolean(false)) { + // We have a {$exists: true} predicate on this path if we have a Constant[false] child + // here. Note that ${exists: false} is handled by the presence of a negation expression + // higher in the ABT. + childResult.selectivity = kDefaultExistsSel; + } + return childResult; + } + + template + EvalFilterSelectivityResult transport(const T& /*node*/, Ts&&...) { + return {{}, nullptr, kDefaultFilterSel}; + } + + static SelectivityType derive(const CEType inputCard, const ABT::reference_type ref) { + EvalFilterSelectivityTransport instance; + const auto result = algebra::transport(ref, instance, inputCard); + return result.selectivity; + } + +private: + SelectivityType negationSel(const SelectivityType in) { + return 1.0 - in; + } + + SelectivityType conjunctionSel(const SelectivityType left, const SelectivityType right) { + return left * right; + } + + SelectivityType disjunctionSel(const SelectivityType left, const SelectivityType right) { + // We sum the selectivities and subtract the overlapping part so that it's only counted + // once. + return left + right - left * right; + } +}; + +class HeuristicTransport { +public: + CEType transport(const ScanNode& node, CEType /*bindResult*/) { + // Default cardinality estimate. + const CEType metadataCE = _metadata._scanDefs.at(node.getScanDefName()).getCE(); + return (metadataCE < 0.0) ? kDefaultCard : metadataCE; + } + + CEType transport(const ValueScanNode& node, CEType /*bindResult*/) { + return node.getArraySize(); + } + + CEType transport(const MemoLogicalDelegatorNode& node) { + return properties::getPropertyConst( + _memo.getLogicalProps(node.getGroupId())) + .getEstimate(); + } + + CEType transport(const FilterNode& node, CEType childResult, CEType /*exprResult*/) { + if (childResult == 0.0) { + // Early out and return 0 since we don't expect to get more results. + return 0.0; + } + if (node.getFilter() == Constant::boolean(true)) { + // Trivially true filter. + return childResult; + } + if (node.getFilter() == Constant::boolean(false)) { + // Trivially false filter. + return 0.0; + } + + const SelectivityType sel = + EvalFilterSelectivityTransport::derive(childResult, node.getFilter().ref()); + + return std::max(sel * childResult, kMinCard); + } + + CEType transport(const EvaluationNode& node, CEType childResult, CEType /*exprResult*/) { + // Evaluations do not change cardinality. + return childResult; + } + + CEType transport(const SargableNode& node, + CEType childResult, + CEType /*bindsResult*/, + CEType /*refsResult*/) { + // Early out and return 0 since we don't expect to get more results. + if (childResult == 0.0) { + return 0.0; + } + + SelectivityType topLevelSel = 1.0; + std::vector topLevelSelectivities; + for (const auto& [key, req] : node.getReqMap()) { + if (req.getIsPerfOnly()) { + // Ignore perf-only requirements. + continue; + } + + SelectivityType disjSel = 1.0; + std::vector disjSelectivities; + // Intervals are in DNF. + const auto intervalDNF = req.getIntervals(); + const auto disjuncts = intervalDNF.cast()->nodes(); + for (const auto& disjunct : disjuncts) { + const auto& conjuncts = disjunct.cast()->nodes(); + SelectivityType conjSel = 1.0; + std::vector conjSelectivities; + for (const auto& conjunct : conjuncts) { + const auto& interval = conjunct.cast()->getExpr(); + const SelectivityType sel = intervalSel(interval, childResult); + conjSelectivities.push_back(sel); + } + conjSel = conjExponentialBackoff(std::move(conjSelectivities)); + disjSelectivities.push_back(conjSel); + } + disjSel = disjExponentialBackoff(std::move(disjSelectivities)); + topLevelSelectivities.push_back(disjSel); + } + + if (topLevelSelectivities.empty()) { + return 1.0; + } + // The elements of the PartialSchemaRequirements map represent an implicit conjunction. + topLevelSel = conjExponentialBackoff(std::move(topLevelSelectivities)); + CEType card = std::max(topLevelSel * childResult, kMinCard); + uassert(6716602, "Invalid cardinality.", validCardinality(card)); + return card; + } + + CEType transport(const RIDIntersectNode& node, + CEType /*leftChildResult*/, + CEType /*rightChildResult*/) { + // CE for the group should already be derived via the underlying Filter or Evaluation + // logical nodes. + uasserted(6624038, "Should not be necessary to derive CE for RIDIntersectNode"); + } + + CEType transport(const RIDUnionNode& node, + CEType /*leftChildResult*/, + CEType /*rightChildResult*/) { + // CE for the group should already be derived via the underlying Filter or Evaluation + // logical nodes. + uasserted(7016301, "Should not be necessary to derive CE for RIDUnionNode"); + } + + CEType transport(const BinaryJoinNode& node, + CEType leftChildResult, + CEType rightChildResult, + CEType /*exprResult*/) { + const auto& filter = node.getFilter(); + + SelectivityType selectivity = kDefaultFilterSel; + if (filter == Constant::boolean(false)) { + selectivity = 0.0; + } else if (filter == Constant::boolean(true)) { + selectivity = 1.0; + } + return leftChildResult * rightChildResult * selectivity; + } + + CEType transport(const UnionNode& node, + std::vector childResults, + CEType /*bindResult*/, + CEType /*refsResult*/) { + // Combine the CE of each child. + CEType result = 0; + for (auto&& child : childResults) { + result += child; + } + return result; + } + + CEType transport(const GroupByNode& node, + CEType childResult, + CEType /*bindAggResult*/, + CEType /*refsAggResult*/, + CEType /*bindGbResult*/, + CEType /*refsGbResult*/) { + // TODO: estimate number of groups. + switch (node.getType()) { + case GroupNodeType::Complete: + return kDefaultCompleteGroupSel * childResult; + + // Global and Local selectivity should multiply to Complete selectivity. + case GroupNodeType::Global: + return kDefaultGlobalGroupSel * childResult; + case GroupNodeType::Local: + return kDefaultLocalGroupSel * childResult; + + default: + MONGO_UNREACHABLE; + } + } + + CEType transport(const UnwindNode& node, + CEType childResult, + CEType /*bindResult*/, + CEType /*refsResult*/) { + return kDefaultAverageArraySize * childResult; + } + + CEType transport(const CollationNode& node, CEType childResult, CEType /*refsResult*/) { + // Collations do not change cardinality. + return childResult; + } + + CEType transport(const LimitSkipNode& node, CEType childResult) { + const auto limit = node.getProperty().getLimit(); + const auto skip = node.getProperty().getSkip(); + const auto cardAfterSkip = std::max(childResult - skip, 0.0); + if (limit < cardAfterSkip) { + return limit; + } + return cardAfterSkip; + } + + CEType transport(const ExchangeNode& node, CEType childResult, CEType /*refsResult*/) { + // Exchanges do not change cardinality. + return childResult; + } + + CEType transport(const RootNode& node, CEType childResult, CEType /*refsResult*/) { + // Root node does not change cardinality. + return childResult; + } + + /** + * Other ABT types. + */ + template + CEType transport(const T& /*node*/, Ts&&...) { + static_assert(!canBeLogicalNode(), "Logical node must implement its CE derivation."); + return 0.0; + } + + static CEType derive(const Metadata& metadata, + const cascades::Memo& memo, + const ABT::reference_type logicalNodeRef) { + HeuristicTransport instance(metadata, memo); + return algebra::transport(logicalNodeRef, instance); + } + +private: + HeuristicTransport(const Metadata& metadata, const cascades::Memo& memo) + : _metadata(metadata), _memo(memo) {} + + // We don't own this. + const Metadata& _metadata; + const cascades::Memo& _memo; +}; + +CEType HeuristicEstimator::deriveCE(const Metadata& metadata, + const cascades::Memo& memo, + const properties::LogicalProps& /*logicalProps*/, + const ABT::reference_type logicalNodeRef) const { + return HeuristicTransport::derive(metadata, memo, logicalNodeRef); +} + +} // namespace mongo::optimizer::ce diff --git a/src/mongo/db/query/ce/heuristic_estimator.h b/src/mongo/db/query/ce/heuristic_estimator.h new file mode 100644 index 00000000000..0cfef17d6c2 --- /dev/null +++ b/src/mongo/db/query/ce/heuristic_estimator.h @@ -0,0 +1,49 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/query/optimizer/cascades/interfaces.h" + +namespace mongo::optimizer::ce { + +/** + * Default cardinality estimation in the absence of statistics. + * Relies purely on heuristics. + * We currently do not use logical properties for heuristic ce. + */ +class HeuristicEstimator : public cascades::CardinalityEstimator { +public: + CEType deriveCE(const Metadata& metadata, + const cascades::Memo& memo, + const properties::LogicalProps& /*logicalProps*/, + ABT::reference_type logicalNodeRef) const override final; +}; + +} // namespace mongo::optimizer::ce diff --git a/src/mongo/db/query/ce/heuristic_estimator_test.cpp b/src/mongo/db/query/ce/heuristic_estimator_test.cpp new file mode 100644 index 00000000000..f92f63edde9 --- /dev/null +++ b/src/mongo/db/query/ce/heuristic_estimator_test.cpp @@ -0,0 +1,978 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include + +#include "mongo/db/query/ce/heuristic_estimator.h" +#include "mongo/db/query/ce/test_utils.h" +#include "mongo/db/query/optimizer/cascades/logical_props_derivation.h" +#include "mongo/db/query/optimizer/cascades/memo.h" +#include "mongo/db/query/optimizer/defs.h" +#include "mongo/db/query/optimizer/explain.h" +#include "mongo/db/query/optimizer/metadata.h" +#include "mongo/db/query/optimizer/opt_phase_manager.h" +#include "mongo/db/query/optimizer/props.h" +#include "mongo/db/query/optimizer/utils/unit_test_utils.h" +#include "mongo/db/query/optimizer/utils/utils.h" +#include "mongo/unittest/unittest.h" + +namespace mongo::optimizer::ce { +namespace { +constexpr double kCollCard = 10000.0; +const std::string collName = "test"; + +class HeuristicCETester : public CETester { +public: + HeuristicCETester(std::string collName, + const OptPhaseManager::PhaseSet& optPhases = kDefaultCETestPhaseSet) + : CETester(collName, kCollCard, optPhases) {} + +protected: + std::unique_ptr getEstimator() const override { + return std::make_unique(); + } +}; + +TEST(CEHeuristicTest, CEWithoutOptimizationGtLtNum) { + std::string query = "{a0 : {$gt : 14, $lt : 21}}"; + HeuristicCETester ht(collName, kNoOptPhaseSet); + ASSERT_MATCH_CE(ht, query, 1089.0); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationEqNum) { + std::string query = "{a: 123}"; + HeuristicCETester ht(collName, kNoOptPhaseSet); + ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); + ASSERT_MATCH_CE_CARD(ht, query, 1.73205, 3.0); + ASSERT_MATCH_CE_CARD(ht, query, 2.64575, 7.0); + ASSERT_MATCH_CE_CARD(ht, query, 3.16228, 10.0); + ASSERT_MATCH_CE_CARD(ht, query, 10.0, 100.0); + ASSERT_MATCH_CE_CARD(ht, query, 100.0, 10000.0); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationEqStr) { + std::string query = "{a: 'foo'}"; + HeuristicCETester ht(collName, kNoOptPhaseSet); + ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); + ASSERT_MATCH_CE_CARD(ht, query, 1.73205, 3.0); + ASSERT_MATCH_CE_CARD(ht, query, 2.64575, 7.0); + ASSERT_MATCH_CE_CARD(ht, query, 3.16228, 10.0); + ASSERT_MATCH_CE_CARD(ht, query, 10.0, 100.0); + ASSERT_MATCH_CE_CARD(ht, query, 100.0, 10000.0); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationGtNum) { + std::string query = "{a: {$gt: 44}}"; + HeuristicCETester ht(collName, kNoOptPhaseSet); + ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); + ASSERT_MATCH_CE_CARD(ht, query, 6.3, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 44.55, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 330.0, 1000.0); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationGtStr) { + std::string query = "{a: {$gt: 'foo'}}"; + HeuristicCETester ht(collName, kNoOptPhaseSet); + ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); + ASSERT_MATCH_CE_CARD(ht, query, 6.3, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 44.55, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 330.0, 1000.0); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationLtNum) { + std::string query = "{a: {$lt: 44}}"; + HeuristicCETester ht(collName, kNoOptPhaseSet); + ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); + ASSERT_MATCH_CE_CARD(ht, query, 6.3, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 44.55, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 330.0, 1000.0); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationDNF1pathSimple) { + std::string query = + "{$or: [" + "{$and: [{a0: {$gt: 9}}, {a0: {$lt: 12}}]}," + "{$and: [{a0: {$gt:40}}, {a0: {$lt: 44}}]}" + "]}"; + HeuristicCETester ht(collName, kNoOptPhaseSet); + ASSERT_MATCH_CE_CARD(ht, query, 6.6591, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 36.0354, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 205.941, 1000.0); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationNestedConjAndDisj1) { + std::string query = + "{$or: [" + "{a: {$lt: 3}}," + "{$and: [{b: {$gt:5}}, {c: {$lt: 10}}]}" + "]}"; + HeuristicCETester ht(collName, kNoOptPhaseSet); + ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); + ASSERT_MATCH_CE_CARD(ht, query, 7.623, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 55.5761, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 402.963, 1000.0); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationNestedConjAndDisj2) { + std::string query = + "{$and: [" + "{a: {$lt: 3}}," + "{$or: [{b: {$gt:5}}, {b: {$lt: 10}}]}" + "]}"; + HeuristicCETester ht(collName, kNoOptPhaseSet); + ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); + ASSERT_MATCH_CE_CARD(ht, query, 5.733, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 31.0736, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 181.863, 1000.0); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationNestedConjAndDisj3) { + std::string query = + "{$and: [" + "{$and: [{a: {$gt: 5}}, {a: {$lt: 10}}]}," + "{$and: [" + " {b: {$gt: 15}}," + " {c: {$lt: 110}}," + " {$or: [{a1: 1}, {b1: 2}, {c1: 3}]}" + "]}" + "]}"; + HeuristicCETester ht(collName, kNoOptPhaseSet); + ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); + ASSERT_MATCH_CE_CARD(ht, query, 1.52063, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 4.15975, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 9.11877, 1000.0); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationNestedConjAndDisj4) { + std::string query = + "{$or: [" + "{$or: [{a: {$gt: 5}}, {a: {$lt: 10}}]}," + "{$or: [" + " {b: {$gt: 15}}," + " {c: {$lt: 110}}," + " {$and: [{a1: 1}, {b1: 2}, {c1: 3}]}" + "]}" + "]}"; + HeuristicCETester ht(collName, kNoOptPhaseSet); + ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); + ASSERT_MATCH_CE_CARD(ht, query, 8.9298, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 89.9501, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 798.495, 1000.0); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationTraverseSelectivityDoesNotAccumulate) { + std::string query = + "{$or: [" + "{a0: 1}," + "{a0: {$lt: -4}}," + "{b0: {$gt: 10}}" + "]}"; + std::string queryWithLongPaths = + "{$or: [" + "{'a0.a1.a2.a3.a4.a5.a6.a7.a8.a9': 1}," + "{'a0.a1.a2.a3.a4.a5.a6.a7.a8.a9': {$lt: -4}}," + "{'b0.b1.b3': {$gt: 10}}" + "]}"; + HeuristicCETester ht(collName, kNoOptPhaseSet); + auto ce1 = ht.getMatchCE(query); + auto ce2 = ht.getMatchCE(queryWithLongPaths); + ASSERT_APPROX_EQUAL(ce1, ce2, kMaxCEError); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationIntervalWithEqOnSameValue) { + std::string query = + "{$or: [" + "{a: 1}," + "{$and: [{a: 2}, {a: 2}]}" + "]}"; + HeuristicCETester ht(collName, kNoOptPhaseSet); + ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); + ASSERT_MATCH_CE_CARD(ht, query, 5.0, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 18.8997, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 62.2456, 1000.0); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationIntervalWithEqOnDifferentValues) { + std::string query = + "{$or: [" + "{a: 1}," + "{$and: [{a: 2}, {a: 3}]}" + "]}"; + HeuristicCETester ht(collName, kNoOptPhaseSet); + ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); + ASSERT_MATCH_CE_CARD(ht, query, 3.0, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 9.94987, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 31.6228, 1000.0); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationConjunctionWithIn) { + std::string query = + "{$or: [" + "{a: 1}," + "{$and: [{a: 2}, {a: {$in: [2, 3, 4]}}]}" + "]}"; + HeuristicCETester ht(collName, kNoOptPhaseSet); + // Estimation for $in is not implemented yet, so we assume it has the default filter selectivity + // of 0.1. + ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); + ASSERT_MATCH_CE_CARD(ht, query, 3.6, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 18.8549, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 128.46, 1000.0); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationOneLowBoundWithoutTraverse) { + ABT scanNode = make("test", "test"); + + ABT filterNode = make( + make(make("a", make(Operations::Gt, Constant::int64(42))), + make("test")), + std::move(scanNode)); + + ABT rootNode = make(properties::ProjectionRequirement{ProjectionNameVector{"test"}}, + std::move(filterNode)); + + HeuristicCETester ht(collName, kNoOptPhaseSet); + ASSERT_CE_CARD(ht, rootNode, 0.0, 0.0); + ASSERT_CE_CARD(ht, rootNode, 2.1, 3.0); + ASSERT_CE_CARD(ht, rootNode, 4.9, 7.0); + ASSERT_CE_CARD(ht, rootNode, 7.0, 10.0); + ASSERT_CE_CARD(ht, rootNode, 33.0, 100.0); + ASSERT_CE_CARD(ht, rootNode, 3300.0, 10000.0); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationOneHighBoundWithoutTraverse) { + ABT scanNode = make("test", "test"); + + ABT filterNode = make( + make(make("a", make(Operations::Lt, Constant::int64(42))), + make("test")), + std::move(scanNode)); + + ABT rootNode = make(properties::ProjectionRequirement{ProjectionNameVector{"test"}}, + std::move(filterNode)); + + HeuristicCETester ht(collName, kNoOptPhaseSet); + ASSERT_CE_CARD(ht, rootNode, 0.0, 0.0); + ASSERT_CE_CARD(ht, rootNode, 2.1, 3.0); + ASSERT_CE_CARD(ht, rootNode, 4.9, 7.0); + ASSERT_CE_CARD(ht, rootNode, 7.0, 10.0); + ASSERT_CE_CARD(ht, rootNode, 33.0, 100.0); + ASSERT_CE_CARD(ht, rootNode, 3300.0, 10000.0); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationTwoLowBoundsWithoutTraverse) { + ABT scanNode = make("test", "test"); + + ABT filterNode = make( + make(make("a", + make( + make(Operations::Gt, Constant::int64(5)), + make(Operations::Gt, Constant::int64(10)))), + make("test")), + std::move(scanNode)); + + ABT rootNode = make(properties::ProjectionRequirement{ProjectionNameVector{"test"}}, + std::move(filterNode)); + + HeuristicCETester ht(collName, kNoOptPhaseSet); + ASSERT_CE_CARD(ht, rootNode, 0.0, 0.0); + ASSERT_CE_CARD(ht, rootNode, 2.1, 3.0); + ASSERT_CE_CARD(ht, rootNode, 4.9, 7.0); + ASSERT_CE_CARD(ht, rootNode, 7.0, 10.0); + ASSERT_CE_CARD(ht, rootNode, 33.0, 100.0); + ASSERT_CE_CARD(ht, rootNode, 3300.0, 10000.0); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationTwoHighBoundsWithoutTraverse) { + ABT scanNode = make("test", "test"); + + ABT filterNode = make( + make(make("a", + make( + make(Operations::Lt, Constant::int64(5)), + make(Operations::Lt, Constant::int64(10)))), + make("test")), + std::move(scanNode)); + + ABT rootNode = make(properties::ProjectionRequirement{ProjectionNameVector{"test"}}, + std::move(filterNode)); + + HeuristicCETester ht(collName, kNoOptPhaseSet); + ASSERT_CE_CARD(ht, rootNode, 0.0, 0.0); + ASSERT_CE_CARD(ht, rootNode, 2.1, 3.0); + ASSERT_CE_CARD(ht, rootNode, 4.9, 7.0); + ASSERT_CE_CARD(ht, rootNode, 7.0, 10.0); + ASSERT_CE_CARD(ht, rootNode, 33.0, 100.0); + ASSERT_CE_CARD(ht, rootNode, 3300.0, 10000.0); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationClosedRangeWithoutTraverse) { + ABT scanNode = make("test", "test"); + + ABT filterNode = make( + make(make("a", + make( + make(Operations::Gt, Constant::int64(7)), + make(Operations::Lt, Constant::int64(13)))), + make("test")), + std::move(scanNode)); + + ABT rootNode = make(properties::ProjectionRequirement{ProjectionNameVector{"test"}}, + std::move(filterNode)); + + HeuristicCETester ht(collName, kNoOptPhaseSet); + ASSERT_CE_CARD(ht, rootNode, 0.0, 0.0); + ASSERT_CE_CARD(ht, rootNode, 1.5, 3.0); + ASSERT_CE_CARD(ht, rootNode, 3.5, 7.0); + ASSERT_CE_CARD(ht, rootNode, 5.0, 10.0); + ASSERT_CE_CARD(ht, rootNode, 20.0, 100.0); + ASSERT_CE_CARD(ht, rootNode, 2000.0, 10000.0); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationIntervalWithDifferentTypes) { + ABT scanNode = make("test", "test"); + + ABT filterNode = make( + make( + make( + "a", + make(make(Operations::Gt, Constant::int64(5)), + make(Operations::Lt, Constant::str("foo")))), + make("test")), + std::move(scanNode)); + + ABT rootNode = make(properties::ProjectionRequirement{ProjectionNameVector{"test"}}, + std::move(filterNode)); + + HeuristicCETester ht(collName, kNoOptPhaseSet); + ASSERT_CE_CARD(ht, rootNode, 0.0, 0.0); + ASSERT_CE_CARD(ht, rootNode, 2.1, 3.0); + ASSERT_CE_CARD(ht, rootNode, 4.9, 7.0); + ASSERT_CE_CARD(ht, rootNode, 7.0, 10.0); + ASSERT_CE_CARD(ht, rootNode, 33.0, 100.0); + ASSERT_CE_CARD(ht, rootNode, 3300.0, 10000.0); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationClosedRangeWithPathExpr) { + ABT scanNode = make("test", "test"); + + ABT filterNode = make( + make( + make( + make( + "a0", + make( + make("a1", + make( + make(Operations::Gt, Constant::int64(5)), + PathTraverse::kSingleLevel)), + PathTraverse::kSingleLevel)), + make( + "a0", + make( + make("a1", + make( + make(Operations::Lt, Constant::int64(10)), + PathTraverse::kSingleLevel)), + PathTraverse::kSingleLevel))), + make("test")), + std::move(scanNode)); + + ABT rootNode = make(properties::ProjectionRequirement{ProjectionNameVector{"test"}}, + std::move(filterNode)); + + HeuristicCETester ht(collName, kNoOptPhaseSet); + ASSERT_CE_CARD(ht, rootNode, 0.0, 0.0); + ASSERT_CE_CARD(ht, rootNode, 1.5, 3.0); + ASSERT_CE_CARD(ht, rootNode, 3.5, 7.0); + ASSERT_CE_CARD(ht, rootNode, 5.0, 10.0); + ASSERT_CE_CARD(ht, rootNode, 20.0, 100.0); + ASSERT_CE_CARD(ht, rootNode, 2000.0, 10000.0); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationClosedRangeWith1Variable) { + ABT scanNode = make("test", "test"); + + ABT filterNode = make( + make( + make( + make( + "a0", + make( + make("a1", + make( + make(Operations::Gt, Constant::int64(5)), + PathTraverse::kSingleLevel)), + PathTraverse::kSingleLevel)), + make( + "a0", + make( + make("a1", + make( + make(Operations::Lt, make("test")), + PathTraverse::kSingleLevel)), + PathTraverse::kSingleLevel))), + make("test")), + std::move(scanNode)); + + ABT rootNode = make(properties::ProjectionRequirement{ProjectionNameVector{"test"}}, + std::move(filterNode)); + + HeuristicCETester ht(collName, kNoOptPhaseSet); + ASSERT_CE_CARD(ht, rootNode, 0.0, 0.0); + ASSERT_CE_CARD(ht, rootNode, 1.5, 3.0); + ASSERT_CE_CARD(ht, rootNode, 3.5, 7.0); + ASSERT_CE_CARD(ht, rootNode, 5.0, 10.0); + ASSERT_CE_CARD(ht, rootNode, 20.0, 100.0); + ASSERT_CE_CARD(ht, rootNode, 2000.0, 10000.0); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationOpenRangeWith1Variable) { + ABT scanNode = make("test", "test"); + + ABT filterNode = make( + make( + make( + make( + "a0", + make( + make("a1", + make( + make(Operations::Lt, Constant::int64(5)), + PathTraverse::kSingleLevel)), + PathTraverse::kSingleLevel)), + make( + "a0", + make( + make("a1", + make( + make(Operations::Lt, make("test")), + PathTraverse::kSingleLevel)), + PathTraverse::kSingleLevel))), + make("test")), + std::move(scanNode)); + + ABT rootNode = make(properties::ProjectionRequirement{ProjectionNameVector{"test"}}, + std::move(filterNode)); + + HeuristicCETester ht(collName, kNoOptPhaseSet); + ASSERT_CE_CARD(ht, rootNode, 0.0, 0.0); + ASSERT_CE_CARD(ht, rootNode, 2.1, 3.0); + ASSERT_CE_CARD(ht, rootNode, 4.9, 7.0); + ASSERT_CE_CARD(ht, rootNode, 7.0, 10.0); + ASSERT_CE_CARD(ht, rootNode, 33.0, 100.0); + ASSERT_CE_CARD(ht, rootNode, 3300.0, 10000.0); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationConjunctionOfBoundsWithDifferentPaths) { + ABT scanNode = make("test", "test"); + + ABT filterNode = make( + make( + make( + make( + "a0", + make( + make("a1", + make( + make(Operations::Gt, Constant::int64(5)), + PathTraverse::kSingleLevel)), + PathTraverse::kSingleLevel)), + make( + "b0", + make( + make("b1", + make( + make(Operations::Lt, Constant::int64(10)), + PathTraverse::kSingleLevel)), + PathTraverse::kSingleLevel))), + make("test")), + std::move(scanNode)); + + ABT rootNode = make(properties::ProjectionRequirement{ProjectionNameVector{"test"}}, + std::move(filterNode)); + + HeuristicCETester ht(collName, kNoOptPhaseSet); + ASSERT_CE_CARD(ht, rootNode, 0.0, 0.0); + ASSERT_CE_CARD(ht, rootNode, 1.47, 3.0); + ASSERT_CE_CARD(ht, rootNode, 3.43, 7.0); + ASSERT_CE_CARD(ht, rootNode, 4.9, 10.0); + ASSERT_CE_CARD(ht, rootNode, 10.89, 100.0); + ASSERT_CE_CARD(ht, rootNode, 1089.0, 10000.0); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationDisjunctionOnSamePathWithoutTraverse) { + ABT scanNode = make("test", "test"); + + ABT filterNode = make( + make( + make( + make( + "a0", + make("a1", make(Operations::Gt, Constant::int64(5)))), + make( + "a0", + make("a1", make(Operations::Eq, Constant::int64(100))))), + make("test")), + std::move(scanNode)); + + ABT rootNode = make(properties::ProjectionRequirement{ProjectionNameVector{"test"}}, + std::move(filterNode)); + + HeuristicCETester ht(collName, kNoOptPhaseSet); + ASSERT_CE_CARD(ht, rootNode, 0.0, 0.0); + ASSERT_CE_CARD(ht, rootNode, 2.61962, 3.0); + ASSERT_CE_CARD(ht, rootNode, 5.69373, 7.0); + ASSERT_CE_CARD(ht, rootNode, 7.94868, 10.0); + ASSERT_CE_CARD(ht, rootNode, 39.7, 100.0); + ASSERT_CE_CARD(ht, rootNode, 3367.0, 10000.0); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationDisjunctionOnDifferentPathsWithoutTraverse) { + ABT scanNode = make("test", "test"); + + ABT filterNode = make( + make( + make( + make( + "a0", + make("a1", make(Operations::Gt, Constant::int64(5)))), + make( + "b0", + make("b1", make(Operations::Eq, Constant::int64(100))))), + make("test")), + std::move(scanNode)); + + ABT rootNode = make(properties::ProjectionRequirement{ProjectionNameVector{"test"}}, + std::move(filterNode)); + + HeuristicCETester ht(collName, kNoOptPhaseSet); + ASSERT_CE_CARD(ht, rootNode, 0.0, 0.0); + ASSERT_CE_CARD(ht, rootNode, 2.61962, 3.0); + ASSERT_CE_CARD(ht, rootNode, 5.69373, 7.0); + ASSERT_CE_CARD(ht, rootNode, 7.94868, 10.0); + ASSERT_CE_CARD(ht, rootNode, 39.7, 100.0); + ASSERT_CE_CARD(ht, rootNode, 3367.0, 10000.0); +} + +TEST(CEHeuristicTest, CEWithoutOptimizationEquivalentConjunctions) { + ABT rootNode1 = make( + properties::ProjectionRequirement{ProjectionNameVector{"test"}}, + make( + make( + make( + make( + "a0", + make(make(Operations::Gt, Constant::int64(5)), + PathTraverse::kSingleLevel)), + make( + "b0", + make(make(Operations::Gt, Constant::int64(10)), + PathTraverse::kSingleLevel))), + make("test")), + make("test", "test"))); + + ABT rootNode2 = make( + properties::ProjectionRequirement{ProjectionNameVector{"test"}}, + make( + make(make("a0", + make(make(Operations::Gt, + Constant::int64(5)), + PathTraverse::kSingleLevel)), + make("test")), + make( + make( + make( + "b0", + make(make(Operations::Gt, Constant::int64(10)), + PathTraverse::kSingleLevel)), + make("test")), + make("test", "test")))); + + HeuristicCETester ht(collName, kNoOptPhaseSet); + ht.setCollCard(kCollCard); + auto ce1 = ht.getCE(rootNode1); + auto ce2 = ht.getCE(rootNode2); + ASSERT_APPROX_EQUAL(ce1, ce2, kMaxCEError); +} + +TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_Eq) { + std::string query = "{a : 123}"; + HeuristicCETester ht(collName); + ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); + ASSERT_MATCH_CE_CARD(ht, query, 0.1, 0.1); + ASSERT_MATCH_CE_CARD(ht, query, 1.73205, 3.0); + ASSERT_MATCH_CE_CARD(ht, query, 2.64575, 7.0); + ASSERT_MATCH_CE_CARD(ht, query, 3.16228, 10.0); + ASSERT_MATCH_CE_CARD(ht, query, 10.0, 100.0); + ASSERT_MATCH_CE_CARD(ht, query, 100.0, 10000.0); +} + +TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_Gt) { + std::string query = "{a: {$gt: 44}}"; + HeuristicCETester ht(collName); + ASSERT_MATCH_CE_CARD(ht, query, 0.01, 0.0); + ASSERT_MATCH_CE_CARD(ht, query, 0.7, 1.0); + ASSERT_MATCH_CE_CARD(ht, query, 6.3, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 44.55, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 330, 1000.0); +} + +TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_Gt_Lt) { + std::string query = "{a: {$gt: 44, $lt: 99}}"; + HeuristicCETester ht(collName); + ASSERT_MATCH_CE_CARD(ht, query, 0.585662, 1.0); + ASSERT_MATCH_CE_CARD(ht, query, 5.27096, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 29.885, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 189.571, 1000.0); +} + +TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_AND2Eq) { + std::string query = "{a : 13, b : 42}"; + HeuristicCETester ht(collName); + ASSERT_MATCH_CE_CARD(ht, query, 1.31607, 3.0); + ASSERT_MATCH_CE_CARD(ht, query, 1.62658, 7.0); + ASSERT_MATCH_CE_CARD(ht, query, 1.77828, 10.0); + ASSERT_MATCH_CE_CARD(ht, query, 3.16228, 100.0); + ASSERT_MATCH_CE_CARD(ht, query, 10.0, 10000.0); +} + +TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_AND3Eq) { + std::string query = "{a : 13, b : 42, c : 69}"; + HeuristicCETester ht(collName); + ASSERT_MATCH_CE_CARD(ht, query, 1.1472, 3.0); + ASSERT_MATCH_CE_CARD(ht, query, 1.27537, 7.0); + ASSERT_MATCH_CE_CARD(ht, query, 1.33352, 10.0); + ASSERT_MATCH_CE_CARD(ht, query, 1.77828, 100.0); + ASSERT_MATCH_CE_CARD(ht, query, 3.16228, 10000.0); +} + +TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_OR1path) { + std::string query = "{$or: [{a0: {$gt: 44}}, {a0: {$lt: 9}}]}"; + HeuristicCETester ht(collName); + ASSERT_MATCH_CE_CARD(ht, query, 7.52115, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 58.6188, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 451.581, 1000.0); +} + +TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_OR2paths) { + std::string query = "{$or: [{a0: {$gt:44}}, {b0: {$lt: 9}}]}"; + HeuristicCETester ht(collName, kOnlySubPhaseSet); + // Disjunctions on different paths are not SARGable. + ASSERT_MATCH_CE_CARD(ht, query, 8.19, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 69.0525, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 551.1, 1000.0); +} + +TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_DNF1pathSimple) { + std::string query = + "{$or: [" + "{$and: [{a0: {$gt: 9}}, {a0: {$lt: 12}}]}," + "{$and: [{a0: {$gt:40}}, {a0: {$lt: 44}}]}" + "]}"; + HeuristicCETester ht(collName); + ASSERT_MATCH_CE_CARD(ht, query, 6.42792, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 37.0586, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 225.232, 1000.0); +} + + +TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_DNF1pathComplex) { + HeuristicCETester ht(collName, kOnlySubPhaseSet); + // Each disjunct has different number of conjuncts, + // so that its selectivity is different. We need 5 disjuncts to test exponential backoff which + // cuts off at the first 4. The conjuncts are in selectivity order. + std::string query1 = + "{$or: [" + "{$and: [{a0: {$gt: 9}}, {a0: {$lt: 12}}]}," + "{$and: [{a0: {$gt: 9}}, {a0: {$lt: 12}}, {a0: {$gt: 42}}]}," + "{$and: [{a0: {$gt:40}}, {a0: {$lt: 99}}, {a0: {$gt: 42}}, {a0: {$lt: 88}}]}," + "{$and: [{a0: {$gt:40}}, {a0: {$lt: 99}}, {a0: {$gt: 42}}, {a0: {$lt: 88}}, {a0: {$lt: " + "81}}]}," + "{$and: [{a0: {$gt:40}}, {a0: {$lt: 99}}, {a0: {$gt: 42}}, {a0: {$lt: 88}}, {a0: {$lt: " + "81}}, {a0: {$lt: 77}}]}" + "]}"; + auto ce1 = ht.getMatchCE(query1); + // The conjuncts are in inverse selectivity order. + std::string query2 = + "{$or: [" + "{$and: [{a0: {$gt:40}}, {a0: {$lt: 99}}, {a0: {$gt: 42}}, {a0: {$lt: 88}}, {a0: {$lt: " + "81}}, {a0: {$lt: 77}}]}," + "{$and: [{a0: {$gt:40}}, {a0: {$lt: 99}}, {a0: {$gt: 42}}, {a0: {$lt: 88}}, {a0: {$lt: " + "81}}]}," + "{$and: [{a0: {$gt:40}}, {a0: {$lt: 99}}, {a0: {$gt: 42}}, {a0: {$lt: 88}}]}," + "{$and: [{a0: {$gt: 9}}, {a0: {$lt: 12}}, {a0: {$gt: 42}}]}," + "{$and: [{a0: {$gt: 9}}, {a0: {$lt: 12}}]}" + "]}"; + auto ce2 = ht.getMatchCE(query2); + ASSERT_APPROX_EQUAL(ce1, ce2, kMaxCEError); +} + +TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_DNF2paths) { + std::string query = + "{$or: [" + "{$and: [{a0: {$gt: 9}}, {a0: {$lt: 12}}]}," + "{$and: [{b0: {$gt:40}}, {b0: {$lt: 44}}]}" + "]}"; + HeuristicCETester ht(collName, kOnlySubPhaseSet); + // Disjunctions on different paths are not SARGable. + ASSERT_MATCH_CE_CARD(ht, query, 6.6591, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 36.0354, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 205.941, 1000.0); +} + +TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_CNF1path) { + std::string query = + "{$and : [" + "{$or : [ {a0 : {$gt : 11}}, {a0 : {$lt : 44}} ]}," + "{$or : [ {a0 : {$gt : 77}}, {a0 : {$eq : 51}} ]}" + "]}"; + HeuristicCETester ht(collName); + ASSERT_MATCH_CE_CARD(ht, query, 6.21212, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 36.4418, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 228.935, 1000.0); +} + +TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_CNF2paths) { + std::string query = + "{$and : [" + "{$or : [ {a0 : {$gt : 11}}, {a0 : {$lt : 44}} ]}," + "{$or : [ {b0 : {$gt : 77}}, {b0 : {$eq : 51}} ]}" + "]}"; + HeuristicCETester ht(collName); + ASSERT_MATCH_CE_CARD(ht, query, 6.21212, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 36.4418, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 228.935, 1000.0); +} + +TEST(CEHeuristicTest, CEAfterMemoSubstitutionExplorationPhases) { + HeuristicCETester ht(collName); + ASSERT_MATCH_CE(ht, "{a : 13, b : 42}", 10.0); +} + +TEST(CEHeuristicTest, CENotEquality) { + double collCard = kCollCard; + HeuristicCETester opt(collName); + + // We avoid optimizing in order to verify heuristic estimate of FilterNode subtree. Note that we + // do not generate SargableNodes for $not predicates, but we do generate SargableNodes without + // it; for the purposes of this test, we want to demonstrate that $not returns the inverse of + // the FilterNode estimate. + HeuristicCETester noOpt(collName, kNoOptPhaseSet); + + // Equality selectivity is sqrt(kCollCard)/kCollCard = 0.01. When we see a UnaryOp [Not] above + // this subtree, we invert the selectivity 1.0 - 0.01 = 0.99. + double ce = 100.0; + double inverseCE = collCard - ce; + ASSERT_MATCH_CE(noOpt, "{a: {$eq: 1}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$eq: 1}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{'validate.long.path.estimate': {$eq: 1}}", ce); + ASSERT_MATCH_CE(opt, "{'validate.long.path.estimate': {$not: {$eq: 1}}}", inverseCE); + + // Update cardinality to 25. + collCard = 25; + opt.setCollCard(collCard); + noOpt.setCollCard(collCard); + + // Selectivity is sqrt(25)/25. + ce = 5.0; + inverseCE = collCard - ce; + ASSERT_MATCH_CE(noOpt, "{a: {$eq: 1}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$eq: 1}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{'validate.long.path.estimate': {$eq: 1}}", ce); + ASSERT_MATCH_CE(opt, "{'validate.long.path.estimate': {$not: {$eq: 1}}}", inverseCE); + + // Update cardinality to 9. + collCard = 9; + opt.setCollCard(collCard); + noOpt.setCollCard(collCard); + + // Selectivity is sqrt(3)/9. + ce = 3.0; + inverseCE = collCard - ce; + ASSERT_MATCH_CE(noOpt, "{a: {$eq: 1}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$eq: 1}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{'validate.long.path.estimate': {$eq: 1}}", ce); + ASSERT_MATCH_CE(opt, "{'validate.long.path.estimate': {$not: {$eq: 1}}}", inverseCE); +} + +TEST(CEHeuristicTest, CENotOpenRange) { + // Repeat the above test for open ranges; the $not cardinality estimate should add up with the + // non-$not estimate to the collection cardinality. + double collCard = kCollCard; + HeuristicCETester opt(collName); + HeuristicCETester noOpt(collName, kNoOptPhaseSet); + + // Expect open-range selectivity for input card > 100 (0.33). + double ce = 3300; + double inverseCE = collCard - ce; + + ASSERT_MATCH_CE(noOpt, "{a: {$lt: 1}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$lt: 1}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{a: {$lte: 1}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$lte: 1}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{a: {$gt: 1}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$gt: 1}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{a: {$gte: 1}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$gte: 1}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{'validate.long.path.estimate': {$gte: 1}}", ce); + ASSERT_MATCH_CE(opt, "{'validate.long.path.estimate': {$not: {$gte: 1}}}", inverseCE); + + // Update cardinality to 25. + collCard = 25; + opt.setCollCard(collCard); + noOpt.setCollCard(collCard); + + // Expect open-range selectivity for input card in range (20, 100) (0.45). + ce = 11.25; + inverseCE = collCard - ce; + + ASSERT_MATCH_CE(noOpt, "{a: {$lt: 1}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$lt: 1}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{a: {$lte: 1}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$lte: 1}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{a: {$gt: 1}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$gt: 1}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{a: {$gte: 1}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$gte: 1}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{'validate.long.path.estimate': {$gte: 1}}", ce); + ASSERT_MATCH_CE(opt, "{'validate.long.path.estimate': {$not: {$gte: 1}}}", inverseCE); + + // Update cardinality to 10. + collCard = 10.0; + opt.setCollCard(collCard); + noOpt.setCollCard(collCard); + + // Expect open-range selectivity for input card < 20 (0.70). + ce = 7.0; + inverseCE = collCard - ce; + + ASSERT_MATCH_CE(noOpt, "{a: {$lt: 1}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$lt: 1}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{a: {$lte: 1}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$lte: 1}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{a: {$gt: 1}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$gt: 1}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{a: {$gte: 1}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$gte: 1}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{'validate.long.path.estimate': {$gte: 1}}", ce); + ASSERT_MATCH_CE(opt, "{'validate.long.path.estimate': {$not: {$gte: 1}}}", inverseCE); +} + +TEST(CEHeuristicTest, CENotClosedRange) { + // Repeat the above test for closed ranges; the $not cardinality estimate should add up with the + // non-$not estimate to the collection cardinality. + double collCard = kCollCard; + double ce = 1089.0; + double inverseCE = collCard - ce; + HeuristicCETester opt(collName); + HeuristicCETester noOpt(collName, kNoOptPhaseSet); + + ASSERT_MATCH_CE(noOpt, "{a: {$gt: 10, $lt: 20}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$gt: 10, $lt: 20}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{a: {$gte: 10, $lt: 20}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$gte: 10, $lt: 20}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{a: {$gte: 10, $lte: 20}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$gte: 10, $lte: 20}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{a: {$gt: 10, $lte: 20}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$gt: 10, $lte: 20}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{'validate.long.path.estimate': {$gte: 10, $lt: 20}}", ce); + ASSERT_MATCH_CE(opt, "{'validate.long.path.estimate': {$not: {$gte: 10, $lt: 20}}}", inverseCE); + + /* + * Update cardinality to 25. Here we observe an interesting edge case where the estimated + * cardinality is not the inverse of the actual cardinality. + * + * Consider the predicate {a: {$gt: 10, $lt: 20}}. This generates two FilterNodes stacked on top + * of each other. However, the predicate {a: {$not: {$gt: 10, $lt: 20}}} generates just one + * FilterNode. + * + * We always use input cardinality to determine which interval selectivity we're going to use. + * However, we have a different input cardinality for the one FilterNode case (collCard) than + * for the two FilterNodes case: the first node gets collCard, and the second node gets a + * smaller value after the selectivity of the first filter is applied. + * + * Because we use a piecewise function to pick the selectivity, and because we go from inputCard + * < 100 to inputCard < 20, we choose different selectivities for the intervals in the second + * FilterNode (0.50) than in the first (0.33). + */ + collCard = 25; + ce = 7.875; + inverseCE = 19.9375; + opt.setCollCard(collCard); + noOpt.setCollCard(collCard); + + ASSERT_MATCH_CE(noOpt, "{a: {$gt: 10, $lt: 20}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$gt: 10, $lt: 20}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{a: {$gte: 10, $lt: 20}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$gte: 10, $lt: 20}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{a: {$gte: 10, $lte: 20}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$gte: 10, $lte: 20}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{a: {$gt: 10, $lte: 20}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$gt: 10, $lte: 20}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{'validate.long.path.estimate': {$gte: 10, $lt: 20}}", ce); + ASSERT_MATCH_CE(opt, "{'validate.long.path.estimate': {$not: {$gte: 10, $lt: 20}}}", inverseCE); + + // Update cardinality to 10. + collCard = 10.0; + ce = 4.9; + inverseCE = collCard - ce; + opt.setCollCard(collCard); + noOpt.setCollCard(collCard); + + ASSERT_MATCH_CE(noOpt, "{a: {$gt: 10, $lt: 20}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$gt: 10, $lt: 20}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{a: {$gte: 10, $lt: 20}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$gte: 10, $lt: 20}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{a: {$gte: 10, $lte: 20}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$gte: 10, $lte: 20}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{a: {$gt: 10, $lte: 20}}", ce); + ASSERT_MATCH_CE(opt, "{a: {$not: {$gt: 10, $lte: 20}}}", inverseCE); + ASSERT_MATCH_CE(noOpt, "{'validate.long.path.estimate': {$gte: 10, $lt: 20}}", ce); + ASSERT_MATCH_CE(opt, "{'validate.long.path.estimate': {$not: {$gte: 10, $lt: 20}}}", inverseCE); +} + +TEST(CEHeuristicTest, CEExists) { + HeuristicCETester noOpt(collName); + + // Test basic case + $not. + ASSERT_MATCH_CE(noOpt, "{a: {$exists: true}}", 7000); + ASSERT_MATCH_CE(noOpt, "{a: {$exists: false}}", 3000); + ASSERT_MATCH_CE(noOpt, "{a: {$not: {$exists: false}}}", 7000); + ASSERT_MATCH_CE(noOpt, "{a: {$not: {$exists: true}}}", 3000); + + // Test combinations of predicates. + ASSERT_MATCH_CE(noOpt, "{a: {$exists: true, $eq: 123}}", 70); + ASSERT_MATCH_CE(noOpt, "{a: {$exists: false, $eq: null}}", 30); + ASSERT_MATCH_CE(noOpt, "{a: {$exists: false}, b: {$eq: 123}}", 30); + ASSERT_MATCH_CE(noOpt, "{a: {$exists: true, $gt: 123}}", 2310); +} + +} // namespace +} // namespace mongo::optimizer::ce diff --git a/src/mongo/db/query/ce/hinted_estimator.cpp b/src/mongo/db/query/ce/hinted_estimator.cpp new file mode 100644 index 00000000000..b27381268b8 --- /dev/null +++ b/src/mongo/db/query/ce/hinted_estimator.cpp @@ -0,0 +1,100 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/ce/hinted_estimator.h" + +#include "mongo/db/query/ce/heuristic_estimator.h" + +namespace mongo::optimizer::ce { +class HintedTransport { +public: + CEType transport(const ABT& n, + const SargableNode& node, + CEType childResult, + CEType /*bindsResult*/, + CEType /*refsResult*/) { + CEType result = childResult; + for (const auto& [key, req] : node.getReqMap()) { + if (!isIntervalReqFullyOpenDNF(req.getIntervals())) { + auto it = _hints.find(key); + if (it != _hints.cend()) { + // Assume independence. + result *= it->second; + } + } + } + + return result; + } + + template + CEType transport(const ABT& n, const T& /*node*/, Ts&&...) { + if (canBeLogicalNode()) { + return _heuristicCE.deriveCE(_metadata, _memo, _logicalProps, n.ref()); + } + return 0.0; + } + + static CEType derive(const Metadata& metadata, + const cascades::Memo& memo, + const PartialSchemaSelHints& hints, + const properties::LogicalProps& logicalProps, + const ABT::reference_type logicalNodeRef) { + HintedTransport instance(metadata, memo, logicalProps, hints); + return algebra::transport(logicalNodeRef, instance); + } + +private: + HintedTransport(const Metadata& metadata, + const cascades::Memo& memo, + const properties::LogicalProps& logicalProps, + const PartialSchemaSelHints& hints) + : _heuristicCE(), + _metadata(metadata), + _memo(memo), + _logicalProps(logicalProps), + _hints(hints) {} + + HeuristicEstimator _heuristicCE; + + // We don't own this. + const Metadata& _metadata; + const cascades::Memo& _memo; + const properties::LogicalProps& _logicalProps; + const PartialSchemaSelHints& _hints; +}; + +CEType HintedEstimator::deriveCE(const Metadata& metadata, + const cascades::Memo& memo, + const properties::LogicalProps& logicalProps, + const ABT::reference_type logicalNodeRef) const { + return HintedTransport::derive(metadata, memo, _hints, logicalProps, logicalNodeRef); +} + +} // namespace mongo::optimizer::ce diff --git a/src/mongo/db/query/ce/hinted_estimator.h b/src/mongo/db/query/ce/hinted_estimator.h new file mode 100644 index 00000000000..766a1a1f03c --- /dev/null +++ b/src/mongo/db/query/ce/hinted_estimator.h @@ -0,0 +1,57 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/query/optimizer/cascades/interfaces.h" + +namespace mongo::optimizer::ce { + +using PartialSchemaSelHints = + std::map; + +/** + * Estimation based on hints. The hints are organized in a PartialSchemaSelHints structure. + * SargableNodes are estimated based on the matching PartialSchemaKeys. + */ +class HintedEstimator : public cascades::CardinalityEstimator { +public: + HintedEstimator(PartialSchemaSelHints hints) : _hints(std::move(hints)) {} + + CEType deriveCE(const Metadata& metadata, + const cascades::Memo& memo, + const properties::LogicalProps& logicalProps, + ABT::reference_type logicalNodeRef) const override final; + +private: + // Selectivity hints per PartialSchemaKey. + PartialSchemaSelHints _hints; +}; + +} // namespace mongo::optimizer::ce diff --git a/src/mongo/db/query/ce/histogram_array_data_test.cpp b/src/mongo/db/query/ce/histogram_array_data_test.cpp new file mode 100644 index 00000000000..7f8bb92fc51 --- /dev/null +++ b/src/mongo/db/query/ce/histogram_array_data_test.cpp @@ -0,0 +1,298 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include + +#include "mongo/db/exec/sbe/values/value.h" +#include "mongo/db/query/ce/histogram_predicate_estimation.h" +#include "mongo/db/query/ce/test_utils.h" +#include "mongo/db/query/query_test_service_context.h" +#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/unittest/unittest.h" + +namespace mongo::optimizer::ce { +namespace { +namespace value = sbe::value; + +using stats::ArrayHistogram; +using stats::ScalarHistogram; +using stats::TypeCounts; + +/** + * Structure representing a range query and its estimated and actual cardinalities. + * Used to record hand-crafted queries over a pre-generated dataset. + */ +struct QuerySpec { + // Low bound of the query range. + int32_t low; + // Upper bound of the query range. + int32_t high; + // Estimated cardinality of $match query. + double estMatch; + // Actual cardinality of $match query. + double actMatch; + // Estimated cardinality of $elemMatch query. + double estElemMatch; + // Actual cardinality of $elemMatch query. + double actElemMatch; +}; + +static std::pair computeErrors(size_t actualCard, double estimatedCard) { + double error = estimatedCard - actualCard; + double relError = (actualCard == 0) ? (estimatedCard == 0 ? 0.0 : -1.0) : error / actualCard; + return std::make_pair(error, relError); +} + +static std::string serializeQuery(QuerySpec& q, bool isElemMatch) { + std::ostringstream os; + os << "{$match: {a: {"; + if (isElemMatch) { + os << "$elemMatch: {"; + } + os << "$gt: " << q.low; + os << ", $lt: " << q.high; + if (isElemMatch) { + os << "}"; + } + os << "}}}\n"; + return os.str(); +} + +static std::string computeRMSE(std::vector& querySet, bool isElemMatch) { + double rms = 0.0, relRms = 0.0, meanAbsSelErr = 0.0; + size_t trialSize = querySet.size(); + const size_t dataSize = 1000; + + std::ostringstream os; + os << "\nQueries:\n"; + for (auto& q : querySet) { + double estimatedCard = isElemMatch ? q.estElemMatch : q.estMatch; + double actualCard = isElemMatch ? q.actElemMatch : q.actMatch; + + auto [error, relError] = computeErrors(actualCard, estimatedCard); + rms += error * error; + relRms += relError * relError; + meanAbsSelErr += std::abs(error); + os << serializeQuery(q, isElemMatch); + os << "Estimated: " << estimatedCard << " Actual " << actualCard << " (Error: " << error + << " RelError: " << relError << ")\n\n"; + } + rms = std::sqrt(rms / trialSize); + relRms = std::sqrt(relRms / trialSize); + meanAbsSelErr /= (trialSize * dataSize); + + os << "=====" << (isElemMatch ? " ElemMatch errors: " : "Match errors:") << "=====\n"; + os << "RMSE : " << rms << " RelRMSE : " << relRms + << " MeanAbsSelectivityError: " << meanAbsSelErr << std::endl; + return os.str(); +} + +TEST(EstimatorArrayDataTest, Histogram1000ArraysSmall10Buckets) { + std::vector scalarData{{}}; + const ScalarHistogram scalarHist = createHistogram(scalarData); + + std::vector minData{{0, 5.0, 0.0, 0.0}, + {553, 2.0, 935.0, 303.0}, + {591, 4.0, 2.0, 1.0}, + {656, 2.0, 21.0, 12.0}, + {678, 3.0, 6.0, 3.0}, + {693, 2.0, 1.0, 1.0}, + {730, 1.0, 6.0, 3.0}, + {788, 1.0, 2.0, 2.0}, + {847, 2.0, 4.0, 1.0}, + {867, 1.0, 0.0, 0.0}}; + + const ScalarHistogram aMinHist = createHistogram(minData); + + std::vector maxData{{117, 1.0, 0.0, 0.0}, + {210, 1.0, 1.0, 1.0}, + {591, 1.0, 8.0, 4.0}, + {656, 1.0, 0.0, 0.0}, + {353, 2.0, 18.0, 9.0}, + {610, 5.0, 125.0, 65.0}, + {733, 8.0, 134.0, 53.0}, + {768, 6.0, 50.0, 16.0}, + {957, 8.0, 448.0, 137.0}, + {1000, 7.0, 176.0, 40.0}}; + + const ScalarHistogram aMaxHist = createHistogram(maxData); + + std::vector uniqueData{{0, 5.0, 0.0, 0.0}, + {16, 11.0, 74.0, 13.0}, + {192, 13.0, 698.0, 148.0}, + {271, 9.0, 312.0, 70.0}, + {670, 7.0, 1545.0, 355.0}, + {712, 9.0, 159.0, 32.0}, + {776, 11.0, 247.0, 54.0}, + {869, 9.0, 361.0, 85.0}, + {957, 8.0, 323.0, 76.0}, + {1000, 7.0, 188.0, 40.0}}; + + const ScalarHistogram aUniqueHist = createHistogram(uniqueData); + + TypeCounts typeCounts; + TypeCounts arrayTypeCounts; + // Dataset generated as 1000 arrays of size between 3 to 5. + typeCounts.insert({value::TypeTags::Array, 1000}); + arrayTypeCounts.insert({value::TypeTags::NumberInt32, 3996}); + + const ArrayHistogram arrHist(scalarHist, + typeCounts, + aUniqueHist, + aMinHist, + aMaxHist, + arrayTypeCounts, + 0 /* emptyArrayCount */); + + std::vector querySet{{10, 20, 35.7, 93.0, 37.8, 39.0}, + {10, 60, 103.3, 240.0, 158.0, 196.0}, + {320, 330, 554.5, 746.0, 26.0, 30.0}, + {320, 400, 672.9, 832.0, 231.5, 298.0}, + {980, 990, 88.8, 101.0, 36.5, 41.0}, + {970, 1050, 129.7, 141.0, 129.7, 141.0}}; + + for (const auto q : querySet) { + // $match query, includeScalar = true. + double estCard = estimateCardRange(arrHist, + false /* lowInclusive */, + value::TypeTags::NumberInt32, + sbe::value::bitcastFrom(q.low), + false /* highInclusive */, + value::TypeTags::NumberInt32, + sbe::value::bitcastFrom(q.high), + true /* includeScalar */); + ASSERT_APPROX_EQUAL(estCard, q.estMatch, 0.1); + + // $elemMatch query, includeScalar = false. + estCard = estimateCardRange(arrHist, + false /* lowInclusive */, + value::TypeTags::NumberInt32, + sbe::value::bitcastFrom(q.low), + false /* highInclusive */, + value::TypeTags::NumberInt32, + sbe::value::bitcastFrom(q.high), + false /* includeScalar */); + ASSERT_APPROX_EQUAL(estCard, q.estElemMatch, 0.1); + } + std::cout << computeRMSE(querySet, false /* isElemMatch */) << std::endl; + std::cout << computeRMSE(querySet, true /* isElemMatch */) << std::endl; +} + +TEST(EstimatorArrayDataTest, Histogram1000ArraysLarge10Buckets) { + std::vector scalarData{{}}; + const ScalarHistogram scalarHist = createHistogram(scalarData); + + std::vector minData{{0, 2.0, 0.0, 0.0}, + {1324, 4.0, 925.0, 408.0}, + {1389, 5.0, 7.0, 5.0}, + {1521, 2.0, 16.0, 10.0}, + {1621, 2.0, 13.0, 7.0}, + {1852, 5.0, 10.0, 9.0}, + {1864, 2.0, 0.0, 0.0}, + {1971, 1.0, 3.0, 3.0}, + {2062, 2.0, 0.0, 0.0}, + {2873, 1.0, 0.0, 0.0}}; + + const ScalarHistogram aMinHist = createHistogram(minData); + + std::vector maxData{{2261, 1.0, 0.0, 0.0}, + {2673, 1.0, 0.0, 0.0}, + {2930, 1.0, 1.0, 1.0}, + {3048, 2.0, 2.0, 2.0}, + {3128, 3.0, 1.0, 1.0}, + {3281, 2.0, 0.0, 0.0}, + {3378, 2.0, 7.0, 5.0}, + {3453, 4.0, 2.0, 2.0}, + {3763, 6.0, 44.0, 23.0}, + {5000, 1.0, 920.0, 416.0}}; + + const ScalarHistogram aMaxHist = createHistogram(maxData); + + std::vector uniqueData{{0, 2.0, 0.0, 0.0}, + {1106, 9.0, 1970.0, 704.0}, + {1542, 11.0, 736.0, 280.0}, + {3267, 6.0, 3141.0, 1097.0}, + {3531, 6.0, 461.0, 175.0}, + {3570, 7.0, 48.0, 20.0}, + {4573, 8.0, 1851.0, 656.0}, + {4619, 6.0, 65.0, 30.0}, + {4782, 5.0, 265.0, 99.0}, + {5000, 1.0, 342.0, 135.0}}; + + const ScalarHistogram aUniqueHist = createHistogram(uniqueData); + + TypeCounts typeCounts; + TypeCounts arrayTypeCounts; + // Dataset generated as 1000 arrays of size between 8 to 10. + typeCounts.insert({value::TypeTags::Array, 1000}); + arrayTypeCounts.insert({value::TypeTags::NumberInt32, 8940}); + + const ArrayHistogram arrHist(scalarHist, + typeCounts, + aUniqueHist, + aMinHist, + aMaxHist, + arrayTypeCounts, + 0 /* emptyArrayCount */); + + std::vector querySet{{10, 20, 13.7, 39.0, 9.7, 26.0}, + {10, 60, 41.6, 108.0, 55.7, 101.0}, + {1000, 1010, 705.4, 861.0, 9.7, 7.0}, + {1000, 1050, 733.3, 884.0, 55.7, 87.0}, + {3250, 3300, 988.0, 988.0, 59.3, 86.0}, + {4970, 4980, 23.3, 53.0, 8.5, 16.0}}; + + for (const auto q : querySet) { + // $match query, includeScalar = true. + double estCard = estimateCardRange(arrHist, + false /* lowInclusive */, + value::TypeTags::NumberInt32, + sbe::value::bitcastFrom(q.low), + false /* highInclusive */, + value::TypeTags::NumberInt32, + sbe::value::bitcastFrom(q.high), + true /* includeScalar */); + ASSERT_APPROX_EQUAL(estCard, q.estMatch, 0.1); + + // $elemMatch query, includeScalar = false. + estCard = estimateCardRange(arrHist, + false /* lowInclusive */, + value::TypeTags::NumberInt32, + sbe::value::bitcastFrom(q.low), + false /* highInclusive */, + value::TypeTags::NumberInt32, + sbe::value::bitcastFrom(q.high), + false /* includeScalar */); + ASSERT_APPROX_EQUAL(estCard, q.estElemMatch, 0.1); + } + std::cout << computeRMSE(querySet, false /* isElemMatch */) << std::endl; + std::cout << computeRMSE(querySet, true /* isElemMatch */) << std::endl; +} +} // namespace +} // namespace mongo::optimizer::ce diff --git a/src/mongo/db/query/ce/histogram_edge_cases_test.cpp b/src/mongo/db/query/ce/histogram_edge_cases_test.cpp new file mode 100644 index 00000000000..051d3134dcc --- /dev/null +++ b/src/mongo/db/query/ce/histogram_edge_cases_test.cpp @@ -0,0 +1,1007 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/pipeline/abt/utils.h" +#include "mongo/db/query/ce/histogram_predicate_estimation.h" +#include "mongo/db/query/ce/test_utils.h" +#include "mongo/db/query/optimizer/utils/ce_math.h" +#include "mongo/db/query/sbe_stage_builder_helpers.h" +#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/maxdiff_test_utils.h" +#include "mongo/db/query/stats/value_utils.h" +#include "mongo/unittest/unittest.h" + +namespace mongo::optimizer::ce { +namespace { +namespace value = sbe::value; + +using stats::ArrayHistogram; +using stats::makeInt64Value; +using stats::SBEValue; +using stats::ScalarHistogram; +using stats::TypeCounts; + +constexpr double kErrorBound = 0.01; + +TEST(EstimatorTest, OneBucketIntHistogram) { + // Data set of 10 values, each with frequency 3, in the range (-inf, 100]. + // Example: { -100, -20, 0, 20, 50, 60, 70, 80, 90, 100}. + std::vector data{{100, 3.0, 27.0, 9.0}}; + const ScalarHistogram hist = createHistogram(data); + + ASSERT_EQ(30.0, getTotals(hist).card); + + // Estimates with the bucket bound. + ASSERT_EQ(3.0, estimateIntValCard(hist, 100, EstimationType::kEqual)); + ASSERT_EQ(27.0, estimateIntValCard(hist, 100, EstimationType::kLess)); + ASSERT_EQ(30.0, estimateIntValCard(hist, 100, EstimationType::kLessOrEqual)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 100, EstimationType::kGreater)); + ASSERT_EQ(3.0, estimateIntValCard(hist, 100, EstimationType::kGreaterOrEqual)); + + // Estimates with a value inside the bucket. + ASSERT_EQ(3.0, estimateIntValCard(hist, 10, EstimationType::kEqual)); + // No interpolation possible for estimates of inequalities in a single bucket. The estimates + // are based on the default cardinality of half bucket +/- the estimate of equality inside of + // the bucket. + ASSERT_EQ(10.5, estimateIntValCard(hist, 10, EstimationType::kLess)); + ASSERT_EQ(13.5, estimateIntValCard(hist, 10, EstimationType::kLessOrEqual)); + ASSERT_EQ(16.5, estimateIntValCard(hist, 10, EstimationType::kGreater)); + ASSERT_EQ(19.5, estimateIntValCard(hist, 10, EstimationType::kGreaterOrEqual)); + + // Estimates for a value larger than the last bucket bound. + ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kEqual)); + ASSERT_EQ(30.0, estimateIntValCard(hist, 1000, EstimationType::kLess)); + ASSERT_EQ(30.0, estimateIntValCard(hist, 1000, EstimationType::kLessOrEqual)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kGreater)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kGreaterOrEqual)); +} + +TEST(EstimatorTest, OneExclusiveBucketIntHistogram) { + // Data set of a single value. + // By exclusive bucket we mean a bucket with only boundary, that is the range frequency and NDV + // are zero. + std::vector data{{100, 2.0, 0.0, 0.0}}; + const ScalarHistogram hist = createHistogram(data); + + ASSERT_EQ(2.0, getTotals(hist).card); + + // Estimates with the bucket boundary. + ASSERT_EQ(2.0, estimateIntValCard(hist, 100, EstimationType::kEqual)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 100, EstimationType::kLess)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 100, EstimationType::kGreater)); + + ASSERT_EQ(0.0, estimateIntValCard(hist, 0, EstimationType::kEqual)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 0, EstimationType::kLess)); + ASSERT_EQ(2.0, estimateIntValCard(hist, 0, EstimationType::kGreater)); + + ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kEqual)); + ASSERT_EQ(2.0, estimateIntValCard(hist, 1000, EstimationType::kLess)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kGreater)); +} + +TEST(EstimatorTest, OneBucketTwoIntValuesHistogram) { + // Data set of two values, example {5, 100, 100}. + std::vector data{{100, 2.0, 1.0, 1.0}}; + const ScalarHistogram hist = createHistogram(data); + + ASSERT_EQ(3.0, getTotals(hist).card); + + // Estimates with the bucket boundary. + ASSERT_EQ(2.0, estimateIntValCard(hist, 100, EstimationType::kEqual)); + ASSERT_EQ(1.0, estimateIntValCard(hist, 100, EstimationType::kLess)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 100, EstimationType::kGreater)); + + ASSERT_EQ(1.0, estimateIntValCard(hist, 10, EstimationType::kEqual)); + // Default estimate of half of the bucket's range frequency = 0.5. + ASSERT_EQ(0.5, estimateIntValCard(hist, 10, EstimationType::kLess)); + ASSERT_EQ(2.5, estimateIntValCard(hist, 10, EstimationType::kGreater)); + + ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kEqual)); + ASSERT_EQ(3.0, estimateIntValCard(hist, 1000, EstimationType::kLess)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kGreater)); +} + +TEST(EstimatorTest, OneBucketTwoIntValuesHistogram2) { + // Similar to the above test with higher frequency for the second value. + // Example {5, 5, 5, 100, 100}. + std::vector data{{100, 2.0, 3.0, 1.0}}; + const ScalarHistogram hist = createHistogram(data); + + ASSERT_EQ(5.0, getTotals(hist).card); + + // Estimates with the bucket boundary. + ASSERT_EQ(2.0, estimateIntValCard(hist, 100, EstimationType::kEqual)); + ASSERT_EQ(3.0, estimateIntValCard(hist, 100, EstimationType::kLess)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 100, EstimationType::kGreater)); + + ASSERT_EQ(3.0, estimateIntValCard(hist, 10, EstimationType::kEqual)); + // Default estimate of half of the bucket's range frequency = 1.5. + ASSERT_EQ(1.5, estimateIntValCard(hist, 10, EstimationType::kLess)); + ASSERT_EQ(3.5, estimateIntValCard(hist, 10, EstimationType::kGreater)); + + ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kEqual)); + ASSERT_EQ(5.0, estimateIntValCard(hist, 1000, EstimationType::kLess)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kGreater)); +} + +TEST(EstimatorTest, TwoBucketsIntHistogram) { + // Data set of 10 values in the range [1, 100]. + std::vector data{{1, 1.0, 0.0, 0.0}, {100, 3.0, 26.0, 8.0}}; + const ScalarHistogram hist = createHistogram(data); + + ASSERT_EQ(30.0, getTotals(hist).card); + + // Estimates for a value smaller than the first bucket. + ASSERT_EQ(0.0, estimateIntValCard(hist, -42, EstimationType::kEqual)); + ASSERT_EQ(0.0, estimateIntValCard(hist, -42, EstimationType::kLess)); + ASSERT_EQ(0.0, estimateIntValCard(hist, -42, EstimationType::kLessOrEqual)); + ASSERT_EQ(30.0, estimateIntValCard(hist, -42, EstimationType::kGreater)); + ASSERT_EQ(30.0, estimateIntValCard(hist, -42, EstimationType::kGreaterOrEqual)); + + // Estimates with bucket bounds. + ASSERT_EQ(1.0, estimateIntValCard(hist, 1, EstimationType::kEqual)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 1, EstimationType::kLess)); + ASSERT_EQ(1.0, estimateIntValCard(hist, 1, EstimationType::kLessOrEqual)); + ASSERT_EQ(29.0, estimateIntValCard(hist, 1, EstimationType::kGreater)); + ASSERT_EQ(30.0, estimateIntValCard(hist, 1, EstimationType::kGreaterOrEqual)); + + ASSERT_EQ(3.0, estimateIntValCard(hist, 100, EstimationType::kEqual)); + ASSERT_EQ(27.0, estimateIntValCard(hist, 100, EstimationType::kLess)); + ASSERT_EQ(30.0, estimateIntValCard(hist, 100, EstimationType::kLessOrEqual)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 100, EstimationType::kGreater)); + ASSERT_EQ(3.0, estimateIntValCard(hist, 100, EstimationType::kGreaterOrEqual)); + + // Estimates with a value inside the bucket. The estimates use interpolation. + // The bucket ratio for the value of 10 is smaller than the estimate for equality + // and the estimates for Less and LessOrEqual are the same. + ASSERT_APPROX_EQUAL(3.25, estimateIntValCard(hist, 10, EstimationType::kEqual), kErrorBound); + ASSERT_APPROX_EQUAL(3.36, estimateIntValCard(hist, 10, EstimationType::kLess), kErrorBound); + ASSERT_APPROX_EQUAL( + 3.36, estimateIntValCard(hist, 10, EstimationType::kLessOrEqual), kErrorBound); + + ASSERT_APPROX_EQUAL(26.64, estimateIntValCard(hist, 10, EstimationType::kGreater), kErrorBound); + ASSERT_APPROX_EQUAL( + 26.64, estimateIntValCard(hist, 10, EstimationType::kGreaterOrEqual), kErrorBound); + + // Different estimates for Less and LessOrEqual for the value of 50. + ASSERT_APPROX_EQUAL(3.25, estimateIntValCard(hist, 50, EstimationType::kEqual), kErrorBound); + ASSERT_APPROX_EQUAL(10.61, estimateIntValCard(hist, 50, EstimationType::kLess), kErrorBound); + ASSERT_APPROX_EQUAL( + 13.87, estimateIntValCard(hist, 50, EstimationType::kLessOrEqual), kErrorBound); + ASSERT_APPROX_EQUAL(16.13, estimateIntValCard(hist, 50, EstimationType::kGreater), kErrorBound); + ASSERT_APPROX_EQUAL( + 19.38, estimateIntValCard(hist, 50, EstimationType::kGreaterOrEqual), kErrorBound); +} + +TEST(EstimatorTest, ThreeExclusiveBucketsIntHistogram) { + std::vector data{{1, 1.0, 0.0, 0.0}, {10, 8.0, 0.0, 0.0}, {100, 1.0, 0.0, 0.0}}; + const ScalarHistogram hist = createHistogram(data); + + ASSERT_EQ(10.0, getTotals(hist).card); + + ASSERT_EQ(0.0, estimateIntValCard(hist, 5, EstimationType::kEqual)); + ASSERT_EQ(1.0, estimateIntValCard(hist, 5, EstimationType::kLess)); + ASSERT_EQ(1.0, estimateIntValCard(hist, 5, EstimationType::kLessOrEqual)); + ASSERT_EQ(9.0, estimateIntValCard(hist, 5, EstimationType::kGreater)); + ASSERT_EQ(9.0, estimateIntValCard(hist, 5, EstimationType::kGreaterOrEqual)); +} +TEST(EstimatorTest, OneBucketStrHistogram) { + std::vector data{{"xyz", 3.0, 27.0, 9.0}}; + const ScalarHistogram hist = createHistogram(data); + + ASSERT_EQ(30.0, getTotals(hist).card); + + // Estimates with bucket bound. + auto [tag, value] = value::makeNewString("xyz"_sd); + value::ValueGuard vg(tag, value); + double expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_EQ(3.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_EQ(27.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; + ASSERT_EQ(30.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card; + ASSERT_EQ(3.0, expectedCard); + + // Estimates for a value inside the bucket. Since there is no low value bound in the histogram + // all values smaller than the upper bound will be estimated the same way using half of the + // bucket cardinality. + std::tie(tag, value) = value::makeNewString("a"_sd); + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_EQ(3.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_EQ(10.5, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; + ASSERT_EQ(13.5, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_EQ(16.5, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card; + ASSERT_EQ(19.5, expectedCard); + + std::tie(tag, value) = value::makeNewString(""_sd); + // In the special case of a single string bucket, we estimate empty string equality as for any + // other string value. In practice if there are at least 2 buckets for the string data and an + // empty string in the data set, it will be chosen as a bound for the first bucket and produce + // precise estimates. + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_EQ(3.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card; + ASSERT_EQ(30.0, expectedCard); + + // Estimates for a value larger than the upper bound. + std::tie(tag, value) = value::makeNewString("z"_sd); + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_EQ(30.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_EQ(0.0, expectedCard); +} + +TEST(EstimatorTest, TwoBucketsStrHistogram) { + // Data set of 100 strings in the range ["abc", "xyz"], with average frequency of 2. + std::vector data{{"abc", 2.0, 0.0, 0.0}, {"xyz", 3.0, 95.0, 48.0}}; + const ScalarHistogram hist = createHistogram(data); + + ASSERT_EQ(100.0, getTotals(hist).card); + + // Estimates for a value smaller than the first bucket bound. + auto [tag, value] = value::makeNewString("a"_sd); + value::ValueGuard vg(tag, value); + + double expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_EQ(100.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card; + ASSERT_EQ(100.0, expectedCard); + + // Estimates with bucket bounds. + std::tie(tag, value) = value::makeNewString("abc"_sd); + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_EQ(2.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; + ASSERT_EQ(2.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_EQ(98.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card; + ASSERT_EQ(100.0, expectedCard); + + std::tie(tag, value) = value::makeNewString("xyz"_sd); + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_EQ(3.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_EQ(97.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; + ASSERT_EQ(100.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card; + ASSERT_EQ(3.0, expectedCard); + + // Estimates for a value inside the bucket. + std::tie(tag, value) = value::makeNewString("sun"_sd); + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_APPROX_EQUAL(1.98, expectedCard, kErrorBound); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_APPROX_EQUAL(74.39, expectedCard, kErrorBound); + expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; + ASSERT_APPROX_EQUAL(76.37, expectedCard, kErrorBound); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_APPROX_EQUAL(23.64, expectedCard, kErrorBound); + expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card; + ASSERT_APPROX_EQUAL(25.62, expectedCard, kErrorBound); + + // Estimate for a value very close to the bucket bound. + std::tie(tag, value) = value::makeNewString("xyw"_sd); + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_APPROX_EQUAL(1.98, expectedCard, kErrorBound); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_APPROX_EQUAL(95.02, expectedCard, kErrorBound); + expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; + ASSERT_APPROX_EQUAL(96.99, expectedCard, kErrorBound); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_APPROX_EQUAL(3.0, expectedCard, kErrorBound); + expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card; + ASSERT_APPROX_EQUAL(4.98, expectedCard, kErrorBound); +} + +TEST(EstimatorTest, TwoBucketsDateHistogram) { + // June 6, 2017 -- June 7, 2017. + const int64_t startInstant = 1496777923000LL; + const int64_t endInstant = 1496864323000LL; + const auto startDate = Date_t::fromMillisSinceEpoch(startInstant); + const auto endDate = Date_t::fromMillisSinceEpoch(endInstant); + + std::vector data{{Value(startDate), 3.0, 0.0, 0.0}, + {Value(endDate), 1.0, 96.0, 48.0}}; + const ScalarHistogram hist = createHistogram(data); + + ASSERT_EQ(100.0, getTotals(hist).card); + + const auto valueBefore = value::bitcastFrom(startInstant - 1); + double expectedCard = + estimate(hist, value::TypeTags::Date, valueBefore, EstimationType::kEqual).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, value::TypeTags::Date, valueBefore, EstimationType::kLess).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = + estimate(hist, value::TypeTags::Date, valueBefore, EstimationType::kGreater).card; + ASSERT_EQ(100.0, expectedCard); + + const auto valueStart = value::bitcastFrom(startInstant); + expectedCard = estimate(hist, value::TypeTags::Date, valueStart, EstimationType::kEqual).card; + ASSERT_EQ(3.0, expectedCard); + expectedCard = estimate(hist, value::TypeTags::Date, valueStart, EstimationType::kLess).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, value::TypeTags::Date, valueStart, EstimationType::kGreater).card; + ASSERT_EQ(97.0, expectedCard); + + const auto valueEnd = value::bitcastFrom(endInstant); + expectedCard = estimate(hist, value::TypeTags::Date, valueEnd, EstimationType::kEqual).card; + ASSERT_EQ(1.0, expectedCard); + expectedCard = estimate(hist, value::TypeTags::Date, valueEnd, EstimationType::kLess).card; + ASSERT_EQ(99.0, expectedCard); + expectedCard = estimate(hist, value::TypeTags::Date, valueEnd, EstimationType::kGreater).card; + ASSERT_EQ(0.0, expectedCard); + + const auto valueIn = value::bitcastFrom(startInstant + 43000000); + expectedCard = estimate(hist, value::TypeTags::Date, valueIn, EstimationType::kEqual).card; + ASSERT_EQ(2.0, expectedCard); + expectedCard = estimate(hist, value::TypeTags::Date, valueIn, EstimationType::kLess).card; + ASSERT_APPROX_EQUAL(48.77, expectedCard, kErrorBound); + expectedCard = estimate(hist, value::TypeTags::Date, valueIn, EstimationType::kGreater).card; + ASSERT_APPROX_EQUAL(49.22, expectedCard, kErrorBound); + + const auto valueAfter = value::bitcastFrom(endInstant + 100); + expectedCard = estimate(hist, value::TypeTags::Date, valueAfter, EstimationType::kEqual).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, value::TypeTags::Date, valueAfter, EstimationType::kLess).card; + ASSERT_EQ(100.0, expectedCard); + expectedCard = estimate(hist, value::TypeTags::Date, valueAfter, EstimationType::kGreater).card; + ASSERT_EQ(0.0, expectedCard); +} + +TEST(EstimatorTest, TwoBucketsTimestampHistogram) { + // June 6, 2017 -- June 7, 2017 in seconds. + const int64_t startInstant = 1496777923LL; + const int64_t endInstant = 1496864323LL; + const Timestamp startTs{Seconds(startInstant), 0}; + const Timestamp endTs{Seconds(endInstant), 0}; + + std::vector data{{Value(startTs), 3.0, 0.0, 0.0}, {Value(endTs), 1.0, 96.0, 48.0}}; + const ScalarHistogram hist = createHistogram(data); + + ASSERT_EQ(100.0, getTotals(hist).card); + + const auto valueBefore = value::bitcastFrom(startTs.asULL() - 1); + double expectedCard = + estimate(hist, value::TypeTags::Timestamp, valueBefore, EstimationType::kEqual).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = + estimate(hist, value::TypeTags::Timestamp, valueBefore, EstimationType::kLess).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = + estimate(hist, value::TypeTags::Timestamp, valueBefore, EstimationType::kGreater).card; + ASSERT_EQ(100.0, expectedCard); + + const auto valueStart = value::bitcastFrom( + startTs.asULL()); // NB: startTs.asInt64() produces different value. + expectedCard = + estimate(hist, value::TypeTags::Timestamp, valueStart, EstimationType::kEqual).card; + ASSERT_EQ(3.0, expectedCard); + expectedCard = + estimate(hist, value::TypeTags::Timestamp, valueStart, EstimationType::kLess).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = + estimate(hist, value::TypeTags::Timestamp, valueStart, EstimationType::kGreater).card; + ASSERT_EQ(97.0, expectedCard); + + const auto valueEnd = value::bitcastFrom(endTs.asULL()); + expectedCard = + estimate(hist, value::TypeTags::Timestamp, valueEnd, EstimationType::kEqual).card; + ASSERT_EQ(1.0, expectedCard); + expectedCard = estimate(hist, value::TypeTags::Timestamp, valueEnd, EstimationType::kLess).card; + ASSERT_EQ(99.0, expectedCard); + expectedCard = + estimate(hist, value::TypeTags::Timestamp, valueEnd, EstimationType::kGreater).card; + ASSERT_EQ(0.0, expectedCard); + + const auto valueIn = value::bitcastFrom((startTs.asULL() + endTs.asULL()) / 2); + expectedCard = estimate(hist, value::TypeTags::Timestamp, valueIn, EstimationType::kEqual).card; + ASSERT_EQ(2.0, expectedCard); + expectedCard = estimate(hist, value::TypeTags::Timestamp, valueIn, EstimationType::kLess).card; + ASSERT_APPROX_EQUAL(49.0, expectedCard, kErrorBound); + expectedCard = + estimate(hist, value::TypeTags::Timestamp, valueIn, EstimationType::kGreater).card; + ASSERT_APPROX_EQUAL(49.0, expectedCard, kErrorBound); + + const auto valueAfter = value::bitcastFrom(endTs.asULL() + 100); + expectedCard = + estimate(hist, value::TypeTags::Timestamp, valueAfter, EstimationType::kEqual).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = + estimate(hist, value::TypeTags::Timestamp, valueAfter, EstimationType::kLess).card; + ASSERT_EQ(100.0, expectedCard); + expectedCard = + estimate(hist, value::TypeTags::Timestamp, valueAfter, EstimationType::kGreater).card; + ASSERT_EQ(0.0, expectedCard); +} + +TEST(EstimatorTest, TwoBucketsObjectIdHistogram) { + const auto startOid = OID("63340d8d27afef2de7357e8d"); + const auto endOid = OID("63340dbed6cd8af737d4139a"); + ASSERT_TRUE(startOid < endOid); + + std::vector data{{Value(startOid), 2.0, 0.0, 0.0}, + {Value(endOid), 1.0, 97.0, 77.0}}; + const ScalarHistogram hist = createHistogram(data); + + ASSERT_EQ(100.0, getTotals(hist).card); + + auto [tag, value] = value::makeNewObjectId(); + value::ValueGuard vg(tag, value); + const auto oidBefore = OID("63340d8d27afef2de7357e8c"); + oidBefore.view().readInto(value::getObjectIdView(value)); + + double expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_EQ(100.0, expectedCard); + + // Bucket bounds. + startOid.view().readInto(value::getObjectIdView(value)); + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_EQ(2.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_EQ(98.0, expectedCard); + + endOid.view().readInto(value::getObjectIdView(value)); + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_EQ(1.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_EQ(99.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_EQ(0.0, expectedCard); + + // ObjectId value inside the bucket. + const auto oidInside = OID("63340db2cd4d46ff39178e9d"); + oidInside.view().readInto(value::getObjectIdView(value)); + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_APPROX_EQUAL(1.25, expectedCard, kErrorBound); + + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_APPROX_EQUAL(83.95, expectedCard, kErrorBound); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_APPROX_EQUAL(14.78, expectedCard, kErrorBound); + + const auto oidAfter = OID("63340dbed6cd8af737d4139b"); + oidAfter.view().readInto(value::getObjectIdView(value)); + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_EQ(100.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_EQ(0.0, expectedCard); +} + +TEST(EstimatorTest, TwoExclusiveBucketsMixedHistogram) { + // Data set of mixed data types: 3 integers and 5 strings. + std::vector data{{1, 3.0, 0.0, 0.0}, {"abc", 5.0, 0.0, 0.0}}; + const ScalarHistogram hist = createHistogram(data); + const ArrayHistogram arrHist( + hist, TypeCounts{{value::TypeTags::NumberInt64, 3}, {value::TypeTags::StringSmall, 5}}); + + const auto [tagLowDbl, valLowDbl] = + std::make_pair(value::TypeTags::NumberDouble, + value::bitcastFrom(std::numeric_limits::quiet_NaN())); + + // (NaN, 1). + double expectedCard = estimateCardRange(arrHist, + false /* lowInclusive */, + tagLowDbl, + valLowDbl, + false /* highInclusive */, + value::TypeTags::NumberInt32, + value::bitcastFrom(1), + true /* includeScalar */); + ASSERT_APPROX_EQUAL(0.0, expectedCard, kErrorBound); + + // (NaN, 5). + expectedCard = estimateCardRange(arrHist, + false /* lowInclusive */, + tagLowDbl, + valLowDbl, + false /* highInclusive */, + value::TypeTags::NumberInt32, + value::bitcastFrom(5), + true /* includeScalar */); + ASSERT_APPROX_EQUAL(3.0, expectedCard, kErrorBound); + + const auto [tagLowStr, valLowStr] = value::makeNewString(""_sd); + value::ValueGuard vgLowStr(tagLowStr, valLowStr); + auto [tag, value] = value::makeNewString("a"_sd); + value::ValueGuard vg(tag, value); + + // [0, ""). + expectedCard = estimateCardRange(arrHist, + true /* lowInclusive */, + value::TypeTags::NumberInt32, + value::bitcastFrom(0), + false /* highInclusive */, + tagLowStr, + valLowStr, + true /* includeScalar */); + ASSERT_APPROX_EQUAL(3.0, expectedCard, kErrorBound); + + // ["", "a"]. + expectedCard = estimateCardRange(arrHist, + true /* lowInclusive */, + tagLowStr, + valLowStr, + true /* highInclusive */, + tag, + value, + true /* includeScalar */); + + ASSERT_APPROX_EQUAL(0.0, expectedCard, kErrorBound); + + std::tie(tag, value) = value::makeNewString("xyz"_sd); + // ["", "xyz"]. + expectedCard = estimateCardRange(arrHist, + true /* lowInclusive */, + tagLowStr, + valLowStr, + true /* highInclusive */, + tag, + value, + true /* includeScalar */); + + ASSERT_APPROX_EQUAL(5.0, expectedCard, kErrorBound); +} + +TEST(EstimatorTest, TwoBucketsMixedHistogram) { + // Data set of mixed data types: 20 integers and 80 strings. + // Histogram with one bucket per data type. + std::vector data{{100, 3.0, 17.0, 9.0}, {"pqr", 5.0, 75.0, 25.0}}; + const ScalarHistogram hist = createHistogram(data); + const ArrayHistogram arrHist( + hist, TypeCounts{{value::TypeTags::NumberInt64, 20}, {value::TypeTags::StringSmall, 80}}); + + ASSERT_EQ(100.0, getTotals(hist).card); + + // Estimates with the bucket bounds. + ASSERT_EQ(3.0, estimateIntValCard(hist, 100, EstimationType::kEqual)); + ASSERT_EQ(17.0, estimateIntValCard(hist, 100, EstimationType::kLess)); + ASSERT_EQ(80.0, estimateIntValCard(hist, 100, EstimationType::kGreater)); + + auto [tag, value] = value::makeNewString("pqr"_sd); + value::ValueGuard vg(tag, value); + double expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_EQ(5.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_EQ(95.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_EQ(0.0, expectedCard); + + // Estimates for a value smaller than the first bucket bound. + ASSERT_APPROX_EQUAL(1.88, estimateIntValCard(hist, 50, EstimationType::kEqual), kErrorBound); + ASSERT_APPROX_EQUAL(6.61, estimateIntValCard(hist, 50, EstimationType::kLess), kErrorBound); + ASSERT_APPROX_EQUAL( + 8.49, estimateIntValCard(hist, 50, EstimationType::kLessOrEqual), kErrorBound); + ASSERT_APPROX_EQUAL(91.5, estimateIntValCard(hist, 50, EstimationType::kGreater), kErrorBound); + ASSERT_APPROX_EQUAL( + 93.39, estimateIntValCard(hist, 50, EstimationType::kGreaterOrEqual), kErrorBound); + + // Estimates for a value between bucket bounds. + ASSERT_EQ(0.0, estimateIntValCard(hist, 105, EstimationType::kEqual)); + + std::tie(tag, value) = value::makeNewString("a"_sd); + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_APPROX_EQUAL(3.0, expectedCard, kErrorBound); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_APPROX_EQUAL(54.5, expectedCard, kErrorBound); + expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; + ASSERT_APPROX_EQUAL(57.5, expectedCard, kErrorBound); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_APPROX_EQUAL(42.5, expectedCard, kErrorBound); + expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card; + ASSERT_APPROX_EQUAL(45.5, expectedCard, kErrorBound); + + // Range estimates, including min/max values per data type. + const auto [tagLowDbl, valLowDbl] = + std::make_pair(value::TypeTags::NumberDouble, + value::bitcastFrom(std::numeric_limits::quiet_NaN())); + const auto [tagHighInt, valHighInt] = + std::make_pair(value::TypeTags::NumberInt64, value::bitcastFrom(1000000)); + + // [NaN, 25]. + expectedCard = estimateCardRange(arrHist, + true /* lowInclusive */, + tagLowDbl, + valLowDbl, + true /* highInclusive */, + value::TypeTags::NumberInt32, + value::bitcastFrom(25), + true /* includeScalar */); + ASSERT_APPROX_EQUAL(8.49, expectedCard, kErrorBound); + + // [25, 1000000]. + expectedCard = estimateCardRange(arrHist, + true /* lowInclusive */, + value::TypeTags::NumberInt32, + value::bitcastFrom(25), + true /* highInclusive */, + tagHighInt, + valHighInt, + true /* includeScalar */); + ASSERT_APPROX_EQUAL(13.38, expectedCard, kErrorBound); + + // [NaN, 1000000]. + expectedCard = estimateCardRange(arrHist, + true /* lowInclusive */, + tagLowDbl, + valLowDbl, + true /* highInclusive */, + tagHighInt, + valHighInt, + true /* includeScalar */); + ASSERT_APPROX_EQUAL(20.0, expectedCard, kErrorBound); + + const auto [tagLowStr, valLowStr] = value::makeNewString(""_sd); + value::ValueGuard vgLowStr(tagLowStr, valLowStr); + + // [NaN, ""). + expectedCard = estimateCardRange(arrHist, + true /* lowInclusive */, + tagLowDbl, + valLowDbl, + false /* highInclusive */, + tagLowStr, + valLowStr, + true /* includeScalar */); + ASSERT_APPROX_EQUAL(20.0, expectedCard, kErrorBound); + + // [25, ""). + expectedCard = estimateCardRange(arrHist, + true /* lowInclusive */, + value::TypeTags::NumberInt32, + value::bitcastFrom(25), + false /* highInclusive */, + tagLowStr, + valLowStr, + true /* includeScalar */); + ASSERT_APPROX_EQUAL(13.39, expectedCard, kErrorBound); + + // ["", "a"]. + expectedCard = estimateCardRange(arrHist, + true /* lowInclusive */, + tagLowStr, + valLowStr, + true /* highInclusive */, + tag, + value, + true /* includeScalar */); + + ASSERT_APPROX_EQUAL(37.49, expectedCard, kErrorBound); + + // ["", {}). + auto [tagObj, valObj] = value::makeNewObject(); + value::ValueGuard vgObj(tagObj, valObj); + expectedCard = estimateCardRange(arrHist, + true /* lowInclusive */, + tagLowStr, + valLowStr, + false /* highInclusive */, + tagObj, + valObj, + true /* includeScalar */); + ASSERT_APPROX_EQUAL(80.0, expectedCard, kErrorBound); + + // ["a", {}). + expectedCard = estimateCardRange(arrHist, + true /* lowInclusive */, + tag, + value, + false /* highInclusive */, + tagObj, + valObj, + true /* includeScalar */); + + ASSERT_APPROX_EQUAL(45.5, expectedCard, kErrorBound); +} + +// TODO: enable the following test after SERVER-71376 Fix histogram generation on MacOs +#if 0 +/** + * Tests for cardinality estimates for queries over minimum values of date, timestamp, and objectId + * types. When the histogram has at least 2 buckets per data type, the minimum value, if present in + * the data, is picked as a bound for the first bucket for the corresponding data type. In this case + * the cardinality estimates are precise. To test the approximate estimation, we force the histogram + * generation to use one bucket per type (except the first numeric type). + */ +TEST(EstimatorTest, MinValueMixedHistogramFromData) { + const int64_t startInstant = 1506777923000LL; + const int64_t endInstant = 1516864323000LL; + const Timestamp startTs{Seconds(1516864323LL), 0}; + const Timestamp endTs{Seconds(1526864323LL), 0}; + const auto startOid = OID("63340d8d27afef2de7357e8d"); + // const auto endOid = OID("63340dbed6cd8af737d4139a"); + + std::vector data; + data.emplace_back(value::TypeTags::Date, value::bitcastFrom(startInstant)); + data.emplace_back(value::TypeTags::Date, value::bitcastFrom(endInstant)); + + data.emplace_back(value::TypeTags::Timestamp, value::bitcastFrom(startTs.asULL())); + data.emplace_back(value::TypeTags::Timestamp, value::bitcastFrom(endTs.asULL())); + + auto [tag, val] = makeInt64Value(100); + data.emplace_back(tag, val); + std::tie(tag, val) = makeInt64Value(1000); + data.emplace_back(tag, val); + + auto [strTag, strVal] = value::makeNewString("abc"_sd); + value::ValueGuard strVG(strTag, strVal); + auto [copyTag, copyVal] = value::copyValue(strTag, strVal); + data.emplace_back(copyTag, copyVal); + std::tie(strTag, strVal) = value::makeNewString("xyz"_sd); + std::tie(copyTag, copyVal) = value::copyValue(strTag, strVal); + data.emplace_back(copyTag, copyVal); + + auto [objTag, objVal] = value::makeNewObjectId(); + value::ValueGuard objVG(objTag, objVal); + startOid.view().readInto(value::getObjectIdView(objVal)); + std::tie(tag, val) = copyValue(objTag, objVal); + data.emplace_back(tag, val); + /* TODO: add another objectId value when mapping to double is fixed by SERVER-71205. + endOid.view().readInto(value::getObjectIdView(objVal)); + std::tie(tag, val) = copyValue(objTag, objVal); + data.emplace_back(tag, val); + */ + + sortValueVector(data); + + // Force each type except numbers to use a single bucket. This way there is no bucket for the + // min value if present in the data and it needs to be estimated. + const ScalarHistogram& hist = makeHistogram(data, 6); + // Mixed data are sorted in the histogram according to the BSON order as defined in bsontypes.h + // the canonicalizeBSONTypeUnsafeLookup function. + if constexpr (kCETestLogOnly) { + std::cout << printValueArray(data) << "\n"; + std::cout << "Mixed types " << hist.dump(); + } + + // Minimum ObjectId. + auto&& [minOid, inclOid] = getMinMaxBoundForType(true /*isMin*/, value::TypeTags::ObjectId); + auto [minOidTag, minOidVal] = minOid->cast()->get(); + double expectedCard = estimate(hist, minOidTag, minOidVal, EstimationType::kEqual).card; + ASSERT_EQ(1.0, expectedCard); + + // Minimum date. + const auto&& [minDate, inclDate] = getMinMaxBoundForType(true /*isMin*/, value::TypeTags::Date); + const auto [minDateTag, minDateVal] = minDate->cast()->get(); + expectedCard = estimate(hist, minDateTag, minDateVal, EstimationType::kEqual).card; + ASSERT_EQ(1.0, expectedCard); + + // Minimum timestamp. + auto&& [minTs, inclTs] = getMinMaxBoundForType(true /*isMin*/, value::TypeTags::Timestamp); + auto [minTsTag, minTsVal] = minTs->cast()->get(); + expectedCard = estimate(hist, minTsTag, minTsVal, EstimationType::kEqual).card; + ASSERT_EQ(1.0, expectedCard); + + // Add minimum values to the data set and create another histogram. + const auto [tagLowStr, valLowStr] = value::makeNewString(""_sd); + value::ValueGuard vgLowStr(tagLowStr, valLowStr); + std::tie(copyTag, copyVal) = value::copyValue(tagLowStr, valLowStr); + data.emplace_back(copyTag, copyVal); + data.emplace_back(minDateTag, minDateVal); + data.emplace_back(minTsTag, minTsVal); + + sortValueVector(data); + const ScalarHistogram& hist2 = makeHistogram(data, 6); + if constexpr (kCETestLogOnly) { + std::cout << printValueArray(data) << "\n"; + std::cout << "Mixed types " << hist2.dump(); + } + + // Precise estimate for equality to empty string, it is a bucket boundary. + expectedCard = estimate(hist2, tagLowStr, valLowStr, EstimationType::kEqual).card; + ASSERT_EQ(1.0, expectedCard); + // Equality to the minimum date/ts value is estimated by range_frequency/NDV. + expectedCard = estimate(hist2, minDateTag, minDateVal, EstimationType::kEqual).card; + ASSERT_EQ(1.0, expectedCard); + expectedCard = estimate(hist2, minTsTag, minTsVal, EstimationType::kEqual).card; + ASSERT_EQ(1.0, expectedCard); + + // Inequality predicates using min values. + const ArrayHistogram arrHist(hist2, + TypeCounts{ + {value::TypeTags::NumberInt64, 2}, + {value::TypeTags::StringSmall, 3}, + {value::TypeTags::ObjectId, 1}, + {value::TypeTags::Date, 3}, + {value::TypeTags::Timestamp, 3}, + }); + // [minDate, startInstant], estimated by the half of the date bucket. + expectedCard = estimateCardRange(arrHist, + true /* lowInclusive */, + minDateTag, + minDateVal, + true /* highInclusive */, + value::TypeTags::Date, + value::bitcastFrom(startInstant), + true /* includeScalar */); + ASSERT_EQ(1.0, expectedCard); + + // [minDate, endInstant], estimated by the entire date bucket. + expectedCard = estimateCardRange(arrHist, + true /* lowInclusive */, + minDateTag, + minDateVal, + true /* highInclusive */, + value::TypeTags::Date, + value::bitcastFrom(endInstant), + true /* includeScalar */); + ASSERT_EQ(3.0, expectedCard); + + // [minDate, minTs), estimated by the entire date bucket. + // (is this interval possible or is it better to have maxDate upper bound?). + expectedCard = estimateCardRange(arrHist, + true /* lowInclusive */, + minDateTag, + minDateVal, + false /* highInclusive */, + minTsTag, + minTsVal, + true /* includeScalar */); + ASSERT_EQ(3.0, expectedCard); + + // [minTs, startTs], estimated by the half of the timestamp bucket. + expectedCard = estimateCardRange(arrHist, + true /* lowInclusive */, + minTsTag, + minTsVal, + true /* highInclusive */, + value::TypeTags::Timestamp, + value::bitcastFrom(startTs.asULL()), + true /* includeScalar */); + ASSERT_EQ(1.0, expectedCard); + + // [minTs, endTs], estimated by the entire timestamp bucket. + expectedCard = estimateCardRange(arrHist, + true /* lowInclusive */, + minTsTag, + minTsVal, + true /* highInclusive */, + value::TypeTags::Timestamp, + value::bitcastFrom(endTs.asULL()), + true /* includeScalar */); + ASSERT_EQ(3.0, expectedCard); + + // [minTs, maxTs], estimated by the entire timestamp bucket. + auto&& [maxTs, inclMaxTs] = getMinMaxBoundForType(false /*isMin*/, value::TypeTags::Timestamp); + const auto [maxTsTag, maxTsVal] = maxTs->cast()->get(); + expectedCard = estimateCardRange(arrHist, + true /* lowInclusive */, + minTsTag, + minTsVal, + true /* highInclusive */, + maxTsTag, + maxTsVal, + true /* includeScalar */); + ASSERT_EQ(3.0, expectedCard); +} +#endif + +TEST(EstimatorTest, MinValueMixedHistogramFromBuckets) { + const auto endOid = OID("63340dbed6cd8af737d4139a"); + const auto endDate = Date_t::fromMillisSinceEpoch(1526864323000LL); + const Timestamp endTs{Seconds(1526864323LL), 0}; + + std::vector data{ + {0, 1.0, 0.0, 0.0}, + {100, 4.0, 95.0, 30.0}, + {"xyz", 5.0, 95.0, 25.0}, + {Value(endOid), 5.0, 95.0, 50.0}, + {Value(endDate), 4.0, 96.0, 24.0}, + {Value(endTs), 5.0, 95.0, 50.0}, + }; + const ScalarHistogram hist = createHistogram(data); + if constexpr (kCETestLogOnly) { + std::cout << "Mixed types " << hist.dump(); + } + ASSERT_EQ(500.0, getTotals(hist).card); + + // Minimum ObjectId. + auto&& [minOid, inclOid] = getMinMaxBoundForType(true /*isMin*/, value::TypeTags::ObjectId); + auto [minOidTag, minOidVal] = minOid->cast()->get(); + double expectedCard = estimate(hist, minOidTag, minOidVal, EstimationType::kEqual).card; + ASSERT_APPROX_EQUAL(1.9, expectedCard, kErrorBound); + + // Minimum date. + const auto&& [minDate, inclDate] = getMinMaxBoundForType(true /*isMin*/, value::TypeTags::Date); + const auto [minDateTag, minDateVal] = minDate->cast()->get(); + expectedCard = estimate(hist, minDateTag, minDateVal, EstimationType::kEqual).card; + ASSERT_EQ(4.0, expectedCard); + + // Minimum timestamp. + auto&& [minTs, inclTs] = getMinMaxBoundForType(true /*isMin*/, value::TypeTags::Timestamp); + auto [minTsTag, minTsVal] = minTs->cast()->get(); + expectedCard = estimate(hist, minTsTag, minTsVal, EstimationType::kEqual).card; + ASSERT_APPROX_EQUAL(1.9, expectedCard, kErrorBound); + + // Inequality predicates using min values. + const ArrayHistogram arrHist(hist, + TypeCounts{ + {value::TypeTags::NumberInt64, 100}, + {value::TypeTags::StringSmall, 100}, + {value::TypeTags::ObjectId, 100}, + {value::TypeTags::Date, 100}, + {value::TypeTags::Timestamp, 100}, + }); + // [minDate, innerDate], estimated by the half of the date bucket. + const int64_t innerDate = 1516864323000LL; + expectedCard = estimateCardRange(arrHist, + true /* lowInclusive */, + minDateTag, + minDateVal, + true /* highInclusive */, + value::TypeTags::Date, + value::bitcastFrom(innerDate), + true /* includeScalar */); + ASSERT_APPROX_EQUAL(48.0, expectedCard, kErrorBound); + + // [minTs, innerTs], estimated by the half of the timestamp bucket. + const Timestamp innerTs{Seconds(1516864323LL), 0}; + expectedCard = estimateCardRange(arrHist, + true /* lowInclusive */, + minTsTag, + minTsVal, + true /* highInclusive */, + value::TypeTags::Timestamp, + value::bitcastFrom(innerTs.asULL()), + true /* includeScalar */); + ASSERT_APPROX_EQUAL(47.5, expectedCard, kErrorBound); +} +} // namespace +} // namespace mongo::optimizer::ce diff --git a/src/mongo/db/query/ce/histogram_estimation.cpp b/src/mongo/db/query/ce/histogram_estimation.cpp deleted file mode 100644 index cd1e52219c2..00000000000 --- a/src/mongo/db/query/ce/histogram_estimation.cpp +++ /dev/null @@ -1,488 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/db/query/ce/histogram_estimation.h" -#include "mongo/db/exec/sbe/abt/abt_lower.h" -#include "mongo/db/pipeline/abt/utils.h" -#include "mongo/db/query/ce/value_utils.h" -#include "mongo/db/query/optimizer/rewrites/const_eval.h" -#include "mongo/db/query/optimizer/syntax/expr.h" -#include "mongo/db/query/optimizer/utils/ce_math.h" -#include "mongo/db/query/optimizer/utils/interval_utils.h" - -namespace mongo::ce { -using namespace sbe; -using namespace optimizer; - -std::pair getConstTypeVal(const ABT& abt) { - const auto* constant = abt.cast(); - tassert(7051102, "Interval ABTs passed in for estimation must have Constant bounds.", constant); - return constant->get(); -}; - -boost::optional> getBound( - const BoundRequirement& boundReq) { - const ABT& bound = boundReq.getBound(); - if (bound.is()) { - return getConstTypeVal(bound); - } - return boost::none; -}; - -IntervalRequirement getMinMaxIntervalForType(value::TypeTags type) { - // Note: This function works based on the assumption that there are no intervals that include - // values from more than one type. That is why the MinMax interval of a type will include all - // possible intervals over that type. - - auto&& [min, minInclusive] = getMinMaxBoundForType(true /*isMin*/, type); - tassert(7051103, str::stream() << "Type " << type << " has no minimum", min); - - auto&& [max, maxInclusive] = getMinMaxBoundForType(false /*isMin*/, type); - tassert(7051104, str::stream() << "Type " << type << " has no maximum", max); - - return IntervalRequirement{BoundRequirement(minInclusive, *min), - BoundRequirement(maxInclusive, *max)}; -} - -bool isIntervalSubsetOfType(const IntervalRequirement& interval, value::TypeTags type) { - // Create a conjunction of the interval and the min-max interval for the type as input for the - // intersection function. - auto intervals = - IntervalReqExpr::make(IntervalReqExpr::NodeVector{ - IntervalReqExpr::make(IntervalReqExpr::NodeVector{ - IntervalReqExpr::make(interval), - IntervalReqExpr::make(getMinMaxIntervalForType(type))})}); - - return intersectDNFIntervals(intervals, ConstEval::constFold).has_value(); -} - -EstimationResult getTotals(const ScalarHistogram& h) { - if (h.empty()) { - return {0.0, 0.0}; - } - - const Bucket& last = h.getBuckets().back(); - return {last._cumulativeFreq, last._cumulativeNDV}; -} - -/** - * Helper function that uses linear interpolation to estimate the cardinality and NDV for a value - * that falls inside of a histogram bucket. - */ -EstimationResult interpolateEstimateInBucket(const ScalarHistogram& h, - value::TypeTags tag, - value::Value val, - EstimationType type, - size_t bucketIndex) { - - const Bucket& bucket = h.getBuckets().at(bucketIndex); - const auto [boundTag, boundVal] = h.getBounds().getAt(bucketIndex); - - double resultCard = bucket._cumulativeFreq - bucket._equalFreq - bucket._rangeFreq; - double resultNDV = bucket._cumulativeNDV - bucket._ndv - 1.0; - - // Check if the estimate is at the point of type brackets switch. If the current bucket is the - // first bucket of a new type bracket and the value is of another type, estimate cardinality - // from the current bucket as 0. - // - // For example, let bound 1 = 1000, bound 2 = "abc". The value 100000000 falls in bucket 2, the - // first bucket for strings, but should not get cardinality/ ndv fraction from it. - if (!sameTypeBracket(tag, boundTag)) { - if (type == EstimationType::kEqual) { - return {0.0, 0.0}; - } else { - return {resultCard, resultNDV}; - } - } - - // Estimate for equality frequency inside of the bucket. - const double innerEqFreq = (bucket._ndv == 0.0) ? 0.0 : bucket._rangeFreq / bucket._ndv; - - if (type == EstimationType::kEqual) { - return {innerEqFreq, 1.0}; - } - - // If the value is minimal for its type, and the operation is $lt or $lte return cardinality up - // to the previous bucket. - auto&& [minConstant, inclusive] = getMinMaxBoundForType(true /*isMin*/, tag); - auto [minTag, minVal] = getConstTypeVal(*minConstant); - if (compareValues(minTag, minVal, tag, val) == 0) { - return {resultCard, resultNDV}; - } - - // For $lt and $lte operations use linear interpolation to take a fraction of the bucket - // cardinality and NDV if there is a preceeding bucket with bound of the same type. Use half of - // the bucket estimates otherwise. - double ratio = 0.5; - if (bucketIndex > 0) { - const auto [lowBoundTag, lowBoundVal] = h.getBounds().getAt(bucketIndex - 1); - if (sameTypeBracket(lowBoundTag, boundTag)) { - double doubleLowBound = valueToDouble(lowBoundTag, lowBoundVal); - double doubleUpperBound = valueToDouble(boundTag, boundVal); - double doubleVal = valueToDouble(tag, val); - ratio = (doubleVal - doubleLowBound) / (doubleUpperBound - doubleLowBound); - } - } - - const double bucketFreqRatio = bucket._rangeFreq * ratio; - resultCard += bucketFreqRatio; - resultNDV += bucket._ndv * ratio; - - if (type == EstimationType::kLess) { - // Subtract from the estimate the cardinality and ndv corresponding to the equality - // operation, if they are larger than the ratio taken from this bucket. - const double innerEqFreqCorrection = (bucketFreqRatio < innerEqFreq) ? 0.0 : innerEqFreq; - const double innerEqNdv = (bucket._ndv * ratio <= 1.0) ? 0.0 : 1.0; - resultCard -= innerEqFreqCorrection; - resultNDV -= innerEqNdv; - } - return {resultCard, resultNDV}; -} - -EstimationResult estimate(const ScalarHistogram& h, - value::TypeTags tag, - value::Value val, - EstimationType type) { - switch (type) { - case EstimationType::kGreater: - return getTotals(h) - estimate(h, tag, val, EstimationType::kLessOrEqual); - - case EstimationType::kGreaterOrEqual: - return getTotals(h) - estimate(h, tag, val, EstimationType::kLess); - - default: - // Continue. - break; - } - - size_t bucketIndex = 0; - { - size_t len = h.getBuckets().size(); - while (len > 0) { - const size_t half = len >> 1; - const auto [boundTag, boundVal] = h.getBounds().getAt(bucketIndex + half); - - if (compareValues(boundTag, boundVal, tag, val) < 0) { - bucketIndex += half + 1; - len -= half + 1; - } else { - len = half; - } - } - } - if (bucketIndex == h.getBuckets().size()) { - // Value beyond the largest endpoint. - switch (type) { - case EstimationType::kEqual: - return {0.0, 0.0}; - - case EstimationType::kLess: - case EstimationType::kLessOrEqual: - return getTotals(h); - - default: - MONGO_UNREACHABLE; - } - } - - const Bucket& bucket = h.getBuckets().at(bucketIndex); - const auto [boundTag, boundVal] = h.getBounds().getAt(bucketIndex); - const bool isEndpoint = compareValues(boundTag, boundVal, tag, val) == 0; - - if (isEndpoint) { - switch (type) { - case EstimationType::kEqual: { - return {bucket._equalFreq, 1.0}; - } - - case EstimationType::kLess: { - double resultCard = bucket._cumulativeFreq - bucket._equalFreq; - double resultNDV = bucket._cumulativeNDV - 1.0; - return {resultCard, resultNDV}; - } - - case EstimationType::kLessOrEqual: { - double resultCard = bucket._cumulativeFreq; - double resultNDV = bucket._cumulativeNDV; - return {resultCard, resultNDV}; - } - - default: - MONGO_UNREACHABLE; - } - } else { - return interpolateEstimateInBucket(h, tag, val, type, bucketIndex); - } -} - -/** - * Returns how many values of the given type are known by the array histogram. - */ -double getTypeCard(const ArrayHistogram& ah, value::TypeTags tag, bool includeScalar) { - double count = 0.0; - - // TODO SERVER-70936: booleans are estimated by different type counters (unless in arrays). - if (tag == sbe::value::TypeTags::Boolean) { - uasserted(7051101, "Cannot estimate boolean types yet with histogram CE."); - } - - // Note that if we are asked by the optimizer to estimate an interval whose bounds are arrays, - // this means we are trying to estimate equality on nested arrays. In this case, we do not want - // to include the "scalar" type counter for the array type, because this will cause us to - // estimate the nested array case as counting all arrays, regardless of whether or not they are - // nested. - if (includeScalar && tag != value::TypeTags::Array) { - auto typeIt = ah.getTypeCounts().find(tag); - if (typeIt != ah.getTypeCounts().end()) { - count += typeIt->second; - } - } - if (ah.isArray()) { - auto typeIt = ah.getArrayTypeCounts().find(tag); - if (typeIt != ah.getArrayTypeCounts().end()) { - count += typeIt->second; - } - } - return count; -} - -/** - * Estimates equality to the given tag/value using histograms. - */ -double estimateCardEq(const ArrayHistogram& ah, - value::TypeTags tag, - value::Value val, - bool includeScalar) { - double card = 0.0; - if (includeScalar) { - card = estimate(ah.getScalar(), tag, val, EstimationType::kEqual).card; - } - if (ah.isArray()) { - card += estimate(ah.getArrayUnique(), tag, val, EstimationType::kEqual).card; - } - return card; -} - -static EstimationResult estimateRange(const ScalarHistogram& histogram, - bool lowInclusive, - value::TypeTags tagLow, - value::Value valLow, - bool highInclusive, - value::TypeTags tagHigh, - value::Value valHigh) { - const EstimationType highType = - highInclusive ? EstimationType::kLessOrEqual : EstimationType::kLess; - const EstimationResult highEstimate = estimate(histogram, tagHigh, valHigh, highType); - - const EstimationType lowType = - lowInclusive ? EstimationType::kLess : EstimationType::kLessOrEqual; - const EstimationResult lowEstimate = estimate(histogram, tagLow, valLow, lowType); - - return highEstimate - lowEstimate; -} - -/** - * Compute an estimate for range query on array data with formula: - * Card(ArrayMin(a < valHigh)) - Card(ArrayMax(a < valLow)) - */ -static EstimationResult estimateRangeQueryOnArray(const ScalarHistogram& histogramAmin, - const ScalarHistogram& histogramAmax, - bool lowInclusive, - value::TypeTags tagLow, - value::Value valLow, - bool highInclusive, - value::TypeTags tagHigh, - value::Value valHigh) { - const EstimationType highType = - highInclusive ? EstimationType::kLessOrEqual : EstimationType::kLess; - const EstimationResult highEstimate = estimate(histogramAmin, tagHigh, valHigh, highType); - - const EstimationType lowType = - lowInclusive ? EstimationType::kLess : EstimationType::kLessOrEqual; - const EstimationResult lowEstimate = estimate(histogramAmax, tagLow, valLow, lowType); - - return highEstimate - lowEstimate; -} - -double estimateCardRange(const ArrayHistogram& ah, - /* Define lower bound. */ - bool lowInclusive, - value::TypeTags tagLow, - value::Value valLow, - /* Define upper bound. */ - bool highInclusive, - value::TypeTags tagHigh, - value::Value valHigh, - bool includeScalar, - EstimationAlgo estimationAlgo) { - uassert(6695701, - "Low bound must not be higher than high", - compareValues(tagLow, valLow, tagHigh, valHigh) <= 0); - - // Helper lambda to shorten code for legibility. - auto estRange = [&](const ScalarHistogram& h) { - return estimateRange(h, lowInclusive, tagLow, valLow, highInclusive, tagHigh, valHigh); - }; - - double result = 0.0; - if (ah.isArray()) { - - if (includeScalar) { - // Range query on array data. - const EstimationResult rangeCardOnArray = estimateRangeQueryOnArray(ah.getArrayMin(), - ah.getArrayMax(), - lowInclusive, - tagLow, - valLow, - highInclusive, - tagHigh, - valHigh); - result += rangeCardOnArray.card; - } else { - // $elemMatch query on array data. - const auto arrayMinEst = estRange(ah.getArrayMin()); - const auto arrayMaxEst = estRange(ah.getArrayMax()); - const auto arrayUniqueEst = estRange(ah.getArrayUnique()); - - // ToDo: try using ah.getArrayCount() - ah.getEmptyArrayCount(); - // when the number of empty arrays is provided by the statistics. - const double totalArrayCount = ah.getArrayCount(); - - uassert( - 6715101, "Array histograms should contain at least one array", totalArrayCount > 0); - switch (estimationAlgo) { - case EstimationAlgo::HistogramV1: { - const double arrayUniqueDensity = (arrayUniqueEst.ndv == 0.0) - ? 0.0 - : (arrayUniqueEst.card / std::sqrt(arrayUniqueEst.ndv)); - result = - std::max(std::max(arrayMinEst.card, arrayMaxEst.card), arrayUniqueDensity); - break; - } - case EstimationAlgo::HistogramV2: { - const double avgArraySize = - getTotals(ah.getArrayUnique()).card / totalArrayCount; - const double adjustedUniqueCard = (avgArraySize == 0.0) - ? 0.0 - : std::min(arrayUniqueEst.card / pow(avgArraySize, 0.2), totalArrayCount); - result = - std::max(std::max(arrayMinEst.card, arrayMaxEst.card), adjustedUniqueCard); - break; - } - case EstimationAlgo::HistogramV3: { - const double adjustedUniqueCard = - 0.85 * std::min(arrayUniqueEst.card, totalArrayCount); - result = - std::max(std::max(arrayMinEst.card, arrayMaxEst.card), adjustedUniqueCard); - break; - } - default: - MONGO_UNREACHABLE; - } - } - } - - if (includeScalar) { - const auto scalarEst = estRange(ah.getScalar()); - result += scalarEst.card; - } - - return result; -} - -double estimateIntervalCardinality(const ce::ArrayHistogram& ah, - const IntervalRequirement& interval, - CEType childResult, - bool includeScalar) { - if (interval.isFullyOpen()) { - return childResult; - } else if (interval.isEquality()) { - auto maybeConstBound = getBound(interval.getLowBound()); - if (!maybeConstBound) { - return kInvalidEstimate; - } - - auto [tag, val] = *maybeConstBound; - if (canEstimateTypeViaHistogram(tag)) { - return estimateCardEq(ah, tag, val, includeScalar); - } - - // Otherwise, we return the cardinality for the type of the intervals. - return getTypeCard(ah, tag, includeScalar); - } - - // Otherwise, we have a range. - auto lowBound = interval.getLowBound(); - auto maybeConstLowBound = getBound(lowBound); - if (!maybeConstLowBound) { - return kInvalidEstimate; - } - - auto highBound = interval.getHighBound(); - auto maybeConstHighBound = getBound(highBound); - if (!maybeConstHighBound) { - return kInvalidEstimate; - } - - auto [lowTag, lowVal] = *maybeConstLowBound; - auto [highTag, highVal] = *maybeConstHighBound; - - // Check if we estimated this interval using histograms. One of the tags may not be of a type we - // know how to estimate using histograms; however, it should still be possible to estimate the - // interval if the other one is of the appropriate type. - if (canEstimateTypeViaHistogram(lowTag) || canEstimateTypeViaHistogram(highTag)) { - return estimateCardRange(ah, - lowBound.isInclusive(), - lowTag, - lowVal, - highBound.isInclusive(), - highTag, - highVal, - includeScalar); - } - - // Otherwise, this interval was not in our histogram. We may be able to estimate this interval - // via type counts- if so, we just return the total count for the type. - - // If the bound tags are equal, we can estimate this in the same way that we do equalities on - // non-histogrammable types. Otherwise, we need to figure out which type(s) are included by this - // range. - if (lowTag == highTag || isIntervalSubsetOfType(interval, lowTag)) { - return getTypeCard(ah, lowTag, includeScalar); - } else if (isIntervalSubsetOfType(interval, highTag)) { - return getTypeCard(ah, highTag, includeScalar); - } - - // If we reach here, we've given up estimating, because our interval intersected both high & low - // type intervals (and possibly more types). - // TODO: could we aggregate type counts across all intersected types here? - return 0.0; -} - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/histogram_estimation.h b/src/mongo/db/query/ce/histogram_estimation.h deleted file mode 100644 index f0291b42cd8..00000000000 --- a/src/mongo/db/query/ce/histogram_estimation.h +++ /dev/null @@ -1,106 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include "mongo/db/query/ce/array_histogram.h" -#include "mongo/db/query/optimizer/defs.h" -#include "mongo/db/query/optimizer/index_bounds.h" - -namespace mongo::ce { - -constexpr double kInvalidEstimate = -1.0; - -enum class EstimationType { kEqual, kLess, kLessOrEqual, kGreater, kGreaterOrEqual }; -enum class EstimationAlgo { HistogramV1, HistogramV2, HistogramV3 }; - -const stdx::unordered_map estimationTypeName = { - {EstimationType::kEqual, "eq"}, - {EstimationType::kLess, "lt"}, - {EstimationType::kLessOrEqual, "lte"}, - {EstimationType::kGreater, "gt"}, - {EstimationType::kGreaterOrEqual, "gte"}}; - -struct EstimationResult { - double card; - double ndv; - - EstimationResult operator-(const EstimationResult& other) const { - return {card - other.card, ndv - other.ndv}; - } -}; - -/** - * Returns cumulative total statistics for a histogram. - */ -EstimationResult getTotals(const ScalarHistogram& h); - -/** - * Compute an estimate for a given value and estimation type. Use linear interpolation for values - * that fall inside of histogram buckets. - */ -EstimationResult estimate(const ScalarHistogram& h, - sbe::value::TypeTags tag, - sbe::value::Value val, - EstimationType type); - -/** - * Given an array histogram, an interval, and the input cardinality, estimates the cardinality of - * the interval. - */ -double estimateIntervalCardinality(const ArrayHistogram& estimator, - const optimizer::IntervalRequirement& interval, - optimizer::CEType inputCardinality, - bool includeScalar); - -/** - * Estimates the cardinality of an equality predicate given an ArrayHistogram and an SBE value and - * type tag pair. - */ -double estimateCardEq(const ArrayHistogram& ah, - sbe::value::TypeTags tag, - sbe::value::Value val, - bool includeScalar); - -/** - * Estimates the cardinality of a range predicate given an ArrayHistogram and a range predicate. - * Set 'includeScalar' to true to indicate whether or not the provided range should include no-array - * values. The other fields define the range of the estimation. - */ -double estimateCardRange(const ArrayHistogram& ah, - bool lowInclusive, - sbe::value::TypeTags tagLow, - sbe::value::Value valLow, - bool highInclusive, - sbe::value::TypeTags tagHigh, - sbe::value::Value valHigh, - bool includeScalar, - EstimationAlgo estAlgo = EstimationAlgo::HistogramV2); - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/histogram_estimator.cpp b/src/mongo/db/query/ce/histogram_estimator.cpp new file mode 100644 index 00000000000..6978ad4a307 --- /dev/null +++ b/src/mongo/db/query/ce/histogram_estimator.cpp @@ -0,0 +1,272 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/ce/histogram_estimator.h" + +#include "mongo/db/pipeline/abt/utils.h" +#include "mongo/db/query/ce/histogram_predicate_estimation.h" +#include "mongo/db/query/optimizer/utils/abt_hash.h" +#include "mongo/db/query/optimizer/utils/ce_math.h" +#include "mongo/db/query/optimizer/utils/memo_utils.h" + +namespace mongo::optimizer::ce { +namespace { +/** + * This transport combines chains of PathGets and PathTraverses into an MQL-like string path. + */ +class PathDescribeTransport { +public: + std::string transport(const PathTraverse& /*node*/, std::string childResult) { + return childResult; + } + + std::string transport(const PathGet& node, std::string childResult) { + return str::stream() << node.name() << (childResult.length() > 0 ? "." : "") << childResult; + } + + std::string transport(const EvalFilter& node, std::string pathResult, std::string inputResult) { + return pathResult; + } + + std::string transport(const PathIdentity& node) { + return ""; + } + + template + std::string transport(const T& node, Ts&&... /* args */) { + uasserted(6903900, "Unexpected node in path serialization."); + } +}; + +std::string serializePath(const ABT& path) { + PathDescribeTransport pdt; + auto str = algebra::transport(path, pdt); + return str; +} + +} // namespace + +class HistogramTransport { +public: + HistogramTransport(std::shared_ptr stats, + std::unique_ptr fallbackCE) + : _stats(stats), + _fallbackCE(std::move(fallbackCE)), + _arrayOnlyInterval(*defaultConvertPathToInterval(make())) {} + + CEType transport(const ABT& n, + const ScanNode& node, + const cascades::Memo& memo, + const properties::LogicalProps& logicalProps, + CEType /*bindResult*/) { + return _stats->getCardinality(); + } + + /** + * This struct is used to track an intermediate representation of the intervals in the + * requirements map. In particular, grouping intervals along each path in the map allows us to + * determine which paths should be estimated as $elemMatches without relying on a particular + * order of entries in the requirements map. + */ + struct SargableConjunct { + bool includeScalar; + const stats::ArrayHistogram& histogram; + std::vector> intervals; + }; + + CEType transport(const ABT& n, + const SargableNode& node, + const Metadata& metadata, + const cascades::Memo& memo, + const properties::LogicalProps& logicalProps, + CEType childResult, + CEType /*bindsResult*/, + CEType /*refsResult*/) { + // Early out and return 0 since we don't expect to get more results. + if (childResult == 0.0) { + return 0.0; + } + + // Initial first pass through the requirements map to extract information about each path. + std::map conjunctRequirements; + for (const auto& [key, req] : node.getReqMap()) { + if (req.getIsPerfOnly()) { + // Ignore perf-only requirements. + continue; + } + + const auto serializedPath = serializePath(key._path.ref()); + const auto& interval = req.getIntervals(); + const bool isPathArrInterval = + (_arrayOnlyInterval == interval) && !pathEndsInTraverse(key._path.ref()); + + // Check if we have already seen this path. + if (auto conjunctIt = conjunctRequirements.find({serializedPath}); + conjunctIt != conjunctRequirements.end()) { + auto& conjunctReq = conjunctIt->second; + if (isPathArrInterval) { + // We should estimate this path's intervals using $elemMatch semantics. + // Don't push back the interval for estimation; instead, we use it to change how + // we estimate other intervals along this path. + conjunctReq.includeScalar = false; + } else { + // We will need to estimate this interval. + conjunctReq.intervals.push_back(interval); + } + continue; + } + + // Fallback if there is no histogram. + auto histogram = _stats->getHistogram(serializedPath); + if (!histogram) { + // For now, because of the structure of SargableNode and the implementation of + // the fallback (currently HeuristicCE), we can't combine heuristic & histogram + // estimates. In this case, default to Heuristic if we don't have a histogram for + // any of the predicates. + return _fallbackCE->deriveCE(metadata, memo, logicalProps, n.ref()); + } + + // Add this path to the map. If this is not a 'PathArr' interval, add it to the vector + // of intervals we will be estimating. + SargableConjunct sc{!isPathArrInterval, *histogram, {}}; + if (sc.includeScalar) { + sc.intervals.push_back(interval); + } + conjunctRequirements.emplace(serializedPath, std::move(sc)); + } + + std::vector topLevelSelectivities; + for (const auto& [_, conjunctReq] : conjunctRequirements) { + const CEType totalCard = _stats->getCardinality(); + + if (conjunctReq.intervals.empty() && !conjunctReq.includeScalar) { + // In this case there is a single 'PathArr' interval for this field. + // The selectivity of this interval is: (count of all arrays) / totalCard + double pathArrSel = conjunctReq.histogram.getArrayCount() / totalCard; + topLevelSelectivities.push_back(pathArrSel); + } + + // Intervals are in DNF. + for (const IntervalReqExpr::Node& intervalDNF : conjunctReq.intervals) { + std::vector disjSelectivities; + + const auto disjuncts = intervalDNF.cast()->nodes(); + for (const auto& disjunct : disjuncts) { + const auto& conjuncts = disjunct.cast()->nodes(); + + std::vector conjSelectivities; + for (const auto& conjunct : conjuncts) { + const auto& interval = conjunct.cast()->getExpr(); + auto cardinality = + ce::estimateIntervalCardinality(conjunctReq.histogram, + interval, + childResult, + conjunctReq.includeScalar); + + // We may still not have been able to estimate the interval using + // histograms, for instance if the interval bounds were non-Constant. In + // this case, we should fallback to heuristics. + if (cardinality < 0) { + return _fallbackCE->deriveCE(metadata, memo, logicalProps, n.ref()); + } + + // We have to convert the cardinality to a selectivity. The histogram + // returns the cardinality for the entire collection; however, fewer records + // may be expected at the SargableNode. + conjSelectivities.push_back(cardinality / totalCard); + } + + auto backoff = ce::conjExponentialBackoff(std::move(conjSelectivities)); + disjSelectivities.push_back(backoff); + } + + auto backoff = ce::disjExponentialBackoff(std::move(disjSelectivities)); + topLevelSelectivities.push_back(backoff); + } + } + + // The elements of the PartialSchemaRequirements map represent an implicit conjunction. + if (!topLevelSelectivities.empty()) { + auto backoff = ce::conjExponentialBackoff(std::move(topLevelSelectivities)); + childResult *= backoff; + } + return childResult; + } + + CEType transport(const ABT& n, + const RootNode& node, + const Metadata& metadata, + const cascades::Memo& memo, + const properties::LogicalProps& logicalProps, + CEType childResult, + CEType /*refsResult*/) { + // Root node does not change cardinality. + return childResult; + } + + /** + * Use fallback for other ABT types. + */ + template + CEType transport(const ABT& n, + const T& /*node*/, + const Metadata& metadata, + const cascades::Memo& memo, + const properties::LogicalProps& logicalProps, + Ts&&...) { + if (canBeLogicalNode()) { + return _fallbackCE->deriveCE(metadata, memo, logicalProps, n.ref()); + } + return 0.0; + } + +private: + std::shared_ptr _stats; + std::unique_ptr _fallbackCE; + + // This is a special interval indicating that we expect to use $elemMatch semantics when + // estimating the current path. + const IntervalReqExpr::Node _arrayOnlyInterval; +}; + +HistogramEstimator::HistogramEstimator(std::shared_ptr stats, + std::unique_ptr fallbackCE) + : _transport(std::make_unique(stats, std::move(fallbackCE))) {} + +HistogramEstimator::~HistogramEstimator() {} + +CEType HistogramEstimator::deriveCE(const Metadata& metadata, + const cascades::Memo& memo, + const properties::LogicalProps& logicalProps, + const ABT::reference_type logicalNodeRef) const { + return algebra::transport( + logicalNodeRef, *this->_transport, metadata, memo, logicalProps); +} + +} // namespace mongo::optimizer::ce diff --git a/src/mongo/db/query/ce/histogram_estimator.h b/src/mongo/db/query/ce/histogram_estimator.h new file mode 100644 index 00000000000..ebcf008bdd3 --- /dev/null +++ b/src/mongo/db/query/ce/histogram_estimator.h @@ -0,0 +1,54 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/query/optimizer/cascades/interfaces.h" +#include "mongo/db/query/stats/collection_statistics.h" + +namespace mongo::optimizer::ce { + +class HistogramTransport; + +class HistogramEstimator : public cascades::CardinalityEstimator { +public: + HistogramEstimator(std::shared_ptr stats, + std::unique_ptr fallbackCE); + ~HistogramEstimator(); + + CEType deriveCE(const Metadata& metadata, + const cascades::Memo& memo, + const properties::LogicalProps& logicalProps, + ABT::reference_type logicalNodeRef) const final; + +private: + std::unique_ptr _transport; +}; + +} // namespace mongo::optimizer::ce diff --git a/src/mongo/db/query/ce/histogram_estimator_test.cpp b/src/mongo/db/query/ce/histogram_estimator_test.cpp new file mode 100644 index 00000000000..bdc7d95dea4 --- /dev/null +++ b/src/mongo/db/query/ce/histogram_estimator_test.cpp @@ -0,0 +1,1161 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/ce/histogram_estimator.h" +#include "mongo/db/query/ce/histogram_predicate_estimation.h" +#include "mongo/db/query/ce/test_utils.h" +#include "mongo/db/query/optimizer/utils/unit_test_utils.h" +#include "mongo/db/query/sbe_stage_builder_helpers.h" +#include "mongo/db/query/stats/collection_statistics_mock.h" +#include "mongo/unittest/unittest.h" + +namespace mongo::optimizer::ce { +namespace { +namespace value = sbe::value; + +using stats::ArrayHistogram; +using stats::Bucket; +using stats::CollectionStatistics; +using stats::CollectionStatisticsMock; +using stats::ScalarHistogram; +using stats::TypeCounts; + +std::string collName("test"); + +class CEHistogramTester : public CETester { +public: + CEHistogramTester(std::string collName, double numRecords) + : CETester(collName, numRecords), _stats{new CollectionStatisticsMock(numRecords)} {} + + void addHistogram(const std::string& path, std::shared_ptr histogram) { + _stats->addHistogram(path, histogram); + } + +protected: + std::unique_ptr getEstimator() const override { + // making a copy of CollecitonStatistics to override + return std::make_unique(_stats, makeHeuristicCE()); + } + +private: + std::shared_ptr _stats; +}; + +struct TestBucket { + Value val; + double equalFreq; + double rangeFreq = 0.0; + double ndv = 1.0; /* ndv including bucket boundary*/ +}; +using TestBuckets = std::vector; + +ScalarHistogram getHistogramFromData(TestBuckets testBuckets) { + sbe::value::Array bounds; + std::vector buckets; + + double cumulativeFreq = 0.0; + double cumulativeNDV = 0.0; + for (const auto& b : testBuckets) { + // Add bucket boundary value to bounds. + auto [tag, val] = stage_builder::makeValue(b.val); + bounds.push_back(tag, val); + + cumulativeFreq += b.equalFreq + b.rangeFreq; + cumulativeNDV += b.ndv; + + // Create a histogram bucket. + buckets.emplace_back(b.equalFreq, + b.rangeFreq, + cumulativeFreq, + b.ndv - 1, /* ndv excluding bucket boundary*/ + cumulativeNDV); + } + + return ScalarHistogram(std::move(bounds), std::move(buckets)); +} + +TypeCounts getTypeCountsFromData(TestBuckets testBuckets) { + TypeCounts typeCounts; + for (const auto& b : testBuckets) { + // Add bucket boundary value to bounds. + auto sbeVal = stage_builder::makeValue(b.val); + auto [tag, val] = sbeVal; + + // Increment count of values for each type tag. + if (auto it = typeCounts.find(tag); it != typeCounts.end()) { + it->second += b.equalFreq + b.rangeFreq; + } else { + typeCounts[tag] = b.equalFreq + b.rangeFreq; + } + } + return typeCounts; +} + +std::unique_ptr getArrayHistogramFromData(TestBuckets testBuckets, + TypeCounts additionalScalarData = {}) { + TypeCounts dataTypeCounts = getTypeCountsFromData(testBuckets); + dataTypeCounts.merge(additionalScalarData); + return std::make_unique(getHistogramFromData(testBuckets), + std::move(dataTypeCounts)); +} + +std::unique_ptr getArrayHistogramFromData(TestBuckets scalarBuckets, + TestBuckets arrayUniqueBuckets, + TestBuckets arrayMinBuckets, + TestBuckets arrayMaxBuckets, + TypeCounts arrayTypeCounts, + double totalArrayCount, + double emptyArrayCount = 0, + TypeCounts additionalScalarData = {}) { + + // Set up scalar type counts. + TypeCounts dataTypeCounts = getTypeCountsFromData(scalarBuckets); + dataTypeCounts[value::TypeTags::Array] = totalArrayCount; + dataTypeCounts.merge(additionalScalarData); + + // Set up histograms. + auto arrayMinHist = getHistogramFromData(arrayMinBuckets); + auto arrayMaxHist = getHistogramFromData(arrayMaxBuckets); + return std::make_unique(getHistogramFromData(scalarBuckets), + std::move(dataTypeCounts), + getHistogramFromData(arrayUniqueBuckets), + std::move(arrayMinHist), + std::move(arrayMaxHist), + std::move(arrayTypeCounts), + emptyArrayCount); +} + +TEST(CEHistogramTest, AssertSmallMaxDiffHistogramEstimatesAtomicPredicates) { + constexpr auto kCollCard = 8; + CEHistogramTester t(collName, kCollCard); + + // Construct a histogram with two buckets: one for 3 ints equal to 1, another for 5 strings + // equal to "ing". + const std::string& str = "ing"; + t.addHistogram("a", + getArrayHistogramFromData({ + {Value(1), 3 /* frequency */}, + {Value(str), 5 /* frequency */}, + })); + + // Test $eq. + ASSERT_MATCH_CE(t, "{a: {$eq: 1}}", 3.0); + ASSERT_MATCH_CE(t, "{a: {$eq: 2}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$eq: \"ing\"}}", 5.0); + ASSERT_MATCH_CE(t, "{a: {$eq: \"foo\"}}", 0.0); + + // Test case when field doesn't match fieldpath of histogram. This falls back to heuristics. + ASSERT_MATCH_CE(t, "{b: {$eq: 1}}", 2.82843); + + // Test $gt. + ASSERT_MATCH_CE(t, "{a: {$gt: 3}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$gt: 1}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$gt: 0}}", 3.0); + ASSERT_MATCH_CE(t, "{a: {$gt: \"bar\"}}", 5.0); + ASSERT_MATCH_CE(t, "{a: {$gt: \"ing\"}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$gt: \"zap\"}}", 0.0); + + // Test $lt. + ASSERT_MATCH_CE(t, "{a: {$lt: 3}}", 3.0); + ASSERT_MATCH_CE(t, "{a: {$lt: 1}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$lt: 0}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$lt: \"bar\"}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$lt: \"ing\"}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$lt: \"zap\"}}", 5.0); + + // Test $gte. + ASSERT_MATCH_CE(t, "{a: {$gte: 3}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$gte: 1}}", 3.0); + ASSERT_MATCH_CE(t, "{a: {$gte: 0}}", 3.0); + ASSERT_MATCH_CE(t, "{a: {$gte: \"bar\"}}", 5.0); + ASSERT_MATCH_CE(t, "{a: {$gte: \"ing\"}}", 5.0); + ASSERT_MATCH_CE(t, "{a: {$gte: \"zap\"}}", 0.0); + + // Test $lte. + ASSERT_MATCH_CE(t, "{a: {$lte: 3}}", 3.0); + ASSERT_MATCH_CE(t, "{a: {$lte: 1}}", 3.0); + ASSERT_MATCH_CE(t, "{a: {$lte: 0}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$lte: \"bar\"}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$lte: \"ing\"}}", 5.0); + ASSERT_MATCH_CE(t, "{a: {$lte: \"zap\"}}", 5.0); +} + +TEST(CEHistogramTest, AssertSmallHistogramEstimatesComplexPredicates) { + constexpr auto kCollCard = 9; + CEHistogramTester t(collName, kCollCard); + + // Construct a histogram with three int buckets for field 'a'. + t.addHistogram("a", + getArrayHistogramFromData({ + {Value(1), 3 /* frequency */}, + {Value(2), 5 /* frequency */}, + {Value(3), 1 /* frequency */}, + })); + + // Construct a histogram with two int buckets for field 'b'. + t.addHistogram("b", + getArrayHistogramFromData({ + {Value(22), 3 /* frequency */}, + {Value(33), 6 /* frequency */}, + })); + + + // Test simple conjunctions on one field. Note the first example: the range we expect to see + // here is (1, 3); however, the structure in the SargableNode gives us a conjunction of two + // intervals instead: (1, "") ^ (nan, 3) This is then estimated using exponential backoff to + // give us a less accurate result. The correct cardinality here would be 5. + ASSERT_MATCH_CE(t, "{a: {$gt: 1}, a: {$lt: 3}}", 5.66); + ASSERT_MATCH_CE(t, "{a: {$gt: 1}, a: {$lte: 3}}", 6.0); + ASSERT_MATCH_CE(t, "{a: {$gte: 1}, a: {$lt: 3}}", 8.0); + ASSERT_MATCH_CE(t, "{a: {$gte: 1}, a: {$lte: 3}}", 9.0); + + // Test ranges which exclude each other. + ASSERT_MATCH_CE(t, "{a: {$lt: 1}, a: {$gt: 3}}", 0.0); + + // Test overlapping ranges. This is a similar case to {a: {$gt: 1}, a: {$lt: 3}} above: we + // expect to see the range [2, 2]; instead, we see the range [nan, 2] ^ [2, ""). + ASSERT_MATCH_CE(t, "{a: {$lte: 2}, a: {$gte: 2}}", 5.66); + + // Test conjunctions over multiple fields for which we have histograms. Here we expect a + // cardinality estimated by exponential backoff. + ASSERT_MATCH_CE(t, "{a: {$eq: 2}, b: {$eq: 22}}", 2.24); + ASSERT_MATCH_CE(t, "{a: {$eq: 11}, b: {$eq: 22}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$gt: 11}, a: {$lte: 100}, b: {$eq: 22}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$lt: 3}, a: {$gte: 1}, b: {$lt: 100}, b: {$gt: 30}}", 5.66); + + // Test conjunctions over multiple fields for which we may not have histograms. This falls back + // to heuristic estimation. + ASSERT_MATCH_CE(t, "{a: {$eq: 2}, c: {$eq: 1}}", 1.73205); + ASSERT_MATCH_CE(t, "{c: {$eq: 2}, d: {$eq: 22}}", 1.73205); +} + +TEST(CEHistogramTest, SanityTestEmptyHistogram) { + constexpr auto kCollCard = 0; + CEHistogramTester t(collName, kCollCard); + t.addHistogram("empty", std::make_unique()); + + ASSERT_MATCH_CE(t, "{empty: {$eq: 1.0}}", 0.0); + ASSERT_MATCH_CE(t, "{empty: {$lt: 1.0}, empty: {$gt: 0.0}}", 0.0); + ASSERT_MATCH_CE(t, "{empty: {$eq: 1.0}, other: {$eq: \"anything\"}}", 0.0); + ASSERT_MATCH_CE(t, "{other: {$eq: \"anything\"}, empty: {$eq: 1.0}}", 0.0); +} + +TEST(CEHistogramTest, TestOneBucketOneIntHistogram) { + constexpr auto kCollCard = 50; + CEHistogramTester t(collName, kCollCard); + + // Create a histogram with a single bucket that contains exactly one int (42) with a frequency + // of 50 (equal to the collection cardinality). + t.addHistogram("soloInt", + getArrayHistogramFromData({ + {Value(42), kCollCard /* frequency */}, + })); + + // Check against a variety of intervals that include 42 as a bound. + ASSERT_MATCH_CE(t, "{soloInt: {$eq: 42}}", kCollCard); + ASSERT_MATCH_CE(t, "{soloInt: {$lt: 42}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$lte: 42}}", kCollCard); + ASSERT_MATCH_CE(t, "{soloInt: {$gt: 42}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$gte: 42}}", kCollCard); + ASSERT_MATCH_CE(t, "{soloInt: {$gt: 42}, soloInt: {$lt: 42}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$gt: 42}, soloInt: {$lte: 42}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$gte: 42}, soloInt: {$lt: 42}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$gte: 42}, soloInt: {$lte: 42}}", kCollCard); + + // Check against a variety of intervals that include 42 only as one bound. + ASSERT_MATCH_CE(t, "{soloInt: {$gt: 42}, soloInt: {$lt: 43}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$gt: 42}, soloInt: {$lte: 43}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$gte: 42}, soloInt: {$lt: 43}}", kCollCard); + ASSERT_MATCH_CE(t, "{soloInt: {$gte: 42}, soloInt: {$lte: 43}}", kCollCard); + ASSERT_MATCH_CE(t, "{soloInt: {$gt: 41}, soloInt: {$lt: 42}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$gt: 41}, soloInt: {$lte: 42}}", kCollCard); + ASSERT_MATCH_CE(t, "{soloInt: {$gte: 41}, soloInt: {$lt: 42}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$gte: 41}, soloInt: {$lte: 42}}", kCollCard); + + // Check against a variety of intervals close to 42 using a lower bound of 41 and a higher bound + // of 43. + ASSERT_MATCH_CE(t, "{soloInt: {$eq: 41}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$eq: 43}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$lt: 43}}", kCollCard); + ASSERT_MATCH_CE(t, "{soloInt: {$lte: 43}}", kCollCard); + ASSERT_MATCH_CE(t, "{soloInt: {$gt: 41}}", kCollCard); + ASSERT_MATCH_CE(t, "{soloInt: {$gte: 41}}", kCollCard); + ASSERT_MATCH_CE(t, "{soloInt: {$gt: 41}, soloInt: {$lt: 43}}", kCollCard); + ASSERT_MATCH_CE(t, "{soloInt: {$gte: 41}, soloInt: {$lt: 43}}", kCollCard); + ASSERT_MATCH_CE(t, "{soloInt: {$gt: 41}, soloInt: {$lte: 43}}", kCollCard); + ASSERT_MATCH_CE(t, "{soloInt: {$gte: 41}, soloInt: {$lte: 43}}", kCollCard); + + // Check against different types. + ASSERT_MATCH_CE(t, "{soloInt: {$eq: \"42\"}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$lt: \"42\"}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$lt: 42.1}}", kCollCard); +} + +TEST(CEHistogramTest, TestOneBoundIntRangeHistogram) { + constexpr auto kCollCard = 51; + CEHistogramTester t(collName, kCollCard); + t.addHistogram("intRange", + getArrayHistogramFromData({ + {Value(10), 5 /* frequency */}, + {Value(20), 1 /* frequency */, 45 /* range frequency */, 10 /* ndv */}, + })); + + // Test ranges that overlap only with the lower bound. + // Note: 5 values equal 10. + ASSERT_MATCH_CE(t, "{intRange: {$eq: 10}}", 5.0); + ASSERT_MATCH_CE(t, "{intRange: {$lte: 10}}", 5.0); + ASSERT_MATCH_CE(t, "{intRange: {$lte: 10}, intRange: {$gte: 10}}", 5.0); + + // Test ranges that overlap only with the upper bound. + ASSERT_MATCH_CE(t, "{intRange: {$eq: 11}}", 5.0); + ASSERT_MATCH_CE(t, "{intRange: {$eq: 15}}", 5.0); + ASSERT_MATCH_CE(t, "{intRange: {$eq: 15.5}}", 5.0); + ASSERT_MATCH_CE(t, "{intRange: {$eq: 20}}", 1.0); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 20}}", 1.0); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 10}}", 46.0); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 15}}", 28.5); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 15}}", 23.5); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lte: 20}}", 41.5); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lte: 20}}", 41.5); + + // Test ranges that partially overlap with the entire histogram. + ASSERT_MATCH_CE(t, "{intRange: {$lt: 11}}", 9.5); + ASSERT_MATCH_CE(t, "{intRange: {$lt: 15}}", 22.5); + ASSERT_MATCH_CE(t, "{intRange: {$lte: 15}}", 27.5); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 8}, intRange: {$lte: 15}}", 27.5); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 8}, intRange: {$lte: 15}}", 27.5); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 8}, intRange: {$lt: 15}}", 22.5); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 8}, intRange: {$lte: 15}}", 27.5); + + // Test ranges that include all values in the histogram. + ASSERT_MATCH_CE(t, "{intRange: {$gte: 10}, intRange: {$lte: 20}}", kCollCard); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 1}, intRange: {$lte: 30}}", kCollCard); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 1}, intRange: {$lt: 30}}", kCollCard); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 1}, intRange: {$lte: 30}}", kCollCard); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 1}, intRange: {$lt: 30}}", kCollCard); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 0}}", kCollCard); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 0}}", kCollCard); + ASSERT_MATCH_CE(t, "{intRange: {$lt: 100}}", kCollCard); + ASSERT_MATCH_CE(t, "{intRange: {$lte: 100}}", kCollCard); + + // Test ranges that are fully included in the histogram. + ASSERT_MATCH_CE(t, "{intRange: {$eq: 10.5}}", 5.0); + ASSERT_MATCH_CE(t, "{intRange: {$eq: 12.5}}", 5.0); + ASSERT_MATCH_CE(t, "{intRange: {$eq: 19.36}}", 5.0); + + // Test ranges that don't overlap with the histogram. + ASSERT_MATCH_CE(t, "{intRange: {$lt: 10}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$lt: 5}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$lte: 5}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$eq: 20.1}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$eq: 21}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 21}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 20}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 100}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 30}, intRange: {$lte: 50}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 30}, intRange: {$lt: 50}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 30}, intRange: {$lt: 50}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 30}, intRange: {$lte: 50}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 0}, intRange: {$lte: 5}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 0}, intRange: {$lt: 5}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 0}, intRange: {$lt: 5}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 0}, intRange: {$lte: 5}}", 0.0); + + // Because we don't specify any indexes here, these intervals do not go through simplification. + // This means that instead of having one key in the requirements map of the generated sargable + // node corresponding to the path "intRange", we have two keys and two ranges, both + // corresponding to the same path. As a consequence, we combine the estimates for the intervals + // using exponential backoff, which results in an overestimate. + ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lt: 20}}", 41.09); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lt: 20}}", 41.09); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lt: 15}}", 19.16); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lt: 15}}", 20.42); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lte: 15}}", 23.42); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lte: 15}}", 24.96); + ASSERT_MATCH_CE(t, "{intRange: {$lt: 19}, intRange: {$gt: 11}}", 36.53); + + // When we specify that there is a non-multikey index on 'intRange', we expect to see interval + // simplification occurring, which should provide a better estimate for the following ranges. + t.setIndexes( + {{"intRangeIndex", + makeIndexDefinition("intRange", CollationOp::Ascending, /* isMultiKey */ false)}}); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lt: 20}}", 40.5); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lt: 20}}", 40.5); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lt: 15}}", 8.5); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lt: 15}}", 13.5); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lte: 15}}", 13.5); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lte: 15}}", 18.5); + ASSERT_MATCH_CE(t, "{intRange: {$lt: 19}, intRange: {$gt: 11}}", 31.0); +} + +TEST(CEHistogramTest, TestHistogramOnNestedPaths) { + constexpr auto kCollCard = 50; + CEHistogramTester t(collName, kCollCard); + + // Create a histogram with a single bucket that contains exactly one int (42) with a frequency + // of 50 (equal to the collection cardinality). + t.addHistogram("path", + getArrayHistogramFromData({ + {Value(42), kCollCard /* frequency */}, + })); + t.addHistogram("a.histogram.path", + getArrayHistogramFromData({ + {Value(42), kCollCard /* frequency */}, + })); + + ASSERT_MATCH_CE(t, "{\"not.a.histogram.path\": {$eq: 42}}", 7.071 /* heuristic */); + ASSERT_MATCH_CE(t, "{\"a.histogram.path\": {$eq: 42}}", kCollCard); + ASSERT_MATCH_CE( + t, "{\"a.histogram.path.with.no.histogram\": {$eq: 42}}", 7.071 /* heuristic */); + + // When a predicate can't be precisely translated to a SargableNode (such as $elemMatch on a + // dotted path), we may still be able to translate an over-approximation. We generate a + // SargableNode with all predicates marked perfOnly, and keep the original Filter. The Filter + // ensures the results are correct, while the SargableNode hopefully will be answerable by an + // index. + // + // On the logical level, perfOnly predicates don't do anything, so we don't consider them in + // cardinality estimates. But when we split a SargableNode into an indexed part and a fetch + // part, we remove the perfOnly flag from the indexed part, and we should consider them to + // estimate how many index keys are returned. + // + // In this test, we want to exercise the histogram estimate for the SargableNode generated by + // $elemMatch on a dotted path. So we create an index on this field to ensure the SargableNode + // is split, and the predicates marked non-perfOnly. + // + // We also mark the index multikey, to prevent non-CE rewrites from removing the predicate + // entirely. (This scenario could happen if you remove all the arrays, and refresh the + // statistics.) + IndexDefinition ix{ + IndexCollationSpec{ + IndexCollationEntry{ + makeIndexPath({"a", "histogram", "path"}), + CollationOp::Ascending, + }, + }, + true /* isMultiKey */, + }; + t.setIndexes({{"a_histogram_path_1", std::move(ix)}}); + ASSERT_MATCH_CE_NODE(t, "{\"a.histogram.path\": {$elemMatch: {$eq: 42}}}", 0.0, isSargable2); +} + +TEST(CEHistogramTest, TestArrayHistogramOnAtomicPredicates) { + constexpr auto kCollCard = 6; + CEHistogramTester t(collName, kCollCard); + t.addHistogram( + "a", + // Generate a histogram for this data: + // {a: 1}, {a: 2}, {a: [1, 2, 3, 2, 2]}, {a: [10]}, {a: [2, 3, 3, 4, 5, 5, 6]}, {a: []} + // - scalars: [1, 2] + // - unique values: [1, 2, 3], [10], [2, 3, 4, 5, 6] + // -> [1, 2, 2, 3, 3, 4, 5, 6, 10] + // - min values: [1], [10], [2] -> [1, 1, 2, 2, 10] + // - max values: [3], [10], [6] -> [1, 2, 3, 6, 10] + getArrayHistogramFromData( + {// Scalar buckets. + {Value(1), 1 /* frequency */}, + {Value(2), 1 /* frequency */}}, + { + // Array unique buckets. + {Value(1), 1 /* frequency */}, + {Value(2), 2 /* frequency */}, + {Value(3), 2 /* frequency */}, + {Value(4), 1 /* frequency */}, + {Value(5), 1 /* frequency */}, + {Value(6), 1 /* frequency */}, + {Value(10), 1 /* frequency */}, + }, + { + // Array min buckets. + {Value(1), 1 /* frequency */}, + {Value(2), 1 /* frequency */}, + {Value(10), 1 /* frequency */}, + }, + { + // Array max buckets. + {Value(3), 1 /* frequency */}, + {Value(6), 1 /* frequency */}, + {Value(10), 1 /* frequency */}, + }, + {{sbe::value::TypeTags::NumberInt32, 13}}, // Array type counts. + 3, // 3 arrays total. + 1 // 1 empty array. + )); + + // Test simple predicates against 'a'. Note: in the $elemMatch case, we exclude scalar + // estimates. Without $elemMatch, we add the array histogram and scalar histogram estimates + // together. + + // Test equality predicates. + ASSERT_EQ_ELEMMATCH_CE(t, 0.0 /* CE */, 0.0 /* $elemMatch CE */, "a", "{$eq: 0}"); + ASSERT_EQ_ELEMMATCH_CE(t, 2.0 /* CE */, 1.0 /* $elemMatch CE */, "a", "{$eq: 1}"); + ASSERT_EQ_ELEMMATCH_CE(t, 3.0 /* CE */, 2.0 /* $elemMatch CE */, "a", "{$eq: 2}"); + ASSERT_EQ_ELEMMATCH_CE(t, 2.0 /* CE */, 2.0 /* $elemMatch CE */, "a", "{$eq: 3}"); + ASSERT_EQ_ELEMMATCH_CE(t, 1.0 /* CE */, 1.0 /* $elemMatch CE */, "a", "{$eq: 4}"); + ASSERT_EQ_ELEMMATCH_CE(t, 1.0 /* CE */, 1.0 /* $elemMatch CE */, "a", "{$eq: 5}"); + ASSERT_EQ_ELEMMATCH_CE(t, 1.0 /* CE */, 1.0 /* $elemMatch CE */, "a", "{$eq: 6}"); + ASSERT_EQ_ELEMMATCH_CE(t, 1.0 /* CE */, 1.0 /* $elemMatch CE */, "a", "{$eq: 10}"); + ASSERT_EQ_ELEMMATCH_CE(t, 0.0 /* CE */, 0.0 /* $elemMatch CE */, "a", "{$eq: 11}"); + + // Test histogram boundary values. + ASSERT_EQ_ELEMMATCH_CE(t, 0.0 /* CE */, 0.0 /* $elemMatch CE */, "a", "{$lt: 1}"); + ASSERT_EQ_ELEMMATCH_CE(t, 2.0 /* CE */, 1.0 /* $elemMatch CE */, "a", "{$lte: 1}"); + ASSERT_EQ_ELEMMATCH_CE(t, 0.0 /* CE */, 0.0 /* $elemMatch CE */, "a", "{$gt: 10}"); + ASSERT_EQ_ELEMMATCH_CE(t, 1.0 /* CE */, 1.0 /* $elemMatch CE */, "a", "{$gte: 10}"); + + ASSERT_EQ_ELEMMATCH_CE(t, 5.0 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$lte: 10}"); + ASSERT_EQ_ELEMMATCH_CE(t, 4.0 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$lt: 10}"); + ASSERT_EQ_ELEMMATCH_CE(t, 4.0 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$gt: 1}"); + ASSERT_EQ_ELEMMATCH_CE(t, 5.0 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$gte: 1}"); + + ASSERT_EQ_ELEMMATCH_CE(t, 4.0 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$lte: 5}"); + ASSERT_EQ_ELEMMATCH_CE(t, 4.0 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$lt: 5}"); + ASSERT_EQ_ELEMMATCH_CE(t, 2.0 /* CE */, 2.0 /* $elemMatch CE */, "a", "{$gt: 5}"); + ASSERT_EQ_ELEMMATCH_CE(t, 2.0 /* CE */, 2.40822 /* $elemMatch CE */, "a", "{$gte: 5}"); + + ASSERT_EQ_ELEMMATCH_CE(t, 2.45 /* CE */, 2.40822 /* $elemMatch CE */, "a", "{$gt: 2, $lt: 5}"); + ASSERT_EQ_ELEMMATCH_CE(t, 3.27 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$gte: 2, $lt: 5}"); + ASSERT_EQ_ELEMMATCH_CE(t, 2.45 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$gt: 2, $lte: 5}"); + ASSERT_EQ_ELEMMATCH_CE(t, 3.27 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$gte: 2, $lte: 5}"); +} + +TEST(CEHistogramTest, TestArrayHistogramOnCompositePredicates) { + constexpr auto kCollCard = 175; + CEHistogramTester t(collName, kCollCard); + + // A scalar histogram with values in the range [1,10], most of which are in the middle bucket. + t.addHistogram("scalar", + getArrayHistogramFromData({ + {Value(1), 10 /* frequency */}, + {Value(2), 10 /* frequency */}, + {Value(3), 20 /* frequency */, 120 /* range frequency */, 5 /* ndv */}, + {Value(8), 5 /* frequency */, 10 /* range frequency */, 3 /* ndv */}, + })); + + // An array histogram built on the following arrays with 35 occurrences of each: + // [{[1, 2, 3]: 35}, {[5, 5, 5, 5, 5]: 35}, {[6]: 35}, {[]: 35}, {[8, 9, 10]: 35}] + t.addHistogram( + "array", + getArrayHistogramFromData( + {/* No scalar buckets. */}, + { + // Array unique buckets. + {Value(2), 35 /* frequency */, 35 /* range frequency */, 2 /* ndv */}, + {Value(5), 35 /* frequency */, 35 /* range frequency */, 2 /* ndv */}, + {Value(6), 35 /* frequency */}, + {Value(10), 35 /* frequency */, 105 /* range frequency */, 3 /* ndv */}, + }, + { + // Array min buckets. + {Value(1), 35 /* frequency */}, + {Value(5), 35 /* frequency */}, + {Value(6), 35 /* frequency */}, + {Value(8), 35 /* frequency */}, + }, + { + // Array max buckets. + {Value(3), 35 /* frequency */}, + {Value(5), 35 /* frequency */}, + {Value(6), 35 /* frequency */}, + {Value(10), 35 /* frequency */}, + }, + {{sbe::value::TypeTags::NumberInt32, 420}}, // Array type count = 3*35+5*35+1*35+3*35. + kCollCard, // kCollCard arrays total. + 35 // 35 empty arrays + )); + + t.addHistogram( + "mixed", + // The mixed histogram has 87 scalars that follow approximately the same distribution as + // in the pure scalar case, and 88 arrays with the following distribution: + // [{[1, 2, 3]: 17}, {[5, 5, 5, 5, 5]: 17}, {[6]: 17}, {[]: 20}, {[8, 9, 10]: 17}] + getArrayHistogramFromData( + { + // Scalar buckets. These are half the number of values from the "scalar" histogram. + {Value(1), 5 /* frequency */}, + {Value(2), 5 /* frequency */}, + {Value(3), 10 /* frequency */, 60 /* range frequency */, 5 /* ndv */}, + {Value(8), 2 /* frequency */, 5 /* range frequency */, 3 /* ndv */}, + }, + { + // Array unique buckets. + {Value(2), 17 /* frequency */, 17 /* range frequency */, 2 /* ndv */}, + {Value(5), 17 /* frequency */, 17 /* range frequency */, 2 /* ndv */}, + {Value(6), 17 /* frequency */}, + {Value(10), 17 /* frequency */, 34 /* range frequency */, 3 /* ndv */}, + }, + { + // Array min buckets. + {Value(1), 17 /* frequency */}, + {Value(5), 17 /* frequency */}, + {Value(6), 17 /* frequency */}, + {Value(8), 17 /* frequency */}, + }, + { + // Array max buckets. + {Value(3), 17 /* frequency */}, + {Value(5), 17 /* frequency */}, + {Value(6), 17 /* frequency */}, + {Value(10), 17 /* frequency */}, + }, + {{sbe::value::TypeTags::NumberInt32, 289}}, // Array type count = 3*17+5*17+6*17+3*17 + 88, // kCollCard arrays total. + 20 // 20 empty arrays. + )); + + // Test cardinality of individual predicates. + ASSERT_EQ_ELEMMATCH_CE(t, 5.0 /* CE */, 0.0 /* $elemMatch CE */, "scalar", "{$eq: 5}"); + ASSERT_EQ_ELEMMATCH_CE(t, 35.0 /* CE */, 35.0 /* $elemMatch CE */, "array", "{$eq: 5}"); + ASSERT_EQ_ELEMMATCH_CE(t, 19.5 /* CE */, 17.0 /* $elemMatch CE */, "mixed", "{$eq: 5}"); + + // Test cardinality of predicate combinations; the following tests make sure we correctly track + // which paths have $elemMatches and which don't. Some notes: + // - Whenever we use 'scalar' + $elemMatch, we expect an estimate of 0 because $elemMatch never + // returns documents on non-array paths. + // - Whenever we use 'mixed' + $elemMatch, we expect the estimate to decrease because we omit + // scalar values in 'mixed' from our estimate. + // - We do not expect the estimate on 'array' to be affected by the presence of $elemMatch, + // since we only have array values for this field. + + // Composite predicate on 'scalar' and 'array' fields. + ASSERT_MATCH_CE(t, "{scalar: {$eq: 5}, array: {$eq: 5}}", 2.236); + ASSERT_MATCH_CE(t, "{scalar: {$eq: 5}, array: {$elemMatch: {$eq: 5}}}", 2.236); + ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$eq: 5}}, array: {$eq: 5}}", 0.0); + + // Composite predicate on 'mixed' and 'array' fields. + ASSERT_MATCH_CE(t, "{mixed: {$eq: 5}, array: {$eq: 5}}", 8.721); + ASSERT_MATCH_CE(t, "{mixed: {$eq: 5}, array: {$elemMatch: {$eq: 5}}}", 8.721); + ASSERT_MATCH_CE(t, "{mixed: {$elemMatch: {$eq: 5}}, array: {$eq: 5}}", 7.603); + + // Composite predicate on 'scalar' and 'mixed' fields. + ASSERT_MATCH_CE(t, "{scalar: {$eq: 5}, mixed: {$eq: 5}}", 1.669); + ASSERT_MATCH_CE(t, "{scalar: {$eq: 5}, mixed: {$elemMatch: {$eq: 5}}}", 1.559); + ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$eq: 5}}, mixed: {$eq: 5}}", 0.0); + + // Composite predicate on all three fields without '$elemMatch' on 'array'. + ASSERT_MATCH_CE(t, "{scalar: {$eq: 5}, mixed: {$eq: 5}, array: {$eq: 5}}", 1.116); + ASSERT_MATCH_CE(t, "{scalar: {$eq: 5}, mixed: {$elemMatch: {$eq: 5}}, array: {$eq: 5}}", 1.042); + ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$eq: 5}}, mixed: {$eq: 5}, array: {$eq: 5}}", 0.0); + + // Composite predicate on all three fields with '$elemMatch' on 'array' (same expected results + // as above). + ASSERT_MATCH_CE(t, "{scalar: {$eq: 5}, mixed: {$eq: 5}, array: {$elemMatch: {$eq: 5}}}", 1.116); + + // Test case where the same path has both $match and $elemMatch (same as $elemMatch case). + ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$eq: 5}}, scalar: {$eq: 5}}", 0.0); + ASSERT_MATCH_CE(t, "{mixed: {$elemMatch: {$eq: 5}}, mixed: {$eq: 5}}", 17.0); + ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$eq: 5}}, array: {$eq: 5}}", 35.0); + + // Test case with multiple predicates and ranges. + ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$lt: 5}}, mixed: {$lt: 5}}", 70.2156); + ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$lt: 5}}, mixed: {$gt: 5}}", 28.4848); + + // Test multiple $elemMatches. + ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$eq: 5}}, array: {$elemMatch: {$eq: 5}}}", 0.0); + ASSERT_MATCH_CE(t, "{mixed: {$elemMatch: {$eq: 5}}, array: {$elemMatch: {$eq: 5}}}", 7.603); + ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$eq: 5}}, mixed: {$elemMatch: {$eq: 5}}}", 0.0); + ASSERT_MATCH_CE( + t, "{scalar: {$elemMatch: {$eq: 5}}, mixed: {$elemMatch: {$eq: 5}}, array: {$eq: 5}}", 0.0); + ASSERT_MATCH_CE( + t, + "{scalar: {$eq: 5}, mixed: {$elemMatch: {$eq: 5}}, array: {$elemMatch: {$eq: 5}}}", + 1.042); + ASSERT_MATCH_CE( + t, "{scalar: {$elemMatch: {$eq: 5}}, mixed: {$eq: 5}, array: {$elemMatch: {$eq: 5}}}", 0.0); + ASSERT_MATCH_CE(t, + "{scalar: {$elemMatch: {$eq: 5}}, mixed: {$elemMatch: {$eq: 5}}, array: " + "{$elemMatch: {$eq: 5}}}", + 0.0); + ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$lt: 5}}, mixed: {$elemMatch: {$lt: 5}}}", 34.1434); + ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$lt: 5}}, mixed: {$elemMatch: {$gt: 5}}}", 45.5246); + + // Verify that we still return an estimate of 0.0 for any $elemMatch predicate on a scalar + // field when we have a non-multikey index. + t.setIndexes({{"aScalarIndex", + makeIndexDefinition("scalar", CollationOp::Ascending, /* isMultiKey */ false)}}); + ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$eq: 5}}}", 0.0); + ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$gt: 1, $lt: 10}}}", 0.0); + + // Test how we estimate singular PathArr sargable predicate. + ASSERT_MATCH_CE_NODE(t, "{array: {$elemMatch: {}}}", 175.0, isSargable); + ASSERT_MATCH_CE_NODE(t, "{mixed: {$elemMatch: {}}}", 88.0, isSargable); + + // Take into account both empty and non-empty arrays. + auto makePathArrABT = [&](const FieldNameType& fieldName) { + const ProjectionName scanProjection{"scan_0"}; + auto scanNode = make(scanProjection, collName); + auto filterNode = + make(make(make(std::move(fieldName), make()), + make(scanProjection)), + std::move(scanNode)); + return make( + properties::ProjectionRequirement{ProjectionNameVector{scanProjection}}, + std::move(filterNode)); + }; + + // There are no arrays in the 'scalar' field. + ABT scalarABT = makePathArrABT("scalar"); + ASSERT_CE(t, scalarABT, 0.0); + + // About half the values of this field are arrays. + ABT mixedABT = makePathArrABT("mixed"); + ASSERT_CE(t, mixedABT, 88.0); + + // This field is always an array. + ABT arrayABT = makePathArrABT("array"); + ASSERT_CE(t, arrayABT, kCollCard); +} + +TEST(CEHistogramTest, TestMixedElemMatchAndNonElemMatch) { + constexpr auto kCollCard = 1; + CEHistogramTester t(collName, kCollCard); + + // A very simple histogram encoding a collection with one document {a: [3, 10]}. + t.addHistogram("a", + getArrayHistogramFromData({/* No scalar buckets. */}, + { + // Array unique buckets. + {Value(3), 1 /* frequency */}, + {Value(10), 1 /* frequency */}, + }, + { + // Array min buckets. + {Value(3), 1 /* frequency */}, + }, + { + // Array max buckets. + {Value(10), 1 /* frequency */}, + }, + {{sbe::value::TypeTags::NumberInt32, 2}}, + // Array type counts. + 1, + 0)); + + // Tests without indexes. + ASSERT_MATCH_CE(t, "{a: {$elemMatch: {$gt: 3, $lt: 10}}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$gt: 3, $lt: 10}}", 1.0); + ASSERT_MATCH_CE(t, "{a: {$elemMatch: {$eq: 3}, $gt: 3, $lt: 10}}", 1.0); + ASSERT_MATCH_CE(t, "{a: {$gt: 3, $lt: 10, $elemMatch: {$eq: 3}}}", 1.0); + ASSERT_MATCH_CE(t, "{a: {$gt: 3, $lt: 10, $elemMatch: {$gt: 3, $lt: 10}}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$elemMatch: {$gt: 3, $lt: 10}, $gt: 3, $lt: 10}}", 0.0); + + // Tests with multikey index (note that the index on "a" must be multikey due to arrays). + t.setIndexes( + {{"anIndex", makeIndexDefinition("a", CollationOp::Ascending, /* isMultiKey */ true)}}); + ASSERT_MATCH_CE(t, "{a: {$elemMatch: {$gt: 3, $lt: 10}}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$gt: 3, $lt: 10}}", 1.0); + ASSERT_MATCH_CE(t, "{a: {$elemMatch: {$eq: 3}, $gt: 3, $lt: 10}}", 1.0); + ASSERT_MATCH_CE(t, "{a: {$gt: 3, $lt: 10, $elemMatch: {$eq: 3}}}", 1.0); + ASSERT_MATCH_CE(t, "{a: {$gt: 3, $lt: 10, $elemMatch: {$gt: 3, $lt: 10}}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$elemMatch: {$gt: 3, $lt: 10}, $gt: 3, $lt: 10}}", 0.0); +} + +TEST(CEHistogramTest, TestTypeCounters) { + constexpr double kCollCard = 1000.0; + CEHistogramTester t(collName, kCollCard); + + // This test is designed such that for each document, we have the following fields: + // 1. scalar: Scalar histogram with no buckets, only type-counted data. + // 2. array: Array histogram with no buckets, only type-counted data inside of arrays. + // 3. mixed: Mixed histogram with no buckets, only type-counted data, both scalars and arrays. + constexpr double kNumObj = 200.0; + constexpr double kNumNull = 300.0; + t.addHistogram("scalar", + getArrayHistogramFromData({/* No histogram data. */}, + {{sbe::value::TypeTags::Object, kNumObj}, + {sbe::value::TypeTags::Null, kNumNull}})); + t.addHistogram("array", + getArrayHistogramFromData({/* No scalar buckets. */}, + {/* No array unique buckets. */}, + {/* No array min buckets. */}, + {/* No array max buckets. */}, + {{sbe::value::TypeTags::Object, kNumObj}, + {sbe::value::TypeTags::Null, kNumNull}}, + kCollCard)); + + // Count of each type in array type counters for field "mixed". + constexpr double kNumObjMA = 50.0; + constexpr double kNumNullMA = 100.0; + // For the purposes of this test, we have one array of each value of a non-histogrammable type. + constexpr double kNumArr = kNumObjMA + kNumNullMA; + const TypeCounts mixedArrayTC{{sbe::value::TypeTags::Object, kNumObjMA}, + {sbe::value::TypeTags::Null, kNumNullMA}}; + + // Count of each type in scalar type counters for field "mixed". + constexpr double kNumObjMS = 150.0; + constexpr double kNumNullMS = 200.0; + const TypeCounts mixedScalarTC{{sbe::value::TypeTags::Object, kNumObjMS}, + {sbe::value::TypeTags::Null, kNumNullMS}}; + + // Quick sanity check of test setup for the "mixed" histogram. The idea is that we want a + // portion of objects inside arrays, and the rest as scalars, but we want the total count of + // objects to be + ASSERT_EQ(kNumObjMA + kNumObjMS, kNumObj); + ASSERT_EQ(kNumNullMA + kNumNullMS, kNumNull); + + t.addHistogram("mixed", + getArrayHistogramFromData({/* No scalar buckets. */}, + {/* No array unique buckets. */}, + {/* No array min buckets. */}, + {/* No array max buckets. */}, + mixedArrayTC, + kNumArr, + 0 /* Empty array count. */, + mixedScalarTC)); + + // Set up indexes. + t.setIndexes({{"scalarIndex", + makeIndexDefinition("scalar", CollationOp::Ascending, /* isMultiKey */ false)}}); + t.setIndexes({{"arrayIndex", + makeIndexDefinition("array", CollationOp::Ascending, /* isMultiKey */ true)}}); + t.setIndexes({{"mixedIndex", + makeIndexDefinition("mixed", CollationOp::Ascending, /* isMultiKey */ true)}}); + + // Tests for scalar type counts only. + // For object-only intervals in a scalar histogram, we always return object count, no matter + // what the bounds are. Since we have a scalar histogram for "scalar", we expect all $elemMatch + // queries to have a cardinality of 0. + + // Test object equality. + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, 0.0, "scalar", "{$eq: {}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, 0.0, "scalar", "{$eq: {a: 1}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, 0.0, "scalar", "{$eq: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, 0.0, "scalar", "{$lt: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, 0.0, "scalar", "{$gt: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, 0.0, "scalar", "{$gte: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, 0.0, "scalar", "{$lte: {b: 2, c: 3}}"); + + // Test intervals including the empty object. Note that range queries on objects do not generate + // point equalities, so these fall back onto logic in interval estimation that identifies that + // the generated intervals are subsets of the object type interval. Note: we don't even generate + // a SargableNode for the first case. The generated bounds are: + // [{}, {}) because {} is the "minimum" value for the object type. + ASSERT_EQ_ELEMMATCH_CE(t, 0.0, 0.0, "scalar", "{$lt: {}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, 0.0, "scalar", "{$gt: {}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, 0.0, "scalar", "{$gte: {}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, 0.0, "scalar", "{$lte: {}}"); + + // Rather than combining the intervals together, in the following cases we generate two + // object-only intervals in the requirements map with the following bounds. Each individual + // interval is estimated as having a cardinality of 'kNumObj', before we apply conjunctive + // exponential backoff to combine them. + constexpr double k2ObjCard = 89.4427; // == 200/1000 * sqrt(200/1000) * 1000 + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, 0.0, "scalar", "{$gt: {}, $lte: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, 0.0, "scalar", "{$gte: {}, $lte: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, 0.0, "scalar", "{$gte: {}, $lt: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, 0.0, "scalar", "{$gt: {}, $lt: {b: 2, c: 3}}"); + + // Test intervals including {a: 1}. Similar to the above case, we have two intervals in the + // requirements map. + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, 0.0, "scalar", "{$gt: {a: 1}, $lte: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, 0.0, "scalar", "{$gte: {a: 1}, $lte: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, 0.0, "scalar", "{$gte: {a: 1}, $lt: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, 0.0, "scalar", "{$gt: {a: 1}, $lt: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, 0.0, "scalar", "{$gt: {a: 1}, $lte: {a: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, 0.0, "scalar", "{$gte: {a: 1}, $lte: {a: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, 0.0, "scalar", "{$gte: {a: 1}, $lt: {a: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, 0.0, "scalar", "{$gt: {a: 1}, $lt: {a: 3}}"); + + // Test that for null, we always return null count. + // Note that for ranges including null (e.g. {$lt: null}) we don't generate any SargableNodes. + ASSERT_EQ_ELEMMATCH_CE(t, kNumNull, 0.0, "scalar", "{$eq: null}"); + + // TODO SERVER-70936: Add tests for booleans. + // ASSERT_EQ_ELEMMATCH_CE(t, kNumBool, 0.0, "scalar", "{$eq: true}"); + // ASSERT_EQ_ELEMMATCH_CE(t, kNumBool, 0.0, "scalar", "{$eq: false}"); + + // Tests for array type counts only. + // For object-only intervals in an array histogram, if we're using $elemMatch on an object-only + // interval, we always return object count. While we have no scalar type counts for "array", + // non-$elemMatch queries should also match objects embedded in arrays, so we still return + // object count in that case. + + // Test object equality. + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObj, "array", "{$eq: {}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObj, "array", "{$eq: {a: 1}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObj, "array", "{$eq: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObj, "array", "{$lt: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObj, "array", "{$gt: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObj, "array", "{$gte: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObj, "array", "{$lte: {b: 2, c: 3}}"); + + // Test intervals including the empty object. + // Note: we don't even generate a SargableNode for the first case. The generated bounds are: + // [{}, {}) because {} is the "minimum" value for the object type. + ASSERT_EQ_ELEMMATCH_CE(t, 0.0, 0.0, "array", "{$lt: {}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObj, "array", "{$gt: {}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObj, "array", "{$gte: {}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObj, "array", "{$lte: {}}"); + + // Similar to above, here we have two object intervals for non-$elemMatch queries. However, for + // $elemMatch queries, we have the following intervals in the requirements map: + // 1. [[], BinData(0, )) with CE 1000 + // 2. The actual object interval, e.g. ({}, {b: 2, c: 3}] with CE 200 + constexpr double kArrEMCard = kNumObj; // == 200/1000 * sqrt(1000/1000) * 1000 + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kArrEMCard, "array", "{$gt: {}, $lte: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kArrEMCard, "array", "{$gte: {}, $lte: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kArrEMCard, "array", "{$gte: {}, $lt: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kArrEMCard, "array", "{$gt: {}, $lt: {b: 2, c: 3}}"); + + // Test intervals including {a: 1}; similar to above, we have two object intervals. + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kArrEMCard, "array", "{$gt: {a: 1}, $lte: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kArrEMCard, "array", "{$gte: {a: 1}, $lte: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kArrEMCard, "array", "{$gte: {a: 1}, $lt: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kArrEMCard, "array", "{$gt: {a: 1}, $lt: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kArrEMCard, "array", "{$gt: {a: 1}, $lte: {a: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kArrEMCard, "array", "{$gte: {a: 1}, $lte: {a: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kArrEMCard, "array", "{$gte: {a: 1}, $lt: {a: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kArrEMCard, "array", "{$gt: {a: 1}, $lt: {a: 3}}"); + + // Test that for null, we always return null count. + // Note that for ranges including null (e.g. {$lt: null}) we don't generate any SargableNodes. + ASSERT_EQ_ELEMMATCH_CE(t, kNumNull, kNumNull, "array", "{$eq: null}"); + + // TODO SERVER-70936: Add tests for booleans. + // ASSERT_EQ_ELEMMATCH_CE(t, kNumBool, kNumBool, "array", "{$eq: true}"); + // ASSERT_EQ_ELEMMATCH_CE(t, kNumBool, kNumBool, "array", "{$eq: false}"); + + // Tests for mixed type counts only. Regular match predicates should be estimated as the sum of + // the scalar and array counts (e.g. for objects, 'kNumObj'), while elemMatch predicates + // should be estimated without scalars, returning the array type count (for objects this is + // 'kNumObjMA'). + + // Test object equality. + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObjMA, "mixed", "{$eq: {}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObjMA, "mixed", "{$eq: {a: 1}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObjMA, "mixed", "{$eq: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObjMA, "mixed", "{$lt: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObjMA, "mixed", "{$gt: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObjMA, "mixed", "{$gte: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObjMA, "mixed", "{$lte: {b: 2, c: 3}}"); + + // Test intervals including the empty object. + // Note: we don't even generate a SargableNode for the first case. The generated bounds are: + // [{}, {}) because {} is the "minimum" value for the object type. + ASSERT_EQ_ELEMMATCH_CE(t, 0.0, 0.0, "mixed", "{$lt: {}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObjMA, "mixed", "{$gt: {}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObjMA, "mixed", "{$gte: {}}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, kNumObjMA, "mixed", "{$lte: {}}"); + + // Similar to above, here we have two object intervals for non-$elemMatch queries. However, for + // $elemMatch queries, we have the following intervals in the requirements map: + // 1. [[], BinData(0, )) with CE 1000 + // 2. The actual object interval, e.g. ({}, {b: 2, c: 3}] with CE 50 + constexpr double kMixEMCard = kNumObjMA; // == 50/1000 * sqrt(1000/1000) * 1000 + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kMixEMCard, "mixed", "{$gt: {}, $lte: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kMixEMCard, "mixed", "{$gte: {}, $lte: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kMixEMCard, "mixed", "{$gte: {}, $lt: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kMixEMCard, "mixed", "{$gt: {}, $lt: {b: 2, c: 3}}"); + + // Test intervals including {a: 1}; similar to above, we have two object intervals. + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kMixEMCard, "mixed", "{$gt: {a: 1}, $lte: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kMixEMCard, "mixed", "{$gte: {a: 1}, $lte: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kMixEMCard, "mixed", "{$gte: {a: 1}, $lt: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kMixEMCard, "mixed", "{$gt: {a: 1}, $lt: {b: 2, c: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kMixEMCard, "mixed", "{$gt: {a: 1}, $lte: {a: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kMixEMCard, "mixed", "{$gte: {a: 1}, $lte: {a: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kMixEMCard, "mixed", "{$gte: {a: 1}, $lt: {a: 3}}"); + ASSERT_EQ_ELEMMATCH_CE(t, k2ObjCard, kMixEMCard, "mixed", "{$gt: {a: 1}, $lt: {a: 3}}"); + + // Test that for null, we always return null count. + // Note that for ranges including null (e.g. {$lt: null}) we don't generate any SargableNodes. + ASSERT_EQ_ELEMMATCH_CE(t, kNumNull, kNumNullMA, "mixed", "{$eq: null}"); + + // TODO SERVER-70936: Add tests for booleans. + // ASSERT_EQ_ELEMMATCH_CE(t, kNumBool, kNumBoolMA, "mixed", "{$eq: true}"); + // ASSERT_EQ_ELEMMATCH_CE(t, kNumBool, kNumBoolMA, "mixed", "{$eq: false}"); + + // Test combinations of the three fields/ type counters. + constexpr double k3ObjCard = + 59.814; // == 200/1000 * sqrt(200/1000) * sqrt(sqrt(200/1000)) * 1000 + constexpr double k4ObjCard = 48.914; + ASSERT_MATCH_CE_NODE(t, + "{scalar: {$eq: {a: 1}}, mixed: {$eq: {b: 1}}, array: {$eq: {c: 1}}}", + k3ObjCard, + isSargable3); + ASSERT_MATCH_CE_NODE( + t, + "{scalar: {$eq: {}}, mixed: {$lt: {b: 1}}, array: {$gt: {a: 1}, $lte: {a: 2, b: 4, c: 3}}}", + k4ObjCard, + isSargable4); + + // Should always get a 0.0 cardinality for an $elemMatch on a scalar predicate. + ASSERT_MATCH_CE(t, + "{scalar: {$elemMatch: {$eq: {a: 1}}}, mixed: {$elemMatch: {$eq: {b: 1}}}," + " array: {$elemMatch: {$eq: {c: 1}}}}", + 0.0); + ASSERT_MATCH_CE(t, + "{scalar: {$elemMatch: {$eq: {}}}, mixed: {$elemMatch: {$lt: {b: 1}}}," + " array: {$elemMatch: {$gt: {a: 1}, $lte: {a: 2, b: 4, c: 3}}}}", + 0.0); + + // The 'array' interval estimate is 50, but the 'mixed' interval estimate is 200. + constexpr double kArrMixObjEMCard = 22.3607; // == 50/1000 * sqrt(200/1000) * 1000 + ASSERT_MATCH_CE_NODE(t, + "{mixed: {$elemMatch: {$eq: {b: 1}}}, array: {$elemMatch: {$eq: {c: 1}}}}", + kArrMixObjEMCard, + isSargable4); + ASSERT_MATCH_CE_NODE(t, + "{mixed: {$elemMatch: {$lt: {b: 1}}}," + " array: {$elemMatch: {$gt: {a: 1}, $lte: {a: 2, b: 4, c: 3}}}}", + kArrMixObjEMCard, + isSargable4); +} + +TEST(CEHistogramTest, TestNestedArrayTypeCounterPredicates) { + // This test validates the correct behaviour of both the nested-array type counter as well as + // combinations of type counters and histogram estimates. + constexpr double kCollCard = 1000.0; + constexpr double kNumArr = 600.0; // Total number of arrays. + constexpr double kNumNestArr = 500.0; // Frequency of nested arrays, e.g. [[1, 2, 3]]. + constexpr double kNumNonNestArr = 100.0; + constexpr double kNum1 = 2.0; // Frequency of 1. + constexpr double kNum2 = 3.0; // Frequency of 2. + constexpr double kNum3 = 5.0; // Frequency of 3. + constexpr double kNumArr1 = 20.0; // Frequency of [1]. + constexpr double kNumArr2 = 30.0; // Frequency of [2]. + constexpr double kNumArr3 = 50.0; // Frequency of [3]. + constexpr double kNumObj = 390.0; // Total number of scalar objects. + + // Sanity test numbers. + ASSERT_EQ(kNumArr1 + kNumArr2, kNumArr3); + ASSERT_EQ(kNumNonNestArr + kNumNestArr, kNumArr); + ASSERT_EQ(kNumObj + kNumArr + kNum1 + kNum2 + kNum3, kCollCard); + + // Define histogram buckets. + TestBuckets scalarBuckets{{Value(1), kNum1}, {Value(2), kNum2}, {Value(3), kNum3}}; + TestBuckets arrUniqueBuckets{{Value(1), kNumArr1}, {Value(2), kNumArr2}, {Value(3), kNumArr3}}; + TestBuckets arrMinBuckets{{Value(1), kNumArr1}, {Value(2), kNumArr2}, {Value(3), kNumArr3}}; + TestBuckets arrMaxBuckets{{Value(1), kNumArr1}, {Value(2), kNumArr2}, {Value(3), kNumArr3}}; + + // Define type counts. + TypeCounts arrayTypeCounts{{sbe::value::TypeTags::Array, kNumNestArr}, + {sbe::value::TypeTags::NumberInt32, kNumNonNestArr}}; + TypeCounts scalarTypeCounts{{sbe::value::TypeTags::Object, kNumObj}}; + + CEHistogramTester t(collName, kCollCard); + t.addHistogram("na", + getArrayHistogramFromData(std::move(scalarBuckets), + std::move(arrUniqueBuckets), + std::move(arrMinBuckets), + std::move(arrMaxBuckets), + std::move(arrayTypeCounts), + kNumArr, + 0 /* Empty array count. */, + std::move(scalarTypeCounts))); + t.setIndexes( + {{"index", makeIndexDefinition("na", CollationOp::Ascending, /* isMultiKey */ true)}}); + + // Some equality tests on types that are not present in the type counters should return 0.0. + // TODO SERVER-70936: Add tests for booleans. + // ASSERT_EQ_ELEMMATCH_CE(t, 0.0, 0.0, "na", "{$eq: false}"); + // ASSERT_EQ_ELEMMATCH_CE(t, 0.0, 0.0, "na", "{$eq: true}"); + ASSERT_EQ_ELEMMATCH_CE(t, 0.0, 0.0, "na", "{$eq: null}"); + // We don't have any objects in arrays, so don't count them. + ASSERT_EQ_ELEMMATCH_CE(t, kNumObj, 0.0, "na", "{$eq: {a: 1}}"); + + // Quick equality test to see if regular array histogram estimation still works as expected. + ASSERT_EQ_ELEMMATCH_CE(t, kNumArr1 + kNum1, kNumArr1, "na", "{$eq: 1}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumArr2 + kNum2, kNumArr2, "na", "{$eq: 2}"); + ASSERT_EQ_ELEMMATCH_CE(t, kNumArr3 + kNum3, kNumArr3, "na", "{$eq: 3}"); + + // Test a range predicate. + // - For simple $lt, we correctly return both scalar and array counts that could match. + // - For $elemMatch + $lt, we have two entries in the requirements map. + // - The PathArr interval, estimated correctly as 'kNumArr'. + // - The interval {$lt: 3}, estimated as an array histogram range interval. + // We then combine the estimates for the two using conjunctive exponential backoff. + constexpr double elemMatchRange = 71.5485; + ASSERT_EQ_ELEMMATCH_CE( + t, kNumArr1 + kNum1 + kNumArr2 + kNum2, elemMatchRange, "na", "{$lt: 3}"); + ASSERT_EQ_ELEMMATCH_CE(t, 0.0, 0.0, "na", "{$lt: 1}"); + + // Test equality to arrays. + // - $elemMatch, estimation, as expected, will return the count of nested arrays. + // - For the case where we see equality to the array, we have a disjunction of intervals in the + // same entry of the SargableNode requirements map. For the case of {$eq: [1]}, for example, we + // have: [[1], [1]] U [1, 1]. As a result, we estimate each point interval separately: + // - [[1], [1]]: We estimate the nested array interval as 'kNumNestArr'. + // - [1, 1]: We estimate the regular point interval as 'kNumArr1' + 'kNum1'. + // We then combine the results by exponential backoff. Note that we will NOT match {na: 1}; + // however, because of the way the interval is defined, our estimate suggests that we would. + // TODO: is there a way to know this on the CE side? + constexpr double kArr1EqCard = 505.531; // (1 - (1 - 500.0/1000) * sqrt(1 - 22.0/1000)) * 1000 + constexpr double kArr2EqCard = 508.319; // (1 - (1 - 500.0/1000) * sqrt(1 - 33.0/1000)) * 1000 + constexpr double kArr3EqCard = 513.944; // (1 - (1 - 500.0/1000) * sqrt(1 - 55.0/1000)) * 1000 + ASSERT_EQ_ELEMMATCH_CE_NODE(t, kArr1EqCard, kNumNestArr, "na", "{$eq: [1]}", isSargable); + ASSERT_EQ_ELEMMATCH_CE_NODE(t, kArr2EqCard, kNumNestArr, "na", "{$eq: [2]}", isSargable); + ASSERT_EQ_ELEMMATCH_CE_NODE(t, kArr3EqCard, kNumNestArr, "na", "{$eq: [3]}", isSargable); + // For the last case, we have the interval [[1, 2, 3], [1, 2, 3]] U [1, 1]. + // TODO: is this interval semantically correct? + ASSERT_EQ_ELEMMATCH_CE_NODE(t, kArr1EqCard, kNumNestArr, "na", "{$eq: [1, 2, 3]}", isSargable); + + // Now, we test the case of nested arrays. + // - $elemMatch, once again, returns the number of nested arrays. + // - Simple equality generates two intervals. We estimate both intervals using the nested array + // type count. For {$eq: [[1, 2, 3]]}, we get: + // - [[1, 2, 3], [1, 2, 3]] U [[[1, 2, 3]]], [[1, 2, 3]]] + constexpr double kNestedEqCard = + 646.447; // (1 - (1 - 500.0/1000) * sqrt(1 - 500.0/1000)) * 1000 + ASSERT_EQ_ELEMMATCH_CE_NODE( + t, kNestedEqCard, kNumNestArr, "na", "{$eq: [[1, 2, 3]]}", isSargable); + ASSERT_EQ_ELEMMATCH_CE_NODE(t, kNestedEqCard, kNumNestArr, "na", "{$eq: [[1]]}", isSargable); + ASSERT_EQ_ELEMMATCH_CE_NODE(t, kNestedEqCard, kNumNestArr, "na", "{$eq: [[2]]}", isSargable); + ASSERT_EQ_ELEMMATCH_CE_NODE(t, kNestedEqCard, kNumNestArr, "na", "{$eq: [[3]]}", isSargable); + + // Note: we can't convert range queries on arrays to SargableNodes yet. If we ever can, we + // should add some more tests here. +} + +TEST(CEHistogramTest, TestFallbackForNonConstIntervals) { + // This is a sanity test to validate fallback for an interval with non-const bounds. + IntervalRequirement intervalLowNonConst{ + BoundRequirement(true /*inclusive*/, make("v1")), + BoundRequirement::makePlusInf()}; + + IntervalRequirement intervalHighNonConst{ + BoundRequirement::makeMinusInf(), + BoundRequirement(true /*inclusive*/, make("v2"))}; + + IntervalRequirement intervalEqNonConst{ + BoundRequirement(true /*inclusive*/, make("v3")), + BoundRequirement(true /*inclusive*/, make("v3"))}; + + const auto estInterval = [](const auto& interval) { + ArrayHistogram ah; + return estimateIntervalCardinality( + ah, interval, 100 /* inputCardinality */, true /* includeScalar */); + }; + + ASSERT_EQ(estInterval(intervalLowNonConst), -1.0); + ASSERT_EQ(estInterval(intervalHighNonConst), -1.0); + ASSERT_EQ(estInterval(intervalEqNonConst), -1.0); +} +} // namespace +} // namespace mongo::optimizer::ce diff --git a/src/mongo/db/query/ce/histogram_interpolation_test.cpp b/src/mongo/db/query/ce/histogram_interpolation_test.cpp new file mode 100644 index 00000000000..4ad9d38b4e0 --- /dev/null +++ b/src/mongo/db/query/ce/histogram_interpolation_test.cpp @@ -0,0 +1,508 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/ce/histogram_predicate_estimation.h" +#include "mongo/db/query/ce/test_utils.h" +#include "mongo/db/query/sbe_stage_builder_helpers.h" +#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/unittest/unittest.h" + +namespace mongo::optimizer::ce { +namespace { +namespace value = sbe::value; + +using stats::ArrayHistogram; +using stats::ScalarHistogram; +using stats::TypeCounts; + +TEST(EstimatorTest, ManualHistogram) { + std::vector data{{0, 1.0, 1.0, 1.0}, + {10, 1.0, 10.0, 5.0}, + {20, 3.0, 15.0, 3.0}, + {30, 1.0, 10.0, 4.0}, + {40, 2.0, 0.0, 0.0}, + {50, 1.0, 10.0, 5.0}}; + const ScalarHistogram hist = createHistogram(data); + + ASSERT_EQ(55.0, getTotals(hist).card); + + ASSERT_EQ(1.0, estimateIntValCard(hist, 0, EstimationType::kEqual)); + ASSERT_EQ(2.0, estimateIntValCard(hist, 5, EstimationType::kEqual)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 35, EstimationType::kEqual)); + + ASSERT_EQ(15.5, estimateIntValCard(hist, 15, EstimationType::kLess)); + ASSERT_EQ(20.5, estimateIntValCard(hist, 15, EstimationType::kLessOrEqual)); + ASSERT_EQ(28, estimateIntValCard(hist, 20, EstimationType::kLess)); + ASSERT_EQ(31.0, estimateIntValCard(hist, 20, EstimationType::kLessOrEqual)); + + ASSERT_EQ(42, estimateIntValCard(hist, 10, EstimationType::kGreater)); + ASSERT_EQ(43, estimateIntValCard(hist, 10, EstimationType::kGreaterOrEqual)); + ASSERT_EQ(19, estimateIntValCard(hist, 25, EstimationType::kGreater)); + ASSERT_EQ(21.5, estimateIntValCard(hist, 25, EstimationType::kGreaterOrEqual)); +} + +TEST(EstimatorTest, UniformIntEstimate) { + // This hard-codes a maxdiff histogram with 10 buckets built off a uniform int distribution with + // a minimum of 0, a maximum of 1000, and 70 distinct values. + std::vector data{{2, 1, 0, 0}, + {57, 3, 2, 1}, + {179, 5, 10, 6}, + {317, 5, 9, 6}, + {344, 3, 0, 0}, + {558, 4, 19, 12}, + {656, 2, 4, 3}, + {798, 3, 7, 4}, + {951, 5, 17, 7}, + {986, 1, 0, 0}}; + const ScalarHistogram hist = createHistogram(data); + + // Predicates over bucket bound. + double expectedCard = estimateIntValCard(hist, 558, EstimationType::kEqual); + ASSERT_EQ(4.0, expectedCard); + expectedCard = estimateIntValCard(hist, 558, EstimationType::kLess); + ASSERT_EQ(57.0, expectedCard); + expectedCard = estimateIntValCard(hist, 558, EstimationType::kLessOrEqual); + ASSERT_EQ(61.0, expectedCard); + + // Predicates over value inside of a bucket. + + // Query: [{$match: {a: {$eq: 530}}}]. + expectedCard = estimateIntValCard(hist, 530, EstimationType::kEqual); + ASSERT_APPROX_EQUAL(1.6, expectedCard, 0.1); // Actual: 1. + + // Query: [{$match: {a: {$lt: 530}}}]. + expectedCard = estimateIntValCard(hist, 530, EstimationType::kLess); + ASSERT_APPROX_EQUAL(52.9, expectedCard, 0.1); // Actual: 50. + + // Query: [{$match: {a: {$lte: 530}}}]. + expectedCard = estimateIntValCard(hist, 530, EstimationType::kLessOrEqual); + ASSERT_APPROX_EQUAL(54.5, expectedCard, 0.1); // Actual: 51. + + // Query: [{$match: {a: {$eq: 400}}}]. + expectedCard = estimateIntValCard(hist, 400, EstimationType::kEqual); + ASSERT_APPROX_EQUAL(1.6, expectedCard, 0.1); // Actual: 1. + + // Query: [{$match: {a: {$lt: 400}}}]. + expectedCard = estimateIntValCard(hist, 400, EstimationType::kLess); + ASSERT_APPROX_EQUAL(41.3, expectedCard, 0.1); // Actual: 39. + + // Query: [{$match: {a: {$lte: 400}}}]. + expectedCard = estimateIntValCard(hist, 400, EstimationType::kLessOrEqual); + ASSERT_APPROX_EQUAL(43.0, expectedCard, 0.1); // Actual: 40. +} + +TEST(EstimatorTest, NormalIntEstimate) { + // This hard-codes a maxdiff histogram with 10 buckets built off a normal int distribution with + // a minimum of 0, a maximum of 1000, and 70 distinct values. + std::vector data{{2, 1, 0, 0}, + {317, 8, 20, 15}, + {344, 2, 0, 0}, + {388, 3, 0, 0}, + {423, 4, 2, 2}, + {579, 4, 12, 8}, + {632, 3, 2, 1}, + {696, 3, 5, 3}, + {790, 5, 4, 2}, + {993, 1, 21, 9}}; + const ScalarHistogram hist = createHistogram(data); + + // Predicates over bucket bound. + double expectedCard = estimateIntValCard(hist, 696, EstimationType::kEqual); + ASSERT_EQ(3.0, expectedCard); + expectedCard = estimateIntValCard(hist, 696, EstimationType::kLess); + ASSERT_EQ(66.0, expectedCard); + expectedCard = estimateIntValCard(hist, 696, EstimationType::kLessOrEqual); + ASSERT_EQ(69.0, expectedCard); + + // Predicates over value inside of a bucket. + + // Query: [{$match: {a: {$eq: 150}}}]. + expectedCard = estimateIntValCard(hist, 150, EstimationType::kEqual); + ASSERT_APPROX_EQUAL(1.3, expectedCard, 0.1); // Actual: 1. + + // Query: [{$match: {a: {$lt: 150}}}]. + expectedCard = estimateIntValCard(hist, 150, EstimationType::kLess); + ASSERT_APPROX_EQUAL(9.1, expectedCard, 0.1); // Actual: 9. + + // Query: [{$match: {a: {$lte: 150}}}]. + expectedCard = estimateIntValCard(hist, 150, EstimationType::kLessOrEqual); + ASSERT_APPROX_EQUAL(10.4, expectedCard, 0.1); // Actual: 10. +} + +TEST(EstimatorTest, UniformStrEstimate) { + // This hard-codes a maxdiff histogram with 10 buckets built off a uniform string distribution + // with a minimum length of 3, a maximum length of 5, and 80 distinct values. + std::vector data{{{"0ejz", 2, 0, 0}, + {"8DCaq", 3, 4, 4}, + {"Cy5Kw", 3, 3, 3}, + {"WXX7w", 3, 31, 20}, + {"YtzS", 2, 0, 0}, + {"fuK", 5, 13, 7}, + {"gLkp", 3, 0, 0}, + {"ixmVx", 2, 6, 2}, + {"qou", 1, 9, 6}, + {"z2b", 1, 9, 6}}}; + const ScalarHistogram hist = createHistogram(data); + + // Predicates over value inside of a bucket. + const auto [tag, value] = value::makeNewString("TTV"_sd); + value::ValueGuard vg(tag, value); + + // Query: [{$match: {a: {$eq: 'TTV'}}}]. + double expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_APPROX_EQUAL(1.55, expectedCard, 0.1); // Actual: 2. + + // Query: [{$match: {a: {$lt: 'TTV'}}}]. + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_APPROX_EQUAL(39.8, expectedCard, 0.1); // Actual: 39. + + // Query: [{$match: {a: {$lte: 'TTV'}}}]. + expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; + ASSERT_APPROX_EQUAL(41.3, expectedCard, 0.1); // Actual: 41. +} + +TEST(EstimatorTest, NormalStrEstimate) { + // This hard-codes a maxdiff histogram with 10 buckets built off a normal string distribution + // with a minimum length of 3, a maximum length of 5, and 80 distinct values. + std::vector data{{ + {"0ejz", 1, 0, 0}, + {"4FGjc", 3, 5, 3}, + {"9bU3", 2, 3, 2}, + {"Cy5Kw", 3, 3, 3}, + {"Lm4U", 2, 11, 5}, + {"TTV", 5, 14, 8}, + {"YtzS", 2, 3, 2}, + {"o9cD4", 6, 26, 16}, + {"qfmnP", 1, 4, 2}, + {"xqbi", 2, 4, 4}, + }}; + const ScalarHistogram hist = createHistogram(data); + + // Predicates over bucket bound. + auto [tag, value] = value::makeNewString("TTV"_sd); + value::ValueGuard vg(tag, value); + + // Query: [{$match: {a: {$eq: 'TTV'}}}]. + double expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_APPROX_EQUAL(5.0, expectedCard, 0.1); // Actual: 5. + + // Query: [{$match: {a: {$lt: 'TTV'}}}]. + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_APPROX_EQUAL(47.0, expectedCard, 0.1); // Actual: 47. + + // Query: [{$match: {a: {$lte: 'TTV'}}}]. + expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; + ASSERT_APPROX_EQUAL(52.0, expectedCard, 0.1); // Actual: 52. + + // Predicates over value inside of a bucket. + std::tie(tag, value) = value::makeNewString("Pfa"_sd); + + // Query: [{$match: {a: {$eq: 'Pfa'}}}]. + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_APPROX_EQUAL(1.75, expectedCard, 0.1); // Actual: 2. + + // Query: [{$match: {a: {$lt: 'Pfa'}}}]. + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_APPROX_EQUAL(38.3, expectedCard, 0.1); // Actual: 35. + + // Query: [{$match: {a: {$lte: 'Pfa'}}}]. + expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; + ASSERT_APPROX_EQUAL(40.0, expectedCard, 0.1); // Actual: 37. +} + +TEST(EstimatorTest, UniformIntStrEstimate) { + // This hard-codes a maxdiff histogram with 20 buckets built off of a uniform distribution with + // two types occurring with equal probability: + // - 100 distinct ints between 0 and 1000, and + // - 100 distinct strings of length between 2 and 5. + std::vector data{{ + {2, 3, 0, 0}, {19, 4, 1, 1}, {226, 2, 49, 20}, {301, 5, 12, 4}, + {317, 3, 0, 0}, {344, 2, 3, 1}, {423, 5, 18, 6}, {445, 3, 0, 0}, + {495, 3, 4, 2}, {542, 5, 9, 3}, {696, 3, 44, 19}, {773, 4, 11, 5}, + {805, 2, 8, 4}, {931, 5, 21, 8}, {998, 4, 21, 3}, {"8N4", 5, 31, 14}, + {"MIb", 5, 45, 17}, {"Zgi", 3, 55, 22}, {"pZ", 6, 62, 25}, {"yUwxz", 5, 29, 12}, + }}; + const ScalarHistogram hist = createHistogram(data); + const ArrayHistogram arrHist( + hist, TypeCounts{{value::TypeTags::NumberInt64, 254}, {value::TypeTags::StringSmall, 246}}); + + // Predicates over value inside of the last numeric bucket. + + // Query: [{$match: {a: {$eq: 993}}}]. + double expectedCard = estimateIntValCard(hist, 993, EstimationType::kEqual); + ASSERT_APPROX_EQUAL(7.0, expectedCard, 0.1); // Actual: 9. + + // Query: [{$match: {a: {$lt: 993}}}]. + expectedCard = estimateIntValCard(hist, 993, EstimationType::kLess); + ASSERT_APPROX_EQUAL(241.4, expectedCard, 0.1); // Actual: 241. + + // Query: [{$match: {a: {$lte: 993}}}]. + expectedCard = estimateIntValCard(hist, 993, EstimationType::kLessOrEqual); + ASSERT_APPROX_EQUAL(248.4, expectedCard, 0.1); // Actual: 250. + + // Predicates over value inside of the first string bucket. + auto [tag, value] = value::makeNewString("04e"_sd); + value::ValueGuard vg(tag, value); + + // Query: [{$match: {a: {$eq: '04e'}}}]. + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_APPROX_EQUAL(2.2, expectedCard, 0.1); // Actual: 3. + + value::TypeTags lowTag = value::TypeTags::NumberInt64; + value::Value lowVal = 100000000; + + // Type bracketing: low value of different type than the bucket bound. + // Query: [{$match: {a: {$eq: 100000000}}}]. + expectedCard = estimateCardEq(arrHist, lowTag, lowVal, true /* includeScalar */); + ASSERT_APPROX_EQUAL(0.0, expectedCard, 0.1); // Actual: 0. + + // No interpolation for inequality to values inside the first string bucket, fallback to half of + // the bucket frequency. + + // Query: [{$match: {a: {$lt: '04e'}}}]. + expectedCard = estimateCardRange(arrHist, + false /* lowInclusive */, + lowTag, + lowVal, + false /* highInclusive */, + tag, + value, + true /* includeScalar */); + ASSERT_APPROX_EQUAL(13.3, expectedCard, 0.1); // Actual: 0. + + // Query: [{$match: {a: {$lte: '04e'}}}]. + expectedCard = estimateCardRange(arrHist, + false /* lowInclusive */, + lowTag, + lowVal, + true /* highInclusive */, + tag, + value, + true /* includeScalar */); + ASSERT_APPROX_EQUAL(15.5, expectedCard, 0.1); // Actual: 3. + + // Value towards the end of the bucket gets the same half bucket estimate. + std::tie(tag, value) = value::makeNewString("8B5"_sd); + + // Query: [{$match: {a: {$lt: '8B5'}}}]. + expectedCard = estimateCardRange(arrHist, + false /* lowInclusive */, + lowTag, + lowVal, + false /* highInclusive */, + tag, + value, + true /* includeScalar */); + ASSERT_APPROX_EQUAL(13.3, expectedCard, 0.1); // Actual: 24. + + // Query: [{$match: {a: {$lte: '8B5'}}}]. + expectedCard = estimateCardRange(arrHist, + false /* lowInclusive */, + lowTag, + lowVal, + true /* highInclusive */, + tag, + value, + true /* includeScalar */); + ASSERT_APPROX_EQUAL(15.5, expectedCard, 0.1); // Actual: 29. +} + +TEST(EstimatorTest, UniformIntArrayOnlyEstimate) { + // This hard-codes a maxdiff histogram with 10 buckets built off of an array distribution with + // arrays between 3 and 5 elements long, each containing 100 distinct ints uniformly distributed + // between 0 and 1000. There are no scalar elements. + std::vector scalarData{{}}; + const ScalarHistogram scalarHist = createHistogram(scalarData); + + std::vector minData{{ + {5, 3, 0, 0}, {19, 5, 2, 1}, {57, 4, 4, 3}, {116, 7, 13, 7}, {198, 3, 15, 6}, + {228, 2, 3, 2}, {254, 4, 0, 0}, {280, 2, 2, 1}, {335, 3, 5, 3}, {344, 2, 0, 0}, + {388, 3, 0, 0}, {420, 2, 0, 0}, {454, 1, 6, 3}, {488, 2, 1, 1}, {530, 1, 0, 0}, + {561, 1, 0, 0}, {609, 1, 0, 0}, {685, 1, 0, 0}, {713, 1, 0, 0}, {758, 1, 0, 0}, + }}; + const ScalarHistogram minHist = createHistogram(minData); + + std::vector maxData{{ + {301, 1, 0, 0}, {408, 2, 0, 0}, {445, 1, 0, 0}, {605, 2, 0, 0}, {620, 1, 0, 0}, + {665, 1, 1, 1}, {687, 3, 0, 0}, {704, 2, 6, 2}, {718, 2, 2, 1}, {741, 2, 1, 1}, + {752, 2, 0, 0}, {823, 7, 3, 3}, {827, 1, 0, 0}, {852, 3, 0, 0}, {864, 5, 0, 0}, + {909, 7, 12, 5}, {931, 2, 3, 1}, {939, 3, 0, 0}, {970, 2, 12, 4}, {998, 1, 10, 4}, + }}; + const ScalarHistogram maxHist = createHistogram(maxData); + + std::vector uniqueData{{ + {5, 3, 0, 0}, {19, 6, 2, 1}, {57, 4, 4, 3}, {116, 7, 15, 8}, {228, 2, 38, 13}, + {254, 7, 0, 0}, {269, 10, 0, 0}, {280, 7, 3, 1}, {306, 4, 1, 1}, {317, 4, 0, 0}, + {344, 2, 19, 5}, {423, 2, 27, 8}, {507, 2, 22, 13}, {704, 8, 72, 34}, {718, 6, 3, 1}, + {758, 3, 13, 4}, {864, 7, 35, 14}, {883, 4, 0, 0}, {939, 5, 32, 10}, {998, 1, 24, 9}, + }}; + const ScalarHistogram uniqueHist = createHistogram(uniqueData); + + const ArrayHistogram arrHist(scalarHist, + TypeCounts{{value::TypeTags::Array, 100}}, + uniqueHist, + minHist, + maxHist, + TypeCounts{}, + 0); + + // Query in the middle of the domain: estimate from ArrayUnique histogram. + value::TypeTags lowTag = value::TypeTags::NumberInt64; + value::Value lowVal = 500; + value::TypeTags highTag = value::TypeTags::NumberInt64; + value::Value highVal = 600; + + // Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 500, $lt: 600}}}}]. + double expectedCard = estimateCardRange(arrHist, + false /* lowInclusive */, + lowTag, + lowVal, + false /* highInclusive */, + highTag, + highVal, + false /* includeScalar */); + ASSERT_APPROX_EQUAL(27.0, expectedCard, 0.1); // actual 21. + + // Test interpolation for query: [{$match: {a: {$gt: 500, $lt: 600}}}]. + // Note: although there are no scalars, the estimate is different than the + // above since we use different formulas. + expectedCard = estimateCardRange(arrHist, + false /* lowInclusive */, + lowTag, + lowVal, + false /* highInclusive */, + highTag, + highVal, + true /* includeScalar */); + ASSERT_APPROX_EQUAL(92.0, expectedCard, 0.1); // actual 92. + + // Query at the end of the domain: more precise estimates from ArrayMin, ArrayMax histograms. + lowVal = 10; + highVal = 110; + + // Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 10, $lt: 110}}}}]. + expectedCard = estimateCardRange(arrHist, + false /* lowInclusive */, + lowTag, + lowVal, + false /* highInclusive */, + highTag, + highVal, + false /* includeScalar */); + ASSERT_APPROX_EQUAL(24.1, expectedCard, 0.1); // actual 29. + + // Test interpolation for query: [{$match: {a: {$gt: 10, $lt: 110}}}]. + expectedCard = estimateCardRange(arrHist, + false /* lowInclusive */, + lowTag, + lowVal, + false /* highInclusive */, + highTag, + highVal, + true /* includeScalar */); + ASSERT_APPROX_EQUAL(27.8, expectedCard, 0.1); // actual 31. +} + +TEST(EstimatorTest, UniformIntMixedArrayEstimate) { + // This hard-codes a maxdiff histogram with 20 buckets built off of a mixed distribution split + // with equal probability between: + // - an array distribution between 3 and 5 elements long, each containing 80 distinct ints + // uniformly distributed between 0 and 1000, and + // - a uniform int distribution with 80 distinct ints between 0 and 1000. + std::vector scalarData{{ + {25, 1, 0, 0}, {41, 2, 0, 0}, {142, 2, 3, 3}, {209, 3, 3, 1}, {243, 1, 2, 1}, + {296, 3, 4, 3}, {321, 5, 4, 2}, {480, 3, 9, 8}, {513, 3, 3, 2}, {554, 1, 0, 0}, + {637, 3, 3, 2}, {666, 2, 1, 1}, {697, 2, 2, 1}, {750, 3, 3, 2}, {768, 4, 0, 0}, + {791, 4, 3, 3}, {851, 2, 2, 2}, {927, 2, 10, 6}, {958, 3, 2, 1}, {980, 3, 0, 0}, + }}; + const ScalarHistogram scalarHist = createHistogram(scalarData); + + std::vector minData{{ + {3, 3, 0, 0}, {5, 8, 0, 0}, {9, 3, 0, 0}, {19, 2, 0, 0}, {49, 7, 4, 2}, + {69, 6, 0, 0}, {115, 3, 5, 3}, {125, 2, 0, 0}, {146, 1, 2, 1}, {198, 2, 4, 3}, + {214, 2, 0, 0}, {228, 3, 0, 0}, {260, 3, 4, 1}, {280, 1, 2, 2}, {330, 2, 2, 1}, + {344, 6, 0, 0}, {388, 2, 0, 0}, {420, 2, 0, 0}, {461, 2, 8, 4}, {696, 1, 2, 1}, + }}; + const ScalarHistogram minHist = createHistogram(minData); + + std::vector maxData{{ + {301, 1, 0, 0}, {445, 1, 0, 0}, {491, 1, 0, 0}, {533, 3, 0, 0}, {605, 3, 0, 0}, + {620, 2, 0, 0}, {647, 3, 0, 0}, {665, 4, 0, 0}, {713, 3, 10, 4}, {741, 3, 0, 0}, + {814, 3, 2, 2}, {839, 2, 1, 1}, {864, 1, 2, 2}, {883, 3, 0, 0}, {893, 7, 0, 0}, + {898, 5, 0, 0}, {909, 1, 12, 3}, {931, 2, 2, 1}, {953, 6, 3, 2}, {993, 1, 7, 5}, + }}; + const ScalarHistogram maxHist = createHistogram(maxData); + + std::vector uniqueData{{ + {3, 3, 0, 0}, {19, 5, 11, 2}, {49, 7, 5, 3}, {69, 8, 0, 0}, {75, 3, 0, 0}, + {125, 2, 10, 5}, {228, 3, 27, 14}, {260, 4, 5, 1}, {344, 6, 36, 13}, {423, 4, 20, 8}, + {605, 4, 61, 28}, {665, 8, 12, 6}, {758, 4, 41, 16}, {768, 5, 0, 0}, {776, 3, 0, 0}, + {864, 3, 15, 10}, {883, 8, 0, 0}, {911, 2, 28, 6}, {953, 6, 8, 4}, {993, 1, 7, 5}, + }}; + const ScalarHistogram uniqueHist = createHistogram(uniqueData); + + TypeCounts typeCounts{{value::TypeTags::NumberInt64, 106}, {value::TypeTags::Array, 94}}; + const ArrayHistogram arrHist(scalarHist, + typeCounts, + uniqueHist, + minHist, + maxHist, + TypeCounts{{value::TypeTags::NumberInt64, 375}}, + 0); + + value::TypeTags lowTag = value::TypeTags::NumberInt64; + value::Value lowVal = 500; + value::TypeTags highTag = value::TypeTags::NumberInt64; + value::Value highVal = 550; + + // Test interpolation for query: [{$match: {a: {$gt: 500, $lt: 550}}}]. + double expectedCard = estimateCardRange(arrHist, + false /* lowInclusive */, + lowTag, + lowVal, + false /* highInclusive */, + highTag, + highVal, + true /* includeScalar */); + ASSERT_APPROX_EQUAL(92.9, expectedCard, 0.1); // Actual: 94. + + // Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 500, $lt: 550}}}}]. + expectedCard = estimateCardRange(arrHist, + false /* lowInclusive */, + lowTag, + lowVal, + false /* highInclusive */, + highTag, + highVal, + false /* includeScalar */); + ASSERT_APPROX_EQUAL(11.0, expectedCard, 0.1); // Actual: 8. +} + +} // namespace +} // namespace mongo::optimizer::ce diff --git a/src/mongo/db/query/ce/histogram_predicate_estimation.cpp b/src/mongo/db/query/ce/histogram_predicate_estimation.cpp new file mode 100644 index 00000000000..25d1658807d --- /dev/null +++ b/src/mongo/db/query/ce/histogram_predicate_estimation.cpp @@ -0,0 +1,496 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/ce/histogram_predicate_estimation.h" + +#include "mongo/db/exec/sbe/abt/abt_lower.h" +#include "mongo/db/pipeline/abt/utils.h" + +#include "mongo/db/query/optimizer/rewrites/const_eval.h" +#include "mongo/db/query/optimizer/syntax/expr.h" +#include "mongo/db/query/optimizer/utils/ce_math.h" +#include "mongo/db/query/optimizer/utils/interval_utils.h" +#include "mongo/db/query/stats/value_utils.h" + +namespace mongo::optimizer::ce { +namespace value = sbe::value; + +using stats::ArrayHistogram; +using stats::Bucket; +using stats::compareValues; +using stats::sameTypeBracket; +using stats::ScalarHistogram; +using stats::valueToDouble; + +std::pair getConstTypeVal(const ABT& abt) { + const auto* constant = abt.cast(); + tassert(7051102, "Interval ABTs passed in for estimation must have Constant bounds.", constant); + return constant->get(); +}; + +boost::optional> getBound( + const BoundRequirement& boundReq) { + const ABT& bound = boundReq.getBound(); + if (bound.is()) { + return getConstTypeVal(bound); + } + return boost::none; +}; + +IntervalRequirement getMinMaxIntervalForType(value::TypeTags type) { + // Note: This function works based on the assumption that there are no intervals that include + // values from more than one type. That is why the MinMax interval of a type will include all + // possible intervals over that type. + + auto&& [min, minInclusive] = getMinMaxBoundForType(true /*isMin*/, type); + tassert(7051103, str::stream() << "Type " << type << " has no minimum", min); + + auto&& [max, maxInclusive] = getMinMaxBoundForType(false /*isMin*/, type); + tassert(7051104, str::stream() << "Type " << type << " has no maximum", max); + + return IntervalRequirement{BoundRequirement(minInclusive, *min), + BoundRequirement(maxInclusive, *max)}; +} + +bool isIntervalSubsetOfType(const IntervalRequirement& interval, value::TypeTags type) { + // Create a conjunction of the interval and the min-max interval for the type as input for the + // intersection function. + auto intervals = + IntervalReqExpr::make(IntervalReqExpr::NodeVector{ + IntervalReqExpr::make(IntervalReqExpr::NodeVector{ + IntervalReqExpr::make(interval), + IntervalReqExpr::make(getMinMaxIntervalForType(type))})}); + + return intersectDNFIntervals(intervals, ConstEval::constFold).has_value(); +} + +EstimationResult getTotals(const ScalarHistogram& h) { + if (h.empty()) { + return {0.0, 0.0}; + } + + const Bucket& last = h.getBuckets().back(); + return {last._cumulativeFreq, last._cumulativeNDV}; +} + +/** + * Helper function that uses linear interpolation to estimate the cardinality and NDV for a value + * that falls inside of a histogram bucket. + */ +EstimationResult interpolateEstimateInBucket(const ScalarHistogram& h, + value::TypeTags tag, + value::Value val, + EstimationType type, + size_t bucketIndex) { + + const Bucket& bucket = h.getBuckets().at(bucketIndex); + const auto [boundTag, boundVal] = h.getBounds().getAt(bucketIndex); + + double resultCard = bucket._cumulativeFreq - bucket._equalFreq - bucket._rangeFreq; + double resultNDV = bucket._cumulativeNDV - bucket._ndv - 1.0; + + // Check if the estimate is at the point of type brackets switch. If the current bucket is the + // first bucket of a new type bracket and the value is of another type, estimate cardinality + // from the current bucket as 0. + // + // For example, let bound 1 = 1000, bound 2 = "abc". The value 100000000 falls in bucket 2, the + // first bucket for strings, but should not get cardinality/ ndv fraction from it. + if (!sameTypeBracket(tag, boundTag)) { + if (type == EstimationType::kEqual) { + return {0.0, 0.0}; + } else { + return {resultCard, resultNDV}; + } + } + + // Estimate for equality frequency inside of the bucket. + const double innerEqFreq = (bucket._ndv == 0.0) ? 0.0 : bucket._rangeFreq / bucket._ndv; + + if (type == EstimationType::kEqual) { + return {innerEqFreq, 1.0}; + } + + // If the value is minimal for its type, and the operation is $lt or $lte return cardinality up + // to the previous bucket. + auto&& [minConstant, inclusive] = getMinMaxBoundForType(true /*isMin*/, tag); + auto [minTag, minVal] = getConstTypeVal(*minConstant); + if (compareValues(minTag, minVal, tag, val) == 0) { + return {resultCard, resultNDV}; + } + + // For $lt and $lte operations use linear interpolation to take a fraction of the bucket + // cardinality and NDV if there is a preceeding bucket with bound of the same type. Use half of + // the bucket estimates otherwise. + double ratio = 0.5; + if (bucketIndex > 0) { + const auto [lowBoundTag, lowBoundVal] = h.getBounds().getAt(bucketIndex - 1); + if (sameTypeBracket(lowBoundTag, boundTag)) { + double doubleLowBound = valueToDouble(lowBoundTag, lowBoundVal); + double doubleUpperBound = valueToDouble(boundTag, boundVal); + double doubleVal = valueToDouble(tag, val); + ratio = (doubleVal - doubleLowBound) / (doubleUpperBound - doubleLowBound); + } + } + + const double bucketFreqRatio = bucket._rangeFreq * ratio; + resultCard += bucketFreqRatio; + resultNDV += bucket._ndv * ratio; + + if (type == EstimationType::kLess) { + // Subtract from the estimate the cardinality and ndv corresponding to the equality + // operation, if they are larger than the ratio taken from this bucket. + const double innerEqFreqCorrection = (bucketFreqRatio < innerEqFreq) ? 0.0 : innerEqFreq; + const double innerEqNdv = (bucket._ndv * ratio <= 1.0) ? 0.0 : 1.0; + resultCard -= innerEqFreqCorrection; + resultNDV -= innerEqNdv; + } + return {resultCard, resultNDV}; +} + +EstimationResult estimate(const ScalarHistogram& h, + value::TypeTags tag, + value::Value val, + EstimationType type) { + switch (type) { + case EstimationType::kGreater: + return getTotals(h) - estimate(h, tag, val, EstimationType::kLessOrEqual); + + case EstimationType::kGreaterOrEqual: + return getTotals(h) - estimate(h, tag, val, EstimationType::kLess); + + default: + // Continue. + break; + } + + size_t bucketIndex = 0; + { + size_t len = h.getBuckets().size(); + while (len > 0) { + const size_t half = len >> 1; + const auto [boundTag, boundVal] = h.getBounds().getAt(bucketIndex + half); + + if (compareValues(boundTag, boundVal, tag, val) < 0) { + bucketIndex += half + 1; + len -= half + 1; + } else { + len = half; + } + } + } + if (bucketIndex == h.getBuckets().size()) { + // Value beyond the largest endpoint. + switch (type) { + case EstimationType::kEqual: + return {0.0, 0.0}; + + case EstimationType::kLess: + case EstimationType::kLessOrEqual: + return getTotals(h); + + default: + MONGO_UNREACHABLE; + } + } + + const Bucket& bucket = h.getBuckets().at(bucketIndex); + const auto [boundTag, boundVal] = h.getBounds().getAt(bucketIndex); + const bool isEndpoint = compareValues(boundTag, boundVal, tag, val) == 0; + + if (isEndpoint) { + switch (type) { + case EstimationType::kEqual: { + return {bucket._equalFreq, 1.0}; + } + + case EstimationType::kLess: { + double resultCard = bucket._cumulativeFreq - bucket._equalFreq; + double resultNDV = bucket._cumulativeNDV - 1.0; + return {resultCard, resultNDV}; + } + + case EstimationType::kLessOrEqual: { + double resultCard = bucket._cumulativeFreq; + double resultNDV = bucket._cumulativeNDV; + return {resultCard, resultNDV}; + } + + default: + MONGO_UNREACHABLE; + } + } else { + return interpolateEstimateInBucket(h, tag, val, type, bucketIndex); + } +} + +/** + * Returns how many values of the given type are known by the array histogram. + */ +double getTypeCard(const ArrayHistogram& ah, value::TypeTags tag, bool includeScalar) { + double count = 0.0; + + // TODO SERVER-70936: booleans are estimated by different type counters (unless in arrays). + if (tag == sbe::value::TypeTags::Boolean) { + uasserted(7051101, "Cannot estimate boolean types yet with histogram CE."); + } + + // Note that if we are asked by the optimizer to estimate an interval whose bounds are arrays, + // this means we are trying to estimate equality on nested arrays. In this case, we do not want + // to include the "scalar" type counter for the array type, because this will cause us to + // estimate the nested array case as counting all arrays, regardless of whether or not they are + // nested. + if (includeScalar && tag != value::TypeTags::Array) { + auto typeIt = ah.getTypeCounts().find(tag); + if (typeIt != ah.getTypeCounts().end()) { + count += typeIt->second; + } + } + if (ah.isArray()) { + auto typeIt = ah.getArrayTypeCounts().find(tag); + if (typeIt != ah.getArrayTypeCounts().end()) { + count += typeIt->second; + } + } + return count; +} + +/** + * Estimates equality to the given tag/value using histograms. + */ +double estimateCardEq(const ArrayHistogram& ah, + value::TypeTags tag, + value::Value val, + bool includeScalar) { + double card = 0.0; + if (includeScalar) { + card = estimate(ah.getScalar(), tag, val, EstimationType::kEqual).card; + } + if (ah.isArray()) { + card += estimate(ah.getArrayUnique(), tag, val, EstimationType::kEqual).card; + } + return card; +} + +static EstimationResult estimateRange(const ScalarHistogram& histogram, + bool lowInclusive, + value::TypeTags tagLow, + value::Value valLow, + bool highInclusive, + value::TypeTags tagHigh, + value::Value valHigh) { + const EstimationType highType = + highInclusive ? EstimationType::kLessOrEqual : EstimationType::kLess; + const EstimationResult highEstimate = estimate(histogram, tagHigh, valHigh, highType); + + const EstimationType lowType = + lowInclusive ? EstimationType::kLess : EstimationType::kLessOrEqual; + const EstimationResult lowEstimate = estimate(histogram, tagLow, valLow, lowType); + + return highEstimate - lowEstimate; +} + +/** + * Compute an estimate for range query on array data with formula: + * Card(ArrayMin(a < valHigh)) - Card(ArrayMax(a < valLow)) + */ +static EstimationResult estimateRangeQueryOnArray(const ScalarHistogram& histogramAmin, + const ScalarHistogram& histogramAmax, + bool lowInclusive, + value::TypeTags tagLow, + value::Value valLow, + bool highInclusive, + value::TypeTags tagHigh, + value::Value valHigh) { + const EstimationType highType = + highInclusive ? EstimationType::kLessOrEqual : EstimationType::kLess; + const EstimationResult highEstimate = estimate(histogramAmin, tagHigh, valHigh, highType); + + const EstimationType lowType = + lowInclusive ? EstimationType::kLess : EstimationType::kLessOrEqual; + const EstimationResult lowEstimate = estimate(histogramAmax, tagLow, valLow, lowType); + + return highEstimate - lowEstimate; +} + +double estimateCardRange(const ArrayHistogram& ah, + /* Define lower bound. */ + bool lowInclusive, + value::TypeTags tagLow, + value::Value valLow, + /* Define upper bound. */ + bool highInclusive, + value::TypeTags tagHigh, + value::Value valHigh, + bool includeScalar, + EstimationAlgo estimationAlgo) { + uassert(6695701, + "Low bound must not be higher than high", + compareValues(tagLow, valLow, tagHigh, valHigh) <= 0); + + // Helper lambda to shorten code for legibility. + auto estRange = [&](const ScalarHistogram& h) { + return estimateRange(h, lowInclusive, tagLow, valLow, highInclusive, tagHigh, valHigh); + }; + + double result = 0.0; + if (ah.isArray()) { + + if (includeScalar) { + // Range query on array data. + const EstimationResult rangeCardOnArray = estimateRangeQueryOnArray(ah.getArrayMin(), + ah.getArrayMax(), + lowInclusive, + tagLow, + valLow, + highInclusive, + tagHigh, + valHigh); + result += rangeCardOnArray.card; + } else { + // $elemMatch query on array data. + const auto arrayMinEst = estRange(ah.getArrayMin()); + const auto arrayMaxEst = estRange(ah.getArrayMax()); + const auto arrayUniqueEst = estRange(ah.getArrayUnique()); + + // ToDo: try using ah.getArrayCount() - ah.getEmptyArrayCount(); + // when the number of empty arrays is provided by the statistics. + const double totalArrayCount = ah.getArrayCount(); + + uassert( + 6715101, "Array histograms should contain at least one array", totalArrayCount > 0); + switch (estimationAlgo) { + case EstimationAlgo::HistogramV1: { + const double arrayUniqueDensity = (arrayUniqueEst.ndv == 0.0) + ? 0.0 + : (arrayUniqueEst.card / std::sqrt(arrayUniqueEst.ndv)); + result = + std::max(std::max(arrayMinEst.card, arrayMaxEst.card), arrayUniqueDensity); + break; + } + case EstimationAlgo::HistogramV2: { + const double avgArraySize = + getTotals(ah.getArrayUnique()).card / totalArrayCount; + const double adjustedUniqueCard = (avgArraySize == 0.0) + ? 0.0 + : std::min(arrayUniqueEst.card / pow(avgArraySize, 0.2), totalArrayCount); + result = + std::max(std::max(arrayMinEst.card, arrayMaxEst.card), adjustedUniqueCard); + break; + } + case EstimationAlgo::HistogramV3: { + const double adjustedUniqueCard = + 0.85 * std::min(arrayUniqueEst.card, totalArrayCount); + result = + std::max(std::max(arrayMinEst.card, arrayMaxEst.card), adjustedUniqueCard); + break; + } + default: + MONGO_UNREACHABLE; + } + } + } + + if (includeScalar) { + const auto scalarEst = estRange(ah.getScalar()); + result += scalarEst.card; + } + + return result; +} + +double estimateIntervalCardinality(const ArrayHistogram& ah, + const IntervalRequirement& interval, + CEType childResult, + bool includeScalar) { + if (interval.isFullyOpen()) { + return childResult; + } else if (interval.isEquality()) { + auto maybeConstBound = getBound(interval.getLowBound()); + if (!maybeConstBound) { + return kInvalidEstimate; + } + + auto [tag, val] = *maybeConstBound; + if (stats::canEstimateTypeViaHistogram(tag)) { + return estimateCardEq(ah, tag, val, includeScalar); + } + + // Otherwise, we return the cardinality for the type of the intervals. + return getTypeCard(ah, tag, includeScalar); + } + + // Otherwise, we have a range. + auto lowBound = interval.getLowBound(); + auto maybeConstLowBound = getBound(lowBound); + if (!maybeConstLowBound) { + return kInvalidEstimate; + } + + auto highBound = interval.getHighBound(); + auto maybeConstHighBound = getBound(highBound); + if (!maybeConstHighBound) { + return kInvalidEstimate; + } + + auto [lowTag, lowVal] = *maybeConstLowBound; + auto [highTag, highVal] = *maybeConstHighBound; + + // Check if we estimated this interval using histograms. One of the tags may not be of a type we + // know how to estimate using histograms; however, it should still be possible to estimate the + // interval if the other one is of the appropriate type. + if (stats::canEstimateTypeViaHistogram(lowTag) || stats::canEstimateTypeViaHistogram(highTag)) { + return estimateCardRange(ah, + lowBound.isInclusive(), + lowTag, + lowVal, + highBound.isInclusive(), + highTag, + highVal, + includeScalar); + } + + // Otherwise, this interval was not in our histogram. We may be able to estimate this interval + // via type counts- if so, we just return the total count for the type. + + // If the bound tags are equal, we can estimate this in the same way that we do equalities on + // non-histogrammable types. Otherwise, we need to figure out which type(s) are included by this + // range. + if (lowTag == highTag || isIntervalSubsetOfType(interval, lowTag)) { + return getTypeCard(ah, lowTag, includeScalar); + } else if (isIntervalSubsetOfType(interval, highTag)) { + return getTypeCard(ah, highTag, includeScalar); + } + + // If we reach here, we've given up estimating, because our interval intersected both high & low + // type intervals (and possibly more types). + // TODO: could we aggregate type counts across all intersected types here? + return 0.0; +} + +} // namespace mongo::optimizer::ce diff --git a/src/mongo/db/query/ce/histogram_predicate_estimation.h b/src/mongo/db/query/ce/histogram_predicate_estimation.h new file mode 100644 index 00000000000..763f6c13a5e --- /dev/null +++ b/src/mongo/db/query/ce/histogram_predicate_estimation.h @@ -0,0 +1,106 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/query/optimizer/defs.h" +#include "mongo/db/query/optimizer/index_bounds.h" +#include "mongo/db/query/stats/array_histogram.h" + +namespace mongo::optimizer::ce { + +constexpr double kInvalidEstimate = -1.0; + +enum class EstimationType { kEqual, kLess, kLessOrEqual, kGreater, kGreaterOrEqual }; +enum class EstimationAlgo { HistogramV1, HistogramV2, HistogramV3 }; + +const stdx::unordered_map estimationTypeName = { + {EstimationType::kEqual, "eq"}, + {EstimationType::kLess, "lt"}, + {EstimationType::kLessOrEqual, "lte"}, + {EstimationType::kGreater, "gt"}, + {EstimationType::kGreaterOrEqual, "gte"}}; + +struct EstimationResult { + double card; + double ndv; + + EstimationResult operator-(const EstimationResult& other) const { + return {card - other.card, ndv - other.ndv}; + } +}; + +/** + * Returns cumulative total statistics for a histogram. + */ +EstimationResult getTotals(const stats::ScalarHistogram& h); + +/** + * Compute an estimate for a given value and estimation type. Use linear interpolation for values + * that fall inside of histogram buckets. + */ +EstimationResult estimate(const stats::ScalarHistogram& h, + sbe::value::TypeTags tag, + sbe::value::Value val, + EstimationType type); + +/** + * Given an array histogram, an interval, and the input cardinality, estimates the cardinality of + * the interval. + */ +double estimateIntervalCardinality(const stats::ArrayHistogram& estimator, + const IntervalRequirement& interval, + CEType inputCardinality, + bool includeScalar); + +/** + * Estimates the cardinality of an equality predicate given an ArrayHistogram and an SBE value and + * type tag pair. + */ +double estimateCardEq(const stats::ArrayHistogram& ah, + sbe::value::TypeTags tag, + sbe::value::Value val, + bool includeScalar); + +/** + * Estimates the cardinality of a range predicate given an ArrayHistogram and a range predicate. + * Set 'includeScalar' to true to indicate whether or not the provided range should include no-array + * values. The other fields define the range of the estimation. + */ +double estimateCardRange(const stats::ArrayHistogram& ah, + bool lowInclusive, + sbe::value::TypeTags tagLow, + sbe::value::Value valLow, + bool highInclusive, + sbe::value::TypeTags tagHigh, + sbe::value::Value valHigh, + bool includeScalar, + EstimationAlgo estAlgo = EstimationAlgo::HistogramV2); + +} // namespace mongo::optimizer::ce diff --git a/src/mongo/db/query/ce/max_diff.cpp b/src/mongo/db/query/ce/max_diff.cpp deleted file mode 100644 index 3c265620771..00000000000 --- a/src/mongo/db/query/ce/max_diff.cpp +++ /dev/null @@ -1,376 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "mongo/base/string_data.h" -#include "mongo/bson/bsonobjbuilder.h" -#include "mongo/bson/bsontypes.h" -#include "mongo/db/exec/sbe/values/bson.h" -#include "mongo/db/exec/sbe/values/value.h" -#include "mongo/db/query/ce/max_diff.h" -#include "mongo/db/query/ce/value_utils.h" -#include "mongo/util/assert_util.h" - - -namespace mongo::ce { - -std::string printDistribution(const DataDistribution& distr, size_t nElems) { - std::ostringstream os; - for (size_t i = 0; i < std::min(nElems, distr._freq.size()); ++i) { - os << "{val: " << distr._bounds[i].get() << ", " << distr._freq[i].toString() << "}\n"; - } - return os.str(); -} - -static double valueSpread(value::TypeTags tag1, - value::Value val1, - value::TypeTags tag2, - value::Value val2) { - double doubleVal1 = valueToDouble(tag1, val1); - double doubleVal2 = valueToDouble(tag2, val2); - uassert(6660502, - "Data distribution values must be monotonically increasing.", - doubleVal2 >= doubleVal1); - return doubleVal2 - doubleVal1; -} - -DataDistribution getDataDistribution(const std::vector& sortedInput) { - if (sortedInput.empty()) { - return {}; - } - - DataDistribution result; - value::TypeTags prevTag; - value::Value prevValue; - bool first = true; - - // Aggregate the values in a sorted dataset into a frequency distribution. - size_t idx = 0; - for (size_t i = 0; i < sortedInput.size(); i++) { - const auto v = sortedInput[i].get(); - const auto comparison = first ? 1 : compareValues(v.first, v.second, prevTag, prevValue); - first = false; - - if (comparison != 0) { - uassert(6660550, "Input is not sorted", comparison > 0); - prevTag = v.first; - prevValue = v.second; - - const auto [tagCopy, valCopy] = copyValue(v.first, v.second); - result._bounds.emplace_back(tagCopy, valCopy); - result._freq.emplace_back(idx, 1); - ++idx; - } else { - ++result._freq.back()._freq; - } - } - - // Calculate the area for all values in the data distribution. - // The current minimum and maximum areas of the values of a type class. - double maxArea = 0.0; - - for (size_t i = 0; i + 1 < result._freq.size(); ++i) { - const auto v1 = result._bounds[i]; - const auto v2 = result._bounds[i + 1]; - const bool newTypeClass = !sameTypeClass(v1.getTag(), v2.getTag()); - - if (newTypeClass) { - const auto res = result.typeClassBounds.emplace(i, maxArea); - uassert(6660551, "There can't be duplicate type class bounds.", res.second); - maxArea = 0.0; - } else if (i == 0) { - const double spread = - valueSpread(v1.getTag(), v1.getValue(), v2.getTag(), v2.getValue()); - maxArea = result._freq[i]._freq * spread; - } - - if (i == 0 || newTypeClass) { - // Make sure we insert bucket boundaries between different types, and also make sure - // first value is picked for a boundary. - result._freq[i]._area = std::numeric_limits::infinity(); - } else { - const double spread = - valueSpread(v1.getTag(), v1.getValue(), v2.getTag(), v2.getValue()); - result._freq[i]._area = result._freq[i]._freq * spread; - maxArea = std::max(maxArea, result._freq[i]._area); - } - } - - // Make sure last value is picked as a histogram bucket boundary. - result._freq.back()._area = std::numeric_limits::infinity(); - const auto res = result.typeClassBounds.emplace(result._freq.size(), maxArea); - uassert(6660503, "There can't be duplicate type class bounds.", res.second); - - // Compute normalized areas. If the spread is 0, the area may also be 0. This could happen, - // for instance, if there is only a single value of a given type, - size_t beginIdx = 0; - for (const auto [endIdx, area] : result.typeClassBounds) { - for (size_t i = beginIdx; i < endIdx; ++i) { - result._freq[i]._normArea = area > 0.0 ? (result._freq[i]._area / area) : 0.0; - } - beginIdx = endIdx; - } - - // std::cout << "Distribution sorted by value:\n" - // << printDistribution(result, result._freq.size()) << "\n" - // << std::flush; - - return result; -} - -// TODO: This doesn't seem right -- it looks like we're sorting on the frequency, -// not the difference between buckets -static std::vector generateTopKBuckets(const DataDistribution& dataDistrib, - size_t numBuckets) { - struct AreaComparator { - bool operator()(const ValFreq& a, const ValFreq& b) const { - return a._normArea > b._normArea; - } - }; - std::priority_queue, AreaComparator> pq; - - for (const auto& valFreq : dataDistrib._freq) { - if (pq.size() < numBuckets) { - pq.emplace(valFreq); - } else if (AreaComparator()(valFreq, pq.top())) { - pq.pop(); - pq.emplace(valFreq); - } - } - - std::vector result; - while (!pq.empty()) { - result.push_back(pq.top()); - pq.pop(); - } - - std::sort(result.begin(), result.end(), [](const ValFreq& a, const ValFreq& b) { - return a._idx < b._idx; - }); - - return result; -} - -ScalarHistogram genMaxDiffHistogram(const DataDistribution& dataDistrib, size_t numBuckets) { - if (dataDistrib._freq.empty()) { - return {}; - } - - std::vector topKBuckets = generateTopKBuckets(dataDistrib, numBuckets); - uassert(6660504, - "Must have bucket boundary on first value", - topKBuckets[0]._idx == dataDistrib._freq[0]._idx); - uassert(6660505, - "Must have bucket boundary on last value", - topKBuckets.back()._idx == dataDistrib._freq.back()._idx); - - std::vector buckets; - value::Array bounds; - - // Create histogram buckets out of the top-K bucket values. - size_t startBucketIdx = 0; - double cumulativeFreq = 0.0; - double cumulativeNDV = 0.0; - for (size_t i = 0; i < std::min(dataDistrib._freq.size(), numBuckets); i++) { - const size_t bucketBoundIdx = topKBuckets[i]._idx; - const double freq = dataDistrib._freq.at(bucketBoundIdx)._freq; - - // Compute per-bucket statistics. - double rangeFreq = 0.0; - double ndv = 0.0; - while (startBucketIdx < bucketBoundIdx) { - rangeFreq += dataDistrib._freq[startBucketIdx++]._freq; - ++ndv; - } - cumulativeFreq += rangeFreq + freq; - cumulativeNDV += ndv + 1.0; - - // Add a histogram bucket. - const auto v = dataDistrib._bounds[startBucketIdx]; - const auto [copyTag, copyVal] = value::copyValue(v.getTag(), v.getValue()); - bounds.push_back(copyTag, copyVal); - buckets.emplace_back(freq, rangeFreq, cumulativeFreq, ndv, cumulativeNDV); - startBucketIdx++; - } - - return {std::move(bounds), std::move(buckets)}; -} - -/** - * Helper for getting the input for constructing an array histogram for an array estimator using the - * values in an array. For each value in `arrayElements`, update the min, max, and unique value - * vectors. These will be used to generate the corresponding histograms for array values. - */ -void updateMinMaxUniqArrayVals(std::vector& arrayElements, - std::vector& arrayMinData, - std::vector& arrayMaxData, - std::vector& arrayUniqueData) { - - if (arrayElements.size() == 0) { - return; - } - - sortValueVector(arrayElements); - - // Emit values for arrayMin and arrayMax histograms. - { - boost::optional prev; - for (const auto& element : arrayElements) { - if (!prev) { - arrayMinData.push_back(element); - } else if (!sameTypeClass(prev->getTag(), element.getTag())) { - arrayMaxData.push_back(*prev); - arrayMinData.push_back(element); - } - prev = element; - } - if (prev) { - arrayMaxData.push_back(*prev); - } - } - - // Emit values for arrayUnique histogram. - { - boost::optional prev; - for (const auto& element : arrayElements) { - if (!prev || - compareValues( - prev->getTag(), prev->getValue(), element.getTag(), element.getValue()) < 0) { - arrayUniqueData.push_back(element); - prev = element; - } - } - } -} - -ArrayHistogram createArrayEstimator(const std::vector& arrayData, size_t nBuckets) { - // Values that will be used as inputs to histogram generation code. - std::vector scalarData; - std::vector arrayMinData; - std::vector arrayMaxData; - std::vector arrayUniqueData; - - // Type counters. - TypeCounts typeCounts; - TypeCounts arrayTypeCounts; - - // Value counters. - double emptyArrayCount = 0; - double trueCount = 0; - double falseCount = 0; - - for (const auto& v : arrayData) { - const auto val = v.getValue(); - const auto tag = v.getTag(); - - // Increment type counters. - auto tagCount = typeCounts.insert({tag, 1}); - if (!tagCount.second) { - ++tagCount.first->second; - } - - if (tag == value::TypeTags::Array) { - // If we have an array, we can construct min, max, and unique histograms from its - // elements, provided that they are histogrammable. - std::vector arrayElements; - - value::Array* arr = value::getArrayView(val); - size_t arrSize = arr->size(); - if (arrSize == 0) { - ++emptyArrayCount; - continue; - } - - for (size_t i = 0; i < arrSize; i++) { - const auto [tag, val] = arr->getAt(i); - - // Increment array type tag counts. - auto arrTagCount = arrayTypeCounts.insert({tag, 1}); - if (!arrTagCount.second) { - ++arrTagCount.first->second; - } - - if (!canEstimateTypeViaHistogram(tag)) { - // If the elements of this array are not histogrammable, then we can only update - // the array type counters - continue; - } - - const auto [tagCopy, valCopy] = value::copyValue(tag, val); - arrayElements.emplace_back(tagCopy, valCopy); - } - updateMinMaxUniqArrayVals(arrayElements, arrayMinData, arrayMaxData, arrayUniqueData); - - } else if (tag == value::TypeTags::Boolean) { - // If we have a boolean, we also have counters for true and false values we should - // increment here. - if (value::bitcastTo(val)) { - trueCount++; - } else { - falseCount++; - } - continue; - - } else if (!canEstimateTypeViaHistogram(tag)) { - // If we have a non-histogrammable type, we can only increment the type counters for it; - // we cannot build a scalar histogram on it. - continue; - - } else { - // Assume non-arrays are scalars. Emit values for the scalar histogram. - scalarData.push_back(v); - } - } - - // Lambda helper to construct histogram from an unsorted value vector. - const auto makeHistogram = [&nBuckets](std::vector& values) { - sortValueVector(values); - return genMaxDiffHistogram(getDataDistribution(values), nBuckets); - }; - - return {makeHistogram(scalarData), - std::move(typeCounts), - makeHistogram(arrayUniqueData), - makeHistogram(arrayMinData), - makeHistogram(arrayMaxData), - std::move(arrayTypeCounts), - emptyArrayCount, - trueCount, - falseCount}; -} - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/max_diff.h b/src/mongo/db/query/ce/max_diff.h deleted file mode 100644 index ab69f7001eb..00000000000 --- a/src/mongo/db/query/ce/max_diff.h +++ /dev/null @@ -1,82 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include -#include - -#include "mongo/db/exec/sbe/values/value.h" -#include "mongo/db/query/ce/array_histogram.h" -#include "mongo/db/query/ce/scalar_histogram.h" -#include "mongo/db/query/ce/value_utils.h" - -namespace mongo::ce { - -struct ValFreq { - ValFreq(size_t idx, size_t freq) : _idx(idx), _freq(freq), _area(-1.0), _normArea(-1) {} - - std::string toString() const { - std::ostringstream os; - os << "idx: " << _idx << ", freq: " << _freq << ", area: " << _area - << ", normArea: " << _normArea; - return os.str(); - } - - size_t _idx; // Original index according to value order. - size_t _freq; // Frequency of the value. - double _area; // Derived as: spread * frequency - double _normArea; // Area normalized to the maximum in a type class. -}; - -struct DataDistribution { - std::vector _bounds; - std::vector _freq; - // The min/max areas of each type class. The key is the index of the last boundary of the class. - std::map typeClassBounds; -}; - -/** - Given a set of values sorted in BSON order, generate a data distribution consisting of - counts for each value with the values in sorted order -*/ -DataDistribution getDataDistribution(const std::vector& sortedInput); - -/** - Given a data distribution, generate a scalar histogram with the supplied number of buckets -*/ -ScalarHistogram genMaxDiffHistogram(const DataDistribution& dataDistrib, size_t numBuckets); - -/** - Given a vector containing SBEValues, generate a set of statistics to summarize the supplied - data. Histograms will use the supplied number of buckets. -*/ -ArrayHistogram createArrayEstimator(const std::vector& arrayData, size_t nBuckets); - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/maxdiff_histogram_test.cpp b/src/mongo/db/query/ce/maxdiff_histogram_test.cpp index 2f1e2185f8d..80364fea0bb 100644 --- a/src/mongo/db/query/ce/maxdiff_histogram_test.cpp +++ b/src/mongo/db/query/ce/maxdiff_histogram_test.cpp @@ -27,30 +27,36 @@ * it in the license file. */ -#include -#include - #include "mongo/db/concurrency/lock_state.h" #include "mongo/db/exec/sbe/abt/sbe_abt_test_util.h" #include "mongo/db/exec/sbe/values/value.h" -#include "mongo/db/query/ce/array_histogram.h" -#include "mongo/db/query/ce/ce_test_utils.h" -#include "mongo/db/query/ce/histogram_estimation.h" -#include "mongo/db/query/ce/max_diff.h" -#include "mongo/db/query/ce/maxdiff_test_utils.h" -#include "mongo/db/query/ce/rand_utils.h" -#include "mongo/db/query/ce/rand_utils_new.h" -#include "mongo/db/query/ce/scalar_histogram.h" +#include "mongo/db/query/ce/histogram_predicate_estimation.h" +#include "mongo/db/query/ce/test_utils.h" #include "mongo/db/query/optimizer/utils/unit_test_utils.h" +#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/max_diff.h" +#include "mongo/db/query/stats/maxdiff_test_utils.h" +#include "mongo/db/query/stats/rand_utils.h" +#include "mongo/db/query/stats/rand_utils_new.h" +#include "mongo/db/query/stats/scalar_histogram.h" #include "mongo/logv2/log_component.h" #include "mongo/logv2/log_component_settings.h" #include "mongo/logv2/log_severity.h" #include "mongo/unittest/unittest.h" -namespace mongo::ce::statistics { +namespace mongo::optimizer::ce { namespace { +namespace value = sbe::value; + +using stats::ArrayHistogram; +using stats::Bucket; +using stats::DataDistribution; +using stats::genFixedValueArray; +using stats::getDataDistribution; +using stats::makeHistogram; +using stats::makeInt64Value; +using stats::ScalarHistogram; -using namespace sbe; const double kTolerance = 0.001; class HistogramTest : public ServiceContextTest {}; @@ -266,4 +272,4 @@ TEST_F(HistogramTest, MaxDiffEmptyArrays) { } } // namespace -} // namespace mongo::ce::statistics +} // namespace mongo::optimizer::ce diff --git a/src/mongo/db/query/ce/maxdiff_test_utils.cpp b/src/mongo/db/query/ce/maxdiff_test_utils.cpp deleted file mode 100644 index b27cbb6ec93..00000000000 --- a/src/mongo/db/query/ce/maxdiff_test_utils.cpp +++ /dev/null @@ -1,120 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/db/query/ce/maxdiff_test_utils.h" - -#include "mongo/db/query/ce/array_histogram.h" -#include "mongo/db/query/ce/max_diff.h" - -namespace mongo::ce { - -static std::vector convertToJSON(const std::vector& input) { - std::vector result; - - for (size_t i = 0; i < input.size(); i++) { - const auto [objTag, objVal] = value::makeNewObject(); - value::ValueGuard vg(objTag, objVal); - - const auto [tag, val] = input[i].get(); - // Copy the value because objVal owns its value, and the ValueGuard releases not only - // objVal, but also its Value (in the case below - copyVal). - const auto [copyTag, copyVal] = value::copyValue(tag, val); - value::getObjectView(objVal)->push_back("a", copyTag, copyVal); - - std::ostringstream os; - os << std::make_pair(objTag, objVal); - result.push_back(os.str()); - } - - return result; -} - -size_t getActualCard(OperationContext* opCtx, - const std::vector& input, - const std::string& query) { - return mongo::optimizer::runPipeline(opCtx, query, convertToJSON(input)).size(); -} - -std::string makeMatchExpr(const SBEValue& val, EstimationType cmpOp) { - std::stringstream matchExpr; - std::string cmpOpName = estimationTypeName.at(cmpOp); - matchExpr << "[{$match: {a: {$" << cmpOpName << ": " << val.get() << "}}}]"; - return matchExpr.str(); -} - -ScalarHistogram makeHistogram(std::vector& randData, size_t nBuckets) { - sortValueVector(randData); - const DataDistribution& dataDistrib = getDataDistribution(randData); - return genMaxDiffHistogram(dataDistrib, nBuckets); -} - -std::string printValueArray(const std::vector& values) { - std::stringstream strStream; - for (size_t i = 0; i < values.size(); ++i) { - strStream << " " << values[i].get(); - } - return strStream.str(); -} - -std::string plotArrayEstimator(const ArrayHistogram& estimator, const std::string& header) { - std::ostringstream os; - os << header << "\n"; - if (!estimator.getScalar().empty()) { - os << "Scalar histogram:\n" << estimator.getScalar().plot(); - } - if (!estimator.getArrayUnique().empty()) { - os << "Array unique histogram:\n" << estimator.getArrayUnique().plot(); - } - if (!estimator.getArrayMin().empty()) { - os << "Array min histogram:\n" << estimator.getArrayMin().plot(); - } - if (!estimator.getArrayMax().empty()) { - os << "Array max histogram:\n" << estimator.getArrayMax().plot(); - } - if (!estimator.getTypeCounts().empty()) { - os << "Per scalar data type value counts: "; - for (auto tagCount : estimator.getTypeCounts()) { - os << tagCount.first << "=" << tagCount.second << " "; - } - } - if (!estimator.getArrayTypeCounts().empty()) { - os << "\nPer array data type value counts: "; - for (auto tagCount : estimator.getArrayTypeCounts()) { - os << tagCount.first << "=" << tagCount.second << " "; - } - } - if (estimator.isArray()) { - os << "\nEmpty array count: " << estimator.getEmptyArrayCount(); - } - os << "\n"; - - return os.str(); -} - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/maxdiff_test_utils.h b/src/mongo/db/query/ce/maxdiff_test_utils.h deleted file mode 100644 index 4ea1244da02..00000000000 --- a/src/mongo/db/query/ce/maxdiff_test_utils.h +++ /dev/null @@ -1,76 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include -#include - - -#include "mongo/db/exec/sbe/abt/sbe_abt_test_util.h" -#include "mongo/db/exec/sbe/values/value.h" -#include "mongo/db/query/ce/histogram_estimation.h" -#include "mongo/db/query/ce/scalar_histogram.h" -#include "mongo/db/query/ce/value_utils.h" - -namespace mongo::ce { - -class ArrayHistogram; - -/** - Given a list of SBE values and a query, create a collection containing the data, - and count the results from the supplied query. - */ -size_t getActualCard(OperationContext* opCtx, - const std::vector& input, - const std::string& query); - -/** - Given a value and a comparison operator, generate a match expression reflecting - x cmpOp val. -*/ -std::string makeMatchExpr(const SBEValue& val, EstimationType cmpOp); - -/** - Given a vector of values, create a histogram reflection the distribution of the vector - with the supplied number of buckets. -*/ -ScalarHistogram makeHistogram(std::vector& randData, size_t nBuckets); - -/** - Serialize a vector of values. -*/ -std::string printValueArray(const std::vector& values); - -/** - Plot a set of statistics as stored in ArrayHistogram. -*/ -std::string plotArrayEstimator(const ArrayHistogram& estimator, const std::string& header); - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/rand_utils.cpp b/src/mongo/db/query/ce/rand_utils.cpp deleted file mode 100644 index 7f317904298..00000000000 --- a/src/mongo/db/query/ce/rand_utils.cpp +++ /dev/null @@ -1,391 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include -#include -#include -#include - -#include "mongo/db/query/ce/rand_utils.h" - -#include "mongo/db/exec/sbe/values/value.h" -#include "mongo/util/assert_util.h" - -namespace mongo::ce { - -const std::string DatasetDescriptor::_alphabet = - "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; - -DatasetDescriptor::DatasetDescriptor(const DataTypeDistribution& dataTypeDistribution, - size_t intNDV, - int minInt, - int maxInt, - size_t strNDV, - size_t minStrLen, - size_t maxStrLen, - std::shared_ptr nestedDataDescriptor, - double reuseScalarsRatio, - size_t arrNDV, - size_t minArrLen, - size_t maxArrLen) - : _gen{42}, - _reuseScalarsRatio(reuseScalarsRatio), - _intNDV(std::min(intNDV, static_cast(std::abs(maxInt - minInt)))), - _uniformIntDist{minInt, maxInt}, - _arrNDV(arrNDV), - _uniformArrSizeDist{minArrLen, maxArrLen}, - _nestedDataDescriptor(nestedDataDescriptor) { - uassert(6660520, "Maximum integer number must be >= the minimum one.", (maxInt >= minInt)); - uassert(6660521, "Maximum string size must be >= the minimum one.", (maxStrLen >= minStrLen)); - uassert(6660522, - "Array specs must be 0 if there is no array data descriptor.", - _nestedDataDescriptor || (arrNDV == 0 && minArrLen == 0 && maxArrLen == 0)); - uassert(6660523, - "Nested arrays requires sensible array lengths", - !_nestedDataDescriptor || maxArrLen >= minArrLen); - uassert(6660524, "Recursive descriptors are not allowed.", nestedDataDescriptor.get() != this); - uassert(6660525, - "reuseScalarsRatio is a probability, must be in [0, 1].", - reuseScalarsRatio >= 0 && reuseScalarsRatio <= 1.0); - - // Compute absolute ranges given relative weights of each value type. - double sumWeights = 0; - for (const auto& weightedType : dataTypeDistribution) { - sumWeights += weightedType.second; - } - double sumRelativeWeights = 0; - auto lastKey = dataTypeDistribution.crbegin()->first; - for (auto it = dataTypeDistribution.cbegin(); it != dataTypeDistribution.cend(); ++it) { - const auto weightedType = *it; - if (weightedType.first != lastKey) { - sumRelativeWeights += weightedType.second / sumWeights; - uassert(6660526, "The sum of weights can't be >= 1", sumRelativeWeights < 1); - } else { - // Due to rounding errors the last relative weight may not be exactly 1.0. Set it - // to 1.0. - sumRelativeWeights = 1.0; - } - _dataTypeDistribution.emplace(sumRelativeWeights, weightedType.first); - } - - // Generate a set of random integers. - mongo::stdx::unordered_set tmpIntSet; - tmpIntSet.reserve(_intNDV); - if (_intNDV == intNDV) { - for (int i = minInt; i <= maxInt; ++i) { - tmpIntSet.insert(i); // This is a dense set of all ints the range. - } - } else { - size_t randCount = 0; - while (tmpIntSet.size() < _intNDV && randCount < 10 * _intNDV) { - int randInt = _uniformIntDist(_gen); - ++randCount; - tmpIntSet.insert(randInt); - } - } - uassert( - 6660527, "Too few integers generated.", (double)tmpIntSet.size() / (double)_intNDV > 0.99); - _intSet.reserve(tmpIntSet.size()); - _intSet.insert(_intSet.end(), tmpIntSet.begin(), tmpIntSet.end()); - _uniformIntIdxDist.param( - std::uniform_int_distribution::param_type(0, _intSet.size() - 1)); - - // Generate a set of random strings with random sizes so that each string can be chosen - // multiple times in the test data set. - _stringSet.reserve(strNDV); - std::uniform_int_distribution uniformStrSizeDistr{minStrLen, maxStrLen}; - for (size_t i = 0; i < strNDV; ++i) { - size_t len = uniformStrSizeDistr(_gen); - const auto randStr = genRandomString(len); - _stringSet.push_back(randStr); - } - _uniformStrIdxDist.param( - std::uniform_int_distribution::param_type(0, _stringSet.size() - 1)); - - // Generate a set of random arrays that are chosen from when generating array data. - fillRandomArraySet(); -} - -std::vector DatasetDescriptor::genRandomDataset(size_t nElems, - DatasetDescriptor* parentDesc) { - std::vector randValues; - randValues.reserve(nElems); - DatasetDescriptor* curDesc = this; - - if (parentDesc) { - double reuseProb = _uniformRandProbability(_gen); - if (reuseProb < parentDesc->_reuseScalarsRatio) { - curDesc = parentDesc; - } - } - - for (size_t i = 0; i < nElems; ++i) { - // Get the data type of the current value to be generated. - value::TypeTags genTag = this->getRandDataType(); - // Generate a random value of the corresponding type. - switch (genTag) { - case value::TypeTags::NumberInt64: { - size_t idx = curDesc->_uniformIntIdxDist(_gen); - auto randInt = curDesc->_intSet.at(idx); - const auto [tag, val] = makeInt64Value(randInt); - randValues.emplace_back(tag, val); - break; - } - case value::TypeTags::StringBig: - case value::TypeTags::StringSmall: { - size_t idx = curDesc->_uniformStrIdxDist(_gen); - const auto randStr = curDesc->_stringSet.at(idx); - const auto [tag, val] = value::makeNewString(randStr); - const auto [copyTag, copyVal] = value::copyValue(tag, val); - randValues.emplace_back(copyTag, copyVal); - break; - } - case value::TypeTags::Array: { - if (_nestedDataDescriptor) { - const auto randArray = genRandomArray(); - auto [arrayTag, arrayVal] = value::makeNewArray(); - value::Array* arr = value::getArrayView(arrayVal); - for (const auto& elem : randArray) { - const auto [copyTag, copyVal] = - value::copyValue(elem.getTag(), elem.getValue()); - arr->push_back(copyTag, copyVal); - } - randValues.emplace_back(arrayTag, arrayVal); - } - break; - } - default: - uasserted(6660528, "Unsupported data type"); - } - } - - return randValues; -} - -std::string DatasetDescriptor::genRandomString(size_t len) { - std::string randStr; - randStr.reserve(len); - for (size_t i = 0; i < len; ++i) { - size_t idx = _uniformCharIdxDist(_gen); - const char ch = _alphabet[idx]; - randStr += ch; - } - - return randStr; -} - -std::vector DatasetDescriptor::genRandomArray() { - uassert(6660529, - "There must be a nested data descriptor for random array generation.", - _nestedDataDescriptor); - if (_arrNDV == 0) { - size_t randArraySize = _uniformArrSizeDist(_gen); - return _nestedDataDescriptor->genRandomDataset(randArraySize, this); - } else { - size_t idx = _uniformArrIdxDist(_gen); - return _arraySet.at(idx); - } -} - -void DatasetDescriptor::fillRandomArraySet() { - for (size_t i = 0; i < _arrNDV; ++i) { - size_t randArraySize = _uniformArrSizeDist(_gen); - const auto randArray = _nestedDataDescriptor->genRandomDataset(randArraySize, this); - _arraySet.push_back(randArray); - } - - if (_arrNDV > 0) { - _uniformArrIdxDist.param( - std::uniform_int_distribution::param_type(0, _arraySet.size() - 1)); - } -} - -/** - Generate a random string. It is possible (even expected) that the same parameters - will generate different strings on successive calls -*/ -std::string genRandomString(size_t len, std::mt19937_64& gen, size_t seed) { - std::string randStr; - randStr.reserve(len); - const constexpr char* kAlphabet = - "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; - std::uniform_int_distribution uniformDist{0, std::strlen(kAlphabet) - 1}; - - for (size_t i = 0; i < len; ++i) { - size_t idx = uniformDist(gen); - const char ch = kAlphabet[idx]; - randStr += ch; - } - - return randStr; -} - -/** - Generate a string. This string will be deterministic in that the same - parameters will always generate the same string, even on different platforms. -*/ -std::string genString(size_t len, size_t seed) { - std::string str; - str.reserve(len); - - const constexpr char* kAlphabet = - "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; - const int kAlphabetLength = strlen(kAlphabet); - - unsigned long long rand = seed; - for (size_t i = 0; i < len; ++i) { - // Library implementations of rand vary by compiler, naturally, Since we still - // want the appearance of randomness, but consistency across compilers, we use a linear - // congruential generator to choose characters for the string. The parameters chosen - // are from Numerical Recipes. We use the upper 32 bits when calculating the character - // index, as the lower 32 are essentially nonrandom -- a weakness of LCGs in general. - rand = 3935559000370003845ULL * rand + 269134368944950781ULL; - - int idx = (rand >> 32) % kAlphabetLength; - str += kAlphabet[idx]; - } - - return str; -} - -/** - Generate an array of values with the required ratio of int to string. This array will be - deterministic in that the same parameters will always generate the same array, even on - different platforms. -*/ -std::vector genFixedValueArray(size_t nElems, double intRatio, double strRatio) { - - std::vector values; - - const int intNDV = static_cast(nElems) / 4; - for (size_t i = 0; i < std::round(nElems * intRatio); ++i) { - const auto [tag, val] = makeInt64Value((i % intNDV) + 1); - values.emplace_back(tag, val); - } - - if (strRatio == 0.0) { - return values; - } - - // Generate a set of strings so that each string can be chosen multiple times in the test - // data set. - const size_t strNDV = nElems / 5; - std::vector stringSet; - stringSet.reserve(strNDV); - for (size_t i = 0; i < strNDV; ++i) { - const auto randStr = genString(8, i); - stringSet.push_back(randStr); - } - - for (size_t i = 0; i < std::round(nElems * strRatio); ++i) { - size_t idx = i % stringSet.size(); - const auto randStr = stringSet[idx]; - const auto [tag, val] = value::makeNewString(randStr); - values.emplace_back(tag, val); - } - - return values; -} - -std::vector genRandomValueArray(size_t nElems, - double intRatio, - double strRatio, - size_t seed) { - std::vector randValues; - const int intNDV = static_cast(nElems) / 4; - const size_t strNDV = nElems / 5; - std::vector stringSet; - stringSet.reserve(strNDV); - - std::mt19937_64 gen{seed}; - std::uniform_int_distribution uniformDist{1, intNDV}; - - for (size_t i = 0; i < std::round(nElems * intRatio); ++i) { - const auto [tag, val] = makeInt64Value(uniformDist(gen)); - randValues.emplace_back(tag, val); - } - - // Generate a set of strings so that each string can be chosen multiple times in the test - // data set. - for (size_t i = 0; i < strNDV; ++i) { - const auto randStr = genRandomString(8, gen, seed); - stringSet.push_back(randStr); - } - - std::uniform_int_distribution idxDistr{0, stringSet.size() - 1}; - for (size_t i = 0; i < std::round(nElems * strRatio); ++i) { - size_t idx = idxDistr(gen); - const auto randStr = stringSet[idx]; - const auto [tag, val] = value::makeNewString(randStr); - randValues.emplace_back(tag, val); - } - - return randValues; -} - -std::vector nestArrays(const std::vector& input, size_t emptyArrayCount) { - std::vector result; - auto [arrayTag, arrayVal] = value::makeNewArray(); - - for (size_t i = 0; i < input.size(); i++) { - const auto v = input[i].get(); - const auto [tagCopy, valCopy] = value::copyValue(v.first, v.second); - - if (i % 10 < 5) { - // 50% of values remain scalar. - result.emplace_back(tagCopy, valCopy); - } else { - // 50% of the values are grouped into arrays of size 10. - value::Array* arr = value::getArrayView(arrayVal); - arr->push_back(tagCopy, valCopy); - if (arr->size() == 10) { - result.emplace_back(arrayTag, arrayVal); - std::tie(arrayTag, arrayVal) = value::makeNewArray(); - } - } - } - - for (size_t i = 0; i < emptyArrayCount; ++i) { - auto [emptyArrayTag, emptyArrayVal] = value::makeNewArray(); - result.emplace_back(emptyArrayTag, emptyArrayVal); - } - - // It's possible that the array still contains something. If it's empty, - // we can safely release it. If not, append it to the result. - value::Array* arr = value::getArrayView(arrayVal); - if (arr->size() > 0) { - result.emplace_back(arrayTag, arrayVal); - } else { - value::releaseValue(arrayTag, arrayVal); - } - - return result; -} - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/rand_utils.h b/src/mongo/db/query/ce/rand_utils.h deleted file mode 100644 index 9a9acad5161..00000000000 --- a/src/mongo/db/query/ce/rand_utils.h +++ /dev/null @@ -1,191 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include -#include - -#include "value_utils.h" - -namespace mongo::ce { - -class SBEValue; - -// A simple histogram describing the distribution of values of each data type. -using DataTypeDistribution = std::map; - -/** - Describes the distribution of a dataset according to type and weight. Other ctor parameters - are used to describe the various data types which can be emitted and correspond to the fields - named similarly - */ -class DatasetDescriptor { -public: - DatasetDescriptor(const DataTypeDistribution& dataTypeDistribution, - size_t intNDV, - int minInt, - int maxInt, - size_t strNDV, - size_t minStrLen, - size_t maxStrLen, - std::shared_ptr nestedDataDescriptor = nullptr, - double reuseScalarsRatio = 0, - size_t arrNDV = 0, - size_t minArrLen = 0, - size_t maxArrLen = 0); - - // Generate a random dataset of 'nElems' according to the data distribution characteristics in - // this object. - std::vector genRandomDataset(size_t nElems, DatasetDescriptor* parentDesc = nullptr); - -private: - // Select a random value data type. - value::TypeTags getRandDataType() { - double key = _uniformRandProbability(_gen); - return (*_dataTypeDistribution.upper_bound(key)).second; - } - - // Generate a random string with size 'len'. - std::string genRandomString(size_t len); - - // Generate a random array with length determined uniformly between minArrLen and maxArrLen - std::vector genRandomArray(); - - // Generate a set of random arrays that are chosen from when generating array data. - void fillRandomArraySet(); - -private: - using InternalDataTypeDistribution = std::map; - /* - * General distribution charecteristics. - */ - - // Pseudo-random generator. - std::mt19937_64 _gen; - // Random probabilities. Used to: - // - Select Value data types as random indexes in '_dataTypeDistribution'. - // - Select the source of values - either existing scalars or new. - std::uniform_real_distribution _uniformRandProbability{0.0, 1.0}; - // Distribution of different SBE data types. There will be %percent values of each type. - InternalDataTypeDistribution _dataTypeDistribution; - double _reuseScalarsRatio; - - /* - * Integer data parameters. - */ - - // Number of distinct integer values. - const size_t _intNDV; - // A set of integers to choose from while generating random integers. - std::vector _intSet; - // Generator of random integers with uniform distribution. - std::uniform_int_distribution _uniformIntDist; - // Generator of random indexes into the set of integers '_intSet'. - std::uniform_int_distribution _uniformIntIdxDist; - - /* - * String data parameters. - */ - - // All strings draw characters from this alphabet. - static const std::string _alphabet; - // A set of random strings to choose from. In theory there can be duplicates, but this is very - // unlikely. We don't care much if there are a few duplicates anyway. - std::vector _stringSet; - // Generator of random indexes into the set of characters '_alphabet'. - std::uniform_int_distribution _uniformCharIdxDist{0, _alphabet.size() - 1}; - // Generator of random indexes into the set of strings '_stringSet'. - std::uniform_int_distribution _uniformStrIdxDist; - - /* - * Array data parameters. - */ - - // Number of distinct arrays. - // TODO: currently not used. The idea is to use it in the same way as arrays - pre-generate - // '_arrNDV' arrays, then select randomly from this initial set. - size_t _arrNDV; - // Set of arrays to pick from when generating random data. - std::vector> _arraySet; - // Generator of random array sizes. - std::uniform_int_distribution _uniformArrSizeDist; - // Descriptor of the dataset within each array. - std::shared_ptr _nestedDataDescriptor; - // Generator of random indexes into the set of arrays '_arraySet'. - std::uniform_int_distribution _uniformArrIdxDist; -}; // namespace mongo::ce - -/** - Generate a pseudorandom string of length n - * The alphabet is fixed as [0-9][a-z][A-Z] - * Characters are chosed uniformly from the alphabet - * Randomness is implemented such that it is independent of the platform, - i.e. given the same length and seed on any platform, we will produce the - same string. -*/ -std::string genString(size_t len, size_t seed); - -/** - Generate a set of elements consisting of strings and ints in the - requested ratio. The generated array will contain the same values given the same - inputs on all platforms. - */ -std::vector genFixedValueArray(size_t nElems, double intRatio, double strRatio); - -/** - Generate a random string of length len. - * The alphabet is fixed as [0-9][a-z][A-Z]. - * Characters are chosed uniformly from the alphabet. - * Generated strings are likely to differ by platform, so derived values depending on them - are also likely to change. - */ -std::string genRandomString(size_t len, std::mt19937_64& gen, size_t seed); - - -/** - Generate a uniformly random set of elements consisting of string and ints in the - requested ratio. The resulting array is very likely to differ between platforms, even - with the same seed. Thus, derived values are also likely to change. - - Prefer genFixedValueArray when comparing derived values against constants. - */ -std::vector genRandomValueArray(size_t nElems, - double intRatio, - double strRatio, - size_t seed); - -/** - Generate a set up values consisting of half scalars, and half arrays of length 10. - - Values contained in the result will be drawn from the input vector. - */ -std::vector nestArrays(const std::vector& input, size_t emptyArrayCount); - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/rand_utils_new.cpp b/src/mongo/db/query/ce/rand_utils_new.cpp deleted file mode 100644 index 038e69dde04..00000000000 --- a/src/mongo/db/query/ce/rand_utils_new.cpp +++ /dev/null @@ -1,249 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include -#include -#include -#include -#include - -#include "mongo/db/query/ce/rand_utils_new.h" - -#include "mongo/db/exec/sbe/values/value.h" -#include "mongo/util/assert_util.h" - -namespace mongo::ce { - -const std::string StrDistribution::_alphabet = - "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; - -void DataTypeDistrNew::generate(std::vector& randValues, std::mt19937_64& gen) { - if (_nullsRatio > 0 && _nullSelector(gen) < _nullsRatio) { - auto [tag, val] = makeNullValue(); - randValues.emplace_back(tag, val); - } else { - size_t idx = (*_idxDist)(gen); - const auto val = _valSet.at(idx); - auto [copyTag, copyVal] = copyValue(val.getTag(), val.getValue()); - randValues.emplace_back(copyTag, copyVal); - } -} - -void DataTypeDistrNew::generate(value::Array* randValueArray, std::mt19937_64& gen) { - if (_nullsRatio > 0 && _nullSelector(gen) < _nullsRatio) { - auto [tag, val] = makeNullValue(); - randValueArray->push_back(tag, val); - } else { - size_t idx = (*_idxDist)(gen); - const auto val = _valSet.at(idx); - auto [copyTag, copyVal] = copyValue(val.getTag(), val.getValue()); - randValueArray->push_back(copyTag, copyVal); - } -} - -IntDistribution::IntDistribution(MixedDistributionDescriptor distrDescriptor, - double weight, - size_t ndv, - int minInt, - int maxInt, - double nullsRatio) - : DataTypeDistrNew(distrDescriptor, - value::TypeTags::NumberInt64, - weight, - std::min(ndv, static_cast(std::abs(maxInt - minInt))), - nullsRatio), - _minInt(minInt), - _maxInt(maxInt) { - uassert(6660507, "Maximum integer number must be >= the minimum one.", (maxInt >= minInt)); -} - -void IntDistribution::init(DatasetDescriptorNew* parentDesc, std::mt19937_64& gen) { - std::set tmpIntSet; - std::uniform_int_distribution uniformIntDist{_minInt, _maxInt}; - - if (_ndv == static_cast(std::abs(_maxInt - _minInt))) { - // This is a dense set of all ints in the range. - for (int i = _minInt; i <= _maxInt; ++i) { - tmpIntSet.insert(i); - } - } else { - size_t randCount = 0; - while (tmpIntSet.size() < _ndv && randCount < 10 * _ndv) { - int randInt = uniformIntDist(gen); - ++randCount; - tmpIntSet.insert(randInt); - } - } - uassert(6660508, "Too few integers generated.", (double)tmpIntSet.size() / (double)_ndv > 0.99); - _valSet.reserve(tmpIntSet.size()); - for (const auto randInt : tmpIntSet) { - const auto [tag, val] = makeInt64Value(randInt); - _valSet.emplace_back(tag, val); - } - - _idxDist = MixedDistribution::make(_mixedDistrDescriptor, 0, _valSet.size() - 1); -} - -StrDistribution::StrDistribution(MixedDistributionDescriptor distrDescriptor, - double weight, - size_t ndv, - size_t minStrLen, - size_t maxStrLen, - double nullsRatio) - : DataTypeDistrNew(distrDescriptor, value::TypeTags::StringBig, weight, ndv, nullsRatio), - _minStrLen(minStrLen), - _maxStrLen(maxStrLen) { - uassert(6660509, "Maximum string size must be >= the minimum one.", (maxStrLen >= minStrLen)); -} - -void StrDistribution::init(DatasetDescriptorNew* parentDesc, std::mt19937_64& gen) { - // Generate a set of random strings with random sizes between _minStrLen and _maxStrLen. - _valSet.reserve(_ndv); - std::uniform_int_distribution uniformStrSizeDistr{_minStrLen, _maxStrLen}; - for (size_t i = 0; i < _ndv; ++i) { - size_t len = uniformStrSizeDistr(gen); - const auto randStr = genRandomString(len, gen); - const auto [tag, val] = value::makeNewString(randStr); - _valSet.emplace_back(tag, val); - } - - _idxDist = MixedDistribution::make(_mixedDistrDescriptor, 0, _valSet.size() - 1); -} - -std::string StrDistribution::genRandomString(size_t len, std::mt19937_64& gen) { - std::string randStr; - randStr.reserve(len); - for (size_t i = 0; i < len; ++i) { - size_t idx = _uniformCharIdxDist(gen); - const char ch = _alphabet[idx]; - randStr += ch; - } - - return randStr; -} - -ArrDistribution::ArrDistribution(MixedDistributionDescriptor distrDescriptor, - double weight, - size_t ndv, - size_t minArrLen, - size_t maxArrLen, - std::unique_ptr arrayDataDescriptor, - double reuseScalarsRatio, - double nullsRatio) - : DataTypeDistrNew(distrDescriptor, value::TypeTags::Array, weight, ndv, nullsRatio), - _uniformArrSizeDist{minArrLen, maxArrLen}, - _arrayDataDescriptor(std::move(arrayDataDescriptor)), - _reuseScalarsRatio(reuseScalarsRatio) { - uassert(6660510, - "Array specs must be 0 if there is no array data descriptor.", - _arrayDataDescriptor || (ndv == 0 && minArrLen == 0 && maxArrLen == 0)); - uassert(6660511, - "Nested arrays requires sensible array lengths.", - !_arrayDataDescriptor || maxArrLen >= minArrLen); - uassert(6660512, - "reuseScalarsRatio must be in [0, 1].", - reuseScalarsRatio >= 0 && reuseScalarsRatio <= 1.0); -} - -void ArrDistribution::init(DatasetDescriptorNew* parentDesc, std::mt19937_64& gen) { - uassert(6660513, "There must always be a parent data descriptor.", parentDesc); - - // Extract the per-type probabilities from the parent descriptor, but set the array probability - // to 0 to avoid self-recursion. - std::vector parentProbabilities; - for (const auto& dtd : parentDesc->_dataTypeDistributions) { - double prob = (dtd->tag() == value::TypeTags::Array) ? 0 : dtd->weight(); - parentProbabilities.push_back(prob); - } - std::discrete_distribution parentDataTypeSelector; - parentDataTypeSelector.param(std::discrete_distribution::param_type( - parentProbabilities.begin(), parentProbabilities.end())); - - // Generate _ndv distinct arrays, and store them in _valSet. - for (size_t i = 0; i < _ndv; ++i) { - auto [arrayTag, arrayVal] = value::makeNewArray(); - value::Array* arr = value::getArrayView(arrayVal); - size_t randArraySize = _uniformArrSizeDist(gen); - arr->reserve(randArraySize); - // Generate the data for one random array. - for (size_t j = 0; j < randArraySize; ++j) { - DataTypeDistrNew* dtd = nullptr; - size_t idx; - double reuseParentProb = _uniformRandProbability(gen); - if (reuseParentProb < _reuseScalarsRatio) { - // Pick a random data type descriptor from the parent. - idx = parentDataTypeSelector(gen); - dtd = parentDesc->_dataTypeDistributions.at(idx).get(); - } else { - idx = _arrayDataDescriptor->_dataTypeSelector(gen); - dtd = _arrayDataDescriptor->_dataTypeDistributions.at(idx).get(); - } - dtd->generate(arr, gen); - } - _valSet.emplace_back(arrayTag, arrayVal); - } - - _idxDist = MixedDistribution::make(_mixedDistrDescriptor, 0, _valSet.size() - 1); -} - -DatasetDescriptorNew::DatasetDescriptorNew(TypeDistrVector dataTypeDistributions, - std::mt19937_64& gen) - : _dataTypeDistributions(std::move(dataTypeDistributions)), _gen{gen} { - - // The probability of each type to be chosen. Extracted into a vector in order to setup a - // discrete_distribution. - std::vector probabilities; - probabilities.reserve(_dataTypeDistributions.size()); - for (auto& dtd : _dataTypeDistributions) { - dtd->init(this, gen); - probabilities.push_back(dtd->weight()); - } - _dataTypeSelector.param( - std::discrete_distribution::param_type(probabilities.begin(), probabilities.end())); -} - -DataTypeDistrNew* DatasetDescriptorNew::getRandDataTypeDist() { - size_t idx = _dataTypeSelector(_gen); - return _dataTypeDistributions[idx].get(); -} - -std::vector DatasetDescriptorNew::genRandomDataset(size_t nElems) { - std::vector randValues; - randValues.reserve(nElems); - - for (size_t i = 0; i < nElems; ++i) { - DataTypeDistrNew* dtd = getRandDataTypeDist(); - dtd->generate(randValues, _gen); - } - - return randValues; -} - - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/rand_utils_new.h b/src/mongo/db/query/ce/rand_utils_new.h deleted file mode 100644 index 0420b990dce..00000000000 --- a/src/mongo/db/query/ce/rand_utils_new.h +++ /dev/null @@ -1,354 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include -#include - -#include "value_utils.h" - -namespace mongo::ce { - -class SBEValue; -class DatasetDescriptorNew; - -/** - * A base class for wrappers of STL random distributions that produce size_t values within a range. - * This class enables polymorphic usage of random distributions, for instance to implement a mix of - * distributions. - */ -class RandomDistribution { -public: - RandomDistribution() = default; - RandomDistribution(const RandomDistribution&) = default; - RandomDistribution(RandomDistribution&&) = default; - RandomDistribution& operator=(const RandomDistribution&) = default; - RandomDistribution& operator=(RandomDistribution&&) = default; - virtual ~RandomDistribution() = default; - - virtual size_t operator()(std::mt19937_64& gen) = 0; -}; - -/** - A uniform random distribution of size_t within a range - */ -class UniformDistr : public RandomDistribution { -public: - UniformDistr(size_t min, size_t max) : _distr{min, max}, _min(min), _max(max) {} - - size_t operator()(std::mt19937_64& gen) override { - size_t result = _distr(gen); - uassert(6660540, "Random index out of range", result >= _min && result <= _max); - return result; - } - -private: - std::uniform_int_distribution _distr; - size_t _min; - size_t _max; -}; - -/** - * Wrapper of normal distribution that is guaranteed to produces size_t values within a certain - * range. The STL class normal_distribution takes a median and standard deviation. This class - * computes a suitable median and standard deviation from the required [min,max] boundaries. - */ -class NormalDistr : public RandomDistribution { -public: - NormalDistr(size_t min, size_t max) - : _distr{(double)(min + max) / 2.0, (double)(max - min) / 4.0}, - _backup{min, max}, - _min((double)min), - _max((double)max) {} - - size_t operator()(std::mt19937_64& gen) override { - size_t result = std::round(_distr(gen)); - size_t trials = 0; - // If the result is outside the range (an event with low probability), try 10 more times to - // get a number in the range. - while (!(result >= _min && result <= _max) && trials < 10) { - double randNum = _distr(gen); - if (randNum < _min) { - result = std::ceil(randNum); - } else if (randNum > _max) { - result = std::floor(randNum); - } else { - result = std::round(randNum); - } - ++trials; - } - if (result < _min && result > _max) { - // We couldn't generate a number in [min,max] within 10 attempts. Generate a uniform - // number. - result = _backup(gen); - } - uassert(6660541, "Random index out of range", result >= _min && result <= _max); - return result; - } - -private: - std::normal_distribution _distr; - std::uniform_int_distribution _backup; - double _min; - double _max; -}; - -enum class DistrType { kUniform, kNormal }; - -using MixedDistributionDescriptor = std::vector>; - -/** - * Generator for mixed distribution, where mixing is on the type of distribution, in the - * probabilities specified in distrProbabilites - */ -class MixedDistribution { -public: - MixedDistribution(std::vector> distrMix, - std::vector& distrProbabilities) - : _distrMix(std::move(distrMix)) { - _distDist.param(std::discrete_distribution::param_type(distrProbabilities.begin(), - distrProbabilities.end())); - } - - static std::unique_ptr make(MixedDistributionDescriptor& descriptor, - size_t min, - size_t max) { - std::vector distrProbabilities; - std::vector> distrMix; - - for (const auto& [distrType, weight] : descriptor) { - distrProbabilities.push_back(weight); - switch (distrType) { - case DistrType::kUniform: - distrMix.emplace_back(std::make_unique(min, max)); - break; - case DistrType::kNormal: - distrMix.emplace_back(std::make_unique(min, max)); - break; - default: - MONGO_UNREACHABLE; - } - } - - return std::make_unique(std::move(distrMix), distrProbabilities); - } - - size_t operator()(std::mt19937_64& gen) { - size_t distIdx = _distDist(gen); - size_t result = (*_distrMix.at(distIdx))(gen); - return result; - } - -private: - // Mix of different distributions. There can be instances of the same type of distribution, - // because they can still be defined differently. - std::vector> _distrMix; - // Distribution of distributions - select the current distribution with a certain probability. - std::discrete_distribution _distDist; -}; - -/** - * Descriptor of a typed data distribution - */ -class DataTypeDistrNew { -public: - DataTypeDistrNew(MixedDistributionDescriptor distrDescriptor, - value::TypeTags tag, - double weight, - size_t ndv, - double nullsRatio = 0.0) - : _mixedDistrDescriptor(distrDescriptor), - _tag(tag), - _weight(weight), - _ndv(ndv), - _nullsRatio(nullsRatio) { - uassert(6660542, "NDV must be > 0.", ndv > 0); - uassert(6660543, "nullsRatio must be in [0, 1].", nullsRatio >= 0 && nullsRatio <= 1); - } - - virtual ~DataTypeDistrNew() = default; - - /** - * Generate all unique values that generation chooses from, and store them in '_valSet'. - * Different data types provide different implementations. - * @todo: The 'parentDesc' parameter is used only by array generation. Consider a different way - * of passing it only to that type. - */ - virtual void init(DatasetDescriptorNew* parentDesc, std::mt19937_64& gen) = 0; - - /** - * Generate a single random value, and store it in 'randValues' vector. - */ - void generate(std::vector& randValues, std::mt19937_64& gen); - - /** - * Generate a single random value, and store it in 'randValueArray' array. - */ - void generate(value::Array* randValueArray, std::mt19937_64& gen); - - /** - * Custom equality comparison for storage in sets. There can be only datatype in a set. - */ - bool operator==(const DataTypeDistrNew& d) const { - return this->_tag == d._tag; - } - - value::TypeTags tag() const { - return _tag; - } - - double weight() const { - return _weight; - } - -protected: - MixedDistributionDescriptor _mixedDistrDescriptor; - value::TypeTags _tag; - // Weight that determines the probability of a value of this type. - const double _weight; - const size_t _ndv; - // A set of (randomly generated) values to choose from when generating random datasets. - std::vector _valSet; - // Generator of random indexes into a set of values. - // std::uniform_int_distribution _idxDist; - std::unique_ptr _idxDist; - // Percent of null values in the dataset. - double _nullsRatio; - std::uniform_real_distribution _nullSelector{0, 1}; - - friend class DatasetDescriptorNew; -}; - -using TypeDistrVector = std::vector>; - -/** - * Integer data distribution. - */ -class IntDistribution : public DataTypeDistrNew { -public: - IntDistribution(MixedDistributionDescriptor distrDescriptor, - double weight, - size_t ndv, - int minInt, - int maxInt, - double nullsRatio = 0); - - /* - * Generate a set of random integers, and store them in _valSet. - */ - void init(DatasetDescriptorNew* parentDesc, std::mt19937_64& gen) override; - -protected: - int _minInt; - int _maxInt; -}; - -/** - * String data distribution. - */ -class StrDistribution : public DataTypeDistrNew { -public: - StrDistribution(MixedDistributionDescriptor distrDescriptor, - double weight, - size_t ndv, - size_t minStrLen, - size_t maxStrLen, - double nullsRatio = 0); - - /* - * Generate a set of random strings, and store them in _valSet. - */ - void init(DatasetDescriptorNew* parentDesc, std::mt19937_64& gen) override; - -protected: - std::string genRandomString(size_t len, std::mt19937_64& gen); - - size_t _minStrLen; - size_t _maxStrLen; - // All strings draw characters from this alphabet. - static const std::string _alphabet; - // Generator of random indexes into the set of characters '_alphabet'. - std::uniform_int_distribution _uniformCharIdxDist{0, _alphabet.size() - 1}; -}; - -/** - * SBE array data distribution. - */ -class ArrDistribution : public DataTypeDistrNew { -public: - ArrDistribution(MixedDistributionDescriptor distrDescriptor, - double weight, - size_t ndv, - size_t minArrLen, - size_t maxArrLen, - std::unique_ptr arrayDataDescriptor, - double reuseScalarsRatio = 0, - double nullsRatio = 0); - -private: - void init(DatasetDescriptorNew* parentDesc, std::mt19937_64& gen) override; - - // Generator of random array sizes. - std::uniform_int_distribution _uniformArrSizeDist; - // Descriptor of the dataset within each array. - std::unique_ptr _arrayDataDescriptor; - // Randomly select a parent or a child distribution when generating random - std::uniform_real_distribution _uniformRandProbability{0.0, 1.0}; - double _reuseScalarsRatio; -}; - -/** - Given a list of tyoed data distibutions, this class is used to generate a vector of values - according to the distribution weights. -*/ -class DatasetDescriptorNew { -public: - DatasetDescriptorNew(TypeDistrVector dataTypeDistributions, std::mt19937_64& gen); - - // Generate a random dataset of 'nElems' according to the data distribution characteristics in - // this object. - std::vector genRandomDataset(size_t nElems); - -private: - // Select a random value data type. - DataTypeDistrNew* getRandDataTypeDist(); - - // Distribution of different SBE data types. There will be %percent values of each type. - // TODO: is it a better idea to store shared_ptr or raw pointers to enable reuse? - TypeDistrVector _dataTypeDistributions; - // Pseudo-random generator. - std::mt19937_64& _gen; - // Select a random data type distribution. - std::discrete_distribution _dataTypeSelector; - - friend class ArrDistribution; -}; - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/sampling_estimator.cpp b/src/mongo/db/query/ce/sampling_estimator.cpp new file mode 100644 index 00000000000..85fac93b0a2 --- /dev/null +++ b/src/mongo/db/query/ce/sampling_estimator.cpp @@ -0,0 +1,341 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/ce/sampling_estimator.h" + +#include "mongo/db/exec/sbe/abt/abt_lower.h" +#include "mongo/db/query/cqf_command_utils.h" +#include "mongo/db/query/optimizer/explain.h" +#include "mongo/db/query/optimizer/index_bounds.h" +#include "mongo/db/query/optimizer/props.h" +#include "mongo/db/query/optimizer/utils/abt_hash.h" +#include "mongo/db/query/optimizer/utils/memo_utils.h" +#include "mongo/logv2/log.h" + +#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kQuery + +namespace mongo::optimizer::ce { +class SamplingPlanExtractor { +public: + SamplingPlanExtractor(const cascades::Memo& memo, + const OptPhaseManager& phaseManager, + const size_t sampleSize) + : _memo(memo), _sampleSize(sampleSize), _phaseManager(phaseManager) {} + + void transport(ABT& n, const MemoLogicalDelegatorNode& node) { + n = extract(_memo.getLogicalNodes(node.getGroupId()).front()); + } + + void transport(ABT& n, const ScanNode& /*node*/, ABT& /*binder*/) { + // We will lower the scan node in a sampling context here. + // TODO: for now just return the documents in random order. + n = make(properties::LimitSkipRequirement(_sampleSize, 0), std::move(n)); + } + + void transport(ABT& n, const FilterNode& /*node*/, ABT& childResult, ABT& /*exprResult*/) { + // Skip over filters. + n = childResult; + } + + void transport(ABT& /*n*/, + const EvaluationNode& /*node*/, + ABT& /*childResult*/, + ABT& /*exprResult*/) { + // Keep Eval nodes. + } + + void transport(ABT& n, const SargableNode& node, ABT& childResult, ABT& refs, ABT& binds) { + ABT result = childResult; + // Retain only output bindings without applying filters. + for (const auto& [key, req] : node.getReqMap()) { + if (const auto& boundProjName = req.getBoundProjectionName()) { + lowerPartialSchemaRequirement( + key, + PartialSchemaRequirement{ + boundProjName, IntervalReqExpr::makeSingularDNF(), req.getIsPerfOnly()}, + result, + _phaseManager.getPathToInterval()); + } + } + std::swap(n, result); + } + + void transport(ABT& n, const CollationNode& /*node*/, ABT& childResult, ABT& refs) { + // Skip over collation nodes. + n = childResult; + } + + template + void transport(ABT& /*n*/, const T& /*node*/, Ts&&...) { + if constexpr (std::is_base_of_v) { + uasserted(6624242, "Should not be seeing other types of nodes here."); + } + } + + ABT extract(ABT node) { + algebra::transport(node, *this); + return node; + } + +private: + const cascades::Memo& _memo; + const size_t _sampleSize; + const OptPhaseManager& _phaseManager; +}; + +class SamplingTransport { + static constexpr size_t kMaxSampleSize = 1000; + +public: + SamplingTransport(OperationContext* opCtx, + OptPhaseManager phaseManager, + const int64_t numRecords, + std::unique_ptr fallbackCE) + : _phaseManager(std::move(phaseManager)), + _opCtx(opCtx), + _sampleSize(std::min(numRecords, kMaxSampleSize)), + _fallbackCE(std::move(fallbackCE)) {} + + CEType transport(const ABT& n, + const FilterNode& node, + const Metadata& metadata, + const cascades::Memo& memo, + const properties::LogicalProps& logicalProps, + CEType childResult, + CEType /*exprResult*/) { + if (!properties::hasProperty(logicalProps)) { + return _fallbackCE->deriveCE(metadata, memo, logicalProps, n.ref()); + } + + SamplingPlanExtractor planExtractor(memo, _phaseManager, _sampleSize); + // Create a plan with all eval nodes so far and the filter last. + ABT abtTree = make(node.getFilter(), planExtractor.extract(n)); + + return estimateFilterCE(metadata, memo, logicalProps, n, std::move(abtTree), childResult); + } + + CEType transport(const ABT& n, + const SargableNode& node, + const Metadata& metadata, + const cascades::Memo& memo, + const properties::LogicalProps& logicalProps, + CEType childResult, + CEType /*bindResult*/, + CEType /*refsResult*/) { + if (!properties::hasProperty(logicalProps)) { + return _fallbackCE->deriveCE(metadata, memo, logicalProps, n.ref()); + } + + SamplingPlanExtractor planExtractor(memo, _phaseManager, _sampleSize); + ABT extracted = planExtractor.extract(n); + + // Estimate individual requirements separately by potentially re-using cached results. + // Here we assume that each requirement is independent. + // TODO: consider estimating together the entire set of requirements (but caching!) + CEType result = childResult; + for (const auto& [key, req] : node.getReqMap()) { + if (req.getIsPerfOnly()) { + // Ignore perf-only requirements. + continue; + } + + if (!isIntervalReqFullyOpenDNF(req.getIntervals())) { + ABT lowered = extracted; + // Lower requirement without an output binding. + lowerPartialSchemaRequirement( + key, + PartialSchemaRequirement{boost::none /*boundProjectionName*/, + req.getIntervals(), + req.getIsPerfOnly()}, + lowered, + _phaseManager.getPathToInterval()); + uassert(6624243, "Expected a filter node", lowered.is()); + result = + estimateFilterCE(metadata, memo, logicalProps, n, std::move(lowered), result); + } + } + + return result; + } + + /** + * Other ABT types. + */ + template + CEType transport(const ABT& n, + const T& /*node*/, + const Metadata& metadata, + const cascades::Memo& memo, + const properties::LogicalProps& logicalProps, + Ts&&...) { + if (canBeLogicalNode()) { + return _fallbackCE->deriveCE(metadata, memo, logicalProps, n.ref()); + } + return 0.0; + } + + CEType derive(const Metadata& metadata, + const cascades::Memo& memo, + const properties::LogicalProps& logicalProps, + const ABT::reference_type logicalNodeRef) { + return algebra::transport(logicalNodeRef, *this, metadata, memo, logicalProps); + } + +private: + CEType estimateFilterCE(const Metadata& metadata, + const cascades::Memo& memo, + const properties::LogicalProps& logicalProps, + const ABT& n, + ABT abtTree, + CEType childResult) { + auto it = _selectivityCacheMap.find(abtTree); + if (it != _selectivityCacheMap.cend()) { + // Cache hit. + return it->second * childResult; + } + + const auto [success, selectivity] = estimateSelectivity(abtTree); + if (!success) { + return _fallbackCE->deriveCE(metadata, memo, logicalProps, n.ref()); + } + + _selectivityCacheMap.emplace(std::move(abtTree), selectivity); + + OPTIMIZER_DEBUG_LOG(6264805, + 5, + "CE sampling estimated filter selectivity", + "selectivity"_attr = selectivity); + return selectivity * childResult; + } + + std::pair estimateSelectivity(ABT abtTree) { + // Add a group by to count number of documents. + const ProjectionName sampleSumProjection = "sum"; + abtTree = + make(ProjectionNameVector{}, + ProjectionNameVector{sampleSumProjection}, + makeSeq(make("$sum", makeSeq(Constant::int64(1)))), + std::move(abtTree)); + abtTree = make( + properties::ProjectionRequirement{ProjectionNameVector{sampleSumProjection}}, + std::move(abtTree)); + + + OPTIMIZER_DEBUG_LOG(6264806, + 5, + "Estimate selectivity ABT", + "explain"_attr = ExplainGenerator::explainV2(abtTree)); + + _phaseManager.optimize(abtTree); + + auto env = VariableEnvironment::build(abtTree); + SlotVarMap slotMap; + boost::optional ridSlot; + sbe::value::SlotIdGenerator ids; + SBENodeLowering g{env, + slotMap, + ridSlot, + ids, + _phaseManager.getMetadata(), + _phaseManager.getNodeToGroupPropsMap(), + _phaseManager.getRIDProjections(), + true /*randomScan*/}; + auto sbePlan = g.optimize(abtTree); + tassert(6624261, "Unexpected rid slot", !ridSlot); + + // TODO: return errors instead of exceptions? + uassert(6624244, "Lowering failed", sbePlan != nullptr); + uassert(6624245, "Invalid slot map size", slotMap.size() == 1); + + sbePlan->attachToOperationContext(_opCtx); + sbe::CompileCtx ctx(std::make_unique()); + sbePlan->prepare(ctx); + + std::vector accessors; + for (auto& [name, slot] : slotMap) { + accessors.emplace_back(sbePlan->getAccessor(ctx, slot)); + } + + sbePlan->open(false); + ON_BLOCK_EXIT([&] { sbePlan->close(); }); + + while (sbePlan->getNext() != sbe::PlanState::IS_EOF) { + const auto [tag, value] = accessors.at(0)->getViewOfValue(); + if (tag == sbe::value::TypeTags::NumberInt64) { + // TODO: check if we get exactly one result from the groupby? + return {true, static_cast(value) / _sampleSize}; + } + return {false, {}}; + }; + + // If nothing passes the filter, estimate 0.0 selectivity. HashGroup will return 0 results. + return {true, 0.0}; + } + + struct NodeRefHash { + size_t operator()(const ABT& node) const { + return ABTHashGenerator::generate(node); + } + }; + + struct NodeRefCompare { + bool operator()(const ABT& left, const ABT& right) const { + return left == right; + } + }; + + // Cache a logical node reference to computed selectivity. Used for Filter and Sargable nodes. + opt::unordered_map _selectivityCacheMap; + + OptPhaseManager _phaseManager; + + // We don't own this. + OperationContext* _opCtx; + + const int64_t _sampleSize; + std::unique_ptr _fallbackCE; +}; + +SamplingEstimator::SamplingEstimator(OperationContext* opCtx, + OptPhaseManager phaseManager, + const int64_t numRecords, + std::unique_ptr fallbackCE) + : _transport(std::make_unique( + opCtx, std::move(phaseManager), numRecords, std::move(fallbackCE))) {} + +SamplingEstimator::~SamplingEstimator() {} + +CEType SamplingEstimator::deriveCE(const Metadata& metadata, + const cascades::Memo& memo, + const properties::LogicalProps& logicalProps, + const ABT::reference_type logicalNodeRef) const { + return _transport->derive(metadata, memo, logicalProps, logicalNodeRef); +} + +} // namespace mongo::optimizer::ce diff --git a/src/mongo/db/query/ce/sampling_estimator.h b/src/mongo/db/query/ce/sampling_estimator.h new file mode 100644 index 00000000000..cf9d0973a39 --- /dev/null +++ b/src/mongo/db/query/ce/sampling_estimator.h @@ -0,0 +1,56 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/query/optimizer/cascades/interfaces.h" +#include "mongo/db/query/optimizer/opt_phase_manager.h" + +namespace mongo::optimizer::ce { + +class SamplingTransport; + +class SamplingEstimator : public cascades::CardinalityEstimator { +public: + SamplingEstimator(OperationContext* opCtx, + OptPhaseManager phaseManager, + int64_t numRecords, + std::unique_ptr fallbackCE); + ~SamplingEstimator(); + + CEType deriveCE(const Metadata& metadata, + const cascades::Memo& memo, + const properties::LogicalProps& logicalProps, + ABT::reference_type logicalNodeRef) const final; + +private: + std::unique_ptr _transport; +}; + +} // namespace mongo::optimizer::ce diff --git a/src/mongo/db/query/ce/scalar_histogram.cpp b/src/mongo/db/query/ce/scalar_histogram.cpp deleted file mode 100644 index 604af42da13..00000000000 --- a/src/mongo/db/query/ce/scalar_histogram.cpp +++ /dev/null @@ -1,194 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/db/query/ce/scalar_histogram.h" -#include "mongo/db/exec/sbe/values/bson.h" -#include "mongo/db/exec/sbe/values/value.h" - -namespace mongo::ce { - -using namespace sbe; - -Bucket::Bucket( - double equalFreq, double rangeFreq, double cumulativeFreq, double ndv, double cumulativeNDV) - : _equalFreq(equalFreq), - _rangeFreq(rangeFreq), - _cumulativeFreq(cumulativeFreq), - _ndv(ndv), - _cumulativeNDV(cumulativeNDV) { - uassert(6695702, "Invalid equalFreq", _equalFreq >= 0.0); - uassert(6695703, "Invalid rangeFreq", _rangeFreq >= 0.0); - uassert(6695704, "Invalid ndv", _ndv <= _rangeFreq); - uassert(6695705, "Invalid cumulative frequency", _cumulativeFreq >= _equalFreq + _rangeFreq); - uassert(6695706, "Invalid cumulative ndv", _cumulativeNDV >= _ndv + 1.0); -} - -std::string Bucket::toString() const { - std::ostringstream os; - os << "equalFreq: " << _equalFreq << ", rangeFreq: " << _rangeFreq - << ", cumulativeFreq: " << _cumulativeFreq << ", ndv: " << _ndv - << ", cumulativeNDV: " << _cumulativeNDV; - return os.str(); -} - -std::string Bucket::dump() const { - std::ostringstream os; - os << _equalFreq << ", " << _rangeFreq << ", " << _ndv; - return os.str(); -} - -BSONObj Bucket::serialize() const { - BSONObjBuilder bob; - bob.appendNumber("boundaryCount", _equalFreq); - bob.appendNumber("rangeCount", _rangeFreq); - bob.appendNumber("rangeDistincts", _ndv); - bob.appendNumber("cumulativeCount", _cumulativeFreq); - bob.appendNumber("cumulativeDistincts", _cumulativeNDV); - bob.doneFast(); - return bob.obj(); -} - -ScalarHistogram::ScalarHistogram() : ScalarHistogram({}, {}) {} - -ScalarHistogram::ScalarHistogram(const StatsHistogram& histogram) { - for (const auto& bucket : histogram.getBuckets()) { - Bucket b(bucket.getBoundaryCount(), - bucket.getRangeCount(), - bucket.getCumulativeCount(), - bucket.getRangeDistincts(), - bucket.getCumulativeDistincts()); - _buckets.push_back(std::move(b)); - } - for (const auto& bound : histogram.getBounds()) { - // We cannot insert a view here, because the lifetime of the of the bound is shorter than - // that of the histogram. In the case of a larger type, e.g. BigString/bsonString, we need - // to copy over the entire string as well, not just a pointer to memory which may be - // deallocated before we need it. - auto value = sbe::bson::convertFrom(bound.getElement()); - _bounds.push_back(value.first, value.second); - } -} - -ScalarHistogram::ScalarHistogram(value::Array bounds, std::vector buckets) - : _bounds(std::move(bounds)), _buckets(std::move(buckets)) { - uassert(6695707, "Invalid sizes", bounds.size() == buckets.size()); -} - -std::string ScalarHistogram::toString() const { - std::ostringstream os; - os << "["; - for (size_t i = 0; i < _buckets.size(); i++) { - os << "{val: " << _bounds.getAt(i) << ", " << _buckets.at(i).toString() << "}"; - if (_buckets.size() - i > 1) - os << ","; - } - os << "]"; - return os.str(); -} - -std::string ScalarHistogram::plot() const { - std::ostringstream os; - double maxFreq = 0; - const double maxBucketSize = 100; - - for (const auto& bucket : _buckets) { - double maxBucketFreq = std::max(bucket._equalFreq, bucket._rangeFreq); - maxFreq = std::max(maxFreq, maxBucketFreq); - } - - std::vector> headers; - size_t maxHeaderSize = 0; - for (size_t i = 0; i < _buckets.size(); ++i) { - std::ostringstream rngHeader; - std::ostringstream eqlHeader; - double scaledRngF = maxBucketSize * _buckets[i]._rangeFreq / maxFreq; - double scaledEqlF = maxBucketSize * _buckets[i]._equalFreq / maxFreq; - rngHeader << _bounds.getAt(i) << ": " << _buckets[i]._rangeFreq; - eqlHeader << _bounds.getAt(i) << ": " << _buckets[i]._equalFreq; - auto rngStr = rngHeader.str(); - maxHeaderSize = std::max(maxHeaderSize, rngStr.size()); - headers.emplace_back(scaledRngF, rngStr); - auto eqlStr = eqlHeader.str(); - maxHeaderSize = std::max(maxHeaderSize, eqlStr.size()); - headers.emplace_back(scaledEqlF, eqlStr); - } - - const std::string maxLine(maxBucketSize + maxHeaderSize + 3, '-'); - os << maxLine << "\n"; - for (size_t j = 0; j < headers.size(); ++j) { - auto header = headers.at(j); - header.second.resize(maxHeaderSize, ' '); - const std::string bar(std::round(header.first), '*'); - os << header.second << " | " << bar << "\n"; - } - os << maxLine << "\n"; - - return os.str(); -} - -std::string ScalarHistogram::dump() const { - std::ostringstream os; - os << "Histogram:\n{"; - for (size_t i = 0; i < _buckets.size(); i++) { - os << "{" << _bounds.getAt(i) << ", " << _buckets.at(i).dump() << "},\n"; - } - os << "}"; - return os.str(); -} - -const value::Array& ScalarHistogram::getBounds() const { - return _bounds; -} - -const std::vector& ScalarHistogram::getBuckets() const { - return _buckets; -} - -BSONObj ScalarHistogram::serialize() const { - BSONObjBuilder histogramBuilder; - - // Construct bucket BSON. - auto buckets = getBuckets(); - BSONArrayBuilder bucketsBuilder(histogramBuilder.subarrayStart("buckets")); - for (const auto& bucket : buckets) { - bucketsBuilder.append(bucket.serialize()); - } - bucketsBuilder.doneFast(); - - // Construct bucket bounds BSON. - auto bounds = getBounds(); - BSONArrayBuilder boundsBuilder(histogramBuilder.subarrayStart("bounds")); - sbe::bson::convertToBsonObj(boundsBuilder, &bounds); - boundsBuilder.doneFast(); - - histogramBuilder.doneFast(); - return histogramBuilder.obj(); -} - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/scalar_histogram.h b/src/mongo/db/query/ce/scalar_histogram.h deleted file mode 100644 index c368e0dd50b..00000000000 --- a/src/mongo/db/query/ce/scalar_histogram.h +++ /dev/null @@ -1,120 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include -#include -#include - -#include "mongo/db/exec/sbe/values/value.h" -#include "mongo/db/query/ce/stats_gen.h" - -namespace mongo::ce { - -/** - * Statistics related to a single ScalarHistogram bucket. The boundary value is kept in a separate - * array, so that each bucket has a corresponding boundary value. The reason for this to manage the - * memory of values. - */ -struct Bucket { - Bucket(double equalFreq, - double rangeFreq, - double cumulativeFreq, - double ndv, - double cumulativeNDV); - - std::string toString() const; - // Help function to dump the bucket content as needed by histogram creation in the unit tests. - std::string dump() const; - - // Frequency of the bound value itself. - double _equalFreq; - - // Frequency of other values. - double _rangeFreq; - - // Sum of frequencies of preceding buckets to avoid recomputing. Includes both _equalFreq and - // _rangeFreq. - double _cumulativeFreq; - - // Number of distinct values in this bucket, excludes the bound. - double _ndv; - - // Sum of distinct values in preceding buckets including this bucket. - double _cumulativeNDV; - - // Serialize to BSON for storage in stats collection. - BSONObj serialize() const; -}; - -/** - * A ScalarHistogram over a set of values. The ScalarHistogram consists of two parallel vectors - - * one with the individual value statistics, and another one with the actual boundary values. - */ -class ScalarHistogram { -public: - ScalarHistogram(); - ScalarHistogram(const StatsHistogram& histogram); - ScalarHistogram(sbe::value::Array bounds, std::vector buckets); - - // Print a human-readable representation of a histogram. - std::string toString() const; - std::string plot() const; - // Help function to dump the content of the histogram as needed by the manual histogram creation - // in the unit tests (without cummulative frequency and NDV). - std::string dump() const; - - const sbe::value::Array& getBounds() const; - const std::vector& getBuckets() const; - // Return the total number of histogrammed values. - size_t getCardinality() const { - if (_buckets.empty()) { - return 0.0; - } - return _buckets.back()._cumulativeFreq; - } - - bool empty() const { - return _buckets.empty(); - } - - // Serialize to BSON for storage in stats collection. - BSONObj serialize() const; - - static constexpr size_t kMaxBuckets = 100; - -private: - // Bucket bounds representing the **highest** value in each bucket. - sbe::value::Array _bounds; - - std::vector _buckets; -}; - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/stats.idl b/src/mongo/db/query/ce/stats.idl deleted file mode 100644 index eb6220d45b9..00000000000 --- a/src/mongo/db/query/ce/stats.idl +++ /dev/null @@ -1,102 +0,0 @@ -# Copyright (C) 2022-present MongoDB, Inc. -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the Server Side Public License, version 1, -# as published by MongoDB, Inc. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Server Side Public License for more details. -# -# You should have received a copy of the Server Side Public License -# along with this program. If not, see -# . -# -# As a special exception, the copyright holders give permission to link the -# code of portions of this program with the OpenSSL library under certain -# conditions as described in each individual source file and distribute -# linked combinations including the program with the OpenSSL library. You -# must comply with the Server Side Public License in all respects for -# all of the code used other than as permitted herein. If you modify file(s) -# with this exception, you may extend this exception to your version of the -# file(s), but you are not obligated to do so. If you do not wish to do so, -# delete this exception statement from your version. If you delete this -# exception statement from all source files in the program, then also delete -# it in the license file. -# -global: - cpp_namespace: "mongo" - -imports: - - "mongo/db/basic_types.idl" - -structs: - StatsBucket: - description: "Histogram bucket" - fields: - boundaryCount: - type: double - rangeCount: - type: double - rangeDistincts: - type: double - cumulativeCount: - type: double - cumulativeDistincts: - type: double - - StatsHistogram: - description: "MaxDiff Histogram" - fields: - buckets: - type: array - bounds: - type: array - - TypeTag: - description: "SBE types and their corresponding frequencies in the histogram" - fields: - typeName: - type: string - count: - type: double - - StatsArrayHistogram: - description: "Array Histogram" - fields: - minHistogram: - type: StatsHistogram - maxHistogram: - type: StatsHistogram - uniqueHistogram: - type: StatsHistogram - typeCount: - type: array - - Statistics: - description: "Serialized representation of data statistics for a key path" - fields: - documents: - type: double - trueCount: - type: double - falseCount: - type: double - emptyArrayCount: - type: double - typeCount: - type: array - scalarHistogram: - type: StatsHistogram - arrayStatistics: - type: StatsArrayHistogram - optional: true - - StatsPath: - description: "Key path to statstics" - fields: - _id: - type: string - statistics: - type: Statistics diff --git a/src/mongo/db/query/ce/stats_cache.cpp b/src/mongo/db/query/ce/stats_cache.cpp deleted file mode 100644 index 2fb2be400a6..00000000000 --- a/src/mongo/db/query/ce/stats_cache.cpp +++ /dev/null @@ -1,82 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - - -#include "mongo/platform/basic.h" - -#include "mongo/db/query/ce/stats_cache.h" - -#include "mongo/db/query/ce/collection_statistics.h" -#include "mongo/util/read_through_cache.h" - -#include "mongo/logv2/log.h" - -#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kQuery - - -namespace mongo { -using namespace mongo::ce; - -namespace { - -const auto statsCacheDecoration = ServiceContext::declareDecoration>(); - -} // namespace - -StatsCache::StatsCache(ServiceContext* service, - std::unique_ptr cacheLoader, - ThreadPoolInterface& threadPool, - int size) - : ReadThroughCache( - _mutex, - service, - threadPool, - [this](OperationContext* opCtx, - const StatsPathString& statsPath, - const ValueHandle& stats) { return _lookupStats(opCtx, statsPath, stats); }, - size), - _statsCacheLoader(std::move(cacheLoader)) {} - -StatsCache::LookupResult StatsCache::_lookupStats(OperationContext* opCtx, - const StatsPathString& statsPath, - const StatsCacheValueHandle& stats) { - - try { - invariant(_statsCacheLoader); - auto newStats = _statsCacheLoader->getStats(opCtx, statsPath).get(); - return LookupResult(std::move(newStats)); - } catch (const DBException& ex) { - if (ex.code() == ErrorCodes::NamespaceNotFound) { - return StatsCache::LookupResult(boost::none); - } - throw; - } -} - -} // namespace mongo diff --git a/src/mongo/db/query/ce/stats_cache.h b/src/mongo/db/query/ce/stats_cache.h deleted file mode 100644 index 65b5bcd19b4..00000000000 --- a/src/mongo/db/query/ce/stats_cache.h +++ /dev/null @@ -1,84 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include "mongo/base/string_data.h" -#include "mongo/db/namespace_string.h" -#include "mongo/db/query/ce/collection_statistics.h" -#include "mongo/db/query/ce/stats_cache_loader.h" -#include "mongo/util/concurrency/thread_pool.h" -#include "mongo/util/read_through_cache.h" - -namespace mongo { - -using namespace mongo::ce; - -using StatsCacheType = ReadThroughCache; -using StatsCacheValueHandle = StatsCacheType::ValueHandle; - -/** - * Collectoin statistics read through cache. It reads from the persitent storage but never wrties to - * it. - */ -class StatsCache : public StatsCacheType { -public: - /** - * The constructor provides the Service context under which this cache has been instantiated, - * and a Thread pool to be used for invoking the blocking 'lookup' calls. The size is the number - * of entries the underlying LRU cache will hold. - */ - StatsCache(ServiceContext* service, - std::unique_ptr cacheLoader, - ThreadPoolInterface& threadPool, - int size); - - /** - * Returns statsCacheLoader currently used for testing only. - */ - StatsCacheLoader* getStatsCacheLoader() { - invariant(_statsCacheLoader); - - return _statsCacheLoader.get(); - } - -private: - /** - * Reads collection stats from the underlying storage if its not found in the in memory cache. - */ - LookupResult _lookupStats(OperationContext* opCtx, - const StatsPathString& statsPath, - const ValueHandle& stats); - - Mutex _mutex = MONGO_MAKE_LATCH("StatsCache::_mutex"); - - std::unique_ptr _statsCacheLoader; -}; - -} // namespace mongo diff --git a/src/mongo/db/query/ce/stats_cache_loader.h b/src/mongo/db/query/ce/stats_cache_loader.h deleted file mode 100644 index a6ba3935c43..00000000000 --- a/src/mongo/db/query/ce/stats_cache_loader.h +++ /dev/null @@ -1,61 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include "mongo/db/namespace_string.h" -#include "mongo/db/query/ce/array_histogram.h" -#include "mongo/stdx/thread.h" - -namespace mongo { - -using namespace mongo::ce; - -using StatsPathString = std::pair; -using StatsCacheVal = std::shared_ptr; - -class StatsCacheLoader { -public: - /** - * Non-blocking call, which returns CollectionStatistics from the the persistent metadata store. - * - * If for some reason the asynchronous fetch operation cannot be dispatched (for example on - * shutdown), throws a DBException. - */ - virtual SemiFuture getStats(OperationContext* opCtx, - const StatsPathString& statsPath) = 0; - - virtual void setStatsReturnValueForTest(StatusWith swStats){}; - - virtual ~StatsCacheLoader() {} - - static constexpr StringData kStatsPrefix = "system.statistics"_sd; -}; - -} // namespace mongo diff --git a/src/mongo/db/query/ce/stats_cache_loader_impl.cpp b/src/mongo/db/query/ce/stats_cache_loader_impl.cpp deleted file mode 100644 index bd4d54c4e17..00000000000 --- a/src/mongo/db/query/ce/stats_cache_loader_impl.cpp +++ /dev/null @@ -1,86 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - - -#include "mongo/platform/basic.h" - -#include "mongo/db/query/ce/stats_cache_loader_impl.h" - -#include "mongo/db/dbdirectclient.h" -#include "mongo/db/namespace_string.h" -#include "mongo/db/query/ce/stats_gen.h" -#include "mongo/logv2/log.h" -#include "mongo/stdx/thread.h" - -#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kQuery - -namespace mongo { - -SemiFuture StatsCacheLoaderImpl::getStats(OperationContext* opCtx, - const StatsPathString& statsPath) { - - std::string statsColl(kStatsPrefix + "." + statsPath.first.coll()); - - NamespaceString statsNss(statsPath.first.db(), statsColl); - DBDirectClient client(opCtx); - - - FindCommandRequest findRequest{statsNss}; - BSONObj filter = BSON("_id" << statsPath.second); - LOGV2_DEBUG(7085600, 1, "findRequest filter", "filter"_attr = filter.toString()); - findRequest.setFilter(filter.getOwned()); - - try { - auto cursor = client.find(std::move(findRequest)); - - if (!cursor) { - uasserted(ErrorCodes::OperationFailed, - str::stream() - << "Failed to establish a cursor for reading " << statsPath.first.ns() - << ", path " << statsPath.second << " from local storage"); - } - - if (cursor->more()) { - IDLParserContext ctx("StatsPath"); - BSONObj document = cursor->nextSafe().getOwned(); - auto parsedStats = StatsPath::parse(ctx, document); - StatsCacheVal statsPtr(new ArrayHistogram(parsedStats.getStatistics())); - return makeReadyFutureWith([this, statsPtr] { return statsPtr; }).semi(); - } - - uasserted(ErrorCodes::NamespaceNotFound, - str::stream() << "Stats does not exists for " << statsNss.ns() << ", path " - << statsPath.second); - } catch (const DBException& ex) { - uassertStatusOK(ex.toStatus()); - } - MONGO_UNREACHABLE -} - -} // namespace mongo diff --git a/src/mongo/db/query/ce/stats_cache_loader_impl.h b/src/mongo/db/query/ce/stats_cache_loader_impl.h deleted file mode 100644 index b461d1d51c6..00000000000 --- a/src/mongo/db/query/ce/stats_cache_loader_impl.h +++ /dev/null @@ -1,47 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include "mongo/db/namespace_string.h" -#include "mongo/db/query/ce/collection_statistics.h" -#include "mongo/db/query/ce/stats_cache_loader.h" -#include "mongo/stdx/thread.h" - -namespace mongo { - -using namespace mongo::ce; - -class StatsCacheLoaderImpl : public StatsCacheLoader { -public: - SemiFuture getStats(OperationContext* opCtx, - const StatsPathString& statsPath) override; -}; - -} // namespace mongo diff --git a/src/mongo/db/query/ce/stats_cache_loader_mock.cpp b/src/mongo/db/query/ce/stats_cache_loader_mock.cpp deleted file mode 100644 index ddf343bd026..00000000000 --- a/src/mongo/db/query/ce/stats_cache_loader_mock.cpp +++ /dev/null @@ -1,53 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - - -#include "mongo/platform/basic.h" - -#include "mongo/db/query/ce/stats_cache_loader_mock.h" - -#include "mongo/db/namespace_string.h" -#include "mongo/db/query/ce/collection_statistics.h" -#include "mongo/stdx/thread.h" - -namespace mongo { - -const Status StatsCacheLoaderMock::kInternalErrorStatus = { - ErrorCodes::InternalError, "Stats cache loader received unexpected request"}; - -SemiFuture StatsCacheLoaderMock::getStats(OperationContext* opCtx, - const StatsPathString& statsPath) { - - return makeReadyFutureWith([this] { return _swStatsReturnValueForTest; }).semi(); -} - -void StatsCacheLoaderMock::setStatsReturnValueForTest(StatusWith swStats) { - _swStatsReturnValueForTest = std::move(swStats); -} -} // namespace mongo diff --git a/src/mongo/db/query/ce/stats_cache_loader_mock.h b/src/mongo/db/query/ce/stats_cache_loader_mock.h deleted file mode 100644 index 0b105d5858a..00000000000 --- a/src/mongo/db/query/ce/stats_cache_loader_mock.h +++ /dev/null @@ -1,54 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include "mongo/db/namespace_string.h" -#include "mongo/db/query/ce/collection_statistics.h" -#include "mongo/db/query/ce/stats_cache_loader.h" -#include "mongo/stdx/thread.h" - -namespace mongo { - -using namespace mongo::ce; - -class StatsCacheLoaderMock : public StatsCacheLoader { -public: - SemiFuture getStats(OperationContext* opCtx, - const StatsPathString& statsPath) override; - - void setStatsReturnValueForTest(StatusWith swStats); - - static const Status kInternalErrorStatus; - -private: - StatusWith _swStatsReturnValueForTest{kInternalErrorStatus}; -}; - -} // namespace mongo diff --git a/src/mongo/db/query/ce/stats_cache_loader_test.cpp b/src/mongo/db/query/ce/stats_cache_loader_test.cpp deleted file mode 100644 index 9fc003e524a..00000000000 --- a/src/mongo/db/query/ce/stats_cache_loader_test.cpp +++ /dev/null @@ -1,116 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/bson/oid.h" -#include "mongo/db/catalog/collection_write_path.h" -#include "mongo/db/db_raii.h" -#include "mongo/db/query/ce/scalar_histogram.h" -#include "mongo/db/query/ce/stats_cache_loader_impl.h" -#include "mongo/db/query/ce/stats_cache_loader_test_fixture.h" -#include "mongo/db/query/ce/stats_gen.h" -#include "mongo/unittest/unittest.h" -#include "mongo/util/assert_util.h" -#include "mongo/util/fail_point.h" - -namespace mongo { -namespace { - -class StatsCacheLoaderTest : public StatsCacheLoaderTestFixture { -protected: - void createStatsCollection(NamespaceString nss); - StatsCacheLoaderImpl _statsCacheLoader; -}; - -void StatsCacheLoaderTest::createStatsCollection(NamespaceString nss) { - auto opCtx = operationContext(); - AutoGetCollection autoColl(opCtx, nss, MODE_IX); - auto db = autoColl.ensureDbExists(opCtx); - WriteUnitOfWork wuow(opCtx); - ASSERT(db->createCollection(opCtx, nss)); - wuow.commit(); -} - -TEST_F(StatsCacheLoaderTest, VerifyStatsLoad) { - // Initialize histogram buckets. - constexpr double doubleCount = 15.0; - constexpr double trueCount = 12.0; - constexpr double falseCount = 16.0; - constexpr double numDocs = doubleCount + trueCount + falseCount; - std::vector buckets{ - ce::Bucket{1.0, 0.0, 1.0, 0.0, 1.0}, - ce::Bucket{2.0, 5.0, 8.0, 1.0, 2.0}, - ce::Bucket{3.0, 4.0, 15.0, 2.0, 6.0}, - }; - - // Initialize histogram bounds. - auto [boundsTag, boundsVal] = sbe::value::makeNewArray(); - sbe::value::ValueGuard boundsGuard{boundsTag, boundsVal}; - auto bounds = sbe::value::getArrayView(boundsVal); - bounds->push_back(sbe::value::TypeTags::NumberDouble, 1.0); - bounds->push_back(sbe::value::TypeTags::NumberDouble, 2.0); - bounds->push_back(sbe::value::TypeTags::NumberDouble, 3.0); - - // Create a scalar histogram. - ce::TypeCounts tc{ - {sbe::value::TypeTags::NumberDouble, doubleCount}, - {sbe::value::TypeTags::Boolean, trueCount + falseCount}, - }; - ce::ScalarHistogram sh(*bounds, buckets); - ce::ArrayHistogram ah(sh, tc, trueCount, falseCount); - auto expectedSerialized = ah.serialize(); - - // Serialize histogram into a stats path. - std::string path = "somePath"; - auto serialized = stats::makeStatsPath(path, numDocs, ah); - - // Initalize stats collection. - NamespaceString nss("test", "stats"); - std::string statsColl(StatsCacheLoader::kStatsPrefix + "." + nss.coll()); - NamespaceString statsNss(nss.db(), statsColl); - createStatsCollection(statsNss); - - // Write serialized stats path to collection. - AutoGetCollection autoColl(operationContext(), statsNss, MODE_IX); - const CollectionPtr& coll = autoColl.getCollection(); - { - WriteUnitOfWork wuow(operationContext()); - ASSERT_OK(collection_internal::insertDocument( - operationContext(), coll, InsertStatement(serialized), nullptr)); - wuow.commit(); - } - - // Read stats path & verify values are consistent with what we expect. - auto actualAH = _statsCacheLoader.getStats(operationContext(), std::make_pair(nss, path)).get(); - auto actualSerialized = actualAH->serialize(); - - ASSERT_BSONOBJ_EQ(expectedSerialized, actualSerialized); -} - -} // namespace -} // namespace mongo diff --git a/src/mongo/db/query/ce/stats_cache_loader_test_fixture.cpp b/src/mongo/db/query/ce/stats_cache_loader_test_fixture.cpp deleted file mode 100644 index 20510a19203..00000000000 --- a/src/mongo/db/query/ce/stats_cache_loader_test_fixture.cpp +++ /dev/null @@ -1,76 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/platform/basic.h" - -#include - -#include "mongo/db/query/ce/stats_cache_loader_test_fixture.h" - -#include "mongo/db/repl/replication_coordinator_mock.h" -#include "mongo/db/repl/storage_interface_impl.h" -#include "mongo/db/service_context_d_test_fixture.h" - -namespace mongo { - -void StatsCacheLoaderTestFixture::setUp() { - // Set up mongod. - ServiceContextMongoDTest::setUp(); - - auto service = getServiceContext(); - _storage = std::make_unique(); - _opCtx = cc().makeOperationContext(); - - // Set up ReplicationCoordinator and ensure that we are primary. - auto replCoord = std::make_unique(service); - ASSERT_OK(replCoord->setFollowerMode(repl::MemberState::RS_PRIMARY)); - repl::ReplicationCoordinator::set(service, std::move(replCoord)); - - // Set up oplog collection. If the WT storage engine is used, the oplog collection is expected - // to exist when fetching the next opTime (LocalOplogInfo::getNextOpTimes) to use for a write. - repl::createOplog(operationContext()); -} - -void StatsCacheLoaderTestFixture::tearDown() { - _storage.reset(); - _opCtx.reset(); - - // Tear down mongod. - ServiceContextMongoDTest::tearDown(); -} - -OperationContext* StatsCacheLoaderTestFixture::operationContext() { - return _opCtx.get(); -} - -repl::StorageInterface* StatsCacheLoaderTestFixture::storageInterface() { - return _storage.get(); -} - -} // namespace mongo diff --git a/src/mongo/db/query/ce/stats_cache_loader_test_fixture.h b/src/mongo/db/query/ce/stats_cache_loader_test_fixture.h deleted file mode 100644 index 6ffc992b9f8..00000000000 --- a/src/mongo/db/query/ce/stats_cache_loader_test_fixture.h +++ /dev/null @@ -1,60 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include "mongo/db/operation_context.h" -#include "mongo/db/query/ce/stats_cache_loader.h" -#include "mongo/db/repl/storage_interface_impl.h" -#include "mongo/db/service_context_d_test_fixture.h" - -namespace mongo { - -/** - * Sets up and provides a repl::StorageInterface and OperationContext. - * Database data are cleared between test runs. - */ -class StatsCacheLoaderTestFixture : public ServiceContextMongoDTest { -public: - explicit StatsCacheLoaderTestFixture(Options options = {}) - : ServiceContextMongoDTest(std::move(options)) {} - - OperationContext* operationContext(); - repl::StorageInterface* storageInterface(); - -protected: - void setUp() override; - void tearDown() override; - -private: - ServiceContext::UniqueOperationContext _opCtx; - std::unique_ptr _storage; -}; - -} // namespace mongo diff --git a/src/mongo/db/query/ce/stats_cache_test.cpp b/src/mongo/db/query/ce/stats_cache_test.cpp deleted file mode 100644 index 4e92a9ea2ca..00000000000 --- a/src/mongo/db/query/ce/stats_cache_test.cpp +++ /dev/null @@ -1,133 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/platform/basic.h" - -#include - -#include "mongo/db/client.h" -#include "mongo/db/concurrency/locker_noop_service_context_test_fixture.h" -#include "mongo/db/operation_context.h" -#include "mongo/db/query/ce/stats_cache.h" -#include "mongo/db/query/ce/stats_cache_loader_mock.h" -#include "mongo/unittest/barrier.h" -#include "mongo/unittest/unittest.h" -#include "mongo/util/concurrency/thread_pool.h" -#include "mongo/util/read_through_cache.h" -#include "mongo/util/scopeguard.h" - -#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kDefault - -namespace mongo { -namespace { - -using unittest::assertGet; - -/** - * Fixture for tests, which do not need to exercise the multi-threading capabilities of the cache - * and as such do not require control over the creation/destruction of their operation contexts. - */ -class StatsCacheTest : public LockerNoopServiceContextTest { -protected: - // Extends StatsCache and automatically provides it with a thread pool, which will be - // shutdown and joined before the StatsCache is destroyed (which is part of the contract of - // ReadThroughCache) - class CacheWithThreadPool : public StatsCache { - public: - CacheWithThreadPool(ServiceContext* service, - std::unique_ptr cacheLoaderMock, - size_t size) - : StatsCache(service, std::move(cacheLoaderMock), _threadPool, size) { - _threadPool.startup(); - } - - private: - ThreadPool _threadPool{[] { - ThreadPool::Options options; - options.poolName = "StatsCacheTest"; - options.minThreads = 1; - options.maxThreads = 1; - return options; - }()}; - }; - - const ServiceContext::UniqueOperationContext _opCtxHolder{makeOperationContext()}; - OperationContext* const _opCtx{_opCtxHolder.get()}; -}; - -TEST(StatsCacheTest, StandaloneValueHandle) { - StatsCacheVal statsPtr(new ArrayHistogram()); - StatsCache::ValueHandle standaloneHandle(std::move(statsPtr)); - ASSERT(standaloneHandle.isValid()); -} - -TEST_F(StatsCacheTest, KeyDoesNotExist) { - Status namespaceNotFoundErrorStatus = {ErrorCodes::NamespaceNotFound, - "The key does not exists"}; - auto cacheLoaderMock = std::make_unique(); - auto cache = CacheWithThreadPool(getServiceContext(), std::move(cacheLoaderMock), 1); - cache.getStatsCacheLoader()->setStatsReturnValueForTest( - std::move(namespaceNotFoundErrorStatus)); - auto handle = cache.acquire(_opCtx, std::make_pair(NamespaceString("db", "coll"), "somePath")); - ASSERT(!handle); -} - -/* -TEST_F(StatsCacheTest, LoadStats) { - auto cacheLoaderMock = std::make_unique(); - auto cache = CacheWithThreadPool(getServiceContext(), std::move(cacheLoaderMock), 1); - - auto stats1 = CollectionStatistics(1); - auto stats2 = CollectionStatistics(2); - - cache.getStatsCacheLoader()->setStatsReturnValueForTest(std::move(stats1)); - - auto handle = cache.acquire(_opCtx, NamespaceString("db", "coll1")); - ASSERT(handle.isValid()); - ASSERT_EQ(1, handle->getCardinality()); - - // Make all requests to StatsCacheLoader to throw an exception to ensre that test returns value - // from cache. - Status internalErrorStatus = {ErrorCodes::InternalError, - "Stats cache loader received unexpected request"}; - cache.getStatsCacheLoader()->setStatsReturnValueForTest(std::move(internalErrorStatus)); - - handle = cache.acquire(_opCtx, NamespaceString("db", "coll1")); - ASSERT(handle.isValid()); - ASSERT_EQ(1, handle->getCardinality()); - - cache.getStatsCacheLoader()->setStatsReturnValueForTest(std::move(stats2)); - handle = cache.acquire(_opCtx, NamespaceString("db", "coll2")); - ASSERT(handle.isValid()); - ASSERT_EQ(2, handle->getCardinality()); -} -*/ - -} // namespace -} // namespace mongo diff --git a/src/mongo/db/query/ce/stats_catalog.cpp b/src/mongo/db/query/ce/stats_catalog.cpp deleted file mode 100644 index d8b65d09e72..00000000000 --- a/src/mongo/db/query/ce/stats_catalog.cpp +++ /dev/null @@ -1,115 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - - -#include "mongo/platform/basic.h" - -#include "mongo/db/query/ce/stats_cache.h" -#include "mongo/db/query/ce/stats_catalog.h" - -#include "mongo/db/query/ce/array_histogram.h" -#include "mongo/db/query/ce/collection_statistics.h" -#include "mongo/util/read_through_cache.h" - -#include "mongo/logv2/log.h" - -#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kQuery - -namespace mongo { -using namespace mongo::ce; - -namespace { - -const auto statsCatalogDecoration = - ServiceContext::declareDecoration>(); - -} // namespace - -StatsCatalog::StatsCatalog(ServiceContext* service, - std::unique_ptr statsCacheLoader) - : _executor(std::make_shared([] { - ThreadPool::Options options; - options.poolName = "StatsCache"; - options.minThreads = 0; - options.maxThreads = 2; - return options; - }())), - _statsCache(service, std::move(statsCacheLoader), *_executor, 1000) { - _executor->startup(); -} - -StatsCatalog::~StatsCatalog() { - // The executor is used by the StatsCatalog, so it must be joined, before this cache is - // destroyed, per the contract of ReadThroughCache. - _executor->shutdown(); - _executor->join(); -} - -void StatsCatalog::set(ServiceContext* serviceContext, std::unique_ptr cache) { - auto& statsCatalog = statsCatalogDecoration(serviceContext); - invariant(!statsCatalog); - - statsCatalog = std::move(cache); -} - -StatsCatalog& StatsCatalog::get(ServiceContext* serviceContext) { - auto& statsCatalog = statsCatalogDecoration(serviceContext); - invariant(statsCatalog); - - return *statsCatalog; -} - -StatsCatalog& StatsCatalog::get(OperationContext* opCtx) { - return get(opCtx->getServiceContext()); -} - -StatusWith> StatsCatalog::getHistogram(OperationContext* opCtx, - const NamespaceString& nss, - const std::string& path) { - try { - auto handle = _statsCache.acquire(opCtx, std::make_pair(nss, path)); - uassert(ErrorCodes::NamespaceNotFound, - str::stream() << "path " << nss << " : " << path << " not found", - handle); - - return *(handle.get()); - } catch (const DBException& ex) { - return ex.toStatus(); - } -} - -Status StatsCatalog::invalidatePath(const NamespaceString& nss, const std::string& path) { - try { - _statsCache.invalidateKey(std::make_pair(nss, path)); - return Status::OK(); - } catch (const DBException& ex) { - return ex.toStatus(); - } -} -} // namespace mongo diff --git a/src/mongo/db/query/ce/stats_catalog.h b/src/mongo/db/query/ce/stats_catalog.h deleted file mode 100644 index efd53178c94..00000000000 --- a/src/mongo/db/query/ce/stats_catalog.h +++ /dev/null @@ -1,80 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include "mongo/base/string_data.h" -#include "mongo/db/namespace_string.h" -#include "mongo/db/query/ce/collection_statistics.h" -#include "mongo/db/query/ce/stats_cache.h" -#include "mongo/db/query/ce/stats_cache_loader.h" -#include "mongo/util/concurrency/thread_pool.h" - -namespace mongo { - -using namespace mongo::ce; - -/** - * This class owns statsCache and manages executor lifetime. - */ -class StatsCatalog { -public: - /** - * Stores the catalog on the specified service context. May only be called once for the lifetime - * of the service context. - */ - static void set(ServiceContext* serviceContext, std::unique_ptr catalog); - - static StatsCatalog& get(ServiceContext* serviceContext); - static StatsCatalog& get(OperationContext* opCtx); - - /** - * The constructor provides the Service context under which the cache needs to be instantiated, - * and a Thread pool to be used for invoking the blocking 'lookup' calls. The size is the number - * of entries the underlying LRU cache will hold. - */ - StatsCatalog(ServiceContext* service, std::unique_ptr cacheLoader); - - ~StatsCatalog(); - - StatusWith> getHistogram(OperationContext* opCtx, - const NamespaceString& nss, - const std::string& path); - - Status invalidatePath(const NamespaceString& nss, const std::string& path); - -private: - /** - * The executor is used by the cache. - */ - std::shared_ptr _executor; - StatsCache _statsCache; -}; - -} // namespace mongo diff --git a/src/mongo/db/query/ce/stats_path_test.cpp b/src/mongo/db/query/ce/stats_path_test.cpp deleted file mode 100644 index c4ed743e987..00000000000 --- a/src/mongo/db/query/ce/stats_path_test.cpp +++ /dev/null @@ -1,131 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/platform/basic.h" - -#include "mongo/bson/bsonobjbuilder.h" -#include "mongo/db/exec/sbe/values/bson.h" -#include "mongo/db/exec/sbe/values/value.h" -#include "mongo/db/query/ce/array_histogram.h" -#include "mongo/db/query/ce/scalar_histogram.h" -#include "mongo/db/query/ce/stats_gen.h" -#include "mongo/unittest/unittest.h" -#include "mongo/util/assert_util.h" - -namespace mongo { -namespace { - -IDLParserContext ctx("StatsPath"); - -/** - * Validate round trip conversion for histogram bucket - */ -TEST(StatsPath, BasicValidStatsBucketDouble) { - // Create & parse StatsBucket. - auto serializedBucket = ce::Bucket{3.0, 4.0, 15.0, 2.0, 6.0}.serialize(); - auto parsedBucket = StatsBucket::parse(ctx, serializedBucket); - - // Round-trip conversion. - auto bucketToBSON = parsedBucket.toBSON(); - ASSERT_BSONOBJ_EQ(serializedBucket, bucketToBSON); -} - -/** - * Validate round-trip conversion for StatsPath datatype. - */ -TEST(StatsPath, BasicValidStatsPath) { - // Initialize histogram buckets. - constexpr double doubleCount = 15.0; - constexpr double trueCount = 12.0; - constexpr double falseCount = 16.0; - constexpr double numDocs = doubleCount + trueCount + falseCount; - std::vector buckets{ - ce::Bucket{1.0, 0.0, 1.0, 0.0, 1.0}, - ce::Bucket{2.0, 5.0, 8.0, 1.0, 2.0}, - ce::Bucket{3.0, 4.0, 15.0, 2.0, 6.0}, - }; - - // Initialize histogram bounds. - auto [boundsTag, boundsVal] = sbe::value::makeNewArray(); - sbe::value::ValueGuard boundsGuard{boundsTag, boundsVal}; - auto bounds = sbe::value::getArrayView(boundsVal); - bounds->push_back(sbe::value::TypeTags::NumberDouble, 1.0); - bounds->push_back(sbe::value::TypeTags::NumberDouble, 2.0); - bounds->push_back(sbe::value::TypeTags::NumberDouble, 3.0); - - // Create a scalar histogram. - ce::TypeCounts tc{ - {sbe::value::TypeTags::NumberDouble, doubleCount}, - {sbe::value::TypeTags::Boolean, trueCount + falseCount}, - }; - ce::ScalarHistogram sh(*bounds, buckets); - ce::ArrayHistogram ah(sh, tc, trueCount, falseCount); - - // Serialize to BSON. - auto serializedPath = stats::makeStatsPath("somePath", numDocs, ah); - - // Parse StatsPath via IDL & serialize to BSON. - auto parsedPath = StatsPath::parse(ctx, serializedPath); - auto parsedPathToBSON = parsedPath.toBSON(); - - // We should end up with the same serialized BSON in the end. - ASSERT_BSONOBJ_EQ(serializedPath, parsedPathToBSON); -} - -/** - * Validate round-trip conversion for StatsPath datatype. - */ -TEST(StatsPath, BasicValidEmptyStatsPath) { - // Initialize histogram buckets. - constexpr double numDocs = 0.0; - std::vector buckets; - - // Initialize histogram bounds. - auto [boundsTag, boundsVal] = sbe::value::makeNewArray(); - sbe::value::ValueGuard boundsGuard{boundsTag, boundsVal}; - auto bounds = sbe::value::getArrayView(boundsVal); - - // Create an empty scalar histogram. - ce::TypeCounts tc; - ce::ScalarHistogram sh(*bounds, buckets); - ce::ArrayHistogram ah(sh, tc); - - // Serialize to BSON. - auto serializedPath = stats::makeStatsPath("someEmptyPath", numDocs, ah); - - // Parse StatsPath via IDL & serialize to BSON. - auto parsedPath = StatsPath::parse(ctx, serializedPath); - auto parsedPathToBSON = parsedPath.toBSON(); - - // We should end up with the same serialized BSON in the end. - ASSERT_BSONOBJ_EQ(serializedPath, parsedPathToBSON); -} - -} // namespace -} // namespace mongo diff --git a/src/mongo/db/query/ce/test_utils.cpp b/src/mongo/db/query/ce/test_utils.cpp new file mode 100644 index 00000000000..55bf1645f12 --- /dev/null +++ b/src/mongo/db/query/ce/test_utils.cpp @@ -0,0 +1,214 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/ce/test_utils.h" + +#include "mongo/db/pipeline/abt/utils.h" +#include "mongo/db/query/optimizer/explain.h" +#include "mongo/db/query/optimizer/metadata_factory.h" +#include "mongo/db/query/optimizer/opt_phase_manager.h" +#include "mongo/db/query/optimizer/rewrites/const_eval.h" +#include "mongo/db/query/optimizer/utils/unit_test_pipeline_utils.h" +#include "mongo/db/query/optimizer/utils/unit_test_utils.h" +#include "mongo/db/query/sbe_stage_builder_helpers.h" +#include "mongo/unittest/unittest.h" + +namespace mongo::optimizer::ce { +namespace value = sbe::value; + +CETester::CETester(std::string collName, + double collCard, + const OptPhaseManager::PhaseSet& optPhases) + : _optPhases(optPhases), _hints(), _metadata({}), _collName(collName) { + addCollection(collName, collCard); +} + +CEType CETester::getMatchCE(const std::string& queryPredicate, + std::function nodePredicate) const { + return getCE("[{$match: " + queryPredicate + "}]", nodePredicate); +} + +CEType CETester::getCE(const std::string& pipeline, + std::function nodePredicate) const { + if constexpr (kCETestLogOnly) { + std::cout << "\n\nQuery: " << pipeline << "\n"; + } + + // Construct ABT from pipeline and optimize. + ABT abt = translatePipeline(pipeline, _collName); + + // Get cardinality estimate. + return getCE(abt, nodePredicate); +} + +CEType CETester::getCE(ABT& abt, std::function nodePredicate) const { + if constexpr (kCETestLogOnly) { + std::cout << ExplainGenerator::explainV2(abt) << std::endl; + } + + OptPhaseManager phaseManager{_optPhases, + _prefixId, + false /*requireRID*/, + _metadata, + getEstimator(), + makeHeuristicCE(), + makeCostEstimator(), + defaultConvertPathToInterval, + ConstEval::constFold, + DebugInfo::kDefaultForTests, + _hints}; + phaseManager.optimize(abt); + + const auto& memo = phaseManager.getMemo(); + if constexpr (kCETestLogOnly) { + std::cout << ExplainGenerator::explainMemo(memo) << std::endl; + } + + auto cht = getEstimator(); + + // If we are running no optimization phases, we are ensuring that we get the correct estimate on + // the original ABT (usually testing the CE for FilterNodes). The memo won't have any groups for + // us to estimate directly yet. + if (_optPhases.empty()) { + auto card = cht->deriveCE(_metadata, memo, {}, abt.ref()); + + if constexpr (kCETestLogOnly) { + std::cout << "CE: " << card << std::endl; + } + + return card; + } + + CEType outCard = kInvalidCardinality; + for (size_t groupId = 0; groupId < memo.getGroupCount(); groupId++) { + // Note that we always verify CE for MemoLogicalDelegatorNodes when calling getCE(). + + // If the 'optPhases' either ends with the MemoSubstitutionPhase or the + // MemoImplementationPhase, we should have exactly one logical node per group. However, if + // we have indexes, or a $group, we may have multiple logical nodes. In this case, we still + // want to pick the first node. + const auto& node = memo.getLogicalNodes(groupId).front(); + + // This gets the cardinality estimate actually produced during optimization. + const auto& logicalProps = memo.getLogicalProps(groupId); + auto memoCE = properties::getPropertyConst(logicalProps) + .getEstimate(); + + // Conversely, here we call deriveCE() on the ABT produced by the optimization phases, which + // has all its delegators dereferenced. + auto card = cht->deriveCE(_metadata, memo, logicalProps, node.ref()); + + if constexpr (!kCETestLogOnly) { + // Ensure that the CE stored for the logical nodes of each group is what we would expect + // when estimating that node directly. Note that this check will fail if we are testing + // histogram estimation and only using the MemoSubstitutionPhase because the memo always + // uses heuristic estimation in this case. + ASSERT_APPROX_EQUAL(card, memoCE, kMaxCEError); + } else { + if (std::abs(memoCE - card) > kMaxCEError) { + std::cout << "ERROR: CE Group(" << groupId << ") " << card << " vs. " << memoCE + << std::endl; + std::cout << ExplainGenerator::explainV2(node) << std::endl; + } + } + + if (nodePredicate(node)) { + // We want to return the cardinality for the memo group matching the 'nodePredicate'. + outCard = memoCE; + } + } + + ASSERT_NOT_EQUALS(outCard, kInvalidCardinality); + + if constexpr (kCETestLogOnly) { + std::cout << "CE: " << outCard << std::endl; + } + + return outCard; +} + +ScanDefinition& CETester::getCollScanDefinition() { + auto it = _metadata._scanDefs.find(_collName); + invariant(it != _metadata._scanDefs.end()); + return it->second; +} + + +void CETester::setCollCard(double card) { + auto& scanDef = getCollScanDefinition(); + addCollection(_collName, card, scanDef.getIndexDefs()); +} + +void CETester::setIndexes(opt::unordered_map indexes) { + auto& scanDef = getCollScanDefinition(); + addCollection(_collName, scanDef.getCE(), indexes); +} + +void CETester::addCollection(std::string collName, + double numRecords, + opt::unordered_map indexes) { + _metadata._scanDefs.insert_or_assign(collName, + createScanDef({}, + indexes, + ConstEval::constFold, + {DistributionType::Centralized}, + true /*exists*/, + numRecords)); +} + +stats::ScalarHistogram createHistogram(const std::vector& data) { + value::Array bounds; + std::vector buckets; + + double cumulativeFreq = 0.0; + double cumulativeNDV = 0.0; + + for (size_t i = 0; i < data.size(); i++) { + const auto& item = data.at(i); + const auto [tag, val] = stage_builder::makeValue(item._v); + bounds.push_back(tag, val); + + cumulativeFreq += item._equalFreq + item._rangeFreq; + cumulativeNDV += item._ndv + 1.0; + buckets.emplace_back( + item._equalFreq, item._rangeFreq, cumulativeFreq, item._ndv, cumulativeNDV); + } + + return {std::move(bounds), std::move(buckets)}; +} + +double estimateIntValCard(const stats::ScalarHistogram& hist, + const int v, + const EstimationType type) { + const auto [tag, val] = + std::make_pair(value::TypeTags::NumberInt64, value::bitcastFrom(v)); + return estimate(hist, tag, val, type).card; +}; + +} // namespace mongo::optimizer::ce diff --git a/src/mongo/db/query/ce/test_utils.h b/src/mongo/db/query/ce/test_utils.h new file mode 100644 index 00000000000..1f84fe9a1a8 --- /dev/null +++ b/src/mongo/db/query/ce/test_utils.h @@ -0,0 +1,231 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/query/ce/histogram_predicate_estimation.h" +#include "mongo/db/query/optimizer/cascades/interfaces.h" +#include "mongo/db/query/optimizer/opt_phase_manager.h" +#include "mongo/db/query/stats/scalar_histogram.h" + +namespace mongo::optimizer::ce { +// Enable this flag to log all estimates, and let all tests pass. +constexpr bool kCETestLogOnly = false; + +const double kMaxCEError = 0.01; +const CEType kInvalidCardinality = -1.0; + +const OptPhaseManager::PhaseSet kDefaultCETestPhaseSet{OptPhase::MemoSubstitutionPhase, + OptPhase::MemoExplorationPhase, + OptPhase::MemoImplementationPhase}; + +const OptPhaseManager::PhaseSet kOnlySubPhaseSet{OptPhase::MemoSubstitutionPhase}; + +const OptPhaseManager::PhaseSet kNoOptPhaseSet{}; + +/** + * Helpful macros for asserting that the CE of a $match predicate is approximately what we were + * expecting. + */ + +#define _ASSERT_CE(estimatedCE, expectedCE) \ + if constexpr (kCETestLogOnly) { \ + if (std::abs(estimatedCE - expectedCE) > kMaxCEError) { \ + std::cout << "ERROR: expected " << expectedCE << std::endl; \ + } \ + ASSERT_APPROX_EQUAL(1.0, 1.0, kMaxCEError); \ + } else { \ + ASSERT_APPROX_EQUAL(estimatedCE, expectedCE, kMaxCEError); \ + } +#define _PREDICATE(field, predicate) (str::stream() << "{" << field << ": " << predicate "}") +#define _ELEMMATCH_PREDICATE(field, predicate) \ + (str::stream() << "{" << field << ": {$elemMatch: " << predicate << "}}") + +// This macro verifies the cardinality of a pipeline or an input ABT. +#define ASSERT_CE(ce, pipeline, expectedCE) _ASSERT_CE(ce.getCE(pipeline), (expectedCE)) + +// This macro does the same as above but also sets the collection cardinality. +#define ASSERT_CE_CARD(ce, pipeline, expectedCE, collCard) \ + ce.setCollCard(collCard); \ + ASSERT_CE(ce, pipeline, expectedCE) + +// This macro verifies the cardinality of a pipeline with a single $match predicate. +#define ASSERT_MATCH_CE(ce, predicate, expectedCE) \ + _ASSERT_CE(ce.getMatchCE(predicate), (expectedCE)) + +#define ASSERT_MATCH_CE_NODE(ce, queryPredicate, expectedCE, nodePredicate) \ + _ASSERT_CE(ce.getMatchCE(queryPredicate, nodePredicate), (expectedCE)) + +// This macro does the same as above but also sets the collection cardinality. +#define ASSERT_MATCH_CE_CARD(ce, predicate, expectedCE, collCard) \ + ce.setCollCard(collCard); \ + ASSERT_MATCH_CE(ce, predicate, expectedCE) + +// This macro tests cardinality of two versions of the predicate; with and without $elemMatch. +#define ASSERT_EQ_ELEMMATCH_CE(tester, expectedCE, elemMatchExpectedCE, field, predicate) \ + ASSERT_MATCH_CE(tester, _PREDICATE(field, predicate), expectedCE); \ + ASSERT_MATCH_CE(tester, _ELEMMATCH_PREDICATE(field, predicate), elemMatchExpectedCE) + +#define ASSERT_EQ_ELEMMATCH_CE_NODE(tester, expectedCE, elemMatchExpectedCE, field, predicate, n) \ + ASSERT_MATCH_CE_NODE(tester, _PREDICATE(field, predicate), expectedCE, n); \ + ASSERT_MATCH_CE_NODE(tester, _ELEMMATCH_PREDICATE(field, predicate), elemMatchExpectedCE, n) + +// Some commonly used functions for picking nodes in the memo for testing estimation. +template +bool isSargableNode(const ABT& n) { + if constexpr (NumReq == 0) { + return n.is(); + } + + // Sometimes SargableNodes get split and placed into different memo groups, but we are looking + // for a SargableNode with a specific number of predicates. For tests, we only care about + // verifying the cardinality of that one. + if (auto* sargable = n.cast()) { + return sargable->getReqMap().size() == NumReq; + } + return false; +} +const auto isSargable = isSargableNode<0>; +const auto isSargable1 = isSargableNode<1>; +const auto isSargable2 = isSargableNode<2>; +const auto isSargable3 = isSargableNode<3>; +const auto isSargable4 = isSargableNode<4>; +const auto isRoot = [](const ABT& n) -> bool { return n.is(); }; + +/** + * A test utility class for helping verify the cardinality of CE transports on a given $match + * predicate. + */ +class CETester { +public: + /** + * The tester initializes at least one collection with the name 'collName' and the cardinality + * 'numRecords' in the metadata. + */ + CETester(std::string collName, + double numRecords, + const OptPhaseManager::PhaseSet& optPhases = kDefaultCETestPhaseSet); + + /** + * Returns the estimated cardinality of a given 'matchPredicate'. + * + * 'nodePredicate' identifies the node in the memo we want to estimate. + */ + CEType getMatchCE(const std::string& matchPredicate, + std::function nodePredicate = isRoot) const; + + /** + * Returns the estimated cardinality of a given 'pipeline'. + * + * 'nodePredicate' identifies the node in the memo we want to estimate. + */ + CEType getCE(const std::string& pipeline, + std::function nodePredicate = isRoot) const; + + /** + * Returns the estimated cardinality of a given 'abt'. + * + * 'nodePredicate' identifies the node in the memo we want to estimate. + */ + CEType getCE(ABT& abt, std::function nodePredicate = isRoot) const; + + /** + * Updates the cardinality of the collection '_collName'. + */ + void setCollCard(double card); + + /** + * Updates the indexes used by the collection '_collName'. + */ + void setIndexes(opt::unordered_map indexes); + + /** + * Adds a ScanDefinition for an additional collection for the test. + */ + void addCollection(std::string collName, + double numRecords, + opt::unordered_map indexes = {}); + + /** + * Prevents the optimizer from generating collection scan plans. + */ + void setDisableScan(bool disableScan) { + _hints._disableScan = disableScan; + } + +protected: + /** + * Subclasses need to override this method to initialize the cardinality estimators they are + * testing. + */ + virtual std::unique_ptr getEstimator() const = 0; + +private: + /** + * Helper to find the ScanDefinition of '_collName' in _metadata. + */ + ScanDefinition& getCollScanDefinition(); + + // Phases to use when optimizing an input query. + const OptPhaseManager::PhaseSet& _optPhases; + + // Used to initialize the OptPhaseManager. + mutable PrefixId _prefixId; + + // Allows us to pass hints to the optimizer. + QueryHints _hints; + + // Stores the ScanDefinitions for all collections defined in the test. + Metadata _metadata; + + // Name of the collection tests will be executed against. + std::string _collName; +}; + +/** + * Test utility for helping with creation of manual histograms in the unit tests. + */ +struct BucketData { + Value _v; + double _equalFreq; + double _rangeFreq; + double _ndv; + + BucketData(Value v, double equalFreq, double rangeFreq, double ndv) + : _v(v), _equalFreq(equalFreq), _rangeFreq(rangeFreq), _ndv(ndv) {} + BucketData(const std::string& v, double equalFreq, double rangeFreq, double ndv) + : BucketData(Value(v), equalFreq, rangeFreq, ndv) {} + BucketData(int v, double equalFreq, double rangeFreq, double ndv) + : BucketData(Value(v), equalFreq, rangeFreq, ndv) {} +}; + +stats::ScalarHistogram createHistogram(const std::vector& data); + +double estimateIntValCard(const stats::ScalarHistogram& hist, int v, EstimationType type); +} // namespace mongo::optimizer::ce diff --git a/src/mongo/db/query/ce/value_utils.cpp b/src/mongo/db/query/ce/value_utils.cpp deleted file mode 100644 index 46e3f143b16..00000000000 --- a/src/mongo/db/query/ce/value_utils.cpp +++ /dev/null @@ -1,254 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/db/query/ce/value_utils.h" - -#include "mongo/db/query/ce/max_diff.h" -#include "mongo/db/query/ce/scalar_histogram.h" - -namespace mongo::ce { - -using namespace sbe; - -SBEValue::SBEValue(value::TypeTags tag, value::Value val) : _tag(tag), _val(val) {} - -SBEValue::SBEValue(std::pair v) : SBEValue(v.first, v.second) {} - -SBEValue::SBEValue(const SBEValue& other) { - auto [tag, val] = copyValue(other._tag, other._val); - _tag = tag; - _val = val; -} - -SBEValue::SBEValue(SBEValue&& other) { - _tag = other._tag; - _val = other._val; - - other._tag = value::TypeTags::Nothing; - other._val = 0; -} - -SBEValue::~SBEValue() { - value::releaseValue(_tag, _val); -} - -SBEValue& SBEValue::operator=(const SBEValue& other) { - value::releaseValue(_tag, _val); - - auto [tag, val] = copyValue(other._tag, other._val); - _tag = tag; - _val = val; - return *this; -} - -SBEValue& SBEValue::operator=(SBEValue&& other) { - value::releaseValue(_tag, _val); - - _tag = other._tag; - _val = other._val; - - other._tag = value::TypeTags::Nothing; - other._val = 0; - - return *this; -} - -std::pair SBEValue::get() const { - return std::make_pair(_tag, _val); -} - -value::TypeTags SBEValue::getTag() const { - return _tag; -} - -value::Value SBEValue::getValue() const { - return _val; -} - -std::pair makeInt64Value(int v) { - return std::make_pair(value::TypeTags::NumberInt64, value::bitcastFrom(v)); -}; - -std::pair makeNullValue() { - return std::make_pair(value::TypeTags::Null, 0); -}; - -bool sameTypeClass(value::TypeTags tag1, value::TypeTags tag2) { - if (tag1 == tag2) { - return true; - } - - static constexpr const char* kTempFieldName = "temp"; - - BSONObjBuilder minb1; - minb1.appendMinForType(kTempFieldName, value::tagToType(tag1)); - const BSONObj min1 = minb1.obj(); - - BSONObjBuilder minb2; - minb2.appendMinForType(kTempFieldName, value::tagToType(tag2)); - const BSONObj min2 = minb2.obj(); - - return min1.woCompare(min2) == 0; -} - -bool sameTypeBracket(value::TypeTags tag1, value::TypeTags tag2) { - if (tag1 == tag2) { - return true; - } - return ((value::isNumber(tag1) && value::isNumber(tag2)) || - (value::isString(tag1) && value::isString(tag2))); -} - -int32_t compareValues(value::TypeTags tag1, - value::Value val1, - value::TypeTags tag2, - value::Value val2) { - const auto [compareTag, compareVal] = value::compareValue(tag1, val1, tag2, val2); - uassert(6660547, "Invalid comparison result", compareTag == value::TypeTags::NumberInt32); - return value::bitcastTo(compareVal); -} - -void sortValueVector(std::vector& sortVector) { - const auto cmp = [](const SBEValue& a, const SBEValue& b) { - return compareValues(a.getTag(), a.getValue(), b.getTag(), b.getValue()) < 0; - }; - std::sort(sortVector.begin(), sortVector.end(), cmp); -} - -double valueToDouble(value::TypeTags tag, value::Value val) { - double result = 0; - if (value::isNumber(tag)) { - result = value::numericCast(tag, val); - } else if (value::isString(tag)) { - const StringData sd = value::getStringView(tag, val); - - // Convert a prefix of the string to a double. - const size_t maxPrecision = std::min(sd.size(), sizeof(double)); - for (size_t i = 0; i < maxPrecision; ++i) { - const char ch = sd[i]; - const double charToDbl = ch / std::pow(2, i * 8); - result += charToDbl; - } - } else if (tag == value::TypeTags::Date || tag == value::TypeTags::Timestamp) { - int64_t v = value::bitcastTo(val); - result = value::numericCast(value::TypeTags::NumberInt64, v); - - } else if (tag == value::TypeTags::ObjectId) { - auto objView = - ConstDataView(reinterpret_cast(sbe::value::getObjectIdView(val)->data())); - // Take the first 8 bytes of the ObjectId. - // ToDo: consider using the entire ObjectId or other parts of it - // auto v = objView.read>(sizeof(uint32_t)); - auto v = objView.read>(); - result = value::numericCast(value::TypeTags::NumberInt64, v); - } else { - uassert(6844500, "Unexpected value type", false); - } - - return result; -} - -bool canEstimateTypeViaHistogram(value::TypeTags tag) { - if (sbe::value::isNumber(tag) || value::isString(tag)) { - return true; - } - - switch (tag) { - // Other types that we can/do build histograms on: - // - Date/time types. - case value::TypeTags::Date: - case value::TypeTags::Timestamp: - // - ObjectId. - case value::TypeTags::ObjectId: - return true; - - // Types that can only be estimated via the type-counters. - case value::TypeTags::Object: - case value::TypeTags::Array: - case value::TypeTags::Null: - case value::TypeTags::Nothing: - case value::TypeTags::Boolean: - return false; - - // Trying to estimate any other types should result in an error. - default: - uasserted(7051100, - str::stream() - << "Type " << tag << " is not supported by histogram estimation."); - } - - MONGO_UNREACHABLE; -} - -std::string serialize(value::TypeTags tag) { - std::ostringstream os; - os << tag; - return os.str(); -} - -// TODO: does this belong in SBE value utils? -value::TypeTags deserialize(const std::string& name) { - if ("NumberInt32" == name) { - return value::TypeTags::NumberInt32; - } else if ("NumberInt64" == name) { - return value::TypeTags::NumberInt64; - } else if ("NumberDecimal" == name) { - return value::TypeTags::NumberDecimal; - } else if ("NumberDouble" == name) { - return value::TypeTags::NumberDouble; - } else if ("StringBig" == name) { - return value::TypeTags::StringBig; - } else if ("StringSmall" == name) { - return value::TypeTags::StringSmall; - } else if ("bsonString" == name) { - return value::TypeTags::bsonString; - } else if ("Date" == name) { - return value::TypeTags::Date; - } else if ("Timestamp" == name) { - return value::TypeTags::Timestamp; - } else if ("ObjectId" == name) { - return value::TypeTags::ObjectId; - } else if ("Object" == name) { - return value::TypeTags::Object; - } else if ("Boolean" == name) { - return value::TypeTags::Boolean; - } else if ("Array" == name) { - return value::TypeTags::Array; - } else if ("Null" == name) { - return value::TypeTags::Null; - } else if ("Nothing" == name) { - return value::TypeTags::Nothing; - } - - // Trying to deserialize any other types should result in an error. - uasserted(6660600, - str::stream() << "String " << name << " is not convertable to SBE type tag."); -} // namespace mongo::ce - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/value_utils.h b/src/mongo/db/query/ce/value_utils.h deleted file mode 100644 index 0191b4e2c26..00000000000 --- a/src/mongo/db/query/ce/value_utils.h +++ /dev/null @@ -1,123 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include "mongo/db/exec/sbe/values/value.h" - -namespace mongo::ce { - -using namespace sbe; - -/** - Container object for SBE value/tag pairs. Supplied values are owned by this object - and are released on destruction -*/ -class SBEValue { -public: - SBEValue(value::TypeTags tag, value::Value val); - SBEValue(std::pair v); - ~SBEValue(); - - SBEValue(const SBEValue& other); - SBEValue(SBEValue&& other); - - SBEValue& operator=(const SBEValue& other); - SBEValue& operator=(SBEValue&& other); - - std::pair get() const; - value::TypeTags getTag() const; - value::Value getValue() const; - -private: - value::TypeTags _tag; - value::Value _val; -}; - -/** - Generate an SBE Value pair that represents the supplied int with - type Int64 -*/ -std::pair makeInt64Value(int v); - -/** - Generate an SBE Value pair representing a BSON null value -*/ -std::pair makeNullValue(); - -/** - Do the supplied type tags represent the same BSON type? -*/ -bool sameTypeClass(value::TypeTags tag1, value::TypeTags tag2); - -/** - Do the supplied type tags represent the same BSON type? - TODO: This may be the same as sameTypeClass. @timourk? -*/ -bool sameTypeBracket(value::TypeTags tag1, value::TypeTags tag2); - -/** - Compare a pair of SBE values. - - The return will be - <0 if val1 < val2 in BSON order - 0 if val1 == val2 in BSON order - >0 if val1 > val2 in BSON order -*/ -int32_t compareValues(value::TypeTags tag1, - value::Value val1, - value::TypeTags tag2, - value::Value val2); - -/** - Sort a vector of values in place in BSON order -*/ -void sortValueVector(std::vector& sortVector); - -/** - Convert a value of any supported type into a double according to some metric. This - metric will be consistent with ordering in the type. -*/ -double valueToDouble(value::TypeTags tag, value::Value val); - -/** - * Returns true for types that can be estimated via histograms, and false for types that need type - * counters. Any other type results in a uassert. - * - * NOTE: This should be kept in sync with 'valueToDouble' above. - */ -bool canEstimateTypeViaHistogram(value::TypeTags tag); - -/** - * Serialize/Deserialize a TypeTag to a string for TypeCount storage in the stats collection. - */ -std::string serialize(value::TypeTags tag); -value::TypeTags deserialize(const std::string& name); - -} // namespace mongo::ce diff --git a/src/mongo/db/query/ce_mode_parameter.cpp b/src/mongo/db/query/ce_mode_parameter.cpp index 2099c65ab1d..f54831bf626 100644 --- a/src/mongo/db/query/ce_mode_parameter.cpp +++ b/src/mongo/db/query/ce_mode_parameter.cpp @@ -30,11 +30,11 @@ #include "mongo/db/query/ce_mode_parameter.h" #include "mongo/db/query/query_knobs_gen.h" -namespace mongo::ce { +namespace mongo::optimizer::ce { Status validateCEMode(const std::string& value, const boost::optional&) { if (value == kHeuristic || value == kHistogram || value == kSampling) { return Status::OK(); } return Status(ErrorCodes::Error{6695700}, "Invalid cardinality estimation mode."); } -} // namespace mongo::ce +} // namespace mongo::optimizer::ce diff --git a/src/mongo/db/query/ce_mode_parameter.h b/src/mongo/db/query/ce_mode_parameter.h index e103b82e927..f253c60e5fd 100644 --- a/src/mongo/db/query/ce_mode_parameter.h +++ b/src/mongo/db/query/ce_mode_parameter.h @@ -34,7 +34,7 @@ #include "mongo/base/status.h" #include "mongo/db/tenant_id.h" -namespace mongo::ce { +namespace mongo::optimizer::ce { /** * Defines cardinality estimation modes. @@ -45,4 +45,4 @@ const std::string kSampling = "sampling"; Status validateCEMode(const std::string& value, const boost::optional&); -} // namespace mongo::ce +} // namespace mongo::optimizer::ce diff --git a/src/mongo/db/query/ce_mode_parameter_test.cpp b/src/mongo/db/query/ce_mode_parameter_test.cpp index 9769127e5b3..011c5f210d3 100644 --- a/src/mongo/db/query/ce_mode_parameter_test.cpp +++ b/src/mongo/db/query/ce_mode_parameter_test.cpp @@ -31,7 +31,7 @@ #include "mongo/unittest/unittest.h" -namespace mongo::ce { +namespace mongo::optimizer::ce { TEST(CEModeParameterTest, ValidatesValidCEModes) { ASSERT_OK(validateCEMode("heuristic", boost::none)); @@ -44,4 +44,4 @@ TEST(CEModeParameterTest, RejectsInvalidCEModes) { ASSERT_NOT_OK(validateCEMode("", boost::none)); } -} // namespace mongo::ce +} // namespace mongo::optimizer::ce diff --git a/src/mongo/db/query/cost_model/SConscript b/src/mongo/db/query/cost_model/SConscript index d8ce836096a..44649d53978 100644 --- a/src/mongo/db/query/cost_model/SConscript +++ b/src/mongo/db/query/cost_model/SConscript @@ -7,7 +7,7 @@ env = env.Clone() env.Library( target="query_cost_model", source=[ - 'cost_estimator.cpp', + 'cost_estimator_impl.cpp', 'cost_model.idl', 'cost_model_manager.cpp', 'cost_model_utils.cpp', diff --git a/src/mongo/db/query/cost_model/cost_estimator.cpp b/src/mongo/db/query/cost_model/cost_estimator.cpp deleted file mode 100644 index 3dea08ebd1f..00000000000 --- a/src/mongo/db/query/cost_model/cost_estimator.cpp +++ /dev/null @@ -1,418 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/db/query/cost_model/cost_estimator.h" - -#include "mongo/db/query/optimizer/defs.h" - -namespace mongo::cost_model { - -using namespace optimizer; -using namespace optimizer::properties; -using optimizer::cascades::Memo; - -namespace { -struct CostAndCEInternal { - CostAndCEInternal(double cost, CEType ce) : _cost(cost), _ce(ce) { - uassert(7034000, "Invalid cost.", !std::isnan(cost) && cost >= 0.0); - uassert(7034001, "Invalid cardinality", std::isfinite(ce) && ce >= 0.0); - } - double _cost; - CEType _ce; -}; - -class CostDerivation { -public: - CostAndCEInternal operator()(const ABT& /*n*/, const PhysicalScanNode& /*node*/) { - // Default estimate for scan. - const double collectionScanCost = _coefficients.getScanStartupCost() + - _coefficients.getScanIncrementalCost() * _cardinalityEstimate; - return {collectionScanCost, _cardinalityEstimate}; - } - - CostAndCEInternal operator()(const ABT& /*n*/, const CoScanNode& /*node*/) { - // Assumed to be free. - return {_coefficients.getDefaultStartupCost(), _cardinalityEstimate}; - } - - CostAndCEInternal operator()(const ABT& /*n*/, const IndexScanNode& node) { - const double indexScanCost = _coefficients.getIndexScanStartupCost() + - _coefficients.getIndexScanIncrementalCost() * _cardinalityEstimate; - return {indexScanCost, _cardinalityEstimate}; - } - - CostAndCEInternal operator()(const ABT& /*n*/, const SeekNode& /*node*/) { - // SeekNode should deliver one result via cardinality estimate override. - // TODO: consider using node.getProjectionMap()._fieldProjections.size() to make the cost - // dependent on the size of the projection - const double seekCost = - _coefficients.getSeekStartupCost() + _coefficients.getSeekCost() * _cardinalityEstimate; - return {seekCost, _cardinalityEstimate}; - } - - CostAndCEInternal operator()(const ABT& /*n*/, const MemoLogicalDelegatorNode& node) { - const LogicalProps& childLogicalProps = _memo.getLogicalProps(node.getGroupId()); - // Notice that unlike all physical nodes, this logical node takes it cardinality directly - // from the memo group logical property, ignoring _cardinalityEstimate. - CEType baseCE = getPropertyConst(childLogicalProps).getEstimate(); - - if (hasProperty(_physProps)) { - const auto& indexingReq = getPropertyConst(_physProps); - if (indexingReq.getIndexReqTarget() == IndexReqTarget::Seek) { - // If we are performing a seek, normalize against the scan group cardinality. - const GroupIdType scanGroupId = - getPropertyConst(childLogicalProps).getScanGroupId(); - if (scanGroupId == node.getGroupId()) { - baseCE = 1.0; - } else { - const CEType scanGroupCE = - getPropertyConst(_memo.getLogicalProps(scanGroupId)) - .getEstimate(); - if (scanGroupCE > 0.0) { - baseCE /= scanGroupCE; - } - } - } - } - - return {0.0, getAdjustedCE(baseCE, _physProps)}; - } - - CostAndCEInternal operator()(const ABT& /*n*/, const MemoPhysicalDelegatorNode& /*node*/) { - uasserted(7034002, "Should not be costing physical delegator nodes."); - } - - CostAndCEInternal operator()(const ABT& /*n*/, const FilterNode& node) { - CostAndCEInternal childResult = deriveChild(node.getChild(), 0); - double filterCost = childResult._cost; - if (!isTrivialExpr(node.getFilter())) { - // Non-trivial filter. - filterCost += _coefficients.getFilterStartupCost() + - _coefficients.getFilterIncrementalCost() * childResult._ce; - } - return {filterCost, _cardinalityEstimate}; - } - - CostAndCEInternal operator()(const ABT& /*n*/, const EvaluationNode& node) { - CostAndCEInternal childResult = deriveChild(node.getChild(), 0); - double evalCost = childResult._cost; - if (!isTrivialExpr(node.getProjection())) { - // Non-trivial projection. - evalCost += _coefficients.getEvalStartupCost() + - _coefficients.getEvalIncrementalCost() * _cardinalityEstimate; - } - return {evalCost, _cardinalityEstimate}; - } - - CostAndCEInternal operator()(const ABT& /*n*/, const BinaryJoinNode& node) { - CostAndCEInternal leftChildResult = deriveChild(node.getLeftChild(), 0); - CostAndCEInternal rightChildResult = deriveChild(node.getRightChild(), 1); - const double joinCost = _coefficients.getBinaryJoinStartupCost() + - _coefficients.getBinaryJoinIncrementalCost() * - (leftChildResult._ce + rightChildResult._ce) + - leftChildResult._cost + rightChildResult._cost; - return {joinCost, _cardinalityEstimate}; - } - - CostAndCEInternal operator()(const ABT& /*n*/, const HashJoinNode& node) { - CostAndCEInternal leftChildResult = deriveChild(node.getLeftChild(), 0); - CostAndCEInternal rightChildResult = deriveChild(node.getRightChild(), 1); - - // TODO: distinguish build side and probe side. - const double hashJoinCost = _coefficients.getHashJoinStartupCost() + - _coefficients.getHashJoinIncrementalCost() * - (leftChildResult._ce + rightChildResult._ce) + - leftChildResult._cost + rightChildResult._cost; - return {hashJoinCost, _cardinalityEstimate}; - } - - CostAndCEInternal operator()(const ABT& /*n*/, const MergeJoinNode& node) { - CostAndCEInternal leftChildResult = deriveChild(node.getLeftChild(), 0); - CostAndCEInternal rightChildResult = deriveChild(node.getRightChild(), 1); - - const double mergeJoinCost = _coefficients.getMergeJoinStartupCost() + - _coefficients.getMergeJoinIncrementalCost() * - (leftChildResult._ce + rightChildResult._ce) + - leftChildResult._cost + rightChildResult._cost; - - return {mergeJoinCost, _cardinalityEstimate}; - } - - CostAndCEInternal operator()(const ABT& /*n*/, const UnionNode& node) { - const ABTVector& children = node.nodes(); - // UnionNode with one child is optimized away before lowering, therefore - // its cost is the cost of its child. - if (children.size() == 1) { - CostAndCEInternal childResult = deriveChild(children[0], 0); - return {childResult._cost, _cardinalityEstimate}; - } - - double totalCost = _coefficients.getUnionStartupCost(); - // The cost is the sum of the costs of its children and the cost to union each child. - for (size_t childIdx = 0; childIdx < children.size(); childIdx++) { - CostAndCEInternal childResult = deriveChild(children[childIdx], childIdx); - const double childCost = childResult._cost + - (childIdx > 0 ? _coefficients.getUnionIncrementalCost() * childResult._ce : 0); - totalCost += childCost; - } - return {totalCost, _cardinalityEstimate}; - } - - CostAndCEInternal operator()(const ABT& /*n*/, const GroupByNode& node) { - CostAndCEInternal childResult = deriveChild(node.getChild(), 0); - double groupByCost = _coefficients.getGroupByStartupCost(); - - // TODO: for now pretend global group by is free. - if (node.getType() == GroupNodeType::Global) { - groupByCost += childResult._cost; - } else { - // TODO: consider RepetitionEstimate since this is a stateful operation. - groupByCost += - _coefficients.getGroupByIncrementalCost() * childResult._ce + childResult._cost; - } - return {groupByCost, _cardinalityEstimate}; - } - - CostAndCEInternal operator()(const ABT& /*n*/, const UnwindNode& node) { - CostAndCEInternal childResult = deriveChild(node.getChild(), 0); - // Unwind probably depends mostly on its output size. - const double unwindCost = - _coefficients.getUnwindIncrementalCost() * _cardinalityEstimate + childResult._cost; - return {unwindCost, _cardinalityEstimate}; - } - - CostAndCEInternal operator()(const ABT& /*n*/, const UniqueNode& node) { - CostAndCEInternal childResult = deriveChild(node.getChild(), 0); - const double uniqueCost = _coefficients.getUniqueStartupCost() + - _coefficients.getUniqueIncrementalCost() * childResult._ce + childResult._cost; - return {uniqueCost, _cardinalityEstimate}; - } - - CostAndCEInternal operator()(const ABT& /*n*/, const CollationNode& node) { - CostAndCEInternal childResult = deriveChild(node.getChild(), 0); - // TODO: consider RepetitionEstimate since this is a stateful operation. - - double logFactor = childResult._ce; - double incrConst = _coefficients.getCollationIncrementalCost(); - if (hasProperty(_physProps)) { - if (auto limit = getPropertyConst(_physProps).getAbsoluteLimit(); - limit < logFactor) { - logFactor = limit; - incrConst = _coefficients.getCollationWithLimitIncrementalCost(); - } - } - - // Notice that log2(x) < 0 for any x < 1, and log2(1) = 0. Generally it makes sense that - // there is no cost to sort 1 document, so the only cost left is the startup cost. - const double sortCost = _coefficients.getCollationStartupCost() + childResult._cost + - ((logFactor <= 1.0) - ? 0.0 - // TODO: The cost formula below is based on 1 field, mix of int and str. Instead we - // have to take into account the number and size of sorted fields. - : incrConst * childResult._ce * std::log2(logFactor)); - return {sortCost, _cardinalityEstimate}; - } - - CostAndCEInternal operator()(const ABT& /*n*/, const LimitSkipNode& node) { - // Assumed to be free. - CostAndCEInternal childResult = deriveChild(node.getChild(), 0); - const double limitCost = _coefficients.getLimitSkipStartupCost() + childResult._cost + - _cardinalityEstimate * _coefficients.getLimitSkipIncrementalCost(); - return {limitCost, _cardinalityEstimate}; - } - - CostAndCEInternal operator()(const ABT& /*n*/, const ExchangeNode& node) { - CostAndCEInternal childResult = deriveChild(node.getChild(), 0); - double localCost = _coefficients.getExchangeStartupCost() + - _coefficients.getExchangeIncrementalCost() * _cardinalityEstimate; - - switch (node.getProperty().getDistributionAndProjections()._type) { - case DistributionType::Replicated: - localCost *= 2.0; - break; - - case DistributionType::HashPartitioning: - case DistributionType::RangePartitioning: - localCost *= 1.1; - break; - - default: - break; - } - - return {localCost + childResult._cost, _cardinalityEstimate}; - } - - CostAndCEInternal operator()(const ABT& /*n*/, const RootNode& node) { - return deriveChild(node.getChild(), 0); - } - - /** - * Other ABT types. - */ - template - CostAndCEInternal operator()(const ABT& /*n*/, const T& /*node*/, Ts&&...) { - static_assert(!canBePhysicalNode(), "Physical node must implement its cost derivation."); - return {0.0, 0.0}; - } - - static CostAndCEInternal derive(const Metadata& metadata, - const Memo& memo, - const PhysProps& physProps, - const ABT::reference_type physNodeRef, - const ChildPropsType& childProps, - const NodeCEMap& nodeCEMap, - const CostModelCoefficients& coefficients) { - CostAndCEInternal result = deriveInternal( - metadata, memo, physProps, physNodeRef, childProps, nodeCEMap, coefficients); - - switch (getPropertyConst(physProps) - .getDistributionAndProjections() - ._type) { - case DistributionType::Centralized: - case DistributionType::Replicated: - break; - - case DistributionType::RoundRobin: - case DistributionType::HashPartitioning: - case DistributionType::RangePartitioning: - case DistributionType::UnknownPartitioning: - result._cost /= metadata._numberOfPartitions; - break; - - default: - MONGO_UNREACHABLE; - } - - return result; - } - -private: - CostDerivation(const Metadata& metadata, - const Memo& memo, - const CEType ce, - const PhysProps& physProps, - const ChildPropsType& childProps, - const NodeCEMap& nodeCEMap, - const CostModelCoefficients& coefficients) - : _metadata(metadata), - _memo(memo), - _physProps(physProps), - _cardinalityEstimate(getAdjustedCE(ce, _physProps)), - _childProps(childProps), - _nodeCEMap(nodeCEMap), - _coefficients(coefficients) {} - - template - static bool isTrivialExpr(const ABT& n) { - if (n.is() || n.is()) { - return true; - } - if (const auto* ptr = n.cast(); ptr != nullptr && - ptr->getPath().template is() && isTrivialExpr(ptr->getInput())) { - return true; - } - return false; - } - - static CostAndCEInternal deriveInternal(const Metadata& metadata, - const Memo& memo, - const PhysProps& physProps, - const ABT::reference_type physNodeRef, - const ChildPropsType& childProps, - const NodeCEMap& nodeCEMap, - const CostModelCoefficients& coefficients) { - auto it = nodeCEMap.find(physNodeRef.cast()); - bool found = (it != nodeCEMap.cend()); - uassert(7034003, - "Only MemoLogicalDelegatorNode can be missing from nodeCEMap.", - found || physNodeRef.is()); - const CEType ce = (found ? it->second : 0.0); - - CostDerivation instance(metadata, memo, ce, physProps, childProps, nodeCEMap, coefficients); - CostAndCEInternal costCEestimates = physNodeRef.visit(instance); - return costCEestimates; - } - - CostAndCEInternal deriveChild(const ABT& child, const size_t childIndex) { - PhysProps physProps = _childProps.empty() ? _physProps : _childProps.at(childIndex).second; - return deriveInternal( - _metadata, _memo, physProps, child.ref(), {}, _nodeCEMap, _coefficients); - } - - static CEType getAdjustedCE(CEType baseCE, const PhysProps& physProps) { - CEType result = baseCE; - - // First: correct for un-enforced limit. - if (hasProperty(physProps)) { - const auto limit = getPropertyConst(physProps).getAbsoluteLimit(); - if (result > limit) { - result = limit; - } - } - - // Second: correct for enforced limit. - if (hasProperty(physProps)) { - const auto limit = getPropertyConst(physProps).getEstimate(); - if (result > limit) { - result = limit; - } - } - - // Third: correct for repetition. - if (hasProperty(physProps)) { - result *= getPropertyConst(physProps).getEstimate(); - } - - return result; - } - - // We don't own this. - const Metadata& _metadata; - const Memo& _memo; - const PhysProps& _physProps; - const CEType _cardinalityEstimate; - const ChildPropsType& _childProps; - const NodeCEMap& _nodeCEMap; - const CostModelCoefficients& _coefficients; -}; -} // namespace - -CostAndCE CostEstimator::deriveCost(const Metadata& metadata, - const Memo& memo, - const PhysProps& physProps, - const ABT::reference_type physNodeRef, - const ChildPropsType& childProps, - const NodeCEMap& nodeCEMap) const { - const CostAndCEInternal result = CostDerivation::derive( - metadata, memo, physProps, physNodeRef, childProps, nodeCEMap, _coefficients); - return {CostType::fromDouble(result._cost), result._ce}; -} - -} // namespace mongo::cost_model diff --git a/src/mongo/db/query/cost_model/cost_estimator.h b/src/mongo/db/query/cost_model/cost_estimator.h deleted file mode 100644 index 763351324b4..00000000000 --- a/src/mongo/db/query/cost_model/cost_estimator.h +++ /dev/null @@ -1,56 +0,0 @@ -/** - * Copyright (C) 2022-present MongoDB, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the Server Side Public License, version 1, - * as published by MongoDB, Inc. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * Server Side Public License for more details. - * - * You should have received a copy of the Server Side Public License - * along with this program. If not, see - * . - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the Server Side Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include "mongo/db/query/cost_model/cost_model_gen.h" -#include "mongo/db/query/optimizer/cascades/interfaces.h" -#include "mongo/db/query/optimizer/cascades/memo.h" - -namespace mongo::cost_model { -/** - * Default costing for physical nodes with logical delegator (not-yet-optimized) inputs. - */ -class CostEstimator : public optimizer::cascades::CostingInterface { -public: - CostEstimator(CostModelCoefficients coefficicients) - : _coefficients{std::move(coefficicients)} {} - - optimizer::CostAndCE deriveCost(const optimizer::Metadata& metadata, - const optimizer::cascades::Memo& memo, - const optimizer::properties::PhysProps& physProps, - optimizer::ABT::reference_type physNodeRef, - const optimizer::ChildPropsType& childProps, - const optimizer::NodeCEMap& nodeCEMap) const override final; - -private: - const CostModelCoefficients _coefficients; -}; - -} // namespace mongo::cost_model diff --git a/src/mongo/db/query/cost_model/cost_estimator_impl.cpp b/src/mongo/db/query/cost_model/cost_estimator_impl.cpp new file mode 100644 index 00000000000..55d2bf6ab5c --- /dev/null +++ b/src/mongo/db/query/cost_model/cost_estimator_impl.cpp @@ -0,0 +1,418 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/cost_model/cost_estimator_impl.h" + +#include "mongo/db/query/optimizer/defs.h" + +namespace mongo::cost_model { + +using namespace optimizer; +using namespace optimizer::properties; +using optimizer::cascades::Memo; + +namespace { +struct CostAndCEInternal { + CostAndCEInternal(double cost, CEType ce) : _cost(cost), _ce(ce) { + uassert(7034000, "Invalid cost.", !std::isnan(cost) && cost >= 0.0); + uassert(7034001, "Invalid cardinality", std::isfinite(ce) && ce >= 0.0); + } + double _cost; + CEType _ce; +}; + +class CostDerivation { +public: + CostAndCEInternal operator()(const ABT& /*n*/, const PhysicalScanNode& /*node*/) { + // Default estimate for scan. + const double collectionScanCost = _coefficients.getScanStartupCost() + + _coefficients.getScanIncrementalCost() * _cardinalityEstimate; + return {collectionScanCost, _cardinalityEstimate}; + } + + CostAndCEInternal operator()(const ABT& /*n*/, const CoScanNode& /*node*/) { + // Assumed to be free. + return {_coefficients.getDefaultStartupCost(), _cardinalityEstimate}; + } + + CostAndCEInternal operator()(const ABT& /*n*/, const IndexScanNode& node) { + const double indexScanCost = _coefficients.getIndexScanStartupCost() + + _coefficients.getIndexScanIncrementalCost() * _cardinalityEstimate; + return {indexScanCost, _cardinalityEstimate}; + } + + CostAndCEInternal operator()(const ABT& /*n*/, const SeekNode& /*node*/) { + // SeekNode should deliver one result via cardinality estimate override. + // TODO: consider using node.getProjectionMap()._fieldProjections.size() to make the cost + // dependent on the size of the projection + const double seekCost = + _coefficients.getSeekStartupCost() + _coefficients.getSeekCost() * _cardinalityEstimate; + return {seekCost, _cardinalityEstimate}; + } + + CostAndCEInternal operator()(const ABT& /*n*/, const MemoLogicalDelegatorNode& node) { + const LogicalProps& childLogicalProps = _memo.getLogicalProps(node.getGroupId()); + // Notice that unlike all physical nodes, this logical node takes it cardinality directly + // from the memo group logical property, ignoring _cardinalityEstimate. + CEType baseCE = getPropertyConst(childLogicalProps).getEstimate(); + + if (hasProperty(_physProps)) { + const auto& indexingReq = getPropertyConst(_physProps); + if (indexingReq.getIndexReqTarget() == IndexReqTarget::Seek) { + // If we are performing a seek, normalize against the scan group cardinality. + const GroupIdType scanGroupId = + getPropertyConst(childLogicalProps).getScanGroupId(); + if (scanGroupId == node.getGroupId()) { + baseCE = 1.0; + } else { + const CEType scanGroupCE = + getPropertyConst(_memo.getLogicalProps(scanGroupId)) + .getEstimate(); + if (scanGroupCE > 0.0) { + baseCE /= scanGroupCE; + } + } + } + } + + return {0.0, getAdjustedCE(baseCE, _physProps)}; + } + + CostAndCEInternal operator()(const ABT& /*n*/, const MemoPhysicalDelegatorNode& /*node*/) { + uasserted(7034002, "Should not be costing physical delegator nodes."); + } + + CostAndCEInternal operator()(const ABT& /*n*/, const FilterNode& node) { + CostAndCEInternal childResult = deriveChild(node.getChild(), 0); + double filterCost = childResult._cost; + if (!isTrivialExpr(node.getFilter())) { + // Non-trivial filter. + filterCost += _coefficients.getFilterStartupCost() + + _coefficients.getFilterIncrementalCost() * childResult._ce; + } + return {filterCost, _cardinalityEstimate}; + } + + CostAndCEInternal operator()(const ABT& /*n*/, const EvaluationNode& node) { + CostAndCEInternal childResult = deriveChild(node.getChild(), 0); + double evalCost = childResult._cost; + if (!isTrivialExpr(node.getProjection())) { + // Non-trivial projection. + evalCost += _coefficients.getEvalStartupCost() + + _coefficients.getEvalIncrementalCost() * _cardinalityEstimate; + } + return {evalCost, _cardinalityEstimate}; + } + + CostAndCEInternal operator()(const ABT& /*n*/, const BinaryJoinNode& node) { + CostAndCEInternal leftChildResult = deriveChild(node.getLeftChild(), 0); + CostAndCEInternal rightChildResult = deriveChild(node.getRightChild(), 1); + const double joinCost = _coefficients.getBinaryJoinStartupCost() + + _coefficients.getBinaryJoinIncrementalCost() * + (leftChildResult._ce + rightChildResult._ce) + + leftChildResult._cost + rightChildResult._cost; + return {joinCost, _cardinalityEstimate}; + } + + CostAndCEInternal operator()(const ABT& /*n*/, const HashJoinNode& node) { + CostAndCEInternal leftChildResult = deriveChild(node.getLeftChild(), 0); + CostAndCEInternal rightChildResult = deriveChild(node.getRightChild(), 1); + + // TODO: distinguish build side and probe side. + const double hashJoinCost = _coefficients.getHashJoinStartupCost() + + _coefficients.getHashJoinIncrementalCost() * + (leftChildResult._ce + rightChildResult._ce) + + leftChildResult._cost + rightChildResult._cost; + return {hashJoinCost, _cardinalityEstimate}; + } + + CostAndCEInternal operator()(const ABT& /*n*/, const MergeJoinNode& node) { + CostAndCEInternal leftChildResult = deriveChild(node.getLeftChild(), 0); + CostAndCEInternal rightChildResult = deriveChild(node.getRightChild(), 1); + + const double mergeJoinCost = _coefficients.getMergeJoinStartupCost() + + _coefficients.getMergeJoinIncrementalCost() * + (leftChildResult._ce + rightChildResult._ce) + + leftChildResult._cost + rightChildResult._cost; + + return {mergeJoinCost, _cardinalityEstimate}; + } + + CostAndCEInternal operator()(const ABT& /*n*/, const UnionNode& node) { + const ABTVector& children = node.nodes(); + // UnionNode with one child is optimized away before lowering, therefore + // its cost is the cost of its child. + if (children.size() == 1) { + CostAndCEInternal childResult = deriveChild(children[0], 0); + return {childResult._cost, _cardinalityEstimate}; + } + + double totalCost = _coefficients.getUnionStartupCost(); + // The cost is the sum of the costs of its children and the cost to union each child. + for (size_t childIdx = 0; childIdx < children.size(); childIdx++) { + CostAndCEInternal childResult = deriveChild(children[childIdx], childIdx); + const double childCost = childResult._cost + + (childIdx > 0 ? _coefficients.getUnionIncrementalCost() * childResult._ce : 0); + totalCost += childCost; + } + return {totalCost, _cardinalityEstimate}; + } + + CostAndCEInternal operator()(const ABT& /*n*/, const GroupByNode& node) { + CostAndCEInternal childResult = deriveChild(node.getChild(), 0); + double groupByCost = _coefficients.getGroupByStartupCost(); + + // TODO: for now pretend global group by is free. + if (node.getType() == GroupNodeType::Global) { + groupByCost += childResult._cost; + } else { + // TODO: consider RepetitionEstimate since this is a stateful operation. + groupByCost += + _coefficients.getGroupByIncrementalCost() * childResult._ce + childResult._cost; + } + return {groupByCost, _cardinalityEstimate}; + } + + CostAndCEInternal operator()(const ABT& /*n*/, const UnwindNode& node) { + CostAndCEInternal childResult = deriveChild(node.getChild(), 0); + // Unwind probably depends mostly on its output size. + const double unwindCost = + _coefficients.getUnwindIncrementalCost() * _cardinalityEstimate + childResult._cost; + return {unwindCost, _cardinalityEstimate}; + } + + CostAndCEInternal operator()(const ABT& /*n*/, const UniqueNode& node) { + CostAndCEInternal childResult = deriveChild(node.getChild(), 0); + const double uniqueCost = _coefficients.getUniqueStartupCost() + + _coefficients.getUniqueIncrementalCost() * childResult._ce + childResult._cost; + return {uniqueCost, _cardinalityEstimate}; + } + + CostAndCEInternal operator()(const ABT& /*n*/, const CollationNode& node) { + CostAndCEInternal childResult = deriveChild(node.getChild(), 0); + // TODO: consider RepetitionEstimate since this is a stateful operation. + + double logFactor = childResult._ce; + double incrConst = _coefficients.getCollationIncrementalCost(); + if (hasProperty(_physProps)) { + if (auto limit = getPropertyConst(_physProps).getAbsoluteLimit(); + limit < logFactor) { + logFactor = limit; + incrConst = _coefficients.getCollationWithLimitIncrementalCost(); + } + } + + // Notice that log2(x) < 0 for any x < 1, and log2(1) = 0. Generally it makes sense that + // there is no cost to sort 1 document, so the only cost left is the startup cost. + const double sortCost = _coefficients.getCollationStartupCost() + childResult._cost + + ((logFactor <= 1.0) + ? 0.0 + // TODO: The cost formula below is based on 1 field, mix of int and str. Instead we + // have to take into account the number and size of sorted fields. + : incrConst * childResult._ce * std::log2(logFactor)); + return {sortCost, _cardinalityEstimate}; + } + + CostAndCEInternal operator()(const ABT& /*n*/, const LimitSkipNode& node) { + // Assumed to be free. + CostAndCEInternal childResult = deriveChild(node.getChild(), 0); + const double limitCost = _coefficients.getLimitSkipStartupCost() + childResult._cost + + _cardinalityEstimate * _coefficients.getLimitSkipIncrementalCost(); + return {limitCost, _cardinalityEstimate}; + } + + CostAndCEInternal operator()(const ABT& /*n*/, const ExchangeNode& node) { + CostAndCEInternal childResult = deriveChild(node.getChild(), 0); + double localCost = _coefficients.getExchangeStartupCost() + + _coefficients.getExchangeIncrementalCost() * _cardinalityEstimate; + + switch (node.getProperty().getDistributionAndProjections()._type) { + case DistributionType::Replicated: + localCost *= 2.0; + break; + + case DistributionType::HashPartitioning: + case DistributionType::RangePartitioning: + localCost *= 1.1; + break; + + default: + break; + } + + return {localCost + childResult._cost, _cardinalityEstimate}; + } + + CostAndCEInternal operator()(const ABT& /*n*/, const RootNode& node) { + return deriveChild(node.getChild(), 0); + } + + /** + * Other ABT types. + */ + template + CostAndCEInternal operator()(const ABT& /*n*/, const T& /*node*/, Ts&&...) { + static_assert(!canBePhysicalNode(), "Physical node must implement its cost derivation."); + return {0.0, 0.0}; + } + + static CostAndCEInternal derive(const Metadata& metadata, + const Memo& memo, + const PhysProps& physProps, + const ABT::reference_type physNodeRef, + const ChildPropsType& childProps, + const NodeCEMap& nodeCEMap, + const CostModelCoefficients& coefficients) { + CostAndCEInternal result = deriveInternal( + metadata, memo, physProps, physNodeRef, childProps, nodeCEMap, coefficients); + + switch (getPropertyConst(physProps) + .getDistributionAndProjections() + ._type) { + case DistributionType::Centralized: + case DistributionType::Replicated: + break; + + case DistributionType::RoundRobin: + case DistributionType::HashPartitioning: + case DistributionType::RangePartitioning: + case DistributionType::UnknownPartitioning: + result._cost /= metadata._numberOfPartitions; + break; + + default: + MONGO_UNREACHABLE; + } + + return result; + } + +private: + CostDerivation(const Metadata& metadata, + const Memo& memo, + const CEType ce, + const PhysProps& physProps, + const ChildPropsType& childProps, + const NodeCEMap& nodeCEMap, + const CostModelCoefficients& coefficients) + : _metadata(metadata), + _memo(memo), + _physProps(physProps), + _cardinalityEstimate(getAdjustedCE(ce, _physProps)), + _childProps(childProps), + _nodeCEMap(nodeCEMap), + _coefficients(coefficients) {} + + template + static bool isTrivialExpr(const ABT& n) { + if (n.is() || n.is()) { + return true; + } + if (const auto* ptr = n.cast(); ptr != nullptr && + ptr->getPath().template is() && isTrivialExpr(ptr->getInput())) { + return true; + } + return false; + } + + static CostAndCEInternal deriveInternal(const Metadata& metadata, + const Memo& memo, + const PhysProps& physProps, + const ABT::reference_type physNodeRef, + const ChildPropsType& childProps, + const NodeCEMap& nodeCEMap, + const CostModelCoefficients& coefficients) { + auto it = nodeCEMap.find(physNodeRef.cast()); + bool found = (it != nodeCEMap.cend()); + uassert(7034003, + "Only MemoLogicalDelegatorNode can be missing from nodeCEMap.", + found || physNodeRef.is()); + const CEType ce = (found ? it->second : 0.0); + + CostDerivation instance(metadata, memo, ce, physProps, childProps, nodeCEMap, coefficients); + CostAndCEInternal costCEestimates = physNodeRef.visit(instance); + return costCEestimates; + } + + CostAndCEInternal deriveChild(const ABT& child, const size_t childIndex) { + PhysProps physProps = _childProps.empty() ? _physProps : _childProps.at(childIndex).second; + return deriveInternal( + _metadata, _memo, physProps, child.ref(), {}, _nodeCEMap, _coefficients); + } + + static CEType getAdjustedCE(CEType baseCE, const PhysProps& physProps) { + CEType result = baseCE; + + // First: correct for un-enforced limit. + if (hasProperty(physProps)) { + const auto limit = getPropertyConst(physProps).getAbsoluteLimit(); + if (result > limit) { + result = limit; + } + } + + // Second: correct for enforced limit. + if (hasProperty(physProps)) { + const auto limit = getPropertyConst(physProps).getEstimate(); + if (result > limit) { + result = limit; + } + } + + // Third: correct for repetition. + if (hasProperty(physProps)) { + result *= getPropertyConst(physProps).getEstimate(); + } + + return result; + } + + // We don't own this. + const Metadata& _metadata; + const Memo& _memo; + const PhysProps& _physProps; + const CEType _cardinalityEstimate; + const ChildPropsType& _childProps; + const NodeCEMap& _nodeCEMap; + const CostModelCoefficients& _coefficients; +}; +} // namespace + +CostAndCE CostEstimatorImpl::deriveCost(const Metadata& metadata, + const Memo& memo, + const PhysProps& physProps, + const ABT::reference_type physNodeRef, + const ChildPropsType& childProps, + const NodeCEMap& nodeCEMap) const { + const CostAndCEInternal result = CostDerivation::derive( + metadata, memo, physProps, physNodeRef, childProps, nodeCEMap, _coefficients); + return {CostType::fromDouble(result._cost), result._ce}; +} + +} // namespace mongo::cost_model diff --git a/src/mongo/db/query/cost_model/cost_estimator_impl.h b/src/mongo/db/query/cost_model/cost_estimator_impl.h new file mode 100644 index 00000000000..0ed094c02a9 --- /dev/null +++ b/src/mongo/db/query/cost_model/cost_estimator_impl.h @@ -0,0 +1,56 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/query/cost_model/cost_model_gen.h" +#include "mongo/db/query/optimizer/cascades/interfaces.h" +#include "mongo/db/query/optimizer/cascades/memo.h" + +namespace mongo::cost_model { +/** + * Default costing for physical nodes with logical delegator (not-yet-optimized) inputs. + */ +class CostEstimatorImpl : public optimizer::cascades::CostEstimator { +public: + CostEstimatorImpl(CostModelCoefficients coefficicients) + : _coefficients{std::move(coefficicients)} {} + + optimizer::CostAndCE deriveCost(const optimizer::Metadata& metadata, + const optimizer::cascades::Memo& memo, + const optimizer::properties::PhysProps& physProps, + optimizer::ABT::reference_type physNodeRef, + const optimizer::ChildPropsType& childProps, + const optimizer::NodeCEMap& nodeCEMap) const override final; + +private: + const CostModelCoefficients _coefficients; +}; + +} // namespace mongo::cost_model diff --git a/src/mongo/db/query/cost_model/cost_estimator_test.cpp b/src/mongo/db/query/cost_model/cost_estimator_test.cpp index 420da954b06..5b68139a72e 100644 --- a/src/mongo/db/query/cost_model/cost_estimator_test.cpp +++ b/src/mongo/db/query/cost_model/cost_estimator_test.cpp @@ -27,7 +27,7 @@ * it in the license file. */ -#include "mongo/db/query/cost_model/cost_estimator.h" +#include "mongo/db/query/cost_model/cost_estimator_impl.h" #include "mongo/db/query/cost_model/cost_model_gen.h" #include "mongo/db/query/cost_model/cost_model_utils.h" #include "mongo/db/query/optimizer/cascades/memo.h" @@ -48,7 +48,7 @@ TEST(CostEstimatorTest, PhysicalScanCost) { costModel.setScanStartupCost(startupCost); costModel.setScanIncrementalCost(scanCost); - CostEstimator costEstimator{costModel}; + CostEstimatorImpl costEstimator{costModel}; optimizer::Metadata metadata{{}}; optimizer::cascades::Memo memo{}; @@ -94,7 +94,7 @@ TEST(CostEstimatorTest, PhysicalScanCostWithAdjustedCE) { costModel.setScanStartupCost(startupCost); costModel.setScanIncrementalCost(scanCost); - CostEstimator costEstimator{costModel}; + CostEstimatorImpl costEstimator{costModel}; optimizer::Metadata metadata{{}}; optimizer::cascades::Memo memo{}; @@ -130,7 +130,7 @@ TEST(CostEstimatorTest, IndexScanCost) { costModel.setIndexScanStartupCost(startupCost); costModel.setIndexScanIncrementalCost(indexScanCost); - CostEstimator costEstimator{costModel}; + CostEstimatorImpl costEstimator{costModel}; optimizer::Metadata metadata{{}}; optimizer::cascades::Memo memo{}; @@ -173,7 +173,7 @@ TEST(CostEstimatorTest, FilterAndEvaluationCost) { costModel.setEvalIncrementalCost(evalCost); costModel.setEvalStartupCost(startupCost); - CostEstimator costEstimator{costModel}; + CostEstimatorImpl costEstimator{costModel}; optimizer::Metadata metadata{{}}; optimizer::cascades::Memo memo{}; @@ -273,7 +273,7 @@ TEST(CostEstimatorTest, MergeJoinCost) { nodeCEMap[evalNodeRight.cast()] = ce; - CostEstimator costEstimator{costModel}; + CostEstimatorImpl costEstimator{costModel}; optimizer::Metadata metadata{{}}; optimizer::cascades::Memo memo{}; diff --git a/src/mongo/db/query/cqf_get_executor.cpp b/src/mongo/db/query/cqf_get_executor.cpp index 642a8ad6322..be762fc0b9e 100644 --- a/src/mongo/db/query/cqf_get_executor.cpp +++ b/src/mongo/db/query/cqf_get_executor.cpp @@ -35,12 +35,11 @@ #include "mongo/db/pipeline/abt/document_source_visitor.h" #include "mongo/db/pipeline/abt/match_expression_visitor.h" #include "mongo/db/pipeline/abt/utils.h" -#include "mongo/db/query/ce/ce_heuristic.h" -#include "mongo/db/query/ce/ce_histogram.h" -#include "mongo/db/query/ce/ce_sampling.h" -#include "mongo/db/query/ce/collection_statistics_impl.h" +#include "mongo/db/query/ce/heuristic_estimator.h" +#include "mongo/db/query/ce/histogram_estimator.h" +#include "mongo/db/query/ce/sampling_estimator.h" #include "mongo/db/query/ce_mode_parameter.h" -#include "mongo/db/query/cost_model/cost_estimator.h" +#include "mongo/db/query/cost_model/cost_estimator_impl.h" #include "mongo/db/query/cost_model/cost_model_gen.h" #include "mongo/db/query/cost_model/cost_model_manager.h" #include "mongo/db/query/cost_model/on_coefficients_change_updater_impl.h" @@ -54,6 +53,7 @@ #include "mongo/db/query/query_knobs_gen.h" #include "mongo/db/query/query_planner_params.h" #include "mongo/db/query/sbe_stage_builder.h" +#include "mongo/db/query/stats/collection_statistics_impl.h" #include "mongo/db/query/yield_policy_callbacks_impl.h" #include "mongo/logv2/log.h" #include "mongo/logv2/log_attr.h" @@ -65,7 +65,10 @@ MONGO_FAIL_POINT_DEFINE(failConstructingBonsaiExecutor); namespace mongo { using namespace optimizer; -using cost_model::CostEstimator; +using ce::HeuristicEstimator; +using ce::HistogramEstimator; +using ce::SamplingEstimator; +using cost_model::CostEstimatorImpl; using cost_model::CostModelManager; static opt::unordered_map buildIndexSpecsOptimizer( @@ -582,9 +585,9 @@ static OptPhaseManager createPhaseManager(const CEMode mode, prefixId, false /*requireRID*/, std::move(metadataForSampling), - std::make_unique(), - std::make_unique(), - std::make_unique(costModel), + std::make_unique(), + std::make_unique(), + std::make_unique(costModel), defaultConvertPathToInterval, constFold, DebugInfo::kDefaultForProd, @@ -593,12 +596,12 @@ static OptPhaseManager createPhaseManager(const CEMode mode, prefixId, requireRID, std::move(metadata), - std::make_unique(opCtx, - std::move(phaseManagerForSampling), - collectionSize, - std::make_unique()), - std::make_unique(), - std::make_unique(costModel), + std::make_unique(opCtx, + std::move(phaseManagerForSampling), + collectionSize, + std::make_unique()), + std::make_unique(), + std::make_unique(costModel), defaultConvertPathToInterval, constFold, DebugInfo::kDefaultForProd, @@ -610,11 +613,11 @@ static OptPhaseManager createPhaseManager(const CEMode mode, prefixId, requireRID, std::move(metadata), - std::make_unique( - std::make_shared(collectionSize, nss), - std::make_unique()), - std::make_unique(), - std::make_unique(costModel), + std::make_unique( + std::make_shared(collectionSize, nss), + std::make_unique()), + std::make_unique(), + std::make_unique(costModel), defaultConvertPathToInterval, constFold, DebugInfo::kDefaultForProd, @@ -625,9 +628,9 @@ static OptPhaseManager createPhaseManager(const CEMode mode, prefixId, requireRID, std::move(metadata), - std::make_unique(), - std::make_unique(), - std::make_unique(costModel), + std::make_unique(), + std::make_unique(), + std::make_unique(costModel), defaultConvertPathToInterval, constFold, DebugInfo::kDefaultForProd, diff --git a/src/mongo/db/query/optimizer/cascades/interfaces.h b/src/mongo/db/query/optimizer/cascades/interfaces.h index 2eb2f801b0b..63d20fe9845 100644 --- a/src/mongo/db/query/optimizer/cascades/interfaces.h +++ b/src/mongo/db/query/optimizer/cascades/interfaces.h @@ -56,9 +56,9 @@ public: /** * Interface for deriving CE for a newly added logical node in a new memo group. */ -class CEInterface { +class CardinalityEstimator { public: - virtual ~CEInterface() = default; + virtual ~CardinalityEstimator() = default; virtual CEType deriveCE(const Metadata& metadata, const Memo& memo, @@ -69,9 +69,9 @@ public: /** * Interface for deriving costs and adjusted CE (based on physical props) for a physical node. */ -class CostingInterface { +class CostEstimator { public: - virtual ~CostingInterface() = default; + virtual ~CostEstimator() = default; virtual CostAndCE deriveCost(const Metadata& metadata, const Memo& memo, const properties::PhysProps& physProps, diff --git a/src/mongo/db/query/optimizer/cascades/logical_rewriter.cpp b/src/mongo/db/query/optimizer/cascades/logical_rewriter.cpp index 7f527d394cf..f1192426a1d 100644 --- a/src/mongo/db/query/optimizer/cascades/logical_rewriter.cpp +++ b/src/mongo/db/query/optimizer/cascades/logical_rewriter.cpp @@ -88,7 +88,7 @@ LogicalRewriter::LogicalRewriter(const Metadata& metadata, const PathToIntervalFn& pathToInterval, const ConstFoldFn& constFold, const LogicalPropsInterface& logicalPropsDerivation, - const CEInterface& ceDerivation) + const CardinalityEstimator& cardinalityEstimator) : _activeRewriteSet(std::move(rewriteSet)), _groupsPending(), _metadata(metadata), @@ -99,7 +99,7 @@ LogicalRewriter::LogicalRewriter(const Metadata& metadata, _pathToInterval(pathToInterval), _constFold(constFold), _logicalPropsDerivation(logicalPropsDerivation), - _ceDerivation(ceDerivation) { + _cardinalityEstimator(cardinalityEstimator) { initializeRewrites(); if (_activeRewriteSet.count(LogicalRewriteType::SargableSplit) > 0) { @@ -132,7 +132,7 @@ std::pair LogicalRewriter::addNode(const ABT& node, } const GroupIdType resultGroupId = _memo.integrate( - Memo::Context{&_metadata, &_debugInfo, &_logicalPropsDerivation, &_ceDerivation}, + Memo::Context{&_metadata, &_debugInfo, &_logicalPropsDerivation, &_cardinalityEstimator}, node, std::move(targetGroupMap), insertNodeIds, diff --git a/src/mongo/db/query/optimizer/cascades/logical_rewriter.h b/src/mongo/db/query/optimizer/cascades/logical_rewriter.h index 0af43abae90..5ed180a222b 100644 --- a/src/mongo/db/query/optimizer/cascades/logical_rewriter.h +++ b/src/mongo/db/query/optimizer/cascades/logical_rewriter.h @@ -61,7 +61,7 @@ public: const PathToIntervalFn& pathToInterval, const ConstFoldFn& constFold, const LogicalPropsInterface& logicalPropsDerivation, - const CEInterface& ceDerivation); + const CardinalityEstimator& cardinalityEstimator); // This is a transient structure. We do not allow copying or moving. LogicalRewriter() = delete; @@ -130,7 +130,7 @@ private: const PathToIntervalFn& _pathToInterval; const ConstFoldFn& _constFold; const LogicalPropsInterface& _logicalPropsDerivation; - const CEInterface& _ceDerivation; + const CardinalityEstimator& _cardinalityEstimator; RewriteFnMap _rewriteMap; diff --git a/src/mongo/db/query/optimizer/cascades/memo.cpp b/src/mongo/db/query/optimizer/cascades/memo.cpp index 12f43bb2d9d..f847b1df58b 100644 --- a/src/mongo/db/query/optimizer/cascades/memo.cpp +++ b/src/mongo/db/query/optimizer/cascades/memo.cpp @@ -535,15 +535,15 @@ private: Memo::Context::Context(const Metadata* metadata, const DebugInfo* debugInfo, const LogicalPropsInterface* logicalPropsDerivation, - const CEInterface* ceDerivation) + const CardinalityEstimator* cardinalityEstimator) : _metadata(metadata), _debugInfo(debugInfo), _logicalPropsDerivation(logicalPropsDerivation), - _ceDerivation(ceDerivation) { + _cardinalityEstimator(cardinalityEstimator) { invariant(_metadata != nullptr); invariant(_debugInfo != nullptr); invariant(_logicalPropsDerivation != nullptr); - invariant(_ceDerivation != nullptr); + invariant(_cardinalityEstimator != nullptr); } size_t Memo::GroupIdVectorHash::operator()(const Memo::GroupIdVector& v) const { @@ -617,7 +617,8 @@ void Memo::estimateCE(const Context& ctx, const GroupIdType groupId) { ctx._logicalPropsDerivation->deriveProps(*ctx._metadata, nodeRef, nullptr, this, groupId); props.merge(logicalProps); - const CEType estimate = ctx._ceDerivation->deriveCE(*ctx._metadata, *this, props, nodeRef); + const CEType estimate = + ctx._cardinalityEstimator->deriveCE(*ctx._metadata, *this, props, nodeRef); auto ceProp = properties::CardinalityEstimate(estimate); if (auto sargablePtr = nodeRef.cast(); sargablePtr != nullptr) { @@ -630,8 +631,8 @@ void Memo::estimateCE(const Context& ctx, const GroupIdType groupId) { ScanParams{}, sargablePtr->getTarget(), sargablePtr->getChild()); - const CEType singularEst = - ctx._ceDerivation->deriveCE(*ctx._metadata, *this, props, singularReq.ref()); + const CEType singularEst = ctx._cardinalityEstimator->deriveCE( + *ctx._metadata, *this, props, singularReq.ref()); partialSchemaKeyCE.emplace_back(key, singularEst); } } diff --git a/src/mongo/db/query/optimizer/cascades/memo.h b/src/mongo/db/query/optimizer/cascades/memo.h index c2c2b6a2d93..93321bce7e1 100644 --- a/src/mongo/db/query/optimizer/cascades/memo.h +++ b/src/mongo/db/query/optimizer/cascades/memo.h @@ -127,13 +127,13 @@ public: Context(const Metadata* metadata, const DebugInfo* debugInfo, const LogicalPropsInterface* logicalPropsDerivation, - const CEInterface* ceDerivation); + const CardinalityEstimator* cardinalityEstimator); // None of those should be null. const Metadata* _metadata; const DebugInfo* _debugInfo; const LogicalPropsInterface* _logicalPropsDerivation; - const CEInterface* _ceDerivation; + const CardinalityEstimator* _cardinalityEstimator; }; struct Stats { diff --git a/src/mongo/db/query/optimizer/cascades/physical_rewriter.cpp b/src/mongo/db/query/optimizer/cascades/physical_rewriter.cpp index c0b3423ab0c..bdbe168e146 100644 --- a/src/mongo/db/query/optimizer/cascades/physical_rewriter.cpp +++ b/src/mongo/db/query/optimizer/cascades/physical_rewriter.cpp @@ -110,13 +110,13 @@ PhysicalRewriter::PhysicalRewriter(const Metadata& metadata, const DebugInfo& debugInfo, const QueryHints& hints, const RIDProjectionsMap& ridProjections, - const CostingInterface& costDerivation, + const CostEstimator& costEstimator, const PathToIntervalFn& pathToInterval, std::unique_ptr& logicalRewriter) : _metadata(metadata), _memo(memo), _rootGroupId(rootGroupId), - _costDerivation(costDerivation), + _costEstimator(costEstimator), _debugInfo(debugInfo), _hints(hints), _ridProjections(ridProjections), @@ -149,7 +149,7 @@ void PhysicalRewriter::costAndRetainBestNode(std::unique_ptr node, const GroupIdType groupId, PrefixId& prefixId, PhysOptimizationResult& bestResult) { - const CostAndCE nodeCostAndCE = _costDerivation.deriveCost( + const CostAndCE nodeCostAndCE = _costEstimator.deriveCost( _metadata, _memo, bestResult._physProps, node->ref(), childProps, nodeCEMap); const CostType nodeCost = nodeCostAndCE._cost; uassert(6624056, "Must get non-infinity cost for physical node.", !nodeCost.isInfinite()); diff --git a/src/mongo/db/query/optimizer/cascades/physical_rewriter.h b/src/mongo/db/query/optimizer/cascades/physical_rewriter.h index a2fb936442e..50bc831dc46 100644 --- a/src/mongo/db/query/optimizer/cascades/physical_rewriter.h +++ b/src/mongo/db/query/optimizer/cascades/physical_rewriter.h @@ -58,7 +58,7 @@ public: const DebugInfo& debugInfo, const QueryHints& hints, const RIDProjectionsMap& ridProjections, - const CostingInterface& costDerivation, + const CostEstimator& costEstimator, const PathToIntervalFn& pathToInterval, std::unique_ptr& logicalRewriter); @@ -96,7 +96,7 @@ private: const Metadata& _metadata; Memo& _memo; const GroupIdType _rootGroupId; - const CostingInterface& _costDerivation; + const CostEstimator& _costEstimator; const DebugInfo& _debugInfo; const QueryHints& _hints; const RIDProjectionsMap& _ridProjections; diff --git a/src/mongo/db/query/optimizer/opt_phase_manager.cpp b/src/mongo/db/query/optimizer/opt_phase_manager.cpp index 79e6b34201a..2b710a1b2ca 100644 --- a/src/mongo/db/query/optimizer/opt_phase_manager.cpp +++ b/src/mongo/db/query/optimizer/opt_phase_manager.cpp @@ -49,9 +49,9 @@ OptPhaseManager::OptPhaseManager(OptPhaseManager::PhaseSet phaseSet, PrefixId& prefixId, const bool requireRID, Metadata metadata, - std::unique_ptr explorationCE, - std::unique_ptr substitutionCE, - std::unique_ptr costDerivation, + std::unique_ptr explorationCE, + std::unique_ptr substitutionCE, + std::unique_ptr costEstimator, PathToIntervalFn pathToInterval, ConstFoldFn constFold, DebugInfo debugInfo, @@ -64,14 +64,14 @@ OptPhaseManager::OptPhaseManager(OptPhaseManager::PhaseSet phaseSet, _logicalPropsDerivation(std::make_unique()), _explorationCE(std::move(explorationCE)), _substitutionCE(std::move(substitutionCE)), - _costDerivation(std::move(costDerivation)), + _costEstimator(std::move(costEstimator)), _pathToInterval(std::move(pathToInterval)), _constFold(std::move(constFold)), _physicalNodeId(), _requireRID(requireRID), _ridProjections(), _prefixId(prefixId) { - uassert(6624093, "Cost derivation is null", _costDerivation); + uassert(6624093, "Cost derivation is null", _costEstimator); uassert(7088900, "Exploration CE is null", _explorationCE); uassert(7088901, "Substitution CE is null", _substitutionCE); @@ -224,7 +224,7 @@ void OptPhaseManager::runMemoPhysicalRewrite(const OptPhase phase, _debugInfo, _hints, _ridProjections, - *_costDerivation, + *_costEstimator, _pathToInterval, logicalRewriter); diff --git a/src/mongo/db/query/optimizer/opt_phase_manager.h b/src/mongo/db/query/optimizer/opt_phase_manager.h index 4c6e8c431ae..c1219361f55 100644 --- a/src/mongo/db/query/optimizer/opt_phase_manager.h +++ b/src/mongo/db/query/optimizer/opt_phase_manager.h @@ -79,9 +79,9 @@ public: PrefixId& prefixId, bool requireRID, Metadata metadata, - std::unique_ptr explorationCE, - std::unique_ptr substitutionCE, - std::unique_ptr costDerivation, + std::unique_ptr explorationCE, + std::unique_ptr substitutionCE, + std::unique_ptr costEstimator, PathToIntervalFn pathToInterval, ConstFoldFn constFold, DebugInfo debugInfo, @@ -172,7 +172,7 @@ private: /** * Cardinality estimation implementation to be used during the exploraton phase.. */ - std::unique_ptr _explorationCE; + std::unique_ptr _explorationCE; /** * Cardinality estimation implementation to be used during the substitution phase. @@ -181,12 +181,12 @@ private: * alternatives. Since some CE implementations are expensive (sampling), we let the caller pass * a different one for this phase. */ - std::unique_ptr _substitutionCE; + std::unique_ptr _substitutionCE; /** * Cost derivation implementation. */ - std::unique_ptr _costDerivation; + std::unique_ptr _costEstimator; /** * Path ABT node to index bounds converter implementation. diff --git a/src/mongo/db/query/optimizer/utils/ce_math.cpp b/src/mongo/db/query/optimizer/utils/ce_math.cpp index 936686354ac..4eaa9372c4e 100644 --- a/src/mongo/db/query/optimizer/utils/ce_math.cpp +++ b/src/mongo/db/query/optimizer/utils/ce_math.cpp @@ -34,8 +34,7 @@ #include "mongo/db/query/optimizer/utils/ce_math.h" #include "mongo/util/assert_util.h" -namespace mongo::ce { - +namespace mongo::optimizer::ce { bool validSelectivity(SelectivityType sel) { return (sel >= 0.0 && sel <= 1.0); } @@ -82,4 +81,4 @@ SelectivityType disjExponentialBackoff(std::vector disjSelectiv } return 1.0 - sel; } -} // namespace mongo::ce +} // namespace mongo::optimizer::ce diff --git a/src/mongo/db/query/optimizer/utils/ce_math.h b/src/mongo/db/query/optimizer/utils/ce_math.h index a29a7d1f34b..44b87b14904 100644 --- a/src/mongo/db/query/optimizer/utils/ce_math.h +++ b/src/mongo/db/query/optimizer/utils/ce_math.h @@ -34,9 +34,7 @@ #include "mongo/db/query/optimizer/defs.h" -namespace mongo::ce { - -using namespace mongo::optimizer; +namespace mongo::optimizer::ce { // Default cardinality when actual collection cardinality is unknown. // Mostly used by unit tests. @@ -68,4 +66,4 @@ SelectivityType conjExponentialBackoff(std::vector conjSelectiv * exponential backoff. */ SelectivityType disjExponentialBackoff(std::vector disjSelectivities); -} // namespace mongo::ce +} // namespace mongo::optimizer::ce diff --git a/src/mongo/db/query/optimizer/utils/unit_test_utils.cpp b/src/mongo/db/query/optimizer/utils/unit_test_utils.cpp index f4b3ba349e5..e261c5ac537 100644 --- a/src/mongo/db/query/optimizer/utils/unit_test_utils.cpp +++ b/src/mongo/db/query/optimizer/utils/unit_test_utils.cpp @@ -32,9 +32,9 @@ #include #include "mongo/db/pipeline/abt/utils.h" -#include "mongo/db/query/ce/ce_heuristic.h" -#include "mongo/db/query/ce/ce_hinted.h" -#include "mongo/db/query/cost_model/cost_estimator.h" +#include "mongo/db/query/ce/heuristic_estimator.h" +#include "mongo/db/query/ce/hinted_estimator.h" +#include "mongo/db/query/cost_model/cost_estimator_impl.h" #include "mongo/db/query/cost_model/cost_model_manager.h" #include "mongo/db/query/optimizer/explain.h" #include "mongo/db/query/optimizer/metadata.h" @@ -246,16 +246,16 @@ IndexDefinition makeCompositeIndexDefinition(std::vector indexFi return IndexDefinition{std::move(idxCollSpec), isMultiKey}; } -std::unique_ptr makeHeuristicCE() { - return std::make_unique(); +std::unique_ptr makeHeuristicCE() { + return std::make_unique(); } -std::unique_ptr makeHintedCE(ce::PartialSchemaSelHints hints) { - return std::make_unique(std::move(hints)); +std::unique_ptr makeHintedCE(ce::PartialSchemaSelHints hints) { + return std::make_unique(std::move(hints)); } -std::unique_ptr makeCosting() { - return std::make_unique( +std::unique_ptr makeCostEstimator() { + return std::make_unique( cost_model::CostModelManager::getDefaultCoefficients()); } @@ -270,7 +270,7 @@ OptPhaseManager makePhaseManager(OptPhaseManager::PhaseSet phaseSet, std::move(metadata), makeHeuristicCE(), // primary CE makeHeuristicCE(), // substitution phase CE, same as primary - makeCosting(), + makeCostEstimator(), defaultConvertPathToInterval, ConstEval::constFold, std::move(debugInfo), @@ -280,16 +280,16 @@ OptPhaseManager makePhaseManager(OptPhaseManager::PhaseSet phaseSet, OptPhaseManager makePhaseManager(OptPhaseManager::PhaseSet phaseSet, PrefixId& prefixId, Metadata metadata, - std::unique_ptr ceDerivation, + std::unique_ptr ce, DebugInfo debugInfo, QueryHints queryHints) { return OptPhaseManager{std::move(phaseSet), prefixId, false /*requireRID*/, std::move(metadata), - std::move(ceDerivation), // primary CE - makeHeuristicCE(), // substitution phase CE - makeCosting(), + std::move(ce), // primary CE + makeHeuristicCE(), // substitution phase CE + makeCostEstimator(), defaultConvertPathToInterval, ConstEval::constFold, std::move(debugInfo), @@ -308,7 +308,7 @@ OptPhaseManager makePhaseManager(OptPhaseManager::PhaseSet phaseSet, std::move(metadata), makeHeuristicCE(), // primary CE makeHeuristicCE(), // substitution phase CE, same as primary - std::make_unique(coefs), + std::make_unique(coefs), defaultConvertPathToInterval, ConstEval::constFold, std::move(debugInfo), @@ -326,7 +326,7 @@ OptPhaseManager makePhaseManagerRequireRID(OptPhaseManager::PhaseSet phaseSet, std::move(metadata), makeHeuristicCE(), // primary CE makeHeuristicCE(), // substitution phase CE, same as primary - makeCosting(), + makeCostEstimator(), defaultConvertPathToInterval, ConstEval::constFold, std::move(debugInfo), diff --git a/src/mongo/db/query/optimizer/utils/unit_test_utils.h b/src/mongo/db/query/optimizer/utils/unit_test_utils.h index 953b8a07781..a1e6549e52f 100644 --- a/src/mongo/db/query/optimizer/utils/unit_test_utils.h +++ b/src/mongo/db/query/optimizer/utils/unit_test_utils.h @@ -30,7 +30,7 @@ #pragma once #include "mongo/db/bson/dotted_path_support.h" -#include "mongo/db/query/ce/ce_hinted.h" +#include "mongo/db/query/ce/hinted_estimator.h" #include "mongo/db/query/cost_model/cost_model_gen.h" #include "mongo/db/query/optimizer/defs.h" #include "mongo/db/query/optimizer/explain.h" @@ -168,17 +168,17 @@ IndexDefinition makeCompositeIndexDefinition(std::vector indexFi /** * A factory function to create a heuristic-based cardinality estimator. */ -std::unique_ptr makeHeuristicCE(); +std::unique_ptr makeHeuristicCE(); /** * A factory function to create a hint-based cardinality estimator. */ -std::unique_ptr makeHintedCE(ce::PartialSchemaSelHints hints); +std::unique_ptr makeHintedCE(ce::PartialSchemaSelHints hints); /** * A convenience factory function to create costing. */ -std::unique_ptr makeCosting(); +std::unique_ptr makeCostEstimator(); /** * A convenience factory function to create OptPhaseManager for unit tests. @@ -195,7 +195,7 @@ OptPhaseManager makePhaseManager(OptPhaseManager::PhaseSet phaseSet, OptPhaseManager makePhaseManager(OptPhaseManager::PhaseSet phaseSet, PrefixId& prefixId, Metadata metadata, - std::unique_ptr ceDerivation, + std::unique_ptr ce, DebugInfo debugInfo, QueryHints queryHints = {}); diff --git a/src/mongo/db/query/query_knobs.idl b/src/mongo/db/query/query_knobs.idl index 6b10ec2e9fd..af84c505323 100644 --- a/src/mongo/db/query/query_knobs.idl +++ b/src/mongo/db/query/query_knobs.idl @@ -742,7 +742,7 @@ server_parameters: cpp_vartype: std::string default: sampling validator: - callback: ce::validateCEMode + callback: optimizer::ce::validateCEMode internalCascadesOptimizerDisableScan: description: "Disable full collection scans in the Cascades optimizer." diff --git a/src/mongo/db/query/stats/SConscript b/src/mongo/db/query/stats/SConscript new file mode 100644 index 00000000000..1ef2b61f00e --- /dev/null +++ b/src/mongo/db/query/stats/SConscript @@ -0,0 +1,123 @@ +# -*- mode: python -*- + +Import("env") + +env = env.Clone() + +env.Library( + target="query_stats", + source=[ + 'collection_statistics_impl.cpp', + 'stats_catalog.cpp', + 'stats_cache.cpp', + 'stats_cache_loader_impl.cpp', + ], + LIBDEPS_PRIVATE=[ + '$BUILD_DIR/mongo/db/dbdirectclient', + '$BUILD_DIR/mongo/util/caching', + '$BUILD_DIR/mongo/util/concurrency/thread_pool', + 'stats_histograms', + ], +) + +env.Library( + target="stats_histograms", + source=[ + 'array_histogram.cpp', + 'scalar_histogram.cpp', + 'stats.idl', + 'value_utils.cpp', + ], + LIBDEPS=[ + '$BUILD_DIR/mongo/db/exec/sbe/query_sbe_values', + ], +) + +env.Library( + target="stats_gen", + source=[ + 'max_diff.cpp', + ], + LIBDEPS=[ + 'stats_histograms', + ], +) + +env.CppUnitTest( + target='stats_cache_loader_test', + source=[ + 'stats_cache_loader_test.cpp', + 'stats_cache_loader_test_fixture.cpp', + ], + LIBDEPS=[ + '$BUILD_DIR/mongo/db/auth/authmocks', + '$BUILD_DIR/mongo/db/catalog/collection_crud', + '$BUILD_DIR/mongo/db/commands/test_commands_enabled', + '$BUILD_DIR/mongo/db/index_builds_coordinator_mongod', + '$BUILD_DIR/mongo/db/multitenancy', + '$BUILD_DIR/mongo/db/op_observer/op_observer', + '$BUILD_DIR/mongo/db/op_observer/op_observer_impl', + '$BUILD_DIR/mongo/db/query/datetime/date_time_support', + '$BUILD_DIR/mongo/db/query/query_test_service_context', + '$BUILD_DIR/mongo/db/query_expressions', + '$BUILD_DIR/mongo/db/repl/drop_pending_collection_reaper', + '$BUILD_DIR/mongo/db/repl/oplog', + '$BUILD_DIR/mongo/db/repl/optime', + '$BUILD_DIR/mongo/db/repl/repl_coordinator_interface', + '$BUILD_DIR/mongo/db/repl/replmocks', + '$BUILD_DIR/mongo/db/repl/storage_interface_impl', + '$BUILD_DIR/mongo/db/server_base', + '$BUILD_DIR/mongo/db/service_context', + '$BUILD_DIR/mongo/db/service_context_d_test_fixture', + '$BUILD_DIR/mongo/db/service_context_test_fixture', + '$BUILD_DIR/mongo/db/shard_role', + '$BUILD_DIR/mongo/db/storage/wiredtiger/storage_wiredtiger', + '$BUILD_DIR/mongo/db/timeseries/timeseries_options', + '$BUILD_DIR/mongo/unittest/unittest', + '$BUILD_DIR/mongo/util/clock_source_mock', + '$BUILD_DIR/mongo/util/fail_point', + '$BUILD_DIR/mongo/util/pcre_wrapper', + ], +) + +env.CppUnitTest( + target="stats_cache_test", + source=[ + "stats_cache_test.cpp", + "stats_cache_loader_mock.cpp", + ], + LIBDEPS=[ + '$BUILD_DIR/mongo/base', + '$BUILD_DIR/mongo/db/service_context', + 'stats_test_utils', + ], +) + +env.CppUnitTest( + target="stats_path_test", + source=[ + "stats_path_test.cpp", + ], + LIBDEPS=[ + '$BUILD_DIR/mongo/base', + '$BUILD_DIR/mongo/db/service_context', + 'stats_test_utils', + ], +) + +env.Library( + target="stats_test_utils", + source=[ + 'collection_statistics_mock.cpp', + 'rand_utils.cpp', + 'rand_utils_new.cpp', + 'maxdiff_test_utils.cpp', + ], + LIBDEPS=[ + '$BUILD_DIR/mongo/base', + '$BUILD_DIR/mongo/db/exec/sbe/sbe_abt_test_util', + "$BUILD_DIR/mongo/unittest/unittest", + 'stats_gen', + 'stats_histograms', + ], +) diff --git a/src/mongo/db/query/stats/array_histogram.cpp b/src/mongo/db/query/stats/array_histogram.cpp new file mode 100644 index 00000000000..ccf11bf02d2 --- /dev/null +++ b/src/mongo/db/query/stats/array_histogram.cpp @@ -0,0 +1,209 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/value_utils.h" + +namespace mongo::stats { +namespace { +TypeCounts mapStatsTypeCountToTypeCounts(std::vector tc) { + TypeCounts out; + for (const auto& t : tc) { + out.emplace(deserialize(t.getTypeName().toString()), t.getCount()); + } + return out; +} +} // namespace + +ArrayHistogram::ArrayHistogram() : ArrayHistogram(ScalarHistogram(), {}) {} + +ArrayHistogram::ArrayHistogram(Statistics stats) + : ArrayHistogram(stats.getScalarHistogram(), + mapStatsTypeCountToTypeCounts(stats.getTypeCount()), + stats.getTrueCount(), + stats.getFalseCount()) { + // TODO SERVER-71513: initialize non-scalar histogram fields. +} + +ArrayHistogram::ArrayHistogram(ScalarHistogram scalar, + TypeCounts typeCounts, + ScalarHistogram arrayUnique, + ScalarHistogram arrayMin, + ScalarHistogram arrayMax, + TypeCounts arrayTypeCounts, + double emptyArrayCount, + double trueCount, + double falseCount) + : _scalar(std::move(scalar)), + _typeCounts(std::move(typeCounts)), + _emptyArrayCount(emptyArrayCount), + _trueCount(trueCount), + _falseCount(falseCount), + _arrayUnique(std::move(arrayUnique)), + _arrayMin(std::move(arrayMin)), + _arrayMax(std::move(arrayMax)), + _arrayTypeCounts(std::move(arrayTypeCounts)) { + invariant(isArray()); +} + +ArrayHistogram::ArrayHistogram(ScalarHistogram scalar, + TypeCounts typeCounts, + double trueCount, + double falseCount) + : _scalar(std::move(scalar)), + _typeCounts(std::move(typeCounts)), + _emptyArrayCount(0.0), + _trueCount(trueCount), + _falseCount(falseCount), + _arrayUnique(boost::none), + _arrayMin(boost::none), + _arrayMax(boost::none), + _arrayTypeCounts(boost::none) { + invariant(!isArray()); +} + +bool ArrayHistogram::isArray() const { + return _arrayUnique && _arrayMin && _arrayMax && _arrayTypeCounts; +} + +std::string typeCountsToString(const TypeCounts& typeCounts) { + std::ostringstream os; + os << "{"; + bool first = true; + for (auto [tag, count] : typeCounts) { + if (!first) + os << ", "; + os << tag << ": " << count; + first = false; + } + os << "}"; + return os.str(); +} + +std::string ArrayHistogram::toString() const { + std::ostringstream os; + os << "{\n"; + os << " scalar: " << _scalar.toString(); + os << ",\n typeCounts: " << typeCountsToString(_typeCounts); + if (isArray()) { + os << ",\n arrayUnique: " << _arrayUnique->toString(); + os << ",\n arrayMin: " << _arrayMin->toString(); + os << ",\n arrayMax: " << _arrayMax->toString(); + os << ",\n arrayTypeCounts: " << typeCountsToString(*_arrayTypeCounts); + } + os << "\n}\n"; + return os.str(); +} + +const ScalarHistogram& ArrayHistogram::getScalar() const { + return _scalar; +} + +const ScalarHistogram& ArrayHistogram::getArrayUnique() const { + invariant(isArray()); + return *_arrayUnique; +} + +const ScalarHistogram& ArrayHistogram::getArrayMin() const { + invariant(isArray()); + return *_arrayMin; +} + +const ScalarHistogram& ArrayHistogram::getArrayMax() const { + invariant(isArray()); + return *_arrayMax; +} + +const TypeCounts& ArrayHistogram::getTypeCounts() const { + return _typeCounts; +} + +const TypeCounts& ArrayHistogram::getArrayTypeCounts() const { + invariant(isArray()); + return *_arrayTypeCounts; +} + +double ArrayHistogram::getArrayCount() const { + if (isArray()) { + auto findArray = _typeCounts.find(sbe::value::TypeTags::Array); + uassert(6979504, + "Histogram with array data must have a total array count.", + findArray != _typeCounts.end()); + double arrayCount = findArray->second; + uassert(6979503, "Histogram with array data must have at least one array.", arrayCount > 0); + return arrayCount; + } + return 0; +} + +BSONObj ArrayHistogram::serialize() const { + BSONObjBuilder histogramBuilder; + + // Serialize boolean type counters. + histogramBuilder.append("trueCount", getTrueCount()); + histogramBuilder.append("falseCount", getFalseCount()); + + // Serialize empty array counts. + histogramBuilder.appendNumber("emptyArrayCount", getEmptyArrayCount()); + + // Serialize type counts. + BSONArrayBuilder typeCountBuilder(histogramBuilder.subarrayStart("typeCount")); + const auto& typeCounts = getTypeCounts(); + for (const auto& [sbeType, count] : typeCounts) { + auto typeCount = BSON("typeName" << stats::serialize(sbeType) << "count" << count); + typeCountBuilder.append(typeCount); + } + typeCountBuilder.doneFast(); + + // Serialize scalar histogram. + histogramBuilder.append("scalarHistogram", getScalar().serialize()); + + // TODO SERVER-71513: serialize array histograms. + + histogramBuilder.doneFast(); + return histogramBuilder.obj(); +} + +BSONObj makeStatistics(double documents, const ArrayHistogram& arrayHistogram) { + BSONObjBuilder builder; + builder.appendNumber("documents", documents); + builder.appendElements(arrayHistogram.serialize()); + builder.doneFast(); + return builder.obj(); +} + +BSONObj makeStatsPath(StringData path, double documents, const ArrayHistogram& arrayHistogram) { + BSONObjBuilder builder; + builder.append("_id", path); + builder.append("statistics", makeStatistics(documents, arrayHistogram)); + builder.doneFast(); + return builder.obj(); +} + +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/array_histogram.h b/src/mongo/db/query/stats/array_histogram.h new file mode 100644 index 00000000000..9a80feae423 --- /dev/null +++ b/src/mongo/db/query/stats/array_histogram.h @@ -0,0 +1,142 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include + +#include "mongo/db/exec/sbe/values/value.h" +#include "mongo/db/query/stats/scalar_histogram.h" +#include "mongo/db/query/stats/stats_gen.h" + +namespace mongo::stats { +using TypeCounts = std::map; + +class ArrayHistogram { +public: + // Constructs an empty scalar histogram. + ArrayHistogram(); + + // Constructor using StatsPath IDL as input. + ArrayHistogram(Statistics stats); + + // Constructor for scalar field histograms. + ArrayHistogram(ScalarHistogram scalar, + TypeCounts typeCounts, + double trueCount = 0.0, + double falseCount = 0.0); + + // Constructor for array field histograms. We have to initialize all array fields in this case. + ArrayHistogram(ScalarHistogram scalar, + TypeCounts typeCounts, + ScalarHistogram arrayUnique, + ScalarHistogram arrayMin, + ScalarHistogram arrayMax, + TypeCounts arrayTypeCounts, + double emptyArrayCount = 0.0, + double trueCount = 0.0, + double falseCount = 0.0); + + // ArrayHistogram is neither copy-constructible nor copy-assignable. + ArrayHistogram(const ArrayHistogram&) = delete; + ArrayHistogram& operator=(const ArrayHistogram&) = delete; + + // However, it is move-constructible and move-assignable. + ArrayHistogram(ArrayHistogram&&) = default; + ArrayHistogram& operator=(ArrayHistogram&&) = default; + ~ArrayHistogram() = default; + + std::string toString() const; + + // Serialize to BSON for storage in stats collection. + BSONObj serialize() const; + + const ScalarHistogram& getScalar() const; + const ScalarHistogram& getArrayUnique() const; + const ScalarHistogram& getArrayMin() const; + const ScalarHistogram& getArrayMax() const; + const TypeCounts& getTypeCounts() const; + const TypeCounts& getArrayTypeCounts() const; + + // Returns whether or not this histogram includes array data points. + bool isArray() const; + + // Get the total number of arrays in the histogram's path including empty arrays. + double getArrayCount() const; + + // Get the total number of empty arrays ( [] ) in the histogram's path. + double getEmptyArrayCount() const { + return _emptyArrayCount; + } + + // Get the count of true booleans. + double getTrueCount() const { + return _trueCount; + } + + // Get the count of false booleans. + double getFalseCount() const { + return _falseCount; + } + +private: + /* Fields for all paths. */ + + // Contains values which appeared originally as scalars on the path. + ScalarHistogram _scalar; + // The number of values of each type. + TypeCounts _typeCounts; + // The number of empty arrays - they are not accounted for in the histograms. + double _emptyArrayCount; + // The counts of true & false booleans. + double _trueCount; + double _falseCount; + + /* Fields for array paths (only initialized if arrays are present). */ + + // Contains unique scalar values originating from arrays. + boost::optional _arrayUnique; + // Contains minimum values originating from arrays **per class**. + boost::optional _arrayMin; + // Contains maximum values originating from arrays **per class**. + boost::optional _arrayMax; + // The number of values of each type inside all arrays. + boost::optional _arrayTypeCounts; +}; + +/** + * Returns an owned BSON Object representing data matching mongo::Statistics IDL. + */ +BSONObj makeStatistics(double documents, const ArrayHistogram& arrayHistogram); + +/** + * Returns an owned BSON Object representing data matching mongo::StatsPath IDL. + */ +BSONObj makeStatsPath(StringData path, double documents, const ArrayHistogram& arrayHistogram); +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/collection_statistics.h b/src/mongo/db/query/stats/collection_statistics.h new file mode 100644 index 00000000000..22e48663a61 --- /dev/null +++ b/src/mongo/db/query/stats/collection_statistics.h @@ -0,0 +1,60 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/namespace_string.h" +#include "mongo/db/query/stats/array_histogram.h" + +namespace mongo::stats { + +using Histograms = std::map>; + +class CollectionStatistics { +public: + /** + * Returns the cardinality of the given collection. + */ + virtual double getCardinality() const = 0; + + /** + * Returns the histogram for the given field path, or nullptr if none exists. + */ + virtual const ArrayHistogram* getHistogram(const std::string& path) const = 0; + + /** + * Adds a histogram along the given path. + */ + virtual void addHistogram(const std::string& path, + std::shared_ptr histogram) const = 0; + + virtual ~CollectionStatistics() = default; +}; + +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/collection_statistics_impl.cpp b/src/mongo/db/query/stats/collection_statistics_impl.cpp new file mode 100644 index 00000000000..b03829b3f1d --- /dev/null +++ b/src/mongo/db/query/stats/collection_statistics_impl.cpp @@ -0,0 +1,72 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/stats/collection_statistics_impl.h" + +#include "mongo/db/client.h" +#include "mongo/db/query/stats/stats_catalog.h" + +namespace mongo::stats { + +CollectionStatisticsImpl::CollectionStatisticsImpl(double cardinality, const NamespaceString& nss) + : _cardinality{cardinality}, _histograms{}, _nss{nss} {}; + +double CollectionStatisticsImpl::getCardinality() const { + return _cardinality; +} + +void CollectionStatisticsImpl::addHistogram(const std::string& path, + std::shared_ptr histogram) const { + _histograms[path] = histogram; +} + +const ArrayHistogram* CollectionStatisticsImpl::getHistogram(const std::string& path) const { + if (auto mapIt = _histograms.find(path); mapIt != _histograms.end()) { + return mapIt->second.get(); + } else { + uassert(8423368, "no current client", Client::getCurrent()); + auto opCtx = Client::getCurrent()->getOperationContext(); + uassert(8423367, "no operation context", opCtx); + StatsCatalog& statsCatalog = StatsCatalog::get(opCtx); + const auto swHistogram = statsCatalog.getHistogram(opCtx, _nss, path); + if (!swHistogram.isOK()) { + if (swHistogram != ErrorCodes::NamespaceNotFound) { + uasserted(swHistogram.getStatus().code(), + str::stream() << "Error getting histograms for path " << _nss << " : " + << path << swHistogram.getStatus().reason()); + } + return nullptr; + } + const auto histogram = std::move(swHistogram.getValue()); + addHistogram(path, histogram); + return histogram.get(); + } +} + +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/collection_statistics_impl.h b/src/mongo/db/query/stats/collection_statistics_impl.h new file mode 100644 index 00000000000..19c9612382f --- /dev/null +++ b/src/mongo/db/query/stats/collection_statistics_impl.h @@ -0,0 +1,67 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/collection_statistics.h" + +namespace mongo::stats { + +using Histograms = std::map>; + +class CollectionStatisticsImpl : public CollectionStatistics { +public: + CollectionStatisticsImpl(double cardinality, const NamespaceString& nss); + + /** + * Returns the cardinality of the given collection. + */ + double getCardinality() const override; + + /** + * Returns the histogram for the given field path, or nullptr if none exists. + */ + const ArrayHistogram* getHistogram(const std::string& path) const override; + + /** + * Adds a histogram along the given path. + */ + void addHistogram(const std::string& path, + std::shared_ptr histogram) const override; + + ~CollectionStatisticsImpl() = default; + +private: + double _cardinality; + mutable Histograms _histograms; + const NamespaceString _nss; +}; + +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/collection_statistics_mock.cpp b/src/mongo/db/query/stats/collection_statistics_mock.cpp new file mode 100644 index 00000000000..39b2f65e527 --- /dev/null +++ b/src/mongo/db/query/stats/collection_statistics_mock.cpp @@ -0,0 +1,53 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/stats/collection_statistics_mock.h" + +namespace mongo::stats { + +CollectionStatisticsMock::CollectionStatisticsMock(double cardinality) + : _cardinality{cardinality}, _histograms{} {}; + +double CollectionStatisticsMock::getCardinality() const { + return _cardinality; +} + +void CollectionStatisticsMock::addHistogram(const std::string& path, + std::shared_ptr histogram) const { + _histograms[path] = histogram; +} + +const ArrayHistogram* CollectionStatisticsMock::getHistogram(const std::string& path) const { + if (auto mapIt = _histograms.find(path); mapIt != _histograms.end()) { + return mapIt->second.get(); + } + return nullptr; +} + +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/collection_statistics_mock.h b/src/mongo/db/query/stats/collection_statistics_mock.h new file mode 100644 index 00000000000..04fee5ff69c --- /dev/null +++ b/src/mongo/db/query/stats/collection_statistics_mock.h @@ -0,0 +1,64 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/namespace_string.h" +#include "mongo/db/query/stats/collection_statistics.h" + +namespace mongo::stats { + +class CollectionStatisticsMock : public CollectionStatistics { +public: + CollectionStatisticsMock(double cardinality); + + /** + * Returns the cardinality of the given collection. + */ + double getCardinality() const override; + + /** + * Adds a histogram along the given path. + */ + void addHistogram(const std::string& path, + std::shared_ptr histogram) const override; + + /** + * Returns the histogram for the given field path, or nullptr if none exists. + */ + const ArrayHistogram* getHistogram(const std::string& path) const override; + + ~CollectionStatisticsMock() = default; + +private: + double _cardinality; + mutable Histograms _histograms; +}; + +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/max_diff.cpp b/src/mongo/db/query/stats/max_diff.cpp new file mode 100644 index 00000000000..9203b3d8321 --- /dev/null +++ b/src/mongo/db/query/stats/max_diff.cpp @@ -0,0 +1,378 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/stats/max_diff.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mongo/base/string_data.h" +#include "mongo/bson/bsonobjbuilder.h" +#include "mongo/bson/bsontypes.h" +#include "mongo/db/exec/sbe/values/bson.h" +#include "mongo/db/exec/sbe/values/value.h" +#include "mongo/db/query/stats/value_utils.h" +#include "mongo/util/assert_util.h" + +namespace mongo::stats { +namespace { +namespace value = sbe::value; + +std::string printDistribution(const DataDistribution& distr, size_t nElems) { + std::ostringstream os; + for (size_t i = 0; i < std::min(nElems, distr._freq.size()); ++i) { + os << "{val: " << distr._bounds[i].get() << ", " << distr._freq[i].toString() << "}\n"; + } + return os.str(); +} + +double valueSpread(value::TypeTags tag1, + value::Value val1, + value::TypeTags tag2, + value::Value val2) { + double doubleVal1 = valueToDouble(tag1, val1); + double doubleVal2 = valueToDouble(tag2, val2); + uassert(6660502, + "Data distribution values must be monotonically increasing.", + doubleVal2 >= doubleVal1); + return doubleVal2 - doubleVal1; +} + +// TODO: This doesn't seem right -- it looks like we're sorting on the frequency, +// not the difference between buckets +std::vector generateTopKBuckets(const DataDistribution& dataDistrib, size_t numBuckets) { + struct AreaComparator { + bool operator()(const ValFreq& a, const ValFreq& b) const { + return a._normArea > b._normArea; + } + }; + std::priority_queue, AreaComparator> pq; + + for (const auto& valFreq : dataDistrib._freq) { + if (pq.size() < numBuckets) { + pq.emplace(valFreq); + } else if (AreaComparator()(valFreq, pq.top())) { + pq.pop(); + pq.emplace(valFreq); + } + } + + std::vector result; + while (!pq.empty()) { + result.push_back(pq.top()); + pq.pop(); + } + + std::sort(result.begin(), result.end(), [](const ValFreq& a, const ValFreq& b) { + return a._idx < b._idx; + }); + + return result; +} + +/** + * Helper for getting the input for constructing an array histogram for an array estimator using the + * values in an array. For each value in `arrayElements`, update the min, max, and unique value + * vectors. These will be used to generate the corresponding histograms for array values. + */ +void updateMinMaxUniqArrayVals(std::vector& arrayElements, + std::vector& arrayMinData, + std::vector& arrayMaxData, + std::vector& arrayUniqueData) { + + if (arrayElements.size() == 0) { + return; + } + + sortValueVector(arrayElements); + + // Emit values for arrayMin and arrayMax histograms. + { + boost::optional prev; + for (const auto& element : arrayElements) { + if (!prev) { + arrayMinData.push_back(element); + } else if (!sameTypeClass(prev->getTag(), element.getTag())) { + arrayMaxData.push_back(*prev); + arrayMinData.push_back(element); + } + prev = element; + } + if (prev) { + arrayMaxData.push_back(*prev); + } + } + + // Emit values for arrayUnique histogram. + { + boost::optional prev; + for (const auto& element : arrayElements) { + if (!prev || + compareValues( + prev->getTag(), prev->getValue(), element.getTag(), element.getValue()) < 0) { + arrayUniqueData.push_back(element); + prev = element; + } + } + } +} +} // namespace + +DataDistribution getDataDistribution(const std::vector& sortedInput) { + if (sortedInput.empty()) { + return {}; + } + + DataDistribution result; + value::TypeTags prevTag; + value::Value prevValue; + bool first = true; + + // Aggregate the values in a sorted dataset into a frequency distribution. + size_t idx = 0; + for (size_t i = 0; i < sortedInput.size(); i++) { + const auto v = sortedInput[i].get(); + const auto comparison = first ? 1 : compareValues(v.first, v.second, prevTag, prevValue); + first = false; + + if (comparison != 0) { + uassert(6660550, "Input is not sorted", comparison > 0); + prevTag = v.first; + prevValue = v.second; + + const auto [tagCopy, valCopy] = copyValue(v.first, v.second); + result._bounds.emplace_back(tagCopy, valCopy); + result._freq.emplace_back(idx, 1); + ++idx; + } else { + ++result._freq.back()._freq; + } + } + + // Calculate the area for all values in the data distribution. + // The current minimum and maximum areas of the values of a type class. + double maxArea = 0.0; + + for (size_t i = 0; i + 1 < result._freq.size(); ++i) { + const auto v1 = result._bounds[i]; + const auto v2 = result._bounds[i + 1]; + const bool newTypeClass = !sameTypeClass(v1.getTag(), v2.getTag()); + + if (newTypeClass) { + const auto res = result.typeClassBounds.emplace(i, maxArea); + uassert(6660551, "There can't be duplicate type class bounds.", res.second); + maxArea = 0.0; + } else if (i == 0) { + const double spread = + valueSpread(v1.getTag(), v1.getValue(), v2.getTag(), v2.getValue()); + maxArea = result._freq[i]._freq * spread; + } + + if (i == 0 || newTypeClass) { + // Make sure we insert bucket boundaries between different types, and also make sure + // first value is picked for a boundary. + result._freq[i]._area = std::numeric_limits::infinity(); + } else { + const double spread = + valueSpread(v1.getTag(), v1.getValue(), v2.getTag(), v2.getValue()); + result._freq[i]._area = result._freq[i]._freq * spread; + maxArea = std::max(maxArea, result._freq[i]._area); + } + } + + // Make sure last value is picked as a histogram bucket boundary. + result._freq.back()._area = std::numeric_limits::infinity(); + const auto res = result.typeClassBounds.emplace(result._freq.size(), maxArea); + uassert(6660503, "There can't be duplicate type class bounds.", res.second); + + // Compute normalized areas. If the spread is 0, the area may also be 0. This could happen, + // for instance, if there is only a single value of a given type, + size_t beginIdx = 0; + for (const auto [endIdx, area] : result.typeClassBounds) { + for (size_t i = beginIdx; i < endIdx; ++i) { + result._freq[i]._normArea = area > 0.0 ? (result._freq[i]._area / area) : 0.0; + } + beginIdx = endIdx; + } + + // std::cout << "Distribution sorted by value:\n" + // << printDistribution(result, result._freq.size()) << "\n" + // << std::flush; + + return result; +} + +ScalarHistogram genMaxDiffHistogram(const DataDistribution& dataDistrib, size_t numBuckets) { + if (dataDistrib._freq.empty()) { + return {}; + } + + std::vector topKBuckets = generateTopKBuckets(dataDistrib, numBuckets); + uassert(6660504, + "Must have bucket boundary on first value", + topKBuckets[0]._idx == dataDistrib._freq[0]._idx); + uassert(6660505, + "Must have bucket boundary on last value", + topKBuckets.back()._idx == dataDistrib._freq.back()._idx); + + std::vector buckets; + value::Array bounds; + + // Create histogram buckets out of the top-K bucket values. + size_t startBucketIdx = 0; + double cumulativeFreq = 0.0; + double cumulativeNDV = 0.0; + for (size_t i = 0; i < std::min(dataDistrib._freq.size(), numBuckets); i++) { + const size_t bucketBoundIdx = topKBuckets[i]._idx; + const double freq = dataDistrib._freq.at(bucketBoundIdx)._freq; + + // Compute per-bucket statistics. + double rangeFreq = 0.0; + double ndv = 0.0; + while (startBucketIdx < bucketBoundIdx) { + rangeFreq += dataDistrib._freq[startBucketIdx++]._freq; + ++ndv; + } + cumulativeFreq += rangeFreq + freq; + cumulativeNDV += ndv + 1.0; + + // Add a histogram bucket. + const auto v = dataDistrib._bounds[startBucketIdx]; + const auto [copyTag, copyVal] = value::copyValue(v.getTag(), v.getValue()); + bounds.push_back(copyTag, copyVal); + buckets.emplace_back(freq, rangeFreq, cumulativeFreq, ndv, cumulativeNDV); + startBucketIdx++; + } + + return {std::move(bounds), std::move(buckets)}; +} + +ArrayHistogram createArrayEstimator(const std::vector& arrayData, size_t nBuckets) { + // Values that will be used as inputs to histogram generation code. + std::vector scalarData; + std::vector arrayMinData; + std::vector arrayMaxData; + std::vector arrayUniqueData; + + // Type counters. + TypeCounts typeCounts; + TypeCounts arrayTypeCounts; + + // Value counters. + double emptyArrayCount = 0; + double trueCount = 0; + double falseCount = 0; + + for (const auto& v : arrayData) { + const auto val = v.getValue(); + const auto tag = v.getTag(); + + // Increment type counters. + auto tagCount = typeCounts.insert({tag, 1}); + if (!tagCount.second) { + ++tagCount.first->second; + } + + if (tag == value::TypeTags::Array) { + // If we have an array, we can construct min, max, and unique histograms from its + // elements, provided that they are histogrammable. + std::vector arrayElements; + + value::Array* arr = value::getArrayView(val); + size_t arrSize = arr->size(); + if (arrSize == 0) { + ++emptyArrayCount; + continue; + } + + for (size_t i = 0; i < arrSize; i++) { + const auto [tag, val] = arr->getAt(i); + + // Increment array type tag counts. + auto arrTagCount = arrayTypeCounts.insert({tag, 1}); + if (!arrTagCount.second) { + ++arrTagCount.first->second; + } + + if (!canEstimateTypeViaHistogram(tag)) { + // If the elements of this array are not histogrammable, then we can only update + // the array type counters + continue; + } + + const auto [tagCopy, valCopy] = value::copyValue(tag, val); + arrayElements.emplace_back(tagCopy, valCopy); + } + updateMinMaxUniqArrayVals(arrayElements, arrayMinData, arrayMaxData, arrayUniqueData); + + } else if (tag == value::TypeTags::Boolean) { + // If we have a boolean, we also have counters for true and false values we should + // increment here. + if (value::bitcastTo(val)) { + trueCount++; + } else { + falseCount++; + } + continue; + + } else if (!canEstimateTypeViaHistogram(tag)) { + // If we have a non-histogrammable type, we can only increment the type counters for it; + // we cannot build a scalar histogram on it. + continue; + + } else { + // Assume non-arrays are scalars. Emit values for the scalar histogram. + scalarData.push_back(v); + } + } + + // Lambda helper to construct histogram from an unsorted value vector. + const auto makeHistogram = [&nBuckets](std::vector& values) { + sortValueVector(values); + return genMaxDiffHistogram(getDataDistribution(values), nBuckets); + }; + + return {makeHistogram(scalarData), + std::move(typeCounts), + makeHistogram(arrayUniqueData), + makeHistogram(arrayMinData), + makeHistogram(arrayMaxData), + std::move(arrayTypeCounts), + emptyArrayCount, + trueCount, + falseCount}; +} + +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/max_diff.h b/src/mongo/db/query/stats/max_diff.h new file mode 100644 index 00000000000..147cb35af8d --- /dev/null +++ b/src/mongo/db/query/stats/max_diff.h @@ -0,0 +1,82 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include +#include + +#include "mongo/db/exec/sbe/values/value.h" +#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/scalar_histogram.h" +#include "mongo/db/query/stats/value_utils.h" + +namespace mongo::stats { + +struct ValFreq { + ValFreq(size_t idx, size_t freq) : _idx(idx), _freq(freq), _area(-1.0), _normArea(-1) {} + + std::string toString() const { + std::ostringstream os; + os << "idx: " << _idx << ", freq: " << _freq << ", area: " << _area + << ", normArea: " << _normArea; + return os.str(); + } + + size_t _idx; // Original index according to value order. + size_t _freq; // Frequency of the value. + double _area; // Derived as: spread * frequency + double _normArea; // Area normalized to the maximum in a type class. +}; + +struct DataDistribution { + std::vector _bounds; + std::vector _freq; + // The min/max areas of each type class. The key is the index of the last boundary of the class. + std::map typeClassBounds; +}; + +/** + Given a set of values sorted in BSON order, generate a data distribution consisting of + counts for each value with the values in sorted order +*/ +DataDistribution getDataDistribution(const std::vector& sortedInput); + +/** + Given a data distribution, generate a scalar histogram with the supplied number of buckets +*/ +ScalarHistogram genMaxDiffHistogram(const DataDistribution& dataDistrib, size_t numBuckets); + +/** + Given a vector containing SBEValues, generate a set of statistics to summarize the supplied + data. Histograms will use the supplied number of buckets. +*/ +ArrayHistogram createArrayEstimator(const std::vector& arrayData, size_t nBuckets); + +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/maxdiff_test_utils.cpp b/src/mongo/db/query/stats/maxdiff_test_utils.cpp new file mode 100644 index 00000000000..cb0e66dc285 --- /dev/null +++ b/src/mongo/db/query/stats/maxdiff_test_utils.cpp @@ -0,0 +1,120 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/stats/maxdiff_test_utils.h" + +#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/max_diff.h" + +namespace mongo::stats { + +static std::vector convertToJSON(const std::vector& input) { + std::vector result; + + for (size_t i = 0; i < input.size(); i++) { + const auto [objTag, objVal] = sbe::value::makeNewObject(); + sbe::value::ValueGuard vg(objTag, objVal); + + const auto [tag, val] = input[i].get(); + // Copy the value because objVal owns its value, and the ValueGuard releases not only + // objVal, but also its Value (in the case below - copyVal). + const auto [copyTag, copyVal] = sbe::value::copyValue(tag, val); + sbe::value::getObjectView(objVal)->push_back("a", copyTag, copyVal); + + std::ostringstream os; + os << std::make_pair(objTag, objVal); + result.push_back(os.str()); + } + + return result; +} + +size_t getActualCard(OperationContext* opCtx, + const std::vector& input, + const std::string& query) { + return mongo::optimizer::runPipeline(opCtx, query, convertToJSON(input)).size(); +} + +std::string makeMatchExpr(const SBEValue& val, optimizer::ce::EstimationType cmpOp) { + std::stringstream matchExpr; + std::string cmpOpName = optimizer::ce::estimationTypeName.at(cmpOp); + matchExpr << "[{$match: {a: {$" << cmpOpName << ": " << val.get() << "}}}]"; + return matchExpr.str(); +} + +ScalarHistogram makeHistogram(std::vector& randData, size_t nBuckets) { + sortValueVector(randData); + const DataDistribution& dataDistrib = getDataDistribution(randData); + return genMaxDiffHistogram(dataDistrib, nBuckets); +} + +std::string printValueArray(const std::vector& values) { + std::stringstream strStream; + for (size_t i = 0; i < values.size(); ++i) { + strStream << " " << values[i].get(); + } + return strStream.str(); +} + +std::string plotArrayEstimator(const ArrayHistogram& estimator, const std::string& header) { + std::ostringstream os; + os << header << "\n"; + if (!estimator.getScalar().empty()) { + os << "Scalar histogram:\n" << estimator.getScalar().plot(); + } + if (!estimator.getArrayUnique().empty()) { + os << "Array unique histogram:\n" << estimator.getArrayUnique().plot(); + } + if (!estimator.getArrayMin().empty()) { + os << "Array min histogram:\n" << estimator.getArrayMin().plot(); + } + if (!estimator.getArrayMax().empty()) { + os << "Array max histogram:\n" << estimator.getArrayMax().plot(); + } + if (!estimator.getTypeCounts().empty()) { + os << "Per scalar data type value counts: "; + for (auto tagCount : estimator.getTypeCounts()) { + os << tagCount.first << "=" << tagCount.second << " "; + } + } + if (!estimator.getArrayTypeCounts().empty()) { + os << "\nPer array data type value counts: "; + for (auto tagCount : estimator.getArrayTypeCounts()) { + os << tagCount.first << "=" << tagCount.second << " "; + } + } + if (estimator.isArray()) { + os << "\nEmpty array count: " << estimator.getEmptyArrayCount(); + } + os << "\n"; + + return os.str(); +} + +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/maxdiff_test_utils.h b/src/mongo/db/query/stats/maxdiff_test_utils.h new file mode 100644 index 00000000000..a34f7dd41ee --- /dev/null +++ b/src/mongo/db/query/stats/maxdiff_test_utils.h @@ -0,0 +1,74 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include +#include + +#include "mongo/db/exec/sbe/abt/sbe_abt_test_util.h" +#include "mongo/db/exec/sbe/values/value.h" +#include "mongo/db/query/ce/histogram_predicate_estimation.h" +#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/scalar_histogram.h" +#include "mongo/db/query/stats/value_utils.h" + +namespace mongo::stats { + +/** + Given a list of SBE values and a query, create a collection containing the data, + and count the results from the supplied query. + */ +size_t getActualCard(OperationContext* opCtx, + const std::vector& input, + const std::string& query); + +/** + Given a value and a comparison operator, generate a match expression reflecting + x cmpOp val. +*/ +std::string makeMatchExpr(const SBEValue& val, optimizer::ce::EstimationType cmpOp); + +/** + Given a vector of values, create a histogram reflection the distribution of the vector + with the supplied number of buckets. +*/ +ScalarHistogram makeHistogram(std::vector& randData, size_t nBuckets); + +/** + Serialize a vector of values. +*/ +std::string printValueArray(const std::vector& values); + +/** + Plot a set of statistics as stored in ArrayHistogram. +*/ +std::string plotArrayEstimator(const ArrayHistogram& estimator, const std::string& header); + +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/rand_utils.cpp b/src/mongo/db/query/stats/rand_utils.cpp new file mode 100644 index 00000000000..ff66272a681 --- /dev/null +++ b/src/mongo/db/query/stats/rand_utils.cpp @@ -0,0 +1,392 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/stats/rand_utils.h" + +#include +#include +#include +#include + +#include "mongo/db/exec/sbe/values/value.h" +#include "mongo/util/assert_util.h" + +namespace mongo::stats { +namespace value = sbe::value; + +const std::string DatasetDescriptor::_alphabet = + "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + +DatasetDescriptor::DatasetDescriptor(const DataTypeDistribution& dataTypeDistribution, + size_t intNDV, + int minInt, + int maxInt, + size_t strNDV, + size_t minStrLen, + size_t maxStrLen, + std::shared_ptr nestedDataDescriptor, + double reuseScalarsRatio, + size_t arrNDV, + size_t minArrLen, + size_t maxArrLen) + : _gen{42}, + _reuseScalarsRatio(reuseScalarsRatio), + _intNDV(std::min(intNDV, static_cast(std::abs(maxInt - minInt)))), + _uniformIntDist{minInt, maxInt}, + _arrNDV(arrNDV), + _uniformArrSizeDist{minArrLen, maxArrLen}, + _nestedDataDescriptor(nestedDataDescriptor) { + uassert(6660520, "Maximum integer number must be >= the minimum one.", (maxInt >= minInt)); + uassert(6660521, "Maximum string size must be >= the minimum one.", (maxStrLen >= minStrLen)); + uassert(6660522, + "Array specs must be 0 if there is no array data descriptor.", + _nestedDataDescriptor || (arrNDV == 0 && minArrLen == 0 && maxArrLen == 0)); + uassert(6660523, + "Nested arrays requires sensible array lengths", + !_nestedDataDescriptor || maxArrLen >= minArrLen); + uassert(6660524, "Recursive descriptors are not allowed.", nestedDataDescriptor.get() != this); + uassert(6660525, + "reuseScalarsRatio is a probability, must be in [0, 1].", + reuseScalarsRatio >= 0 && reuseScalarsRatio <= 1.0); + + // Compute absolute ranges given relative weights of each value type. + double sumWeights = 0; + for (const auto& weightedType : dataTypeDistribution) { + sumWeights += weightedType.second; + } + double sumRelativeWeights = 0; + auto lastKey = dataTypeDistribution.crbegin()->first; + for (auto it = dataTypeDistribution.cbegin(); it != dataTypeDistribution.cend(); ++it) { + const auto weightedType = *it; + if (weightedType.first != lastKey) { + sumRelativeWeights += weightedType.second / sumWeights; + uassert(6660526, "The sum of weights can't be >= 1", sumRelativeWeights < 1); + } else { + // Due to rounding errors the last relative weight may not be exactly 1.0. Set it + // to 1.0. + sumRelativeWeights = 1.0; + } + _dataTypeDistribution.emplace(sumRelativeWeights, weightedType.first); + } + + // Generate a set of random integers. + mongo::stdx::unordered_set tmpIntSet; + tmpIntSet.reserve(_intNDV); + if (_intNDV == intNDV) { + for (int i = minInt; i <= maxInt; ++i) { + tmpIntSet.insert(i); // This is a dense set of all ints the range. + } + } else { + size_t randCount = 0; + while (tmpIntSet.size() < _intNDV && randCount < 10 * _intNDV) { + int randInt = _uniformIntDist(_gen); + ++randCount; + tmpIntSet.insert(randInt); + } + } + uassert( + 6660527, "Too few integers generated.", (double)tmpIntSet.size() / (double)_intNDV > 0.99); + _intSet.reserve(tmpIntSet.size()); + _intSet.insert(_intSet.end(), tmpIntSet.begin(), tmpIntSet.end()); + _uniformIntIdxDist.param( + std::uniform_int_distribution::param_type(0, _intSet.size() - 1)); + + // Generate a set of random strings with random sizes so that each string can be chosen + // multiple times in the test data set. + _stringSet.reserve(strNDV); + std::uniform_int_distribution uniformStrSizeDistr{minStrLen, maxStrLen}; + for (size_t i = 0; i < strNDV; ++i) { + size_t len = uniformStrSizeDistr(_gen); + const auto randStr = genRandomString(len); + _stringSet.push_back(randStr); + } + _uniformStrIdxDist.param( + std::uniform_int_distribution::param_type(0, _stringSet.size() - 1)); + + // Generate a set of random arrays that are chosen from when generating array data. + fillRandomArraySet(); +} + +std::vector DatasetDescriptor::genRandomDataset(size_t nElems, + DatasetDescriptor* parentDesc) { + std::vector randValues; + randValues.reserve(nElems); + DatasetDescriptor* curDesc = this; + + if (parentDesc) { + double reuseProb = _uniformRandProbability(_gen); + if (reuseProb < parentDesc->_reuseScalarsRatio) { + curDesc = parentDesc; + } + } + + for (size_t i = 0; i < nElems; ++i) { + // Get the data type of the current value to be generated. + value::TypeTags genTag = this->getRandDataType(); + // Generate a random value of the corresponding type. + switch (genTag) { + case value::TypeTags::NumberInt64: { + size_t idx = curDesc->_uniformIntIdxDist(_gen); + auto randInt = curDesc->_intSet.at(idx); + const auto [tag, val] = makeInt64Value(randInt); + randValues.emplace_back(tag, val); + break; + } + case value::TypeTags::StringBig: + case value::TypeTags::StringSmall: { + size_t idx = curDesc->_uniformStrIdxDist(_gen); + const auto randStr = curDesc->_stringSet.at(idx); + const auto [tag, val] = value::makeNewString(randStr); + const auto [copyTag, copyVal] = value::copyValue(tag, val); + randValues.emplace_back(copyTag, copyVal); + break; + } + case value::TypeTags::Array: { + if (_nestedDataDescriptor) { + const auto randArray = genRandomArray(); + auto [arrayTag, arrayVal] = value::makeNewArray(); + value::Array* arr = value::getArrayView(arrayVal); + for (const auto& elem : randArray) { + const auto [copyTag, copyVal] = + value::copyValue(elem.getTag(), elem.getValue()); + arr->push_back(copyTag, copyVal); + } + randValues.emplace_back(arrayTag, arrayVal); + } + break; + } + default: + uasserted(6660528, "Unsupported data type"); + } + } + + return randValues; +} + +std::string DatasetDescriptor::genRandomString(size_t len) { + std::string randStr; + randStr.reserve(len); + for (size_t i = 0; i < len; ++i) { + size_t idx = _uniformCharIdxDist(_gen); + const char ch = _alphabet[idx]; + randStr += ch; + } + + return randStr; +} + +std::vector DatasetDescriptor::genRandomArray() { + uassert(6660529, + "There must be a nested data descriptor for random array generation.", + _nestedDataDescriptor); + if (_arrNDV == 0) { + size_t randArraySize = _uniformArrSizeDist(_gen); + return _nestedDataDescriptor->genRandomDataset(randArraySize, this); + } else { + size_t idx = _uniformArrIdxDist(_gen); + return _arraySet.at(idx); + } +} + +void DatasetDescriptor::fillRandomArraySet() { + for (size_t i = 0; i < _arrNDV; ++i) { + size_t randArraySize = _uniformArrSizeDist(_gen); + const auto randArray = _nestedDataDescriptor->genRandomDataset(randArraySize, this); + _arraySet.push_back(randArray); + } + + if (_arrNDV > 0) { + _uniformArrIdxDist.param( + std::uniform_int_distribution::param_type(0, _arraySet.size() - 1)); + } +} + +/** + Generate a random string. It is possible (even expected) that the same parameters + will generate different strings on successive calls +*/ +std::string genRandomString(size_t len, std::mt19937_64& gen, size_t seed) { + std::string randStr; + randStr.reserve(len); + const constexpr char* kAlphabet = + "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + std::uniform_int_distribution uniformDist{0, std::strlen(kAlphabet) - 1}; + + for (size_t i = 0; i < len; ++i) { + size_t idx = uniformDist(gen); + const char ch = kAlphabet[idx]; + randStr += ch; + } + + return randStr; +} + +/** + Generate a string. This string will be deterministic in that the same + parameters will always generate the same string, even on different platforms. +*/ +std::string genString(size_t len, size_t seed) { + std::string str; + str.reserve(len); + + const constexpr char* kAlphabet = + "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + const int kAlphabetLength = strlen(kAlphabet); + + unsigned long long rand = seed; + for (size_t i = 0; i < len; ++i) { + // Library implementations of rand vary by compiler, naturally, Since we still + // want the appearance of randomness, but consistency across compilers, we use a linear + // congruential generator to choose characters for the string. The parameters chosen + // are from Numerical Recipes. We use the upper 32 bits when calculating the character + // index, as the lower 32 are essentially nonrandom -- a weakness of LCGs in general. + rand = 3935559000370003845ULL * rand + 269134368944950781ULL; + + int idx = (rand >> 32) % kAlphabetLength; + str += kAlphabet[idx]; + } + + return str; +} + +/** + Generate an array of values with the required ratio of int to string. This array will be + deterministic in that the same parameters will always generate the same array, even on + different platforms. +*/ +std::vector genFixedValueArray(size_t nElems, double intRatio, double strRatio) { + + std::vector values; + + const int intNDV = static_cast(nElems) / 4; + for (size_t i = 0; i < std::round(nElems * intRatio); ++i) { + const auto [tag, val] = makeInt64Value((i % intNDV) + 1); + values.emplace_back(tag, val); + } + + if (strRatio == 0.0) { + return values; + } + + // Generate a set of strings so that each string can be chosen multiple times in the test + // data set. + const size_t strNDV = nElems / 5; + std::vector stringSet; + stringSet.reserve(strNDV); + for (size_t i = 0; i < strNDV; ++i) { + const auto randStr = genString(8, i); + stringSet.push_back(randStr); + } + + for (size_t i = 0; i < std::round(nElems * strRatio); ++i) { + size_t idx = i % stringSet.size(); + const auto randStr = stringSet[idx]; + const auto [tag, val] = value::makeNewString(randStr); + values.emplace_back(tag, val); + } + + return values; +} + +std::vector genRandomValueArray(size_t nElems, + double intRatio, + double strRatio, + size_t seed) { + std::vector randValues; + const int intNDV = static_cast(nElems) / 4; + const size_t strNDV = nElems / 5; + std::vector stringSet; + stringSet.reserve(strNDV); + + std::mt19937_64 gen{seed}; + std::uniform_int_distribution uniformDist{1, intNDV}; + + for (size_t i = 0; i < std::round(nElems * intRatio); ++i) { + const auto [tag, val] = makeInt64Value(uniformDist(gen)); + randValues.emplace_back(tag, val); + } + + // Generate a set of strings so that each string can be chosen multiple times in the test + // data set. + for (size_t i = 0; i < strNDV; ++i) { + const auto randStr = genRandomString(8, gen, seed); + stringSet.push_back(randStr); + } + + std::uniform_int_distribution idxDistr{0, stringSet.size() - 1}; + for (size_t i = 0; i < std::round(nElems * strRatio); ++i) { + size_t idx = idxDistr(gen); + const auto randStr = stringSet[idx]; + const auto [tag, val] = value::makeNewString(randStr); + randValues.emplace_back(tag, val); + } + + return randValues; +} + +std::vector nestArrays(const std::vector& input, size_t emptyArrayCount) { + std::vector result; + auto [arrayTag, arrayVal] = value::makeNewArray(); + + for (size_t i = 0; i < input.size(); i++) { + const auto v = input[i].get(); + const auto [tagCopy, valCopy] = value::copyValue(v.first, v.second); + + if (i % 10 < 5) { + // 50% of values remain scalar. + result.emplace_back(tagCopy, valCopy); + } else { + // 50% of the values are grouped into arrays of size 10. + value::Array* arr = value::getArrayView(arrayVal); + arr->push_back(tagCopy, valCopy); + if (arr->size() == 10) { + result.emplace_back(arrayTag, arrayVal); + std::tie(arrayTag, arrayVal) = value::makeNewArray(); + } + } + } + + for (size_t i = 0; i < emptyArrayCount; ++i) { + auto [emptyArrayTag, emptyArrayVal] = value::makeNewArray(); + result.emplace_back(emptyArrayTag, emptyArrayVal); + } + + // It's possible that the array still contains something. If it's empty, + // we can safely release it. If not, append it to the result. + value::Array* arr = value::getArrayView(arrayVal); + if (arr->size() > 0) { + result.emplace_back(arrayTag, arrayVal); + } else { + value::releaseValue(arrayTag, arrayVal); + } + + return result; +} + +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/rand_utils.h b/src/mongo/db/query/stats/rand_utils.h new file mode 100644 index 00000000000..89e4741fd2a --- /dev/null +++ b/src/mongo/db/query/stats/rand_utils.h @@ -0,0 +1,188 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include +#include + +#include "mongo/db/query/stats/value_utils.h" + +namespace mongo::stats { +// A simple histogram describing the distribution of values of each data type. +using DataTypeDistribution = std::map; + +/** + Describes the distribution of a dataset according to type and weight. Other ctor parameters + are used to describe the various data types which can be emitted and correspond to the fields + named similarly + */ +class DatasetDescriptor { +public: + DatasetDescriptor(const DataTypeDistribution& dataTypeDistribution, + size_t intNDV, + int minInt, + int maxInt, + size_t strNDV, + size_t minStrLen, + size_t maxStrLen, + std::shared_ptr nestedDataDescriptor = nullptr, + double reuseScalarsRatio = 0, + size_t arrNDV = 0, + size_t minArrLen = 0, + size_t maxArrLen = 0); + + // Generate a random dataset of 'nElems' according to the data distribution characteristics in + // this object. + std::vector genRandomDataset(size_t nElems, DatasetDescriptor* parentDesc = nullptr); + +private: + // Select a random value data type. + sbe::value::TypeTags getRandDataType() { + double key = _uniformRandProbability(_gen); + return (*_dataTypeDistribution.upper_bound(key)).second; + } + + // Generate a random string with size 'len'. + std::string genRandomString(size_t len); + + // Generate a random array with length determined uniformly between minArrLen and maxArrLen + std::vector genRandomArray(); + + // Generate a set of random arrays that are chosen from when generating array data. + void fillRandomArraySet(); + +private: + using InternalDataTypeDistribution = std::map; + /* + * General distribution charecteristics. + */ + + // Pseudo-random generator. + std::mt19937_64 _gen; + // Random probabilities. Used to: + // - Select Value data types as random indexes in '_dataTypeDistribution'. + // - Select the source of values - either existing scalars or new. + std::uniform_real_distribution _uniformRandProbability{0.0, 1.0}; + // Distribution of different SBE data types. There will be %percent values of each type. + InternalDataTypeDistribution _dataTypeDistribution; + double _reuseScalarsRatio; + + /* + * Integer data parameters. + */ + + // Number of distinct integer values. + const size_t _intNDV; + // A set of integers to choose from while generating random integers. + std::vector _intSet; + // Generator of random integers with uniform distribution. + std::uniform_int_distribution _uniformIntDist; + // Generator of random indexes into the set of integers '_intSet'. + std::uniform_int_distribution _uniformIntIdxDist; + + /* + * String data parameters. + */ + + // All strings draw characters from this alphabet. + static const std::string _alphabet; + // A set of random strings to choose from. In theory there can be duplicates, but this is very + // unlikely. We don't care much if there are a few duplicates anyway. + std::vector _stringSet; + // Generator of random indexes into the set of characters '_alphabet'. + std::uniform_int_distribution _uniformCharIdxDist{0, _alphabet.size() - 1}; + // Generator of random indexes into the set of strings '_stringSet'. + std::uniform_int_distribution _uniformStrIdxDist; + + /* + * Array data parameters. + */ + + // Number of distinct arrays. + // TODO: currently not used. The idea is to use it in the same way as arrays - pre-generate + // '_arrNDV' arrays, then select randomly from this initial set. + size_t _arrNDV; + // Set of arrays to pick from when generating random data. + std::vector> _arraySet; + // Generator of random array sizes. + std::uniform_int_distribution _uniformArrSizeDist; + // Descriptor of the dataset within each array. + std::shared_ptr _nestedDataDescriptor; + // Generator of random indexes into the set of arrays '_arraySet'. + std::uniform_int_distribution _uniformArrIdxDist; +}; + +/** + Generate a pseudorandom string of length n + * The alphabet is fixed as [0-9][a-z][A-Z] + * Characters are chosed uniformly from the alphabet + * Randomness is implemented such that it is independent of the platform, + i.e. given the same length and seed on any platform, we will produce the + same string. +*/ +std::string genString(size_t len, size_t seed); + +/** + Generate a set of elements consisting of strings and ints in the + requested ratio. The generated array will contain the same values given the same + inputs on all platforms. + */ +std::vector genFixedValueArray(size_t nElems, double intRatio, double strRatio); + +/** + Generate a random string of length len. + * The alphabet is fixed as [0-9][a-z][A-Z]. + * Characters are chosed uniformly from the alphabet. + * Generated strings are likely to differ by platform, so derived values depending on them + are also likely to change. + */ +std::string genRandomString(size_t len, std::mt19937_64& gen, size_t seed); + + +/** + Generate a uniformly random set of elements consisting of string and ints in the + requested ratio. The resulting array is very likely to differ between platforms, even + with the same seed. Thus, derived values are also likely to change. + + Prefer genFixedValueArray when comparing derived values against constants. + */ +std::vector genRandomValueArray(size_t nElems, + double intRatio, + double strRatio, + size_t seed); + +/** + Generate a set up values consisting of half scalars, and half arrays of length 10. + + Values contained in the result will be drawn from the input vector. + */ +std::vector nestArrays(const std::vector& input, size_t emptyArrayCount); + +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/rand_utils_new.cpp b/src/mongo/db/query/stats/rand_utils_new.cpp new file mode 100644 index 00000000000..a8e8fab3bb8 --- /dev/null +++ b/src/mongo/db/query/stats/rand_utils_new.cpp @@ -0,0 +1,250 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/stats/rand_utils_new.h" + +#include +#include +#include +#include +#include + +#include "mongo/db/exec/sbe/values/value.h" +#include "mongo/util/assert_util.h" + +namespace mongo::stats { +namespace value = sbe::value; + +const std::string StrDistribution::_alphabet = + "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + +void DataTypeDistrNew::generate(std::vector& randValues, std::mt19937_64& gen) { + if (_nullsRatio > 0 && _nullSelector(gen) < _nullsRatio) { + auto [tag, val] = makeNullValue(); + randValues.emplace_back(tag, val); + } else { + size_t idx = (*_idxDist)(gen); + const auto val = _valSet.at(idx); + auto [copyTag, copyVal] = copyValue(val.getTag(), val.getValue()); + randValues.emplace_back(copyTag, copyVal); + } +} + +void DataTypeDistrNew::generate(value::Array* randValueArray, std::mt19937_64& gen) { + if (_nullsRatio > 0 && _nullSelector(gen) < _nullsRatio) { + auto [tag, val] = makeNullValue(); + randValueArray->push_back(tag, val); + } else { + size_t idx = (*_idxDist)(gen); + const auto val = _valSet.at(idx); + auto [copyTag, copyVal] = copyValue(val.getTag(), val.getValue()); + randValueArray->push_back(copyTag, copyVal); + } +} + +IntDistribution::IntDistribution(MixedDistributionDescriptor distrDescriptor, + double weight, + size_t ndv, + int minInt, + int maxInt, + double nullsRatio) + : DataTypeDistrNew(distrDescriptor, + value::TypeTags::NumberInt64, + weight, + std::min(ndv, static_cast(std::abs(maxInt - minInt))), + nullsRatio), + _minInt(minInt), + _maxInt(maxInt) { + uassert(6660507, "Maximum integer number must be >= the minimum one.", (maxInt >= minInt)); +} + +void IntDistribution::init(DatasetDescriptorNew* parentDesc, std::mt19937_64& gen) { + std::set tmpIntSet; + std::uniform_int_distribution uniformIntDist{_minInt, _maxInt}; + + if (_ndv == static_cast(std::abs(_maxInt - _minInt))) { + // This is a dense set of all ints in the range. + for (int i = _minInt; i <= _maxInt; ++i) { + tmpIntSet.insert(i); + } + } else { + size_t randCount = 0; + while (tmpIntSet.size() < _ndv && randCount < 10 * _ndv) { + int randInt = uniformIntDist(gen); + ++randCount; + tmpIntSet.insert(randInt); + } + } + uassert(6660508, "Too few integers generated.", (double)tmpIntSet.size() / (double)_ndv > 0.99); + _valSet.reserve(tmpIntSet.size()); + for (const auto randInt : tmpIntSet) { + const auto [tag, val] = makeInt64Value(randInt); + _valSet.emplace_back(tag, val); + } + + _idxDist = MixedDistribution::make(_mixedDistrDescriptor, 0, _valSet.size() - 1); +} + +StrDistribution::StrDistribution(MixedDistributionDescriptor distrDescriptor, + double weight, + size_t ndv, + size_t minStrLen, + size_t maxStrLen, + double nullsRatio) + : DataTypeDistrNew(distrDescriptor, value::TypeTags::StringBig, weight, ndv, nullsRatio), + _minStrLen(minStrLen), + _maxStrLen(maxStrLen) { + uassert(6660509, "Maximum string size must be >= the minimum one.", (maxStrLen >= minStrLen)); +} + +void StrDistribution::init(DatasetDescriptorNew* parentDesc, std::mt19937_64& gen) { + // Generate a set of random strings with random sizes between _minStrLen and _maxStrLen. + _valSet.reserve(_ndv); + std::uniform_int_distribution uniformStrSizeDistr{_minStrLen, _maxStrLen}; + for (size_t i = 0; i < _ndv; ++i) { + size_t len = uniformStrSizeDistr(gen); + const auto randStr = genRandomString(len, gen); + const auto [tag, val] = value::makeNewString(randStr); + _valSet.emplace_back(tag, val); + } + + _idxDist = MixedDistribution::make(_mixedDistrDescriptor, 0, _valSet.size() - 1); +} + +std::string StrDistribution::genRandomString(size_t len, std::mt19937_64& gen) { + std::string randStr; + randStr.reserve(len); + for (size_t i = 0; i < len; ++i) { + size_t idx = _uniformCharIdxDist(gen); + const char ch = _alphabet[idx]; + randStr += ch; + } + + return randStr; +} + +ArrDistribution::ArrDistribution(MixedDistributionDescriptor distrDescriptor, + double weight, + size_t ndv, + size_t minArrLen, + size_t maxArrLen, + std::unique_ptr arrayDataDescriptor, + double reuseScalarsRatio, + double nullsRatio) + : DataTypeDistrNew(distrDescriptor, value::TypeTags::Array, weight, ndv, nullsRatio), + _uniformArrSizeDist{minArrLen, maxArrLen}, + _arrayDataDescriptor(std::move(arrayDataDescriptor)), + _reuseScalarsRatio(reuseScalarsRatio) { + uassert(6660510, + "Array specs must be 0 if there is no array data descriptor.", + _arrayDataDescriptor || (ndv == 0 && minArrLen == 0 && maxArrLen == 0)); + uassert(6660511, + "Nested arrays requires sensible array lengths.", + !_arrayDataDescriptor || maxArrLen >= minArrLen); + uassert(6660512, + "reuseScalarsRatio must be in [0, 1].", + reuseScalarsRatio >= 0 && reuseScalarsRatio <= 1.0); +} + +void ArrDistribution::init(DatasetDescriptorNew* parentDesc, std::mt19937_64& gen) { + uassert(6660513, "There must always be a parent data descriptor.", parentDesc); + + // Extract the per-type probabilities from the parent descriptor, but set the array probability + // to 0 to avoid self-recursion. + std::vector parentProbabilities; + for (const auto& dtd : parentDesc->_dataTypeDistributions) { + double prob = (dtd->tag() == value::TypeTags::Array) ? 0 : dtd->weight(); + parentProbabilities.push_back(prob); + } + std::discrete_distribution parentDataTypeSelector; + parentDataTypeSelector.param(std::discrete_distribution::param_type( + parentProbabilities.begin(), parentProbabilities.end())); + + // Generate _ndv distinct arrays, and store them in _valSet. + for (size_t i = 0; i < _ndv; ++i) { + auto [arrayTag, arrayVal] = value::makeNewArray(); + value::Array* arr = value::getArrayView(arrayVal); + size_t randArraySize = _uniformArrSizeDist(gen); + arr->reserve(randArraySize); + // Generate the data for one random array. + for (size_t j = 0; j < randArraySize; ++j) { + DataTypeDistrNew* dtd = nullptr; + size_t idx; + double reuseParentProb = _uniformRandProbability(gen); + if (reuseParentProb < _reuseScalarsRatio) { + // Pick a random data type descriptor from the parent. + idx = parentDataTypeSelector(gen); + dtd = parentDesc->_dataTypeDistributions.at(idx).get(); + } else { + idx = _arrayDataDescriptor->_dataTypeSelector(gen); + dtd = _arrayDataDescriptor->_dataTypeDistributions.at(idx).get(); + } + dtd->generate(arr, gen); + } + _valSet.emplace_back(arrayTag, arrayVal); + } + + _idxDist = MixedDistribution::make(_mixedDistrDescriptor, 0, _valSet.size() - 1); +} + +DatasetDescriptorNew::DatasetDescriptorNew(TypeDistrVector dataTypeDistributions, + std::mt19937_64& gen) + : _dataTypeDistributions(std::move(dataTypeDistributions)), _gen{gen} { + + // The probability of each type to be chosen. Extracted into a vector in order to setup a + // discrete_distribution. + std::vector probabilities; + probabilities.reserve(_dataTypeDistributions.size()); + for (auto& dtd : _dataTypeDistributions) { + dtd->init(this, gen); + probabilities.push_back(dtd->weight()); + } + _dataTypeSelector.param( + std::discrete_distribution::param_type(probabilities.begin(), probabilities.end())); +} + +DataTypeDistrNew* DatasetDescriptorNew::getRandDataTypeDist() { + size_t idx = _dataTypeSelector(_gen); + return _dataTypeDistributions[idx].get(); +} + +std::vector DatasetDescriptorNew::genRandomDataset(size_t nElems) { + std::vector randValues; + randValues.reserve(nElems); + + for (size_t i = 0; i < nElems; ++i) { + DataTypeDistrNew* dtd = getRandDataTypeDist(); + dtd->generate(randValues, _gen); + } + + return randValues; +} + + +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/rand_utils_new.h b/src/mongo/db/query/stats/rand_utils_new.h new file mode 100644 index 00000000000..be77578fc28 --- /dev/null +++ b/src/mongo/db/query/stats/rand_utils_new.h @@ -0,0 +1,353 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include +#include + +#include "mongo/db/query/stats/value_utils.h" + +namespace mongo::stats { + +class DatasetDescriptorNew; + +/** + * A base class for wrappers of STL random distributions that produce size_t values within a range. + * This class enables polymorphic usage of random distributions, for instance to implement a mix of + * distributions. + */ +class RandomDistribution { +public: + RandomDistribution() = default; + RandomDistribution(const RandomDistribution&) = default; + RandomDistribution(RandomDistribution&&) = default; + RandomDistribution& operator=(const RandomDistribution&) = default; + RandomDistribution& operator=(RandomDistribution&&) = default; + virtual ~RandomDistribution() = default; + + virtual size_t operator()(std::mt19937_64& gen) = 0; +}; + +/** + A uniform random distribution of size_t within a range + */ +class UniformDistr : public RandomDistribution { +public: + UniformDistr(size_t min, size_t max) : _distr{min, max}, _min(min), _max(max) {} + + size_t operator()(std::mt19937_64& gen) override { + size_t result = _distr(gen); + uassert(6660540, "Random index out of range", result >= _min && result <= _max); + return result; + } + +private: + std::uniform_int_distribution _distr; + size_t _min; + size_t _max; +}; + +/** + * Wrapper of normal distribution that is guaranteed to produces size_t values within a certain + * range. The STL class normal_distribution takes a median and standard deviation. This class + * computes a suitable median and standard deviation from the required [min,max] boundaries. + */ +class NormalDistr : public RandomDistribution { +public: + NormalDistr(size_t min, size_t max) + : _distr{(double)(min + max) / 2.0, (double)(max - min) / 4.0}, + _backup{min, max}, + _min((double)min), + _max((double)max) {} + + size_t operator()(std::mt19937_64& gen) override { + size_t result = std::round(_distr(gen)); + size_t trials = 0; + // If the result is outside the range (an event with low probability), try 10 more times to + // get a number in the range. + while (!(result >= _min && result <= _max) && trials < 10) { + double randNum = _distr(gen); + if (randNum < _min) { + result = std::ceil(randNum); + } else if (randNum > _max) { + result = std::floor(randNum); + } else { + result = std::round(randNum); + } + ++trials; + } + if (result < _min && result > _max) { + // We couldn't generate a number in [min,max] within 10 attempts. Generate a uniform + // number. + result = _backup(gen); + } + uassert(6660541, "Random index out of range", result >= _min && result <= _max); + return result; + } + +private: + std::normal_distribution _distr; + std::uniform_int_distribution _backup; + double _min; + double _max; +}; + +enum class DistrType { kUniform, kNormal }; + +using MixedDistributionDescriptor = std::vector>; + +/** + * Generator for mixed distribution, where mixing is on the type of distribution, in the + * probabilities specified in distrProbabilites + */ +class MixedDistribution { +public: + MixedDistribution(std::vector> distrMix, + std::vector& distrProbabilities) + : _distrMix(std::move(distrMix)) { + _distDist.param(std::discrete_distribution::param_type(distrProbabilities.begin(), + distrProbabilities.end())); + } + + static std::unique_ptr make(MixedDistributionDescriptor& descriptor, + size_t min, + size_t max) { + std::vector distrProbabilities; + std::vector> distrMix; + + for (const auto& [distrType, weight] : descriptor) { + distrProbabilities.push_back(weight); + switch (distrType) { + case DistrType::kUniform: + distrMix.emplace_back(std::make_unique(min, max)); + break; + case DistrType::kNormal: + distrMix.emplace_back(std::make_unique(min, max)); + break; + default: + MONGO_UNREACHABLE; + } + } + + return std::make_unique(std::move(distrMix), distrProbabilities); + } + + size_t operator()(std::mt19937_64& gen) { + size_t distIdx = _distDist(gen); + size_t result = (*_distrMix.at(distIdx))(gen); + return result; + } + +private: + // Mix of different distributions. There can be instances of the same type of distribution, + // because they can still be defined differently. + std::vector> _distrMix; + // Distribution of distributions - select the current distribution with a certain probability. + std::discrete_distribution _distDist; +}; + +/** + * Descriptor of a typed data distribution + */ +class DataTypeDistrNew { +public: + DataTypeDistrNew(MixedDistributionDescriptor distrDescriptor, + sbe::value::TypeTags tag, + double weight, + size_t ndv, + double nullsRatio = 0.0) + : _mixedDistrDescriptor(distrDescriptor), + _tag(tag), + _weight(weight), + _ndv(ndv), + _nullsRatio(nullsRatio) { + uassert(6660542, "NDV must be > 0.", ndv > 0); + uassert(6660543, "nullsRatio must be in [0, 1].", nullsRatio >= 0 && nullsRatio <= 1); + } + + virtual ~DataTypeDistrNew() = default; + + /** + * Generate all unique values that generation chooses from, and store them in '_valSet'. + * Different data types provide different implementations. + * @todo: The 'parentDesc' parameter is used only by array generation. Consider a different way + * of passing it only to that type. + */ + virtual void init(DatasetDescriptorNew* parentDesc, std::mt19937_64& gen) = 0; + + /** + * Generate a single random value, and store it in 'randValues' vector. + */ + void generate(std::vector& randValues, std::mt19937_64& gen); + + /** + * Generate a single random value, and store it in 'randValueArray' array. + */ + void generate(sbe::value::Array* randValueArray, std::mt19937_64& gen); + + /** + * Custom equality comparison for storage in sets. There can be only datatype in a set. + */ + bool operator==(const DataTypeDistrNew& d) const { + return this->_tag == d._tag; + } + + sbe::value::TypeTags tag() const { + return _tag; + } + + double weight() const { + return _weight; + } + +protected: + MixedDistributionDescriptor _mixedDistrDescriptor; + sbe::value::TypeTags _tag; + // Weight that determines the probability of a value of this type. + const double _weight; + const size_t _ndv; + // A set of (randomly generated) values to choose from when generating random datasets. + std::vector _valSet; + // Generator of random indexes into a set of values. + // std::uniform_int_distribution _idxDist; + std::unique_ptr _idxDist; + // Percent of null values in the dataset. + double _nullsRatio; + std::uniform_real_distribution _nullSelector{0, 1}; + + friend class DatasetDescriptorNew; +}; + +using TypeDistrVector = std::vector>; + +/** + * Integer data distribution. + */ +class IntDistribution : public DataTypeDistrNew { +public: + IntDistribution(MixedDistributionDescriptor distrDescriptor, + double weight, + size_t ndv, + int minInt, + int maxInt, + double nullsRatio = 0); + + /* + * Generate a set of random integers, and store them in _valSet. + */ + void init(DatasetDescriptorNew* parentDesc, std::mt19937_64& gen) override; + +protected: + int _minInt; + int _maxInt; +}; + +/** + * String data distribution. + */ +class StrDistribution : public DataTypeDistrNew { +public: + StrDistribution(MixedDistributionDescriptor distrDescriptor, + double weight, + size_t ndv, + size_t minStrLen, + size_t maxStrLen, + double nullsRatio = 0); + + /* + * Generate a set of random strings, and store them in _valSet. + */ + void init(DatasetDescriptorNew* parentDesc, std::mt19937_64& gen) override; + +protected: + std::string genRandomString(size_t len, std::mt19937_64& gen); + + size_t _minStrLen; + size_t _maxStrLen; + // All strings draw characters from this alphabet. + static const std::string _alphabet; + // Generator of random indexes into the set of characters '_alphabet'. + std::uniform_int_distribution _uniformCharIdxDist{0, _alphabet.size() - 1}; +}; + +/** + * SBE array data distribution. + */ +class ArrDistribution : public DataTypeDistrNew { +public: + ArrDistribution(MixedDistributionDescriptor distrDescriptor, + double weight, + size_t ndv, + size_t minArrLen, + size_t maxArrLen, + std::unique_ptr arrayDataDescriptor, + double reuseScalarsRatio = 0, + double nullsRatio = 0); + +private: + void init(DatasetDescriptorNew* parentDesc, std::mt19937_64& gen) override; + + // Generator of random array sizes. + std::uniform_int_distribution _uniformArrSizeDist; + // Descriptor of the dataset within each array. + std::unique_ptr _arrayDataDescriptor; + // Randomly select a parent or a child distribution when generating random + std::uniform_real_distribution _uniformRandProbability{0.0, 1.0}; + double _reuseScalarsRatio; +}; + +/** + Given a list of tyoed data distibutions, this class is used to generate a vector of values + according to the distribution weights. +*/ +class DatasetDescriptorNew { +public: + DatasetDescriptorNew(TypeDistrVector dataTypeDistributions, std::mt19937_64& gen); + + // Generate a random dataset of 'nElems' according to the data distribution characteristics in + // this object. + std::vector genRandomDataset(size_t nElems); + +private: + // Select a random value data type. + DataTypeDistrNew* getRandDataTypeDist(); + + // Distribution of different SBE data types. There will be %percent values of each type. + // TODO: is it a better idea to store shared_ptr or raw pointers to enable reuse? + TypeDistrVector _dataTypeDistributions; + // Pseudo-random generator. + std::mt19937_64& _gen; + // Select a random data type distribution. + std::discrete_distribution _dataTypeSelector; + + friend class ArrDistribution; +}; + +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/scalar_histogram.cpp b/src/mongo/db/query/stats/scalar_histogram.cpp new file mode 100644 index 00000000000..87ab175fe83 --- /dev/null +++ b/src/mongo/db/query/stats/scalar_histogram.cpp @@ -0,0 +1,192 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/stats/scalar_histogram.h" + +#include "mongo/db/exec/sbe/values/bson.h" +#include "mongo/db/exec/sbe/values/value.h" + +namespace mongo::stats { +Bucket::Bucket( + double equalFreq, double rangeFreq, double cumulativeFreq, double ndv, double cumulativeNDV) + : _equalFreq(equalFreq), + _rangeFreq(rangeFreq), + _cumulativeFreq(cumulativeFreq), + _ndv(ndv), + _cumulativeNDV(cumulativeNDV) { + uassert(6695702, "Invalid equalFreq", _equalFreq >= 0.0); + uassert(6695703, "Invalid rangeFreq", _rangeFreq >= 0.0); + uassert(6695704, "Invalid ndv", _ndv <= _rangeFreq); + uassert(6695705, "Invalid cumulative frequency", _cumulativeFreq >= _equalFreq + _rangeFreq); + uassert(6695706, "Invalid cumulative ndv", _cumulativeNDV >= _ndv + 1.0); +} + +std::string Bucket::toString() const { + std::ostringstream os; + os << "equalFreq: " << _equalFreq << ", rangeFreq: " << _rangeFreq + << ", cumulativeFreq: " << _cumulativeFreq << ", ndv: " << _ndv + << ", cumulativeNDV: " << _cumulativeNDV; + return os.str(); +} + +std::string Bucket::dump() const { + std::ostringstream os; + os << _equalFreq << ", " << _rangeFreq << ", " << _ndv; + return os.str(); +} + +BSONObj Bucket::serialize() const { + BSONObjBuilder bob; + bob.appendNumber("boundaryCount", _equalFreq); + bob.appendNumber("rangeCount", _rangeFreq); + bob.appendNumber("rangeDistincts", _ndv); + bob.appendNumber("cumulativeCount", _cumulativeFreq); + bob.appendNumber("cumulativeDistincts", _cumulativeNDV); + bob.doneFast(); + return bob.obj(); +} + +ScalarHistogram::ScalarHistogram() : ScalarHistogram({}, {}) {} + +ScalarHistogram::ScalarHistogram(const StatsHistogram& histogram) { + for (const auto& bucket : histogram.getBuckets()) { + Bucket b(bucket.getBoundaryCount(), + bucket.getRangeCount(), + bucket.getCumulativeCount(), + bucket.getRangeDistincts(), + bucket.getCumulativeDistincts()); + _buckets.push_back(std::move(b)); + } + for (const auto& bound : histogram.getBounds()) { + // We cannot insert a view here, because the lifetime of the of the bound is shorter than + // that of the histogram. In the case of a larger type, e.g. BigString/bsonString, we need + // to copy over the entire string as well, not just a pointer to memory which may be + // deallocated before we need it. + auto value = sbe::bson::convertFrom(bound.getElement()); + _bounds.push_back(value.first, value.second); + } +} + +ScalarHistogram::ScalarHistogram(sbe::value::Array bounds, std::vector buckets) + : _bounds(std::move(bounds)), _buckets(std::move(buckets)) { + uassert(6695707, "Invalid sizes", bounds.size() == buckets.size()); +} + +std::string ScalarHistogram::toString() const { + std::ostringstream os; + os << "["; + for (size_t i = 0; i < _buckets.size(); i++) { + os << "{val: " << _bounds.getAt(i) << ", " << _buckets.at(i).toString() << "}"; + if (_buckets.size() - i > 1) + os << ","; + } + os << "]"; + return os.str(); +} + +std::string ScalarHistogram::plot() const { + std::ostringstream os; + double maxFreq = 0; + const double maxBucketSize = 100; + + for (const auto& bucket : _buckets) { + double maxBucketFreq = std::max(bucket._equalFreq, bucket._rangeFreq); + maxFreq = std::max(maxFreq, maxBucketFreq); + } + + std::vector> headers; + size_t maxHeaderSize = 0; + for (size_t i = 0; i < _buckets.size(); ++i) { + std::ostringstream rngHeader; + std::ostringstream eqlHeader; + double scaledRngF = maxBucketSize * _buckets[i]._rangeFreq / maxFreq; + double scaledEqlF = maxBucketSize * _buckets[i]._equalFreq / maxFreq; + rngHeader << _bounds.getAt(i) << ": " << _buckets[i]._rangeFreq; + eqlHeader << _bounds.getAt(i) << ": " << _buckets[i]._equalFreq; + auto rngStr = rngHeader.str(); + maxHeaderSize = std::max(maxHeaderSize, rngStr.size()); + headers.emplace_back(scaledRngF, rngStr); + auto eqlStr = eqlHeader.str(); + maxHeaderSize = std::max(maxHeaderSize, eqlStr.size()); + headers.emplace_back(scaledEqlF, eqlStr); + } + + const std::string maxLine(maxBucketSize + maxHeaderSize + 3, '-'); + os << maxLine << "\n"; + for (size_t j = 0; j < headers.size(); ++j) { + auto header = headers.at(j); + header.second.resize(maxHeaderSize, ' '); + const std::string bar(std::round(header.first), '*'); + os << header.second << " | " << bar << "\n"; + } + os << maxLine << "\n"; + + return os.str(); +} + +std::string ScalarHistogram::dump() const { + std::ostringstream os; + os << "Histogram:\n{"; + for (size_t i = 0; i < _buckets.size(); i++) { + os << "{" << _bounds.getAt(i) << ", " << _buckets.at(i).dump() << "},\n"; + } + os << "}"; + return os.str(); +} + +const sbe::value::Array& ScalarHistogram::getBounds() const { + return _bounds; +} + +const std::vector& ScalarHistogram::getBuckets() const { + return _buckets; +} + +BSONObj ScalarHistogram::serialize() const { + BSONObjBuilder histogramBuilder; + + // Construct bucket BSON. + auto buckets = getBuckets(); + BSONArrayBuilder bucketsBuilder(histogramBuilder.subarrayStart("buckets")); + for (const auto& bucket : buckets) { + bucketsBuilder.append(bucket.serialize()); + } + bucketsBuilder.doneFast(); + + // Construct bucket bounds BSON. + auto bounds = getBounds(); + BSONArrayBuilder boundsBuilder(histogramBuilder.subarrayStart("bounds")); + sbe::bson::convertToBsonObj(boundsBuilder, &bounds); + boundsBuilder.doneFast(); + + histogramBuilder.doneFast(); + return histogramBuilder.obj(); +} + +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/scalar_histogram.h b/src/mongo/db/query/stats/scalar_histogram.h new file mode 100644 index 00000000000..0473f369af1 --- /dev/null +++ b/src/mongo/db/query/stats/scalar_histogram.h @@ -0,0 +1,120 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include +#include +#include + +#include "mongo/db/exec/sbe/values/value.h" +#include "mongo/db/query/stats/stats_gen.h" + +namespace mongo::stats { + +/** + * Statistics related to a single ScalarHistogram bucket. The boundary value is kept in a separate + * array, so that each bucket has a corresponding boundary value. The reason for this to manage the + * memory of values. + */ +struct Bucket { + Bucket(double equalFreq, + double rangeFreq, + double cumulativeFreq, + double ndv, + double cumulativeNDV); + + std::string toString() const; + // Help function to dump the bucket content as needed by histogram creation in the unit tests. + std::string dump() const; + + // Frequency of the bound value itself. + double _equalFreq; + + // Frequency of other values. + double _rangeFreq; + + // Sum of frequencies of preceding buckets to avoid recomputing. Includes both _equalFreq and + // _rangeFreq. + double _cumulativeFreq; + + // Number of distinct values in this bucket, excludes the bound. + double _ndv; + + // Sum of distinct values in preceding buckets including this bucket. + double _cumulativeNDV; + + // Serialize to BSON for storage in stats collection. + BSONObj serialize() const; +}; + +/** + * A ScalarHistogram over a set of values. The ScalarHistogram consists of two parallel vectors - + * one with the individual value statistics, and another one with the actual boundary values. + */ +class ScalarHistogram { +public: + ScalarHistogram(); + ScalarHistogram(const StatsHistogram& histogram); + ScalarHistogram(sbe::value::Array bounds, std::vector buckets); + + // Print a human-readable representation of a histogram. + std::string toString() const; + std::string plot() const; + // Help function to dump the content of the histogram as needed by the manual histogram creation + // in the unit tests (without cummulative frequency and NDV). + std::string dump() const; + + const sbe::value::Array& getBounds() const; + const std::vector& getBuckets() const; + // Return the total number of histogrammed values. + size_t getCardinality() const { + if (_buckets.empty()) { + return 0.0; + } + return _buckets.back()._cumulativeFreq; + } + + bool empty() const { + return _buckets.empty(); + } + + // Serialize to BSON for storage in stats collection. + BSONObj serialize() const; + + static constexpr size_t kMaxBuckets = 100; + +private: + // Bucket bounds representing the **highest** value in each bucket. + sbe::value::Array _bounds; + + std::vector _buckets; +}; + +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/stats.idl b/src/mongo/db/query/stats/stats.idl new file mode 100644 index 00000000000..eb6220d45b9 --- /dev/null +++ b/src/mongo/db/query/stats/stats.idl @@ -0,0 +1,102 @@ +# Copyright (C) 2022-present MongoDB, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the Server Side Public License, version 1, +# as published by MongoDB, Inc. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Server Side Public License for more details. +# +# You should have received a copy of the Server Side Public License +# along with this program. If not, see +# . +# +# As a special exception, the copyright holders give permission to link the +# code of portions of this program with the OpenSSL library under certain +# conditions as described in each individual source file and distribute +# linked combinations including the program with the OpenSSL library. You +# must comply with the Server Side Public License in all respects for +# all of the code used other than as permitted herein. If you modify file(s) +# with this exception, you may extend this exception to your version of the +# file(s), but you are not obligated to do so. If you do not wish to do so, +# delete this exception statement from your version. If you delete this +# exception statement from all source files in the program, then also delete +# it in the license file. +# +global: + cpp_namespace: "mongo" + +imports: + - "mongo/db/basic_types.idl" + +structs: + StatsBucket: + description: "Histogram bucket" + fields: + boundaryCount: + type: double + rangeCount: + type: double + rangeDistincts: + type: double + cumulativeCount: + type: double + cumulativeDistincts: + type: double + + StatsHistogram: + description: "MaxDiff Histogram" + fields: + buckets: + type: array + bounds: + type: array + + TypeTag: + description: "SBE types and their corresponding frequencies in the histogram" + fields: + typeName: + type: string + count: + type: double + + StatsArrayHistogram: + description: "Array Histogram" + fields: + minHistogram: + type: StatsHistogram + maxHistogram: + type: StatsHistogram + uniqueHistogram: + type: StatsHistogram + typeCount: + type: array + + Statistics: + description: "Serialized representation of data statistics for a key path" + fields: + documents: + type: double + trueCount: + type: double + falseCount: + type: double + emptyArrayCount: + type: double + typeCount: + type: array + scalarHistogram: + type: StatsHistogram + arrayStatistics: + type: StatsArrayHistogram + optional: true + + StatsPath: + description: "Key path to statstics" + fields: + _id: + type: string + statistics: + type: Statistics diff --git a/src/mongo/db/query/stats/stats_cache.cpp b/src/mongo/db/query/stats/stats_cache.cpp new file mode 100644 index 00000000000..dfe5a43890e --- /dev/null +++ b/src/mongo/db/query/stats/stats_cache.cpp @@ -0,0 +1,74 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/stats/stats_cache.h" + +#include "mongo/db/query/stats/collection_statistics.h" +#include "mongo/util/read_through_cache.h" + +#include "mongo/logv2/log.h" + +#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kQuery + +namespace mongo::stats { +namespace { +const auto statsCacheDecoration = ServiceContext::declareDecoration>(); +} // namespace + +StatsCache::StatsCache(ServiceContext* service, + std::unique_ptr cacheLoader, + ThreadPoolInterface& threadPool, + int size) + : ReadThroughCache( + _mutex, + service, + threadPool, + [this](OperationContext* opCtx, + const StatsPathString& statsPath, + const ValueHandle& stats) { return _lookupStats(opCtx, statsPath, stats); }, + size), + _statsCacheLoader(std::move(cacheLoader)) {} + +StatsCache::LookupResult StatsCache::_lookupStats(OperationContext* opCtx, + const StatsPathString& statsPath, + const StatsCacheValueHandle& stats) { + + try { + invariant(_statsCacheLoader); + auto newStats = _statsCacheLoader->getStats(opCtx, statsPath).get(); + return LookupResult(std::move(newStats)); + } catch (const DBException& ex) { + if (ex.code() == ErrorCodes::NamespaceNotFound) { + return StatsCache::LookupResult(boost::none); + } + throw; + } +} + +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/stats_cache.h b/src/mongo/db/query/stats/stats_cache.h new file mode 100644 index 00000000000..37d3d238a4d --- /dev/null +++ b/src/mongo/db/query/stats/stats_cache.h @@ -0,0 +1,81 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/base/string_data.h" +#include "mongo/db/namespace_string.h" +#include "mongo/db/query/stats/collection_statistics.h" +#include "mongo/db/query/stats/stats_cache_loader.h" +#include "mongo/util/concurrency/thread_pool.h" +#include "mongo/util/read_through_cache.h" + +namespace mongo::stats { +using StatsCacheType = ReadThroughCache; +using StatsCacheValueHandle = StatsCacheType::ValueHandle; + +/** + * Collectoin statistics read through cache. It reads from the persitent storage but never wrties to + * it. + */ +class StatsCache : public StatsCacheType { +public: + /** + * The constructor provides the Service context under which this cache has been instantiated, + * and a Thread pool to be used for invoking the blocking 'lookup' calls. The size is the number + * of entries the underlying LRU cache will hold. + */ + StatsCache(ServiceContext* service, + std::unique_ptr cacheLoader, + ThreadPoolInterface& threadPool, + int size); + + /** + * Returns statsCacheLoader currently used for testing only. + */ + StatsCacheLoader* getStatsCacheLoader() { + invariant(_statsCacheLoader); + + return _statsCacheLoader.get(); + } + +private: + /** + * Reads collection stats from the underlying storage if its not found in the in memory cache. + */ + LookupResult _lookupStats(OperationContext* opCtx, + const StatsPathString& statsPath, + const ValueHandle& stats); + + Mutex _mutex = MONGO_MAKE_LATCH("StatsCache::_mutex"); + + std::unique_ptr _statsCacheLoader; +}; + +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/stats_cache_loader.h b/src/mongo/db/query/stats/stats_cache_loader.h new file mode 100644 index 00000000000..7bad4b64304 --- /dev/null +++ b/src/mongo/db/query/stats/stats_cache_loader.h @@ -0,0 +1,58 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/namespace_string.h" +#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/stdx/thread.h" + +namespace mongo::stats { +using StatsPathString = std::pair; +using StatsCacheVal = std::shared_ptr; + +class StatsCacheLoader { +public: + /** + * Non-blocking call, which returns CollectionStatistics from the the persistent metadata store. + * + * If for some reason the asynchronous fetch operation cannot be dispatched (for example on + * shutdown), throws a DBException. + */ + virtual SemiFuture getStats(OperationContext* opCtx, + const StatsPathString& statsPath) = 0; + + virtual void setStatsReturnValueForTest(StatusWith swStats){}; + + virtual ~StatsCacheLoader() {} + + static constexpr StringData kStatsPrefix = "system.statistics"_sd; +}; + +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/stats_cache_loader_impl.cpp b/src/mongo/db/query/stats/stats_cache_loader_impl.cpp new file mode 100644 index 00000000000..e41912eafc0 --- /dev/null +++ b/src/mongo/db/query/stats/stats_cache_loader_impl.cpp @@ -0,0 +1,82 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/stats/stats_cache_loader_impl.h" + +#include "mongo/db/dbdirectclient.h" +#include "mongo/db/namespace_string.h" +#include "mongo/db/query/stats/stats_gen.h" +#include "mongo/logv2/log.h" +#include "mongo/stdx/thread.h" + +#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kQuery + +namespace mongo::stats { +SemiFuture StatsCacheLoaderImpl::getStats(OperationContext* opCtx, + const StatsPathString& statsPath) { + + std::string statsColl(kStatsPrefix + "." + statsPath.first.coll()); + + NamespaceString statsNss(statsPath.first.db(), statsColl); + DBDirectClient client(opCtx); + + + FindCommandRequest findRequest{statsNss}; + BSONObj filter = BSON("_id" << statsPath.second); + LOGV2_DEBUG(7085600, 1, "findRequest filter", "filter"_attr = filter.toString()); + findRequest.setFilter(filter.getOwned()); + + try { + auto cursor = client.find(std::move(findRequest)); + + if (!cursor) { + uasserted(ErrorCodes::OperationFailed, + str::stream() + << "Failed to establish a cursor for reading " << statsPath.first.ns() + << ", path " << statsPath.second << " from local storage"); + } + + if (cursor->more()) { + IDLParserContext ctx("StatsPath"); + BSONObj document = cursor->nextSafe().getOwned(); + auto parsedStats = StatsPath::parse(ctx, document); + StatsCacheVal statsPtr(new ArrayHistogram(parsedStats.getStatistics())); + return makeReadyFutureWith([this, statsPtr] { return statsPtr; }).semi(); + } + + uasserted(ErrorCodes::NamespaceNotFound, + str::stream() << "Stats does not exists for " << statsNss.ns() << ", path " + << statsPath.second); + } catch (const DBException& ex) { + uassertStatusOK(ex.toStatus()); + } + MONGO_UNREACHABLE +} + +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/stats_cache_loader_impl.h b/src/mongo/db/query/stats/stats_cache_loader_impl.h new file mode 100644 index 00000000000..979a1009acb --- /dev/null +++ b/src/mongo/db/query/stats/stats_cache_loader_impl.h @@ -0,0 +1,45 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/namespace_string.h" +#include "mongo/db/query/stats/collection_statistics.h" +#include "mongo/db/query/stats/stats_cache_loader.h" +#include "mongo/stdx/thread.h" + +namespace mongo::stats { + +class StatsCacheLoaderImpl : public StatsCacheLoader { +public: + SemiFuture getStats(OperationContext* opCtx, + const StatsPathString& statsPath) override; +}; + +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/stats_cache_loader_mock.cpp b/src/mongo/db/query/stats/stats_cache_loader_mock.cpp new file mode 100644 index 00000000000..c190d61c312 --- /dev/null +++ b/src/mongo/db/query/stats/stats_cache_loader_mock.cpp @@ -0,0 +1,50 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/stats/stats_cache_loader_mock.h" + +#include "mongo/db/namespace_string.h" +#include "mongo/db/query/stats/collection_statistics.h" +#include "mongo/stdx/thread.h" + +namespace mongo::stats { + +const Status StatsCacheLoaderMock::kInternalErrorStatus = { + ErrorCodes::InternalError, "Stats cache loader received unexpected request"}; + +SemiFuture StatsCacheLoaderMock::getStats(OperationContext* opCtx, + const StatsPathString& statsPath) { + + return makeReadyFutureWith([this] { return _swStatsReturnValueForTest; }).semi(); +} + +void StatsCacheLoaderMock::setStatsReturnValueForTest(StatusWith swStats) { + _swStatsReturnValueForTest = std::move(swStats); +} +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/stats_cache_loader_mock.h b/src/mongo/db/query/stats/stats_cache_loader_mock.h new file mode 100644 index 00000000000..9951bcfd2ca --- /dev/null +++ b/src/mongo/db/query/stats/stats_cache_loader_mock.h @@ -0,0 +1,52 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/namespace_string.h" +#include "mongo/db/query/stats/collection_statistics.h" +#include "mongo/db/query/stats/stats_cache_loader.h" +#include "mongo/stdx/thread.h" + +namespace mongo::stats { + +class StatsCacheLoaderMock : public StatsCacheLoader { +public: + SemiFuture getStats(OperationContext* opCtx, + const StatsPathString& statsPath) override; + + void setStatsReturnValueForTest(StatusWith swStats); + + static const Status kInternalErrorStatus; + +private: + StatusWith _swStatsReturnValueForTest{kInternalErrorStatus}; +}; + +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/stats_cache_loader_test.cpp b/src/mongo/db/query/stats/stats_cache_loader_test.cpp new file mode 100644 index 00000000000..a22e6dd9044 --- /dev/null +++ b/src/mongo/db/query/stats/stats_cache_loader_test.cpp @@ -0,0 +1,116 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/bson/oid.h" +#include "mongo/db/catalog/collection_write_path.h" +#include "mongo/db/db_raii.h" +#include "mongo/db/query/stats/scalar_histogram.h" +#include "mongo/db/query/stats/stats_cache_loader_impl.h" +#include "mongo/db/query/stats/stats_cache_loader_test_fixture.h" +#include "mongo/db/query/stats/stats_gen.h" +#include "mongo/unittest/unittest.h" +#include "mongo/util/assert_util.h" +#include "mongo/util/fail_point.h" + +namespace mongo::stats { +namespace { + +class StatsCacheLoaderTest : public StatsCacheLoaderTestFixture { +protected: + void createStatsCollection(NamespaceString nss); + StatsCacheLoaderImpl _statsCacheLoader; +}; + +void StatsCacheLoaderTest::createStatsCollection(NamespaceString nss) { + auto opCtx = operationContext(); + AutoGetCollection autoColl(opCtx, nss, MODE_IX); + auto db = autoColl.ensureDbExists(opCtx); + WriteUnitOfWork wuow(opCtx); + ASSERT(db->createCollection(opCtx, nss)); + wuow.commit(); +} + +TEST_F(StatsCacheLoaderTest, VerifyStatsLoad) { + // Initialize histogram buckets. + constexpr double doubleCount = 15.0; + constexpr double trueCount = 12.0; + constexpr double falseCount = 16.0; + constexpr double numDocs = doubleCount + trueCount + falseCount; + std::vector buckets{ + Bucket{1.0, 0.0, 1.0, 0.0, 1.0}, + Bucket{2.0, 5.0, 8.0, 1.0, 2.0}, + Bucket{3.0, 4.0, 15.0, 2.0, 6.0}, + }; + + // Initialize histogram bounds. + auto [boundsTag, boundsVal] = sbe::value::makeNewArray(); + sbe::value::ValueGuard boundsGuard{boundsTag, boundsVal}; + auto bounds = sbe::value::getArrayView(boundsVal); + bounds->push_back(sbe::value::TypeTags::NumberDouble, 1.0); + bounds->push_back(sbe::value::TypeTags::NumberDouble, 2.0); + bounds->push_back(sbe::value::TypeTags::NumberDouble, 3.0); + + // Create a scalar histogram. + TypeCounts tc{ + {sbe::value::TypeTags::NumberDouble, doubleCount}, + {sbe::value::TypeTags::Boolean, trueCount + falseCount}, + }; + ScalarHistogram sh(*bounds, buckets); + ArrayHistogram ah(sh, tc, trueCount, falseCount); + auto expectedSerialized = ah.serialize(); + + // Serialize histogram into a stats path. + std::string path = "somePath"; + auto serialized = stats::makeStatsPath(path, numDocs, ah); + + // Initalize stats collection. + NamespaceString nss("test", "stats"); + std::string statsColl(StatsCacheLoader::kStatsPrefix + "." + nss.coll()); + NamespaceString statsNss(nss.db(), statsColl); + createStatsCollection(statsNss); + + // Write serialized stats path to collection. + AutoGetCollection autoColl(operationContext(), statsNss, MODE_IX); + const CollectionPtr& coll = autoColl.getCollection(); + { + WriteUnitOfWork wuow(operationContext()); + ASSERT_OK(collection_internal::insertDocument( + operationContext(), coll, InsertStatement(serialized), nullptr)); + wuow.commit(); + } + + // Read stats path & verify values are consistent with what we expect. + auto actualAH = _statsCacheLoader.getStats(operationContext(), std::make_pair(nss, path)).get(); + auto actualSerialized = actualAH->serialize(); + + ASSERT_BSONOBJ_EQ(expectedSerialized, actualSerialized); +} + +} // namespace +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/stats_cache_loader_test_fixture.cpp b/src/mongo/db/query/stats/stats_cache_loader_test_fixture.cpp new file mode 100644 index 00000000000..1e353196b83 --- /dev/null +++ b/src/mongo/db/query/stats/stats_cache_loader_test_fixture.cpp @@ -0,0 +1,74 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/stats/stats_cache_loader_test_fixture.h" + +#include + +#include "mongo/db/repl/replication_coordinator_mock.h" +#include "mongo/db/repl/storage_interface_impl.h" +#include "mongo/db/service_context_d_test_fixture.h" + +namespace mongo::stats { + +void StatsCacheLoaderTestFixture::setUp() { + // Set up mongod. + ServiceContextMongoDTest::setUp(); + + auto service = getServiceContext(); + _storage = std::make_unique(); + _opCtx = cc().makeOperationContext(); + + // Set up ReplicationCoordinator and ensure that we are primary. + auto replCoord = std::make_unique(service); + ASSERT_OK(replCoord->setFollowerMode(repl::MemberState::RS_PRIMARY)); + repl::ReplicationCoordinator::set(service, std::move(replCoord)); + + // Set up oplog collection. If the WT storage engine is used, the oplog collection is expected + // to exist when fetching the next opTime (LocalOplogInfo::getNextOpTimes) to use for a write. + repl::createOplog(operationContext()); +} + +void StatsCacheLoaderTestFixture::tearDown() { + _storage.reset(); + _opCtx.reset(); + + // Tear down mongod. + ServiceContextMongoDTest::tearDown(); +} + +OperationContext* StatsCacheLoaderTestFixture::operationContext() { + return _opCtx.get(); +} + +repl::StorageInterface* StatsCacheLoaderTestFixture::storageInterface() { + return _storage.get(); +} + +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/stats_cache_loader_test_fixture.h b/src/mongo/db/query/stats/stats_cache_loader_test_fixture.h new file mode 100644 index 00000000000..6c7d502fdf1 --- /dev/null +++ b/src/mongo/db/query/stats/stats_cache_loader_test_fixture.h @@ -0,0 +1,60 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/operation_context.h" +#include "mongo/db/query/stats/stats_cache_loader.h" +#include "mongo/db/repl/storage_interface_impl.h" +#include "mongo/db/service_context_d_test_fixture.h" + +namespace mongo::stats { + +/** + * Sets up and provides a repl::StorageInterface and OperationContext. + * Database data are cleared between test runs. + */ +class StatsCacheLoaderTestFixture : public ServiceContextMongoDTest { +public: + explicit StatsCacheLoaderTestFixture(Options options = {}) + : ServiceContextMongoDTest(std::move(options)) {} + + OperationContext* operationContext(); + repl::StorageInterface* storageInterface(); + +protected: + void setUp() override; + void tearDown() override; + +private: + ServiceContext::UniqueOperationContext _opCtx; + std::unique_ptr _storage; +}; + +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/stats_cache_test.cpp b/src/mongo/db/query/stats/stats_cache_test.cpp new file mode 100644 index 00000000000..b95dc2c3bd8 --- /dev/null +++ b/src/mongo/db/query/stats/stats_cache_test.cpp @@ -0,0 +1,131 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include + +#include "mongo/db/client.h" +#include "mongo/db/concurrency/locker_noop_service_context_test_fixture.h" +#include "mongo/db/operation_context.h" +#include "mongo/db/query/stats/stats_cache.h" +#include "mongo/db/query/stats/stats_cache_loader_mock.h" +#include "mongo/unittest/barrier.h" +#include "mongo/unittest/unittest.h" +#include "mongo/util/concurrency/thread_pool.h" +#include "mongo/util/read_through_cache.h" +#include "mongo/util/scopeguard.h" + +#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kDefault + +namespace mongo::stats { +namespace { + +using unittest::assertGet; + +/** + * Fixture for tests, which do not need to exercise the multi-threading capabilities of the cache + * and as such do not require control over the creation/destruction of their operation contexts. + */ +class StatsCacheTest : public LockerNoopServiceContextTest { +protected: + // Extends StatsCache and automatically provides it with a thread pool, which will be + // shutdown and joined before the StatsCache is destroyed (which is part of the contract of + // ReadThroughCache) + class CacheWithThreadPool : public StatsCache { + public: + CacheWithThreadPool(ServiceContext* service, + std::unique_ptr cacheLoaderMock, + size_t size) + : StatsCache(service, std::move(cacheLoaderMock), _threadPool, size) { + _threadPool.startup(); + } + + private: + ThreadPool _threadPool{[] { + ThreadPool::Options options; + options.poolName = "StatsCacheTest"; + options.minThreads = 1; + options.maxThreads = 1; + return options; + }()}; + }; + + const ServiceContext::UniqueOperationContext _opCtxHolder{makeOperationContext()}; + OperationContext* const _opCtx{_opCtxHolder.get()}; +}; + +TEST(StatsCacheTest, StandaloneValueHandle) { + StatsCacheVal statsPtr(new ArrayHistogram()); + StatsCache::ValueHandle standaloneHandle(std::move(statsPtr)); + ASSERT(standaloneHandle.isValid()); +} + +TEST_F(StatsCacheTest, KeyDoesNotExist) { + Status namespaceNotFoundErrorStatus = {ErrorCodes::NamespaceNotFound, + "The key does not exists"}; + auto cacheLoaderMock = std::make_unique(); + auto cache = CacheWithThreadPool(getServiceContext(), std::move(cacheLoaderMock), 1); + cache.getStatsCacheLoader()->setStatsReturnValueForTest( + std::move(namespaceNotFoundErrorStatus)); + auto handle = cache.acquire(_opCtx, std::make_pair(NamespaceString("db", "coll"), "somePath")); + ASSERT(!handle); +} + +/* +TEST_F(StatsCacheTest, LoadStats) { + auto cacheLoaderMock = std::make_unique(); + auto cache = CacheWithThreadPool(getServiceContext(), std::move(cacheLoaderMock), 1); + + auto stats1 = CollectionStatistics(1); + auto stats2 = CollectionStatistics(2); + + cache.getStatsCacheLoader()->setStatsReturnValueForTest(std::move(stats1)); + + auto handle = cache.acquire(_opCtx, NamespaceString("db", "coll1")); + ASSERT(handle.isValid()); + ASSERT_EQ(1, handle->getCardinality()); + + // Make all requests to StatsCacheLoader to throw an exception to ensre that test returns value + // from cache. + Status internalErrorStatus = {ErrorCodes::InternalError, + "Stats cache loader received unexpected request"}; + cache.getStatsCacheLoader()->setStatsReturnValueForTest(std::move(internalErrorStatus)); + + handle = cache.acquire(_opCtx, NamespaceString("db", "coll1")); + ASSERT(handle.isValid()); + ASSERT_EQ(1, handle->getCardinality()); + + cache.getStatsCacheLoader()->setStatsReturnValueForTest(std::move(stats2)); + handle = cache.acquire(_opCtx, NamespaceString("db", "coll2")); + ASSERT(handle.isValid()); + ASSERT_EQ(2, handle->getCardinality()); +} +*/ + +} // namespace +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/stats_catalog.cpp b/src/mongo/db/query/stats/stats_catalog.cpp new file mode 100644 index 00000000000..99891f1dc4c --- /dev/null +++ b/src/mongo/db/query/stats/stats_catalog.cpp @@ -0,0 +1,108 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/stats/stats_catalog.h" + +#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/collection_statistics.h" +#include "mongo/db/query/stats/stats_cache.h" +#include "mongo/util/read_through_cache.h" + +#include "mongo/logv2/log.h" + +#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kQuery + +namespace mongo::stats { +namespace { +const auto statsCatalogDecoration = + ServiceContext::declareDecoration>(); +} // namespace + +StatsCatalog::StatsCatalog(ServiceContext* service, + std::unique_ptr statsCacheLoader) + : _executor(std::make_shared([] { + ThreadPool::Options options; + options.poolName = "StatsCache"; + options.minThreads = 0; + options.maxThreads = 2; + return options; + }())), + _statsCache(service, std::move(statsCacheLoader), *_executor, 1000) { + _executor->startup(); +} + +StatsCatalog::~StatsCatalog() { + // The executor is used by the StatsCatalog, so it must be joined, before this cache is + // destroyed, per the contract of ReadThroughCache. + _executor->shutdown(); + _executor->join(); +} + +void StatsCatalog::set(ServiceContext* serviceContext, std::unique_ptr cache) { + auto& statsCatalog = statsCatalogDecoration(serviceContext); + invariant(!statsCatalog); + + statsCatalog = std::move(cache); +} + +StatsCatalog& StatsCatalog::get(ServiceContext* serviceContext) { + auto& statsCatalog = statsCatalogDecoration(serviceContext); + invariant(statsCatalog); + + return *statsCatalog; +} + +StatsCatalog& StatsCatalog::get(OperationContext* opCtx) { + return get(opCtx->getServiceContext()); +} + +StatusWith> StatsCatalog::getHistogram(OperationContext* opCtx, + const NamespaceString& nss, + const std::string& path) { + try { + auto handle = _statsCache.acquire(opCtx, std::make_pair(nss, path)); + uassert(ErrorCodes::NamespaceNotFound, + str::stream() << "path " << nss << " : " << path << " not found", + handle); + + return *(handle.get()); + } catch (const DBException& ex) { + return ex.toStatus(); + } +} + +Status StatsCatalog::invalidatePath(const NamespaceString& nss, const std::string& path) { + try { + _statsCache.invalidateKey(std::make_pair(nss, path)); + return Status::OK(); + } catch (const DBException& ex) { + return ex.toStatus(); + } +} +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/stats_catalog.h b/src/mongo/db/query/stats/stats_catalog.h new file mode 100644 index 00000000000..e86b4562b8e --- /dev/null +++ b/src/mongo/db/query/stats/stats_catalog.h @@ -0,0 +1,77 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/base/string_data.h" +#include "mongo/db/namespace_string.h" +#include "mongo/db/query/stats/collection_statistics.h" +#include "mongo/db/query/stats/stats_cache.h" +#include "mongo/db/query/stats/stats_cache_loader.h" +#include "mongo/util/concurrency/thread_pool.h" + +namespace mongo::stats { +/** + * This class owns statsCache and manages executor lifetime. + */ +class StatsCatalog { +public: + /** + * Stores the catalog on the specified service context. May only be called once for the lifetime + * of the service context. + */ + static void set(ServiceContext* serviceContext, std::unique_ptr catalog); + + static StatsCatalog& get(ServiceContext* serviceContext); + static StatsCatalog& get(OperationContext* opCtx); + + /** + * The constructor provides the Service context under which the cache needs to be instantiated, + * and a Thread pool to be used for invoking the blocking 'lookup' calls. The size is the number + * of entries the underlying LRU cache will hold. + */ + StatsCatalog(ServiceContext* service, std::unique_ptr cacheLoader); + + ~StatsCatalog(); + + StatusWith> getHistogram(OperationContext* opCtx, + const NamespaceString& nss, + const std::string& path); + + Status invalidatePath(const NamespaceString& nss, const std::string& path); + +private: + /** + * The executor is used by the cache. + */ + std::shared_ptr _executor; + StatsCache _statsCache; +}; + +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/stats_path_test.cpp b/src/mongo/db/query/stats/stats_path_test.cpp new file mode 100644 index 00000000000..3e3afe50b32 --- /dev/null +++ b/src/mongo/db/query/stats/stats_path_test.cpp @@ -0,0 +1,129 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/bson/bsonobjbuilder.h" +#include "mongo/db/exec/sbe/values/bson.h" +#include "mongo/db/exec/sbe/values/value.h" +#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/db/query/stats/scalar_histogram.h" +#include "mongo/db/query/stats/stats_gen.h" +#include "mongo/unittest/unittest.h" +#include "mongo/util/assert_util.h" + +namespace mongo::stats { +namespace { + +IDLParserContext ctx("StatsPath"); + +/** + * Validate round trip conversion for histogram bucket + */ +TEST(StatsPath, BasicValidStatsBucketDouble) { + // Create & parse StatsBucket. + auto serializedBucket = Bucket{3.0, 4.0, 15.0, 2.0, 6.0}.serialize(); + auto parsedBucket = StatsBucket::parse(ctx, serializedBucket); + + // Round-trip conversion. + auto bucketToBSON = parsedBucket.toBSON(); + ASSERT_BSONOBJ_EQ(serializedBucket, bucketToBSON); +} + +/** + * Validate round-trip conversion for StatsPath datatype. + */ +TEST(StatsPath, BasicValidStatsPath) { + // Initialize histogram buckets. + constexpr double doubleCount = 15.0; + constexpr double trueCount = 12.0; + constexpr double falseCount = 16.0; + constexpr double numDocs = doubleCount + trueCount + falseCount; + std::vector buckets{ + Bucket{1.0, 0.0, 1.0, 0.0, 1.0}, + Bucket{2.0, 5.0, 8.0, 1.0, 2.0}, + Bucket{3.0, 4.0, 15.0, 2.0, 6.0}, + }; + + // Initialize histogram bounds. + auto [boundsTag, boundsVal] = sbe::value::makeNewArray(); + sbe::value::ValueGuard boundsGuard{boundsTag, boundsVal}; + auto bounds = sbe::value::getArrayView(boundsVal); + bounds->push_back(sbe::value::TypeTags::NumberDouble, 1.0); + bounds->push_back(sbe::value::TypeTags::NumberDouble, 2.0); + bounds->push_back(sbe::value::TypeTags::NumberDouble, 3.0); + + // Create a scalar histogram. + TypeCounts tc{ + {sbe::value::TypeTags::NumberDouble, doubleCount}, + {sbe::value::TypeTags::Boolean, trueCount + falseCount}, + }; + ScalarHistogram sh(*bounds, buckets); + ArrayHistogram ah(sh, tc, trueCount, falseCount); + + // Serialize to BSON. + auto serializedPath = stats::makeStatsPath("somePath", numDocs, ah); + + // Parse StatsPath via IDL & serialize to BSON. + auto parsedPath = StatsPath::parse(ctx, serializedPath); + auto parsedPathToBSON = parsedPath.toBSON(); + + // We should end up with the same serialized BSON in the end. + ASSERT_BSONOBJ_EQ(serializedPath, parsedPathToBSON); +} + +/** + * Validate round-trip conversion for StatsPath datatype. + */ +TEST(StatsPath, BasicValidEmptyStatsPath) { + // Initialize histogram buckets. + constexpr double numDocs = 0.0; + std::vector buckets; + + // Initialize histogram bounds. + auto [boundsTag, boundsVal] = sbe::value::makeNewArray(); + sbe::value::ValueGuard boundsGuard{boundsTag, boundsVal}; + auto bounds = sbe::value::getArrayView(boundsVal); + + // Create an empty scalar histogram. + TypeCounts tc; + ScalarHistogram sh(*bounds, buckets); + ArrayHistogram ah(sh, tc); + + // Serialize to BSON. + auto serializedPath = stats::makeStatsPath("someEmptyPath", numDocs, ah); + + // Parse StatsPath via IDL & serialize to BSON. + auto parsedPath = StatsPath::parse(ctx, serializedPath); + auto parsedPathToBSON = parsedPath.toBSON(); + + // We should end up with the same serialized BSON in the end. + ASSERT_BSONOBJ_EQ(serializedPath, parsedPathToBSON); +} + +} // namespace +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/value_utils.cpp b/src/mongo/db/query/stats/value_utils.cpp new file mode 100644 index 00000000000..5af0f1c248c --- /dev/null +++ b/src/mongo/db/query/stats/value_utils.cpp @@ -0,0 +1,252 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/stats/value_utils.h" + +#include "mongo/db/query/stats/scalar_histogram.h" + +namespace mongo::stats { +namespace value = sbe::value; + +SBEValue::SBEValue(value::TypeTags tag, value::Value val) : _tag(tag), _val(val) {} + +SBEValue::SBEValue(std::pair v) : SBEValue(v.first, v.second) {} + +SBEValue::SBEValue(const SBEValue& other) { + auto [tag, val] = copyValue(other._tag, other._val); + _tag = tag; + _val = val; +} + +SBEValue::SBEValue(SBEValue&& other) { + _tag = other._tag; + _val = other._val; + + other._tag = value::TypeTags::Nothing; + other._val = 0; +} + +SBEValue::~SBEValue() { + value::releaseValue(_tag, _val); +} + +SBEValue& SBEValue::operator=(const SBEValue& other) { + value::releaseValue(_tag, _val); + + auto [tag, val] = copyValue(other._tag, other._val); + _tag = tag; + _val = val; + return *this; +} + +SBEValue& SBEValue::operator=(SBEValue&& other) { + value::releaseValue(_tag, _val); + + _tag = other._tag; + _val = other._val; + + other._tag = value::TypeTags::Nothing; + other._val = 0; + + return *this; +} + +std::pair SBEValue::get() const { + return std::make_pair(_tag, _val); +} + +value::TypeTags SBEValue::getTag() const { + return _tag; +} + +value::Value SBEValue::getValue() const { + return _val; +} + +std::pair makeInt64Value(int v) { + return std::make_pair(value::TypeTags::NumberInt64, value::bitcastFrom(v)); +}; + +std::pair makeNullValue() { + return std::make_pair(value::TypeTags::Null, 0); +}; + +bool sameTypeClass(value::TypeTags tag1, value::TypeTags tag2) { + if (tag1 == tag2) { + return true; + } + + static constexpr const char* kTempFieldName = "temp"; + + BSONObjBuilder minb1; + minb1.appendMinForType(kTempFieldName, value::tagToType(tag1)); + const BSONObj min1 = minb1.obj(); + + BSONObjBuilder minb2; + minb2.appendMinForType(kTempFieldName, value::tagToType(tag2)); + const BSONObj min2 = minb2.obj(); + + return min1.woCompare(min2) == 0; +} + +bool sameTypeBracket(value::TypeTags tag1, value::TypeTags tag2) { + if (tag1 == tag2) { + return true; + } + return ((value::isNumber(tag1) && value::isNumber(tag2)) || + (value::isString(tag1) && value::isString(tag2))); +} + +int32_t compareValues(value::TypeTags tag1, + value::Value val1, + value::TypeTags tag2, + value::Value val2) { + const auto [compareTag, compareVal] = value::compareValue(tag1, val1, tag2, val2); + uassert(6660547, "Invalid comparison result", compareTag == value::TypeTags::NumberInt32); + return value::bitcastTo(compareVal); +} + +void sortValueVector(std::vector& sortVector) { + const auto cmp = [](const SBEValue& a, const SBEValue& b) { + return compareValues(a.getTag(), a.getValue(), b.getTag(), b.getValue()) < 0; + }; + std::sort(sortVector.begin(), sortVector.end(), cmp); +} + +double valueToDouble(value::TypeTags tag, value::Value val) { + double result = 0; + if (value::isNumber(tag)) { + result = value::numericCast(tag, val); + } else if (value::isString(tag)) { + const StringData sd = value::getStringView(tag, val); + + // Convert a prefix of the string to a double. + const size_t maxPrecision = std::min(sd.size(), sizeof(double)); + for (size_t i = 0; i < maxPrecision; ++i) { + const char ch = sd[i]; + const double charToDbl = ch / std::pow(2, i * 8); + result += charToDbl; + } + } else if (tag == value::TypeTags::Date || tag == value::TypeTags::Timestamp) { + int64_t v = value::bitcastTo(val); + result = value::numericCast(value::TypeTags::NumberInt64, v); + + } else if (tag == value::TypeTags::ObjectId) { + auto objView = + ConstDataView(reinterpret_cast(sbe::value::getObjectIdView(val)->data())); + // Take the first 8 bytes of the ObjectId. + // ToDo: consider using the entire ObjectId or other parts of it + // auto v = objView.read>(sizeof(uint32_t)); + auto v = objView.read>(); + result = value::numericCast(value::TypeTags::NumberInt64, v); + } else { + uassert(6844500, "Unexpected value type", false); + } + + return result; +} + +bool canEstimateTypeViaHistogram(value::TypeTags tag) { + if (sbe::value::isNumber(tag) || value::isString(tag)) { + return true; + } + + switch (tag) { + // Other types that we can/do build histograms on: + // - Date/time types. + case value::TypeTags::Date: + case value::TypeTags::Timestamp: + // - ObjectId. + case value::TypeTags::ObjectId: + return true; + + // Types that can only be estimated via the type-counters. + case value::TypeTags::Object: + case value::TypeTags::Array: + case value::TypeTags::Null: + case value::TypeTags::Nothing: + case value::TypeTags::Boolean: + return false; + + // Trying to estimate any other types should result in an error. + default: + uasserted(7051100, + str::stream() + << "Type " << tag << " is not supported by histogram estimation."); + } + + MONGO_UNREACHABLE; +} + +std::string serialize(value::TypeTags tag) { + std::ostringstream os; + os << tag; + return os.str(); +} + +// TODO: does this belong in SBE value utils? +value::TypeTags deserialize(const std::string& name) { + if ("NumberInt32" == name) { + return value::TypeTags::NumberInt32; + } else if ("NumberInt64" == name) { + return value::TypeTags::NumberInt64; + } else if ("NumberDecimal" == name) { + return value::TypeTags::NumberDecimal; + } else if ("NumberDouble" == name) { + return value::TypeTags::NumberDouble; + } else if ("StringBig" == name) { + return value::TypeTags::StringBig; + } else if ("StringSmall" == name) { + return value::TypeTags::StringSmall; + } else if ("bsonString" == name) { + return value::TypeTags::bsonString; + } else if ("Date" == name) { + return value::TypeTags::Date; + } else if ("Timestamp" == name) { + return value::TypeTags::Timestamp; + } else if ("ObjectId" == name) { + return value::TypeTags::ObjectId; + } else if ("Object" == name) { + return value::TypeTags::Object; + } else if ("Boolean" == name) { + return value::TypeTags::Boolean; + } else if ("Array" == name) { + return value::TypeTags::Array; + } else if ("Null" == name) { + return value::TypeTags::Null; + } else if ("Nothing" == name) { + return value::TypeTags::Nothing; + } + + // Trying to deserialize any other types should result in an error. + uasserted(6660600, + str::stream() << "String " << name << " is not convertable to SBE type tag."); +} + +} // namespace mongo::stats diff --git a/src/mongo/db/query/stats/value_utils.h b/src/mongo/db/query/stats/value_utils.h new file mode 100644 index 00000000000..d79417ea724 --- /dev/null +++ b/src/mongo/db/query/stats/value_utils.h @@ -0,0 +1,120 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/exec/sbe/values/value.h" + +namespace mongo::stats { +/** + Container object for SBE value/tag pairs. Supplied values are owned by this object + and are released on destruction +*/ +class SBEValue { +public: + SBEValue(sbe::value::TypeTags tag, sbe::value::Value val); + SBEValue(std::pair v); + ~SBEValue(); + + SBEValue(const SBEValue& other); + SBEValue(SBEValue&& other); + + SBEValue& operator=(const SBEValue& other); + SBEValue& operator=(SBEValue&& other); + + std::pair get() const; + sbe::value::TypeTags getTag() const; + sbe::value::Value getValue() const; + +private: + sbe::value::TypeTags _tag; + sbe::value::Value _val; +}; + +/** + Generate an SBE Value pair that represents the supplied int with + type Int64 +*/ +std::pair makeInt64Value(int v); + +/** + Generate an SBE Value pair representing a BSON null value +*/ +std::pair makeNullValue(); + +/** + Do the supplied type tags represent the same BSON type? +*/ +bool sameTypeClass(sbe::value::TypeTags tag1, sbe::value::TypeTags tag2); + +/** + Do the supplied type tags represent the same BSON type? + TODO: This may be the same as sameTypeClass. @timourk? +*/ +bool sameTypeBracket(sbe::value::TypeTags tag1, sbe::value::TypeTags tag2); + +/** + Compare a pair of SBE values. + + The return will be + <0 if val1 < val2 in BSON order + 0 if val1 == val2 in BSON order + >0 if val1 > val2 in BSON order +*/ +int32_t compareValues(sbe::value::TypeTags tag1, + sbe::value::Value val1, + sbe::value::TypeTags tag2, + sbe::value::Value val2); + +/** + Sort a vector of values in place in BSON order +*/ +void sortValueVector(std::vector& sortVector); + +/** + Convert a value of any supported type into a double according to some metric. This + metric will be consistent with ordering in the type. +*/ +double valueToDouble(sbe::value::TypeTags tag, sbe::value::Value val); + +/** + * Returns true for types that can be estimated via histograms, and false for types that need type + * counters. Any other type results in a uassert. + * + * NOTE: This should be kept in sync with 'valueToDouble' above. + */ +bool canEstimateTypeViaHistogram(sbe::value::TypeTags tag); + +/** + * Serialize/Deserialize a TypeTag to a string for TypeCount storage in the stats collection. + */ +std::string serialize(sbe::value::TypeTags tag); +sbe::value::TypeTags deserialize(const std::string& name); + +} // namespace mongo::stats -- cgit v1.2.1