diff options
Diffstat (limited to 'src/mongo/db/query/ce/histogram.cpp')
-rw-r--r-- | src/mongo/db/query/ce/histogram.cpp | 220 |
1 files changed, 220 insertions, 0 deletions
diff --git a/src/mongo/db/query/ce/histogram.cpp b/src/mongo/db/query/ce/histogram.cpp new file mode 100644 index 00000000000..2359929707d --- /dev/null +++ b/src/mongo/db/query/ce/histogram.cpp @@ -0,0 +1,220 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/ce/histogram.h" + +namespace mongo::ce { + +using namespace sbe; + +Bucket::Bucket( + double equalFreq, double rangeFreq, double cumulativeFreq, double ndv, double cumulativeNDV) + : _equalFreq(equalFreq), + _rangeFreq(rangeFreq), + _cumulativeFreq(cumulativeFreq), + _ndv(ndv), + _cumulativeNDV(cumulativeNDV) { + uassert(6695702, "Invalid equalFreq", _equalFreq >= 0.0); + uassert(6695703, "Invalid rangeFreq", _rangeFreq >= 0.0); + uassert(6695704, "Invalid ndv", _ndv <= _rangeFreq); + uassert(6695705, "Invalid cumulative frequency", _cumulativeFreq >= _equalFreq + _rangeFreq); + uassert(6695706, "Invalid cumulative ndv", _cumulativeNDV >= _ndv + 1.0); +} + +std::string Bucket::toString() const { + std::ostringstream os; + os << "equalFreq: " << _equalFreq << ", rangeFreq: " << _rangeFreq + << ", cumulativeFreq: " << _cumulativeFreq << ", ndv: " << _ndv + << ", cumulativeNDV: " << _cumulativeNDV; + return os.str(); +} + +Histogram::Histogram() : Histogram({}, {}) {} + +Histogram::Histogram(value::Array bounds, std::vector<Bucket> buckets) + : _bounds(std::move(bounds)), _buckets(std::move(buckets)) { + uassert(6695707, "Invalid sizes", bounds.size() == buckets.size()); +} + +std::string Histogram::toString() const { + std::ostringstream os; + os << "["; + for (size_t i = 0; i < _buckets.size(); i++) { + os << "{val: " << _bounds.getAt(i) << ", " << _buckets.at(i).toString() << "}"; + if (_buckets.size() - i > 1) + os << ","; + } + os << "]"; + return os.str(); +} + +std::string Histogram::plot() const { + std::ostringstream os; + double maxFreq = 0; + const double maxBucketSize = 100; + + for (const auto& bucket : _buckets) { + double maxBucketFreq = std::max(bucket._equalFreq, bucket._rangeFreq); + maxFreq = std::max(maxFreq, maxBucketFreq); + } + + std::vector<std::pair<double, std::string>> headers; + size_t maxHeaderSize = 0; + for (size_t i = 0; i < _buckets.size(); ++i) { + std::ostringstream rngHeader; + std::ostringstream eqlHeader; + double scaledRngF = maxBucketSize * _buckets[i]._rangeFreq / maxFreq; + double scaledEqlF = maxBucketSize * _buckets[i]._equalFreq / maxFreq; + rngHeader << _bounds.getAt(i) << ": " << _buckets[i]._rangeFreq; + eqlHeader << _bounds.getAt(i) << ": " << _buckets[i]._equalFreq; + auto rngStr = rngHeader.str(); + maxHeaderSize = std::max(maxHeaderSize, rngStr.size()); + headers.emplace_back(scaledRngF, rngStr); + auto eqlStr = eqlHeader.str(); + maxHeaderSize = std::max(maxHeaderSize, eqlStr.size()); + headers.emplace_back(scaledEqlF, eqlStr); + } + + const std::string maxLine(maxBucketSize + maxHeaderSize + 3, '-'); + os << maxLine << "\n"; + for (size_t j = 0; j < headers.size(); ++j) { + auto header = headers.at(j); + header.second.resize(maxHeaderSize, ' '); + const std::string bar(std::round(header.first), '*'); + os << header.second << " | " << bar << "\n"; + } + os << maxLine << "\n"; + + return os.str(); +} + +EstimationResult Histogram::getTotals() const { + if (_buckets.empty()) { + return {0.0, 0.0}; + } + + const Bucket& last = _buckets.back(); + return {last._cumulativeFreq, last._cumulativeNDV}; +} + +EstimationResult Histogram::estimate(value::TypeTags tag, + value::Value val, + EstimationType type) const { + switch (type) { + case EstimationType::kGreater: + return getTotals() - estimate(tag, val, EstimationType::kLessOrEqual); + + case EstimationType::kGreaterOrEqual: + return getTotals() - estimate(tag, val, EstimationType::kLess); + + default: + // Continue. + break; + } + + size_t bucketIndex = 0; + { + size_t len = _buckets.size(); + while (len > 0) { + const size_t half = len >> 1; + const auto [boundTag, boundVal] = _bounds.getAt(bucketIndex + half); + + if (compareValues3w(boundTag, boundVal, tag, val) < 0) { + bucketIndex += half + 1; + len -= half + 1; + } else { + len = half; + } + } + } + if (bucketIndex == _buckets.size()) { + // Value beyond the largest endpoint. + switch (type) { + case EstimationType::kEqual: + return {0.0, 0.0}; + + case EstimationType::kLess: + case EstimationType::kLessOrEqual: + return getTotals(); + + default: + MONGO_UNREACHABLE; + } + } + + const Bucket& bucket = _buckets.at(bucketIndex); + const auto [boundTag, boundVal] = _bounds.getAt(bucketIndex); + const bool isEndpoint = compareValues3w(boundTag, boundVal, tag, val) == 0; + + switch (type) { + case EstimationType::kEqual: { + if (isEndpoint) { + return {bucket._equalFreq, 1.0}; + } + return {(bucket._ndv == 0.0) ? 0.0 : bucket._rangeFreq / bucket._ndv, 1.0}; + } + + case EstimationType::kLess: { + double resultCard = bucket._cumulativeFreq - bucket._equalFreq; + double resultNDV = bucket._cumulativeNDV - 1.0; + + if (!isEndpoint) { + // TODO: consider value interpolation instead of assigning 50% of the weight. + resultCard -= bucket._rangeFreq / 2.0; + resultNDV -= bucket._ndv / 2.0; + } + return {resultCard, resultNDV}; + } + + case EstimationType::kLessOrEqual: { + double resultCard = bucket._cumulativeFreq; + double resultNDV = bucket._cumulativeNDV; + + if (!isEndpoint) { + // TODO: consider value interpolation instead of assigning 50% of the weight. + resultCard -= bucket._equalFreq + bucket._rangeFreq / 2.0; + resultNDV -= 1.0 + bucket._ndv / 2.0; + } + return {resultCard, resultNDV}; + } + + default: + MONGO_UNREACHABLE; + } +} + +const value::Array& Histogram::getBounds() const { + return _bounds; +} + +const std::vector<Bucket>& Histogram::getBuckets() const { + return _buckets; +} + +} // namespace mongo::ce |