/** * Copyright (C) 2022-present MongoDB, Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the Server Side Public License, version 1, * as published by MongoDB, Inc. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * Server Side Public License for more details. * * You should have received a copy of the Server Side Public License * along with this program. If not, see * . * * As a special exception, the copyright holders give permission to link the * code of portions of this program with the OpenSSL library under certain * conditions as described in each individual source file and distribute * linked combinations including the program with the OpenSSL library. You * must comply with the Server Side Public License in all respects for * all of the code used other than as permitted herein. If you modify file(s) * with this exception, you may extend this exception to your version of the * file(s), but you are not obligated to do so. If you do not wish to do so, * delete this exception statement from your version. If you delete this * exception statement from all source files in the program, then also delete * it in the license file. */ #include "mongo/db/query/ce/histogram_estimation.h" #include "mongo/db/exec/sbe/abt/abt_lower.h" #include "mongo/db/query/ce/value_utils.h" #include "mongo/db/query/optimizer/syntax/expr.h" namespace mongo::ce { using namespace sbe; EstimationResult getTotals(const ScalarHistogram& h) { if (h.empty()) { return {0.0, 0.0}; } const Bucket& last = h.getBuckets().back(); return {last._cumulativeFreq, last._cumulativeNDV}; } /** * Helper function that uses linear interpolation to estimate the cardinality and NDV for a value * that falls inside of a histogram bucket. */ EstimationResult interpolateEstimateInBucket(const ScalarHistogram& h, value::TypeTags tag, value::Value val, EstimationType type, size_t bucketIndex) { const Bucket& bucket = h.getBuckets().at(bucketIndex); const auto [boundTag, boundVal] = h.getBounds().getAt(bucketIndex); double resultCard = bucket._cumulativeFreq - bucket._equalFreq - bucket._rangeFreq; double resultNDV = bucket._cumulativeNDV - bucket._ndv - 1.0; // Check if the estimate is at the point of type brackets switch. If the current bucket is the // first bucket of a new type bracket and the value is of another type, estimate cardinality // from the current bucket as 0. // // For example, let bound 1 = 1000, bound 2 = "abc". The value 100000000 falls in bucket 2, the // first bucket for strings, but should not get cardinality/ ndv fraction from it. if (!sameTypeBracket(tag, boundTag)) { if (type == EstimationType::kEqual) { return {0.0, 0.0}; } else { return {resultCard, resultNDV}; } } // Estimate for equality frequency inside of the bucket. const double innerEqFreq = (bucket._ndv == 0.0) ? 0.0 : bucket._rangeFreq / bucket._ndv; if (type == EstimationType::kEqual) { return {innerEqFreq, 1.0}; } // For $lt and $lte operations use linear interpolation to take a fraction of the bucket // cardinality and NDV if there is a preceeding bucket with bound of the same type. Use half of // the bucket estimates otherwise. double ratio = 0.5; if (bucketIndex > 0) { const auto [lowBoundTag, lowBoundVal] = h.getBounds().getAt(bucketIndex - 1); if (sameTypeBracket(lowBoundTag, boundTag)) { double doubleLowBound = valueToDouble(lowBoundTag, lowBoundVal); double doubleUpperBound = valueToDouble(boundTag, boundVal); double doubleVal = valueToDouble(tag, val); ratio = (doubleVal - doubleLowBound) / (doubleUpperBound - doubleLowBound); } } resultCard += bucket._rangeFreq * ratio; resultNDV += bucket._ndv * ratio; if (type == EstimationType::kLess) { // Subtract from the estimate the cardinality and ndv corresponding to the equality // operation. const double innerEqNdv = (bucket._ndv * ratio <= 1.0) ? 0.0 : 1.0; resultCard -= innerEqFreq; resultNDV -= innerEqNdv; } return {resultCard, resultNDV}; } EstimationResult estimate(const ScalarHistogram& h, value::TypeTags tag, value::Value val, EstimationType type) { switch (type) { case EstimationType::kGreater: return getTotals(h) - estimate(h, tag, val, EstimationType::kLessOrEqual); case EstimationType::kGreaterOrEqual: return getTotals(h) - estimate(h, tag, val, EstimationType::kLess); default: // Continue. break; } size_t bucketIndex = 0; { size_t len = h.getBuckets().size(); while (len > 0) { const size_t half = len >> 1; const auto [boundTag, boundVal] = h.getBounds().getAt(bucketIndex + half); if (compareValues(boundTag, boundVal, tag, val) < 0) { bucketIndex += half + 1; len -= half + 1; } else { len = half; } } } if (bucketIndex == h.getBuckets().size()) { // Value beyond the largest endpoint. switch (type) { case EstimationType::kEqual: return {0.0, 0.0}; case EstimationType::kLess: case EstimationType::kLessOrEqual: return getTotals(h); default: MONGO_UNREACHABLE; } } const Bucket& bucket = h.getBuckets().at(bucketIndex); const auto [boundTag, boundVal] = h.getBounds().getAt(bucketIndex); const bool isEndpoint = compareValues(boundTag, boundVal, tag, val) == 0; if (isEndpoint) { switch (type) { case EstimationType::kEqual: { return {bucket._equalFreq, 1.0}; } case EstimationType::kLess: { double resultCard = bucket._cumulativeFreq - bucket._equalFreq; double resultNDV = bucket._cumulativeNDV - 1.0; return {resultCard, resultNDV}; } case EstimationType::kLessOrEqual: { double resultCard = bucket._cumulativeFreq; double resultNDV = bucket._cumulativeNDV; return {resultCard, resultNDV}; } default: MONGO_UNREACHABLE; } } else { return interpolateEstimateInBucket(h, tag, val, type, bucketIndex); } } double estimateCardEq(const ArrayHistogram& ah, value::TypeTags tag, value::Value val, bool includeScalar) { if (tag != value::TypeTags::Null) { double card = 0.0; if (includeScalar) { card = estimate(ah.getScalar(), tag, val, EstimationType::kEqual).card; } if (ah.isArray()) { card += estimate(ah.getArrayUnique(), tag, val, EstimationType::kEqual).card; } return card; } else { // Predicate: {field: null} // Count the values that are either null or that do not contain the field. // TODO: // This prototype doesn't have the concept of missing values. It can be added easily // by adding a cardinality estimate that is >= the number of values. // Estimation of $exists can be built on top of this estimate: // {$exists: true} matches the documents that contain the field, including those where the // field value is null. // {$exists: false} matches only the documents that do not contain the field. auto findNull = ah.getTypeCounts().find(value::TypeTags::Null); if (findNull != ah.getTypeCounts().end()) { return findNull->second; } return 0.0; } } static EstimationResult estimateRange(const ScalarHistogram& histogram, bool lowInclusive, value::TypeTags tagLow, value::Value valLow, bool highInclusive, value::TypeTags tagHigh, value::Value valHigh) { const EstimationType highType = highInclusive ? EstimationType::kLessOrEqual : EstimationType::kLess; const EstimationResult highEstimate = estimate(histogram, tagHigh, valHigh, highType); const EstimationType lowType = lowInclusive ? EstimationType::kLess : EstimationType::kLessOrEqual; const EstimationResult lowEstimate = estimate(histogram, tagLow, valLow, lowType); return highEstimate - lowEstimate; } /** * Compute an estimate for range query on array data with formula: * Card(ArrayMin(a < valHigh)) - Card(ArrayMax(a < valLow)) */ static EstimationResult estimateRangeQueryOnArray(const ScalarHistogram& histogramAmin, const ScalarHistogram& histogramAmax, bool lowInclusive, value::TypeTags tagLow, value::Value valLow, bool highInclusive, value::TypeTags tagHigh, value::Value valHigh) { const EstimationType highType = highInclusive ? EstimationType::kLessOrEqual : EstimationType::kLess; const EstimationResult highEstimate = estimate(histogramAmin, tagHigh, valHigh, highType); const EstimationType lowType = lowInclusive ? EstimationType::kLess : EstimationType::kLessOrEqual; const EstimationResult lowEstimate = estimate(histogramAmax, tagLow, valLow, lowType); return highEstimate - lowEstimate; } double estimateCardRange(const ArrayHistogram& ah, /* Define lower bound. */ bool lowInclusive, value::TypeTags tagLow, value::Value valLow, /* Define upper bound. */ bool highInclusive, value::TypeTags tagHigh, value::Value valHigh, bool includeScalar, EstimationAlgo estimationAlgo) { uassert(6695701, "Low bound must not be higher than high", compareValues(tagLow, valLow, tagHigh, valHigh) <= 0); // Helper lambda to shorten code for legibility. auto estRange = [&](const ScalarHistogram& h) { return estimateRange(h, lowInclusive, tagLow, valLow, highInclusive, tagHigh, valHigh); }; double result = 0.0; if (ah.isArray()) { if (includeScalar) { // Range query on array data. const EstimationResult rangeCardOnArray = estimateRangeQueryOnArray(ah.getArrayMin(), ah.getArrayMax(), lowInclusive, tagLow, valLow, highInclusive, tagHigh, valHigh); result += rangeCardOnArray.card; } else { // $elemMatch query on array data. const auto arrayMinEst = estRange(ah.getArrayMin()); const auto arrayMaxEst = estRange(ah.getArrayMax()); const auto arrayUniqueEst = estRange(ah.getArrayUnique()); const double totalArrayCount = getTotals(ah.getArrayMin()).card; uassert( 6715101, "Array histograms should contain at least one array", totalArrayCount > 0); switch (estimationAlgo) { case EstimationAlgo::HistogramV1: { const double arrayUniqueDensity = (arrayUniqueEst.ndv == 0.0) ? 0.0 : (arrayUniqueEst.card / std::sqrt(arrayUniqueEst.ndv)); result = std::max(std::max(arrayMinEst.card, arrayMaxEst.card), arrayUniqueDensity); break; } case EstimationAlgo::HistogramV2: { const double avgArraySize = getTotals(ah.getArrayUnique()).card / totalArrayCount; const double adjustedUniqueCard = (avgArraySize == 0.0) ? 0.0 : std::min(arrayUniqueEst.card / pow(avgArraySize, 0.2), totalArrayCount); result = std::max(std::max(arrayMinEst.card, arrayMaxEst.card), adjustedUniqueCard); break; } case EstimationAlgo::HistogramV3: { const double adjustedUniqueCard = 0.85 * std::min(arrayUniqueEst.card, totalArrayCount); result = std::max(std::max(arrayMinEst.card, arrayMaxEst.card), adjustedUniqueCard); break; } default: MONGO_UNREACHABLE; } } } if (includeScalar) { const auto scalarEst = estRange(ah.getScalar()); result += scalarEst.card; } return result; } double estimateIntervalCardinality(const ce::ArrayHistogram& ah, const optimizer::IntervalRequirement& interval, optimizer::CEType childResult, bool includeScalar) { auto getBound = [](const optimizer::BoundRequirement& boundReq) { return boundReq.getBound().cast()->get(); }; if (interval.isFullyOpen()) { return childResult; } else if (interval.isEquality()) { auto [tag, val] = getBound(interval.getLowBound()); return estimateCardEq(ah, tag, val, includeScalar); } // Otherwise, we have a range. auto lowBound = interval.getLowBound(); auto [lowTag, lowVal] = getBound(lowBound); auto highBound = interval.getHighBound(); auto [highTag, highVal] = getBound(highBound); return estimateCardRange(ah, lowBound.isInclusive(), lowTag, lowVal, highBound.isInclusive(), highTag, highVal, includeScalar); } } // namespace mongo::ce