diff options
Diffstat (limited to 'src/mongo/db/query/ce/histogram_estimation.cpp')
-rw-r--r-- | src/mongo/db/query/ce/histogram_estimation.cpp | 85 |
1 files changed, 75 insertions, 10 deletions
diff --git a/src/mongo/db/query/ce/histogram_estimation.cpp b/src/mongo/db/query/ce/histogram_estimation.cpp index 948729ee1b4..4fd2d8376db 100644 --- a/src/mongo/db/query/ce/histogram_estimation.cpp +++ b/src/mongo/db/query/ce/histogram_estimation.cpp @@ -266,6 +266,29 @@ static EstimationResult estimateRange(const ScalarHistogram& histogram, return highEstimate - lowEstimate; } +/** + * Compute an estimate for range query on array data with formula: + * Card(ArrayMin(a < valHigh)) - Card(ArrayMax(a < valLow)) + */ +static EstimationResult estimateRangeQueryOnArray(const ScalarHistogram& histogramAmin, + const ScalarHistogram& histogramAmax, + bool lowInclusive, + value::TypeTags tagLow, + value::Value valLow, + bool highInclusive, + value::TypeTags tagHigh, + value::Value valHigh) { + const EstimationType highType = + highInclusive ? EstimationType::kLessOrEqual : EstimationType::kLess; + const EstimationResult highEstimate = estimate(histogramAmin, tagHigh, valHigh, highType); + + const EstimationType lowType = + lowInclusive ? EstimationType::kLess : EstimationType::kLessOrEqual; + const EstimationResult lowEstimate = estimate(histogramAmax, tagLow, valLow, lowType); + + return highEstimate - lowEstimate; +} + double estimateCardRange(const ArrayHistogram& ah, bool includeScalar, /* Define lower bound. */ @@ -275,7 +298,8 @@ double estimateCardRange(const ArrayHistogram& ah, /* Define upper bound. */ bool highInclusive, value::TypeTags tagHigh, - value::Value valHigh) { + value::Value valHigh, + EstimationAlgo estimationAlgo) { uassert(6695701, "Low bound must not be higher than high", compareValues3w(tagLow, valLow, tagHigh, valHigh) <= 0); @@ -287,16 +311,57 @@ double estimateCardRange(const ArrayHistogram& ah, double result = 0.0; if (ah.isArray()) { - const auto arrayMinEst = estRange(ah.getArrayMin()); - const auto arrayMaxEst = estRange(ah.getArrayMax()); - const auto arrayUniqueEst = estRange(ah.getArrayUnique()); - - // TODO: should we consider diving by sqrt(ndv) or just by ndv? - const double arrayUniqueDensity = (arrayUniqueEst.ndv == 0.0) - ? 0.0 - : (arrayUniqueEst.card / std::sqrt(arrayUniqueEst.ndv)); - result = std::max(std::max(arrayMinEst.card, arrayMaxEst.card), arrayUniqueDensity); + if (includeScalar) { + // Range query on array data. + const EstimationResult rangeCardOnArray = estimateRangeQueryOnArray(ah.getArrayMin(), + ah.getArrayMax(), + lowInclusive, + tagLow, + valLow, + highInclusive, + tagHigh, + valHigh); + result += rangeCardOnArray.card; + } else { + // $elemMatch query on array data. + const auto arrayMinEst = estRange(ah.getArrayMin()); + const auto arrayMaxEst = estRange(ah.getArrayMax()); + const auto arrayUniqueEst = estRange(ah.getArrayUnique()); + + const double totalArrayCount = getTotals(ah.getArrayMin()).card; + uassert( + 6715101, "Array histograms should contain at least one array", totalArrayCount > 0); + switch (estimationAlgo) { + case EstimationAlgo::HistogramV1: { + const double arrayUniqueDensity = (arrayUniqueEst.ndv == 0.0) + ? 0.0 + : (arrayUniqueEst.card / std::sqrt(arrayUniqueEst.ndv)); + result = + std::max(std::max(arrayMinEst.card, arrayMaxEst.card), arrayUniqueDensity); + break; + } + case EstimationAlgo::HistogramV2: { + const double avgArraySize = + getTotals(ah.getArrayUnique()).card / totalArrayCount; + const double adjustedUniqueCard = (avgArraySize == 0.0) + ? 0.0 + : std::min(arrayUniqueEst.card / pow(avgArraySize, 0.2), totalArrayCount); + result = + std::max(std::max(arrayMinEst.card, arrayMaxEst.card), adjustedUniqueCard); + break; + } + case EstimationAlgo::HistogramV3: { + const double adjustedUniqueCard = + 0.85 * std::min(arrayUniqueEst.card, totalArrayCount); + result = + std::max(std::max(arrayMinEst.card, arrayMaxEst.card), adjustedUniqueCard); + break; + } + default: + MONGO_UNREACHABLE; + } + } } if (includeScalar) { |