summaryrefslogtreecommitdiff
path: root/src/mongo/db/exec/geo_near.cpp
diff options
context:
space:
mode:
authorSiyuan Zhou <siyuan.zhou@mongodb.com>2014-10-07 15:59:32 -0400
committerSiyuan Zhou <siyuan.zhou@mongodb.com>2014-10-28 18:19:32 -0400
commit385f03dc7205ef60bbb9cb8b475afd9c802bc67d (patch)
treefcbfbc4b1e0bb44d751f43c5536187fe1f803d30 /src/mongo/db/exec/geo_near.cpp
parent33d59e5d8cded2e0eab2b9e370727eac5e4d020c (diff)
downloadmongo-385f03dc7205ef60bbb9cb8b475afd9c802bc67d.tar.gz
SERVER-15562 Estimate density before near search
Diffstat (limited to 'src/mongo/db/exec/geo_near.cpp')
-rw-r--r--src/mongo/db/exec/geo_near.cpp351
1 files changed, 307 insertions, 44 deletions
diff --git a/src/mongo/db/exec/geo_near.cpp b/src/mongo/db/exec/geo_near.cpp
index 32059002265..60f67bc08db 100644
--- a/src/mongo/db/exec/geo_near.cpp
+++ b/src/mongo/db/exec/geo_near.cpp
@@ -42,8 +42,11 @@
#include "mongo/db/matcher/expression.h"
#include "mongo/db/query/expression_index.h"
#include "mongo/db/query/expression_index_knobs.h"
+#include "mongo/db/index/expression_params.h"
#include "mongo/util/log.h"
+#include <algorithm>
+
namespace mongo {
//
@@ -267,18 +270,147 @@ namespace mongo {
return fullBounds;
}
- static double twoDBoundsIncrement(IndexDescriptor* twoDIndex, const GeoNearParams& nearParams) {
- if (FLAT == nearParams.nearQuery->centroid->crs) {
- GeoHashConverter::Parameters hashParams;
- Status status = GeoHashConverter::parseParameters(twoDIndex->infoObj(), &hashParams);
- invariant(status.isOK()); // The index status should always be valid
+ class GeoNear2DStage::DensityEstimator {
+ public:
+ DensityEstimator(GeoNear2DStage* stage) : _stage(stage) { }
+ PlanStage::StageState work(OperationContext* txn,
+ WorkingSet* workingSet,
+ Collection* collection,
+ double* estimatedDistance);
+
+ private:
+ void buildIndexScan(OperationContext* txn, WorkingSet* workingSet, Collection* collection);
+
+ const GeoNear2DStage* _stage; // Not owned here.
+ scoped_ptr<IndexScan> _indexScan;
+ scoped_ptr<GeoHashConverter> _converter;
+ GeoHash _centroidCell;
+ };
+
+ // Initialize the internal states
+ void GeoNear2DStage::DensityEstimator::buildIndexScan(OperationContext* txn,
+ WorkingSet* workingSet,
+ Collection* collection)
+ {
+ // Estimate density with 2d index.
+ GeoHashConverter::Parameters hashParams;
+ Status status = GeoHashConverter::parseParameters(_stage->_twoDIndex->infoObj(),
+ &hashParams);
+ // The index status should always be valid.
+ invariant(status.isOK());
- GeoHashConverter converter(hashParams);
- return 5 * converter.sizeEdge(hashParams.bits);
+ _converter.reset(new GeoHashConverter(hashParams));
+ _centroidCell = _converter->hash(_stage->_nearParams.nearQuery->centroid->oldPoint);
+
+ // Setup the stages for this interval
+
+ // Build index bound [hash(centerCell), minKey] in reverse order
+ mongo::BSONObjBuilder builder;
+ _centroidCell.appendHashMax(&builder, "");
+ builder.appendMinForType("", mongo::BinData);
+ OrderedIntervalList oil;
+ oil.intervals.push_back(IndexBoundsBuilder::makeRangeInterval(builder.obj(),
+ true,
+ true));
+ IndexScanParams scanParams;
+ scanParams.descriptor = _stage->_twoDIndex;
+ scanParams.direction = -1;
+ scanParams.doNotDedup = true;
+
+ // Scan bounds on 2D indexes are only over the 2D field - other bounds aren't applicable.
+ // This is handled in query planning.
+ scanParams.bounds = _stage->_nearParams.baseBounds;
+
+ // The "2d" field is always the first in the index
+ const string twoDFieldName = _stage->_nearParams.nearQuery->field;
+ const int twoDFieldPosition = 0;
+
+ oil.name = scanParams.bounds.fields[twoDFieldPosition].name;
+
+ // Intersect the $near bounds we just generated into the bounds we have for anything else
+ // in the scan (i.e. $within)
+ IndexBoundsBuilder::intersectize(oil,
+ &scanParams.bounds.fields[twoDFieldPosition]);
+
+ _indexScan.reset(new IndexScan(txn, scanParams, workingSet, NULL));
+ }
+
+ // Return IS_EOF is we find a document in it's ancestor cells and set estimated distance
+ // from the nearest document.
+ PlanStage::StageState GeoNear2DStage::DensityEstimator::work(OperationContext* txn,
+ WorkingSet* workingSet,
+ Collection* collection,
+ double* estimatedDistance)
+ {
+ if (!_indexScan) {
+ buildIndexScan(txn, workingSet, collection);
}
- else {
- return kMaxEarthDistanceInMeters / 1000.0;
+
+ // Fetch the fist document
+ WorkingSetID workingSetID;
+ PlanStage::StageState stageState = _indexScan->work(&workingSetID);
+
+ if (stageState == PlanStage::ADVANCED) {
+ // Found a document in ancestors of center cell. Extract its key and calculate
+ // the distance from it.
+ IndexKeyDatum datum = workingSet->get(workingSetID)->keyData[0];
+ BSONElement keyElt = datum.keyData.firstElement();
+ invariant(BinData == keyElt.type());
+ GeoHash previousKey = _converter->hash(keyElt);
+ GeoHash commonPrefix = _centroidCell.commonPrefix(previousKey);
+
+ //
+ // +---+---+
+ // | | A |
+ // +---+---+
+ // | | B |
+ // +---+---+
+ //
+ // Say A is the the centroid cell of (0100)11
+ // and B is the the previous cell of (0100)01
+ // The common prefix (0100) gives the full square in the figure above.
+ // At the scale of cell size at commonPrefix's level, we found a document.
+ //
+ // TODO: make a threshold for the coarsest level.
+ *estimatedDistance = _converter->sizeEdge(commonPrefix.getBits());
+ return PlanStage::IS_EOF;
+
+ } else if (stageState == PlanStage::IS_EOF) {
+ // Found nothing. Return the distance proportional to the finest cell size by default.
+ *estimatedDistance = 2 * _converter->sizeEdge(_centroidCell.getBits());
+ return PlanStage::IS_EOF;
}
+
+ // Propagate NEED_TIME or errors
+ return stageState;
+ }
+
+ PlanStage::StageState GeoNear2DStage::initialize(OperationContext* txn,
+ WorkingSet* workingSet,
+ Collection* collection)
+ {
+ if (SPHERE == _nearParams.nearQuery->centroid->crs) {
+ _boundsIncrement = kMaxEarthDistanceInMeters / 1000.0;
+ return PlanStage::IS_EOF;
+ }
+
+ if (!_densityEstimator) {
+ _densityEstimator.reset(new DensityEstimator(this));
+ }
+
+ double estimatedDistance;
+ PlanStage::StageState state = _densityEstimator->work(txn, workingSet, collection, &estimatedDistance);
+
+ if (state == PlanStage::IS_EOF) {
+ // Estimator finished its work, we need to finish initialization too.
+ _boundsIncrement = 5 * estimatedDistance;
+ invariant(_boundsIncrement > 0.0);
+
+ // Clean up
+ _densityEstimator.reset(NULL);
+ }
+
+ return state;
}
static const string kTwoDIndexNearStage("GEO_NEAR_2D");
@@ -297,7 +429,7 @@ namespace mongo {
_twoDIndex(twoDIndex),
_fullBounds(twoDDistanceBounds(nearParams, twoDIndex)),
_currBounds(_fullBounds.center(), -1, _fullBounds.getInner()),
- _boundsIncrement(twoDBoundsIncrement(twoDIndex, nearParams)) {
+ _boundsIncrement(0.0) {
getNearStats()->keyPattern = twoDIndex->keyPattern();
}
@@ -620,7 +752,7 @@ namespace mongo {
// in the scan (i.e. $within)
IndexBoundsBuilder::intersectize(coveredIntervals,
&scanParams.bounds.fields[twoDFieldPosition]);
-
+
// These parameters are stored by the index, and so must be ok
GeoHashConverter::Parameters hashParams;
GeoHashConverter::parseParameters(_twoDIndex->infoObj(), &hashParams);
@@ -641,19 +773,19 @@ namespace mongo {
// IndexScanWithMatch owns the matcher
IndexScan* scan = new IndexScanWithMatch(txn, scanParams, workingSet, keyMatcher);
-
+
MatchExpression* docMatcher = NULL;
-
+
// FLAT searches need to add an additional annulus $within matcher, see above
if (FLAT == queryCRS) {
docMatcher = new TwoDPtInAnnulusExpression(_fullBounds, twoDFieldName);
}
-
+
// FetchStage owns index scan
FetchStage* fetcher(new FetchStageWithMatch(txn,
- workingSet,
- scan,
- docMatcher,
+ workingSet,
+ scan,
+ docMatcher,
collection));
return StatusWith<CoveredInterval*>(new CoveredInterval(fetcher,
@@ -671,23 +803,22 @@ namespace mongo {
// GeoNear2DSphereStage
//
- static double twoDSphereBoundsIncrement(const IndexDescriptor* s2Index) {
+ static int getFieldPosition(const IndexDescriptor* index, const string& fieldName) {
- // The user can override this so we honor it. We could ignore it though -- it's just used
- // to set _radiusIncrement, not to do any covering.
- // TODO: Make this parsed somewhere else
- int finestIndexedLevel;
- BSONElement finestLevelEl = s2Index->infoObj()["finestIndexedLevel"];
- if (finestLevelEl.isNumber()) {
- finestIndexedLevel = finestLevelEl.numberInt();
- }
- else {
- finestIndexedLevel = S2::kAvgEdge.GetClosestLevel(500.0 / kRadiusOfEarthInMeters);
+ int fieldPosition = 0;
+
+ BSONObjIterator specIt(index->keyPattern());
+ while (specIt.more()) {
+ if (specIt.next().fieldName() == fieldName) {
+ break;
+ }
+ ++fieldPosition;
}
- // Start with a conservative bounds increment. When we're done searching a shell we
- // increment the two radii by this.
- return 5 * S2::kAvgEdge.GetValue(finestIndexedLevel) * kRadiusOfEarthInMeters;
+ if (fieldPosition == index->keyPattern().nFields())
+ return -1;
+
+ return fieldPosition;
}
static const string kS2IndexNearStage("GEO_NEAR_2DSPHERE");
@@ -706,7 +837,7 @@ namespace mongo {
_s2Index(s2Index),
_fullBounds(geoNearDistanceBounds(*nearParams.nearQuery)),
_currBounds(_fullBounds.center(), -1, _fullBounds.getInner()),
- _boundsIncrement(twoDSphereBoundsIncrement(s2Index)) {
+ _boundsIncrement(0.0) {
getNearStats()->keyPattern = s2Index->keyPattern();
}
@@ -800,22 +931,152 @@ namespace mongo {
};
}
- static int getFieldPosition(const IndexDescriptor* index, const string& fieldName) {
+ // Estimate the density of data by search the nearest cells level by level around center.
+ class GeoNear2DSphereStage::DensityEstimator {
+ public:
+ DensityEstimator(const IndexDescriptor* s2Index, const GeoNearParams* nearParams) :
+ _s2Index(s2Index), _nearParams(nearParams), _currentLevel(0)
+ {
+ S2IndexingParams params;
+ ExpressionParams::parse2dsphereParams(_s2Index->infoObj(), &params);
+ // Since cellId.AppendVertexNeighbors(level, output) requires level > cellId.level(),
+ // we have to start to find documents at most S2::kMaxCellLevel - 1. Thus the finest
+ // search area is 16 * finest cell area at S2::kMaxCellLevel, which is less than
+ // (1.4 inch X 1.4 inch) on the earth.
+ _currentLevel = std::max(0,
+ std::min(S2::kMaxCellLevel - 1,
+ params.finestIndexedLevel - 1));
+ }
- int fieldPosition = 0;
+ // Search for a document in neighbors at current level.
+ // Return IS_EOF is such document exists and set the estimated distance to the nearest doc.
+ PlanStage::StageState work(OperationContext* txn,
+ WorkingSet* workingSet,
+ Collection* collection,
+ double* estimatedDistance);
+
+ private:
+ void buildIndexScan(OperationContext* txn, WorkingSet* workingSet, Collection* collection);
+
+ const IndexDescriptor* _s2Index; // Not owned here.
+ const GeoNearParams* _nearParams; // Not owned here.
+ int _currentLevel;
+ scoped_ptr<IndexScan> _indexScan;
+ };
+
+ // Setup the index scan stage for neighbors at this level.
+ void GeoNear2DSphereStage::DensityEstimator::buildIndexScan(OperationContext* txn,
+ WorkingSet* workingSet,
+ Collection* collection)
+ {
+ IndexScanParams scanParams;
+ scanParams.descriptor = _s2Index;
+ scanParams.direction = 1;
+ scanParams.doNotDedup = true;
+ scanParams.bounds = _nearParams->baseBounds;
- BSONObjIterator specIt(index->keyPattern());
- while (specIt.more()) {
- if (specIt.next().fieldName() == fieldName) {
- break;
+ // Because the planner doesn't yet set up 2D index bounds, do it ourselves here
+ const string s2Field = _nearParams->nearQuery->field;
+ const int s2FieldPosition = getFieldPosition(_s2Index, s2Field);
+ OrderedIntervalList* coveredIntervals = &scanParams.bounds.fields[s2FieldPosition];
+ coveredIntervals->intervals.clear();
+
+ // Find 4 neighbors (3 neighbors at face vertex) at current level.
+ const S2CellId& centerId = _nearParams->nearQuery->centroid->cell.id();
+ vector<S2CellId> neighbors;
+
+ // The search area expands 4X each time.
+ // Return the neighbors of closest vertex to this cell at the given level.
+ invariant(_currentLevel < centerId.level());
+ centerId.AppendVertexNeighbors(_currentLevel, &neighbors);
+
+ // Convert S2CellId to string and sort
+ vector<string> neighborKeys;
+ for (vector<S2CellId>::const_iterator it = neighbors.begin(); it != neighbors.end(); it++) {
+ neighborKeys.push_back(it->toString());
+ }
+ std::sort(neighborKeys.begin(), neighborKeys.end());
+
+ for (vector<string>::const_iterator it = neighborKeys.begin(); it != neighborKeys.end();
+ it++)
+ {
+ // construct interval [*it, end) for this cell.
+ std::string end = *it;
+ end[end.size() - 1]++;
+ coveredIntervals->intervals.push_back(
+ IndexBoundsBuilder::makeRangeInterval(*it, end, true, false));
+ }
+
+ invariant(coveredIntervals->isValidFor(1));
+
+ // Index scan
+ _indexScan.reset(new IndexScan(txn, scanParams, workingSet, NULL));
+ }
+
+ PlanStage::StageState GeoNear2DSphereStage::DensityEstimator::work(OperationContext* txn,
+ WorkingSet* workingSet,
+ Collection* collection,
+ double* estimatedDistance)
+ {
+ if (!_indexScan) {
+ // Setup index scan stage for current level.
+ buildIndexScan(txn, workingSet, collection);
+ }
+
+ WorkingSetID workingSetID;
+ PlanStage::StageState state = _indexScan->work(&workingSetID);
+
+ if (state == PlanStage::IS_EOF) {
+ // We ran through the neighbors but found nothing.
+ if (_currentLevel > 0) {
+ // Advance to the next level and search again.
+ _currentLevel--;
+ // Reset index scan for the next level.
+ _indexScan.reset(NULL);
+ return PlanStage::NEED_TIME;
}
- ++fieldPosition;
+
+ // We are already at the top level.
+ *estimatedDistance = S2::kAvgEdge.GetValue(_currentLevel) * kRadiusOfEarthInMeters;
+ return PlanStage::IS_EOF;
+ } else if (state == PlanStage::ADVANCED) {
+ // We found something!
+ *estimatedDistance = S2::kAvgEdge.GetValue(_currentLevel) * kRadiusOfEarthInMeters;
+ return PlanStage::IS_EOF;
}
- if (fieldPosition == index->keyPattern().nFields())
- return -1;
+ // Propagate NEED_TIME or errors
+ return state;
+ }
- return fieldPosition;
+
+ PlanStage::StageState GeoNear2DSphereStage::initialize(OperationContext* txn,
+ WorkingSet* workingSet,
+ Collection* collection)
+ {
+ if (!_densityEstimator) {
+ _densityEstimator.reset(new DensityEstimator(_s2Index, &_nearParams));
+ }
+
+ double estimatedDistance;
+ PlanStage::StageState state = _densityEstimator->work(txn, workingSet, collection, &estimatedDistance);
+
+ if (state == IS_EOF) {
+ // We find a document in 4 neighbors at current level, but didn't at previous level.
+ //
+ // Assuming cell size at current level is d and data is even distributed, the distance
+ // between two nearest points are at least d. The following circle with radius of 3 * d
+ // covers PI * 9 * d^2, giving at most 30 documents.
+ //
+ // At the coarsest level, the search area is the whole earth.
+ _boundsIncrement = 3 * estimatedDistance;
+ invariant(_boundsIncrement > 0.0);
+
+ // Clean up
+ _densityEstimator.reset(NULL);
+ }
+
+ return state;
}
StatusWith<NearStage::CoveredInterval*> //
@@ -845,11 +1106,13 @@ namespace mongo {
_boundsIncrement /= 2;
}
+ invariant(_boundsIncrement > 0.0);
+
R2Annulus nextBounds(_currBounds.center(),
_currBounds.getOuter(),
min(_currBounds.getOuter() + _boundsIncrement,
_fullBounds.getOuter()));
-
+
bool isLastInterval = (nextBounds.getOuter() == _fullBounds.getOuter());
_currBounds = nextBounds;
@@ -874,7 +1137,7 @@ namespace mongo {
scanParams.bounds.fields[s2FieldPosition].intervals.clear();
OrderedIntervalList* coveredIntervals = &scanParams.bounds.fields[s2FieldPosition];
- TwoDSphereKeyInRegionExpression* keyMatcher =
+ TwoDSphereKeyInRegionExpression* keyMatcher =
new TwoDSphereKeyInRegionExpression(_currBounds, s2Field);
ExpressionMapping::cover2dsphere(keyMatcher->getRegion(),